#!/usr/bin/python
#encoding:utf-8
"""
@author: LlQ
@contact:LIQINGLIN54951@gmail.com
@file:cp9_p178.py
@time: 5/19/2019 7:00 PM
"""
from urllib.request import urlopen
from random import randint
def wordListSum(wordList):
sum = 0
for word, value in wordList.items():
sum += value
return sum
def retrieveRandomWord(wordList):
#Markov Models
randIndex = randint(1, wordListSum(wordList))#
for word, value in wordList.items():
randIndex -= value
if randIndex <= 0:
return word
def buildWordDict(text):
# Remove newlines and quotes
text = text.replace('\n', ' ');
text = text.replace('"', '');
# Make sure punctuation marks are treated as their own "words,"
# so that they will be included in the Markov chain
punctuaction = [',','.',';',':']
# putting spaces around the punctuation
for symbol in punctuaction:
text = text.replace(symbol, ' {} '.format(symbol))
# text = text.replace(symbol, " "+symbol+" ");
words = text.split(' ')
# Filter our empty words
words = [word for word in words if word != '']
wordDict = {}
#it builds a two-dimensional dictionary—a dictionary of dictionaries
for i in range(1, len(words)):
if words[i-1] not in wordDict:
# Create a new dictionary for this word
wordDict[ words[i-1] ] = {} #{words[i-1]:{}}
if words[i] not in wordDict[ words[i-1] ]:
wordDict[ words[i-1] ][ words[i] ] = 0 #{words[i-1]:{words[i]:0}}
wordDict[ words[i-1] ][ words[i] ] += 1 #{words[i-1]:{words[i]:1}}
return wordDict
text = str(urlopen('http://pythonscraping.com/files/inaugurationSpeech.txt').read(), 'utf-8')
#print(text)
wordDict = buildWordDict(text)
#print(wordDict)
# Generate a Markov chain of length 100
length =100
chain = ['Called']
for i in range(0, length):
newWord = retrieveRandomWord(wordDict[chain[-1]])
#Markov Models
chain.append(newWord)
print( ' '.join(chain))
# for i in range(0, length):
# chain += currentWord+" "
# currentWord = retrieveRandomWord(wordDict[currentWord])
# print(chain)