#!/usr/bin/python
#encoding:utf-8
"""
@author: LlQ
@contact:LIQINGLIN54951@gmail.com
@file:cp9_p172.py
@time: 5/19/2019 3:40 AM
"""
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
from collections import Counter
def cleanSentence(sentence):
#splits the sentence into words,
wordList = sentence.split(' ')
#strips punctuation and whitespace,
#string.punctuation): !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
wordList = [word.strip(string.punctuation+string.whitespace) for word in wordList]
#removes single-character words besides I and a
wordList = [word for word in wordList if (len(word) > 1)
or (word.lower() == 'a' or word.lower() =='i')]
return wordList #word List
def cleanInput(content):
#replaces all instances of the newline character with a space
#removes citations like [123],[1]
content = re.sub('\n|[[\d+\]]', ' ', content)
#连续的多个空格替换成一个空格,确保所有单词之间只有一个空格
content = re.sub(' +', " ", content)
#escape characters are eliminated by encoding the content with UTF-8
#bytes():如果 source 为字符串,则按照指定的 encoding 将字符串转换为字节序列
content = bytes(content,"UTF-8")
content = content.decode('ascii', 'ignore')
sentenceList = content.split('. ')
# sentenceList: sentenceList[wordsList[]...]
return [cleanSentence(sentence) for sentence in sentenceList]
def getNgramsFromSentence(wordList,n):
output = []
for i in range( len(wordList)-n+1 ):
output.append( wordList[i:i+n] )
#print(output) #output:[[N-WordList]...] or [[N-gram]...]
return output
def getNgrams(content,n):
content = content.upper()
content = cleanInput(content) # sentences: sentenceList[wordsList[]...]
#print(content)
# ngrams = []
# for sentence in content:
# ngrams.extend( getNgramsFromSentence(sentence, n) )
# return (ngrams)
ngrams=Counter()
ngrams_list=[] ##########
for sentence in content: #each wordList in sentenceList
newNgrams = [ ' '.join(ngram) for ngram in getNgramsFromSentence(sentence,2) ]
ngrams_list.extend(newNgrams)##########
ngrams.update(newNgrams)#Convert A Counter Object
return (ngrams)
def isCommon(ngram):
commonWords = ['THE', 'BE', 'AND', 'OF', 'A', 'IN', 'TO', 'HAVE', 'IT', 'I',
'THAT', 'FOR', 'YOU', 'HE', 'WITH', 'ON', 'DO', 'SAY', 'THIS', 'THEY',
'IS', 'AN', 'AT', 'BUT', 'WE', 'HIS', 'FROM', 'THAT', 'NOT', 'BY',
'SHE', 'OR', 'AS', 'WHAT', 'GO', 'THEIR', 'CAN', 'WHO', 'GET', 'IF',
'WOULD', 'HER', 'ALL', 'MY', 'MAKE', 'ABOUT', 'KNOW', 'WILL', 'AS',
'UP', 'ONE', 'TIME', 'HAS', 'BEEN', 'THERE', 'YEAR', 'SO', 'THINK',
'WHEN', 'WHICH', 'THEM', 'SOME', 'ME', 'PEOPLE', 'TAKE', 'OUT', 'INTO',
'JUST', 'SEE', 'HIM', 'YOUR', 'COME', 'COULD', 'NOW', 'THAN', 'LIKE',
'OTHER', 'HOW', 'THEN', 'ITS', 'OUR', 'TWO', 'MORE', 'THESE', 'WANT',
'WAY', 'LOOK', 'FIRST', 'ALSO', 'NEW', 'BECAUSE', 'DAY', 'MORE', 'USE',
'NO', 'MAN', 'FIND', 'HERE', 'THING', 'GIVE', 'MANY', 'WELL']
for word in ngram:
if word in commonWords:
return True
return False
content = str(urlopen('http://pythonscraping.com/files/inaugurationSpeech.txt').read(), 'utf-8')
ngrams = getNgrams(content,2)
#print( list(ngrams.elements()))
for ngram in list(ngrams.elements()):
if ngrams[ngram]<3 or isCommon(ngram)!=True:
del ngrams[ngram]
import operator
sortedNGrams = sorted(ngrams.items(), key = operator.itemgetter(1), reverse=False)
print(sortedNGrams)
print('2-grams count is: '+str(len(sortedNGrams)))#284 #5632:content = content.upper()