cp9

xiaoxiao2022-06-24 178

#!/usr/bin/python #encoding:utf-8 """ @author: LlQ @contact:LIQINGLIN54951@gmail.com @file：cp9_p172.py @time: 5/19/2019 3:40 AM """ from urllib.request import urlopen from bs4 import BeautifulSoup import re import string from collections import Counter def cleanSentence(sentence): #splits the sentence into words, wordList = sentence.split(' ') #strips punctuation and whitespace, #string.punctuation): !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ wordList = [word.strip(string.punctuation+string.whitespace) for word in wordList] #removes single-character words besides I and a wordList = [word for word in wordList if (len(word) > 1) or (word.lower() == 'a' or word.lower() =='i')] return wordList #word List def cleanInput(content): #replaces all instances of the newline character with a space #removes citations like [123],[1] content = re.sub('\n|[[\d+\]]', ' ', content) #连续的多个空格替换成一个空格，确保所有单词之间只有一个空格 content = re.sub(' +', " ", content) #escape characters are eliminated by encoding the content with UTF-8 #bytes():如果 source 为字符串，则按照指定的 encoding 将字符串转换为字节序列 content = bytes(content,"UTF-8") content = content.decode('ascii', 'ignore') sentenceList = content.split('. ') # sentenceList: sentenceList[wordsList[]...] return [cleanSentence(sentence) for sentence in sentenceList] def getNgramsFromSentence(wordList,n): output = [] for i in range( len(wordList)-n+1 ): output.append( wordList[i:i+n] ) #print(output) #output:[[N-WordList]...] or [[N-gram]...] return output def getNgrams(content,n): content = content.upper() content = cleanInput(content) # sentences: sentenceList[wordsList[]...] #print(content) # ngrams = [] # for sentence in content: # ngrams.extend( getNgramsFromSentence(sentence, n) ) # return (ngrams) ngrams=Counter() ngrams_list=[] ########## for sentence in content: #each wordList in sentenceList newNgrams = [ ' '.join(ngram) for ngram in getNgramsFromSentence(sentence,2) ] ngrams_list.extend(newNgrams)########## ngrams.update(newNgrams)#Convert A Counter Object return (ngrams) def isCommon(ngram): commonWords = ['THE', 'BE', 'AND', 'OF', 'A', 'IN', 'TO', 'HAVE', 'IT', 'I', 'THAT', 'FOR', 'YOU', 'HE', 'WITH', 'ON', 'DO', 'SAY', 'THIS', 'THEY', 'IS', 'AN', 'AT', 'BUT', 'WE', 'HIS', 'FROM', 'THAT', 'NOT', 'BY', 'SHE', 'OR', 'AS', 'WHAT', 'GO', 'THEIR', 'CAN', 'WHO', 'GET', 'IF', 'WOULD', 'HER', 'ALL', 'MY', 'MAKE', 'ABOUT', 'KNOW', 'WILL', 'AS', 'UP', 'ONE', 'TIME', 'HAS', 'BEEN', 'THERE', 'YEAR', 'SO', 'THINK', 'WHEN', 'WHICH', 'THEM', 'SOME', 'ME', 'PEOPLE', 'TAKE', 'OUT', 'INTO', 'JUST', 'SEE', 'HIM', 'YOUR', 'COME', 'COULD', 'NOW', 'THAN', 'LIKE', 'OTHER', 'HOW', 'THEN', 'ITS', 'OUR', 'TWO', 'MORE', 'THESE', 'WANT', 'WAY', 'LOOK', 'FIRST', 'ALSO', 'NEW', 'BECAUSE', 'DAY', 'MORE', 'USE', 'NO', 'MAN', 'FIND', 'HERE', 'THING', 'GIVE', 'MANY', 'WELL'] for word in ngram: if word in commonWords: return True return False content = str(urlopen('http://pythonscraping.com/files/inaugurationSpeech.txt').read(), 'utf-8') ngrams = getNgrams(content,2) #print( list(ngrams.elements())) for ngram in list(ngrams.elements()): if ngrams[ngram]<3 or isCommon(ngram)!=True: del ngrams[ngram] import operator sortedNGrams = sorted(ngrams.items(), key = operator.itemgetter(1), reverse=False) print(sortedNGrams) print('2-grams count is: '+str(len(sortedNGrams)))#284 #5632:content = content.upper()

专利

最新回复(0)