# csv load
import nltk
import csv
import json
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import blankline_tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import PorterStemmer # import Porter stemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import urllib.request as urllib
from bs4 import BeautifulSoup
from nltk.metrics import edit_distance
# nltk.download('punkt')
nltk.download()
# csv load
with open('600000.csv', 'r') as f:
reader = csv.reader(f, delimiter=',', quotechar='"')
# for line in reader :
# print(line[1] ) # assuming the second field is the raw sting
rows = [row for row in reader]
print(rows)
# json load
jsonfile = open('example.json')
data = json.load(jsonfile)
print(data)
# 文本清洗 如章节一
# 句子拆分器
# from nltk.tokenize import sent_tokenize
inputstring = ' This is an example sent. The sentence splitter will split on sent markers. Ohh really !!'
allsent = sent_tokenize(inputstring)
print(allsent)
# 标记解析
# from nltk.tokenize import word_tokenize
s = "Hi Everyone ! hola gr_8" # simplest tokenizer
print(s.split())
# from nltk.tokenize import word_tokenize
word = word_tokenize(s)
print(word)
# from nltk.tokenize import regexp_tokenize
word1 = regexp_tokenize(s, pattern="\\w+")
print(word1)
word2 = regexp_tokenize(s, pattern='\\d+')
print(word2)
# from nltk.tokenize import blankline_tokenize
word3 = blankline_tokenize(s)
print(word3)
# from nltk.tokenize import wordpunct_tokenize
word4 = wordpunct_tokenize(s)
print(word4)
# 词干提取
# from nltk.stem import PorterStemmer # import Porter stemmer
pst = PorterStemmer() # create obj of the PorterStemmer
print(pst.stem('shopping'))
# from nltk.stem.lancaster import LancasterStemmer
lst = LancasterStemmer()
lst.stem('eating')
print(lst.stem('eating'))
# from nltk.stem.Snowball import SnowballStemmer
# 词形还原
# from nltk.stem import WordNetLemmatizer
wlem = WordNetLemmatizer()
print(wlem.lemmatize("ate"))
# 停用词删除 english
# from nltk.corpus import stopwords
stoplist = stopwords.words('english') # config the language name
text = "This is just a test"
cleanwordlist = [word for word in text.split() if word not in stoplist]
print(cleanwordlist)
# 停用词删除 chinese
stoplist1 = stopwords.words('chinese')
text1 = '这是我写的一段中文文字,用于停用次测试,我也不知道可不可以,反正我也不敢问'
cleanwordlist1 = [word for word in text1.split() if word not in stoplist1]
print(cleanwordlist1)
# 生僻字删除
response = urllib.urlopen('http://python.org/')
html = response.read()
clean = BeautifulSoup(html, "html5lib").get_text()
# clean will have entire string removing all the html noise
tokens = [tok for tok in clean.split()]
freq_dist = nltk.FreqDist(tokens)
rarewords = freq_dist.keys()[-50:]
after_rare_words = [word for word in tokens not in rarewords]
print(after_rare_words)
# 拼写校正
# from nltk.metrics import edit_distance
print(edit_distance("rain", "shine"))