文本清洗,導(dǎo)出到文件
import re
# make English text clean
def clean_en_text(text):
# keep English, digital and space
comp = re.compile('[^A-Z^a-z^0-9^ ]')
return comp.sub(' ',text)
# make Chinese text clean
def clean_zh_text(text):
# keep English, digital and Chinese
comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]')
return comp.sub(' ',text)
def file_en_clean(r_file_ad, w_file_ad):
f = open(r_file_ad,'rt')
print('讀取文件的名字:',f.name)
lines = f.readlines()
output = []
for line in lines:
line = clean_en_text(line)
output.append(line)
f.close()
f = open(w_file_ad,'w')
print('寫入文件的名字:',f.name)
for o in output:
f.write(o)
f.write('\n')
f.close()
if __name__ == '__main__':
# 本代碼所在文件和兩個(gè).txt文件在同一目錄下
file_en_clean('./e2.txt','./new_test.txt')
加入詞頻統(tǒng)計(jì),導(dǎo)出到文件
import re
# make English text clean
def clean_en_text(text):
# keep English, digital and space
comp = re.compile('[^A-Z^a-z^0-9^ ]')
return comp.sub(' ',text)
# make Chinese text clean
def clean_zh_text(text):
# keep English, digital and Chinese
comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]')
return comp.sub(' ',text)
def dealed_list(filename):
f = open(filename,'rt')
print('讀取文件的名字:',f.name)
lines = f.readlines()
output = []
for line in lines:
line = clean_en_text(line)
output.append(line)
f.close()
return output
def readlist(dealed_list):
fr = dealed_list
wordsL = []#use this list to save the words
for word in fr:
word = word.lower()
word = word.strip()
word = word.split()
wordsL = wordsL + word
return wordsL
#count the frequency of every word and store in a dictionary
#And sort dictionaries by value from large to small
def count(wordsL):
wordsD = {}
for x in wordsL:
#move these words that we don't need
if Judge(x):
continue
#count
if not x in wordsD:
wordsD[x] = 1
wordsD[x] += 1
#Sort dictionaries by value from large to small
wordsInorder = sorted(wordsD.items(), key=lambda x:x[1], reverse = True)
return wordsInorder
#juege whether the word is that we want to move such as punctuation or letter
#You can modify this function to move more words such as number
def Judge(word):
punctList = [' ','\t','\n',',','.',':','?']#juege whether the word is punctuation
letterList = ['a','b','c','d','m','n','x','p','t']#juege whether the word is letter
if word in punctList:
return True
elif word in letterList:
return True
else:
return False
if __name__ == '__main__':
for x in range(1,6):
filename = 'e' + str(x) + '.txt'
# 去掉不需要的字符
L = dealed_list(filename)
wordsL = readlist(L)
words = count(wordsL)
fw = open('./results/words_e' + str(x) + '.txt','w')
for item in words:
fw.write(item[0] + '\t' + str(item[1]) + '\n')
fw.close()
參考博文:
用Python實(shí)現(xiàn)針對(duì)英文論文的詞頻分析
Python正則表達(dá)式做文本預(yù)處理,去掉特殊符號(hào)