麻豆传媒丝袜制度视频,欧美亚州日韩国产

文本清洗，導(dǎo)出到文件

import re

# make English text clean 
def clean_en_text(text):
    # keep English, digital and space
    comp = re.compile('[^A-Z^a-z^0-9^ ]')
    return comp.sub(' ',text)

# make Chinese text clean
def clean_zh_text(text):
    # keep English, digital and Chinese
    comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]')
    return comp.sub(' ',text)

def file_en_clean(r_file_ad, w_file_ad):
    f = open(r_file_ad,'rt')
    print('讀取文件的名字：',f.name)
    lines = f.readlines()
    output = []
    for line in lines:
        line = clean_en_text(line)
        output.append(line)
    f.close()
    f = open(w_file_ad,'w')
    print('寫入文件的名字：',f.name)
    for o in output:
        f.write(o)
        f.write('\n')
    f.close()
if __name__ == '__main__':
# 本代碼所在文件和兩個(gè).txt文件在同一目錄下
    file_en_clean('./e2.txt','./new_test.txt')

加入詞頻統(tǒng)計(jì)，導(dǎo)出到文件

import re

# make English text clean 
def clean_en_text(text):
    # keep English, digital and space
    comp = re.compile('[^A-Z^a-z^0-9^ ]')
    return comp.sub(' ',text)

# make Chinese text clean
def clean_zh_text(text):
    # keep English, digital and Chinese
    comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]')
    return comp.sub(' ',text)

def dealed_list(filename):
    f = open(filename,'rt')
    print('讀取文件的名字：',f.name)
    lines = f.readlines()
    output = []
    for line in lines:
        line = clean_en_text(line)
        output.append(line)
    f.close()
    return output

def readlist(dealed_list):
    fr = dealed_list
    wordsL = []#use this list to save the words
    for word in fr:
        word = word.lower()
        word = word.strip()
        word = word.split()
        wordsL = wordsL + word
    return wordsL

#count the frequency of every word and store in a dictionary
#And sort dictionaries by value from large to small
def count(wordsL):
    wordsD = {}
    for x in wordsL:
        #move these words that we don't need
        if Judge(x):
            continue
        #count
        if not x in wordsD:
            wordsD[x] = 1
        wordsD[x] += 1
    #Sort dictionaries by value from large to small
    wordsInorder = sorted(wordsD.items(), key=lambda x:x[1], reverse = True)
    return wordsInorder

#juege whether the word is that we want to move such as punctuation or letter
#You can modify this function to move more words such as number
def Judge(word):
    punctList = [' ','\t','\n',',','.',':','?']#juege whether the word is punctuation
    letterList = ['a','b','c','d','m','n','x','p','t']#juege whether the word is letter
    if word in punctList:
        return True
    elif word in letterList:
        return True
    else:
        return False

if __name__ == '__main__':
    for x in range(1,6):
        filename = 'e' + str(x) + '.txt'
        # 去掉不需要的字符
        L = dealed_list(filename)
        wordsL = readlist(L)
        words = count(wordsL)
        fw = open('./results/words_e' + str(x) + '.txt','w')
        for item in words:
            fw.write(item[0] + '\t' + str(item[1]) + '\n')
        fw.close()

參考博文：
用Python實(shí)現(xiàn)針對(duì)英文論文的詞頻分析
 Python正則表達(dá)式做文本預(yù)處理，去掉特殊符號(hào)

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av

文本清洗+python+正則表達(dá)式+詞頻統(tǒng)計(jì)

文本清洗+python+正則表達(dá)式+詞頻統(tǒng)計(jì)

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九 欧美,1769亚洲,黄色成人av

文本清洗+python+正則表達(dá)式+詞頻統(tǒng)計(jì)

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av