fasttext文本分類

1.簡(jiǎn)介

fasttext是facebook開(kāi)源的一個(gè)詞向量與文本分類工具,在2016年開(kāi)源,典型應(yīng)用場(chǎng)景是“帶監(jiān)督的文本分類問(wèn)題”。提供簡(jiǎn)單而高效的文本分類和表征學(xué)習(xí)的方法,性能比肩深度學(xué)習(xí)而且速度更快。
fastText結(jié)合了自然語(yǔ)言處理和機(jī)器學(xué)習(xí)中最成功的理念。這些包括了使用詞袋以及n-gram袋表征語(yǔ)句,還有使用子字(subword)信息,并通過(guò)隱藏表征在類別間共享信息。我們另外采用了一個(gè)softmax層級(jí)(利用了類別不均衡分布的優(yōu)勢(shì))來(lái)加速運(yùn)算過(guò)程。

2.訓(xùn)練實(shí)例

# -*- coding: utf-8 -*-
from sklearn.externals import joblib
import pandas as pd  
import numpy as np  
import warnings
import jieba
import re
import time
import fasttext
import random
from stop_words import stop_word
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
warnings.filterwarnings('ignore')

data_content = pd.read_excel('語(yǔ)料.xlsx', index_col = None, encoding = 'utf-8')
contents = data_content['語(yǔ)料'].values
targets = data_content['敏感等級(jí)(1高度、2敏感、3不敏感)'].values
jieba.load_userdict("key_word.csv")
source = []
#數(shù)據(jù)處理
for i in range (0,len(contents)):
    content= contents[i]
    content_string = re.sub("\|uid|Name|content|dtype|object|[\]\[\:\...\:\.\!\,\,\·\…\~\。\;\;\?\-\─\*\—\”\《\》]|[\/\?\?\、\~\】\【\(\)\)\__\____]", "", content)
    content_cut = ''.join(content_string.split())
    content_seglist = jieba.lcut(content_cut,cut_all=False)
    content_seglist = [word.strip().replace('\ufeff', '') for word in content_seglist if word not in stop_word]#去除停用詞
    content_seglist = ' '.join(i for i in content_seglist)
    content_text = "__label__"+str(targets[i])+" , "+ content_seglist
    source.append(content_text)
x_train, x_test, y_train, y_test = train_test_split(source, targets, test_size = 0.1, random_state=33)
train_text = open('data/train_data.txt', 'w', encoding = 'utf-8')
for sentence in x_train:
    #print (sentence)
    train_text.write(sentence +"\n")
test_text = open('data/test_data.txt', 'w', encoding = 'utf-8')
for sentence in x_test:
    test_text.write(sentence +"\n")
classifier = fasttext.supervised('data/train_data.txt', 'model/classifier.model', label_prefix='__label__')
#result = classifier.test('data/train_data.txt')
labels = classifier.predict_proba('data/test_data.txt', k=3)
print ('輸出預(yù)測(cè)結(jié)果')
print (result)
print (labels)
print ('P@1:', result.precision)
print ('R@1:', result.recall)
print ('F@1:', result.f1score)
print ('Number of examples:', result.nexamples)

3.多進(jìn)程預(yù)測(cè)

# -*- coding: utf-8 -*-
from sklearn.externals import joblib
import pandas as pd  
import numpy as np  
import warnings
import jieba
import re
import time
import fasttext
import random
import pymysql as mydb
import threading,time
import queue
from  multiprocessing import Process, Pool, freeze_support
from multiprocessing import cpu_count
from stop_words import stop_word
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
warnings.filterwarnings('ignore')
#查詢數(shù)據(jù)
db = mydb.connect(host='XXXXXX', port=XXXX, user='XXXXX', passwd='XXXXX', db='XXXX', charset='utf8') #使用此數(shù)據(jù)庫(kù),需在sql查詢加上b.yes_no
sql_cmd = "select a.tieba_name, a.post_url, case a.title when '' then 'empty' else a.title end title, case a.content when '' then 'empty' else a.content end content, a.floor,  (case  when b.reply IS NULL then 'null' when b.reply = '' then 'empty' else b.reply end) reply, from_unixtime(a.time, '%Y-%m-%d') time from s_content_tieba a left join s_huifu_tieba b on  a.content_id = b.post_id where from_unixtime(a.time, '%Y-%m-%d') between '2018-11-26' and '2018-11-27'"
data_set = pd.read_sql(sql_cmd, db)
db.close()
#data_set = data_set1.iloc[0:10000,]
#data_set['content_label'] = ''
#data_set['content_prob'] = ''
#data_set['reply_label'] = ''
#data_set['reply_prob'] = ''
lens = (len(data_set))
idx = [i for i in range (lens)]
contents = data_set['content'].values
replies = data_set['reply'].values
jieba.load_userdict("key_word.csv")
pre_model = fasttext.load_model('model/classifier.model.bin',  label_prefix='__label__')
#content_labs = []
#reply_labs = []
print ('開(kāi)始預(yù)測(cè)')
def consumer(i):
    print(i)
    content_one= contents[i]
    #print ('content_one')
    content_string = re.sub("\|uid|Name|content|dtype|object|[\]\[\:\...\:\.\!\,\,\·\…\~\。\;\;\?\-\─\*\—\”\《\》]|[\/\?\?\、\~\】\【\(\)\)\__\____]", "", content_one)
    content_cut = ''.join(content_string.split())
    content_seglist1 = jieba.lcut(content_cut,cut_all=False)
    content_seglist2 = [word.strip().replace('\ufeff', '') for word in content_seglist1 if word not in stop_word]#去除停用詞
    if len(content_seglist2)> 0:
        content_seglist3 = [' '.join(j for j in content_seglist2)]
        #print ('kaishiyuce')
        result_pre = pre_model.predict(content_seglist3)
        content_labels = result_pre[0][0]
        #data_set['content_prob'].iloc[i]=result_pre[0][0][1]
        #data_set['content_label'].iloc[i]=result_pre[0][0][0]
        #data_set['content_prob'].iloc[i]=result_pre[0][0][1]
    else:
        content_labels = 'empty'
    reply_one= replies[i]
    reply_string = re.sub("\|uid|Name|content|dtype|object|[\]\[\:\...\:\.\!\,\,\·\…\~\。\;\;\?\-\─\*\—\”\《\》]|[\/\?\?\、\~\】\【\(\)\)\__\____]|回復(fù).*?:|回復(fù).*?:|回復(fù)\s(\S+)", "", reply_one)
    reply_cut = ''.join(reply_string.split())
    reply_seglist1 = jieba.lcut(reply_cut,cut_all=False)
    reply_seglist2 = [word.strip().replace('\ufeff', '') for word in reply_seglist1 if word not in stop_word]#去除停用詞
    if len(reply_seglist2)> 0:
        reply_seglist3 = [' '.join(j for j in reply_seglist2)]
        result_pre2 = pre_model.predict(reply_seglist3)
        reply_labels=result_pre2[0][0]
    else:
        reply_labels = 'empty'
        #data_set['reply_prob'].iloc[i]=result_pre2[0][0][1]
        #data_set['reply_label'].iloc[i]=result_pre2[0][0][0]
        #data_set['reply_prob'].iloc[i]=result_pre2[0][0][1]
    return content_labels, reply_labels 
b_time1 = time.time()
pool = Pool(cpu_count())
th = []
th.append(pool.map_async(consumer, idx))
pool.close()
pool.join()
#print (th.get())
ths = []
for a in th:
    ths.append(a.get())
thx = [e[0] for e in ths[0]]
thy = [e[1] for e in ths[0]]
data_set['content_label'] = thx
data_set['reply_label'] = thy
#data.to_csv('data.csv')
print (time.time() - b_time1)
#print (time.time() - b_time1)
data_set['序號(hào)'] = [a for a in range (len(data_set))]
data_mg1 = pd.pivot_table(data_set, index=['tieba_name', 'post_url', 'title', 'content', 'content_label', 'floor', 'reply', 'reply_label', 'time'])
#data_mg1 = pd.pivot_table(data_set, index=['tieba_name', 'post_url', 'title', 'content', 'content_label', 'content_prob', 'floor', 'reply', 'reply_label', 'reply_prob', 'time'])
data_mg1['序號(hào)'] = [a for a in range (len(data_mg1))]
now_date = time.strftime('%Y%m%d',time.localtime(time.time()))
data_mg1.to_csv('data/匹配結(jié)果'+now_date+'.csv')

4.總結(jié)

fasttext非常簡(jiǎn)單易用,如果你想快速感受一下類深度學(xué)習(xí)的效果,可以嘗試一把。它可以完成無(wú)監(jiān)督的詞向量的學(xué)習(xí),學(xué)習(xí)出來(lái)詞向量,保持住詞和詞之間,相關(guān)詞之間是一個(gè)距離比較近的情況;
也可以用于有監(jiān)督學(xué)習(xí)的文本分類任務(wù),(新聞文本分類,垃圾郵件分類、情感分析中文本情感分析,電商中用戶評(píng)論的褒貶分析)。詳細(xì)原理及詞向量應(yīng)用可參考https://blog.csdn.net/john_bh/article/details/79268850

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書(shū)系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容