1.簡(jiǎn)介
fasttext是facebook開(kāi)源的一個(gè)詞向量與文本分類工具,在2016年開(kāi)源,典型應(yīng)用場(chǎng)景是“帶監(jiān)督的文本分類問(wèn)題”。提供簡(jiǎn)單而高效的文本分類和表征學(xué)習(xí)的方法,性能比肩深度學(xué)習(xí)而且速度更快。
fastText結(jié)合了自然語(yǔ)言處理和機(jī)器學(xué)習(xí)中最成功的理念。這些包括了使用詞袋以及n-gram袋表征語(yǔ)句,還有使用子字(subword)信息,并通過(guò)隱藏表征在類別間共享信息。我們另外采用了一個(gè)softmax層級(jí)(利用了類別不均衡分布的優(yōu)勢(shì))來(lái)加速運(yùn)算過(guò)程。
2.訓(xùn)練實(shí)例
# -*- coding: utf-8 -*-
from sklearn.externals import joblib
import pandas as pd
import numpy as np
import warnings
import jieba
import re
import time
import fasttext
import random
from stop_words import stop_word
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
warnings.filterwarnings('ignore')
data_content = pd.read_excel('語(yǔ)料.xlsx', index_col = None, encoding = 'utf-8')
contents = data_content['語(yǔ)料'].values
targets = data_content['敏感等級(jí)(1高度、2敏感、3不敏感)'].values
jieba.load_userdict("key_word.csv")
source = []
#數(shù)據(jù)處理
for i in range (0,len(contents)):
content= contents[i]
content_string = re.sub("\|uid|Name|content|dtype|object|[\]\[\:\...\:\.\!\,\,\·\…\~\。\;\;\?\-\─\*\—\”\《\》]|[\/\?\?\、\~\】\【\(\)\)\__\____]", "", content)
content_cut = ''.join(content_string.split())
content_seglist = jieba.lcut(content_cut,cut_all=False)
content_seglist = [word.strip().replace('\ufeff', '') for word in content_seglist if word not in stop_word]#去除停用詞
content_seglist = ' '.join(i for i in content_seglist)
content_text = "__label__"+str(targets[i])+" , "+ content_seglist
source.append(content_text)
x_train, x_test, y_train, y_test = train_test_split(source, targets, test_size = 0.1, random_state=33)
train_text = open('data/train_data.txt', 'w', encoding = 'utf-8')
for sentence in x_train:
#print (sentence)
train_text.write(sentence +"\n")
test_text = open('data/test_data.txt', 'w', encoding = 'utf-8')
for sentence in x_test:
test_text.write(sentence +"\n")
classifier = fasttext.supervised('data/train_data.txt', 'model/classifier.model', label_prefix='__label__')
#result = classifier.test('data/train_data.txt')
labels = classifier.predict_proba('data/test_data.txt', k=3)
print ('輸出預(yù)測(cè)結(jié)果')
print (result)
print (labels)
print ('P@1:', result.precision)
print ('R@1:', result.recall)
print ('F@1:', result.f1score)
print ('Number of examples:', result.nexamples)
3.多進(jìn)程預(yù)測(cè)
# -*- coding: utf-8 -*-
from sklearn.externals import joblib
import pandas as pd
import numpy as np
import warnings
import jieba
import re
import time
import fasttext
import random
import pymysql as mydb
import threading,time
import queue
from multiprocessing import Process, Pool, freeze_support
from multiprocessing import cpu_count
from stop_words import stop_word
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
warnings.filterwarnings('ignore')
#查詢數(shù)據(jù)
db = mydb.connect(host='XXXXXX', port=XXXX, user='XXXXX', passwd='XXXXX', db='XXXX', charset='utf8') #使用此數(shù)據(jù)庫(kù),需在sql查詢加上b.yes_no
sql_cmd = "select a.tieba_name, a.post_url, case a.title when '' then 'empty' else a.title end title, case a.content when '' then 'empty' else a.content end content, a.floor, (case when b.reply IS NULL then 'null' when b.reply = '' then 'empty' else b.reply end) reply, from_unixtime(a.time, '%Y-%m-%d') time from s_content_tieba a left join s_huifu_tieba b on a.content_id = b.post_id where from_unixtime(a.time, '%Y-%m-%d') between '2018-11-26' and '2018-11-27'"
data_set = pd.read_sql(sql_cmd, db)
db.close()
#data_set = data_set1.iloc[0:10000,]
#data_set['content_label'] = ''
#data_set['content_prob'] = ''
#data_set['reply_label'] = ''
#data_set['reply_prob'] = ''
lens = (len(data_set))
idx = [i for i in range (lens)]
contents = data_set['content'].values
replies = data_set['reply'].values
jieba.load_userdict("key_word.csv")
pre_model = fasttext.load_model('model/classifier.model.bin', label_prefix='__label__')
#content_labs = []
#reply_labs = []
print ('開(kāi)始預(yù)測(cè)')
def consumer(i):
print(i)
content_one= contents[i]
#print ('content_one')
content_string = re.sub("\|uid|Name|content|dtype|object|[\]\[\:\...\:\.\!\,\,\·\…\~\。\;\;\?\-\─\*\—\”\《\》]|[\/\?\?\、\~\】\【\(\)\)\__\____]", "", content_one)
content_cut = ''.join(content_string.split())
content_seglist1 = jieba.lcut(content_cut,cut_all=False)
content_seglist2 = [word.strip().replace('\ufeff', '') for word in content_seglist1 if word not in stop_word]#去除停用詞
if len(content_seglist2)> 0:
content_seglist3 = [' '.join(j for j in content_seglist2)]
#print ('kaishiyuce')
result_pre = pre_model.predict(content_seglist3)
content_labels = result_pre[0][0]
#data_set['content_prob'].iloc[i]=result_pre[0][0][1]
#data_set['content_label'].iloc[i]=result_pre[0][0][0]
#data_set['content_prob'].iloc[i]=result_pre[0][0][1]
else:
content_labels = 'empty'
reply_one= replies[i]
reply_string = re.sub("\|uid|Name|content|dtype|object|[\]\[\:\...\:\.\!\,\,\·\…\~\。\;\;\?\-\─\*\—\”\《\》]|[\/\?\?\、\~\】\【\(\)\)\__\____]|回復(fù).*?:|回復(fù).*?:|回復(fù)\s(\S+)", "", reply_one)
reply_cut = ''.join(reply_string.split())
reply_seglist1 = jieba.lcut(reply_cut,cut_all=False)
reply_seglist2 = [word.strip().replace('\ufeff', '') for word in reply_seglist1 if word not in stop_word]#去除停用詞
if len(reply_seglist2)> 0:
reply_seglist3 = [' '.join(j for j in reply_seglist2)]
result_pre2 = pre_model.predict(reply_seglist3)
reply_labels=result_pre2[0][0]
else:
reply_labels = 'empty'
#data_set['reply_prob'].iloc[i]=result_pre2[0][0][1]
#data_set['reply_label'].iloc[i]=result_pre2[0][0][0]
#data_set['reply_prob'].iloc[i]=result_pre2[0][0][1]
return content_labels, reply_labels
b_time1 = time.time()
pool = Pool(cpu_count())
th = []
th.append(pool.map_async(consumer, idx))
pool.close()
pool.join()
#print (th.get())
ths = []
for a in th:
ths.append(a.get())
thx = [e[0] for e in ths[0]]
thy = [e[1] for e in ths[0]]
data_set['content_label'] = thx
data_set['reply_label'] = thy
#data.to_csv('data.csv')
print (time.time() - b_time1)
#print (time.time() - b_time1)
data_set['序號(hào)'] = [a for a in range (len(data_set))]
data_mg1 = pd.pivot_table(data_set, index=['tieba_name', 'post_url', 'title', 'content', 'content_label', 'floor', 'reply', 'reply_label', 'time'])
#data_mg1 = pd.pivot_table(data_set, index=['tieba_name', 'post_url', 'title', 'content', 'content_label', 'content_prob', 'floor', 'reply', 'reply_label', 'reply_prob', 'time'])
data_mg1['序號(hào)'] = [a for a in range (len(data_mg1))]
now_date = time.strftime('%Y%m%d',time.localtime(time.time()))
data_mg1.to_csv('data/匹配結(jié)果'+now_date+'.csv')
4.總結(jié)
fasttext非常簡(jiǎn)單易用,如果你想快速感受一下類深度學(xué)習(xí)的效果,可以嘗試一把。它可以完成無(wú)監(jiān)督的詞向量的學(xué)習(xí),學(xué)習(xí)出來(lái)詞向量,保持住詞和詞之間,相關(guān)詞之間是一個(gè)距離比較近的情況;
也可以用于有監(jiān)督學(xué)習(xí)的文本分類任務(wù),(新聞文本分類,垃圾郵件分類、情感分析中文本情感分析,電商中用戶評(píng)論的褒貶分析)。詳細(xì)原理及詞向量應(yīng)用可參考https://blog.csdn.net/john_bh/article/details/79268850