1.代碼部分
"""解析:http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/"""
"""
CBOW模式是從原始語句中腿短目標字詞,也即是填空;SG模式恰好相反,是從目標字詞退出原始語句
此外使用編碼的噪聲詞匯進行訓練,也被稱為Negative Saampling
損失函數(shù)選擇:Noise-Contrastive Estimation loss
"""
#1.導入所依賴的庫
import collections
import math
import os
import random
import zipfile
import numpy as np
import urllib
import pprint
import tensorflow as tf
import matplotlib.pyplot as plt
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
#2.準備數(shù)據(jù)集
url = "http://mattmahoney.net/dc/"
def maybe_download(filename,expected_bytes):
"""
判斷文件是否已經下載,如果沒有,則下載數(shù)據(jù)集
"""
if not os.path.exists(filename):
#數(shù)據(jù)集不存在,開始下載
filename,_ = urllib.request.urlretrieve(url + filename,filename)
#核對文件尺寸
stateinfo = os.stat(filename)
if stateinfo.st_size == expected_bytes:
print("數(shù)據(jù)集已存在,且文件尺寸合格!",filename)
else :
print(stateinfo.st_size)
raise Exception(
"文件尺寸不對 !請重新下載,下載地址為:"+url
)
return filename
"""
測試文件是否存在
"""
filename = maybe_download("text8.zip",31344016)
#3.解壓文件
def read_data(filename):
with zipfile.ZipFile(filename) as f:
data = tf.compat.as_str(f.read(f.namelist()[0])).split()
return data
words = read_data(filename)
print("總的單詞個數(shù):",len(words))
#4.構建詞匯表,并統(tǒng)計每個單詞出現(xiàn)的頻數(shù),同時用字典的形式進行存儲,取頻數(shù)排名前50000的單詞
vocabulary_size = 50000
def build_dataset(words):
count = [["unkown",-1]]
#collections.Counter()返回的是形如[["unkown",-1],("the",4),("physics",2)]
count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
dictionary = {}
#將全部單詞轉為編號(以頻數(shù)排序的編號),我們只關注top50000的單詞,以外的認為是unknown的,編號為0,同時統(tǒng)計一下這類詞匯的數(shù)量
for word,_ in count:
dictionary[word] = len(dictionary)
#形如:{"the":1,"UNK":0,"a":12}
data = []
unk_count = 0 #準備統(tǒng)計top50000以外的單詞的個數(shù)
for word in words:
#對于其中每一個單詞,首先判斷是否出現(xiàn)在字典當中
if word in dictionary:
#如果已經出現(xiàn)在字典中,則轉為其編號
index = dictionary[word]
else:
#如果不在字典,則轉為編號0
index = 0
unk_count += 1
data.append(index)#此時單詞已經轉變成對應的編號
"""
print(data[:10])
[5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]
"""
count[0][1] = unk_count #將統(tǒng)計好的unknown的單詞數(shù),填入count中
#將字典進行翻轉,形如:{3:"the,4:"an"}
reverse_dictionary = dict(zip(dictionary.values(),dictionary.keys()))
return data,count,dictionary,reverse_dictionary
#為了節(jié)省內存,將原始單詞列表進行刪除
data,count,dictionary,reverse_dictionary = build_dataset(words)
del words
#將部分結果展示出來
#print("出現(xiàn)頻率最高的單詞(包括未知類別的):",count[:10])
#將已經轉換為編號的數(shù)據(jù)進行輸出,從data中輸出頻數(shù),從翻轉字典中輸出編號對應的單詞
#print("樣本數(shù)據(jù)(排名):",data[:10],"\n對應的單詞",[reverse_dictionary[i] for i in data[:10]])
#5.生成Word2Vec的訓練樣本,使用skip-gram模式
data_index = 0
def generate_batch(batch_size,num_skips,skip_window):
"""
:param batch_size: 每個訓練批次的數(shù)據(jù)量
:param num_skips: 每個單詞生成的樣本數(shù)量,不能超過skip_window的兩倍,并且必須是batch_size的整數(shù)倍
:param skip_window: 單詞最遠可以聯(lián)系的距離,設置為1則表示當前單詞只考慮前后兩個單詞之間的關系,也稱為滑窗的大小
:return:返回每個批次的樣本以及對應的標簽
"""
global data_index #聲明為全局變量,方便后期多次使用
#使用Python中的斷言函數(shù),提前對輸入的參數(shù)進行判別,防止后期出bug而難以尋找原因
assert batch_size % num_skips == 0
assert num_skips <= skip_window * 2
batch = np.ndarray(shape=(batch_size),dtype=np.int32) #創(chuàng)建一個batch_size大小的數(shù)組,數(shù)據(jù)類型為int32類型,數(shù)值隨機
labels = np.ndarray(shape=(batch_size,1),dtype=np.int32) #數(shù)據(jù)維度為[batch_size,1]
span = 2 * skip_window + 1 #入隊的長度
buffer = collections.deque(maxlen=span) #創(chuàng)建雙向隊列。最大長度為span
"""
print(batch,"\n",labels)
batch :[0 ,-805306368 ,405222565 ,1610614781 ,-2106392574 ,2721-2106373584 ,163793]
labels: [[ 0]
[-805306368]
[ 407791039]
[ 536872957]
[ 2]
[ 0]
[ 0]
[ 131072]]
"""
#對雙向隊列填入初始值
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index+1) % len(data)
"""
print(buffer,"\n",data_index) 輸出:
deque([5234, 3081, 12], maxlen=3)
3
"""
#進入第一層循環(huán),i表示第幾次入雙向隊列
for i in range(batch_size // num_skips):
target = skip_window #定義buffer中第skip_window個單詞是目標
targets_avoid = [skip_window] #定義生成樣本時需要避免的單詞,因為我們要預測的是語境單詞,不包括目標單詞本身,因此列表開始包括第skip_window個單詞
for j in range(num_skips):
"""第二層循環(huán),每次循環(huán)對一個語境單詞生成樣本,先產生隨機數(shù),直到不在需要避免的單詞中,也即需要找到可以使用的語境詞語"""
while target in targets_avoid:
target = random.randint(0,span-1)
targets_avoid.append(target) #因為該語境單詞已經被使用過了,因此將其添加到需要避免的單詞庫中
batch[i * num_skips + j] = buffer[skip_window] #目標詞匯
labels[i * num_skips +j,0] = buffer[target] #語境詞匯
#此時buffer已經填滿,后續(xù)的數(shù)據(jù)會覆蓋掉前面的數(shù)據(jù)
#print(batch,labels)
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
return batch,labels
batch,labels = generate_batch(8,2,1)
"""
for i in range(8):
print("目標單詞:"+reverse_dictionary[batch[i]]+"對應編號為:".center(20)+str(batch[i])+" 對應的語境單詞為: ".ljust(20)+reverse_dictionary[labels[i,0]]+" 編號為",labels[i,0])
測試結果:
目標單詞:originated 對應編號為: 3081 對應的語境單詞為: as 編號為 12
目標單詞:originated 對應編號為: 3081 對應的語境單詞為: anarchism 編號為 5234
目標單詞:as 對應編號為: 12 對應的語境單詞為: originated 編號為 3081
目標單詞:as 對應編號為: 12 對應的語境單詞為: a 編號為 6
目標單詞:a 對應編號為: 6 對應的語境單詞為: as 編號為 12
目標單詞:a 對應編號為: 6 對應的語境單詞為: term 編號為 195
目標單詞:term 對應編號為: 195 對應的語境單詞為: of 編號為 2
目標單詞:term 對應編號為: 95 對應的語境單詞為: a 編號為 6
"""
#6.定義訓練數(shù)據(jù)的一些參數(shù)
batch_size = 128 #訓練樣本的批次大小
embedding_size = 128 #單詞轉化為稠密詞向量的維度
skip_window = 1 #單詞可以聯(lián)系到的最遠距離
num_skips = 1 #每個目標單詞提取的樣本數(shù)
#7.定義驗證數(shù)據(jù)的一些參數(shù)
valid_size = 16 #驗證的單詞數(shù)
valid_window = 100 #指驗證單詞只從頻數(shù)最高的前100個單詞中進行抽取
valid_examples = np.random.choice(valid_window,valid_size,replace=False) #進行隨機抽取
num_sampled = 64 #訓練時用來做負樣本的噪聲單詞的數(shù)量
#8.開始定義Skip-Gram Word2Vec模型的網絡結構
#8.1創(chuàng)建一個graph作為默認的計算圖,同時為輸入數(shù)據(jù)和標簽申請占位符,并將驗證樣例的隨機數(shù)保存成TensorFlow的常數(shù)
graph = tf.Graph()
with graph.as_default():
train_inputs = tf.placeholder(tf.int32,[batch_size])
train_labels = tf.placeholder(tf.int32,[batch_size,1])
valid_dataset = tf.constant(valid_examples,tf.int32)
#選擇運行的device為CPU
with tf.device("/cpu:0"):
#單詞大小為50000,向量維度為128,隨機采樣在(-1,1)之間的浮點數(shù)
embeddings = tf.Variable(tf.random_uniform([vocabulary_size,embedding_size],-1.0,1.0))
#使用tf.nn.embedding_lookup()函數(shù)查找train_inputs對應的向量embed
embed = tf.nn.embedding_lookup(embeddings,train_inputs)
#優(yōu)化目標選擇NCE loss
#使用截斷正太函數(shù)初始化NCE損失的權重,偏重初始化為0
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size,embedding_size],stddev= 1.0 /math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
#計算學習出的embedding在訓練數(shù)據(jù)集上的loss,并使用tf.reduce_mean()函數(shù)進行匯總
loss = tf.reduce_mean(tf.nn.nce_loss(
weights=nce_weights,
biases=nce_biases,
labels =train_labels,
inputs=embed,
num_sampled=num_sampled,
num_classes=vocabulary_size
))
#定義優(yōu)化器為SGD,且學習率設置為1.0.然后計算嵌入向量embeddings的L2范數(shù)norm,并計算出標準化后的normalized_embeddings
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings),1,keep_dims=True)) #嵌入向量的L2范數(shù)
normalized_embeddings = embeddings / norm #標準哈embeddings
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,valid_dataset) #查詢驗證單詞的嵌入向量
#計算驗證單詞的嵌入向量與詞匯表中所有單詞的相似性
similarity = tf.matmul(
valid_embeddings,normalized_embeddings,transpose_b=True
)
init = tf.global_variables_initializer() #定義參數(shù)的初始化
##9.啟動訓練
num_steps = 150001 #進行15W次的迭代計算
#創(chuàng)建一個回話并設置為默認
with tf.Session(graph=graph) as session:
init.run() #啟動參數(shù)的初始化
print("初始化完成!")
average_loss = 0 #計算誤差
#開始迭代訓練
for step in range(num_steps):
batch_inputs,batch_labels = generate_batch(batch_size,num_skips,skip_window) #調用生成訓練數(shù)據(jù)函數(shù)生成一組batch和label
feed_dict = {train_inputs:batch_inputs,train_labels:batch_labels} #待填充的數(shù)據(jù)
#啟動回話,運行優(yōu)化器optimizer和損失計算函數(shù),并填充數(shù)據(jù)
optimizer_trained,loss_val = session.run([optimizer,loss],feed_dict=feed_dict)
average_loss += loss_val #統(tǒng)計NCE損失
#為了方便,每2000次計算一下?lián)p失并顯示出來
if step % 2000 == 0:
if step > 0:
average_loss /= 2000
print("第{}輪迭代后的損失為:{}".format(step,average_loss))
average_loss = 0
#每10000次迭代,計算一次驗證單詞與全部單詞的相似度,并將于驗證單詞最相似的前8個單詞呈現(xiàn)出來
if step % 10000 == 0:
sim = similarity.eval() #計算向量
for i in range(valid_size):
valid_word = reverse_dictionary[valid_examples[i]] #得到對應的驗證單詞
top_k = 8
nearest = (-sim[i,:]).argsort()[1:top_k+1] #計算每一個驗證單詞相似度最接近的前8個單詞
log_str = "與單詞 {} 最相似的: ".format(str(valid_word))
for k in range(top_k):
close_word = reverse_dictionary[nearest[k]] #相似度高的單詞
log_str = "%s %s, " %(log_str,close_word)
print(log_str)
final_embeddings = normalized_embeddings.eval()
#10.可視化Word2Vec效果
def plot_with_labels(low_dim_embs,labels,filename = "tsne.png"):
assert low_dim_embs.shape[0] >= len(labels),"標簽數(shù)超過了嵌入向量的個數(shù)!!"
plt.figure(figsize=(20,20))
for i,label in enumerate(labels):
x,y = low_dim_embs[i,:]
plt.scatter(x,y)
plt.annotate(
label,
xy = (x,y),
xytext=(5,2),
textcoords="offset points",
ha="right",
va="bottom"
)
plt.savefig(filename)
from sklearn.manifold import TSNE
tsne = TSNE(perplexity=30,n_components=2,init="pca",n_iter=5000)
plot_only = 100
low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])
Labels = [reverse_dictionary[i] for i in range(plot_only)]
plot_with_labels(low_dim_embs,Labels)
"""
第142000輪迭代后的損失為:4.46674475479126
第144000輪迭代后的損失為:4.460033647537231
第146000輪迭代后的損失為:4.479593712329865
第148000輪迭代后的損失為:4.463101862192154
第150000輪迭代后的損失為:4.3655951328277585
與單詞 can 最相似的: may, will, would, could, should, must, might, cannot,
與單詞 were 最相似的: are, was, have, had, been, be, those, including,
與單詞 is 最相似的: was, has, are, callithrix, landesverband, cegep, contains, became,
與單詞 been 最相似的: be, become, were, was, acuity, already, banded, had,
與單詞 new 最相似的: repertory, rium, real, ursus, proclaiming, cegep, mesoplodon, bolster,
與單詞 their 最相似的: its, his, her, the, our, some, these, landesverband,
與單詞 when 最相似的: while, if, where, before, after, although, was, during,
與單詞 of 最相似的: vah, in, neutronic, widehat, abet, including, nine, cegep,
與單詞 first 最相似的: second, last, biggest, cardiomyopathy, next, cegep, third, burnt,
與單詞 other 最相似的: different, some, various, many, thames, including, several, bearings,
與單詞 its 最相似的: their, his, her, the, simplistic, dativus, landesverband, any,
與單詞 from 最相似的: into, through, within, in, akita, bde, during, lawless,
與單詞 would 最相似的: will, can, could, may, should, might, must, shall,
與單詞 people 最相似的: those, men, pisa, lep, arctocephalus, protectors, saguinus, builders,
與單詞 had 最相似的: has, have, was, were, having, ascribed, wrote, nitrile,
與單詞 all 最相似的: auditum, some, scratch, both, several, many, katydids, two,
"""
2. 詞向量圖
詞向量空間表示

詞向量空間表示