Chatbot------用PyTorch做對話機器人
引用
[TOC]
導入頭文件
使用了from __future__ import xxx可以在python2,python3環(huán)境下運行同一份代碼而不出錯,編寫的時候使用python3規(guī)范即可。
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
進行數(shù)據(jù)處理
數(shù)據(jù)正規(guī)化
首先之前將句子中的單詞都轉(zhuǎn)換成小寫并去處兩邊的空白格,接著將句子的unicode編碼轉(zhuǎn)換成Ascii編碼,便于使用正則表達式對文本進行正規(guī)化,,再接著為了便于分詞,所以我們將無效字符轉(zhuǎn)換成我們能處理的字符,s = re.sub(r"([.!?])", r" \1", s)意思成將句號,感嘆號,疑問號后面前面都加上一個空格,\1的意思是與之前()里面匹配的東西原封不動復制一下,所以原來匹配到的是句號那么這次還是句號,原來匹配到的是感嘆號這次還是感嘆號,s = re.sub(r"[^a-zA-Z.!?]+", r" ", s),意思是將不是大小寫字母句號疑問號感嘆號的非法字符全部用空格符替換,注意有個+號,是因為將所有連在一起的非法字符只替換一遍,所有加號一定不要忘了,s = re.sub(r"\s+", r" ", s).strip(),意思是將所有的空白符替換成空格,如果多個空白符連接在一起那么只替換一次。
def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
)
# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
s = unicodeToAscii(s.lower().strip())
s = re.sub(r"([.!?])", r" \1", s)
s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
s = re.sub(r"\s+", r" ", s).strip()
return s
讀取數(shù)據(jù)
lines = open(datafile, encoding='utf-8').read().strip().split('\n'),datafile是一個query/response中間用\t分隔,
# Read query/response pairs and return a voc object
def readVocs(datafile, corpus_name):
print("Reading lines...")
# Read the file and split into lines
lines = open(datafile, encoding='utf-8').\
read().strip().split('\n')
# Split every line into pairs and normalize
pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
voc = Voc(corpus_name)
return voc, pairs
代碼寫的很有靈性,判斷pairs里面的句子長度是不是超過了最大長度,如果超過了最大長度那么就掉。
def filterPair(p):
# Input sequences need to preserve the last word for EOS token
return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
# Filter pairs using filterPair condition
def filterPairs(pairs):
return [pair for pair in pairs if filterPair(pair)]
讀取并整合數(shù)據(jù),voc就是一個大的單詞對應表,里面有word2index,有word2count,有index2word
def loadPrepareData(corpus, corpus_name, datafile, save_dir):
print("Start preparing training data ...")
voc, pairs = readVocs(datafile, corpus_name)
print("Read {!s} sentence pairs".format(len(pairs)))
pairs = filterPairs(pairs)
print("Trimmed to {!s} sentence pairs".format(len(pairs)))
print("Counting words...")
for pair in pairs:
voc.addSentence(pair[0])
voc.addSentence(pair[1])
print("Counted words:", voc.num_words)
return voc, pairs
# Load/Assemble voc and pairs
save_dir = os.path.join("data", "save")
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
print(pair)
根據(jù)單詞表對句子進行裁剪,首先先把單詞表中詞頻小于MIN_COUNT的單詞全部丟棄,然后對于句子判斷如果句子中有詞頻過小的單詞,那么整個句子也不用保留。
MIN_COUNT = 3 # Minimum word count threshold for trimming
def trimRareWords(voc, pairs, MIN_COUNT):
# Trim words used under the MIN_COUNT from the voc
voc.trim(MIN_COUNT)
# Filter out pairs with trimmed words
keep_pairs = []
for pair in pairs:
input_sentence = pair[0]
output_sentence = pair[1]
keep_input = True
keep_output = True
# Check input sentence
for word in input_sentence.split(' '):
if word not in voc.word2index:
keep_input = False
break
# Check output sentence
for word in output_sentence.split(' '):
if word not in voc.word2index:
keep_output = False
break
# Only keep pairs that do not contain trimmed word(s) in their input or output sentence
if keep_input and keep_output:
keep_pairs.append(pair)
print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
return keep_pairs
# Trim voc and pairs
pairs = trimRareWords(voc, pairs, MIN_COUNT)
將句子轉(zhuǎn)換成Tensor
預處理的最后一步,在訓練翻譯模型的時候我們都是一個句子一個句子進行訓練的,但是為了加速訓練,最好還是使用Mini-batch進行訓練,如果想要使用Mini-batch 進行訓練,那么就要讓一個batch里面的句子長度一樣長,如果一個句子太短了,那么就在EOS_token后面進行0填充,,這樣構(gòu)建出來的Mini_batch的形狀是batch_size*max_length,為了編寫代碼方便,我們需要batch[0]指示所有句子第一個單詞(總共有batch_size個),所以我們還需要把這個Mini_batch`的矩陣轉(zhuǎn)置一下。
- 將所有的單詞轉(zhuǎn)換成
index,然后加上表示結(jié)束的index---EOS_token。
def indexesFromSentence(voc, sentence):
return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]
2.itertools.zip_longest函數(shù)第一個參數(shù)是聚合在一起的元素(l是一個二維矩陣,*l(解壓操作)是去除最外面的中括號,所以就變成了一堆聚合在一起的元素了),函數(shù)第二個參數(shù)是要填充的字符值,函數(shù)功能是對長度不一樣的聚合在一起的元素,使用fillvalue進行填充,填充成最大元素的長度。
itertools.zip_longest:Make an iterator that aggregates elements from each of the iterables. If the iterables are of uneven length, missing values are filled-in with fillvalue. Iteration continues until the longest iterable is exhausted.
def zeroPadding(l, fillvalue=PAD_token):
return list(itertools.zip_longest(*l, fillvalue=fillvalue))
- 制作掩碼矩陣,填充的部分設置為0,非填充的部分設置為1。
def binaryMatrix(l, value=PAD_token):
m = []
for i, seq in enumerate(l):
m.append([])
for token in seq:
if token == PAD_token:
m[i].append(0)
else:
m[i].append(1)
return m
-
inputVar和outputVar是將一個batch的數(shù)據(jù)進行word和tensor的相互轉(zhuǎn)換。inputVar,第一句是先將sentence轉(zhuǎn)換成index矩陣,第二句是將創(chuàng)建和index矩陣同樣大小的Tensor矩陣,torch.tensor([a,b,c,d])創(chuàng)建a * b * c * d大小的Tensor矩陣。接下來兩句是對index矩陣進行填充,然后成成對應的Tensor矩陣,torch.LongTensor(list)創(chuàng)建一個和list同樣形狀的Tensor矩陣,batch2TrainData就是將,隨機采樣到的小批數(shù)據(jù),生成訓練中需要的tensor數(shù)據(jù)。
# Returns padded input sequence tensor and lengths
def inputVar(l, voc):
indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
padList = zeroPadding(indexes_batch)
padVar = torch.LongTensor(padList)
return padVar, lengths
# Returns padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
max_target_len = max([len(indexes) for indexes in indexes_batch])
padList = zeroPadding(indexes_batch)
mask = binaryMatrix(padList)
mask = torch.ByteTensor(mask)
padVar = torch.LongTensor(padList)
return padVar, mask, max_target_len
# Returns all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
input_batch, output_batch = [], []
for pair in pair_batch:
input_batch.append(pair[0])
output_batch.append(pair[1])
inp, lengths = inputVar(input_batch, voc)
output, mask, max_target_len = outputVar(output_batch, voc)
return inp, lengths, output, mask, max_target_len
# Example for validation
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches
print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)
構(gòu)建模型
Encoder
nn.GRU(input_size,hidden_size,n_layers,...),第一個參數(shù)輸入vector的長度,第二個根據(jù)GRU的計算公式,是輸出和隱藏vector的長度(因為是上一個time_step的輸出和這個階段的h共同組合成這個階段的輸出),n_layers就是需要堆疊的GRU的層數(shù),如果大于1的話,就是將n層GRU摞起來,nn.Embedding(voc_size,dimension,..),第一個參數(shù)是表示需要embedding的單詞的數(shù)目,dimension是embedding之后每一個單詞的vector的長度,torch.nn.utils.rnn.pack_padded_sequence(input, lengths, batch_first=False),將一個填充過的tensor打包起來,input的形狀是max_length * batch_size,如果batch_first=True則更換兩者的順序,并且輸入是一個已經(jīng)排好序(降序)的tensor矩陣,長的在前,短的在后,lengths是batch中輸入序列每個元素的長度的一個列表,這一函數(shù)的作用就是打包,進行批處理(將多個Tensor結(jié)合在一起,而且排序也是很有必要的,能和原來的元素一對應起來),torch.nn.utils.rnn.pad_packed_sequence,是上個函數(shù)的逆操作,相當于解壓操作,接下來一句,因為我們使用的是雙向的LSTM所以會有兩倍的輸出,我們將其相加起來組成最后的輸出,。
Packs a Tensor containing padded sequences of variable length.Input can be of size T x B x * where T is the length of the longest sequence (equal to lengths[0]), B is the batch size, and * is any number of dimensions (including 0). If batch_first is True B x T x * inputs are expected.The sequences should be sorted by length in a decreasing order, i.e. input[:,0] should be the longest sequence, and input[:,B-1] the shortest one.This function accepts any input that has at least two dimensions. You can apply it to pack the labels, and use the output of the RNN with them to compute the loss directly.
class EncoderRNN(nn.Module):
def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
super(EncoderRNN, self).__init__()
self.n_layers = n_layers
self.hidden_size = hidden_size
self.embedding = embedding
# Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
# because our input size is a word embedding with number of features == hidden_size
self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
dropout=(0 if n_layers == 1 else dropout), bidirectional=True)
def forward(self, input_seq, input_lengths, hidden=None):
# Convert word indexes to embeddings
embedded = self.embedding(input_seq)
# Pack padded batch of sequences for RNN module
packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
# Forward pass through GRU
outputs, hidden = self.gru(packed, hidden)
# Unpack padding
outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
# Sum bidirectional GRU outputs
outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
# Return output and final hidden state
return outputs, hidden

Attention
完成了一個關(guān)于注意力機制的類,這樣就不用每個模型都寫一個attention了,拿來即用,W是一個矩陣變換,在函數(shù)中相當于torch.nn.Linear,就是實現(xiàn)這個W轉(zhuǎn)換,所以在__init__里面的首先要初始化這個能進行W變換的submodule,torch.nn.Linear(in_features, out_features, bias=True),以general為例,in_features是h_hat_s的長度,out_features就是經(jīng)過線性變換以后特征的維度,(hidden.expand(encoder_output.size(0), -1, -1),作用是將某一個維度是1的維擴大到指定的維度,這個例子中hidden 的形狀是1*n*m ,然后擴展到encoder_output.size(0)*n*m,-1表示不擴展這個維度,值得注意的是擴展維度并不會多耗費內(nèi)存,只是更改了視圖(view)把stride設置為0,torch.cat(A,B,n)表示在第n維上連接兩個tensor,torch.sum(input,dims,keepdim=False...)表示在維度(可以為一個列表)上進行加法,并使用torch.squeeze()對結(jié)果進行擠壓,如果keepdim=True則不進行擠壓,tensor.t()對tensor進行轉(zhuǎn)置,這個函數(shù)只能對tensor是二維張量,將矩陣進行轉(zhuǎn)置,
that have a very special property when used with Module s - when they’re assigned as Module attributes they are automatically added to the list of its parameters, and will appear e.g. in parameters() iterator. Assigning a Tensor doesn’t have such effect.
@三種attention方式|center
@tensor.expand()|center
# Luong attention layer
class Attn(torch.nn.Module):
def __init__(self, method, hidden_size):
super(Attn, self).__init__()
self.method = method
if self.method not in ['dot', 'general', 'concat']:
raise ValueError(self.method, "is not an appropriate attention method.")
self.hidden_size = hidden_size
if self.method == 'general':
self.attn = torch.nn.Linear(self.hidden_size, hidden_size)
elif self.method == 'concat':
self.attn = torch.nn.Linear(self.hidden_size * 2, hidden_size)
self.v = torch.nn.Parameter(torch.FloatTensor(hidden_size))
def dot_score(self, hidden, encoder_output):
return torch.sum(hidden * encoder_output, dim=2)
def general_score(self, hidden, encoder_output):
energy = self.attn(encoder_output)
return torch.sum(hidden * energy, dim=2)
def concat_score(self, hidden, encoder_output):
energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
return torch.sum(self.v * energy, dim=2)
def forward(self, hidden, encoder_outputs):
# Calculate the attention weights (energies) based on the given method
if self.method == 'general':
attn_energies = self.general_score(hidden, encoder_outputs)
elif self.method == 'concat':
attn_energies = self.concat_score(hidden, encoder_outputs)
elif self.method == 'dot':
attn_energies = self.dot_score(hidden, encoder_outputs)
# Transpose max_length and batch_size dimensions
attn_energies = attn_energies.t()
# Return the softmax normalized probability scores (with added dimension)
return F.softmax(attn_energies, dim=1).unsqueeze(1)
Decoder
nn.Dropout(),embedded = self.embedding_dropout(embedded)就是隨機歸0一些tensor,tensorA.bmm(tensorB)相當于torch.bmm(tensorA,tensorB)都是對兩個tensor進行批量相乘,注意第一維度是batch_size,tensor.squeeze(tensorA,dim=None),如果不指定維度那么會將張量里面的所有維度為1的全部消除[2,1,2,1,2]->[2,2,2],如果指定了dim=1,[2,1,2,1,2]->[2,2,1,2]。torch.tanh的目的是加入一些非線性變換,注意rnn_output, hidden = self.gru(embedded, last_hidden)這里得到的是一個時間步里面的結(jié)果,每次一個輸出和一個隱藏狀態(tài),做.transpose(0,1)也是因為訓練的時候batch_size在第一維現(xiàn)在要求是第零維。。
m = nn.Dropout(p=0.2)
input = torch.randn(20, 16)
output = m(input)
print(output)
class AttnDecoderRNN(nn.Module):
def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
super(LuongAttnDecoderRNN, self).__init__()
# Keep for reference
self.attn_model = attn_model
self.hidden_size = hidden_size
self.output_size = output_size
self.n_layers = n_layers
self.dropout = dropout
# Define layers
self.embedding = embedding
self.embedding_dropout = nn.Dropout(dropout)
self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
self.concat = nn.Linear(hidden_size * 2, hidden_size)
self.out = nn.Linear(hidden_size, output_size)
self.attn = Attn(attn_model, hidden_size)
def forward(self, input_step, last_hidden, encoder_outputs):
# Note: we run this one step (word) at a time
# Get embedding of current input word
embedded = self.embedding(input_step)
embedded = self.embedding_dropout(embedded)
# Forward through unidirectional GRU
rnn_output, hidden = self.gru(embedded, last_hidden)
# Calculate attention weights from the current GRU output
attn_weights = self.attn(rnn_output, encoder_outputs)
# Multiply attention weights to encoder outputs to get new "weighted sum" context vector
context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
# Concatenate weighted context vector and GRU output using Luong eq. 5
rnn_output = rnn_output.squeeze(0)
context = context.squeeze(1)
concat_input = torch.cat((rnn_output, context), 1)
concat_output = torch.tanh(self.concat(concat_input))
# Predict next word using Luong eq. 6
output = self.out(concat_output)
output = F.softmax(output, dim=1)
# Return output and final hidden state
return output, hidden
LossFunction
torch.gather(input, dim, index, out=None),這個函數(shù)的作用是將input按照dim所指定的維度,按照index中的次序進行收集,輸出的尺寸和index是一樣的,比如下面的例子dim=1指示對input的第1維進行收集,且有2個第一維,所以index的尺寸是2*n,此處n=2,指示了每個第一維元素應該如何收集([1,2]按照[0,0]收集,[3,4]按照[0,1]收集),input和index的維度,只能是dim所指示的哪一個維度可以不一樣,其他的必須完全一樣。tensor.masked_select(mask).mean(),torch.masked_select(input, mask, out=None) → Tensor,將input按照mask的中為1的元素的順序收集成一個一維的tensor,mask不是必須要和input同一緯度,但是必須是可以broadcastable。
If input is an n-dimensional tensor with size (x0,x1...,xi?1,xi,xi+1,...,xn?1) and dim =i, then index must be an n-dimensional tensor with size (x0,x1,...,xi?1,y,xi+1,...,xn?1) where y≥1 and out will have the same size as index.
Returns a new 1-D tensor which indexes the input tensor according to the binary mask mask which is a ByteTensor.The shapes of the mask tensor and the input tensor don’t need to match, but they must be broadcastable.
>>> t = torch.tensor([[1,2],[3,4]])
>>> torch.gather(t, 1, torch.tensor([[0,0],[1,0]]))
tensor([[ 1, 1],
[ 4, 3]])
>>> x = torch.randn(3, 4)
>>> x
tensor([[ 0.3552, -2.3825, -0.8297, 0.3477],
[-1.2035, 1.2252, 0.5002, 0.6248],
[ 0.1307, -2.0608, 0.1244, 2.0139]])
>>> mask = x.ge(0.5)
>>> mask
tensor([[ 0, 0, 0, 0],
[ 0, 1, 1, 1],
[ 0, 0, 0, 1]], dtype=torch.uint8)
>>> torch.masked_select(x, mask)
tensor([ 1.2252, 0.5002, 0.6248, 2.0139])
def maskNLLLoss(inp, target, mask):
nTotal = mask.sum()
crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)))
loss = crossEntropy.masked_select(mask).mean()
loss = loss.to(device)
return loss, nTotal.item()
訓練
一個時間步的訓練
train指示一個Mini_batch的一個時間步的訓練。tensor.topk(n),返回tensor最大的n個數(shù)及其下標_, topi = decoder_output.topk(1),decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])此處的是一個batch,所以先取到topi[i]在top[i].item()取出元素。torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)進行梯度裁剪,防止梯度爆炸,第二個參數(shù)是max norm of the gradients,正規(guī)化以后梯度的最大值。
>>> x = torch.arange(1., 6.)
>>> x
tensor([ 1., 2., 3., 4., 5.])
>>> torch.topk(x, 3)
(tensor([ 5., 4., 3.]), tensor([ 4, 3, 2]))
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):
# Zero gradients
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
# Set device options
input_variable = input_variable.to(device)
lengths = lengths.to(device)
target_variable = target_variable.to(device)
mask = mask.to(device)
# Initialize variables
loss = 0
print_losses = []
n_totals = 0
# Forward pass through encoder
encoder_outputs, encoder_hidden = encoder(input_variable, lengths)
# Create initial decoder input (start with SOS tokens for each sentence)
decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
decoder_input = decoder_input.to(device)
# Set initial decoder hidden state to the encoder's final hidden state
decoder_hidden = encoder_hidden[:decoder.n_layers]
# Determine if we are using teacher forcing this iteration
use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
# Forward batch of sequences through decoder one time step at a time
if use_teacher_forcing:
for t in range(max_target_len):
decoder_output, decoder_hidden = decoder(
decoder_input, decoder_hidden, encoder_outputs
)
# Teacher forcing: next input is current target
decoder_input = target_variable[t].view(1, -1)
# Calculate and accumulate loss
mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
loss += mask_loss
print_losses.append(mask_loss.item() * nTotal)
n_totals += nTotal
else:
for t in range(max_target_len):
decoder_output, decoder_hidden = decoder(
decoder_input, decoder_hidden, encoder_outputs
)
# No teacher forcing: next input is decoder's own current output
_, topi = decoder_output.topk(1)
decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
decoder_input = decoder_input.to(device)
# Calculate and accumulate loss
mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
loss += mask_loss
print_losses.append(mask_loss.item() * nTotal)
n_totals += nTotal
# Perform backpropatation
loss.backward()
# Clip gradients: gradients are modified in place
_ = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
_ = torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)
# Adjust model weights
encoder_optimizer.step()
decoder_optimizer.step()
return sum(print_losses) / n_totals
訓練
training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)]) for _ in range(n_iteration)],首先把所有的訓練數(shù)據(jù)采樣下來。然后每個循環(huán)拿出一小批數(shù)據(jù)進行訓練。if loadFilename:start_iteration = checkpoint['iteration'] + 1很有靈性的寫法,雖然感覺用處不會很大,input_variable, lengths, target_variable, mask, max_target_len = training_batch,從training_batch中取出各個字段。保存數(shù)據(jù)這一段寫的太經(jīng)典了。
if (iteration % save_every == 0):
directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
if not os.path.exists(directory):
os.makedirs(directory)
torch.save({
'iteration': iteration,
'en': encoder.state_dict(),
'de': decoder.state_dict(),
'en_opt': encoder_optimizer.state_dict(),
'de_opt': decoder_optimizer.state_dict(),
'loss': loss,
'voc_dict': voc.__dict__,
'embedding': embedding.state_dict()
}, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename):
# Load batches for each iteration
training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
for _ in range(n_iteration)]
# Initializations
print('Initializing ...')
start_iteration = 1
print_loss = 0
if loadFilename:
start_iteration = checkpoint['iteration'] + 1
# Training loop
print("Training...")
for iteration in range(start_iteration, n_iteration + 1):
training_batch = training_batches[iteration - 1]
# Extract fields from batch
input_variable, lengths, target_variable, mask, max_target_len = training_batch
# Run a training iteration with batch
loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
print_loss += loss
# Print progress
if iteration % print_every == 0:
print_loss_avg = print_loss / print_every
print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
print_loss = 0
# Save checkpoint
if (iteration % save_every == 0):
directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
if not os.path.exists(directory):
os.makedirs(directory)
torch.save({
'iteration': iteration,
'en': encoder.state_dict(),
'de': decoder.state_dict(),
'en_opt': encoder_optimizer.state_dict(),
'de_opt': decoder_optimizer.state_dict(),
'loss': loss,
'voc_dict': voc.__dict__,
'embedding': embedding.state_dict()
}, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))
模型評估
貪婪搜索
decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_toke,*一個常數(shù)就是將tensor的每一個元素都乘以這個常數(shù),做成1*1的tensor可能就是因為說明這一批就只有一個元素,torch.max(input,dim),返回inputTensor中指定維度的最大值以及下標,如果不指定維度那么只返回最大的數(shù),這個指定維度是便利所有元素值所得到的最大的元素,是總體的最大元素,所以input的維度可以是m*n,貪婪搜索decoder只用將所有的輸入序列進行一次encoder,但是對decoder要運行max_length次,比較有意思的事是,all_tokens首先誰初始化成起始符0,然后每次將最有可能的那個詞使用torch.cat連接起來,因為torch.max輸出的decoder_input是一維的,而輸入要求是二維的所有還要使用torch.unsqueeze進行擴維。
>>> a = torch.randn(4, 4)
>>> a
tensor([[-1.2360, -0.2942, -0.1222, 0.8475],
[ 1.1949, -1.1127, -2.2379, -0.6702],
[ 1.5717, -0.9207, 0.1297, -1.8768],
[-0.6172, 1.0036, -0.6060, -0.2432]])
>>> torch.max(a, 1)
(tensor([ 0.8475, 1.1949, 1.5717, 1.0036]), tensor([ 3, 0, 0, 1]))
>>> a = torch.randn(1, 3)
>>> a
tensor([[ 0.6763, 0.7445, -2.2369]])
>>> torch.max(a)
tensor(0.7445)
class GreedySearchDecoder(nn.Module):
def __init__(self, encoder, decoder):
super(GreedySearchDecoder, self).__init__()
self.encoder = encoder
self.decoder = decoder
def forward(self, input_seq, input_length, max_length):
# Forward input through encoder model
encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
# Prepare encoder's final hidden layer to be first hidden input to the decoder
decoder_hidden = encoder_hidden[:decoder.n_layers]
# Initialize decoder input with SOS_token
decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
# Initialize tensors to append decoded words to
all_tokens = torch.zeros([0], device=device, dtype=torch.long)
all_scores = torch.zeros([0], device=device)
# Iteratively decode one word token at a time
for _ in range(max_length):
# Forward pass through decoder
decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
# Obtain most likely word token and its softmax score
decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
# Record token and score
all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
all_scores = torch.cat((all_scores, decoder_scores), dim=0)
# Prepare current token to be next decoder input (add a dimension)
decoder_input = torch.unsqueeze(decoder_input, 0)
# Return collections of word tokens and scores
return all_tokens, all_scores
評估函數(shù)
-
indexes_batch = [indexesFromSentence(voc, sentence)]得到句子的index下標,句子sentence是問答系統(tǒng)的一個句子,lengths的Tensor矩陣,input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)將batch_size放在了第二維,decoded_words = [voc.index2word[token.item()] for token in tokens],index2word是voc實例里面的一個屬性。 -
input(">")作用是先輸出一個>然后獲取直到回車的字符串并返回,然后對輸入的句子進行noramlizeString,加空格,去除非字母的符號,output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]很有靈性的一段代碼,去除結(jié)束符和填充符號,但是寫[:]好像沒用呀。print('Bot:', ' '.join(output_words))將列表中的元素連接成字符串。
"""
output_words[:]相當先把output_words的元素全部復制一遍然后存到以地址output_words[:]開始的地方。
嚴格的說,python沒有賦值,只有名字到對象的綁定。所以L1=L是把L所指的對象綁定到名字L1上,而L2=L[:]則是把L通過切片運算取得的新列表對象綁定到L2上。前者兩個名字指向同一個對象,后者兩個名字指向不同對象。換句話說,L1和L是指的同一個東西,那么修改L1也就修改了L;L2則是不同的東西,修改L2不會改變L。注意這個引用的概念對于所有的東西都成立,例如容器內(nèi)部存儲的都是引用。
"""
i=[1,2,3,4,5]
l=i[:]
i[2]=9
print(i)
print(l)
---------------------
[1, 2, 9, 4, 5]
[1, 2, 3, 4, 5]
---------------------
i=[1,2,3,4,5]
i[:]=[6,7,8]
print(i)
--------------------
[6, 7, 8]
[6, 7, 8]
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
### Format input sentence as a batch
# words -> indexes
indexes_batch = [indexesFromSentence(voc, sentence)]
# Create lengths tensor
lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
# Transpose dimensions of batch to match models' expectations
input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
# Use appropriate device
input_batch = input_batch.to(device)
lengths = lengths.to(device)
# Decode sentence with searcher
tokens, scores = searcher(input_batch, lengths, max_length)
# indexes -> words
decoded_words = [voc.index2word[token.item()] for token in tokens]
return decoded_words
def evaluateInput(encoder, decoder, searcher, voc):
input_sentence = ''
while(1):
try:
# Get input sentence
input_sentence = input('> ')
# Check if it is quit case
if input_sentence == 'q' or input_sentence == 'quit': break
# Normalize sentence
input_sentence = normalizeString(input_sentence)
# Evaluate sentence
output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
# Format and print response sentence
output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
print('Bot:', ' '.join(output_words))
except KeyError:
print("Error: Encountered unknown word.")
運行模型
-
__dict__,python中的類,都會從object里繼承一個__dict__屬性,這個屬性中存放著類的屬性和方法對應的鍵值對。一個類實例化之后,這個類的實例也具有這么一個__dict__屬性。但是二者并不相同。
In [26]: class A:
...: some = 1
...: def __init__(self,num):
...: self.num = num
...:
In [27]: a = A(10)
In [28]: print(a.__dict__)
{'num': 10}
In [30]: a.age = 10
In [31]: print(a.__dict__)
{'num': 10, 'age': 10}
- 加載模型,1.首先創(chuàng)建模型,2.使用
model.load_state_dict(embedding_sd)加載模型參數(shù)。
embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
encoder.load_state_dict(encoder_sd)
decoder.load_state_dict(decoder_sd)
encoder = encoder.to(device)
decoder = decoder.to(device)
# Configure models
model_name = 'cb_model'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64
# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 4000
#loadFilename = os.path.join(save_dir, model_name, corpus_name,
# '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
# '{}_checkpoint.tar'.format(checkpoint_iter))
# Load model if a loadFilename is provided
if loadFilename:
# If loading on same machine the model was trained on
checkpoint = torch.load(loadFilename)
# If loading a model trained on GPU to CPU
#checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
encoder_sd = checkpoint['en']
decoder_sd = checkpoint['de']
encoder_optimizer_sd = checkpoint['en_opt']
decoder_optimizer_sd = checkpoint['de_opt']
embedding_sd = checkpoint['embedding']
voc.__dict__ = checkpoint['voc_dict']
print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
encoder.load_state_dict(encoder_sd)
decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')
運行模型
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 4000
print_every = 1
save_every = 500
# Ensure dropout layers are in train mode
encoder.train()
decoder.train()
# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
encoder_optimizer.load_state_dict(encoder_optimizer_sd)
decoder_optimizer.load_state_dict(decoder_optimizer_sd)
# Run training iterations
print("Starting Training!")
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
print_every, save_every, clip, corpus_name, loadFilename)
運行模型,與模型進行交流
將最后一行的注釋取掉就可以與machine交流了。
# Set dropout layers to eval mode
encoder.eval()
decoder.eval()
# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)
# Begin chatting (uncomment and run the following line to begin)
# evaluateInput(encoder, decoder, searcher, voc)

