介紹
RNN(Recurrent Nueral Network, 循環(huán)神經(jīng)網(wǎng)絡(luò)),自然語言處理常用的一種神經(jīng)網(wǎng)絡(luò)類型。因為它的輸入和輸出(通常為時間序列)是可變長的,詳細(xì)介紹參考:https://blog.csdn.net/heyongluoyao8/article/details/48636251
準(zhǔn)備
數(shù)據(jù)集
全唐詩(43030首):鏈接: https://pan.baidu.com/s/10rcjAVmrPJwEWF0blglldQ
提取碼: 666g
參考代碼
自動生成英文詩歌:https://github.com/karpathy/char-rnn
博客:http://blog.topspeedsnail.com/archives/10542
代碼部分
數(shù)據(jù)預(yù)處理
import collections
ORIGIN_DATA = 'data/poetry.txt' # 源數(shù)據(jù)路徑
OUTPUT_DATA = 'data/o_poetry.txt' # 輸出向量路徑
VOCAB_DATA = 'data/poetry.vocab'
def word_to_id(word, id_dict):
if word in id_dict:
return id_dict[word]
else:
return id_dict['<unknow>']
poetrys = [] # 存放唐詩的數(shù)組
# 從文件中讀取唐詩
with open(ORIGIN_DATA, 'r', encoding='utf-8') as f:
f_lines = f.readlines()
print('唐詩總數(shù) : {}'.format(len(f_lines)))
# 逐行進(jìn)行處理
for line in f_lines:
# 去除前后空白符,轉(zhuǎn)碼
strip_line = line.strip()
try:
# 將唐詩分為標(biāo)題和內(nèi)容
title, content = strip_line.split(':')
except:
# 出現(xiàn)多個':'的將被舍棄
continue
# 去除內(nèi)容中的空格
content = content.strip().replace(' ', '')
# 舍棄含有非法字符的唐詩
if '(' in content or '(' in content or '<' in content or '《' in content or '_' in content or '[' in content:
continue
# 舍棄過短或過長的唐詩
lenth = len(content)
if lenth < 20 or lenth > 100:
continue
# 加入列表
poetrys.append('s' + content + 'e')
print('用于訓(xùn)練的唐詩數(shù) : {}'.format(len(poetrys)))
分割結(jié)果:
['[寒隨窮律變,春逐鳥聲開。初風(fēng)飄帶柳,晚雪間花梅。碧林青舊竹,綠沼翠新苔。芝田初雁去,綺樹巧鶯來。]', '[晚霞聊自怡,初晴彌可喜。日晃百花色,風(fēng)動千林翠。池魚躍不同,園鳥聲還異。寄言博通者,知予物外志。]', '[一朝春夏改,隔夜鳥花遷。陰陽深淺葉,曉夕重輕煙。哢鶯猶響殿,橫絲正網(wǎng)天。珮高蘭影接,綬細(xì)草紋連。碧鱗驚棹側(cè),玄燕舞檐前。何必汾陽處,始復(fù)有山泉。]']
poetry_list = sorted(poetrys, key=lambda x: len(x))
words_list = []
# 獲取唐詩中所有的字符
for poetry in poetry_list:
words_list.extend([word for word in poetry])
# 統(tǒng)計其出現(xiàn)的次數(shù)
counter = collections.Counter(words_list)
# 排序
sorted_words = sorted(counter.items(), key=lambda x: x[1], reverse=True)
# 獲得出現(xiàn)次數(shù)降序排列的字符列表
words_list = ['<unknow>'] + [x[0] for x in sorted_words]
# 這里選擇保留高頻詞的數(shù)目,詞只有不到七千個,所以我全部保留
words_list = words_list[:len(words_list)]
print('詞匯表大小 : {}'.format(words_list))
with open(VOCAB_DATA, 'w', encoding='utf-8') as f:
for word in words_list:
f.write(word + '\n')
# 生成單詞到id的映射
word_id_dict = dict(zip(words_list, range(len(words_list))))
# 將poetry_list轉(zhuǎn)換成向量形式
id_list = []
for poetry in poetry_list:
id_list.append([str(word_to_id(word, word_id_dict)) for word in poetry])
# 將向量寫入文件
with open(OUTPUT_DATA, 'w', encoding='utf-8') as f:
for id_l in id_list:
f.write(' '.join(id_l) + '\n')
RNN
import tensorflow as tf
import functools
VOCAB_SIZE = 6272 # 詞匯表大小
SHARE_EMD_WITH_SOFTMAX = True # 是否在embedding層和softmax層之間共享參數(shù)
MAX_GRAD = 5.0 # 最大梯度,防止梯度爆炸
LEARN_RATE = 0.0005 # 初始學(xué)習(xí)率
LR_DECAY = 0.92 # 學(xué)習(xí)率衰減
LR_DECAY_STEP = 600 # 衰減步數(shù)
BATCH_SIZE = 64 # batch大小
CKPT_PATH = 'ckpt/model_ckpt' # 模型保存路徑
VOCAB_PATH = 'vocab/poetry.vocab' # 詞表路徑
EMB_KEEP = 0.5 # embedding層dropout保留率
RNN_KEEP = 0.5 # lstm層dropout保留率
HIDDEN_SIZE = 128 # LSTM隱藏節(jié)點個數(shù)
NUM_LAYERS = 2 # RNN深度
def doublewrap(function):
def decorator(*args, **kwargs):
if len(args) == 1 and len(kwargs) == 0 and callable(args[0]):
return function(args[0])
else:
return lambda wrapee: function(wrapee, *args, **kwargs)
return decorator
def define_scope(function, scope=None, *args, **kwargs):
attribute = '_cache_' + function.__name__
name = scope or function.__name__
def decorator(self):
if not hasattr(self, attribute):
with tf.variable_scope(name, *args, **kwargs):
setattr(self, attribute, function(self))
return getattr(self, attribute)
return decorator
class TrainModel(object):
"""
訓(xùn)練模型
"""
def __init__(self, data, labels, emb_keep, rnn_keep):
self.data = data # 數(shù)據(jù)
self.labels = labels # 標(biāo)簽
self.emb_keep = emb_keep # embedding層dropout保留率
self.rnn_keep = rnn_keep # lstm層dropout保留率
self.global_step
self.cell
self.predict
self.loss
self.optimize
def cell(self):
"""
rnn網(wǎng)絡(luò)結(jié)構(gòu)
:return:
"""
lstm_cell = [
tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE), output_keep_prob=self.rnn_keep) for
_ in range(NUM_LAYERS)]
cell = tf.nn.rnn_cell.MultiRNNCell(lstm_cell)
return cell
def predict(self):
"""
定義前向傳播
:return:
"""
# 創(chuàng)建詞嵌入矩陣權(quán)重
embedding = tf.get_variable('embedding', shape=[VOCAB_SIZE, HIDDEN_SIZE])
# 創(chuàng)建softmax層參數(shù)
if SHARE_EMD_WITH_SOFTMAX:
softmax_weights = tf.transpose(embedding)
else:
softmax_weights = tf.get_variable('softmaweights', shape=[HIDDEN_SIZE, VOCAB_SIZE])
softmax_bais = tf.get_variable('softmax_bais', shape=[VOCAB_SIZE])
# 進(jìn)行詞嵌入
emb = tf.nn.embedding_lookup(embedding, self.data)
# dropout
emb_dropout = tf.nn.dropout(emb, self.emb_keep)
# 計算循環(huán)神經(jīng)網(wǎng)絡(luò)的輸出
self.init_state = self.cell.zero_state(BATCH_SIZE, dtype=tf.float32)
outputs, last_state = tf.nn.dynamic_rnn(self.cell, emb_dropout, scope='d_rnn', dtype=tf.float32,
initial_state=self.init_state)
outputs = tf.reshape(outputs, [-1, HIDDEN_SIZE])
# 計算logits
logits = tf.matmul(outputs, softmax_weights) + softmax_bais
return logits
def loss(self):
"""
定義損失函數(shù)
:return:
"""
# 計算交叉熵
outputs_target = tf.reshape(self.labels, [-1])
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.predict, labels=outputs_target, )
# 平均
cost = tf.reduce_mean(loss)
return cost
def global_step(self):
"""
global_step
:return:
"""
global_step = tf.Variable(0, trainable=False)
return global_step
def optimize(self):
"""
定義反向傳播過程
:return:
"""
# 學(xué)習(xí)率衰減
learn_rate = tf.train.exponential_decay(LEARN_RATE, self.global_step, LR_DECAY_STEP,
LR_DECAY)
# 計算梯度,并防止梯度爆炸
trainable_variables = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, trainable_variables), MAX_GRAD)
# 創(chuàng)建優(yōu)化器,進(jìn)行反向傳播
optimizer = tf.train.AdamOptimizer(learn_rate)
train_op = optimizer.apply_gradients(zip(grads, trainable_variables), self.global_step)
return train_op
class EvalModel(object):
def __init__(self, data, emb_keep, rnn_keep):
self.data = data # 輸入
self.emb_keep = emb_keep # embedding層dropout保留率
self.rnn_keep = rnn_keep # lstm層dropout保留率
self.cell
self.predict
self.prob
def cell(self):
"""
rnn網(wǎng)絡(luò)結(jié)構(gòu)
:return:
"""
lstm_cell = [
tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE), output_keep_prob=self.rnn_keep) for
_ in range(NUM_LAYERS)]
cell = tf.nn.rnn_cell.MultiRNNCell(lstm_cell)
return cell
def predict(self):
"""
定義前向傳播過程
:return:
"""
embedding = tf.get_variable('embedding', shape=[VOCAB_SIZE, HIDDEN_SIZE])
if SHARE_EMD_WITH_SOFTMAX:
softmax_weights = tf.transpose(embedding)
else:
softmax_weights = tf.get_variable('softmaweights', shape=[HIDDEN_SIZE, VOCAB_SIZE])
softmax_bais = tf.get_variable('softmax_bais', shape=[VOCAB_SIZE])
emb = tf.nn.embedding_lookup(embedding, self.data)
emb_dropout = tf.nn.dropout(emb, self.emb_keep)
# 與訓(xùn)練模型不同,這里只要生成一首古體詩,所以batch_size=1
self.init_state = self.cell.zero_state(1, dtype=tf.float32)
outputs, last_state = tf.nn.dynamic_rnn(self.cell, emb_dropout, scope='d_rnn', dtype=tf.float32,
initial_state=self.init_state)
outputs = tf.reshape(outputs, [-1, HIDDEN_SIZE])
logits = tf.matmul(outputs, softmax_weights) + softmax_bais
# 與訓(xùn)練模型不同,這里要記錄最后的狀態(tài),以此來循環(huán)生成字,直到完成一首詩
self.last_state = last_state
return logits
def prob(self):
"""
softmax計算概率
:return:
"""
probs = tf.nn.softmax(self.predict)
return probs
訓(xùn)練
使用LSMT模型,直接一輪訓(xùn)練,50000次,耗時大約2小時訓(xùn)練完成。
import tensorflow as tf
from rnn_model import TrainModel
import org
SHARE_EMD_WITH_SOFTMAX = True # 是否在embedding層和softmax層之間共享參數(shù)
MAX_GRAD = 5.0 # 最大梯度,防止梯度爆炸
LEARN_RATE = 0.0005 # 初始學(xué)習(xí)率
LR_DECAY = 0.92 # 學(xué)習(xí)率衰減
LR_DECAY_STEP = 600 # 衰減步數(shù)
BATCH_SIZE = 64 # batch大小
CKPT_PATH = 'ckpt/model_ckpt' # 模型保存路徑
VOCAB_PATH = 'vocab/poetry.vocab' # 詞表路徑
EMB_KEEP = 0.5 # embedding層dropout保留率
RNN_KEEP = 0.5 # lstm層dropout保留率
HIDDEN_SIZE = 128 # LSTM隱藏節(jié)點個數(shù)
NUM_LAYERS = 2 # RNN深度
TRAIN_TIMES = 30000 # 迭代總次數(shù)(沒有計算epoch)
SHOW_STEP = 1 # 顯示loss頻率
SAVE_STEP = 100 # 保存模型參數(shù)頻率
x_data = tf.placeholder(tf.int32, [BATCH_SIZE, None]) # 輸入數(shù)據(jù)
y_data = tf.placeholder(tf.int32, [BATCH_SIZE, None]) # 標(biāo)簽
emb_keep = tf.placeholder(tf.float32) # embedding層dropout保留率
rnn_keep = tf.placeholder(tf.float32) # lstm層dropout保留率
data = org.Dataset(BATCH_SIZE) # 創(chuàng)建數(shù)據(jù)集
model = TrainModel(x_data, y_data, emb_keep, rnn_keep) # 創(chuàng)建訓(xùn)練模型
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer()) # 初始化
for step in range(TRAIN_TIMES):
# 獲取訓(xùn)練batch
x, y = data.next_batch()
# 計算loss
loss, _ = sess.run([model.loss, model.optimize],
{model.data: x, model.labels: y, model.emb_keep: EMB_KEEP,
model.rnn_keep: RNN_KEEP})
if step % SHOW_STEP == 0:
print('step {}, loss is {}'.format(step, loss))
# 保存模型
if step % SAVE_STEP == 0:
saver.save(sess, CKPT_PATH, global_step=model.global_step)
經(jīng)過50000次的迭代后,最終的loss值大概在4~5%左右,這里忘記截圖了。
測試
import sys
import tensorflow as tf
import numpy as np
from rnn_model import EvalModel
import utils
import os
# 指定驗證時不使用cuda,這樣可以在用gpu訓(xùn)練的同時,使用cpu進(jìn)行驗證
os.environ['CUDA_VISIBLE_DEVICES'] = ''
x_data = tf.placeholder(tf.int32, [1, None])
emb_keep = tf.placeholder(tf.float32)
rnn_keep = tf.placeholder(tf.float32)
# 驗證用模型
model = EvalModel(x_data, emb_keep, rnn_keep)
saver = tf.train.Saver()
# 單詞到id的映射
word2id_dict = utils.read_word_to_id_dict()
# id到單詞的映射
id2word_dict = utils.read_id_to_word_dict()
def generate_word(prob):
"""
選擇概率最高的前100個詞,并用輪盤賭法選取最終結(jié)果
:param prob: 概率向量
:return: 生成的詞
"""
prob = sorted(prob, reverse=True)[:100]
index = np.searchsorted(np.cumsum(prob), np.random.rand(1) * np.sum(prob))
return id2word_dict[int(index)]
# def generate_word(prob):
# """
# 從所有詞中,使用輪盤賭法選取最終結(jié)果
# :param prob: 概率向量
# :return: 生成的詞
# """
# index = int(np.searchsorted(np.cumsum(prob), np.random.rand(1) * np.sum(prob)))
# return id2word_dict[index]
def generate_poem():
"""
隨機(jī)生成一首詩歌
:return:
"""
with tf.Session() as sess:
# 加載最新的模型
ckpt = tf.train.get_checkpoint_state('ckpt')
saver.restore(sess, ckpt.model_checkpoint_path)
# 預(yù)測第一個詞
rnn_state = sess.run(model.cell.zero_state(1, tf.float32))
x = np.array([[word2id_dict['s']]], np.int32)
prob, rnn_state = sess.run([model.prob, model.last_state],
{model.data: x, model.init_state: rnn_state, model.emb_keep: 1.0,
model.rnn_keep: 1.0})
word = generate_word(prob)
poem = ''
# 循環(huán)操作,直到預(yù)測出結(jié)束符號‘e'
while word != 'e':
poem += word
x = np.array([[word2id_dict[word]]])
prob, rnn_state = sess.run([model.prob, model.last_state],
{model.data: x, model.init_state: rnn_state, model.emb_keep: 1.0,
model.rnn_keep: 1.0})
word = generate_word(prob)
# 打印生成的詩歌
print(poem)
if __name__ == '__main__':
generate_poem()
結(jié)果:
江川重舌助清懸,風(fēng)起別蘇臨夜新。
江月吳籠罷白客,空夜山山許可悠。
-----------------------------
傷能題家節(jié),相態(tài)不今多。
斟軍笑不與,莫應(yīng)伴朝情。
-----------------------------
勞是孤商欲醉含,人相能處轉(zhuǎn)坐由。
瀑鶯共君全賞處,袁輪行上愛何心。
可以看出來,格式起碼是正確的。語法上還是存在一些問題,可以使用在對數(shù)據(jù)預(yù)處理時候,使用一些NLP方法(分詞、語法等)來進(jìn)行優(yōu)化。