TF之LSTM:基于Tensorflow框架采用PTB數(shù)據(jù)集建立LSTM網(wǎng)絡的自然語言建模
關于PTB數(shù)據(jù)集
PTB (Penn Treebank Dataset)文本數(shù)據(jù)集是語言模型學習中目前最被廣泛使用數(shù)據(jù)集。
ptb.test.txt ? ?#測試集數(shù)據(jù)文件
ptb.train.txt ? #訓練集數(shù)據(jù)文件
ptb.valid.txt ? #驗證集數(shù)據(jù)文件
這三個數(shù)據(jù)文件中的數(shù)據(jù)已經(jīng)經(jīng)過了預處理,包含了10000 個不同的詞語和語句結(jié)束標記符(在文本中就是換行符)以及標記稀有詞語的特殊符號,。
為了讓使用PTB數(shù)據(jù)集更加方便,TensorFlow提供了兩個函數(shù)來幫助實現(xiàn)數(shù)據(jù)的預處理,。首先,TensorFlow提供了ptb_raw_data函數(shù)來讀取PTB的原始數(shù)據(jù),并將原始數(shù)據(jù)中的單詞轉(zhuǎn)化為單詞ID。
訓練數(shù)據(jù)中總共包含了929589 個單詞,而這些單詞被組成了一個非常長的序列,。這個序列通過特殊的標識符給出了每句話結(jié)束的位置,。在這個數(shù)據(jù)集中,句子結(jié)束的標識符ID為2。
數(shù)據(jù)集的下載地址:TF的PTB數(shù)據(jù)集? ? ?(別的數(shù)據(jù)集不匹配的話會出現(xiàn)錯誤)
?
代碼實現(xiàn)
? ?本代碼使用2層 LSTM 網(wǎng)絡,且每層有 200 個隱藏單元,。在訓練中截斷的輸入序列長度為 32,且使用 Dropout 和梯度截斷等方法控制模型的過擬合與梯度爆炸等問題,。當簡單地訓練 3 個 Epoch 后,測試復雜度(Perplexity)降低到了 210,如果多輪訓練會更低。
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import os
import sys
import tensorflow as tf
Py3 = sys.version_info[0] == 3
def _read_words(filename):
with tf.gfile.GFile(filename, "r") as f:
if Py3:
return f.read().replace("\n", "<eos>").split()
else:
return f.read().decode("utf-8").replace("\n", "<eos>").split()
def _build_vocab(filename):
data = _read_words(filename)
counter = collections.Counter(data)
count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*count_pairs))
word_to_id = dict(zip(words, range(len(words))))
return word_to_id
def _file_to_word_ids(filename, word_to_id):
data = _read_words(filename)
return [word_to_id[word] for word in data if word in word_to_id]
def ptb_raw_data(data_path=None):
"""Load PTB raw data from data directory "data_path".
Reads PTB text files, converts strings to integer ids,
and performs mini-batching of the inputs.
The PTB dataset comes from Tomas Mikolov's webpage:
http://www.fit./~imikolov/rnnlm/simple-examples.tgz
Args:
data_path: string path to the directory where simple-examples.tgz has
been extracted.
Returns:
tuple (train_data, valid_data, test_data, vocabulary)
where each of the data objects can be passed to PTBIterator.
"""
train_path = os.path.join(data_path, "ptb.train.txt")
valid_path = os.path.join(data_path, "ptb.valid.txt")
test_path = os.path.join(data_path, "ptb.test.txt")
word_to_id = _build_vocab(train_path)
train_data = _file_to_word_ids(train_path, word_to_id)
valid_data = _file_to_word_ids(valid_path, word_to_id)
test_data = _file_to_word_ids(test_path, word_to_id)
vocabulary = len(word_to_id)
return train_data, valid_data, test_data, vocabulary
def ptb_producer(raw_data, batch_size, num_steps, name=None):
"""Iterate on the raw PTB data.
This chunks up raw_data into batches of examples and returns Tensors that
are drawn from these batches.
Args:
raw_data: one of the raw data outputs from ptb_raw_data.
batch_size: int, the batch size.
num_steps: int, the number of unrolls.
name: the name of this operation (optional).
Returns:
A pair of Tensors, each shaped [batch_size, num_steps]. The second element
of the tuple is the same data time-shifted to the right by one.
Raises:
tf.errors.InvalidArgumentError: if batch_size or num_steps are too high.
"""
with tf.name_scope(name, "PTBProducer", [raw_data, batch_size, num_steps]):
raw_data = tf.convert_to_tensor(raw_data, name="raw_data", dtype=tf.int32)
data_len = tf.size(raw_data)
batch_len = data_len // batch_size
data = tf.reshape(raw_data[0 : batch_size * batch_len],
[batch_size, batch_len])
epoch_size = (batch_len - 1) // num_steps
assertion = tf.assert_positive(
epoch_size,
message="epoch_size == 0, decrease batch_size or num_steps")
with tf.control_dependencies([assertion]):
epoch_size = tf.identity(epoch_size, name="epoch_size")
i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
x = tf.strided_slice(data, [0, i * num_steps],
[batch_size, (i + 1) * num_steps])
x.set_shape([batch_size, num_steps])
y = tf.strided_slice(data, [0, i * num_steps + 1],
[batch_size, (i + 1) * num_steps + 1])
y.set_shape([batch_size, num_steps])
return x, y
from reader import *
import tensorflow as tf
import numpy as np
data_path = 'F:/File_Python/Python_daydayup/data/simple-examples/data' #F:/File_Python/Python_daydayup/data/simple-examples/data
# 隱藏層單元數(shù)與LSTM層級數(shù)
hidden_size = 200
num_layers = 2
#詞典規(guī)模
vocab_size = 10000
learning_rate = 1.0
train_batch_size = 16
# 訓練數(shù)據(jù)截斷長度
train_num_step = 32
# 在測試時不需要使用截斷,測試數(shù)據(jù)為一個超長序列
eval_batch_size = 1
eval_num_step = 1
num_epoch = 3
#結(jié)點不被Dropout的概率
keep_prob = 0.5
# 用于控制梯度爆炸的參數(shù)
max_grad_norm = 5
# 通過ptbmodel 的類描述模型
class PTBModel(object):
def __init__(self, is_training, batch_size, num_steps):
# 記錄使用的Batch大小和截斷長度
self.batch_size = batch_size
self.num_steps = num_steps
# 定義輸入層,維度為批量大小×截斷長度
self.input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
# 定義預期輸出
self.targets = tf.placeholder(tf.int32, [batch_size, num_steps])
# 定義使用LSTM結(jié)構(gòu)為循環(huán)體,帶Dropout的深度RNN
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size)
if is_training:
lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob)
cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers)
# 初始化狀態(tài)為0
self.initial_state = cell.zero_state(batch_size, tf.float32)
# 將單詞ID轉(zhuǎn)換為單詞向量,embedding的維度為vocab_size*hidden_size
embedding = tf.get_variable('embedding', [vocab_size, hidden_size])
# 將一個批量內(nèi)的單詞ID轉(zhuǎn)化為詞向量,轉(zhuǎn)化后的輸入維度為批量大小×截斷長度×隱藏單元數(shù)
inputs = tf.nn.embedding_lookup(embedding, self.input_data)
# 只在訓練時使用Dropout
if is_training: inputs = tf.nn.dropout(inputs, keep_prob)
# 定義輸出列表,這里先將不同時刻LSTM的輸出收集起來,再通過全連接層得到最終輸出
outputs = []
# state 儲存不同批量中LSTM的狀態(tài),初始為0
state = self.initial_state
with tf.variable_scope('RNN'):
for time_step in range(num_steps):
if time_step > 0: tf.get_variable_scope().reuse_variables()
# 從輸入數(shù)據(jù)獲取當前時間步的輸入與前一時間步的狀態(tài),并傳入LSTM結(jié)構(gòu)
cell_output, state = cell(inputs[:, time_step, :], state)
# 將當前輸出加入輸出隊列
outputs.append(cell_output)
# 將輸出隊列展開成[batch,hidden*num_step]的形狀,再reshape為[batch*num_step, hidden]
output = tf.reshape(tf.concat(outputs, 1), [-1, hidden_size])
# 將LSTM的輸出傳入全連接層以生成最后的預測結(jié)果,。最后結(jié)果在每時刻上都是長度為vocab_size的張量
# 且經(jīng)過softmax層后表示下一個位置不同詞的概率
weight = tf.get_variable('weight', [hidden_size, vocab_size])
bias = tf.get_variable('bias', [vocab_size])
logits = tf.matmul(output, weight) + bias
# 定義交叉熵損失函數(shù),一個序列的交叉熵之和
loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
[logits], # 預測的結(jié)果
[tf.reshape(self.targets, [-1])], # 期望正確的結(jié)果,這里將[batch_size, num_steps]壓縮為一維張量
[tf.ones([batch_size * num_steps], dtype=tf.float32)]) # 損失的權重,所有為1表明不同批量和時刻的重要程度一樣
# 計算每個批量的平均損失
self.cost = tf.reduce_sum(loss) / batch_size
self.final_state = state
# 只在訓練模型時定義反向傳播操作
if not is_training: return
trainable_variable = tf.trainable_variables()
# 控制梯度爆炸問題
grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, trainable_variable), max_grad_norm)
# 如果需要使用Adam作為優(yōu)化器,可以改為tf.train.AdamOptimizer(learning_rate),學習率需要降低至0.001左右
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
# 定義訓練步驟
self.train_op = optimizer.apply_gradients(zip(grads, trainable_variable))
def run_epoch(session, model, data, train_op, output_log, epoch_size):
total_costs = 0.0
iters = 0
state = session.run(model.initial_state)
# # 使用當前數(shù)據(jù)訓練或測試模型
for step in range(epoch_size):
x, y = session.run(data)
# 在當前批量上運行train_op并計算損失值,交叉熵計算的是下一個單詞為給定單詞的概率
cost, state, _ = session.run([model.cost, model.final_state, train_op],
{model.input_data: x, model.targets: y, model.initial_state: state})
# 將不同時刻和批量的概率就可得到困惑度的對數(shù)形式,將這個和做指數(shù)運算就可得到困惑度
total_costs += cost
iters += model.num_steps
# 只在訓練時輸出日志
if output_log and step % 100 == 0:
print("After %d steps, perplexity is %.3f" % (step, np.exp(total_costs / iters)))
return np.exp(total_costs / iters)
def main():
train_data, valid_data, test_data, _ = ptb_raw_data(data_path)
# 計算一個epoch需要訓練的次數(shù)
train_data_len = len(train_data)
train_batch_len = train_data_len // train_batch_size
train_epoch_size = (train_batch_len - 1) // train_num_step
valid_data_len = len(valid_data)
valid_batch_len = valid_data_len // eval_batch_size
valid_epoch_size = (valid_batch_len - 1) // eval_num_step
test_data_len = len(test_data)
test_batch_len = test_data_len // eval_batch_size
test_epoch_size = (test_batch_len - 1) // eval_num_step
initializer = tf.random_uniform_initializer(-0.05, 0.05)
with tf.variable_scope("language_model", reuse=None, initializer=initializer):
train_model = PTBModel(True, train_batch_size, train_num_step)
with tf.variable_scope("language_model", reuse=True, initializer=initializer):
eval_model = PTBModel(False, eval_batch_size, eval_num_step)
# 訓練模型,。
with tf.Session() as session:
tf.global_variables_initializer().run()
train_queue = ptb_producer(train_data, train_model.batch_size, train_model.num_steps)
eval_queue = ptb_producer(valid_data, eval_model.batch_size, eval_model.num_steps)
test_queue = ptb_producer(test_data, eval_model.batch_size, eval_model.num_steps)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=session, coord=coord)
for i in range(num_epoch):
print("In iteration: %d" % (i + 1))
run_epoch(session, train_model, train_queue, train_model.train_op, True, train_epoch_size)
valid_perplexity = run_epoch(session, eval_model, eval_queue, tf.no_op(), False, valid_epoch_size)
print("Epoch: %d Validation Perplexity: %.3f" % (i + 1, valid_perplexity))
test_perplexity = run_epoch(session, eval_model, test_queue, tf.no_op(), False, test_epoch_size)
print("Test Perplexity: %.3f" % test_perplexity)
coord.request_stop()
coord.join(threads)
if __name__ == "__main__":
main()