import tensorflow as tf from numpy.random import RandomState import math #hyperparameters NUM_BATCH = 100 MAX_STEPS = 9 hidden_size = 8 n_inputs = 2 n_steps = 3 batch_size = 5 dataset_size = n_steps * batch_size * NUM_BATCH #take exponitional decay to set learning rate global_step = tf.Variable(0) learning_rate = tf.train.exponential_decay(0.01, global_step, 5, 0.96, staircase=True) #generate dataset rdm = RandomState(1) X = rdm.rand(dataset_size,n_inputs) Y = [[int(x1 + x2 < 1)] for (x1, x2) in X] #data validation data according to the ratio validation_prop = math.floor(0.2*NUM_BATCH) #set placeholder x = tf.placeholder(dtype=tf.float32, shape=(None, n_inputs), name="x-input") y_ = tf.placeholder(dtype=tf.float32, shape=(None, 1), name="y-input") #set variables weights = { #shape:n_inputs X hidden_size "in": tf.Variable(tf.random_normal([n_inputs, hidden_size]), name="input_weight"), #shape: hidden-size X 1 "out": tf.Variable(tf.random_normal([hidden_size, 1]), name="output-weight") } biases = { #shape: hidden_size "in": tf.Variable(tf.constant(0.1, shape=[hidden_size, ])), #shape; 1 "out": tf.Variable(tf.constant(0.1, shape=[1, ])) } #shape: None X hidden_size input = tf.matmul(x, weights["in"]) + biases["in"] #shape: None X n_step X hidden_size input = tf.reshape(input, [-1, n_steps, hidden_size]) cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=hidden_size, forget_bias=0.98) init_state = cell.zero_state(batch_size, dtype=tf.float32) outputs, final_state = tf.nn.dynamic_rnn(cell=cell, inputs=input, initial_state=init_state) #outputs shape: [batch_size, n_steps, n_inputs], final_state shape: [batch-size, n_inputs] #print(outputs.get_shape().as_list()) get outpus shape #output shape: None X hidden output0 = tf.reshape(outputs, [-1, hidden_size]) #output shape: None X 1 y = tf.matmul(output0, weights["out"]) + biases["out"] #compute loss and optimize loss cross_entropy = -tf.reduce_mean(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)) + (1-y_) * tf.log(tf.clip_by_value(1-y, 1e-10, 1.0)))/(batch_size*n_steps) tf.summary.scalar("cross_entropy", cross_entropy) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cross_entropy, global_step=global_step) merged = tf.summary.merge_all() num_step = 0 with tf.Session() as sess: tf.global_variables_initializer().run() summary_writer1 = tf.summary.FileWriter("./train", sess.graph) summary_writer2 = tf.summary.FileWriter("./test", sess.graph) summary_writer3 = tf.summary.FileWriter("./train_sum", sess.graph) loss = 0.0 loss_summary = tf.Summary() loss_summary.value.add(tag='sum cross-entropy', simple_value=loss) loss_sum_train = 0.0 loss_st_summary = tf.Summary() loss_st_summary.value.add(tag='sum cross-entropy', simple_value=loss_sum_train) for i in range(MAX_STEPS): for step in range(validation_prop, NUM_BATCH): summary_train, _, cross_entropy1= sess.run([merged,optimizer, cross_entropy], feed_dict={x: X[(step-1)*n_steps*batch_size: step*n_steps*batch_size], y_: Y[(step-1)*n_steps*batch_size: step*n_steps*batch_size]}) print(cross_entropy1) summary_writer1.add_summary(summary_train, num_step) if num_step%2 == 0: for step1 in range(1, validation_prop): cross_entropy2 = sess.run(cross_entropy, feed_dict={ x: X[(step1 - 1) * n_steps * batch_size: step1 * n_steps * batch_size], y_: Y[(step1 - 1) * n_steps * batch_size: step1 * n_steps * batch_size]}) loss += cross_entropy2 loss = loss/(validation_prop) print("loss: ", loss) loss_summary.value[0].simple_value = loss summary_writer2.add_summary(loss_summary, num_step) for step2 in range(validation_prop, NUM_BATCH): cross_entropy3 = sess.run(cross_entropy, feed_dict={ x: X[(step2 - 1) * n_steps * batch_size: step2 * n_steps * batch_size], y_: Y[(step2 - 1) * n_steps * batch_size: step2 * n_steps * batch_size]}) loss_sum_train += cross_entropy3 loss_sum_train = loss_sum_train/(NUM_BATCH-validation_prop) print("loss_sum_train: ", loss_sum_train) loss_st_summary.value[0].simple_value = loss_sum_train summary_writer3.add_summary(loss_st_summary, num_step) num_step += 1 summary_writer1.close() summary_writer2.close() summary_writer3.close()数据集通过随机产生,然后构造分类。
检测变量:使用softmax统计训练集中每一步的交叉熵、整个训练集的交叉熵、测试集交叉熵。
注意0.96的100次方大约在0.169,所以选择初始值时一定要注意不能选得太大,不然直接导致梯度下降无效。

图3训练集每一步的交叉熵误差统计

图4 每一步训练集和测试集上的交叉熵误差平均
可以看见通过动态调整learning_rate,能够使得交叉熵平滑下降。
文章来源: RNN举例