本文的代码主要参考github上的一篇开源的代码“Basic word2vec example”,但是几乎只提取了其中网络搭建的必要部分,并且为了方便自己作为初学者的理解进行了一些语言上简化(并没有简化模型),同时加上了一些自己的批注。
主要目的是学习熟悉tensorflow的使用,同时加深对word2vec的理解,因此在此进行记录。
读取的数据最终以一个一个单词的形式储存在vacabulary里。注意,这里的单词一定要按照原文的语序排好,而不能是乱序的(从word2vec的原理上来讲必须这样),举个例子vac=[I, like, eating, ChongQing, food, …]这样。
def maybe_download(filename, expected_bytes): """Download a file if not present, and make sure it's the right size.""" local_filename = os.path.join(gettempdir(), filename) if not os.path.exists(local_filename): local_filename, _ = urllib.request.urlretrieve(url + filename, local_filename) statinfo = os.stat(local_filename) if statinfo.st_size == expected_bytes: print('Found and verified', filename) else: print(statinfo.st_size) raise Exception('Failed to verify ' + local_filename + '. Can you get to it with a browser?') return local_filename filename = maybe_download('text8.zip', 31344016) # Read the data into a list of strings. def read_data(filename): """Extract the first file enclosed in a zip file as a list of words.""" with zipfile.ZipFile(filename) as f: data = tf.compat.as_str(f.read(f.namelist()[0])).split() return data vocabulary = read_data(filename) vocabulary_size = len(vocabulary) print('Data size', vocabulary_size)
将vacabulary里面的单词词频进行统计,然后根据词频大小从小到大对单词进行编号。特别的,对未达到阈值的低频词全部编号为 0.
def build_dataset(words, n_words): """Process raw inputs into a dataset.""" count = [['UNK', -1]] count.extend(collections.Counter(words).most_common(n_words - 1)) dictionary = dict() for word, _ in count: dictionary[word] = len(dictionary) data = list() unk_count = 0 for word in words: index = dictionary.get(word, 0) if index == 0: # dictionary['UNK'] unk_count += 1 data.append(index) count[0][1] = unk_count reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys())) return data, count, dictionary, reversed_dictionary # Filling 4 global variables: # data - list of codes (integers from 0 to vocabulary_size-1). # This is the original text but words are replaced by their codes # count - map of words(strings) to count of occurrences # dictionary - map of words(strings) to their codes(integers) # reverse_dictionary - maps codes(integers) to words(strings) data, count, dictionary, reverse_dictionary = build_dataset( vocabulary, vocabulary_size) del vocabulary # Hint to reduce memory. print('Most common words (+UNK)', count[:5]) print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
根据skip-gram的思想,我们最终输入的训练集的每一对应该是形如(x=center_word, y=context_word)的形式,举个例子,假设我们设置参数skip_nums=2(代表每个窗口中center_word要进行几次预测),skip_window=1(窗口的大小),那么我们得到的作为输入的训练集应该是形如(like, I)(like, eating) (eating, like) (eating, ChongQing)…
data_index = 0 def generate_batch(batch_size, num_skips, skip_window): global data_index assert batch_size % num_skips == 0 assert num_skips <= 2 * skip_window batch = np.ndarray(shape=(batch_size), dtype=np.int32) labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32) span = 2 * skip_window + 1 # [ skip_window target skip_window ] buffer = collections.deque(maxlen=span) # pylint: disable=redefined-builtin if data_index + span > len(data): data_index = 0 buffer.extend(data[data_index:data_index + span]) data_index += span for i in range(batch_size // num_skips): context_words = [w for w in range(span) if w != skip_window] words_to_use = random.sample(context_words, num_skips) for j, context_word in enumerate(words_to_use): batch[i * num_skips + j] = buffer[skip_window] labels[i * num_skips + j, 0] = buffer[context_word] if data_index == len(data): buffer.extend(data[0:span]) data_index = span else: buffer.append(data[data_index]) data_index += 1 # Backtrack a little bit to avoid skipping words in the end of a batch data_index = (data_index + len(data) - span) % len(data) return batch, labels
这一步就是搭建模型网络并进行训练了,其中需要注意的是tf.nn.nec_loss,这个函数直接帮我们完成了负采样及相应的计算,我们只需要把训练语料,待训练参数放进去就可以了。其余部分几乎和word2vec中的skip-gram思想完全一致,对算法原理还不太清楚地同学们可以参考之前fasttext一文中的参考文献:Word2vec中的数学原理。
batch_size = 128 embedding_size = 50 # Dimension of the embedding vector. skip_window = 2 # How many words to consider left and right. num_skips = 4 # How many times to reuse an input to generate a label. num_sampled = 5 # Number of negative examples to sample. graph = tf.Graph() with graph.as_default(): # Input data with tf.name_scope('inputs'): train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) # Look up embeddings for inputs. with tf.name_scope('embeddings'): embeddings = tf.Variable( tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) embed = tf.nn.embedding_lookup(embeddings, train_inputs) # Construct the variables for the NCE loss with tf.name_scope('weights'): nce_weights = tf.Variable( tf.truncated_normal( [vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size))) with tf.name_scope('biases'): nce_biases = tf.Variable(tf.zeros([vocabulary_size])) # Compute the average NCE loss for the batch. # tf.nce_loss automatically draws a new sample of the negative labels each # time we evaluate the loss. with tf.name_scope('loss'): loss = tf.reduce_mean( tf.nn.nce_loss( weights=nce_weights, biases=nce_biases, labels=train_labels, inputs=embed, num_sampled=num_sampled, num_classes=vocabulary_size)) with tf.name_scope('optimizer'): optimizer = tf.train.AdamOptimizer().minimize(loss) # Compute the cosine similarity between minibatch examples and all embeddings. # Add variable initializer. init = tf.global_variables_initializer() num_steps = 100000 with tf.Session(graph=graph) as session: # We must initialize all variables before we use them. init.run() print('Initialized') average_loss = 0 for step in xrange(num_steps): batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window) feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels} session.run(optimizer, feed_dict=feed_dict) if step % 10000 == 0: print(session.run(loss, feed_dict=feed_dict))
小结:
以上就是通过tensorflow实现word2vec训练的一个简单的流程,不得不说tensorflow功能非常强大,能想到的基本都给你准备好了。