ELMo代码详解(一)：数据准备

词汇表文件，生成word和索引的相互对应关系，即_id_to_word和_word_to_id，前者是一个数组，后者是一个字典。当然，我们也需要加上一个特殊的词，比如<S>, </S>，<UNK>（分别表示句首，句尾和不知词）。主要的代码如下:

def __init__(self, filename, validate_file=False):     '''     filename = the vocabulary file.  It is a flat text file with one         (normalized) token per line.  In addition, the file should also         contain the special tokens <S>, </S>, <UNK> (case sensitive).         vocab文件，是一个纯文本，每一行只有一个词。另外，这个文件应该包含特殊词，         比如<S>, </S>, <UNK>等     '''     self._id_to_word = []     self._word_to_id = {}     self._unk = -1     self._bos = -1     self._eos = -1      with open(filename) as f:         idx = 0         for line in f:                                                        #词汇表中一行就是一个单词             word_name = line.strip()             if word_name == '<S>':                 self._bos = idx             elif word_name == '</S>':                 self._eos = idx             elif word_name == '<UNK>':                 self._unk = idx             if word_name == '!!!MAXTERMID':                 continue              self._id_to_word.append(word_name)             self._word_to_id[word_name] = idx             idx += 1      # check to ensure file has special tokens     if validate_file:         if self._bos == -1 or self._eos == -1 or self._unk == -1:             raise ValueError("Ensure the vocabulary file has "                              "<S>, </S>, <UNK> tokens")

当然，类中还有两个很实用的函数，一个是编码函数encode，另一个是解码函数decode。编码器encode的作用是将一条句子sentence转化为一个word-ids列表，注意要加上句首和句尾token。当然包括反转选项，用来做双向的LSTM。而解码器decode就是将word-ids列表转化为相应的单词。

def encode(self, sentence, reverse=False, split=True):     """Convert a sentence to a list of ids, with special tokens added.     Sentence is a single string with tokens separated by whitespace.      If reverse, then the sentence is assumed to be reversed, and         this method will swap the BOS/EOS tokens appropriately.        将一个sentenct转化为ids序列        并提供句子反转的功能     """      if split:         word_ids = [             self.word_to_id(cur_word) for cur_word in sentence.split()         ]     else:         word_ids = [self.word_to_id(cur_word) for cur_word in sentence]      if reverse:         return np.array([self.eos] + word_ids + [self.bos], dtype=np.int32) #在每一条句子首位加上了<eos>和<bos>     else:         return np.array([self.bos] + word_ids + [self.eos], dtype=np.int32)  def decode(self, cur_ids):     """Convert a list of ids to a sentence, with space inserted.        将一个ids序列转化为word序列     """     return ' '.join([self.id_to_word(cur_id) for cur_id in cur_ids])

注意这个类是上面word词汇表Vocabulary的子类，这意味着这个字符类包含了Vocabulary的所有变量和方法！

#将词转化为char_ids def _convert_word_to_char_ids(self, word):     code = np.zeros([self.max_word_length], dtype=np.int32)     code[:] = self.pad_char      #将word中每一个字符转化为utf-8编码，然后用数组存起来，例如:     #english中，e:101, n:110, g:103, l:108, h:105, s:115, h:104     word_encoded = word.encode('utf-8', 'ignore')[:(self.max_word_length-2)]     code[0] = self.bow_char                                      #加上词开始和结尾的编码     for k, chr_id in enumerate(word_encoded, start=1):         code[k] = chr_id     code[k + 1] = self.eow_char      return code   def __init__(self, filename, max_word_length, **kwargs):     #调用父类Vocabulary，生成word和id之间的转换等     super(UnicodeCharsVocabulary, self).__init__(filename, **kwargs)     self._max_word_length = max_word_length                             #每个词对应最大字符长      # char ids 0-255 come from utf-8 encoding bytes     # assign 256-300 to special chars     self.bos_char = 256  # <begin sentence>     self.eos_char = 257  # <end sentence>     self.bow_char = 258  # <begin word>     self.eow_char = 259  # <end word>     self.pad_char = 260 # <padding>      num_words = len(self._id_to_word)                                   #单词的个数，父类中的属性      #每个词都会对应一个char_ids列表     self._word_char_ids = np.zeros([num_words, max_word_length],         dtype=np.int32)      # the charcter representation of the begin/end of sentence characters     # 对句首或者句尾的token来一个字符的表示     def _make_bos_eos(c):         r = np.zeros([self.max_word_length], dtype=np.int32)         r[:] = self.pad_char         r[0] = self.bow_char                                            #词的开始         r[1] = c         r[2] = self.eow_char                                            #词的结束         return r     self.bos_chars = _make_bos_eos(self.bos_char)                       #句子开始对应的char_ids     self.eos_chars = _make_bos_eos(self.eos_char)                       #句子的结尾对应的char_ids      for i, word in enumerate(self._id_to_word):                         #遍历id2word数组，得到每一个词的char_ids         self._word_char_ids[i] = self._convert_word_to_char_ids(word)      self._word_char_ids[self.bos] = self.bos_chars                      #将句子开头和结尾当作一个word处理     self._word_char_ids[self.eos] = self.eos_chars

通过以上两个函数，我们就可以得到每个单词(word)对应的字符id序列(char-ids)，包括句首和句尾的字符id序列表示。

#返回word对应的char_ids数组 def word_to_char_ids(self, word):     if word in self._word_to_id:         return self._word_char_ids[self._word_to_id[word]]     else:         return self._convert_word_to_char_ids(word)  def encode_chars(self, sentence, reverse=False, split=True):     '''     Encode the sentence as a white space delimited string of tokens.     对一整句话进行编码，编码成chars     '''     if split:                                                             #如果切割了句子         chars_ids = [self.word_to_char_ids(cur_word)                         for cur_word in sentence.split()]     else:         chars_ids = [self.word_to_char_ids(cur_word)                  for cur_word in sentence]     if reverse:         return np.vstack([self.eos_chars] + chars_ids + [self.bos_chars]) #在每一条句子上都加了<eos>和<bos>            else:         return np.vstack([self.bos_chars] + chars_ids + [self.eos_chars])

def batch_sentences(self, sentences: List[List[str]]):     '''     Batch the sentences as character ids     确定是character_ids?而不是word_ids     Each sentence is a list of tokens without <s> or </s>, e.g.     [['The', 'first', 'sentence', '.'], ['Second', '.']]     '''     n_sentences = len(sentences)     max_length = max(len(sentence) for sentence in sentences) + 2      X_ids = np.zeros((n_sentences, max_length), dtype=np.int64)          #word_ids是二维的，[batch_size, max_len]      for k, sent in enumerate(sentences):         length = len(sent) + 2         ids_without_mask = self._lm_vocab.encode(sent, split=False)         # add one so that 0 is the mask value         X_ids[k, :length] = ids_without_mask + 1                         #0表示mask值      return X_ids

def batch_sentences(self, sentences: List[List[str]]):     '''     Batch the sentences as character ids     Each sentence is a list of tokens without <s> or </s>, e.g.     [['The', 'first', 'sentence', '.'], ['Second', '.']]     '''     n_sentences = len(sentences)                                      #句子个数     max_length = max(len(sentence) for sentence in sentences) + 2     #句子最大长度，加上句首和句尾?      X_char_ids = np.zeros(                                            #三维数组，每条句子中每个单词对应的char_ids数组         (n_sentences, max_length, self._max_token_length),         dtype=np.int64     )      #遍历数组     for k, sent in enumerate(sentences):         length = len(sent) + 2         char_ids_without_mask = self._lm_vocab.encode_chars(          #对每个sentence得到char_ids数组             sent, split=False)         # add one so that 0 is the mask value, 加上1，所以0是mask值         X_char_ids[k, :length, :] = char_ids_without_mask + 1         #直接复制粘贴?将对应值加1，其他值填0      return X_char_ids

def _get_batch(generator, batch_size, num_steps, max_word_length): """Read batches of input.    都一个batch的输入 """ cur_stream = [None] * batch_size                                         #None表示任意大小  no_more_data = False while True:     inputs = np.zeros([batch_size, num_steps], np.int32)                 #batch中word_ids               if max_word_length is not None:                                      #batch中每条句子每个word对应的char_ids         char_inputs = np.zeros([batch_size, num_steps, max_word_length],                             np.int32)     else:         char_inputs = None     targets = np.zeros([batch_size, num_steps], np.int32)                #我们的目标是预测下一个词来优化emlo，所以我们以向右滑动的1个词作为target      for i in range(batch_size):                                          #每一条句子         cur_pos = 0                                                      #这个值?          while cur_pos < num_steps:                                       #循环是不是有点多余, 毫无意义             if cur_stream[i] is None or len(cur_stream[i][0]) <= 1:                 try:                     cur_stream[i] = list(next(generator))                #一个生成器一次只生成一条句子信息                 except StopIteration:                     # No more data, exhaust current streams and quit                          no_more_data = True                     break             #感觉cur_stream是这样一个东西，[i][0]代表的是word_ids，[i][1]代表的是char_ids?             #你的猜测是完全正确的，num_steps是一个窗口大小吗?             #所以下面的一次是，读一个窗口的数据?             how_many = min(len(cur_stream[i][0]) - 1, num_steps - cur_pos)             next_pos = cur_pos + how_many              inputs[i, cur_pos:next_pos] = cur_stream[i][0][:how_many]             if max_word_length is not None:                 char_inputs[i, cur_pos:next_pos] = cur_stream[i][1][                                                                 :how_many]             targets[i, cur_pos:next_pos] = cur_stream[i][0][1:how_many+1]     #后一个词是预测对象              cur_pos = next_pos              cur_stream[i][0] = cur_stream[i][0][how_many:]                    #cur_stream也跟着往后移动?             if max_word_length is not None:                 cur_stream[i][1] = cur_stream[i][1][how_many:]      if no_more_data:         # There is no more data.  Note: this will not return data         # for the incomplete batch         break      X = {'token_ids': inputs, 'tokens_characters': char_inputs,              'next_token_id': targets}      yield X

def get_sentence(self):     """     构造一个生成器吗?     """     while True:         if self._i == self._nids:             self._ids = self._load_random_shard()                             #重新加载文件读取         ret = self._ids[self._i]                                              #一次仅仅训练一条句子?         self._i += 1         yield ret   def iter_batches(self, batch_size, num_steps):     """一个生成数据的迭代器"""     for X in _get_batch(self.get_sentence(), batch_size, num_steps,                        self.max_word_length):          # token_ids = (batch_size, num_steps)         # char_inputs = (batch_size, num_steps, 50) of character ids         # targets = word ID of next word (batch_size, num_steps)         yield X

def __init__(self, filepattern, vocab, test=False, shuffle_on_load=False):     '''     bidirectional version of LMDataset     前向的LSTM传播过程数据正常取     反向的LSTM传播过程只需要将数据反转就好了     '''     self._data_forward = LMDataset(                                            #正向数据集         filepattern, vocab, reverse=False, test=test,         shuffle_on_load=shuffle_on_load)     self._data_reverse = LMDataset(         filepattern, vocab, reverse=True, test=test,                           #反向数据集         shuffle_on_load=shuffle_on_load)   def iter_batches(self, batch_size, num_steps):     """     将二者合成一个数据集?     """     max_word_length = self._data_forward.max_word_length      for X, Xr in zip(         _get_batch(self._data_forward.get_sentence(), batch_size,                   num_steps, max_word_length),         _get_batch(self._data_reverse.get_sentence(), batch_size,                   num_steps, max_word_length)         ):          for k, v in Xr.items():                                               #都合并到X中去             #形成token_ids_reverse, token_characters_reverse等             X[k + '_reverse'] = v                                                       yield X

文章来源: ELMo代码详解(一)：数据准备

标签

char函数

char

num

ids