class SelfAttention(nn.Module): """ scores each element of the sequence with a linear layer and uses the normalized scores to compute a context over the sequence. """ def __init__(self, d_hid, dropout=0.): super().__init__() self.scorer = nn.Linear(d_hid, 1) self.dropout = nn.Dropout(dropout) def forward(self, input_seq, lens): batch_size, seq_len, feature_dim = input_seq.size() input_seq = self.dropout(input_seq) scores = self.scorer(input_seq.contiguous().view(-1, feature_dim)).view(batch_size, seq_len) max_len = max(lens) for i, l in enumerate(lens): if l < max_len: scores.data[i, l:] = -np.inf scores = F.softmax(scores, dim=1) context = scores.unsqueeze(2).expand_as(input_seq).mul(input_seq).sum(1) return context # 既然命名为context就应该是整句的表示
输入是[batch_size, seq_len, feature_dim]
输出是[batch_size, feature_dim]
而transformer里的multihead_attention在memory为None时也就成self attention之输出是[batch_size, seq_len, feature_dim]
来源:51CTO
作者:guotong1988
链接:https://blog.csdn.net/guotong1988/article/details/100775514