reverse word embeddings in keras - python

问题

I am trying to make a chatbot in keras. I am assigning every word in the vocabulary its own ID. One training sample looks like this:

[0 0 0 0 0 0 32 328 2839 13 192 1 ] -> [23 3289 328 2318 12 0 0 0 0 0 0 0]

Then I am using the Embedding layer in Keras to embedding these ID into vectors of size 32. Then I'm using LSTM layers as the hidden layers. The problem is that my output is a list of embedded ID's like so.

[ 0.16102183 0.1238187 0.1159694 0.13688719 0.12964118 0.12848872 0.13515817 0.13582146 0.16919741 0.15453722 ... ]

How can I convert these embeddings back to the words in my original vocabulary?

Here is my code:

from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential, load_model
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

import os

import numpy as np
import cPickle as pickle


class Chatbot(object):

def __init__(self, h_layers=1):
    # self.name = name
    self.h_layers = h_layers
    self.seq2seq = None
    self.max_length = 0
    self.vocabulary = {}

@staticmethod
def load(model_name):
    with open('models/{}/chatbot_object.pkl'.format(model_name), 'rb') as pickle_file:
        obj = pickle.load(pickle_file)
    obj.seq2seq = load_model('models/{}/seq2seq.h5'.format(model_name))
    return obj

def train(self, x_train, y_train):
    count_vect = CountVectorizer()
    count_vect.fit(x_train)
    count_vect.fit(y_train)

    self.vocabulary = count_vect.vocabulary_
    self.vocabulary.update({'<START>': len(self.vocabulary),
                            '<END>': len(self.vocabulary) + 1,
                            '<PAD>': len(self.vocabulary) + 2,
                            '<UNK>': len(self.vocabulary) + 3})

    for i in range(len(x_train)):
        x_train[i] = ['<START>'] + [w.lower() for w in word_tokenize(x_train[i])] + ['<END>']
    for i in range(len(y_train)):
        y_train[i] = ['<START>'] + [w.lower() for w in word_tokenize(y_train[i])] + ['<END>']

    for sample in x_train:
        if len(sample) > self.max_length:
            self.max_length = len(sample)
    for sample in y_train:
        if len(sample) > self.max_length:
            self.max_length = len(sample)

    for i in range(len(x_train)):
        x_train[i] = [self.vocabulary[w] for w in x_train[i] if w in self.vocabulary]
    for i in range(len(y_train)):
        y_train[i] = [self.vocabulary[w] for w in y_train[i] if w in self.vocabulary]

    x_train = sequence.pad_sequences(x_train, maxlen=self.max_length, value=self.vocabulary['<PAD>'])
    y_train = sequence.pad_sequences(y_train, maxlen=self.max_length, padding='post',
                                     value=self.vocabulary['<PAD>'])

    x_train = np.asarray(x_train)
    y_train = np.asarray(y_train)

    embedding_vector_length = 32

    self.seq2seq = Sequential()
    self.seq2seq.add(Embedding(len(self.vocabulary), embedding_vector_length, input_length=self.max_length))

    for _ in range(self.h_layers):
        self.seq2seq.add(LSTM(self.max_length, return_sequences=True))

    self.seq2seq.add(LSTM(self.max_length))
    self.seq2seq.compile(loss='cosine_proximity', optimizer='adam', metrics=['accuracy'])
    self.seq2seq.fit(x_train[:100], y_train[:100], epochs=5, batch_size=32)

def save(self, filename):
    if filename not in os.listdir('models'):
        os.system('mkdir models/{}'.format(filename))
    self.seq2seq.save('models/{}/seq2seq.h5'.format(filename))
    self.seq2seq = None
    with open('models/{}/chatbot_object.pkl'.format(filename), 'wb') as pickle_file:
        pickle.dump(self, pickle_file)

def respond(self, text):
    tokens = ['<START>'] + [w.lower() for w in word_tokenize(text)] + ['<END>']
    for i in range(len(tokens)):
        if tokens[i] in self.vocabulary:
            tokens[i] = self.vocabulary[tokens[i]]
        else:
            tokens[i] = self.vocabulary['<PAD>']
    x = sequence.pad_sequences([tokens], maxlen=self.max_length, value=self.vocabulary['<PAD>'])
    prediction = self.seq2seq.predict(x, batch_size=1)
    return prediction[0]

回答1:

I could not find the answer to this either, so I wrote a lookup function.

def lookup(tokenizer, vec, returnIntNotWord=True):
    twordkey = [(k, tokenizer.word_index[k]) for k in sorted(tokenizer.word_index, key=tokenizer.word_index.get, reverse=False)]
    oneHotVec = [] #captures the index of the ords
    engVec = [] #this one returns the indexs and the words. Make sure returnIntNotWord is false though
    for eachRow, notUsed in enumerate(vec):
        for index, item in enumerate(vec[0]):
            if vec[eachRow][index] == 1:
                oneHotVec.append(index)
    for index in oneHotVec:
        engVec.append(twordkey[index])
    if returnIntNotWord == True:
        return oneHotVec
    else:
        return engVec

Tokenizer is the Keras Tokenizer.
Vec is a 2D vector of One-Hot encoded labels
ReturnIntNotWord, this is in comments.

来源：https://stackoverflow.com/questions/45773660/reverse-word-embeddings-in-keras-python

标签

python

python-2.7

machine-learning

neural-network

deep-learning