问题
I am trying to make a chatbot in keras. I am assigning every word in the vocabulary its own ID. One training sample looks like this:
[0 0 0 0 0 0 32 328 2839 13 192 1 ] -> [23 3289 328 2318 12 0 0 0 0 0 0 0]
Then I am using the Embedding layer in Keras to embedding these ID into vectors of size 32. Then I'm using LSTM layers as the hidden layers. The problem is that my output is a list of embedded ID's like so.
[ 0.16102183 0.1238187 0.1159694 0.13688719 0.12964118 0.12848872
0.13515817 0.13582146 0.16919741 0.15453722 ... ]
How can I convert these embeddings back to the words in my original vocabulary?
Here is my code:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential, load_model
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
import os
import numpy as np
import cPickle as pickle
class Chatbot(object):
def __init__(self, h_layers=1):
# self.name = name
self.h_layers = h_layers
self.seq2seq = None
self.max_length = 0
self.vocabulary = {}
@staticmethod
def load(model_name):
with open('models/{}/chatbot_object.pkl'.format(model_name), 'rb') as pickle_file:
obj = pickle.load(pickle_file)
obj.seq2seq = load_model('models/{}/seq2seq.h5'.format(model_name))
return obj
def train(self, x_train, y_train):
count_vect = CountVectorizer()
count_vect.fit(x_train)
count_vect.fit(y_train)
self.vocabulary = count_vect.vocabulary_
self.vocabulary.update({'<START>': len(self.vocabulary),
'<END>': len(self.vocabulary) + 1,
'<PAD>': len(self.vocabulary) + 2,
'<UNK>': len(self.vocabulary) + 3})
for i in range(len(x_train)):
x_train[i] = ['<START>'] + [w.lower() for w in word_tokenize(x_train[i])] + ['<END>']
for i in range(len(y_train)):
y_train[i] = ['<START>'] + [w.lower() for w in word_tokenize(y_train[i])] + ['<END>']
for sample in x_train:
if len(sample) > self.max_length:
self.max_length = len(sample)
for sample in y_train:
if len(sample) > self.max_length:
self.max_length = len(sample)
for i in range(len(x_train)):
x_train[i] = [self.vocabulary[w] for w in x_train[i] if w in self.vocabulary]
for i in range(len(y_train)):
y_train[i] = [self.vocabulary[w] for w in y_train[i] if w in self.vocabulary]
x_train = sequence.pad_sequences(x_train, maxlen=self.max_length, value=self.vocabulary['<PAD>'])
y_train = sequence.pad_sequences(y_train, maxlen=self.max_length, padding='post',
value=self.vocabulary['<PAD>'])
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)
embedding_vector_length = 32
self.seq2seq = Sequential()
self.seq2seq.add(Embedding(len(self.vocabulary), embedding_vector_length, input_length=self.max_length))
for _ in range(self.h_layers):
self.seq2seq.add(LSTM(self.max_length, return_sequences=True))
self.seq2seq.add(LSTM(self.max_length))
self.seq2seq.compile(loss='cosine_proximity', optimizer='adam', metrics=['accuracy'])
self.seq2seq.fit(x_train[:100], y_train[:100], epochs=5, batch_size=32)
def save(self, filename):
if filename not in os.listdir('models'):
os.system('mkdir models/{}'.format(filename))
self.seq2seq.save('models/{}/seq2seq.h5'.format(filename))
self.seq2seq = None
with open('models/{}/chatbot_object.pkl'.format(filename), 'wb') as pickle_file:
pickle.dump(self, pickle_file)
def respond(self, text):
tokens = ['<START>'] + [w.lower() for w in word_tokenize(text)] + ['<END>']
for i in range(len(tokens)):
if tokens[i] in self.vocabulary:
tokens[i] = self.vocabulary[tokens[i]]
else:
tokens[i] = self.vocabulary['<PAD>']
x = sequence.pad_sequences([tokens], maxlen=self.max_length, value=self.vocabulary['<PAD>'])
prediction = self.seq2seq.predict(x, batch_size=1)
return prediction[0]
回答1:
I could not find the answer to this either, so I wrote a lookup function.
def lookup(tokenizer, vec, returnIntNotWord=True):
twordkey = [(k, tokenizer.word_index[k]) for k in sorted(tokenizer.word_index, key=tokenizer.word_index.get, reverse=False)]
oneHotVec = [] #captures the index of the ords
engVec = [] #this one returns the indexs and the words. Make sure returnIntNotWord is false though
for eachRow, notUsed in enumerate(vec):
for index, item in enumerate(vec[0]):
if vec[eachRow][index] == 1:
oneHotVec.append(index)
for index in oneHotVec:
engVec.append(twordkey[index])
if returnIntNotWord == True:
return oneHotVec
else:
return engVec
Tokenizer is the Keras Tokenizer.
Vec is a 2D vector of One-Hot encoded labels
ReturnIntNotWord, this is in comments.
来源:https://stackoverflow.com/questions/45773660/reverse-word-embeddings-in-keras-python