Error when checking input: expected embedding_1_input to have shape (4,) but got array with shape (1,)

混江龙づ霸主 提交于 2021-02-10 14:40:36

问题


I use pretrained embedding vectors for my keras model. Before I did it everything worked and now I get this error:

ValueError: Error when checking input: expected embedding_1_input to have shape (4,) but got array with shape (1,)

Maybe somebody can help me, what I do wrong here. I am not sure if I did correct model.fit and model.evaluate. Maybe there is a problem?

import csv
import numpy as np
np.random.seed(42)
from keras.models import Sequential, Model
from keras.layers import *
from random import shuffle
from sklearn.model_selection import train_test_split
from keras import optimizers
from keras.callbacks import EarlyStopping
from itertools import groupby
from numpy import asarray
from numpy import zeros 
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

#function makes a list of antonyms and synonyms from the files
def preprocessing(filename):
    list_words = []
    with open(filename) as tsv:
       for line in csv.reader(tsv, dialect="excel-tab"):
           list_words.append([line[0], line[1]])
    return list_words

#function make a list of not relevant pairs by mixing synonyms and 
antonyms
def notrelevant(filename, filename2):
    list_words = []
    with open(filename) as tsv:
        with open(filename2) as tsv2:
           for lines in zip(csv.reader(tsv, dialect="excel-tab"),csv.reader(tsv2, dialect="excel-tab")):
                list_words.append([lines[0][0], lines[1][1]])
    return list_words

antonyms_list = preprocessing("antonyms.tsv")
synonyms_list = preprocessing("synonyms.tsv")
notrelevant_list = notrelevant("antonyms.tsv", "synonyms.tsv")

# function combines all antonyms, synonyms in one list with labels, 
shuffle them
def data_prepare(ant,syn,nrel):
        data = []
    for  elem1,elem2 in ant:
        data.append([[elem1,elem2], "Antonyms"])
    for elem1, elem2 in syn:
        data.append([[elem1, elem2], "Synonyms"])
    for elem1, elem2 in nrel:
        data.append([[elem1, elem2], "Not relevant"])
    shuffle(data)
    return data


data_with_labels_shuffled = 
data_prepare(antonyms_list,synonyms_list,notrelevant_list)

def label_to_onehot(labels):
    mapping = {label: i for i, label in enumerate(set(labels))}

    one_hot = np.empty((len(labels), 3))
    for i, label in enumerate(labels):
        entry = [0] * len(mapping)
        entry[mapping[label]] = 1
        one_hot[i] = entry
    return (one_hot)

def words_to_ids(labels):
    vocabulary = []
    word_to_id = {}
    ids = []
    for word1,word2 in labels:
        vocabulary.append(word1)
        vocabulary.append(word2)
    counter = 0
    for word in vocabulary:
        if word not in word_to_id:
            word_to_id[word] = counter
            counter += 1
    for word1,word2 in labels:
        ids.append([word_to_id [word1], word_to_id [word2]])
    return (ids)

def split_data(datas):
    data = np.array(datas)
    X, y = data[:, 0], data[:, 1]
    # split the data to get 60% train and 40% test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
    y_train = label_to_onehot(y_train)
    X_dev, X_test, y_dev, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
    y_dev = label_to_onehot(y_dev)
    y_test = label_to_onehot(y_test)
    return X_train, y_train, X_dev, y_dev, X_test, y_test

X_train, y_train, X_dev, y_dev, X_test, y_test = split_data(data_with_labels_shuffled)

# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(X_train)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(X_train)


# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 50))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector



VOCABSIZE = len(data_with_labels_shuffled)
EMBSIZE = 50
HIDDENSIZE = 50
KERNELSIZE = 5
MAXEPOCHS = 5

model = Sequential()
model.add(Embedding(vocab_size, 50, weights=[embedding_matrix], 
input_length=4, trainable=False))
model.add(Dropout(0.25))
model.add(Bidirectional(GRU(units = HIDDENSIZE // 2)))
#model.add(Flatten())
model.add(Dense(units = 3, activation = "softmax"))
model.compile(loss='categorical_crossentropy', optimizer="adam", 
metrics=['accuracy'])


earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='min') 
model.fit (X_train, y_train,
       batch_size=64,
       callbacks = [earlystop],
       epochs=100,
       validation_data=(X_dev, y_dev),
       verbose=1)
scores = model.evaluate(X_test, y_testbatch_size=64)

print("Accuracy is: %.2f%%" %(scores[1] * 100))

回答1:


I think the problem is that you should pass encoded_docs to your model.fit() function instead of X_train since encoded_docs contains the tokenization of your training data and X_train still only contains a list of words. Moreover, you have to make sure that the input_length parameter of your Embedding layer matches the length of these tokenized training examples that you have created in encoded_docs.



来源:https://stackoverflow.com/questions/54332745/error-when-checking-input-expected-embedding-1-input-to-have-shape-4-but-got

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!