Items of feature_columns must be a _FeatureColumn

问题

I am getting this error:

ValueError: Items of feature_columns must be a _FeatureColumn. Given (type ): Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited'], dtype='object').

I am using tensorFlow lib. I want to get prediction results but I can not run m.train(input_fn=get_input_fn ,steps=5000) code. I always got the same error whatever I did. I used these input functions in the following but nothing changed.

def input_fn_train():
     x=tf.constant(df_train.astype(np.float64)),
     y=tf.constant(df_train[LABEL].astype(np.float64))
     return x, y

and

def get_input_fn(data_set, num_epochs=None, shuffle=False):
     return tf.estimator.inputs.pandas_input_fn(
      x=pd.DataFrame({k: data_set[k].values for k in data_set.columns}),
      y=pd.Series(data_set[LABEL].values), num_epochs=num_epochs,
                  shuffle=shuffle)

I can not understand what should I do. What the error is about? I've been googling but never found useful thing. How can I handle this error. The code is below. Thanks!

import pandas as pd
import tensorflow as tf
import numpy as np
import tempfile

COLS= ["RowNumber","CustomerId","Surname","CreditScore","Geography",
"Gender","Age","Tenure","Balance","NumOfProducts","HasCrCard",
"IsActiveMember","EstimatedSalary","Exited"]


FEATURES = ["CreditScore","Age","Tenure","Balance","NumOfProducts",
       "HasCrCard","IsActiveMember", "EstimatedSalary"]

LABEL="Exited"

df_train = pd.read_csv("Churn_Modelling.csv", skipinitialspace=True, 
header=0)
df_test = pd.read_csv("Churn_Modelling.csv", skipinitialspace=True, 
header=0)
test_label = df_test[LABEL].astype(float)
df_test.drop("Surname", axis = 1, inplace=True)
df_test.drop("RowNumber", axis = 1, inplace=True)
df_test.drop("CustomerId", axis = 1, inplace=True)
df_train.drop("CustomerId", axis = 1, inplace=True)
df_train.drop("Surname", axis = 1, inplace=True)
df_train.drop("RowNumber", axis = 1, inplace=True)
df_train.drop("Geography", axis = 1, inplace=True)
df_train.drop("Gender", axis = 1, inplace=True)

def get_input_fn():
    return {'x': tf.constant(df_train[FEATURES].as_matrix(), tf.float32, 
           df_train.shape),
           'y': tf.constant(df_train[LABEL].as_matrix(), tf.float32, 
            df_train.shape)
           }

 df=df_train.select_dtypes(exclude=['object'])
 numeric_cols=df.columns

 m = tf.estimator.LinearClassifier(model_dir=model_dir, feature_columns=
[numeric_cols])

 m.train(input_fn=get_input_fn ,steps=5000)
 results = m.evaluate(input_fn= get_input_fn(df_test, num_epochs=1, 
 shuffle=False),steps=None)

 y = m.predict(input_fn=get_input_fn(df_test, num_epochs=1, shuffle=False))
 pred = list(y)

 rowNumber=0
 for i in pred:
     print(str(rowNumber)+': '+str(pred[i]))
     rowNumber=rowNumber+1

回答1:

Your first mistake is how you create tf.estimator.LinearClassifier. You're passing the dataframe index df.columns into feature_columns, but should pass the list of tensorflow feature columns. The columns should define if it's numerical or categorical and in the later case the encoding type.

Secondly, the input function can be simplified a lot, since you're reading pandas dataframe. Just use tf.estimator.inputs.pandas_input_fn.

Your .csv is most likely different, I've made a dummy one with some values. So here's a way to read the input and fit the model correctly:

import pandas as pd
import tensorflow as tf

FEATURES = ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts", 
            "HasCrCard", "IsActiveMember", "EstimatedSalary", "Exited"]

credit_score = tf.feature_column.numeric_column("CreditScore")
age = tf.feature_column.numeric_column("Age")
tenure = tf.feature_column.numeric_column("Tenure")
balance = tf.feature_column.numeric_column("Balance")
num_of_products = tf.feature_column.numeric_column("NumOfProducts")
has_card = tf.feature_column.categorical_column_with_vocabulary_list("HasCrCard", ["True", "False"])
is_active_member = tf.feature_column.categorical_column_with_vocabulary_list("IsActiveMember", ["True", "False"])
estimated_salary = tf.feature_column.numeric_column("EstimatedSalary")
feature_columns = [credit_score, age, tenure, balance, num_of_products, has_card, is_active_member, estimated_salary]

def input_fn(num_epochs=None, shuffle=True, batch_size=100):
  df = pd.read_csv('Churn_Modelling.csv',
                   names=FEATURES,
                   dtype={'HasCrCard': str, 'IsActiveMember': str},
                   skipinitialspace=True,
                   header=0)
  df = df.dropna(how='any', axis=0)   # remove NaN elements
  labels = df["Exited"]
  return tf.estimator.inputs.pandas_input_fn(x=df,
                                             y=labels,
                                             batch_size=batch_size,
                                             num_epochs=num_epochs,
                                             shuffle=shuffle,
                                             num_threads=5)

model = tf.estimator.LinearClassifier(model_dir=None,
                                      feature_columns=feature_columns)
model.train(input_fn=input_fn(), steps=100)

回答2:

It is working clearly.

import pandas as pd
import tensorflow as tf
import tempfile
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score


def split_data(data, rate, label):
    data = data.dropna()

    train_data, test_data = train_test_split(data, test_size=rate)

    train_label = train_data[label]
    train_data = train_data.drop(label, 1)

    test_label = test_data[label]
    test_data = test_data.drop(label, 1)
    return train_data, train_label, test_data, test_label



LABEL = "Exited"

data = pd.read_csv("Churn_Modelling.csv", skipinitialspace=True, 
    header=0)

data.drop("Surname", axis=1, inplace=True)
data.drop("RowNumber", axis=1, inplace=True)
data.drop("CustomerId", axis=1, inplace=True)
data.drop("Geography", axis=1, inplace=True)
data.drop("Gender", axis=1, inplace=True)
x_train, y_train, x_test, y_test = split_data(data, 0.20, LABEL)



def get_input_fn_train():
    input_fn = tf.estimator.inputs.pandas_input_fn(
        x=x_train,
        y=y_train,
        shuffle=False
    )
    return input_fn

def get_input_fn_test():
    input_fn = tf.estimator.inputs.pandas_input_fn(
        x=x_test,
        y=y_test,
        shuffle=False
    )
    return input_fn


feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input_fn
(get_input_fn_train())


model_dir = tempfile.mkdtemp()
m = tf.estimator.LinearClassifier(model_dir=model_dir, 
feature_columns=feature_columns)

# train data
m.train(input_fn=get_input_fn_train(), steps=5000)

# you can get accuracy, accuracy_baseline, auc, auc_precision_recall, 
#average_loss, global_step, label/mean, lossprediction/mean

results = m.evaluate(input_fn=get_input_fn_test(), steps=None)

print("model directory = %s" % model_dir)
for key in sorted(results):
    print("%s: %s" % (key, results[key]))

# get prediction results
y = m.predict(input_fn=get_input_fn_test())
predictions = list(y)
pred1=pd.DataFrame(data=predictions)
prediction=pd.DataFrame(data=pred1['class_ids'])
pred=[]
for row in prediction["class_ids"]:
    pred.append(row[0])

rowNumber = 0
for i in pred:
    print(str(rowNumber) + ': ' + str(i))
    rowNumber = rowNumber + 1


def calculate(prediction, LABEL):
    arr = {"accuracy": accuracy_score(prediction, LABEL),
           "report": classification_report(prediction, LABEL),
           "Confusion_Matrix": confusion_matrix(prediction, LABEL),
           "F1 score": f1_score(prediction, LABEL),
           "Recall Score": recall_score(prediction, LABEL),
           "cohen_kappa": cohen_kappa_score(prediction, LABEL)
           }
    return arr


pred2 = pd.DataFrame(data=pred)

print(calculate(pred2.round(), y_test))

回答3:

I'm going to make some small changes to @Maxim's answer (thanks, btw) and post a minimum working example with random numpy data. This seems to run fine on my windows machine. Note the suppressed warning due to my particular hardware.

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import pandas as pd
import numpy as np
import tensorflow as tf

FEATURES = ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts", "EstimatedSalary", "Exited"]
credit_score = tf.feature_column.numeric_column("CreditScore")
age = tf.feature_column.numeric_column("Age")
tenure = tf.feature_column.numeric_column("Tenure")
balance = tf.feature_column.numeric_column("Balance")
num_of_products = tf.feature_column.numeric_column("NumOfProducts")
estimated_salary = tf.feature_column.numeric_column("EstimatedSalary")
feature_columns = [credit_score, age, tenure, balance, num_of_products, estimated_salary]

def input_fn(num_epochs=None, shuffle=True, batch_size=100):
  N_features = len(FEATURES)
  print(N_features)
  N_examples = 5000
  X_train = np.random.rand(N_examples,N_features)
  Y_train  = np.random.rand(N_examples)
  columns = [str(i) for i in range(N_features)]
  columns = FEATURES
  df = pd.DataFrame(data = X_train, columns = columns)
  labels = df["Exited"]
  return tf.estimator.inputs.pandas_input_fn(x=df,
                                             y=labels,
                                             batch_size=batch_size,
                                             num_epochs=num_epochs,
                                             shuffle=shuffle,
                                             num_threads=5)

model = tf.estimator.LinearClassifier(model_dir='model_dir',
                                      feature_columns=feature_columns)
model.train(input_fn=input_fn(), steps=100)

来源：https://stackoverflow.com/questions/47197989/items-of-feature-columns-must-be-a-featurecolumn

标签

python

machine-learning

tensorflow

deep-learning

feature-selection