问题
I am trying to reproduce Tensorflow tutorial code from here which is supposed to download CSV file and preprocess data (up to combining numerical data together).
The reproducible example goes as follows:
import tensorflow as tf
print("TF version is: {}".format(tf.__version__))
# Download data
train_url = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
test_url = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"
train_path = tf.keras.utils.get_file("train.csv", train_url)
test_path = tf.keras.utils.get_file("test.csv", test_url)
# Get data into batched dataset
def get_dataset(path):
dataset = tf.data.experimental.make_csv_dataset(path
,batch_size=5
,num_epochs=1
,label_name='survived'
,na_value='?'
,ignore_errors=True)
return dataset
raw_train_dataset = get_dataset(train_path)
raw_test_dataset = get_dataset(test_path)
# Define numerical and categorical column lists
def get_df_batch(dataset):
for batch,label in dataset.take(1):
df = pd.DataFrame()
df['survived'] = label.numpy()
for key, value in batch.items():
df[key] = value.numpy()
return df
dfb = get_df_batch(raw_train_dataset)
num_columns = [i for i in dfb if (dfb[i].dtype != 'O' and i!='survived')]
cat_columns = [i for i in dfb if dfb[i].dtype == 'O']
# Combine numerical columns into one `numerics` column
class Pack():
def __init__(self,names):
self.names = names
def __call__(self,features, labels):
num_features = [features.pop(name) for name in self.names]
num_features = [tf.cast(feat, tf.float32) for feat in num_features]
num_features = tf.stack(num_features, axis=1)
features["numerics"] = num_features
return features, labels
packed_train = raw_train_dataset.map(Pack(num_columns))
# Show what we got
def show_batch(dataset):
for batch, label in dataset.take(1):
for key, value in batch.items():
print("{:20s}: {}".format(key,value.numpy()))
show_batch(packed_train)
TF version is: 2.0.0
sex : [b'female' b'female' b'male' b'male' b'male']
class : [b'Third' b'First' b'Second' b'First' b'Third']
deck : [b'unknown' b'E' b'unknown' b'C' b'unknown']
embark_town : [b'Queenstown' b'Cherbourg' b'Southampton' b'Cherbourg' b'Queenstown']
alone : [b'n' b'n' b'y' b'n' b'n']
numerics : [[ 28. 1. 0. 15.5 ]
[ 40. 1. 1. 134.5 ]
[ 32. 0. 0. 10.5 ]
[ 49. 1. 0. 89.1042]
[ 2. 4. 1. 29.125 ]]
Then I try, and fail, combine numeric features in a functional way:
@tf.function
def pack_func(row, num_columns=num_columns):
features, labels = row
num_features = [features.pop(name) for name in num_columns]
num_features = [tf.cast(feat, tf.float32) for feat in num_features]
num_features = tf.stack(num_features, axis=1)
features['numerics'] = num_features
return features, labels
packed_train = raw_train_dataset.map(pack_func)
Partial traceback:
ValueError: in converted code: :3 pack_func * features, labels = row ValueError: too many values to unpack (expected 2)
2 questions here:
How
featuresandlabelsare get assigned indef __call__(self,features, labels):in the definition of ClassPack. My intuition they should be passed in as defined variables, though I absolutely do not understand where they get defined.When I do
for row in raw_train_dataset.take(1):
print(type(row))
print(len(row))
f,l = row
print(f)
print(l)
I see that row in raw_train_dataset is a tuple2, which can be successfully unpacked into features and labels. Why it cannot be done via map API? Can you suggest the right way of combining numerical features in functional way?
Many thanks in advance!!!
回答1:
After some research and trial the answer to the second question seems to be:
def pack_func(features, labels, num_columns=num_columns):
num_features = [features.pop(name) for name in num_columns]
num_features = [tf.cast(feat, tf.float32) for feat in num_features]
num_features = tf.stack(num_features, axis=1)
features['numerics'] = num_features
return features, labels
packed_train = raw_train_dataset.map(pack_func)
show_batch(packed_train)
sex : [b'male' b'male' b'male' b'female' b'male']
class : [b'Third' b'Third' b'Third' b'First' b'Third']
deck : [b'unknown' b'unknown' b'unknown' b'E' b'unknown']
embark_town : [b'Southampton' b'Southampton' b'Queenstown' b'Cherbourg' b'Queenstown']
alone : [b'y' b'n' b'n' b'n' b'y']
numerics : [[24. 0. 0. 8.05 ]
[14. 5. 2. 46.9 ]
[ 2. 4. 1. 29.125 ]
[39. 1. 1. 83.1583]
[21. 0. 0. 7.7333]]
来源:https://stackoverflow.com/questions/58841401/tensorflow-2-0-packing-numerical-features-of-a-dataset-together-in-a-functional