How can I declare that a given Column in my DataFrame contains categorical information?
I have a Spark SQL DataFrame which I loaded from a
Hey zero323 I used the same technique to look at the metadata and I coded up this Transformer.
def _transform(self, data):
maxValues = self.getOrDefault(self.maxValues)
categoricalCols = self.getOrDefault(self.categoricalCols)
new_schema = types.StructType(data.schema.fields)
new_data = data
for (col, maxVal) in zip(categoricalCols, maxValues):
# I have not decided if I should make a new column or
# overwrite the original column
new_col_name = col + "_categorical"
new_data = new_data.withColumn(new_col_name,
data[col].astype(types.DoubleType()))
# metadata for a categorical column
meta = {u'ml_attr' : {u'vals' : [unicode(i) for i in range(maxVal + 1)],
u'type' : u'nominal',
u'name' : new_col_name}}
new_schema.add(new_col_name, types.DoubleType(), True, meta)
return data.sql_ctx.createDataFrame(new_data.rdd, new_schema)