I have a DF with a huge parseable metadata as a single string column in a Dataframe, lets call it DFA, with ColmnA.
I would like to break this column, ColmnA into mu
I opted to create a function to flatten one column and then just call it simultaneously with the udf.
First define this:
implicit class DfOperations(df: DataFrame) {
def flattenColumn(col: String) = {
def addColumns(df: DataFrame, cols: Array[String]): DataFrame = {
if (cols.isEmpty) df
else addColumns(
df.withColumn(col + "_" + cols.head, df(col + "." + cols.head)),
cols.tail
)
}
val field = df.select(col).schema.fields(0)
val newCols = field.dataType.asInstanceOf[StructType].fields.map(x => x.name)
addColumns(df, newCols).drop(col)
}
def withColumnMany(colName: String, col: Column) = {
df.withColumn(colName, col).flattenColumn(colName)
}
}
Then usage is very simple:
case class MyClass(a: Int, b: Int)
val df = sc.parallelize(Seq(
(0),
(1)
)).toDF("x")
val f = udf((x: Int) => MyClass(x*2,x*3))
df.withColumnMany("test", f($"x")).show()
// +---+------+------+
// | x|test_a|test_b|
// +---+------+------+
// | 0| 0| 0|
// | 1| 2| 3|
// +---+------+------+