Group by column “grp” and compress DataFrame - (take last not null value for each column ordering by column “ord”)

血红的双手。 提交于 2019-12-04 09:46:30

I'd go with same approach like @LeoC, but I believe that there is no need to manipulate column names as string and I would go with a more spark-sql like answer.

import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions.{col, first, last}

val win = Window.partitionBy("grp").orderBy(col("ord")).rowsBetween(0, Window.unboundedFollowing)

// In case there is more than one group column
val nonAggCols = Seq("grp")

// Select columns to aggregate on
val cols: Seq[String] = df.columns.diff(nonAggCols).toSeq

// Map over selection and apply fct
val aggregations: Seq[Column] = cols.map(c => first(col(c), ignoreNulls = true).as(c))

// I'd rather cache the following step as it might get expensive
val step1 = cols.foldLeft(df)((acc, c) => acc.withColumn(c, last(col(c), ignoreNulls = true).over(win))).cache

// Finally we can aggregate our results as followed
val results = step1.groupBy(nonAggCols.head, nonAggCols.tail: _*).agg(aggregations.head, aggregations.tail: _*)

results.show
// +---+--------+---+----+----+
// |grp|null_col|ord|col1|col2|
// +---+--------+---+----+----+
// |  1|    null| 12| s13|  11|
// |  2|    null| 19| a23|  77|
// +---+--------+---+----+----+

I hope this helps.

EDIT: The reason you are not getting the same results is because the reader that you are using isn't correct.

It interprets null from the file as a string and not a null; i.e :

scala> df.filter('col1.isNotNull).show
// +---+--------+---+----+----+
// |grp|null_col|ord|col1|col2|
// +---+--------+---+----+----+
// |  1|    null|  3|null|  11|
// |  2|    null|  2| xxx|  22|
// |  1|    null|  1| yyy|null|
// |  2|    null|  7|null|  33|
// |  1|    null| 12|null|null|
// |  2|    null| 19|null|  77|
// |  1|    null| 10| s13|null|
// |  2|    null| 11| a23|null|
// +---+--------+---+----+----+

Here is my version of readSparkOutput :

def readSparkOutput(filePath: String): org.apache.spark.sql.DataFrame = {
  val step1 = spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .option("delimiter", "|")
    .option("parserLib", "UNIVOCITY")
    .option("ignoreLeadingWhiteSpace", "true")
    .option("ignoreTrailingWhiteSpace", "true")
    .option("comment", "+")
    .csv(filePath)

  val step2 = step1.select(step1.columns.filterNot(_.startsWith("_c")).map(step1(_)): _*)

  val columns = step2.columns
  columns.foldLeft(step2)((acc, c) => acc.withColumn(c, when(col(c) =!= "null", col(c))))
}

Consider the following approach that applies Window function last(c, ignoreNulls=true) ordered by "ord" per "grp" to each of the selected columns; followed by a groupBy("grp") to fetch the first agg(colFcnMap) result:

import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window

val df0 = Seq(
  (1, 3, None, Some(11)),
  (2, 2, Some("aaa"), Some(22)),
  (1, 1, Some("s12"), None),
  (2, 7, None, Some(33)),
  (1, 12, None, None),
  (2, 19, None, Some(77)),
  (1, 10, Some("s13"), None),
  (2, 11, Some("a23"), None)
).toDF("grp", "ord", "col1", "col2")

val df = df0.withColumn("null_col", lit(null))

df.orderBy("grp", "ord").show
// +---+---+----+----+--------+
// |grp|ord|col1|col2|null_col|
// +---+---+----+----+--------+
// |  1|  1| s12|null|    null|
// |  1|  3|null|  11|    null|
// |  1| 10| s13|null|    null|
// |  1| 12|null|null|    null|
// |  2|  2| aaa|  22|    null|
// |  2|  7|null|  33|    null|
// |  2| 11| a23|null|    null|
// |  2| 19|null|  77|    null|
// +---+---+----+----+--------+

val win = Window.partitionBy("grp").orderBy("ord").
  rowsBetween(0, Window.unboundedFollowing)

val nonAggCols = Array("grp")
val cols = df.columns.diff(nonAggCols)  // Columns to be aggregated

val colFcnMap = cols.zip(Array.fill(cols.size)("first")).toMap
// colFcnMap: scala.collection.immutable.Map[String,String] =
//   Map(ord -> first, col1 -> first, col2 -> first, null_col -> first)

cols.foldLeft(df)((acc, c) =>
    acc.withColumn(c, last(c, ignoreNulls=true).over(win))
  ).
  groupBy("grp").agg(colFcnMap).
  select(col("grp") :: colFcnMap.toList.map{case (c, f) => col(s"$f($c)").as(c)}: _*).
  show
// +---+---+----+----+--------+
// |grp|ord|col1|col2|null_col|
// +---+---+----+----+--------+
// |  1| 12| s13|  11|    null|
// |  2| 19| a23|  77|    null|
// +---+---+----+----+--------+

Note that the final select is for stripping the function name (in this case first()) from the aggregated column names.

I have worked something out, here is the code and output

import org.apache.spark.sql.functions._
import spark.implicits._

val df0 = Seq(
  (1, 3, None, Some(11)),
  (2, 2, Some("aaa"), Some(22)),
  (1, 1, Some("s12"), None),
  (2, 7, None, Some(33)),
  (1, 12, None, None),
  (2, 19, None, Some(77)),
  (1, 10, Some("s13"), None),
  (2, 11, Some("a23"), None)
).toDF("grp", "ord", "col1", "col2")

df0.show()

//+---+---+----+----+
//|grp|ord|col1|col2|
//+---+---+----+----+
//|  1|  3|null|  11|
//|  2|  2| aaa|  22|
//|  1|  1| s12|null|
//|  2|  7|null|  33|
//|  1| 12|null|null|
//|  2| 19|null|  77|
//|  1| 10| s13|null|
//|  2| 11| a23|null|
//+---+---+----+----+

Ordering the data on first 2 columns

val df1 = df0.select("grp", "ord", "col1", "col2").orderBy("grp", "ord")

df1.show()

//+---+---+----+----+
//|grp|ord|col1|col2|
//+---+---+----+----+
//|  1|  1| s12|null|
//|  1|  3|null|  11|
//|  1| 10| s13|null|
//|  1| 12|null|null|
//|  2|  2| aaa|  22|
//|  2|  7|null|  33|
//|  2| 11| a23|null|
//|  2| 19|null|  77|
//+---+---+----+----+

val df2 = df1.groupBy("grp").agg(max("ord").alias("ord"),collect_set("col1").alias("col1"),collect_set("col2").alias("col2"))

val df3 = df2.withColumn("new_col1",$"col1".apply(size($"col1").minus(1))).withColumn("new_col2",$"col2".apply(size($"col2").minus(1)))

df3.show()

//+---+---+----------+------------+--------+--------+
//|grp|ord|      col1|        col2|new_col1|new_col2|
//+---+---+----------+------------+--------+--------+
//|  1| 12|[s12, s13]|        [11]|     s13|      11|
//|  2| 19|[aaa, a23]|[33, 22, 77]|     a23|      77|
//+---+---+----------+------------+--------+--------+

You can drop the columns you don't need by using .drop("column_name")

So here we are grouping by a and selecting the max of all other columns in the group:

scala> val df = List((1,2,11), (1,1,1), (2,1,4), (2,3,5)).toDF("a", "b", "c")
df: org.apache.spark.sql.DataFrame = [a: int, b: int ... 1 more field]

scala> val aggCols = df.schema.map(_.name).filter(_ != "a").map(colName => sum(col(colName)).alias(s"max_$colName"))
aggCols: Seq[org.apache.spark.sql.Column] = List(sum(b) AS `max_b`, sum(c) AS `max_c`)

scala> df.groupBy(col("a")).agg(aggCols.head, aggCols.tail: _*)
res0: org.apache.spark.sql.DataFrame = [a: int, max_b: bigint ... 1 more field]

Here is your answer (and hopefully my bounty!!!)

scala> val df = spark.sparkContext.parallelize(List(
     | (1,null.asInstanceOf[String],3,null.asInstanceOf[String],new Integer(11)),
     | (2,null.asInstanceOf[String],2,new String("xxx"),new Integer(22)),
     | (1,null.asInstanceOf[String],1,new String("yyy"),null.asInstanceOf[Integer]),
     | (2,null.asInstanceOf[String],7,null.asInstanceOf[String],new Integer(33)),
     | (1,null.asInstanceOf[String],12,null.asInstanceOf[String],null.asInstanceOf[Integer]),
     | (2,null.asInstanceOf[String],19,null.asInstanceOf[String],new Integer(77)),
     | (1,null.asInstanceOf[String],10,new String("s13"),null.asInstanceOf[Integer]),
     | (2,null.asInstanceOf[String],11,new String("a23"),null.asInstanceOf[Integer]))).toDF("grp","null_col","ord","col1","col2")
df: org.apache.spark.sql.DataFrame = [grp: int, null_col: string ... 3 more fields]

scala> df.show
+---+--------+---+----+----+
|grp|null_col|ord|col1|col2|
+---+--------+---+----+----+
|  1|    null|  3|null|  11|
|  2|    null|  2| xxx|  22|
|  1|    null|  1| yyy|null|
|  2|    null|  7|null|  33|
|  1|    null| 12|null|null|
|  2|    null| 19|null|  77|
|  1|    null| 10| s13|null|
|  2|    null| 11| a23|null|
+---+--------+---+----+----+

//Create window specification

scala> import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.expressions.Window

scala> val win = Window.partitionBy("grp").orderBy($"ord".desc)
win: org.apache.spark.sql.expressions.WindowSpec = org.apache.spark.sql.expressions.WindowSpec@71878833

//Use foldLeft with first over window specification over all columns and take distinct

scala> val result = df.columns.foldLeft(df)((df, colName) => df.withColumn(colName, first(colName, ignoreNulls=true).over(win).as(colName))).distinct
result: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [grp: int, null_col: string ... 3 more fields]

scala> result.show
+---+--------+---+----+----+
|grp|null_col|ord|col1|col2|
+---+--------+---+----+----+
|  1|    null| 12| s13|  11|
|  2|    null| 19| a23|  77|
+---+--------+---+----+----+

Hope this helps.

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!