How to get all columns after groupby on Dataset in spark sql 2.1.0

前端 未结 5 729
孤独总比滥情好
孤独总比滥情好 2020-12-29 11:13

First, I am very new to SPARK

I have millions of records in my Dataset and i wanted to groupby with name column and finding names which having maximum age. I am gett

5条回答
  •  鱼传尺愫
    2020-12-29 11:17

    You need to remember that aggregate functions reduce the rows and therefore you need to specify which of the rows age you want with a reducing function. If you want to retain all rows of a group (warning! this can cause explosions or skewed partitions) you can collect them as a list. You can then use a UDF (user defined function) to reduce them by your criteria, in this example funniness_of_requisite. And then expand columns belonging to the reduced row from the single reduced row with another UDF . For the purpose of this answer I assume you wish to retain the age of the person who has the max funniness_of_requisite.

    import org.apache.spark.sql._
    import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
    import org.apache.spark.sql.functions._
    import org.apache.spark.sql.types.{IntegerType, StringType}
    
    import scala.collection.mutable
    
    
    object TestJob4 {
    
    def main (args: Array[String]): Unit = {
    
    val sparkSession = SparkSession
      .builder()
      .appName(this.getClass.getName.replace("$", ""))
      .master("local")
      .getOrCreate()
    
    val sc = sparkSession.sparkContext
    
    import sparkSession.sqlContext.implicits._
    
    val rawDf = Seq(
      (1, "Moe",  "Slap",  7.9, 118),
      (2, "Larry",  "Spank",  8.0, 115),
      (3, "Curly",  "Twist", 6.0, 113),
      (4, "Laurel", "Whimper", 7.53, 119),
      (5, "Hardy", "Laugh", 6.0, 18),
      (6, "Charley",  "Ignore",   9.7, 115),
      (2, "Moe",  "Spank",  6.8, 118),
      (3, "Larry",  "Twist", 6.0, 115),
      (3, "Charley",  "fall", 9.0, 115)
    ).toDF("id", "name", "requisite", "funniness_of_requisite", "age")
    
    rawDf.show(false)
    rawDf.printSchema
    
    val rawSchema = rawDf.schema
    
    val fUdf = udf(reduceByFunniness, rawSchema)
    
    val nameUdf = udf(extractAge, IntegerType)
    
    val aggDf = rawDf
      .groupBy("name")
      .agg(
        count(struct("*")).as("count"),
        max(col("funniness_of_requisite")),
        collect_list(struct("*")).as("horizontal")
      )
      .withColumn("short", fUdf($"horizontal"))
      .withColumn("age", nameUdf($"short"))
      .drop("horizontal")
    
    aggDf.printSchema
    
    aggDf.show(false)
    }
    
    def reduceByFunniness= (x: Any) => {
    
    val d = x.asInstanceOf[mutable.WrappedArray[GenericRowWithSchema]]
    
    val red = d.reduce((r1, r2) => {
    
      val funniness1 = r1.getAs[Double]("funniness_of_requisite")
      val funniness2 = r2.getAs[Double]("funniness_of_requisite")
    
      val r3 = funniness1 match {
        case a if a >= funniness2 =>
          r1
        case _ =>
          r2
      }
    
      r3
    })
    
    red
    }
    
    def extractAge = (x: Any) => {
    
    val d = x.asInstanceOf[GenericRowWithSchema]
    
    d.getAs[Int]("age")
    }
     }
    
      d.getAs[String]("name")
    }
    }
    

    here is the output

    +-------+-----+---------------------------+-------------------------------+---+
    |name   |count|max(funniness_of_requisite)|short                          
    |age|
    +-------+-----+---------------------------+-------------------------------+---+
    |Hardy  |1    |6.0                        |[5, Hardy, Laugh, 6.0, 18]     
    |18 |
    |Moe    |2    |7.9                        |[1, Moe, Slap, 7.9, 118]       
    |118|
    |Curly  |1    |6.0                        |[3, Curly, Twist, 6.0, 113]    
    |113|
    |Larry  |2    |8.0                        |[2, Larry, Spank, 8.0, 115]    
    |115|
    |Laurel |1    |7.53                       |[4, Laurel, Whimper, 7.53, 119]|119|
    |Charley|2    |9.7                        |[6, Charley, Ignore, 9.7, 115] |115|
    +-------+-----+---------------------------+-------------------------------+---+
    

提交回复
热议问题