Flattening JSON into Tabular Structure using Spark-Scala RDD only fucntion

后端 未结 3 1368
闹比i
闹比i 2021-01-07 06:01

I have nested JSON and like to have output in tabular structure. I am able to parse the JSON values individually , but having some problems in tabularizing it. I am able to

3条回答
  •  情歌与酒
    2021-01-07 06:17

    There are 2 versions of solutions to your question.

    Version 1:

    def main(Args : Array[String]): Unit = {
    
      val conf = new SparkConf().setAppName("JSON Read and Write using Spark RDD").setMaster("local[1]")
      val sc = new SparkContext(conf)
      val sqlContext = new SQLContext(sc)
    
      val salesSchema = StructType(Array(
        StructField("prodID", StringType, true),
        StructField("unitOfMeasure", StringType, true),
        StructField("state", StringType, true),
        StructField("effectiveDateTime", StringType, true),
        StructField("quantity", StringType, true),
        StructField("stockKeepingLevel", StringType, true)
      ))
    
      val ReadAlljsonMessageInFile_RDD = sc.textFile("product_rdd.json")    
    
      val x = ReadAlljsonMessageInFile_RDD.map(eachJsonMessages => {
    
        parse(eachJsonMessages)
    
      }).map(insideEachJson=>{
        implicit  val formats = org.json4s.DefaultFormats
    
       val prodID = (insideEachJson\ "level" \"productReference" \"prodID").extract[String].toString
       val unitOfMeasure = (insideEachJson\ "level" \ "productReference" \"unitOfMeasure").extract[String].toString
    
       val state= (insideEachJson \ "level" \"states").extract[List[JValue]].
          map(x=>(x\"state").extract[String]).toString()
       val effectiveDateTime= (insideEachJson \ "level" \"states").extract[List[JValue]].
         map(x=>(x\"effectiveDateTime").extract[String]).toString
      val quantity= (insideEachJson \ "level" \"states").extract[List[JValue]].
         map(x=>(x\"stockQuantity").extract[JValue]).map(x=>(x\"quantity").extract[Double]).
         toString
      val stockKeepingLevel= (insideEachJson \ "level" \"states").extract[List[JValue]].
         map(x=>(x\"stockQuantity").extract[JValue]).map(x=>(x\"stockKeepingLevel").extract[String]).
       toString
    
      Row(prodID,unitOfMeasure,state,effectiveDateTime,quantity,stockKeepingLevel)
    
      })
    
        sqlContext.createDataFrame(x,salesSchema).show(truncate = false)
    
    }
    

    This would give you following output:

    +------+-------------+----------------+----------------------------------------------------------+-------------------+-----------------+
    |prodID|unitOfMeasure|state           |effectiveDateTime                                         |quantity           |stockKeepingLevel|
    +------+-------------+----------------+----------------------------------------------------------+-------------------+-----------------+
    |1234  |EA           |List(SELL, HELD)|List(2015-10-09T00:55:23.6345Z, 2015-10-09T00:55:23.6345Z)|List(1400.0, 800.0)|List(A, B)       |
    +------+-------------+----------------+----------------------------------------------------------+-------------------+-----------------+
    

    Version 2:

    def main(Args : Array[String]): Unit = {
    
      val conf = new SparkConf().setAppName("JSON Read and Write using Spark RDD").setMaster("local[1]")
      val sc = new SparkContext(conf)
      val sqlContext = new SQLContext(sc)
    
      val salesSchema = StructType(Array(
        StructField("prodID", StringType, true),
        StructField("unitOfMeasure", StringType, true),
        StructField("state", ArrayType(StringType, true), true),
        StructField("effectiveDateTime", ArrayType(StringType, true), true),
        StructField("quantity", ArrayType(DoubleType, true), true),
        StructField("stockKeepingLevel", ArrayType(StringType, true), true)
      ))
    
      val ReadAlljsonMessageInFile_RDD = sc.textFile("product_rdd.json")    
    
      val x = ReadAlljsonMessageInFile_RDD.map(eachJsonMessages => {
    
        parse(eachJsonMessages)
    
      }).map(insideEachJson=>{
        implicit  val formats = org.json4s.DefaultFormats
    
       val prodID = (insideEachJson\ "level" \"productReference" \"prodID").extract[String].toString
       val unitOfMeasure = (insideEachJson\ "level" \ "productReference" \"unitOfMeasure").extract[String].toString
    
       val state= (insideEachJson \ "level" \"states").extract[List[JValue]].
          map(x=>(x\"state").extract[String])
       val effectiveDateTime= (insideEachJson \ "level" \"states").extract[List[JValue]].
         map(x=>(x\"effectiveDateTime").extract[String])
      val quantity= (insideEachJson \ "level" \"states").extract[List[JValue]].
         map(x=>(x\"stockQuantity").extract[JValue]).map(x=>(x\"quantity").extract[Double])
      val stockKeepingLevel= (insideEachJson \ "level" \"states").extract[List[JValue]].
         map(x=>(x\"stockQuantity").extract[JValue]).map(x=>(x\"stockKeepingLevel").extract[String])
    
      Row(prodID,unitOfMeasure,state,effectiveDateTime,quantity,stockKeepingLevel)
    
      })
    
    
        sqlContext.createDataFrame(x,salesSchema).show(truncate = false)
    
    }
    

    This would give you following output:

    +------+-------------+------------+------------------------------------------------------+---------------+-----------------+
    |prodID|unitOfMeasure|state       |effectiveDateTime                                     |quantity       |stockKeepingLevel|
    +------+-------------+------------+------------------------------------------------------+---------------+-----------------+
    |1234  |EA           |[SELL, HELD]|[2015-10-09T00:55:23.6345Z, 2015-10-09T00:55:23.6345Z]|[1400.0, 800.0]|[A, B]           |
    +------+-------------+------------+------------------------------------------------------+---------------+-----------------+
    

    The difference between Version 1 & 2 is of schema. In Version 1 you are casting every column into String whereas in Version 2 they are being casted into Array.

提交回复
热议问题