Flattening JSON into Tabular Structure using Spark-Scala RDD only fucntion

后端未结

关注

 3  1368

闹比i 2021-01-07 06:01

I have nested JSON and like to have output in tabular structure. I am able to parse the JSON values individually , but having some problems in tabularizing it. I am able to

3条回答

情歌与酒 (楼主)

2021-01-07 06:17

There are 2 versions of solutions to your question.

Version 1:

def main(Args : Array[String]): Unit = {

  val conf = new SparkConf().setAppName("JSON Read and Write using Spark RDD").setMaster("local[1]")
  val sc = new SparkContext(conf)
  val sqlContext = new SQLContext(sc)

  val salesSchema = StructType(Array(
    StructField("prodID", StringType, true),
    StructField("unitOfMeasure", StringType, true),
    StructField("state", StringType, true),
    StructField("effectiveDateTime", StringType, true),
    StructField("quantity", StringType, true),
    StructField("stockKeepingLevel", StringType, true)
  ))

  val ReadAlljsonMessageInFile_RDD = sc.textFile("product_rdd.json")    

  val x = ReadAlljsonMessageInFile_RDD.map(eachJsonMessages => {

    parse(eachJsonMessages)

  }).map(insideEachJson=>{
    implicit  val formats = org.json4s.DefaultFormats

   val prodID = (insideEachJson\ "level" \"productReference" \"prodID").extract[String].toString
   val unitOfMeasure = (insideEachJson\ "level" \ "productReference" \"unitOfMeasure").extract[String].toString

   val state= (insideEachJson \ "level" \"states").extract[List[JValue]].
      map(x=>(x\"state").extract[String]).toString()
   val effectiveDateTime= (insideEachJson \ "level" \"states").extract[List[JValue]].
     map(x=>(x\"effectiveDateTime").extract[String]).toString
  val quantity= (insideEachJson \ "level" \"states").extract[List[JValue]].
     map(x=>(x\"stockQuantity").extract[JValue]).map(x=>(x\"quantity").extract[Double]).
     toString
  val stockKeepingLevel= (insideEachJson \ "level" \"states").extract[List[JValue]].
     map(x=>(x\"stockQuantity").extract[JValue]).map(x=>(x\"stockKeepingLevel").extract[String]).
   toString

  Row(prodID,unitOfMeasure,state,effectiveDateTime,quantity,stockKeepingLevel)

  })

    sqlContext.createDataFrame(x,salesSchema).show(truncate = false)

}

This would give you following output:

+------+-------------+----------------+----------------------------------------------------------+-------------------+-----------------+
|prodID|unitOfMeasure|state           |effectiveDateTime                                         |quantity           |stockKeepingLevel|
+------+-------------+----------------+----------------------------------------------------------+-------------------+-----------------+
|1234  |EA           |List(SELL, HELD)|List(2015-10-09T00:55:23.6345Z, 2015-10-09T00:55:23.6345Z)|List(1400.0, 800.0)|List(A, B)       |
+------+-------------+----------------+----------------------------------------------------------+-------------------+-----------------+

Version 2:

def main(Args : Array[String]): Unit = {

  val conf = new SparkConf().setAppName("JSON Read and Write using Spark RDD").setMaster("local[1]")
  val sc = new SparkContext(conf)
  val sqlContext = new SQLContext(sc)

  val salesSchema = StructType(Array(
    StructField("prodID", StringType, true),
    StructField("unitOfMeasure", StringType, true),
    StructField("state", ArrayType(StringType, true), true),
    StructField("effectiveDateTime", ArrayType(StringType, true), true),
    StructField("quantity", ArrayType(DoubleType, true), true),
    StructField("stockKeepingLevel", ArrayType(StringType, true), true)
  ))

  val ReadAlljsonMessageInFile_RDD = sc.textFile("product_rdd.json")    

  val x = ReadAlljsonMessageInFile_RDD.map(eachJsonMessages => {

    parse(eachJsonMessages)

  }).map(insideEachJson=>{
    implicit  val formats = org.json4s.DefaultFormats

   val prodID = (insideEachJson\ "level" \"productReference" \"prodID").extract[String].toString
   val unitOfMeasure = (insideEachJson\ "level" \ "productReference" \"unitOfMeasure").extract[String].toString

   val state= (insideEachJson \ "level" \"states").extract[List[JValue]].
      map(x=>(x\"state").extract[String])
   val effectiveDateTime= (insideEachJson \ "level" \"states").extract[List[JValue]].
     map(x=>(x\"effectiveDateTime").extract[String])
  val quantity= (insideEachJson \ "level" \"states").extract[List[JValue]].
     map(x=>(x\"stockQuantity").extract[JValue]).map(x=>(x\"quantity").extract[Double])
  val stockKeepingLevel= (insideEachJson \ "level" \"states").extract[List[JValue]].
     map(x=>(x\"stockQuantity").extract[JValue]).map(x=>(x\"stockKeepingLevel").extract[String])

  Row(prodID,unitOfMeasure,state,effectiveDateTime,quantity,stockKeepingLevel)

  })


    sqlContext.createDataFrame(x,salesSchema).show(truncate = false)

}

This would give you following output:

+------+-------------+------------+------------------------------------------------------+---------------+-----------------+
|prodID|unitOfMeasure|state       |effectiveDateTime                                     |quantity       |stockKeepingLevel|
+------+-------------+------------+------------------------------------------------------+---------------+-----------------+
|1234  |EA           |[SELL, HELD]|[2015-10-09T00:55:23.6345Z, 2015-10-09T00:55:23.6345Z]|[1400.0, 800.0]|[A, B]           |
+------+-------------+------------+------------------------------------------------------+---------------+-----------------+

The difference between Version 1 & 2 is of schema. In Version 1 you are casting every column into String whereas in Version 2 they are being casted into Array.

0 讨论(0)

查看其它3个回答