filter spark dataframe with row field that is an array of strings

后端 未结 2 1376
遥遥无期
遥遥无期 2021-01-03 23:57

Using Spark 1.5 and Scala 2.10.6

I\'m trying to filter a dataframe via a field \"tags\" that is an array of strings. Looking for all rows that have the tag \'privat

2条回答
  •  暗喜
    暗喜 (楼主)
    2021-01-04 00:23

    You can use ordinal to refer to the json array's for e.g. in your case df("tags")(0). Here is a working sample

    scala> val stringRDD = sc.parallelize(Seq("""
         |       { "name": "ed",
         |         "tags": ["private"]
         |       }""",
         |       """{ "name": "fred",
         |         "tags": ["public"]
         |       }""")
         |     )
    stringRDD: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[87] at parallelize at :22
    
    scala> import sqlContext.implicits._
    import sqlContext.implicits._
    
    scala> sqlContext.read.json(stringRDD)
    res28: org.apache.spark.sql.DataFrame = [name: string, tags: array]
    
    scala> val df=sqlContext.read.json(stringRDD)
    df: org.apache.spark.sql.DataFrame = [name: string, tags: array]
    
    scala> df.columns
    res29: Array[String] = Array(name, tags)
    
    scala> df.dtypes
    res30: Array[(String, String)] = Array((name,StringType), (tags,ArrayType(StringType,true)))
    
    scala> val report = df.select("*").where(df("tags")(0).contains("private"))
    report: org.apache.spark.sql.DataFrame = [name: string, tags: array]
    
    scala> report.show
    +----+-------------+
    |name|         tags|
    +----+-------------+
    |  ed|List(private)|
    +----+-------------+
    

提交回复
热议问题