Using Spark 1.5 and Scala 2.10.6
I\'m trying to filter a dataframe via a field \"tags\" that is an array of strings. Looking for all rows that have the tag \'privat
You can use ordinal to refer to the json array's for e.g. in your case df("tags")(0). Here is a working sample
scala> val stringRDD = sc.parallelize(Seq("""
| { "name": "ed",
| "tags": ["private"]
| }""",
| """{ "name": "fred",
| "tags": ["public"]
| }""")
| )
stringRDD: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[87] at parallelize at :22
scala> import sqlContext.implicits._
import sqlContext.implicits._
scala> sqlContext.read.json(stringRDD)
res28: org.apache.spark.sql.DataFrame = [name: string, tags: array]
scala> val df=sqlContext.read.json(stringRDD)
df: org.apache.spark.sql.DataFrame = [name: string, tags: array]
scala> df.columns
res29: Array[String] = Array(name, tags)
scala> df.dtypes
res30: Array[(String, String)] = Array((name,StringType), (tags,ArrayType(StringType,true)))
scala> val report = df.select("*").where(df("tags")(0).contains("private"))
report: org.apache.spark.sql.DataFrame = [name: string, tags: array]
scala> report.show
+----+-------------+
|name| tags|
+----+-------------+
| ed|List(private)|
+----+-------------+