Error with spark Row.fromSeq for a text file

久未见 提交于 2021-02-19 08:25:07

问题


import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark._
import org.apache.spark.sql.types._
import org.apache.spark.sql._

object fixedLength {

  def main(args:Array[String]) {

    def getRow(x : String) : Row={    
    val columnArray = new Array[String](4)
    columnArray(0)=x.substring(0,3)
    columnArray(1)=x.substring(3,13)
    columnArray(2)=x.substring(13,18)
    columnArray(3)=x.substring(18,22)
    Row.fromSeq(columnArray)  
  }

    Logger.getLogger("org").setLevel(Level.ERROR)

    val spark = SparkSession.builder().master("local").appName("ReadingCSV").getOrCreate()


    val conf = new SparkConf().setAppName("FixedLength").setMaster("local[*]").set("spark.driver.allowMultipleContexts", "true");
    val sc = new SparkContext(conf)    
    val fruits = sc.textFile("in/fruits.txt")

    val schemaString = "id,fruitName,isAvailable,unitPrice";
    val fields = schemaString.split(",").map( field => StructField(field,StringType,nullable=true))
    val schema = StructType(fields)

    val df = spark.createDataFrame(fruits.map { x => getRow(x)} , schema)
    df.show() // Error
    println("End of the program")
  }
}

I'm getting error in the df.show() command. My file content is

56 apple     TRUE 0.56
45 pear      FALSE1.34
34 raspberry TRUE 2.43
34 plum      TRUE 1.31
53 cherry    TRUE 1.4 
23 orange    FALSE2.34
56 persimmon FALSE23.2

ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0) java.lang.ClassCastException: org.apache.spark.util.SerializableConfiguration cannot be cast to [B at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:81)

Can you please help?


回答1:


You are creating rdd in old way SparkContext(conf)

val conf = new SparkConf().setAppName("FixedLength").setMaster("local[*]").set("spark.driver.allowMultipleContexts", "true");
val sc = new SparkContext(conf)    
val fruits = sc.textFile("in/fruits.txt")

whereas you are creating dataframe in new way using SparkSession

val spark = SparkSession.builder().master("local").appName("ReadingCSV").getOrCreate()
val df = spark.createDataFrame(fruits.map { x => getRow(x)} , schema)

Ultimately you are mixing rdd created with old sparkContext functions with dataframe created by using new sparkSession.

I would suggest you to use only one way.

I guess thats the reason for the issue

Update

doing the following should work for you

def getRow(x : String) : Row={    
val columnArray = new Array[String](4)
columnArray(0)=x.substring(0,3)
columnArray(1)=x.substring(3,13)
columnArray(2)=x.substring(13,18)
columnArray(3)=x.substring(18,22)
Row.fromSeq(columnArray)  
}

Logger.getLogger("org").setLevel(Level.ERROR)

val spark = SparkSession.builder().master("local").appName("ReadingCSV").getOrCreate()

val fruits = spark.sparkContext.textFile("in/fruits.txt")

val schemaString = "id,fruitName,isAvailable,unitPrice";
val fields = schemaString.split(",").map( field => StructField(field,StringType,nullable=true))
val schema = StructType(fields)

val df = spark.createDataFrame(fruits.map { x => getRow(x)} , schema)


来源:https://stackoverflow.com/questions/49069720/error-with-spark-row-fromseq-for-a-text-file

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!