问题
I am trying to build a simple TFIDF vectorizer in Spark and compile to jar to test it locally. However, I keep getting No TypeTag available for (Int, String). Here is my code:
package com.valiant.ml
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer, StopWordsRemover}
object TextClassification
{
def main(args: Array[String])
{
val spark = SparkSession
.builder
.appName("TextClassification")
.getOrCreate()
val sentenceData = spark.createDataFrame(Seq(
(0, "Hi I heard about Spark"),
(0, "I wish Java could use case classes"),
(1, "Logistic regression models are neat")
)).toDF("label", "text")
val tokenizer = new Tokenizer()
.setInputCol("text")
.setOutputCol("words")
val wordsData = tokenizer.transform(sentenceData)
val remover = new StopWordsRemover()
.setInputCol("words")
.setOutputCol("filtered_words")
val filteredWordsData = remover.transform(wordsData)
val hashingTF = new HashingTF()
.setInputCol("filtered_words")
.setOutputCol("rawFeatures")
.setNumFeatures(20)
val featurizedData = hashingTF.transform(filteredWordsData)
// alternatively, CountVectorizer can also be used to get term frequency vectors
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData)
rescaledData.select("features", "label").take(3).foreach(println)
spark.stop()
}
}
Here is the build file:
name := "text-classification"
version := "0.0.1"
scalaVersion := "2.11.8"
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % "2.0.0" % "provided",
"org.apache.spark" %% "spark-sql" % "2.0.0",
"org.apache.spark" %% "spark-mllib" % "2.0.0",
"org.apache.spark" %% "spark-catalyst" % "2.0.0"
)
Here is the top of my error report:
[0m[[0minfo[0m] [0mSet current project to text-classification (in build file:/arete/repos/ml/classification/text/)[0m
[0m[[0minfo[0m] [0mCompiling 1 Scala source to /arete/repos/ml/classification/text/target/scala-2.11/classes...[0m
[0m[[31merror[0m] [0m/arete/repos/ml/classification/text/text.scala:15: No TypeTag available for (Int, String)[0m
[0m[[31merror[0m] [0m val sentenceData = spark.createDataFrame(Seq([0m
[0m[[31merror[0m] [0m ^[0m
[0m[[31merror[0m] [0m[0m
[0m[[31merror[0m] [0m while compiling: /arete/repos/ml/classification/text/text.scala[0m
[0m[[31merror[0m] [0m during phase: typer[0m
[0m[[31merror[0m] [0m library version: version 2.10.4[0m
[0m[[31merror[0m] [0m compiler version: version 2.10.4[0m
来源:https://stackoverflow.com/questions/38761672/spark-sbt-package-no-typetag-available