import java.util.Arrays
import org.apache.spark.SparkConf
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.{DataFrame, Row, SparkSession, functions}
import org.apache.spark.sql.functions.{col, desc, length, row_number, trim, when}
import org.apache.spark.sql.functions.{countDistinct,sum,count,avg}
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
import org.apache.spark.sql.expressions.Window
import org.apache.spark.storage.StorageLevel
import org.apache.spark.sql.SaveMode
object WordCount {
def initSparkAndData() : DataFrame = {
val sparkSession= SparkSession.builder().master("local").appName("AppName").getOrCreate()
val javasc = new JavaSparkContext(sparkSession.sparkContext)
val nameRDD = javasc.parallelize(Arrays.asList("{'name':'wangwu','age':'18','vip':'t'}",
"{'name':'sunliu','age':'19','vip':'t'}","{'name':'zhangsan','age':'20','vip':'f'}"));
val namedf = sparkSession.read.json(nameRDD)
namedf
}
def main(args: Array[String]): Unit = {
val data = initSparkAndData()
}
}
上面讲sparksession初始化和数据的加载定义为一个方法,方便后续叙述
select,filter,groupby,sum
val data = initSparkAndData()
val simpleoption = data.select(col("name"),col("age"),col("vip"))
.filter(col("name") =!= "zhangsan" && col("vip") === "t") //其实是zhangsan的过滤,主要是展示===和=!=
.groupBy(col("vip")) //sql理解就是 group by 语句
.agg(sum(col("age")) as "sumage") //sql理解就是 sum语句
.show(100) //显示vip、sumage两列,sparksql自动补齐
参数列表
\ 是linux中换行,方便看参数
spark-submit \
--class class \
--class
你的main函数中
reduce阶段
set mapreduce.job.running.reduce.limit=80;(例子:任务中有100个reduce,但是可以使reduce分批执行一批10个)
合并文件
hive合并文件是新启动一个任务合并文件,感觉这个参数不太合适,有这个时间不如直接输出(map和reduce阶段都是一样的)。