一、sparksql基本操作
1.构建sparkSession
import org.apache.spark.sql.SparkSession
val spark: SparkSession = SparkSession
.builder
.enableHiveSupport()
.config(initConf())
.getOrCreate
2.停止sparkSession
spark.stop
3.读取parquet文件
val df1 = spark.read.parquet("hdfs://localhost:8020/tmp/testdata.parquet")
4.输出parquet文件
import org.apache.spark.sql. SaveMode
df1.write.mode(SaveMode.Overwrite).parquet("hdfs://localhost:8020/tmp/testdata_out.parquet")
5.查看数据
//查看100行,列宽无限制
df1.show(100,false)
二、SQL操作数据
1.读取parquet文件,并注册临时view
val df1 = spark.read.parquet("hdfs://localhost:8020/tmp/testdata.parquet")
df1.createOrReplaceTempView("table_df1")
2.sql方式查询数据
以COUNT(1) 为例,其他sql类似
val res = spark.sql("select count(1) from table_df1")
三、算子操作数据
1.读取parquet文件,选择目标列
val df1 = spark.read.parquet("hdfs://localhost:8020/tmp/testdata.parquet")
val df2 = df1.select("col_A","col_B")
2.新增列col_C,并以col_A+col_B赋值
val df3 = df2.withColumn("col_C",col("col_A") + col("col_B"))
3.新增列col_C,并以0赋默认值
val df3 = df2.withColumn("col_C",lit(0))
4.用户自定义函数的定义(UDF)
val udf_make_tag = udf((x:Double) =>{
if (x<0.5) "0~0.5"
else if(x >= 0.5 && x < 1) "0.5~1"
else if(x >= 1 && x < 1.5) "1~1.5"
else if(x >=1.5 && x < 2) "1.5~2"
else ">2"
}
)
5.UDF调用
val df3 = df2.withColumn("col_C",udf_make_tag(col("col_B"))
一个完整的程序
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.SaveMode
object Info {
def main(args:Array[String]) = {
val spark = SparkSession.builder().master("yarn").appName("testApp").getOrCreate()
val df1 = spark.read.parquet("hdfs://localhost:8020/tmp/testdata.parquet")
df1.createOrReplaceTempView("table_df1")
val res = spark.sql("""
|select colA,count(1)
|from table_df1
|group by colA
""".stripMargin)
res.show()
res.write.mode(SaveMode.Overwrite).parquet("hdfs://localhost:8020/tmp/testdata_out.parquet")
spark.stop()
}
}
来源:oschina
链接:https://my.oschina.net/weiwubunengxiao/blog/4299948