可参考官方文档
http://spark.apache.org/docs/2.2.0/sql-programming-guide.html
怎么读取外部数据
读:spark.read.format(format)
支持数据格式
内置:json、parquet、jdbc、csv(2.x)
外部:可访问 https://spark-packages.org/ ,这里面提供了很多外部数据源
写:people.write.format("parquet").save("path")
操作Parquet文件
这里我在本地代码测试的,将服务器spark目录下文件下载到本地
/home/hadoop/app/spark-2.2.0-bin-hadoop2.6/examples/src/main/resources/users.parquet
代码如下
package com.yy.spark import org.apache.spark.sql.SparkSession /** * 读取parquet文件 */ object ParquetApp extends App { val path = "file:///D:\\data\\users.parquet" var spark = SparkSession.builder().appName("ParquetApp").master("local[2]").getOrCreate() //标准写法 var userDF = spark.read.format("parquet").load(path) //简单写法,默认是parquet格式 var userDF2 = spark.read.load(path) //另一种写法 var userDF3 = spark.read.option("path", path).load() userDF.printSchema() userDF.show() userDF.select("name", "favorite_color").show() //通过json的方式写出来,注意:不能是一个已存在的文件夹 userDF.select("name", "favorite_color").write.format("json").save("file:///D:\\data\\output") spark.stop() }
Sql方式处理
在服务器上启动spark-sql控制台
$ ./spark-sql --master local[2] --jars ~/software/mysql-connector-java-5.1.45.jar
执行以下语句
CREATE TEMPORARY VIEW parquetTable USING org.apache.spark.sql.parquet OPTIONS ( path "/home/hadoop/app/spark-2.2.0-bin-hadoop2.6/examples/src/main/resources/users.parquet" ); SELECT * FROM parquetTable;
操作Hive表数据
在服务器上启动spark-shell控制台
注:启动前先启动hadoop环境
$ ./spark-shell --master local[2] --jars ~/software/mysql-connector-java-5.1.45.jar
读取表数据
spark.table(tableName)
例:
scala> spark.table("emp").show
df.write.saveAdTable(tableName)
例:
scala> spark.sql("select deptno,count(1) as count from emp group by deptno").write.saveAsTable("emp_count")
注意:count(1)需要重命名,不然会报以下错误 org.apache.spark.sql.AnalysisException: Attribute name "count(1)" contains invalid character(s) among " ,;{}()\n\t=". Please use alias to rename it.;
分区数量:shuffle数据和聚合的时候默认是200
在生产环境一定要注意设置分区数量,设置语句如下
scala> spark.sqlContext.setConf("spark.sql.shuffle.partitions", "10")
操作MySQL
在项目pom.xml添加mysql驱动
<dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.45</version> </dependency>
代码如下
package com.yy.spark import java.util.Properties import org.apache.spark.sql.SparkSession /** * 操作MySQL表数据 */ object MysqlApp extends App { val spark = SparkSession.builder().appName("ParquetApp").master("local[2]").getOrCreate() //方式一 val jdbcDF = spark.read .format("jdbc") .option("url", "jdbc:mysql://192.168.175.128:3306/bootscala") .option("dbtable", "meta_database") .option("user", "root") .option("password", "123456") .option("driver", "com.mysql.jdbc.Driver") .load() jdbcDF.printSchema() jdbcDF.show() jdbcDF.select("name","lcotion").show() //方式二 val connectionProperties = new Properties() connectionProperties.put("user", "root") connectionProperties.put("password", "123456") connectionProperties.put("driver", "com.mysql.jdbc.Driver") val jdbcDF2 = spark.read .jdbc("jdbc:mysql://192.168.175.128:3306/bootscala", "meta_database", connectionProperties) jdbcDF2.show() spark.stop() }
spark-sql操作方式
spark-sql> CREATE TEMPORARY VIEW jdbcTable USING org.apache.spark.sql.jdbc OPTIONS ( url "jdbc:mysql://192.168.175.128:3306/bootscala", dbtable "meta_database", user 'root', password '123456', driver 'com.mysql.jdbc.Driver' ); spark-sql> show tables; spark-sql> select * from jdbctable;
Hive和MySQL综合使用
MySQL脚本
create database spark; use spark; CREATE TABLE DEPT( DEPTNO int(2) PRIMARY KEY, DNAME VARCHAR(14) , LOC VARCHAR(13) ) ; INSERT INTO DEPT VALUES(10,'ACCOUNTING','NEW YORK'); INSERT INTO DEPT VALUES(20,'RESEARCH','DALLAS'); INSERT INTO DEPT VALUES(30,'SALES','CHICAGO'); INSERT INTO DEPT VALUES(40,'OPERATIONS','BOSTON');
Hive表emp数据如下
+-----+-----+--------+----+----------+------+-----+------+
|empno|ename| job| mgr| hiredate|salary| comm|deptno|
+-----+-----+--------+----+----------+------+-----+------+
| 7369|SMITH| CLERK|7902|1980-12-17| 800.0| null| 20|
| 7499|ALLEN|SALESMAN|7698|1981-02-20|1600.0|300.0| 30|
| 7521| WARD|SALESMAN|7698|1981-02-22|1250.0|500.0| 30|
+-----+-----+--------+----+----------+------+-----+------+
MySQL和Hive表关联查询代码如下
package com.yy.spark import org.apache.spark.sql.SparkSession /** * 使用外部数据源综合查询MySQL和Hive的表数据 */ object HiveMysqlApp extends App { val spark = SparkSession.builder().appName("HiveMysqlApp").master("local[2]").getOrCreate() //加载Hive表数据 val hiveDF = spark.table("emp") //加载MySQL表数据 val mysqlDF = spark.read .format("jdbc") .option("url", "jdbc:mysql://192.168.175.128:3306/spark") .option("dbtable", "DEPT") .option("user", "root") .option("password", "123456") .option("driver", "com.mysql.jdbc.Driver") .load() //join val resultDF = hiveDF.join(mysqlDF, hiveDF.col("deptno") === mysqlDF.col("DEPTNO")) resultDF.show() resultDF.select(hiveDF.col("empno"), hiveDF.col("ename"), mysqlDF.col("deptno"), mysqlDF.col("dname")).show() spark.stop() }