t1,p3,u1,1,300 t2,p1,u2,1,100 t3,p1,u1,1,100 t4,p2,u2,1,10 t5,p4,u4,1,9 t6,p1,u1,1,100 t7,p4,u1,1,9 t8,p4,u5,2,40
一、RDD的LeftOuterJoin操作
/** * Perform a left outer join of `this` and `other`. For each element (k, v) in `this`, the * resulting RDD will either contain all pairs (k, (v, Some(w))) for w in `other`, or the * pair (k, (v, None)) if no elements in `other` have key k. Hash-partitions the output * using the existing partitioner/parallelism level. */ def leftOuterJoin[W](other: RDD[(K, W)]): RDD[(K, (V, Option[W]))] = self.withScope { leftOuterJoin(other, defaultPartitioner(self, other)) } /** * Perform a left outer join of `this` and `other`. For each element (k, v) in `this`, the * resulting RDD will either contain all pairs (k, (v, Some(w))) for w in `other`, or the * pair (k, (v, None)) if no elements in `other` have key k. Hash-partitions the output * into `numPartitions` partitions. */ def leftOuterJoin[W]( other: RDD[(K, W)], numPartitions: Int): RDD[(K, (V, Option[W]))] = self.withScope { leftOuterJoin(other, new HashPartitioner(numPartitions)) } /** * Perform a right outer join of `this` and `other`. For each element (k, w) in `other`, the * resulting RDD will either contain all pairs (k, (Some(v), w)) for v in `this`, or the * pair (k, (None, w)) if no elements in `this` have key k. Hash-partitions the resulting * RDD using the existing partitioner/parallelism level. */ def rightOuterJoin[W](other: RDD[(K, W)]): RDD[(K, (Option[V], W))] = self.withScope { rightOuterJoin(other, defaultPartitioner(self, other)) }
1.2 RDD使用LeftOuterJoin的代码实现
/** * Spark-RDD的左连接操作 **/ object RDDkLeftOuterJoin { def main(args: Array[String]): Unit = { if (args.length < 3) { println("使用参数:SparkLeftOuterJoin <users-data-path> <transactions-data-path> <output-path>") sys.exit(1) } //用户数据文件 val usersFile: String = args(0) //交易数据文件 val transactionsFile: String = args(1) //输出路径 val output: String = args(2) val sparkConf: SparkConf = new SparkConf().setMaster("local").setAppName("RDDLeftOuterJoinExample") //创建SparkContext val sparkContext: SparkContext = SparkSession.builder().config(sparkConf).getOrCreate().sparkContext //读取用户数据形成RDD val usersRaw: RDD[String] = sparkContext.textFile(usersFile) //读取交易数据形成RDD val transactionsRaw: RDD[String] = sparkContext.textFile(transactionsFile) val rddUsers: RDD[(String, String)] = usersRaw.map(line => { val tokens = line.split(",") (tokens(0), tokens(1)) }) val rddTransactions: RDD[(String, (String, String, String))] = transactionsRaw.map(line => { val tokens = line.split(",") (tokens(2), (tokens(1), tokens(3), tokens(4))) }) val rddJoined: RDD[(String, ((String, String, String), Option[String]))] = rddTransactions.leftOuterJoin(rddUsers) rddJoined.foreach(println) val rddProductLocations: RDD[(String, String)] = rddJoined.values.map(f => (f._1._1, f._2.getOrElse("unknown"))) val rddProductByLocations: RDD[(String, Iterable[String])] = rddProductLocations.groupByKey() // 转换为Set,去掉重复数据 val productWithUniqueLocations: RDD[(String, Set[String])] = rddProductByLocations.mapValues(_.toSet) // 统计产品个数,返回tuple(product, location count). val rddProductCount: RDD[(String, Int)] = productWithUniqueLocations.map(f => (f._1, f._2.size)) //保存结果到输出路径 rddProductCount.saveAsTextFile(output) } }
1.3 输出结果
(p1,2) (p2,1) (p3,1) (p4,3)
二、DataFrame的LeftOuterJoin操作
2.1 DataFrame的LeftOuterJoin方法定义
/** * Join with another `DataFrame`, using the given join expression. The following performs * a full outer join between `df1` and `df2`. * * {{{ * // Scala: * import org.apache.spark.sql.functions._ * df1.join(df2, $"df1Key" === $"df2Key", "outer") * * // Java: * import static org.apache.spark.sql.functions.*; * df1.join(df2, col("df1Key").equalTo(col("df2Key")), "outer"); * }}} * * @param right Right side of the join. * @param joinExprs Join expression. * @param joinType Type of join to perform. Default `inner`. Must be one of: * `inner`, `cross`, `outer`, `full`, `full_outer`, `left`, `left_outer`, * `right`, `right_outer`, `left_semi`, `left_anti`. * * @group untypedrel * @since 2.0.0 */ def join(right: Dataset[_], joinExprs: Column, joinType: String): DataFrame = { // Note that in this function, we introduce a hack in the case of self-join to automatically // resolve ambiguous join conditions into ones that might make sense [SPARK-6231]. // Consider this case: df.join(df, df("key") === df("key")) // Since df("key") === df("key") is a trivially true condition, this actually becomes a // cartesian join. However, most likely users expect to perform a self join using "key". // With that assumption, this hack turns the trivially true condition into equality on join // keys that are resolved to both sides. // Trigger analysis so in the case of self-join, the analyzer will clone the plan. // After the cloning, left and right side will have distinct expression ids. val plan = withPlan( Join(logicalPlan, right.logicalPlan, JoinType(joinType), Some(joinExprs.expr))) .queryExecution.analyzed.asInstanceOf[Join] // If auto self join alias is disabled, return the plan. if (!sparkSession.sessionState.conf.dataFrameSelfJoinAutoResolveAmbiguity) { return withPlan(plan) } // If left/right have no output set intersection, return the plan. val lanalyzed = withPlan(this.logicalPlan).queryExecution.analyzed val ranalyzed = withPlan(right.logicalPlan).queryExecution.analyzed if (lanalyzed.outputSet.intersect(ranalyzed.outputSet).isEmpty) { return withPlan(plan) } // Otherwise, find the trivially true predicates and automatically resolves them to both sides. // By the time we get here, since we have already run analysis, all attributes should've been // resolved and become AttributeReference. val cond = plan.condition.map { _.transform { case catalyst.expressions.EqualTo(a: AttributeReference, b: AttributeReference) if a.sameRef(b) => catalyst.expressions.EqualTo( withPlan(plan.left).resolve(a.name), withPlan(plan.right).resolve(b.name)) }} withPlan { plan.copy(condition = cond) } }
2.2 DataFrame的LeftOuterJoin的代码实现
/** * Spark-DataFrame的左连接操作 **/ object DataFrameLeftOuterJoin { def main(args: Array[String]): Unit = { if (args.length < 3) { println("使用参数: DataFrameLeftOuterJoin <users-data-path> <transactions-data-path> <output-path>") sys.exit(1) } //用户数据文件 val usersFile: String = args(0) //交易数据文件 val transactionsFile: String = args(1) //输出路径 val output: String = args(2) val sparkConf = new SparkConf() .setMaster("local[1]") .setAppName("DataFrameLeftOuterJoinExample") val sparkSession: SparkSession = SparkSession.builder().config(sparkConf).getOrCreate() val sparkContext: SparkContext = sparkSession.sparkContext // 定义用户的schema val userSchema = StructType(Seq( StructField("userId", StringType, false), StructField("location", StringType, false))) // 定义交易数据的schema val transactionSchema = StructType(Seq( StructField("transactionId", StringType, false), StructField("productId", StringType, false), StructField("userId", StringType, false), StructField("quantity", IntegerType, false), StructField("price", DoubleType, false))) //加载用户数据 val usersRaw: RDD[String] = sparkContext.textFile(usersFile) //转换为RDD[org.apache.spark.sql.Row] val userRDDRows: RDD[Row] = usersRaw.map(line => { val tokens = line.split(",") Row(tokens(0), tokens(1)) }) //从RDD中创建DataFrame val dfUsers: DataFrame = sparkSession.createDataFrame(userRDDRows, userSchema) dfUsers.printSchema() //加载交易数据 val transactionsRaw = sparkContext.textFile(transactionsFile) //转换为RDD[org.apache.spark.sql.Row] val transactionsRDDRows = transactionsRaw.map(line => { val tokens = line.split(",") Row(tokens(0), tokens(1), tokens(2), tokens(3).toInt, tokens(4).toDouble) }) // 从RDD中创建DataFrame val dfTransactions = sparkSession.createDataFrame(transactionsRDDRows, transactionSchema) dfTransactions.printSchema() //DataFrame的LeftOutJoin,用户关联交易信息 val dfLeftJoin: DataFrame = dfTransactions.join(dfUsers, dfTransactions("userId") === dfUsers("userId"), "left") dfLeftJoin.printSchema() dfLeftJoin.show() //查询产品的用户及地址 val dfProductLocation: DataFrame = dfLeftJoin.select(dfUsers.col("userId"), dfLeftJoin.col("productId"), dfLeftJoin.col("location")) dfProductLocation.show() val dfProductLocationDistinct: Dataset[Row] = dfProductLocation.distinct dfProductLocationDistinct.show() val dfProductsCount: DataFrame = dfProductLocationDistinct.groupBy("productId").count() dfProductsCount.show() //重新分区,输出到同一个文件中,小数据量可以这样实现 //dfProductsCount.repartition(1).write.save(output + "/df") dfProductsCount.rdd.repartition(1).saveAsTextFile(output + "/df_output") } }
2.3 运行结果
[p2,1] [p1,2] [p3,1] [p4,3]
三、SparkSQL的LeftOuterJoin
//使用SparkSQL,创建一个用户临时表 dfUsers.createOrReplaceTempView("users") //使用SparkSQL,创建一个交易数据临时表 dfTransactions.createOrReplaceTempView("transactions") val sql = """ |SELECT productId, count(distinct location) locCount FROM transactions | LEFT OUTER JOIN users | ON transactions.userId = users.userId |GROUP BY productId """.stripMargin val dfSqlResult = sparkSession.sql(sql) dfSqlResult.show() //重新分区,输出到同一个文件中,小数据量可以这样实现 //dfSqlResult.repartition(1).write.save(output + "/sql") dfSqlResult.rdd.repartition(1).saveAsTextFile(output + "/sql_output")
输出结果:
[p2,1] [p1,2] [p3,1] [p4,3]
以上,分别介绍了Spark中RDD,DataFrame和SparkSQL的LeftOuterJoin的操作。在Spark中,同样可以使用InnerJoin,RightOuterJoin方法来操作数据集,其使用与LeftOuterJoin的使用基本一样,在使用上传入的参数有所不同。
文章来源: [Spark的LeftOuterJoin操作]