t1,p3,u1,1,300 t2,p1,u2,1,100 t3,p1,u1,1,100 t4,p2,u2,1,10 t5,p4,u4,1,9 t6,p1,u1,1,100 t7,p4,u1,1,9 t8,p4,u5,2,40

一、RDD的LeftOuterJoin操作

/**    * Perform a left outer join of `this` and `other`. For each element (k, v) in `this`, the    * resulting RDD will either contain all pairs (k, (v, Some(w))) for w in `other`, or the    * pair (k, (v, None)) if no elements in `other` have key k. Hash-partitions the output    * using the existing partitioner/parallelism level.    */   def leftOuterJoin[W](other: RDD[(K, W)]): RDD[(K, (V, Option[W]))] = self.withScope {     leftOuterJoin(other, defaultPartitioner(self, other))   }    /**    * Perform a left outer join of `this` and `other`. For each element (k, v) in `this`, the    * resulting RDD will either contain all pairs (k, (v, Some(w))) for w in `other`, or the    * pair (k, (v, None)) if no elements in `other` have key k. Hash-partitions the output    * into `numPartitions` partitions.    */   def leftOuterJoin[W](       other: RDD[(K, W)],       numPartitions: Int): RDD[(K, (V, Option[W]))] = self.withScope {     leftOuterJoin(other, new HashPartitioner(numPartitions))   }    /**    * Perform a right outer join of `this` and `other`. For each element (k, w) in `other`, the    * resulting RDD will either contain all pairs (k, (Some(v), w)) for v in `this`, or the    * pair (k, (None, w)) if no elements in `this` have key k. Hash-partitions the resulting    * RDD using the existing partitioner/parallelism level.    */   def rightOuterJoin[W](other: RDD[(K, W)]): RDD[(K, (Option[V], W))] = self.withScope {     rightOuterJoin(other, defaultPartitioner(self, other))   }

1.2 RDD使用LeftOuterJoin的代码实现

/**   * Spark-RDD的左连接操作   **/ object RDDkLeftOuterJoin {     def main(args: Array[String]): Unit = {         if (args.length < 3) {             println("使用参数:SparkLeftOuterJoin <users-data-path> <transactions-data-path> <output-path>")             sys.exit(1)         }         //用户数据文件         val usersFile: String = args(0)         //交易数据文件         val transactionsFile: String = args(1)         //输出路径         val output: String = args(2)          val sparkConf: SparkConf = new SparkConf().setMaster("local").setAppName("RDDLeftOuterJoinExample")         //创建SparkContext         val sparkContext: SparkContext = SparkSession.builder().config(sparkConf).getOrCreate().sparkContext         //读取用户数据形成RDD         val usersRaw: RDD[String] = sparkContext.textFile(usersFile)         //读取交易数据形成RDD         val transactionsRaw: RDD[String] = sparkContext.textFile(transactionsFile)          val rddUsers: RDD[(String, String)] = usersRaw.map(line => {             val tokens = line.split(",")             (tokens(0), tokens(1))         })          val rddTransactions: RDD[(String, (String, String, String))] = transactionsRaw.map(line => {             val tokens = line.split(",")             (tokens(2), (tokens(1), tokens(3), tokens(4)))         })          val rddJoined: RDD[(String, ((String, String, String), Option[String]))] = rddTransactions.leftOuterJoin(rddUsers)         rddJoined.foreach(println)         val rddProductLocations: RDD[(String, String)] = rddJoined.values.map(f => (f._1._1, f._2.getOrElse("unknown")))          val rddProductByLocations: RDD[(String, Iterable[String])] = rddProductLocations.groupByKey()         // 转换为Set,去掉重复数据         val productWithUniqueLocations: RDD[(String, Set[String])] = rddProductByLocations.mapValues(_.toSet)         // 统计产品个数，返回tuple(product, location count).         val rddProductCount: RDD[(String, Int)] = productWithUniqueLocations.map(f => (f._1, f._2.size))         //保存结果到输出路径         rddProductCount.saveAsTextFile(output)     } }

1.3 输出结果

(p1,2) (p2,1) (p3,1) (p4,3)

二、DataFrame的LeftOuterJoin操作

2.1 DataFrame的LeftOuterJoin方法定义

/**    * Join with another `DataFrame`, using the given join expression. The following performs    * a full outer join between `df1` and `df2`.    *    * {{{    *   // Scala:    *   import org.apache.spark.sql.functions._    *   df1.join(df2, $"df1Key" === $"df2Key", "outer")    *    *   // Java:    *   import static org.apache.spark.sql.functions.*;    *   df1.join(df2, col("df1Key").equalTo(col("df2Key")), "outer");    * }}}    *    * @param right Right side of the join.    * @param joinExprs Join expression.    * @param joinType Type of join to perform. Default `inner`. Must be one of:    *                 `inner`, `cross`, `outer`, `full`, `full_outer`, `left`, `left_outer`,    *                 `right`, `right_outer`, `left_semi`, `left_anti`.    *    * @group untypedrel    * @since 2.0.0    */   def join(right: Dataset[_], joinExprs: Column, joinType: String): DataFrame = {     // Note that in this function, we introduce a hack in the case of self-join to automatically     // resolve ambiguous join conditions into ones that might make sense [SPARK-6231].     // Consider this case: df.join(df, df("key") === df("key"))     // Since df("key") === df("key") is a trivially true condition, this actually becomes a     // cartesian join. However, most likely users expect to perform a self join using "key".     // With that assumption, this hack turns the trivially true condition into equality on join     // keys that are resolved to both sides.      // Trigger analysis so in the case of self-join, the analyzer will clone the plan.     // After the cloning, left and right side will have distinct expression ids.     val plan = withPlan(       Join(logicalPlan, right.logicalPlan, JoinType(joinType), Some(joinExprs.expr)))       .queryExecution.analyzed.asInstanceOf[Join]      // If auto self join alias is disabled, return the plan.     if (!sparkSession.sessionState.conf.dataFrameSelfJoinAutoResolveAmbiguity) {       return withPlan(plan)     }      // If left/right have no output set intersection, return the plan.     val lanalyzed = withPlan(this.logicalPlan).queryExecution.analyzed     val ranalyzed = withPlan(right.logicalPlan).queryExecution.analyzed     if (lanalyzed.outputSet.intersect(ranalyzed.outputSet).isEmpty) {       return withPlan(plan)     }      // Otherwise, find the trivially true predicates and automatically resolves them to both sides.     // By the time we get here, since we have already run analysis, all attributes should've been     // resolved and become AttributeReference.     val cond = plan.condition.map { _.transform {       case catalyst.expressions.EqualTo(a: AttributeReference, b: AttributeReference)           if a.sameRef(b) =>         catalyst.expressions.EqualTo(           withPlan(plan.left).resolve(a.name),           withPlan(plan.right).resolve(b.name))     }}      withPlan {       plan.copy(condition = cond)     }   }

2.2 DataFrame的LeftOuterJoin的代码实现

/**   * Spark-DataFrame的左连接操作   **/ object DataFrameLeftOuterJoin {     def main(args: Array[String]): Unit = {         if (args.length < 3) {             println("使用参数: DataFrameLeftOuterJoin <users-data-path> <transactions-data-path> <output-path>")             sys.exit(1)         }         //用户数据文件         val usersFile: String = args(0)         //交易数据文件         val transactionsFile: String = args(1)         //输出路径         val output: String = args(2)          val sparkConf = new SparkConf()             .setMaster("local[1]")             .setAppName("DataFrameLeftOuterJoinExample")         val sparkSession: SparkSession = SparkSession.builder().config(sparkConf).getOrCreate()          val sparkContext: SparkContext = sparkSession.sparkContext          // 定义用户的schema         val userSchema = StructType(Seq(             StructField("userId", StringType, false),             StructField("location", StringType, false)))          // 定义交易数据的schema         val transactionSchema = StructType(Seq(             StructField("transactionId", StringType, false),             StructField("productId", StringType, false),             StructField("userId", StringType, false),             StructField("quantity", IntegerType, false),             StructField("price", DoubleType, false)))          //加载用户数据         val usersRaw: RDD[String] = sparkContext.textFile(usersFile)         //转换为RDD[org.apache.spark.sql.Row]         val userRDDRows: RDD[Row] = usersRaw.map(line => {             val tokens = line.split(",")             Row(tokens(0), tokens(1))         })         //从RDD中创建DataFrame         val dfUsers: DataFrame = sparkSession.createDataFrame(userRDDRows, userSchema)         dfUsers.printSchema()         //加载交易数据         val transactionsRaw = sparkContext.textFile(transactionsFile)         //转换为RDD[org.apache.spark.sql.Row]         val transactionsRDDRows = transactionsRaw.map(line => {             val tokens = line.split(",")             Row(tokens(0), tokens(1), tokens(2), tokens(3).toInt, tokens(4).toDouble)         })         // 从RDD中创建DataFrame         val dfTransactions = sparkSession.createDataFrame(transactionsRDDRows, transactionSchema)         dfTransactions.printSchema()         //DataFrame的LeftOutJoin,用户关联交易信息         val dfLeftJoin: DataFrame = dfTransactions.join(dfUsers, dfTransactions("userId") === dfUsers("userId"), "left")         dfLeftJoin.printSchema()         dfLeftJoin.show()         //查询产品的用户及地址         val dfProductLocation: DataFrame = dfLeftJoin.select(dfUsers.col("userId"), dfLeftJoin.col("productId"), dfLeftJoin.col("location"))         dfProductLocation.show()         val dfProductLocationDistinct: Dataset[Row] = dfProductLocation.distinct         dfProductLocationDistinct.show()         val dfProductsCount: DataFrame = dfProductLocationDistinct.groupBy("productId").count()         dfProductsCount.show()         //重新分区,输出到同一个文件中，小数据量可以这样实现         //dfProductsCount.repartition(1).write.save(output + "/df")         dfProductsCount.rdd.repartition(1).saveAsTextFile(output + "/df_output")     } }

2.3 运行结果

[p2,1] [p1,2] [p3,1] [p4,3]

三、SparkSQL的LeftOuterJoin

  //使用SparkSQL，创建一个用户临时表         dfUsers.createOrReplaceTempView("users")         //使用SparkSQL，创建一个交易数据临时表         dfTransactions.createOrReplaceTempView("transactions")         val sql =             """               |SELECT productId, count(distinct location) locCount FROM transactions               | LEFT OUTER JOIN users               |     ON transactions.userId = users.userId               |GROUP BY productId             """.stripMargin         val dfSqlResult = sparkSession.sql(sql)         dfSqlResult.show()         //重新分区,输出到同一个文件中，小数据量可以这样实现         //dfSqlResult.repartition(1).write.save(output + "/sql")         dfSqlResult.rdd.repartition(1).saveAsTextFile(output + "/sql_output")

输出结果：

[p2,1] [p1,2] [p3,1] [p4,3]

以上，分别介绍了Spark中RDD,DataFrame和SparkSQL的LeftOuterJoin的操作。在Spark中，同样可以使用InnerJoin,RightOuterJoin方法来操作数据集，其使用与LeftOuterJoin的使用基本一样，在使用上传入的参数有所不同。

文章来源: [Spark的LeftOuterJoin操作]

标签

dataframe

string

spark

rdd