val rdd = sc.parallelize(Seq((\"vskp\", Array(2.0, 1.0, 2.1, 5.4)),(\"hyd\",Array(1.5, 0.5, 0.9, 3.7)),(\"hyd\", Array(1.5, 0.5, 0.9, 3.2)),(\"tvm\", Array(8.0, 2.9,
Thanks to Tomer's Answer
For scala - The issue came up when I tried to use the column in the self-join clause, to fix it use the method
// To `and` all the column conditions
def andAll(cols: Iterable[Column]): Column =
if (cols.isEmpty) lit(true)
else cols.tail.foldLeft(cols.head) { case (soFar, curr) => soFar.and(curr) }
// To perform join different col name
def renameColAndJoin(leftDf: DataFrame, joinCols: Seq[String], joinType: String = "inner")(rightDf: DataFrame): DataFrame = {
val renamedCols: Seq[String] = joinCols.map(colName => s"${colName}_renamed")
val zippedCols: Seq[(String, String)] = joinCols.zip(renamedCols)
val renamedRightDf: DataFrame = zippedCols.foldLeft(rightDf) {
case (df, (origColName, renamedColName)) => df.withColumnRenamed(origColName, renamedColName)
}
val joinExpr: Column = andAll(zippedCols.map {
case (origCol, renamedCol) => renamedRightDf(renamedCol).equalTo(rightDf(origCol))
})
leftDf.join(renamedRightDf, joinExpr, joinType)
}