I am trying to solve the age-old problem of adding a sequence number to a data set. I am working with DataFrames, and there appears to be no DataFrame equivalent to RD
Here is my proposal, the advantages of which are:
DataFrame
's InternalRow
s.RDD.zipWithIndex
.Its major down sides are:
package org.apache.spark.sql;
.imports:
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.LogicalRDD
import org.apache.spark.sql.functions.lit
/**
* Optimized Spark SQL equivalent of RDD.zipWithIndex.
*
* @param df
* @param indexColName
* @return `df` with a column named `indexColName` of consecutive unique ids.
*/
def zipWithIndex(df: DataFrame, indexColName: String = "index"): DataFrame = {
import df.sparkSession.implicits._
val dfWithIndexCol: DataFrame = df
.drop(indexColName)
.select(lit(0L).as(indexColName), $"*")
val internalRows: RDD[InternalRow] = dfWithIndexCol
.queryExecution
.toRdd
.zipWithIndex()
.map {
case (internalRow: InternalRow, index: Long) =>
internalRow.setLong(0, index)
internalRow
}
Dataset.ofRows(
df.sparkSession,
LogicalRDD(dfWithIndexCol.schema.toAttributes, internalRows)(df.sparkSession)
)
[1]: (from/to InternalRow
's underlying bytes array <--> GenericRow
's underlying JVM objects collection Array[Any]
).