Using Spark 1.5.1,
I\'ve been trying to forward fill null values with the last known observation for one column of my DataFrame.
Hope you find this forward fill function useful. It is written using native pyspark function. Neither udf nor rdd being used (both of them are very slow, especially UDF!).
Let's use example provided by @Sid.
values = [
(1, "2015-12-01", None),
(1, "2015-12-02", "U1"),
(1, "2015-12-02", "U1"),
(1, "2015-12-03", "U2"),
(1, "2015-12-04", None),
(1, "2015-12-05", None),
(2, "2015-12-04", None),
(2, "2015-12-03", None),
(2, "2015-12-02", "U3"),
(2, "2015-12-05", None),
]
df = spark.createDataFrame(values, ['cookie_ID', 'Time', 'User_ID'])
Functions:
def cum_sum(df, sum_col , order_col, cum_sum_col_nm='cum_sum'):
'''Find cumulative sum of a column.
Parameters
-----------
sum_col : String
Column to perform cumulative sum.
order_col : List
Column/columns to sort for cumulative sum.
cum_sum_col_nm : String
The name of the resulting cum_sum column.
Return
-------
df : DataFrame
Dataframe with additional "cum_sum_col_nm".
'''
df = df.withColumn('tmp', lit('tmp'))
windowval = (Window.partitionBy('tmp')
.orderBy(order_col)
.rangeBetween(Window.unboundedPreceding, 0))
df = df.withColumn('cum_sum', sum(sum_col).over(windowval).alias('cumsum').cast(StringType()))
df = df.drop('tmp')
return df
def forward_fill(df, order_col, fill_col, fill_col_name=None):
'''Forward fill a column by a column/set of columns (order_col).
Parameters:
------------
df: Dataframe
order_col: String or List of string
fill_col: String (Only work for a column for this version.)
Return:
---------
df: Dataframe
Return df with the filled_cols.
'''
# "value" and "constant" are tmp columns created ton enable forward fill.
df = df.withColumn('value', when(col(fill_col).isNull(), 0).otherwise(1))
df = cum_sum(df, 'value', order_col).drop('value')
df = df.withColumn(fill_col,
when(col(fill_col).isNull(), 'constant').otherwise(col(fill_col)))
win = (Window.partitionBy('cum_sum')
.orderBy(order_col))
if not fill_col_name:
fill_col_name = 'ffill_{}'.format(fill_col)
df = df.withColumn(fill_col_name, collect_list(fill_col).over(win)[0])
df = df.drop('cum_sum')
df = df.withColumn(fill_col_name, when(col(fill_col_name)=='constant', None).otherwise(col(fill_col_name)))
df = df.withColumn(fill_col, when(col(fill_col)=='constant', None).otherwise(col(fill_col)))
return df
Let's see the results.
ffilled_df = forward_fill(df,
order_col=['cookie_ID', 'Time'],
fill_col='User_ID',
fill_col_name = 'User_ID_ffil')
ffilled_df.sort(['cookie_ID', 'Time']).show()