Is there a way to replace null values in pyspark dataframe with the last valid value? There is addtional timestamp and session columns
This seems to be doing the trick using Window functions:
import sys
from pyspark.sql.window import Window
import pyspark.sql.functions as func
def fill_nulls(df):
df_na = df.na.fill(-1)
lag = df_na.withColumn('id_lag', func.lag('id', default=-1)\
.over(Window.partitionBy('session')\
.orderBy('timestamp')))
switch = lag.withColumn('id_change',
((lag['id'] != lag['id_lag']) &
(lag['id'] != -1)).cast('integer'))
switch_sess = switch.withColumn(
'sub_session',
func.sum("id_change")
.over(
Window.partitionBy("session")
.orderBy("timestamp")
.rowsBetween(-sys.maxsize, 0))
)
fid = switch_sess.withColumn('nn_id',
func.first('id')\
.over(Window.partitionBy('session', 'sub_session')\
.orderBy('timestamp')))
fid_na = fid.replace(-1, 'null')
ff = fid_na.drop('id').drop('id_lag')\
.drop('id_change')\
.drop('sub_session').\
withColumnRenamed('nn_id', 'id')
return ff
Here is the full null_test.py.