I am trying to create a dataframe using random uniform distribution in Spark. I couldn\'t find anything on how to create a dataframe but when I read the documentation I foun
The following solution ignores my own concern I mentioned in the question
I can create a few rdds and use them to create a dataframe but the dataset I am using has many fields(100+) and creating 100s of rdds and then zipping them doesn't seem efficient.
def create_uniform_rdd(nrow, ncol, schema = None):
random_rdd = RandomRDDs()
rdds = []
for each_col in range(ncol):
rdds.append(random_rdd.uniformRDD(sc, nrow).collect())
rdds = list(zip(*rdds))
if schema is None:
schema = StructType([StructField(str(i), FloatType(), False) for i in range(ncol)])
df = sqlContext.createDataFrame(rdds, schema)
return df
I had to do the zip
bit because Spark dataframes are row oriented. I could have switched ncol
with nrow
in the for
loop but since the number of rows I have are far larger than the number of columns
Adding a time comparison of eliasah's method and my method
def create_uniform_rdd_vector(nrow, ncol, schema = None):
data = RandomRDDs.uniformVectorRDD(sc, nrow,ncol).map(lambda a : a.tolist()).toDF()
return data
def create_uniform_rdd(nrow, ncol, schema = None):
random_rdd = RandomRDDs()
rdds = []
for each_col in range(ncol):
rdds.append(random_rdd.uniformRDD(sc, nrow).collect())
rdds = list(zip(*rdds))
if schema is None:
schema = StructType([StructField(str(i), FloatType(), False) for i in range(ncol)])
df = sqlContext.createDataFrame(rdds, schema)
return df
def timer_func(func, niter = 10):
tic = time()
for i in range(1,niter+1):
nrow = i*1000
ncol = i*10
_ = func(nrow, ncol, schema = None)
tac = time()
return tac - tic
niter = 5
create_uniform_rdd_time = timer_func(create_uniform_rdd, niter) # 4.27 secs
create_uniform_rdd_vector_time = timer_func(create_uniform_rdd_vector, niter) # 1.31 secs