I am loading a file of JSON objects as a PySpark SchemaRDD. I want to change the \"shape\" of the objects (basically, I\'m flattening them) and then insert into
you can try this one... a bit long but works
def flat_table(df,table_name):
def rec(l,in_array,name):
for i,v in enumerate(l):
if isinstance(v['type'],dict):
if 'fields' in v['type'].keys():
rec(name=name+[v['name']],l=v['type']['fields'],in_array=False)
if 'elementType' in v['type'].keys():
rec(name=name+[v['name']],l=v['type']['elementType']['fields'],in_array=True)
else:#recursia stop rule
#if this is an array so we need to explode every element in the array
if in_array:
field_list.append('{node}{subnode}.array'.format(node=".".join(name)+'.' if name else '', subnode=v['name']))
else:
field_list.append('{node}{subnode}'.format(node=".".join(name)+'.' if name else '', subnode=v['name']))
# table_name='x'
field_list=[]
l=df.schema.jsonValue()['fields']
df.registerTempTable(table_name)
rec(l,in_array=False,name=[table_name])
#create the select satement
inner_fileds=[]
outer_fields=[]
flag=True
for x in field_list:
f=x.split('.')
if f[-1]<>'array':
inner_fileds.append('{field} as {name}'.format(field=".".join(f),name=f[-1]))
of=['a']+f[-1:]
outer_fields.append('{field} as {name}'.format(field=".".join(of),name=of[-1]))
else:
if flag:#add the array to the inner query for expotion only once for every array field
inner_fileds.append('explode({field}) as {name}'.format(field=".".join(f[:-2]),name=f[-3]))
flag=False
of=['a']+f[-3:-1]
outer_fields.append('{field} as {name}'.format(field=".".join(of),name=of[-1]))
q="""select {outer_fields}
from (select {inner_fileds}
from {table_name}) a""".format(outer_fields=',\n'.join(outer_fields),inner_fileds=',\n'.join(inner_fileds),table_name=table_name)
return q