I have a dataframe df that loads data from a database. Most of the columns are json strings while some are even list of jsons. For example:
id
TL;DR Copy-paste the following function and use it like this: flatten_nested_json_df(df)
This is the most general function I could come up with:
def flatten_nested_json_df(df):
df = df.reset_index()
print(f"original shape: {df.shape}")
print(f"original columns: {df.columns}")
# search for columns to explode/flatten
s = (df.applymap(type) == list).all()
list_columns = s[s].index.tolist()
s = (df.applymap(type) == dict).all()
dict_columns = s[s].index.tolist()
print(f"lists: {list_columns}, dicts: {dict_columns}")
while len(list_columns) > 0 or len(dict_columns) > 0:
new_columns = []
for col in dict_columns:
print(f"flattening: {col}")
# explode dictionaries horizontally, adding new columns
horiz_exploded = pd.json_normalize(df[col]).add_prefix(f'{col}.')
horiz_exploded.index = df.index
df = pd.concat([df, horiz_exploded], axis=1).drop(columns=[col])
new_columns.extend(horiz_exploded.columns) # inplace
for col in list_columns:
print(f"exploding: {col}")
# explode lists vertically, adding new columns
df = df.drop(columns=[col]).join(df[col].explode().to_frame())
new_columns.append(col)
# check if there are still dict o list fields to flatten
s = (df[new_columns].applymap(type) == list).all()
list_columns = s[s].index.tolist()
s = (df[new_columns].applymap(type) == dict).all()
dict_columns = s[s].index.tolist()
print(f"lists: {list_columns}, dicts: {dict_columns}")
print(f"final shape: {df.shape}")
print(f"final columns: {df.columns}")
return df
It takes a dataframe that may have nested lists and/or dicts in its columns, and recursively explodes/flattens those columns.
It uses pandas' pd.json_normalize to explode the dictionaries (creating new columns), and pandas' explode to explode the lists (creating new rows).
Simple to use:
# Test
df = pd.DataFrame(
columns=['id','name','columnA','columnB'],
data=[
[1,'John',{"dist": "600", "time": "0:12.10"},[{"pos": "1st", "value": "500"},{"pos": "2nd", "value": "300"},{"pos": "3rd", "value": "200"}, {"pos": "total", "value": "1000"}]],
[2,'Mike',{"dist": "600"},[{"pos": "1st", "value": "500"},{"pos": "2nd", "value": "300"},{"pos": "total", "value": "800"}]]
])
flatten_nested_json_df(df)
It's not the most efficient thing on earth, and it has the side effect of resetting your dataframe's index, but it gets the job done. Feel free to tweak it.