问题
I am reading log files in my python code which contains some nested json data. I have a nested for loop containing 4 for-loops from which values of certain keys are extracted and appended to a dataframe.
The nested for-loop is taking too much time and I saw from other answers that multiprocessing is the way to go for nested for-loops but did not find an example for json data.
What is the best approach for this ? Below is my code to extract data from log files and into dataframes. recommendation_list
is a list of json objects.
for recommendation in recommendation_list:
if recommendation['type'] == "httpRequest":
session_id = recommendation['query'].split('sessionId=')[1].split('&')[0]
category_id = recommendation['query'].split('categoryId=')[1].split('&')[0]
if recommendation['type'] == "httpResponse":
recommendation_count = recommendation_count + 1
user_id = recommendation['userId']
time_stamp = recommendation['ts']
event_date = time_stamp.split("T")[0]
time = time_stamp.split("T")[-1]
try:
product_list = json.loads(recommendation['body'])['products']
except:
product_list = []
if len(product_list) > 0:
for product in product_list:
product_id = product["id"]
if 'recommendationMeta' in product:
data_frame = data_frame.append({
"transaction_id": last_id,
"user_id": user_id,
"session_id": session_id,
"category_id": category_id,
"product_id": product_id,
"date": event_date,
"time": time[0:12],
"event": "recommendation",
"ab_bucket": "B",
"recommendation_count": recommendation_count,
}, ignore_index=True)
for learning_unit in product['recommendationMeta']:
lu_name = learning_unit['lu']
lu_value = learning_unit['value']
recommendation_mode = learning_unit['recommendationMode']
prod_def1 = products[(products["product_id"] == product_id) &
(products["lu_value"].str.lower() == lu_value)]
if len(prod_def1) != 0:
product_list = prod_def1.to_dict('records')
for product_id in product_list:
category = categories[(categories["category_def_id"] == product_id["category_def_id"]) &
(categories["lu_name"].str.lower() == lu_name)]
if len(category) != 0:
product_def_id = product_id['product_def_id']
lu_df = lu_df.append({
"lu_data_id": lu_id,
"product_def_id": product_def_id,
"transaction_id": last_id,
"rec_mode": recommendation_mode,
}, ignore_index=True)
lu_id = lu_id+1
last_id = last_id + 1
I figure that the innermost for-loop executes most number of times and decided to use multiprocessing for it.
I replaced
for product_id in product_list:
category = categories[(categories["category_def_id"] == product_id["category_def_id"]) &
(categories["lu_name"].str.lower() == lu_name)]
if len(category) != 0:
product_def_id = product_id['product_def_id']
lu_df = lu_df.append({
"lu_data_id": lu_id,
"product_def_id": product_def_id,
"transaction_id": last_id,
"rec_mode": recommendation_mode,
}, ignore_index=True)
lu_id = lu_id+1
with this...
for product_id in product_list:
pool = Pool() # Create a multiprocessing Pool
data = pool.starmap(create_lu_data, [last_id, categories, recommendation_mode,
lu_name, lu_df, lu_id, product_id])
lu_id = lu_id + 1
p.close()
print(data)
where create_lu_data
is
def create_lu_data(last_id, categories, recommendation_mode, lu_name, lu_df, lu_id, product_id):
category = categories[(categories["category_def_id"] == product_id["category_def_id"]) &
(categories["lu_name"].str.lower() == lu_name)]
if len(category) != 0:
product_def_id = product_id['product_def_id']
lu_df = lu_df.append({
"lu_data_id": lu_id,
"product_def_id": product_def_id,
"transaction_id": last_id,
"rec_mode": recommendation_mode,
}, ignore_index=True)
return lu_df
I didn't get any errors, but the output dataframe has several times the expected number of rows.
来源:https://stackoverflow.com/questions/60387457/how-to-optimize-a-nested-for-loop-looping-over-json-data-to-extract-values-of-c