Naver Crawler: Combining DataFrame per each loop Python

匿名 (未验证) 提交于 2019-12-03 01:09:02

问题:

I am working on my Naver Crawler (its a Korea Google :P). I have working on this code for a week now, and I have one last task to solve! So my code below shows Data Crawling through Naver API and receiving data to "js" in each loop. All I need to do is combine each dataframe (dfdfdf) and combine at the bottom. But my result always shows the last looped data. Bottom line is that I want to add DataFrame for each loop that I am taking. I tried merge, join but it seems to be not working. Please let me know and if my code below does not make sense (or too dirty) let me know!

import os import sys import urllib.request import pandas as pd import json import numpy as np from datetime import datetime, timedelta import time  ex = pd.ExcelFile('mat_hierarchy.xlsx').parse('Sheet1')  DNA1 = [] #adding list to DNA DNA1.extend(ex.iloc[:,3]) DNA1.extend(ex.iloc[:,2]) seen = set() DNA = [] for item in DNA1:     if item not in seen:         seen.add(item)         DNA.append(item)  # len(DNA)  #Setting Date weekly or daily #dd = pd.date_range('2016-01-01',datetime.now().date() - timedelta(2)) dd = pd.date_range(start = '2016-01-01',end = datetime.now().date() - timedelta(2), freq = 'W-MON')  setendDate = datetime.now().date() - timedelta(1) endDate = setendDate.strftime('%Y-%m-%d')   #Setting DataFrame & List Data = pd.DataFrame(index=dd)  #Naver API Connection  client_id = "ID" client_secret = "PW" url = "https://openapi.naver.com/v1/datalab/search";  #Setting requests body_intro = "{\"startDate\":\"2016-01-01\",\"endDate\":\"" body_endDate = "\",\"timeUnit\":\"date\",\"keywordGroups\":[{\"groupName\":\"" body_keywords = "\",\"keywords\":[\"" body_groupName = "\"]},{\"groupName\":\"" body_last = "\"]}],\"ages\":[\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"10\",\"11\"]}"  df_list=[]  for i in range(2270,len(DNA),5):     if((len(DNA)%5==0) or (i < (len(DNA)-(len(DNA)%5)))):         body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + body_keywords + DNA[i+3] + body_groupName + DNA[i+4] + body_keywords + DNA[i+4] + body_last             print("5")          request = urllib.request.Request(url)         request.add_header("X-Naver-Client-Id",client_id)         request.add_header("X-Naver-Client-Secret",client_secret)         request.add_header("Content-Type","application/json")         response = urllib.request.urlopen(request, data=body.encode("utf-8"))         rescode = response.getcode()         if(rescode==200):             response_body = response.read()             js = response_body.decode('utf-8')         else:             print("Error Code:" + rescode)         #checking empty values & append to df_list         d = json.loads(js)         lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']})          if len(r['data']) > 0          else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])         for r in d['results']]         df = pd.concat(lst, 1)         dfdfdf = Data.join(df)         df_list.append(dfdfdf)       elif(len(DNA)%5==4):         body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + body_keywords + DNA[i+3] + body_last             print("4")          request = urllib.request.Request(url)         request.add_header("X-Naver-Client-Id",client_id)         request.add_header("X-Naver-Client-Secret",client_secret)         request.add_header("Content-Type","application/json")         response = urllib.request.urlopen(request, data=body.encode("utf-8"))         rescode = response.getcode()         if(rescode==200):             response_body = response.read()             js = response_body.decode('utf-8')         else:             print("Error Code:" + rescode)         #checking empty values & append to df_list         d = json.loads(js)         lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']})          if len(r['data']) > 0          else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])         for r in d['results']]         df = pd.concat(lst, 1)         dfdfdf = Data.join(df)         df_list.append(dfdfdf)       elif(len(DNA)%5==3):         body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + body_keywords + DNA[i+2] + body_last             print("3")          request = urllib.request.Request(url)         request.add_header("X-Naver-Client-Id",client_id)         request.add_header("X-Naver-Client-Secret",client_secret)         request.add_header("Content-Type","application/json")         response = urllib.request.urlopen(request, data=body.encode("utf-8"))         rescode = response.getcode()         if(rescode==200):             response_body = response.read()             js = response_body.decode('utf-8')         else:             print("Error Code:" + rescode)         #checking empty values & append to df_list         d = json.loads(js)         lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']})          if len(r['data']) > 0          else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])         for r in d['results']]         df = pd.concat(lst, 1)         dfdfdf = Data.join(df)         df_list.append(dfdfdf)       elif(len(DNA)%5==2):         body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_last             print("2")          request = urllib.request.Request(url)         request.add_header("X-Naver-Client-Id",client_id)         request.add_header("X-Naver-Client-Secret",client_secret)         request.add_header("Content-Type","application/json")         response = urllib.request.urlopen(request, data=body.encode("utf-8"))         rescode = response.getcode()         if(rescode==200):             response_body = response.read()             js = response_body.decode('utf-8')         else:             print("Error Code:" + rescode)         #checking empty values & append to df_list         d = json.loads(js)         lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']})          if len(r['data']) > 0          else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])         for r in d['results']]         df = pd.concat(lst, 1)         dfdfdf = Data.join(df)         df_list.append(dfdfdf)       else:         body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_last          print("1")          request = urllib.request.Request(url)         request.add_header("X-Naver-Client-Id",client_id)         request.add_header("X-Naver-Client-Secret",client_secret)         request.add_header("Content-Type","application/json")         response = urllib.request.urlopen(request, data=body.encode("utf-8"))         rescode = response.getcode()         if(rescode==200):             response_body = response.read()             js = response_body.decode('utf-8')         else:             print("Error Code:" + rescode)         #checking empty values & append to df_list         d = json.loads(js)         lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']})          if len(r['data']) > 0          else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])         for r in d['results']]         df = pd.concat(lst, 1)         dfdfdf = Data.join(df)         df_list.append(dfdfdf)       #Combining all Data     #Naver = Data.join(dfdfdf)      print("end")     time.sleep(.5) Final = pd.concat(df_list, axis=1) Final.to_csv("Naver123.csv") 

回答1:

Consider using a list of dataframes that are concatenated outside of for loop. Whereas the individual loops run horizontal merge, the final master combine runs a vertical append.

Also, for a DRY-er solution, consider using a defined method that runs the response to dataframe, passing in as a parameter the body variable, the only difference between if blocks.

... def response_to_df(body):    request = urllib.request.Request(url)    request.add_header("X-Naver-Client-Id",client_id)    request.add_header("X-Naver-Client-Secret",client_secret)    request.add_header("Content-Type","application/json")    response = urllib.request.urlopen(request, data=body.encode("utf-8"))    rescode = response.getcode()    if(rescode==200):        response_body = response.read()        js = response_body.decode('utf-8')     else:        print("Error Code:" + rescode)     d = json.loads(js)     lst = [pd.DataFrame.from_dict(r['data']).set_index('period')\                                   .rename(columns={'ratio' : r['title']})            for r in d['results']]      # HORIZONTAL MERGE     df = pd.concat(lst, axis=1)     df = Data.join(df)     return df   df_list = [] for i in range(len(DNA), 5):     if((len(DNA) % 5==0) or (i < (len(DNA) - (len(DNA) % 5)))):         body = body_intro + endDate + body_endDate + DNA[i] + \                body_keywords + DNA[i] + body_groupName + DNA[i+1] + \                body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + \                body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + \                body_keywords + DNA[i+3] + body_groupName + DNA[i+4] + \                body_keywords + DNA[i+4] + body_last             print("5")          tmp = response_to_df(body)         df_list.append(tmp)      elif(len(DNA) % 5 == 4):         body = body_intro + endDate + body_endDate + DNA[i] + \                body_keywords + DNA[i] + body_groupName + DNA[i+1] + \                body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + \                body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + \                body_keywords + DNA[i+3] + body_last             print("4")          tmp = response_to_df(body)         df_list.append(tmp)      elif(len(DNA) % 5 == 3):         body = body_intro + endDate + body_endDate + DNA[i] + \                body_keywords + DNA[i] + body_groupName + DNA[i+1] + \                body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + \                body_keywords + DNA[i+2] + body_last             print("3")          tmp = response_to_df(body)         df_list.append(tmp)      elif(len(DNA) % 5 == 2):         body = body_intro + endDate + body_endDate + DNA[i] + \                body_keywords + DNA[i] + body_groupName + DNA[i+1] + \                body_keywords + DNA[i+1] + body_last             print("2")          tmp = response_to_df(body)         df_list.append(tmp)       else:         body = body_intro + endDate + body_endDate + DNA[i] + \                body_keywords + DNA[i] + body_last          print("1")          tmp = response_to_df(body)         df_list.append(tmp)   # Combining all Data (VERTICAL APPEND) Naver = pd.concat(df_list, axis=0) print("ddd") Naver 


标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!