Naver Crawler: Combining DataFrame per each loop Python

放肆的年华 提交于 2019-12-14 03:02:14

问题


I am working on my Naver Crawler (its a Korea Google :P). I have working on this code for a week now, and I have one last task to solve! So my code below shows Data Crawling through Naver API and receiving data to "js" in each loop. All I need to do is combine each dataframe (dfdfdf) and combine at the bottom. But my result always shows the last looped data. Bottom line is that I want to add DataFrame for each loop that I am taking. I tried merge, join but it seems to be not working. Please let me know and if my code below does not make sense (or too dirty) let me know!

import os
import sys
import urllib.request
import pandas as pd
import json
import numpy as np
from datetime import datetime, timedelta
import time

ex = pd.ExcelFile('mat_hierarchy.xlsx').parse('Sheet1') 
DNA1 = []
#adding list to DNA
DNA1.extend(ex.iloc[:,3])
DNA1.extend(ex.iloc[:,2])
seen = set()
DNA = []
for item in DNA1:
    if item not in seen:
        seen.add(item)
        DNA.append(item)

# len(DNA)

#Setting Date weekly or daily
#dd = pd.date_range('2016-01-01',datetime.now().date() - timedelta(2))
dd = pd.date_range(start = '2016-01-01',end = datetime.now().date() - timedelta(2), freq = 'W-MON')

setendDate = datetime.now().date() - timedelta(1)
endDate = setendDate.strftime('%Y-%m-%d') 

#Setting DataFrame & List
Data = pd.DataFrame(index=dd)

#Naver API Connection 
client_id = "ID"
client_secret = "PW"
url = "https://openapi.naver.com/v1/datalab/search";

#Setting requests
body_intro = "{\"startDate\":\"2016-01-01\",\"endDate\":\""
body_endDate = "\",\"timeUnit\":\"date\",\"keywordGroups\":[{\"groupName\":\""
body_keywords = "\",\"keywords\":[\""
body_groupName = "\"]},{\"groupName\":\""
body_last = "\"]}],\"ages\":[\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"10\",\"11\"]}"

df_list=[]

for i in range(2270,len(DNA),5):
    if((len(DNA)%5==0) or (i < (len(DNA)-(len(DNA)%5)))):
        body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + body_keywords + DNA[i+3] + body_groupName + DNA[i+4] + body_keywords + DNA[i+4] + body_last    
        print("5")

        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id",client_id)
        request.add_header("X-Naver-Client-Secret",client_secret)
        request.add_header("Content-Type","application/json")
        response = urllib.request.urlopen(request, data=body.encode("utf-8"))
        rescode = response.getcode()
        if(rescode==200):
            response_body = response.read()
            js = response_body.decode('utf-8')
        else:
            print("Error Code:" + rescode)
        #checking empty values & append to df_list
        d = json.loads(js)
        lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']}) 
        if len(r['data']) > 0 
        else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
        for r in d['results']]
        df = pd.concat(lst, 1)
        dfdfdf = Data.join(df)
        df_list.append(dfdfdf)


    elif(len(DNA)%5==4):
        body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + body_keywords + DNA[i+3] + body_last    
        print("4")

        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id",client_id)
        request.add_header("X-Naver-Client-Secret",client_secret)
        request.add_header("Content-Type","application/json")
        response = urllib.request.urlopen(request, data=body.encode("utf-8"))
        rescode = response.getcode()
        if(rescode==200):
            response_body = response.read()
            js = response_body.decode('utf-8')
        else:
            print("Error Code:" + rescode)
        #checking empty values & append to df_list
        d = json.loads(js)
        lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']}) 
        if len(r['data']) > 0 
        else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
        for r in d['results']]
        df = pd.concat(lst, 1)
        dfdfdf = Data.join(df)
        df_list.append(dfdfdf)


    elif(len(DNA)%5==3):
        body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + body_keywords + DNA[i+2] + body_last    
        print("3")

        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id",client_id)
        request.add_header("X-Naver-Client-Secret",client_secret)
        request.add_header("Content-Type","application/json")
        response = urllib.request.urlopen(request, data=body.encode("utf-8"))
        rescode = response.getcode()
        if(rescode==200):
            response_body = response.read()
            js = response_body.decode('utf-8')
        else:
            print("Error Code:" + rescode)
        #checking empty values & append to df_list
        d = json.loads(js)
        lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']}) 
        if len(r['data']) > 0 
        else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
        for r in d['results']]
        df = pd.concat(lst, 1)
        dfdfdf = Data.join(df)
        df_list.append(dfdfdf)


    elif(len(DNA)%5==2):
        body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_last    
        print("2")

        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id",client_id)
        request.add_header("X-Naver-Client-Secret",client_secret)
        request.add_header("Content-Type","application/json")
        response = urllib.request.urlopen(request, data=body.encode("utf-8"))
        rescode = response.getcode()
        if(rescode==200):
            response_body = response.read()
            js = response_body.decode('utf-8')
        else:
            print("Error Code:" + rescode)
        #checking empty values & append to df_list
        d = json.loads(js)
        lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']}) 
        if len(r['data']) > 0 
        else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
        for r in d['results']]
        df = pd.concat(lst, 1)
        dfdfdf = Data.join(df)
        df_list.append(dfdfdf)


    else:
        body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_last 
        print("1")

        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id",client_id)
        request.add_header("X-Naver-Client-Secret",client_secret)
        request.add_header("Content-Type","application/json")
        response = urllib.request.urlopen(request, data=body.encode("utf-8"))
        rescode = response.getcode()
        if(rescode==200):
            response_body = response.read()
            js = response_body.decode('utf-8')
        else:
            print("Error Code:" + rescode)
        #checking empty values & append to df_list
        d = json.loads(js)
        lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']}) 
        if len(r['data']) > 0 
        else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
        for r in d['results']]
        df = pd.concat(lst, 1)
        dfdfdf = Data.join(df)
        df_list.append(dfdfdf)


    #Combining all Data
    #Naver = Data.join(dfdfdf) 
    print("end")
    time.sleep(.5)
Final = pd.concat(df_list, axis=1)
Final.to_csv("Naver123.csv")

回答1:


Consider using a list of dataframes that are concatenated outside of for loop. Whereas the individual loops run horizontal merge, the final master combine runs a vertical append.

Also, for a DRY-er solution, consider using a defined method that runs the response to dataframe, passing in as a parameter the body variable, the only difference between if blocks.

...
def response_to_df(body):
   request = urllib.request.Request(url)
   request.add_header("X-Naver-Client-Id",client_id)
   request.add_header("X-Naver-Client-Secret",client_secret)
   request.add_header("Content-Type","application/json")
   response = urllib.request.urlopen(request, data=body.encode("utf-8"))
   rescode = response.getcode()
   if(rescode==200):
       response_body = response.read()
       js = response_body.decode('utf-8')
    else:
       print("Error Code:" + rescode)
    d = json.loads(js)
    lst = [pd.DataFrame.from_dict(r['data']).set_index('period')\
                                  .rename(columns={'ratio' : r['title']})
           for r in d['results']]

    # HORIZONTAL MERGE
    df = pd.concat(lst, axis=1)
    df = Data.join(df)
    return df


df_list = []
for i in range(len(DNA), 5):
    if((len(DNA) % 5==0) or (i < (len(DNA) - (len(DNA) % 5)))):
        body = body_intro + endDate + body_endDate + DNA[i] + \
               body_keywords + DNA[i] + body_groupName + DNA[i+1] + \
               body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + \
               body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + \
               body_keywords + DNA[i+3] + body_groupName + DNA[i+4] + \
               body_keywords + DNA[i+4] + body_last    
        print("5")

        tmp = response_to_df(body)
        df_list.append(tmp)

    elif(len(DNA) % 5 == 4):
        body = body_intro + endDate + body_endDate + DNA[i] + \
               body_keywords + DNA[i] + body_groupName + DNA[i+1] + \
               body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + \
               body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + \
               body_keywords + DNA[i+3] + body_last    
        print("4")

        tmp = response_to_df(body)
        df_list.append(tmp)

    elif(len(DNA) % 5 == 3):
        body = body_intro + endDate + body_endDate + DNA[i] + \
               body_keywords + DNA[i] + body_groupName + DNA[i+1] + \
               body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + \
               body_keywords + DNA[i+2] + body_last    
        print("3")

        tmp = response_to_df(body)
        df_list.append(tmp)

    elif(len(DNA) % 5 == 2):
        body = body_intro + endDate + body_endDate + DNA[i] + \
               body_keywords + DNA[i] + body_groupName + DNA[i+1] + \
               body_keywords + DNA[i+1] + body_last    
        print("2")

        tmp = response_to_df(body)
        df_list.append(tmp) 

    else:
        body = body_intro + endDate + body_endDate + DNA[i] + \
               body_keywords + DNA[i] + body_last 
        print("1")

        tmp = response_to_df(body)
        df_list.append(tmp)


# Combining all Data (VERTICAL APPEND)
Naver = pd.concat(df_list, axis=0)
print("ddd")
Naver


来源:https://stackoverflow.com/questions/47619920/naver-crawler-combining-dataframe-per-each-loop-python

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!