How to scrape multiple result having same tags and class

核能气质少年 提交于 2020-03-28 06:41:46

问题


My code is accurate for single page but when I run this code for multiple records using for loop and if there are some data missing like person then (as I used index no[1] and [2] for person variable ,location, phone no and cell no but if there are something missing like person name is missing) next record will be extracted at person variable. Could you please fix this issue? here is my code:

import requests
from bs4 import BeautifulSoup
import re


def get_page(url):
    response = requests.get(url)
    if not response.ok:
        print('server responded:', response.status_code)
    else:
        soup = BeautifulSoup(response.text, 'lxml') # 1. html , 2. parser
    return soup

def get_detail_data(soup):
        #soup = BeautifulSoup(r.text, 'html.parser')
        try:
            title = soup.find("h1", {'class': 'sc-AykKI'}).text
        except:
            title = 'Empty Title'
        #print(title)
        try:
            person = soup.find(
            "span", {'class': 'Contact__Item-sc-1giw2l4-2 kBpGee'}).text.strip()
        except:
            person = 'Empty Person'
        #print(person)
        try:
            addr = soup.findAll(
            "span", {'class': 'Contact__Item-sc-1giw2l4-2 kBpGee'})[1].text
        except:
            addr = 'Empty Address'
        #print(addr)
        #abn = soup.find('div', class_="box__Box-sc-1u3aqjl-0 kxddET").('a').text
        #print(abn)
        try:
            ratting = soup.find(
            "div", {'class': 'Rating__RatingText-sc-1r9ytu8-1 jIdgkl'}).text
        except:
            ratting = 'Empty Ratting'
        #print(ratting)
        try:
            abn = (re.search('abn\\\\":\\\\"(.*?)\\\\"', soup.text).group(1))
        except:
            abn = 'Empty ABN'
        #print(abn)
        try:
            website = (re.search('website\\\\":\\\\"(.*?)\\\\"', soup.text).group(1))
        except:
            website = 'Empty Website'
        #print(website )
        try:
            phone = (re.search('phone\\\\":\\\\"(.*?)\\\\"', soup.text).group(1))
        except:
            phone = 'Empty Phone No'
        #print(phone)
        try:
            cell = (re.search('mobile\\\\":\\\\"(.*?)\\\\"', soup.text).group(1))
        except:
            cell = 'Empty Cell No'
        #print(cell)

        data = {
        'title'         : title,
        'peron name'    : person,
        'address'       : addr,
        'phone no'      : phone,
        'cell no'       : cell,
        'abn no'        : abn,
        'website'       : website
        }
        return data
def get_index_data(soup):
    #soup = BeautifulSoup(r.text, 'html.parser')
    titles = []
    for item in soup.findAll("h3", {'class': 'sc-bZQynM sc-iwsKbI dpKmnV'}):
        urls = (f"https://hipages.com.au{item.previous_element.get('href')}")
        titles.append(urls)
    #print(titles)
    return titles

def Main():
    url = "https://hipages.com.au/connect/abcelectricservicespl/service/126298"
    mainurl = "https://hipages.com.au/find/antenna_services/nsw/sydney"
    main_titles = get_index_data(get_page(mainurl))
    for title in main_titles:
        data1 = get_detail_data(get_page(title))
        print(data1)


Main()

回答1:


You need to parse your data from the script tag rather than the spans and divs.

Try this:

import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from pandas import json_normalize
import json

def get_page(url):
    response = requests.get(url)
    if not response.ok:
        print('server responded:', response.status_code)
    else:
        soup = BeautifulSoup(response.text, 'lxml') 
    return soup

def get_detail_data(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.content, "lxml")
    raw = res.text.split("<script> window.__INITIAL_STATE__=")[1]
    raw = raw.split("</script>")[0]
    data = json.loads(raw)
    data = json.loads(data)

    cols = ['abn', 'address', 'name', 'primary_location', 'service_area', 'state', 'suburb', 'website']

    df = pd.DataFrame(data["sites"]["list"]).T
    df = df[cols].reset_index(drop=True)

    primary_location = json_normalize(df.primary_location[0])
    df = pd.concat([df, primary_location], axis=1)
    to_drop = ["primary_location", "is_primary", "suburb_seo_key", "capital_city_seo_key"]
    df.drop(to_drop, axis=1, inplace=True)

    return df


def get_index_data(soup):
    titles = []
    for item in soup.findAll("h3", {'class': 'sc-bZQynM sc-iwsKbI dpKmnV'}):
        urls = (f"https://hipages.com.au{item.previous_element.get('href')}")
        titles.append(urls)
    return titles

def Main():
    mainurl = "https://hipages.com.au/find/antenna_services/nsw/sydney"
    main_titles = get_index_data(get_page(mainurl))  
    final_data = [] 
    for title in main_titles:
        data = get_detail_data(title)
        final_data.append(data)
    return final_data

data = Main()

df = pd.concat(data).reset_index(drop=True)
display(df)

This gives you much more detailed data by the way.



来源:https://stackoverflow.com/questions/60556883/how-to-scrape-multiple-result-having-same-tags-and-class

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!