How to speed up parsing using BeautifulSoup?

本秂侑毒 提交于 2021-02-11 14:47:41

问题


I want to make a list of music festivals in Korea, so I tried to crawl a website selling festival tickets:

import requests
from bs4 import BeautifulSoup

INTERPARK_BASE_URL = 'http://ticket.interpark.com'

# Festival List Page
req = requests.get('http://ticket.interpark.com/TPGoodsList.asp?Ca=Liv&SubCa=Fes')
html = req.text
soup = BeautifulSoup(html, 'lxml')
for title_raw in soup.find_all('span', class_='fw_bold'):
    title = str(title_raw.find('a').text)
    url_raw = str(title_raw.find('a').get('href'))
    url = INTERPARK_BASE_URL + url_raw

    # Detail Page
    req_detail = requests.get(url)
    html_detail = req_detail.text
    soup_detail = BeautifulSoup(html_detail, 'lxml')
    details_1 = soup_detail.find('table', class_='table_goods_info')
    details_2 = soup_detail.find('ul', class_='info_Lst')
    image = soup_detail.find('div', class_='poster')

    singers = str(details_1.find_all('td')[4].text)
    place = str(details_1.find_all('td')[5].text)
    date_text = str(details_2.find('span').text)
    image_url = str(image.find('img').get('src'))

    print(title)
    print(url)
    print(singers)
    print(place)
    print(date_text)
    print(image_url)

I used for loop to browse all detail pages in the list page, but loading each detail page is too slow.

How to speed up my code?


回答1:


import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime as dt
import csv


def Soup(content):
    soup = BeautifulSoup(content, 'html.parser')
    return soup


def Main(url):
    r = requests.get(url)
    soup = Soup(r.content)
    spans = soup.findAll('span', class_='fw_bold')
    links = [f"{url[:27]}{span.a['href']}" for span in spans]
    return links


def Parent():
    links = Main(
        "http://ticket.interpark.com/TPGoodsList.asp?Ca=Liv&SubCa=Fes")
    with open("result.csv", 'w', newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["Name", "Singers", "Location", "Date", "ImageUrl"])
        with requests.Session() as req:
            for link in links:
                r = req.get(link)
                soup = Soup(r.content)
                script = json.loads(
                    soup.find("script", type="application/ld+json").text)
                name = script["name"]
                print(f"Extracting: {name}")
                singers = script["performer"]["name"]
                location = script["location"]["name"]
                datelist = list(script.values())[3:5]
                datest = []
                image = script["image"]
                for date in datelist:
                    date = dt.strptime(date,
                                       '%Y%m%d').strftime('%d-%m-%Y')
                    datest.append(date)
                writer.writerow(
                    [name, singers, location, " : ".join(datest), *image])


Parent()

Run&Check-Output-Online

View-Output



来源:https://stackoverflow.com/questions/60794997/how-to-speed-up-parsing-using-beautifulsoup

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!