How to write code to read output file to figure out how far it got in scraping website and then starting from where it left off

问题

I'm writing a program to scrape article title, date and body text from each article on this website's archive and export to a csv file. The website seems to block me at some point and I get this error: HTTPError: Service Unavailable.

I believe this is because I am trying to access their website too many times in a short amount of time. I want my code to be able to read where the error happened and pick up where it left off.

I've tried adding delays to delay 2 seconds after going through 10 articles. I've also tried random delays after every ten articles. I could add longer delays but I want the code to be able to pick up where it felt off to be foolproof.

from bs4 import BeautifulSoup
from urllib.request import urlopen
import csv
from time import sleep
from random import randint

csvfile = "C:/Users/k/Dropbox/granularitygrowth/Politico/pol.csv"
with open(csvfile, mode='w', newline='', encoding='utf-8') as pol:
    csvwriter = csv.writer(pol, delimiter='~', quoting=csv.QUOTE_MINIMAL)
    csvwriter.writerow(["Date", "Title", "Article"])

    #for each page on Politico archive
    for p in range(0,412):
        url = urlopen("https://www.politico.com/newsletters/playbook/archive/%d" % p)
        content = url.read()

        #Parse article links from page
        soup = BeautifulSoup(content,"lxml")
        articleLinks = soup.findAll('article', attrs={'class':'story-frag format-l'})

        #Each article link on page
        for article in articleLinks:
            link = article.find('a', attrs={'target':'_top'}).get('href')

            #Open and read each article link
            articleURL = urlopen(link)
            articleContent = articleURL.read()

            #Parse body text from article page
            soupArticle = BeautifulSoup(articleContent, "lxml")

            #Limits to div class = story-text tag (where article text is)
            articleText = soupArticle.findAll('div', attrs={'class':'story-text'})
            for div in articleText:

                #Find date
                footer = div.find('footer', attrs={'class':'meta'})
                date = footer.find('time').get('datetime')
                print(date)

                #Find title
                headerSection = div.find('header')
                title = headerSection.find('h1').text
                print(title)

                #Find body text
                textContent = ""
                bodyText = div.findAll('p')
                for p in bodyText:
                    p_string = str(p.text)
                    textContent += p_string + ' '
                print(textContent)

                #Adds data to csv file
                csvwriter.writerow([date, title, textContent])

        time.sleep(randint(3,8))

I expect my code to still have this error but then pick up from where it left off and continue printing and exporting data to csv file.

回答1:

You can count the number of articles you've saved in CSV, int divide it by 10 (page = 1 + records // 10 (+1 is for the first page)) to get the last page you were at.

I've refactored your code like this:

import csv
import time
from random import randint
from urllib.request import urlopen

from bs4 import BeautifulSoup

HEADERS = ["Date", "Title", "Article"]


def count_rows(csv_path: str) -> int:
    with open(csv_path) as f:
        reader = csv.DictReader(f)
        return len(list(reader))


def write_articles(csv_path: str, articles: list):
    # note the append mode, write mode would delete everything and start fresh
    with open(csv_path, 'a', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f,
                                quoting=csv.QUOTE_MINIMAL,
                                fieldnames=HEADERS)
        writer.writerows(articles)


def init_csv(csv_path: str):
    with open(csv_path, 'w', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=HEADERS, quoting=csv.QUOTE_MINIMAL)
        writer.writeheader()


def get_page_soup(url: str) -> BeautifulSoup:
    response = urlopen(url)
    html = response.read()

    soup = BeautifulSoup(html, "lxml")
    return soup


def scrape_article(url: str) -> dict:
    article_soup = get_page_soup(url)

    # Limits to div class = story-text tag (where article text is)
    story_el = article_soup.select_one('.story-text')

    # find date
    date = story_el.select_one('.timestamp time')['datetime']

    # find title
    title = story_el.find('h1').text

    # find body text
    article_text = ''
    for p in story_el.find_all('p'):
        article_text += p.text + ' '

    return {
        'Title': title,
        'Date': date,
        'Article': article_text
    }


def main():
    csvfile = "test.csv"

    try:
        record_count = count_rows(csvfile)
    except FileNotFoundError:
        init_csv(csvfile)
        print('Initialized CSV file')
        record_count = 0

    article_per_page = 10
    page = 1 + record_count // article_per_page

    print('Continuing from page', page)

    articles = []
    for p in range(page, 413):
        url = "https://www.politico.com/newsletters/playbook/archive/%d" % p
        soup = get_page_soup(url)
        article_links = soup.select('article.story-frag.format-l')

        # Each article link on page
        for article in article_links:
            link = article.select_one('a[target=_top]')['href']
            scraped_article = scrape_article(link)
            print(scraped_article)
            articles.append(scraped_article)

        write_articles(csvfile, articles)
        print('Finished page', p)
        time.sleep(randint(3, 8))


if __name__ == '__main__':
    main()

this gives you an output like this:

Finished page 48
{'Title': 'Playbook: Scalise takes several Republicans to ...
{'Title': 'Playbook: Four unfolding events that show the  ...
{'Title': 'Playbook: Texas kicks off primary season, as D ...
{'Title': 'Playbook: The next gen: McCarthy and Crowley’s ...
{'Title': 'INSIDE THE GRIDIRON DINNER: What Trump said an ...
{'Title': 'DEMS spending millions already to boost vulner ...
{'Title': 'Playbook: Inside the Republican super PAC mone ...
{'Title': 'Playbook: Who would want to be White House com ...
{'Title': "Playbook: Jared Kushner's bad day", 'Date': '2 ...
{'Title': 'Playbook: Gun control quickly stalls in the Se ...
Finished page 49

来源：https://stackoverflow.com/questions/56845903/how-to-write-code-to-read-output-file-to-figure-out-how-far-it-got-in-scraping-w

标签

python

for-loop

web-scraping

http-error