Web Scraping Dynamic Pages - Adjusting the code

有些话、适合烂在心里 提交于 2020-03-25 18:46:08

问题


αԋɱҽԃ αмєяιcαη helped me in constructing this code for scraping reviews from this page where reviews are dynamically loaded. I then tried to adjust it so that it scrapes not just the comment-body, but also the commentors' names, dates, and ratings, and for the code to save the extracted data into an excel file. But I failed to do so. Could someone help me in adjusting the code correctly?

This is the code from αԋɱҽԃ αмєяιcαη

import requests
from bs4 import BeautifulSoup
import math


def PageNum():
    r = requests.get(
        "https://boxes.mysubscriptionaddiction.com/box/boxycharm?ratings=true#review-update-create")
    soup = BeautifulSoup(r.text, 'html.parser')
    num = int(
        soup.find("a", class_="show-more-reviews").text.split(" ")[3][1:-1])
    if num % 3 == 0:
        return (num / 3) + 1
    else:
        return math.ceil(num / 3) + 2


def Main():
    num = PageNum()
    headers = {
        'X-Requested-With': 'XMLHttpRequest'
    }
    with requests.Session() as req:
        for item in range(1, num):
            print(f"Extracting Page# {item}")
            r = req.get(
                f"https://boxes.mysubscriptionaddiction.com/get_user_reviews?box_id=105&page={item}", headers=headers)
            soup = BeautifulSoup(r.text, 'html.parser')
            for com in soup.findAll("div", class_=r'\"comment-body\"'):
                print(com.text[5:com.text.find(r"\n", 3)])


Main()

This is the code I adjusted but then got errors that I couldn't resolve

import requests
from bs4 import BeautifulSoup
import math
import pandas as pd

df = pd.DataFrame()

def PageNum():
    r = requests.get(
        "https://boxes.mysubscriptionaddiction.com/box/boxycharm?ratings=true#review-update-create")
    soup = BeautifulSoup(r.text, 'html.parser')
    num = int(
        soup.find("a", class_="show-more-reviews").text.split(" ")[3][1:-1])
    if num % 3 == 0:
        return (num / 3) + 1
    else:
        return math.ceil(num / 3) + 2


def Main():
    num = PageNum()
    headers = {
        'X-Requested-With': 'XMLHttpRequest'
    }
    with requests.Session() as req:
        for item in range(1, num):
            names = []
            headers = []
            bodies = []
            ratings = []
            published = []
            updated = []
            reported = []
            dateElements = []
            print(f"Extracting Page# {item}")
            r = req.get(
                f"https://boxes.mysubscriptionaddiction.com/get_user_reviews?box_id=105&page={item}", headers=headers)
            soup = BeautifulSoup(r.text, 'html.parser')
            for com in soup.findAll("div", class_=r'\"user-review\"'):
                names.append(article.find('div', attrs={'class': 'name'}).text.strip())
                try:
                    bodies.append(article.find('div', attrs={'class': 'comment-body'}).text.strip())
                except:
                    bodies.append('NA')

                try:
                    ratings.append(article.find('meta', attrs={'itemprop': 'ratingValue'})['content'])
                except:
                    ratings.append('NA')
                dateElements.append(article.find('div', attrs={'class': 'comment-date'}).text.strip())
                print(com.text[5:com.text.find(r"\n", 3)])

            temp_df = pd.DataFrame(
                {'User Name': names, 'Body': bodies, 'Rating': ratings, 'Published Date': dateElements})
            df = df.append(temp_df, sort=False).reset_index(drop=True)

Main()

df.to_csv('Allure10.csv', index=False, encoding='utf-8')
print ('excel done')

回答1:


import requests
from bs4 import BeautifulSoup
import math
import csv


def PageNum():
    r = requests.get(
        "https://boxes.mysubscriptionaddiction.com/box/boxycharm?ratings=true#review-update-create")
    soup = BeautifulSoup(r.text, 'html.parser')
    num = int(
        soup.find("a", class_="show-more-reviews").text.split(" ")[3][1:-1])
    if num % 3 == 0:
        return (num / 3) + 1
    else:
        return math.ceil(num / 3) + 2


def Main():
    num = PageNum()
    headers = {
        'X-Requested-With': 'XMLHttpRequest'
    }
    with requests.Session() as req:
        names = []
        dates = []
        comments = []
        rating = []
        for item in range(1, num):
            print(f"Extracting Page# {item}")
            r = req.get(
                f"https://boxes.mysubscriptionaddiction.com/get_user_reviews?box_id=105&page={item}", headers=headers)
            soup = BeautifulSoup(r.text, 'html.parser')
            for com in soup.findAll("div", class_=r'\"comment-body\"'):
                comments.append(com.text[5:com.text.find(r"\n", 3)])
            for name in soup.findAll("div", class_=r'\"name\"'):
                names.append(name.text[:name.text.find(r"<\/div>", 1)])
            for date in soup.findAll("div", class_=r'\"comment-date\"'):
                dates.append(date.text[:date.text.find(r"<\/div>", 1)])
            for rate in soup.findAll("meta", itemprop=r'\"ratingValue\"'):
                rating.append(rate.get("content")[2:-3])
    return zip(names, dates, rating, comments)


def Save():
    data = Main()
    with open("oka.csv", 'w', newline="", encoding="UTF-8") as f:
        writer = csv.writer(f)
        writer.writerow(["Name", "Dates", "Rating", "Comments"])
        writer.writerows(data)


Save()

Output: check-online



来源:https://stackoverflow.com/questions/60595024/web-scraping-dynamic-pages-adjusting-the-code

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!