问题
αԋɱҽԃ αмєяιcαη helped me in constructing this code for scraping reviews from this page where reviews are dynamically loaded. I then tried to adjust it so that it scrapes not just the comment-body, but also the commentors' names, dates, and ratings, and for the code to save the extracted data into an excel file. But I failed to do so. Could someone help me in adjusting the code correctly?
This is the code from αԋɱҽԃ αмєяιcαη
import requests
from bs4 import BeautifulSoup
import math
def PageNum():
r = requests.get(
"https://boxes.mysubscriptionaddiction.com/box/boxycharm?ratings=true#review-update-create")
soup = BeautifulSoup(r.text, 'html.parser')
num = int(
soup.find("a", class_="show-more-reviews").text.split(" ")[3][1:-1])
if num % 3 == 0:
return (num / 3) + 1
else:
return math.ceil(num / 3) + 2
def Main():
num = PageNum()
headers = {
'X-Requested-With': 'XMLHttpRequest'
}
with requests.Session() as req:
for item in range(1, num):
print(f"Extracting Page# {item}")
r = req.get(
f"https://boxes.mysubscriptionaddiction.com/get_user_reviews?box_id=105&page={item}", headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
for com in soup.findAll("div", class_=r'\"comment-body\"'):
print(com.text[5:com.text.find(r"\n", 3)])
Main()
This is the code I adjusted but then got errors that I couldn't resolve
import requests
from bs4 import BeautifulSoup
import math
import pandas as pd
df = pd.DataFrame()
def PageNum():
r = requests.get(
"https://boxes.mysubscriptionaddiction.com/box/boxycharm?ratings=true#review-update-create")
soup = BeautifulSoup(r.text, 'html.parser')
num = int(
soup.find("a", class_="show-more-reviews").text.split(" ")[3][1:-1])
if num % 3 == 0:
return (num / 3) + 1
else:
return math.ceil(num / 3) + 2
def Main():
num = PageNum()
headers = {
'X-Requested-With': 'XMLHttpRequest'
}
with requests.Session() as req:
for item in range(1, num):
names = []
headers = []
bodies = []
ratings = []
published = []
updated = []
reported = []
dateElements = []
print(f"Extracting Page# {item}")
r = req.get(
f"https://boxes.mysubscriptionaddiction.com/get_user_reviews?box_id=105&page={item}", headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
for com in soup.findAll("div", class_=r'\"user-review\"'):
names.append(article.find('div', attrs={'class': 'name'}).text.strip())
try:
bodies.append(article.find('div', attrs={'class': 'comment-body'}).text.strip())
except:
bodies.append('NA')
try:
ratings.append(article.find('meta', attrs={'itemprop': 'ratingValue'})['content'])
except:
ratings.append('NA')
dateElements.append(article.find('div', attrs={'class': 'comment-date'}).text.strip())
print(com.text[5:com.text.find(r"\n", 3)])
temp_df = pd.DataFrame(
{'User Name': names, 'Body': bodies, 'Rating': ratings, 'Published Date': dateElements})
df = df.append(temp_df, sort=False).reset_index(drop=True)
Main()
df.to_csv('Allure10.csv', index=False, encoding='utf-8')
print ('excel done')
回答1:
import requests
from bs4 import BeautifulSoup
import math
import csv
def PageNum():
r = requests.get(
"https://boxes.mysubscriptionaddiction.com/box/boxycharm?ratings=true#review-update-create")
soup = BeautifulSoup(r.text, 'html.parser')
num = int(
soup.find("a", class_="show-more-reviews").text.split(" ")[3][1:-1])
if num % 3 == 0:
return (num / 3) + 1
else:
return math.ceil(num / 3) + 2
def Main():
num = PageNum()
headers = {
'X-Requested-With': 'XMLHttpRequest'
}
with requests.Session() as req:
names = []
dates = []
comments = []
rating = []
for item in range(1, num):
print(f"Extracting Page# {item}")
r = req.get(
f"https://boxes.mysubscriptionaddiction.com/get_user_reviews?box_id=105&page={item}", headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
for com in soup.findAll("div", class_=r'\"comment-body\"'):
comments.append(com.text[5:com.text.find(r"\n", 3)])
for name in soup.findAll("div", class_=r'\"name\"'):
names.append(name.text[:name.text.find(r"<\/div>", 1)])
for date in soup.findAll("div", class_=r'\"comment-date\"'):
dates.append(date.text[:date.text.find(r"<\/div>", 1)])
for rate in soup.findAll("meta", itemprop=r'\"ratingValue\"'):
rating.append(rate.get("content")[2:-3])
return zip(names, dates, rating, comments)
def Save():
data = Main()
with open("oka.csv", 'w', newline="", encoding="UTF-8") as f:
writer = csv.writer(f)
writer.writerow(["Name", "Dates", "Rating", "Comments"])
writer.writerows(data)
Save()
Output: check-online
来源:https://stackoverflow.com/questions/60595024/web-scraping-dynamic-pages-adjusting-the-code