Some data are not appearing while scrapping using for loop in Selenium, Python?

问题

I am scrapping booking.com for multiple pages using for loop and selenium web driver. However, some of the items are not appearing. Items are available when I checked the pages. Can you please advise what would be the problem and solution for this? I checked other posts here and they all advised to use a timer. I used timer whenever it reads a new page but not successful.

I can able to get the complete record if I scrape single page but it consumes a lot of time. Hence, I wanted to automate the same. Each page gives 28 records and second page offset to 25 as per the booking.com link.

Here I tried to extract hotels for wellington and it has 4 pages. I tested for two pages as per my code. Please help and advise what went wrong?

My code below


#Importing necessary library

from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.firefox.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import re
import requests

from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt


from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from itertools import zip_longest

# Creating an empty list for hotel name, ratings, locations, and description links and appending the list using loop
names = []
rating = []
location = []
links = []
reviews = []
price = []
p1 = []
desc = []
loc = []
src_link = []
category = []
desc = []

driver = webdriver.Chrome(ChromeDriverManager().install())
for pageno in range(0,50,25):

    print(pageno)

    driver.get("https://www.booking.com/searchresults.en-gb.html?aid=304142&label=gen173nr-1DCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAKhtYn0BcACAQ&sid=560904567b64f1e8c80d883e4882616f&tmpl=searchresults&checkin_month=8&checkin_monthday=1&checkin_year=2020&checkout_month=8&checkout_monthday=4&checkout_year=2020&class_interval=1&dest_id=-1521348&dest_type=city&dtdisc=0&from_sf=1&group_adults=2&group_children=0&inac=0&index_postcard=0&label_click=undef&no_rooms=1&postcard=0&raw_dest_type=city&room1=A%2CA&sb_price_type=total&shw_aparth=1&slp_r_match=0&src=index&src_elem=sb&srpvid=47769c9973ad002d&ss=Wellington&ss_all=0&ssb=empty&sshis=0&ssne=Wellington&ssne_untouched=Wellington&top_ufis=1&rows=25&offset=0" + str(pageno))
    time.sleep(5)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

#Hotel name
    for item in soup.findAll('span', {'class': 'sr-hotel__name'}):
        names.append(item.get_text(strip=True))
        #print(names)
# Number of reviews
    for item in soup.findAll('div', {'class' : 'bui-review-score__text'}):
        reviews.append(item.get_text(strip = True))
        #print(reviews)
#Number of ratings
    for item in soup.findAll("div", {'class': 'bui-review-score__badge'}):
        rating.append(item.get_text(strip=True))
        #print(rating)
# Extracting each hotel links
    for item in soup.findAll("a", {'class': 'hotel_name_link url'}):
        item = item.get("href").strip("\n")
        links.append(f"https://www.booking.com{item}")
#Extracting each hotel image link
    for link in soup.find_all("img", class_='hotel_image'):
        a = link.attrs["src"]
        src_link.append(a)
#Opening each hotel link and extracting location and hotel description
    for item in links:
        r = requests.get(item)
        soup = BeautifulSoup(r.text, 'html.parser')
        for item in soup.findAll("div", {'id': 'property_description_content'}):
            desc.append(item.get_text("\n", strip=True))
        for item in soup.findAll("span", {'class': 'hp_address_subtitle'}):
            loc.append(item.get_text(strip = True))
#Extracting hotel category type
    for item in links:


        driver.get(item)
        WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.CSS_SELECTOR,"h2#hp_hotel_name")))

        try:
            job_title = driver.find_element_by_css_selector("h2#hp_hotel_name>span").text
            category.append(job_title)
        #print(category)

        except:
            category.append("None")

# Converting all the details into dataframe and csv file
final = []
for item in zip_longest(names, reviews, rating, desc, loc, src_link, links, category):
    final.append(item)

df5 = pd.DataFrame(
    final, columns=['Names', 'Reviews','Rating', 'Description', 'Location', 'image', 'links', 'category'])
#df.to_csv('booked.csv')
#driver.quit()

Output: Hotel names, reviews, ratings are not appearing for last 20 records.

回答1:

If you can get everything from hotel's main page then I don't think you should implement lot of for loops as some are very illogical as mentioned in @ThePyGuy answer's.

So first of, get response

url="https://www.booking.com/searchresults.en-gb.html?dest_id=-1521348;dest_type=city;offset=0;ss=Wellington;tmpl=searchresults"

response=requests.get(url)

Now call a method which should be implemented like that

def pagination(response):
data={}
soup = BeautifulSoup(response.text, 'html.parser')
urls = soup.findAll("a", {'class': 'hotel_name_link url'})
img_urls=soup.findAll("img", class_='hotel_image')
for i in urls:
    resp=requests.get(urljoin(response.url,i.get("href").strip("\n")))
    sp = BeautifulSoup(resp.text, 'html.parser')
    data['Names']=sp.h2.text.strip() #You can get Category From here also
    data['Rating']=sp.find("div", {'class': 'bui-review-score__badge'}).get_text(strip=True)
    data['Reviews'] =sp.find('div', {'class' : 'bui-review-score__text'}).get_text(strip=True)
    data['Description']=next(iter([item.get_text("\n", strip=True) for item in sp.findAll("div", {'id': 'property_description_content'})]),'None')
    data['Location']=sp.find("span", {'class': 'hp_address_subtitle'}).get_text(strip=True)
    data['image']=img_urls[urls.index(i)].attrs["src"]
    data['links']=resp.url
    print (data)
try:
    next_page=soup.find("a", {'title': re.compile("Next page")}).attrs['href']
    if next_page:
        response = requests.get(next_page)
        print(response.url)
        pagination(response)
except AttributeError:
    print('Scraping Completed...!')

You can use dictionaries which are best fit for making csv files. If you think that you're unable to get data from the hotel's main page then here is the screenshot of the data

Just add a csv file code in place of print(data) and use it wherever you want.

回答2:

Use Pythons built in method for strings format. i.e. "There are {} pages.".format('4') The old way of doing this is "There are %s pages." % 4

You already had the idea right of counting the number of items and stepping by 25 with the range function. Also note if I do the following:

for i in range(1, 25):

It would only count to 24 and not actually count 25. So in your range function its not actually counting 50 so there is no second page. I would do this:

for pageno in range(0,51,25):

And change the driver.get string to operate with .format() it will replace the curlys with what you place in format.

driver.get("https://www.booking.com/searchresults.en-gb.html?aid=304142&label=gen173nr-1DCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAKhtYn0BcACAQ&sid=560904567b64f1e8c80d883e4882616f&tmpl=searchresults&checkin_month=8&checkin_monthday=1&checkin_year=2020&checkout_month=8&checkout_monthday=4&checkout_year=2020&class_interval=1&dest_id=-1521348&dest_type=city&dtdisc=0&from_sf=1&group_adults=2&group_children=0&inac=0&index_postcard=0&label_click=undef&no_rooms=1&postcard=0&raw_dest_type=city&room1=A%2CA&sb_price_type=total&shw_aparth=1&slp_r_match=0&src=index&src_elem=sb&srpvid=47769c9973ad002d&ss=Wellington&ss_all=0&ssb=empty&sshis=0&ssne=Wellington&ssne_untouched=Wellington&top_ufis=1&rows=25&offset={}".format(pageno)

来源：https://stackoverflow.com/questions/60940942/some-data-are-not-appearing-while-scrapping-using-for-loop-in-selenium-python

标签

python-3.x

pandas

selenium

selenium-webdriver