Need help to scrape “Show more” button

我是研究僧i 提交于 2021-01-25 22:12:24

问题


I have the followind code

import pandas as pd
import requests
from bs4 import BeautifulSoup
import datetime
import time


url_list = [
        'https://www.coolmod.com/componentes-pc-procesadores?f=375::No',
#       'https://www.coolmod.com/componentes-pc-placas-base?f=55::ATX||prices::3-300',

   ]

df_list = [] 

for url in url_list:

    headers = ({'User-Agent':
         'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
         'Accept-Language': 'es-ES, es;q=0.5'})
    print (url)
    r = requests.get(url, headers = headers)
    print(r.status_code)
    soup = BeautifulSoup(r.content,'html.parser')
    items = soup.find_all('div',class_='col-xs-12 col-sm-6 col-sm-6 col-md-6 col-lg-3 col-product col-custom-width')
    # print(items)
    store = 'Coolmod'
    extraction_date = datetime.datetime.today().replace(microsecond=0)
    for item in items:
        product_name = item.find('div',class_ = 'product-name').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
        try:
            price = item.find('div', class_ = 'margin-top-20 mod-product-price text-big').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
        except AttributeError:
            price = item.find('div', class_ = 'mod-product-price text-big').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
        except AttributeError:
            price = "No price"  # .replace('€','').replace('\t','').replace('\n', '').replace('\r', '')
        
         # old_price = item.find(class_ = 'old-price product-price').text[:-2] if item.find(class_ = 'old-price product-price') != None else None
        try:
            availability = item.find('div', class_ = 'product-availability cat-product-availability').text.replace('\t','').replace('\n', '').replace('\r', '')
         # except AttributeError:
         #     availability = item.find('span', class_ = 'btn-addtocart btn-icon disabled').text.replace('\t','').replace('\n', '').replace('\r', '')
        except AttributeError:
             availability = "No info"
         # stock = [item.find(class_ = 'item-availability').get_text() if item.find(class_ = 'item-availability') != None else None for item in items] 
        product_info = {
                'product_name' : product_name,
                'price' : price,
             #  'old_price' : old_price,
                'availability' : availability,
                'store' : store,
                'date_extraction' : extraction_date,
            }
        df_list.append(product_info)
    time.sleep(3)

df = pd.DataFrame(df_list)
print(df)

It works fine and return a dataframe with the expected results. The problem is only retrieve the twenty first records, after that there is a "Show more" button in order to get the next twenty products and so on.

I see the web page code and inspect it but I don´t find a way to interact with the button.

Any idea or suggestion would be much appreciated.

Regards.


回答1:


Finally I got it

from selenium import webdriver 
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
import datetime
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
import random
import pandas as pd

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
#options.add_argument('--headless')

driver = webdriver.Chrome(executable_path=r"C:\\chromedriver.exe", options=options)


url = 'https://www.coolmod.com/componentes-pc-procesadores?f=375::No'

driver.get(url)

sleep(random.uniform(5.0, 7.5))

try:
    popup = driver.find_element_by_class_name('confirm').click()
except NoSuchElementException:
    pass


iter = 1
while iter > 0:
    sleep(random.uniform(3.5, 6.5))
    try:
        ver_mas = driver.find_element_by_class_name('button-load-more')
        actions = ActionChains(driver)
        actions.move_to_element(ver_mas).perform()
        driver.execute_script("arguments[0].click();", ver_mas)

    except NoSuchElementException:
        break
    iter += 1

page_source = driver.page_source

soup = BeautifulSoup(page_source, 'lxml')
# print(soup)

items = soup.find_all('div', class_='col-lg-12 col-md-12 col-sm-8 col-xs-9 cat-container-text')
# print(len(items))

df_list = []
store = 'Coolmod'
extraction_date = datetime.datetime.today().replace(microsecond=0)
for item in items:
    product_name = item.find('div',class_ = 'product-name').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
    try:
        price = item.find('div', class_ = 'margin-top-20 mod-product-price text-big').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
    except AttributeError:
        price = item.find('div', class_ = 'mod-product-price text-big').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
    except AttributeError:
        price = "No price"
    try:
        availability = item.find('div', class_ = 'product-availability cat-product-availability').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
    except AttributeError:
        availability = item.find('div', class_ = 'product-availability cat-product-availability local-available').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
    except AttributeError:
        availability = "No info"
    
    product_info = {
            'product_name' : product_name,
            'price' : price,
            'availability' : availability,
            'store' : store,
            'date_extraction' : extraction_date,
        }
    df_list.append(product_info)

df = pd.DataFrame(df_list)
print(df)

Thanks @Alin Stelian for the help



来源:https://stackoverflow.com/questions/65793234/need-help-to-scrape-show-more-button

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!