问题
I am trying to scrape agents data here. I am able to get the links from the first page. I am using numbered loops because I know the total number of pages. I tried to run this as long as the "next" page option is there. I tried both "try" and "if not" but wasn't able to figure it out. Any help is welcome. Here is the code.
from selenium import webdriver
import time
from selenium.common.exceptions import ElementNotVisibleException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome('C:/Users/../Downloads/cd79/chromedriver.exe', options=options)
links_total = []
driver.get("https://www.cbp.gov/contact/find-broker-by-port?field_port_location_tid=All&field_port_code_value=")
def first_links():
initial_data = driver.find_elements_by_tag_name('td')
for initial in initial_data:
page_links = initial.find_elements_by_tag_name('a')
for page in page_links:
page_link = page.get_attribute("href")
links_total.append(page_link)
driver.refresh()
if driver.find_element_by_partial_link_text('next'):
next_page = driver.find_element_by_partial_link_text('next')
next_page.click()
time.sleep(2)
new_data = driver.find_elements_by_tag_name('td')
for new in new_data:
links = new.find_elements_by_tag_name('a')
for link in links:
new_link = link.get_attribute("href")
links_total.append(new_link)
for i in range(1, 23):
first_links()
for link in links_total:
print(link)
回答1:
Try-catch would be better option
from selenium import webdriver
import time
from selenium.common.exceptions import ElementNotVisibleException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome('C:/Users/../Downloads/cd79/chromedriver.exe', options=options)
driver.implicitly_wait(10)
# links_total = []
driver.get("https://www.cbp.gov/contact/find-broker-by-port?field_port_location_tid=All&field_port_code_value=")
def first_links(links_total=[]):
initial_data = driver.find_elements_by_tag_name('td')
for initial in initial_data:
page_links = initial.find_elements_by_tag_name('a')
for page in page_links:
page_link = page.get_attribute("href")
links_total.append(page_link)
# driver.refresh()
try:
next_page = driver.find_element_by_partial_link_text('next')
next_page.click()
time.sleep(2)
first_links(links_total)
except (TimeoutError, ElementNotVisibleException, NoSuchElementException):
print("NEXT btn not found : ")
pass
return links_total
all_links = first_links()
for link in all_links:
print(link)
You don't need to use Selenium actually. You could do it with BeautifulSoap like so :
import requests
from bs4 import BeautifulSoup
page_num=0
url_cbp = r"https://www.cbp.gov/contact/find-broker-by-port?field_port_location_tid=All&field_port_code_value=&page={}"
def get_links(links_total=[], page_num=0):
page = requests.get(url_cbp.format(page_num))
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='region-content')
table_cells = results.find_all('td', class_='views-field')
for cell in table_cells:
# print(cell )
# print('\n\n')
cell_link = cell.find('a')
page_link = cell_link["href"]
links_total.append(page_link)
next_page = results.find('li', class_='pager-next')
if next_page:
page_num += 1
get_links(links_total, page_num)
return links_total
all_links = get_links()
for link in all_links:
print(link)
来源:https://stackoverflow.com/questions/62057651/if-or-try-loop-for-an-element-in-a-page-selenium