问题
In trying to scrape this website using selenium.
I have the code working but it currently only scrapes the first page. The page uses input buttons as a way to navigate through pages so I thought to click each button one by one but it doesn't work, has anyone got any other way to handle navigation for this type of pagination?
import requests
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options
options = Options()
# options.add_argument('--headless')
options.add_argument("start-maximized")
options.add_argument('disable-infobars')
driver=webdriver.Chrome(chrome_options=options,
executable_path=r'/Users/liban/Downloads/chromedriver')
url = 'http://www.boston.gov.uk/index.aspx?articleid=6207&ShowAdvancedSearch=true'
driver.get(url)
def get_Data():
data = []
divs = driver.find_element_by_xpath('//*[@id="content"]/form').find_elements_by_tag_name('div')
for div in divs:
app_number = driver.find_element_by_xpath('//div[ contains( concat( " ", normalize-space( @class ), " "), " grid_13 ") ]/form/div[1]/h4/a').text
address = driver.find_element_by_xpath('//div[ contains( concat( " ", normalize-space( @class ), " "), " grid_13 ") ]/form/div[1]/p[5]').text
status = driver.find_element_by_xpath('//div[ contains( concat( " ", normalize-space( @class ), " "), " grid_13 ") ]/form/div[1]/p[1]/strong').text
link = driver.find_element_by_xpath('//div[ contains( concat( " ", normalize-space( @class ), " "), " grid_13 ") ]/form/div[1]/h4/a').get_attribute("href")
proposals = driver.find_element_by_xpath('//div[ contains( concat( " ", normalize-space( @class ), " "), " grid_13 ") ]/form/div[1]/p[3]').text
data.append({"caseRef": app_number, "propDesc": proposals, "address": address, "caseUrl": link, "status": status})
print(data)
return data
def navigation():
data = []
total_inputs = driver.find_element_by_xpath('//div[ contains( concat( " ", normalize-space( @class ), " "), " grid_13 ") ]/form/table/tbody/tr/td/input')
for input in total_inputs:
input.click()
data.extend(get_Data())
def main():
all_data = []
select = Select(driver.find_element_by_xpath('//*[@id="DatePresets"]'))
select.select_by_index(7)
search_by = driver.find_element_by_xpath('//*[@id="radio-ReceivedDate"]')
search_by.click()
show = Select(driver.find_element_by_xpath('//*[@id="ResultSize"]'))
show.select_by_index(4)
search_button = driver.find_element_by_xpath('//*[@id="content"]/form/input[3]')
search_button.click()
all_data.extend(navigation())
if __name__ == "__main__":
main()
How the website handles pagination:
<td align="center">
<input type="submit" class="pageNumberButton selected" name="searchResults_Page" value="1" disabled="disabled"/>
<input type="submit" class="pageNumberButton " name="searchResults_Page" value="2" />
<input type="submit" class="pageNumberButton " name="searchResults_Page" value="3" />
<input type="submit" class="pageNumberButton " name="searchResults_Page" value="4" />
<input type="submit" class="pageNumberButton " name="searchResults_Page" value="5" />
<input type="submit" class="pageNumberButton " name="searchResults_Page" value="6" />
<input type="submit" class="pageNumberButton " name="searchResults_Page" value="7" />
<input type="submit" class="pageNumberButton " name="searchResults_Page" value="8" />
<input type="submit" class="pageNumberButton " name="searchResults_Page" value="9" />
<input type="submit" class="pageNumberButton " name="searchResults_Page" value="10" />
</td>
Manual Steps:
- Choose a preset date = 'Last Month'
- Search by = 'Both Dates'
- Click Search
- After you scrape each page go to the next page and so on until there are no more pages then go back to original URL.
回答1:
As per your question to handle Pagination on the website http://www.boston.gov.uk/index.aspx?articleid=6207 you can use the following solution:
Code Block:
from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.select import Select options = Options() options.add_argument("start-maximized") options.add_argument("disable-infobars") options.add_argument("--disable-extensions") driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\WebDrivers\ChromeDriver\chromedriver_win32\chromedriver.exe') driver.get('http://www.boston.gov.uk/index.aspx?articleid=6207&ShowAdvancedSearch=true') mySelectElement = Select(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "select#DatePresets[name='DatePresets']")))) mySelectElement.select_by_visible_text('Last month') driver.find_element_by_css_selector("input.button[name='searchFilter']").click() numLinks = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "input.pageNumberButton")))) print(numLinks) for i in range(numLinks): print("Perform your scrapping here on page {}".format(str(i+1))) WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//input[@class='pageNumberButton selected']//following::input[1]"))).click() driver.quit()Console Output:
DevTools listening on ws://127.0.0.1:12115/devtools/browser/2ece3f6a-0431-4b74-9276-f61fcf70dd6d 10 Perform your scrapping here on page 1 Perform your scrapping here on page 2 Perform your scrapping here on page 3 Perform your scrapping here on page 4 Perform your scrapping here on page 5 Perform your scrapping here on page 6 Perform your scrapping here on page 7 Perform your scrapping here on page 8 Perform your scrapping here on page 9 Perform your scrapping here on page 10
回答2:
Try with:
find_elements_by_xpath instead of find_element_by_xpath which will return you the list.
PS: I didn't tried your code locally but the error you mentioned is the solution which I mentioned.
来源:https://stackoverflow.com/questions/52364188/handling-pagination-on-website-using-input-buttons