Handling pagination on website using input buttons

问题

In trying to scrape this website using selenium.

I have the code working but it currently only scrapes the first page. The page uses input buttons as a way to navigate through pages so I thought to click each button one by one but it doesn't work, has anyone got any other way to handle navigation for this type of pagination?

import requests
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options

options = Options()
# options.add_argument('--headless')
options.add_argument("start-maximized")
options.add_argument('disable-infobars')
driver=webdriver.Chrome(chrome_options=options, 
executable_path=r'/Users/liban/Downloads/chromedriver')

url = 'http://www.boston.gov.uk/index.aspx?articleid=6207&ShowAdvancedSearch=true'
driver.get(url)


def get_Data():
    data = []
    divs = driver.find_element_by_xpath('//*[@id="content"]/form').find_elements_by_tag_name('div')
    for div in divs:
        app_number = driver.find_element_by_xpath('//div[ contains( concat( " ", normalize-space( @class ), " "), " grid_13 ") ]/form/div[1]/h4/a').text
        address = driver.find_element_by_xpath('//div[ contains( concat( " ", normalize-space( @class ), " "), " grid_13 ") ]/form/div[1]/p[5]').text
        status = driver.find_element_by_xpath('//div[ contains( concat( " ", normalize-space( @class ), " "), " grid_13 ") ]/form/div[1]/p[1]/strong').text
        link = driver.find_element_by_xpath('//div[ contains( concat( " ", normalize-space( @class ), " "), " grid_13 ") ]/form/div[1]/h4/a').get_attribute("href")
        proposals = driver.find_element_by_xpath('//div[ contains( concat( " ", normalize-space( @class ), " "), " grid_13 ") ]/form/div[1]/p[3]').text

        data.append({"caseRef": app_number, "propDesc": proposals, "address": address,  "caseUrl": link, "status": status})
    print(data)
    return data

def navigation():
    data = []
    total_inputs = driver.find_element_by_xpath('//div[ contains( concat( " ", normalize-space( @class ), " "), " grid_13 ") ]/form/table/tbody/tr/td/input')
    for input in total_inputs:
        input.click()
        data.extend(get_Data())

def main():
    all_data = []
    select = Select(driver.find_element_by_xpath('//*[@id="DatePresets"]'))
    select.select_by_index(7)
    search_by = driver.find_element_by_xpath('//*[@id="radio-ReceivedDate"]')
    search_by.click()
    show = Select(driver.find_element_by_xpath('//*[@id="ResultSize"]'))
    show.select_by_index(4)
    search_button = driver.find_element_by_xpath('//*[@id="content"]/form/input[3]')
    search_button.click()

    all_data.extend(navigation())

 if __name__ == "__main__":
       main()

How the website handles pagination:

  <td align="center">
           <input type="submit" class="pageNumberButton selected" name="searchResults_Page" value="1" disabled="disabled"/>
           <input type="submit" class="pageNumberButton " name="searchResults_Page" value="2" />
           <input type="submit" class="pageNumberButton " name="searchResults_Page" value="3" />
           <input type="submit" class="pageNumberButton " name="searchResults_Page" value="4" />
           <input type="submit" class="pageNumberButton " name="searchResults_Page" value="5" />
           <input type="submit" class="pageNumberButton " name="searchResults_Page" value="6" />
           <input type="submit" class="pageNumberButton " name="searchResults_Page" value="7" />
           <input type="submit" class="pageNumberButton " name="searchResults_Page" value="8" />
           <input type="submit" class="pageNumberButton " name="searchResults_Page" value="9" />
           <input type="submit" class="pageNumberButton " name="searchResults_Page" value="10" />
    </td>

Manual Steps:

Choose a preset date = 'Last Month'
Search by = 'Both Dates'
Click Search
After you scrape each page go to the next page and so on until there are no more pages then go back to original URL.

回答1:

As per your question to handle Pagination on the website http://www.boston.gov.uk/index.aspx?articleid=6207 you can use the following solution:

Code Block:

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select

options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\WebDrivers\ChromeDriver\chromedriver_win32\chromedriver.exe')
driver.get('http://www.boston.gov.uk/index.aspx?articleid=6207&ShowAdvancedSearch=true')
mySelectElement = Select(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "select#DatePresets[name='DatePresets']"))))
mySelectElement.select_by_visible_text('Last month')
driver.find_element_by_css_selector("input.button[name='searchFilter']").click()
numLinks = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "input.pageNumberButton"))))
print(numLinks)
for i in range(numLinks):
    print("Perform your scrapping here on page {}".format(str(i+1)))
    WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//input[@class='pageNumberButton selected']//following::input[1]"))).click()
driver.quit()

Console Output:

DevTools listening on ws://127.0.0.1:12115/devtools/browser/2ece3f6a-0431-4b74-9276-f61fcf70dd6d
10
Perform your scrapping here on page 1
Perform your scrapping here on page 2
Perform your scrapping here on page 3
Perform your scrapping here on page 4
Perform your scrapping here on page 5
Perform your scrapping here on page 6
Perform your scrapping here on page 7
Perform your scrapping here on page 8
Perform your scrapping here on page 9
Perform your scrapping here on page 10