Scraping Duckduckgo with Python 3.6

后端未结

关注

 2  1196

暖寄归人 2021-01-22 05:36

A simple question. i can scrape results from the first page of a duckduckgo search. However i am struggling to get onto the 2nd and subsequent pages. I have used Python with the

2条回答

日久生厌 (楼主)

2021-01-22 06:13

calling class of 'btn--alt' when you go to second page will not work as this is the same class name for both buttons 'Next' and 'Previous', and it was clicking on previous button and return me back again !!

below code change worked for me perfectly

nextButton = driver.find_element_by_xpath('//input[@value="Next"]')
nextButton.click()

full function:

def duckduckGoSearch(query,searchPages = None,filterTheSearch = False,searchFilter = None):

URL_ = 'https://duckduckgo.com/html?'
driver = webdriver.Chrome()
driver.get(URL_)

query = query

searchResults = {}

filterTheSearch = filterTheSearch

searchFilter = searchFilter

searchFilter = searchFilter

# # click on search textBox
# item = driver.find_element_by_xpath('//*[@id="sb_form_q"]').click()
#
# #Enter your search query
item = driver.find_element_by_xpath('//*[@id="search_form_input_homepage"]').send_keys(query)

# # Click enter to perform the search process
item = driver.find_element_by_xpath('//*[@id="search_form_input_homepage"]').send_keys(Keys.RETURN)

time.sleep(2)

page_number = 1

while True:

    # loop for the required number of pages

    if page_number <= searchPages:

        try:

            nextButton = driver.find_element_by_xpath('//input[@value="Next"]')
            nextButton.click()

            page_number += 1

            try:
                webPageSource = driver.page_source

                # parse and get the urls for the results

                soup = BeautifulSoup(webPageSource, "html.parser")

                Data_Set_div_Tags = soup.findAll('h2') + soup.findAll('div', {'class': 'result__body links_main links_deep'})

                for i in range(0, len(Data_Set_div_Tags)):

                    try:
                        resultDescription = Data_Set_div_Tags[i].findAll('a')[0].text

                        resultURL = Data_Set_div_Tags[i].findAll('a')[0]['href']

                    except:
                        print('nothing to parse')
                        pass

                    if resultURL not in searchResults.keys():
                        if filterTheSearch:
                            if searchFilter in resultURL:
                                searchResults[resultURL] = resultDescription

                        else:
                            searchResults[resultURL] = resultDescription

            except:
                print('search is done , found ', len(searchResults), 'Results')
                break
                # pass

        except:  # change something so it stops scrolling
            print('search is done , found ', len(searchResults), 'Results')
            print('no more pages')
            driver.quit()
            break


    else:
        print('search is done , found ', len(searchResults), 'Results')
        driver.quit()
        break


return searchResults

0 讨论(0)

查看其它2个回答