Scraping Duckduckgo with Python 3.6

后端 未结 2 1187
暖寄归人
暖寄归人 2021-01-22 05:36

A simple question. i can scrape results from the first page of a duckduckgo search. However i am struggling to get onto the 2nd and subsequent pages. I have used Python with the

2条回答
  •  日久生厌
    2021-01-22 06:13

    calling class of 'btn--alt' when you go to second page will not work as this is the same class name for both buttons 'Next' and 'Previous', and it was clicking on previous button and return me back again !!

    below code change worked for me perfectly

    nextButton = driver.find_element_by_xpath('//input[@value="Next"]')
    nextButton.click()
    

    full function:

    def duckduckGoSearch(query,searchPages = None,filterTheSearch = False,searchFilter = None):
    
    URL_ = 'https://duckduckgo.com/html?'
    driver = webdriver.Chrome()
    driver.get(URL_)
    
    query = query
    
    searchResults = {}
    
    filterTheSearch = filterTheSearch
    
    searchFilter = searchFilter
    
    searchFilter = searchFilter
    
    # # click on search textBox
    # item = driver.find_element_by_xpath('//*[@id="sb_form_q"]').click()
    #
    # #Enter your search query
    item = driver.find_element_by_xpath('//*[@id="search_form_input_homepage"]').send_keys(query)
    
    # # Click enter to perform the search process
    item = driver.find_element_by_xpath('//*[@id="search_form_input_homepage"]').send_keys(Keys.RETURN)
    
    time.sleep(2)
    
    page_number = 1
    
    while True:
    
        # loop for the required number of pages
    
        if page_number <= searchPages:
    
            try:
    
                nextButton = driver.find_element_by_xpath('//input[@value="Next"]')
                nextButton.click()
    
                page_number += 1
    
                try:
                    webPageSource = driver.page_source
    
                    # parse and get the urls for the results
    
                    soup = BeautifulSoup(webPageSource, "html.parser")
    
                    Data_Set_div_Tags = soup.findAll('h2') + soup.findAll('div', {'class': 'result__body links_main links_deep'})
    
                    for i in range(0, len(Data_Set_div_Tags)):
    
                        try:
                            resultDescription = Data_Set_div_Tags[i].findAll('a')[0].text
    
                            resultURL = Data_Set_div_Tags[i].findAll('a')[0]['href']
    
                        except:
                            print('nothing to parse')
                            pass
    
                        if resultURL not in searchResults.keys():
                            if filterTheSearch:
                                if searchFilter in resultURL:
                                    searchResults[resultURL] = resultDescription
    
                            else:
                                searchResults[resultURL] = resultDescription
    
                except:
                    print('search is done , found ', len(searchResults), 'Results')
                    break
                    # pass
    
            except:  # change something so it stops scrolling
                print('search is done , found ', len(searchResults), 'Results')
                print('no more pages')
                driver.quit()
                break
    
    
        else:
            print('search is done , found ', len(searchResults), 'Results')
            driver.quit()
            break
    
    
    return searchResults
    

提交回复
热议问题