Parse the html code for a whole webpage scrolled down

前端 未结 1 734
谎友^
谎友^ 2020-12-03 12:55
from bs4 import BeautifulSoup
import urllib,sys
reload(sys)
sys.setdefaultencoding(\"utf-8\")
r = urllib.urlopen(\'https://twitter.com/ndtv\').read()
soup = Beautifu         


        
相关标签:
1条回答
  • 2020-12-03 13:38

    I would still insist on using the Twitter API.

    Alternatively, here is how you can approach the problem with selenium:

    • use Explicit Waits and define a custom Expected Condition to wait for tweets to load on scroll
    • perform the scroll to a last loaded tweet via scrollIntoView()

    Implementation:

    from selenium import webdriver
    from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    
    
    class wait_for_more_than_n_elements_to_be_present(object):
        def __init__(self, locator, count):
            self.locator = locator
            self.count = count
    
        def __call__(self, driver):
            try:
                elements = EC._find_elements(driver, self.locator)
                return len(elements) > self.count
            except StaleElementReferenceException:
                return False
    
    
    url = "https://twitter.com/ndtv"
    driver = webdriver.Firefox()
    driver.maximize_window()
    driver.get(url)
    
    # initial wait for the tweets to load
    wait = WebDriverWait(driver, 10)
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
    
    # scroll down to the last tweet until there is no more tweets loaded
    while True:
        tweets = driver.find_elements_by_css_selector("li[data-item-id]")
        number_of_tweets = len(tweets)
    
        driver.execute_script("arguments[0].scrollIntoView();", tweets[-1])
    
        try:
            wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
        except TimeoutException:
            break
    

    This would scroll down as much as it is needed to load all of the existing tweets in this channel.


    Here is the HTML-parsing snippet, extracting tweets:

    page_source = driver.page_source
    driver.close()
    
    soup = BeautifulSoup(page_source)
    for tweet in soup.select("div.tweet div.content"):
        print tweet.p.text
    

    It prints:

    Father's Day Facebook post by arrested cop Suhas Gokhale's son got nearly 10,000 likes http://goo.gl/aPqlxf  pic.twitter.com/JUqmdWNQ3c
    #HWL2015 End of third quarter! Breathtaking stuff. India 2-2 Pakistan - http://sports.ndtv.com/hockey/news/244463-hockey-world-league-semifinal-india-vs-pakistan-antwerp …
    Why these Kashmiri boys may miss their IIT dream http://goo.gl/9LVKfK  pic.twitter.com/gohX21Gibi
    ...
    
    0 讨论(0)
提交回复
热议问题