Parse the html code for a whole webpage scrolled down

可紊 提交于 2019-11-27 14:33:39
alecxe

I would still insist on using the Twitter API.

Alternatively, here is how you can approach the problem with selenium:

Implementation:

from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


class wait_for_more_than_n_elements_to_be_present(object):
    def __init__(self, locator, count):
        self.locator = locator
        self.count = count

    def __call__(self, driver):
        try:
            elements = EC._find_elements(driver, self.locator)
            return len(elements) > self.count
        except StaleElementReferenceException:
            return False


url = "https://twitter.com/ndtv"
driver = webdriver.Firefox()
driver.maximize_window()
driver.get(url)

# initial wait for the tweets to load
wait = WebDriverWait(driver, 10)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))

# scroll down to the last tweet until there is no more tweets loaded
while True:
    tweets = driver.find_elements_by_css_selector("li[data-item-id]")
    number_of_tweets = len(tweets)

    driver.execute_script("arguments[0].scrollIntoView();", tweets[-1])

    try:
        wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
    except TimeoutException:
        break

This would scroll down as much as it is needed to load all of the existing tweets in this channel.


Here is the HTML-parsing snippet, extracting tweets:

page_source = driver.page_source
driver.close()

soup = BeautifulSoup(page_source)
for tweet in soup.select("div.tweet div.content"):
    print tweet.p.text

It prints:

Father's Day Facebook post by arrested cop Suhas Gokhale's son got nearly 10,000 likes http://goo.gl/aPqlxf  pic.twitter.com/JUqmdWNQ3c
#HWL2015 End of third quarter! Breathtaking stuff. India 2-2 Pakistan - http://sports.ndtv.com/hockey/news/244463-hockey-world-league-semifinal-india-vs-pakistan-antwerp …
Why these Kashmiri boys may miss their IIT dream http://goo.gl/9LVKfK  pic.twitter.com/gohX21Gibi
...
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!