automatic crawling using selenium

不羁的心 提交于 2019-12-11 01:19:23

问题


from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

OUTPUT_FILE_NAME = 'output0.txt'
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)

def get_text():
    driver.get("http://law.go.kr/precSc.do?tabMenuId=tab67")
    elem = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#viewHeightDiv > 
table > tbody > "
                                                                 "tr:nth-child(1) > 
td.s_tit > a")))

    title = elem.text.strip().split(" ")[0]
    elem.click()

    wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#viewwrapCenter h2"), 
title))
    content = driver.find_element_by_css_selector("#viewwrapCenter").text
    return content

def main():
    open_output_file = open(OUTPUT_FILE_NAME, 'w')
    result_text = get_text()
    open_output_file.write(result_text)
    open_output_file.close()

main()

based on this code i want to crawl this website. like from the original url selenium goes into 1st link and save text to txt file and it goes back to original url and goes into 2nd link and keeps going but the problem is css_selector values for 1st link is #viewHeightDiv > table > tbody > tr:nth-child(1) > td.s_tit > a and 2nd link is #viewHeightDiv > table > tbody > tr:nth-child(3) > td.s_tit > a only difference between them is number after a child and it seems like has no rule it goes like 1,3,5,9,... so im stuck here...


回答1:


To scrape all posts you don't need Selenium. You can do all using Requests and BeautifulSoup libraries:

import requests
from bs4 import BeautifulSoup

if __name__ == '__main__':

    # Using request get 50 items from first page. pg=1 is page number, outmax=50 items per page
    response = requests.post(
        "http://law.go.kr/precScListR.do?q=*&section=evtNm&outmax=50&pg=1&fsort=21,10,30&precSeq=0&dtlYn=N")

    # Parse html using BeautifulSoup
    page = BeautifulSoup(response.text, "html.parser")

    # Find "go to last page" element and get "onclick" attribute, inside "onlick" attribute parse last page number
    # for "outmax=50" (used before)
    onclick = str(page.select(".paging > a:last-child")[0].attrs["onclick"])
    last_page_number = int(''.join([n for n in onclick if n.isdigit()]))

    # To test uncomment code below to get items only from first page
    # last_page_number = 1

    # Go through all pages and collect posts numbers in items
    items = []
    for i in range(1, last_page_number + 1):
        if i>1:
            # Go to next page
            response = requests.post(
                "http://law.go.kr/precScListR.do?q=*&section=evtNm&outmax=100&pg=%d&fsort=21,10,30&precSeq=0&dtlYn=N" % i)

        # Get all links
        links = page.select("#viewHeightDiv .s_tit a")
        # Loop all links and collect post numbers
        for link in links:
            # Parse post number from "onclick" attribute
            items.append(''.join([n for n in link.attrs["onclick"] if n.isdigit()]))

    # Open all posts and collect in posts dictionary with keys: number, url and text
    posts = []
    for item in items:
        url = "http://law.go.kr/precInfoR.do?precSeq=%s&vSct=*" % item
        response = requests.get(url)
        t = BeautifulSoup(response.text, "html.parser").find('div', attrs={'id': 'contentBody'}).text
        posts.append({'number': item, 'url': url, 'text': t})

To save to file change last part of the code to below, where /yourfullpath/ replace with your path like "C://files/" or "/Users/myuser/files/":

# Open all posts and collect in posts dictionary with keys: number, url and text
posts = []
for item in items:
    url = "http://law.go.kr/precInfoR.do?precSeq=%s&vSct=*" % item
    response = requests.get(url)
    parsed = BeautifulSoup(response.text, "html.parser")
    text = parsed.find('div', attrs={'id': 'contentBody'}).text
    title = parsed.select_one("h2").text
    posts.append({'number': item, 'url': url, 'text': text, 'title': title})

    with open('/yourfullpath/' + title + '.text', 'w') as f:
        f.write(text)



回答2:


You can use locator like:

td.s_tit > a


来源:https://stackoverflow.com/questions/54784490/automatic-crawling-using-selenium

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!