问题
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
OUTPUT_FILE_NAME = 'output0.txt'
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
def get_text():
driver.get("http://law.go.kr/precSc.do?tabMenuId=tab67")
elem = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#viewHeightDiv >
table > tbody > "
"tr:nth-child(1) >
td.s_tit > a")))
title = elem.text.strip().split(" ")[0]
elem.click()
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#viewwrapCenter h2"),
title))
content = driver.find_element_by_css_selector("#viewwrapCenter").text
return content
def main():
open_output_file = open(OUTPUT_FILE_NAME, 'w')
result_text = get_text()
open_output_file.write(result_text)
open_output_file.close()
main()
based on this code i want to crawl this website. like from the original url selenium goes into 1st link and save text to txt file and it goes back to original url and goes into 2nd link and keeps going but the problem is css_selector values for 1st link is #viewHeightDiv > table > tbody > tr:nth-child(1) > td.s_tit > a and 2nd link is #viewHeightDiv > table > tbody > tr:nth-child(3) > td.s_tit > a only difference between them is number after a child and it seems like has no rule it goes like 1,3,5,9,... so im stuck here...
回答1:
To scrape all posts you don't need Selenium. You can do all using Requests and BeautifulSoup libraries:
import requests
from bs4 import BeautifulSoup
if __name__ == '__main__':
# Using request get 50 items from first page. pg=1 is page number, outmax=50 items per page
response = requests.post(
"http://law.go.kr/precScListR.do?q=*§ion=evtNm&outmax=50&pg=1&fsort=21,10,30&precSeq=0&dtlYn=N")
# Parse html using BeautifulSoup
page = BeautifulSoup(response.text, "html.parser")
# Find "go to last page" element and get "onclick" attribute, inside "onlick" attribute parse last page number
# for "outmax=50" (used before)
onclick = str(page.select(".paging > a:last-child")[0].attrs["onclick"])
last_page_number = int(''.join([n for n in onclick if n.isdigit()]))
# To test uncomment code below to get items only from first page
# last_page_number = 1
# Go through all pages and collect posts numbers in items
items = []
for i in range(1, last_page_number + 1):
if i>1:
# Go to next page
response = requests.post(
"http://law.go.kr/precScListR.do?q=*§ion=evtNm&outmax=100&pg=%d&fsort=21,10,30&precSeq=0&dtlYn=N" % i)
# Get all links
links = page.select("#viewHeightDiv .s_tit a")
# Loop all links and collect post numbers
for link in links:
# Parse post number from "onclick" attribute
items.append(''.join([n for n in link.attrs["onclick"] if n.isdigit()]))
# Open all posts and collect in posts dictionary with keys: number, url and text
posts = []
for item in items:
url = "http://law.go.kr/precInfoR.do?precSeq=%s&vSct=*" % item
response = requests.get(url)
t = BeautifulSoup(response.text, "html.parser").find('div', attrs={'id': 'contentBody'}).text
posts.append({'number': item, 'url': url, 'text': t})
To save to file change last part of the code to below, where /yourfullpath/ replace with your path like "C://files/" or "/Users/myuser/files/":
# Open all posts and collect in posts dictionary with keys: number, url and text
posts = []
for item in items:
url = "http://law.go.kr/precInfoR.do?precSeq=%s&vSct=*" % item
response = requests.get(url)
parsed = BeautifulSoup(response.text, "html.parser")
text = parsed.find('div', attrs={'id': 'contentBody'}).text
title = parsed.select_one("h2").text
posts.append({'number': item, 'url': url, 'text': text, 'title': title})
with open('/yourfullpath/' + title + '.text', 'w') as f:
f.write(text)
回答2:
You can use locator like:
td.s_tit > a
来源:https://stackoverflow.com/questions/54784490/automatic-crawling-using-selenium