import scrapy
import re
import os
class MySpider(scrapy.Spider):
name = "deeplearning4j-api"
start_urls = ["https://deeplearning4j.org/api/latest/index-files/index-1.html"]
crawledLinks = {}
# times = 200
def parse(self, response):
# 输出html页面
def output2html(htmlcontent,filepath,folder='./document-deeplearning4j/'):
filepath = folder + filepath
if not os.path.exists(os.path.split(filepath)[0]):
os.makedirs(os.path.split(filepath)[0])
f = open(filepath, 'wb')
f.write(htmlcontent)
f.close()
# 处理链接成统一格式
def htmlprocess(url,preStr="https://deeplearning4j.org",endStr=".html"):
url = url.replace(preStr,'')
pattern = re.compile(r'\.html.*$')
url = pattern.sub('.html',url)
return url
url = htmlprocess( response.url )
output2html(response.body,url)
# 选取页面元素(找出所有连接)
links = response.xpath('//a/@href').extract()
# 限定域名(绝对链接)
linkPattern = re.compile(r"(^https\:\/\/deeplearning4j\.org\/api\/)")
for link in links:
link = response.urljoin(link) # 将所有连接处理成绝对链接
link = htmlprocess( link, preStr='',endStr=".html") # 不处理链接域名,不区分链接后缀
if linkPattern.match(link) and not link in self.crawledLinks:
self.crawledLinks[link]=1
yield scrapy.Request(link, callback = self.parse)
item = {}
item["link"] = link
yield item