用Python 爬虫批量下载PDF文档
更新:之前代码是用 python2 写的,有关 python3 的代码可以参考这位博主的: https://blog.csdn.net/baidu_28479651/article/details/76158051 代码如下: # coding = UTF-8 # 爬取李东风PDF文档,网址:http://www.math.pku.edu.cn/teachers/lidf/docs/textrick/index.htm import urllib.request import re import os # open the url and read def getHtml(url): page = urllib.request.urlopen(url) html = page.read() page.close() return html # compile the regular expressions and find # all stuff we need def getUrl(html): reg = r'(?:href|HREF)="?((?:http://)?.+?\.pdf)' url_re = re.compile(reg) url_lst = url_re.findall(html.decode('gb2312')) return(url_lst) def