import urllib.request import urllib.parse response = urllib.request.urlopen('http://httpbin.org/get') # 向网站发起请求并获取响应对象,read()得到结果为 bytes 数据类型,decode('utf-8')指定解析编码格式decode() 转为string数据类型 html = response.read().decode('utf-8') print(html) request = urllib.request.Request( url = 'http://httpbin.org/get', headers = {'User-Agent':'Mozilla/5.0'} ) response = urllib.request.urlopen(request) html = response.read().decode() print(html) query_string = {'wd':'美女'} result = urllib.parse.urlencode(query_string) print(result) def get_url(word): # url = 'https://www.so.com/s?{}' # params = urllib.parse.urlencode({'q':word}) url = 'https://www.so.com/s?q={}' params = urllib.parse.quote(word) url = url.format(params) return url def request_url(url,filename): request = urllib.request.Request(url = url,headers = {'User-Agent':'Mozilla/5.0'}) response = urllib.request.urlopen(request) html = response.read().decode('utf-8') with open(filename,'w',encoding='utf-8')as f: f.write(html) if __name__ == '__main__': word = input('请输入要搜索的内容:') url = get_url(word) filename = word + '.html' request_url(url,filename) print(url) import time,random from fake_useragent import UserAgent class BaiduTieBaSpider: def __init__(self): self.url = 'http://tieba.baidu.com/f?kw={}&pn={}' def get_html(self,url): headers = { 'User-Agent':UserAgent().random } request = urllib.request.Request(url=url,headers=headers) response = urllib.request.urlopen(request) html = response.read().decode('utf-8') return html def parse_html(self): pass def write_html(self,filename,html): with open(filename,'w',encoding='utf-8')as f: f.write(html) def run(self): name = input('请输入贴吧名:') begin = int(input('请输入起始页:')) end = int(input('请输入终止页:')) params = urllib.parse.quote(name) for page in range(begin,end+1): pn = (page-1)*50 url = self.url.format(params,pn) html = self.get_html(url) filename = '{}-第{}页.html'.format(name,page) self.write_html(filename,html) time.sleep(random.randint(1,2)) print('第%d页爬取完成' % page) if __name__ == '__main__': start = time.time() spider = BaiduTieBaSpider() spider.run() end = time.time() print('执行时间:%.2f' % (end - start)) import re html = ''' <div><p>九霄龙吟惊天变</p></div> <div><p>风云际汇潜水游</p></div> ''' pattern = re.compile('<div><p>(.*?)</p></div>.*?<div><p>(.*?)</p></div>',re.S) r_list = pattern.findall(html) print(r_list)
来源:https://www.cnblogs.com/bishopmarcel/p/12165219.html