spider_01 | 易学教程

import urllib.request
import urllib.parse

response = urllib.request.urlopen('http://httpbin.org/get')
# 向网站发起请求并获取响应对象,read()得到结果为 bytes 数据类型,decode('utf-8')指定解析编码格式decode() 转为string数据类型
html = response.read().decode('utf-8')
print(html)

request = urllib.request.Request(
    url = 'http://httpbin.org/get',
    headers = {'User-Agent':'Mozilla/5.0'}
)
response = urllib.request.urlopen(request)
html = response.read().decode()
print(html)

query_string = {'wd':'美女'}
result = urllib.parse.urlencode(query_string)
print(result)

def get_url(word):
    # url = 'https://www.so.com/s?{}'
    # params = urllib.parse.urlencode({'q':word})
    url = 'https://www.so.com/s?q={}'
    params = urllib.parse.quote(word)
    url = url.format(params)
    return url
def request_url(url,filename):
    request = urllib.request.Request(url = url,headers = {'User-Agent':'Mozilla/5.0'})
    response = urllib.request.urlopen(request)
    html = response.read().decode('utf-8')
    with open(filename,'w',encoding='utf-8')as f:
        f.write(html)
if __name__ == '__main__':
    word = input('请输入要搜索的内容:')
    url = get_url(word)
    filename = word + '.html'
    request_url(url,filename)
    print(url)


import time,random
from fake_useragent import UserAgent

class BaiduTieBaSpider:
    def __init__(self):
        self.url = 'http://tieba.baidu.com/f?kw={}&pn={}'

    def get_html(self,url):
        headers = {
            'User-Agent':UserAgent().random
        }
        request = urllib.request.Request(url=url,headers=headers)
        response = urllib.request.urlopen(request)
        html = response.read().decode('utf-8')

        return html

    def parse_html(self):
        pass

    def write_html(self,filename,html):
        with open(filename,'w',encoding='utf-8')as f:
            f.write(html)

    def run(self):
        name = input('请输入贴吧名:')
        begin = int(input('请输入起始页:'))
        end = int(input('请输入终止页:'))

        params = urllib.parse.quote(name)
        for page in range(begin,end+1):
            pn = (page-1)*50
            url = self.url.format(params,pn)
            html = self.get_html(url)
            filename = '{}-第{}页.html'.format(name,page)
            self.write_html(filename,html)
            time.sleep(random.randint(1,2))
            print('第%d页爬取完成' % page)

if __name__ == '__main__':
    start = time.time()
    spider = BaiduTieBaSpider()
    spider.run()
    end = time.time()
    print('执行时间:%.2f' % (end - start))


import re
html = '''
<div><p>九霄龙吟惊天变</p></div>
<div><p>风云际汇潜水游</p></div>
'''
pattern = re.compile('<div><p>(.*?)</p></div>.*?<div><p>(.*?)</p></div>',re.S)
r_list = pattern.findall(html)
print(r_list)
来源：https://www.cnblogs.com/bishopmarcel/p/12165219.html
标签
url
response