在做网络爬虫的过程中经常会遇到请求次数过多无法访问的现象,这种情况下就可以使用代理IP来解决。但是网上的代理IP要么收费,要么没有API接口。秉着能省则省的原则,自己创建一个代理IP库。
废话不多说,直接上代码:
1 import requests
2 from bs4 import BeautifulSoup
3
4
5 # 发送请求
6 def GetInfo(url):
7 headers = {
8 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
9 }
10 proxies = {"http": "https://119.180.173.81:8060"}
11 response = requests.get(url=url, proxies=proxies, headers=headers)
12 response.encoding = "utf8"
13 return response.text
14
15
16 # 将数据写入文件
17 def WriteData():
18 for i in range(100):
19 url = "https://www.xicidaili.com/nn/" + str(i+1)
20 data = GetData(url)
21 file = open('Proxies.txt', 'a+')
22 file.write(str(data))
23 file.close()
24
25
26 # 验证该代理能否使用
27 def verify(proxies):
28 req = requests.get("https://www.baidu.com", proxies=proxies)
29 return req.status_code
30
31
32 # 解析页面
33 def GetData(url):
34 data = list()
35 html = GetInfo(url)
36 soup = BeautifulSoup(html, "lxml")
37 table = soup.find_all("table", id="ip_list")
38 soup = BeautifulSoup(str(table[0]), "lxml")
39 trs = soup.find_all("tr")
40 del trs[0]
41 for tr in trs:
42 ip = tr.select("td")[1].get_text()
43 port = tr.select("td")[2].get_text()
44 protocol = tr.select("td")[5].get_text()
45 address = protocol.lower()+"://"+ip+":"+port
46 proxies = {'http': address}
47 if verify(proxies) == 200:
48 data.append(address)
49 return data
50
51 if __name__ == '__main__':
52 WriteData()
返回数据:
['http://111.222.141.127:8118', 'https://117.88.177.101:3000', 'http://183.166.136.144:8888', 'http://27.208.231.100:8060', 'http://123.169.99.177:9999', 'http://119.84.84.185:12345', 'http://101.132.190.101:80', 'https://114.99.54.65:8118', 'https://119.4.13.26:1133', 'http://58.253.158.177:9999', 'http://114.223.208.165:8118', 'http://112.84.73.53:9999']
源站地址:https://www.lizaza.cn/page23.html
来源:oschina
链接:https://my.oschina.net/u/4299292/blog/4258091