xpath




<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>xpath的基本语法</title>
</head>
<body>
<script>
/*
* xpath 基本语法:
* 1, 根节点 : / eg: /html/body/note/book
* 2, 跨节点 : // eg: //book
* 3, 使用下标: [],下标从1开始,而且使用的前提是 元素之间是同级关系。 eg://book[1] 可以! //title[1] 不可以!
* 下标如果倒着来的话,要用last() eg: //book[last()] 倒数第一个 book[last()-1] 倒数第二个,以此类推
* 也可以这样: book[position()>1]
* 4, 精确查找: 属性选择器 eg: //title[@name="zcb1"]
* 上面四个都是获取的是标签。
*
* 下面是获取具体的值
* 5, 标签包裹的 内容 :使用text(). --> string eg://book[2]/title/text() 这得到的才是字符串
* 6, 取标签中属性的 value : 使用@属性名 -->string eg : //book[1]/title/@name
* */
</script>
<note>
<book>
<title name="zcb1">哈利波特1</title>
<price>100</price>
</book>
<book>
<title>哈利波特2</title>
<price>101</price>
</book>
<book>
<title>哈利波特3</title>
<price>102</price>
</book>
</note>
</body>
</html>
xpath 练习:
https://movie.douban.com/top250

//ol[@class="grid_view"]/li//span[@class="title"][1]/text()

//ol/li//div[@class="hd"]/a/@href

//ol/li//div[@class="star"]/span[last()]

//a[text()='下一页>']/@href
lxml:
它也是一个第三方库:


1 from lxml import etree
2
3 def test():
4 html_str = '''
5 <!DOCTYPE html>
6 <html lang="en">
7 <head>
8 <meta charset="UTF-8">
9 <title>Title</title>
10 </head>
11 <body>
12 <div>
13 <ul>
14 <li class="item1"><a href="link1.html">first item</a></li>
15 <li class="item2"><a href="link2.html">ist item</a></li>
16 <li class="item3"><a href="link3.html">f item</a></li>
17 <li class="item4"><a href="link4.html">st item</a></li>
18 <li class="item5"><a href="link5.html">fi item</a></li>
19 <li class="item6"><a href="link6.html">tte item</a></li>
20 <li> <!-- 此处 li 标签未封闭 。但是它也是不影响结果的-->
21 </ul>
22 </div>
23 </body>
24 </html>
25 '''
26
27 #1,将字符串 转为html
28 data = etree.HTML(html_str)
29
30 #2,调用xpath的语法进行解析 (xpath 解析出来 都是以列表进行展示! )
31 result_obj = data.xpath("/html//ul/li")
32 print(result_obj)
33
34 #标签包裹的内容
35 content = data.xpath("//a[@href='link1.html']/text()") #第一个a 标签中包裹的内容 ['first item']
36 print(content)
37
38 #标签的属性
39 value = data.xpath("//li[3]/@class")
40 print(value)
41
42
43 #了解 模糊查询 contains() #模糊查询 属性 class 包含 it 的所有li标签
44 ret = data.xpath("//li[contains(@class,'it')]")
45 print(ret)
46
47
48 #补充: 如果给的html_str 比较乱,我们可以对其进行格式化 一下! 它会将没有补全的标签自动补全,并且也会进行缩进!
49 # result_formated = etree.tostring(data).decode("utf8") #
50 # print(result_formated)
51
52
53
54
55
56
57
58
59
60 if __name__ == '__main__':
61 test()
xpath 和 lxml 的 代码使用步骤:


1 import requests
2 import random
3
4 USER_AGENT = [
5 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
6 # IPhone
7 "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
8 # IPod
9 "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
10 # IPAD
11 "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
12 "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
13 # Android
14 "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
15 "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
16 # QQ浏览器 Android版本
17 "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
18 # Android Opera Mobile
19 "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
20 # Android Pad Moto Xoom
21 "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
22 # BlackBerry
23 "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
24 # WebOS HP Touchpad
25 "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
26 # Nokia N97
27 "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
28 # Windows Phone Mango
29 "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
30 # UC浏览器
31 "UCWEB7.0.2.37/28/999",
32 "NOKIA5700/ UCWEB7.0.2.37/28/999",
33 # UCOpenwave
34 "Openwave/ UCWEB7.0.2.37/28/999",
35 # UC Opera
36 "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"
37 ]
38
39 #https://search.chinahr.com/bj/job/pn1/?key=python
40 class ChinaHRSpider():
41 def __init__(self):
42 self.base_url = "https://search.chinahr.com/bj/job/pn1/"
43 self.headers = {"User-Agent":random.choice(USER_AGENT)}
44
45 #1,发送请求
46 def send_requests(self,params):
47 response = requests.get(self.base_url,headers =self.headers,params=params)
48 data_str = response.content.decode("utf8") #默认就是utf8
49 return data_str
50
51 #2,解析数据 使用xpath 来解析
52 def parase_data(self,data):
53 pass
54
55 #3,保存数据
56 def write_file(self,data):
57 with open("ChinaHR.html","w",encoding="utf8") as f:
58 f.write(data)
59
60 #4,调度方法
61 def run(self):
62 params = {
63 "key":"python"
64 }
65 data_str = self.send_requests(params)
66 self.write_file(data_str)
67 if __name__ == '__main__':
68 ChinaHRSpider().run()
下面解析上面拿到的html文档。

1 import requests
2 import random
3 from lxml import etree
4
5 USER_AGENT = [
6 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
7 # IPhone
8 "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
9 # IPod
10 "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
11 # IPAD
12 "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
13 "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
14 # Android
15 "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
16 "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
17 # QQ浏览器 Android版本
18 "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
19 # Android Opera Mobile
20 "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
21 # Android Pad Moto Xoom
22 "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
23 # BlackBerry
24 "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
25 # WebOS HP Touchpad
26 "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
27 # Nokia N97
28 "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
29 # Windows Phone Mango
30 "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
31 # UC浏览器
32 "UCWEB7.0.2.37/28/999",
33 "NOKIA5700/ UCWEB7.0.2.37/28/999",
34 # UCOpenwave
35 "Openwave/ UCWEB7.0.2.37/28/999",
36 # UC Opera
37 "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"
38 ]
39
40 #https://search.chinahr.com/bj/job/pn1/?key=python
41 class ChinaHRSpider():
42 def __init__(self):
43 self.base_url = "https://search.chinahr.com/bj/job/pn1/"
44 self.headers = {"User-Agent":random.choice(USER_AGENT)}
45
46 #1,发送请求
47 def send_requests(self,params):
48 response = requests.get(self.base_url,headers =self.headers,params=params)
49 data_str = response.content.decode("utf8") #默认就是utf8
50 return data_str
51
52 #2,解析数据 使用xpath 来解析
53 def parase_data(self,data_str):
54 #a 转换类型
55 html_data = etree.HTML(data_str)
56 #b 调用xpath 方法 解析数据
57
58
59
60 #3,保存数据
61 def write_file(self,data):
62 with open("ChinaHR.html","w",encoding="utf8") as f:
63 f.write(data)
64
65 #4,调度方法
66 def run(self):
67 params = {
68 "key":"python"
69 }
70 data_str = self.send_requests(params)
71 self.write_file(data_str)
72 def test():
73 with open("ChinaHR.html","r",encoding="utf8") as f:
74 data_str = f.read()
75 def parase_data(data_str):
76 #a 转换类型
77 html_data = etree.HTML(data_str)
78 #b 调用xpath 方法 解析数据
79 jobname_list = html_data.xpath('//div[@class="job-list-box"]//ul[1]/li/@title')
80
81
82
83
84 parase_data(data_str)
85
86 if __name__ == '__main__':
87 # ChinaHRSpider().run()
88 test()

1 import requests
2 import random
3 from lxml import etree
4 import json
5
6 USER_AGENT = [
7 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
8 # IPhone
9 "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
10 # IPod
11 "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
12 # IPAD
13 "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
14 "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
15 # Android
16 "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
17 "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
18 # QQ浏览器 Android版本
19 "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
20 # Android Opera Mobile
21 "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
22 # Android Pad Moto Xoom
23 "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
24 # BlackBerry
25 "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
26 # WebOS HP Touchpad
27 "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
28 # Nokia N97
29 "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
30 # Windows Phone Mango
31 "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
32 # UC浏览器
33 "UCWEB7.0.2.37/28/999",
34 "NOKIA5700/ UCWEB7.0.2.37/28/999",
35 # UCOpenwave
36 "Openwave/ UCWEB7.0.2.37/28/999",
37 # UC Opera
38 "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"
39 ]
40 #https://search.chinahr.com/bj/job/pn1/?key=python
41 class ChinaHRSpider():
42 def __init__(self):
43 self.base_url = "https://search.chinahr.com/bj/job/pn1/"
44 self.headers = {"User-Agent":random.choice(USER_AGENT)}
45
46 #1,发送请求
47 def send_requests(self,params):
48 response = requests.get(self.base_url,headers =self.headers,params=params)
49 data_str = response.content.decode("utf8") #默认就是utf8
50 return data_str
51
52 #2,解析数据
53 def parase_data(self,data_str): #-->list
54 # a 转换类型
55 html_data = etree.HTML(data_str)
56 # b 调用xpath 方法 解析数据
57 divs = html_data.xpath('//div[@class="job-list-box"]/div') # divs 包含了我们需要的一行中的信息
58 job_list = []
59 for div in divs:
60 temp = {}
61 job_name = str(div.xpath('ul/li[@class="job-name"]/@title')[0]) # string
62 fabu_date = div.xpath('ul/li[@class="fabu-date"]/text()')[0].strip("\n ") # string
63 job_address = div.xpath('ul/li[@class="job-address"]/text()')[0].strip("\n ").split('|')[0] # string
64 jingyan = div.xpath('ul/li[@class="job-address"]/text()')[0].strip("\n ").split('|')[1] # string
65 xueli = div.xpath('ul/li[@class="job-address"]/text()')[0].strip("\n ").split('|')[2] # string
66 company_address = div.xpath('ul/li[@class="job-company"]/text()')[0].strip("\n ") # string
67 data = {
68 "job_name": job_name,
69 "fabu_date": fabu_date,
70 "job_address": job_address,
71 "jingyan": jingyan,
72 "xueli": xueli,
73 "company_name": company_address
74 }
75 job_list.append(data)
76 return job_list
77
78 #3,保存数据
79 def write_file(self,data):
80 with open("ChinaHR.json","w",encoding="utf8") as f:
81 json.dump(data,f,ensure_ascii=False)
82 #4,调度方法
83 def run(self):
84 params = {
85 "key":"python"
86 }
87 data_str = self.send_requests(params)
88 ret = self.parase_data(data_str)
89 self.write_file(ret)
90
91 if __name__ == '__main__':
92 ChinaHRSpider().run()

1 import requests
2 import random
3 from lxml import etree
4 import json
5
6 USER_AGENT = [
7 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
8 # IPhone
9 "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
10 # IPod
11 "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
12 # IPAD
13 "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
14 "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
15 # Android
16 "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
17 "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
18 # QQ浏览器 Android版本
19 "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
20 # Android Opera Mobile
21 "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
22 # Android Pad Moto Xoom
23 "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
24 # BlackBerry
25 "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
26 # WebOS HP Touchpad
27 "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
28 # Nokia N97
29 "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
30 # Windows Phone Mango
31 "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
32 # UC浏览器
33 "UCWEB7.0.2.37/28/999",
34 "NOKIA5700/ UCWEB7.0.2.37/28/999",
35 # UCOpenwave
36 "Openwave/ UCWEB7.0.2.37/28/999",
37 # UC Opera
38 "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"
39 ]
40 #https://search.chinahr.com/bj/job/pn9//?key=python
41 class ChinaHRSpider():
42 def __init__(self):
43 self.base_url = "https://search.chinahr.com/bj/job/pn"
44 self.headers = {"User-Agent":random.choice(USER_AGENT)}
45 self.job_list = []
46
47 #1,发送请求
48 def send_requests(self,new_url,params):
49 response = requests.get(new_url,headers =self.headers,params=params)
50 data_str = response.content.decode("utf8") #默认就是utf8
51 return data_str
52
53 #2,解析数据
54 def parase_data(self,data_str):
55 # a 转换类型
56 html_data = etree.HTML(data_str)
57 # b 调用xpath 方法 解析数据
58 divs = html_data.xpath('//div[@class="job-list-box"]/div') # divs 包含了我们需要的一行中的信息
59
60 for div in divs:
61 temp = {}
62 job_name = str(div.xpath('ul/li[@class="job-name"]/@title')[0]) # string
63 fabu_date = div.xpath('ul/li[@class="fabu-date"]/text()')[0].strip("\n ") # string
64 job_address = div.xpath('ul/li[@class="job-address"]/text()')[0].strip("\n ").split('|')[0] # string
65 jingyan = div.xpath('ul/li[@class="job-address"]/text()')[0].strip("\n ").split('|')[1] # string
66 xueli = div.xpath('ul/li[@class="job-address"]/text()')[0].strip("\n ").split('|')[2] # string
67 company_address = div.xpath('ul/li[@class="job-company"]/text()')[0].strip("\n ") # string
68 data = {
69 "job_name": job_name,
70 "fabu_date": fabu_date,
71 "job_address": job_address,
72 "jingyan": jingyan,
73 "xueli": xueli,
74 "company_name": company_address
75 }
76 self.job_list.append(data)
77
78 #3,保存数据
79 def write_file(self):
80 with open("ChinaHR.json","w",encoding="utf8") as f:
81 json.dump(self.job_list,f,ensure_ascii=False)
82 #4,调度方法
83 def run(self):
84 params = {
85 "key":"python"
86 }
87 for pageNum in range(1,10): #1-9 页 的数据!
88 new_url = self.base_url+str(pageNum)+"/"
89 data_str = self.send_requests(new_url,params)
90 self.parase_data(data_str)
91 self.write_file()
92
93 print("正在下载第{}页......".format(pageNum))
94
95 if __name__ == '__main__':
96 ChinaHRSpider().run()
bs4
来源:https://www.cnblogs.com/zach0812/p/12005571.html
