爬取前尘无忧python职位信息

匿名 (未验证) 提交于 2019-12-02 22:11:45

1.re实现

 1 import re,os  2 import requests  3 from requests.exceptions import RequestException  4   5 MAX_PAGE = 10 #最大页数  6 KEYWORD = python  7 headers = {  8     User-Agent:  9         Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 10 } 11 file_name = re_job51_python.txt 12  13 # 获取网页源码 14 def getHtml(page): 15     try: 16         url = https://search.51job.com/list/040000,000000,0000,00,9,99,{0},2,{1}.html?.format(KEYWORD,page) 17         response = requests.get(url,headers=headers) 18         response.encoding = response.apparent_encoding 19         return response.text 20     except RequestException: 21         print(请求出错) 22         return None 23  24 # 解析网页源码,得到目标信息 25 def getTarget(html): 26     reg = re.compile( 27         rclass="t1 ">.*? <a target="_blank"  28         title="(.*?)".*? <span class="t2"><a target="_blank"  29         title="(.*?)".*?<span  30         class="t3">(.*?)</span>.*?<span  31         class="t4">(.*?)</span>.*? <span  32         class="t5">(.*?)</span>, 33         re.S)  # 匹配换行符 34     target = re.findall(reg,html) 35     return target 36  37  38 # 保存到文本中 39 def save_to_txt(item): 40     with open(file_name,a,newline=‘‘) as f:  # newline参数防止两行之间有空行 41         for i in range(len(item)): 42             # 最后一个元素换行,非最后则以’,‘隔开 43             if i == len(item)-1: 44                 f.write(item[i]) 45                 f.write(\n) 46             else: 47                 f.write(item[i]+,) 48  49 def main(): 50     # 每次执行前检查文件是否存在,存在则删除 51     if os.path.exists(file_name): 52         os.remove(file_name) 53  54     # 分页爬取 55     for page in range(MAX_PAGE+1): 56         html = getHtml(page) 57         content = getTarget(html) 58         for item in content: 59             save_to_txt(item) 60  61 if __name__ == __main__: 62     main()
View Code

 2.xpath实现

  1 import os   2 import requests   3 from requests.exceptions import RequestException   4 from lxml import etree   5 import pymongo   6 from spiders.前程无忧.mongo_config import *   7    8 # mongo数据库设置   9 client = pymongo.MongoClient(MONGO_URL)  10 db = client[MONGO_DB]  11   12 MAX_PAGE = 5  13 KEYWORD = python  14 headers = {  15     User-Agent:Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)  16     Chrome/63.0.3239.132 Safari/537.36  17 }  18 file_name = xpath_job51_python.txt  19   20 # 获取网页  21 def get_html(page):  22     try:  23         url = https://search.51job.com/list/040000,000000,0000,00,9,99,{},2,{}.html?.format(KEYWORD,page)  24         response = requests.get(url,headers=headers)  25         response.encoding = response.apparent_encoding  26         return response.text  27     except RequestException:  28         return None  29   30 # 解析网页  31 def parse_html(html):  32     # 构造xpath解析对象,可自动修整HTML文本  33     html = etree.HTML(html)  34     # 获取文本 /text()  35     # 获取属性 /@href  36     # 获取第i个标签 /tar_name[i]  从1开始  37     # normalize-space-->去空格换行符  38     # position_name = html.xpath(‘normalize-space(//div[@class="el"]/p/span/a/text())‘)  39   40     # 职位名称,  41     position_names = []  42     for name in html.xpath(//div[@class="el"]/p/span/a/text()):  43         position_name = name.strip()  44         position_names.append(position_name)  45   46     # 职位地址  47     position_urls = html.xpath(//div[@class="el"]/p/span/a/@href)  48   49     # 公司名称  50     company_names = html.xpath(//div[@class="el"]/span[1]/a/text())  51   52     # 公司地址  53     company_urls = html.xpath(//div[@class="el"]/span[1]/a/@href)  54   55     # 位置  56     locations = html.xpath(//div[@class="el"]/span[@class="t3"]/text())  57   58     # 薪资  59     salarys = html.xpath(//div[@class="el"]/span[@class="t4"]/text())  60   61     # 发布时间  62     release_dates = html.xpath(//div[@class="el"]/span[4]/text())  63   64     result = zip(position_names,position_urls,company_names,company_urls,locations,salarys,release_dates)  65     return result  66   67   68 def save_to_txt(element):  69     with open(file_name,a,newline=‘‘) as f:  70         for i in range(len(element)):  71             # data = ‘,‘.join(element[i])  72             if i == len(element)-1:  73                 f.write(element[i])  74                 f.write(\n)  75             else:  76                 f.write(element[i]+,)  77   78   79 def save_to_mongo(element):  80     keys = [position_name,position_url,company_name,  81             company_url,location,salary,release_date]  82     result = dict(zip(keys,list(element)))  83     if db[MONGO_TABLE_XPATH].insert(result):  84         print(数据成功存储到mongo数据库中)  85         return True  86     return False  87   88     # 遍历字典元素  89     # for k,v in result.items():  90     #     print(k,‘:‘,v)  91     for key in result:  92         print(key,:,result[key])  93   94   95   96 def main():  97     if os.path.exists(file_name):  98         os.remove(file_name)  99     for page in range(1,MAX_PAGE+1): 100         html = get_html(page) 101         elements = parse_html(html) 102         if elements: 103             for element in elements: 104                 save_to_txt(element) 105                 save_to_mongo(element) 106  107 if __name__ == __main__: 108     main()
View Code

 



原文:https://www.cnblogs.com/ray-mmss/p/9373742.html

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!