python爬虫:爬取拉勾网数据

匿名 (未验证) 提交于 2019-12-02 22:54:36
import requests import json from bs4 import BeautifulSoup   def crawl_detail(id):     url = 'https://www.lagou.com/jobs/%s.html' % id     headers = {         'Host': 'www.lagou.com',         'Referer': 'https://www.lagou.com/jobs/list_Python?px=default&city=%E6%88%90%E9%83%BD',         'Upgrade-Insecure-Requests': '1',         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '                       'AppleWebKit/537.36 (KHTML, like Gecko) '                       'Chrome/63.0.3239.132 Safari/537.36'     }     req = requests.get(url, headers=headers)     # print(req.text)     soup = BeautifulSoup(req.text, 'lxml')     job_bt = soup.find('dd', attrs={'class': 'job_bt'})     job_bt = job_bt.text     return job_bt   def main():     headers = {          'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '                        'AppleWebKit/537.36 (KHTML, like Gecko) '                        'Chrome/63.0.3239.132 Safari/537.36',          'Host': 'www.lagou.com',          'Referer': 'https://www.lagou.com/jobs/list_Python?px=default&city=%E6%88%90%E9%83%BD',          'X-Anit-Forge-Code': '0',          'X-Anit-Forge-Token': None,          'X-Requested-With': 'XMLHttpRequest'     }      positions = []     for x in range(1, 2):         form_data = {             'first': 'true',             'pn': x,             'kd': 'python'         }         result = requests.post('https://www.lagou.com/jobs/positionAjax.json?'                                'px=default&city=%E6%88%90%E9%83%BD&'                                'needAddtionalResult=false',headers=headers,data=form_data)         json_result = result.json()         # print('*'*30)         # print(json_result)         # print('*' * 30)         page_positions = json_result['content']['positionResult']['result']         for position in page_positions:             position_dict = {                 'workYear': position['workYear'],                 'positionName': position['positionName'],                 'salary': position['salary'],                 'district': position['district'],                 'companyFullName': position['companyFullName']             }             position_id = position['positionId']             position_detail = crawl_detail(position_id)             position_dict['position_detail'] = position_detail             positions.append(position_dict)      line = json.dumps(positions, ensure_ascii=False)     with open('lagou.json', 'wb') as f:         f.write(line.encode('utf-8'))   if __name__ == '__main__':     main()     # crawl_detail('4613044') 
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!