为了方便使用scrapy JSON api 而将其进行包装
关于egg包的上传,则需要在本文件所处目录创建一个eggs文件夹,将egg格式文件放入即可
import requests
import demjson
import datetime
import pandas as pd
import os
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
DEFAULT_SCRAPYDIP = '127.0.0.1'
DEFAULT_SCRAPYDPORT = '6800'
class ScrapydApi(object):
def __init__(self, ip=DEFAULT_SCRAPYDIP, port=DEFAULT_SCRAPYDPORT):
self.ip = ip
self.port = port
self.url = 'http://' + ip + ':' + port
def addversion(self, project_name, version, egg):
npth = os.path.join(os.getcwd(), 'eggs')
os.chdir(npth)
os.system('curl %s/addversion.json -F project=%s -F version=%s -F egg=@%s' % (self.url, project_name, version, egg))
def _scheduler(self, project_name, spider_name, setting=None, jobid=None, _version=None):
"""
爬虫运行
:param project_name: 项目名
:param spider_name: 爬虫名
:return:
"""
data = {
'project': project_name,
'spider': spider_name,
'setting': setting,
'jobid': jobid,
'_version': _version
}
ret = requests.post('%s/schedule.json' % self.url, data=data).text
result = demjson.decode(ret)
return result
def scheduler(self, project_name, spider_name, setting=None, jobid=None, _version=None):
ret = self._scheduler(project_name, spider_name, setting, jobid, _version)
self._sceduler_log(project_name, spider_name, ret)
def sceduler_all(self, project_name, setting=None, jobid=None, _version=None):
spiders = self._get_spiders_list(project_name)['spiders']
for i in spiders:
ret = self._scheduler(project_name, i, setting, jobid, _version)
self._sceduler_log(project_name, i, ret)
def _sceduler_log(self, project_name, spider_name, ret):
if ret['status'] == 'ok':
result = '项目名称: %s 爬虫名称: %s 工作id: %s' % (project_name, spider_name, ret['jobid'])
logger.info(result)
else:
logger.error(ret['message'])
def _get_spiders_list(self, project_name):
ret = requests.get('%s/listspiders.json?project=%s' % (self.url, project_name)).text
json = demjson.decode(ret, encoding='GBK')
if json['status'] == 'ok':
return json
else:
logger.error(json['message'])
def show_spiders_list(self, project_name):
"""
获取爬虫列表
:param project_name: 项目名
:return:
"""
json = self._get_spiders_list(project_name)
spiders = json['spiders']
if len(spiders):
result = '项目名称: %s -- 爬虫列表: %s' % (project_name, ' '.join(spiders))
else:
result = '爬虫列表为空'
logger.info(result)
def _cancel(self, project_name, jobid):
data = {
'project': project_name,
'job': jobid
}
ret = requests.post('%s/cancel.json' % self.url, data=data).text
ret = demjson.decode(ret)
return ret
def cancel(self, project_name, jobid):
"""
关闭爬虫
:param project_name: 项目名
:param jobid: 工作id
:return:
"""
ret = self._cancel(project_name, jobid)
self._cancel_log(project_name, jobid, ret)
def cancel_all(self, project_name):
ret = self._get_jobs_list(project_name)
if ret['status'] == 'ok':
for i in ret['running']:
ret = self._cancel(project_name, i['id'])
self._cancel_log(project_name, i['id'], ret)
else:
logger.error(ret['message'])
def _cancel_log(self, project_name, jobid, ret):
if ret['status'] == 'ok':
result = '项目名称: %s 爬虫进程: %s 被关闭 上一状态为: %s' % (project_name, jobid, ret['prevstate'])
logger.info(result)
else:
logger.error(ret['message'])
def show_version_list(self, project_name):
"""
查看版本
:param project_name: 项目名
:return:
"""
ret = requests.get('%s/listversions.json?project=%s' % (self.url, project_name)).text
json = demjson.decode(ret)
if json['status'] == 'ok':
version = json.get('versions')
if not (len(version)):
result = '不存在项目 %s' % project_name
logger.error(result)
else:
result = '项目名称: %s -- 版本号: %s' % (project_name, version)
logger.info(result)
else:
result = json['message']
logger.error(result)
def daemonstatus(self):
"""
检查服务器负载
:return:
"""
ret = requests.get('%s/daemonstatus.json' % self.url).text
logger.info(ret)
def show_projects_list(self):
"""
获取项目列表
:return:
"""
ret = requests.get('%s/listprojects.json' % self.url).text
ret = demjson.decode(ret, encoding='GBK')
if len(ret):
if not len(ret['projects']):
ret['projects'] = '暂时还没有项目'
result = '项目名称列表: %s' % ret['projects']
else:
result = '爬虫列表为空'
logger.info(result)
def _get_jobs_list(self, project_name):
ret = requests.get('%s/listjobs.json?project=%s' % (self.url, project_name)).text
ret = demjson.decode(ret, encoding='GBK')
return ret
def show_jobs_list(self, project_name):
"""
获取任务列表
:param project_name: 项目名
:return:
"""
ret = self._get_jobs_list(project_name)
if ret['status'] == 'ok':
logger.info('项目名: %s' % project_name)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)
logger.info('等待中爬虫进程:')
logger.info('\n%s', (pd.DataFrame(ret['pending'], columns=['id', 'spider', 'start_time', 'end_time'])))
logger.info('运行中爬虫进程:')
logger.info('\n%s', pd.DataFrame(ret['running'], columns=['id', 'spider', 'start_time', 'end_time']))
logger.info('已结束爬虫进程:')
logger.info('\n%s', pd.DataFrame(ret['finished'], columns=['id', 'spider', 'start_time', 'end_time']))
else:
logger.info(ret['message'])
def del_version(self, project_name, version):
"""
版本删除
:param project_name: 项目名
:param version: 版本号
:return:
"""
data = {
'project': project_name,
'version': version
}
ret = requests.post('%s/delversion.json' % self.url, data=data).text
json = demjson.decode(ret)
if json['status'] == 'ok':
logger.info('项目: %s 版本号: %s 删除成功' % (project_name, version))
else:
logger.error('访问错误 %s' % json['message'])
def del_project(self, project_name):
"""
项目删除
:param project_name: 项目名
:return:
"""
data = {
'project': project_name
}
ret = requests.post('%s/delproject.json' % self.url, data=data).text
ret = demjson.decode(ret)
if ret['status'] == 'ok':
logger.info('项目: %s 删除成功' % project_name)
else:
logger.error('访问错误 %s' % ret['message'])
if __name__ == '__main__':
project_name = ''
spider_name = ''
sa = ScrapydApi()
# sa.show_projects_list()
# sa.show_version_list(project_name)
# sa.show_spiders_list(project_name)
# sa.scheduler(project_name, spider_name)
# sa.cancel(project_name, '5822a5caefc211e9ab6300e04c68038a')
# sa.show_jobs_list(project_name)
# sa.del_version(project_name, 'r23')
# sa.del_project(project_name)
# sa.addversion(project_name, 'v0.1', 'eggs.egg')
# sa.daemonstatus()
# sa.sceduler_all(project_name)
# sa.cancel_all(project_name)
来源:https://www.cnblogs.com/lpapython/p/11685424.html