# pip install flask-swagger-ui # pip install flask_swagger from flask import Flask, jsonify, abort, request from flask_swagger import swagger import jiebahelper from flask_swagger_ui import get_swaggerui_blueprint app = Flask(__name__) SWAGGER_URL = '/api/docs' # URL for exposing Swagger UI (without trailing '/') API_URL = '/swagger' # Call factory function to create our blueprint swaggerui_blueprint = get_swaggerui_blueprint( # Swagger UI static files will be mapped to '{SWAGGER_URL}/dist/' SWAGGER_URL, API_URL, config={ # Swagger UI config overrides 'app_name': "Jiebao Application" } ) # Register blueprint at URL # (URL must match the one given to factory function above) app.register_blueprint(swaggerui_blueprint, url_prefix=SWAGGER_URL) # https://github.com/OAI/OpenAPI-Specification/blob/master/versions/2.0.md#parameter-object @app.route("/swagger") def spec(): swag = swagger(app) swag['info']['version'] = "1.0" swag['info']['title'] = "Segment API" return jsonify(swag) @app.route('/') def index(): return 'Jiebao Segment API by Python.' from flask import make_response @app.errorhandler(404) def not_found(error): # 当我们请求 # 2 id的资源时,可以获取,但是当我们请求#3的资源时返回了404错误。并且返回了一段奇怪的HTML错误,而不是我们期望的JSON,这是因为Flask产生了默认的404响应。客户端需要收到的都是JSON的响应,因此我们需要改进404错误处理: return make_response(jsonify({'error': 'Not found'}), 404) @app.errorhandler(400) def para_error(error): # 数据错误 return make_response(jsonify({'error': 'Parameter Error'}), 400) @app.route('/segment', methods=['POST']) def segment(): ''' 切词。不带词性,去停词 --- tags: - segment parameters: - in: body name: body description: 内容 required: true schema: type: string ''' a = request.data.strip() if a == '': abort(400) ret = jiebahelper.dosegment(a) return ret @app.route('/segmentpos', methods=['POST']) def segmentpos(): ''' 切词。带词性,去停词 --- tags: - segment parameters: - in: body name: body description: 内容 required: true schema: type: string ''' a = request.data.strip() if a == '': abort(400) ret = jiebahelper.dosegment_with_pos(a) return ret @app.route('/segmentall', methods=['POST']) def segmentall(): ''' 切词。带词性,不去停词 --- tags: - segment parameters: - in: body name: body description: 内容 required: true schema: type: string ''' a = request.data.strip() if not a: abort(400) ret = jiebahelper.dosegment_all(a) return ret if __name__ == "__main__": app.run(host="0.0.0.0", port=5000)
命令直接执行 python SegmentAPI.py启动站点
然后再浏览器中直接输入 127.0.0.1:5000直接浏览
直接在swagger中测试接口,结果如下图
分词使用jieba,jiebahelp.py源码
# jiebahelper 结巴分词封装模块 import jieba import jieba.analyse import jieba.posseg import re import datetime #加载自定义词典 jieba.load_userdict('userdict.txt') # 至少包含一个汉字的正则表达式 contains_hanzi_pattern = re.compile(r'.*[\u4e00-\u9fa5]+.*') # 创建停用词list def stopwordslist(filepath): stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()] return stopwords stopwords = stopwordslist('./stopwordshit.txt') # 这里加载停用词的路径 emptyList = ["\t", "\r\n", "\r", "\n"] # 对句子进行分词 def dosegment(sentence, must_contains_hanzi=False): ''' 分词 :param sentence:输入字符 :param must_contains_hanzi:是否必须包含汉字,默认False,即全部切词。Ture,即不返回词中没有汉字的词语 :return: ''' start = datetime.datetime.now() sentence_seged = jieba.cut(sentence.strip()) step1 = datetime.datetime.now() # outstr = '' # for word in sentence_seged: # if word not in stopwords and word not in emptyList: # # 带数字或者只返回不是数字的字符 # if not must_contains_hanzi or contains_hanzi_pattern.match(word): # outstr += word+" " outstr=" ".join(list(filter(lambda x:(x not in stopwords and x not in emptyList and (not must_contains_hanzi or contains_hanzi_pattern.match(x))),sentence_seged))) #outstr=" ".join(sentence_seged) step2 = datetime.datetime.now() print("cut:{}us微秒 filter:{}us".format((step1-start).microseconds,(step2-step1).microseconds)) return outstr # 带词性标注,对句子进行分词 def dosegment_with_pos(sentence, must_contains_hanzi=False): ''' 分词 :param sentence:输入字符 :param must_contains_hanzi:是否必须包含汉字,默认False,即全部切词。Ture,即不返回词中没有汉字的词语 :return: ''' start = datetime.datetime.now() sentence_seged = jieba.posseg.cut(sentence.strip()) step1 = datetime.datetime.now() outstr = '' for x in sentence_seged: # 是否必须包含汉字 if x.word not in stopwords and x.word not in emptyList and (not must_contains_hanzi or contains_hanzi_pattern.match(x.word)): outstr+="{}/{},".format(x.word,x.flag) step2 = datetime.datetime.now() print("poscut:{}us微秒 filter:{}us".format((step1-start).microseconds,(step2-step1).microseconds)) return outstr def dosegment_all(sentence): ''' 带词性标注,对句子进行分词,不排除停词等 :param sentence:输入字符 :return: ''' sentence_seged = jieba.posseg.cut(sentence.strip()) outstr = '' for x in sentence_seged: outstr+="{}/{},".format(x.word,x.flag) return outstr # 提取关键词 def extract_tags(content,topk): content = content.strip() tags=jieba.analyse.extract_tags(content, topK=topk) return ','.join(tags)