mecab 只获短语中的取名词

余生长醉 提交于 2019-12-26 00:36:50
#!/usr/bin/env python
# -*- coding:utf-8 -*-
#@Time  : 2019/12/24 13:57
#@Author: 李现伟
#@File  : TinyTokenizerTest.py
import os
import MeCab
from tiny_tokenizer import WordTokenizer
os.environ['MECAB_PATH']='C:\\Program Files (x86)\\MeCab\\bin\\libmecab.dll'
os.environ['MECAB_CHARSET']='utf-8'


sentence = '僕は李現偉と申します。日本語を勉強しています。'

tokenizer1 = WordTokenizer('MeCab')
str_mecab_arry = tokenizer1.tokenize(sentence)
str_mecab = ' '.join(str(i) for i in str_mecab_arry)
# print(str_mecab)

# tokenizer = WordTokenizer(tokenizer="mecab", with_postag=False)
# print(tokenizer.tokenize(sentence))
#
# tokenizer = WordTokenizer('Sentencepiece', model_path="data/model.spm")
# print(tokenizer.tokenize(sentence))

TAGGER = MeCab.Tagger(os.environ.get("MECAB_TAGGER_ARGS", ""))

node = TAGGER.parseToNode(str_mecab)
res = []
while node:
    # print(node.surface + '----' + node.feature)
    str = node.feature
    strs = str.split(',')
    if(strs[0] == "名詞"):
        res.append(node.surface)
    node = node.next
print(res)
# print(TAGGER.parseNBest(2,"日本"))


执行结果:

['僕', '李', '日本語', '勉強']
标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!