#!/usr/bin/env python
# -*- coding:utf-8 -*-
#@Time : 2019/12/24 13:57
#@Author: 李现伟
#@File : TinyTokenizerTest.py
import os
import MeCab
from tiny_tokenizer import WordTokenizer
os.environ['MECAB_PATH']='C:\\Program Files (x86)\\MeCab\\bin\\libmecab.dll'
os.environ['MECAB_CHARSET']='utf-8'
sentence = '僕は李現偉と申します。日本語を勉強しています。'
tokenizer1 = WordTokenizer('MeCab')
str_mecab_arry = tokenizer1.tokenize(sentence)
str_mecab = ' '.join(str(i) for i in str_mecab_arry)
# print(str_mecab)
# tokenizer = WordTokenizer(tokenizer="mecab", with_postag=False)
# print(tokenizer.tokenize(sentence))
#
# tokenizer = WordTokenizer('Sentencepiece', model_path="data/model.spm")
# print(tokenizer.tokenize(sentence))
TAGGER = MeCab.Tagger(os.environ.get("MECAB_TAGGER_ARGS", ""))
node = TAGGER.parseToNode(str_mecab)
res = []
while node:
# print(node.surface + '----' + node.feature)
str = node.feature
strs = str.split(',')
if(strs[0] == "名詞"):
res.append(node.surface)
node = node.next
print(res)
# print(TAGGER.parseNBest(2,"日本"))
执行结果:
['僕', '李', '日本語', '勉強']
来源:CSDN
作者:Rigenyi
链接:https://blog.csdn.net/redhat1986/article/details/103703415