1.处理框架
Ltp:中文分词、分词标注、未登陆词识别、句法分析、语义角色标注
Stanford NLP:中文分词、分词标注、未登陆词识别、句法分析
FudanNLP:中文分词、句法分析
HanLP:中文分词、句法分析等各类算法
ICTCLAS分词系统:具有里程碑意义的中文分词系统
Anjs中文分词系统:中等规模的分词系统
jieba:小规模中文分词
2.分词功能
(1).ltp分词

import sys
import os
from pyltp import Segmentor
seg = Segmentor()
seg.load("modal path")
words = seg.segment('大家好我是一个例子')
print(words)
(2).jieba分词

import sys
import os
import jieba
#全模式
wordlist = jieba.cut("大家好我是一个例子",cut_all=True)
print("|".join(wordlist))
#精确模式
wordlist = jieba.cut("大家好我是一个例子")
print("|".join(wordlist))
#搜索模式
wordlist = jieba.cut_for_search("大家好我是一个例子")
print("|".join(wordlist))
3.词性标注

import sys
import os
from pyltp import *
words = ['我','是','一个','人']
pst = Postagger()
pst.load('model path')
tags = pst.postag(words)
for word,tag in zip(words,tags):
print(word + '-' + tag)
4.实体识别

rcz = NamedEntityRecognizer()
rez.load('model path')
ntags = rez.recognize(words,tags)
print(ntags)
5.句法分析

import nltk
from nltk.tree import Tree
from nltk.grammar import DependencyGrammarp
from nltk.parse import *
parser = Parser()
parser.load('model path')
arcs = parser.parse(words,tags)
coll = ''
for i in xrange(len(arcs)):
if arcs[i].head == 0:
arcs[i].relation = 'ROOT'
coll += '\t' + words[i] + '(' + tags[i] + ')' + '\t' + tags[i] + '\t' + arcs[i].head + '\t' + arcs[i].relation
print(coll)
ctree = DependencyGraph(coll)
ctree.tree().draw()
来源:https://www.cnblogs.com/yangyang12138/p/12417889.html
