中文分词 - jieba
import re
import jieba
news_CN = '''
央视315晚会曝光湖北省知名的神丹牌、莲田牌“土鸡蛋”实为普通鸡蛋冒充,同时在商标上玩猫腻,
分别注册“鲜土”、注册“好土”商标,让消费者误以为是“土鸡蛋”。3月15日晚间,新京报记者就此
事致电湖北神丹健康食品有限公司方面,其工作人员表示不知情,需要了解清楚情况,截至发稿暂未
取得最新回应。新京报记者还查询发现,湖北神丹健康食品有限公司为农业产业化国家重点龙头企
业、高新技术企业,此前曾因涉嫌虚假宣传“中国最大的蛋品企业”而被罚6万元。
'''
string = re.sub('[^\w]', '', news_CN) #使用正则去符号,之后都是用这个str字符串
seg_list = jieba.cut(string, cut_all=False) #精确模式
print('/'.join(seg_list))
'''
path = ''
file=open(path,'r')
jieba.load_userdict(file)
file.close()
'''
jieba.load_userdict(['神丹牌','莲花牌','土鸡蛋','新京报']) #载入字典
seg_list = jieba.cut(string, cut_all=False) #精确模式 str 为之前的字符串
print('/'.join(seg_list)) #词典中指定的词不会拆分
英文分词 - NLTK
import nltk
from nltk.corpus import names
from nltk.classify import NaiveBayesClassifier
# 分句
sentences = nltk.sent_tokenize(news.data[0])
postag_data = []
for sent in sentences:
# 词性标注 [('RULE', 'NNP'),...]
postag_data += nltk.pos_tag(nltk.word_tokenize(sent))
for word in postag_data:
#首字母大写都判为专有名词了
if 'NNP' == word[1]:
print(word)
实例1:根据人名预测性别
# 导入数据 [(u'Aaron', 'male'), (u'Abbey', 'male')]
data = ([(name, 'male') for name in names.words('male.txt')] +
[(name, 'female') for name in names.words('female.txt')])
# 提取特征
def gender_features(word):
return {'last_letter': word[-1]}
train_set = [(gender_features(n), g) for (n,g) in data]
# 训练模型
classifier = NaiveBayesClassifier.train(train_set)
classifier.classify(gender_features('Frank'))
实例2:确定积极评论和消极评论所占的比例
positive_vocab = [ 'awesome', 'outstanding', 'fantastic', 'terrific', 'good', 'nice', 'great', ':)' ]
negative_vocab = [ 'bad', 'terrible','useless', 'hate', ':(' ]
neutral_vocab = [ 'movie','the','sound','was','is','actors','did','know','words','not' ]
def word_feats(words):
return dict([(word, True) for word in words])
positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab]
negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab]
neutral_features = [(word_feats(neu), 'neu') for neu in neutral_vocab]
train_set = negative_features + positive_features + neutral_features
classifier = NaiveBayesClassifier.train(train_set)
neg = 0
pos = 0
sentence = "Awesome movie, I liked it"
sentence = sentence.lower()
words = sentence.split(' ')
for word in words:
classResult = classifier.classify(word_feats(word))
if classResult == 'neg':
neg = neg + 1
if classResult == 'pos':
pos = pos + 1
print('Positive: ' + str(float(pos) / len(words)))
print('Negative: ' + str(float(neg) / len(words)))
参考链接: