我是根据《三国演义》人物统计,改进了一下,只显示我要统计的人物。
三国统计:
import jieba
excludes = {"将军","却说","荆州","二人","不可","不能","如此"}
txt = open("三国演义.txt", "r", encoding='utf-8').read()
words = jieba.lcut(txt)
counts = {}
for word in words:
if len(word) == 1:
continue
elif word == "诸葛亮" or word == "孔明曰":
rword = "孔明"
elif word == "关公" or word == "云长":
rword = "关羽"
elif word == "玄德" or word == "玄德曰":
rword = "刘备"
elif word == "孟德" or word == "丞相":
rword = "曹操"
else:
rword = word
counts[rword] = counts.get(rword,0) + 1
for word in excludes:
del counts[word]
items = list(counts.items())
items.sort(key=lambda x:x[1], reverse=True)
for i in range(10):
word, count = items[i]
print ("{0:<10}{1:>5}".format(word, count))
#输出
曹操 595
孔明 534
张飞 228
主公 223
刘备 196
引兵 190
次日 179
大喜 174
左右 172
天下 171
红楼梦统计:
import jieba
list_12jinchai= ["黛玉", "宝钗", "元春", "探春", "惜春", "李纨","妙玉","凤姐", "秦可卿", "贾巧"]
list_yahuan = ["晴雯", "袭人", "麝月", "香菱", "司琪"]
list_jiafu = ["宝玉","贾敬", "贾赦", "贾政", "贾琏", "贾珍", "贾环"]
list_furen = ["贾母", "王夫人"]
lists = list_12jinchai + list_furen + list_jiafu + list_yahuan
for s in lists:
jieba.add_word(s) # 目标分词
txt = open("红楼梦.txt", "r", encoding='utf_8').read()
words = jieba.lcut(txt)
counts = {}
for word in words:
if len(word) == 1:
continue
elif word == "林妹妹" :
rword == "黛玉"
elif word == "老太太":
rword == "贾母"
else:
rword = word
counts[rword] = counts.get(rword,0) + 1
items = list(counts.items())
items1 = []
for s in items:
if s[0] in lists:
items1.append(s) # 只显示目标人物
items1.sort(key=lambda x:x[1], reverse=True)
for i in range(10):
word, count = items1[i]
print("{0:<10}{1:>5}".format(word, count))
# 输出
宝玉 3445
贾母 1183
凤姐 1135
袭人 1070
黛玉 994
王夫人 969
宝钗 736
贾琏 691
贾政 449
探春 407
来源:CSDN
作者:撒哈拉沙漠的眼泪
链接:https://blog.csdn.net/wdx1993/article/details/103654602