1、知识点
"""
1)cut()
a) codecs.open() 解决编码问题
b) f.readline() 读取一行,也可以使用f.readlines()读取多行
c) words =" ".join(jieba.cut(line))分词,每个词用空格分隔
2)lcut()
返回一个list列表
"""
2、标点符号处理,并分词,存储到文件中
def fenCi():
"""
标点符号处理,并分词,存储到文件中
:return:
"""
f = codecs.open("深渊主宰系统.txt",'r',encoding='utf-8')
f1 = open("seg.txt",'w',encoding='utf-8')
line = f.readline()
while line:
line = line.strip(' ')
words =" ".join(jieba.cut(line))
words = words.replace(",","").replace("!","").replace("“","")\
.replace("”","").replace("。","").replace("?","").replace(":","")\
.replace("...","").replace("、","").strip(' ')
print(len(words))
if words.startswith('-') or words == '\r\n' or words.startswith('.') or len(words)<10 :
line = f.readline()
continue
words = words.strip('\n')
f1.writelines(words)
line = f.readline()
3、中文分词统计
def zhongwen():
"""
中文分词统计
对两个词以上的次数进行统计
lcut 进行分词,返回分词后list列表
:return:
"""
f = codecs.open("深渊主宰系统.txt", 'r', encoding='utf-8').read()
counts = {}
wordsList =jieba.lcut(f)
for word in wordsList:
word = word.replace(",", "").replace("!", "").replace("“", "") \
.replace("”", "").replace("。", "").replace("?", "").replace(":", "") \
.replace("...", "").replace("、", "").strip(' ').strip('\r\n')
if len(word) == 1 or word == "":
continue
else:
counts[word]=counts.get(word,0)+1 #单词计数
items = list(counts.items()) #将字典转为list
items.sort(key=lambda x:x[1],reverse=True) #根据单词出现次数降序排序
#打印前15个
for i in range(15):
word,counter = items[i]
print("单词:{},次数:{}".format(word,counter))
4、英文分词统计
def get_txt():
txt = open("1.txt", "r", encoding='UTF-8').read()
txt = txt.lower()
for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~':
txt = txt.replace(ch, " ") # 将文本中特殊字符替换为空格
return txt
def yingwen():
"""
英文分词统计
:return:
"""
file_txt = get_txt()
words = file_txt.split() # 对字符串进行分割,获得单词列表
counts = {}
for word in words:
if len(word) == 1:
continue
else:
counts[word] = counts.get(word, 0) + 1
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)
for i in range(5):
word, count = items[i]
print("{0:<5}->{1:>5}".format(word, count))
来源:https://www.cnblogs.com/ywjfx/p/11003872.html