中文词频统计与词云生成

1 text = open("C:三体.txt", "r", encoding="UTF-8").read()      # 读取文本

1 import jieba 2  3  4 text = open("C:三体.txt", "r", encoding="UTF-8").read()      # 读取文本 5 word_txt = jieba.lcut(text)                                 # 进行中文分词

1 jieba.load_userdict(r'C:三体词汇.txt')                       # 加入小说分析对象的特有词汇 2 jieba.add_word("量子力学")                                   # 丰富词汇 3 jieba.add_word("万有引力")

  1 # -*- coding: utf-8 -*-   2 import struct   3 import os   4     5 # 拼音表偏移，   6 startPy = 0x1540;   7     8 # 汉语词组表偏移   9 startChinese = 0x2628;  10    11 # 全局拼音表  12 GPy_Table = {}  13    14 # 解析结果  15 # 元组(词频,拼音,中文词组)的列表  16    17    18 # 原始字节码转为字符串  19 def byte2str(data):  20     pos = 0  21     str = ''  22     while pos < len(data):  23         c = chr(struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0])  24         if c != chr(0):  25             str += c  26         pos += 2  27     return str  28    29 # 获取拼音表  30 def getPyTable(data):  31     data = data[4:]  32     pos = 0  33     while pos < len(data):  34         index = struct.unpack('H', bytes([data[pos],data[pos + 1]]))[0]  35         pos += 2  36         lenPy = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]  37         pos += 2  38         py = byte2str(data[pos:pos + lenPy])  39    40         GPy_Table[index] = py  41         pos += lenPy  42    43 # 获取一个词组的拼音  44 def getWordPy(data):  45     pos = 0  46     ret = ''  47     while pos < len(data):  48         index = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]  49         ret += GPy_Table[index]  50         pos += 2  51     return ret  52    53 # 读取中文表  54 def getChinese(data):  55     GTable = []  56     pos = 0  57     while pos < len(data):  58         # 同音词数量  59         same = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]  60    61         # 拼音索引表长度  62         pos += 2  63         py_table_len = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]  64    65         # 拼音索引表  66         pos += 2  67         py = getWordPy(data[pos: pos + py_table_len])  68    69         # 中文词组  70         pos += py_table_len  71         for i in range(same):  72             # 中文词组长度  73             c_len = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]  74             # 中文词组  75             pos += 2  76             word = byte2str(data[pos: pos + c_len])  77             # 扩展数据长度  78             pos += c_len  79             ext_len = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]  80             # 词频  81             pos += 2  82             count = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]  83    84             # 保存  85             GTable.append((count, py, word))  86    87             # 到下个词的偏移位置  88             pos += ext_len  89     return GTable  90    91    92 def scel2txt(file_name):  93     print('-' * 60)  94     with open(file_name, 'rb') as f:  95         data = f.read()  96    97     print("词库名：", byte2str(data[0x130:0x338])) # .encode('GB18030')  98     print("词库类型：", byte2str(data[0x338:0x540]))  99     print("描述信息：", byte2str(data[0x540:0xd40])) 100     print("词库示例：", byte2str(data[0xd40:startPy])) 101   102     getPyTable(data[startPy:startChinese]) 103     getChinese(data[startChinese:]) 104     return getChinese(data[startChinese:]) 105   106 if __name__ == '__main__': 107     # scel所在文件夹路径 108     in_path = r"C:\Users\Administrator\Downloads"   #修改为你的词库文件存放文件夹 109     # 输出词典所在文件夹路径 110     out_path = r"C:\Users\Administrator\Downloads\text"  # 转换之后文件存放文件夹 111     fin = [fname for fname in os.listdir(in_path) if fname[-5:] == ".scel"] 112     for f in fin: 113         try: 114             for word in scel2txt(os.path.join(in_path, f)): 115                 file_path=(os.path.join(out_path, str(f).split('.')[0] + '.txt')) 116                 # 保存结果 117                 with open(file_path,'a+',encoding='utf-8')as file: 118                     file.write(word[2] + '\n') 119             os.remove(os.path.join(in_path, f)) 120         except Exception as e: 121             print(e) 122             pass

scel_to_text

1 for word in word_list: 2     if len(word) == 1: 3         continue 4     else: 5         word_list = word_lists.append(word) 6         word_dict[word] = word_dict.get(word, 0)+1                         # get()函数返回指定键的值，若没有则返回默认值

1 wd = list(word_dict.items())                                               # 为了排序，使字典列表化 2 wd.sort(key=lambda x: x[1], reverse=True)                                  # 根据字典的值排序

1 stops_word = open("C:stops_chinese.txt", "r", encoding="UTF-8").read()     # 读取停用词文本 2 exclude = {'两个', '东西', '很快', '一种', '这是', '看着', '真的', '发出', '回答', 3            '感觉', '仿佛', '\u3000', '\n','中'}                            # 自定义停用词 4 stop_list = stops_word.split() 5 stops_all = set(stop_list).union(set(stop_list), exclude)                  # 求停用词的并集 6 word_list = [element for element in word_txt if element not in stops_all]  # 去掉停用词

1 for i in range(20):                                                        # 输出前20个高频的词 2     print(wd[i]) 3 word_csv = wd                                                              # 生成csv文件 4 pd.DataFrame(data=word_csv[0:20]).to_csv('The_three_body.csv', encoding='UTF-8')

 1 from wordcloud import WordCloud  2   3 import matplotlib.pyplot as plt  4 import jieba  5 import pandas as pd  6   7   8 jieba.load_userdict(r'C:三体词汇.txt')                                     # 加入小说分析对象的特有词汇  9 jieba.add_word("量子力学")                                                 # 丰富词汇 10 jieba.add_word("万有引力") 11 text = open("C:三体.txt", "r", encoding="UTF-8").read()                    # 读取小说文本 12 word_txt = jieba.lcut(text)                                                # 进行中文分词 13 # jieba.add_word('')                                                       # 添加小说特有词汇 14 stops_word = open("C:stops_chinese.txt", "r", encoding="UTF-8").read()     # 读取停用词文本 15 exclude = {'两个', '东西', '很快', '一种', '这是', '看着', '真的', '发出', '回答', 16            '感觉', '仿佛', '\u3000', '\n','中'}                            # 自定义停用词 17 stop_list = stops_word.split() 18 stops_all = set(stop_list).union(set(stop_list), exclude)                  # 求停用词的并集 19 word_list = [element for element in word_txt if element not in stops_all]  # 去掉停用词 20 word_dict = {} 21 word_lists = [] 22 for word in word_list: 23     if len(word) == 1: 24         continue 25     else: 26         word_list = word_lists.append(word) 27         word_dict[word] = word_dict.get(word, 0)+1                         # get()函数返回指定键的值，若没有则返回默认值 28 wd = list(word_dict.items())                                               # 为了排序，使字典列表化 29 wd.sort(key=lambda x: x[1], reverse=True)                                  # 根据字典的值排序 30 for i in range(20):                                                        # 输出前20个高频的词 31     print(wd[i]) 32 word_csv = wd                                                              # 生成csv文件 33 pd.DataFrame(data=word_csv[0:20]).to_csv('The_three_body.csv', encoding='UTF-8') 34  35  36 mywc = WordCloud(background_color='black', margin=2).generate(str(word_lists)) 37 plt.imshow(mywc) 38 plt.axis("off") 39 plt.show()

中文词频统计与词云生成

标签

data

jieba