一.通常关于文本聚类也都是针对已有的一堆历史数据进行聚类,比如常用的方法有kmeans,dbscan等。如果有个需求需要针对流式文本进行聚类(即来一条聚一条),那么这些方法都不太适用了,当然也有很多其它针对流式数据进行动态聚类方法,动态聚类也有很多挑战,比如聚类个数是不固定的,聚类的相似阈值也不好设。这些都有待继续研究下去。本文实现一个简单sing-pass单遍聚类方法,文本间的相似度是利用余弦距离,文本向量可以用tfidf(这里的idf可以在一个大的文档集里统计得到,然后在新的文本中的词直接利用),也可以用一些如word2vec,bert等中文预训练模型对文本进行向量表示。
二.程序
1 import numpy as np
2 import os
3 import sys
4 import pickle
5 import collections
6 from sklearn.feature_extraction.text import TfidfVectorizer
7 from sklearn.decomposition import TruncatedSVD
8 from gensim import corpora, models, matutils
9 from utils.tokenizer import load_stopwords, load_samples, tokenizer, word_segment, load_data, read_data_to_list
10 from gensim.models import doc2vec, Doc2Vec
11 from sklearn.metrics.pairwise import cosine_similarity
12
13 '''
14 大体流程:
15 input:doc vector;threshold
16 output:cluster
17 begin
18 input doc vector
19 input threshold
20 first doc as first cluster and it's vector as the center of the cluster
21 while(doc vectors){
22 while(clusters){
23 max_sim,max_cluster = simlarity(doc vector,cluster);
24 }
25 if(max_sim > threshold){
26 max_cluster.put(doc vector);
27 max_cluster.update_center()
28 }
29 else{
30 build new cluster(doc vector);
31 }
32 }
33 end
34 '''
35 class SingelPassCluster(object):
36
37 '''
38 1.利用tfidf vec计算cossim
39 '''
40 def tfidf_vec(self, corpus, pivot=10, slope=0.25):
41 dictionary = corpora.Dictionary(corpus) # 形成词典映射
42 self.dict_size = len(dictionary)
43 print('dictionary size:{}'.format(len(dictionary)))
44 corpus = [dictionary.doc2bow(text) for text in corpus] # 词的向量表示
45 tfidf = models.TfidfModel(corpus, pivot=pivot, slope=slope)
46 corpus_tfidf = tfidf[corpus]
47 return corpus_tfidf
48
49 def get_max_similarity(self, cluster_cores, vector):
50 max_value = 0
51 max_index = -1
52 print('vector:{}'.format(vector))
53 for k, core in cluster_cores.items():
54 print('core:{}'.format(core))
55 similarity = matutils.cossim(vector, core)
56 if similarity > max_value:
57 max_value = similarity
58 max_index = k
59 return max_index, max_value
60
61 def single_pass(self, corpus_vec, corpus, theta):
62 clusters = {}
63 cluster_cores = {}
64 cluster_text = {}
65 num_topic = 0
66 cnt = 0
67 for vector, text in zip(corpus_vec, corpus):
68 if num_topic == 0:
69 clusters.setdefault(num_topic, []).append(vector)
70 cluster_cores[num_topic] = vector
71 cluster_text.setdefault(num_topic, []).append(text)
72 num_topic += 1
73 else:
74 max_index, max_value = self.get_max_similarity(cluster_cores, vector)
75 if max_value > theta:
76 clusters[max_index].append(vector)
77 text_matrix = matutils.corpus2dense(clusters[max_index], num_terms=self.dict_size,
78 num_docs=len(clusters[max_index])).T # 稀疏转稠密
79 core = np.mean(text_matrix, axis=0) # 更新簇中心
80 core = matutils.any2sparse(core) # 将稠密向量core转为稀疏向量
81 cluster_cores[max_index] = core
82 cluster_text[max_index].append(text)
83 else: # 创建一个新簇
84 clusters.setdefault(num_topic, []).append(vector)
85 cluster_cores[num_topic] = vector
86 cluster_text.setdefault(num_topic, []).append(text)
87 num_topic += 1
88 cnt += 1
89 if cnt % 100 == 0:
90 print('processing {}...'.format(cnt))
91 return clusters, cluster_text
92
93 def fit_transform(self, corpus, raw_data, theta=0.5):
94 tfidf_vec = self.tfidf_vec(corpus) # tfidf_vec是稀疏向量
95 clusters, cluster_text = self.single_pass(tfidf_vec, raw_data, theta)
96 return clusters, cluster_text
97
98
99 '''
100 2.利用doc2vec计算cossim
101 '''
102 def fit(self, doc2vec_model, corpus, raw_data, theta=0.5):
103 doc_vec = self.doc_vec(doc2vec_model, corpus)
104 clusters, cluster_text = self.doc2vec_single_pass(doc_vec, raw_data, theta)
105 return clusters, cluster_text
106
107 def fit_2(self, doc_vec, text2index, theta):
108 clusters, cluster_text = self.doc2vec_single_pass(doc_vec, text2index, theta)
109 return clusters, cluster_text
110
111 def doc_vec(self, doc2vec_model, x_train):
112 print('doc2vec infered vec...')
113 infered_vectors_list = []
114 for text, label in x_train:
115 vector = doc2vec_model.infer_vector(text)
116 infered_vectors_list.append(vector)
117 print('infered vector size:{}'.format(len(infered_vectors_list)))
118 if len(infered_vectors_list) >= 100:
119 break
120 return infered_vectors_list
121
122 def get_doc2vec_similarity(self, cluster_cores, vector):
123 max_value = 0
124 max_index = -1
125 for k, core in cluster_cores.items(): # core -> np.ndarray
126 similarity = cosine_similarity(vector.reshape(1, -1), core.reshape(1, -1))
127 similarity = similarity[0, 0]
128 if similarity > max_value:
129 max_value = similarity
130 max_index = k
131 return max_index, max_value
132
133 def doc2vec_single_pass(self, corpus_vec, corpus, theta):
134 clusters = {}
135 cluster_cores = {}
136 cluster_text = {}
137 num_topic = 0
138 cnt = 0
139 for vector, text in zip(corpus_vec, corpus):
140 if num_topic == 0:
141 clusters.setdefault(num_topic, []).append(vector)
142 cluster_cores[num_topic] = vector
143 cluster_text.setdefault(num_topic, []).append(text)
144 num_topic += 1
145 else:
146 max_index, max_value = self.get_doc2vec_similarity(cluster_cores, vector)
147 if max_value > theta:
148 clusters[max_index].append(vector)
149 core = np.mean(clusters[max_index], axis=0) # 更新簇中心
150 cluster_cores[max_index] = core
151 cluster_text[max_index].append(text)
152 else: # 创建一个新簇
153 clusters.setdefault(num_topic, []).append(vector)
154 cluster_cores[num_topic] = vector
155 cluster_text.setdefault(num_topic, []).append(text)
156 num_topic += 1
157 cnt += 1
158 if cnt % 100 == 0:
159 print('processing {}...'.format(cnt))
160 return clusters, cluster_text
161
162
163 def sim(doc_vec):
164 vector = doc_vec[0]
165 print('vector:{}'.format(type(vector)))
166 for core in doc_vec:
167 similarity = cosine_similarity(vector.reshape(1,-1), core.reshape(1,-1))
168 similarity = similarity[0, 0]
169 print("similarity:{}".format(similarity))
170
171 if __name__ == '__main__':
172 base_path = os.path.abspath(os.path.join(os.getcwd(), '../..'))
173 process_text = base_path + '/data/process_text.txt' # 处理后的样本路径
174 doc2vec_path = base_path + '/data/doc2vec.pkl'
175 cluster_result = base_path + '/data/cluster_result.txt'
176 doc_vec_path = base_path + '/data/doc_vec.vec' # 经过doc2vec推荐的文本向量
177
178 corpus = load_data(process_text)
179 raw_text = load_samples(process_text)
180
181 index2corpus = collections.OrderedDict()
182 for index, line in enumerate(raw_text):
183 index2corpus[index] = line
184 text2index = list(index2corpus.keys())
185 print('docs total size:{}'.format(len(text2index)))
186
187 single_cluster = SingelPassCluster()
188
189 cal_vec_type = 'doc2vec'
190
191 if cal_vec_type == 'tfidf':
192 clusters, cluster_text = single_cluster.fit_transform(corpus, text2index, theta=0.4)
193
194 if cal_vec_type == 'doc2vec':
195 with open(doc_vec_path, 'rb') as file:
196 infered_vectors_list = pickle.load(file)
197 clusters, cluster_text = single_cluster.fit_2(infered_vectors_list, text2index, theta=0.6)
198
199 '''
200 if os.path.exists(doc2vec_path):
201 print('doc2vec model loading...')
202 doc2vec_model = Doc2Vec.load(doc2vec_path)
203 x_train = read_data_to_list(process_text)
204 clusters, cluster_text = single_cluster.fit(doc2vec_model, x_train, text2index, theta=0.6)
205 '''
206
207 if cal_vec_type == 'd2vsim':
208 if os.path.exists(doc2vec_path):
209 print('doc2vec model loading...')
210 doc2vec_model = Doc2Vec.load(doc2vec_path)
211 x_train = read_data_to_list(process_text)
212 doc_vec = single_cluster.doc_vec(doc2vec_model, x_train)
213 sim(doc_vec)
214
215
216 print("............................................................................................")
217 print("得到的类数量有: {} 个 ...".format(len(clusters)))
218 print("............................................................................................\n")
219 # 按聚类语句数量对聚类结果进行降序排列
220 clusterTopic_list = sorted(cluster_text.items(), key=lambda x: len(x[1]), reverse=True)
221 with open(cluster_result, 'w', encoding='utf-8') as file_write:
222 for k in clusterTopic_list:
223 cluster_text = []
224 for index, value in enumerate(k[1],start=1):
225 cluster_text.append('(' + str(index) + '): ' + index2corpus[value])
226 cluster_text = '\n'.join(cluster_text)
227 file_write.write("【簇索引】:{} \n【簇中文档数】:{} \n【簇中文档】 :\n{}".format(k[0], len(k[1]), cluster_text))
228 file_write.write('\n')
229 file_write.flush()