用LDA主题模型做场景分类

  之前的项目中有使用到NLP中经典的文档生成主题模型LDA做场景分类,本文是其源码的核心部分记录,重要优化是使用了multiprocessing多核的手段来加速运算。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# -*- coding: utf-8 -*-
# 参考https://www.cnblogs.com/pinard/p/6908150.html
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from multiprocessing import Pool
import pandas as pd
import numpy as np
import jieba
import time
import os


def run_lda(cntTf, n_topic, idx):
print("# Topic nums: %d, " % n_topic, "Start analysing")
t0 = time.time()
lda = LatentDirichletAllocation(n_components=n_topic,
max_iter=200,
learning_method='batch',
evaluate_every=200,
verbose=0)
lda.fit(cntTf)
perplexity = lda.perplexity(cntTf)
prob = lda.transform(cntTf)
comp = lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]
print("# Topic nums: %d, " % n_topic, "end time: %0.3fs," % (time.time() - t0), "perplexity: %0.3f" % perplexity)
return (perplexity, prob, comp)


def run_kmeans(X, k, i):
print("# Category nums: %d, " % k, "Start analysing")
t0 = time.time()
kmeans = KMeans(n_clusters=k)
kmeans.fit(X)
SSE = sum(np.min(cdist(X, kmeans.cluster_centers_, "euclidean"), axis=1)) / X.shape[0]
print("# Category nums: %d, " % k, "end time: %0.3fs," % (time.time() - t0), "SSE: %0.3f" % SSE)
return (SSE)


def get_perplexityLst_flag(perplexityLst):
flag_id = 0
for i in range(len(perplexityLst) - 1):
if perplexityLst[i] < perplexityLst[i + 1]:
flag_id = i
break
return (flag_id)


def get_kmeans_flag(meandistortions):
flag_id = 0
target = (meandistortions[0] + 3 * meandistortions[-1]) / 4
for i in range(len(meandistortions) - 1):
if (meandistortions[i] > target and meandistortions[i + 1] < target):
flag_id = i + 1
break
return (flag_id)


def get_time_stamp():
# time_stamp can be used as an id of a analyse
ct = time.time()
local_time = time.localtime(ct)
data_head = time.strftime("%Y-%m-%d %H:%M:%S", local_time)
data_secs = (ct - int(ct)) * 1000
time_stamp = "%s.%03d" % (data_head, data_secs)
# stamp = ("".join(time_stamp.split()[0].split("-"))+"".join(time_stamp.split()[1].split(":"))).replace('.', '')
return (time_stamp)


if __name__ == '__main__':
workfolder = "D:\\Research\\Datasets\\NLP"
Rpt_no = get_time_stamp()
pool_maxnum = 4
n_topics = list(range(2, 6))
# add words need or not to be cut
jieba.suggest_freq('银行', True)
jieba.suggest_freq('张继科', True)
# get stopwords
stpwrdlst = []
file_list = os.listdir(workfolder)
try:
with open(os.path.join(workfolder, 'stop_words.txt')) as stpwrd_dic:
stpwrd_content = stpwrd_dic.read()
stpwrdlst = stpwrd_content.splitlines()
file_list.remove('stop_words.txt')
except:
pass

# dealing with every txt file
all_docs = []
for file_name in file_list:
if os.path.splitext(file_name)[1] == '.txt':
doc = os.path.splitext(file_name)[0]
data_path = os.path.join(workfolder, file_name)
with open(data_path, 'rb') as f:
document = f.read()
document_cut = jieba.cut(document)
result = ' '.join(document_cut)
all_docs.append(result)
print('Words Perpare Done!')

cntVector = CountVectorizer(stop_words=stpwrdlst)
cntTf = cntVector.fit_transform(all_docs)

# use multiprocess to caculate different topic_num models and perplexity
resLst = [1.0] * len(n_topics)
perplexityLst = [1.0] * len(n_topics)
# PROB is the doc_topic matrix
PROB = [1.0] * len(n_topics)
# COMP is the topic_word matrix
COMP = [1.0] * len(n_topics)

p = Pool(pool_maxnum)
print('multiprocess start, begin to caculate LDA')
for i in range(len(n_topics)):
res = p.apply_async(run_lda, args=(cntTf, n_topics[i], i,))
resLst[i] = res
p.close()
p.join()
print('LDA finish')

# get the results
for i in range(len(n_topics)):
res = resLst[i].get()
perplexityLst[i] = res[0]
PROB[i] = res[1]
COMP[i] = res[2]
# find the best topic numbers
flag_id = get_perplexityLst_flag(perplexityLst)
# print the prob of the best perplexity
# print(PROB[flag_id])

Tbl_RPT_Cfg_Perplexity = pd.DataFrame(columns=['Rpt_no', 'Topic_Nums', 'Perplexity', 'Flag'])
Tbl_RPT_Cfg_Perplexity.Topic_Nums = n_topics
Tbl_RPT_Cfg_Perplexity.Perplexity = perplexityLst
Tbl_RPT_Cfg_Perplexity.Rpt_no = Rpt_no
Tbl_RPT_Cfg_Perplexity.Flag = 0
Tbl_RPT_Cfg_Perplexity.loc[flag_id, 'Flag'] = 1
# print(Tbl_RPT_Cfg_Perplexity)

print('start using doc-topic prob matrix to apply clustering')
prob = PROB[flag_id]
X = np.array(prob)
K = n_topics.copy()
resLst = [1.0] * len(K)
meandistortions = [1.0] * len(K)
KMEANS = [1.0] * len(K)

p = Pool(pool_maxnum)
for i in range(len(K)):
res = p.apply_async(run_kmeans, args=(X, K[i], i,))
resLst[i] = res
p.close()
p.join()
print('KMEANS finish')

# get the results
for i in range(len(K)):
meandistortions[i] = resLst[i].get()

flag_id = get_kmeans_flag(meandistortions)
Tbl_RPT_Cfg_Kmeans = pd.DataFrame(columns=['Rpt_no', 'KMEANS', 'SSE', 'Flag'])
Tbl_RPT_Cfg_Kmeans.KMEANS = K
Tbl_RPT_Cfg_Kmeans.SSE = meandistortions
Tbl_RPT_Cfg_Kmeans.Rpt_no = Rpt_no
Tbl_RPT_Cfg_Kmeans.Flag = 0
Tbl_RPT_Cfg_Kmeans.loc[flag_id, 'Flag'] = 1
# print(Tbl_RPT_Cfg_Kmeans)

# the above gives the relationship of KMEANS.k and SSE, choose a k and apply clustering to get result
kmeans = KMeans(n_clusters=K[flag_id])
kmeans.fit(X)
output = kmeans.labels_
print(output)
print('finish')

附:
使用的示例新闻稿:文档样例.zip
停用词表:stop_words.txt