├── README.md
├── cn_lda_text.py
├── lda_logfile.zip
├── lda_model.py
└── text_test1.zip


/README.md:
--------------------------------------------------------------------------------
 1 | # LDA
 2 | # lda 算法以及使用示例
 3 | # 1在数据库中建立表 ldasql 指定utf8
 4 | import cn_lda_text.py
 5 | 
 6 | content=cn_lda_text.lda('ldasql')
 7 | # 2建立表
 8 | content.create_indextables()
 9 | # 3读取训练集到数据库 文件在两个zip解压后直接使用  放在同一项目目录下
10 | # 注意训练语料路径和停用词路径  在addtodb中设置
11 | content.addtodb()
12 | # 4运行lda_model
13 | content.run_lda()
14 | 


--------------------------------------------------------------------------------
/cn_lda_text.py:
--------------------------------------------------------------------------------
  1 | #-*- coding:utf-8 -*-
  2 | import os
  3 | import jieba
  4 | import re
  5 | import json
  6 | import lda_model
  7 | import MySQLdb
  8 | import codecs
  9 | import sys
 10 | reload(sys)
 11 | sys.setdefaultencoding('utf-8')
 12 | 
 13 | class lda:
 14 |     def __init__(self, dbname):
 15 |         self.conn = MySQLdb.connect(user='root', passwd='mysql', host='localhost', db=dbname, charset="utf8")
 16 |         self.con = self.conn.cursor()  # con  replace  cur
 17 |         self.stop_word = []
 18 | 
 19 |     def __del__(self):
 20 |         self.conn.close()
 21 | 
 22 |     def dbcommit(self):
 23 |         self.conn.commit()
 24 | 
 25 |     def readfile(self,path):
 26 |         fp = open(path, 'rb')
 27 |         content = fp.read()
 28 |         fp.close()
 29 |         return content
 30 | 
 31 |     def create_indextables(self):
 32 |         self.con.execute(
 33 |             "CREATE TABLE IF NOT EXISTS urllist (rowid int(100) NOT NULL AUTO_INCREMENT PRIMARY KEY, url varchar(200),flag int(100),"
 34 |             "score_pagerank varchar(200),score_nn varchar(200),score_content varchar(200),score_lda varchar(200)) CHARACTER SET utf8")
 35 |         self.con.execute(
 36 |             'CREATE TABLE IF NOT EXISTS wordlist(rowid int(100) NOT NULL AUTO_INCREMENT PRIMARY KEY, word varchar(200)) CHARACTER SET utf8')
 37 |         self.con.execute(
 38 |             'CREATE TABLE IF NOT EXISTS wordlocation(urlid int(100) ,wordid int(100),location int(100),lda_flag int(100)) CHARACTER SET utf8')
 39 |         self.con.execute(
 40 |             'CREATE TABLE IF NOT EXISTS link(rowid int(100) NOT NULL AUTO_INCREMENT PRIMARY KEY,fromid int(100),toid int(100)) CHARACTER SET utf8')
 41 |         self.con.execute('create index wordidx on wordlist(word)')
 42 |         self.con.execute('create index urlidx on urllist(url)')
 43 |         self.con.execute('create index wordurlidx on wordlocation(wordid)')
 44 |         self.con.execute('create index urltoidx on link(toid)')
 45 |         self.con.execute('create index urlfromidx on link(fromid)')
 46 |         self.dbcommit()
 47 | 
 48 |     def separatewords(self,text):
 49 |         # print 'separating'
 50 |         null=[]
 51 |         a = re.findall(ur"[\u4e00-\u9fa5]+", text)
 52 |         if a:
 53 |             for word in a:#在切分词时候添加停用词检测
 54 |                 if word in self.stop_word:
 55 |                     return null
 56 |                 else:
 57 |                     return a
 58 | 
 59 |     def addtodb(self):
 60 |         #加载停用词，在入数据库之前忽略停用词，在所有进程开始前调用一次
 61 |         path_stopwoed = 'lda_logfile/'
 62 |         stopwordfile = path_stopwoed + 'stopwords.txt'
 63 |         with codecs.open(stopwordfile, 'rb', 'utf-8') as f:
 64 |             stop_words = f.readlines()
 65 |         for sword in stop_words:
 66 |             strword = sword.replace('\r\n', '').replace(' ','').strip()
 67 |             if strword != '':
 68 |                 self.stop_word.append(strword)#全局变量  self.stop_word
 69 |             else:
 70 |                 continue
 71 | 
 72 |         path = 'text_test1/'  # 'ldatext/'
 73 |         catlist = os.listdir(path)
 74 |         for dir in catlist:
 75 |             class_path = path + dir + '/'
 76 |             # if not os.path.exists(class_path):
 77 |             file_list = os.listdir(class_path)
 78 |             for file in file_list:
 79 |                 fullname = class_path + file
 80 |                 if fullname!='':
 81 |                     self.addtoindex(fullname)
 82 | 
 83 |                 #content = self.readfile(fullname).strip()
 84 |                 #content = content.replace("\r\n", "").strip()
 85 |                 #content_seg = jieba.cut(content, cut_all=False)
 86 |                 #for word in content_seg:
 87 |                     # w.append(separatewords(word))
 88 |                     #w.extend(self.separatewords(word))
 89 |                 #print  json.dumps(w, encoding="UTF-8", ensure_ascii=False) + '\t'
 90 |                 # print " ".join(content_seg)+'\t'
 91 |                 #print fullname + '\t'
 92 | 
 93 |     def isindexed(self, url):
 94 |         # print 'checking'
 95 |         self.con.execute("select flag from urllist where url='%s' " % url)
 96 |         u = self.con.fetchone()
 97 |         if u!= None:
 98 |             if u[0] == 1:  # if only if flag=1
 99 |                 return True
100 |             else:
101 |                 return False
102 |         else:
103 |             return False
104 | 
105 |     def getentryid(self, table, field, value,flag=0, createnew=True):
106 |         #update flag
107 |         if flag == 1:
108 |             self.con.execute("update %s set %s=1 where url='%s' " % (table, field, value))
109 |             self.dbcommit()
110 |         #search and insert
111 |         elif flag == 0:
112 |             self.con.execute("select rowid from %s where %s='%s'" % (table, field, value))
113 |             res = self.con.fetchone()
114 |             if res == None:
115 |                 self.con.execute("insert into %s (%s) values ('%s')" % (table, field, value))
116 |                 v = self.con.lastrowid
117 |                 self.dbcommit()
118 |                 return v
119 |             else:
120 |                 # return int(res[0])
121 |                 return res[0]
122 | 
123 |     def addtoindex(self, url):
124 |         words=[]
125 |         if self.isindexed(url): return
126 |         print 'Indexing ' + url
127 |         content = self.readfile(url).strip()
128 |         content = content.replace("\r\n", "").strip()
129 |         content_seg = jieba.cut(content, cut_all=False)
130 | 
131 |         for word in content_seg:
132 |             if word:#判断列表是否为空 若word非空
133 |                 words.extend(self.separatewords(word))
134 |             else:
135 |                 continue
136 | 
137 |         # 得到这个 URL的 rowid
138 |         urlid = self.getentryid('urllist', 'url', url)
139 |         #flag=1 表示已经插入过这个url
140 |         self.getentryid('urllist', 'flag', url, 1)
141 | 
142 |         # Link each word to this url
143 |         for i in range(len(words)):
144 |             word = words[i]
145 |             if word.replace(' ','')=='' or word in self.stop_word: continue
146 |             # 忽略停用词，但是location空出来了 是否合适  还是在加入words列表时就去除
147 |             wordid = self.getentryid('wordlist', 'word', word)#得到词典中词的id（包括去重过程）
148 |             self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid, wordid, i))
149 |         self.dbcommit()
150 | 
151 | 
152 | 
153 | #-----------------------------------------------------------------------------------------------------------------------
154 | #添加lda模块
155 |     def run_lda(self):
156 |         doc = {}
157 |         i=0
158 |         self.con.execute('select max(rowid) from wordlist')  # count(distinct(rowid))
159 |         words_count = self.con.fetchone()[0]  # wordcount是词汇总数（不重复）
160 |         self.con.execute('select DISTINCT (urlid) from wordlocation')#
161 |         cur = self.con.fetchall()
162 |         for (urlid,) in cur:
163 |             #doc.append(urlid)#得到列表 doc
164 |             doc[i]=urlid#得到字典doc  从0开始存储真正的docid
165 |             i+=1
166 |         docs_count = len(doc)
167 |         a = {}
168 |         url_location={}
169 |         print 'preparing url_wordlocation.....'
170 |         for docid in doc.values():  # 将docid,location,wordid载入内存，避免mysql瓶颈 返回self.url_location
171 |             # self.url_location.setdefault(docid, {})
172 |             self.con.execute('select wordid,location from wordlocation where urlid=%d' % docid)
173 |             res = self.con.fetchall()
174 |             j=0
175 |             for (wordid, location) in res:
176 |                 a[j] = wordid
177 |                 j+=1
178 | 
179 |             url_location[docid] = a  # url_location存的是docs中的编号，即true_urlid
180 |         mylda = lda_model.lda_model('lda', words_count+1, docs_count, doc,url_location)
181 |         mylda.runlda()
182 | 
183 | 
184 | 
185 | 
186 | 
187 | #---------------------------------------------------------------------------------------------------------------------
188 | #demo
189 |     def set_stopwords(self):
190 |         w=[]
191 |         path='lda_logfile/'
192 |         trainfile=path+'stopwords.txt'
193 |         with codecs.open(trainfile, 'rb', 'utf-8') as f:
194 |             words = f.readlines()
195 |         for word in words:
196 |             strword=word.replace('\r\n','').strip()
197 |             if strword!='':
198 |                 w.append(strword)
199 |             else:
200 |                 continue
201 |             #print word.strip()
202 |             #self.con.execute("select rowid from wordlist where word='%s' " % str(strword))
203 |             #res=self.con.fetchone()
204 |             #if res!=None:
205 |                 #print res[0]
206 |                 #self.con.execute("update wordlist set stopword_flag=1 where rowid=%d " % res[0])
207 |                 #self.dbcommit()
208 |             #else:
209 |                 #continue
210 |         #self.dbcommit()
211 |         print json.dumps(w, encoding="UTF-8", ensure_ascii=False)+'\t'
212 |         print ("和" in w)
213 | 


--------------------------------------------------------------------------------
/lda_logfile.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renpengcheng-github/LDA/97a153e94dc17048d3150339dbb9f8cd644ebc16/lda_logfile.zip


--------------------------------------------------------------------------------
/lda_model.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import numpy as np
  3 | import MySQLdb
  4 | from numpy import *
  5 | import ConfigParser
  6 | import random
  7 | import codecs
  8 | import os
  9 | import logging.config
 10 | logger = logging.getLogger()
 11 | 
 12 | class lda_model:
 13 |     def __init__(self, dbname,words_count,docs_count,doc,url_location):
 14 |         self.conn = MySQLdb.connect(user='root', passwd='mysql', host='localhost', db=dbname, charset="utf8")
 15 |         self.con = self.conn.cursor()  # con  replace  cur
 16 |         #-------------------------------------------------------------------------------------------------------------------
 17 |         #初始化变量
 18 |         self.K = 100
 19 |         self.beta = 0.1
 20 |         self.alpha = 0.1
 21 |         self.iter_times = 300#迭代次数
 22 | 
 23 |         self.docs = doc  # docs 为urlid 文档的真正编号 遍历时以下标遍历 字典结构
 24 |         #self.top_words_num = 20
 25 |         self.docs_count=docs_count  #len(self.docs)
 26 |         self.words_count=words_count  #wordcount是词汇总数（不重复）0~N 共N+1个数
 27 |         # nd[doc_count(M)][K],每个doc中各个topic的词的总数
 28 |         # ndsum[doc_count(M)],每各doc中词的总数
 29 |         # nw[wordcount(V)][K],词word在主题topic上的分布
 30 |         # nwsum[K],每各topic的词的总数
 31 | 
 32 |         self.nd = np.zeros((docs_count, self.K), dtype="int")
 33 |         # [[0 for t in range(K)] for d in range(docs_count) ]
 34 |         self.ndsum = np.zeros(docs_count, dtype="int")
 35 |         # [0 for d in range(docs_count)]
 36 |         self.nw = np.zeros((words_count, self.K), dtype="int")
 37 |         # [[0 for t in range(K)] for d in range(words_count) ]
 38 |         self.nwsum = np.zeros(self.K, dtype="int")
 39 |         # [0 for d in range(k)]
 40 | 
 41 |         self.p = np.zeros(self.K,dtype=float)
 42 |         # p,概率向量 double类型，存储采样的临时变量
 43 |         #self.z = np.array([[0 for m in xrange(docs_count)] for w in xrange(words_count)])
 44 |         self.z =[[0 for y in xrange(len(url_location[doc[m]]))] for m in xrange(docs_count)]
 45 |         # M*doc.size()，文档中词的主题分布
 46 | 
 47 |         self.theta = np.array([[0.0 for y in xrange(self.K)] for x in xrange(docs_count)])
 48 |         self.phi = np.array([[0.0 for y in xrange(words_count)] for x in xrange(self.K)])
 49 |         self.url_location=url_location
 50 |         # 文件变量
 51 |         # 分好词的文件trainfile
 52 |         # 词对应id文件wordidmapfile
 53 |         # 文章-主题分布文件thetafile
 54 |         # 词-主题分布文件phifile
 55 |         # 每个主题topN词文件topNfile
 56 |         # 最后分派结果文件tassginfile
 57 |         # 模型训练选择的参数文件paramfile
 58 |         #
 59 |         self.path = 'lda_logfile/'
 60 |         self.phifile = self.path + 'model_phi.dat'
 61 |         self.thetafile = self.path + 'model_theta.dat'
 62 |         self.topicwordfile=self.path + 'model_topicword.dat'
 63 |         self.test_topic=self.path + 'test_topic.dat'
 64 |         self.test_word=self.path + 'test_word.dat'
 65 | 
 66 |     def __del__(self):
 67 |         self.con.close()
 68 |     def dbcommit(self):
 69 |         self.conn.commit()
 70 | 
 71 |     # -------------------------------------------------------------------------------------------------------------------
 72 |     #1数据输入和Topic随机初始化：
 73 |     def data_preparing(self):
 74 |         # 初始化阶段：随机先分配类型
 75 |         print 'preparing data.....'
 76 |         for m in xrange(self.docs_count): #docs里面存储的是真正的urlid  self.docs_count=len(doc)
 77 |             true_urlid=self.docs[m]
 78 |             #------------------------------------------------------------------------------------
 79 |             #res=len(self.url_location[true_urlid])
 80 |             #self.ndsum[m] = res#某文档有多少个词
 81 |             #for i in self.url_location[true_urlid].keys():#对于每篇文档的每个locations
 82 |                 #topicid = random.randint(0, self.K - 1)  # topic标号是0到k-1 随机分配topic
 83 |                 #dict_wordid = self.url_location[true_urlid][i] #-1
 84 |                 #self.z[i][m] = topicid  # 将此文档中每个词赋值一个topic
 85 |                 #self.nw[dict_wordid][topicid] += 1
 86 |                 #self.nwsum[topicid] += 1
 87 |                 #self.nd[m][topicid] += 1
 88 |             # ------------------------------------------------------------------------------------
 89 |             res = len(self.url_location[true_urlid])
 90 |             self.ndsum[m] = res  # 某文档有多少个词
 91 |             for i in xrange(res):
 92 |                 topicid = random.randint(0, self.K - 1)  # topic标号是0到k-1 随机分配topic
 93 |                 dict_wordid = self.url_location[true_urlid][i] #-1
 94 |                 self.z[m][i] = topicid  # 将此文档中每个词赋值一个topic
 95 |                 self.nw[dict_wordid][topicid] += 1
 96 |                 self.nwsum[topicid] += 1
 97 |                 self.nd[m][topicid] += 1
 98 |             # ------------------------------------------------------------------------------------
 99 |             #self.con.execute('select count(wordid) from wordlocation where urlid=%d' %true_urlid)
100 |             #res=self.con.fetchone()[0]#每篇doc文档有多少词（数量） 需要程序计数（从外界传入变量）
101 |             #self.ndsum[m] =res
102 |             #for w in xrange(int(res)):#此w是文档中的所有词语  即这篇文档一共有多少个词（包括重复词语）要对每一个词语查询字典id
103 |                 #if self.url_location[true_urlid].has_key(w):#若有这个词的location
104 |                     #topicid = random.randint(0, self.K - 1)#topic标号是0到k-1
105 |                     #dict_wordid = self.url_location[true_urlid][w]#-1
106 |                     #self.z[dict_wordid][m] = topicid#将此文档中每个词赋值一个topic
107 |                     #self.nw[dict_wordid][topicid] += 1
108 |                     #self.nwsum[topicid] += 1
109 |                     #self.nd[m][topicid]+=1
110 |                 #else:
111 |                     #continue
112 |         self.dbcommit()
113 |         return 'prepare end!'
114 | 
115 |     # -------------------------------------------------------------------------------------------------------------------
116 |     #计算alpha，输入nd, nd_sum
117 |     def calc_theta(self):
118 |         print 'calculating theta.....'
119 |         nd=self.nd
120 |         nd_sum=self.ndsum
121 |         alpha=self.alpha
122 |         topic_number=self.K
123 |         doc_num=self.docs_count
124 |         topic_alpha=topic_number * alpha  # K*alpha
125 |         theta=[[0 for t in range(topic_number)]for d in range(doc_num)]
126 |         for m in range(doc_num):
127 |             for k in range(topic_number):
128 |                 theta[m][k]=(nd[m][k]+alpha)/(nd_sum[m]+topic_alpha)
129 |         return theta
130 |     #向量计算
131 |     #def calc_theta(self):
132 |         #print 'calculating theta.....'
133 |         #for i in xrange(self.docs_count):#0到docs_count-1
134 |             #self.theta[i] = (self.nd[i] + self.alpha) / (self.ndsum[i] + self.K * self.alpha)
135 | 
136 | 
137 |     # -------------------------------------------------------------------------------------------------------------------
138 |     #计算phi输入 nw,nw_sum
139 |     #def calc_phi(self):
140 |         #print 'calculating phi.....'
141 |         #nw=self.nw
142 |         #nw_sum=self.nwsum
143 |         #beta=self.beta
144 |         #topic_number = self.K
145 |         #word_num = self.words_count
146 |         #word_beta = word_num * beta  # V*beta
147 |         #phi = [[0 for t in range(word_num)] for d in range(topic_number)]
148 |         #for k in range(topic_number):
149 |             #for w in range(word_num):
150 |                 #phi[k][w] = (nw[w][k] + beta) / (nw_sum[k] + word_beta)
151 |         #return phi
152 |     #向量计算
153 |     def calc_phi(self):
154 |         print 'calculating phi.....'
155 |         nw = self.nw
156 |         nw_sum = self.nwsum
157 |         beta = self.beta
158 |         topic_number = self.K
159 |         word_num = self.words_count
160 |         word_beta = word_num * beta  # V*beta
161 |         phi = [[0 for t in range(word_num)] for d in range(topic_number)]
162 |         for k in xrange(self.K):
163 |             phi[k] = (nw.T[k] + beta) / (nw_sum[k] + word_beta)
164 |         return phi
165 | 
166 |         #a[0]
167 |         #Out[47]: array([1., 1., 1.])
168 |         #In[48]: a[0] + 1
169 |         #Out[48]: array([2., 2., 2.])
170 |     # -------------------------------------------------------------------------------------------------------------------
171 |     #吉比斯采样过程
172 | 
173 |     def sampling(self, m, dict_wordid,i):#对某个位置进行采样
174 |         topic = self.z[m][i]
175 |         self.nw[dict_wordid][topic] -= 1
176 |         self.nd[m][topic] -= 1
177 |         self.nwsum[topic] -= 1
178 |         self.ndsum[m] -= 1
179 | 
180 |         Vbeta = self.words_count * self.beta
181 |         Kalpha = self.K * self.alpha
182 |         #吉比斯采样公式计算topic采样概率存入p中,以向量来计算  每迭代一次更新p向量
183 |         self.p = (self.nw[dict_wordid] + self.beta) / (self.nwsum + Vbeta) * \
184 |                  (self.nd[m] + self.alpha) / (self.ndsum[m] + Kalpha)
185 | 
186 |         # 在这里完成了一个掷骰子的过程，p[]向量是Topic的多项分布骰子，u是随便扔了个数，扔到p几就输出对应的Topic
187 |         for k in xrange(1, self.K):
188 |             self.p[k] += self.p[k - 1]#逐项累计 参考lda漫游p56  #p为长度为K的变量，存放各个主题的概率
189 |         u = random.uniform(0, self.p[self.K - 1])
190 |         new_topicid=0
191 |         for i in xrange(self.K):
192 |             if self.p[i] > u:
193 |                 new_topicid=i
194 |                 break
195 |             else:
196 |                 continue
197 |         self.nw[dict_wordid][new_topicid] += 1
198 |         self.nwsum[new_topicid] += 1
199 |         self.nd[m][new_topicid] += 1
200 |         self.ndsum[m] += 1
201 |         return new_topicid
202 | 
203 | 
204 |     # -------------------------------------------------------------------------------------------------------------------
205 |     #运行lda进行训练
206 |     def runlda(self):
207 |         self.data_preparing()
208 |         # Consolelogger.info(u"迭代次数为%s 次" % self.iter_times)
209 |         print 'prepare data end,starting simple iteration.....'
210 |         for x in xrange(self.iter_times): #迭代次数
211 |             print '迭代次数 ：'+str(x+1)
212 |             for m in xrange(self.docs_count):#对每篇文档
213 |                 true_urlid=self.docs[m]
214 |                 #--------------------------------------------------------------------------------------
215 |                 res = len(self.url_location[true_urlid])
216 |                 #self.ndsum[m] = res  # 某文档有多少个词
217 |                 for i in xrange(res):
218 |                     dict_wordid = self.url_location[true_urlid][i]
219 |                     topic = self.sampling(m, dict_wordid,i)  # 对第m文档的w（位置location 从0开始）词进行采样 返回topicid
220 |                     self.z[dict_wordid][m] = topic
221 |                 # --------------------------------------------------------------------------------------
222 |                 #for w in xrange(self.ndsum[m]):#对文档m中的每个词（包括重复词）都要进行采样，模拟投掷骰子生成词的逆过程
223 |                     #dict_wordid=self.getword_dictid(self.docs[m],w)-1#减1同理
224 |                     #if self.url_location[self.docs[m]].has_key(w):
225 |                         #dict_wordid=self.url_location[self.docs[m]][w]#-1
226 |                         #topic = self.sampling(m, dict_wordid)#对第m文档的w（位置location 从0开始）词进行采样 返回topicid
227 |                         #self.z[dict_wordid][m] = topic
228 |                     #else:
229 |                         #continue
230 |         self.theta=self.calc_theta()
231 |         self.phi=self.calc_phi()
232 |         print 'calculate end.....'
233 |         print 'starting sign lda flag in mysql.....'
234 |         for m in range(self.docs_count):
235 |             self.updateandsave_topicword(m)
236 |         self.save()
237 |         return 'lda process end!'
238 |  # -------------------------------------------------------------------------------------------------------------------
239 |  #更新数据库ldaflag状态和高频主题词保存至本地
240 |     def updateandsave_topicword(self,m):
241 |         true_docid=self.docs[m]
242 |         topk_word = 0
243 |         top_topicid = self.theta[m].index(max(self.theta[m]))  # 概率最高的主题
244 |         temp = list(enumerate(self.phi[top_topicid]))
245 |         sorted_tmp=sorted(temp,key=lambda x: x[1], reverse=True) # 主题词排序 从高到底
246 |         #temp.sort(key=lambda x: x[1], reverse=True)
247 |         with codecs.open(self.topicwordfile, 'a') as f:
248 |             f.write('真实文档号：'+str(true_docid) + '\n')#\n 换行
249 |             for (wordid, p) in sorted_tmp:#phi的下标代表词典中的词id
250 |                 if wordid!=0:
251 |                     self.con.execute('select word from wordlist where rowid=%d' % wordid)
252 |                     word=self.con.fetchone()[0]
253 |                     if topk_word < 10:
254 |                         f.write('第' + str(topk_word + 1) + '主题词：' + str(word) + '\t' + '概率为：' + str(p) + '\n')  # \n 换行
255 |                     #self.con.execute(
256 |                     #'update wordlocation set lda_flag=1 where urlid=%d and wordid=%d ' % (true_docid, wordid))
257 |                         topk_word += 1
258 |                     else:
259 |                         break
260 |                 else:
261 |                     continue
262 |         #------------------------------------------------------------------------------------
263 |         #test
264 |         topk_word = 0
265 |         with codecs.open(self.test_word, 'a') as f:
266 |             f.write('真实文档号：'+str(true_docid) + '\n')#\n 换行
267 |             for (wordid, p) in sorted_tmp:#phi的下标代表词典中的词id
268 |                 if topk_word < 10:
269 |                     f.write('第' + str(topk_word + 1) + '高频词：' + str(wordid) + '\t' + '概率为：' + str(p) + '\n')
270 |                     topk_word += 1
271 |                 else:
272 |                     break
273 |         #------------------------------------------------------------------------------------
274 |         topk_word = 0
275 |         with codecs.open(self.test_topic, 'a') as f:
276 |             f.write('真实文档号：'+str(true_docid) + '\n')#\n 换行
277 |             if topk_word < 10:
278 |                 f.write('文档主题号'+str(top_topicid) + '\n')
279 |                 topk_word += 1
280 |         self.dbcommit()
281 |         return 'update and save done!'
282 |  # -------------------------------------------------------------------------------------------------------------------
283 |     # 保存theta文章-主题分布
284 |     # 存储结果
285 |     def save(self):
286 |         print 'starting save.....'
287 |         with codecs.open(self.thetafile, 'w') as f:
288 |             for x in xrange(self.docs_count):
289 |                 for y in xrange(self.K):
290 |                     if self.theta[x][y]!=None:
291 |                         f.write(str(self.theta[x][y]) + '\t')
292 |                     else:
293 |                         f.write(str(0) + '\t')
294 |                 f.write('\n')
295 |         # 保存phi词-主题分布
296 |         #logger.info(u"词-主题分布已保存到%s" % self.phifile)
297 |         with codecs.open(self.phifile, 'w') as f:
298 |             for y in xrange(self.words_count):
299 |                 for x in xrange(self.K):
300 |                     if self.phi[x][y]!=None:
301 |                         f.write(str(self.phi[x][y]) + '\t')
302 |                     else:
303 |                         f.write(str(0) + '\t')
304 |                 f.write('\n')
305 |         return 'save end'
306 | 


--------------------------------------------------------------------------------
/text_test1.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/renpengcheng-github/LDA/97a153e94dc17048d3150339dbb9f8cd644ebc16/text_test1.zip


--------------------------------------------------------------------------------