├── README.md ├── cn_lda_text.py ├── lda_logfile.zip ├── lda_model.py └── text_test1.zip /README.md: -------------------------------------------------------------------------------- 1 | # LDA 2 | # lda 算法以及使用示例 3 | # 1在数据库中建立表 ldasql 指定utf8 4 | import cn_lda_text.py 5 | 6 | content=cn_lda_text.lda('ldasql') 7 | # 2建立表 8 | content.create_indextables() 9 | # 3读取训练集到数据库 文件在两个zip解压后直接使用  放在同一项目目录下 10 | # 注意训练语料路径和停用词路径 在addtodb中设置 11 | content.addtodb() 12 | # 4运行lda_model 13 | content.run_lda() 14 | -------------------------------------------------------------------------------- /cn_lda_text.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | import os 3 | import jieba 4 | import re 5 | import json 6 | import lda_model 7 | import MySQLdb 8 | import codecs 9 | import sys 10 | reload(sys) 11 | sys.setdefaultencoding('utf-8') 12 | 13 | class lda: 14 | def __init__(self, dbname): 15 | self.conn = MySQLdb.connect(user='root', passwd='mysql', host='localhost', db=dbname, charset="utf8") 16 | self.con = self.conn.cursor() # con replace cur 17 | self.stop_word = [] 18 | 19 | def __del__(self): 20 | self.conn.close() 21 | 22 | def dbcommit(self): 23 | self.conn.commit() 24 | 25 | def readfile(self,path): 26 | fp = open(path, 'rb') 27 | content = fp.read() 28 | fp.close() 29 | return content 30 | 31 | def create_indextables(self): 32 | self.con.execute( 33 | "CREATE TABLE IF NOT EXISTS urllist (rowid int(100) NOT NULL AUTO_INCREMENT PRIMARY KEY, url varchar(200),flag int(100)," 34 | "score_pagerank varchar(200),score_nn varchar(200),score_content varchar(200),score_lda varchar(200)) CHARACTER SET utf8") 35 | self.con.execute( 36 | 'CREATE TABLE IF NOT EXISTS wordlist(rowid int(100) NOT NULL AUTO_INCREMENT PRIMARY KEY, word varchar(200)) CHARACTER SET utf8') 37 | self.con.execute( 38 | 'CREATE TABLE IF NOT EXISTS wordlocation(urlid int(100) ,wordid int(100),location int(100),lda_flag int(100)) CHARACTER SET utf8') 39 | self.con.execute( 40 | 'CREATE TABLE IF NOT EXISTS link(rowid int(100) NOT NULL AUTO_INCREMENT PRIMARY KEY,fromid int(100),toid int(100)) CHARACTER SET utf8') 41 | self.con.execute('create index wordidx on wordlist(word)') 42 | self.con.execute('create index urlidx on urllist(url)') 43 | self.con.execute('create index wordurlidx on wordlocation(wordid)') 44 | self.con.execute('create index urltoidx on link(toid)') 45 | self.con.execute('create index urlfromidx on link(fromid)') 46 | self.dbcommit() 47 | 48 | def separatewords(self,text): 49 | # print 'separating' 50 | null=[] 51 | a = re.findall(ur"[\u4e00-\u9fa5]+", text) 52 | if a: 53 | for word in a:#在切分词时候添加停用词检测 54 | if word in self.stop_word: 55 | return null 56 | else: 57 | return a 58 | 59 | def addtodb(self): 60 | #加载停用词,在入数据库之前忽略停用词,在所有进程开始前调用一次 61 | path_stopwoed = 'lda_logfile/' 62 | stopwordfile = path_stopwoed + 'stopwords.txt' 63 | with codecs.open(stopwordfile, 'rb', 'utf-8') as f: 64 | stop_words = f.readlines() 65 | for sword in stop_words: 66 | strword = sword.replace('\r\n', '').replace(' ','').strip() 67 | if strword != '': 68 | self.stop_word.append(strword)#全局变量 self.stop_word 69 | else: 70 | continue 71 | 72 | path = 'text_test1/' # 'ldatext/' 73 | catlist = os.listdir(path) 74 | for dir in catlist: 75 | class_path = path + dir + '/' 76 | # if not os.path.exists(class_path): 77 | file_list = os.listdir(class_path) 78 | for file in file_list: 79 | fullname = class_path + file 80 | if fullname!='': 81 | self.addtoindex(fullname) 82 | 83 | #content = self.readfile(fullname).strip() 84 | #content = content.replace("\r\n", "").strip() 85 | #content_seg = jieba.cut(content, cut_all=False) 86 | #for word in content_seg: 87 | # w.append(separatewords(word)) 88 | #w.extend(self.separatewords(word)) 89 | #print json.dumps(w, encoding="UTF-8", ensure_ascii=False) + '\t' 90 | # print " ".join(content_seg)+'\t' 91 | #print fullname + '\t' 92 | 93 | def isindexed(self, url): 94 | # print 'checking' 95 | self.con.execute("select flag from urllist where url='%s' " % url) 96 | u = self.con.fetchone() 97 | if u!= None: 98 | if u[0] == 1: # if only if flag=1 99 | return True 100 | else: 101 | return False 102 | else: 103 | return False 104 | 105 | def getentryid(self, table, field, value,flag=0, createnew=True): 106 | #update flag 107 | if flag == 1: 108 | self.con.execute("update %s set %s=1 where url='%s' " % (table, field, value)) 109 | self.dbcommit() 110 | #search and insert 111 | elif flag == 0: 112 | self.con.execute("select rowid from %s where %s='%s'" % (table, field, value)) 113 | res = self.con.fetchone() 114 | if res == None: 115 | self.con.execute("insert into %s (%s) values ('%s')" % (table, field, value)) 116 | v = self.con.lastrowid 117 | self.dbcommit() 118 | return v 119 | else: 120 | # return int(res[0]) 121 | return res[0] 122 | 123 | def addtoindex(self, url): 124 | words=[] 125 | if self.isindexed(url): return 126 | print 'Indexing ' + url 127 | content = self.readfile(url).strip() 128 | content = content.replace("\r\n", "").strip() 129 | content_seg = jieba.cut(content, cut_all=False) 130 | 131 | for word in content_seg: 132 | if word:#判断列表是否为空 若word非空 133 | words.extend(self.separatewords(word)) 134 | else: 135 | continue 136 | 137 | # 得到这个 URL的 rowid 138 | urlid = self.getentryid('urllist', 'url', url) 139 | #flag=1 表示已经插入过这个url 140 | self.getentryid('urllist', 'flag', url, 1) 141 | 142 | # Link each word to this url 143 | for i in range(len(words)): 144 | word = words[i] 145 | if word.replace(' ','')=='' or word in self.stop_word: continue 146 | # 忽略停用词,但是location空出来了 是否合适 还是在加入words列表时就去除 147 | wordid = self.getentryid('wordlist', 'word', word)#得到词典中词的id(包括去重过程) 148 | self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid, wordid, i)) 149 | self.dbcommit() 150 | 151 | 152 | 153 | #----------------------------------------------------------------------------------------------------------------------- 154 | #添加lda模块 155 | def run_lda(self): 156 | doc = {} 157 | i=0 158 | self.con.execute('select max(rowid) from wordlist') # count(distinct(rowid)) 159 | words_count = self.con.fetchone()[0] # wordcount是词汇总数(不重复) 160 | self.con.execute('select DISTINCT (urlid) from wordlocation')# 161 | cur = self.con.fetchall() 162 | for (urlid,) in cur: 163 | #doc.append(urlid)#得到列表 doc 164 | doc[i]=urlid#得到字典doc 从0开始存储真正的docid 165 | i+=1 166 | docs_count = len(doc) 167 | a = {} 168 | url_location={} 169 | print 'preparing url_wordlocation.....' 170 | for docid in doc.values(): # 将docid,location,wordid载入内存,避免mysql瓶颈 返回self.url_location 171 | # self.url_location.setdefault(docid, {}) 172 | self.con.execute('select wordid,location from wordlocation where urlid=%d' % docid) 173 | res = self.con.fetchall() 174 | j=0 175 | for (wordid, location) in res: 176 | a[j] = wordid 177 | j+=1 178 | 179 | url_location[docid] = a # url_location存的是docs中的编号,即true_urlid 180 | mylda = lda_model.lda_model('lda', words_count+1, docs_count, doc,url_location) 181 | mylda.runlda() 182 | 183 | 184 | 185 | 186 | 187 | #--------------------------------------------------------------------------------------------------------------------- 188 | #demo 189 | def set_stopwords(self): 190 | w=[] 191 | path='lda_logfile/' 192 | trainfile=path+'stopwords.txt' 193 | with codecs.open(trainfile, 'rb', 'utf-8') as f: 194 | words = f.readlines() 195 | for word in words: 196 | strword=word.replace('\r\n','').strip() 197 | if strword!='': 198 | w.append(strword) 199 | else: 200 | continue 201 | #print word.strip() 202 | #self.con.execute("select rowid from wordlist where word='%s' " % str(strword)) 203 | #res=self.con.fetchone() 204 | #if res!=None: 205 | #print res[0] 206 | #self.con.execute("update wordlist set stopword_flag=1 where rowid=%d " % res[0]) 207 | #self.dbcommit() 208 | #else: 209 | #continue 210 | #self.dbcommit() 211 | print json.dumps(w, encoding="UTF-8", ensure_ascii=False)+'\t' 212 | print ("和" in w) 213 | -------------------------------------------------------------------------------- /lda_logfile.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renpengcheng-github/LDA/97a153e94dc17048d3150339dbb9f8cd644ebc16/lda_logfile.zip -------------------------------------------------------------------------------- /lda_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import numpy as np 3 | import MySQLdb 4 | from numpy import * 5 | import ConfigParser 6 | import random 7 | import codecs 8 | import os 9 | import logging.config 10 | logger = logging.getLogger() 11 | 12 | class lda_model: 13 | def __init__(self, dbname,words_count,docs_count,doc,url_location): 14 | self.conn = MySQLdb.connect(user='root', passwd='mysql', host='localhost', db=dbname, charset="utf8") 15 | self.con = self.conn.cursor() # con replace cur 16 | #------------------------------------------------------------------------------------------------------------------- 17 | #初始化变量 18 | self.K = 100 19 | self.beta = 0.1 20 | self.alpha = 0.1 21 | self.iter_times = 300#迭代次数 22 | 23 | self.docs = doc # docs 为urlid 文档的真正编号 遍历时以下标遍历 字典结构 24 | #self.top_words_num = 20 25 | self.docs_count=docs_count #len(self.docs) 26 | self.words_count=words_count #wordcount是词汇总数(不重复)0~N 共N+1个数 27 | # nd[doc_count(M)][K],每个doc中各个topic的词的总数 28 | # ndsum[doc_count(M)],每各doc中词的总数 29 | # nw[wordcount(V)][K],词word在主题topic上的分布 30 | # nwsum[K],每各topic的词的总数 31 | 32 | self.nd = np.zeros((docs_count, self.K), dtype="int") 33 | # [[0 for t in range(K)] for d in range(docs_count) ] 34 | self.ndsum = np.zeros(docs_count, dtype="int") 35 | # [0 for d in range(docs_count)] 36 | self.nw = np.zeros((words_count, self.K), dtype="int") 37 | # [[0 for t in range(K)] for d in range(words_count) ] 38 | self.nwsum = np.zeros(self.K, dtype="int") 39 | # [0 for d in range(k)] 40 | 41 | self.p = np.zeros(self.K,dtype=float) 42 | # p,概率向量 double类型,存储采样的临时变量 43 | #self.z = np.array([[0 for m in xrange(docs_count)] for w in xrange(words_count)]) 44 | self.z =[[0 for y in xrange(len(url_location[doc[m]]))] for m in xrange(docs_count)] 45 | # M*doc.size(),文档中词的主题分布 46 | 47 | self.theta = np.array([[0.0 for y in xrange(self.K)] for x in xrange(docs_count)]) 48 | self.phi = np.array([[0.0 for y in xrange(words_count)] for x in xrange(self.K)]) 49 | self.url_location=url_location 50 | # 文件变量 51 | # 分好词的文件trainfile 52 | # 词对应id文件wordidmapfile 53 | # 文章-主题分布文件thetafile 54 | # 词-主题分布文件phifile 55 | # 每个主题topN词文件topNfile 56 | # 最后分派结果文件tassginfile 57 | # 模型训练选择的参数文件paramfile 58 | # 59 | self.path = 'lda_logfile/' 60 | self.phifile = self.path + 'model_phi.dat' 61 | self.thetafile = self.path + 'model_theta.dat' 62 | self.topicwordfile=self.path + 'model_topicword.dat' 63 | self.test_topic=self.path + 'test_topic.dat' 64 | self.test_word=self.path + 'test_word.dat' 65 | 66 | def __del__(self): 67 | self.con.close() 68 | def dbcommit(self): 69 | self.conn.commit() 70 | 71 | # ------------------------------------------------------------------------------------------------------------------- 72 | #1数据输入和Topic随机初始化: 73 | def data_preparing(self): 74 | # 初始化阶段:随机先分配类型 75 | print 'preparing data.....' 76 | for m in xrange(self.docs_count): #docs里面存储的是真正的urlid self.docs_count=len(doc) 77 | true_urlid=self.docs[m] 78 | #------------------------------------------------------------------------------------ 79 | #res=len(self.url_location[true_urlid]) 80 | #self.ndsum[m] = res#某文档有多少个词 81 | #for i in self.url_location[true_urlid].keys():#对于每篇文档的每个locations 82 | #topicid = random.randint(0, self.K - 1) # topic标号是0到k-1 随机分配topic 83 | #dict_wordid = self.url_location[true_urlid][i] #-1 84 | #self.z[i][m] = topicid # 将此文档中每个词赋值一个topic 85 | #self.nw[dict_wordid][topicid] += 1 86 | #self.nwsum[topicid] += 1 87 | #self.nd[m][topicid] += 1 88 | # ------------------------------------------------------------------------------------ 89 | res = len(self.url_location[true_urlid]) 90 | self.ndsum[m] = res # 某文档有多少个词 91 | for i in xrange(res): 92 | topicid = random.randint(0, self.K - 1) # topic标号是0到k-1 随机分配topic 93 | dict_wordid = self.url_location[true_urlid][i] #-1 94 | self.z[m][i] = topicid # 将此文档中每个词赋值一个topic 95 | self.nw[dict_wordid][topicid] += 1 96 | self.nwsum[topicid] += 1 97 | self.nd[m][topicid] += 1 98 | # ------------------------------------------------------------------------------------ 99 | #self.con.execute('select count(wordid) from wordlocation where urlid=%d' %true_urlid) 100 | #res=self.con.fetchone()[0]#每篇doc文档有多少词(数量) 需要程序计数(从外界传入变量) 101 | #self.ndsum[m] =res 102 | #for w in xrange(int(res)):#此w是文档中的所有词语 即这篇文档一共有多少个词(包括重复词语)要对每一个词语查询字典id 103 | #if self.url_location[true_urlid].has_key(w):#若有这个词的location 104 | #topicid = random.randint(0, self.K - 1)#topic标号是0到k-1 105 | #dict_wordid = self.url_location[true_urlid][w]#-1 106 | #self.z[dict_wordid][m] = topicid#将此文档中每个词赋值一个topic 107 | #self.nw[dict_wordid][topicid] += 1 108 | #self.nwsum[topicid] += 1 109 | #self.nd[m][topicid]+=1 110 | #else: 111 | #continue 112 | self.dbcommit() 113 | return 'prepare end!' 114 | 115 | # ------------------------------------------------------------------------------------------------------------------- 116 | #计算alpha,输入nd, nd_sum 117 | def calc_theta(self): 118 | print 'calculating theta.....' 119 | nd=self.nd 120 | nd_sum=self.ndsum 121 | alpha=self.alpha 122 | topic_number=self.K 123 | doc_num=self.docs_count 124 | topic_alpha=topic_number * alpha # K*alpha 125 | theta=[[0 for t in range(topic_number)]for d in range(doc_num)] 126 | for m in range(doc_num): 127 | for k in range(topic_number): 128 | theta[m][k]=(nd[m][k]+alpha)/(nd_sum[m]+topic_alpha) 129 | return theta 130 | #向量计算 131 | #def calc_theta(self): 132 | #print 'calculating theta.....' 133 | #for i in xrange(self.docs_count):#0到docs_count-1 134 | #self.theta[i] = (self.nd[i] + self.alpha) / (self.ndsum[i] + self.K * self.alpha) 135 | 136 | 137 | # ------------------------------------------------------------------------------------------------------------------- 138 | #计算phi输入 nw,nw_sum 139 | #def calc_phi(self): 140 | #print 'calculating phi.....' 141 | #nw=self.nw 142 | #nw_sum=self.nwsum 143 | #beta=self.beta 144 | #topic_number = self.K 145 | #word_num = self.words_count 146 | #word_beta = word_num * beta # V*beta 147 | #phi = [[0 for t in range(word_num)] for d in range(topic_number)] 148 | #for k in range(topic_number): 149 | #for w in range(word_num): 150 | #phi[k][w] = (nw[w][k] + beta) / (nw_sum[k] + word_beta) 151 | #return phi 152 | #向量计算 153 | def calc_phi(self): 154 | print 'calculating phi.....' 155 | nw = self.nw 156 | nw_sum = self.nwsum 157 | beta = self.beta 158 | topic_number = self.K 159 | word_num = self.words_count 160 | word_beta = word_num * beta # V*beta 161 | phi = [[0 for t in range(word_num)] for d in range(topic_number)] 162 | for k in xrange(self.K): 163 | phi[k] = (nw.T[k] + beta) / (nw_sum[k] + word_beta) 164 | return phi 165 | 166 | #a[0] 167 | #Out[47]: array([1., 1., 1.]) 168 | #In[48]: a[0] + 1 169 | #Out[48]: array([2., 2., 2.]) 170 | # ------------------------------------------------------------------------------------------------------------------- 171 | #吉比斯采样过程 172 | 173 | def sampling(self, m, dict_wordid,i):#对某个位置进行采样 174 | topic = self.z[m][i] 175 | self.nw[dict_wordid][topic] -= 1 176 | self.nd[m][topic] -= 1 177 | self.nwsum[topic] -= 1 178 | self.ndsum[m] -= 1 179 | 180 | Vbeta = self.words_count * self.beta 181 | Kalpha = self.K * self.alpha 182 | #吉比斯采样公式计算topic采样概率存入p中,以向量来计算 每迭代一次更新p向量 183 | self.p = (self.nw[dict_wordid] + self.beta) / (self.nwsum + Vbeta) * \ 184 | (self.nd[m] + self.alpha) / (self.ndsum[m] + Kalpha) 185 | 186 | # 在这里完成了一个掷骰子的过程,p[]向量是Topic的多项分布骰子,u是随便扔了个数,扔到p几就输出对应的Topic 187 | for k in xrange(1, self.K): 188 | self.p[k] += self.p[k - 1]#逐项累计 参考lda漫游p56 #p为长度为K的变量,存放各个主题的概率 189 | u = random.uniform(0, self.p[self.K - 1]) 190 | new_topicid=0 191 | for i in xrange(self.K): 192 | if self.p[i] > u: 193 | new_topicid=i 194 | break 195 | else: 196 | continue 197 | self.nw[dict_wordid][new_topicid] += 1 198 | self.nwsum[new_topicid] += 1 199 | self.nd[m][new_topicid] += 1 200 | self.ndsum[m] += 1 201 | return new_topicid 202 | 203 | 204 | # ------------------------------------------------------------------------------------------------------------------- 205 | #运行lda进行训练 206 | def runlda(self): 207 | self.data_preparing() 208 | # Consolelogger.info(u"迭代次数为%s 次" % self.iter_times) 209 | print 'prepare data end,starting simple iteration.....' 210 | for x in xrange(self.iter_times): #迭代次数 211 | print '迭代次数 :'+str(x+1) 212 | for m in xrange(self.docs_count):#对每篇文档 213 | true_urlid=self.docs[m] 214 | #-------------------------------------------------------------------------------------- 215 | res = len(self.url_location[true_urlid]) 216 | #self.ndsum[m] = res # 某文档有多少个词 217 | for i in xrange(res): 218 | dict_wordid = self.url_location[true_urlid][i] 219 | topic = self.sampling(m, dict_wordid,i) # 对第m文档的w(位置location 从0开始)词进行采样 返回topicid 220 | self.z[dict_wordid][m] = topic 221 | # -------------------------------------------------------------------------------------- 222 | #for w in xrange(self.ndsum[m]):#对文档m中的每个词(包括重复词)都要进行采样,模拟投掷骰子生成词的逆过程 223 | #dict_wordid=self.getword_dictid(self.docs[m],w)-1#减1同理 224 | #if self.url_location[self.docs[m]].has_key(w): 225 | #dict_wordid=self.url_location[self.docs[m]][w]#-1 226 | #topic = self.sampling(m, dict_wordid)#对第m文档的w(位置location 从0开始)词进行采样 返回topicid 227 | #self.z[dict_wordid][m] = topic 228 | #else: 229 | #continue 230 | self.theta=self.calc_theta() 231 | self.phi=self.calc_phi() 232 | print 'calculate end.....' 233 | print 'starting sign lda flag in mysql.....' 234 | for m in range(self.docs_count): 235 | self.updateandsave_topicword(m) 236 | self.save() 237 | return 'lda process end!' 238 | # ------------------------------------------------------------------------------------------------------------------- 239 | #更新数据库ldaflag状态和高频主题词保存至本地 240 | def updateandsave_topicword(self,m): 241 | true_docid=self.docs[m] 242 | topk_word = 0 243 | top_topicid = self.theta[m].index(max(self.theta[m])) # 概率最高的主题 244 | temp = list(enumerate(self.phi[top_topicid])) 245 | sorted_tmp=sorted(temp,key=lambda x: x[1], reverse=True) # 主题词排序 从高到底 246 | #temp.sort(key=lambda x: x[1], reverse=True) 247 | with codecs.open(self.topicwordfile, 'a') as f: 248 | f.write('真实文档号:'+str(true_docid) + '\n')#\n 换行 249 | for (wordid, p) in sorted_tmp:#phi的下标代表词典中的词id 250 | if wordid!=0: 251 | self.con.execute('select word from wordlist where rowid=%d' % wordid) 252 | word=self.con.fetchone()[0] 253 | if topk_word < 10: 254 | f.write('第' + str(topk_word + 1) + '主题词:' + str(word) + '\t' + '概率为:' + str(p) + '\n') # \n 换行 255 | #self.con.execute( 256 | #'update wordlocation set lda_flag=1 where urlid=%d and wordid=%d ' % (true_docid, wordid)) 257 | topk_word += 1 258 | else: 259 | break 260 | else: 261 | continue 262 | #------------------------------------------------------------------------------------ 263 | #test 264 | topk_word = 0 265 | with codecs.open(self.test_word, 'a') as f: 266 | f.write('真实文档号:'+str(true_docid) + '\n')#\n 换行 267 | for (wordid, p) in sorted_tmp:#phi的下标代表词典中的词id 268 | if topk_word < 10: 269 | f.write('第' + str(topk_word + 1) + '高频词:' + str(wordid) + '\t' + '概率为:' + str(p) + '\n') 270 | topk_word += 1 271 | else: 272 | break 273 | #------------------------------------------------------------------------------------ 274 | topk_word = 0 275 | with codecs.open(self.test_topic, 'a') as f: 276 | f.write('真实文档号:'+str(true_docid) + '\n')#\n 换行 277 | if topk_word < 10: 278 | f.write('文档主题号'+str(top_topicid) + '\n') 279 | topk_word += 1 280 | self.dbcommit() 281 | return 'update and save done!' 282 | # ------------------------------------------------------------------------------------------------------------------- 283 | # 保存theta文章-主题分布 284 | # 存储结果 285 | def save(self): 286 | print 'starting save.....' 287 | with codecs.open(self.thetafile, 'w') as f: 288 | for x in xrange(self.docs_count): 289 | for y in xrange(self.K): 290 | if self.theta[x][y]!=None: 291 | f.write(str(self.theta[x][y]) + '\t') 292 | else: 293 | f.write(str(0) + '\t') 294 | f.write('\n') 295 | # 保存phi词-主题分布 296 | #logger.info(u"词-主题分布已保存到%s" % self.phifile) 297 | with codecs.open(self.phifile, 'w') as f: 298 | for y in xrange(self.words_count): 299 | for x in xrange(self.K): 300 | if self.phi[x][y]!=None: 301 | f.write(str(self.phi[x][y]) + '\t') 302 | else: 303 | f.write(str(0) + '\t') 304 | f.write('\n') 305 | return 'save end' 306 | -------------------------------------------------------------------------------- /text_test1.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/renpengcheng-github/LDA/97a153e94dc17048d3150339dbb9f8cd644ebc16/text_test1.zip --------------------------------------------------------------------------------