├── README.md └── SinglePass.py /README.md: -------------------------------------------------------------------------------- 1 | # Sing-pass- 2 | 用于发现热议事件的新闻文本聚类算法的python实现 3 | -------------------------------------------------------------------------------- /SinglePass.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | import jieba 3 | from mssqlconn import * 4 | from numpy import * 5 | from sklearn.feature_extraction.text import TfidfTransformer 6 | from sklearn.feature_extraction.text import CountVectorizer 7 | from sklearn.metrics.pairwise import cosine_similarity 8 | from sklearn.feature_extraction.text import TfidfVectorizer 9 | import sys 10 | reload(sys) 11 | sys.setdefaultencoding('utf8') 12 | 13 | ms=MSSQL(host='xxx.xxx.xxx.xxx', user='xxx', pwd='xxx', db='TextMining') 14 | stopwords = [line.strip().decode('utf-8') for line in open('stopwords.txt').readlines()] 15 | theta=0.5 16 | xClusterID=1 17 | 18 | def fenci(inTxtLst): 19 | retTxtLst=[] 20 | for line in inTxtLst: 21 | line=[ln.decode('utf-8') for ln in line.splitlines() if ln.strip()] 22 | strline=''.join(line) 23 | seglist = jieba.cut(strline, cut_all=False) # 精确模式 24 | lst = list(seglist) 25 | for seg in lst[:]:#注意这里要使用切片,不然删除了元素之后,index改变 26 | if seg in stopwords: 27 | lst.remove(seg) 28 | output = ' '.join(list(lst)) # 空格拼接 29 | retTxtLst.append(output) 30 | return retTxtLst 31 | 32 | 33 | 34 | 35 | def getTfidfMat(lst):#测试函数 36 | # 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频 37 | vectorizer = CountVectorizer() 38 | # 该类会统计每个词语的tf-idf权值 39 | transformer = TfidfTransformer() 40 | # 第一个fit_transform是计算tf-idf 第二个fit_transform是将文本转为词频矩阵 41 | tfidf = transformer.fit_transform(vectorizer.fit_transform(lst)) 42 | # 获取词袋模型中的所有词语 43 | word = vectorizer.get_feature_names() 44 | # 将tf-idf矩阵抽取出来,元素w[i][j]表示j词在i类文本中的tf-idf权重 45 | weight = tfidf.toarray() 46 | #词频cp=vectorizer.fit_transform(lst) 47 | #词频cp=cp.toarray() 48 | # for i in range(len(weight)): 49 | # for j in range(len(word)): 50 | # print word[j],cp[i][j],'#', 51 | # print '\n' 52 | return weight 53 | 54 | 55 | if __name__ == "__main__": 56 | 57 | #开始single-pass部分 58 | try: 59 | resList = ms.ExecQuery("SELECT ID,content FROM corpora WHERE isProcessed=0 AND SourceType='News'") 60 | for (ID0, content0) in resList:#读出未分类的新闻 61 | ####更新weightMat 62 | corpus = [] 63 | trClusterID = [] 64 | try: 65 | resList = ms.ExecQuery("SELECT ID,content,ClusterID,isProcessed FROM corpora WHERE isProcessed=1") 66 | for (ID1, content1, ClusterID, isProcessed) in resList: 67 | corpus.append(content1) 68 | trClusterID.append(ClusterID) 69 | except: 70 | print '\nSome error/exception occurred.x' 71 | 72 | segedTxtlst = fenci(corpus) 73 | vectorizer = TfidfVectorizer() 74 | trainTfidf = vectorizer.fit_transform(segedTxtlst) 75 | weightMat = trainTfidf.toarray() # 得到语料库的VSM 76 | ####更新weightMat结束 77 | 78 | temContent=[] 79 | temContent.append(content0) 80 | segedInLst = fenci(temContent)#对该新闻分词 81 | testTfidf = vectorizer.transform(segedInLst) 82 | testVec = testTfidf.toarray()#得到基于tf-idf的文档向量 83 | # 计算testVec和weightMat每一行的余弦相似度 84 | xx = cosine_similarity(testVec, weightMat) 85 | ndxx=array(xx) 86 | max=ndxx.max() 87 | if(max>theta): 88 | indxx=argmax(ndxx)#最大值在weightMat的index已经找到 89 | ms.ExecNonQuery("UPDATE corpora set ClusterID='%s',isProcessed=1 WHERE ID=%s"%(trClusterID[indxx],ID0)) 90 | else:# 91 | # 不大于某阈值就新建一个分类 92 | ms.ExecNonQuery("UPDATE corpora set ClusterID='%s',isProcessed=1 WHERE ID=%s"%(xClusterID,ID0)) 93 | xClusterID+=1 94 | #已经把一条新闻聚到某个簇了,下面要更新一下weightMat 95 | 96 | except: 97 | print '\nSome error/exception occurred.y' 98 | #single-pass部分结束 99 | 100 | 101 | --------------------------------------------------------------------------------