├── .gitignore ├── preW2V.py ├── processContent.py ├── processWord2vec.py ├── query.py └── queryw2v.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.swp 3 | *github.com* 4 | *bin/ 5 | *data/ 6 | *old/ 7 | *output/ 8 | *pkg/ 9 | *index/ 10 | *data/ 11 | *.log* 12 | *.json* 13 | *.dat* 14 | *.sql* 15 | *jzlservice/ 16 | *DB/ 17 | .DS_Store 18 | *.txt 19 | nohup.out 20 | -------------------------------------------------------------------------------- /preW2V.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | # 4 | # 输入doc,生成分词后的语料 5 | # 6 | # 7 | import re 8 | import logging 9 | import os.path 10 | import sys 11 | import multiprocessing 12 | 13 | import jieba 14 | 15 | from gensim.models import Word2Vec 16 | from gensim.models.word2vec import LineSentence 17 | 18 | 19 | 20 | reload(sys) 21 | sys.setdefaultencoding('utf8') 22 | output = "./output/" 23 | 24 | def saveObject(filename,obj): 25 | f=open(filename,'wb') 26 | pickle.dump(obj,f) 27 | f.close() 28 | return True 29 | 30 | etlregex = re.compile(ur"[^\u4e00-\u9f5a0-9]") 31 | def etl(content): 32 | content = etlregex.sub('',content) 33 | return content 34 | 35 | 36 | #原始语料集合 37 | train_set=[] 38 | docinfos = [] 39 | #读取文本,进行切词操作 40 | f=open("./data/all.txt") 41 | lines=f.readlines() 42 | for line in lines: 43 | content = (line.lower()).split("\t")[2] + (line.lower()).split("\t")[1] 44 | word_list = filter(lambda x: len(x)>0,map(etl,jieba.cut(content,cut_all=False))) 45 | for w in word_list: 46 | print w + " ", 47 | print "" 48 | f.close() 49 | 50 | -------------------------------------------------------------------------------- /processContent.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | #encoding=utf8 3 | 4 | from gensim import corpora,models,similarities,utils 5 | import jieba 6 | import jieba.posseg as pseg 7 | import sys 8 | import os 9 | import re 10 | import gc 11 | import pickle 12 | import logging 13 | 14 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 15 | 16 | reload(sys) 17 | sys.setdefaultencoding('utf8') 18 | output = "./output/" 19 | 20 | jieba.load_userdict( "user_dic.txt" ) 21 | 22 | def saveObject(filename,obj): 23 | f=open(filename,'wb') 24 | pickle.dump(obj,f) 25 | f.close() 26 | return True 27 | 28 | etlregex = re.compile(ur"[^\u4e00-\u9f5a0-9]") 29 | def etl(content): 30 | content = etlregex.sub('',content) 31 | return content 32 | 33 | 34 | #原始语料集合 35 | train_set=[] 36 | docinfos = [] 37 | #读取文本,进行切词操作 38 | f=open("./data/all.txt") 39 | lines=f.readlines() 40 | for line in lines: 41 | content = (line.lower()).split("\t")[2] + (line.lower()).split("\t")[1] 42 | 43 | 44 | #print content 45 | #切词,etl用于去掉无用的符号 46 | word_list = filter(lambda x: len(x)>0,map(etl,jieba.cut(content,cut_all=False))) 47 | train_set.append(word_list) 48 | detail={} 49 | detail["id"]=(line.lower()).split("\t")[0] 50 | detail["title"]=(line.lower()).split("\t")[1] 51 | detail["content"]=(line.lower()).split("\t")[2] 52 | docinfos.append(detail) 53 | f.close() 54 | #语料太大的情况下可以强制GC回收内存空间 55 | #gc.collect() 56 | #生成字典 57 | dictionary = corpora.Dictionary(train_set) 58 | #去除极低频的杂质词 59 | dictionary.filter_extremes(no_below=1,no_above=1,keep_n=None) 60 | #将词典保存下来,将语料也保存下来,语料转换成bow形式,方便后续使用 61 | dictionary.save(output + "all.dic") 62 | corpus = [dictionary.doc2bow(text) for text in train_set] 63 | saveObject(output+"all.cps",corpus) 64 | #存储原始的数据 65 | saveObject(output+"all.info",docinfos) 66 | 67 | #TF*IDF模型生成 68 | #使用原始数据生成TFIDF模型 69 | tfidfModel = models.TfidfModel(corpus) 70 | #通过TFIDF模型生成TFIDF向量 71 | tfidfVectors = tfidfModel[corpus] 72 | #存储tfidfModel 73 | tfidfModel.save(output + "allTFIDF.mdl") 74 | indexTfidf = similarities.MatrixSimilarity(tfidfVectors) 75 | indexTfidf.save(output + "allTFIDF.idx") 76 | 77 | 78 | #LDA模型 79 | lda = models.LdaModel(tfidfVectors, id2word=dictionary, num_topics=50) 80 | lda.save(output + "allLDA50Topic.mdl") 81 | corpus_lda = lda[tfidfVectors] 82 | indexLDA = similarities.MatrixSimilarity(corpus_lda) 83 | indexLDA.save(output + "allLDA50Topic.idx") 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /processWord2vec.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | #encoding=utf8 3 | import re 4 | import logging 5 | import os.path 6 | import sys 7 | import multiprocessing 8 | 9 | import jieba 10 | 11 | from gensim.models import Word2Vec 12 | from gensim.models.word2vec import LineSentence 13 | 14 | 15 | 16 | reload(sys); 17 | sys.setdefaultencoding('utf8'); 18 | 19 | etlregex = re.compile(ur"[^\u4e00-\u9f5aa-zA-Z0-9]") 20 | def etl(content): 21 | content = etlregex.sub('',content) 22 | return content 23 | 24 | 25 | 26 | model = Word2Vec(LineSentence('./data/allw2v.txt'),size=10000,window=5,min_count=5,workers=multiprocessing.cpu_count()) 27 | 28 | model.save('./output/allw2v.w2v') 29 | #model.save_word2vec_format('segmentfaultw2v.txt',binary=False) -------------------------------------------------------------------------------- /query.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | #encoding=utf8 3 | 4 | from gensim import corpora,models,similarities,utils 5 | import jieba 6 | import jieba.posseg as pseg 7 | import sys 8 | import os 9 | import re 10 | import gc 11 | import pickle 12 | import logging 13 | 14 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 15 | 16 | reload(sys) 17 | sys.setdefaultencoding('utf8') 18 | output = "./output/" 19 | 20 | def saveObject(filename,obj): 21 | f=open(filename,'wb') 22 | pickle.dump(obj,f) 23 | f.close() 24 | return True 25 | 26 | 27 | def loadObject(filename): 28 | f=open(filename,'r') 29 | obj=pickle.load(f) 30 | return obj 31 | 32 | etlregex = re.compile(ur"[^\u4e00-\u9f5aa-zA-Z0-9]") 33 | def etl(content): 34 | content = etlregex.sub('',content) 35 | return content 36 | 37 | #载入详情数据 38 | docinfos = loadObject(output + "all.info") 39 | #载入字典 40 | dictionary = corpora.Dictionary.load(output + "all.dic") 41 | 42 | #TF*IDF模型生成 43 | #载入TFIDF模型 44 | tfidfModel = models.TfidfModel.load(output+"allTFIDF.mdl") 45 | indexTfidf = similarities.MatrixSimilarity.load(output + "allTFIDF.idx") 46 | 47 | #载入LDA索引 48 | ldaModel = models.LdaModel.load(output + "allLDA400Topic.mdl") 49 | indexLDA = similarities.MatrixSimilarity.load(output + "allLDA400Topic.idx") 50 | 51 | 52 | 53 | 54 | query= """ 55 | 取舍之道,说起来容易,做起来却很难。有时候它意味着你要放弃自己擅长的、甚至给你带来成功的东西,而去在全新的、前景未知的领域做出尝试。如果柯达能早点从胶片中走出来、诺基亚能少造点塞班机,也许就不会有今天的佳能、苹果、三星。如果保时捷没有冒天下之大不韪造出卡宴,也很难预料它如今的境遇会如何。 56 | 看开头这应该是一篇为全新宝马X1 Li(下文简称新X1)洗地的文章,我想很多宝马死忠、车神也已经准备移步评论区,敲下神圣、激扬的文字。其实这年月已经不流行道德绑架了,所以对于车迷而言,也请适当的放下你对这个品牌的喜爱,试着从一个兜里揣着三十来万、想买辆豪华品牌SUV的消费者的角度来看待新X1,这也是本文作者的角度。 57 | 一切的争议其实都是由UKL前驱平台而起,宝马做了一个艰难的决定,给几个入门的车系换上了前驱平台。在空间、成本相对有限的入门车型里,放弃后驱传统以换取更大空间。我想这也能看出宝马对于未来趋势的判断,就是在难以兼顾空间、操控的入门豪华车市场,消费者会重视空间实用性多过操控性,宝马也将宝押在了空间实用性上。 58 | 宝马做出了取舍,消费者该怎么选?以我自己为例吧,在买车的时候也考虑过老款宝马X1(下文简称老X1),优惠后25万左右,价格没比途观、奇骏贵多少,但品牌、动力的提升都可谓巨大。但为什么最后放弃?正是因为空间。如果家里只有这一辆车,老X1的空间确实有些力不从心。 59 | 新X1的空间绝对没有问题,而且看起来也比老款要更大气、更有面子。我们测试的这台xDrive25Li 豪华型同样采用2.0T和8AT变速箱,只是改为基于前驱的四驱系统。究竟值不值得选择,读完文章希望你能有答案。 60 | 更大气、更阳刚新X1在外观上的变化翻天覆地,与之前的老X1是完全性格的两种产物。老X1低调内敛,有着一种含蓄之美;而新X1则阳刚帅气,骨子里透着一种坚韧的性格,有着更符合男性消费者需求的阳刚之美。 61 | """ 62 | query_bow = dictionary.doc2bow(filter(lambda x: len(x)>0,map(etl,jieba.cut(query,cut_all=False)))) 63 | tfidfvect = tfidfModel[query_bow] 64 | simstfidf = indexTfidf[tfidfvect] 65 | sort_sims = sorted(enumerate(simstfidf), key=lambda item: -item[1]) 66 | print "TFIDF similary Top 10:::" 67 | for sim in sort_sims[:10]: 68 | print "ID : " + docinfos[sim[0]]["id"] + "\t" + docinfos[sim[0]]["title"] + "\tsimilary:::" + str(sim[1]) 69 | 70 | 71 | ldavec = ldaModel[tfidfvect] 72 | simlda = indexLDA[ldavec] 73 | sort_sims = sorted(enumerate(simlda), key=lambda item: -item[1]) 74 | print "LDA similary Top 10:::" 75 | for sim in sort_sims[:10]: 76 | print "ID : " + docinfos[sim[0]]["id"] + "\t" + docinfos[sim[0]]["title"] + "\tsimilary:::" + str(sim[1]) 77 | 78 | 79 | 80 | 81 | 82 | query= """ 83 | 一般情况下,搜索引擎默认会认为索引是不会有太大的变化的,所以把索引分为全量索引和增量索引两部分,全量索引一般是以天甚至是周,月为单位构建的,构建完了以后就导入到引擎中进行检索,而增量索引是实时的进入搜索引擎的,很多就是保存在内存中,搜索的时候分别从全量索引和增量索引中检索数据,然后把两部分数据合并起来返回给请求方,所以增量索引不是我们这一篇的主要内容,在最后我的索引构建部分我会说一下我的增量索引构建方式。现在先看看全量索引 84 | """ 85 | query_bow = dictionary.doc2bow(filter(lambda x: len(x)>0,map(etl,jieba.cut(query,cut_all=False)))) 86 | tfidfvect = tfidfModel[query_bow] 87 | simstfidf = indexTfidf[tfidfvect] 88 | sort_sims = sorted(enumerate(simstfidf), key=lambda item: -item[1]) 89 | print "TFIDF similary Top 10:::" 90 | for sim in sort_sims[:10]: 91 | print "ID : " + docinfos[sim[0]]["id"] + "\t" + docinfos[sim[0]]["title"] + "\tsimilary:::" + str(sim[1]) 92 | 93 | 94 | ldavec = ldaModel[tfidfvect] 95 | simlda = indexLDA[ldavec] 96 | sort_sims = sorted(enumerate(simlda), key=lambda item: -item[1]) 97 | print "LDA similary Top 10:::" 98 | for sim in sort_sims[:10]: 99 | print "ID : " + docinfos[sim[0]]["id"] + "\t" + docinfos[sim[0]]["title"] + "\tsimilary:::" + str(sim[1]) 100 | 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /queryw2v.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | #encoding=utf8 3 | 4 | import re 5 | import logging 6 | import os.path 7 | import sys 8 | import multiprocessing 9 | 10 | import jieba 11 | 12 | from gensim.models import Word2Vec 13 | from gensim.models.word2vec import LineSentence 14 | import gensim 15 | 16 | 17 | reload(sys); 18 | sys.setdefaultencoding('utf8'); 19 | 20 | 21 | 22 | model = gensim.models.Word2Vec.load("./output/allw2v.w2v") 23 | sims=model.most_similar(u'全栈') 24 | for sim in sims: 25 | print sim[0] + "\t" + str(sim[1]) 26 | 27 | --------------------------------------------------------------------------------