├── .gitignore
├── preW2V.py
├── processContent.py
├── processWord2vec.py
├── query.py
└── queryw2v.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | *.swp
 3 | *github.com*
 4 | *bin/
 5 | *data/
 6 | *old/
 7 | *output/
 8 | *pkg/
 9 | *index/
10 | *data/
11 | *.log*
12 | *.json*
13 | *.dat*
14 | *.sql*
15 | *jzlservice/
16 | *DB/
17 | .DS_Store
18 | *.txt
19 | nohup.out
20 | 


--------------------------------------------------------------------------------
/preW2V.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # -*- coding: utf-8 -*-  
 3 | #
 4 | # 输入doc，生成分词后的语料
 5 | #
 6 | #
 7 | import re
 8 | import logging
 9 | import os.path
10 | import sys
11 | import multiprocessing
12 | 
13 | import jieba
14 | 
15 | from gensim.models import Word2Vec
16 | from gensim.models.word2vec import LineSentence
17 | 
18 | 
19 | 
20 | reload(sys)
21 | sys.setdefaultencoding('utf8')
22 | output = "./output/"
23 | 
24 | def saveObject(filename,obj):
25 |     f=open(filename,'wb')
26 |     pickle.dump(obj,f)
27 |     f.close()
28 |     return True
29 | 
30 | etlregex = re.compile(ur"[^\u4e00-\u9f5a0-9]")
31 | def etl(content):
32 |     content = etlregex.sub('',content)
33 |     return content
34 |     
35 |     
36 | #原始语料集合
37 | train_set=[]
38 | docinfos = []
39 | #读取文本，进行切词操作
40 | f=open("./data/all.txt")
41 | lines=f.readlines()
42 | for line in lines:
43 |     content = (line.lower()).split("\t")[2] + (line.lower()).split("\t")[1]
44 |     word_list = filter(lambda x: len(x)>0,map(etl,jieba.cut(content,cut_all=False)))
45 |     for w in word_list:
46 |         print w + " ",
47 |     print  ""
48 | f.close()
49 | 
50 | 


--------------------------------------------------------------------------------
/processContent.py:
--------------------------------------------------------------------------------
 1 | #!/bin/python
 2 | #encoding=utf8
 3 | 
 4 | from gensim import corpora,models,similarities,utils
 5 | import jieba
 6 | import jieba.posseg as pseg
 7 | import sys
 8 | import os
 9 | import re
10 | import gc
11 | import pickle
12 | import logging
13 | 
14 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
15 | 
16 | reload(sys)
17 | sys.setdefaultencoding('utf8')
18 | output = "./output/"
19 | 
20 | jieba.load_userdict( "user_dic.txt" )
21 | 
22 | def saveObject(filename,obj):
23 |     f=open(filename,'wb')
24 |     pickle.dump(obj,f)
25 |     f.close()
26 |     return True
27 | 
28 | etlregex = re.compile(ur"[^\u4e00-\u9f5a0-9]")
29 | def etl(content):
30 |     content = etlregex.sub('',content)
31 |     return content
32 |     
33 |     
34 | #原始语料集合
35 | train_set=[]
36 | docinfos = []
37 | #读取文本，进行切词操作
38 | f=open("./data/all.txt")
39 | lines=f.readlines()
40 | for line in lines:
41 |     content = (line.lower()).split("\t")[2] + (line.lower()).split("\t")[1]
42 |     
43 |     
44 |     #print content
45 |     #切词，etl用于去掉无用的符号
46 |     word_list = filter(lambda x: len(x)>0,map(etl,jieba.cut(content,cut_all=False)))
47 |     train_set.append(word_list)
48 |     detail={}
49 |     detail["id"]=(line.lower()).split("\t")[0]
50 |     detail["title"]=(line.lower()).split("\t")[1]
51 |     detail["content"]=(line.lower()).split("\t")[2]
52 |     docinfos.append(detail)
53 | f.close()
54 | #语料太大的情况下可以强制GC回收内存空间
55 | #gc.collect()
56 | #生成字典
57 | dictionary = corpora.Dictionary(train_set)
58 | #去除极低频的杂质词
59 | dictionary.filter_extremes(no_below=1,no_above=1,keep_n=None)
60 | #将词典保存下来，将语料也保存下来,语料转换成bow形式，方便后续使用
61 | dictionary.save(output + "all.dic")
62 | corpus = [dictionary.doc2bow(text) for text in train_set]
63 | saveObject(output+"all.cps",corpus)
64 | #存储原始的数据
65 | saveObject(output+"all.info",docinfos)
66 | 
67 | #TF*IDF模型生成
68 | #使用原始数据生成TFIDF模型
69 | tfidfModel = models.TfidfModel(corpus)
70 | #通过TFIDF模型生成TFIDF向量
71 | tfidfVectors = tfidfModel[corpus]
72 | #存储tfidfModel
73 | tfidfModel.save(output + "allTFIDF.mdl")
74 | indexTfidf = similarities.MatrixSimilarity(tfidfVectors)
75 | indexTfidf.save(output + "allTFIDF.idx")
76 | 
77 | 
78 | #LDA模型
79 | lda = models.LdaModel(tfidfVectors, id2word=dictionary, num_topics=50)
80 | lda.save(output + "allLDA50Topic.mdl")
81 | corpus_lda = lda[tfidfVectors]
82 | indexLDA = similarities.MatrixSimilarity(corpus_lda)
83 | indexLDA.save(output + "allLDA50Topic.idx")
84 | 
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/processWord2vec.py:
--------------------------------------------------------------------------------
 1 | #!/bin/python
 2 | #encoding=utf8
 3 | import re
 4 | import logging
 5 | import os.path
 6 | import sys
 7 | import multiprocessing
 8 | 
 9 | import jieba
10 | 
11 | from gensim.models import Word2Vec
12 | from gensim.models.word2vec import LineSentence
13 | 
14 | 
15 | 
16 | reload(sys);
17 | sys.setdefaultencoding('utf8');
18 | 
19 | etlregex = re.compile(ur"[^\u4e00-\u9f5aa-zA-Z0-9]")
20 | def etl(content):
21 |     content = etlregex.sub('',content)
22 |     return content
23 | 
24 | 
25 | 
26 | model = Word2Vec(LineSentence('./data/allw2v.txt'),size=10000,window=5,min_count=5,workers=multiprocessing.cpu_count())
27 | 
28 | model.save('./output/allw2v.w2v')
29 | #model.save_word2vec_format('segmentfaultw2v.txt',binary=False)


--------------------------------------------------------------------------------
/query.py:
--------------------------------------------------------------------------------
  1 | #!/bin/python
  2 | #encoding=utf8
  3 | 
  4 | from gensim import corpora,models,similarities,utils
  5 | import jieba
  6 | import jieba.posseg as pseg
  7 | import sys
  8 | import os
  9 | import re
 10 | import gc
 11 | import pickle
 12 | import logging
 13 | 
 14 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 15 | 
 16 | reload(sys)
 17 | sys.setdefaultencoding('utf8')
 18 | output = "./output/"
 19 | 
 20 | def saveObject(filename,obj):
 21 |     f=open(filename,'wb')
 22 |     pickle.dump(obj,f)
 23 |     f.close()
 24 |     return True
 25 | 
 26 | 
 27 | def loadObject(filename):
 28 |     f=open(filename,'r')
 29 |     obj=pickle.load(f)
 30 |     return obj
 31 | 
 32 | etlregex = re.compile(ur"[^\u4e00-\u9f5aa-zA-Z0-9]")
 33 | def etl(content):
 34 |     content = etlregex.sub('',content)
 35 |     return content
 36 | 
 37 | #载入详情数据
 38 | docinfos = loadObject(output + "all.info")
 39 | #载入字典
 40 | dictionary = corpora.Dictionary.load(output + "all.dic")
 41 | 
 42 | #TF*IDF模型生成
 43 | #载入TFIDF模型
 44 | tfidfModel = models.TfidfModel.load(output+"allTFIDF.mdl")
 45 | indexTfidf = similarities.MatrixSimilarity.load(output + "allTFIDF.idx")
 46 | 
 47 | #载入LDA索引
 48 | ldaModel = models.LdaModel.load(output + "allLDA400Topic.mdl")
 49 | indexLDA = similarities.MatrixSimilarity.load(output + "allLDA400Topic.idx")
 50 | 
 51 | 
 52 | 
 53 | 
 54 | query= """
 55 | 取舍之道，说起来容易，做起来却很难。有时候它意味着你要放弃自己擅长的、甚至给你带来成功的东西，而去在全新的、前景未知的领域做出尝试。如果柯达能早点从胶片中走出来、诺基亚能少造点塞班机，也许就不会有今天的佳能、苹果、三星。如果保时捷没有冒天下之大不韪造出卡宴，也很难预料它如今的境遇会如何。
 56 | 看开头这应该是一篇为全新宝马X1 Li（下文简称新X1）洗地的文章，我想很多宝马死忠、车神也已经准备移步评论区，敲下神圣、激扬的文字。其实这年月已经不流行道德绑架了，所以对于车迷而言，也请适当的放下你对这个品牌的喜爱，试着从一个兜里揣着三十来万、想买辆豪华品牌SUV的消费者的角度来看待新X1，这也是本文作者的角度。
 57 | 一切的争议其实都是由UKL前驱平台而起，宝马做了一个艰难的决定，给几个入门的车系换上了前驱平台。在空间、成本相对有限的入门车型里，放弃后驱传统以换取更大空间。我想这也能看出宝马对于未来趋势的判断，就是在难以兼顾空间、操控的入门豪华车市场，消费者会重视空间实用性多过操控性，宝马也将宝押在了空间实用性上。
 58 | 宝马做出了取舍，消费者该怎么选？以我自己为例吧，在买车的时候也考虑过老款宝马X1（下文简称老X1），优惠后25万左右，价格没比途观、奇骏贵多少，但品牌、动力的提升都可谓巨大。但为什么最后放弃？正是因为空间。如果家里只有这一辆车，老X1的空间确实有些力不从心。
 59 | 新X1的空间绝对没有问题，而且看起来也比老款要更大气、更有面子。我们测试的这台xDrive25Li 豪华型同样采用2.0T和8AT变速箱，只是改为基于前驱的四驱系统。究竟值不值得选择，读完文章希望你能有答案。
 60 | 更大气、更阳刚新X1在外观上的变化翻天覆地，与之前的老X1是完全性格的两种产物。老X1低调内敛，有着一种含蓄之美；而新X1则阳刚帅气，骨子里透着一种坚韧的性格，有着更符合男性消费者需求的阳刚之美。
 61 | """
 62 | query_bow = dictionary.doc2bow(filter(lambda x: len(x)>0,map(etl,jieba.cut(query,cut_all=False))))
 63 | tfidfvect = tfidfModel[query_bow]
 64 | simstfidf = indexTfidf[tfidfvect]
 65 | sort_sims = sorted(enumerate(simstfidf), key=lambda item: -item[1])
 66 | print "TFIDF similary Top 10:::"
 67 | for sim in sort_sims[:10]:
 68 |     print "ID : " + docinfos[sim[0]]["id"] + "\t" + docinfos[sim[0]]["title"] + "\tsimilary:::" + str(sim[1])
 69 | 
 70 | 
 71 | ldavec = ldaModel[tfidfvect]
 72 | simlda = indexLDA[ldavec]
 73 | sort_sims = sorted(enumerate(simlda), key=lambda item: -item[1])
 74 | print "LDA similary Top 10:::"
 75 | for sim in sort_sims[:10]:
 76 |     print "ID : " + docinfos[sim[0]]["id"] + "\t" + docinfos[sim[0]]["title"] + "\tsimilary:::" + str(sim[1])
 77 | 
 78 | 
 79 | 
 80 | 
 81 | 
 82 | query= """
 83 | 一般情况下，搜索引擎默认会认为索引是不会有太大的变化的，所以把索引分为全量索引和增量索引两部分，全量索引一般是以天甚至是周，月为单位构建的，构建完了以后就导入到引擎中进行检索，而增量索引是实时的进入搜索引擎的，很多就是保存在内存中，搜索的时候分别从全量索引和增量索引中检索数据，然后把两部分数据合并起来返回给请求方，所以增量索引不是我们这一篇的主要内容，在最后我的索引构建部分我会说一下我的增量索引构建方式。现在先看看全量索引
 84 | """
 85 | query_bow = dictionary.doc2bow(filter(lambda x: len(x)>0,map(etl,jieba.cut(query,cut_all=False))))
 86 | tfidfvect = tfidfModel[query_bow]
 87 | simstfidf = indexTfidf[tfidfvect]
 88 | sort_sims = sorted(enumerate(simstfidf), key=lambda item: -item[1])
 89 | print "TFIDF similary Top 10:::"
 90 | for sim in sort_sims[:10]:
 91 |     print "ID : " + docinfos[sim[0]]["id"] + "\t" + docinfos[sim[0]]["title"] + "\tsimilary:::" + str(sim[1])
 92 | 
 93 | 
 94 | ldavec = ldaModel[tfidfvect]
 95 | simlda = indexLDA[ldavec]
 96 | sort_sims = sorted(enumerate(simlda), key=lambda item: -item[1])
 97 | print "LDA similary Top 10:::"
 98 | for sim in sort_sims[:10]:
 99 |     print "ID : " + docinfos[sim[0]]["id"] + "\t" + docinfos[sim[0]]["title"] + "\tsimilary:::" + str(sim[1])
100 | 
101 | 
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/queryw2v.py:
--------------------------------------------------------------------------------
 1 | #!/bin/python
 2 | #encoding=utf8
 3 | 
 4 | import re
 5 | import logging
 6 | import os.path
 7 | import sys
 8 | import multiprocessing
 9 | 
10 | import jieba
11 | 
12 | from gensim.models import Word2Vec
13 | from gensim.models.word2vec import LineSentence
14 | import gensim
15 | 
16 | 
17 | reload(sys);
18 | sys.setdefaultencoding('utf8');
19 | 
20 | 
21 | 
22 | model = gensim.models.Word2Vec.load("./output/allw2v.w2v")  
23 | sims=model.most_similar(u'全栈')
24 | for sim in sims:
25 |     print sim[0] + "\t" + str(sim[1])
26 | 
27 | 


--------------------------------------------------------------------------------