├── 10.txt ├── README.md └── doc2vct.py /10.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamxiaomu/doc2vec/23e31e6bbb59a1057d41502a1839d810846dda84/10.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # doc2vec 2 | the example of doc2vec to calculate the similarity of docs 3 | 10.txt is a example of input file 4 | -------------------------------------------------------------------------------- /doc2vct.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #使用doc2vec 判断文档相似性 3 | from gensim import models,corpora,similarities 4 | import jieba.posseg as pseg 5 | from gensim.models.doc2vec import TaggedDocument,Doc2Vec 6 | import os 7 | 8 | def a_sub_b(a,b): 9 | ret = [] 10 | for el in a: 11 | if el not in b: 12 | ret.append(el) 13 | return ret 14 | stop = [line.strip().decode('utf-8') for line in open('stopword.txt').readlines() ] 15 | 16 | #读取文件 17 | raw_documents=[] 18 | walk = os.walk(os.path.realpath("/Users/muhongfen/sougou")) 19 | for root, dirs, files in walk: 20 | for name in files: 21 | f = open(os.path.join(root, name), 'r') 22 | raw = str(os.path.join(root, name))+" " 23 | raw += f.read() 24 | raw_documents.append(raw) 25 | 26 | #构建语料库 27 | corpora_documents = [] 28 | doc=[] #输出时使用,用来存储未经过TaggedDocument处理的数据,如果输出document,前面会有u 29 | for i, item_text in enumerate(raw_documents): 30 | words_list=[] 31 | item=(pseg.cut(item_text)) 32 | for j in list(item): 33 | words_list.append(j.word) 34 | words_list=a_sub_b(words_list,list(stop)) 35 | document = TaggedDocument(words=words_list, tags=[i]) 36 | corpora_documents.append(document) 37 | doc.append(words_list) 38 | #创建model 39 | model = Doc2Vec(size=50, min_count=1, iter=10) 40 | model.build_vocab(corpora_documents) 41 | model.train(corpora_documents) 42 | print('#########', model.vector_size) 43 | 44 | #训练 45 | test_data_1 = '本报讯 全球最大个人电脑制造商戴尔公司8日说,由于市场竞争激烈,以及定价策略不当,该公司今年第一季度盈利预计有所下降。'\ 46 | '消息发布之后,戴尔股价一度下跌近6%,创下一年来的新低。戴尔公司估计,其第一季度收入约为142亿美元,每股收益33美分。此前公司预测当季收入为142亿至146亿美元,'\ 47 | '每股收益36至38美分,而分析师平均预测戴尔同期收入为145.2亿美元,每股收益38美分。为抢夺失去的市场份额,戴尔公司一些产品打折力度很大。戴尔公司首席执行官凯文·罗林斯在一份声明中说,'\ 48 | '公司在售后服务和产品质量方面一直在投资,同时不断下调价格。戴尔公司将于5月18日公布第一季度的财报。' 49 | test_cut_raw_1 =[] 50 | item2=(pseg.cut(test_data_1)) 51 | for k in list(item2): 52 | test_cut_raw_1.append(k.word) 53 | inferred_vector = model.infer_vector(test_cut_raw_1) 54 | sims = model.docvecs.most_similar([inferred_vector], topn=3) 55 | print(sims) #sims是一个tuples,(index_of_document, similarity) 56 | for i in sims: 57 | similar="" 58 | print('################################') 59 | print i[0] 60 | for j in doc[i[0]]: 61 | similar+=j 62 | print similar --------------------------------------------------------------------------------