├── README.MD ├── data ├── question.csv └── stopword.txt └── src ├── dataset.py ├── doc2vec.py ├── gensim_model_main.py ├── gensim_similarity_main.py └── utility.py /README.MD: -------------------------------------------------------------------------------- 1 | # Sentence similarity example# 2 | ## 功能 ## 3 | * 语句相似性建模 4 | * 按照不同算法查询相似的句子 5 | * 基于gensim的 lda ,lsi, rp算法 6 | * 基于gensim构建索引 7 | 8 | ## 程序说明 ## 9 | * dataset.py 构建了字典和语料库(按照bagofword向量存储) 10 | * gensim_model_main.py 是根据语料库结果建立句子的模型(LDA,LSI,RP) 11 | * RP是向量的压缩 12 | * gensim_similarity_main.py 里面是针对所有的语料根据不同的模型计算出句子的向量,并且可以根据计算出的矩阵做相似查询 13 | * doc2vec.py 是按照doc2vec模型训练模型查找相似性的算法 14 | 15 | 更多AI(机器学习,深度学习)讨论 ,请加QQ群:707403483 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /data/question.csv: -------------------------------------------------------------------------------- 1 | "关于婴儿补钙" 2 | "是不是不能让婴儿经常晒太阳?" 3 | "关于补钙" 4 | "鱼肝油的补充" 5 | "新生儿的健康" 6 | "宝宝从3个月开始厌食不喝奶到现在该怎么办?" 7 | "不知怎么改善宝宝的睡眠呢?" 8 | "孩子烂牙挺厉害,怎么办呀?" 9 | "奶水变少是因为之前没有及时让宝宝吸允奶水的原因吗?怎样让奶水变多啊?" 10 | "十一个月宝宝发烧嗓子疼嘴里有味,如何是好啊?" 11 | "上环后同房就有血,怎么办?" 12 | "33天宝宝吐舌头是怎么回事啊?" 13 | "新生儿一哭,脸发红,皮肤也发红正常吗?" 14 | "奶是不是越挤就会越多啊?" 15 | "男人就应该不做家务吗?" 16 | "请问怎样给宝宝断掉夜奶?" 17 | "5个半月宝宝体重正常是多重?" 18 | "我现在吃的也不好,是不是得补点叶酸呀?" 19 | "生完孩子多久可以用收腹带?" 20 | 21 | -------------------------------------------------------------------------------- /data/stopword.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/goodskillprogramer/SentenceSimilarity/fb90401318dd2a34b15e50335ac2001f1b0d3262/data/stopword.txt -------------------------------------------------------------------------------- /src/dataset.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import csv 4 | import jieba 5 | from six import iteritems 6 | from gensim import corpora 7 | from docutils.parsers.rst.directives import path 8 | 9 | def build_dictionary(corppath,dicsavepath,stoplist = []): 10 | dict = corpora.Dictionary(jieba.lcut(line) for line in open(corppath,'r',encoding='utf-8')) 11 | print(dict) 12 | stop_ids = [dict.token2id[stopword] for stopword in stoplist if stopword in dict.token2id] 13 | once_ids = [tokenid for tokenid, docfreq in iteritems(dict.dfs) if docfreq == 1] 14 | 15 | print('stop id and once id len',len(stop_ids),len(once_ids)) 16 | dict.filter_tokens(stop_ids + once_ids) 17 | dict.compactify() 18 | dict.save(dicsavepath) 19 | print(dict) 20 | 21 | def load_dictionary(dicsavepath): 22 | dict = corpora.Dictionary.load(dicsavepath) 23 | return dict 24 | 25 | class MyCorpus(object): 26 | def __init__(self,dic,corp_path): 27 | self.dict = dic 28 | self.corp_path = corp_path 29 | 30 | def __iter__(self): 31 | for line in open(self.corp_path,'r',encoding='utf-8'): 32 | yield self.dict.doc2bow(jieba.lcut(line)) 33 | 34 | 35 | def get_question(qpath=r'../data/question.csv'): 36 | rows = csv.reader(open(qpath,'r',encoding='utf-8'))# 37 | corps = [] 38 | for r in rows: 39 | corps.append(r[0]) 40 | 41 | return corps 42 | 43 | def get_stop_words(spath = '../data/stopword.txt'): 44 | stopwords=[] 45 | with open(spath) as f: 46 | lines = f.readlines() 47 | for line in lines: 48 | line = line.strip('\n') 49 | if line: 50 | stopwords.append(line) 51 | return stopwords 52 | 53 | def build_corpora(dicsavepath,raw_corps_path,corps_path): 54 | dicts = load_dictionary(dicsavepath) 55 | corps = MyCorpus(dicts,raw_corps_path) 56 | corpus = [] 57 | for corp in corps: 58 | corpus.append(corp) 59 | corpora.MmCorpus.serialize(corps_path, corpus) 60 | 61 | def load_corpora(corps_path): 62 | corpus = corpora.MmCorpus(corps_path) 63 | return corpus 64 | 65 | if __name__ == '__main__': 66 | dicsavepath = '../data/yuer.dict' 67 | raw_corps_path = '../data/question.csv' 68 | crops_path = '../data/yuer.mm' 69 | stopwords=get_stop_words() 70 | print('load stop words',len(stopwords)) 71 | build_dictionary(raw_corps_path,dicsavepath,stopwords) 72 | build_corpora(dicsavepath,raw_corps_path,crops_path) 73 | print(load_corpora(crops_path)) 74 | 75 | -------------------------------------------------------------------------------- /src/doc2vec.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import os 3 | import gensim 4 | from gensim.models import doc2vec 5 | from dataset import get_question 6 | import jieba 7 | import numpy as np 8 | import time 9 | from utility import write_to_file 10 | 11 | def gen_d2v_corpus(lines,savemodel,istran=False): 12 | 13 | total_examples = len(lines) 14 | with open("../data/ques2_result.txt", "wb") as fw: 15 | for line in lines: 16 | txt = " ".join(jieba.lcut(line)) + "\n" 17 | txt = txt.encode('utf-8') 18 | fw.write(txt) 19 | 20 | sents = doc2vec.TaggedLineDocument("../data/ques2_result.txt") 21 | 22 | model = None 23 | if os.path.exists(savemodel): 24 | print('loading model',savemodel,time.asctime()) 25 | model = doc2vec.Doc2Vec.load(savemodel) 26 | print('loaded model',savemodel,time.asctime()) 27 | if istran: 28 | count = 0 29 | while(True): 30 | count+=1 31 | epoches =20 32 | model.train(sents,total_examples=total_examples,epochs=epoches) 33 | if count % 10: 34 | model.save(savemodel+"."+str(count)) 35 | model.save(savemodel) 36 | print('trained ',count*epoches) 37 | else: 38 | print('train new model') 39 | model = doc2vec.Doc2Vec(sents, size=300, window=12, min_count=2, workers=4,dm=0) 40 | 41 | print('train',time.asctime()) 42 | model.train(sents,total_examples=total_examples,epochs=200) 43 | print('train',time.asctime()) 44 | model.save(savemodel) 45 | 46 | save_path = '../data/query.doc2vec.txt' 47 | write_to_file(save_path,"".encode('utf-8'),mode='wb+') 48 | for i in range(100): 49 | vs=model.docvecs.most_similar(i) 50 | for v in vs[:10]: 51 | result_indx = v[0] 52 | distance=v[1] 53 | txt ='{} {} {} {} {} {}\n'.format(i,lines[i],"->",result_indx,lines[result_indx],distance) 54 | write_to_file(save_path,txt.encode('utf-8')) 55 | write_to_file(save_path,"\n".encode('utf-8')) 56 | 57 | def main(): 58 | questions = get_question() 59 | savemodel = '../data/gensim.model' 60 | gen_d2v_corpus(questions,savemodel,istran=False) 61 | 62 | if __name__ == "__main__": 63 | main() -------------------------------------------------------------------------------- /src/gensim_model_main.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import logging 4 | 5 | from dataset import load_corpora 6 | from gensim import corpora, models, similarities 7 | import jieba #ieba.lcut(line) 8 | 9 | def tfidf_model(corpus,tfidf_save_path): 10 | tfidf = models.TfidfModel(corpus,normalize=True) 11 | corpus_tfidf = tfidf[corpus] 12 | tfidf.save(tfidf_save_path) 13 | return corpus_tfidf 14 | 15 | def load_tfidf_model(tfidf_save_path): 16 | model = models.TfidfModel.load(tfidf_save_path) 17 | return model 18 | 19 | def lsi_model(corpus_tfidf,dictionary,lsi_save_path): 20 | lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300) 21 | lsi.print_topics(10) 22 | lsi.save(lsi_save_path) 23 | 24 | def load_lsi_model(lsi_save_path): 25 | lsi = models.LsiModel.load(lsi_save_path) 26 | return lsi 27 | def lda_model(corpus,dictionary,lda_save_path,num_topics=300): 28 | model = models.LdaModel(corpus, id2word=dictionary, num_topics=num_topics) 29 | model.save(lda_save_path) 30 | 31 | def load_lad_model(lda_save_path): 32 | lda = models.LdaModel.load(lda_save_path) 33 | return lda 34 | 35 | def build_lsi_model(corpus,dictionary,tfidf_save_path,lsi_save_path): 36 | corpus_tfidf = tfidf_model(corpus,tfidf_save_path) 37 | lsi_model(corpus_tfidf,dictionary,lsi_save_path) 38 | 39 | def build_rp_model(corpus,dictionary,tfidf_save_path,rp_save_path): 40 | tfidfmodel = load_tfidf_model(tfidf_save_path) 41 | corpus_tfidf = tfidfmodel[corpus] 42 | rp = models.RpModel(corpus_tfidf, num_topics=500) 43 | rp.save(rp_save_path) 44 | 45 | def load_rp_model(rp_save_path): 46 | return models.RpModel.load(rp_save_path) 47 | 48 | def main(corpora_path,dicsavepath,tfidf_save_path,lsi_save_path,lda_save_path,rp_save_path): 49 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 50 | dictionary = corpora.Dictionary.load(dicsavepath) 51 | corpus = load_corpora(corpora_path) 52 | build_rp_model(corpus,dictionary,tfidf_save_path,rp_save_path) 53 | # build_lsi_model(corpus,dictionary,tfidf_save_path,lsi_save_path) 54 | # lda_model(corpus,dictionary,lda_save_path) 55 | 56 | if __name__ == '__main__': 57 | corpora_path = '../data/yuer.mm' 58 | dicsavepath = '../data/yuer.dict' 59 | lsi_save_path = '../data/yuer.lsi' 60 | lda_save_path = '../data/yuer.lda' 61 | tfidf_save_path = '../data/yuer.tfidf' 62 | rp_save_path = '../data/yuer.rp' 63 | main(corpora_path,dicsavepath,tfidf_save_path,lsi_save_path,lda_save_path,rp_save_path) 64 | -------------------------------------------------------------------------------- /src/gensim_similarity_main.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import os 4 | import logging 5 | import time 6 | import jieba 7 | 8 | from gensim import corpora, models, similarities 9 | from gensim_model_main import load_lad_model,load_lsi_model,load_tfidf_model,load_rp_model 10 | from dataset import load_corpora,get_question 11 | from utility import write_to_file 12 | 13 | def lsi_similarity_main(dictionary,tfidf_save_path,lsi_save_path,corpora_path,index_path): 14 | 15 | questions= get_question() 16 | 17 | corpus = load_corpora(corpora_path) 18 | 19 | tfidfmodel = load_tfidf_model(tfidf_save_path) 20 | lsimodel = load_lsi_model(lsi_save_path) 21 | 22 | corpus_tfidf = tfidfmodel[corpus] 23 | 24 | if os.path.exists(index_path): 25 | index_sim = similarities.MatrixSimilarity.load(index_path) 26 | else: 27 | index_sim = similarities.MatrixSimilarity(lsimodel[corpus_tfidf]) 28 | index_sim.save(index_path) 29 | 30 | write_to_file('../data/query.lsi.txt', ''.encode('utf-8'),mode='wb+') 31 | for i in range(1000): 32 | querydoc =questions[i] 33 | vec_bow = dictionary.doc2bow(jieba.lcut(querydoc)) 34 | vectfidf = tfidfmodel[vec_bow] 35 | vec_lsi = lsimodel[vectfidf] 36 | sims = index_sim[vec_lsi] 37 | sims = sorted(enumerate(sims), key=lambda item: -item[1]) 38 | 39 | for sim in sims[:10]: 40 | index = sim[0] 41 | distance = sim[1] 42 | txt = '{} {} {} {} {}\n'.format(i,querydoc,index,questions[index],distance) 43 | write_to_file('../data/query.lsi.txt', txt.encode('utf-8')) 44 | write_to_file('../data/query.lsi.txt', '\n'.encode('utf-8')) 45 | 46 | 47 | def lda_similarity_main(dictionary,lda_save_path,corpora_path,index_path): 48 | 49 | questions= get_question() 50 | 51 | corpus = load_corpora(corpora_path) 52 | 53 | ldamodel = load_lad_model(lda_save_path) 54 | 55 | if os.path.exists(index_path): 56 | index_sim = similarities.MatrixSimilarity.load(index_path) 57 | else: 58 | index_sim = similarities.MatrixSimilarity(ldamodel[corpus]) 59 | index_sim.save(index_path) 60 | 61 | write_to_file('../data/query.lda.txt', ''.encode('utf-8'),mode='wb+') 62 | for i in range(1000): 63 | querydoc = questions[i] 64 | vec_bow = dictionary.doc2bow(jieba.lcut(querydoc)) 65 | vec_lda= ldamodel[vec_bow] 66 | 67 | sims = index_sim[vec_lda] 68 | sims = sorted(enumerate(sims), key=lambda item: -item[1]) 69 | 70 | for sim in sims[:10]: 71 | index = sim[0] 72 | distance = sim[1] 73 | txt='{} {} {} {} {}\n'.format(i,querydoc,index,questions[index],distance) 74 | write_to_file('../data/query.lda.txt', txt.encode('utf-8')) 75 | write_to_file('../data/query.lda.txt', '\n'.encode('utf-8')) 76 | 77 | def rp_similarity_main(dictionary,tfidf_save_path,rp_save_path,corpora_path,index_path): 78 | 79 | questions= get_question() 80 | 81 | corpus = load_corpora(corpora_path) 82 | 83 | rpmodel = load_rp_model(lda_save_path) 84 | tfidfmodel = load_tfidf_model(tfidf_save_path) 85 | 86 | if os.path.exists(index_path): 87 | index_sim = similarities.MatrixSimilarity.load(index_path) 88 | else: 89 | print('build matrix similarity') 90 | corpus_tfidf = tfidfmodel[corpus] 91 | index_sim = similarities.MatrixSimilarity(rpmodel[corpus_tfidf]) 92 | index_sim.save(index_path) 93 | 94 | write_to_file('../data/query.rp.txt', ''.encode('utf-8'),mode='wb+') 95 | for i in range(1000): 96 | querydoc = questions[i] 97 | vec_bow = dictionary.doc2bow(jieba.lcut(querydoc)) 98 | vectfidf = tfidfmodel[vec_bow] 99 | vec_rp= rpmodel[vectfidf] 100 | 101 | sims = index_sim[vec_rp] 102 | sims = sorted(enumerate(sims), key=lambda item: -item[1]) 103 | 104 | for sim in sims[:10]: 105 | index = sim[0] 106 | distance = sim[1] 107 | txt='{} {} {} {} {}\n'.format(i,querydoc,index,questions[index],distance) 108 | write_to_file('../data/query.rp.txt', txt.encode('utf-8')) 109 | write_to_file('../data/query.rp.txt', '\n'.encode('utf-8')) 110 | 111 | if __name__ == '__main__': 112 | lsi_save_path = '../data/yuer.lsi' 113 | lda_save_path = '../data/yuer.lda' 114 | rp_save_path = '../data/yuer.rp' 115 | tfidf_save_path = '../data/yuer.tfidf' 116 | dicsavepath = '../data/yuer.dict' 117 | corpora_path = '../data/yuer.mm' 118 | index_lsi_path ='../data/yuer.lsi.index' 119 | index_lda_path ='../data/yuer.lda.index' 120 | index_rp_path ='../data/yuer.rp.index' 121 | dictionary = corpora.Dictionary.load(dicsavepath) 122 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 123 | rp_similarity_main(dictionary,tfidf_save_path,rp_save_path,corpora_path,index_rp_path) 124 | # lsi_similarity_main(dictionary,tfidf_save_path,lsi_save_path,corpora_path,index_lsi_path) 125 | # lda_similarity_main(dictionary,lda_save_path,corpora_path,index_lda_path) -------------------------------------------------------------------------------- /src/utility.py: -------------------------------------------------------------------------------- 1 | try: 2 | import cPickle 3 | except : 4 | import _pickle as cPickle 5 | 6 | def save_model(clf,modelpath): 7 | with open(modelpath, 'wb') as f: 8 | cPickle.dump(clf, f) 9 | 10 | def load_model(modelpath): 11 | try: 12 | with open(modelpath, 'rb') as f: 13 | rf = cPickle.load(f) 14 | return rf 15 | except Exception as e: 16 | return None 17 | 18 | def write_to_file(path,txt,mode='ab+'): 19 | with open(path,mode=mode) as f: 20 | f.write(txt) --------------------------------------------------------------------------------