├── .deepsource.toml ├── .gitignore ├── .travis.yml ├── DocSim.py ├── LICENSE ├── README.md ├── data ├── stopwords_en.txt └── test_data.txt ├── example.py ├── requirements.txt └── test_DocSim.py /.deepsource.toml: -------------------------------------------------------------------------------- 1 | version = 1 2 | 3 | [[analyzers]] 4 | name = "python" 5 | enabled = true 6 | runtime_version = "3.x.x" 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.pyc 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | 7 | data/GoogleNews-vectors-negative300.bin 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | install: "pip install -r requirements.txt" 5 | script: python -c "from DocSim import DocSim" 6 | -------------------------------------------------------------------------------- /DocSim.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class DocSim: 5 | def __init__(self, w2v_model, stopwords=None): 6 | self.w2v_model = w2v_model 7 | self.stopwords = stopwords if stopwords is not None else [] 8 | 9 | def vectorize(self, doc: str) -> np.ndarray: 10 | """ 11 | Identify the vector values for each word in the given document 12 | :param doc: 13 | :return: 14 | """ 15 | doc = doc.lower() 16 | words = [w for w in doc.split(" ") if w not in self.stopwords] 17 | word_vecs = [] 18 | for word in words: 19 | try: 20 | vec = self.w2v_model[word] 21 | word_vecs.append(vec) 22 | except KeyError: 23 | # Ignore, if the word doesn't exist in the vocabulary 24 | pass 25 | 26 | # Assuming that document vector is the mean of all the word vectors 27 | # PS: There are other & better ways to do it. 28 | vector = np.mean(word_vecs, axis=0) 29 | return vector 30 | 31 | def _cosine_sim(self, vecA, vecB): 32 | """Find the cosine similarity distance between two vectors.""" 33 | csim = np.dot(vecA, vecB) / (np.linalg.norm(vecA) * np.linalg.norm(vecB)) 34 | if np.isnan(np.sum(csim)): 35 | return 0 36 | return csim 37 | 38 | def calculate_similarity(self, source_doc, target_docs=None, threshold=0): 39 | """Calculates & returns similarity scores between given source document & all 40 | the target documents.""" 41 | if not target_docs: 42 | return [] 43 | 44 | if isinstance(target_docs, str): 45 | target_docs = [target_docs] 46 | 47 | source_vec = self.vectorize(source_doc) 48 | results = [] 49 | for doc in target_docs: 50 | target_vec = self.vectorize(doc) 51 | sim_score = self._cosine_sim(source_vec, target_vec) 52 | if sim_score > threshold: 53 | results.append({"score": sim_score, "doc": doc}) 54 | # Sort results by score in desc order 55 | results.sort(key=lambda k: k["score"], reverse=True) 56 | 57 | return results 58 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2017 Vishwa Datta, http://vishwa.be 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Document Similarity using Word2Vec 2 | 3 | Calculate the similarity distance between documents using pre-trained word2vec model. 4 | 5 | ### Usage 6 | 7 | - Load a pre-trained word2vec model. _Note_: You can use [Google's pre-trained word2vec model](https://bit.ly/w2vgdrive), if you don't have one. 8 | 9 | ```python 10 | from gensim.models.keyedvectors import KeyedVectors 11 | model_path = './data/GoogleNews-vectors-negative300.bin' 12 | w2v_model = KeyedVectors.load_word2vec_format(model_path, binary=True) 13 | ``` 14 | 15 | - Once the model is loaded, it can be passed to `DocSim` class to calculate document similarities. 16 | 17 | ```python 18 | from DocSim import DocSim 19 | ds = DocSim(w2v_model) 20 | ``` 21 | 22 | - Calculate the similarity score between a source document & a list of target documents. 23 | 24 | ```python 25 | source_doc = 'how to delete an invoice' 26 | target_docs = ['delete a invoice', 'how do i remove an invoice', 'purge an invoice'] 27 | 28 | # This will return 3 target docs with similarity score 29 | sim_scores = ds.calculate_similarity(source_doc, target_docs) 30 | 31 | print(sim_scores) 32 | ``` 33 | - Output is as follows: 34 | ```python 35 | [ {'score': 0.99999994, 'doc': 'delete a invoice'}, 36 | {'score': 0.79869318, 'doc': 'how do i remove an invoice'}, 37 | {'score': 0.71488398, 'doc': 'purge an invoice'} ] 38 | ``` 39 | 40 | - _Note_: You can optionally pass a `threshold` argument to the `calculate_similarity()` method to return only the target documents with similarity score above the threshold. 41 | 42 | ```python 43 | sim_scores = ds.calculate_similarity(source_doc, target_docs, threshold=0.7) 44 | ``` 45 | 46 | 47 | ### Requirements 48 | - Python 3 only 49 | - **_gensim_** : to load the word2vec model 50 | - **_numpy_** : to calculate similarity scores 51 | 52 | ### License 53 | [The MIT License](./LICENSE) -------------------------------------------------------------------------------- /data/stopwords_en.txt: -------------------------------------------------------------------------------- 1 | a,about,above,across,after,again,against,all,almost,alone,along,already,also,although,always,am,among,an,and,another,any,anybody,anyone,anything,anywhere,are,area,areas,aren't,around,as,ask,asked,asking,asks,at,away,b,back,backed,backing,backs,be,became,because,become,becomes,been,before,began,behind,being,beings,below,best,better,between,big,both,but,by,c,came,can,cannot,can't,case,cases,certain,certainly,clear,clearly,come,could,couldn't,d,did,didn't,differ,different,differently,do,does,doesn't,doing,done,don't,down,downed,downing,downs,during,e,each,early,either,end,ended,ending,ends,enough,even,evenly,ever,every,everybody,everyone,everything,everywhere,f,face,faces,fact,facts,far,felt,few,find,finds,first,for,four,from,full,fully,further,furthered,furthering,furthers,g,gave,general,generally,get,gets,give,given,gives,go,going,good,goods,got,great,greater,greatest,group,grouped,grouping,groups,h,had,hadn't,has,hasn't,have,haven't,having,he,he'd,he'll,her,here,here's,hers,herself,he's,high,higher,highest,him,himself,his,how,however,how's,i,i'd,if,i'll,i'm,important,in,interest,interested,interesting,interests,into,is,isn't,it,its,it's,itself,i've,j,just,k,keep,keeps,kind,knew,know,known,knows,l,large,largely,last,later,latest,least,less,let,lets,let's,like,likely,long,longer,longest,m,made,make,making,man,many,may,me,member,members,men,might,more,most,mostly,mr,mrs,much,must,mustn't,my,myself,n,necessary,need,needed,needing,needs,never,new,newer,newest,next,no,nobody,non,noone,nor,not,nothing,now,nowhere,number,numbers,o,of,off,often,old,older,oldest,on,once,one,only,open,opened,opening,opens,or,order,ordered,ordering,orders,other,others,ought,our,ours,ourselves,out,over,own,p,part,parted,parting,parts,per,perhaps,place,places,point,pointed,pointing,points,possible,present,presented,presenting,presents,problem,problems,put,puts,q,quite,r,rather,really,right,room,rooms,s,said,same,saw,say,says,second,seconds,see,seem,seemed,seeming,seems,sees,several,shall,shan't,she,she'd,she'll,she's,should,shouldn't,show,showed,showing,shows,side,sides,since,small,smaller,smallest,so,some,somebody,someone,something,somewhere,state,states,still,such,sure,t,take,taken,than,that,that's,the,their,theirs,them,themselves,then,there,therefore,there's,these,they,they'd,they'll,they're,they've,thing,things,think,thinks,this,those,though,thought,thoughts,three,through,thus,to,today,together,too,took,toward,turn,turned,turning,turns,two,u,under,until,up,upon,use,used,uses,v,very,w,want,wanted,wanting,wants,was,wasn't,way,ways,we,we'd,well,we'll,wells,went,were,we're,weren't,we've,what,what's,when,when's,where,where's,whether,which,while,who,whole,whom,who's,whose,why,why's,will,with,within,without,won't,work,worked,working,works,would,wouldn't,x,y,year,years,yes,yet,you,you'd,you'll,young,younger,youngest,your,you're,yours,yourself,yourselves,you've,z -------------------------------------------------------------------------------- /data/test_data.txt: -------------------------------------------------------------------------------- 1 | 3 10 2 | how 0.5 0.5 0.6 0.3 0.2 0.1 0.4 0.6 0.5 0.5 3 | invoice 0.5 0.5 0.6 0.3 0.2 0.1 0.4 0.6 0.5 0.5 4 | delete 0.5 0.5 0.6 0.3 0.2 0.1 0.4 0.6 0.5 0.5 5 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | from gensim.models.keyedvectors import KeyedVectors 2 | from DocSim import DocSim 3 | 4 | # Using the pre-trained word2vec model trained using Google news corpus of 3 billion running words. 5 | # The model can be downloaded here: https://bit.ly/w2vgdrive (~1.4GB) 6 | # Feel free to use to your own model. 7 | googlenews_model_path = './data/GoogleNews-vectors-negative300.bin' 8 | stopwords_path = "./data/stopwords_en.txt" 9 | 10 | model = KeyedVectors.load_word2vec_format(googlenews_model_path, binary=True) 11 | with open(stopwords_path, 'r') as fh: 12 | stopwords = fh.read().split(",") 13 | ds = DocSim(model,stopwords=stopwords) 14 | 15 | source_doc = "how to delete an invoice" 16 | target_docs = ['delete a invoice', 'how do i remove an invoice', "purge an invoice"] 17 | 18 | sim_scores = ds.calculate_similarity(source_doc, target_docs) 19 | 20 | print(sim_scores) 21 | 22 | # Prints: 23 | ## [ {'score': 0.99999994, 'doc': 'delete a invoice'}, 24 | ## {'score': 0.79869318, 'doc': 'how do i remove an invoice'}, 25 | ## {'score': 0.71488398, 'doc': 'purge an invoice'} ] 26 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.16.2 2 | gensim==3.8.1 3 | -------------------------------------------------------------------------------- /test_DocSim.py: -------------------------------------------------------------------------------- 1 | from gensim.models.keyedvectors import KeyedVectors 2 | import numpy as np 3 | from DocSim import DocSim 4 | import unittest 5 | 6 | 7 | class DocSimTest(unittest.TestCase): 8 | @classmethod 9 | def setUpClass(cls): 10 | test_model_path = './data/test_data.txt' 11 | cls.w2v_model = KeyedVectors.load_word2vec_format(test_model_path, binary=False) 12 | cls.stopwords = ['to', 'an', 'a'] 13 | cls.doc_sim = DocSim(cls.w2v_model, cls.stopwords) 14 | 15 | def test_vectorize_with_valid_words(self): 16 | source_doc = 'how to delete an invoice' 17 | # same values dummy data will output same mean value 18 | expected = np.array([0.5, 0.5, 0.6, 0.3, 0.2, 0.1, 0.4, 0.6, 0.5, 0.5]) 19 | actual = self.doc_sim.vectorize(source_doc) 20 | self.assertEqual(expected.all(), actual.all()) 21 | 22 | 23 | if __name__ == "__main__": 24 | unittest.main() 25 | --------------------------------------------------------------------------------