├── .deepsource.toml
├── .gitignore
├── .travis.yml
├── DocSim.py
├── LICENSE
├── README.md
├── data
    ├── stopwords_en.txt
    └── test_data.txt
├── example.py
├── requirements.txt
└── test_DocSim.py


/.deepsource.toml:
--------------------------------------------------------------------------------
1 | version = 1
2 | 
3 | [[analyzers]]
4 | name = "python"
5 | enabled = true
6 | runtime_version = "3.x.x"
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | *.pyc
3 | *.py[cod]
4 | *$py.class
5 | *.so
6 | 
7 | data/GoogleNews-vectors-negative300.bin
8 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 |   - "3.6"
4 | install: "pip install -r requirements.txt"
5 | script: python -c "from DocSim import DocSim"
6 | 


--------------------------------------------------------------------------------
/DocSim.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class DocSim:
 5 |     def __init__(self, w2v_model, stopwords=None):
 6 |         self.w2v_model = w2v_model
 7 |         self.stopwords = stopwords if stopwords is not None else []
 8 | 
 9 |     def vectorize(self, doc: str) -> np.ndarray:
10 |         """
11 |         Identify the vector values for each word in the given document
12 |         :param doc:
13 |         :return:
14 |         """
15 |         doc = doc.lower()
16 |         words = [w for w in doc.split(" ") if w not in self.stopwords]
17 |         word_vecs = []
18 |         for word in words:
19 |             try:
20 |                 vec = self.w2v_model[word]
21 |                 word_vecs.append(vec)
22 |             except KeyError:
23 |                 # Ignore, if the word doesn't exist in the vocabulary
24 |                 pass
25 | 
26 |         # Assuming that document vector is the mean of all the word vectors
27 |         # PS: There are other & better ways to do it.
28 |         vector = np.mean(word_vecs, axis=0)
29 |         return vector
30 | 
31 |     def _cosine_sim(self, vecA, vecB):
32 |         """Find the cosine similarity distance between two vectors."""
33 |         csim = np.dot(vecA, vecB) / (np.linalg.norm(vecA) * np.linalg.norm(vecB))
34 |         if np.isnan(np.sum(csim)):
35 |             return 0
36 |         return csim
37 | 
38 |     def calculate_similarity(self, source_doc, target_docs=None, threshold=0):
39 |         """Calculates & returns similarity scores between given source document & all
40 |         the target documents."""
41 |         if not target_docs:
42 |             return []
43 | 
44 |         if isinstance(target_docs, str):
45 |             target_docs = [target_docs]
46 | 
47 |         source_vec = self.vectorize(source_doc)
48 |         results = []
49 |         for doc in target_docs:
50 |             target_vec = self.vectorize(doc)
51 |             sim_score = self._cosine_sim(source_vec, target_vec)
52 |             if sim_score > threshold:
53 |                 results.append({"score": sim_score, "doc": doc})
54 |             # Sort results by score in desc order
55 |             results.sort(key=lambda k: k["score"], reverse=True)
56 | 
57 |         return results
58 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2017 Vishwa Datta, http://vishwa.be
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Document Similarity using Word2Vec
 2 | 
 3 | Calculate the similarity distance between documents using pre-trained word2vec model.
 4 | 
 5 | ### Usage
 6 | 
 7 | - Load a pre-trained word2vec model. _Note_: You can use [Google's pre-trained word2vec model](https://bit.ly/w2vgdrive), if you don't have one.
 8 |     
 9 |      ```python
10 |     from gensim.models.keyedvectors import KeyedVectors
11 |     model_path = './data/GoogleNews-vectors-negative300.bin'
12 |     w2v_model = KeyedVectors.load_word2vec_format(model_path, binary=True)
13 |      ```
14 | 
15 | - Once the model is loaded, it can be passed to `DocSim` class to calculate document similarities.
16 |  
17 |     ```python
18 |     from DocSim import DocSim
19 |     ds = DocSim(w2v_model)
20 |     ```
21 | 
22 | - Calculate the similarity score between a source document & a list of target documents.
23 | 
24 |     ```python
25 |   source_doc = 'how to delete an invoice'
26 |   target_docs = ['delete a invoice', 'how do i remove an invoice', 'purge an invoice']
27 | 
28 |   # This will return 3 target docs with similarity score
29 |   sim_scores = ds.calculate_similarity(source_doc, target_docs)
30 | 
31 |   print(sim_scores)
32 |   ```
33 | - Output is as follows:
34 |   ```python
35 |     [ {'score': 0.99999994, 'doc': 'delete a invoice'}, 
36 |     {'score': 0.79869318, 'doc': 'how do i remove an invoice'}, 
37 |     {'score': 0.71488398, 'doc': 'purge an invoice'} ]
38 |     ```
39 | 
40 | - _Note_: You can optionally pass a `threshold` argument to the  `calculate_similarity()` method to return only the target documents with similarity score above the threshold.
41 | 
42 |     ```python
43 |     sim_scores = ds.calculate_similarity(source_doc, target_docs, threshold=0.7)
44 |     ```
45 | 
46 | 
47 | ### Requirements
48 | - Python 3 only
49 | - **_gensim_** : to load the word2vec model
50 | - **_numpy_**  : to calculate similarity scores
51 | 
52 | ### License
53 | [The MIT License](./LICENSE)


--------------------------------------------------------------------------------
/data/stopwords_en.txt:
--------------------------------------------------------------------------------
1 | a,about,above,across,after,again,against,all,almost,alone,along,already,also,although,always,am,among,an,and,another,any,anybody,anyone,anything,anywhere,are,area,areas,aren't,around,as,ask,asked,asking,asks,at,away,b,back,backed,backing,backs,be,became,because,become,becomes,been,before,began,behind,being,beings,below,best,better,between,big,both,but,by,c,came,can,cannot,can't,case,cases,certain,certainly,clear,clearly,come,could,couldn't,d,did,didn't,differ,different,differently,do,does,doesn't,doing,done,don't,down,downed,downing,downs,during,e,each,early,either,end,ended,ending,ends,enough,even,evenly,ever,every,everybody,everyone,everything,everywhere,f,face,faces,fact,facts,far,felt,few,find,finds,first,for,four,from,full,fully,further,furthered,furthering,furthers,g,gave,general,generally,get,gets,give,given,gives,go,going,good,goods,got,great,greater,greatest,group,grouped,grouping,groups,h,had,hadn't,has,hasn't,have,haven't,having,he,he'd,he'll,her,here,here's,hers,herself,he's,high,higher,highest,him,himself,his,how,however,how's,i,i'd,if,i'll,i'm,important,in,interest,interested,interesting,interests,into,is,isn't,it,its,it's,itself,i've,j,just,k,keep,keeps,kind,knew,know,known,knows,l,large,largely,last,later,latest,least,less,let,lets,let's,like,likely,long,longer,longest,m,made,make,making,man,many,may,me,member,members,men,might,more,most,mostly,mr,mrs,much,must,mustn't,my,myself,n,necessary,need,needed,needing,needs,never,new,newer,newest,next,no,nobody,non,noone,nor,not,nothing,now,nowhere,number,numbers,o,of,off,often,old,older,oldest,on,once,one,only,open,opened,opening,opens,or,order,ordered,ordering,orders,other,others,ought,our,ours,ourselves,out,over,own,p,part,parted,parting,parts,per,perhaps,place,places,point,pointed,pointing,points,possible,present,presented,presenting,presents,problem,problems,put,puts,q,quite,r,rather,really,right,room,rooms,s,said,same,saw,say,says,second,seconds,see,seem,seemed,seeming,seems,sees,several,shall,shan't,she,she'd,she'll,she's,should,shouldn't,show,showed,showing,shows,side,sides,since,small,smaller,smallest,so,some,somebody,someone,something,somewhere,state,states,still,such,sure,t,take,taken,than,that,that's,the,their,theirs,them,themselves,then,there,therefore,there's,these,they,they'd,they'll,they're,they've,thing,things,think,thinks,this,those,though,thought,thoughts,three,through,thus,to,today,together,too,took,toward,turn,turned,turning,turns,two,u,under,until,up,upon,use,used,uses,v,very,w,want,wanted,wanting,wants,was,wasn't,way,ways,we,we'd,well,we'll,wells,went,were,we're,weren't,we've,what,what's,when,when's,where,where's,whether,which,while,who,whole,whom,who's,whose,why,why's,will,with,within,without,won't,work,worked,working,works,would,wouldn't,x,y,year,years,yes,yet,you,you'd,you'll,young,younger,youngest,your,you're,yours,yourself,yourselves,you've,z


--------------------------------------------------------------------------------
/data/test_data.txt:
--------------------------------------------------------------------------------
1 | 3 10
2 | how 0.5 0.5 0.6 0.3 0.2 0.1 0.4 0.6 0.5 0.5
3 | invoice 0.5 0.5 0.6 0.3 0.2 0.1 0.4 0.6 0.5 0.5
4 | delete 0.5 0.5 0.6 0.3 0.2 0.1 0.4 0.6 0.5 0.5
5 | 


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
 1 | from gensim.models.keyedvectors import KeyedVectors
 2 | from DocSim import DocSim
 3 | 
 4 | # Using the pre-trained word2vec model trained using Google news corpus of 3 billion running words.
 5 | # The model can be downloaded here: https://bit.ly/w2vgdrive (~1.4GB)
 6 | # Feel free to use to your own model.
 7 | googlenews_model_path = './data/GoogleNews-vectors-negative300.bin'
 8 | stopwords_path = "./data/stopwords_en.txt"
 9 | 
10 | model = KeyedVectors.load_word2vec_format(googlenews_model_path, binary=True)
11 | with open(stopwords_path, 'r') as fh:
12 |     stopwords = fh.read().split(",")
13 | ds = DocSim(model,stopwords=stopwords)
14 | 
15 | source_doc = "how to delete an invoice"
16 | target_docs = ['delete a invoice', 'how do i remove an invoice', "purge an invoice"]
17 | 
18 | sim_scores = ds.calculate_similarity(source_doc, target_docs)
19 | 
20 | print(sim_scores)
21 | 
22 | # Prints:
23 | ##   [ {'score': 0.99999994, 'doc': 'delete a invoice'}, 
24 | ##   {'score': 0.79869318, 'doc': 'how do i remove an invoice'}, 
25 | ##   {'score': 0.71488398, 'doc': 'purge an invoice'} ]
26 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.16.2
2 | gensim==3.8.1
3 | 


--------------------------------------------------------------------------------
/test_DocSim.py:
--------------------------------------------------------------------------------
 1 | from gensim.models.keyedvectors import KeyedVectors
 2 | import numpy as np
 3 | from DocSim import DocSim
 4 | import unittest
 5 | 
 6 | 
 7 | class DocSimTest(unittest.TestCase):
 8 |     @classmethod
 9 |     def setUpClass(cls):
10 |         test_model_path = './data/test_data.txt'
11 |         cls.w2v_model = KeyedVectors.load_word2vec_format(test_model_path, binary=False)
12 |         cls.stopwords = ['to', 'an', 'a']
13 |         cls.doc_sim = DocSim(cls.w2v_model, cls.stopwords)
14 | 
15 |     def test_vectorize_with_valid_words(self):
16 |         source_doc = 'how to delete an invoice'
17 |         # same values dummy data will output same mean value
18 |         expected = np.array([0.5, 0.5, 0.6, 0.3, 0.2, 0.1, 0.4, 0.6, 0.5, 0.5])
19 |         actual = self.doc_sim.vectorize(source_doc)
20 |         self.assertEqual(expected.all(), actual.all())
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     unittest.main()
25 | 


--------------------------------------------------------------------------------