├── data ├── asdf.txt ├── etaoinShrdlu.txt ├── quickBrownFox.txt ├── nowIsTheTime.txt └── loremIpsum.txt ├── README.md ├── src ├── tfidf_cos_dist.py └── cos_dist.py └── .gitignore /data/asdf.txt: -------------------------------------------------------------------------------- 1 | asdf -------------------------------------------------------------------------------- /data/etaoinShrdlu.txt: -------------------------------------------------------------------------------- 1 | ETAOIN SHRDLU -------------------------------------------------------------------------------- /data/quickBrownFox.txt: -------------------------------------------------------------------------------- 1 | The quick brown fox jumps over the lazy dog -------------------------------------------------------------------------------- /data/nowIsTheTime.txt: -------------------------------------------------------------------------------- 1 | Now is the time for all good men to come to the aid of the party -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # document-similarity 2 | 3 | Attempts to quantify the similarity between two documents. 4 | 5 | ### cosine similarity 6 | 7 | Converts two documents to vectors and computes the similarity between those vectors. 8 | Similarity is calculated by taking the inner product space that measures the cosine angle between them. 9 | -------------------------------------------------------------------------------- /data/loremIpsum.txt: -------------------------------------------------------------------------------- 1 | Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. -------------------------------------------------------------------------------- /src/tfidf_cos_dist.py: -------------------------------------------------------------------------------- 1 | # Based on the implementation found at https://stackoverflow.com/a/8897648 2 | 3 | from sklearn.feature_extraction.text import TfidfVectorizer 4 | 5 | text_files = ['../data/nowIsTheTime.txt', '../data/quickBrownFox.txt'] 6 | documents = [open(f, encoding="utf8").read() for f in text_files] 7 | tfidf = TfidfVectorizer().fit_transform(documents) 8 | # no need to normalize, since Vectorizer will return normalized tf-idf 9 | pairwise_similarity = (tfidf * tfidf.T).A 10 | 11 | print(pairwise_similarity) 12 | -------------------------------------------------------------------------------- /src/cos_dist.py: -------------------------------------------------------------------------------- 1 | # Calculate document distance given two files 2 | # Uses cosine formula described on Wikipedia: https://en.wikipedia.org/wiki/Cosine_similarity 3 | # Based on implementation by vpekar: http://stackoverflow.com/q/15173225 4 | 5 | import math 6 | import re 7 | from collections import Counter 8 | 9 | # regular expression 10 | # \w matches any alphanumeric character and the underscore 11 | # + causes the RE to match 1 or more repetitions of the preceding RE 12 | WORD = re.compile(r'\w+') 13 | 14 | 15 | def textToVector(text): 16 | words = WORD.findall(text) 17 | # unordered collection where elements are stored as dict keys, and counts are stored as dict vals 18 | return Counter(words) 19 | 20 | 21 | def cosDistance(vector1, vector2): 22 | # set of unordered collection of unique items 23 | intersection = set(vector1.keys()) & set(vector2.keys()) # return set with elements in intersection 24 | numerator = sum([vector1[x] * vector2[x] for x in intersection]) 25 | 26 | sum1 = sum([vector1[x] ** 2 for x in vector1.keys()]) 27 | sum2 = sum([vector2[x] ** 2 for x in vector2.keys()]) 28 | denominator = math.sqrt(sum1) * math.sqrt(sum2) 29 | 30 | if not denominator: 31 | return 0.0 32 | else: 33 | return float(numerator) / denominator 34 | 35 | 36 | def readFile(fileName): 37 | return open("../data/" + fileName, 'r').read() 38 | 39 | 40 | text1 = readFile("nowIsTheTime.txt") 41 | text2 = readFile("quickBrownFox.txt") 42 | 43 | vector1 = textToVector(text1) 44 | vector2 = textToVector(text2) 45 | 46 | cosine = cosDistance(vector1, vector2) 47 | 48 | print("Cosine Distance:\t", cosine) 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/osx,linux,python,windows,pycharm+all 3 | 4 | ### Linux ### 5 | *~ 6 | 7 | # temporary files which can be created if a process still has a handle open of a deleted file 8 | .fuse_hidden* 9 | 10 | # KDE directory preferences 11 | .directory 12 | 13 | # Linux trash folder which might appear on any partition or disk 14 | .Trash-* 15 | 16 | # .nfs files are created when an open file is removed but is still being accessed 17 | .nfs* 18 | 19 | ### OSX ### 20 | # General 21 | .DS_Store 22 | .AppleDouble 23 | .LSOverride 24 | 25 | # Icon must end with two \r 26 | Icon 27 | 28 | # Thumbnails 29 | ._* 30 | 31 | # Files that might appear in the root of a volume 32 | .DocumentRevisions-V100 33 | .fseventsd 34 | .Spotlight-V100 35 | .TemporaryItems 36 | .Trashes 37 | .VolumeIcon.icns 38 | .com.apple.timemachine.donotpresent 39 | 40 | # Directories potentially created on remote AFP share 41 | .AppleDB 42 | .AppleDesktop 43 | Network Trash Folder 44 | Temporary Items 45 | .apdisk 46 | 47 | ### PyCharm+all ### 48 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 49 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 50 | 51 | # User-specific stuff 52 | .idea/**/workspace.xml 53 | .idea/**/tasks.xml 54 | .idea/**/usage.statistics.xml 55 | .idea/**/dictionaries 56 | .idea/**/shelf 57 | 58 | # Sensitive or high-churn files 59 | .idea/**/dataSources/ 60 | .idea/**/dataSources.ids 61 | .idea/**/dataSources.local.xml 62 | .idea/**/sqlDataSources.xml 63 | .idea/**/dynamic.xml 64 | .idea/**/uiDesigner.xml 65 | .idea/**/dbnavigator.xml 66 | 67 | # Gradle 68 | .idea/**/gradle.xml 69 | .idea/**/libraries 70 | 71 | # Gradle and Maven with auto-import 72 | # When using Gradle or Maven with auto-import, you should exclude module files, 73 | # since they will be recreated, and may cause churn. Uncomment if using 74 | # auto-import. 75 | # .idea/modules.xml 76 | # .idea/*.iml 77 | # .idea/modules 78 | 79 | # CMake 80 | cmake-build-*/ 81 | 82 | # Mongo Explorer plugin 83 | .idea/**/mongoSettings.xml 84 | 85 | # File-based project format 86 | *.iws 87 | 88 | # IntelliJ 89 | out/ 90 | 91 | # mpeltonen/sbt-idea plugin 92 | .idea_modules/ 93 | 94 | # JIRA plugin 95 | atlassian-ide-plugin.xml 96 | 97 | # Cursive Clojure plugin 98 | .idea/replstate.xml 99 | 100 | # Crashlytics plugin (for Android Studio and IntelliJ) 101 | com_crashlytics_export_strings.xml 102 | crashlytics.properties 103 | crashlytics-build.properties 104 | fabric.properties 105 | 106 | # Editor-based Rest Client 107 | .idea/httpRequests 108 | 109 | ### PyCharm+all Patch ### 110 | # Ignores the whole .idea folder and all .iml files 111 | # See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 112 | 113 | .idea/ 114 | 115 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 116 | 117 | *.iml 118 | modules.xml 119 | .idea/misc.xml 120 | *.ipr 121 | 122 | ### Python ### 123 | # Byte-compiled / optimized / DLL files 124 | __pycache__/ 125 | *.py[cod] 126 | *$py.class 127 | 128 | # C extensions 129 | *.so 130 | 131 | # Distribution / packaging 132 | .Python 133 | build/ 134 | develop-eggs/ 135 | dist/ 136 | downloads/ 137 | eggs/ 138 | .eggs/ 139 | lib/ 140 | lib64/ 141 | parts/ 142 | sdist/ 143 | var/ 144 | wheels/ 145 | *.egg-info/ 146 | .installed.cfg 147 | *.egg 148 | MANIFEST 149 | 150 | # PyInstaller 151 | # Usually these files are written by a python script from a template 152 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 153 | *.manifest 154 | *.spec 155 | 156 | # Installer logs 157 | pip-log.txt 158 | pip-delete-this-directory.txt 159 | 160 | # Unit test / coverage reports 161 | htmlcov/ 162 | .tox/ 163 | .coverage 164 | .coverage.* 165 | .cache 166 | nosetests.xml 167 | coverage.xml 168 | *.cover 169 | .hypothesis/ 170 | .pytest_cache/ 171 | 172 | # Translations 173 | *.mo 174 | *.pot 175 | 176 | # Django stuff: 177 | *.log 178 | local_settings.py 179 | db.sqlite3 180 | 181 | # Flask stuff: 182 | instance/ 183 | .webassets-cache 184 | 185 | # Scrapy stuff: 186 | .scrapy 187 | 188 | # Sphinx documentation 189 | docs/_build/ 190 | 191 | # PyBuilder 192 | target/ 193 | 194 | # Jupyter Notebook 195 | .ipynb_checkpoints 196 | 197 | # pyenv 198 | .python-version 199 | 200 | # celery beat schedule file 201 | celerybeat-schedule 202 | 203 | # SageMath parsed files 204 | *.sage.py 205 | 206 | # Environments 207 | .env 208 | .venv 209 | env/ 210 | venv/ 211 | ENV/ 212 | env.bak/ 213 | venv.bak/ 214 | 215 | # Spyder project settings 216 | .spyderproject 217 | .spyproject 218 | 219 | # Rope project settings 220 | .ropeproject 221 | 222 | # mkdocs documentation 223 | /site 224 | 225 | # mypy 226 | .mypy_cache/ 227 | 228 | ### Python Patch ### 229 | .venv/ 230 | 231 | ### Python.VirtualEnv Stack ### 232 | # Virtualenv 233 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ 234 | [Bb]in 235 | [Ii]nclude 236 | [Ll]ib 237 | [Ll]ib64 238 | [Ll]ocal 239 | [Ss]cripts 240 | pyvenv.cfg 241 | pip-selfcheck.json 242 | 243 | ### Windows ### 244 | # Windows thumbnail cache files 245 | Thumbs.db 246 | ehthumbs.db 247 | ehthumbs_vista.db 248 | 249 | # Dump file 250 | *.stackdump 251 | 252 | # Folder config file 253 | [Dd]esktop.ini 254 | 255 | # Recycle Bin used on file shares 256 | $RECYCLE.BIN/ 257 | 258 | # Windows Installer files 259 | *.cab 260 | *.msi 261 | *.msix 262 | *.msm 263 | *.msp 264 | 265 | # Windows shortcuts 266 | *.lnk 267 | 268 | 269 | # End of https://www.gitignore.io/api/osx,linux,python,windows,pycharm+all 270 | --------------------------------------------------------------------------------