├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.rst ├── docs ├── Makefile ├── conf.py └── index.rst ├── setup.cfg ├── setup.py ├── synonym ├── __init__.py └── extractor.py └── test ├── __init__.py ├── sample_synonyms.txt └── test_extractor.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | *.ipynb 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # dotenv 84 | .env 85 | 86 | # virtualenv 87 | .venv 88 | venv/ 89 | ENV/ 90 | 91 | # Spyder project settings 92 | .spyderproject 93 | .spyproject 94 | 95 | # Rope project settings 96 | .ropeproject 97 | 98 | # mkdocs documentation 99 | /site 100 | 101 | # mypy 102 | .mypy_cache/ 103 | 104 | how_to_test_setup_deploy.txt 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Vikash Singh 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst LICENSE 2 | recursive-include tests *.py -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | This project has moved to `Flash Text `_. 2 | ----------------------------------------- 3 | 4 | synonym-extractor 5 | ================= 6 | 7 | Synonym Extractor is a python library that is loosely based on `Aho-Corasick algorithm 8 | `_. 9 | 10 | The idea is to extract words that we care about from a given sentence in one pass. 11 | 12 | Basically say I have a vocabulary of 10K words and I want to get all the words from that set present in a sentence. A simple regex match will take a lot of time to loop over the 10K documents. 13 | 14 | Hence we use a simpler yet much faster algorithm to get the desired result. 15 | 16 | Installation 17 | ------- 18 | :: 19 | 20 | pip install synonym-extractor 21 | 22 | Usage 23 | ------ 24 | :: 25 | 26 | # import module 27 | from synonym.extractor import SynonymExtractor 28 | 29 | # Create an object of SynonymExtractor 30 | synonym_extractor = SynonymExtractor() 31 | 32 | # add synonyms 33 | synonym_names = ['NY', 'new-york', 'SF'] 34 | clean_names = ['new york', 'new york', 'san francisco'] 35 | 36 | for synonym_name, clean_name in zip(synonym_names, clean_names): 37 | synonym_extractor.add_to_synonym(synonym_name, clean_name) 38 | 39 | synonyms_found = synonym_extractor.get_synonyms_from_sentence('I love SF and NY. new-york is the best.') 40 | 41 | synonyms_found 42 | >> ['san francisco', 'new york', 'new york'] 43 | 44 | 45 | Algorithm 46 | ---------- 47 | 48 | synonym-extractor is based on `Aho-Corasick algorithm 49 | `_. 50 | 51 | Documentation 52 | ---------- 53 | 54 | Documentation can be found at `Read the Docs 55 | `_. 56 | 57 | 58 | Why 59 | ------ 60 | 61 | :: 62 | 63 | Say you have a corpus where similar words appear frequently. 64 | 65 | eg: Last weekened I was in NY. 66 | I am traveling to new york next weekend. 67 | 68 | If you train a word2vec model on this or do any sort of NLP it will treat NY and new york as 2 different words. 69 | 70 | Instead if you create a synonym dictionary like: 71 | 72 | eg: NY=>new york 73 | new york=>new york 74 | 75 | Then you can extract NY and new york as the same text. 76 | 77 | To do the same with regex it will take a lot of time: 78 | 79 | ============ ========== = ========= ============ 80 | Docs count # Synonyms : Regex synonym-extractor 81 | ============ ========== = ========= ============ 82 | 1.5 million 2K : 16 hours NA 83 | 2.5 million 10K : 15 days 15 mins 84 | ============ ========== = ========= ============ 85 | 86 | The idea for this library came from the following `StackOverflow question 87 | `_. 88 | 89 | 90 | License 91 | ------- 92 | 93 | The project is licensed under the MIT license. 94 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help 18 | help: 19 | @echo "Please use \`make ' where is one of" 20 | @echo " html to make standalone HTML files" 21 | @echo " dirhtml to make HTML files named index.html in directories" 22 | @echo " singlehtml to make a single large HTML file" 23 | @echo " pickle to make pickle files" 24 | @echo " json to make JSON files" 25 | @echo " htmlhelp to make HTML files and a HTML help project" 26 | @echo " qthelp to make HTML files and a qthelp project" 27 | @echo " applehelp to make an Apple Help Book" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " epub3 to make an epub3" 31 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 32 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 33 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 34 | @echo " text to make text files" 35 | @echo " man to make manual pages" 36 | @echo " texinfo to make Texinfo files" 37 | @echo " info to make Texinfo files and run them through makeinfo" 38 | @echo " gettext to make PO message catalogs" 39 | @echo " changes to make an overview of all changed/added/deprecated items" 40 | @echo " xml to make Docutils-native XML files" 41 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 42 | @echo " linkcheck to check all external links for integrity" 43 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 44 | @echo " coverage to run coverage check of the documentation (if enabled)" 45 | @echo " dummy to check syntax errors of document sources" 46 | 47 | .PHONY: clean 48 | clean: 49 | rm -rf $(BUILDDIR)/* 50 | 51 | .PHONY: html 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | .PHONY: dirhtml 58 | dirhtml: 59 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 60 | @echo 61 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 62 | 63 | .PHONY: singlehtml 64 | singlehtml: 65 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 66 | @echo 67 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 68 | 69 | .PHONY: pickle 70 | pickle: 71 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 72 | @echo 73 | @echo "Build finished; now you can process the pickle files." 74 | 75 | .PHONY: json 76 | json: 77 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 78 | @echo 79 | @echo "Build finished; now you can process the JSON files." 80 | 81 | .PHONY: htmlhelp 82 | htmlhelp: 83 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 84 | @echo 85 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 86 | ".hhp project file in $(BUILDDIR)/htmlhelp." 87 | 88 | .PHONY: qthelp 89 | qthelp: 90 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 91 | @echo 92 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 93 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 94 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/SynonymExtractor.qhcp" 95 | @echo "To view the help file:" 96 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/SynonymExtractor.qhc" 97 | 98 | .PHONY: applehelp 99 | applehelp: 100 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 101 | @echo 102 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 103 | @echo "N.B. You won't be able to view it unless you put it in" \ 104 | "~/Library/Documentation/Help or install it in your application" \ 105 | "bundle." 106 | 107 | .PHONY: devhelp 108 | devhelp: 109 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 110 | @echo 111 | @echo "Build finished." 112 | @echo "To view the help file:" 113 | @echo "# mkdir -p $$HOME/.local/share/devhelp/SynonymExtractor" 114 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/SynonymExtractor" 115 | @echo "# devhelp" 116 | 117 | .PHONY: epub 118 | epub: 119 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 120 | @echo 121 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 122 | 123 | .PHONY: epub3 124 | epub3: 125 | $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 126 | @echo 127 | @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." 128 | 129 | .PHONY: latex 130 | latex: 131 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 132 | @echo 133 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 134 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 135 | "(use \`make latexpdf' here to do that automatically)." 136 | 137 | .PHONY: latexpdf 138 | latexpdf: 139 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 140 | @echo "Running LaTeX files through pdflatex..." 141 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 142 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 143 | 144 | .PHONY: latexpdfja 145 | latexpdfja: 146 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 147 | @echo "Running LaTeX files through platex and dvipdfmx..." 148 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 149 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 150 | 151 | .PHONY: text 152 | text: 153 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 154 | @echo 155 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 156 | 157 | .PHONY: man 158 | man: 159 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 160 | @echo 161 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 162 | 163 | .PHONY: texinfo 164 | texinfo: 165 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 166 | @echo 167 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 168 | @echo "Run \`make' in that directory to run these through makeinfo" \ 169 | "(use \`make info' here to do that automatically)." 170 | 171 | .PHONY: info 172 | info: 173 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 174 | @echo "Running Texinfo files through makeinfo..." 175 | make -C $(BUILDDIR)/texinfo info 176 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 177 | 178 | .PHONY: gettext 179 | gettext: 180 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 181 | @echo 182 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 183 | 184 | .PHONY: changes 185 | changes: 186 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 187 | @echo 188 | @echo "The overview file is in $(BUILDDIR)/changes." 189 | 190 | .PHONY: linkcheck 191 | linkcheck: 192 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 193 | @echo 194 | @echo "Link check complete; look for any errors in the above output " \ 195 | "or in $(BUILDDIR)/linkcheck/output.txt." 196 | 197 | .PHONY: doctest 198 | doctest: 199 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 200 | @echo "Testing of doctests in the sources finished, look at the " \ 201 | "results in $(BUILDDIR)/doctest/output.txt." 202 | 203 | .PHONY: coverage 204 | coverage: 205 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 206 | @echo "Testing of coverage in the sources finished, look at the " \ 207 | "results in $(BUILDDIR)/coverage/python.txt." 208 | 209 | .PHONY: xml 210 | xml: 211 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 212 | @echo 213 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 214 | 215 | .PHONY: pseudoxml 216 | pseudoxml: 217 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 218 | @echo 219 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 220 | 221 | .PHONY: dummy 222 | dummy: 223 | $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy 224 | @echo 225 | @echo "Build finished. Dummy builder generates no files." 226 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Synonym Extractor documentation build configuration file, created by 5 | # sphinx-quickstart on Sun Jul 2 16:54:26 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | # import os 21 | # import sys 22 | # sys.path.insert(0, os.path.abspath('.')) 23 | 24 | # -- General configuration ------------------------------------------------ 25 | 26 | # If your documentation needs a minimal Sphinx version, state it here. 27 | # 28 | # needs_sphinx = '1.0' 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = [ 34 | 'sphinx.ext.autodoc', 35 | ] 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ['_templates'] 39 | 40 | # The suffix(es) of source filenames. 41 | # You can specify multiple suffix as a list of string: 42 | # 43 | # source_suffix = ['.rst', '.md'] 44 | source_suffix = '.rst' 45 | 46 | # The encoding of source files. 47 | # 48 | # source_encoding = 'utf-8-sig' 49 | 50 | # The master toctree document. 51 | master_doc = 'index' 52 | 53 | # General information about the project. 54 | project = 'Synonym Extractor' 55 | copyright = '2017, Vikash Singh' 56 | author = 'Vikash Singh' 57 | 58 | # The version info for the project you're documenting, acts as replacement for 59 | # |version| and |release|, also used in various other places throughout the 60 | # built documents. 61 | # 62 | # The short X.Y version. 63 | version = '0.1.0' 64 | # The full version, including alpha/beta/rc tags. 65 | release = '0.1.0' 66 | 67 | # The language for content autogenerated by Sphinx. Refer to documentation 68 | # for a list of supported languages. 69 | # 70 | # This is also used if you do content translation via gettext catalogs. 71 | # Usually you set "language" from the command line for these cases. 72 | language = None 73 | 74 | # There are two options for replacing |today|: either, you set today to some 75 | # non-false value, then it is used: 76 | # 77 | # today = '' 78 | # 79 | # Else, today_fmt is used as the format for a strftime call. 80 | # 81 | # today_fmt = '%B %d, %Y' 82 | 83 | # List of patterns, relative to source directory, that match files and 84 | # directories to ignore when looking for source files. 85 | # This patterns also effect to html_static_path and html_extra_path 86 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 87 | 88 | # The reST default role (used for this markup: `text`) to use for all 89 | # documents. 90 | # 91 | # default_role = None 92 | 93 | # If true, '()' will be appended to :func: etc. cross-reference text. 94 | # 95 | # add_function_parentheses = True 96 | 97 | # If true, the current module name will be prepended to all description 98 | # unit titles (such as .. function::). 99 | # 100 | # add_module_names = True 101 | 102 | # If true, sectionauthor and moduleauthor directives will be shown in the 103 | # output. They are ignored by default. 104 | # 105 | # show_authors = False 106 | 107 | # The name of the Pygments (syntax highlighting) style to use. 108 | pygments_style = 'sphinx' 109 | 110 | # A list of ignored prefixes for module index sorting. 111 | # modindex_common_prefix = [] 112 | 113 | # If true, keep warnings as "system message" paragraphs in the built documents. 114 | # keep_warnings = False 115 | 116 | # If true, `todo` and `todoList` produce output, else they produce nothing. 117 | todo_include_todos = False 118 | 119 | 120 | # -- Options for HTML output ---------------------------------------------- 121 | 122 | # The theme to use for HTML and HTML Help pages. See the documentation for 123 | # a list of builtin themes. 124 | # 125 | html_theme = 'alabaster' 126 | 127 | # Theme options are theme-specific and customize the look and feel of a theme 128 | # further. For a list of options available for each theme, see the 129 | # documentation. 130 | # 131 | # html_theme_options = {} 132 | 133 | # Add any paths that contain custom themes here, relative to this directory. 134 | # html_theme_path = [] 135 | 136 | # The name for this set of Sphinx documents. 137 | # " v documentation" by default. 138 | # 139 | # html_title = 'Synonym Extractor v0.1.0' 140 | 141 | # A shorter title for the navigation bar. Default is the same as html_title. 142 | # 143 | # html_short_title = None 144 | 145 | # The name of an image file (relative to this directory) to place at the top 146 | # of the sidebar. 147 | # 148 | # html_logo = None 149 | 150 | # The name of an image file (relative to this directory) to use as a favicon of 151 | # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 152 | # pixels large. 153 | # 154 | # html_favicon = None 155 | 156 | # Add any paths that contain custom static files (such as style sheets) here, 157 | # relative to this directory. They are copied after the builtin static files, 158 | # so a file named "default.css" will overwrite the builtin "default.css". 159 | html_static_path = ['_static'] 160 | 161 | # Add any extra paths that contain custom files (such as robots.txt or 162 | # .htaccess) here, relative to this directory. These files are copied 163 | # directly to the root of the documentation. 164 | # 165 | # html_extra_path = [] 166 | 167 | # If not None, a 'Last updated on:' timestamp is inserted at every page 168 | # bottom, using the given strftime format. 169 | # The empty string is equivalent to '%b %d, %Y'. 170 | # 171 | # html_last_updated_fmt = None 172 | 173 | # If true, SmartyPants will be used to convert quotes and dashes to 174 | # typographically correct entities. 175 | # 176 | # html_use_smartypants = True 177 | 178 | # Custom sidebar templates, maps document names to template names. 179 | # 180 | # html_sidebars = {} 181 | 182 | # Additional templates that should be rendered to pages, maps page names to 183 | # template names. 184 | # 185 | # html_additional_pages = {} 186 | 187 | # If false, no module index is generated. 188 | # 189 | # html_domain_indices = True 190 | 191 | # If false, no index is generated. 192 | # 193 | # html_use_index = True 194 | 195 | # If true, the index is split into individual pages for each letter. 196 | # 197 | # html_split_index = False 198 | 199 | # If true, links to the reST sources are added to the pages. 200 | # 201 | # html_show_sourcelink = True 202 | 203 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 204 | # 205 | # html_show_sphinx = True 206 | 207 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 208 | # 209 | # html_show_copyright = True 210 | 211 | # If true, an OpenSearch description file will be output, and all pages will 212 | # contain a tag referring to it. The value of this option must be the 213 | # base URL from which the finished HTML is served. 214 | # 215 | # html_use_opensearch = '' 216 | 217 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 218 | # html_file_suffix = None 219 | 220 | # Language to be used for generating the HTML full-text search index. 221 | # Sphinx supports the following languages: 222 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' 223 | # 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh' 224 | # 225 | # html_search_language = 'en' 226 | 227 | # A dictionary with options for the search language support, empty by default. 228 | # 'ja' uses this config value. 229 | # 'zh' user can custom change `jieba` dictionary path. 230 | # 231 | # html_search_options = {'type': 'default'} 232 | 233 | # The name of a javascript file (relative to the configuration directory) that 234 | # implements a search results scorer. If empty, the default will be used. 235 | # 236 | # html_search_scorer = 'scorer.js' 237 | 238 | # Output file base name for HTML help builder. 239 | htmlhelp_basename = 'SynonymExtractordoc' 240 | 241 | # -- Options for LaTeX output --------------------------------------------- 242 | 243 | latex_elements = { 244 | # The paper size ('letterpaper' or 'a4paper'). 245 | # 246 | # 'papersize': 'letterpaper', 247 | 248 | # The font size ('10pt', '11pt' or '12pt'). 249 | # 250 | # 'pointsize': '10pt', 251 | 252 | # Additional stuff for the LaTeX preamble. 253 | # 254 | # 'preamble': '', 255 | 256 | # Latex figure (float) alignment 257 | # 258 | # 'figure_align': 'htbp', 259 | } 260 | 261 | # Grouping the document tree into LaTeX files. List of tuples 262 | # (source start file, target name, title, 263 | # author, documentclass [howto, manual, or own class]). 264 | latex_documents = [ 265 | (master_doc, 'SynonymExtractor.tex', 'Synonym Extractor Documentation', 266 | 'Vikash Singh', 'manual'), 267 | ] 268 | 269 | # The name of an image file (relative to this directory) to place at the top of 270 | # the title page. 271 | # 272 | # latex_logo = None 273 | 274 | # For "manual" documents, if this is true, then toplevel headings are parts, 275 | # not chapters. 276 | # 277 | # latex_use_parts = False 278 | 279 | # If true, show page references after internal links. 280 | # 281 | # latex_show_pagerefs = False 282 | 283 | # If true, show URL addresses after external links. 284 | # 285 | # latex_show_urls = False 286 | 287 | # Documents to append as an appendix to all manuals. 288 | # 289 | # latex_appendices = [] 290 | 291 | # It false, will not define \strong, \code, itleref, \crossref ... but only 292 | # \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added 293 | # packages. 294 | # 295 | # latex_keep_old_macro_names = True 296 | 297 | # If false, no module index is generated. 298 | # 299 | # latex_domain_indices = True 300 | 301 | 302 | # -- Options for manual page output --------------------------------------- 303 | 304 | # One entry per manual page. List of tuples 305 | # (source start file, name, description, authors, manual section). 306 | man_pages = [ 307 | (master_doc, 'synonymextractor', 'Synonym Extractor Documentation', 308 | [author], 1) 309 | ] 310 | 311 | # If true, show URL addresses after external links. 312 | # 313 | # man_show_urls = False 314 | 315 | 316 | # -- Options for Texinfo output ------------------------------------------- 317 | 318 | # Grouping the document tree into Texinfo files. List of tuples 319 | # (source start file, target name, title, author, 320 | # dir menu entry, description, category) 321 | texinfo_documents = [ 322 | (master_doc, 'SynonymExtractor', 'Synonym Extractor Documentation', 323 | author, 'SynonymExtractor', 'One line description of project.', 324 | 'Miscellaneous'), 325 | ] 326 | 327 | # Documents to append as an appendix to all manuals. 328 | # 329 | # texinfo_appendices = [] 330 | 331 | # If false, no module index is generated. 332 | # 333 | # texinfo_domain_indices = True 334 | 335 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 336 | # 337 | # texinfo_show_urls = 'footnote' 338 | 339 | # If true, do not generate a @detailmenu in the "Top" node's menu. 340 | # 341 | # texinfo_no_detailmenu = False 342 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | This project has moved to `Flash Text `_. 2 | ----------------------------------------- 3 | 4 | Welcome to Synonym Extractor's documentation! 5 | ============================================= 6 | 7 | This project has moved to `Flash Text 8 | `_. 9 | 10 | Synonym Extractor is a python library that is loosely based on `Aho-Corasick algorithm `_. 11 | 12 | The idea is to extract words that we care about from a given sentence in one pass. 13 | 14 | Basically say I have a vocabulary of 10K words and I want to get all the words from that set present in a sentence. A simple regex match will take a lot of time to loop over the 10K documents. 15 | 16 | Hence we use a simpler yet much faster algorithm to get the desired result. 17 | 18 | Why 19 | ------ 20 | 21 | :: 22 | 23 | Say you have a corpus where similar words appear frequently. 24 | 25 | eg: Last weekened I was in NY. 26 | I am traveling to new york next weekend. 27 | 28 | If you train a word2vec model on this or do any sort of NLP it will treat NY and new york as 2 different words. 29 | 30 | Instead if you create a synonym dictionary like: 31 | 32 | eg: NY=>new york 33 | new york=>new york 34 | 35 | Then you can extract NY and new york as the same text. 36 | 37 | To do the same with regex it will take a lot of time: 38 | 39 | ============ ========== = ========= ============ 40 | Docs count # Synonyms : Regex synonym-extractor 41 | ============ ========== = ========= ============ 42 | 1.5 million 2K : 16 hours NA 43 | 2.5 million 10K : 15 days 15 mins 44 | ============ ========== = ========= ============ 45 | 46 | The idea for this library came from the following `StackOverflow question 47 | `_. 48 | 49 | 50 | Installation 51 | ------------ 52 | :: 53 | 54 | pip install synonym-extractor 55 | 56 | API Reference 57 | ------------- 58 | 59 | Begin by importing the module:: 60 | 61 | >>> from synonym.extractor import SynonymExtractor 62 | 63 | Create object:: 64 | 65 | >>> synonym_extractor = SynonymExtractor() 66 | >>> # by default SynonymExtractor is case insensitive. 67 | >>> # for case_sensitive use SynonymExtractor(case_sensitive=True) 68 | 69 | Add synonyms to the class:: 70 | 71 | >>> synonym_names = ['NY', 'new-york', 'SF'] 72 | >>> clean_names = ['New York', 'New York', 'san francisco'] 73 | 74 | >>> for synonym_name, clean_name in zip(synonym_names, clean_names): 75 | >>> synonym_extractor.add_to_synonym(synonym_name, clean_name) 76 | 77 | Get synonyms present in sentence:: 78 | 79 | >>> synonyms_found = synonym_extractor.get_synonyms_from_sentence('I love SF and NY. new-york is the best.') 80 | >>> synonyms_found 81 | ['san francisco', 'New York', 'New York'] 82 | 83 | Define Synonyms 84 | ~~~~~~~~~~~~~~~~~ 85 | 86 | There are 3 ways to define synonyms 87 | 88 | * Build iteratively:: 89 | 90 | >>> synonym_extractor.add_to_synonym('madras', 'chennai') 91 | 92 | * Build with a dict:: 93 | 94 | >>> synonymys_dict = { 95 | >>> "java":["java_2e","java programing"], 96 | >>> "product management":["PM", "product manager"] 97 | >>> } 98 | >>> synonym_extractor.add_to_synonyms_from_dict(synonymys_dict) 99 | 100 | * Pass a file path:: 101 | 102 | >>> # Format supported is 103 | >>> # madras=>chennai 104 | >>> # SF=>san francisco 105 | >>> # NY=>New York 106 | >>> # new-york=>New York 107 | 108 | >>> synonym_extractor.build_synonym('/file_path_to_synonyms.txt') 109 | 110 | 111 | Extract Synonyms 112 | ~~~~~~~~~~~~~~~~~ 113 | :: 114 | 115 | >>> # This method extracts all matching synonyms in the sentense and returns a list 116 | 117 | >>> synonym_extractor.get_synonyms_from_sentence('I love SF and NY. New-york is the best.') 118 | ['san francisco', 'New York', 'New York'] 119 | 120 | Replace Internal White Space Characters 121 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 122 | :: 123 | 124 | >>> # change the internal white space characters 125 | 126 | >>> synonym_extractor = SynonymExtractor() 127 | >>> synonym_extractor._set_white_space_chars(set(['.', ' '])) 128 | 129 | 130 | Contribute 131 | ---------- 132 | 133 | - Issue Tracker: https://github.com/vi3k6i5/synonym-extractor/issues 134 | - Source Code: https://github.com/vi3k6i5/synonym-extractor/ 135 | 136 | 137 | License 138 | ------- 139 | 140 | The project is licensed under the MIT license. -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, Command 2 | import subprocess 3 | 4 | 5 | class PyTest(Command): 6 | user_options = [] 7 | 8 | def initialize_options(self): 9 | pass 10 | 11 | def finalize_options(self): 12 | pass 13 | 14 | def run(self): 15 | errno = subprocess.call(['py.test']) 16 | raise SystemExit(errno) 17 | 18 | setup( 19 | name='synonym-extractor', 20 | version='1.0', 21 | url='http://github.com/vi3k6i5/synonym-extractor', 22 | author='Vikash Singh', 23 | author_email='vikash.duliajan@gmail.com', 24 | description='Extract synonyms from sentences using Aho Corasick algorithm', 25 | long_description=open('README.rst').read(), 26 | packages=['synonym'], 27 | install_requires=[], 28 | platforms='any', 29 | cmdclass={'test': PyTest}, 30 | classifiers=[ 31 | 'Programming Language :: Python', 32 | 'Programming Language :: Python :: 2', 33 | 'Programming Language :: Python :: 2.6', 34 | 'Programming Language :: Python :: 2.7', 35 | 'Programming Language :: Python :: 3.5', 36 | 'Operating System :: OS Independent', 37 | 'Intended Audience :: Developers', 38 | 'License :: OSI Approved :: MIT License', 39 | ] 40 | ) 41 | -------------------------------------------------------------------------------- /synonym/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vi3k6i5/synonym-extractor/18019914ad1ab3cdf0dd50f3d2fcd8bbbb5045d1/synonym/__init__.py -------------------------------------------------------------------------------- /synonym/extractor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import warnings 3 | 4 | # Adding deprication warning 5 | warnings.simplefilter('always', DeprecationWarning) 6 | warnings.warn("This project has been depricated. " + 7 | "Please use FlashText https://github.com/vi3k6i5/flashtext instead.", DeprecationWarning) 8 | 9 | 10 | class SynonymExtractor(object): 11 | """SynonymExtractor 12 | 13 | Parameters 14 | ---------- 15 | case_sensitive : boolean, default False 16 | If you want the module to be case sensitive set it to True 17 | 18 | Attributes 19 | ---------- 20 | `_end` : string, default '_end_' 21 | used to denote end of work in synonym_trie_dict 22 | `_synonym` : string, default '_synonym_' 23 | key in dict. used to store cleaned synonym name which will be returned 24 | `_white_space_chars` : set, default set(['.', '\t', '\n', '\a', ' ']) 25 | values which will be used to identify if we have reached end of term 26 | `synonym_trie_dict` : dict, default {} 27 | trie dict built character by character, that is used for lookup 28 | `case_sensitive` : boolean, default False 29 | if the algorithm should be case sensitive or not 30 | 31 | Examples 32 | -------- 33 | >>> # import module 34 | >>> from synonym.extractor import SynonymExtractor 35 | 36 | >>> # Create an object of SynonymExtractor 37 | >>> synonym_extractor = SynonymExtractor() 38 | 39 | >>> # add synonyms 40 | >>> synonym_names = ['NY', 'new-york', 'SF'] 41 | >>> clean_names = ['new york', 'new york', 'san francisco'] 42 | 43 | >>> for synonym_name, clean_name in zip(synonym_names, clean_names): 44 | >>> synonym_extractor.add_to_synonym(synonym_name, clean_name) 45 | 46 | >>> synonyms_found = synonym_extractor.get_synonyms_from_sentence('I love SF and NY. new-york is the best.') 47 | 48 | >>> synonyms_found 49 | >>> ['san francisco', 'new york', 'new york'] 50 | 51 | References 52 | ---------- 53 | loosely based on https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm. 54 | Idea came from this question https://stackoverflow.com/questions/44178449/regex-replace-is-taking-time-for-millions-of-documents-how-to-make-it-faster 55 | """ 56 | 57 | def __init__(self, case_sensitive=False): 58 | self._end = '_end_' 59 | self._synonym = '_synonym_' 60 | self._white_space_chars = set(['.', '\t', '\n', '\a', ' ']) 61 | self.synonym_trie_dict = dict() 62 | self.case_sensitive = case_sensitive 63 | 64 | def _set_white_space_chars(self, white_space_chars): 65 | """use this method if you want to replace the inbuilt white space chars 66 | Parameters 67 | ---------- 68 | white_space_chars: set 69 | Set of characters that will be considered as whitespaces. 70 | This will denote that the term has ended. 71 | """ 72 | self._white_space_chars = white_space_chars 73 | 74 | def add_to_synonym(self, synonym_name, clean_name): 75 | """ 76 | if you want to add one or more synonym to the dictionary 77 | pass the synonym name and the clean name it maps to 78 | synonym_name: Name of the synonym 79 | clean_name: clean word 80 | Parameters 81 | ---------- 82 | synonym_name : string 83 | keyword that you want to identify 84 | clean_name : string 85 | clean term for that keyword that you would want to get back in return 86 | """ 87 | if synonym_name and clean_name: 88 | if not self.case_sensitive: 89 | synonym_name = synonym_name.lower() 90 | current_dict = self.synonym_trie_dict 91 | for letter in synonym_name: 92 | current_dict = current_dict.setdefault(letter, {}) 93 | current_dict[self._synonym] = clean_name 94 | current_dict[self._end] = self._end 95 | 96 | def build_synonym(self, synonym_file): 97 | """ 98 | if you want to add synonyms from a file 99 | synonym file format should be like: 100 | java_2e=>java 101 | java programing=>java 102 | product management=>product management 103 | product management techniques=>product management 104 | 105 | Parameters 106 | ---------- 107 | synonym_file : path 108 | """ 109 | if not os.path.isfile(synonym_file): 110 | raise("Invalid file path %s".format(synonym_file)) 111 | with open(synonym_file)as f: 112 | for line in f: 113 | synonym_name, clean_name = line.split('=>') 114 | if not self.case_sensitive: 115 | synonym_name = synonym_name.lower() 116 | self.add_to_synonym(unclean_name, clean_name.strip()) 117 | 118 | def add_to_synonyms_from_dict(self, synonym_dict): 119 | """ 120 | if you want to add synonyms from a dictionary 121 | Dict format should be like: 122 | { 123 | "java":["java_2e", "java programing"], 124 | "product management":["PM", "product manager"] 125 | } 126 | """ 127 | for clean_name, synonym_names in synonym_dict.items(): 128 | for synonym_name in synonym_names: 129 | if not self.case_sensitive: 130 | synonym_name = synonym_name.lower() 131 | self.add_to_synonym(synonym_name, clean_name) 132 | 133 | def get_synonyms_from_sentence(self, sentence): 134 | """get synonyms from the input sentence. 135 | Parameters 136 | ---------- 137 | sentence : string 138 | Line of text that you want to extract all terms from 139 | Returns 140 | ------- 141 | synonyms_extracted : 1D array 142 | List of terms found in sentence 143 | """ 144 | if not self.case_sensitive: 145 | sentence = sentence.lower() 146 | synonyms_extracted = [] 147 | current_dict = self.synonym_trie_dict 148 | idx = 0 149 | sentence_len = len(sentence) 150 | while idx < sentence_len: 151 | char = sentence[idx] 152 | # when we reach whitespace 153 | if char in self._white_space_chars: 154 | 155 | # if end is present in current_dict 156 | if self._end in current_dict or char in current_dict: 157 | # update longest sequence found 158 | sequence_found = None 159 | longest_sequence_found = None 160 | if self._end in current_dict: 161 | sequence_found = current_dict[self._synonym] 162 | longest_sequence_found = current_dict[self._synonym] 163 | 164 | # re look for longest_sequence from this position 165 | if char in current_dict: 166 | current_dict_continued = current_dict[char] 167 | 168 | idy = idx + 1 169 | while idy < sentence_len: 170 | inner_char = sentence[idy] 171 | if inner_char in current_dict_continued: 172 | current_dict_continued = current_dict_continued[inner_char] 173 | else: 174 | break 175 | if self._end in current_dict_continued: 176 | # update longest sequence found 177 | longest_sequence_found = current_dict_continued[self._synonym] 178 | idy += 1 179 | if longest_sequence_found != sequence_found: 180 | idx = idy 181 | current_dict = self.synonym_trie_dict 182 | if longest_sequence_found: 183 | synonyms_extracted.append(longest_sequence_found) 184 | 185 | else: 186 | # we reset current_dict 187 | current_dict = self.synonym_trie_dict 188 | elif char in current_dict: 189 | # we can continue from this char 190 | current_dict = current_dict[char] 191 | else: 192 | # we reset current_dict 193 | current_dict = self.synonym_trie_dict 194 | # skip to end of word 195 | idy = idx + 1 196 | while idy < sentence_len: 197 | char = sentence[idy] 198 | if char in self._white_space_chars: 199 | break 200 | idy += 1 201 | idx = idy 202 | # if we are end of sentence and have a sequence discovered 203 | if idx + 1 >= sentence_len: 204 | if self._end in current_dict: 205 | sequence_found = current_dict[self._synonym] 206 | synonyms_extracted.append(sequence_found) 207 | idx += 1 208 | return synonyms_extracted 209 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vi3k6i5/synonym-extractor/18019914ad1ab3cdf0dd50f3d2fcd8bbbb5045d1/test/__init__.py -------------------------------------------------------------------------------- /test/sample_synonyms.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vi3k6i5/synonym-extractor/18019914ad1ab3cdf0dd50f3d2fcd8bbbb5045d1/test/sample_synonyms.txt -------------------------------------------------------------------------------- /test/test_extractor.py: -------------------------------------------------------------------------------- 1 | from synonym.extractor import SynonymExtractor 2 | 3 | 4 | class TestSynonymExtractor(object): 5 | def setup_method(self, method): 6 | print("start testing") 7 | self.synonym_extractor = SynonymExtractor() 8 | synonym_names = ['NY', 'new-york', 'SF'] 9 | clean_names = ['New York', 'New York', 'san francisco'] 10 | for synonym_name, clean_name in zip(synonym_names, clean_names): 11 | self.synonym_extractor.add_to_synonym(synonym_name, clean_name) 12 | 13 | def teardown_method(self, method): 14 | print("end testing") 15 | 16 | def test_extract_synonyms(self): 17 | synonyms_found = self.synonym_extractor.get_synonyms_from_sentence('I love SF and Ny. New-york is the best.') 18 | assert all(x in synonyms_found for x in ['New York', 'san francisco']) 19 | --------------------------------------------------------------------------------