├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── docs ├── Makefile ├── conf.py ├── index.rst └── make.bat ├── inverted_index ├── __init__.py └── core.py ├── requirements.txt ├── setup.py └── tests ├── __init__.py ├── context.py └── test_inverted_index.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | .hypothesis/ 92 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Will Fitzgerald 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 6 | 7 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 8 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 9 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 10 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | init: 2 | pip install -r requirements.txt 3 | 4 | test: 5 | nosetests tests 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Inverted Index 2 | ============== 3 | 4 | A simple in-memory inverted index system, with a modest query language. 5 | 6 | 7 | i = inverted_index.Index() 8 | i.index(1, "this is the day they give babies away with half a pound of tea") 9 | i.index(1, "if you know any ladies who need any babies just send them round to ") 10 | i.index(2, "babies are born in the circle of the sun") 11 | results, err = i.query("babies") 12 | print(results) 13 | {1,2} 14 | results, err = i.query("babies AND ladies") 15 | print(results) 16 | {1} 17 | i.index(3, "WHERE ARE THE BABIES", tokenizer=lambda s:s.lower().split()) 18 | results, err = i.query("babies") 19 | print(results) 20 | {1,2,3} 21 | i.unindex(3) 22 | results, err = i.query("babies") 23 | print(results) 24 | {1,2} 25 | 26 | Any hashable object can be the "document", and a tokenizer can be specified to tokenize the 27 | text to index. There are also `add_token` and `add_tokens` methods to directly index on individual 28 | tokens. 29 | 30 | The query language is very simple: it understands AND and OR, NOT, and parentheses. For example: 31 | 32 | term OR term 33 | term AND term OR term 34 | (term AND term) OR term 35 | NOT term 36 | NOT term AND (term OR term) 37 | 38 | `AND`, `OR`, and `NOT` have equal precedence, so use parentheses to disambiguate. 39 | 40 | I'm pretty sure you don't want to use this in production code :) 41 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 18 | 19 | help: 20 | @echo "Please use \`make ' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " dirhtml to make HTML files named index.html in directories" 23 | @echo " singlehtml to make a single large HTML file" 24 | @echo " pickle to make pickle files" 25 | @echo " json to make JSON files" 26 | @echo " htmlhelp to make HTML files and a HTML help project" 27 | @echo " qthelp to make HTML files and a qthelp project" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 31 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 32 | @echo " text to make text files" 33 | @echo " man to make manual pages" 34 | @echo " texinfo to make Texinfo files" 35 | @echo " info to make Texinfo files and run them through makeinfo" 36 | @echo " gettext to make PO message catalogs" 37 | @echo " changes to make an overview of all changed/added/deprecated items" 38 | @echo " linkcheck to check all external links for integrity" 39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 40 | 41 | clean: 42 | -rm -rf $(BUILDDIR)/* 43 | 44 | html: 45 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 48 | 49 | dirhtml: 50 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 51 | @echo 52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 53 | 54 | singlehtml: 55 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 56 | @echo 57 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 58 | 59 | pickle: 60 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 61 | @echo 62 | @echo "Build finished; now you can process the pickle files." 63 | 64 | json: 65 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 66 | @echo 67 | @echo "Build finished; now you can process the JSON files." 68 | 69 | htmlhelp: 70 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 71 | @echo 72 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 73 | ".hhp project file in $(BUILDDIR)/htmlhelp." 74 | 75 | qthelp: 76 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 77 | @echo 78 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 79 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 80 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/inverted_index.qhcp" 81 | @echo "To view the help file:" 82 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/inverted_index.qhc" 83 | 84 | devhelp: 85 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 86 | @echo 87 | @echo "Build finished." 88 | @echo "To view the help file:" 89 | @echo "# mkdir -p $$HOME/.local/share/devhelp/inverted_index" 90 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/inverted_index" 91 | @echo "# devhelp" 92 | 93 | epub: 94 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 95 | @echo 96 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 97 | 98 | latex: 99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 100 | @echo 101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 103 | "(use \`make latexpdf' here to do that automatically)." 104 | 105 | latexpdf: 106 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 107 | @echo "Running LaTeX files through pdflatex..." 108 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 109 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 110 | 111 | text: 112 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 113 | @echo 114 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 115 | 116 | man: 117 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 118 | @echo 119 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 120 | 121 | texinfo: 122 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 123 | @echo 124 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 125 | @echo "Run \`make' in that directory to run these through makeinfo" \ 126 | "(use \`make info' here to do that automatically)." 127 | 128 | info: 129 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 130 | @echo "Running Texinfo files through makeinfo..." 131 | make -C $(BUILDDIR)/texinfo info 132 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 133 | 134 | gettext: 135 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 136 | @echo 137 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 138 | 139 | changes: 140 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 141 | @echo 142 | @echo "The overview file is in $(BUILDDIR)/changes." 143 | 144 | linkcheck: 145 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 146 | @echo 147 | @echo "Link check complete; look for any errors in the above output " \ 148 | "or in $(BUILDDIR)/linkcheck/output.txt." 149 | 150 | doctest: 151 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 152 | @echo "Testing of doctests in the sources finished, look at the " \ 153 | "results in $(BUILDDIR)/doctest/output.txt." 154 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # inverted_index documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Apr 16 21:22:43 2012. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import os 15 | import sys 16 | 17 | # If extensions (or modules to document with autodoc) are in another directory, 18 | # add these directories to sys.path here. If the directory is relative to the 19 | # documentation root, use os.path.abspath to make it absolute, like shown here. 20 | #sys.path.insert(0, os.path.abspath('.')) 21 | 22 | # -- General configuration ----------------------------------------------------- 23 | 24 | # If your documentation needs a minimal Sphinx version, state it here. 25 | #needs_sphinx = '1.0' 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be extensions 28 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 29 | extensions = [] 30 | 31 | # Add any paths that contain templates here, relative to this directory. 32 | templates_path = ['_templates'] 33 | 34 | # The suffix of source filenames. 35 | source_suffix = '.rst' 36 | 37 | # The encoding of source files. 38 | #source_encoding = 'utf-8-sig' 39 | 40 | # The master toctree document. 41 | master_doc = 'index' 42 | 43 | # General information about the project. 44 | project = u'inverted_index' 45 | copyright = u'2016, Will Fitzgerald' 46 | 47 | # The version info for the project you're documenting, acts as replacement for 48 | # |version| and |release|, also used in various other places throughout the 49 | # built documents. 50 | # 51 | # The short X.Y version. 52 | version = 'v0.0.1' 53 | # The full version, including alpha/beta/rc tags. 54 | release = 'v0.0.1' 55 | 56 | # The language for content autogenerated by Sphinx. Refer to documentation 57 | # for a list of supported languages. 58 | #language = None 59 | 60 | # There are two options for replacing |today|: either, you set today to some 61 | # non-false value, then it is used: 62 | #today = '' 63 | # Else, today_fmt is used as the format for a strftime call. 64 | #today_fmt = '%B %d, %Y' 65 | 66 | # List of patterns, relative to source directory, that match files and 67 | # directories to ignore when looking for source files. 68 | exclude_patterns = ['_build'] 69 | 70 | # The reST default role (used for this markup: `text`) to use for all documents. 71 | #default_role = None 72 | 73 | # If true, '()' will be appended to :func: etc. cross-reference text. 74 | #add_function_parentheses = True 75 | 76 | # If true, the current module name will be prepended to all description 77 | # unit titles (such as .. function::). 78 | #add_module_names = True 79 | 80 | # If true, sectionauthor and moduleauthor directives will be shown in the 81 | # output. They are ignored by default. 82 | #show_authors = False 83 | 84 | # The name of the Pygments (syntax highlighting) style to use. 85 | pygments_style = 'sphinx' 86 | 87 | # A list of ignored prefixes for module index sorting. 88 | #modindex_common_prefix = [] 89 | 90 | # -- Options for HTML output --------------------------------------------------- 91 | 92 | # The theme to use for HTML and HTML Help pages. See the documentation for 93 | # a list of builtin themes. 94 | html_theme = 'default' 95 | 96 | # Theme options are theme-specific and customize the look and feel of a theme 97 | # further. For a list of options available for each theme, see the 98 | # documentation. 99 | #html_theme_options = {} 100 | 101 | # Add any paths that contain custom themes here, relative to this directory. 102 | #html_theme_path = [] 103 | 104 | # The name for this set of Sphinx documents. If None, it defaults to 105 | # " v documentation". 106 | #html_title = None 107 | 108 | # A shorter title for the navigation bar. Default is the same as html_title. 109 | #html_short_title = None 110 | 111 | # The name of an image file (relative to this directory) to place at the top 112 | # of the sidebar. 113 | #html_logo = None 114 | 115 | # The name of an image file (within the static path) to use as favicon of the 116 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 117 | # pixels large. 118 | #html_favicon = None 119 | 120 | # Add any paths that contain custom static files (such as style sheets) here, 121 | # relative to this directory. They are copied after the builtin static files, 122 | # so a file named "default.css" will overwrite the builtin "default.css". 123 | html_static_path = ['_static'] 124 | 125 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 126 | # using the given strftime format. 127 | #html_last_updated_fmt = '%b %d, %Y' 128 | 129 | # If true, SmartyPants will be used to convert quotes and dashes to 130 | # typographically correct entities. 131 | #html_use_smartypants = True 132 | 133 | # Custom sidebar templates, maps document names to template names. 134 | #html_sidebars = {} 135 | 136 | # Additional templates that should be rendered to pages, maps page names to 137 | # template names. 138 | #html_additional_pages = {} 139 | 140 | # If false, no module index is generated. 141 | #html_domain_indices = True 142 | 143 | # If false, no index is generated. 144 | #html_use_index = True 145 | 146 | # If true, the index is split into individual pages for each letter. 147 | #html_split_index = False 148 | 149 | # If true, links to the reST sources are added to the pages. 150 | #html_show_sourcelink = True 151 | 152 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 153 | #html_show_sphinx = True 154 | 155 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 156 | #html_show_copyright = True 157 | 158 | # If true, an OpenSearch description file will be output, and all pages will 159 | # contain a tag referring to it. The value of this option must be the 160 | # base URL from which the finished HTML is served. 161 | #html_use_opensearch = '' 162 | 163 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 164 | #html_file_suffix = None 165 | 166 | # Output file base name for HTML help builder. 167 | htmlhelp_basename = 'inverted_indexdoc' 168 | 169 | # -- Options for LaTeX output -------------------------------------------------- 170 | 171 | latex_elements = { 172 | # The paper size ('letterpaper' or 'a4paper'). 173 | #'papersize': 'letterpaper', 174 | 175 | # The font size ('10pt', '11pt' or '12pt'). 176 | #'pointsize': '10pt', 177 | 178 | # Additional stuff for the LaTeX preamble. 179 | #'preamble': '', 180 | } 181 | 182 | # Grouping the document tree into LaTeX files. List of tuples 183 | # (source start file, target name, title, author, documentclass [howto/manual]). 184 | latex_documents = [ 185 | ('index', 'inverted_index.tex', u'inverted_index Documentation', 186 | u'Kenneth Reitz', 'manual'), 187 | ] 188 | 189 | # The name of an image file (relative to this directory) to place at the top of 190 | # the title page. 191 | #latex_logo = None 192 | 193 | # For "manual" documents, if this is true, then toplevel headings are parts, 194 | # not chapters. 195 | #latex_use_parts = False 196 | 197 | # If true, show page references after internal links. 198 | #latex_show_pagerefs = False 199 | 200 | # If true, show URL addresses after external links. 201 | #latex_show_urls = False 202 | 203 | # Documents to append as an appendix to all manuals. 204 | #latex_appendices = [] 205 | 206 | # If false, no module index is generated. 207 | #latex_domain_indices = True 208 | 209 | # -- Options for manual page output -------------------------------------------- 210 | 211 | # One entry per manual page. List of tuples 212 | # (source start file, name, description, authors, manual section). 213 | man_pages = [ 214 | ('index', 'inverted_index', u'inverted_index Documentation', 215 | [u'Kenneth Reitz'], 1) 216 | ] 217 | 218 | # If true, show URL addresses after external links. 219 | #man_show_urls = False 220 | 221 | # -- Options for Texinfo output ------------------------------------------------ 222 | 223 | # Grouping the document tree into Texinfo files. List of tuples 224 | # (source start file, target name, title, author, 225 | # dir menu entry, description, category) 226 | texinfo_documents = [ 227 | ('index', 'inverted_index', u'Inverted Index Documentation', 228 | u'Will Fitzgerald Reitz', 'inverted_index', 229 | 'Simple in memory inverted index.', 'Miscellaneous'), 230 | ] 231 | 232 | # Documents to append as an appendix to all manuals. 233 | #texinfo_appendices = [] 234 | 235 | # If false, no module index is generated. 236 | #texinfo_domain_indices = True 237 | 238 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 239 | #texinfo_show_urls = 'footnote' 240 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. sample documentation master file, created by 2 | sphinx-quickstart on Mon Apr 16 21:22:43 2012. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to sample's documentation! 7 | ================================== 8 | 9 | Contents: 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | 15 | 16 | Indices and tables 17 | ================== 18 | 19 | * :ref:`genindex` 20 | * :ref:`modindex` 21 | * :ref:`search` 22 | 23 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. linkcheck to check all external links for integrity 37 | echo. doctest to run all doctests embedded in the documentation if enabled 38 | goto end 39 | ) 40 | 41 | if "%1" == "clean" ( 42 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 43 | del /q /s %BUILDDIR%\* 44 | goto end 45 | ) 46 | 47 | if "%1" == "html" ( 48 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 49 | if errorlevel 1 exit /b 1 50 | echo. 51 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 52 | goto end 53 | ) 54 | 55 | if "%1" == "dirhtml" ( 56 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 57 | if errorlevel 1 exit /b 1 58 | echo. 59 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 60 | goto end 61 | ) 62 | 63 | if "%1" == "singlehtml" ( 64 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 68 | goto end 69 | ) 70 | 71 | if "%1" == "pickle" ( 72 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished; now you can process the pickle files. 76 | goto end 77 | ) 78 | 79 | if "%1" == "json" ( 80 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished; now you can process the JSON files. 84 | goto end 85 | ) 86 | 87 | if "%1" == "htmlhelp" ( 88 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can run HTML Help Workshop with the ^ 92 | .hhp project file in %BUILDDIR%/htmlhelp. 93 | goto end 94 | ) 95 | 96 | if "%1" == "qthelp" ( 97 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 98 | if errorlevel 1 exit /b 1 99 | echo. 100 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 101 | .qhcp project file in %BUILDDIR%/qthelp, like this: 102 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\sample.qhcp 103 | echo.To view the help file: 104 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\sample.ghc 105 | goto end 106 | ) 107 | 108 | if "%1" == "devhelp" ( 109 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 110 | if errorlevel 1 exit /b 1 111 | echo. 112 | echo.Build finished. 113 | goto end 114 | ) 115 | 116 | if "%1" == "epub" ( 117 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 118 | if errorlevel 1 exit /b 1 119 | echo. 120 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 121 | goto end 122 | ) 123 | 124 | if "%1" == "latex" ( 125 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 129 | goto end 130 | ) 131 | 132 | if "%1" == "text" ( 133 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The text files are in %BUILDDIR%/text. 137 | goto end 138 | ) 139 | 140 | if "%1" == "man" ( 141 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 145 | goto end 146 | ) 147 | 148 | if "%1" == "texinfo" ( 149 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 150 | if errorlevel 1 exit /b 1 151 | echo. 152 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 153 | goto end 154 | ) 155 | 156 | if "%1" == "gettext" ( 157 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 158 | if errorlevel 1 exit /b 1 159 | echo. 160 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 161 | goto end 162 | ) 163 | 164 | if "%1" == "changes" ( 165 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 166 | if errorlevel 1 exit /b 1 167 | echo. 168 | echo.The overview file is in %BUILDDIR%/changes. 169 | goto end 170 | ) 171 | 172 | if "%1" == "linkcheck" ( 173 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 174 | if errorlevel 1 exit /b 1 175 | echo. 176 | echo.Link check complete; look for any errors in the above output ^ 177 | or in %BUILDDIR%/linkcheck/output.txt. 178 | goto end 179 | ) 180 | 181 | if "%1" == "doctest" ( 182 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 183 | if errorlevel 1 exit /b 1 184 | echo. 185 | echo.Testing of doctests in the sources finished, look at the ^ 186 | results in %BUILDDIR%/doctest/output.txt. 187 | goto end 188 | ) 189 | 190 | :end 191 | -------------------------------------------------------------------------------- /inverted_index/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import Index, reduce_by_intersection 2 | -------------------------------------------------------------------------------- /inverted_index/core.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import collections 3 | import warnings 4 | from functools import reduce 5 | 6 | 7 | class Index(object): 8 | def __init__(self): 9 | self.inverted_index = dict() 10 | self.reserved = {'AND': 2, 'OR': 2, '(': None, ')': None, 'NOT': 1} 11 | self.document_counts = collections.Counter() 12 | self.token_counts = collections.Counter() 13 | self.documents = dict() 14 | self.operations = { 15 | "AND": reduce_by_intersection, 16 | "OR": 17 | lambda args: reduce(lambda s1, s2: s1.union(s2), args, set()), 18 | "NOT": lambda args: self.document_ids().difference(args[0]) 19 | } 20 | 21 | def cardinality(self, operator): 22 | return self.reserved[operator] 23 | 24 | def document_ids(self): 25 | return set(self.document_counts.keys()) 26 | 27 | def document(self, document_id): 28 | try: 29 | return (self.documents[document_id], None) 30 | except KeyError as e: 31 | return (None, e) 32 | 33 | def index_token(self, document_id, token): 34 | self.document_counts[document_id] += 1 35 | self.token_counts[token] += 1 36 | if token not in self.inverted_index: 37 | self.inverted_index[token] = collections.Counter() 38 | self.inverted_index[token][document_id] += 1 39 | 40 | def index_tokens(self, document_id, tokens): 41 | for token in tokens: 42 | self.index_token(document_id, token) 43 | 44 | def index(self, document_id, sentence, tokenizer=lambda s: s.split()): 45 | self.index_tokens(document_id, tokenizer(sentence)) 46 | 47 | def index_document(self, 48 | document_id, 49 | document, 50 | tokenizer=lambda s: s.split()): 51 | for key, value in document.items(): 52 | self.index_field(document_id, key, to_list(value), tokenizer) 53 | self.documents[document_id] = document 54 | 55 | def index_field(self, 56 | document_id, 57 | field_name, 58 | field_values, 59 | tokenizer=lambda s: s.split()): 60 | for value in to_list(field_values): 61 | self.index(document_id, value, tokenizer) 62 | tokens = ["{0}:{1}".format(field_name, token) 63 | for token in tokenizer(value)] 64 | self.index_tokens(document_id, tokens) 65 | 66 | def unindex_field(self, 67 | document_id, 68 | field_name, 69 | field_values=None, 70 | tokenizer=lambda s: s.split()): 71 | if not field_values: 72 | document, err = self.document(document_id) 73 | if err: 74 | field_values = [] 75 | else: 76 | field_values = to_list(document.get(field_name, [])) 77 | for value in field_values: 78 | self.unindex_string(document_id, value, tokenizer) 79 | tokens = ["{0}:{1}".format(field_name, token) 80 | for token in tokenizer(value)] 81 | self.unindex_tokens(document_id, tokens) 82 | 83 | def unindex_string(self, 84 | document_id, 85 | sentence, 86 | tokenizer=lambda s: s.split()): 87 | self.unindex_tokens(document_id, tokenizer(sentence)) 88 | 89 | def unindex_tokens(self, document_id, tokens): 90 | removes = [] 91 | for token in tokens: 92 | if document_id in self.inverted_index[token]: 93 | # decrease inverted_index count 94 | token_count = self.inverted_index[token][document_id] 95 | del self.inverted_index[token][document_id] 96 | count = self.inverted_index[token][document_id] 97 | # decrease doc count 98 | self.document_counts[document_id] -= token_count 99 | count = self.document_counts[document_id] 100 | if count == 0: 101 | del self.document_counts[document_id] 102 | self.token_counts[token] -= token_count 103 | count = self.token_counts[token] 104 | 105 | if count == 0: 106 | del self.token_counts[token] 107 | if len(self.inverted_index[token]) == 0: 108 | removes.append(token) 109 | for token in removes: 110 | del self.inverted_index[token] 111 | 112 | def unindex_document(self, document_id, tokenizer=lambda s: s.split()): 113 | document, err = self.document(document_id) 114 | if document: 115 | for key, value in document.items(): 116 | self.unindex_field(document_id, key, to_list(value), tokenizer) 117 | if document_id in self.documents: 118 | del self.documents[document_id] 119 | 120 | def unindex(self, document_id): 121 | self.unindex_tokens(document_id, self.inverted_index.keys()) 122 | 123 | def query_token(self, token): 124 | return set( 125 | self.inverted_index.get(token, collections.Counter()).keys()) 126 | 127 | def query(self, q): 128 | try: 129 | return (self.process_query( 130 | q.replace('(', ' ( ').replace(')', ' ) ').split()), None) 131 | except Exception as e: 132 | return (set(), e) 133 | 134 | def process_query(self, expr): 135 | def is_term(token): 136 | return token not in self.reserved 137 | 138 | def is_op(token): 139 | return token in self.operations 140 | 141 | def is_lp(token): 142 | return token == '(' 143 | 144 | def is_rp(token): 145 | return token == ')' 146 | 147 | def apply_operator(op, args): 148 | fn = self.operations.get(op, None) 149 | if fn: 150 | return fn(args) 151 | else: 152 | warnings.warn("Unknown operator: {0}".format(op)) 153 | return set() 154 | 155 | value_stack = list() 156 | operator_stack = list() 157 | 158 | def reduce_operators(): 159 | # print("reducing inside") 160 | op = operator_stack.pop() 161 | args = [value_stack.pop() for i in range(self.cardinality(op))] 162 | v = apply_operator(op, args) 163 | # print("op", op, "s1", s1, "s2", s2, "value", v) 164 | value_stack.append(v) 165 | 166 | # print("processing tokens") 167 | for token in expr: 168 | # print("current token is", token) 169 | if is_term(token): 170 | # print("Found a term", token) 171 | value_stack.append(self.query_token(token)) 172 | elif is_lp(token): 173 | # print("found a lp", token) 174 | operator_stack.append(token) 175 | elif is_rp(token): 176 | # print("found a rp", token) 177 | while len(operator_stack) > 0 and not is_lp(operator_stack[ 178 | -1]): 179 | reduce_operators() 180 | operator_stack.pop() # pop off '(' 181 | elif is_op(token): 182 | # print("found AND or OR, indexing to operator_stack") 183 | operator_stack.append(token) 184 | # print("operating on operator stack") 185 | while len(operator_stack) > 0: 186 | reduce_operators() 187 | # print("reducing values") 188 | return reduce_by_intersection(value_stack) 189 | 190 | 191 | def reduce_by_intersection(sets): 192 | if len(sets) == 0: 193 | return set() 194 | else: 195 | head = sets[0] 196 | tail = sets[1:] 197 | return reduce(lambda s1, s2: s1.intersection(s2), tail, head) 198 | 199 | 200 | def to_list(item): 201 | if type(item) is str: 202 | return [item] 203 | if type(item) is list: 204 | return item 205 | if getattr(item, '__iter__', None): 206 | list(item) 207 | return [item] 208 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | nose 2 | sphinx 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from setuptools import find_packages, setup 4 | 5 | with open('README.rst') as f: 6 | readme = f.read() 7 | 8 | with open('LICENSE') as f: 9 | license = f.read() 10 | 11 | setup( 12 | name='sample', 13 | version='0.0.1', 14 | description='Simple in-memory inverted index code', 15 | long_description=readme, 16 | author='Will Fitzgerald', 17 | author_email='will.fitzgerald@pobox.com', 18 | url='https://github.com/willf/inverted_index', 19 | license=license, 20 | packages=find_packages(exclude=('tests', 'docs'))) 21 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/willf/inverted_index/021bc17038af4c49b518976b32e2f2e8f426e1b7/tests/__init__.py -------------------------------------------------------------------------------- /tests/context.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import sys 5 | 6 | import inverted_index 7 | 8 | sys.path.insert(0, 9 | os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 10 | -------------------------------------------------------------------------------- /tests/test_inverted_index.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | 5 | from .context import inverted_index 6 | 7 | 8 | def sett(*args): 9 | return set(args) 10 | 11 | 12 | class InvertedIndexTextSuite(unittest.TestCase): 13 | def test_create_index(self): 14 | i = inverted_index.Index() 15 | 16 | def test_reduce_by_intersection_empty(self): 17 | s = inverted_index.reduce_by_intersection([]) 18 | assert s == set() 19 | 20 | def test_reduce_by_intersection_1(self): 21 | s = inverted_index.reduce_by_intersection([sett(1, 2, 3)]) 22 | assert s == sett(1, 2, 3) 23 | 24 | def test_reduce_by_intersection_2(self): 25 | s = inverted_index.reduce_by_intersection([sett(1, 2, 3), sett(2, 3)]) 26 | assert s == sett(2, 3) 27 | 28 | def test_reduce_by_intersection_many(self): 29 | s = inverted_index.reduce_by_intersection( 30 | [sett(1, 2, 3, 4, 5), sett(3, 4, 5), sett(3, 4), sett(3)]) 31 | assert s == sett(3) 32 | 33 | def test_index_token(self): 34 | i = inverted_index.Index() 35 | i.index_token(1, "test") 36 | assert len(i.inverted_index.get("test", set())) > 0 37 | 38 | def test_index_tokens(self): 39 | i = inverted_index.Index() 40 | i.index_tokens(1, ["a", 'b']) 41 | assert len(i.inverted_index) == 2 42 | 43 | def test_index_default(self): 44 | i = inverted_index.Index() 45 | i.index(1, "this is the day they give babies away") 46 | assert len(i.inverted_index) == 8 47 | 48 | def test_index_with_tokenizer(self): 49 | i = inverted_index.Index() 50 | i.index( 51 | 1, 52 | "this is the day they give babies away", 53 | tokenizer=lambda s: [s]) 54 | assert len(i.inverted_index) == 1 55 | 56 | def test_query_simple(self): 57 | i = inverted_index.Index() 58 | i.index(1, "i love bess") 59 | i.index(2, "i love liz") 60 | i.index(3, "i love mark") 61 | s, err = i.query('love') 62 | assert err is None 63 | assert s == set([1, 2, 3]) 64 | 65 | def test_query_simple_parens(self): 66 | i = inverted_index.Index() 67 | i.index(1, "i love bess") 68 | i.index(2, "i love liz") 69 | i.index(3, "i love mark") 70 | s, err = i.query('(((love)))') 71 | assert err is None 72 | assert s == set([1, 2, 3]) 73 | 74 | def test_query_simple_err(self): 75 | i = inverted_index.Index() 76 | i.index(1, "i love bess") 77 | i.index(2, "i love liz") 78 | i.index(3, "i love mark") 79 | s, err = i.query('(((love') 80 | assert err is not None 81 | assert s == set() 82 | 83 | def test_query_simple_OR(self): 84 | i = inverted_index.Index() 85 | i.index(1, "i love bess") 86 | i.index(2, "i love liz") 87 | i.index(3, "i love mark") 88 | s, err = i.query('liz OR mark') 89 | assert err is None 90 | assert s == set([2, 3]) 91 | 92 | def test_query_simple_AND(self): 93 | i = inverted_index.Index() 94 | i.index(1, "i love bess") 95 | i.index(2, "i love liz") 96 | i.index(3, "i love mark") 97 | s, err = i.query('liz AND mark') 98 | assert err is None 99 | assert s == set() 100 | s, err = i.query('i AND love') 101 | assert err is None 102 | assert s == set([1, 2, 3]) 103 | 104 | def test_query_simple_NOT(self): 105 | i = inverted_index.Index() 106 | i.index(1, "i love bess") 107 | i.index(2, "i love liz") 108 | i.index(3, "i love mark") 109 | s, err = i.query('NOT bess') 110 | print(s) 111 | a, err = i.query("bess") 112 | v = i.document_ids().difference(a) 113 | assert err is None 114 | assert s == set([2, 3]) 115 | s, err = i.query('NOT i') 116 | assert err is None 117 | assert s == set() 118 | 119 | def test_query_fancy(self): 120 | i = inverted_index.Index() 121 | i.index(1, "i love bess") 122 | i.index(2, "i love liz") 123 | i.index(3, "i love mark") 124 | i.index(4, 'you hate hitler') 125 | s, err = i.query( 126 | "((love AND i) AND bess AND NOT mark) OR ((((hitler))))") 127 | assert err is None 128 | assert s == set([1, 4]) 129 | 130 | def test_query_and_or_lower(self): 131 | i = inverted_index.Index() 132 | i.index(1, "i love a good or") 133 | i.index(2, "and i love a good and") 134 | s, err = i.query('or') 135 | assert err is None 136 | assert s == set([1]) 137 | s, err = i.query('and') 138 | assert err is None 139 | assert s == set([2]) 140 | s, err = i.query('or OR and') 141 | print(err) 142 | assert err is None 143 | assert s == set([1, 2]) 144 | s, err = i.query('or AND and') 145 | assert err is None 146 | assert s == set([]) 147 | 148 | def test_hashable(self): 149 | i = inverted_index.Index() 150 | i.index("document_1", "i love bess") 151 | i.index("document_2", "i love liz") 152 | i.index("document_3", "i love mark") 153 | s, err = i.query('bess') 154 | assert err is None 155 | assert s == set(["document_1"]) 156 | 157 | def test_unindex(self): 158 | i = inverted_index.Index() 159 | i.index(1, "i love bess") 160 | i.unindex(1) 161 | assert len(i.inverted_index) == 0 162 | assert len(i.token_counts) == 0 163 | assert len(i.document_counts) == 0 164 | 165 | def test_unindex_2(self): 166 | i = inverted_index.Index() 167 | i.index(1, "i love bess") 168 | i.index(2, "i love liz") 169 | i.index(3, "i love mark") 170 | i.index(4, 'you hate hitler') 171 | i.unindex(1) 172 | assert len(i.document_ids()) == 3 173 | s, err = i.query("love") 174 | assert err is None 175 | assert s == set([2, 3]) 176 | 177 | def test_index_non_existant(self): 178 | i = inverted_index.Index() 179 | i.index(1, "i love bess") 180 | i.index(2, "i love liz") 181 | i.index(3, "i love mark") 182 | i.index(4, 'you hate hitler') 183 | i.unindex(11111111) 184 | assert len(i.document_ids()) == 4 185 | 186 | def test_index_document_text(self): 187 | i = inverted_index.Index() 188 | i.index_document(1, {'identifier': 'document_1', 189 | 'title': 'I love bess'}) 190 | s, err = i.query("bess") 191 | assert err is None 192 | assert s == set([1]) 193 | 194 | def test_index_document_field(self): 195 | i = inverted_index.Index() 196 | i.index_document(1, {'identifier': 'document_1', 197 | 'title': 'I love bess'}) 198 | s, err = i.query("identifier:document_1") 199 | assert err is None 200 | assert s == set([1]) 201 | 202 | def test_unindex_document(self): 203 | i = inverted_index.Index() 204 | i.index_document(1, {'identifier': 'document_1', 205 | 'title': 'I love bess'}) 206 | assert len(i.document_ids()) == 1 207 | i.unindex_document(1) 208 | assert len(i.document_ids()) == 0 209 | assert len(i.inverted_index) == 0 210 | 211 | def test_index_unindex_field_values(self): 212 | i = inverted_index.Index() 213 | i.index_document(1, {'identifier': 'document_1', 214 | 'title': 'I love bess'}) 215 | s, err = i.query("title:bess") 216 | assert err is None 217 | assert s == set([1]) 218 | i.unindex_field(1, 'title') 219 | s, err = i.query("title:bess") 220 | assert err is None 221 | assert s == set() 222 | i.index_field(1, 'title', 'I like ike') 223 | s, err = i.query("title:ike") 224 | assert err is None 225 | assert s == set([1]) 226 | 227 | def test_doc_not_found(self): 228 | i = inverted_index.Index() 229 | d, err = i.document('notfound') 230 | assert err is not None 231 | 232 | 233 | if __name__ == '__main__': 234 | unittest.main() 235 | --------------------------------------------------------------------------------