├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── doc ├── Makefile ├── conf.py ├── doc_reader.rst ├── freq_tools.rst ├── index.rst ├── query_scorer.rst ├── sample.py ├── sample.rst ├── sim_index.rst ├── sim_index │ ├── concurrent_sim_index.rst │ ├── map_sim_index.rst │ ├── memory_sim_index.rst │ ├── remote_sim_index.rst │ ├── shelf_sim_index.rst │ ├── sim_index.rst │ └── sim_index_collection.rst ├── sim_server.rst ├── similarity.rst ├── static │ └── custom.css └── term_vec.rst ├── pysimsearch ├── __init__.py ├── doc_reader.py ├── exceptions.py ├── freq_tools.py ├── query_scorer.py ├── sim_index │ ├── __init__.py │ ├── concurrent_sim_index.py │ ├── map_sim_index.py │ ├── memory_sim_index.py │ ├── remote_sim_index.py │ ├── shelf_sim_index.py │ ├── sim_index.py │ └── sim_index_collection.py ├── sim_server.py ├── similarity.py ├── term_vec.py └── test │ ├── __init__.py │ ├── freq_tools_test.py │ ├── sim_index_test.py │ ├── similarity_test.py │ └── term_vec_test.py ├── sample.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.idx 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010, Taher Haveliwala 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | * The names of project contributors may not be used to endorse or 12 | promote products derived from this software without specific 13 | prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 | HOLDER OR CONTRIBUTOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | include sample.py 4 | recursive-include docs *.html *.css *.png *.gif 5 | recursive-include doc * 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | pysimsearch 2 | =========== 3 | 4 | Python library for indexing and similarity-search. 5 | 6 | Full documentation is at http://taherh.github.com/pysimsearch/ 7 | 8 | This library is primarily meant to illustrate the basic workings of similarity 9 | and indexing engines, without focusing heavily on optimization. Certain 10 | patterns used for scaling indexes (e.g., distributed indexes) are included. 11 | 12 | Although the code is currently for Python 2.7 series, we use ``__future__`` 13 | imports to match Python 3 as closely as possible. 14 | 15 | If you are interested in learning more about search and information retrieval, 16 | I highly recommend the following two books: 17 | 18 | * [Managing Gigabytes](http://amzn.to/qg6Zhe), by Witten, Moffat, and Bell 19 | * [Introduction to Information Retrieval](http://amzn.to/oz2O27), by Manning, Schütze, and Raghavan 20 | 21 | Sample command-line usage 22 | ------------------------- 23 | 24 | Compute pair-wise similarity of 3 webpages: 25 | 26 | bash$ python pysimsearch/similarity.py http://www.stanford.edu/ http://www.berkeley.edu/ http://www.mit.edu/ 27 | Comparing files ['http://www.stanford.edu/', 'http://www.berkeley.edu/', 'http://www.mit.edu/'] 28 | sim(http://www.stanford.edu/,http://www.berkeley.edu/)=0.322771960247 29 | sim(http://www.stanford.edu/,http://www.mit.edu/)=0.142787018368 30 | sim(http://www.berkeley.edu/,http://www.mit.edu/)=0.248877629741 31 | 32 | Sample API usage 33 | ---------------- 34 | 35 | from __future__ import(division, absolute_import, print_function, 36 | unicode_literals) 37 | 38 | from pprint import pprint 39 | from pysimsearch.sim_index import MemorySimIndex 40 | from pysimsearch import doc_reader 41 | from pysimsearch import similarity 42 | 43 | # Compare web-page similarities 44 | print("Printing pairwise similarities of university homepages") 45 | pprint(similarity.pairwise_compare(urls=['http://www.stanford.edu/', 46 | 'http://www.berkeley.edu/', 47 | 'http://www.ucla.edu', 48 | 'http://www.mit.edu/'])) 49 | 50 | # Create an in-memory index and query it 51 | print("Creating in-memory index of university homepages") 52 | sim_index = MemorySimIndex() 53 | sim_index.index_urls('http://www.stanford.edu/', 54 | 'http://www.berkeley.edu', 55 | 'http://www.ucla.edu', 56 | 'http://www.mit.edu') 57 | 58 | print("Postings list for 'university':") 59 | pprint(sim_index.postings_list('university')) 60 | print("Pages containing terms 'university' and 'california'") 61 | pprint(list(sim_index.docnames_with_terms('university', 'california'))) 62 | 63 | # Issue some similarity queries 64 | print("Similarity search for query 'stanford university'") 65 | sim_index.set_query_scorer('simple_count') 66 | pprint(list(sim_index.query('stanford university'))) 67 | 68 | 69 | Sample Client/Server Usage via JSON api 70 | --------------------------------------- 71 | 72 | *Server* 73 | 74 | bash$ ./sim_server.py sim_index -p 9001 75 | Use Control-C to exit 76 | 77 | *Client* 78 | 79 | >>> from pprint import pprint 80 | >>> import jsonrpclib 81 | >>> server = jsonrpclib.Server('http://localhost:9001/RPC2') 82 | >>> server.sim_index.index_urls('http://www.stanford.edu/', 'http://www.berkeley.edu', 'http://www.ucla.edu') 83 | >>> pprint(server.sim_index.query('stanford university')) 84 | [[u'http://www.stanford.edu', 0.4396892551666724], 85 | [u'http://www.berkeley.edu', 0.0], 86 | [u'http://www.ucla.edu', 0.0]] 87 | 88 | 89 | Sample SimIndexCollection Usage 90 | ------------------------------- 91 | 92 | *Server* 93 | 94 | bash$ ./sim_server.py sim_index -p 9001 & 95 | bash$ ./sim_server.py sim_index -p 9002 & 96 | 97 | *SimIndexCollection* 98 | 99 | >>> from pprint import pprint 100 | >>> from pysimsearch.sim_index import SimIndexCollection 101 | >>> from pysimsearch.sim_index import RemoteSimIndex 102 | >>> servers = [ 103 | RemoteSimIndex('http://localhost:9001/RPC2'), 104 | RemoteSimIndex('http://localhost:9002/RPC2') 105 | ] 106 | >>> index_coll = SimIndexCollection() 107 | >>> index_coll.add_shards(*servers) 108 | >>> index_coll.set_query_scorer('tfidf') 109 | >>> index_coll.index_urls('http://www.stanford.edu/', 110 | 'http://www.berkeley.edu', 111 | 'http://www.ucla.edu', 112 | 'http://www.mit.edu') 113 | >>> pprint(index_coll.query("stanford university")) 114 | [[u'http://www.stanford.edu/', 0.5836102697341475], 115 | [u'http://www.ucla.edu', 0.012839879268194701], 116 | [u'http://www.berkeley.edu', 0.005337522642134812]] 117 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | 15 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest 16 | 17 | help: 18 | @echo "Please use \`make ' where is one of" 19 | @echo " html to make standalone HTML files" 20 | @echo " dirhtml to make HTML files named index.html in directories" 21 | @echo " singlehtml to make a single large HTML file" 22 | @echo " pickle to make pickle files" 23 | @echo " json to make JSON files" 24 | @echo " htmlhelp to make HTML files and a HTML help project" 25 | @echo " qthelp to make HTML files and a qthelp project" 26 | @echo " devhelp to make HTML files and a Devhelp project" 27 | @echo " epub to make an epub" 28 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 29 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 30 | @echo " text to make text files" 31 | @echo " man to make manual pages" 32 | @echo " changes to make an overview of all changed/added/deprecated items" 33 | @echo " linkcheck to check all external links for integrity" 34 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 35 | 36 | clean: 37 | -rm -rf $(BUILDDIR)/* 38 | 39 | html: 40 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 41 | @echo 42 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 43 | 44 | dirhtml: 45 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 48 | 49 | singlehtml: 50 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 51 | @echo 52 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 53 | 54 | pickle: 55 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 56 | @echo 57 | @echo "Build finished; now you can process the pickle files." 58 | 59 | json: 60 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 61 | @echo 62 | @echo "Build finished; now you can process the JSON files." 63 | 64 | htmlhelp: 65 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 66 | @echo 67 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 68 | ".hhp project file in $(BUILDDIR)/htmlhelp." 69 | 70 | qthelp: 71 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 72 | @echo 73 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 74 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 75 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PySimSearch.qhcp" 76 | @echo "To view the help file:" 77 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PySimSearch.qhc" 78 | 79 | devhelp: 80 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 81 | @echo 82 | @echo "Build finished." 83 | @echo "To view the help file:" 84 | @echo "# mkdir -p $$HOME/.local/share/devhelp/PySimSearch" 85 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/PySimSearch" 86 | @echo "# devhelp" 87 | 88 | epub: 89 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 90 | @echo 91 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 92 | 93 | latex: 94 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 95 | @echo 96 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 97 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 98 | "(use \`make latexpdf' here to do that automatically)." 99 | 100 | latexpdf: 101 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 102 | @echo "Running LaTeX files through pdflatex..." 103 | make -C $(BUILDDIR)/latex all-pdf 104 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 105 | 106 | text: 107 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 108 | @echo 109 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 110 | 111 | man: 112 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 113 | @echo 114 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 115 | 116 | changes: 117 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 118 | @echo 119 | @echo "The overview file is in $(BUILDDIR)/changes." 120 | 121 | linkcheck: 122 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 123 | @echo 124 | @echo "Link check complete; look for any errors in the above output " \ 125 | "or in $(BUILDDIR)/linkcheck/output.txt." 126 | 127 | doctest: 128 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 129 | @echo "Testing of doctests in the sources finished, look at the " \ 130 | "results in $(BUILDDIR)/doctest/output.txt." 131 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # PySimSearch documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Jul 25 22:06:33 2011. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sys, os 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | sys.path.insert(0, os.path.abspath('..')) 20 | 21 | # -- General configuration ----------------------------------------------------- 22 | 23 | # If your documentation needs a minimal Sphinx version, state it here. 24 | #needs_sphinx = '1.0' 25 | 26 | # Add any Sphinx extension module names here, as strings. They can be extensions 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 28 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.todo', 'sphinxtogithub'] 29 | 30 | # Add any paths that contain templates here, relative to this directory. 31 | templates_path = ['templates'] 32 | 33 | # The suffix of source filenames. 34 | source_suffix = '.rst' 35 | 36 | # The encoding of source files. 37 | #source_encoding = 'utf-8-sig' 38 | 39 | # The master toctree document. 40 | master_doc = 'index' 41 | 42 | # General information about the project. 43 | project = u'pysimsearch' 44 | copyright = u'2011, Taher Haveliwala' 45 | 46 | # The version info for the project you're documenting, acts as replacement for 47 | # |version| and |release|, also used in various other places throughout the 48 | # built documents. 49 | # 50 | # The short X.Y version. 51 | version = '0.32' 52 | # The full version, including alpha/beta/rc tags. 53 | release = '0.32' 54 | 55 | # The language for content autogenerated by Sphinx. Refer to documentation 56 | # for a list of supported languages. 57 | #language = None 58 | 59 | # There are two options for replacing |today|: either, you set today to some 60 | # non-false value, then it is used: 61 | #today = '' 62 | # Else, today_fmt is used as the format for a strftime call. 63 | #today_fmt = '%B %d, %Y' 64 | 65 | # List of patterns, relative to source directory, that match files and 66 | # directories to ignore when looking for source files. 67 | exclude_patterns = ['build'] 68 | 69 | # The reST default role (used for this markup: `text`) to use for all documents. 70 | #default_role = None 71 | 72 | # If true, '()' will be appended to :func: etc. cross-reference text. 73 | #add_function_parentheses = True 74 | 75 | # If true, the current module name will be prepended to all description 76 | # unit titles (such as .. function::). 77 | #add_module_names = True 78 | 79 | # If true, sectionauthor and moduleauthor directives will be shown in the 80 | # output. They are ignored by default. 81 | #show_authors = False 82 | 83 | # The name of the Pygments (syntax highlighting) style to use. 84 | pygments_style = 'sphinx' 85 | 86 | # A list of ignored prefixes for module index sorting. 87 | #modindex_common_prefix = [] 88 | 89 | 90 | # -- Options for HTML output --------------------------------------------------- 91 | 92 | # The theme to use for HTML and HTML Help pages. See the documentation for 93 | # a list of builtin themes. 94 | html_theme = 'default' 95 | 96 | html_style = 'custom.css' 97 | 98 | # Theme options are theme-specific and customize the look and feel of a theme 99 | # further. For a list of options available for each theme, see the 100 | # documentation. 101 | #html_theme_options = {} 102 | 103 | # Add any paths that contain custom themes here, relative to this directory. 104 | #html_theme_path = [] 105 | 106 | # The name for this set of Sphinx documents. If None, it defaults to 107 | # " v documentation". 108 | #html_title = None 109 | 110 | # A shorter title for the navigation bar. Default is the same as html_title. 111 | #html_short_title = None 112 | 113 | # The name of an image file (relative to this directory) to place at the top 114 | # of the sidebar. 115 | #html_logo = None 116 | 117 | # The name of an image file (within the static path) to use as favicon of the 118 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 119 | # pixels large. 120 | #html_favicon = None 121 | 122 | # Add any paths that contain custom static files (such as style sheets) here, 123 | # relative to this directory. They are copied after the builtin static files, 124 | # so a file named "default.css" will overwrite the builtin "default.css". 125 | html_static_path = ['static'] 126 | 127 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 128 | # using the given strftime format. 129 | #html_last_updated_fmt = '%b %d, %Y' 130 | 131 | # If true, SmartyPants will be used to convert quotes and dashes to 132 | # typographically correct entities. 133 | #html_use_smartypants = True 134 | 135 | # Custom sidebar templates, maps document names to template names. 136 | #html_sidebars = {} 137 | 138 | # Additional templates that should be rendered to pages, maps page names to 139 | # template names. 140 | #html_additional_pages = {} 141 | 142 | # If false, no module index is generated. 143 | #html_domain_indices = True 144 | 145 | # If false, no index is generated. 146 | #html_use_index = True 147 | 148 | # If true, the index is split into individual pages for each letter. 149 | #html_split_index = False 150 | 151 | # If true, links to the reST sources are added to the pages. 152 | html_show_sourcelink = True 153 | 154 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 155 | #html_show_sphinx = True 156 | 157 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 158 | #html_show_copyright = True 159 | 160 | # If true, an OpenSearch description file will be output, and all pages will 161 | # contain a tag referring to it. The value of this option must be the 162 | # base URL from which the finished HTML is served. 163 | #html_use_opensearch = '' 164 | 165 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 166 | #html_file_suffix = None 167 | 168 | # Output file base name for HTML help builder. 169 | htmlhelp_basename = 'pysimsearchdoc' 170 | 171 | 172 | # -- Options for LaTeX output -------------------------------------------------- 173 | 174 | # The paper size ('letter' or 'a4'). 175 | #latex_paper_size = 'letter' 176 | 177 | # The font size ('10pt', '11pt' or '12pt'). 178 | #latex_font_size = '10pt' 179 | 180 | # Grouping the document tree into LaTeX files. List of tuples 181 | # (source start file, target name, title, author, documentclass [howto/manual]). 182 | latex_documents = [ 183 | ('index', 'pysimsearch.tex', u'pysimsearch Documentation', 184 | u'Taher Haveliwala', 'manual'), 185 | ] 186 | 187 | # The name of an image file (relative to this directory) to place at the top of 188 | # the title page. 189 | #latex_logo = None 190 | 191 | # For "manual" documents, if this is true, then toplevel headings are parts, 192 | # not chapters. 193 | #latex_use_parts = False 194 | 195 | # If true, show page references after internal links. 196 | #latex_show_pagerefs = False 197 | 198 | # If true, show URL addresses after external links. 199 | #latex_show_urls = False 200 | 201 | # Additional stuff for the LaTeX preamble. 202 | #latex_preamble = '' 203 | 204 | # Documents to append as an appendix to all manuals. 205 | #latex_appendices = [] 206 | 207 | # If false, no module index is generated. 208 | #latex_domain_indices = True 209 | 210 | 211 | # -- Options for manual page output -------------------------------------------- 212 | 213 | # One entry per manual page. List of tuples 214 | # (source start file, name, description, authors, manual section). 215 | man_pages = [ 216 | ('index', 'pysimsearch', u'pysimsearch Documentation', 217 | [u'Taher Haveliwala'], 1) 218 | ] 219 | -------------------------------------------------------------------------------- /doc/doc_reader.rst: -------------------------------------------------------------------------------- 1 | The :mod:`doc_reader` Module 2 | ---------------------------- 3 | 4 | .. automodule:: pysimsearch.doc_reader 5 | :members: 6 | :undoc-members: 7 | -------------------------------------------------------------------------------- /doc/freq_tools.rst: -------------------------------------------------------------------------------- 1 | The :mod:`freq_tools` Module 2 | ---------------------------- 3 | 4 | .. automodule:: pysimsearch.freq_tools 5 | :members: 6 | :undoc-members: 7 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. PySimSearch documentation master file, created by 2 | sphinx-quickstart on Mon Jul 25 22:06:33 2011. 3 | 4 | pysimsearch |version| documentation 5 | ======================================= 6 | 7 | Python library for indexing and similarity-search. 8 | 9 | Download from `GitHub `_ 10 | 11 | This library is primarily meant to illustrate the basic workings of similarity 12 | and indexing engines, without focusing heavily on optimization. Certain 13 | patterns used for scaling indexes (e.g., distributed indexes) are included. 14 | 15 | Although the code is currently for Python 2.7 series, we use ``__future__`` 16 | imports to match Python 3 as closely as possible. 17 | 18 | If you are interested in learning more about search and information retrieval, 19 | I highly recommend the following two books: 20 | 21 | * `Managing Gigabytes`_, by Witten, Moffat, and Bell 22 | .. _Managing Gigabytes: http://amzn.to/qg6Zhe 23 | * `Introduction to Information Retrieval`_, by Manning, Schütze, and Raghavan 24 | .. _Introduction to Information Retrieval: http://amzn.to/oz2O27 25 | 26 | 27 | Quickstart: 28 | ----------- 29 | 30 | *Quick sample:* 31 | 32 | >>> from pprint import pprint 33 | >>> from pysimsearch import sim_index, doc_reader 34 | >>> index = sim_index.MemorySimIndex() 35 | >>> index.index_urls('http://www.stanford.edu/', 36 | 'http://www.berkeley.edu/', 37 | 'http://www.ucla.edu', 38 | 'http://www.mit.edu') 39 | >>> pprint(index.postings_list('university')) 40 | [(0, 3), (1, 1), (2, 1)] 41 | >>> pprint(list(index.docnames_with_terms('university', 'california'))) 42 | ['http://www.stanford.edu/', 'http://www.ucla.edu'] 43 | >>> index.set_query_scorer('tfidf') 44 | >>> pprint(list(index.query("stanford university"))) 45 | [('http://www.stanford.edu/', 0.5827172819606118), 46 | ('http://www.ucla.edu', 0.05801461340864149), 47 | ('http://www.berkeley.edu/', 0.025725104682131295)] 48 | 49 | View a larger :doc:`sample` 50 | 51 | API: 52 | ---- 53 | 54 | .. toctree:: 55 | :maxdepth: 2 56 | 57 | sim_index 58 | similarity 59 | doc_reader 60 | freq_tools 61 | sim_server 62 | query_scorer 63 | term_vec 64 | 65 | .. automodule:: pysimsearch 66 | :members: 67 | 68 | 69 | Indices and tables 70 | ================== 71 | 72 | * :ref:`genindex` 73 | * :ref:`modindex` 74 | * :ref:`search` 75 | 76 | -------------------------------------------------------------------------------- /doc/query_scorer.rst: -------------------------------------------------------------------------------- 1 | The :mod:`query_scorer` Module 2 | ------------------------------ 3 | 4 | .. automodule:: pysimsearch.query_scorer 5 | :members: 6 | :undoc-members: 7 | -------------------------------------------------------------------------------- /doc/sample.py: -------------------------------------------------------------------------------- 1 | ../sample.py -------------------------------------------------------------------------------- /doc/sample.rst: -------------------------------------------------------------------------------- 1 | Example 2 | ------- 3 | 4 | .. literalinclude:: sample.py 5 | -------------------------------------------------------------------------------- /doc/sim_index.rst: -------------------------------------------------------------------------------- 1 | The :mod:`sim_index` Module 2 | --------------------------- 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | sim_index/sim_index 8 | sim_index/map_sim_index 9 | sim_index/memory_sim_index 10 | sim_index/shelf_sim_index 11 | sim_index/concurrent_sim_index 12 | sim_index/remote_sim_index 13 | sim_index/sim_index_collection 14 | -------------------------------------------------------------------------------- /doc/sim_index/concurrent_sim_index.rst: -------------------------------------------------------------------------------- 1 | The :class:`ConcurrentSimIndex` Class 2 | ------------------------------------- 3 | 4 | .. automodule:: pysimsearch.sim_index.concurrent_sim_index 5 | 6 | .. autoclass:: pysimsearch.sim_index.ConcurrentSimIndex 7 | :members: 8 | :inherited-members: 9 | -------------------------------------------------------------------------------- /doc/sim_index/map_sim_index.rst: -------------------------------------------------------------------------------- 1 | The :class:`MapSimIndex` Class 2 | -------------------------------- 3 | 4 | .. automodule:: pysimsearch.sim_index.map_sim_index 5 | 6 | .. autoclass:: pysimsearch.sim_index.MapSimIndex 7 | :members: 8 | :inherited-members: 9 | -------------------------------------------------------------------------------- /doc/sim_index/memory_sim_index.rst: -------------------------------------------------------------------------------- 1 | The :class:`MemorySimIndex` Class 2 | --------------------------------- 3 | 4 | .. automodule:: pysimsearch.sim_index.memory_sim_index 5 | 6 | .. autoclass:: pysimsearch.sim_index.MemorySimIndex 7 | :members: 8 | :inherited-members: 9 | -------------------------------------------------------------------------------- /doc/sim_index/remote_sim_index.rst: -------------------------------------------------------------------------------- 1 | The :class:`RemoteSimIndex` Class 2 | ------------------------------------- 3 | 4 | .. automodule:: pysimsearch.sim_index.remote_sim_index 5 | 6 | .. autoclass:: pysimsearch.sim_index.RemoteSimIndex 7 | :members: 8 | :inherited-members: 9 | -------------------------------------------------------------------------------- /doc/sim_index/shelf_sim_index.rst: -------------------------------------------------------------------------------- 1 | The :class:`ShelfSimIndex` Class 2 | -------------------------------- 3 | 4 | .. automodule:: pysimsearch.sim_index.shelf_sim_index 5 | 6 | .. autoclass:: pysimsearch.sim_index.ShelfSimIndex 7 | :members: 8 | :inherited-members: 9 | -------------------------------------------------------------------------------- /doc/sim_index/sim_index.rst: -------------------------------------------------------------------------------- 1 | The :class:`SimIndex` Class 2 | --------------------------- 3 | 4 | .. automodule:: pysimsearch.sim_index.sim_index 5 | 6 | .. autoclass:: pysimsearch.sim_index.SimIndex 7 | :members: 8 | -------------------------------------------------------------------------------- /doc/sim_index/sim_index_collection.rst: -------------------------------------------------------------------------------- 1 | The :class:`SimIndexCollection` Class 2 | ------------------------------------- 3 | 4 | .. automodule:: pysimsearch.sim_index.sim_index_collection 5 | 6 | .. autoclass:: pysimsearch.sim_index.SimIndexCollection 7 | :members: 8 | :inherited-members: 9 | -------------------------------------------------------------------------------- /doc/sim_server.rst: -------------------------------------------------------------------------------- 1 | The :mod:`sim_server` Module 2 | ---------------------------- 3 | 4 | .. automodule:: pysimsearch.sim_server 5 | :members: 6 | :undoc-members: 7 | -------------------------------------------------------------------------------- /doc/similarity.rst: -------------------------------------------------------------------------------- 1 | The :mod:`similarity` Module 2 | ---------------------------- 3 | 4 | .. automodule:: pysimsearch.similarity 5 | :members: 6 | :undoc-members: 7 | -------------------------------------------------------------------------------- /doc/static/custom.css: -------------------------------------------------------------------------------- 1 | @import url("default.css"); 2 | 3 | dl.class { 4 | background-color: #F2F2F2; 5 | border-radius: 5px; 6 | padding: 20px; 7 | } 8 | 9 | dl.class>dt { 10 | font-weight: bold; 11 | margin-bottom: 10px; 12 | } -------------------------------------------------------------------------------- /doc/term_vec.rst: -------------------------------------------------------------------------------- 1 | The :mod:`term_vec` Module 2 | ---------------------------- 3 | 4 | .. automodule:: pysimsearch.term_vec 5 | :members: 6 | :undoc-members: 7 | -------------------------------------------------------------------------------- /pysimsearch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taherh/pysimsearch/57796f7175565a8481fe80a56c7815bf0264d4fb/pysimsearch/__init__.py -------------------------------------------------------------------------------- /pysimsearch/doc_reader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2010, Taher Haveliwala 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * The names of project contributors may not be used to endorse or 14 | # promote products derived from this software without specific 15 | # prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | ''' 30 | Utilities for creating term vectors from data 31 | ''' 32 | 33 | from __future__ import(division, absolute_import, print_function, 34 | unicode_literals) 35 | 36 | import codecs 37 | from concurrent import futures 38 | import io 39 | from itertools import chain 40 | import re 41 | import urllib 42 | 43 | import lxml.html 44 | from lxml.html.clean import clean_html 45 | 46 | def get_text_file(filename): 47 | '''Returns file for filename 48 | 49 | TODO: detect html and parse 50 | ''' 51 | return codecs.open(filename, encoding='utf-8') 52 | 53 | def get_url(url): 54 | http_pattern = '^http://' 55 | if re.search(http_pattern, url): 56 | urlfh = urllib.urlopen(url) 57 | content = urlfh.read() 58 | html_tree = lxml.html.fromstring(content) 59 | clean_html(html_tree) # removes crud from html 60 | clean_html_string = lxml.html.tostring(html_tree, 61 | encoding=unicode, 62 | method='text') 63 | return io.StringIO(clean_html_string) 64 | else: 65 | raise Exception("Bad url: {}".format(url)) 66 | 67 | def get_text_files(filenames=None): 68 | ''' 69 | Returns an iterator of (name, file) tuples for filenames 70 | 71 | Params: 72 | filenames: list of filenames 73 | ''' 74 | if filenames is not None: 75 | return ((name, get_text_file(name)) for name in filenames) 76 | 77 | _executor = None 78 | 79 | def get_urls(urls=None): 80 | ''' 81 | Returns an iterator of (name, file) tuples for urls 82 | 83 | Params: 84 | urls: list of urls to fetch 85 | ''' 86 | # The below effectively implements 87 | # 88 | # return ((url, get_url(url)) for url in urls) 89 | # 90 | # but uses futures to allow parallel fetching/processing of urls 91 | 92 | # Initialize the executor if necessary 93 | global _executor 94 | if _executor is None: 95 | _executor = futures.ThreadPoolExecutor(max_workers=10) 96 | 97 | if urls is not None: 98 | # submit the get_url() requests 99 | future_to_url = { 100 | _executor.submit(get_url, url): url 101 | for url in urls 102 | } 103 | 104 | # generator that lazily iterates over futures and yields 105 | # (url, file) tuples 106 | def _gen_result(): 107 | named_files = [] 108 | for future in futures.as_completed(future_to_url, timeout=60): 109 | url = future_to_url[future] 110 | if future.exception() is not None: 111 | raise Exception("failed to fetch {}: e=".format(url, future.exception())) 112 | else: 113 | yield (url, future.result()) 114 | 115 | # return iterator 116 | return _gen_result() 117 | -------------------------------------------------------------------------------- /pysimsearch/exceptions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2010, Taher Haveliwala 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * The names of project contributors may not be used to endorse or 14 | # promote products derived from this software without specific 15 | # prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | ''' 30 | Exception classes 31 | ''' 32 | 33 | class Error(Exception): 34 | '''Base class for Exception types used in this module''' 35 | pass 36 | 37 | class FileFormatException(Error): 38 | '''Exception for invalid input file''' 39 | pass 40 | -------------------------------------------------------------------------------- /pysimsearch/freq_tools.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2010, Taher Haveliwala 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * The names of project contributors may not be used to endorse or 14 | # promote products derived from this software without specific 15 | # prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | ''' 30 | Sample usage as a script:: 31 | 32 | $ python freq_tools --list doc_list -o output.df 33 | Processing... 34 | ''' 35 | 36 | from __future__ import (division, absolute_import, print_function, 37 | unicode_literals) 38 | 39 | # boilerplate to allow running as script 40 | if __name__ == "__main__" and __package__ is None: 41 | import sys, os 42 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 43 | sys.path.insert(0, parent_dir) 44 | import pysimsearch 45 | __package__ = str("pysimsearch") 46 | del sys, os 47 | 48 | # external modules 49 | import argparse 50 | import sys 51 | 52 | # our modules 53 | from .exceptions import * 54 | from . import doc_reader 55 | 56 | def read_df(df_file): 57 | ''' 58 | Reads a document frequency file for use in applying df term weighting 59 | Returns a dictionary of the form {term: doc_freq} 60 | ''' 61 | df_dict = {} 62 | for line in df_file: 63 | ln_list = line.split() 64 | if len(ln_list) == 0: 65 | continue # skip blank lines without warning 66 | if len(ln_list) != 2: # raise exception if there were not exactly 67 | # two entries in the line 68 | raise FileFormatException( 69 | 'Bad line in doc freq file ({0} entries, expecting 2): {1}'. 70 | format(len(ln_list), line)) 71 | (term, df) = ln_list 72 | df_dict[term] = int(df) 73 | return df_dict 74 | 75 | def write_df(df_dict, df_file): 76 | ''' 77 | Writes the document frequency data structure to file 78 | df_dict is a dictionary of the form {term: doc_freq} 79 | 80 | TODO: sort order? 81 | ''' 82 | for (term, df) in df_dict.items(): 83 | df_file.write(u'{0}\t{1}\n'.format(term, df)) 84 | 85 | def compute_df(files): 86 | ''' 87 | Computes document frequency counts by processing a collection of files 88 | Returns a dictionary of the form {term: doc_freq} 89 | ''' 90 | df_dict = {} 91 | for file in files: 92 | term_seen = set() 93 | for line in file: 94 | for term in line.split(): 95 | if term not in term_seen: 96 | if term not in df_dict: 97 | df_dict[term] = 0 98 | df_dict[term] += 1 99 | term_seen.add(term) 100 | 101 | return df_dict 102 | 103 | # --- main() --- 104 | 105 | def main(): 106 | '''Commandline interface for generating document frequency indexes''' 107 | parser = argparse.ArgumentParser( 108 | description='Compute document frequencies of terms in of input ' 109 | 'documents') 110 | parser.add_argument('doc', nargs='*', help='a document filename') 111 | parser.add_argument('-l', '--list', nargs='?', 112 | help='file containing list of input documents') 113 | parser.add_argument('-o', '--output', nargs='?', 114 | help='output file (default: stdout)') 115 | 116 | args = parser.parse_args() 117 | 118 | output_file = sys.stdout 119 | if args.output != None: 120 | output_file = open(args.output, "w") 121 | 122 | doc_list = [] 123 | if args.list != None: 124 | try: 125 | with open(args.list) as input_docnames_file: 126 | doc_list = [line.strip() for line in 127 | input_docnames_file.readlines()] 128 | except IOError: 129 | print("Sorry, could not open " + args.list) 130 | 131 | doc_list.extend(args.doc) 132 | 133 | print("Processing {}".format(str(doc_list))) 134 | 135 | if len(doc_list) == 0: 136 | raise Error("Sorry, you must specify at least one document.") 137 | 138 | df_dict = compute_df(doc_reader.get_text_files(*doc_list)) 139 | for key in df_dict: 140 | print('{}\t{:>20}'.format(key, df_dict[key]), file=output_file) 141 | 142 | if output_file != sys.stdout: 143 | output_file.close() 144 | 145 | if __name__ == '__main__': 146 | main() 147 | 148 | -------------------------------------------------------------------------------- /pysimsearch/query_scorer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2011, Taher Haveliwala 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * The names of project contributors may not be used to endorse or 14 | # promote products derived from this software without specific 15 | # prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | ''' 30 | Scoring algorithms for finding similar documents 31 | ''' 32 | 33 | from __future__ import (division, absolute_import, print_function, 34 | unicode_literals) 35 | 36 | import abc 37 | from collections import defaultdict 38 | import operator 39 | from math import log 40 | 41 | class QueryScorer(object): 42 | ''' 43 | Interface for query scorers which score similarity search results 44 | 45 | QueryScorers are used by the SimIndex.query() method to handle the 46 | scoring of similarity search results. 47 | ''' 48 | 49 | __metaclass__ = abc.ABCMeta 50 | 51 | # name->scorer mapping 52 | _scorers = { } 53 | 54 | @staticmethod 55 | def make_scorer(scorer_type): 56 | '''Returns a new scorer object''' 57 | return QueryScorer._scorers[scorer_type]() 58 | 59 | @staticmethod 60 | def register_scorers(scorer_map): 61 | QueryScorer._scorers.update(scorer_map) 62 | 63 | @abc.abstractmethod 64 | def score_docs(self, query_vec, postings_lists, **extra): 65 | '''Scores documents' similarities to query 66 | 67 | Scans postings_lists to compute similarity scores for docs for the 68 | query term vector 69 | 70 | Params: 71 | query: the query document 72 | postings_lists: a list of postings lists for terms in query 73 | 74 | Returns: 75 | A sorted iterable of (docid, score) tuples 76 | ''' 77 | return 78 | 79 | 80 | class SimpleCountQueryScorer(QueryScorer): 81 | ''' 82 | QueryScorer that uses simple term frequencies for scoring. 83 | ''' 84 | 85 | def score_docs(self, query_vec, postings_lists, **extra): 86 | ''' 87 | Scores query-document similarity using number of occurrences 88 | of query terms in document. Multiple occurrences of a term 89 | in the query are ignored. 90 | ''' 91 | 92 | doc_hit_map = defaultdict(int) 93 | for (term, postings_list) in postings_lists: 94 | assert(query_vec[term] >= 1) 95 | for (docid, freq) in postings_list: 96 | doc_hit_map[docid] += freq 97 | 98 | # construct list of tuples sorted by value 99 | return sorted(doc_hit_map.iteritems(), 100 | key=operator.itemgetter(1), 101 | reverse=True) 102 | 103 | class TFIDFQueryScorer(QueryScorer): 104 | ''' 105 | QueryScorer that uses TFIDF weighting with the cosine similarity measure. 106 | 107 | This implementation is actually an approximation to the true 108 | cosine, because of the way we normalize by document length. 109 | When computing document length, we assume a term weight of 1 for 110 | each document term. E.g., we do not factor in term weights 111 | when computing the "document length", since that would require 112 | choosing the weighting strategy at index time. 113 | 114 | Query length is ignored, as it has no effect on relative ordering 115 | ''' 116 | 117 | @staticmethod 118 | def tf_weight_raw(tf): 119 | '''Returns unscaled tf''' 120 | return tf 121 | 122 | @staticmethod 123 | def tf_weight_log(tf): 124 | '''Returns sublinear scaling of tf: 1+log(tf)''' 125 | assert(tf > 0) 126 | return 1 + log(tf) 127 | 128 | @staticmethod 129 | def idf_weight_log(N, df): 130 | '''Returns idf weight''' 131 | assert(df > 0) 132 | return log(N/df) 133 | 134 | def __init__(self, tf_weight_type = 'raw'): 135 | if tf_weight_type == 'log': 136 | self.tf_weight = self.tf_weight_log 137 | else: 138 | self.tf_weight = self.tf_weight_raw 139 | 140 | self.idf_weight = self.idf_weight_log 141 | 142 | def score_docs(self, query_vec, postings_lists, N, get_doc_freq, get_doc_len, **extra): 143 | ''' 144 | Scores documents' similarities to query using cosine similarity 145 | in a vector space model. Uses tf.idf weighting. 146 | 147 | An individual term hit is scored as:: 148 | 149 | idf * self.tf_weight(q_tf) * self.tf_weight(d_tf) 150 | 151 | The overall score for a doc is given by the sum of the term-hit scores 152 | ''' 153 | 154 | if N == 0: return () 155 | doc_hit_map = defaultdict(int) 156 | for (term, postings_list) in postings_lists: 157 | idf = self.idf_weight(N, get_doc_freq(term)) 158 | query_term_wt = self.tf_weight(query_vec[term]) * idf 159 | for (docid, freq) in postings_list: 160 | doc_hit_map[docid] += self.tf_weight(freq) * query_term_wt 161 | for (docid, weight) in doc_hit_map.iteritems(): 162 | doc_len = get_doc_len(docid) 163 | doc_hit_map[docid] = weight / doc_len 164 | 165 | # construct list of tuples sorted by value 166 | return sorted(doc_hit_map.iteritems(), 167 | key=operator.itemgetter(1), 168 | reverse=True) 169 | 170 | # Register scorers by name 171 | QueryScorer.register_scorers({ 172 | 'simple_count': SimpleCountQueryScorer, 173 | 'tfidf': TFIDFQueryScorer 174 | }) 175 | 176 | -------------------------------------------------------------------------------- /pysimsearch/sim_index/__init__.py: -------------------------------------------------------------------------------- 1 | from .sim_index import SimIndex 2 | from .map_sim_index import MapSimIndex 3 | from .memory_sim_index import MemorySimIndex 4 | from .shelf_sim_index import ShelfSimIndex 5 | from .remote_sim_index import RemoteSimIndex 6 | from .sim_index_collection import SimIndexCollection 7 | from .concurrent_sim_index import ConcurrentSimIndex 8 | -------------------------------------------------------------------------------- /pysimsearch/sim_index/concurrent_sim_index.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2011, Taher Haveliwala 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * The names of project contributors may not be used to endorse or 14 | # promote products derived from this software without specific 15 | # prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | 30 | ''' 31 | ConcurrentSimIndex 32 | 33 | Wrapper to allow concurrent SimIndex access 34 | 35 | Sample usage:: 36 | 37 | from pysimsearch.sim_index import MemorySimIndex, ConcurrentSimIndex 38 | 39 | index = ConcurrentSimIndex(MemorySimIndex()) 40 | index.index_urls('http://www.stanford.edu/', 'http://www.berkeley.edu') 41 | print(list(index.query('stanford'))) 42 | 43 | ''' 44 | 45 | from __future__ import (division, absolute_import, print_function, 46 | unicode_literals) 47 | 48 | from concurrent import futures 49 | import threading 50 | 51 | from . import SimIndex 52 | 53 | class ConcurrentSimIndex(object): 54 | '''Proxy to a :class:`pysimsearch.sim_index.SimIndex` that allows 55 | concurrent access. 56 | 57 | ``ConcurrentSimIndex`` is compatible with the :class:`SimIndex` interface. 58 | We use ``concurrent.futures`` to allow some basic concurrency for indexing 59 | and querying. In particular, calls to ``index_urls()`` are executed in a 60 | nonblocking manner. 61 | ''' 62 | 63 | READ_METHODS = {'name_to_docid', 64 | 'docid_to_name', 65 | 'postings_list', 66 | 'docids_with_terms', 67 | 'docnames_with_terms', 68 | 'query', 69 | 'get_local_N', 70 | 'get_local_df_map', 71 | 'get_name_to_docid_map', 72 | 'config'} 73 | 74 | WRITE_METHODS = {'set_query_scorer', 75 | 'set_global_N', 76 | 'set_global_df_map', 77 | 'load_stoplist', 78 | 'set_config', 79 | 'update_config', 80 | 'index_string_buffers', 81 | 'index_files', 82 | 'del_docids', 83 | } 84 | 85 | # Note: assume that index_urls() is implemented by calling index_files() 86 | # so that the write-lock will be acquired at the time index_files() 87 | # is called. We don't want to acquire a lock on index_urls() 88 | # directly, as we'd like allow at least the url fetches to occur 89 | # concurrently. 90 | # 91 | # TODO: re-implement index_urls() here to ensure the assumption is true? 92 | NONBLOCKING_METHODS = { 'index_urls' } 93 | 94 | 95 | def __init__(self, sim_index): 96 | '''Initialize with ``sim_index`` 97 | 98 | Params: 99 | sim_index: A :class:`SimIndex` instance. 100 | ''' 101 | self._sim_index = sim_index 102 | self._executor = futures.ThreadPoolExecutor(max_workers=10) 103 | self._lock = threading.RLock() # TODO: use a Read-Write Lock 104 | self._futures = set() 105 | 106 | def acquire_read_lock(self): 107 | '''Acquire read lock''' 108 | self._lock.acquire() 109 | 110 | def release_read_lock(self): 111 | '''Release read lock''' 112 | self._lock.release() 113 | 114 | def acquire_write_lock(self): 115 | '''Acquire write lock''' 116 | self._lock.acquire() 117 | 118 | def release_write_lock(self): 119 | '''Release write lock''' 120 | self._lock.release() 121 | 122 | def _read_decorator(self, func): 123 | '''Wrap func with read_lock protection''' 124 | def wrapper(*args, **kwargs): 125 | self.acquire_read_lock() 126 | try: 127 | return func(*args, **kwargs) 128 | finally: 129 | self.release_read_lock() 130 | return wrapper 131 | 132 | def _write_decorator(self, func): 133 | '''Wrap func with write_lock protection''' 134 | def wrapper(*args, **kwargs): 135 | self.acquire_write_lock() 136 | try: 137 | return func(*args, **kwargs) 138 | finally: 139 | self.release_write_lock() 140 | return wrapper 141 | 142 | def _nonblocking_decorator(self, func): 143 | ''' 144 | Wrap func with non-blocking futures call. 145 | Return value of ``func`` is ignored. 146 | ''' 147 | def wrapper(*args, **kwargs): 148 | future = self._executor.submit(func, *args, **kwargs) 149 | self._futures.add(future) 150 | return wrapper 151 | 152 | def _futures_wait(self): 153 | if len(self._futures) > 0: 154 | r = futures.wait(self._futures) 155 | for future in r.done: 156 | if future.exception() is not None: 157 | raise future.exception() 158 | self._futures = set() 159 | 160 | def __getattr__(self, name): 161 | func = getattr(self._sim_index, name) 162 | 163 | if name in self.READ_METHODS: 164 | # wait for any outstanding non-blocking calls to complete 165 | self._futures_wait() 166 | return self._read_decorator(func) 167 | elif name in self.WRITE_METHODS: 168 | # wait for any outstanding non-blocking calls to complete 169 | return self._write_decorator(func) 170 | elif name in self.NONBLOCKING_METHODS: 171 | return self._nonblocking_decorator(func) 172 | else: 173 | raise Exception("Unsupported method: {}".format(name)) 174 | 175 | # ConcurrentSimIndex is a subtype of SimIndex 176 | SimIndex.register(ConcurrentSimIndex) 177 | -------------------------------------------------------------------------------- /pysimsearch/sim_index/map_sim_index.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2011, Taher Haveliwala 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * The names of project contributors may not be used to endorse or 14 | # promote products derived from this software without specific 15 | # prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | 30 | ''' 31 | MapSimIndex 32 | 33 | See :mod:`pysimsearch.sim_index.memory_sim_index` for sample usage 34 | 35 | ''' 36 | 37 | from __future__ import (division, absolute_import, print_function, 38 | unicode_literals) 39 | 40 | from collections import defaultdict 41 | import sys 42 | 43 | from . import SimIndex 44 | from .. import term_vec 45 | from ..exceptions import * 46 | 47 | class MapSimIndex(SimIndex): 48 | ''' 49 | Inherits from :class:`pysimsearch.sim_index.SimIndex`. 50 | 51 | Simple implementation of the :class:`SimIndex` interface backed with dict-like 52 | objects (MutableMapping). By default, uses `dict`, in which case the 53 | indexes are in-memory. 54 | 55 | NOTE: to ensure proper compatibility with arbitrary dict-like objects, 56 | including persistent shelves, any mutations must be done using assignment. 57 | E.g., do not do:: 58 | 59 | map[key].extend([a, b]) 60 | 61 | Instead, do the equivalent of:: 62 | 63 | map[key] += [a,b] # same as: map[key] = map[key].__iadd__([a,b]) 64 | ''' 65 | 66 | 67 | def __init__(self, 68 | name_to_docid_map=None, 69 | docid_to_name_map=None, 70 | docid_to_feature_map=None, 71 | term_index=None, 72 | doc_vectors=None, 73 | df_map=None, 74 | doc_len_map=None): 75 | 76 | super(MapSimIndex, self).__init__() 77 | 78 | # index metadata 79 | self._name_to_docid_map = name_to_docid_map 80 | self._docid_to_name_map = docid_to_name_map 81 | self._docid_to_feature_map = docid_to_feature_map 82 | 83 | # term index 84 | self._term_index = term_index 85 | 86 | # document vectors (useful for deletions and certain scoring algorithms) 87 | self._doc_vectors = doc_vectors 88 | 89 | # additional stats used for scoring 90 | self._df_map = df_map 91 | self._doc_len_map = doc_len_map 92 | 93 | # global stats, which if present, are used instead 94 | # of the local stats 95 | self._global_df_map = None 96 | 97 | # set a default scorer 98 | self.set_query_scorer('tfidf') 99 | 100 | def set_global_df_map(self, df_map): 101 | self._global_df_map = df_map 102 | 103 | def get_local_df_map(self): 104 | return self._df_map 105 | 106 | def get_name_to_docid_map(self): 107 | return self._name_to_docid_map 108 | 109 | def get_doc_freq(self, term): 110 | df_map = self._global_df_map or self._df_map 111 | return df_map.get(term, 1) 112 | 113 | def get_doc_len(self, docid): 114 | return self._doc_len_map.get(docid, 0) 115 | 116 | def index_files(self, named_files): 117 | ''' 118 | Build a similarity index over collection given in named_files 119 | named_files is a list iterable of (filename, file) pairs 120 | ''' 121 | for (name, file) in named_files: 122 | with file: 123 | t_vec = term_vec.term_vec( 124 | file, 125 | stoplist=self.config('stoplist'), 126 | lowercase=self.config('lowercase'), 127 | ) 128 | docid = self._next_docid 129 | self._name_to_docid_map[name] = docid 130 | self._docid_to_name_map[docid] = name 131 | for term in t_vec: 132 | if term not in self._df_map: self._df_map[term] = 0 133 | self._df_map[term] += 1 134 | self._add_vec(docid, t_vec) 135 | self._doc_len_map[docid] = term_vec.l2_norm(t_vec) 136 | self._doc_vectors[docid] = t_vec 137 | self._N += 1 138 | self._next_docid += 1 139 | 140 | def _add_vec(self, docid, term_vec): 141 | '''Add term_vec to the index''' 142 | # build up a dictionary of batched updates for the index 143 | term_index = defaultdict(list) 144 | for (term, freq) in term_vec.iteritems(): 145 | term_index[term].append((docid, freq)) 146 | 147 | # apply the updates to the term index 148 | for (term, new_postings) in term_index.items(): 149 | self._term_index[term] = self.postings_list(term) + new_postings 150 | 151 | def del_docids(self, *docids): 152 | '''Delete docids from index''' 153 | 154 | def _del_helper(map, key): 155 | try: 156 | del map[key] 157 | except KeyError: 158 | # sys.stderr.write("Unkown docid: {}\n".format(docid)) 159 | pass 160 | 161 | # TODO: optimize for batch deletion 162 | for docid in docids: 163 | for (term, freq) in self._doc_vectors[docid].iteritems(): 164 | # decr df count 165 | self._df_map[term] -= 1 166 | # filter out docid from term index 167 | self._term_index[term] = [ 168 | (_docid, freq) 169 | for (_docid, freq) in self._term_index.get(term, []) 170 | if _docid != docid 171 | ] 172 | if len(self._term_index[term]) == 0: 173 | del self._term_index[term] 174 | 175 | name = self.docid_to_name(docid) 176 | _del_helper(self._docid_to_name_map, docid) 177 | _del_helper(self._docid_to_feature_map, docid) 178 | _del_helper(self._name_to_docid_map, name) 179 | _del_helper(self._doc_len_map, docid) 180 | _del_helper(self._doc_vectors, docid) 181 | 182 | self._N -= 1 183 | 184 | def docid_to_name(self, docid): 185 | return self._docid_to_name_map[docid] 186 | 187 | def name_to_docid(self, name): 188 | return self._name_to_docid_map[name] 189 | 190 | def postings_list(self, term): 191 | ''' 192 | Returns list of (docid, freq) tuples for documents containing term 193 | ''' 194 | if self.config('lowercase'): 195 | term = term.lower() 196 | 197 | return self._term_index.get(term, []) 198 | 199 | def _query(self, query_vec): 200 | '''Finds documents similar to query_vec 201 | 202 | Params: 203 | query_vec: term vector representing query document 204 | 205 | Returns: 206 | A iterable of (docname, score) tuples sorted by score 207 | ''' 208 | postings_lists = [] 209 | for term in query_vec: 210 | postings_lists.append((term, self.postings_list(term))) 211 | 212 | 213 | N = self._global_N or self._N 214 | hits = self.query_scorer.score_docs(query_vec=query_vec, 215 | postings_lists=postings_lists, 216 | N=N, 217 | get_doc_freq=self.get_doc_freq, 218 | get_doc_len=self.get_doc_len) 219 | 220 | return ((self.docid_to_name(docid), score) for (docid, score) in hits) 221 | 222 | -------------------------------------------------------------------------------- /pysimsearch/sim_index/memory_sim_index.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2011, Taher Haveliwala 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * The names of project contributors may not be used to endorse or 14 | # promote products derived from this software without specific 15 | # prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | 30 | ''' 31 | MemorySimIndex 32 | 33 | Sample usage:: 34 | 35 | from pprint import pprint 36 | from pysimsearch.sim_index import MemorySimIndex 37 | from pysimsearch import doc_reader 38 | 39 | sim_index = MemorySimIndex() 40 | sim_index.index_urls('http://www.stanford.edu/', 41 | 'http://www.berkeley.edu', 42 | 'http://www.ucla.edu', 43 | 'http://www.mit.edu') 44 | pprint(sim_index.postings_list('university')) 45 | pprint(list(sim_index.docnames_with_terms('university', 'california'))) 46 | 47 | sim_index.set_query_scorer('simple_count') 48 | pprint(list(sim_index.query("stanford university"))) 49 | 50 | ''' 51 | 52 | from __future__ import (division, absolute_import, print_function, 53 | unicode_literals) 54 | 55 | import cPickle as pickle 56 | from collections import defaultdict 57 | 58 | from . import MapSimIndex 59 | from pysimsearch.exceptions import * 60 | 61 | class MemorySimIndex(MapSimIndex): 62 | ''' 63 | Inherits from :class:`pysimsearch.sim_index.MapSimIndex`. 64 | 65 | Memory-based implementation of :class:`SimIndex`. Indexes are backed with 66 | ``dict``. 67 | ''' 68 | 69 | def __init__(self): 70 | 71 | # index metadata 72 | name_to_docid_map = dict() 73 | docid_to_name_map = dict() 74 | docid_to_feature_map = dict() 75 | 76 | # term index 77 | term_index = dict() 78 | 79 | # document vectors 80 | doc_vectors = dict() 81 | 82 | # additional stats used for scoring 83 | df_map = dict() 84 | doc_len_map = dict() 85 | 86 | self._maps = dict(name_to_docid_map=name_to_docid_map, 87 | docid_to_name_map=docid_to_name_map, 88 | docid_to_feature_map=docid_to_feature_map, 89 | term_index=term_index, 90 | doc_vectors=doc_vectors, 91 | df_map=df_map, 92 | doc_len_map=doc_len_map) 93 | 94 | super(MemorySimIndex, self).__init__(**self._maps) 95 | 96 | def save(self, file): 97 | '''Saved index to file''' 98 | # pickle won't let us save query_scorer 99 | qs = self.query_scorer 100 | self.query_scorer = None 101 | pickle.dump(self, file) 102 | self.query_scorer = qs 103 | 104 | @staticmethod 105 | def load(file): 106 | '''Returns a ``MemorySimIndex`` loaded from pickle file''' 107 | return pickle.load(file) 108 | 109 | -------------------------------------------------------------------------------- /pysimsearch/sim_index/remote_sim_index.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2011, Taher Haveliwala 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * The names of project contributors may not be used to endorse or 14 | # promote products derived from this software without specific 15 | # prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | 30 | ''' 31 | RemoteSimIndex 32 | 33 | Sample usage: 34 | 35 | **Server** 36 | :: 37 | 38 | bash$ pysimsearch/sim_server.py sim_index -p 9001 39 | Use Control-C to exit 40 | 41 | 42 | ** pysimsearch Client ** 43 | 44 | >>> from pprint import pprint 45 | >>> from pysimsearch import sim_index 46 | >>> index = sim_index.RemoteSimIndex('http://localhost:9001/RPC2') 47 | >>> index.index_urls('http://www.stanford.edu/', 'http://www.berkeley.edu', 'http://www.ucla.edu') 48 | >>> pprint(index.query('university')) 49 | [[u'http://www.stanford.edu/', 0.10469570845856098], 50 | [u'http://www.ucla.edu', 0.04485065887313478], 51 | [u'http://www.berkeley.edu', 0.020464326883958977]] 52 | 53 | ''' 54 | 55 | from __future__ import (division, absolute_import, print_function, 56 | unicode_literals) 57 | 58 | import jsonrpclib as rpclib 59 | #import xmlrpclib as rpclib 60 | 61 | from . import SimIndex 62 | 63 | class RemoteSimIndex(object): 64 | '''Proxy to a remote :class:`pysimsearch.sim_index.SimIndex` 65 | 66 | ``RemoteSimIndex`` is compatible with the :class:`SimIndex` interface, 67 | and provides access to a remote index. We use this in place of 68 | directly using a jsonrpclib.Server() object because we need an object 69 | that acts like type :class:`SimIndex`. 70 | 71 | Instantiate a ``RemoteSimIndex`` as follows: 72 | 73 | >>> remote_index = RemoteSimIndex('http://localhost:9001/RPC2') 74 | >>> remote_index.query('university') 75 | ... 76 | 77 | ''' 78 | 79 | def __init__(self, server_url): 80 | '''Initialize with server_url 81 | 82 | Params: 83 | server_url: url for remote ``SimIndex`` server 84 | ''' 85 | from .. import sim_server 86 | self.PREFIX = sim_server.SimIndexService.PREFIX 87 | self.EXPORTED_METHODS = sim_server.SimIndexService.EXPORTED_METHODS 88 | self._server = rpclib.Server(server_url) 89 | 90 | def __getattr__(self, name): 91 | if name in self.EXPORTED_METHODS: 92 | func = getattr(self._server, 93 | self.PREFIX + '.' + name) 94 | return func 95 | else: 96 | raise Exception("Unsupported method: {}".format(name)) 97 | 98 | # RemoteSimIndex is a subtype of SimIndex 99 | SimIndex.register(RemoteSimIndex) 100 | 101 | 102 | -------------------------------------------------------------------------------- /pysimsearch/sim_index/shelf_sim_index.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2011, Taher Haveliwala 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * The names of project contributors may not be used to endorse or 14 | # promote products derived from this software without specific 15 | # prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | 30 | ''' 31 | ShelfSimIndex 32 | 33 | Sample usage:: 34 | 35 | from pprint import pprint 36 | from pysimsearch.sim_index import ShelfSimIndex 37 | from pysimsearch import doc_reader 38 | 39 | sim_index = ShelfSimIndex() 40 | sim_index.index_urls('http://www.stanford.edu/', 41 | 'http://www.berkeley.edu', 42 | 'http://www.ucla.edu', 43 | 'http://www.mit.edu') 44 | pprint(sim_index.postings_list('university')) 45 | pprint(list(sim_index.docnames_with_terms('university', 'california'))) 46 | 47 | sim_index.set_query_scorer('simple_count') 48 | pprint(list(sim_index.query("stanford university"))) 49 | 50 | ''' 51 | 52 | from __future__ import (division, absolute_import, print_function, 53 | unicode_literals) 54 | 55 | from collections import defaultdict, MutableMapping 56 | from shelve import DbfilenameShelf as DBShelf 57 | 58 | from . import MapSimIndex 59 | from ..exceptions import * 60 | 61 | class ShelfSimIndex(MapSimIndex): 62 | ''' 63 | Inherits from :class:`pysimsearch.sim_index.MapSimIndex`. 64 | 65 | Shelf-based implementation of :class:`SimIndex`. Indexes are backed with 66 | persistent :class:`shelve.DbfilenameShelf` objects. 67 | ''' 68 | 69 | 70 | def __init__(self, filename, flag): 71 | name_to_docid_map = StrKeyMap(DBShelf(filename + '_n2d', flag)) 72 | docid_to_name_map = StrKeyMap(DBShelf(filename + '_d2n', flag)) 73 | docid_to_feature_map = StrKeyMap(DBShelf(filename + '_feat', flag)) 74 | 75 | # term index 76 | term_index = StrKeyMap(DBShelf(filename + '_term', flag)) 77 | 78 | # document vectors 79 | doc_vectors = StrKeyMap(DBShelf(filename + '_doc_vec', flag)) 80 | 81 | # additional stats used for scoring 82 | df_map = StrKeyMap(DBShelf(filename + '_df', flag)) 83 | doc_len_map = StrKeyMap(DBShelf(filename + '_dl', flag)) 84 | 85 | self._maps = dict(name_to_docid_map=name_to_docid_map, 86 | docid_to_name_map=docid_to_name_map, 87 | docid_to_feature_map=docid_to_feature_map, 88 | term_index=term_index, 89 | doc_vectors=doc_vectors, 90 | df_map=df_map, 91 | doc_len_map=doc_len_map) 92 | 93 | super(ShelfSimIndex, self).__init__(**self._maps) 94 | 95 | self._N = len(docid_to_name_map) 96 | 97 | def close(self): 98 | for (mapname, map) in self._maps.items(): 99 | map.close() 100 | 101 | class StrKeyMap(MutableMapping): 102 | ''' 103 | Ensure that key is converted to str type that is compatible with keys 104 | for underlying map. 105 | ''' 106 | def __init__(self, map): 107 | self._map = map 108 | 109 | def __getitem__(self, key): 110 | return self._map[str(key)] 111 | 112 | def __setitem__(self, key, value): 113 | self._map[str(key)] = value 114 | 115 | def __delitem__(self, key): 116 | del self._map[str(key)] 117 | 118 | def __iter__(self): 119 | raise Exception('Unsupported') 120 | # return iter(self._map) 121 | 122 | def __len__(self): 123 | return len(self._map) 124 | 125 | def close(self): 126 | return self._map.close() 127 | -------------------------------------------------------------------------------- /pysimsearch/sim_index/sim_index.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2011, Taher Haveliwala 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * The names of project contributors may not be used to endorse or 14 | # promote products derived from this software without specific 15 | # prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | 30 | ''' 31 | SimIndex 32 | 33 | See :mod:`pysimsearch.sim_index.memory_sim_index` for sample usage 34 | 35 | ''' 36 | 37 | from __future__ import (division, absolute_import, print_function, 38 | unicode_literals) 39 | 40 | import abc 41 | import io 42 | import itertools 43 | 44 | from .. import doc_reader 45 | from .. import term_vec 46 | from ..exceptions import * 47 | from ..query_scorer import QueryScorer 48 | 49 | class SimIndex(object): 50 | ''' 51 | Base class for similarity indexes 52 | 53 | Defines interface as well as provides default implementation for 54 | several methods. 55 | 56 | Instance Attributes: 57 | config: dictionary of configuration variables 58 | 59 | ''' 60 | 61 | __metaclass__ = abc.ABCMeta 62 | 63 | def __init__(self): 64 | self._config = { 65 | 'lowercase': True, 66 | 'stoplist': {} # using dict instead of set, for rpc support 67 | } 68 | self.query_scorer = None 69 | self._N = 0 70 | self._global_N = None 71 | self._next_docid = 0 72 | 73 | def config(self, key): 74 | return self._config[key] 75 | 76 | def set_config(self, key, value): 77 | self._config[key] = value 78 | 79 | def update_config(self, **d): 80 | self._config.update(d) 81 | 82 | def load_stoplist(self, stopfile): 83 | stoplist = {} 84 | for line in stopfile: 85 | stoplist.update(zip(line.split(), itertools.repeat(1))) 86 | self.set_config('stoplist', stoplist) 87 | 88 | @abc.abstractmethod 89 | def set_global_df_map(self, df_map): 90 | '''Set global df stats''' 91 | return 92 | 93 | @abc.abstractmethod 94 | def get_local_df_map(self): 95 | '''Get local df stats''' 96 | return 97 | 98 | @abc.abstractmethod 99 | def get_name_to_docid_map(self): 100 | '''Return local mapping of name to docids''' 101 | return 102 | 103 | def set_global_N(self, N): 104 | '''Set global number of documents''' 105 | self._global_N = N 106 | 107 | def get_local_N(self): 108 | '''Return local number of documents''' 109 | return self._N 110 | 111 | def set_query_scorer(self, query_scorer): 112 | '''Set the query_scorer 113 | 114 | Params: 115 | query_scorer: if string type, we assume it is a scorer name, 116 | else we assume it is itself a scoring object 117 | of base type :class:`query_scorer.QueryScorer`. 118 | ''' 119 | if isinstance(query_scorer, basestring): 120 | self.query_scorer = QueryScorer.make_scorer(query_scorer) 121 | else: 122 | self.query_scorer = query_scorer 123 | 124 | @abc.abstractmethod 125 | def index_files(self, named_files): 126 | '''Add ``named_files`` to the index 127 | 128 | Params: 129 | named_files: iterable of (filename, file) pairs. 130 | Takes ownership of (and consumes) the files. 131 | ''' 132 | return 133 | 134 | def index_filenames(self, *filenames): 135 | '''Add ``filenames`` to the index 136 | 137 | Convenience method that wraps :meth:`index_files()` 138 | 139 | Params: 140 | ``filenames``: list of filenames to add to the index. 141 | ''' 142 | return self.index_files(doc_reader.get_text_files(filenames)) 143 | 144 | def index_urls(self, *urls): 145 | '''Add ``urls`` to the index 146 | 147 | Convenience method that wraps :meth:`index_files()` 148 | 149 | Params: 150 | ``urls``: list of urls of web pages to add to the index. 151 | ''' 152 | return self.index_files(doc_reader.get_urls(urls)) 153 | 154 | def index_string_buffers(self, named_string_buffers): 155 | '''Add ``named_string_buffers`` to the index 156 | 157 | Params: 158 | named_string_buffers: iterable of (name, string) tuples, where 159 | the string contains the data to index. 160 | 161 | ''' 162 | named_files = [] 163 | for (name, string_buffer) in named_string_buffers: 164 | if isinstance(string_buffer, str): 165 | string_buffer = unicode(string_buffer) 166 | named_files.append((name, io.StringIO(string_buffer))) 167 | self.index_files(named_files) 168 | 169 | @abc.abstractmethod 170 | def del_docids(self, *docids): 171 | '''Deletes documents corresponding to docids from the index''' 172 | return 173 | 174 | @abc.abstractmethod 175 | def docid_to_name(self, docid): 176 | '''Returns document name for a given docid''' 177 | return 178 | 179 | @abc.abstractmethod 180 | def name_to_docid(self, name): 181 | '''Returns docid for a given document name''' 182 | return 183 | 184 | @abc.abstractmethod 185 | def postings_list(self, term): 186 | ''' 187 | Return list of (docid, frequency) tuples for docs that contain term 188 | ''' 189 | return 190 | 191 | def docids_with_terms(self, terms): 192 | '''Returns a list of docids of docs containing all terms''' 193 | docs = None # will hold a set of matching docids 194 | for term in terms: 195 | if docs is None: 196 | docs = set((x[0] for x in self.postings_list(term))) 197 | else: 198 | docs.intersection_update( 199 | (x[0] for x in self.postings_list(term))) 200 | 201 | # return sorted list 202 | if docs is None: docs = [] 203 | return sorted(docs) 204 | 205 | def docnames_with_terms(self, *terms): 206 | '''Returns an iterable of docnames containing terms''' 207 | if self.config('lowercase'): 208 | terms = [term.lower() for term in terms] 209 | return (self.docid_to_name(docid) for docid in self.docids_with_terms(terms)) 210 | 211 | def query(self, q): 212 | '''Finds documents similar to q. 213 | 214 | Params: 215 | query: the query given as either a string or query vector 216 | 217 | Returns: 218 | A iterable of (docname, score) tuples sorted by score 219 | ''' 220 | if isinstance(q, basestring): 221 | if isinstance(q, str): 222 | q = unicode(q) 223 | return self._query( 224 | term_vec.term_vec(q, 225 | stoplist=self.config('stoplist'), 226 | lowercase=self.config('lowercase'))) 227 | else: 228 | return self._query(q) 229 | 230 | @abc.abstractmethod 231 | def _query(self, query_vec): 232 | '''Finds documents similar to query_vec 233 | 234 | Params: 235 | query_vec: term vector representing query document 236 | 237 | Returns: 238 | A iterable of (docname, score) tuples sorted by score 239 | ''' 240 | return 241 | 242 | -------------------------------------------------------------------------------- /pysimsearch/sim_index/sim_index_collection.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2011, Taher Haveliwala 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * The names of project contributors may not be used to endorse or 14 | # promote products derived from this software without specific 15 | # prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | 30 | ''' 31 | SimIndexCollection 32 | 33 | Sample usage:: 34 | 35 | from pprint import pprint 36 | from pysimsearch.sim_index import MemorySimIndex, SimIndexCollection 37 | 38 | indexes = (MemorySimIndex(), MemorySimIndex()) 39 | index_coll = SimIndexCollection() 40 | index_coll.add_shards(*indexes) 41 | index_coll.set_query_scorer('tfidf') 42 | index_coll.index_urls('http://www.stanford.edu/', 43 | 'http://www.berkeley.edu', 44 | 'http://www.ucla.edu', 45 | 'http://www.mit.edu') 46 | 47 | pprint(index_coll.query('stanford university')) 48 | 49 | ''' 50 | 51 | from __future__ import (division, absolute_import, print_function, 52 | unicode_literals) 53 | 54 | from collections import defaultdict 55 | import operator 56 | import os 57 | 58 | from . import SimIndex 59 | from ..exceptions import * 60 | 61 | class SimIndexCollection(SimIndex): 62 | ''' 63 | Inherits from :class:`pysimsearch.sim_index.SimIndex`. 64 | 65 | Provides a :class:`SimIndex` view over a sharded collection of SimIndexes. 66 | 67 | Useful with collections of remote SimIndexes to provide a 68 | distributed indexing and serving architecture. 69 | 70 | Assumes document-level sharding: 71 | 72 | - ``query()`` requests are routed to all shards in collection. 73 | - ``index_files()`` requests are routed according to a sharding function 74 | 75 | Note that if we had used query-sharding, then instead, queries would 76 | be routed using a sharding function, and index-requests would be 77 | routed to all shards. The two sharding approaches correspond to either 78 | partitioning the postings matrix by columns (doc-sharding), 79 | or rows (query-sharding). 80 | 81 | The shard-function is only used for ``index_*()`` operations. If you 82 | have a read-only collection, you don't need a sharding function. 83 | ''' 84 | 85 | def __init__(self, shards=(), root=True): 86 | super(SimIndexCollection, self).__init__() 87 | 88 | self._shards = [] 89 | self.shard_func = self.default_shard_func 90 | self._name_to_docid_map = {} 91 | self._docid_to_name_map = {} 92 | self._df_map = {} 93 | 94 | self._dirty = False 95 | 96 | self.set_config('root', root, passthrough=False) 97 | 98 | if shards: 99 | self.add_shards(*shards) 100 | 101 | def set_config(self, key, value, passthrough=True): 102 | '''Update config var for shards''' 103 | super(SimIndexCollection, self).set_config(key, value) 104 | if passthrough: 105 | for shard in self._shards: 106 | shard.set_config(key, value) 107 | 108 | def update_config(self, passthrough=True, **d): 109 | '''Update config for shards''' 110 | super(SimIndexCollection, self).update_config(**d) 111 | if passthrough: 112 | for shard in self._shards: 113 | shard.update_config(**d) 114 | 115 | def clear_shards(self): 116 | self._shards = [] 117 | 118 | def add_shards(self, *sim_index_shards): 119 | for shard in sim_index_shards: 120 | shard.update_config(**self._config) 121 | self._shards.extend(sim_index_shards) 122 | self.update_trigger_helper() 123 | 124 | _salt = None 125 | def default_shard_func(self, shard_key): 126 | '''implements the default sharding function''' 127 | if self._salt is None: 128 | self._salt = os.urandom(4) 129 | return hash(str(shard_key)+self._salt) % len(self._shards) 130 | 131 | def set_shard_func(self, func): 132 | self._shard_func = func 133 | 134 | def set_global_N(self, N): 135 | for shard in self._shards: 136 | shard.set_global_N(N) 137 | 138 | def set_global_df_map(self, df_map): 139 | for shard in self._shards: 140 | shard.set_global_df_map(df_map) 141 | 142 | def get_local_df_map(self): 143 | return self._df_map 144 | 145 | def get_name_to_docid_map(self): 146 | return self._name_to_docid_map 147 | 148 | def update_trigger(method): 149 | ''' 150 | Decorator for methods that update the index. Used as a post-update 151 | trigger that gathers new term stats, and propagates them back down (if 152 | we're the root node) 153 | ''' 154 | def wrapper(self, *args, **kwargs): 155 | self._dirty = True 156 | val = method(self, *args, **kwargs) 157 | if self._dirty: 158 | self.update_trigger_helper() 159 | self._dirty = False 160 | 161 | return wrapper 162 | 163 | @update_trigger 164 | def index_files(self, named_files): 165 | ''' 166 | Translate to index_string_buffers() call, since file objects 167 | can't be serialized for rpcs to backends. Note: we 168 | currently read in all files in memory, and make one call to 169 | index_string_buffers() -- this can be memory-intesive 170 | if named_files represents a large number of files. 171 | 172 | TODO: read in files in smaller batches, and then make mutiple 173 | calls to index_string_buffers(). 174 | ''' 175 | named_string_buffers = [(name, file.read()) 176 | for (name, file) in named_files] 177 | self.index_string_buffers(named_string_buffers) 178 | 179 | @update_trigger 180 | def index_string_buffers(self, named_string_buffers): 181 | '''Routes index_string_buffers() call to appropriate shard.''' 182 | # minimize rpcs by collecting (name, buffer) tuples for 183 | # different shards up-front 184 | sharded_input_map = defaultdict(list) 185 | for (name, buffer) in named_string_buffers: 186 | sharded_input_map[self.shard_func(name)].append((name, buffer)) 187 | 188 | # issue an indexing rpc to each sharded backend that has some input 189 | # TODO: use non-blocking rpc's 190 | for shard_id in sharded_input_map: 191 | self._shards[shard_id].index_string_buffers( 192 | sharded_input_map[shard_id] 193 | ) 194 | 195 | @update_trigger 196 | def index_urls(self, *urls): 197 | '''Index web pages given by urls''' 198 | # minimize rpcs by collecting (name, buffer) tuples for 199 | # different shards up-front 200 | sharded_input_map = defaultdict(list) 201 | for url in urls: 202 | sharded_input_map[self.shard_func(url)].append(url) 203 | 204 | # Issue an indexing call to each sharded backend that has some input 205 | # Generally the sharded servers should be backed with 206 | # ConcurrentSimIndexes so that the index_urls() call will generally 207 | # be non-blocking. 208 | for shard_id in sharded_input_map: 209 | self._shards[shard_id].index_urls( 210 | *sharded_input_map[shard_id] 211 | ) 212 | 213 | @update_trigger 214 | def del_docids(self, *docids): 215 | '''Delete docid from index collection''' 216 | 217 | sharded_del_map = defaultdict(list) 218 | for docid in docids: 219 | # make sure we have a compound docid 220 | assert '-' in docid 221 | (shard_id, sep, remote_docid) = docid.partition('-') 222 | shard_id = int(shard_id) 223 | # if the remote shard is expected to be a leaf, then cast 224 | # remote docid to int 225 | if '-' not in remote_docid: 226 | remote_docid = int(remote_docid) 227 | sharded_del_map[shard_id].append(remote_docid) 228 | 229 | # propagate the requests the appropriate shard 230 | for (shard_id, remote_docids) in sharded_del_map.items(): 231 | self._shards[shard_id].del_docids(*remote_docids) 232 | 233 | @staticmethod 234 | def make_node_docid(shard_id, docid): 235 | return "{}-{}".format(shard_id, docid) 236 | 237 | def docid_to_name(self, docid): 238 | '''Translates node docid to name''' 239 | return self._docid_to_name_map[docid] 240 | 241 | def name_to_docid(self, name): 242 | '''Translates name to node docid''' 243 | return self._name_to_docid_map[name] 244 | 245 | def postings_list(self, term): 246 | '''Returns aggregated postings list in terms of global docids''' 247 | 248 | merged_postings_list = [] 249 | for shard_id in range(len(self._shards)): 250 | merged_postings_list.extend( 251 | [(self.make_node_docid(shard_id, docid), freq) for 252 | (docid, freq) in self._shards[shard_id].postings_list(term)] 253 | ) 254 | 255 | return merged_postings_list 256 | 257 | def set_query_scorer(self, query_scorer): 258 | '''Passes ``set_query_scorer()`` request to all shards. 259 | 260 | Params: 261 | query_scorer: scorer object or name. If any backends are remote, 262 | query_scorer needs to be a scorer name, rather than 263 | a scorer object (which we currently don't serialize 264 | for rpcs) 265 | ''' 266 | for shard in self._shards: 267 | shard.set_query_scorer(query_scorer) 268 | 269 | def _query(self, query_vec): 270 | '''Issues query to collection and returns merged results 271 | 272 | TODO: use a merge alg. (heapq.merge doesn't have a key= arg yet) 273 | TODO: add support for rank-aggregation in the case of heterogenous 274 | collections where ir scores are not directly comparable 275 | ''' 276 | results = [] 277 | for shard in self._shards: 278 | results.extend(shard.query(query_vec)) 279 | results.sort(key=operator.itemgetter(1), reverse=True) 280 | return results 281 | 282 | def update_trigger_helper(self): 283 | self.update_node_stats() 284 | 285 | # If we're the root of the collection, then propogate back node 286 | # stats (which are global stats) to children. Else some ancestor 287 | # node will have that responsibility. 288 | if self.config('root'): 289 | self.broadcast_node_stats() 290 | 291 | def update_node_stats(self): 292 | ''' 293 | Fetches local stats from all shards, aggregates them, and 294 | rebroadcasts global stats back to shards. Currently uses 295 | "brute-force"; incremental updating (in either direction) 296 | is not supported. 297 | ''' 298 | 299 | def merge_df_map(target, source): 300 | ''' 301 | Helper function to merge df_maps. 302 | ''' 303 | for (term, df) in source.items(): 304 | if term not in target: target[term] = 0 305 | target[term] += df 306 | 307 | # Collect global stats 308 | self._N = 0 309 | self._df_map = {} 310 | name_to_docid_maps = {} 311 | for shard_id in range(len(self._shards)): 312 | shard = self._shards[shard_id] 313 | self._N += shard.get_local_N() 314 | merge_df_map(self._df_map, shard.get_local_df_map()) 315 | name_to_docid_maps[shard_id] = shard.get_name_to_docid_map() 316 | 317 | # Update our name <-> node_docid mapping 318 | for (shard_id, name_to_docid_map) in name_to_docid_maps.iteritems(): 319 | for (name, docid) in name_to_docid_map.iteritems(): 320 | gdocid = self.make_node_docid(shard_id, docid) 321 | self._name_to_docid_map[name] = gdocid 322 | self._docid_to_name_map[gdocid] = name 323 | 324 | def broadcast_node_stats(self): 325 | # Broadcast global stats. Only called by collection root node. 326 | for shard in self._shards: 327 | shard.set_global_N(self._N) 328 | shard.set_global_df_map(self._df_map) 329 | 330 | -------------------------------------------------------------------------------- /pysimsearch/sim_server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2011, Taher Haveliwala 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * The names of project contributors may not be used to endorse or 14 | # promote products derived from this software without specific 15 | # prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | 30 | ''' 31 | SimServer 32 | 33 | Server wrapper for pysimsearch modules. Currently, only provides access 34 | to sim_index. 35 | 36 | *Sample session:* 37 | 38 | **Server** 39 | :: 40 | 41 | bash$ ./sim_server.py sim_index -p 9001 42 | Use Control-C to exit 43 | 44 | **jsonrpclib Client** 45 | 46 | >>> from pprint import pprint 47 | >>> import jsonrpclib 48 | >>> server = jsonrpclib.Server('http://localhost:9001/RPC2') 49 | >>> server.sim_index.index_urls('http://www.stanford.edu/', 'http://www.berkeley.edu', 'http://www.ucla.edu') 50 | >>> pprint(server.sim_index.query('university')) 51 | [[u'http://www.stanford.edu/', 0.10469570845856098], 52 | [u'http://www.ucla.edu', 0.04485065887313478], 53 | [u'http://www.berkeley.edu', 0.020464326883958977]] 54 | 55 | ** pysimsearch Client ** 56 | >>> from pprint import pprint 57 | >>> from pysimsearch import sim_index 58 | >>> index = sim_index.RemoteSimIndex('http://localhost:9001/RPC2') 59 | >>> index.index_urls('http://www.stanford.edu/', 'http://www.berkeley.edu', 'http://www.ucla.edu') 60 | >>> pprint(index.query('stanford')) 61 | [[u'http://www.stanford.edu/', 0.3612214953965162]] 62 | 63 | ''' 64 | 65 | from __future__ import (division, absolute_import, print_function, 66 | unicode_literals) 67 | 68 | # boilerplate to allow running as script 69 | if __name__ == "__main__" and __package__ is None: 70 | import sys, os 71 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 72 | sys.path.insert(0, parent_dir) 73 | import pysimsearch 74 | __package__ = str("pysimsearch") 75 | del sys, os 76 | 77 | # external modules 78 | import argparse 79 | import logging 80 | import traceback 81 | import types 82 | 83 | from pprint import pprint 84 | from jsonrpclib.SimpleJSONRPCServer import SimpleJSONRPCServer as SimpleRPCServer 85 | from jsonrpclib.SimpleJSONRPCServer import SimpleJSONRPCRequestHandler as SimpleRPCRequestHandler 86 | 87 | #from SimpleXMLRPCServer import SimpleXMLRPCServer as SimpleRPCServer 88 | #from SimpleXMLRPCServer import SimpleXMLRPCRequestHandler as SimpleRPCRequestHandler 89 | 90 | # our modules 91 | from .sim_index import * 92 | from . import query_scorer 93 | 94 | class SimIndexService(object): 95 | '''Provide access to sim_index as an RPC service''' 96 | 97 | PREFIX = 'sim_index' 98 | EXPORTED_METHODS = {'index_urls', 99 | 'index_string_buffers', 100 | 'del_docids', 101 | 'docid_to_name', 102 | 'name_to_docid', 103 | 'docid_to_name', 104 | 'postings_list', 105 | 'docids_with_terms', 106 | 'docnames_with_terms', 107 | 'set_query_scorer', 108 | 'query', 109 | 'set_global_N', 110 | 'get_local_N', 111 | 'set_global_df_map', 112 | 'get_local_df_map', 113 | 'get_name_to_docid_map', 114 | 'config', 115 | 'set_config', 116 | 'update_config'} 117 | 118 | def __init__(self, index): 119 | self._sim_index = index 120 | 121 | def _dispatch(self, method, params): 122 | if not method.startswith(self.PREFIX + '.'): 123 | raise Exception('method "{}" is not supported: bad prefix'.format(method)) 124 | 125 | method_name = method.partition('.')[2] 126 | 127 | logging.info('_dispatch: {}'.format(method)) 128 | 129 | if method_name not in self.EXPORTED_METHODS: 130 | raise Exception('method "{}" is not supported'.format(method_name)) 131 | 132 | func = getattr(self._sim_index, method_name) 133 | try: 134 | if type(params) is types.ListType: 135 | r = func(*params) 136 | else: 137 | r = func(**params) 138 | # if we got back a generator, then let's materialize a list so it 139 | # can serialize properly 140 | if isinstance(r, types.GeneratorType): 141 | r = list(r) 142 | return r 143 | except Exception as e: 144 | logging.error(traceback.format_exc()) 145 | raise e 146 | 147 | # Restrict to a particular path. 148 | class RequestHandler(SimpleRPCRequestHandler): 149 | rpc_paths = ('/RPC2',) 150 | 151 | def start_sim_index_server(port, 152 | backends=(), 153 | remote_urls=(), 154 | root=True, 155 | logRequests=True): 156 | 157 | server = SimpleRPCServer(('localhost', port), 158 | logRequests=logRequests, 159 | requestHandler=RequestHandler) 160 | 161 | backend_list = list(backends) 162 | if remote_urls: 163 | backend_list.extend( 164 | [RemoteSimIndex(url) for url in remote_urls]) 165 | 166 | if backend_list: 167 | if len(backend_list) == 1: 168 | index = ConcurrentSimIndex(backend_list[0]) 169 | else: 170 | index = ConcurrentSimIndex( 171 | SimIndexCollection( 172 | shards=backend_list, root=root)) 173 | else: 174 | index = ConcurrentSimIndex(MemorySimIndex()) 175 | index.set_query_scorer('tfidf') 176 | 177 | server.register_instance(SimIndexService(index)) 178 | 179 | try: 180 | print('Use Control-C to exit') 181 | server.serve_forever() 182 | except KeyboardInterrupt: 183 | print('Exiting') 184 | 185 | 186 | # --- main() --- 187 | 188 | def main(): 189 | parser = argparse.ArgumentParser( 190 | description='Start a pysimsearch server') 191 | subparsers = parser.add_subparsers(title='services', 192 | description='valid services', 193 | dest='command', 194 | help='services help',) 195 | 196 | parser_sim_index = subparsers.add_parser('sim_index', 197 | help='Start a SimIndex') 198 | parser_sim_index.add_argument( 199 | '-p', '--port', nargs='?', 200 | default=9001, type=int, 201 | help='Specify server port' 202 | ) 203 | 204 | parser_sim_index.add_argument( 205 | '-r', '--remote_shards', nargs='*', 206 | help='Specify remote backends to use, instead of local index' 207 | ) 208 | 209 | parser_sim_index.add_argument( 210 | '--noroot', action='store_false', 211 | dest='root', default=True, 212 | help='True if this is the root index node' 213 | ) 214 | 215 | args = parser.parse_args() 216 | if args.command == 'sim_index': 217 | start_sim_index_server(port=args.port, 218 | remote_urls=args.remote_shards, 219 | root=args.root) 220 | else: 221 | raise Exception('Unknown command: {}'.format(args.command)) 222 | 223 | if __name__ == '__main__': 224 | main() 225 | 226 | -------------------------------------------------------------------------------- /pysimsearch/similarity.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2010, Taher Haveliwala 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * The names of project contributors may not be used to endorse or 14 | # promote products derived from this software without specific 15 | # prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | r''' 30 | Sample usage as a script:: 31 | 32 | $ python similarity.py http://www.stanford.edu/ http://www.berkeley.edu/ http://www.mit.edu/ 33 | Comparing files ['http://www.stanford.edu/', 'http://www.berkeley.edu/', 'http://www.mit.edu/'] 34 | sim(http://www.stanford.edu/,http://www.berkeley.edu/)=0.322771960247 35 | sim(http://www.stanford.edu/,http://www.mit.edu/)=0.142787018368 36 | sim(http://www.berkeley.edu/,http://www.mit.edu/)=0.248877629741 37 | 38 | ''' 39 | 40 | from __future__ import (division, absolute_import, print_function, 41 | unicode_literals) 42 | 43 | # boilerplate to allow running as script 44 | if __name__ == "__main__" and __package__ is None: 45 | import sys, os 46 | parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 47 | sys.path.insert(0, parent_dir) 48 | import pysimsearch 49 | __package__ = str("pysimsearch") 50 | del sys, os 51 | 52 | import argparse 53 | from itertools import chain, repeat 54 | import re 55 | 56 | # our modules 57 | from . import doc_reader 58 | from .exceptions import * 59 | from .term_vec import * 60 | 61 | 62 | # --- top-level functions --- 63 | def measure_similarity(file_a, file_b, sim_func = None): 64 | r''' 65 | Returns the textual similarity of term_vec_a and term_vec_b using chosen 66 | similarity metric 67 | 68 | 'sim_func' defaults to cosine_sim if not specified 69 | ''' 70 | if sim_func == None: 71 | sim_func = cosine_sim # default to cosine_sim 72 | 73 | return sim_func(term_vec(file_a), term_vec(file_b)) 74 | 75 | def pairwise_compare(filenames=None, urls=None): 76 | r''' 77 | Do a pairwise comparison of all documents specified by ``filenames`` 78 | and ``urls`` and return their pairwise similarities 79 | ''' 80 | input = [] 81 | if filenames is not None: 82 | input.extend(zip(filenames, repeat(doc_reader.get_text_file))) 83 | if urls is not None: 84 | input.extend(zip(urls, repeat(doc_reader.get_url))) 85 | 86 | similarities = [] 87 | for i in range(0, len(input)): 88 | for j in range(i+1, len(input)): 89 | (name_a, get_input) = input[i] 90 | (name_b, get_input) = input[j] 91 | print("comparing {} and {}".format(name_a, name_b)) 92 | with get_input(name_a) as file_a: 93 | with get_input(name_b) as file_b: 94 | similarities.append((name_a, 95 | name_b, 96 | measure_similarity(file_a, file_b))) 97 | return similarities 98 | 99 | # --- Similarity measures --- 100 | 101 | def cosine_sim(u, v): 102 | r''' 103 | Returns the cosine similarity of u,v: ``/(|u||v|)`` 104 | where ``|u|`` is the L2 norm 105 | ''' 106 | return dot_product(u, v) / (l2_norm(u) * l2_norm(v)) 107 | 108 | def jaccard_sim(A, B): 109 | r''' 110 | Returns the Jaccard similarity of A,B: ``|A \cap B| / |A \cup B|`` 111 | We treat A and B as multi-sets (The Jaccard coefficient is technically 112 | meant for sets, although it is easily extended to multi-sets) 113 | ''' 114 | return mag_intersect(A, B) / mag_union(A, B) 115 | 116 | 117 | # --- main() --- 118 | 119 | def main(): 120 | '''Commandline interface for measure pairwise similarities of files''' 121 | parser = argparse.ArgumentParser( 122 | description='List pairwise similarities of input documents') 123 | parser.add_argument('doc', nargs='*', 124 | help='a document in the comparison list') 125 | parser.add_argument('-f', '--filename_list', nargs='?', 126 | help='file containing list of filenames to compare') 127 | parser.add_argument('-u', '--url_list', nargs='?', 128 | help='file containing list of urls to compare') 129 | 130 | args = parser.parse_args() 131 | 132 | def get_list(input_fname): 133 | list = [] 134 | if input_fname is not None: 135 | try: 136 | with open(input_fname) as input_file: 137 | list = [line.strip() for line in 138 | input_file.readlines()] 139 | except IOError: 140 | print("Sorry, could not open " + input_fname) 141 | return list 142 | 143 | filenames = get_list(args.filename_list) 144 | urls = get_list(args.url_list) 145 | 146 | for doc in args.doc: 147 | if re.search('^http://', doc): 148 | urls.append(doc) 149 | else: 150 | filenames.extend(doc) 151 | 152 | if len(filenames) + len(urls) < 2: 153 | raise Error("Sorry, you must specify at least two documents " 154 | "to compare.") 155 | 156 | print('Comparing files {}'.format(list(chain(filenames, urls)))) 157 | similarities = pairwise_compare(filenames=filenames, urls=urls) 158 | for (name_a, name_b, sim) in similarities: 159 | print('sim({0},{1})={2}'.format(name_a, name_b, sim)) 160 | 161 | 162 | if __name__ == '__main__': 163 | main() 164 | -------------------------------------------------------------------------------- /pysimsearch/term_vec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2010, Taher Haveliwala 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * The names of project contributors may not be used to endorse or 14 | # promote products derived from this software without specific 15 | # prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | ''' 30 | Term-vector operations 31 | ''' 32 | 33 | import io 34 | import math 35 | 36 | def dot_product(v1, v2): 37 | '''Returns dot product of two term vectors''' 38 | val = 0.0 39 | for term in v1: 40 | if term in v2: val += v1[term] * v2[term] 41 | return val 42 | 43 | def l2_norm(v): 44 | '''Returns L2 norm of term vector v''' 45 | val = 0.0 46 | for term in v: 47 | val += v[term]**2 48 | val = math.sqrt(val) 49 | return val 50 | 51 | def mag_union(A, B): 52 | ''' 53 | Returns magnitude of multiset-union of A and B 54 | ''' 55 | val = 0 56 | for term in A: val += A[term] 57 | for term in B: val += B[term] 58 | return val 59 | 60 | def mag_intersect(A, B): 61 | ''' 62 | Returns magnitude of multiset-intersection of A and B 63 | ''' 64 | val = 0 65 | for term in A: 66 | if term in B: val += min(A[term], B[term]) 67 | return val 68 | 69 | def magnitude(v): 70 | '''Returns L2 norm of term vector v (identical to l2_norm())''' 71 | return l2_norm(v) 72 | 73 | def term_vec(input, stoplist=None, lowercase=False): 74 | ''' 75 | Returns a term vector for ``input``, represented as a dictionary 76 | of the form {term: frequency} 77 | 78 | ``input`` can be either a string or a file 79 | ''' 80 | if isinstance(input, basestring): 81 | with io.StringIO(input) as string_buffer: 82 | return term_vec(string_buffer) 83 | else: 84 | # default args: 85 | if stoplist is None: 86 | stoplist = set() 87 | 88 | tf_dict = {} 89 | for line in input: 90 | for term in line.split(): 91 | if term not in stoplist: 92 | if lowercase: term = term.lower() 93 | if term not in tf_dict: tf_dict[term] = 0 94 | tf_dict[term] += 1 95 | return tf_dict 96 | -------------------------------------------------------------------------------- /pysimsearch/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taherh/pysimsearch/57796f7175565a8481fe80a56c7815bf0264d4fb/pysimsearch/test/__init__.py -------------------------------------------------------------------------------- /pysimsearch/test/freq_tools_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2010, Taher Haveliwala 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * The names of project contributors may not be used to endorse or 14 | # promote products derived from this software without specific 15 | # prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | ''' 30 | Unittests for pysimsearch.freq_tools package 31 | 32 | To run unittests, run 'nosetests' from the test directory 33 | ''' 34 | from __future__ import(division, absolute_import, print_function, 35 | unicode_literals) 36 | 37 | import unittest 38 | 39 | import io 40 | import pprint 41 | 42 | from pysimsearch import freq_tools 43 | from pysimsearch import doc_reader 44 | 45 | class FreqToolsTest(unittest.TestCase): 46 | longMessage = True 47 | 48 | def test_read_df(self): 49 | '''read_df() test''' 50 | df_dict = {'a':5, 'b':3, 'c':1} 51 | df_file_str =\ 52 | ''' 53 | a 5 54 | b 3 55 | c 1 56 | ''' 57 | df_file = io.StringIO(df_file_str) 58 | self.assertEqual(freq_tools.read_df(df_file), df_dict) 59 | 60 | def test_write_df(self): 61 | '''write_df() test''' 62 | df_dict = {'a':5, 'b':3, 'c':1} 63 | df_file = io.StringIO() 64 | freq_tools.write_df(df_dict, df_file) 65 | 66 | df_file.seek(0) 67 | self.assertEqual(freq_tools.read_df(df_file), df_dict) 68 | 69 | def test_compute_df(self): 70 | doc1 = 'a b b c d e e e e f' 71 | doc2 = ' b e g g g h i' 72 | doc3 = ' b b b b c d h ' 73 | 74 | df_dict = {'a':1, 'b':3, 'c':2, 'd':2, 'e':2, 'f':1, 'g':1, 'h':2, 75 | 'i':1} 76 | 77 | files = (io.StringIO(doc1), io.StringIO(doc2), io.StringIO(doc3)) 78 | self.assertEqual(freq_tools.compute_df(files), df_dict) 79 | 80 | 81 | if __name__ == "__main__": 82 | unittest.main() 83 | -------------------------------------------------------------------------------- /pysimsearch/test/sim_index_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2010, Taher Haveliwala 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * The names of project contributors may not be used to endorse or 14 | # promote products derived from this software without specific 15 | # prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | ''' 30 | Unittests for pysimsearch.sim_index package 31 | 32 | To run unittests, run 'nosetests' from the test directory 33 | ''' 34 | from __future__ import(division, absolute_import, print_function, 35 | unicode_literals) 36 | 37 | import unittest 38 | 39 | import io 40 | import math 41 | import sys 42 | import time 43 | from multiprocessing import Process 44 | from pprint import pprint 45 | 46 | from pysimsearch import term_vec 47 | from pysimsearch.sim_index import MemorySimIndex 48 | from pysimsearch.sim_index import ShelfSimIndex 49 | from pysimsearch.sim_index import ConcurrentSimIndex 50 | from pysimsearch.sim_index import SimIndexCollection 51 | from pysimsearch.sim_index import RemoteSimIndex 52 | from pysimsearch import sim_server 53 | 54 | class SimIndexTest(object): 55 | ''' 56 | Provides common tests for different implementations of the SimIndex 57 | interface. 58 | 59 | To test a concrete implementation of SimIndex, must sublcass SimIndexTest, 60 | and also inherit unittest.TestCase. SimIndexTest intentionally does not 61 | inherit unittest.TestCase, as it is only an abstract class that cannot be 62 | instantiated and tested separately from an implementation. 63 | ''' 64 | longMessage = True 65 | 66 | sim_index = None 67 | 68 | def setUp(self): 69 | with io.StringIO(self.stopfile_buffer) as stopfile: 70 | self.sim_index.load_stoplist(stopfile) 71 | 72 | self.sim_index.index_string_buffers(self.docs) 73 | 74 | # Stopword list 75 | stopfile_buffer = "stopword1 stopword2" 76 | 77 | # Test documents 78 | docs = ( ('doc1', "hello there world hello stopword1"), 79 | ('doc2', "hello world stopword2"), 80 | ('doc3', "hello there bob ") ) 81 | 82 | # Postings that correspond to test documents 83 | golden_postings = { 'hello': {'doc1': 2, 'doc2': 1, 'doc3': 1}, 84 | 'there': {'doc1': 1, 'doc3': 1}, 85 | 'world': {'doc1': 1, 'doc2': 1}, 86 | 'bob': {'doc3': 1}, 87 | 'nobody': {}, 88 | '': {}} 89 | 90 | # Golden hits data (Conjunctive: Requires presence of all terms) 91 | # 92 | # We can reuse golden_postings to provide some test input here 93 | golden_conj_hits = { term: set(postings.keys()) 94 | for (term, postings) in golden_postings.items() } 95 | # and of course throw in some multiword queries as well 96 | golden_conj_hits.update({ "hello there": {'doc1', 'doc3'}, 97 | "there world": {'doc1'}, 98 | "hello world": {'doc1', 'doc2'} }) 99 | 100 | # Golden hits data for SimpleCountQueryScorer (frequencies are simple 101 | # match-counts between query terms and document terms). 102 | # (Disjunctive: requires any term to be present) 103 | # 104 | # We can reuse golden_postings to provide some test input here 105 | golden_scored_hits = { term: docnames 106 | for (term, docnames) in golden_postings.items() } 107 | # and of course throw in some multiword queries as well 108 | golden_scored_hits.update({ "hello there": {'doc1': 3, 'doc2': 1, 'doc3': 2}, 109 | "there world": {'doc1': 2, 'doc2': 1, 'doc3': 1}, 110 | "hello world": {'doc1': 3, 'doc2': 2, 'doc3': 1} }) 111 | 112 | def get_golden_hits_cos(self): 113 | '''Manually computes cosine scores for test set to create golden results''' 114 | d1_len = math.sqrt(2^2 + 1 + 1) 115 | d2_len = math.sqrt(1 + 1) 116 | d3_len = math.sqrt(1 + 1 + 1) 117 | N = 3 118 | hello_idf = math.log(N/3) 119 | there_idf = math.log(N/2) 120 | world_idf = math.log(N/2) 121 | bob_idf = math.log(N/1) 122 | r = ({ "hello there": {'doc1': hello_idf * 2 / d1_len + there_idf / d1_len, 123 | 'doc2': hello_idf / d2_len, 124 | 'doc3': hello_idf / d3_len + there_idf / d3_len}, 125 | "there world": {'doc1': there_idf / d1_len + world_idf / d1_len, 126 | 'doc2': world_idf / d2_len, 127 | 'doc3': there_idf / d3_len}, 128 | "hello world": {'doc1': hello_idf * 2 / d1_len + world_idf / d1_len, 129 | 'doc2': hello_idf / d2_len + world_idf / d2_len, 130 | 'doc3': hello_idf / d3_len} }) 131 | pprint(r) 132 | return r 133 | 134 | def test_docname_docid_translation(self): 135 | '''Test docname_to_docid()/docid_to_docname() using known data''' 136 | 137 | for (docname, doc) in self.docs: 138 | self.assertEqual(docname, 139 | self.sim_index.docid_to_name( 140 | self.sim_index.name_to_docid(docname))) 141 | 142 | def test_postings_list(self): 143 | '''Test postings_list() using known data 144 | 145 | We use sets instead of lists to more easily allow equality 146 | comparison with golden data. 147 | ''' 148 | 149 | for term in self.golden_postings: 150 | translated_postings = { 151 | self.sim_index.docid_to_name(docid): freq 152 | for (docid, freq) in 153 | self.sim_index.postings_list(term) 154 | } 155 | self.assertEqual(translated_postings, 156 | self.golden_postings[term]) 157 | 158 | def test_stoplist(self): 159 | '''Test stoplist functionality''' 160 | for term in self.stopfile_buffer.split(): 161 | print("stopword={}".format(term)) 162 | self.assertEqual(list(self.sim_index.postings_list(term)), []) 163 | 164 | def test_docnames_with_terms(self): 165 | '''Test docnames_with_terms() using known data 166 | 167 | We use sets instead of lists to more easily allow equality 168 | comparison with golden data. 169 | ''' 170 | 171 | # We unpack the golden hit lists, construct a golden set of docnames 172 | # for the hits, and compare with sim_index.docnames_with_terms() 173 | for (query, golden_doc_hits) in self.golden_conj_hits.items(): 174 | query_vec = term_vec.term_vec(query) 175 | terms = [term for (term, freq) in query_vec.items()] 176 | 177 | self.assertEqual(golden_doc_hits, 178 | set(self.sim_index.docnames_with_terms(*terms))) 179 | 180 | def test_query_simple_scorer(self): 181 | '''Test query() with simple_scorer using known data. 182 | 183 | Uses SimpleCountQueryScorer for scoring. 184 | ''' 185 | self.sim_index.set_query_scorer('simple_count') 186 | for (query, golden_doc_hits) in self.golden_scored_hits.items(): 187 | self.assertEqual(golden_doc_hits, 188 | dict(self.sim_index.query(query)), 189 | msg = "query={}".format(query)) 190 | 191 | def test_query_tfidf_scorer(self): 192 | '''Test query() with tfidf using known data. 193 | 194 | Uses TFIDFQueryScorer for scoring. 195 | ''' 196 | self.sim_index.set_query_scorer('tfidf') 197 | for (query, golden_doc_hits_cos) in self.get_golden_hits_cos().items(): 198 | results = self.sim_index.query(query) 199 | for (docname, score) in results: 200 | self.assertAlmostEqual(score, 201 | golden_doc_hits_cos[docname], 202 | msg="results={}".format(str(results))) 203 | 204 | def test_del_docids(self): 205 | '''Test del_docids()''' 206 | retest_list = (self.test_docnames_with_terms, 207 | self.test_query_simple_scorer, 208 | self.test_query_tfidf_scorer,) 209 | 210 | # Make sure that the selected tests already pass (just for clarity) 211 | for test in retest_list: 212 | test() 213 | 214 | # Add an extra doc to the index 215 | self.sim_index.index_string_buffers( (('extra_doc', "hello world"),) ) 216 | 217 | # Make sure that selected tests fail when we add an extra 'unexpected' 218 | # doc to the index 219 | for test in retest_list: 220 | self.assertRaises(AssertionError, test) 221 | 222 | # Delete the extra doc 223 | docid = self.sim_index.name_to_docid('extra_doc') 224 | print('extra docid={}'.format(docid)) 225 | self.sim_index.del_docids(docid) 226 | 227 | # Now make sure that the selected tests pass again 228 | for test in retest_list: 229 | test() 230 | 231 | def test_config(self): 232 | '''Ensure that various config params are properly handled''' 233 | 234 | ### Test 'lowercase' param 235 | 236 | def _check_lc(index, golden_results): 237 | '''helper that checks index against golden_results''' 238 | for (term, golden_docs) in golden_results: 239 | self.assertEqual( 240 | set(index.docnames_with_terms(term)), golden_docs) 241 | self.assertEqual( 242 | set([doc for (doc, score) in index.query(term)]), golden_docs) 243 | 244 | # test data 245 | test_docs = (('doc1', 'Hello There'), 246 | ('doc2', 'hello there')) 247 | 248 | # lowercase=True 249 | index = MemorySimIndex() 250 | index.set_config('lowercase', True) 251 | index.index_string_buffers(test_docs) 252 | golden_results = (('hello', {'doc1', 'doc2'}), 253 | ('Hello', {'doc1', 'doc2'}), 254 | ('HELLO', {'doc1', 'doc2'})) 255 | _check_lc(index, golden_results) 256 | 257 | # lowercase=False 258 | index = MemorySimIndex() 259 | index.set_config('lowercase', False) 260 | index.index_string_buffers(test_docs) 261 | golden_results = (('hello', {'doc2'}), 262 | ('Hello', {'doc1'}), 263 | ('HELLO', set())) 264 | _check_lc(index, golden_results) 265 | 266 | class MemorySimIndexTest(SimIndexTest, unittest.TestCase): 267 | ''' 268 | All tests hitting the SimIndex interface are in the parent class, SimIndexTest 269 | 270 | Tests for api's not in parent class are tested separately here. This is 271 | so we can reuse test code across all implementations of SimIndex. 272 | ''' 273 | 274 | def setUp(self): 275 | print("MemorySimIndexTest") 276 | self.sim_index = MemorySimIndex() 277 | super(MemorySimIndexTest, self).setUp() 278 | 279 | def tearDown(self): 280 | pass 281 | 282 | def test_save_load(self): 283 | '''Test save()/load() functionality''' 284 | with io.BytesIO() as output: 285 | self.sim_index.save(output) 286 | output.seek(0) 287 | loaded_sim_index = MemorySimIndex.load(output) 288 | self.sim_index = loaded_sim_index 289 | self.test_query_simple_scorer() # make sure test_query() still works 290 | 291 | class ShelfSimIndexTest(SimIndexTest, unittest.TestCase): 292 | ''' 293 | All tests hitting the SimIndex interface are in the parent class, SimIndexTest 294 | 295 | Tests for api's not in parent class are tested separately here. This is 296 | so we can reuse test code across all implementations of SimIndex. 297 | ''' 298 | 299 | def setUp(self): 300 | print("ShelfSimIndexTest") 301 | self.sim_index = ShelfSimIndex("/tmp/test_dbm", 'n') 302 | super(ShelfSimIndexTest, self).setUp() 303 | 304 | def tearDown(self): 305 | self.sim_index.close() 306 | 307 | class ConcurrentSimIndexTest(SimIndexTest, unittest.TestCase): 308 | ''' 309 | All tests hitting the SimIndex interface are in the parent class, SimIndexTest 310 | 311 | Tests for api's not in parent class are tested separately here. This is 312 | so we can reuse test code across all implementations of SimIndex. 313 | ''' 314 | 315 | def setUp(self): 316 | print("ConcurrentSimIndexTest") 317 | self.sim_index = ConcurrentSimIndex(MemorySimIndex()) 318 | super(ConcurrentSimIndexTest, self).setUp() 319 | 320 | def tearDown(self): 321 | pass 322 | 323 | class SimIndexCollectionTest(SimIndexTest, unittest.TestCase): 324 | ''' 325 | All tests hitting the SimIndex interface are in the parent class, SimIndexTest 326 | 327 | Tests for api's not in parent class are tested separately here. This is 328 | so we can reuse test code across all implementations of SimIndex. 329 | ''' 330 | 331 | def setUp(self): 332 | print("SimIndexCollectionTest") 333 | self.sim_index = SimIndexCollection() 334 | for i in range(2): 335 | self.sim_index.add_shards(MemorySimIndex()) 336 | 337 | super(SimIndexCollectionTest, self).setUp() 338 | 339 | def tearDown(self): 340 | pass 341 | 342 | 343 | class SimIndexRemoteCollectionTest(SimIndexTest, unittest.TestCase): 344 | ''' 345 | All tests hitting the SimIndex interface are in the parent class, SimIndexTest 346 | 347 | Tests for api's not in parent class are tested separately here. This is 348 | so we can reuse test code across all implementations of SimIndex. 349 | ''' 350 | 351 | processes = None 352 | 353 | def setUp(self): 354 | # setUpClass() may be more efficient for spinning up the servers, 355 | # but this way is more robust (since we'll start each test from a 356 | # clean slate). Otherwise we'd need clear() functionality added. 357 | 358 | print("SimIndexRemoteCollectionTest") 359 | 360 | # We will create a collection tree of the form: 361 | # 362 | # Root 363 | # / \ 364 | # A B 365 | # /\ /\ 366 | # 1 2 3 4 367 | self.processes = [] 368 | 369 | # start leaves 370 | for i in range(4): 371 | port = 9100 + i 372 | process = Process(target=sim_server.start_sim_index_server, 373 | kwargs={'port': port, 'logRequests': False}) 374 | process.daemon = True 375 | process.start() 376 | self.processes.append(process) 377 | 378 | print("Waiting for leaf servers to start") 379 | time.sleep(0.1) 380 | 381 | leaf_nodes = [[],[]] 382 | for i in range(4): 383 | port = 9100 + i 384 | leaf_nodes[i//2].append(RemoteSimIndex( 385 | "http://localhost:{}/RPC2".format(port))) 386 | 387 | # start interior nodes (A, B) 388 | for i in range(2): 389 | port = 9200 + i 390 | process = Process( 391 | target=sim_server.start_sim_index_server, 392 | kwargs={ 'port': port, 393 | 'backends': leaf_nodes[i], 394 | 'root': False, 395 | 'logRequests': False 396 | } 397 | ) 398 | process.daemon = True 399 | process.start() 400 | self.processes.append(process) 401 | 402 | print("Waiting for intermediate servers to start") 403 | time.sleep(0.1) 404 | 405 | interior_nodes = [] 406 | for i in range(2): 407 | port = 9200 + i 408 | interior_nodes.append( 409 | RemoteSimIndex("http://localhost:{}/RPC2".format(port))) 410 | 411 | # root node 412 | self.sim_index = SimIndexCollection(root=True) 413 | self.sim_index.add_shards(*interior_nodes) 414 | 415 | super(SimIndexRemoteCollectionTest, self).setUp() 416 | 417 | def tearDown(self): 418 | for process in self.processes: 419 | process.terminate() 420 | time.sleep(0.1) 421 | 422 | 423 | if __name__ == "__main__": 424 | unittest.main() 425 | -------------------------------------------------------------------------------- /pysimsearch/test/similarity_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2010, Taher Haveliwala 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * The names of project contributors may not be used to endorse or 14 | # promote products derived from this software without specific 15 | # prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | ''' 30 | Unittests for pysimsearch.similarity package 31 | 32 | To run unittests, run 'nosetests' from the test directory 33 | ''' 34 | from __future__ import(division, absolute_import, print_function, 35 | unicode_literals) 36 | 37 | import unittest 38 | 39 | import io 40 | from itertools import combinations 41 | import math 42 | 43 | from pysimsearch import similarity 44 | 45 | class SimilarityTest(unittest.TestCase): 46 | longMessage = True 47 | 48 | def setUp(self): 49 | pass 50 | 51 | def tearDown(self): 52 | pass 53 | 54 | def test_measure_similarity(self): 55 | ''' 56 | measure_similarity() should give known results for known inputs 57 | ''' 58 | 59 | testdata = { 60 | 'testdata_1': "hello", 61 | 'testdata_2': "hello", 62 | 'testdata_3': "world", 63 | 'testdata_4': "hello world", 64 | } 65 | expected_sims = { 66 | ('testdata_1', 'testdata_2'): 1, 67 | ('testdata_1', 'testdata_3'): 0, 68 | ('testdata_1', 'testdata_4'): (1 / math.sqrt(2)), 69 | ('testdata_2', 'testdata_3'): 0, 70 | ('testdata_2', 'testdata_4'): (1 / math.sqrt(2)), 71 | ('testdata_3', 'testdata_4'): (1 / math.sqrt(2)), 72 | } 73 | 74 | for (fname_a, fname_b) in combinations(sorted(testdata.keys()), 2): 75 | print('Comparing {0},{1}'.format(fname_a, fname_b)) 76 | with io.StringIO(testdata[fname_a]) as file_a: 77 | with io.StringIO(testdata[fname_b]) as file_b: 78 | sim = similarity.measure_similarity(file_a, file_b) 79 | self.assertAlmostEqual( 80 | sim, expected_sims[(fname_a, fname_b)], 81 | places = 5, 82 | msg = 'Mismatch for pair {0}: got {1}, expected {2}'. 83 | format((fname_a, fname_b), sim, 84 | expected_sims[(fname_a, fname_b)])) 85 | 86 | def test_cosine_sim(self): 87 | '''cosine_sim() test using known inputs''' 88 | u = {'a':1, 'b':2, 'c':5} 89 | v = {'a':1, 'c':2, 'd':3} 90 | 91 | self.assertEqual(similarity.cosine_sim(u, v), 11 / (math.sqrt(30) * math.sqrt(14))) 92 | 93 | def test_jaccard_sim(self): 94 | '''jaccard_sim() test using known inputs''' 95 | A = {'a':1, 'b':2, 'c':5} 96 | B = {'a':1, 'c':2, 'd':3} 97 | 98 | self.assertEqual(similarity.jaccard_sim(A, B), 3 / 14) 99 | 100 | 101 | if __name__ == "__main__": 102 | unittest.main() 103 | -------------------------------------------------------------------------------- /pysimsearch/test/term_vec_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2010, Taher Haveliwala 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * The names of project contributors may not be used to endorse or 14 | # promote products derived from this software without specific 15 | # prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | ''' 30 | Unittests for pysimsearch.term_vec package 31 | 32 | To run unittests, run 'nosetests' from the test directory 33 | ''' 34 | from __future__ import(division, absolute_import, print_function, 35 | unicode_literals) 36 | 37 | import unittest 38 | 39 | import math 40 | import os 41 | import pprint 42 | 43 | from pysimsearch import term_vec 44 | 45 | class TermVecTest(unittest.TestCase): 46 | longMessage = True 47 | 48 | def test_dot_product(self): 49 | '''dot_product() test using known inputs''' 50 | v1 = {'a':1, 'b':2, 'c':0.5} 51 | v2 = {'a':2, 'c':2, 'd':100} 52 | 53 | self.assertEqual(term_vec.dot_product(v1, v2), 3) 54 | 55 | def test_l2_norm(self): 56 | '''l2_norm() test using known inputs''' 57 | v = {'a':1, 'b':2, 'c':5} 58 | 59 | self.assertEqual(term_vec.l2_norm(v), math.sqrt(1 + 2**2 + 5**2)) 60 | 61 | def test_magnitude(self): 62 | '''magnitude() test using known inputs''' 63 | v = {'a':1, 'b':2, 'c':5} 64 | 65 | self.assertEqual(term_vec.l2_norm(v), math.sqrt(1 + 2**2 + 5**2)) 66 | 67 | def test_mag_union(self): 68 | '''mag_union() test using known inputs''' 69 | A = {'a':1, 'b':2, 'c':5} 70 | B = {'a':1, 'c':2, 'd':3} 71 | 72 | self.assertEqual(term_vec.mag_union(A, B), 14) 73 | 74 | def test_mag_intersect(self): 75 | '''mag_intersect() test using known inputs''' 76 | A = {'a':1, 'b':2, 'c':5} 77 | B = {'a':1, 'c':2, 'd':3} 78 | 79 | self.assertEqual(term_vec.mag_intersect(A, B), 3) 80 | 81 | 82 | if __name__ == "__main__": 83 | unittest.main() 84 | -------------------------------------------------------------------------------- /sample.py: -------------------------------------------------------------------------------- 1 | from __future__ import(division, absolute_import, print_function, 2 | unicode_literals) 3 | 4 | from multiprocessing import Process 5 | import time 6 | 7 | from pprint import pprint 8 | 9 | from pysimsearch.sim_index import MemorySimIndex 10 | from pysimsearch.sim_index import RemoteSimIndex 11 | from pysimsearch.sim_index import SimIndexCollection 12 | from pysimsearch import similarity 13 | from pysimsearch import sim_server 14 | 15 | def sample_similarity(): 16 | # Compare web-page similarities 17 | print() 18 | print("Printing pairwise similarities of university homepages") 19 | similarities = similarity.pairwise_compare( 20 | urls=['http://www.stanford.edu/', 21 | 'http://www.berkeley.edu/', 22 | 'http://www.ucla.edu', 23 | 'http://www.mit.edu/']) 24 | pprint(similarities) 25 | 26 | def sample_sim_index(): 27 | # Create an in-memory index and query it 28 | print() 29 | print("Creating in-memory index of university homepages") 30 | sim_index = MemorySimIndex() 31 | sim_index.index_urls('http://www.stanford.edu/', 32 | 'http://www.berkeley.edu', 33 | 'http://www.ucla.edu', 34 | 'http://www.mit.edu') 35 | 36 | print("Postings list for 'university':") 37 | pprint(sim_index.postings_list('university')) 38 | print("Pages containing terms 'university' and 'california'") 39 | pprint(list(sim_index.docnames_with_terms('university', 'california'))) 40 | 41 | # Issue some similarity queries 42 | print() 43 | print("Similarity search for query 'stanford university' (simple scorer)") 44 | sim_index.set_query_scorer('simple_count') 45 | pprint(list(sim_index.query("stanford university"))) 46 | 47 | print() 48 | print("Similarity search for query 'stanford university' (tf.idf scorer)") 49 | sim_index.set_query_scorer('tfidf') 50 | pprint(list(sim_index.query("stanford university"))) 51 | 52 | # Save the index to disk, then load it back in 53 | print() 54 | print("Saving index to disk") 55 | with open("myindex.idx", "w") as index_file: 56 | sim_index.save(index_file) 57 | 58 | print() 59 | print("Loading index from disk") 60 | with open("myindex.idx", "r") as index_file: 61 | sim_index2 = MemorySimIndex.load(index_file) 62 | 63 | print() 64 | print("Pages containing terms 'university' and 'california' in loaded index") 65 | pprint(list(sim_index2.docnames_with_terms('university', 'california'))) 66 | 67 | def sample_sim_index_collection(): 68 | # SimIndexCollection 69 | print() 70 | print("SimIndexCollection: build a collection, index some urls, and query it") 71 | indexes = (MemorySimIndex(), MemorySimIndex()) 72 | index_coll = SimIndexCollection() 73 | index_coll.add_shards(*indexes) 74 | index_coll.set_query_scorer('tfidf') 75 | index_coll.index_urls('http://www.stanford.edu/', 76 | 'http://www.berkeley.edu', 77 | 'http://www.ucla.edu', 78 | 'http://www.mit.edu') 79 | 80 | pprint(index_coll.query('stanford university')) 81 | 82 | def sample_remote_indexes(): 83 | print() 84 | print("SimIndexCollection with remote backend indexes") 85 | 86 | processes = [] 87 | for i in range(2): 88 | port = 9000 + i 89 | process = Process(target=sim_server.start_sim_index_server, 90 | kwargs={'port': port, 'logRequests': False}) 91 | process.daemon = True 92 | processes.append(process) 93 | 94 | for process in processes: 95 | process.start() 96 | 97 | print("Waiting for servers to start") 98 | time.sleep(1) 99 | 100 | remote_index_coll = SimIndexCollection() 101 | for i in range(2): 102 | port = 9000 + i 103 | remote_index_coll.add_shards( 104 | RemoteSimIndex("http://localhost:{}/RPC2".format(port))) 105 | 106 | remote_index_coll.set_query_scorer('tfidf') 107 | 108 | remote_index_coll.index_urls('http://www.stanford.edu/', 109 | 'http://www.berkeley.edu', 110 | 'http://www.ucla.edu', 111 | 'http://www.mit.edu') 112 | 113 | pprint(remote_index_coll.query('stanford university')) 114 | 115 | for process in processes: 116 | process.terminate() 117 | 118 | if __name__ == '__main__': 119 | sample_similarity() 120 | sample_sim_index() 121 | sample_sim_index_collection() 122 | sample_remote_indexes() 123 | pprint('done!') 124 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | setup( 3 | name = "pysimsearch", 4 | packages = ["pysimsearch", "pysimsearch.sim_index", "pysimsearch.test"], 5 | version = "0.32", 6 | description = "Similarity-search library", 7 | author = "Taher Haveliwala", 8 | author_email = "oss@taherh.org", 9 | url = "https://github.com/taherh/pysimsearch", 10 | download_url = "https://github.com/downloads/taherh/pysimsearch/pysimsearch-0.32.tar.gz", 11 | keywords = ["similarity"], 12 | requires = ["httplib2", "lxml", "jsonrpclib", "futures"], 13 | license = "BSD License", 14 | classifiers = [ 15 | "Programming Language :: Python", 16 | "License :: OSI Approved :: BSD License", 17 | "Operating System :: OS Independent" 18 | ], 19 | long_description = '''\ 20 | Similarity-Search Library 21 | ------------------------- 22 | 23 | Requires Python v2.7.1 or higher 24 | Library for measuring textual similarity of files and web pages and 25 | building similarity indexes. Primarily for pedagogical purposes. 26 | ''' 27 | ) 28 | --------------------------------------------------------------------------------