├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── doc
    ├── Makefile
    ├── conf.py
    ├── doc_reader.rst
    ├── freq_tools.rst
    ├── index.rst
    ├── query_scorer.rst
    ├── sample.py
    ├── sample.rst
    ├── sim_index.rst
    ├── sim_index
    │   ├── concurrent_sim_index.rst
    │   ├── map_sim_index.rst
    │   ├── memory_sim_index.rst
    │   ├── remote_sim_index.rst
    │   ├── shelf_sim_index.rst
    │   ├── sim_index.rst
    │   └── sim_index_collection.rst
    ├── sim_server.rst
    ├── similarity.rst
    ├── static
    │   └── custom.css
    └── term_vec.rst
├── pysimsearch
    ├── __init__.py
    ├── doc_reader.py
    ├── exceptions.py
    ├── freq_tools.py
    ├── query_scorer.py
    ├── sim_index
    │   ├── __init__.py
    │   ├── concurrent_sim_index.py
    │   ├── map_sim_index.py
    │   ├── memory_sim_index.py
    │   ├── remote_sim_index.py
    │   ├── shelf_sim_index.py
    │   ├── sim_index.py
    │   └── sim_index_collection.py
    ├── sim_server.py
    ├── similarity.py
    ├── term_vec.py
    └── test
    │   ├── __init__.py
    │   ├── freq_tools_test.py
    │   ├── sim_index_test.py
    │   ├── similarity_test.py
    │   └── term_vec_test.py
├── sample.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.idx
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2010, Taher Haveliwala
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 |     * Redistributions of source code must retain the above copyright
 7 |       notice, this list of conditions and the following disclaimer.
 8 |     * Redistributions in binary form must reproduce the above copyright
 9 |       notice, this list of conditions and the following disclaimer in the
10 |       documentation and/or other materials provided with the distribution.
11 |     * The names of project contributors may not be used to endorse or 
12 |       promote products derived from this software without specific
13 |       prior written permission.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19 | HOLDER OR CONTRIBUTOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE
3 | include sample.py
4 | recursive-include docs *.html *.css *.png *.gif
5 | recursive-include doc *
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | pysimsearch
  2 | ===========
  3 | 
  4 | Python library for indexing and similarity-search.
  5 | 
  6 | Full documentation is at http://taherh.github.com/pysimsearch/
  7 | 
  8 | This library is primarily meant to illustrate the basic workings of similarity
  9 | and indexing engines, without focusing heavily on optimization.  Certain
 10 | patterns used for scaling indexes (e.g., distributed indexes) are included.
 11 | 
 12 | Although the code is currently for Python 2.7 series, we use ``__future__``
 13 | imports to match Python 3 as closely as possible.
 14 | 
 15 | If you are interested in learning more about search and information retrieval,
 16 | I highly recommend the following two books:
 17 | 
 18 | * [Managing Gigabytes](http://amzn.to/qg6Zhe), by Witten, Moffat, and Bell
 19 | * [Introduction to Information Retrieval](http://amzn.to/oz2O27), by Manning, Schütze, and Raghavan
 20 | 
 21 | Sample command-line usage
 22 | -------------------------
 23 | 
 24 | Compute pair-wise similarity of 3 webpages:
 25 | 
 26 |     bash$ python pysimsearch/similarity.py http://www.stanford.edu/ http://www.berkeley.edu/ http://www.mit.edu/
 27 |     Comparing files ['http://www.stanford.edu/', 'http://www.berkeley.edu/', 'http://www.mit.edu/']
 28 |     sim(http://www.stanford.edu/,http://www.berkeley.edu/)=0.322771960247
 29 |     sim(http://www.stanford.edu/,http://www.mit.edu/)=0.142787018368
 30 |     sim(http://www.berkeley.edu/,http://www.mit.edu/)=0.248877629741
 31 | 
 32 | Sample API usage
 33 | ----------------
 34 | 
 35 |     from __future__ import(division, absolute_import, print_function,
 36 |                            unicode_literals)
 37 |     
 38 |     from pprint import pprint
 39 |     from pysimsearch.sim_index import MemorySimIndex
 40 |     from pysimsearch import doc_reader
 41 |     from pysimsearch import similarity
 42 |         
 43 |     # Compare web-page similarities
 44 |     print("Printing pairwise similarities of university homepages")
 45 |     pprint(similarity.pairwise_compare(urls=['http://www.stanford.edu/',
 46 |                                              'http://www.berkeley.edu/',
 47 |                                              'http://www.ucla.edu',
 48 |                                              'http://www.mit.edu/']))
 49 |     
 50 |     # Create an in-memory index and query it
 51 |     print("Creating in-memory index of university homepages")
 52 |     sim_index = MemorySimIndex()
 53 |     sim_index.index_urls('http://www.stanford.edu/',
 54 |                          'http://www.berkeley.edu',
 55 |                          'http://www.ucla.edu',
 56 |                          'http://www.mit.edu')
 57 |                               
 58 |     print("Postings list for 'university':")
 59 |     pprint(sim_index.postings_list('university'))
 60 |     print("Pages containing terms 'university' and 'california'")
 61 |     pprint(list(sim_index.docnames_with_terms('university', 'california')))
 62 |        
 63 |     # Issue some similarity queries
 64 |     print("Similarity search for query 'stanford university'")
 65 |     sim_index.set_query_scorer('simple_count')
 66 |     pprint(list(sim_index.query('stanford university')))
 67 | 
 68 | 
 69 | Sample Client/Server Usage via JSON api
 70 | ---------------------------------------
 71 | 
 72 | *Server*
 73 | 
 74 |     bash$ ./sim_server.py sim_index -p 9001
 75 |     Use Control-C to exit
 76 | 
 77 | *Client*
 78 | 
 79 |     >>> from pprint import pprint
 80 |     >>> import jsonrpclib
 81 |     >>> server = jsonrpclib.Server('http://localhost:9001/RPC2')
 82 |     >>> server.sim_index.index_urls('http://www.stanford.edu/', 'http://www.berkeley.edu', 'http://www.ucla.edu')
 83 |     >>> pprint(server.sim_index.query('stanford university'))
 84 |     [[u'http://www.stanford.edu', 0.4396892551666724],
 85 |      [u'http://www.berkeley.edu', 0.0],
 86 |      [u'http://www.ucla.edu', 0.0]]
 87 | 
 88 | 
 89 | Sample SimIndexCollection Usage
 90 | -------------------------------
 91 | 
 92 | *Server*
 93 | 
 94 |     bash$ ./sim_server.py sim_index -p 9001 &
 95 |     bash$ ./sim_server.py sim_index -p 9002 &
 96 | 
 97 | *SimIndexCollection*
 98 | 
 99 |     >>> from pprint import pprint
100 |     >>> from pysimsearch.sim_index import SimIndexCollection
101 |     >>> from pysimsearch.sim_index import RemoteSimIndex
102 |     >>> servers = [
103 |                     RemoteSimIndex('http://localhost:9001/RPC2'),
104 |                     RemoteSimIndex('http://localhost:9002/RPC2')
105 |                   ]
106 |     >>> index_coll = SimIndexCollection()
107 |     >>> index_coll.add_shards(*servers)
108 |     >>> index_coll.set_query_scorer('tfidf')
109 |     >>> index_coll.index_urls('http://www.stanford.edu/',
110 |                               'http://www.berkeley.edu',
111 |                               'http://www.ucla.edu',
112 |                               'http://www.mit.edu')
113 |     >>> pprint(index_coll.query("stanford university"))
114 |     [[u'http://www.stanford.edu/', 0.5836102697341475],
115 |      [u'http://www.ucla.edu', 0.012839879268194701],
116 |      [u'http://www.berkeley.edu', 0.005337522642134812]]
117 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | 
 15 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest
 16 | 
 17 | help:
 18 | 	@echo "Please use \`make <target>' where <target> is one of"
 19 | 	@echo "  html       to make standalone HTML files"
 20 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 21 | 	@echo "  singlehtml to make a single large HTML file"
 22 | 	@echo "  pickle     to make pickle files"
 23 | 	@echo "  json       to make JSON files"
 24 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 25 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 26 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 27 | 	@echo "  epub       to make an epub"
 28 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 29 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 30 | 	@echo "  text       to make text files"
 31 | 	@echo "  man        to make manual pages"
 32 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 33 | 	@echo "  linkcheck  to check all external links for integrity"
 34 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 35 | 
 36 | clean:
 37 | 	-rm -rf $(BUILDDIR)/*
 38 | 
 39 | html:
 40 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 41 | 	@echo
 42 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 43 | 
 44 | dirhtml:
 45 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 46 | 	@echo
 47 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 48 | 
 49 | singlehtml:
 50 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 51 | 	@echo
 52 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 53 | 
 54 | pickle:
 55 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 56 | 	@echo
 57 | 	@echo "Build finished; now you can process the pickle files."
 58 | 
 59 | json:
 60 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 61 | 	@echo
 62 | 	@echo "Build finished; now you can process the JSON files."
 63 | 
 64 | htmlhelp:
 65 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 66 | 	@echo
 67 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 68 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 69 | 
 70 | qthelp:
 71 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 72 | 	@echo
 73 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 74 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 75 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PySimSearch.qhcp"
 76 | 	@echo "To view the help file:"
 77 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PySimSearch.qhc"
 78 | 
 79 | devhelp:
 80 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 81 | 	@echo
 82 | 	@echo "Build finished."
 83 | 	@echo "To view the help file:"
 84 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/PySimSearch"
 85 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/PySimSearch"
 86 | 	@echo "# devhelp"
 87 | 
 88 | epub:
 89 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 90 | 	@echo
 91 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 92 | 
 93 | latex:
 94 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
 95 | 	@echo
 96 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
 97 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
 98 | 	      "(use \`make latexpdf' here to do that automatically)."
 99 | 
100 | latexpdf:
101 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
102 | 	@echo "Running LaTeX files through pdflatex..."
103 | 	make -C $(BUILDDIR)/latex all-pdf
104 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
105 | 
106 | text:
107 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
108 | 	@echo
109 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
110 | 
111 | man:
112 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
113 | 	@echo
114 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
115 | 
116 | changes:
117 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
118 | 	@echo
119 | 	@echo "The overview file is in $(BUILDDIR)/changes."
120 | 
121 | linkcheck:
122 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
123 | 	@echo
124 | 	@echo "Link check complete; look for any errors in the above output " \
125 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
126 | 
127 | doctest:
128 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
129 | 	@echo "Testing of doctests in the sources finished, look at the " \
130 | 	      "results in $(BUILDDIR)/doctest/output.txt."
131 | 


--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # PySimSearch documentation build configuration file, created by
  4 | # sphinx-quickstart on Mon Jul 25 22:06:33 2011.
  5 | #
  6 | # This file is execfile()d with the current directory set to its containing dir.
  7 | #
  8 | # Note that not all possible configuration values are present in this
  9 | # autogenerated file.
 10 | #
 11 | # All configuration values have a default; values that are commented out
 12 | # serve to show the default.
 13 | 
 14 | import sys, os
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | sys.path.insert(0, os.path.abspath('..'))
 20 | 
 21 | # -- General configuration -----------------------------------------------------
 22 | 
 23 | # If your documentation needs a minimal Sphinx version, state it here.
 24 | #needs_sphinx = '1.0'
 25 | 
 26 | # Add any Sphinx extension module names here, as strings. They can be extensions
 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 28 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.todo', 'sphinxtogithub']
 29 | 
 30 | # Add any paths that contain templates here, relative to this directory.
 31 | templates_path = ['templates']
 32 | 
 33 | # The suffix of source filenames.
 34 | source_suffix = '.rst'
 35 | 
 36 | # The encoding of source files.
 37 | #source_encoding = 'utf-8-sig'
 38 | 
 39 | # The master toctree document.
 40 | master_doc = 'index'
 41 | 
 42 | # General information about the project.
 43 | project = u'pysimsearch'
 44 | copyright = u'2011, Taher Haveliwala'
 45 | 
 46 | # The version info for the project you're documenting, acts as replacement for
 47 | # |version| and |release|, also used in various other places throughout the
 48 | # built documents.
 49 | #
 50 | # The short X.Y version.
 51 | version = '0.32'
 52 | # The full version, including alpha/beta/rc tags.
 53 | release = '0.32'
 54 | 
 55 | # The language for content autogenerated by Sphinx. Refer to documentation
 56 | # for a list of supported languages.
 57 | #language = None
 58 | 
 59 | # There are two options for replacing |today|: either, you set today to some
 60 | # non-false value, then it is used:
 61 | #today = ''
 62 | # Else, today_fmt is used as the format for a strftime call.
 63 | #today_fmt = '%B %d, %Y'
 64 | 
 65 | # List of patterns, relative to source directory, that match files and
 66 | # directories to ignore when looking for source files.
 67 | exclude_patterns = ['build']
 68 | 
 69 | # The reST default role (used for this markup: `text`) to use for all documents.
 70 | #default_role = None
 71 | 
 72 | # If true, '()' will be appended to :func: etc. cross-reference text.
 73 | #add_function_parentheses = True
 74 | 
 75 | # If true, the current module name will be prepended to all description
 76 | # unit titles (such as .. function::).
 77 | #add_module_names = True
 78 | 
 79 | # If true, sectionauthor and moduleauthor directives will be shown in the
 80 | # output. They are ignored by default.
 81 | #show_authors = False
 82 | 
 83 | # The name of the Pygments (syntax highlighting) style to use.
 84 | pygments_style = 'sphinx'
 85 | 
 86 | # A list of ignored prefixes for module index sorting.
 87 | #modindex_common_prefix = []
 88 | 
 89 | 
 90 | # -- Options for HTML output ---------------------------------------------------
 91 | 
 92 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 93 | # a list of builtin themes.
 94 | html_theme = 'default'
 95 | 
 96 | html_style = 'custom.css'
 97 | 
 98 | # Theme options are theme-specific and customize the look and feel of a theme
 99 | # further.  For a list of options available for each theme, see the
100 | # documentation.
101 | #html_theme_options = {}
102 | 
103 | # Add any paths that contain custom themes here, relative to this directory.
104 | #html_theme_path = []
105 | 
106 | # The name for this set of Sphinx documents.  If None, it defaults to
107 | # "<project> v<release> documentation".
108 | #html_title = None
109 | 
110 | # A shorter title for the navigation bar.  Default is the same as html_title.
111 | #html_short_title = None
112 | 
113 | # The name of an image file (relative to this directory) to place at the top
114 | # of the sidebar.
115 | #html_logo = None
116 | 
117 | # The name of an image file (within the static path) to use as favicon of the
118 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
119 | # pixels large.
120 | #html_favicon = None
121 | 
122 | # Add any paths that contain custom static files (such as style sheets) here,
123 | # relative to this directory. They are copied after the builtin static files,
124 | # so a file named "default.css" will overwrite the builtin "default.css".
125 | html_static_path = ['static']
126 | 
127 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
128 | # using the given strftime format.
129 | #html_last_updated_fmt = '%b %d, %Y'
130 | 
131 | # If true, SmartyPants will be used to convert quotes and dashes to
132 | # typographically correct entities.
133 | #html_use_smartypants = True
134 | 
135 | # Custom sidebar templates, maps document names to template names.
136 | #html_sidebars = {}
137 | 
138 | # Additional templates that should be rendered to pages, maps page names to
139 | # template names.
140 | #html_additional_pages = {}
141 | 
142 | # If false, no module index is generated.
143 | #html_domain_indices = True
144 | 
145 | # If false, no index is generated.
146 | #html_use_index = True
147 | 
148 | # If true, the index is split into individual pages for each letter.
149 | #html_split_index = False
150 | 
151 | # If true, links to the reST sources are added to the pages.
152 | html_show_sourcelink = True
153 | 
154 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
155 | #html_show_sphinx = True
156 | 
157 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
158 | #html_show_copyright = True
159 | 
160 | # If true, an OpenSearch description file will be output, and all pages will
161 | # contain a <link> tag referring to it.  The value of this option must be the
162 | # base URL from which the finished HTML is served.
163 | #html_use_opensearch = ''
164 | 
165 | # This is the file name suffix for HTML files (e.g. ".xhtml").
166 | #html_file_suffix = None
167 | 
168 | # Output file base name for HTML help builder.
169 | htmlhelp_basename = 'pysimsearchdoc'
170 | 
171 | 
172 | # -- Options for LaTeX output --------------------------------------------------
173 | 
174 | # The paper size ('letter' or 'a4').
175 | #latex_paper_size = 'letter'
176 | 
177 | # The font size ('10pt', '11pt' or '12pt').
178 | #latex_font_size = '10pt'
179 | 
180 | # Grouping the document tree into LaTeX files. List of tuples
181 | # (source start file, target name, title, author, documentclass [howto/manual]).
182 | latex_documents = [
183 |   ('index', 'pysimsearch.tex', u'pysimsearch Documentation',
184 |    u'Taher Haveliwala', 'manual'),
185 | ]
186 | 
187 | # The name of an image file (relative to this directory) to place at the top of
188 | # the title page.
189 | #latex_logo = None
190 | 
191 | # For "manual" documents, if this is true, then toplevel headings are parts,
192 | # not chapters.
193 | #latex_use_parts = False
194 | 
195 | # If true, show page references after internal links.
196 | #latex_show_pagerefs = False
197 | 
198 | # If true, show URL addresses after external links.
199 | #latex_show_urls = False
200 | 
201 | # Additional stuff for the LaTeX preamble.
202 | #latex_preamble = ''
203 | 
204 | # Documents to append as an appendix to all manuals.
205 | #latex_appendices = []
206 | 
207 | # If false, no module index is generated.
208 | #latex_domain_indices = True
209 | 
210 | 
211 | # -- Options for manual page output --------------------------------------------
212 | 
213 | # One entry per manual page. List of tuples
214 | # (source start file, name, description, authors, manual section).
215 | man_pages = [
216 |     ('index', 'pysimsearch', u'pysimsearch Documentation',
217 |      [u'Taher Haveliwala'], 1)
218 | ]
219 | 


--------------------------------------------------------------------------------
/doc/doc_reader.rst:
--------------------------------------------------------------------------------
1 | The :mod:`doc_reader` Module
2 | ----------------------------
3 | 
4 | .. automodule:: pysimsearch.doc_reader
5 |    :members:
6 |    :undoc-members:
7 | 


--------------------------------------------------------------------------------
/doc/freq_tools.rst:
--------------------------------------------------------------------------------
1 | The :mod:`freq_tools` Module
2 | ----------------------------
3 | 
4 | .. automodule:: pysimsearch.freq_tools
5 |    :members:
6 |    :undoc-members:
7 | 


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
 1 | .. PySimSearch documentation master file, created by
 2 |    sphinx-quickstart on Mon Jul 25 22:06:33 2011.
 3 | 
 4 | pysimsearch |version| documentation
 5 | =======================================
 6 | 
 7 | Python library for indexing and similarity-search.
 8 | 
 9 | Download from `GitHub <http://github.com/taherh/pysimsearch>`_
10 | 
11 | This library is primarily meant to illustrate the basic workings of similarity
12 | and indexing engines, without focusing heavily on optimization.  Certain
13 | patterns used for scaling indexes (e.g., distributed indexes) are included.
14 | 
15 | Although the code is currently for Python 2.7 series, we use ``__future__``
16 | imports to match Python 3 as closely as possible.
17 | 
18 | If you are interested in learning more about search and information retrieval,
19 | I highly recommend the following two books:
20 | 
21 | * `Managing Gigabytes`_, by Witten, Moffat, and Bell
22 | .. _Managing Gigabytes: http://amzn.to/qg6Zhe
23 | * `Introduction to Information Retrieval`_, by Manning, Schütze, and Raghavan
24 | .. _Introduction to Information Retrieval: http://amzn.to/oz2O27
25 | 
26 | 
27 | Quickstart:
28 | -----------
29 | 
30 | *Quick sample:*
31 | 
32 | >>> from pprint import pprint
33 | >>> from pysimsearch import sim_index, doc_reader
34 | >>> index = sim_index.MemorySimIndex()
35 | >>> index.index_urls('http://www.stanford.edu/',
36 | 		     'http://www.berkeley.edu/',
37 | 		     'http://www.ucla.edu',
38 | 		     'http://www.mit.edu')
39 | >>> pprint(index.postings_list('university'))
40 | [(0, 3), (1, 1), (2, 1)]
41 | >>> pprint(list(index.docnames_with_terms('university', 'california')))
42 | ['http://www.stanford.edu/', 'http://www.ucla.edu']
43 | >>> index.set_query_scorer('tfidf')
44 | >>> pprint(list(index.query("stanford university")))
45 | [('http://www.stanford.edu/', 0.5827172819606118),
46 |  ('http://www.ucla.edu', 0.05801461340864149),
47 |  ('http://www.berkeley.edu/', 0.025725104682131295)]
48 | 
49 | View a larger :doc:`sample`
50 |     
51 | API:
52 | ----
53 | 
54 | .. toctree::
55 |    :maxdepth: 2
56 | 
57 |    sim_index
58 |    similarity
59 |    doc_reader
60 |    freq_tools
61 |    sim_server
62 |    query_scorer
63 |    term_vec
64 | 
65 | .. automodule:: pysimsearch
66 |    :members:
67 | 
68 | 
69 | Indices and tables
70 | ==================
71 | 
72 | * :ref:`genindex`
73 | * :ref:`modindex`
74 | * :ref:`search`
75 | 
76 | 


--------------------------------------------------------------------------------
/doc/query_scorer.rst:
--------------------------------------------------------------------------------
1 | The :mod:`query_scorer` Module
2 | ------------------------------
3 | 
4 | .. automodule:: pysimsearch.query_scorer
5 |    :members:
6 |    :undoc-members:
7 | 


--------------------------------------------------------------------------------
/doc/sample.py:
--------------------------------------------------------------------------------
1 | ../sample.py


--------------------------------------------------------------------------------
/doc/sample.rst:
--------------------------------------------------------------------------------
1 | Example
2 | -------
3 | 
4 | .. literalinclude:: sample.py
5 | 


--------------------------------------------------------------------------------
/doc/sim_index.rst:
--------------------------------------------------------------------------------
 1 | The :mod:`sim_index` Module
 2 | ---------------------------
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 1
 6 | 
 7 |    sim_index/sim_index
 8 |    sim_index/map_sim_index
 9 |    sim_index/memory_sim_index
10 |    sim_index/shelf_sim_index
11 |    sim_index/concurrent_sim_index
12 |    sim_index/remote_sim_index
13 |    sim_index/sim_index_collection
14 | 


--------------------------------------------------------------------------------
/doc/sim_index/concurrent_sim_index.rst:
--------------------------------------------------------------------------------
1 | The :class:`ConcurrentSimIndex` Class
2 | -------------------------------------
3 | 
4 | .. automodule:: pysimsearch.sim_index.concurrent_sim_index
5 | 
6 | .. autoclass:: pysimsearch.sim_index.ConcurrentSimIndex
7 |    :members:
8 |    :inherited-members:
9 | 


--------------------------------------------------------------------------------
/doc/sim_index/map_sim_index.rst:
--------------------------------------------------------------------------------
1 | The :class:`MapSimIndex` Class
2 | --------------------------------
3 | 
4 | .. automodule:: pysimsearch.sim_index.map_sim_index
5 | 
6 | .. autoclass:: pysimsearch.sim_index.MapSimIndex
7 |    :members:
8 |    :inherited-members:
9 | 


--------------------------------------------------------------------------------
/doc/sim_index/memory_sim_index.rst:
--------------------------------------------------------------------------------
1 | The :class:`MemorySimIndex` Class
2 | ---------------------------------
3 | 
4 | .. automodule:: pysimsearch.sim_index.memory_sim_index
5 | 
6 | .. autoclass:: pysimsearch.sim_index.MemorySimIndex
7 |    :members:
8 |    :inherited-members:
9 | 


--------------------------------------------------------------------------------
/doc/sim_index/remote_sim_index.rst:
--------------------------------------------------------------------------------
1 | The :class:`RemoteSimIndex` Class
2 | -------------------------------------
3 | 
4 | .. automodule:: pysimsearch.sim_index.remote_sim_index
5 | 
6 | .. autoclass:: pysimsearch.sim_index.RemoteSimIndex
7 |    :members:
8 |    :inherited-members:
9 | 


--------------------------------------------------------------------------------
/doc/sim_index/shelf_sim_index.rst:
--------------------------------------------------------------------------------
1 | The :class:`ShelfSimIndex` Class
2 | --------------------------------
3 | 
4 | .. automodule:: pysimsearch.sim_index.shelf_sim_index
5 | 
6 | .. autoclass:: pysimsearch.sim_index.ShelfSimIndex
7 |    :members:
8 |    :inherited-members:
9 | 


--------------------------------------------------------------------------------
/doc/sim_index/sim_index.rst:
--------------------------------------------------------------------------------
1 | The :class:`SimIndex` Class
2 | ---------------------------
3 | 
4 | .. automodule:: pysimsearch.sim_index.sim_index
5 | 
6 | .. autoclass:: pysimsearch.sim_index.SimIndex
7 |    :members:
8 | 


--------------------------------------------------------------------------------
/doc/sim_index/sim_index_collection.rst:
--------------------------------------------------------------------------------
1 | The :class:`SimIndexCollection` Class
2 | -------------------------------------
3 | 
4 | .. automodule:: pysimsearch.sim_index.sim_index_collection
5 | 
6 | .. autoclass:: pysimsearch.sim_index.SimIndexCollection
7 |    :members:
8 |    :inherited-members:
9 | 


--------------------------------------------------------------------------------
/doc/sim_server.rst:
--------------------------------------------------------------------------------
1 | The :mod:`sim_server` Module
2 | ----------------------------
3 | 
4 | .. automodule:: pysimsearch.sim_server
5 |    :members:
6 |    :undoc-members:
7 | 


--------------------------------------------------------------------------------
/doc/similarity.rst:
--------------------------------------------------------------------------------
1 | The :mod:`similarity` Module
2 | ----------------------------
3 | 
4 | .. automodule:: pysimsearch.similarity
5 |    :members:
6 |    :undoc-members:
7 | 


--------------------------------------------------------------------------------
/doc/static/custom.css:
--------------------------------------------------------------------------------
 1 | @import url("default.css");
 2 | 
 3 | dl.class {
 4 |     background-color: #F2F2F2;
 5 |     border-radius: 5px;
 6 |     padding: 20px;
 7 | }
 8 | 
 9 | dl.class>dt {
10 |     font-weight: bold;
11 |     margin-bottom: 10px;
12 | }


--------------------------------------------------------------------------------
/doc/term_vec.rst:
--------------------------------------------------------------------------------
1 | The :mod:`term_vec` Module
2 | ----------------------------
3 | 
4 | .. automodule:: pysimsearch.term_vec
5 |    :members:
6 |    :undoc-members:
7 | 


--------------------------------------------------------------------------------
/pysimsearch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taherh/pysimsearch/57796f7175565a8481fe80a56c7815bf0264d4fb/pysimsearch/__init__.py


--------------------------------------------------------------------------------
/pysimsearch/doc_reader.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright (c) 2010, Taher Haveliwala <oss@taherh.org>
  4 | # All rights reserved.
  5 | #
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are met:
  8 | #     * Redistributions of source code must retain the above copyright
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above copyright
 11 | #       notice, this list of conditions and the following disclaimer in the
 12 | #       documentation and/or other materials provided with the distribution.
 13 | #     * The names of project contributors may not be used to endorse or
 14 | #       promote products derived from this software without specific
 15 | #       prior written permission.
 16 | #
 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | 
 29 | '''
 30 | Utilities for creating term vectors from data
 31 | '''
 32 | 
 33 | from __future__ import(division, absolute_import, print_function,
 34 |                        unicode_literals)
 35 | 
 36 | import codecs
 37 | from concurrent import futures
 38 | import io
 39 | from itertools import chain
 40 | import re
 41 | import urllib
 42 | 
 43 | import lxml.html
 44 | from lxml.html.clean import clean_html
 45 | 
 46 | def get_text_file(filename):
 47 |     '''Returns file for filename
 48 |     
 49 |     TODO: detect html and parse
 50 |     '''
 51 |     return codecs.open(filename, encoding='utf-8')
 52 | 
 53 | def get_url(url):
 54 |     http_pattern = '^http://'
 55 |     if re.search(http_pattern, url):
 56 |         urlfh = urllib.urlopen(url)
 57 |         content = urlfh.read()
 58 |         html_tree = lxml.html.fromstring(content)
 59 |         clean_html(html_tree)  # removes crud from html
 60 |         clean_html_string = lxml.html.tostring(html_tree, 
 61 |                                                encoding=unicode,
 62 |                                                method='text')
 63 |         return io.StringIO(clean_html_string)
 64 |     else:
 65 |         raise Exception("Bad url: {}".format(url))
 66 | 
 67 | def get_text_files(filenames=None):
 68 |     '''
 69 |     Returns an iterator of (name, file) tuples for filenames
 70 |     
 71 |     Params:
 72 |         filenames: list of filenames
 73 |     '''
 74 |     if filenames is not None:
 75 |         return ((name, get_text_file(name)) for name in filenames)
 76 | 
 77 | _executor = None
 78 | 
 79 | def get_urls(urls=None):
 80 |     '''
 81 |     Returns an iterator of (name, file) tuples for urls
 82 |     
 83 |     Params:
 84 |         urls: list of urls to fetch
 85 |     '''
 86 |     # The below effectively implements
 87 |     #
 88 |     #    return ((url, get_url(url)) for url in urls)
 89 |     #
 90 |     # but uses futures to allow parallel fetching/processing of urls
 91 |     
 92 |     # Initialize the executor if necessary
 93 |     global _executor
 94 |     if _executor is None:
 95 |         _executor = futures.ThreadPoolExecutor(max_workers=10)
 96 | 
 97 |     if urls is not None:
 98 |         # submit the get_url() requests
 99 |         future_to_url = {
100 |             _executor.submit(get_url, url): url
101 |             for url in urls
102 |         }
103 |         
104 |         # generator that lazily iterates over futures and yields
105 |         # (url, file) tuples
106 |         def _gen_result():
107 |             named_files = []
108 |             for future in futures.as_completed(future_to_url, timeout=60):
109 |                 url = future_to_url[future]
110 |                 if future.exception() is not None:
111 |                     raise Exception("failed to fetch {}: e=".format(url, future.exception()))
112 |                 else:
113 |                     yield (url, future.result())
114 | 
115 |         # return iterator
116 |         return _gen_result()
117 | 


--------------------------------------------------------------------------------
/pysimsearch/exceptions.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright (c) 2010, Taher Haveliwala <oss@taherh.org>
 4 | # All rights reserved.
 5 | #
 6 | # Redistribution and use in source and binary forms, with or without
 7 | # modification, are permitted provided that the following conditions are met:
 8 | #     * Redistributions of source code must retain the above copyright
 9 | #       notice, this list of conditions and the following disclaimer.
10 | #     * Redistributions in binary form must reproduce the above copyright
11 | #       notice, this list of conditions and the following disclaimer in the
12 | #       documentation and/or other materials provided with the distribution.
13 | #     * The names of project contributors may not be used to endorse or
14 | #       promote products derived from this software without specific
15 | #       prior written permission.
16 | #
17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | '''
30 | Exception classes
31 | '''
32 | 
33 | class Error(Exception):
34 |     '''Base class for Exception types used in this module'''
35 |     pass
36 | 
37 | class FileFormatException(Error):
38 |     '''Exception for invalid input file'''
39 |     pass
40 | 


--------------------------------------------------------------------------------
/pysimsearch/freq_tools.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright (c) 2010, Taher Haveliwala <oss@taherh.org>
  4 | # All rights reserved.
  5 | #
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are met:
  8 | #     * Redistributions of source code must retain the above copyright
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above copyright
 11 | #       notice, this list of conditions and the following disclaimer in the
 12 | #       documentation and/or other materials provided with the distribution.
 13 | #     * The names of project contributors may not be used to endorse or
 14 | #       promote products derived from this software without specific
 15 | #       prior written permission.
 16 | #
 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | 
 29 | '''
 30 | Sample usage as a script::
 31 | 
 32 |     $ python freq_tools --list doc_list -o output.df
 33 |     Processing...
 34 | '''
 35 | 
 36 | from __future__ import (division, absolute_import, print_function,
 37 |     unicode_literals)
 38 | 
 39 | # boilerplate to allow running as script
 40 | if __name__ == "__main__" and __package__ is None:
 41 |     import sys, os
 42 |     parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 43 |     sys.path.insert(0, parent_dir)
 44 |     import pysimsearch
 45 |     __package__ = str("pysimsearch")
 46 |     del sys, os
 47 | 
 48 | # external modules
 49 | import argparse
 50 | import sys
 51 | 
 52 | # our modules
 53 | from .exceptions import *
 54 | from . import doc_reader
 55 | 
 56 | def read_df(df_file):
 57 |     '''
 58 |     Reads a document frequency file for use in applying df term weighting
 59 |     Returns a dictionary of the form {term: doc_freq}
 60 |     '''
 61 |     df_dict = {}
 62 |     for line in df_file:
 63 |         ln_list = line.split()
 64 |         if len(ln_list) == 0:
 65 |             continue  # skip blank lines without warning
 66 |         if len(ln_list) != 2:   # raise exception if there were not exactly
 67 |                                 # two entries in the line
 68 |             raise FileFormatException(
 69 |                 'Bad line in doc freq file ({0} entries, expecting 2): {1}'.
 70 |                 format(len(ln_list), line))
 71 |         (term, df) = ln_list
 72 |         df_dict[term] = int(df)
 73 |     return df_dict
 74 | 
 75 | def write_df(df_dict, df_file):
 76 |     '''
 77 |     Writes the document frequency data structure to file
 78 |     df_dict is a dictionary of the form {term: doc_freq}
 79 |     
 80 |     TODO: sort order?
 81 |     '''
 82 |     for (term, df) in df_dict.items():
 83 |         df_file.write(u'{0}\t{1}\n'.format(term, df))
 84 |     
 85 | def compute_df(files):
 86 |     '''
 87 |     Computes document frequency counts by processing a collection of files
 88 |     Returns a dictionary of the form {term: doc_freq}
 89 |     '''
 90 |     df_dict = {}
 91 |     for file in files:
 92 |         term_seen = set()
 93 |         for line in file:
 94 |             for term in line.split():
 95 |                 if term not in term_seen:
 96 |                     if term not in df_dict:
 97 |                         df_dict[term] = 0
 98 |                     df_dict[term] += 1
 99 |                     term_seen.add(term)
100 |                     
101 |     return df_dict
102 |     
103 | # --- main() ---
104 | 
105 | def main():
106 |     '''Commandline interface for generating document frequency indexes'''
107 |     parser = argparse.ArgumentParser(
108 |         description='Compute document frequencies of terms in of input '
109 |                     'documents')
110 |     parser.add_argument('doc', nargs='*', help='a document filename')
111 |     parser.add_argument('-l', '--list', nargs='?',
112 |                         help='file containing list of input documents')
113 |     parser.add_argument('-o', '--output', nargs='?',
114 |                         help='output file (default: stdout)')
115 | 
116 |     args = parser.parse_args()
117 | 
118 |     output_file = sys.stdout
119 |     if args.output != None:
120 |         output_file = open(args.output, "w")
121 |         
122 |     doc_list = []
123 |     if args.list != None:
124 |         try:
125 |             with open(args.list) as input_docnames_file:
126 |                 doc_list = [line.strip() for line in
127 |                             input_docnames_file.readlines()]
128 |         except IOError:
129 |             print("Sorry, could not open " + args.list)
130 | 
131 |     doc_list.extend(args.doc)
132 | 
133 |     print("Processing {}".format(str(doc_list)))
134 |     
135 |     if len(doc_list) == 0:
136 |         raise Error("Sorry, you must specify at least one document.")  
137 | 
138 |     df_dict = compute_df(doc_reader.get_text_files(*doc_list))
139 |     for key in df_dict:
140 |         print('{}\t{:>20}'.format(key, df_dict[key]), file=output_file)
141 | 
142 |     if output_file != sys.stdout:
143 |         output_file.close()
144 | 
145 | if __name__ == '__main__':
146 |     main()
147 | 
148 | 


--------------------------------------------------------------------------------
/pysimsearch/query_scorer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright (c) 2011, Taher Haveliwala <oss@taherh.org>
  4 | # All rights reserved.
  5 | #
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are met:
  8 | #     * Redistributions of source code must retain the above copyright
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above copyright
 11 | #       notice, this list of conditions and the following disclaimer in the
 12 | #       documentation and/or other materials provided with the distribution.
 13 | #     * The names of project contributors may not be used to endorse or
 14 | #       promote products derived from this software without specific
 15 | #       prior written permission.
 16 | #
 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | 
 29 | '''
 30 | Scoring algorithms for finding similar documents
 31 | '''
 32 | 
 33 | from __future__ import (division, absolute_import, print_function,
 34 |         unicode_literals)
 35 | 
 36 | import abc
 37 | from collections import defaultdict
 38 | import operator
 39 | from math import log
 40 | 
 41 | class QueryScorer(object):
 42 |     '''
 43 |     Interface for query scorers which score similarity search results
 44 |     
 45 |     QueryScorers are used by the SimIndex.query() method to handle the
 46 |     scoring of similarity search results.    
 47 |     '''
 48 |     
 49 |     __metaclass__ = abc.ABCMeta
 50 |     
 51 |     # name->scorer mapping
 52 |     _scorers = { }
 53 |     
 54 |     @staticmethod
 55 |     def make_scorer(scorer_type):
 56 |         '''Returns a new scorer object'''
 57 |         return QueryScorer._scorers[scorer_type]()
 58 |     
 59 |     @staticmethod
 60 |     def register_scorers(scorer_map):
 61 |         QueryScorer._scorers.update(scorer_map)
 62 |         
 63 |     @abc.abstractmethod
 64 |     def score_docs(self, query_vec, postings_lists, **extra):
 65 |         '''Scores documents' similarities to query
 66 |         
 67 |         Scans postings_lists to compute similarity scores for docs for the
 68 |         query term vector
 69 |         
 70 |         Params:
 71 |             query: the query document
 72 |             postings_lists: a list of postings lists for terms in query
 73 |         
 74 |         Returns:
 75 |             A sorted iterable of (docid, score) tuples
 76 |         '''
 77 |         return
 78 | 
 79 | 
 80 | class SimpleCountQueryScorer(QueryScorer):
 81 |     '''
 82 |     QueryScorer that uses simple term frequencies for scoring.
 83 |     '''
 84 | 
 85 |     def score_docs(self, query_vec, postings_lists, **extra):
 86 |         '''
 87 |         Scores query-document similarity using number of occurrences
 88 |         of query terms in document.  Multiple occurrences of a term
 89 |         in the query are ignored.
 90 |         '''
 91 |         
 92 |         doc_hit_map = defaultdict(int)
 93 |         for (term, postings_list) in postings_lists:
 94 |             assert(query_vec[term] >= 1)
 95 |             for (docid, freq) in postings_list:
 96 |                 doc_hit_map[docid] += freq
 97 |         
 98 |         # construct list of tuples sorted by value
 99 |         return sorted(doc_hit_map.iteritems(),
100 |                       key=operator.itemgetter(1),
101 |                       reverse=True)
102 | 
103 | class TFIDFQueryScorer(QueryScorer):
104 |     '''
105 |     QueryScorer that uses TFIDF weighting with the cosine similarity measure.
106 |     
107 |     This implementation is actually an approximation to the true
108 |     cosine, because of the way we normalize by document length.
109 |     When computing document length, we assume a term weight of 1 for
110 |     each document term. E.g., we do not factor in term weights
111 |     when computing the "document length", since that would require
112 |     choosing the weighting strategy at index time.
113 |     
114 |     Query length is ignored, as it has no effect on relative ordering
115 |     '''
116 |     
117 |     @staticmethod
118 |     def tf_weight_raw(tf):
119 |         '''Returns unscaled tf'''
120 |         return tf
121 |     
122 |     @staticmethod
123 |     def tf_weight_log(tf):
124 |         '''Returns sublinear scaling of tf: 1+log(tf)'''
125 |         assert(tf > 0)
126 |         return 1 + log(tf)
127 |     
128 |     @staticmethod
129 |     def idf_weight_log(N, df):
130 |         '''Returns idf weight'''
131 |         assert(df > 0)
132 |         return log(N/df)
133 |     
134 |     def __init__(self, tf_weight_type = 'raw'):
135 |         if tf_weight_type == 'log':
136 |             self.tf_weight = self.tf_weight_log
137 |         else:
138 |             self.tf_weight = self.tf_weight_raw
139 |         
140 |         self.idf_weight = self.idf_weight_log
141 |         
142 |     def score_docs(self, query_vec, postings_lists, N, get_doc_freq, get_doc_len, **extra):
143 |         '''
144 |         Scores documents' similarities to query using cosine similarity
145 |         in a vector space model.  Uses tf.idf weighting.
146 |         
147 |         An individual term hit is scored as::
148 |         
149 |             idf * self.tf_weight(q_tf) * self.tf_weight(d_tf)
150 |         
151 |         The overall score for a doc is given by the sum of the term-hit scores
152 |         '''
153 |         
154 |         if N == 0: return ()
155 |         doc_hit_map = defaultdict(int)
156 |         for (term, postings_list) in postings_lists:
157 |             idf = self.idf_weight(N, get_doc_freq(term))
158 |             query_term_wt = self.tf_weight(query_vec[term]) * idf
159 |             for (docid, freq) in postings_list:
160 |                 doc_hit_map[docid] += self.tf_weight(freq) * query_term_wt
161 |         for (docid, weight) in doc_hit_map.iteritems():
162 |             doc_len = get_doc_len(docid)
163 |             doc_hit_map[docid] = weight / doc_len
164 |             
165 |         # construct list of tuples sorted by value
166 |         return sorted(doc_hit_map.iteritems(),
167 |                       key=operator.itemgetter(1),
168 |                       reverse=True)
169 | 
170 | # Register scorers by name
171 | QueryScorer.register_scorers({
172 |     'simple_count': SimpleCountQueryScorer,
173 |     'tfidf': TFIDFQueryScorer
174 | })
175 | 
176 | 


--------------------------------------------------------------------------------
/pysimsearch/sim_index/__init__.py:
--------------------------------------------------------------------------------
1 | from .sim_index import SimIndex
2 | from .map_sim_index import MapSimIndex
3 | from .memory_sim_index import MemorySimIndex
4 | from .shelf_sim_index import ShelfSimIndex
5 | from .remote_sim_index import RemoteSimIndex
6 | from .sim_index_collection import SimIndexCollection
7 | from .concurrent_sim_index import ConcurrentSimIndex
8 | 


--------------------------------------------------------------------------------
/pysimsearch/sim_index/concurrent_sim_index.py:
--------------------------------------------------------------------------------
  1 | ﻿#!/usr/bin/env python
  2 | 
  3 | # Copyright (c) 2011, Taher Haveliwala <oss@taherh.org>
  4 | # All rights reserved.
  5 | #
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are met:
  8 | #         * Redistributions of source code must retain the above copyright
  9 | #           notice, this list of conditions and the following disclaimer.
 10 | #         * Redistributions in binary form must reproduce the above copyright
 11 | #           notice, this list of conditions and the following disclaimer in the
 12 | #           documentation and/or other materials provided with the distribution.
 13 | #         * The names of project contributors may not be used to endorse or
 14 | #           promote products derived from this software without specific
 15 | #           prior written permission.
 16 | #
 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | 
 29 | 
 30 | '''
 31 | ConcurrentSimIndex
 32 | 
 33 | Wrapper to allow concurrent SimIndex access
 34 | 
 35 | Sample usage::
 36 | 
 37 |     from pysimsearch.sim_index import MemorySimIndex, ConcurrentSimIndex
 38 | 
 39 |     index = ConcurrentSimIndex(MemorySimIndex())
 40 |     index.index_urls('http://www.stanford.edu/', 'http://www.berkeley.edu')
 41 |     print(list(index.query('stanford')))
 42 | 
 43 | '''
 44 | 
 45 | from __future__ import (division, absolute_import, print_function,
 46 |                         unicode_literals)
 47 | 
 48 | from concurrent import futures
 49 | import threading
 50 | 
 51 | from . import SimIndex
 52 | 
 53 | class ConcurrentSimIndex(object):
 54 |     '''Proxy to a :class:`pysimsearch.sim_index.SimIndex` that allows
 55 |     concurrent access.
 56 |     
 57 |     ``ConcurrentSimIndex`` is compatible with the :class:`SimIndex` interface.
 58 |     We use ``concurrent.futures`` to allow some basic concurrency for indexing
 59 |     and querying.  In particular, calls to ``index_urls()`` are executed in a
 60 |     nonblocking manner.
 61 |     '''
 62 | 
 63 |     READ_METHODS = {'name_to_docid',
 64 |                     'docid_to_name',
 65 |                     'postings_list',
 66 |                     'docids_with_terms',
 67 |                     'docnames_with_terms',
 68 |                     'query',
 69 |                     'get_local_N',
 70 |                     'get_local_df_map',
 71 |                     'get_name_to_docid_map',
 72 |                     'config'}
 73 |     
 74 |     WRITE_METHODS = {'set_query_scorer',
 75 |                      'set_global_N',
 76 |                      'set_global_df_map',
 77 |                      'load_stoplist',
 78 |                      'set_config',
 79 |                      'update_config',
 80 |                      'index_string_buffers',
 81 |                      'index_files',
 82 |                      'del_docids',
 83 |                      }
 84 |     
 85 |     # Note:  assume that index_urls() is implemented by calling index_files()
 86 |     #        so that the write-lock will be acquired at the time index_files()
 87 |     #        is called.  We don't want to acquire a lock on index_urls()
 88 |     #        directly, as we'd like allow at least the url fetches to occur
 89 |     #        concurrently.
 90 |     #
 91 |     # TODO: re-implement index_urls() here to ensure the assumption is true?
 92 |     NONBLOCKING_METHODS = { 'index_urls' }
 93 | 
 94 |     
 95 |     def __init__(self, sim_index):
 96 |         '''Initialize with ``sim_index``
 97 |         
 98 |         Params:
 99 |             sim_index: A :class:`SimIndex` instance.
100 |         '''
101 |         self._sim_index = sim_index
102 |         self._executor = futures.ThreadPoolExecutor(max_workers=10)
103 |         self._lock = threading.RLock()  # TODO: use a Read-Write Lock
104 |         self._futures = set()
105 |     
106 |     def acquire_read_lock(self):
107 |         '''Acquire read lock'''
108 |         self._lock.acquire()
109 |     
110 |     def release_read_lock(self):
111 |         '''Release read lock'''
112 |         self._lock.release()
113 |     
114 |     def acquire_write_lock(self):
115 |         '''Acquire write lock'''
116 |         self._lock.acquire()
117 |         
118 |     def release_write_lock(self):
119 |         '''Release write lock'''
120 |         self._lock.release()
121 |         
122 |     def _read_decorator(self, func):
123 |         '''Wrap func with read_lock protection'''
124 |         def wrapper(*args, **kwargs):
125 |             self.acquire_read_lock()
126 |             try:
127 |                 return func(*args, **kwargs)
128 |             finally:
129 |                 self.release_read_lock()
130 |         return wrapper
131 | 
132 |     def _write_decorator(self, func):
133 |         '''Wrap func with write_lock protection'''
134 |         def wrapper(*args, **kwargs):
135 |             self.acquire_write_lock()
136 |             try:
137 |                 return func(*args, **kwargs)
138 |             finally:
139 |                 self.release_write_lock()
140 |         return wrapper
141 |     
142 |     def _nonblocking_decorator(self, func):
143 |         '''
144 |         Wrap func with non-blocking futures call.
145 |         Return value of ``func`` is ignored.
146 |         '''
147 |         def wrapper(*args, **kwargs):
148 |             future = self._executor.submit(func, *args, **kwargs)
149 |             self._futures.add(future)
150 |         return wrapper
151 | 
152 |     def _futures_wait(self):
153 |         if len(self._futures) > 0:
154 |             r = futures.wait(self._futures)
155 |             for future in r.done:
156 |                 if future.exception() is not None:
157 |                     raise future.exception()
158 |         self._futures = set()
159 | 
160 |     def __getattr__(self, name):
161 |         func = getattr(self._sim_index, name)
162 |         
163 |         if name in self.READ_METHODS:
164 |             # wait for any outstanding non-blocking calls to complete
165 |             self._futures_wait()
166 |             return self._read_decorator(func)
167 |         elif name in self.WRITE_METHODS:
168 |             # wait for any outstanding  non-blocking calls to complete
169 |             return self._write_decorator(func)
170 |         elif name in self.NONBLOCKING_METHODS:
171 |             return self._nonblocking_decorator(func)
172 |         else:
173 |             raise Exception("Unsupported method: {}".format(name))
174 | 
175 | # ConcurrentSimIndex is a subtype of SimIndex    
176 | SimIndex.register(ConcurrentSimIndex)
177 | 


--------------------------------------------------------------------------------
/pysimsearch/sim_index/map_sim_index.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright (c) 2011, Taher Haveliwala <oss@taherh.org>
  4 | # All rights reserved.
  5 | #
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are met:
  8 | #         * Redistributions of source code must retain the above copyright
  9 | #           notice, this list of conditions and the following disclaimer.
 10 | #         * Redistributions in binary form must reproduce the above copyright
 11 | #           notice, this list of conditions and the following disclaimer in the
 12 | #           documentation and/or other materials provided with the distribution.
 13 | #         * The names of project contributors may not be used to endorse or
 14 | #           promote products derived from this software without specific
 15 | #           prior written permission.
 16 | #
 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | 
 29 | 
 30 | '''
 31 | MapSimIndex
 32 | 
 33 | See :mod:`pysimsearch.sim_index.memory_sim_index` for sample usage
 34 | 
 35 | '''
 36 | 
 37 | from __future__ import (division, absolute_import, print_function,
 38 |                         unicode_literals)
 39 | 
 40 | from collections import defaultdict
 41 | import sys
 42 | 
 43 | from . import SimIndex
 44 | from .. import term_vec
 45 | from ..exceptions import *
 46 | 
 47 | class MapSimIndex(SimIndex):
 48 |     '''
 49 |     Inherits from :class:`pysimsearch.sim_index.SimIndex`.
 50 |     
 51 |     Simple implementation of the :class:`SimIndex` interface backed with dict-like
 52 |     objects (MutableMapping).  By default, uses `dict`, in which case the
 53 |     indexes are in-memory.
 54 |     
 55 |     NOTE: to ensure proper compatibility with arbitrary dict-like objects,
 56 |     including persistent shelves, any mutations must be done using assignment.
 57 |     E.g., do not do::
 58 |     
 59 |         map[key].extend([a, b])
 60 |         
 61 |     Instead, do the equivalent of::
 62 |     
 63 |         map[key] += [a,b]  # same as: map[key] = map[key].__iadd__([a,b])
 64 |     '''
 65 | 
 66 |     
 67 |     def __init__(self,
 68 |                  name_to_docid_map=None,
 69 |                  docid_to_name_map=None,
 70 |                  docid_to_feature_map=None,
 71 |                  term_index=None,
 72 |                  doc_vectors=None,
 73 |                  df_map=None,
 74 |                  doc_len_map=None):
 75 | 
 76 |         super(MapSimIndex, self).__init__()
 77 | 
 78 |         # index metadata
 79 |         self._name_to_docid_map = name_to_docid_map
 80 |         self._docid_to_name_map = docid_to_name_map
 81 |         self._docid_to_feature_map = docid_to_feature_map
 82 | 
 83 |         # term index
 84 |         self._term_index = term_index
 85 |         
 86 |         # document vectors (useful for deletions and certain scoring algorithms)
 87 |         self._doc_vectors = doc_vectors
 88 |         
 89 |         # additional stats used for scoring
 90 |         self._df_map = df_map
 91 |         self._doc_len_map = doc_len_map
 92 |         
 93 |         # global stats, which if present, are used instead
 94 |         # of the local stats
 95 |         self._global_df_map = None
 96 |         
 97 |         # set a default scorer
 98 |         self.set_query_scorer('tfidf')
 99 | 
100 |     def set_global_df_map(self, df_map):
101 |         self._global_df_map = df_map
102 |         
103 |     def get_local_df_map(self):
104 |         return self._df_map
105 |     
106 |     def get_name_to_docid_map(self):
107 |         return self._name_to_docid_map
108 |     
109 |     def get_doc_freq(self, term):
110 |         df_map = self._global_df_map or self._df_map
111 |         return df_map.get(term, 1)
112 |         
113 |     def get_doc_len(self, docid):
114 |         return self._doc_len_map.get(docid, 0)
115 |         
116 |     def index_files(self, named_files):
117 |         '''
118 |         Build a similarity index over collection given in named_files
119 |         named_files is a list iterable of (filename, file) pairs
120 |         '''
121 |         for (name, file) in named_files:
122 |             with file:
123 |                 t_vec = term_vec.term_vec(
124 |                     file,
125 |                     stoplist=self.config('stoplist'),
126 |                     lowercase=self.config('lowercase'),
127 |                 )
128 |             docid = self._next_docid
129 |             self._name_to_docid_map[name] = docid
130 |             self._docid_to_name_map[docid] = name
131 |             for term in t_vec:
132 |                 if term not in self._df_map: self._df_map[term] = 0
133 |                 self._df_map[term] += 1
134 |             self._add_vec(docid, t_vec)
135 |             self._doc_len_map[docid] = term_vec.l2_norm(t_vec)
136 |             self._doc_vectors[docid] = t_vec
137 |             self._N += 1
138 |             self._next_docid += 1
139 | 
140 |     def _add_vec(self, docid, term_vec):
141 |         '''Add term_vec to the index'''
142 |         # build up a dictionary of batched updates for the index
143 |         term_index = defaultdict(list)
144 |         for (term, freq) in term_vec.iteritems():
145 |             term_index[term].append((docid, freq))
146 | 
147 |         # apply the updates to the term index
148 |         for (term, new_postings) in term_index.items():
149 |             self._term_index[term] = self.postings_list(term) + new_postings
150 | 
151 |     def del_docids(self, *docids):
152 |         '''Delete docids from index'''
153 | 
154 |         def _del_helper(map, key):
155 |             try:
156 |                 del map[key]
157 |             except KeyError:
158 | #                sys.stderr.write("Unkown docid: {}\n".format(docid))
159 |                 pass
160 |                 
161 |         # TODO: optimize for batch deletion
162 |         for docid in docids:
163 |             for (term, freq) in self._doc_vectors[docid].iteritems():
164 |                 # decr df count
165 |                 self._df_map[term] -= 1
166 |                 # filter out docid from term index
167 |                 self._term_index[term] = [
168 |                     (_docid, freq)
169 |                     for (_docid, freq) in self._term_index.get(term, [])
170 |                     if _docid != docid
171 |                 ]
172 |                 if len(self._term_index[term]) == 0:
173 |                     del self._term_index[term]
174 |             
175 |             name = self.docid_to_name(docid)
176 |             _del_helper(self._docid_to_name_map, docid)
177 |             _del_helper(self._docid_to_feature_map, docid)
178 |             _del_helper(self._name_to_docid_map, name)
179 |             _del_helper(self._doc_len_map, docid)
180 |             _del_helper(self._doc_vectors, docid)
181 |             
182 |             self._N -= 1
183 |         
184 |     def docid_to_name(self, docid):
185 |         return self._docid_to_name_map[docid]
186 |         
187 |     def name_to_docid(self, name):
188 |         return self._name_to_docid_map[name]
189 | 
190 |     def postings_list(self, term):
191 |         '''
192 |         Returns list of (docid, freq) tuples for documents containing term
193 |         '''
194 |         if self.config('lowercase'):
195 |             term = term.lower()
196 | 
197 |         return self._term_index.get(term, [])
198 |         
199 |     def _query(self, query_vec):
200 |         '''Finds documents similar to query_vec
201 |         
202 |         Params:
203 |             query_vec: term vector representing query document
204 |         
205 |         Returns:
206 |             A iterable of (docname, score) tuples sorted by score
207 |         '''
208 |         postings_lists = []
209 |         for term in query_vec:
210 |             postings_lists.append((term, self.postings_list(term)))
211 | 
212 |         
213 |         N = self._global_N or self._N
214 |         hits = self.query_scorer.score_docs(query_vec=query_vec,
215 |                                             postings_lists=postings_lists,
216 |                                             N=N,
217 |                                             get_doc_freq=self.get_doc_freq,
218 |                                             get_doc_len=self.get_doc_len)
219 |         
220 |         return ((self.docid_to_name(docid), score) for (docid, score) in hits)
221 | 
222 | 


--------------------------------------------------------------------------------
/pysimsearch/sim_index/memory_sim_index.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright (c) 2011, Taher Haveliwala <oss@taherh.org>
  4 | # All rights reserved.
  5 | #
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are met:
  8 | #         * Redistributions of source code must retain the above copyright
  9 | #           notice, this list of conditions and the following disclaimer.
 10 | #         * Redistributions in binary form must reproduce the above copyright
 11 | #           notice, this list of conditions and the following disclaimer in the
 12 | #           documentation and/or other materials provided with the distribution.
 13 | #         * The names of project contributors may not be used to endorse or
 14 | #           promote products derived from this software without specific
 15 | #           prior written permission.
 16 | #
 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | 
 29 | 
 30 | '''
 31 | MemorySimIndex
 32 | 
 33 | Sample usage::
 34 | 
 35 |     from pprint import pprint
 36 |     from pysimsearch.sim_index import MemorySimIndex
 37 |     from pysimsearch import doc_reader
 38 | 
 39 |     sim_index = MemorySimIndex()
 40 |     sim_index.index_urls('http://www.stanford.edu/',
 41 |                          'http://www.berkeley.edu',
 42 |                          'http://www.ucla.edu',
 43 |                          'http://www.mit.edu')
 44 |     pprint(sim_index.postings_list('university'))
 45 |     pprint(list(sim_index.docnames_with_terms('university', 'california')))
 46 |     
 47 |     sim_index.set_query_scorer('simple_count')
 48 |     pprint(list(sim_index.query("stanford university")))
 49 | 
 50 | '''
 51 | 
 52 | from __future__ import (division, absolute_import, print_function,
 53 |                         unicode_literals)
 54 | 
 55 | import cPickle as pickle
 56 | from collections import defaultdict
 57 | 
 58 | from . import MapSimIndex
 59 | from pysimsearch.exceptions import *
 60 | 
 61 | class MemorySimIndex(MapSimIndex):
 62 |     '''
 63 |     Inherits from :class:`pysimsearch.sim_index.MapSimIndex`.
 64 |     
 65 |     Memory-based implementation of :class:`SimIndex`.  Indexes are backed with
 66 |     ``dict``.
 67 |     '''
 68 |     
 69 |     def __init__(self):
 70 |         
 71 |         # index metadata
 72 |         name_to_docid_map = dict()
 73 |         docid_to_name_map = dict()
 74 |         docid_to_feature_map = dict()
 75 | 
 76 |         # term index
 77 |         term_index = dict()
 78 |         
 79 |         # document vectors
 80 |         doc_vectors = dict()
 81 |         
 82 |         # additional stats used for scoring
 83 |         df_map = dict()
 84 |         doc_len_map = dict()
 85 | 
 86 |         self._maps = dict(name_to_docid_map=name_to_docid_map,
 87 |                           docid_to_name_map=docid_to_name_map,
 88 |                           docid_to_feature_map=docid_to_feature_map,
 89 |                           term_index=term_index,
 90 |                           doc_vectors=doc_vectors,
 91 |                           df_map=df_map,
 92 |                           doc_len_map=doc_len_map)
 93 |         
 94 |         super(MemorySimIndex, self).__init__(**self._maps)
 95 |         
 96 |     def save(self, file):
 97 |         '''Saved index to file'''
 98 |         # pickle won't let us save query_scorer
 99 |         qs = self.query_scorer
100 |         self.query_scorer = None
101 |         pickle.dump(self, file)
102 |         self.query_scorer = qs
103 |         
104 |     @staticmethod
105 |     def load(file):
106 |         '''Returns a ``MemorySimIndex`` loaded from pickle file'''
107 |         return pickle.load(file)
108 | 
109 | 


--------------------------------------------------------------------------------
/pysimsearch/sim_index/remote_sim_index.py:
--------------------------------------------------------------------------------
  1 | ﻿#!/usr/bin/env python
  2 | 
  3 | # Copyright (c) 2011, Taher Haveliwala <oss@taherh.org>
  4 | # All rights reserved.
  5 | #
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are met:
  8 | #         * Redistributions of source code must retain the above copyright
  9 | #           notice, this list of conditions and the following disclaimer.
 10 | #         * Redistributions in binary form must reproduce the above copyright
 11 | #           notice, this list of conditions and the following disclaimer in the
 12 | #           documentation and/or other materials provided with the distribution.
 13 | #         * The names of project contributors may not be used to endorse or
 14 | #           promote products derived from this software without specific
 15 | #           prior written permission.
 16 | #
 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | 
 29 | 
 30 | '''
 31 | RemoteSimIndex
 32 | 
 33 | Sample usage:
 34 | 
 35 | **Server**
 36 | ::
 37 | 
 38 |     bash$ pysimsearch/sim_server.py sim_index -p 9001
 39 |     Use Control-C to exit
 40 | 
 41 | 
 42 | ** pysimsearch Client **
 43 | 
 44 | >>> from pprint import pprint
 45 | >>> from pysimsearch import sim_index
 46 | >>> index = sim_index.RemoteSimIndex('http://localhost:9001/RPC2')
 47 | >>> index.index_urls('http://www.stanford.edu/', 'http://www.berkeley.edu', 'http://www.ucla.edu')
 48 | >>> pprint(index.query('university'))
 49 | [[u'http://www.stanford.edu/', 0.10469570845856098],
 50 |  [u'http://www.ucla.edu', 0.04485065887313478],
 51 |  [u'http://www.berkeley.edu', 0.020464326883958977]]
 52 | 
 53 | '''
 54 | 
 55 | from __future__ import (division, absolute_import, print_function,
 56 |                         unicode_literals)
 57 | 
 58 | import jsonrpclib as rpclib
 59 | #import xmlrpclib as rpclib
 60 | 
 61 | from . import SimIndex
 62 | 
 63 | class RemoteSimIndex(object):
 64 |     '''Proxy to a remote :class:`pysimsearch.sim_index.SimIndex`
 65 |     
 66 |     ``RemoteSimIndex`` is compatible with the :class:`SimIndex` interface,
 67 |     and provides access to a remote index.  We use this in place of
 68 |     directly using a jsonrpclib.Server() object because we need an object
 69 |     that acts like type :class:`SimIndex`.
 70 |     
 71 |     Instantiate a ``RemoteSimIndex`` as follows:
 72 |     
 73 |     >>> remote_index = RemoteSimIndex('http://localhost:9001/RPC2')
 74 |     >>> remote_index.query('university')
 75 |     ...
 76 |     
 77 |     '''
 78 |     
 79 |     def __init__(self, server_url):
 80 |         '''Initialize with server_url
 81 |         
 82 |         Params:
 83 |             server_url: url for remote ``SimIndex`` server
 84 |         '''
 85 |         from .. import sim_server
 86 |         self.PREFIX = sim_server.SimIndexService.PREFIX
 87 |         self.EXPORTED_METHODS = sim_server.SimIndexService.EXPORTED_METHODS
 88 |         self._server = rpclib.Server(server_url)
 89 |         
 90 |     def __getattr__(self, name):
 91 |         if name in self.EXPORTED_METHODS:
 92 |             func = getattr(self._server,
 93 |                            self.PREFIX + '.' + name)
 94 |             return func
 95 |         else:
 96 |             raise Exception("Unsupported method: {}".format(name))
 97 | 
 98 | # RemoteSimIndex is a subtype of SimIndex    
 99 | SimIndex.register(RemoteSimIndex)
100 | 
101 | 
102 | 


--------------------------------------------------------------------------------
/pysimsearch/sim_index/shelf_sim_index.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright (c) 2011, Taher Haveliwala <oss@taherh.org>
  4 | # All rights reserved.
  5 | #
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are met:
  8 | #         * Redistributions of source code must retain the above copyright
  9 | #           notice, this list of conditions and the following disclaimer.
 10 | #         * Redistributions in binary form must reproduce the above copyright
 11 | #           notice, this list of conditions and the following disclaimer in the
 12 | #           documentation and/or other materials provided with the distribution.
 13 | #         * The names of project contributors may not be used to endorse or
 14 | #           promote products derived from this software without specific
 15 | #           prior written permission.
 16 | #
 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | 
 29 | 
 30 | '''
 31 | ShelfSimIndex
 32 | 
 33 | Sample usage::
 34 | 
 35 |     from pprint import pprint
 36 |     from pysimsearch.sim_index import ShelfSimIndex
 37 |     from pysimsearch import doc_reader
 38 | 
 39 |     sim_index = ShelfSimIndex()
 40 |     sim_index.index_urls('http://www.stanford.edu/',
 41 |                          'http://www.berkeley.edu',
 42 |                          'http://www.ucla.edu',
 43 |                          'http://www.mit.edu')
 44 |     pprint(sim_index.postings_list('university'))
 45 |     pprint(list(sim_index.docnames_with_terms('university', 'california')))
 46 |     
 47 |     sim_index.set_query_scorer('simple_count')
 48 |     pprint(list(sim_index.query("stanford university")))
 49 | 
 50 | '''
 51 | 
 52 | from __future__ import (division, absolute_import, print_function,
 53 |                         unicode_literals)
 54 | 
 55 | from collections import defaultdict, MutableMapping
 56 | from shelve import DbfilenameShelf as DBShelf
 57 | 
 58 | from . import MapSimIndex
 59 | from ..exceptions import *
 60 | 
 61 | class ShelfSimIndex(MapSimIndex):
 62 |     '''
 63 |     Inherits from :class:`pysimsearch.sim_index.MapSimIndex`.
 64 |     
 65 |     Shelf-based implementation of :class:`SimIndex`.  Indexes are backed with
 66 |     persistent :class:`shelve.DbfilenameShelf` objects.
 67 |     '''
 68 |     
 69 |     
 70 |     def __init__(self, filename, flag):
 71 |         name_to_docid_map = StrKeyMap(DBShelf(filename + '_n2d', flag))
 72 |         docid_to_name_map = StrKeyMap(DBShelf(filename + '_d2n', flag))
 73 |         docid_to_feature_map = StrKeyMap(DBShelf(filename + '_feat', flag))
 74 | 
 75 |         # term index
 76 |         term_index = StrKeyMap(DBShelf(filename + '_term', flag))
 77 | 
 78 |         # document vectors
 79 |         doc_vectors = StrKeyMap(DBShelf(filename + '_doc_vec', flag))
 80 | 
 81 |         # additional stats used for scoring
 82 |         df_map = StrKeyMap(DBShelf(filename + '_df', flag))
 83 |         doc_len_map = StrKeyMap(DBShelf(filename + '_dl', flag))
 84 | 
 85 |         self._maps = dict(name_to_docid_map=name_to_docid_map,
 86 |                           docid_to_name_map=docid_to_name_map,
 87 |                           docid_to_feature_map=docid_to_feature_map,
 88 |                           term_index=term_index,
 89 |                           doc_vectors=doc_vectors,
 90 |                           df_map=df_map,
 91 |                           doc_len_map=doc_len_map)
 92 |         
 93 |         super(ShelfSimIndex, self).__init__(**self._maps)
 94 |         
 95 |         self._N = len(docid_to_name_map)
 96 | 
 97 |     def close(self):
 98 |         for (mapname, map) in self._maps.items():
 99 |             map.close()
100 | 
101 | class StrKeyMap(MutableMapping):
102 |     '''
103 |     Ensure that key is converted to str type that is compatible with keys
104 |     for underlying map.
105 |     '''
106 |     def __init__(self, map):
107 |         self._map = map
108 |         
109 |     def __getitem__(self, key):
110 |         return self._map[str(key)]
111 |         
112 |     def __setitem__(self, key, value):
113 |         self._map[str(key)] = value
114 |         
115 |     def __delitem__(self, key):
116 |         del self._map[str(key)]
117 |         
118 |     def __iter__(self):
119 |         raise Exception('Unsupported')
120 |         # return iter(self._map)
121 |         
122 |     def __len__(self):
123 |         return len(self._map)
124 |         
125 |     def close(self):
126 |         return self._map.close()
127 | 


--------------------------------------------------------------------------------
/pysimsearch/sim_index/sim_index.py:
--------------------------------------------------------------------------------
  1 | ﻿#!/usr/bin/env python
  2 | 
  3 | # Copyright (c) 2011, Taher Haveliwala <oss@taherh.org>
  4 | # All rights reserved.
  5 | #
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are met:
  8 | #         * Redistributions of source code must retain the above copyright
  9 | #           notice, this list of conditions and the following disclaimer.
 10 | #         * Redistributions in binary form must reproduce the above copyright
 11 | #           notice, this list of conditions and the following disclaimer in the
 12 | #           documentation and/or other materials provided with the distribution.
 13 | #         * The names of project contributors may not be used to endorse or
 14 | #           promote products derived from this software without specific
 15 | #           prior written permission.
 16 | #
 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | 
 29 | 
 30 | '''
 31 | SimIndex
 32 | 
 33 | See :mod:`pysimsearch.sim_index.memory_sim_index` for sample usage
 34 | 
 35 | '''
 36 | 
 37 | from __future__ import (division, absolute_import, print_function,
 38 |                         unicode_literals)
 39 | 
 40 | import abc
 41 | import io
 42 | import itertools
 43 | 
 44 | from .. import doc_reader
 45 | from .. import term_vec
 46 | from ..exceptions import *
 47 | from ..query_scorer import QueryScorer
 48 | 
 49 | class SimIndex(object):
 50 |     '''
 51 |     Base class for similarity indexes
 52 |     
 53 |     Defines interface as well as provides default implementation for
 54 |     several methods.
 55 |     
 56 |     Instance Attributes:
 57 |         config: dictionary of configuration variables
 58 |         
 59 |     '''
 60 | 
 61 |     __metaclass__ = abc.ABCMeta
 62 |     
 63 |     def __init__(self):
 64 |         self._config = {
 65 |             'lowercase': True,
 66 |             'stoplist': {}  # using dict instead of set, for rpc support
 67 |         }
 68 |         self.query_scorer = None
 69 |         self._N = 0
 70 |         self._global_N = None
 71 |         self._next_docid = 0
 72 | 
 73 |     def config(self, key):
 74 |         return self._config[key]
 75 | 
 76 |     def set_config(self, key, value):
 77 |         self._config[key] = value
 78 | 
 79 |     def update_config(self, **d):
 80 |         self._config.update(d)
 81 |         
 82 |     def load_stoplist(self, stopfile):
 83 |         stoplist = {}
 84 |         for line in stopfile:
 85 |             stoplist.update(zip(line.split(), itertools.repeat(1)))
 86 |         self.set_config('stoplist', stoplist)
 87 | 
 88 |     @abc.abstractmethod
 89 |     def set_global_df_map(self, df_map):
 90 |         '''Set global df stats'''
 91 |         return
 92 |     
 93 |     @abc.abstractmethod
 94 |     def get_local_df_map(self):
 95 |         '''Get local df stats'''
 96 |         return
 97 |     
 98 |     @abc.abstractmethod
 99 |     def get_name_to_docid_map(self):
100 |         '''Return local mapping of name to docids'''
101 |         return
102 | 
103 |     def set_global_N(self, N):
104 |         '''Set global number of documents'''
105 |         self._global_N = N
106 |     
107 |     def get_local_N(self):
108 |         '''Return local number of documents'''
109 |         return self._N
110 |         
111 |     def set_query_scorer(self, query_scorer):
112 |         '''Set the query_scorer
113 |         
114 |         Params:
115 |             query_scorer: if string type, we assume it is a scorer name,
116 |                           else we assume it is itself a scoring object
117 |                           of base type :class:`query_scorer.QueryScorer`.
118 |         '''
119 |         if isinstance(query_scorer, basestring):
120 |             self.query_scorer = QueryScorer.make_scorer(query_scorer)
121 |         else:
122 |             self.query_scorer = query_scorer
123 | 
124 |     @abc.abstractmethod
125 |     def index_files(self, named_files):
126 |         '''Add ``named_files`` to the index
127 |         
128 |         Params:
129 |             named_files: iterable of (filename, file) pairs.
130 |                          Takes ownership of (and consumes) the files.
131 |         '''
132 |         return
133 | 
134 |     def index_filenames(self, *filenames):
135 |         '''Add ``filenames`` to the index
136 |         
137 |         Convenience method that wraps :meth:`index_files()`
138 |         
139 |         Params:
140 |             ``filenames``: list of filenames to add to the index.
141 |         '''
142 |         return self.index_files(doc_reader.get_text_files(filenames))
143 |         
144 |     def index_urls(self, *urls):
145 |         '''Add ``urls`` to the index
146 |         
147 |         Convenience method that wraps :meth:`index_files()`
148 |         
149 |         Params:
150 |             ``urls``: list of urls of web pages to add to the index.
151 |         '''
152 |         return self.index_files(doc_reader.get_urls(urls))
153 | 
154 |     def index_string_buffers(self, named_string_buffers):
155 |         '''Add ``named_string_buffers`` to the index
156 |         
157 |         Params:
158 |             named_string_buffers: iterable of (name, string) tuples, where
159 |                                   the string contains the data to index.
160 |             
161 |         '''
162 |         named_files = []
163 |         for (name, string_buffer) in named_string_buffers:
164 |             if isinstance(string_buffer, str):
165 |                 string_buffer = unicode(string_buffer)
166 |             named_files.append((name, io.StringIO(string_buffer)))
167 |         self.index_files(named_files)
168 |         
169 |     @abc.abstractmethod
170 |     def del_docids(self, *docids):
171 |         '''Deletes documents corresponding to docids from the index'''
172 |         return
173 |     
174 |     @abc.abstractmethod
175 |     def docid_to_name(self, docid):
176 |         '''Returns document name for a given docid'''
177 |         return
178 |         
179 |     @abc.abstractmethod
180 |     def name_to_docid(self, name):
181 |         '''Returns docid for a given document name'''
182 |         return
183 | 
184 |     @abc.abstractmethod
185 |     def postings_list(self, term):
186 |         '''
187 |         Return list of (docid, frequency) tuples for docs that contain term
188 |         '''
189 |         return
190 |     
191 |     def docids_with_terms(self, terms):
192 |         '''Returns a list of docids of docs containing all terms'''
193 |         docs = None  # will hold a set of matching docids
194 |         for term in terms:
195 |             if docs is None:
196 |                 docs = set((x[0] for x in self.postings_list(term)))
197 |             else:
198 |                 docs.intersection_update(
199 |                     (x[0] for x in self.postings_list(term)))
200 |                 
201 |         # return sorted list
202 |         if docs is None: docs = []
203 |         return sorted(docs)
204 |     
205 |     def docnames_with_terms(self, *terms):
206 |         '''Returns an iterable of docnames containing terms'''
207 |         if self.config('lowercase'):
208 |             terms = [term.lower() for term in terms]
209 |         return (self.docid_to_name(docid) for docid in self.docids_with_terms(terms))
210 |         
211 |     def query(self, q):
212 |         '''Finds documents similar to q.
213 |         
214 |         Params:
215 |             query: the query given as either a string or query vector
216 |             
217 |         Returns:
218 |             A iterable of (docname, score) tuples sorted by score
219 |         '''
220 |         if isinstance(q, basestring):
221 |             if isinstance(q, str):
222 |                 q = unicode(q)
223 |             return self._query(
224 |                 term_vec.term_vec(q,
225 |                                   stoplist=self.config('stoplist'),
226 |                                   lowercase=self.config('lowercase')))
227 |         else:
228 |             return self._query(q)
229 |         
230 |     @abc.abstractmethod
231 |     def _query(self, query_vec):
232 |         '''Finds documents similar to query_vec
233 |         
234 |         Params:
235 |             query_vec: term vector representing query document
236 |         
237 |         Returns:
238 |             A iterable of (docname, score) tuples sorted by score
239 |         '''
240 |         return
241 |     
242 | 


--------------------------------------------------------------------------------
/pysimsearch/sim_index/sim_index_collection.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright (c) 2011, Taher Haveliwala <oss@taherh.org>
  4 | # All rights reserved.
  5 | #
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are met:
  8 | #         * Redistributions of source code must retain the above copyright
  9 | #           notice, this list of conditions and the following disclaimer.
 10 | #         * Redistributions in binary form must reproduce the above copyright
 11 | #           notice, this list of conditions and the following disclaimer in the
 12 | #           documentation and/or other materials provided with the distribution.
 13 | #         * The names of project contributors may not be used to endorse or
 14 | #           promote products derived from this software without specific
 15 | #           prior written permission.
 16 | #
 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | 
 29 | 
 30 | '''
 31 | SimIndexCollection
 32 | 
 33 | Sample usage::
 34 | 
 35 |     from pprint import pprint
 36 |     from pysimsearch.sim_index import MemorySimIndex, SimIndexCollection
 37 | 
 38 |     indexes = (MemorySimIndex(), MemorySimIndex())
 39 |     index_coll = SimIndexCollection()
 40 |     index_coll.add_shards(*indexes)
 41 |     index_coll.set_query_scorer('tfidf')
 42 |     index_coll.index_urls('http://www.stanford.edu/',
 43 |                           'http://www.berkeley.edu',
 44 |                           'http://www.ucla.edu',
 45 |                           'http://www.mit.edu')
 46 |     
 47 |     pprint(index_coll.query('stanford university'))
 48 | 
 49 | '''
 50 | 
 51 | from __future__ import (division, absolute_import, print_function,
 52 |                         unicode_literals)
 53 | 
 54 | from collections import defaultdict
 55 | import operator
 56 | import os
 57 | 
 58 | from . import SimIndex
 59 | from ..exceptions import *
 60 | 
 61 | class SimIndexCollection(SimIndex):
 62 |     '''
 63 |     Inherits from :class:`pysimsearch.sim_index.SimIndex`.
 64 |     
 65 |     Provides a :class:`SimIndex` view over a sharded collection of SimIndexes.
 66 |     
 67 |     Useful with collections of remote SimIndexes to provide a
 68 |     distributed indexing and serving architecture.
 69 |     
 70 |     Assumes document-level sharding:
 71 |     
 72 |       - ``query()`` requests are routed to all shards in collection.
 73 |       - ``index_files()`` requests are routed according to a sharding function
 74 |     
 75 |     Note that if we had used query-sharding, then instead, queries would
 76 |     be routed using a sharding function, and index-requests would be
 77 |     routed to all shards.  The two sharding approaches correspond to either
 78 |     partitioning the postings matrix by columns (doc-sharding),
 79 |     or rows (query-sharding).
 80 |     
 81 |     The shard-function is only used for ``index_*()`` operations.  If you
 82 |     have a read-only collection, you don't need a sharding function.
 83 |     '''
 84 |     
 85 |     def __init__(self, shards=(), root=True):
 86 |         super(SimIndexCollection, self).__init__()
 87 | 
 88 |         self._shards = []
 89 |         self.shard_func = self.default_shard_func
 90 |         self._name_to_docid_map = {}
 91 |         self._docid_to_name_map = {}
 92 |         self._df_map = {}
 93 |         
 94 |         self._dirty = False
 95 |         
 96 |         self.set_config('root', root, passthrough=False)
 97 |         
 98 |         if shards:
 99 |             self.add_shards(*shards)
100 | 
101 |     def set_config(self, key, value, passthrough=True):
102 |         '''Update config var for shards'''
103 |         super(SimIndexCollection, self).set_config(key, value)
104 |         if passthrough:
105 |             for shard in self._shards:
106 |                 shard.set_config(key, value)
107 |             
108 |     def update_config(self, passthrough=True, **d):
109 |         '''Update config for shards'''
110 |         super(SimIndexCollection, self).update_config(**d)
111 |         if passthrough:
112 |             for shard in self._shards:
113 |                 shard.update_config(**d)
114 | 
115 |     def clear_shards(self):
116 |         self._shards = []
117 |         
118 |     def add_shards(self, *sim_index_shards):
119 |         for shard in sim_index_shards:
120 |             shard.update_config(**self._config)
121 |         self._shards.extend(sim_index_shards)
122 |         self.update_trigger_helper()
123 |     
124 |     _salt = None
125 |     def default_shard_func(self, shard_key):
126 |         '''implements the default sharding function'''
127 |         if self._salt is None:
128 |             self._salt = os.urandom(4)
129 |         return hash(str(shard_key)+self._salt) % len(self._shards)
130 |         
131 |     def set_shard_func(self, func):
132 |         self._shard_func = func
133 | 
134 |     def set_global_N(self, N):
135 |         for shard in self._shards:
136 |             shard.set_global_N(N)
137 | 
138 |     def set_global_df_map(self, df_map):
139 |         for shard in self._shards:
140 |             shard.set_global_df_map(df_map)
141 |         
142 |     def get_local_df_map(self):
143 |         return self._df_map
144 |     
145 |     def get_name_to_docid_map(self):
146 |         return self._name_to_docid_map
147 | 
148 |     def update_trigger(method):
149 |         '''
150 |         Decorator for methods that update the index.  Used as a post-update
151 |         trigger that gathers new term stats, and propagates them back down (if
152 |         we're the root node)
153 |         '''
154 |         def wrapper(self, *args, **kwargs):
155 |             self._dirty = True
156 |             val = method(self, *args, **kwargs)
157 |             if self._dirty:
158 |                 self.update_trigger_helper()
159 |                 self._dirty = False
160 |         
161 |         return wrapper
162 | 
163 |     @update_trigger
164 |     def index_files(self, named_files):
165 |         '''
166 |         Translate to index_string_buffers() call, since file objects
167 |         can't be serialized for rpcs to backends.  Note: we
168 |         currently read in all files in memory, and make one call to
169 |         index_string_buffers() -- this can be memory-intesive
170 |         if named_files represents a large number of files.
171 |         
172 |         TODO: read in files in smaller batches, and then make mutiple
173 |         calls to index_string_buffers().
174 |         '''
175 |         named_string_buffers = [(name, file.read())
176 |             for (name, file) in named_files]
177 |         self.index_string_buffers(named_string_buffers)
178 | 
179 |     @update_trigger
180 |     def index_string_buffers(self, named_string_buffers):
181 |         '''Routes index_string_buffers() call to appropriate shard.'''
182 |         # minimize rpcs by collecting (name, buffer) tuples for
183 |         # different shards up-front
184 |         sharded_input_map = defaultdict(list)
185 |         for (name, buffer) in named_string_buffers:
186 |             sharded_input_map[self.shard_func(name)].append((name, buffer))
187 | 
188 |         # issue an indexing rpc to each sharded backend that has some input
189 |         # TODO: use non-blocking rpc's
190 |         for shard_id in sharded_input_map:
191 |             self._shards[shard_id].index_string_buffers(
192 |                 sharded_input_map[shard_id]
193 |             )
194 | 
195 |     @update_trigger
196 |     def index_urls(self, *urls):
197 |         '''Index web pages given by urls'''
198 |         # minimize rpcs by collecting (name, buffer) tuples for
199 |         # different shards up-front
200 |         sharded_input_map = defaultdict(list)
201 |         for url in urls:
202 |             sharded_input_map[self.shard_func(url)].append(url)
203 | 
204 |         # Issue an indexing call to each sharded backend that has some input
205 |         # Generally the sharded servers should be backed with
206 |         # ConcurrentSimIndexes so that the index_urls() call will generally
207 |         # be non-blocking.
208 |         for shard_id in sharded_input_map:
209 |             self._shards[shard_id].index_urls(
210 |                 *sharded_input_map[shard_id]
211 |             )
212 | 
213 |     @update_trigger
214 |     def del_docids(self, *docids):
215 |         '''Delete docid from index collection'''
216 | 
217 |         sharded_del_map = defaultdict(list)
218 |         for docid in docids:
219 |             # make sure we have a compound docid
220 |             assert '-' in docid
221 |             (shard_id, sep, remote_docid) = docid.partition('-')
222 |             shard_id = int(shard_id)
223 |             # if the remote shard is expected to be a leaf, then cast
224 |             # remote docid to int
225 |             if '-' not in remote_docid:
226 |                 remote_docid = int(remote_docid)
227 |             sharded_del_map[shard_id].append(remote_docid)
228 |         
229 |         # propagate the requests the appropriate shard
230 |         for (shard_id, remote_docids) in sharded_del_map.items():
231 |             self._shards[shard_id].del_docids(*remote_docids)
232 |     
233 |     @staticmethod
234 |     def make_node_docid(shard_id, docid):
235 |         return "{}-{}".format(shard_id, docid)
236 |     
237 |     def docid_to_name(self, docid):
238 |         '''Translates node docid to name'''
239 |         return self._docid_to_name_map[docid]
240 |     
241 |     def name_to_docid(self, name):
242 |         '''Translates name to node docid'''
243 |         return self._name_to_docid_map[name]
244 |     
245 |     def postings_list(self, term):
246 |         '''Returns aggregated postings list in terms of global docids'''
247 | 
248 |         merged_postings_list = []
249 |         for shard_id in range(len(self._shards)):
250 |             merged_postings_list.extend(
251 |                  [(self.make_node_docid(shard_id, docid), freq) for
252 |                   (docid, freq) in self._shards[shard_id].postings_list(term)]
253 |                 )
254 |         
255 |         return merged_postings_list
256 |     
257 |     def set_query_scorer(self, query_scorer):
258 |         '''Passes ``set_query_scorer()`` request to all shards.
259 |         
260 |         Params:
261 |             query_scorer: scorer object or name. If any backends are remote,
262 |                           query_scorer needs to be a scorer name, rather than
263 |                           a scorer object (which we currently don't serialize
264 |                           for rpcs)
265 |         '''
266 |         for shard in self._shards:
267 |             shard.set_query_scorer(query_scorer)
268 |             
269 |     def _query(self, query_vec):
270 |         '''Issues query to collection and returns merged results
271 |         
272 |         TODO: use a merge alg. (heapq.merge doesn't have a key= arg yet)
273 |         TODO: add support for rank-aggregation in the case of heterogenous
274 |               collections where ir scores are not directly comparable
275 |         '''
276 |         results = []
277 |         for shard in self._shards:
278 |             results.extend(shard.query(query_vec))
279 |         results.sort(key=operator.itemgetter(1), reverse=True)
280 |         return results
281 | 
282 |     def update_trigger_helper(self):
283 |         self.update_node_stats()
284 | 
285 |         # If we're the root of the collection, then propogate back node
286 |         # stats (which are global stats) to children.  Else some ancestor
287 |         # node will have that responsibility.
288 |         if self.config('root'):
289 |             self.broadcast_node_stats()
290 | 
291 |     def update_node_stats(self):
292 |         '''
293 |         Fetches local stats from all shards, aggregates them, and
294 |         rebroadcasts global stats back to shards.  Currently uses
295 |         "brute-force"; incremental updating (in either direction)
296 |         is not supported.
297 |         '''
298 | 
299 |         def merge_df_map(target, source):
300 |             '''
301 |             Helper function to merge df_maps.
302 |             '''
303 |             for (term, df) in source.items():
304 |                 if term not in target: target[term] = 0
305 |                 target[term] += df
306 | 
307 |         # Collect global stats
308 |         self._N = 0
309 |         self._df_map = {}
310 |         name_to_docid_maps = {}
311 |         for shard_id in range(len(self._shards)):
312 |             shard = self._shards[shard_id]
313 |             self._N += shard.get_local_N()
314 |             merge_df_map(self._df_map, shard.get_local_df_map())
315 |             name_to_docid_maps[shard_id] = shard.get_name_to_docid_map()
316 | 
317 |         # Update our name <-> node_docid mapping
318 |         for (shard_id, name_to_docid_map) in name_to_docid_maps.iteritems():
319 |             for (name, docid) in name_to_docid_map.iteritems():
320 |                 gdocid = self.make_node_docid(shard_id, docid)
321 |                 self._name_to_docid_map[name] = gdocid
322 |                 self._docid_to_name_map[gdocid] = name
323 | 
324 |     def broadcast_node_stats(self):  
325 |         # Broadcast global stats.  Only called by collection root node.
326 |         for shard in self._shards:
327 |             shard.set_global_N(self._N)
328 |             shard.set_global_df_map(self._df_map)
329 | 
330 | 


--------------------------------------------------------------------------------
/pysimsearch/sim_server.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright (c) 2011, Taher Haveliwala <oss@taherh.org>
  4 | # All rights reserved.
  5 | #
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are met:
  8 | #         * Redistributions of source code must retain the above copyright
  9 | #           notice, this list of conditions and the following disclaimer.
 10 | #         * Redistributions in binary form must reproduce the above copyright
 11 | #           notice, this list of conditions and the following disclaimer in the
 12 | #           documentation and/or other materials provided with the distribution.
 13 | #         * The names of project contributors may not be used to endorse or
 14 | #           promote products derived from this software without specific
 15 | #           prior written permission.
 16 | #
 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | 
 29 | 
 30 | '''
 31 | SimServer
 32 | 
 33 | Server wrapper for pysimsearch modules.  Currently, only provides access
 34 | to sim_index.
 35 | 
 36 | *Sample session:*
 37 | 
 38 | **Server**
 39 | ::
 40 | 
 41 |     bash$ ./sim_server.py sim_index -p 9001
 42 |     Use Control-C to exit
 43 | 
 44 | **jsonrpclib Client**
 45 | 
 46 | >>> from pprint import pprint
 47 | >>> import jsonrpclib
 48 | >>> server = jsonrpclib.Server('http://localhost:9001/RPC2')
 49 | >>> server.sim_index.index_urls('http://www.stanford.edu/', 'http://www.berkeley.edu', 'http://www.ucla.edu')
 50 | >>> pprint(server.sim_index.query('university'))
 51 | [[u'http://www.stanford.edu/', 0.10469570845856098],
 52 |  [u'http://www.ucla.edu', 0.04485065887313478],
 53 |  [u'http://www.berkeley.edu', 0.020464326883958977]]
 54 | 
 55 | ** pysimsearch Client **
 56 | >>> from pprint import pprint
 57 | >>> from pysimsearch import sim_index
 58 | >>> index = sim_index.RemoteSimIndex('http://localhost:9001/RPC2')
 59 | >>> index.index_urls('http://www.stanford.edu/', 'http://www.berkeley.edu', 'http://www.ucla.edu')
 60 | >>> pprint(index.query('stanford'))
 61 | [[u'http://www.stanford.edu/', 0.3612214953965162]]
 62 | 
 63 | '''
 64 | 
 65 | from __future__ import (division, absolute_import, print_function,
 66 |         unicode_literals)
 67 | 
 68 | # boilerplate to allow running as script
 69 | if __name__ == "__main__" and __package__ is None:
 70 |     import sys, os
 71 |     parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 72 |     sys.path.insert(0, parent_dir)
 73 |     import pysimsearch
 74 |     __package__ = str("pysimsearch")
 75 |     del sys, os
 76 |     
 77 | # external modules
 78 | import argparse
 79 | import logging
 80 | import traceback
 81 | import types
 82 | 
 83 | from pprint import pprint
 84 | from jsonrpclib.SimpleJSONRPCServer import SimpleJSONRPCServer as SimpleRPCServer
 85 | from jsonrpclib.SimpleJSONRPCServer import SimpleJSONRPCRequestHandler as SimpleRPCRequestHandler
 86 | 
 87 | #from SimpleXMLRPCServer import SimpleXMLRPCServer as SimpleRPCServer
 88 | #from SimpleXMLRPCServer import SimpleXMLRPCRequestHandler as SimpleRPCRequestHandler
 89 | 
 90 | # our modules
 91 | from .sim_index import *
 92 | from . import query_scorer
 93 | 
 94 | class SimIndexService(object):
 95 |     '''Provide access to sim_index as an RPC service'''
 96 | 
 97 |     PREFIX = 'sim_index'
 98 |     EXPORTED_METHODS = {'index_urls',
 99 |                         'index_string_buffers',
100 |                         'del_docids',
101 |                         'docid_to_name',
102 |                         'name_to_docid',
103 |                         'docid_to_name',
104 |                         'postings_list',
105 |                         'docids_with_terms',
106 |                         'docnames_with_terms',
107 |                         'set_query_scorer',
108 |                         'query',
109 |                         'set_global_N',
110 |                         'get_local_N',
111 |                         'set_global_df_map',
112 |                         'get_local_df_map',
113 |                         'get_name_to_docid_map',
114 |                         'config',
115 |                         'set_config',
116 |                         'update_config'}
117 |     
118 |     def __init__(self, index):
119 |         self._sim_index = index
120 |     
121 |     def _dispatch(self, method, params):
122 |         if not method.startswith(self.PREFIX + '.'):
123 |             raise Exception('method "{}" is not supported: bad prefix'.format(method))
124 | 
125 |         method_name = method.partition('.')[2]
126 | 
127 |         logging.info('_dispatch: {}'.format(method))
128 |         
129 |         if method_name not in self.EXPORTED_METHODS:
130 |             raise Exception('method "{}" is not supported'.format(method_name))
131 |             
132 |         func = getattr(self._sim_index, method_name)
133 |         try:
134 |             if type(params) is types.ListType:
135 |                 r = func(*params)
136 |             else:
137 |                 r = func(**params)
138 |             # if we got back a generator, then let's materialize a list so it
139 |             # can serialize properly
140 |             if isinstance(r, types.GeneratorType):
141 |                 r = list(r)
142 |             return r
143 |         except Exception as e:
144 |             logging.error(traceback.format_exc())
145 |             raise e
146 | 
147 | # Restrict to a particular path.
148 | class RequestHandler(SimpleRPCRequestHandler):
149 |     rpc_paths = ('/RPC2',)
150 | 
151 | def start_sim_index_server(port,
152 |                            backends=(),
153 |                            remote_urls=(),
154 |                            root=True,
155 |                            logRequests=True):
156 | 
157 |     server = SimpleRPCServer(('localhost', port),
158 |                              logRequests=logRequests,
159 |                              requestHandler=RequestHandler)
160 |     
161 |     backend_list = list(backends)
162 |     if remote_urls:
163 |         backend_list.extend(
164 |             [RemoteSimIndex(url) for url in remote_urls])
165 | 
166 |     if backend_list:
167 |         if len(backend_list) == 1:
168 |             index = ConcurrentSimIndex(backend_list[0])
169 |         else:
170 |             index = ConcurrentSimIndex(
171 |                         SimIndexCollection(
172 |                             shards=backend_list, root=root))
173 |     else:
174 |         index = ConcurrentSimIndex(MemorySimIndex())
175 |         index.set_query_scorer('tfidf')
176 | 
177 |     server.register_instance(SimIndexService(index))
178 | 
179 |     try:
180 |         print('Use Control-C to exit')
181 |         server.serve_forever()
182 |     except KeyboardInterrupt:
183 |         print('Exiting')
184 | 
185 | 
186 | # --- main() ---
187 | 
188 | def main():
189 |     parser = argparse.ArgumentParser(
190 |         description='Start a pysimsearch server')
191 |     subparsers = parser.add_subparsers(title='services',
192 |                                        description='valid services',
193 |                                        dest='command',
194 |                                        help='services help',)
195 |     
196 |     parser_sim_index = subparsers.add_parser('sim_index',
197 |                                              help='Start a SimIndex')
198 |     parser_sim_index.add_argument(
199 |             '-p', '--port', nargs='?',
200 |             default=9001, type=int,
201 |             help='Specify server port'
202 |         )
203 | 
204 |     parser_sim_index.add_argument(
205 |             '-r', '--remote_shards', nargs='*',
206 |             help='Specify remote backends to use, instead of local index'
207 |         )
208 |     
209 |     parser_sim_index.add_argument(
210 |             '--noroot', action='store_false',
211 |             dest='root', default=True,
212 |             help='True if this is the root index node'
213 |     )
214 | 
215 |     args = parser.parse_args()
216 |     if args.command == 'sim_index':
217 |         start_sim_index_server(port=args.port,
218 |                                remote_urls=args.remote_shards,
219 |                                root=args.root)
220 |     else:
221 |         raise Exception('Unknown command: {}'.format(args.command))
222 |         
223 | if __name__ == '__main__':
224 |     main()
225 |     
226 | 


--------------------------------------------------------------------------------
/pysimsearch/similarity.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright (c) 2010, Taher Haveliwala <oss@taherh.org>
  4 | # All rights reserved.
  5 | #
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are met:
  8 | #     * Redistributions of source code must retain the above copyright
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above copyright
 11 | #       notice, this list of conditions and the following disclaimer in the
 12 | #       documentation and/or other materials provided with the distribution.
 13 | #     * The names of project contributors may not be used to endorse or
 14 | #       promote products derived from this software without specific
 15 | #       prior written permission.
 16 | #
 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | 
 29 | r'''
 30 | Sample usage as a script::
 31 | 
 32 |     $ python similarity.py http://www.stanford.edu/ http://www.berkeley.edu/ http://www.mit.edu/
 33 |     Comparing files ['http://www.stanford.edu/', 'http://www.berkeley.edu/', 'http://www.mit.edu/']
 34 |     sim(http://www.stanford.edu/,http://www.berkeley.edu/)=0.322771960247
 35 |     sim(http://www.stanford.edu/,http://www.mit.edu/)=0.142787018368
 36 |     sim(http://www.berkeley.edu/,http://www.mit.edu/)=0.248877629741
 37 | 
 38 | '''
 39 | 
 40 | from __future__ import (division, absolute_import, print_function,
 41 |     unicode_literals)
 42 | 
 43 | # boilerplate to allow running as script
 44 | if __name__ == "__main__" and __package__ is None:
 45 |     import sys, os
 46 |     parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 47 |     sys.path.insert(0, parent_dir)
 48 |     import pysimsearch
 49 |     __package__ = str("pysimsearch")
 50 |     del sys, os
 51 |     
 52 | import argparse
 53 | from itertools import chain, repeat
 54 | import re
 55 | 
 56 | # our modules
 57 | from . import doc_reader
 58 | from .exceptions import *
 59 | from .term_vec import *
 60 | 
 61 | 
 62 | # --- top-level functions ---
 63 | def measure_similarity(file_a, file_b, sim_func = None):
 64 |     r'''
 65 |     Returns the textual similarity of term_vec_a and term_vec_b using chosen
 66 |     similarity metric
 67 |     
 68 |     'sim_func' defaults to cosine_sim if not specified
 69 |     '''
 70 |     if sim_func == None:
 71 |         sim_func = cosine_sim  # default to cosine_sim
 72 |     
 73 |     return sim_func(term_vec(file_a), term_vec(file_b))
 74 | 
 75 | def pairwise_compare(filenames=None, urls=None):
 76 |     r'''
 77 |     Do a pairwise comparison of all documents specified by ``filenames``
 78 |     and ``urls`` and return their pairwise similarities
 79 |     '''
 80 |     input = []
 81 |     if filenames is not None:
 82 |         input.extend(zip(filenames, repeat(doc_reader.get_text_file)))
 83 |     if urls is not None:
 84 |         input.extend(zip(urls, repeat(doc_reader.get_url)))
 85 |         
 86 |     similarities = []
 87 |     for i in range(0, len(input)):
 88 |         for j in range(i+1, len(input)):
 89 |             (name_a, get_input) = input[i]
 90 |             (name_b, get_input) = input[j]
 91 |             print("comparing {} and {}".format(name_a, name_b))
 92 |             with get_input(name_a) as file_a:
 93 |                 with get_input(name_b) as file_b:
 94 |                     similarities.append((name_a,
 95 |                                          name_b,
 96 |                                          measure_similarity(file_a, file_b)))
 97 |     return similarities
 98 |   
 99 | # --- Similarity measures ---
100 |     
101 | def cosine_sim(u, v):
102 |     r'''
103 |     Returns the cosine similarity of u,v: ``<u,v>/(|u||v|)``
104 |     where ``|u|`` is the L2 norm
105 |     '''
106 |     return dot_product(u, v) / (l2_norm(u) * l2_norm(v))
107 | 
108 | def jaccard_sim(A, B):
109 |     r'''
110 |     Returns the Jaccard similarity of A,B: ``|A \cap B| / |A \cup B|``
111 |     We treat A and B as multi-sets (The Jaccard coefficient is technically
112 |     meant for sets, although it is easily extended to multi-sets)
113 |     '''
114 |     return mag_intersect(A, B) / mag_union(A, B)
115 | 
116 | 
117 | # --- main() ---
118 | 
119 | def main():
120 |     '''Commandline interface for measure pairwise similarities of files'''
121 |     parser = argparse.ArgumentParser(
122 |         description='List pairwise similarities of input documents')
123 |     parser.add_argument('doc', nargs='*',
124 |                         help='a document in the comparison list')
125 |     parser.add_argument('-f', '--filename_list', nargs='?',
126 |                         help='file containing list of filenames to compare')
127 |     parser.add_argument('-u', '--url_list', nargs='?',
128 |                         help='file containing list of urls to compare')
129 | 
130 |     args = parser.parse_args()
131 | 
132 |     def get_list(input_fname):
133 |         list = []
134 |         if input_fname is not None:
135 |             try:
136 |                 with open(input_fname) as input_file:
137 |                     list = [line.strip() for line in
138 |                                   input_file.readlines()]
139 |             except IOError:
140 |                 print("Sorry, could not open " + input_fname)
141 |         return list
142 | 
143 |     filenames = get_list(args.filename_list)
144 |     urls = get_list(args.url_list)
145 | 
146 |     for doc in args.doc:
147 |         if re.search('^http://', doc):
148 |             urls.append(doc)
149 |         else:
150 |             filenames.extend(doc)
151 | 
152 |     if len(filenames) + len(urls) < 2:
153 |         raise Error("Sorry, you must specify at least two documents "
154 |                     "to compare.")  
155 | 
156 |     print('Comparing files {}'.format(list(chain(filenames, urls))))
157 |     similarities = pairwise_compare(filenames=filenames, urls=urls)
158 |     for (name_a, name_b, sim) in similarities:
159 |         print('sim({0},{1})={2}'.format(name_a, name_b, sim))
160 | 
161 | 
162 | if __name__ == '__main__':
163 |     main()
164 | 


--------------------------------------------------------------------------------
/pysimsearch/term_vec.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Copyright (c) 2010, Taher Haveliwala <oss@taherh.org>
 4 | # All rights reserved.
 5 | #
 6 | # Redistribution and use in source and binary forms, with or without
 7 | # modification, are permitted provided that the following conditions are met:
 8 | #     * Redistributions of source code must retain the above copyright
 9 | #       notice, this list of conditions and the following disclaimer.
10 | #     * Redistributions in binary form must reproduce the above copyright
11 | #       notice, this list of conditions and the following disclaimer in the
12 | #       documentation and/or other materials provided with the distribution.
13 | #     * The names of project contributors may not be used to endorse or
14 | #       promote products derived from this software without specific
15 | #       prior written permission.
16 | #
17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | '''
30 | Term-vector operations
31 | '''
32 | 
33 | import io
34 | import math
35 | 
36 | def dot_product(v1, v2):
37 |     '''Returns dot product of two term vectors'''
38 |     val = 0.0
39 |     for term in v1:
40 |         if term in v2: val += v1[term] * v2[term]
41 |     return val
42 | 
43 | def l2_norm(v):
44 |     '''Returns L2 norm of term vector v'''
45 |     val = 0.0
46 |     for term in v:
47 |         val += v[term]**2
48 |     val = math.sqrt(val)
49 |     return val
50 | 
51 | def mag_union(A, B):
52 |     '''
53 |     Returns magnitude of multiset-union of A and B
54 |     '''
55 |     val = 0
56 |     for term in A: val += A[term]
57 |     for term in B: val += B[term]
58 |     return val
59 | 
60 | def mag_intersect(A, B):
61 |     '''
62 |     Returns magnitude of multiset-intersection of A and B
63 |     '''
64 |     val = 0
65 |     for term in A:
66 |         if term in B: val += min(A[term], B[term])
67 |     return val
68 | 
69 | def magnitude(v):
70 |     '''Returns L2 norm of term vector v (identical to l2_norm())'''
71 |     return l2_norm(v)
72 | 
73 | def term_vec(input, stoplist=None, lowercase=False):
74 |     '''
75 |     Returns a term vector for ``input``, represented as a dictionary
76 |     of the form {term: frequency}
77 |     
78 |     ``input`` can be either a string or a file
79 |     '''
80 |     if isinstance(input, basestring):
81 |         with io.StringIO(input) as string_buffer:
82 |             return term_vec(string_buffer)
83 |     else:
84 |         # default args:
85 |         if stoplist is None:
86 |             stoplist = set()
87 |         
88 |         tf_dict = {}
89 |         for line in input:
90 |             for term in line.split():
91 |                 if term not in stoplist:
92 |                     if lowercase: term = term.lower()
93 |                     if term not in tf_dict: tf_dict[term] = 0
94 |                     tf_dict[term] += 1
95 |         return tf_dict
96 | 


--------------------------------------------------------------------------------
/pysimsearch/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taherh/pysimsearch/57796f7175565a8481fe80a56c7815bf0264d4fb/pysimsearch/test/__init__.py


--------------------------------------------------------------------------------
/pysimsearch/test/freq_tools_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright (c) 2010, Taher Haveliwala <oss@taherh.org>
 4 | # All rights reserved.
 5 | #
 6 | # Redistribution and use in source and binary forms, with or without
 7 | # modification, are permitted provided that the following conditions are met:
 8 | #     * Redistributions of source code must retain the above copyright
 9 | #       notice, this list of conditions and the following disclaimer.
10 | #     * Redistributions in binary form must reproduce the above copyright
11 | #       notice, this list of conditions and the following disclaimer in the
12 | #       documentation and/or other materials provided with the distribution.
13 | #     * The names of project contributors may not be used to endorse or
14 | #       promote products derived from this software without specific
15 | #       prior written permission.
16 | #
17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | '''
30 | Unittests for pysimsearch.freq_tools package
31 | 
32 | To run unittests, run 'nosetests' from the test directory
33 | '''
34 | from __future__ import(division, absolute_import, print_function,
35 |                        unicode_literals)
36 | 
37 | import unittest
38 | 
39 | import io
40 | import pprint
41 | 
42 | from pysimsearch import freq_tools
43 | from pysimsearch import doc_reader
44 |     
45 | class FreqToolsTest(unittest.TestCase):
46 |     longMessage = True
47 | 
48 |     def test_read_df(self):
49 |         '''read_df() test'''
50 |         df_dict = {'a':5, 'b':3, 'c':1}
51 |         df_file_str =\
52 |         '''
53 |         a    5
54 |         b    3
55 |         c    1
56 |         '''
57 |         df_file = io.StringIO(df_file_str)
58 |         self.assertEqual(freq_tools.read_df(df_file), df_dict)
59 |         
60 |     def test_write_df(self):
61 |         '''write_df() test'''
62 |         df_dict = {'a':5, 'b':3, 'c':1}
63 |         df_file = io.StringIO()
64 |         freq_tools.write_df(df_dict, df_file)
65 |         
66 |         df_file.seek(0)
67 |         self.assertEqual(freq_tools.read_df(df_file), df_dict)
68 |             
69 |     def test_compute_df(self):
70 |         doc1 = 'a b b     c d e e e e f'
71 |         doc2 = '  b           e         g g g h i'
72 |         doc3 = '  b b b b c d                 h  '
73 |         
74 |         df_dict = {'a':1, 'b':3, 'c':2, 'd':2, 'e':2, 'f':1, 'g':1, 'h':2,
75 |                    'i':1}
76 |         
77 |         files = (io.StringIO(doc1), io.StringIO(doc2), io.StringIO(doc3))
78 |         self.assertEqual(freq_tools.compute_df(files), df_dict)
79 |     
80 | 
81 | if __name__ == "__main__":
82 |     unittest.main()
83 | 


--------------------------------------------------------------------------------
/pysimsearch/test/sim_index_test.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright (c) 2010, Taher Haveliwala <oss@taherh.org>
  4 | # All rights reserved.
  5 | #
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are met:
  8 | #     * Redistributions of source code must retain the above copyright
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above copyright
 11 | #       notice, this list of conditions and the following disclaimer in the
 12 | #       documentation and/or other materials provided with the distribution.
 13 | #     * The names of project contributors may not be used to endorse or
 14 | #       promote products derived from this software without specific
 15 | #       prior written permission.
 16 | #
 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | 
 29 | '''
 30 | Unittests for pysimsearch.sim_index package
 31 | 
 32 | To run unittests, run 'nosetests' from the test directory
 33 | '''
 34 | from __future__ import(division, absolute_import, print_function,
 35 |                        unicode_literals)
 36 | 
 37 | import unittest
 38 | 
 39 | import io
 40 | import math
 41 | import sys
 42 | import time
 43 | from multiprocessing import Process
 44 | from pprint import pprint
 45 | 
 46 | from pysimsearch import term_vec
 47 | from pysimsearch.sim_index import MemorySimIndex
 48 | from pysimsearch.sim_index import ShelfSimIndex
 49 | from pysimsearch.sim_index import ConcurrentSimIndex
 50 | from pysimsearch.sim_index import SimIndexCollection
 51 | from pysimsearch.sim_index import RemoteSimIndex
 52 | from pysimsearch import sim_server
 53 | 
 54 | class SimIndexTest(object):
 55 |     '''
 56 |     Provides common tests for different implementations of the SimIndex
 57 |     interface.
 58 |     
 59 |     To test a concrete implementation of SimIndex, must sublcass SimIndexTest,
 60 |     and also inherit unittest.TestCase.  SimIndexTest intentionally does not
 61 |     inherit unittest.TestCase, as it is only an abstract class that cannot be
 62 |     instantiated and tested separately from an implementation.
 63 |     '''
 64 |     longMessage = True
 65 | 
 66 |     sim_index = None
 67 | 
 68 |     def setUp(self):
 69 |         with io.StringIO(self.stopfile_buffer) as stopfile:
 70 |             self.sim_index.load_stoplist(stopfile)
 71 |             
 72 |         self.sim_index.index_string_buffers(self.docs)
 73 | 
 74 |     # Stopword list
 75 |     stopfile_buffer = "stopword1 stopword2"
 76 | 
 77 |     # Test documents
 78 |     docs = ( ('doc1', "hello there world     hello stopword1"),
 79 |              ('doc2', "hello       world           stopword2"),
 80 |              ('doc3', "hello there       bob      ") )
 81 |     
 82 |     # Postings that correspond to test documents
 83 |     golden_postings = { 'hello': {'doc1': 2, 'doc2': 1, 'doc3': 1},
 84 |                         'there': {'doc1': 1, 'doc3': 1},
 85 |                         'world': {'doc1': 1, 'doc2': 1},
 86 |                         'bob': {'doc3': 1},
 87 |                         'nobody': {},
 88 |                         '': {}}
 89 | 
 90 |     # Golden hits data (Conjunctive: Requires presence of all terms)
 91 |     #
 92 |     # We can reuse golden_postings to provide some test input here
 93 |     golden_conj_hits = { term: set(postings.keys())
 94 |         for (term, postings) in golden_postings.items() }
 95 |     # and of course throw in some multiword queries as well
 96 |     golden_conj_hits.update({ "hello there": {'doc1', 'doc3'},
 97 |                               "there world": {'doc1'},
 98 |                               "hello world": {'doc1', 'doc2'} })
 99 |     
100 |     # Golden hits data for SimpleCountQueryScorer (frequencies are simple
101 |     # match-counts between query terms and document terms).
102 |     #   (Disjunctive: requires any term to be present)
103 |     #
104 |     # We can reuse golden_postings to provide some test input here
105 |     golden_scored_hits = { term: docnames
106 |         for (term, docnames) in golden_postings.items() }
107 |     # and of course throw in some multiword queries as well
108 |     golden_scored_hits.update({ "hello there": {'doc1': 3, 'doc2': 1, 'doc3': 2},
109 |                                 "there world": {'doc1': 2, 'doc2': 1, 'doc3': 1},
110 |                                 "hello world": {'doc1': 3, 'doc2': 2, 'doc3': 1} })
111 | 
112 |     def get_golden_hits_cos(self):
113 |         '''Manually computes cosine scores for test set to create golden results'''
114 |         d1_len = math.sqrt(2^2 + 1 + 1)
115 |         d2_len = math.sqrt(1 + 1)
116 |         d3_len = math.sqrt(1 + 1 + 1)
117 |         N = 3
118 |         hello_idf = math.log(N/3)
119 |         there_idf = math.log(N/2)
120 |         world_idf = math.log(N/2)
121 |         bob_idf = math.log(N/1)
122 |         r = ({ "hello there": {'doc1': hello_idf * 2 / d1_len + there_idf / d1_len,
123 |                                'doc2': hello_idf / d2_len,
124 |                                'doc3': hello_idf / d3_len + there_idf / d3_len},
125 |                "there world": {'doc1': there_idf / d1_len + world_idf / d1_len,
126 |                                'doc2': world_idf / d2_len,
127 |                                'doc3': there_idf / d3_len},
128 |                "hello world": {'doc1': hello_idf * 2 / d1_len + world_idf / d1_len,
129 |                                'doc2': hello_idf / d2_len + world_idf / d2_len,
130 |                                'doc3': hello_idf / d3_len} })
131 |         pprint(r)
132 |         return r
133 |     
134 |     def test_docname_docid_translation(self):
135 |         '''Test docname_to_docid()/docid_to_docname() using known data'''
136 | 
137 |         for (docname, doc) in self.docs:
138 |                 self.assertEqual(docname,
139 |                              self.sim_index.docid_to_name(
140 |                                 self.sim_index.name_to_docid(docname)))
141 | 
142 |     def test_postings_list(self):
143 |         '''Test postings_list() using known data
144 |         
145 |         We use sets instead of lists to more easily allow equality
146 |         comparison with golden data.
147 |         '''
148 | 
149 |         for term in self.golden_postings:    
150 |             translated_postings = {
151 |                 self.sim_index.docid_to_name(docid): freq
152 |                     for (docid, freq) in
153 |                             self.sim_index.postings_list(term)
154 |                 }
155 |             self.assertEqual(translated_postings,
156 |                              self.golden_postings[term])
157 | 
158 |     def test_stoplist(self):
159 |         '''Test stoplist functionality'''
160 |         for term in self.stopfile_buffer.split():
161 |             print("stopword={}".format(term))
162 |             self.assertEqual(list(self.sim_index.postings_list(term)), [])
163 | 
164 |     def test_docnames_with_terms(self):
165 |         '''Test docnames_with_terms() using known data
166 |         
167 |         We use sets instead of lists to more easily allow equality
168 |         comparison with golden data.
169 |         '''
170 | 
171 |         # We unpack the golden hit lists, construct a golden set of docnames
172 |         # for the hits, and compare with sim_index.docnames_with_terms()
173 |         for (query, golden_doc_hits) in self.golden_conj_hits.items():
174 |             query_vec = term_vec.term_vec(query)
175 |             terms = [term for (term, freq) in query_vec.items()]
176 |             
177 |             self.assertEqual(golden_doc_hits,
178 |                              set(self.sim_index.docnames_with_terms(*terms)))
179 | 
180 |     def test_query_simple_scorer(self):
181 |         '''Test query() with simple_scorer using known data.
182 |         
183 |         Uses SimpleCountQueryScorer for scoring.
184 |         '''
185 |         self.sim_index.set_query_scorer('simple_count')
186 |         for (query, golden_doc_hits) in self.golden_scored_hits.items():
187 |             self.assertEqual(golden_doc_hits,
188 |                              dict(self.sim_index.query(query)),
189 |                              msg = "query={}".format(query))
190 | 
191 |     def test_query_tfidf_scorer(self):
192 |         '''Test query() with tfidf using known data.
193 |         
194 |         Uses TFIDFQueryScorer for scoring.
195 |         '''
196 |         self.sim_index.set_query_scorer('tfidf')
197 |         for (query, golden_doc_hits_cos) in self.get_golden_hits_cos().items():
198 |             results = self.sim_index.query(query)
199 |             for (docname, score) in results:
200 |                 self.assertAlmostEqual(score,
201 |                                        golden_doc_hits_cos[docname],
202 |                                        msg="results={}".format(str(results)))
203 |     
204 |     def test_del_docids(self):
205 |         '''Test del_docids()'''
206 |         retest_list = (self.test_docnames_with_terms,
207 |                        self.test_query_simple_scorer,
208 |                        self.test_query_tfidf_scorer,)
209 |         
210 |         # Make sure that the selected tests already pass (just for clarity)
211 |         for test in retest_list:
212 |             test()
213 | 
214 |         # Add an extra doc to the index
215 |         self.sim_index.index_string_buffers( (('extra_doc', "hello world"),) )
216 | 
217 |         # Make sure that selected tests fail when we add an extra 'unexpected'
218 |         # doc to the index
219 |         for test in retest_list:
220 |             self.assertRaises(AssertionError, test)
221 |         
222 |         # Delete the extra doc
223 |         docid = self.sim_index.name_to_docid('extra_doc')
224 |         print('extra docid={}'.format(docid))
225 |         self.sim_index.del_docids(docid)
226 | 
227 |         # Now make sure that the selected tests pass again
228 |         for test in retest_list:
229 |             test()
230 |     
231 |     def test_config(self):
232 |         '''Ensure that various config params are properly handled'''
233 | 
234 |         ### Test 'lowercase' param
235 |         
236 |         def _check_lc(index, golden_results):
237 |             '''helper that checks index against golden_results'''
238 |             for (term, golden_docs) in golden_results:
239 |                 self.assertEqual(
240 |                     set(index.docnames_with_terms(term)), golden_docs)
241 |                 self.assertEqual(
242 |                     set([doc for (doc, score) in index.query(term)]), golden_docs)
243 |                 
244 |         # test data
245 |         test_docs = (('doc1', 'Hello There'),
246 |                      ('doc2', 'hello there'))
247 | 
248 |         # lowercase=True
249 |         index = MemorySimIndex()
250 |         index.set_config('lowercase', True)
251 |         index.index_string_buffers(test_docs)
252 |         golden_results = (('hello', {'doc1', 'doc2'}),
253 |                           ('Hello', {'doc1', 'doc2'}),
254 |                           ('HELLO', {'doc1', 'doc2'}))
255 |         _check_lc(index, golden_results)
256 |         
257 |         # lowercase=False
258 |         index = MemorySimIndex()
259 |         index.set_config('lowercase', False)
260 |         index.index_string_buffers(test_docs)
261 |         golden_results = (('hello', {'doc2'}),
262 |                           ('Hello', {'doc1'}),
263 |                           ('HELLO', set()))
264 |         _check_lc(index, golden_results)
265 | 
266 | class MemorySimIndexTest(SimIndexTest, unittest.TestCase):
267 |     '''
268 |     All tests hitting the SimIndex interface are in the parent class, SimIndexTest
269 |     
270 |     Tests for api's not in parent class are tested separately here.  This is
271 |     so we can reuse test code across all implementations of SimIndex.
272 |     '''
273 |     
274 |     def setUp(self):
275 |         print("MemorySimIndexTest")
276 |         self.sim_index = MemorySimIndex()
277 |         super(MemorySimIndexTest, self).setUp()
278 | 
279 |     def tearDown(self):
280 |         pass
281 |         
282 |     def test_save_load(self):
283 |         '''Test save()/load() functionality'''
284 |         with io.BytesIO() as output:
285 |             self.sim_index.save(output)
286 |             output.seek(0)
287 |             loaded_sim_index = MemorySimIndex.load(output)
288 |         self.sim_index = loaded_sim_index
289 |         self.test_query_simple_scorer()  # make sure test_query() still works
290 | 
291 | class ShelfSimIndexTest(SimIndexTest, unittest.TestCase):
292 |     '''
293 |     All tests hitting the SimIndex interface are in the parent class, SimIndexTest
294 |     
295 |     Tests for api's not in parent class are tested separately here.  This is
296 |     so we can reuse test code across all implementations of SimIndex.
297 |     '''
298 |     
299 |     def setUp(self):
300 |         print("ShelfSimIndexTest")
301 |         self.sim_index = ShelfSimIndex("/tmp/test_dbm", 'n')
302 |         super(ShelfSimIndexTest, self).setUp()
303 | 
304 |     def tearDown(self):
305 |         self.sim_index.close()
306 | 
307 | class ConcurrentSimIndexTest(SimIndexTest, unittest.TestCase):
308 |     '''
309 |     All tests hitting the SimIndex interface are in the parent class, SimIndexTest
310 |     
311 |     Tests for api's not in parent class are tested separately here.  This is
312 |     so we can reuse test code across all implementations of SimIndex.
313 |     '''
314 |     
315 |     def setUp(self):
316 |         print("ConcurrentSimIndexTest")
317 |         self.sim_index = ConcurrentSimIndex(MemorySimIndex())
318 |         super(ConcurrentSimIndexTest, self).setUp()
319 | 
320 |     def tearDown(self):
321 |         pass
322 | 
323 | class SimIndexCollectionTest(SimIndexTest, unittest.TestCase):
324 |     '''
325 |     All tests hitting the SimIndex interface are in the parent class, SimIndexTest
326 |     
327 |     Tests for api's not in parent class are tested separately here.  This is
328 |     so we can reuse test code across all implementations of SimIndex.    
329 |     '''
330 | 
331 |     def setUp(self):
332 |         print("SimIndexCollectionTest")
333 |         self.sim_index = SimIndexCollection()
334 |         for i in range(2):
335 |             self.sim_index.add_shards(MemorySimIndex())
336 | 
337 |         super(SimIndexCollectionTest, self).setUp()
338 |     
339 |     def tearDown(self):
340 |         pass
341 |     
342 | 
343 | class SimIndexRemoteCollectionTest(SimIndexTest, unittest.TestCase):
344 |     '''
345 |     All tests hitting the SimIndex interface are in the parent class, SimIndexTest
346 |     
347 |     Tests for api's not in parent class are tested separately here.  This is
348 |     so we can reuse test code across all implementations of SimIndex.    
349 |     '''
350 | 
351 |     processes = None
352 |     
353 |     def setUp(self):
354 |         # setUpClass() may be more efficient for spinning up the servers,
355 |         # but this way is more robust (since we'll start each test from a
356 |         # clean slate). Otherwise we'd need clear() functionality added.
357 | 
358 |         print("SimIndexRemoteCollectionTest")
359 |         
360 |         # We will create a collection tree of the form:
361 |         #
362 |         #      Root
363 |         #     /   \
364 |         #    A     B
365 |         #   /\     /\
366 |         #  1  2   3  4
367 |         self.processes = []
368 | 
369 |         # start leaves
370 |         for i in range(4):
371 |             port = 9100 + i
372 |             process = Process(target=sim_server.start_sim_index_server,
373 |                               kwargs={'port': port, 'logRequests': False})
374 |             process.daemon = True
375 |             process.start()
376 |             self.processes.append(process)
377 |             
378 |         print("Waiting for leaf servers to start")
379 |         time.sleep(0.1)
380 |         
381 |         leaf_nodes = [[],[]]
382 |         for i in range(4):
383 |             port = 9100 + i
384 |             leaf_nodes[i//2].append(RemoteSimIndex(
385 |                 "http://localhost:{}/RPC2".format(port)))
386 | 
387 |         # start interior nodes (A, B)
388 |         for i in range(2):
389 |             port = 9200 + i
390 |             process = Process(
391 |                 target=sim_server.start_sim_index_server,
392 |                 kwargs={ 'port': port,
393 |                          'backends': leaf_nodes[i],
394 |                          'root': False,
395 |                          'logRequests': False
396 |                         }
397 |             )
398 |             process.daemon = True
399 |             process.start()
400 |             self.processes.append(process)
401 | 
402 |         print("Waiting for intermediate servers to start")
403 |         time.sleep(0.1)        
404 | 
405 |         interior_nodes = []
406 |         for i in range(2):
407 |             port = 9200 + i
408 |             interior_nodes.append(
409 |                 RemoteSimIndex("http://localhost:{}/RPC2".format(port)))
410 | 
411 |         # root node
412 |         self.sim_index = SimIndexCollection(root=True)
413 |         self.sim_index.add_shards(*interior_nodes)
414 |         
415 |         super(SimIndexRemoteCollectionTest, self).setUp()
416 |     
417 |     def tearDown(self):
418 |         for process in self.processes:
419 |             process.terminate()
420 |         time.sleep(0.1)
421 | 
422 | 
423 | if __name__ == "__main__":
424 |     unittest.main()
425 | 


--------------------------------------------------------------------------------
/pysimsearch/test/similarity_test.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright (c) 2010, Taher Haveliwala <oss@taherh.org>
  4 | # All rights reserved.
  5 | #
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are met:
  8 | #     * Redistributions of source code must retain the above copyright
  9 | #       notice, this list of conditions and the following disclaimer.
 10 | #     * Redistributions in binary form must reproduce the above copyright
 11 | #       notice, this list of conditions and the following disclaimer in the
 12 | #       documentation and/or other materials provided with the distribution.
 13 | #     * The names of project contributors may not be used to endorse or
 14 | #       promote products derived from this software without specific
 15 | #       prior written permission.
 16 | #
 17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | 
 29 | '''
 30 | Unittests for pysimsearch.similarity package
 31 | 
 32 | To run unittests, run 'nosetests' from the test directory
 33 | '''
 34 | from __future__ import(division, absolute_import, print_function,
 35 |                        unicode_literals)
 36 | 
 37 | import unittest
 38 | 
 39 | import io
 40 | from itertools import combinations
 41 | import math
 42 | 
 43 | from pysimsearch import similarity
 44 | 
 45 | class SimilarityTest(unittest.TestCase):
 46 |     longMessage = True
 47 | 
 48 |     def setUp(self):
 49 |         pass
 50 | 
 51 |     def tearDown(self):
 52 |         pass
 53 | 
 54 |     def test_measure_similarity(self):
 55 |         '''
 56 |         measure_similarity() should give known results for known inputs
 57 |         '''
 58 |         
 59 |         testdata = {
 60 |             'testdata_1': "hello",
 61 |             'testdata_2': "hello",
 62 |             'testdata_3': "world",
 63 |             'testdata_4': "hello world",
 64 |         }
 65 |         expected_sims = {
 66 |             ('testdata_1', 'testdata_2'): 1,
 67 |             ('testdata_1', 'testdata_3'): 0,
 68 |             ('testdata_1', 'testdata_4'): (1 / math.sqrt(2)),
 69 |             ('testdata_2', 'testdata_3'): 0,
 70 |             ('testdata_2', 'testdata_4'): (1 / math.sqrt(2)),
 71 |             ('testdata_3', 'testdata_4'): (1 / math.sqrt(2)),
 72 |         }
 73 |         
 74 |         for (fname_a, fname_b) in combinations(sorted(testdata.keys()), 2):
 75 |             print('Comparing {0},{1}'.format(fname_a, fname_b))
 76 |             with io.StringIO(testdata[fname_a]) as file_a:
 77 |                 with io.StringIO(testdata[fname_b]) as file_b:
 78 |                     sim = similarity.measure_similarity(file_a, file_b)
 79 |                     self.assertAlmostEqual(
 80 |                             sim, expected_sims[(fname_a, fname_b)], 
 81 |                             places = 5,
 82 |                             msg = 'Mismatch for pair {0}: got {1}, expected {2}'.
 83 |                             format((fname_a, fname_b), sim, 
 84 |                                    expected_sims[(fname_a, fname_b)]))
 85 | 
 86 |     def test_cosine_sim(self):
 87 |         '''cosine_sim() test using known inputs'''
 88 |         u = {'a':1, 'b':2, 'c':5}
 89 |         v = {'a':1,        'c':2, 'd':3}
 90 |     
 91 |         self.assertEqual(similarity.cosine_sim(u, v), 11 / (math.sqrt(30) * math.sqrt(14)))
 92 |         
 93 |     def test_jaccard_sim(self):
 94 |         '''jaccard_sim() test using known inputs'''
 95 |         A = {'a':1, 'b':2, 'c':5}
 96 |         B = {'a':1,        'c':2, 'd':3}
 97 |     
 98 |         self.assertEqual(similarity.jaccard_sim(A, B), 3 / 14)
 99 | 
100 | 
101 | if __name__ == "__main__":
102 |     unittest.main()
103 | 


--------------------------------------------------------------------------------
/pysimsearch/test/term_vec_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright (c) 2010, Taher Haveliwala <oss@taherh.org>
 4 | # All rights reserved.
 5 | #
 6 | # Redistribution and use in source and binary forms, with or without
 7 | # modification, are permitted provided that the following conditions are met:
 8 | #     * Redistributions of source code must retain the above copyright
 9 | #       notice, this list of conditions and the following disclaimer.
10 | #     * Redistributions in binary form must reproduce the above copyright
11 | #       notice, this list of conditions and the following disclaimer in the
12 | #       documentation and/or other materials provided with the distribution.
13 | #     * The names of project contributors may not be used to endorse or
14 | #       promote products derived from this software without specific
15 | #       prior written permission.
16 | #
17 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | '''
30 | Unittests for pysimsearch.term_vec package
31 | 
32 | To run unittests, run 'nosetests' from the test directory
33 | '''
34 | from __future__ import(division, absolute_import, print_function,
35 |                        unicode_literals)
36 | 
37 | import unittest
38 | 
39 | import math
40 | import os
41 | import pprint
42 | 
43 | from pysimsearch import term_vec
44 | 
45 | class TermVecTest(unittest.TestCase):
46 |     longMessage = True
47 | 
48 |     def test_dot_product(self):
49 |         '''dot_product() test using known inputs'''
50 |         v1 = {'a':1, 'b':2, 'c':0.5}
51 |         v2 = {'a':2,        'c':2, 'd':100}
52 |         
53 |         self.assertEqual(term_vec.dot_product(v1, v2), 3)
54 | 
55 |     def test_l2_norm(self):
56 |         '''l2_norm() test using known inputs'''
57 |         v = {'a':1, 'b':2, 'c':5}
58 |         
59 |         self.assertEqual(term_vec.l2_norm(v), math.sqrt(1 + 2**2 + 5**2))
60 | 
61 |     def test_magnitude(self):
62 |         '''magnitude() test using known inputs'''
63 |         v = {'a':1, 'b':2, 'c':5}
64 |         
65 |         self.assertEqual(term_vec.l2_norm(v), math.sqrt(1 + 2**2 + 5**2))
66 |         
67 |     def test_mag_union(self):
68 |         '''mag_union() test using known inputs'''
69 |         A = {'a':1, 'b':2, 'c':5}
70 |         B = {'a':1,        'c':2, 'd':3}
71 |         
72 |         self.assertEqual(term_vec.mag_union(A, B), 14)
73 |     
74 |     def test_mag_intersect(self):
75 |         '''mag_intersect() test using known inputs'''
76 |         A = {'a':1, 'b':2, 'c':5}
77 |         B = {'a':1,        'c':2, 'd':3}
78 |         
79 |         self.assertEqual(term_vec.mag_intersect(A, B), 3)
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     unittest.main()
84 | 


--------------------------------------------------------------------------------
/sample.py:
--------------------------------------------------------------------------------
  1 | from __future__ import(division, absolute_import, print_function,
  2 |                        unicode_literals)
  3 | 
  4 | from multiprocessing import Process
  5 | import time
  6 | 
  7 | from pprint import pprint
  8 | 
  9 | from pysimsearch.sim_index import MemorySimIndex
 10 | from pysimsearch.sim_index import RemoteSimIndex
 11 | from pysimsearch.sim_index import SimIndexCollection
 12 | from pysimsearch import similarity
 13 | from pysimsearch import sim_server
 14 | 
 15 | def sample_similarity():
 16 |     # Compare web-page similarities
 17 |     print()
 18 |     print("Printing pairwise similarities of university homepages")
 19 |     similarities = similarity.pairwise_compare(
 20 |         urls=['http://www.stanford.edu/',
 21 |               'http://www.berkeley.edu/',
 22 |               'http://www.ucla.edu',
 23 |               'http://www.mit.edu/'])
 24 |     pprint(similarities)
 25 | 
 26 | def sample_sim_index():            
 27 |     # Create an in-memory index and query it
 28 |     print()
 29 |     print("Creating in-memory index of university homepages")
 30 |     sim_index = MemorySimIndex()
 31 |     sim_index.index_urls('http://www.stanford.edu/',
 32 |                          'http://www.berkeley.edu',
 33 |                          'http://www.ucla.edu',
 34 |                          'http://www.mit.edu')
 35 |     
 36 |     print("Postings list for 'university':")
 37 |     pprint(sim_index.postings_list('university'))
 38 |     print("Pages containing terms 'university' and 'california'")
 39 |     pprint(list(sim_index.docnames_with_terms('university', 'california')))
 40 |     
 41 |     # Issue some similarity queries
 42 |     print()
 43 |     print("Similarity search for query 'stanford university' (simple scorer)")
 44 |     sim_index.set_query_scorer('simple_count')
 45 |     pprint(list(sim_index.query("stanford university")))
 46 |     
 47 |     print()
 48 |     print("Similarity search for query 'stanford university' (tf.idf scorer)")
 49 |     sim_index.set_query_scorer('tfidf')
 50 |     pprint(list(sim_index.query("stanford university")))
 51 |     
 52 |     # Save the index to disk, then load it back in
 53 |     print()
 54 |     print("Saving index to disk")
 55 |     with open("myindex.idx", "w") as index_file:
 56 |         sim_index.save(index_file)
 57 |     
 58 |     print()
 59 |     print("Loading index from disk")
 60 |     with open("myindex.idx", "r") as index_file:
 61 |         sim_index2 = MemorySimIndex.load(index_file)
 62 |     
 63 |     print()
 64 |     print("Pages containing terms 'university' and 'california' in loaded index")
 65 |     pprint(list(sim_index2.docnames_with_terms('university', 'california')))
 66 | 
 67 | def sample_sim_index_collection():
 68 |     # SimIndexCollection
 69 |     print()
 70 |     print("SimIndexCollection: build a collection, index some urls, and query it")
 71 |     indexes = (MemorySimIndex(), MemorySimIndex())
 72 |     index_coll = SimIndexCollection()
 73 |     index_coll.add_shards(*indexes)
 74 |     index_coll.set_query_scorer('tfidf')
 75 |     index_coll.index_urls('http://www.stanford.edu/',
 76 |                           'http://www.berkeley.edu',
 77 |                           'http://www.ucla.edu',
 78 |                           'http://www.mit.edu')
 79 |     
 80 |     pprint(index_coll.query('stanford university'))
 81 | 
 82 | def sample_remote_indexes():    
 83 |     print()
 84 |     print("SimIndexCollection with remote backend indexes")
 85 |     
 86 |     processes = []
 87 |     for i in range(2):
 88 |         port = 9000 + i
 89 |         process = Process(target=sim_server.start_sim_index_server,
 90 |                           kwargs={'port': port, 'logRequests': False})
 91 |         process.daemon = True
 92 |         processes.append(process)
 93 |         
 94 |     for process in processes:
 95 |         process.start()
 96 |         
 97 |     print("Waiting for servers to start")
 98 |     time.sleep(1)
 99 | 
100 |     remote_index_coll = SimIndexCollection()        
101 |     for i in range(2):
102 |         port = 9000 + i
103 |         remote_index_coll.add_shards(
104 |             RemoteSimIndex("http://localhost:{}/RPC2".format(port)))
105 |         
106 |     remote_index_coll.set_query_scorer('tfidf')
107 | 
108 |     remote_index_coll.index_urls('http://www.stanford.edu/',
109 |                                  'http://www.berkeley.edu',
110 |                                  'http://www.ucla.edu',
111 |                                  'http://www.mit.edu')
112 |     
113 |     pprint(remote_index_coll.query('stanford university'))
114 |         
115 |     for process in processes:
116 |         process.terminate()
117 | 
118 | if __name__ == '__main__':
119 |     sample_similarity()
120 |     sample_sim_index()
121 |     sample_sim_index_collection()
122 |     sample_remote_indexes()
123 |     pprint('done!')
124 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | setup(
 3 |       name = "pysimsearch",
 4 |       packages = ["pysimsearch", "pysimsearch.sim_index", "pysimsearch.test"],
 5 |       version = "0.32",
 6 |       description = "Similarity-search library",
 7 |       author = "Taher Haveliwala",
 8 |       author_email = "oss@taherh.org",
 9 |       url = "https://github.com/taherh/pysimsearch",
10 |       download_url = "https://github.com/downloads/taherh/pysimsearch/pysimsearch-0.32.tar.gz",
11 |       keywords = ["similarity"],
12 |       requires = ["httplib2", "lxml", "jsonrpclib", "futures"],
13 |       license = "BSD License",
14 |       classifiers = [
15 |                      "Programming Language :: Python",
16 |                      "License :: OSI Approved :: BSD License",
17 |                      "Operating System :: OS Independent"
18 |                      ],
19 |       long_description = '''\
20 | Similarity-Search Library
21 | -------------------------
22 | 
23 | Requires Python v2.7.1 or higher
24 | Library for measuring textual similarity of files and web pages and
25 | building similarity indexes.  Primarily for pedagogical purposes.
26 | '''
27 |       )
28 | 


--------------------------------------------------------------------------------