├── .gitignore
├── .travis.yml
├── data.tar
├── data
    ├── hn.askstories.db
    ├── hn.jobstories.db
    ├── hn.showstories.db
    ├── reddit.books.db
    ├── reddit.democrat.db
    ├── reddit.linux.db
    ├── reddit.literature.db
    ├── reddit.movies.db
    ├── reddit.music.db
    ├── reddit.python.db
    └── reddit.republican.db
├── docs
    ├── .gitignore
    ├── Makefile
    ├── python.db
    ├── requests.db
    ├── requests.yaml
    └── source
    │   ├── 1ftcd5.jpg
    │   ├── code
    │       └── social_media_output.txt
    │   ├── conf.py
    │   ├── hat.jpg
    │   ├── hat.xcf
    │   ├── http_diff.png
    │   ├── https_get.png
    │   ├── index.rst
    │   ├── ioLogo.svg
    │   └── iologo.png
├── examples
    ├── hn.py
    ├── hnvcrpy.py
    ├── readthedocs.py
    ├── readthedocsvcrpy.py
    └── reddit.py
├── hoverpy_scikitlearn.py
├── makefile
├── requirements.txt
├── requirements_dev.txt
└── test_hoverpy_scikitlearn.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.log
2 | *.pyc
3 | .idea
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 |   - "2.7"
4 | script:
5 |   - sudo apt-get install python-numpy python-scipy
6 |   - python test_hoverpy_scikitlearn.py
7 | 
8 | 


--------------------------------------------------------------------------------
/data.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shyal/hoverpy-scikitlearn/34ca179fd454139b8bf9f406b399313c7c400bb8/data.tar


--------------------------------------------------------------------------------
/data/hn.askstories.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shyal/hoverpy-scikitlearn/34ca179fd454139b8bf9f406b399313c7c400bb8/data/hn.askstories.db


--------------------------------------------------------------------------------
/data/hn.jobstories.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shyal/hoverpy-scikitlearn/34ca179fd454139b8bf9f406b399313c7c400bb8/data/hn.jobstories.db


--------------------------------------------------------------------------------
/data/hn.showstories.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shyal/hoverpy-scikitlearn/34ca179fd454139b8bf9f406b399313c7c400bb8/data/hn.showstories.db


--------------------------------------------------------------------------------
/data/reddit.books.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shyal/hoverpy-scikitlearn/34ca179fd454139b8bf9f406b399313c7c400bb8/data/reddit.books.db


--------------------------------------------------------------------------------
/data/reddit.democrat.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shyal/hoverpy-scikitlearn/34ca179fd454139b8bf9f406b399313c7c400bb8/data/reddit.democrat.db


--------------------------------------------------------------------------------
/data/reddit.linux.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shyal/hoverpy-scikitlearn/34ca179fd454139b8bf9f406b399313c7c400bb8/data/reddit.linux.db


--------------------------------------------------------------------------------
/data/reddit.literature.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shyal/hoverpy-scikitlearn/34ca179fd454139b8bf9f406b399313c7c400bb8/data/reddit.literature.db


--------------------------------------------------------------------------------
/data/reddit.movies.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shyal/hoverpy-scikitlearn/34ca179fd454139b8bf9f406b399313c7c400bb8/data/reddit.movies.db


--------------------------------------------------------------------------------
/data/reddit.music.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shyal/hoverpy-scikitlearn/34ca179fd454139b8bf9f406b399313c7c400bb8/data/reddit.music.db


--------------------------------------------------------------------------------
/data/reddit.python.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shyal/hoverpy-scikitlearn/34ca179fd454139b8bf9f406b399313c7c400bb8/data/reddit.python.db


--------------------------------------------------------------------------------
/data/reddit.republican.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shyal/hoverpy-scikitlearn/34ca179fd454139b8bf9f406b399313c7c400bb8/data/reddit.republican.db


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  applehelp  to make an Apple Help Book"
 34 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 35 | 	@echo "  epub       to make an epub"
 36 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 37 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 38 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 39 | 	@echo "  text       to make text files"
 40 | 	@echo "  man        to make manual pages"
 41 | 	@echo "  texinfo    to make Texinfo files"
 42 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 43 | 	@echo "  gettext    to make PO message catalogs"
 44 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 45 | 	@echo "  xml        to make Docutils-native XML files"
 46 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 47 | 	@echo "  linkcheck  to check all external links for integrity"
 48 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 49 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 50 | 
 51 | clean:
 52 | 	rm -rf $(BUILDDIR)/*
 53 | 
 54 | html:
 55 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 58 | 	rsync -azv $(BUILDDIR) root@shyal.com:/var/www/static/
 59 | 
 60 | dirhtml:
 61 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 62 | 	@echo
 63 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 64 | 
 65 | singlehtml:
 66 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 67 | 	@echo
 68 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 69 | 
 70 | pickle:
 71 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 72 | 	@echo
 73 | 	@echo "Build finished; now you can process the pickle files."
 74 | 
 75 | json:
 76 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 77 | 	@echo
 78 | 	@echo "Build finished; now you can process the JSON files."
 79 | 
 80 | htmlhelp:
 81 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 82 | 	@echo
 83 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 84 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 85 | 
 86 | qthelp:
 87 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 88 | 	@echo
 89 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 90 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 91 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/social_media.qhcp"
 92 | 	@echo "To view the help file:"
 93 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/social_media.qhc"
 94 | 
 95 | applehelp:
 96 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
 97 | 	@echo
 98 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
 99 | 	@echo "N.B. You won't be able to view it unless you put it in" \
100 | 	      "~/Library/Documentation/Help or install it in your application" \
101 | 	      "bundle."
102 | 
103 | devhelp:
104 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
105 | 	@echo
106 | 	@echo "Build finished."
107 | 	@echo "To view the help file:"
108 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/social_media"
109 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/social_media"
110 | 	@echo "# devhelp"
111 | 
112 | epub:
113 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
114 | 	@echo
115 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
116 | 
117 | latex:
118 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
119 | 	@echo
120 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
121 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
122 | 	      "(use \`make latexpdf' here to do that automatically)."
123 | 
124 | latexpdf:
125 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
126 | 	@echo "Running LaTeX files through pdflatex..."
127 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
128 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
129 | 
130 | latexpdfja:
131 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
132 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
133 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
134 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
135 | 
136 | text:
137 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
138 | 	@echo
139 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
140 | 
141 | man:
142 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
143 | 	@echo
144 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
145 | 
146 | texinfo:
147 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
148 | 	@echo
149 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
150 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
151 | 	      "(use \`make info' here to do that automatically)."
152 | 
153 | info:
154 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
155 | 	@echo "Running Texinfo files through makeinfo..."
156 | 	make -C $(BUILDDIR)/texinfo info
157 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
158 | 
159 | gettext:
160 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
161 | 	@echo
162 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
163 | 
164 | changes:
165 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
166 | 	@echo
167 | 	@echo "The overview file is in $(BUILDDIR)/changes."
168 | 
169 | linkcheck:
170 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
171 | 	@echo
172 | 	@echo "Link check complete; look for any errors in the above output " \
173 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
174 | 
175 | doctest:
176 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
177 | 	@echo "Testing of doctests in the sources finished, look at the " \
178 | 	      "results in $(BUILDDIR)/doctest/output.txt."
179 | 
180 | coverage:
181 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
182 | 	@echo "Testing of coverage in the sources finished, look at the " \
183 | 	      "results in $(BUILDDIR)/coverage/python.txt."
184 | 
185 | xml:
186 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
187 | 	@echo
188 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
189 | 
190 | pseudoxml:
191 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
192 | 	@echo
193 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
194 | 


--------------------------------------------------------------------------------
/docs/python.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shyal/hoverpy-scikitlearn/34ca179fd454139b8bf9f406b399313c7c400bb8/docs/python.db


--------------------------------------------------------------------------------
/docs/requests.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shyal/hoverpy-scikitlearn/34ca179fd454139b8bf9f406b399313c7c400bb8/docs/requests.db


--------------------------------------------------------------------------------
/docs/source/1ftcd5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shyal/hoverpy-scikitlearn/34ca179fd454139b8bf9f406b399313c7c400bb8/docs/source/1ftcd5.jpg


--------------------------------------------------------------------------------
/docs/source/code/social_media_output.txt:
--------------------------------------------------------------------------------
 1 | getting hn showstories - this may take a while!
 2 | got 63 titles in 1.221211 seconds
 3 | getting hn askstories - this may take a while!
 4 | got 80 titles in 5.147536 seconds
 5 | getting hn jobstories - this may take a while!
 6 | got 22 titles in 0.046783 seconds
 7 | getting reddit data for republican
 8 | getting reddit data for democrat
 9 | getting reddit data for linux
10 | getting reddit data for music
11 | getting reddit data for movies
12 | getting reddit data for literature
13 | getting reddit data for books
14 | ******************************
15 | TEST CLASSIFIER
16 | ******************************
17 | 'powershell and openssl compatability testing' => ('reddit', 'linux')
18 | 'compiling source code on ubuntu' => ('reddit', 'linux')
19 | 'wifi drivers keep crashing' => ('reddit', 'linux')
20 | 'cron jobs' => ('reddit', 'republican')
21 | 'training day was a great movie with a legendary director' => ('reddit', 'movies')
22 | 'michael bay should remake lord of the rings, set in the future' => ('reddit', 'books')
23 | "hilary clinton may win voters' hearts" => ('reddit', 'republican')
24 | 'donald trump may donimate the presidency' => ('reddit', 'republican')
25 | 'reading dead wood gives me far more pleasure than using kindles' => ('reddit', 'books')
26 | 'hiring a back end engineer' => ('hn', 'jobstories')
27 | 'guitar is louder than the piano although electronic is best' => ('reddit', 'music')
28 | 'drum solo and singer from the rolling stones' => ('reddit', 'music')
29 | 'hiring a back end engineer' => ('hn', 'jobstories')
30 | 'javascript loader' => ('hn', 'showstories')
31 | "dostoevsky's existentialism" => ('reddit', 'literature')
32 | Enter title: 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # social_media documentation build configuration file, created by
  4 | # sphinx-quickstart on Mon Dec 12 23:00:41 2016.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import sys
 16 | import os
 17 | import shlex
 18 | 
 19 | # If extensions (or modules to document with autodoc) are in another directory,
 20 | # add these directories to sys.path here. If the directory is relative to the
 21 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 22 | #sys.path.insert(0, os.path.abspath('.'))
 23 | 
 24 | # -- General configuration ------------------------------------------------
 25 | 
 26 | # If your documentation needs a minimal Sphinx version, state it here.
 27 | #needs_sphinx = '1.0'
 28 | 
 29 | # Add any Sphinx extension module names here, as strings. They can be
 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 31 | # ones.
 32 | extensions = []
 33 | 
 34 | # Add any paths that contain templates here, relative to this directory.
 35 | templates_path = ['_templates']
 36 | 
 37 | # The suffix(es) of source filenames.
 38 | # You can specify multiple suffix as a list of string:
 39 | # source_suffix = ['.rst', '.md']
 40 | source_suffix = '.rst'
 41 | 
 42 | # The encoding of source files.
 43 | #source_encoding = 'utf-8-sig'
 44 | 
 45 | # The master toctree document.
 46 | master_doc = 'index'
 47 | 
 48 | # General information about the project.
 49 | project = u'social_media'
 50 | copyright = u'2016, Shyal Beardsley'
 51 | author = u'Shyal Beardsley'
 52 | 
 53 | # The version info for the project you're documenting, acts as replacement for
 54 | # |version| and |release|, also used in various other places throughout the
 55 | # built documents.
 56 | #
 57 | # The short X.Y version.
 58 | version = '1.0'
 59 | # The full version, including alpha/beta/rc tags.
 60 | release = '1.0'
 61 | 
 62 | # The language for content autogenerated by Sphinx. Refer to documentation
 63 | # for a list of supported languages.
 64 | #
 65 | # This is also used if you do content translation via gettext catalogs.
 66 | # Usually you set "language" from the command line for these cases.
 67 | language = None
 68 | 
 69 | # There are two options for replacing |today|: either, you set today to some
 70 | # non-false value, then it is used:
 71 | #today = ''
 72 | # Else, today_fmt is used as the format for a strftime call.
 73 | #today_fmt = '%B %d, %Y'
 74 | 
 75 | # List of patterns, relative to source directory, that match files and
 76 | # directories to ignore when looking for source files.
 77 | exclude_patterns = []
 78 | 
 79 | # The reST default role (used for this markup: `text`) to use for all
 80 | # documents.
 81 | #default_role = None
 82 | 
 83 | # If true, '()' will be appended to :func: etc. cross-reference text.
 84 | #add_function_parentheses = True
 85 | 
 86 | # If true, the current module name will be prepended to all description
 87 | # unit titles (such as .. function::).
 88 | #add_module_names = True
 89 | 
 90 | # If true, sectionauthor and moduleauthor directives will be shown in the
 91 | # output. They are ignored by default.
 92 | #show_authors = False
 93 | 
 94 | # The name of the Pygments (syntax highlighting) style to use.
 95 | pygments_style = 'sphinx'
 96 | 
 97 | # A list of ignored prefixes for module index sorting.
 98 | #modindex_common_prefix = []
 99 | 
100 | # If true, keep warnings as "system message" paragraphs in the built documents.
101 | #keep_warnings = False
102 | 
103 | # If true, `todo` and `todoList` produce output, else they produce nothing.
104 | todo_include_todos = False
105 | 
106 | 
107 | # -- Options for HTML output ----------------------------------------------
108 | 
109 | # The theme to use for HTML and HTML Help pages.  See the documentation for
110 | # a list of builtin themes.
111 | html_theme = 'alabaster'
112 | 
113 | # Theme options are theme-specific and customize the look and feel of a theme
114 | # further.  For a list of options available for each theme, see the
115 | # documentation.
116 | #html_theme_options = {}
117 | 
118 | # Add any paths that contain custom themes here, relative to this directory.
119 | #html_theme_path = []
120 | 
121 | # The name for this set of Sphinx documents.  If None, it defaults to
122 | # "<project> v<release> documentation".
123 | #html_title = None
124 | 
125 | # A shorter title for the navigation bar.  Default is the same as html_title.
126 | #html_short_title = None
127 | 
128 | # The name of an image file (relative to this directory) to place at the top
129 | # of the sidebar.
130 | html_logo = "ioLogo.svg"
131 | 
132 | # The name of an image file (within the static path) to use as favicon of the
133 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
134 | # pixels large.
135 | #html_favicon = None
136 | 
137 | # Add any paths that contain custom static files (such as style sheets) here,
138 | # relative to this directory. They are copied after the builtin static files,
139 | # so a file named "default.css" will overwrite the builtin "default.css".
140 | html_static_path = ['_static']
141 | 
142 | # Add any extra paths that contain custom files (such as robots.txt or
143 | # .htaccess) here, relative to this directory. These files are copied
144 | # directly to the root of the documentation.
145 | #html_extra_path = []
146 | 
147 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
148 | # using the given strftime format.
149 | #html_last_updated_fmt = '%b %d, %Y'
150 | 
151 | # If true, SmartyPants will be used to convert quotes and dashes to
152 | # typographically correct entities.
153 | #html_use_smartypants = True
154 | 
155 | # Custom sidebar templates, maps document names to template names.
156 | #html_sidebars = {}
157 | 
158 | # Additional templates that should be rendered to pages, maps page names to
159 | # template names.
160 | #html_additional_pages = {}
161 | 
162 | # If false, no module index is generated.
163 | #html_domain_indices = True
164 | 
165 | # If false, no index is generated.
166 | #html_use_index = True
167 | 
168 | # If true, the index is split into individual pages for each letter.
169 | #html_split_index = False
170 | 
171 | # If true, links to the reST sources are added to the pages.
172 | html_show_sourcelink = False
173 | 
174 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
175 | html_show_sphinx = False
176 | 
177 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
178 | #html_show_copyright = True
179 | 
180 | # If true, an OpenSearch description file will be output, and all pages will
181 | # contain a <link> tag referring to it.  The value of this option must be the
182 | # base URL from which the finished HTML is served.
183 | #html_use_opensearch = ''
184 | 
185 | # This is the file name suffix for HTML files (e.g. ".xhtml").
186 | #html_file_suffix = None
187 | 
188 | # Language to be used for generating the HTML full-text search index.
189 | # Sphinx supports the following languages:
190 | #   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
191 | #   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
192 | #html_search_language = 'en'
193 | 
194 | # A dictionary with options for the search language support, empty by default.
195 | # Now only 'ja' uses this config value
196 | #html_search_options = {'type': 'default'}
197 | 
198 | # The name of a javascript file (relative to the configuration directory) that
199 | # implements a search results scorer. If empty, the default will be used.
200 | #html_search_scorer = 'scorer.js'
201 | 
202 | # Output file base name for HTML help builder.
203 | htmlhelp_basename = 'social_mediadoc'
204 | 
205 | # -- Options for LaTeX output ---------------------------------------------
206 | 
207 | latex_elements = {
208 |     # The paper size ('letterpaper' or 'a4paper').
209 |     #'papersize': 'letterpaper',
210 | 
211 |     # The font size ('10pt', '11pt' or '12pt').
212 |     #'pointsize': '10pt',
213 | 
214 |     # Additional stuff for the LaTeX preamble.
215 |     #'preamble': '',
216 | 
217 |     # Latex figure (float) alignment
218 |     #'figure_align': 'htbp',
219 | }
220 | 
221 | # Grouping the document tree into LaTeX files. List of tuples
222 | # (source start file, target name, title,
223 | #  author, documentclass [howto, manual, or own class]).
224 | latex_documents = [
225 |     (master_doc, 'social_media.tex', u'social\\_media Documentation',
226 |      u'Shyal Beardsley', 'manual'),
227 | ]
228 | 
229 | # The name of an image file (relative to this directory) to place at the top of
230 | # the title page.
231 | #latex_logo = None
232 | 
233 | # For "manual" documents, if this is true, then toplevel headings are parts,
234 | # not chapters.
235 | #latex_use_parts = False
236 | 
237 | # If true, show page references after internal links.
238 | #latex_show_pagerefs = False
239 | 
240 | # If true, show URL addresses after external links.
241 | #latex_show_urls = False
242 | 
243 | # Documents to append as an appendix to all manuals.
244 | #latex_appendices = []
245 | 
246 | # If false, no module index is generated.
247 | #latex_domain_indices = True
248 | 
249 | 
250 | # -- Options for manual page output ---------------------------------------
251 | 
252 | # One entry per manual page. List of tuples
253 | # (source start file, name, description, authors, manual section).
254 | man_pages = [
255 |     (master_doc, 'social_media', u'social_media Documentation',
256 |      [author], 1)
257 | ]
258 | 
259 | # If true, show URL addresses after external links.
260 | #man_show_urls = False
261 | 
262 | 
263 | # -- Options for Texinfo output -------------------------------------------
264 | 
265 | # Grouping the document tree into Texinfo files. List of tuples
266 | # (source start file, target name, title, author,
267 | #  dir menu entry, description, category)
268 | texinfo_documents = [
269 |     (master_doc, 'social_media', u'social_media Documentation',
270 |      author, 'social_media', 'One line description of project.',
271 |      'Miscellaneous'),
272 | ]
273 | 
274 | # Documents to append as an appendix to all manuals.
275 | #texinfo_appendices = []
276 | 
277 | # If false, no module index is generated.
278 | #texinfo_domain_indices = True
279 | 
280 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
281 | #texinfo_show_urls = 'footnote'
282 | 
283 | # If true, do not generate a @detailmenu in the "Top" node's menu.
284 | #texinfo_no_detailmenu = False
285 | 


--------------------------------------------------------------------------------
/docs/source/hat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shyal/hoverpy-scikitlearn/34ca179fd454139b8bf9f406b399313c7c400bb8/docs/source/hat.jpg


--------------------------------------------------------------------------------
/docs/source/hat.xcf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shyal/hoverpy-scikitlearn/34ca179fd454139b8bf9f406b399313c7c400bb8/docs/source/hat.xcf


--------------------------------------------------------------------------------
/docs/source/http_diff.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shyal/hoverpy-scikitlearn/34ca179fd454139b8bf9f406b399313c7c400bb8/docs/source/http_diff.png


--------------------------------------------------------------------------------
/docs/source/https_get.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shyal/hoverpy-scikitlearn/34ca179fd454139b8bf9f406b399313c7c400bb8/docs/source/https_get.png


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
  1 | Speeding up scikit-learn workflow using a high-performance Go proxy.
  2 | ====================================================================
  3 | 
  4 | .. raw:: html
  5 |   
  6 |     <script id="dsq-count-scr" src="//ioloop.disqus.com/count.js" async></script>
  7 | 
  8 | .. toctree::
  9 |    :maxdepth: 2
 10 | 
 11 | Up until now I've been using vcrpy to cache my requests during the data mining phase of my scikit-learn work, but I was recently intimated to an ultra-high-performance GoLang caching proxy, and wanted to see if I could use it for more speed-ups. I was so impressed that I wrote a python wrapper for it.
 12 | 
 13 | .. code-block:: bash
 14 |   
 15 |     pip install hoverpy --user --upgrade
 16 | 
 17 | Offlining readthedocs:
 18 | 
 19 | .. code-block:: python
 20 | 
 21 |     import time
 22 |     import hoverpy
 23 |     import requests
 24 | 
 25 |     rtd = "http://readthedocs.org/api/v1/project/?limit=50&offset=0&format=json"
 26 | 
 27 |     with hoverpy.HoverPy(recordMode='once'):
 28 |         start = time.time()
 29 |         objects = requests.get(rtd).json()['objects']
 30 |         links = ["http://readthedocs.org" + x['resource_uri'] for x in objects]
 31 |         for link in links:
 32 |             response = requests.get(link)
 33 |             print("url: %s, status code: %s" % (link, response.status_code))
 34 |         print("Time taken: %f" % (time.time() - start))
 35 | 
 36 | Ouput:
 37 | 
 38 | .. code-block:: bash
 39 |     
 40 |     [...]
 41 |     Time taken: 9.418862
 42 | 
 43 | Upon second invocation:
 44 | 
 45 | .. code-block:: bash
 46 |     
 47 |     [...]
 48 |     Time taken: 0.093463
 49 | 
 50 | That's much better: *100.78x* faster than hitting the real endpoint.
 51 | 
 52 | .. figure:: http_diff.png
 53 | 
 54 | Not surprising really. My issue with caching proxies however, is that it's the https handshaking that takes time–not fetching the data–and one of my many annoyances with vcrpy is that it won't let me remap https requests to http.
 55 | 
 56 | Therefore I was very pleased to see remapping work perfectly in hoverpy (code provided below the next graph), with hoverpy wiping the floor with vcrpy; over 13x faster:
 57 | 
 58 | .. figure:: https_get.png
 59 | 
 60 | .. literalinclude:: ../../examples/hn.py
 61 |    :language: python
 62 | 
 63 | Once again, on second run, hoverfly steps in with a very significant speedup. I'm very impressed with hoverfly's performance.
 64 | 
 65 | Data mining HN
 66 | --------------
 67 | 
 68 | Before we start, please note you can find the final script `here <https://raw.githubusercontent.com/shyal/hoverpy-scikitlearn/master/hoverpy_scikitlearn.py>`_. You'll also need the `data <https://github.com/shyal/hoverpy-scikitlearn/raw/master/data.tar>`_.
 69 | 
 70 | What I also really like about Hoverfly is how fast it loads, and how fast it loads up the boltdb database. I also like the fact it's configuration-free. Here's a function you can use to offline titles for various HN sections:
 71 | 
 72 | .. literalinclude:: ../../hoverpy_scikitlearn.py
 73 |    :language: python
 74 |    :lines: 1-28
 75 | 
 76 | 
 77 | ------------
 78 | 
 79 | Data mining Reddit
 80 | -------------------
 81 | 
 82 | While we're at it, let's put a function here for offlining subreddits. This one also includes comments:
 83 | 
 84 | .. literalinclude:: ../../hoverpy_scikitlearn.py
 85 |     :language: python
 86 |     :lines: 29-48
 87 | 
 88 | 
 89 | Organising our datamines
 90 | ------------------------
 91 | 
 92 | Rather than sitting around hitting these endpoints, you may as well download these datasets, to save yourself the time.
 93 | 
 94 | .. code-block:: bash
 95 | 
 96 |     wget https://github.com/shyal/hoverpy-scikitlearn/raw/master/data.tar
 97 |     tar xvf data.tar
 98 | 
 99 | And the code:
100 | 
101 | .. literalinclude:: ../../hoverpy_scikitlearn.py
102 |    :language: python
103 |    :lines: 50-71
104 | 
105 | Calling ``doMining()`` caches everything, which takes a while. Although you've hopefully downloaded and extracted ``data.tar``, in which case it shouldn't take more than a few seconds. That's all our data mining done. I think this is a good time to remind ourselves a big part of machine learning is, in fact, data sanitisation and mining.
106 | 
107 | .. code::
108 | 
109 |         GETTING HACKERNEWS showstories DATA
110 |         got 54 hackernews titles in 0.099983 seconds
111 |         GETTING HACKERNEWS askstories DATA
112 |         got 92 hackernews titles in 0.160661 seconds
113 |         GETTING HACKERNEWS jobstories DATA
114 |         got 12 hackernews titles in 0.024908 seconds
115 |         GETTING REDDIT r/republican DATA
116 |         GETTING REDDIT r/democrat DATA
117 |         GETTING REDDIT r/linux DATA
118 |         GETTING REDDIT r/python DATA
119 |         GETTING REDDIT r/music DATA
120 |         GETTING REDDIT r/movies DATA
121 |         GETTING REDDIT r/literature DATA
122 |         GETTING REDDIT r/books DATA
123 | 
124 |         real    0m9.425s
125 | 
126 | -----------------------------------
127 | 
128 | 
129 | Building an HN or Reddit classifier
130 | -----------------------------------
131 | 
132 | .. .. raw:: html
133 |     
134 |     <script type="text/javascript" src="https://asciinema.org/a/626zkc3hduwfd7328aqme4wgl.js" id="asciicast-626zkc3hduwfd7328aqme4wgl" async></script>
135 | 
136 | OK time to play. Let's build a naive bayesian text classifier. You'll be able to type in some text, and it'll tell you which subreddit it thinks the text could have originated from.
137 | 
138 | For this part, you'll need scikit-learn.
139 | 
140 | .. code-block:: bash
141 | 
142 |     pip install numpy
143 | 
144 |     pip install scikit-learn
145 | 
146 | Test scentences:
147 | 
148 | .. literalinclude:: ../../hoverpy_scikitlearn.py
149 |    :language: python
150 |    :lines: 73-88
151 | 
152 | 
153 | Running the classifier:
154 | 
155 | .. literalinclude:: ../../hoverpy_scikitlearn.py
156 |    :language: python
157 |    :lines: 90-119
158 | 
159 | In case you are not familiar with tokenizing, tfidf, classification etc. then I've provided a link at the end of this tutorial that'll demistify the block above.
160 | 
161 | -----------------------------------
162 | 
163 | 
164 | Wrapping things up
165 | ------------------
166 | 
167 | You can find hoverpy's and hoverfly's extensive documentation `here <http://hoverpy.io>`_ and `here <http://hoverfly.io>`_. This excellent and lightweight proxy was developed by the very smart guys at `SpectoLabs <http://specto.io>`_ so I strongly suggest you show them some love (I could not, however, find a donations link).
168 | 
169 | Repository for this post, with code: https://github.com/shyal/hoverpy-scikitlearn
170 | 
171 | http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
172 | 
173 | Taking it one step further
174 | --------------------------
175 | 
176 | The premise of hoverfly is, in fact, testing, CD and CI. The intent is to commit your requests database, and test against them. This makes your code completely hermetic to external dependencies.
177 | 
178 | .. image:: https://travis-ci.org/shyal/hoverpy-scikitlearn.svg?branch=master
179 |     :target: https://travis-ci.org/shyal/hoverpy-scikitlearn
180 | 
181 | .. raw:: html
182 | 
183 |     <div id="disqus_thread"></div>
184 |     <script>
185 |     var disqus_config = function () {
186 |     this.page.url = PAGE_URL;  // Replace PAGE_URL with your page's canonical URL variable
187 |     this.page.identifier = PAGE_IDENTIFIER; // Replace PAGE_IDENTIFIER with your page's unique identifier variable
188 |     };
189 |     (function() { // DON'T EDIT BELOW THIS LINE
190 |     var d = document, s = d.createElement('script');
191 |     s.src = '//ioloop.disqus.com/embed.js';
192 |     s.setAttribute('data-timestamp', +new Date());
193 |     (d.head || d.body).appendChild(s);
194 |     })();
195 |     </script>
196 |     <noscript>Please enable JavaScript to view the <a href="https://disqus.com/?ref_noscript">comments powered by Disqus.</a></noscript>
197 | 
198 | 


--------------------------------------------------------------------------------
/docs/source/ioLogo.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
  3 | 
  4 | <svg
  5 |    xmlns:dc="http://purl.org/dc/elements/1.1/"
  6 |    xmlns:cc="http://creativecommons.org/ns#"
  7 |    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  8 |    xmlns:svg="http://www.w3.org/2000/svg"
  9 |    xmlns="http://www.w3.org/2000/svg"
 10 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
 11 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
 12 |    width="200"
 13 |    height="150"
 14 |    viewBox="0 0 200 150"
 15 |    id="svg2"
 16 |    version="1.1"
 17 |    inkscape:version="0.91 r13725"
 18 |    inkscape:export-filename="/Users/shyal/dev/ioloop/public/img/iologo.png"
 19 |    inkscape:export-xdpi="118.82"
 20 |    inkscape:export-ydpi="118.82"
 21 |    sodipodi:docname="iologo.svg">
 22 |   <defs
 23 |      id="defs4">
 24 |     <inkscape:perspective
 25 |        sodipodi:type="inkscape:persp3d"
 26 |        inkscape:vp_x="0 : 526.18109 : 1"
 27 |        inkscape:vp_y="0 : 999.99998 : 0"
 28 |        inkscape:vp_z="744.09448 : 526.18109 : 1"
 29 |        inkscape:persp3d-origin="372.04724 : 350.78739 : 1"
 30 |        id="perspective3402" />
 31 |   </defs>
 32 |   <sodipodi:namedview
 33 |      id="base"
 34 |      pagecolor="#ffffff"
 35 |      bordercolor="#666666"
 36 |      borderopacity="1.0"
 37 |      inkscape:pageopacity="0.0"
 38 |      inkscape:pageshadow="2"
 39 |      inkscape:zoom="1.4246875"
 40 |      inkscape:cx="60.244372"
 41 |      inkscape:cy="45.119138"
 42 |      inkscape:document-units="px"
 43 |      inkscape:current-layer="layer1"
 44 |      showgrid="false"
 45 |      inkscape:window-width="1162"
 46 |      inkscape:window-height="656"
 47 |      inkscape:window-x="1440"
 48 |      inkscape:window-y="0"
 49 |      inkscape:window-maximized="0"
 50 |      fit-margin-top="0"
 51 |      fit-margin-left="0"
 52 |      fit-margin-right="0"
 53 |      fit-margin-bottom="0"
 54 |      units="px" />
 55 |   <metadata
 56 |      id="metadata7">
 57 |     <rdf:RDF>
 58 |       <cc:Work
 59 |          rdf:about="">
 60 |         <dc:format>image/svg+xml</dc:format>
 61 |         <dc:type
 62 |            rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
 63 |         <dc:title></dc:title>
 64 |       </cc:Work>
 65 |     </rdf:RDF>
 66 |   </metadata>
 67 |   <g
 68 |      inkscape:label="Layer 1"
 69 |      inkscape:groupmode="layer"
 70 |      id="layer1"
 71 |      transform="translate(-513.64062,-296.66527)">
 72 |     <path
 73 |        inkscape:export-ydpi="118.82"
 74 |        inkscape:export-xdpi="118.82"
 75 |        inkscape:export-filename="/Users/shyal/dev/ioloop/public/favicon.png"
 76 |        sodipodi:nodetypes="scscccccccsscsssscsssccssssssssss"
 77 |        inkscape:connector-curvature="0"
 78 |        id="path3444"
 79 |        d="m 646.00287,307.40406 c -1.30013,0.002 -2.60744,0.0514 -3.9173,0.1344 -8.09062,0.51532 -15.21971,2.322 -22.00426,5.57689 -5.92847,2.84408 -11.85044,7.04228 -16.32625,11.58335 l -1.57538,1.60068 1.31417,1.77749 c 1.48477,2.01139 5.22874,7.5892 6.74785,10.05863 1.6949,2.75523 5.21326,8.8283 7.12701,12.29943 l 1.81118,3.28548 1.19627,-1.50795 c 7.37532,-9.3076 14.76927,-13.51146 24.48944,-13.90006 7.25497,-0.29012 14.22847,1.91425 20.08353,6.34349 1.58672,1.20039 4.52808,4.15018 5.72848,5.73691 5.71898,7.56015 7.71503,17.06051 5.47576,26.09852 -1.39764,5.64112 -4.03219,10.29147 -8.18836,14.44763 -1.95351,1.95347 -3.49505,3.18695 -5.6864,4.5407 -9.7331,6.01277 -22.67882,6.08284 -31.66692,0.17704 -5.98014,-3.9294 -11.54067,-10.75771 -18.2976,-22.48445 -0.80762,-1.40158 -3.70463,-6.6141 -6.44457,-11.58335 -6.65331,-12.06692 -7.85528,-14.12225 -11.49075,-19.69603 -5.56604,-8.53382 -11.09135,-15.09369 -16.93282,-20.09196 -2.56304,-2.19311 -5.214,-4.03321 -7.91882,-5.70322 l 0,44.26964 c 0.54025,0.85442 1.06678,1.61041 1.61746,2.5357 2.42136,4.06871 2.97969,5.04915 7.85143,13.90011 2.57148,4.67194 5.33505,9.64268 6.14133,11.04426 4.5121,7.8436 9.37638,14.99118 13.79898,20.26042 2.12369,2.53023 7.06435,7.47186 9.24986,9.25829 10.23456,8.36591 20.70928,12.52482 33.7983,13.41145 14.73948,0.99856 29.4346,-3.36079 41.18637,-12.20679 15.59455,-11.73856 24.78425,-30.06596 24.78425,-49.4169 0,-13.21898 -4.41506,-26.43851 -12.35843,-36.99109 -11.85809,-15.75337 -30.09187,-24.79073 -49.59381,-24.75894 z"
 80 |        style="opacity:1;fill:#4d4d4d" />
 81 |     <path
 82 |        inkscape:export-ydpi="118.82"
 83 |        inkscape:export-xdpi="118.82"
 84 |        inkscape:export-filename="/Users/shyal/dev/ioloop/public/favicon.png"
 85 |        sodipodi:nodetypes="ccsccccc"
 86 |        inkscape:connector-curvature="0"
 87 |        style="opacity:1;fill:#4d4d4d"
 88 |        d="m 569.52713,391.47851 0,30.52965 c 1.86272,-1.17055 3.69193,-2.41293 5.45051,-3.82466 1.68331,-1.3513 4.90182,-4.23658 6.25079,-5.61054 l 0.91829,-0.93511 -0.80029,-1.12888 c -1.56374,-2.18065 -5.56516,-8.25862 -7.09331,-10.77462 -1.15331,-1.89905 -3.00968,-5.22017 -4.72599,-8.25584 z"
 89 |        id="path3446" />
 90 |     <path
 91 |        inkscape:export-ydpi="118.82"
 92 |        inkscape:export-xdpi="118.82"
 93 |        inkscape:export-filename="/Users/shyal/dev/ioloop/public/favicon.png"
 94 |        sodipodi:nodetypes="cccccscccccccc"
 95 |        inkscape:connector-curvature="0"
 96 |        id="path3448"
 97 |        d="m 521.97197,356.07564 c -0.26741,0.16499 -0.46591,0.33368 -0.71608,0.49705 l 0,38.41032 0,31.81013 0,1.71015 c 4.01268,1.13384 8.17473,1.92125 12.48486,2.27454 1.97348,0.16174 8.27878,0.0714 10.50503,-0.15155 3.68178,-0.36891 7.20602,-1.06511 10.60621,-2.03026 l 0,-1.80283 0,-32.13867 0,-38.14917 c -0.2651,-0.18399 -0.5346,-0.3944 -0.8003,-0.56444 -4.64455,-2.9777 -10.22478,-4.45519 -15.86293,-4.4303 -5.63816,0.0278 -11.33415,1.54859 -16.21679,4.56489 z"
 98 |        style="opacity:1;fill:#4d4d4d" />
 99 |     <circle
100 |        style="opacity:1;fill:#4d4d4d;fill-opacity:1;stroke:none;stroke-width:1;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
101 |        id="path3454"
102 |        cx="537.01294"
103 |        cy="323.05695"
104 |        r="15.520542"
105 |        inkscape:export-filename="/Users/shyal/dev/ioloop/public/favicon.png"
106 |        inkscape:export-xdpi="118.82"
107 |        inkscape:export-ydpi="118.82" />
108 |   </g>
109 | </svg>
110 | 


--------------------------------------------------------------------------------
/docs/source/iologo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shyal/hoverpy-scikitlearn/34ca179fd454139b8bf9f406b399313c7c400bb8/docs/source/iologo.png


--------------------------------------------------------------------------------
/examples/hn.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import hoverpy
 3 | import requests
 4 | import os
 5 | 
 6 | prot = "http" if os.path.isfile("hn.db") else "https"
 7 | 
 8 | with hoverpy.HoverPy(recordMode='once', dbpath='hn.db') as hp:
 9 |     print("started hoverpy in %s mode" % hp.mode())
10 |     start = time.time()
11 |     r = requests.get(
12 |         "%s://hacker-news.firebaseio.com/v0/topstories.json" % (prot))
13 |     for item in r.json():
14 |         print(
15 |             requests.get(
16 |                 "%s://hacker-news.firebaseio.com/v0/item/%i.json" %
17 |                 (prot, item)).json()["title"])
18 |     print("got articles in %f seconds" % (time.time() - start))
19 | 


--------------------------------------------------------------------------------
/examples/hnvcrpy.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import vcr
 3 | import requests
 4 | 
 5 | with vcr.use_cassette('hn.yaml'):
 6 |     start = time.time()
 7 |     r = requests.get(
 8 |         "https://hacker-news.firebaseio.com/v0/topstories.json")
 9 |     for item in r.json():
10 |         print(
11 |             requests.get(
12 |                 "https://hacker-news.firebaseio.com/v0/item/%i.json" %
13 |                 item).json()["title"])
14 |     print("got articles in %f seconds" % (time.time() - start))
15 | 


--------------------------------------------------------------------------------
/examples/readthedocs.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import hoverpy
 3 | import requests
 4 | 
 5 | rtd = "http://readthedocs.org/api/v1/project/?limit=50&offset=0&format=json"
 6 | 
 7 | with hoverpy.HoverPy(recordMode='once'):
 8 |     start = time.time()
 9 |     objects = requests.get(rtd).json()['objects']
10 |     links = ["http://readthedocs.org" + x['resource_uri'] for x in objects]
11 |     for link in links:
12 |         response = requests.get(link)
13 |         print("url: %s, status code: %s" % (link, response.status_code))
14 |     print("Time taken: %f" % (time.time() - start))
15 | 


--------------------------------------------------------------------------------
/examples/readthedocsvcrpy.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import vcr
 3 | import requests
 4 | 
 5 | rtd = "https://readthedocs.org/api/v1/project/?limit=50&offset=0&format=json"
 6 | 
 7 | with vcr.use_cassette('requests.yaml'):
 8 |     start = time.time()
 9 |     objects = requests.get(rtd).json()['objects']
10 |     links = ["http://readthedocs.org" + x['resource_uri'] for x in objects]
11 |     for link in links:
12 |         response = requests.get(link)
13 |         print("url: %s, status code: %s" % (link, response.status_code))
14 |     print("Time taken: %f" % (time.time() - start))
15 | 


--------------------------------------------------------------------------------
/examples/reddit.py:
--------------------------------------------------------------------------------
 1 | import hoverpy
 2 | import praw
 3 | import os
 4 | import time
 5 | 
 6 | sub = "python"
 7 | db = ("%s.db" % sub)
 8 | capture = not os.path.isfile(db)
 9 | 
10 | with hoverpy.HoverPy(dbpath=db, recordMode='once') as hp:
11 |     start = time.time()
12 |     titles = []
13 |     print "GETTING REDDIT r/%s DATA" % sub
14 |     r = praw.Reddit(user_agent="Karma breakdown 1.0 by /u/_Daimon_",
15 |                     http_proxy=hp.httpProxy(),
16 |                     https_proxy=hp.httpsProxy(),
17 |                     validate_certs="off")
18 |     if not capture:
19 |         r.config.api_request_delay = 0
20 |     subreddit = r.get_subreddit(sub)
21 |     for submission in subreddit.get_hot(limit=100):
22 |         text = submission.title.lower()
23 |         print(text)
24 |         for comment in praw.helpers.flatten_tree(submission.comments):
25 |             if hasattr(comment, 'body'):
26 |                 text += comment.body + " "
27 |     titles.append(text)
28 |     print("got %i %s in %f" % (len(titles), sub, time.time() - start))
29 | 


--------------------------------------------------------------------------------
/hoverpy_scikitlearn.py:
--------------------------------------------------------------------------------
  1 | def getHNData(verbose=False, limit=100, sub="showstories"):
  2 |     from hackernews import HackerNews
  3 |     from hackernews import settings
  4 |     import hoverpy, time, os
  5 |     dbpath = "data/hn.%s.db" % sub
  6 |     with hoverpy.HoverPy(recordMode="once", dbpath=dbpath) as hp:
  7 |         if not hp.mode() == "capture":
  8 |             settings.supported_api_versions[
  9 |                 "v0"] = "http://hacker-news.firebaseio.com/v0/"
 10 |         hn = HackerNews()
 11 |         titles = []
 12 |         print("GETTING HACKERNEWS %s DATA" % sub)
 13 |         subs = {"showstories": hn.show_stories,
 14 |                 "askstories": hn.ask_stories,
 15 |                 "jobstories": hn.job_stories,
 16 |                 "topstories": hn.top_stories}
 17 |         start = time.time()
 18 |         for story_id in subs[sub](limit=limit):
 19 |             story = hn.get_item(story_id)
 20 |             if verbose:
 21 |                 print(story.title.lower())
 22 |             titles.append(story.title.lower())
 23 |         print(
 24 |             "got %i hackernews titles in %f seconds" %
 25 |             (len(titles), time.time() - start))
 26 |         return titles
 27 | 
 28 | 
 29 | def getRedditData(verbose=False, comments=True, limit=100, sub="all"):
 30 |     import hoverpy, praw, time
 31 |     dbpath = ("data/reddit.%s.db" % sub)
 32 |     with hoverpy.HoverPy(recordMode='once', dbpath=dbpath, httpsToHttp=True) as hp:
 33 |         titles = []
 34 |         print "GETTING REDDIT r/%s DATA" % sub
 35 |         r = praw.Reddit(user_agent="Karma breakdown 1.0 by /u/_Daimon_", http_proxy=hp.httpProxy(), https_proxy=hp.httpProxy(), validate_certs="off")
 36 |         if not hp.mode() == "capture":
 37 |             r.config.api_request_delay = 0
 38 |         subreddit = r.get_subreddit(sub)
 39 |         for submission in subreddit.get_hot(limit=limit):
 40 |             text = submission.title.lower()
 41 |             if comments:
 42 |                 flat_comments = praw.helpers.flatten_tree(submission.comments)
 43 |                 for comment in flat_comments:
 44 |                     text += comment.body + " " if hasattr(comment, 'body') else ''
 45 |             if verbose:
 46 |                 print text
 47 |             titles.append(text)
 48 |         return titles
 49 | 
 50 | subs = [('hn', 'showstories'),
 51 |         ('hn', 'askstories'),
 52 |         ('hn', 'jobstories'),
 53 |         ('reddit', 'republican'),
 54 |         ('reddit', 'democrat'),
 55 |         ('reddit', 'linux'),
 56 |         ('reddit', 'python'),
 57 |         ('reddit', 'music'),
 58 |         ('reddit', 'movies'),
 59 |         ('reddit', 'literature'),
 60 |         ('reddit', 'books')]
 61 | 
 62 | def doMining():
 63 |     titles = []
 64 |     target = []
 65 |     getter = {'hn': getHNData, 'reddit': getRedditData}
 66 |     for i in range(len(subs)):
 67 |         subTitles = getter[subs[i][0]](
 68 |             sub=subs[i][1])
 69 |         titles += subTitles
 70 |         target += [i] * len(subTitles)
 71 |     return (titles, target)
 72 | 
 73 | sentences = ["powershell and openssl compatability testing",
 74 |     "compiling source code on ubuntu",
 75 |     "wifi drivers keep crashing",
 76 |     "cron jobs",
 77 |     "training day was a great movie with a legendary director",
 78 |     "michael bay should remake lord of the rings, set in the future",
 79 |     "hilary clinton may win voters' hearts",
 80 |     "donald trump may donimate the presidency",
 81 |     "reading dead wood gives me far more pleasure than using kindles",
 82 |     "hiring a back end engineer",
 83 |     "guitar is louder than the piano although electronic is best",
 84 |     "drum solo and singer from the rolling stones",
 85 |     "hiring a back end engineer",
 86 |     "javascript loader",
 87 |     "dostoevsky's existentialis"]
 88 | 
 89 | 
 90 | def main():
 91 |     titles, target = doMining()
 92 |     from sklearn.feature_extraction.text import CountVectorizer
 93 |     from sklearn.feature_extraction.text import TfidfTransformer
 94 |     from sklearn.naive_bayes import MultinomialNB
 95 |     # build our count vectoriser
 96 |     #
 97 |     count_vect = CountVectorizer()
 98 |     X_train_counts = count_vect.fit_transform(titles)
 99 |     # build tfidf transformer
100 |     #
101 |     tfidf_transformer = TfidfTransformer()
102 |     X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
103 |     # classifier
104 |     #
105 |     clf = MultinomialNB().fit(X_train_tfidf, target)
106 |     print "*"*30+"\nTEST CLASSIFIER\n"+"*"*30
107 |     # predict function
108 |     #
109 |     def predict(sentences):
110 |         X_new_counts = count_vect.transform(sentences)
111 |         X_new_tfidf = tfidf_transformer.transform(X_new_counts)
112 |         predicted = clf.predict(X_new_tfidf)
113 |         for doc, category in zip(sentences, predicted):
114 |             print('%r => %s' % (doc, subs[category]))
115 |     #
116 |     predict(sentences)
117 |     #
118 |     while True:
119 |         predict([raw_input("Enter title: ").strip()])
120 | 
121 | if __name__ == "__main__":
122 |     main()
123 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
 1 | all:
 2 | 		echo "done"
 3 | 
 4 | .PHONY:
 5 | 
 6 | docs: .PHONY
 7 | 		cd docs; make clean; make html;
 8 | 
 9 | clean:
10 | 		rm -f *.pyc *.log
11 | 
12 | run:
13 | 		python hoverpy_scikitlearn.py
14 | 
15 | commit_and_push:
16 | 		git add .
17 | 		git commit -am "added"
18 | 		git push


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | hoverpy==0.1.20
2 | haxor==0.3.1
3 | praw==3.5.0
4 | scikit-learn==0.15.2
5 | 


--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | hoverpy==0.1.20
2 | haxor==0.3.1
3 | praw==3.5.0
4 | numpy
5 | scipy
6 | scikit-learn==0.15.2
7 | 


--------------------------------------------------------------------------------
/test_hoverpy_scikitlearn.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from hoverpy_scikitlearn import *
 4 | 
 5 | class test_hn(unittest.TestCase):
 6 | 
 7 |     def test_ask(self):
 8 |         stories = getHNData(sub="jobstories")
 9 |         for story in stories:
10 |             if "hiring" in story:
11 |                 self.assertTrue(True)
12 |                 return
13 |         self.assertTrue(False)
14 | 
15 |     def test_show(self):
16 |         stories = getHNData(sub="showstories")
17 |         for story in stories:
18 |             if "show" in story:
19 |                 self.assertTrue(True)
20 |                 return
21 |         self.assertTrue(False)
22 | 
23 |     def test_ask(self):
24 |         stories = getHNData(sub="askstories")
25 |         for story in stories:
26 |             if "ask" in story:
27 |                 self.assertTrue(True)
28 |                 return
29 |         self.assertTrue(False)
30 | 
31 | class test_reddit(unittest.TestCase):
32 | 
33 |     def generic_sub_tester(self, sub):
34 |         stories = getRedditData(sub=sub)
35 |         for story in stories:
36 |             if sub in story:
37 |                 return True
38 | 
39 |     def test_linux(self):
40 |         self.assertTrue(self.generic_sub_tester("linux"))
41 | 
42 |     def test_linux(self):
43 |         self.assertTrue(self.generic_sub_tester("python"))
44 | 
45 |     def test_music(self):
46 |         self.assertTrue(self.generic_sub_tester("music"))
47 | 
48 | 
49 | class test_classifier(unittest.TestCase):
50 | 
51 |     def test_classifier(self):
52 |         try:
53 |             import scipy
54 |         except:
55 |             print("scipy module not installed - quitting")
56 |             return
57 |         titles, target = doMining()
58 | 
59 |         from sklearn.feature_extraction.text import CountVectorizer
60 |         from sklearn.feature_extraction.text import TfidfTransformer
61 |         from sklearn.naive_bayes import MultinomialNB
62 | 
63 |         count_vect = CountVectorizer()
64 |         X_train_counts = count_vect.fit_transform(titles)
65 | 
66 |         tfidf_transformer = TfidfTransformer()
67 |         X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
68 | 
69 |         clf = MultinomialNB().fit(X_train_tfidf, target)
70 | 
71 |         def predict(sentences, answers):
72 |             X_new_counts = count_vect.transform(sentences)
73 |             X_new_tfidf = tfidf_transformer.transform(X_new_counts)
74 | 
75 |             predicted = clf.predict(X_new_tfidf)
76 | 
77 |             for doc, category, answer in zip(sentences, predicted, answers):
78 |                 self.assertEquals(subs[category], answer)
79 | 
80 |         tests = [
81 |             "powershell and openssl compatability testing",
82 |             "compiling source code on ubuntu",
83 |             "wifi drivers keep crashing",
84 |             "training day was a great movie with a legendary director"
85 |         ]
86 | 
87 |         answers = [
88 |             ("reddit", "linux"),
89 |             ("reddit", "linux"),
90 |             ("reddit", "linux"),
91 |             ("reddit", "movies"),
92 |         ]
93 | 
94 |         predict(tests, answers)
95 | 
96 | if __name__ == "__main__":
97 |     unittest.main()
98 | 


--------------------------------------------------------------------------------