├── .gitignore ├── CONTRIBUTORS.rst ├── INSTALL ├── LICENSE.rst ├── MANIFEST.in ├── README.rst ├── doc ├── Makefile ├── make.bat └── source │ ├── conf.py │ ├── extending.rst │ ├── features │ ├── index.rst │ └── keyword-loading.rst │ ├── index.rst │ ├── install.rst │ └── intro.rst ├── example ├── config.ini └── config.py ├── pastycake ├── __init__.py ├── cli_notifier.py ├── config.py ├── gather.py ├── keywords.py ├── mailer.py ├── mongodb_backend.py ├── notifier.py ├── pastebin_source.py ├── pastesource.py ├── pastie_source.py ├── sqlite_backend.py ├── storage_backend.py └── text_backend.py ├── setup.py └── tests ├── __init__.py ├── test_entry_points.py └── test_sqlite_backend.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.swp 3 | MANIFEST 4 | dist/ 5 | build/ 6 | kwords 7 | __pycache__/ 8 | .tox/ 9 | tracker.txt 10 | urls.db 11 | tox.ini 12 | -------------------------------------------------------------------------------- /CONTRIBUTORS.rst: -------------------------------------------------------------------------------- 1 | Contributors 2 | ============ 3 | 4 | * 9b+ (founder) 5 | * coh (resident code monkey) 6 | 7 | 8 | -------------------------------------------------------------------------------- /INSTALL: -------------------------------------------------------------------------------- 1 | Installing pastycake 2 | ==================== 3 | 4 | python setup.py build sdist 5 | pip install dist/pastycake-.tar.gz 6 | 7 | You can also use ``easy_install`` instead of ``pip``, but you should prefer 8 | ``pip`` as it offers some features that ``easy_install`` is lacking, such as 9 | the ability to uninstall. 10 | 11 | Important 12 | --------- 13 | 14 | ``python setup.py install`` will *not* work due to idiosyncracies with python's 15 | packaging system(s). 16 | -------------------------------------------------------------------------------- /LICENSE.rst: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012, 9b+ and Contributors 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of the 9b+ nor the names of its contributors 15 | may be used to endorse or promote products derived from this software 16 | without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | include *.rst 3 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Purpose 2 | ======= 3 | Search through Paste services such as pastebin.com for interesting pastes 4 | based on keywords 5 | 6 | Supported Pastes 7 | ================ 8 | * pastebin.com 9 | * pastie.org 10 | 11 | License 12 | ======= 13 | `3-Clause "New" BSD License`__ , see the file LICENSE.rst for details. 14 | 15 | .. __: http://www.opensource.org/licenses/BSD-3-Clause 16 | 17 | Files 18 | ===== 19 | * ``pastycake-snatch.py`` - 20 | outputs to the command line and uses the ``tracker.txt`` file to monitor 21 | previously seen URLs 22 | * ``pastycake-harvest.py`` - 23 | stores data inside of SQLite instead of a text file 24 | 25 | Plans 26 | ===== 27 | * Create easy way to add in keywords (done) 28 | * Replace nesting with generators (done) 29 | * Add in emailer (done) 30 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 16 | 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 18 | 19 | help: 20 | @echo "Please use \`make ' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " dirhtml to make HTML files named index.html in directories" 23 | @echo " singlehtml to make a single large HTML file" 24 | @echo " pickle to make pickle files" 25 | @echo " json to make JSON files" 26 | @echo " htmlhelp to make HTML files and a HTML help project" 27 | @echo " qthelp to make HTML files and a qthelp project" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 31 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 32 | @echo " text to make text files" 33 | @echo " man to make manual pages" 34 | @echo " texinfo to make Texinfo files" 35 | @echo " info to make Texinfo files and run them through makeinfo" 36 | @echo " gettext to make PO message catalogs" 37 | @echo " changes to make an overview of all changed/added/deprecated items" 38 | @echo " linkcheck to check all external links for integrity" 39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 40 | 41 | clean: 42 | -rm -rf $(BUILDDIR)/* 43 | 44 | html: 45 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 48 | 49 | dirhtml: 50 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 51 | @echo 52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 53 | 54 | singlehtml: 55 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 56 | @echo 57 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 58 | 59 | pickle: 60 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 61 | @echo 62 | @echo "Build finished; now you can process the pickle files." 63 | 64 | json: 65 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 66 | @echo 67 | @echo "Build finished; now you can process the JSON files." 68 | 69 | htmlhelp: 70 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 71 | @echo 72 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 73 | ".hhp project file in $(BUILDDIR)/htmlhelp." 74 | 75 | qthelp: 76 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 77 | @echo 78 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 79 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 80 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pastycake.qhcp" 81 | @echo "To view the help file:" 82 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pastycake.qhc" 83 | 84 | devhelp: 85 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 86 | @echo 87 | @echo "Build finished." 88 | @echo "To view the help file:" 89 | @echo "# mkdir -p $$HOME/.local/share/devhelp/pastycake" 90 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pastycake" 91 | @echo "# devhelp" 92 | 93 | epub: 94 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 95 | @echo 96 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 97 | 98 | latex: 99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 100 | @echo 101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 103 | "(use \`make latexpdf' here to do that automatically)." 104 | 105 | latexpdf: 106 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 107 | @echo "Running LaTeX files through pdflatex..." 108 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 109 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 110 | 111 | text: 112 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 113 | @echo 114 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 115 | 116 | man: 117 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 118 | @echo 119 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 120 | 121 | texinfo: 122 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 123 | @echo 124 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 125 | @echo "Run \`make' in that directory to run these through makeinfo" \ 126 | "(use \`make info' here to do that automatically)." 127 | 128 | info: 129 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 130 | @echo "Running Texinfo files through makeinfo..." 131 | make -C $(BUILDDIR)/texinfo info 132 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 133 | 134 | gettext: 135 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 136 | @echo 137 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 138 | 139 | changes: 140 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 141 | @echo 142 | @echo "The overview file is in $(BUILDDIR)/changes." 143 | 144 | linkcheck: 145 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 146 | @echo 147 | @echo "Link check complete; look for any errors in the above output " \ 148 | "or in $(BUILDDIR)/linkcheck/output.txt." 149 | 150 | doctest: 151 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 152 | @echo "Testing of doctests in the sources finished, look at the " \ 153 | "results in $(BUILDDIR)/doctest/output.txt." 154 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source 10 | set I18NSPHINXOPTS=%SPHINXOPTS% source 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. linkcheck to check all external links for integrity 37 | echo. doctest to run all doctests embedded in the documentation if enabled 38 | goto end 39 | ) 40 | 41 | if "%1" == "clean" ( 42 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 43 | del /q /s %BUILDDIR%\* 44 | goto end 45 | ) 46 | 47 | if "%1" == "html" ( 48 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 49 | if errorlevel 1 exit /b 1 50 | echo. 51 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 52 | goto end 53 | ) 54 | 55 | if "%1" == "dirhtml" ( 56 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 57 | if errorlevel 1 exit /b 1 58 | echo. 59 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 60 | goto end 61 | ) 62 | 63 | if "%1" == "singlehtml" ( 64 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 68 | goto end 69 | ) 70 | 71 | if "%1" == "pickle" ( 72 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished; now you can process the pickle files. 76 | goto end 77 | ) 78 | 79 | if "%1" == "json" ( 80 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished; now you can process the JSON files. 84 | goto end 85 | ) 86 | 87 | if "%1" == "htmlhelp" ( 88 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can run HTML Help Workshop with the ^ 92 | .hhp project file in %BUILDDIR%/htmlhelp. 93 | goto end 94 | ) 95 | 96 | if "%1" == "qthelp" ( 97 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 98 | if errorlevel 1 exit /b 1 99 | echo. 100 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 101 | .qhcp project file in %BUILDDIR%/qthelp, like this: 102 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\pastycake.qhcp 103 | echo.To view the help file: 104 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\pastycake.ghc 105 | goto end 106 | ) 107 | 108 | if "%1" == "devhelp" ( 109 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 110 | if errorlevel 1 exit /b 1 111 | echo. 112 | echo.Build finished. 113 | goto end 114 | ) 115 | 116 | if "%1" == "epub" ( 117 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 118 | if errorlevel 1 exit /b 1 119 | echo. 120 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 121 | goto end 122 | ) 123 | 124 | if "%1" == "latex" ( 125 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 129 | goto end 130 | ) 131 | 132 | if "%1" == "text" ( 133 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The text files are in %BUILDDIR%/text. 137 | goto end 138 | ) 139 | 140 | if "%1" == "man" ( 141 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 145 | goto end 146 | ) 147 | 148 | if "%1" == "texinfo" ( 149 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 150 | if errorlevel 1 exit /b 1 151 | echo. 152 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 153 | goto end 154 | ) 155 | 156 | if "%1" == "gettext" ( 157 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 158 | if errorlevel 1 exit /b 1 159 | echo. 160 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 161 | goto end 162 | ) 163 | 164 | if "%1" == "changes" ( 165 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 166 | if errorlevel 1 exit /b 1 167 | echo. 168 | echo.The overview file is in %BUILDDIR%/changes. 169 | goto end 170 | ) 171 | 172 | if "%1" == "linkcheck" ( 173 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 174 | if errorlevel 1 exit /b 1 175 | echo. 176 | echo.Link check complete; look for any errors in the above output ^ 177 | or in %BUILDDIR%/linkcheck/output.txt. 178 | goto end 179 | ) 180 | 181 | if "%1" == "doctest" ( 182 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 183 | if errorlevel 1 exit /b 1 184 | echo. 185 | echo.Testing of doctests in the sources finished, look at the ^ 186 | results in %BUILDDIR%/doctest/output.txt. 187 | goto end 188 | ) 189 | 190 | :end 191 | -------------------------------------------------------------------------------- /doc/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # pastycake documentation build configuration file, created by 4 | # sphinx-quickstart on Wed Jan 25 17:57:28 2012. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sys, os 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | #sys.path.insert(0, os.path.abspath('.')) 20 | 21 | # -- General configuration ----------------------------------------------------- 22 | 23 | # If your documentation needs a minimal Sphinx version, state it here. 24 | #needs_sphinx = '1.0' 25 | 26 | # Add any Sphinx extension module names here, as strings. They can be extensions 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 28 | extensions = ['sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.mathjax'] 29 | 30 | # Add any paths that contain templates here, relative to this directory. 31 | templates_path = ['_templates'] 32 | 33 | # The suffix of source filenames. 34 | source_suffix = '.rst' 35 | 36 | # The encoding of source files. 37 | #source_encoding = 'utf-8-sig' 38 | 39 | # The master toctree document. 40 | master_doc = 'index' 41 | 42 | # General information about the project. 43 | project = u'pastycake' 44 | copyright = u'2012, R9+, coh' 45 | 46 | # The version info for the project you're documenting, acts as replacement for 47 | # |version| and |release|, also used in various other places throughout the 48 | # built documents. 49 | # 50 | # The short X.Y version. 51 | version = '0.1' 52 | # The full version, including alpha/beta/rc tags. 53 | release = '0.1' 54 | 55 | # The language for content autogenerated by Sphinx. Refer to documentation 56 | # for a list of supported languages. 57 | #language = None 58 | 59 | # There are two options for replacing |today|: either, you set today to some 60 | # non-false value, then it is used: 61 | #today = '' 62 | # Else, today_fmt is used as the format for a strftime call. 63 | #today_fmt = '%B %d, %Y' 64 | 65 | # List of patterns, relative to source directory, that match files and 66 | # directories to ignore when looking for source files. 67 | exclude_patterns = [] 68 | 69 | # The reST default role (used for this markup: `text`) to use for all documents. 70 | #default_role = None 71 | 72 | # If true, '()' will be appended to :func: etc. cross-reference text. 73 | #add_function_parentheses = True 74 | 75 | # If true, the current module name will be prepended to all description 76 | # unit titles (such as .. function::). 77 | #add_module_names = True 78 | 79 | # If true, sectionauthor and moduleauthor directives will be shown in the 80 | # output. They are ignored by default. 81 | #show_authors = False 82 | 83 | # The name of the Pygments (syntax highlighting) style to use. 84 | pygments_style = 'sphinx' 85 | 86 | # A list of ignored prefixes for module index sorting. 87 | #modindex_common_prefix = [] 88 | 89 | 90 | # -- Options for HTML output --------------------------------------------------- 91 | 92 | # The theme to use for HTML and HTML Help pages. See the documentation for 93 | # a list of builtin themes. 94 | html_theme = 'default' 95 | 96 | # Theme options are theme-specific and customize the look and feel of a theme 97 | # further. For a list of options available for each theme, see the 98 | # documentation. 99 | #html_theme_options = {} 100 | 101 | # Add any paths that contain custom themes here, relative to this directory. 102 | #html_theme_path = [] 103 | 104 | # The name for this set of Sphinx documents. If None, it defaults to 105 | # " v documentation". 106 | #html_title = None 107 | 108 | # A shorter title for the navigation bar. Default is the same as html_title. 109 | #html_short_title = None 110 | 111 | # The name of an image file (relative to this directory) to place at the top 112 | # of the sidebar. 113 | #html_logo = None 114 | 115 | # The name of an image file (within the static path) to use as favicon of the 116 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 117 | # pixels large. 118 | #html_favicon = None 119 | 120 | # Add any paths that contain custom static files (such as style sheets) here, 121 | # relative to this directory. They are copied after the builtin static files, 122 | # so a file named "default.css" will overwrite the builtin "default.css". 123 | html_static_path = ['_static'] 124 | 125 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 126 | # using the given strftime format. 127 | #html_last_updated_fmt = '%b %d, %Y' 128 | 129 | # If true, SmartyPants will be used to convert quotes and dashes to 130 | # typographically correct entities. 131 | #html_use_smartypants = True 132 | 133 | # Custom sidebar templates, maps document names to template names. 134 | #html_sidebars = {} 135 | 136 | # Additional templates that should be rendered to pages, maps page names to 137 | # template names. 138 | #html_additional_pages = {} 139 | 140 | # If false, no module index is generated. 141 | #html_domain_indices = True 142 | 143 | # If false, no index is generated. 144 | #html_use_index = True 145 | 146 | # If true, the index is split into individual pages for each letter. 147 | #html_split_index = False 148 | 149 | # If true, links to the reST sources are added to the pages. 150 | #html_show_sourcelink = True 151 | 152 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 153 | #html_show_sphinx = True 154 | 155 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 156 | #html_show_copyright = True 157 | 158 | # If true, an OpenSearch description file will be output, and all pages will 159 | # contain a tag referring to it. The value of this option must be the 160 | # base URL from which the finished HTML is served. 161 | #html_use_opensearch = '' 162 | 163 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 164 | #html_file_suffix = None 165 | 166 | # Output file base name for HTML help builder. 167 | htmlhelp_basename = 'pastycakedoc' 168 | 169 | 170 | # -- Options for LaTeX output -------------------------------------------------- 171 | 172 | latex_elements = { 173 | # The paper size ('letterpaper' or 'a4paper'). 174 | #'papersize': 'letterpaper', 175 | 176 | # The font size ('10pt', '11pt' or '12pt'). 177 | #'pointsize': '10pt', 178 | 179 | # Additional stuff for the LaTeX preamble. 180 | #'preamble': '', 181 | } 182 | 183 | # Grouping the document tree into LaTeX files. List of tuples 184 | # (source start file, target name, title, author, documentclass [howto/manual]). 185 | latex_documents = [ 186 | ('index', 'pastycake.tex', u'pastycake Documentation', 187 | u'R9+, coh', 'manual'), 188 | ] 189 | 190 | # The name of an image file (relative to this directory) to place at the top of 191 | # the title page. 192 | #latex_logo = None 193 | 194 | # For "manual" documents, if this is true, then toplevel headings are parts, 195 | # not chapters. 196 | #latex_use_parts = False 197 | 198 | # If true, show page references after internal links. 199 | #latex_show_pagerefs = False 200 | 201 | # If true, show URL addresses after external links. 202 | #latex_show_urls = False 203 | 204 | # Documents to append as an appendix to all manuals. 205 | #latex_appendices = [] 206 | 207 | # If false, no module index is generated. 208 | #latex_domain_indices = True 209 | 210 | 211 | # -- Options for manual page output -------------------------------------------- 212 | 213 | # One entry per manual page. List of tuples 214 | # (source start file, name, description, authors, manual section). 215 | man_pages = [ 216 | ('index', 'pastycake', u'pastycake Documentation', 217 | [u'R9+, coh'], 1) 218 | ] 219 | 220 | # If true, show URL addresses after external links. 221 | #man_show_urls = False 222 | 223 | 224 | # -- Options for Texinfo output ------------------------------------------------ 225 | 226 | # Grouping the document tree into Texinfo files. List of tuples 227 | # (source start file, target name, title, author, 228 | # dir menu entry, description, category) 229 | texinfo_documents = [ 230 | ('index', 'pastycake', u'pastycake Documentation', 231 | u'R9+, coh', 'pastycake', 'One line description of project.', 232 | 'Miscellaneous'), 233 | ] 234 | 235 | # Documents to append as an appendix to all manuals. 236 | #texinfo_appendices = [] 237 | 238 | # If false, no module index is generated. 239 | #texinfo_domain_indices = True 240 | 241 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 242 | #texinfo_show_urls = 'footnote' 243 | 244 | 245 | # Example configuration for intersphinx: refer to the Python standard library. 246 | intersphinx_mapping = {'http://docs.python.org/': None} 247 | -------------------------------------------------------------------------------- /doc/source/extending.rst: -------------------------------------------------------------------------------- 1 | Extending pastycake 2 | =================== 3 | 4 | Adding a new Paste Source 5 | ------------------------- 6 | 7 | Adding support for a paste source is simply following these steps: 8 | 9 | * create a new file with the pattern ``_source.py`` where 10 | ```` is the name of the paste that you want to support. 11 | 12 | * add ``from pastesource import PasteSource`` to the top of the file. 13 | * create a class ``Source`` that inherits ``PasteSource``. 14 | * implement all methods of the ``PasteSource`` abc. 15 | * (optional) write a setup.py that specifies the proper entry point for the 16 | ``Source``. The namespace for this is 17 | 18 | - ``pastycake`` if you want to submit it for inclusion into the core, or 19 | - ``pastycake.ext`` otherwise. 20 | 21 | 22 | Adding a new Storage Backend 23 | ---------------------------- 24 | 25 | .. important:: 26 | 27 | This section reflects the latest development but the requirements still 28 | change quite often as new functionality is added. 29 | 30 | Supporting a new storage backend is a bit more complicated: 31 | 32 | * create a new file with the pattern ``_backend.py`` where 33 | ```` is the name of the backend that you want to support. 34 | 35 | * add ``from storage_backend import StorageBackend`` to the top of the file. 36 | * create a class ``Backend`` that inherits ``StorageBackend``. 37 | * implement all methods of the ``StorageBackend`` abc. 38 | 39 | * (optional) write a setup.py that specified the proper entry point for the 40 | ``Backend``. The namespace for this is the same as above. 41 | 42 | 43 | Adding a new Storage Backend that supports Keyword refreshing 44 | ------------------------------------------------------------- 45 | 46 | combine the above steps with the following: 47 | 48 | * also add ``from keywords import KeywordStorage`` 49 | * also inherit from ``KeywordStorage`` 50 | * also implement all methods/params of the ``KeywordStorage`` abc. 51 | .. important:: 52 | 53 | You have to re-decorate the ``@abc.abstractproperty`` methods with 54 | ``@property`` 55 | -------------------------------------------------------------------------------- /doc/source/features/index.rst: -------------------------------------------------------------------------------- 1 | Pastycake Features 2 | ================== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | 7 | keyword-loading 8 | -------------------------------------------------------------------------------- /doc/source/features/keyword-loading.rst: -------------------------------------------------------------------------------- 1 | Keyword (Re-)loading 2 | -------------------- 3 | 4 | pastycake can query an instance of the ``KeywordStorage`` abc for a(n updated) 5 | list of keywords to use. 6 | 7 | It checks the (in SQL lingo) table ``matchers`` for rows where the ``enabled`` 8 | field is equal to ``True``. For each matching row, it'll return the 9 | ``match_expression`` field. 10 | 11 | En-/Dis-abling a Keyword 12 | ~~~~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | The abc offers the methods ``enable_keyword`` and ``disable_keyword`` to do 15 | that from within python in a backend-neutral manner (as in you don't have to 16 | write custom SQL instructions). 17 | 18 | -------------------------------------------------------------------------------- /doc/source/index.rst: -------------------------------------------------------------------------------- 1 | .. pastycake documentation master file, created by 2 | sphinx-quickstart on Wed Jan 25 17:57:28 2012. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to pastycake's documentation! 7 | ===================================== 8 | 9 | Contents: 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | intro 15 | install 16 | features/index 17 | extending 18 | 19 | 20 | 21 | Indices and tables 22 | ================== 23 | 24 | * :ref:`genindex` 25 | * :ref:`modindex` 26 | * :ref:`search` 27 | 28 | -------------------------------------------------------------------------------- /doc/source/install.rst: -------------------------------------------------------------------------------- 1 | Installing pastycake 2 | ==================== 3 | 4 | Installation of pastycake is not (yet) totally straight forward. 5 | 6 | Requirements 7 | ------------ 8 | 9 | .. important:: 10 | 11 | At least lxml requires the python development files (headers) as well as 12 | a C compiler to be present. 13 | 14 | Required packages, besides Python ..math:`>= 2.6`, are: 15 | 16 | * Louie 17 | * httplib2 18 | * lxml 19 | 20 | In case of Python 2.6 and Python 3.x ..math:`< 3.2`, it also needs the 21 | ``argparse`` module. 22 | 23 | Optional packages, that enable certain features/extensions, are: 24 | 25 | * pymongo (for MongoDB backend support) 26 | 27 | 28 | Python 3 Compatibility 29 | ---------------------- 30 | 31 | Pastycake by itself is Python 3 compatible through the use of 2to3. 32 | However, some libraries that it needs to work are not. 33 | 34 | 35 | Installing pastycake 36 | -------------------- 37 | 38 | Except for the above Python 3 compatibility issue, the install is fairly easy:: 39 | 40 | python setup.py sdist 41 | pip install dist/pastycake-.tar.gz 42 | 43 | or simply:: 44 | 45 | python setup.py install 46 | -------------------------------------------------------------------------------- /doc/source/intro.rst: -------------------------------------------------------------------------------- 1 | Introduction to Pastycake 2 | ========================= 3 | 4 | Pastycake by itself is a package that allows one to collect data off of paste 5 | services such as pastebin.com . 6 | 7 | It provides 3 scripts, ``pastycake``, ``pastycake-harvest`` and 8 | ``pastycake-snatch``, to make it easy to use right out of the box. The two 9 | latter scripts are merely frontends to ``pastycake`` as they define the gather 10 | mode. 11 | 12 | Gather modes 13 | ------------ 14 | 15 | There are two gather modes so far: 16 | 17 | * ``harvest`` Continue forever with the gathering while taking small naps in 18 | between. 19 | * ``snatch`` Check for the latest&greatest, then stop. 20 | 21 | Program Options 22 | --------------- 23 | 24 | See the output of ``pastycake -h`` for details. 25 | 26 | -------------------------------------------------------------------------------- /example/config.ini: -------------------------------------------------------------------------------- 1 | ;[backend] 2 | ;type = pastycake.ext.storage:MongoDB 3 | ;host = 127.0.0.3 4 | ;username = abc 5 | ;password = changeme 6 | 7 | [backend] 8 | type = storage:Sqlite 9 | 10 | [keywords] 11 | file = kw.txt 12 | add = false 13 | 14 | [listeners] 15 | listener1 = notify:Cli 16 | listener2 = notify:Mail 17 | 18 | [listener2] 19 | recv_addr = foo@bar 20 | 21 | [sources] 22 | source1 = sources:Pastebin 23 | -------------------------------------------------------------------------------- /example/config.py: -------------------------------------------------------------------------------- 1 | from pastycake.mongodb_backend import MongoBackend 2 | from pastycake.mailer import Mailer 3 | from pastycake.pastie_source import PastieSource 4 | 5 | 6 | backend = MongoBackend(host='127.0.0.3', username='abc', 7 | password='chngeme') 8 | 9 | 10 | def _my_awesome_kw_filter(text): 11 | #TODO: investigate why the file-level re import doesn't seem to work 12 | import re 13 | 14 | res = [] 15 | tmp = re.search('username', text, re.I) 16 | if tmp: 17 | res.append(tmp.group()) 18 | 19 | tmp = re.search('password', text, re.I) 20 | if tmp: 21 | res.append(tmp.group()) 22 | 23 | tmp = re.search('p[0Oo]rn', text, re.I) 24 | if tmp: 25 | res = [] 26 | 27 | return None if 2 != len(res) else ','.join(res) 28 | 29 | 30 | keywords = ['foo', 'bar', _my_awesome_kw_filter] 31 | 32 | listeners = [Mailer('foo@bar')] 33 | 34 | sources = [PastieSource()] 35 | -------------------------------------------------------------------------------- /pastycake/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/9b/pastycake/f02363d822dae7111ecc70a1ad435d88d57be939/pastycake/__init__.py -------------------------------------------------------------------------------- /pastycake/cli_notifier.py: -------------------------------------------------------------------------------- 1 | import louie as L 2 | 3 | from .notifier import Notifier 4 | 5 | 6 | class CliNotifier(Notifier): 7 | def __init__(self): 8 | L.connect(self._handle_match, signal='match', sender=L.Any) 9 | 10 | def _handle_match(self, *args, **kwargs): 11 | print '%s matched %s' % (kwargs.get('url', ''), 12 | kwargs.get('match', '')) 13 | -------------------------------------------------------------------------------- /pastycake/config.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import imp 3 | import os.path 4 | import sys 5 | 6 | from ConfigParser import ConfigParser 7 | 8 | from pkg_resources import load_entry_point 9 | 10 | from .mailer import Mailer 11 | from .pastebin_source import PastebinSource 12 | from .sqlite_backend import SqliteBackend 13 | from .text_backend import TextBackend 14 | from .cli_notifier import CliNotifier 15 | 16 | from .pastesource import PasteSource 17 | from .storage_backend import StorageBackend 18 | from .notifier import Notifier 19 | 20 | 21 | def _read_keywords(fhandle): 22 | return [_.rstrip() for _ in fhandle] 23 | 24 | 25 | def load_ep_object(epname, section_name=None): 26 | def _do_load(package, section_name, epname): 27 | try: 28 | return load_entry_point(package, section_name, epname) 29 | except ImportError as e: 30 | #print >> sys.stderr, 'failed to load entry point %s: %s' % ( 31 | # epname, e) 32 | return None 33 | 34 | if not section_name: 35 | return _do_load('pastycake', 'pastycake', epname) or \ 36 | _do_load('pastycake', 'pastycake.ext', epname) 37 | else: 38 | return _do_load('pastycake', section_name, epname) 39 | 40 | 41 | def _create_arg_parser(): 42 | opts = argparse.ArgumentParser(description='harvest or snatch pastes') 43 | opts.add_argument('-a', '--alert_email', metavar='EMAIL', type=str, 44 | dest='alert_email', help='email to send alerts to', 45 | default=None, action='store' 46 | ) 47 | opts.add_argument('-c', '--config', metavar='CFG', 48 | dest='config_fname', action='store', default=None, 49 | help='load the config from file CFG. a file ending in \ 50 | .py(co)? will be treated as python source \ 51 | whereas a file ending in .ini or .cfg will \ 52 | be treated as ini-style.' 53 | ) 54 | opts.add_argument('-k', '--use_keyfile', metavar='KWFILE', 55 | dest='kwfile', type=argparse.FileType('r'), 56 | help='read the keywords from KWFILE. if not given \ 57 | as an argument, then the built-in \ 58 | DEFAULT_KEYWORDS will be used.' 59 | ) 60 | opts.add_argument('-o', '--output', metavar='FILENAME', 61 | dest='filename', action='store', default=None, 62 | type=str, 63 | help='specify a different output filename' 64 | ) 65 | opts.add_argument('gather_mode', metavar='MODE', type=str, 66 | choices=('harvest', 'snatch'), 67 | help="the mode to use. must be one of 'harvest' \ 68 | or 'snatch'" 69 | ) 70 | opts.add_argument('add_keywords', metavar='KEYWORDS', nargs='*', 71 | help='additional keywords to search for' 72 | ) 73 | return opts 74 | 75 | 76 | class Config(dict): 77 | _DEFAULT_KEYWORDS = [ 78 | 'password', 79 | 'hack', 80 | ] 81 | 82 | def __init__(self, defaults=None): 83 | super(Config, self).__init__(defaults or dict()) 84 | self._set_default_options() 85 | 86 | def _set_default_options(self): 87 | self['backend'] = SqliteBackend() 88 | self['keywords'] = self._DEFAULT_KEYWORDS 89 | self['notifiers'] = [CliNotifier()] 90 | self['modefunc'] = load_entry_point('pastycake', 'console_scripts', 91 | 'pastycake-harvest') 92 | self['sources'] = [PastebinSource()] 93 | 94 | def _load_python_config(self, filename): 95 | m = imp.new_module('pastycake_config') 96 | m.__file__ = filename 97 | 98 | try: 99 | execfile(filename, m.__dict__) 100 | except IOError as e: 101 | print >> sys.stderr, "Failed to parse config file %s: %s" % ( 102 | filename, e) 103 | return 104 | 105 | self.update(m.__dict__) 106 | 107 | def _load_ini_config(self, filename): 108 | def _map_section(conf, sectname): 109 | return dict([(opt, val) for opt, val in conf.items(sectname)]) 110 | 111 | p = ConfigParser() 112 | p.read(filename) 113 | 114 | if p.has_section('backend'): 115 | tmp = _map_section(p, 'backend') 116 | 117 | if 'type' not in tmp.keys(): 118 | raise LookupError('backend without type specified') 119 | 120 | tmp_obj = load_ep_object(tmp['type']) 121 | assert(issubclass(tmp_obj, StorageBackend)) 122 | 123 | del tmp['type'] 124 | 125 | self['backend'] = tmp_obj(tmp) 126 | 127 | if p.has_section('keywords'): 128 | kws = [] 129 | for _ in filter(lambda x: x.startswith('file'), 130 | p.options('keywords')): 131 | with open(p.get('keywords', _), 'r') as inkws: 132 | kws += _read_keywords(inkws) 133 | 134 | kws = list(set(kws)) 135 | if p.has_option('keywords', 'add') and \ 136 | p.getboolean('keywords', 'add'): 137 | self['keywords'] += kws 138 | else: 139 | self['keywords'] = kws 140 | 141 | for _, _class in (('notifiers', Notifier), ('sources', PasteSource)): 142 | if p.has_section(_): 143 | tmp = [] 144 | for opt in p.options(_): 145 | tmp_obj = load_ep_object(p.get(_, opt)) 146 | assert(issubclass(tmp_obj, _class)) 147 | 148 | obj_opts = _map_section(p, opt) if p.has_section(opt) \ 149 | else {} 150 | tmp.append(tmp_obj(obj_opts)) 151 | 152 | assert(len(tmp)) 153 | self[_] = tmp 154 | 155 | def parse_file(self, filename, format='py'): 156 | if format not in ('py', 'ini'): 157 | raise ValueError("invalid file format") 158 | if format == 'py': 159 | self._load_python_config(filename) 160 | elif format == 'ini': 161 | self._load_ini_config(filename) 162 | 163 | def parse_cli(self, arguments=None): 164 | opts = _create_arg_parser() 165 | 166 | try: 167 | vals = opts.parse_args(arguments) 168 | except IOError as e: 169 | print >> sys.stderr, "failed to parse options: %s" % e 170 | sys.exit(1) 171 | 172 | if vals.config_fname: 173 | extension = os.path.splitext(vals.config_fname)[1] 174 | 175 | if extension.startswith('.py'): 176 | extension = 'py' 177 | elif extension in ('.ini', '.cfg'): 178 | extension = 'ini' 179 | else: 180 | extension = 'py' 181 | self.parse_file(vals.config_fname, extension) 182 | 183 | if vals.kwfile: 184 | self['keywords'].update(_read_keywords(vals.kwfile[0])) 185 | 186 | if vals.alert_email: 187 | self['notifiers'].append(Mailer(opts.alert_email)) 188 | 189 | if vals.gather_mode not in ('harvest', 'snatch'): 190 | print >> sys.stderr, "unknown gathering mode %s" % vals.gather_mode 191 | elif vals.gather_mode == 'harvest': 192 | self['modefunc'] = load_entry_point('pastycake', 'console_scripts', 193 | 'pastycake-%s' % 194 | vals.gather_mode) 195 | else: 196 | self['modefunc'] = load_entry_point('pastycake', 'console_scripts', 197 | 'pastycake-snatch') 198 | self['backend'] = TextBackend() 199 | 200 | self['output.filename'] = vals.filename 201 | -------------------------------------------------------------------------------- /pastycake/gather.py: -------------------------------------------------------------------------------- 1 | import random 2 | import re 3 | import sys 4 | import time 5 | 6 | import louie as L 7 | 8 | from pastycake.config import Config 9 | from pastycake.keywords import KeywordStorage 10 | 11 | 12 | def _fetch_one(generator, path, keywords, storage, store_match): 13 | status, data = generator.get_paste(path) 14 | 15 | #if 5xx or 4xx 16 | if status['status'][0] in ('4', '5'): 17 | #TODO better handling of timeouts 18 | print >> sys.stderr, "%s: %s. skipping" % (path, 19 | status['status']) 20 | return 21 | 22 | full_url = generator.full_url(path) 23 | match = None 24 | text = None 25 | name = None 26 | 27 | for kw in keywords: 28 | if hasattr(kw, '__call__'): 29 | text = kw(data) 30 | name = getattr(kw, '__name__') 31 | else: 32 | match = re.search(kw, data) 33 | name = kw 34 | text = match.group() if match else None 35 | 36 | if match: 37 | L.send('match', generator, storage, match=match.group(), 38 | url=full_url, data=data) 39 | if store_match: 40 | storage.save_url(full_url, [(name, text), ]) 41 | # stop after the first match 42 | break 43 | if not match and store_match: 44 | storage.save_url(full_url, None) 45 | 46 | 47 | #def fetch(storage, sources, keywords, store_match): 48 | def fetch(conf_obj, store_match=True): 49 | keywords = conf_obj['keywords'] 50 | storage = conf_obj['backend'] 51 | 52 | for src in conf_obj['sources']: 53 | for generator, path in src.new_urls(storage): 54 | _fetch_one(generator, path, keywords, storage, store_match) 55 | 56 | if isinstance(storage, KeywordStorage): 57 | conf_obj['keywords'] = storage.current_keywords 58 | 59 | 60 | def main(args=None): 61 | def _backend_or_exit(storage): 62 | storage.connect() 63 | if not storage.connected(): 64 | print >> sys.stderr, "failed to open storage backend" 65 | sys.exit(1) 66 | return storage 67 | 68 | def _load_conf(args=None): 69 | c = Config() 70 | c.parse_cli(args) 71 | return c 72 | 73 | conf = _load_conf(args) 74 | conf['backend'] = _backend_or_exit(conf['backend']) 75 | 76 | backend = conf['backend'] 77 | if isinstance(backend, KeywordStorage): 78 | for _ in conf['keywords']: 79 | backend.enable_keyword(_ if not hasattr(_, '__call__') 80 | else _.__name__) 81 | conf['modefunc'](conf) 82 | 83 | 84 | def harvest(conf=None): 85 | if not conf: 86 | sys.argv = [sys.argv[0]] + ['harvest'] + sys.argv[1:] 87 | return main() 88 | while True: 89 | fetch(conf) 90 | time.sleep(random.randint(5, 15)) 91 | 92 | 93 | def snatch(conf=None): 94 | if not conf: 95 | sys.argv = [sys.argv[0]] + ['snatch'] + sys.argv[1:] 96 | return main() 97 | fetch(conf, False) 98 | -------------------------------------------------------------------------------- /pastycake/keywords.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | 4 | class KeywordStorage(object): 5 | __metaclass__ = abc.ABCMeta 6 | 7 | @abc.abstractmethod 8 | def enable_keyword(self, kw): 9 | '''enable the keyword kw. 10 | 11 | If kw isn't one of the available keywords already, it'll be added 12 | first. 13 | 14 | ''' 15 | raise NotImplemented 16 | 17 | @abc.abstractmethod 18 | def disable_keyword(self, kw): 19 | '''disable the keyword kw. 20 | 21 | If kw isn't one of the available keywords already, it'll be added 22 | first. 23 | 24 | ''' 25 | raise NotImplemented 26 | 27 | @abc.abstractproperty 28 | def available_keywords(self): 29 | '''return a list of all known (but perhaps disabled) keywords.''' 30 | raise NotImplemented 31 | 32 | @abc.abstractproperty 33 | def current_keywords(self): 34 | '''return a list of all enabled keywords.''' 35 | raise NotImplemented 36 | 37 | @property 38 | def enabled_keywords(self): 39 | return self.current_keywords 40 | -------------------------------------------------------------------------------- /pastycake/mailer.py: -------------------------------------------------------------------------------- 1 | import smtplib 2 | import socket 3 | import sys 4 | 5 | import louie as L 6 | 7 | from email.mime.text import MIMEText 8 | 9 | from .notifier import Notifier 10 | 11 | 12 | class Mailer(Notifier): 13 | def __init__(self, recv, sender=None): 14 | self._recv = recv 15 | self._sender = sender 16 | L.connect(self._handle_match, signal='match', sender=L.Any) 17 | 18 | def _handle_match(self, *args, **kwargs): 19 | self.sendmail(url=kwargs.get('url', ''), 20 | matcher=kwargs.get('match', ''), 21 | data=kwargs.get('data', '')) 22 | 23 | def sendmail(self, url, matcher, data): 24 | sender = self._sender or "pastycake@" + socket.gethostname() 25 | recv = self._recv 26 | subject = "[PastyCake] " + url + " matched " + matcher 27 | 28 | try: 29 | text = str(data) 30 | except: 31 | text = '' 32 | 33 | msg = MIMEText(text) 34 | msg['subject'] = subject 35 | msg['From'] = sender 36 | msg['To'] = recv 37 | 38 | server = smtplib.SMTP("localhost") 39 | try: 40 | server.sendmail(sender, recv, msg.as_string()) 41 | server.close() 42 | except smtplib.SMTPException as e: 43 | print >> sys.stderr, "Unable to send email. Error: %s" % e 44 | -------------------------------------------------------------------------------- /pastycake/mongodb_backend.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from datetime import datetime as dt 4 | 5 | try: 6 | import pymongo as M 7 | except ImportError: 8 | print >> sys.stderr, "This backend requires pymongo to be installed" 9 | 10 | from .storage_backend import StorageBackend 11 | 12 | 13 | class MongoBackend(StorageBackend): 14 | DEFAULT_HOST = 'localhost' 15 | DEFAULT_PORT = 27017 16 | DEFAULT_DB = 'pastycake' 17 | 18 | def __init__(self, kwargs): 19 | self._mongo_kwargs = kwargs 20 | self._db_name = kwargs.get('db_name', self.DEFAULT_DB) 21 | self._connected = False 22 | 23 | def already_visited_url(self, url): 24 | return bool(self._db.posts.find({'url': url}).count()) 25 | 26 | def save_url(self, url, match_text=None, rec=0): 27 | def _do_save_url(self, url, match_text): 28 | self._db.posts.insert( 29 | { 30 | 'url': url, 31 | 'matches': match_text, 32 | 'visited': dt.utcnow(), 33 | } 34 | ) 35 | try: 36 | _do_save_url(self, url, match_text) 37 | return 38 | except M.errors.PyMongoError as e: 39 | print >> sys.stderr, 'eror saving url: %s' % e 40 | 41 | # let's try again in case that the cursor timed out 42 | if not rec: 43 | self.connect() 44 | if self.connected(): 45 | self.save_url(url, match_text, rec + 1) 46 | 47 | def connect(self): 48 | try: 49 | self._con = M.Connection(**self._mongo_kwargs) 50 | self._db = self._con[self._db_name] 51 | self._connected = True 52 | except M.errors.PyMongoError as e: 53 | print >> sys.stderr, "failed to connect: %s" % e 54 | self._connected = False 55 | 56 | def connected(self): 57 | return self._connected 58 | -------------------------------------------------------------------------------- /pastycake/notifier.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | 4 | class Notifier(object): 5 | __metaclass__ = abc.ABCMeta 6 | pass 7 | -------------------------------------------------------------------------------- /pastycake/pastebin_source.py: -------------------------------------------------------------------------------- 1 | import httplib2 2 | 3 | from lxml.html import parse 4 | 5 | from .pastesource import PasteSource 6 | 7 | 8 | class PastebinSource(PasteSource): 9 | baseurl = 'http://pastebin.com' 10 | 11 | def __init__(self, *args, **kwargs): 12 | pass 13 | 14 | def new_urls(self, backend): 15 | doc = parse('http://pastebin.com/archive').getroot() 16 | 17 | for link in doc.cssselect('.maintable tr td a'): 18 | app = link.get('href') 19 | if app.startswith('/archive/'): 20 | continue 21 | if not backend.already_visited_url(self.full_url(app)): 22 | yield self, app 23 | 24 | def get_paste(self, path): 25 | url = 'http://pastebin.com/raw.php?i=' + path[1:] 26 | http = httplib2.Http() 27 | try: 28 | res = http.request(url) 29 | except AttributeError as e: 30 | res = ({'status': '503'}, '') 31 | return res 32 | 33 | def full_url(self, path): 34 | return self.baseurl + path 35 | -------------------------------------------------------------------------------- /pastycake/pastesource.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | 4 | class PasteSource: 5 | __metaclass__ = abc.ABCMeta 6 | 7 | @abc.abstractmethod 8 | def new_urls(self, backend): 9 | ''' 10 | @param backend an instance of an class that is an implementation 11 | of the StorageBackend abc. 12 | 13 | Must be implemented as generator yielding new, unvisited URLs 14 | as pairs of (self, path) 15 | 16 | IMPORTANT: requests to db.already_visited_url MUST be made with 17 | the full url (base_url + path) in order to avoid random 18 | inter-domain collisions. 19 | ''' 20 | raise NotImplemented() 21 | 22 | def get_paste(self, path): 23 | ''' 24 | must return a pair containing (http_status_code, paste_data) 25 | ''' 26 | raise NotImplemented() 27 | 28 | def full_url(self, path): 29 | ''' 30 | must return the full uri (base_url + path) for the given path 31 | ''' 32 | raise NotImplemented() 33 | -------------------------------------------------------------------------------- /pastycake/pastie_source.py: -------------------------------------------------------------------------------- 1 | import httplib2 2 | 3 | from lxml.html import parse 4 | 5 | from .pastesource import PasteSource 6 | 7 | 8 | class PastieSource(PasteSource): 9 | baseurl = 'http://pastie.org' 10 | 11 | def __init__(self, *args, **kwargs): 12 | pass 13 | 14 | def new_urls(self, backend): 15 | doc = parse('http://pastie.org/pastes').getroot() 16 | 17 | for link in doc.cssselect('div.pastePreview a'): 18 | app = link.get('href') 19 | 20 | if not backend.already_visited_url(app): 21 | yield self, app 22 | 23 | def get_paste(self, path): 24 | http = httplib2.Http() 25 | try: 26 | res = http.request(path + '/text') 27 | except AttributeError as e: 28 | res = ({'status': '503'}, '') 29 | return res 30 | 31 | def full_url(self, path): 32 | return path 33 | -------------------------------------------------------------------------------- /pastycake/sqlite_backend.py: -------------------------------------------------------------------------------- 1 | import sqlite3 as S 2 | import traceback 3 | 4 | from sys import stderr 5 | 6 | from .storage_backend import StorageBackend 7 | from .keywords import KeywordStorage 8 | 9 | 10 | class SqliteBackend(StorageBackend, KeywordStorage): 11 | DEFAULT_DB = 'urls.db' 12 | _DB_TABLES = ''' 13 | CREATE TABLE matchers ( 14 | id INTEGER PRIMARY KEY AUTOINCREMENT, 15 | match_expression TEXT UNIQUE, 16 | enabled BOOL DEFAULT 1 17 | ); 18 | CREATE TABLE urls ( 19 | id INTEGER PRIMARY KEY AUTOINCREMENT, 20 | url TEXT UNIQUE, 21 | viewed TIMESTAMP DEFAULT CURRENT_TIMESTAMP 22 | ); 23 | CREATE TABLE url_matches ( 24 | id INTEGER PRIMARY KEY AUTOINCREMENT, 25 | url INTEGER REFERENCES urls(id) ON DELETE RESTRICT, 26 | matcher REFERENCES matchers(id) ON DELETE RESTRICT, 27 | matched TEXT 28 | ); 29 | ''' 30 | 31 | def __init__(self, filename=None): 32 | self._con = None 33 | self._filename = filename or self.DEFAULT_DB 34 | 35 | def _save_url(self, url): 36 | try: 37 | curs = self._con.cursor() 38 | curs.execute('INSERT OR IGNORE INTO urls(url) VALUES(?)', (url,)) 39 | self._con.commit() 40 | res = curs.execute('SELECT id FROM urls WHERE url=?', 41 | (url,)).fetchone() 42 | return res[0] if res else None 43 | except S.IntegrityError as e: 44 | print >> stderr, 'save url: %s' % e 45 | return None 46 | except S.Error as e: 47 | print >> stderr, 'save url: %s' % e 48 | return None 49 | 50 | def _save_matcher(self, matchname): 51 | try: 52 | curs = self._con.cursor() 53 | curs.execute('''INSERT OR IGNORE INTO matchers(match_expression) 54 | VALUES(?)''', (matchname,)) 55 | self._con.commit() 56 | res = curs.execute('''SELECT id FROM matchers WHERE 57 | match_expression=?''', (matchname,)).fetchone() 58 | return res[0] if res else None 59 | except S.IntegrityError as e: 60 | print >> stderr, 'save matcher: %s' % e 61 | return None 62 | except S.Error as e: 63 | print >> stderr, 'save matcher: %s' % e 64 | return None 65 | 66 | def _save_urlmatch(self, urlid, matchid, text): 67 | curs = self._con.cursor() 68 | curs.execute('''INSERT OR IGNORE 69 | INTO url_matches(url, matcher, matched) 70 | VALUES(?, ?, ?)''', (urlid, matchid, text)) 71 | self._con.commit() 72 | 73 | def already_visited_url(self, url): 74 | try: 75 | curs = self._con.cursor() 76 | res = curs.execute('SELECT * FROM urls WHERE url=?', 77 | (url,)) 78 | val = res.fetchone() 79 | return bool(val) 80 | except S.Error as e: 81 | print >> stderr, "failed to check url: %s" % e 82 | return False 83 | 84 | def save_url(self, url, matches=None): 85 | try: 86 | url_id = self._save_url(url) 87 | if not url_id: 88 | raise RuntimeError('failed to save or read url id (%s)' % 89 | url) 90 | if matches: 91 | for name, text in matches: 92 | match_id = self._save_matcher(name) 93 | if match_id: 94 | self._save_urlmatch(url_id, match_id, text) 95 | 96 | except S.IntegrityError as e: 97 | print >> stderr, "issue 12 on github %r %s" % (url, e) 98 | traceback.print_exc(file=stderr) 99 | except S.Error as e: 100 | print >> stderr, "failed to save url: %s" % e 101 | raise 102 | return False 103 | 104 | return True 105 | 106 | def connect(self): 107 | try: 108 | self._con = S.connect(self._filename) 109 | self._create_tables() 110 | except S.Error as e: 111 | print >> stderr, "failed to connect to db: %s" % e 112 | self._con = None 113 | 114 | def connected(self): 115 | return bool(self._con) 116 | 117 | def _create_tables(self): 118 | try: 119 | self._con.executescript(self._DB_TABLES) 120 | self._con.commit() 121 | except S.OperationalError: 122 | #table already exists or we failed to lock the db 123 | pass 124 | 125 | # KeywordStorage implementation 126 | def _set_keyword_status(self, kwid, boolean): 127 | c = self._con.cursor() 128 | c.execute('UPDATE matchers SET enabled=? WHERE id=?', (boolean, kwid)) 129 | self._con.commit() 130 | 131 | def enable_keyword(self, kw): 132 | kw_id = self._save_matcher(kw) 133 | self._set_keyword_status(kw_id, True) 134 | 135 | def disable_keyword(self, kw): 136 | kw_id = self._save_matcher(kw) 137 | self._set_keyword_status(kw_id, False) 138 | 139 | @property 140 | def available_keywords(self): 141 | c = self._con.cursor() 142 | return [_[0] for _ in 143 | c.execute('SELECT match_expression FROM matchers' 144 | ).fetchall() 145 | ] 146 | 147 | @property 148 | def current_keywords(self): 149 | c = self._con.cursor() 150 | return [_[0] for _ in c.execute( 151 | 'SELECT match_expression FROM matchers WHERE enabled=?', 152 | (True,) 153 | ).fetchall() 154 | ] 155 | -------------------------------------------------------------------------------- /pastycake/storage_backend.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | 4 | class StorageBackend: 5 | __metaclass__ = abc.ABCMeta 6 | 7 | @abc.abstractmethod 8 | def already_visited_url(self, url): 9 | ''' 10 | @return True if the url has been visited already, False otherwise 11 | ''' 12 | raise NotImplemented() 13 | 14 | @abc.abstractmethod 15 | def save_url(self, url, match_text=None): 16 | ''' 17 | store the url (together with match_text) as having been visited. 18 | ''' 19 | raise NotImplemented() 20 | 21 | @abc.abstractmethod 22 | def connect(self, *condetails, **kwargs): 23 | ''' 24 | connect with the given condetails (and/or kwargs) to the backend. 25 | ''' 26 | raise NotImplemented() 27 | 28 | @abc.abstractmethod 29 | def connected(self): 30 | ''' 31 | @return True if a valid connection was established, False otherwise 32 | ''' 33 | -------------------------------------------------------------------------------- /pastycake/text_backend.py: -------------------------------------------------------------------------------- 1 | from __future__ import with_statement 2 | 3 | from .storage_backend import StorageBackend 4 | 5 | 6 | class TextBackend(StorageBackend): 7 | DEFAULT_FILE = 'tracker.txt' 8 | 9 | def __init__(self, filename=None): 10 | self._connected = False 11 | self._tracked_urls = set() 12 | self._filename = filename or self.DEFAULT_FILE 13 | 14 | def already_visited_url(self, url): 15 | if not self._tracked_urls: 16 | with open(self._filename, 'r+') as tracker: 17 | self._tracked_urls = set([_.rstrip() for _ in 18 | tracker.readlines()]) 19 | 20 | return url in self._tracked_urls 21 | 22 | def save_url(self, url, *args): 23 | self._tracked_urls.add(url) 24 | 25 | with open(self._filename, 'a') as tracker: 26 | tracker.write(str(url) + "\n") 27 | 28 | def connect(self): 29 | fh = None 30 | try: 31 | fh = open(self._filename, 'a+') 32 | self._connected = True 33 | except Exception as e: 34 | self._connected = False 35 | finally: 36 | if fh: 37 | fh.close() 38 | 39 | def connected(self): 40 | return self._connected 41 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from sys import version_info 2 | 3 | from setuptools import setup 4 | 5 | 6 | _ALWAYS_REQUIRED_PACKS = [ 7 | 'distribute', 8 | 'httplib2', 9 | 'Louie', 10 | 'lxml', 11 | ] 12 | 13 | 14 | def _required_packages(): 15 | res = _ALWAYS_REQUIRED_PACKS 16 | vers = version_info[:2] 17 | 18 | # argparse is part of the stdlib in Python2.x >= 2.7 and Python3.x >= 3.2 19 | if vers < (2, 7) or (3, 0) <= vers < (3, 2): 20 | res.append('argparse') 21 | 22 | return res 23 | 24 | 25 | setup( 26 | author='b9', 27 | author_email='brandon@b9plus.com', 28 | classifiers=[ 29 | "Development Status :: 3 - Alpha", 30 | "Environment :: Console", 31 | "Intended Audience :: Developers", 32 | "Intended Audience :: End Users/Desktop", 33 | "Intended Audience :: Information Technology", 34 | "Intended Audience :: Science/Research", 35 | "Natural Language :: English", 36 | "Operating System :: OS Independent", 37 | "Programming Language :: Python :: 2.6", 38 | "Programming Language :: Python :: 2.7", 39 | "Topic :: Communications", 40 | "Topic :: Internet", 41 | "Topic :: Security", 42 | "Topic :: Software Development :: Libraries :: Python Modules", 43 | "Topic :: Text Processing :: Filters" 44 | ], 45 | description='scan pastes for interesting stuff', 46 | entry_points={ 47 | 'console_scripts': [ 48 | 'pastycake-snatch = pastycake.gather:snatch', 49 | 'pastycake-harvest = pastycake.gather:harvest', 50 | 'pastycake = pastycake.gather:main', 51 | ], 52 | 'pastycake': [ 53 | 'sources:Pastebin = pastycake.pastebin_source:PastebinSource', 54 | 'sources:Pastie = pastycake.pastie_source:PastieSource', 55 | 'storage:Sqlite = pastycake.sqlite_backend:SqliteBackend', 56 | 'storage:Text = pastycake.text_backend:TextBackend', 57 | 'notify:Cli = pastycake.cli_notifier:CliNotifier', 58 | 'notify:Mail = pastycake.mailer:Mailer', 59 | ], 60 | 'pastycake.ext': [ 61 | 'storage:MongoDB = pastycake.mongodb_backend:MongoBackend [MongoDB]', 62 | ] 63 | }, 64 | extras_require={ 65 | 'MongoDB': ['pymongo'], 66 | }, 67 | name='pastycake', 68 | packages=['pastycake'], 69 | install_requires=_required_packages(), 70 | url='http://www.gihub.com/9b/pastycake', 71 | use_2to3=True, 72 | version='0.1', 73 | ) 74 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/9b/pastycake/f02363d822dae7111ecc70a1ad435d88d57be939/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_entry_points.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | import pkg_resources as P 4 | 5 | 6 | from pastycake.storage_backend import StorageBackend 7 | 8 | 9 | class TestEntryPoints(TestCase): 10 | def test_entry_points_avail(self): 11 | self.assertTrue(0 < len([_ for _ in P.iter_entry_points('pastycake')])) 12 | 13 | def test_entry_point_load(self): 14 | s = P.load_entry_point('pastycake', 'pastycake', 'storage:Sqlite') 15 | self.assertTrue(issubclass(s, StorageBackend)) 16 | -------------------------------------------------------------------------------- /tests/test_sqlite_backend.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from pastycake.sqlite_backend import SqliteBackend 4 | 5 | 6 | class TestSqliteBackend(TestCase): 7 | def setUp(self): 8 | self.db = SqliteBackend(':memory:') 9 | 10 | def test_connected(self): 11 | self.assertFalse(self.db.connected()) 12 | 13 | def test_connect(self): 14 | self.assertFalse(self.db.connected()) 15 | self.db.connect() 16 | self.assertTrue(self.db.connected()) 17 | 18 | def test_already_visited_url(self): 19 | self.db.connect() 20 | self.assertTrue(self.db.connected()) 21 | 22 | self.assertFalse(self.db.already_visited_url('abc')) 23 | 24 | def test_save_url(self): 25 | self.db.connect() 26 | self.assertTrue(self.db.connected()) 27 | 28 | self.db.save_url('abc', [('\d+', '123'), ]) 29 | self.assertTrue(self.db.already_visited_url('abc')) 30 | 31 | def test_keywords(self): 32 | self.db.connect() 33 | self.db.enable_keyword('foobar') 34 | self.db.disable_keyword('zoo') 35 | self.assertEqual(['foobar', 'zoo'], 36 | self.db.available_keywords) 37 | self.assertEqual(['foobar'], self.db.current_keywords) 38 | self.assertEqual(self.db.enabled_keywords, self.db.current_keywords) 39 | self.db.enable_keyword('zoo') 40 | self.db.disable_keyword('foobar') 41 | self.assertEqual(['zoo'], self.db.enabled_keywords) 42 | --------------------------------------------------------------------------------