├── .gitignore
├── .travis.yml
├── DESCRIPTION.txt
├── Dockerfile-app
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── baleen
    ├── __init__.py
    ├── config.py
    ├── console
    │   ├── __init__.py
    │   ├── app.py
    │   ├── commands
    │   │   ├── __init__.py
    │   │   ├── export.py
    │   │   ├── ingest.py
    │   │   ├── load.py
    │   │   ├── run.py
    │   │   ├── serve.py
    │   │   └── summary.py
    │   └── utils.py
    ├── exceptions.py
    ├── export.py
    ├── feed.py
    ├── ingest.py
    ├── models.py
    ├── opml.py
    ├── utils
    │   ├── __init__.py
    │   ├── decorators.py
    │   ├── logger.py
    │   ├── mongolog.py
    │   └── timez.py
    ├── version.py
    ├── wrangle.py
    └── www
    │   ├── __init__.py
    │   ├── app.py
    │   ├── static
    │       ├── css
    │       │   └── baleen.css
    │       └── favicon.png
    │   └── templates
    │       ├── base.html
    │       ├── components
    │           ├── footer.html
    │           └── navbar.html
    │       ├── index.html
    │       ├── logs.html
    │       └── status.html
├── bin
    ├── baleen
    ├── doctimes.py
    └── ldoc.py
├── conf
    ├── baleen-example.yaml
    ├── upstart
    │   └── baleen.conf
    └── uwsgi
    │   ├── baleen.ini
    │   └── baleen.nginx
├── docker-compose.yml
├── docs
    ├── about.md
    ├── components.md
    ├── images
    │   ├── component_architecture.png
    │   ├── service_architecture.png
    │   ├── spacewhale.jpg
    │   └── whaleship.jpg
    ├── index.md
    └── service.md
├── fixtures
    └── fields.json
├── mkdocs.yml
├── requirements.txt
├── setup.cfg
├── setup.py
└── tests
    ├── __init__.py
    ├── fixtures
        ├── feedly.opml
        └── feedparser_result.pickle
    ├── test_export.py
    ├── test_feed.py
    ├── test_ingest.py
    ├── test_models.py
    ├── test_opml.py
    ├── test_wrangle.py
    └── utils_tests
        ├── __init__.py
        ├── test_decorators.py
        ├── test_logger.py
        ├── test_mongolog.py
        └── test_timez.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | venv/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .cache
41 | nosetests.xml
42 | coverage.xml
43 | 
44 | # Translations
45 | *.mo
46 | *.pot
47 | 
48 | # Django stuff:
49 | *.log
50 | 
51 | # Sphinx documentation
52 | docs/_build/
53 | 
54 | # PyBuilder
55 | target/
56 | 
57 | # Local configurations
58 | conf/baleen.yaml
59 | fixtures/corpus
60 | fixtures/feedly/*
61 | notebook
62 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | language: python
 3 | 
 4 | python:
 5 |   - '2.7'
 6 | 
 7 | before_install:
 8 |   - pip install nose
 9 |   - pip install coverage
10 |   - pip install coveralls
11 |   - pip install mock
12 |   - pip install mongomock
13 | 
14 | install: pip install -r requirements.txt
15 | 
16 | script: make test
17 | 
18 | after_script: coveralls
19 | 
20 | notifications:
21 |   email:
22 |     recipients:
23 |       - benjamin@bengfort.com
24 | 
25 |     on_success: change
26 |     on_failure: always
27 | 


--------------------------------------------------------------------------------
/DESCRIPTION.txt:
--------------------------------------------------------------------------------
1 | Baleen is a tool for ingesting formal natural language data from the discourse of professional and amateur writers: e.g. bloggers and news outlets. Rather than performing web scraping, Baleen focuses on data ingestion through the use of RSS feeds. It performs as much raw data collection as it can, saving data into a Mongo document store.
2 | 
3 | For more, please see the full documentation at: http://baleen-ingest.readthedocs.org/en/latest/
4 | 


--------------------------------------------------------------------------------
/Dockerfile-app:
--------------------------------------------------------------------------------
 1 | FROM python:2.7
 2 | # things we like
 3 | RUN apt-get update && apt-get install -y \
 4 |       git \
 5 |       vim
 6 | # set up volume we will share our codebase with
 7 | VOLUME /baleen
 8 | WORKDIR /baleen
 9 | # add baleen package to our python path
10 | RUN echo $(pwd) > /usr/local/lib/python2.7/site-packages/baleen.pth
11 | # install requirements
12 | COPY requirements.txt requirements.txt
13 | RUN pip install -r requirements.txt
14 | EXPOSE 5000
15 | # until we get the baleen daemon set, just sleep for now
16 | CMD /bin/sleep Inf
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Benjamin Bengfort
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.md
2 | include *.txt
3 | include *.yml
4 | include Makefile
5 | recursive-include docs *.md
6 | recursive-include docs *.jpg
7 | recursive-include tests *.py
8 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Shell to use with Make
 2 | SHELL := /bin/bash
 3 | 
 4 | # Set important Paths
 5 | PROJECT := baleen
 6 | LOCALPATH := $(CURDIR)/$(PROJECT)
 7 | PYTHONPATH := $(LOCALPATH)/
 8 | PYTHON_BIN := $(VIRTUAL_ENV)/bin
 9 | 
10 | # Export targets not associated with files
11 | .PHONY: test coverage pip virtualenv clean publish
12 | 
13 | # Clean build files
14 | clean:
15 | 	find . -name "*.pyc" -print0 | xargs -0 rm -rf
16 | 	-rm -rf htmlcov
17 | 	-rm -rf .coverage
18 | 	-rm -rf build
19 | 	-rm -rf dist
20 | 	-rm -rf $(PROJECT).egg-info
21 | 
22 | # Targets for Coruscate testing
23 | test:
24 | 	$(PYTHON_BIN)/nosetests -v --with-coverage --cover-package=$(PROJECT) --cover-inclusive --cover-erase tests
25 | 
26 | # Publish to gh-pages
27 | publish:
28 | 	git subtree push --prefix=deploy origin gh-pages
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Baleen
  2 | **An automated ingestion service for blogs to construct a corpus for NLP research.**
  3 | 
  4 | [![PyPI version][pypi_img]][pypi_href]
  5 | [![Build Status][travis_img]][travis_href]
  6 | [![Coverage Status][coveralls_img]][coverals_href]
  7 | [![Code Health][health_img]][health_href]
  8 | [![Documentation Status][rtfd_img]][rtfd_href]
  9 | [![Stories in Ready][waffle_img]][waffle_href]
 10 | 
 11 | [![Space Whale](docs/images/spacewhale.jpg)][spacewhale.jpg]
 12 | 
 13 | ## Quick Start
 14 | 
 15 | This quick start is intended to get you setup with Baleen in development mode (since the project is still under development). If you'd like to run Baleen in production, please see the [documentation][rtfd_href].
 16 | 
 17 | 1. Clone the repository
 18 | 
 19 | ```
 20 | $ git clone git@github.com:bbengfort/baleen.git
 21 | $ cd baleen
 22 | ```
 23 | 
 24 | 2. Create a virtualenv and install the dependencies
 25 | 
 26 | ```
 27 | $ virtualenv venv
 28 | $ source venv/bin/activate
 29 | $ pip install -r requirements.txt
 30 | ```
 31 | 
 32 | 3. Add the `baleen` module to your `$PYTHONPATH` via the virtualenv.
 33 | 
 34 | ```
 35 | $ echo $(pwd) > venv/lib/python2.7/site-packages/baleen.pth
 36 | ```
 37 | 
 38 | 4. Create your local configuration file. Edit it with the connection details to your local MongoDB server.  This is also a good time to check and make sure that you can create a database called Baleen on Mongo.
 39 | 
 40 | ```
 41 | $ cp conf/baleen-example.yaml conf/baleen.yaml
 42 | ```
 43 | 
 44 | ```yaml
 45 | debug: true
 46 | testing: false
 47 | database:
 48 |     host: localhost
 49 |     port: 27017
 50 |     name: baleen
 51 | server:
 52 |     host: 127.0.0.1
 53 |     port: 5000
 54 | 
 55 | ```
 56 | 
 57 | 5. Run the tests to make sure everything is ok.
 58 | 
 59 | ```
 60 | $ make test
 61 | ```
 62 | 
 63 | 6. Make sure that the command line utility is ready to go:
 64 | 
 65 | ```
 66 | $ bin/baleen --help
 67 | ```
 68 | 
 69 | 7. Import the feeds from the `feedly.opml` file in the fixtures.
 70 | 
 71 | ```
 72 | $ bin/baleen load tests/fixtures/feedly.opml
 73 | Ingested 36 feeds from 1 OPML files
 74 | ```
 75 | 
 76 | 8. Perform an ingestion of the feeds that were imported from the `feedly.opml` file.
 77 | 
 78 | ```
 79 | $ bin/baleen ingest
 80 | ```
 81 | 
 82 | Your Mongo database collections should be created as you add new documents to them, and at this point you're ready to develop!
 83 | 
 84 | ## Docker Setup
 85 | 
 86 | Included in this repository are files related to setting up the development environment using docker if you wish.
 87 | 
 88 | 1. Install Docker Machine and Docker Compose e.g. with [Docker Toolbox](https://www.docker.com/products/docker-toolbox).
 89 | 
 90 | 2. Clone the repository
 91 | 
 92 | ```
 93 | $ git clone git@github.com:bbengfort/baleen.git
 94 | $ cd baleen
 95 | ```
 96 | 
 97 | 3. Create your local configuration file. Edit it with your configuration details; your MongoDB server will be at host `mongo`.
 98 | 
 99 | ```
100 | $ cp conf/baleen-example.yaml conf/baleen.yaml
101 | ```
102 | 
103 | ```yaml
104 | debug: true
105 | testing: false
106 | database:
107 |     host: mongo
108 |     port: 27017
109 |     name: baleen
110 | server:
111 |     host: 127.0.0.1
112 |     port: 5000
113 | ```
114 | 
115 | 4. Exec interactively into the `app` container to interact with baleen as described in the above setup directions 5-8.
116 | 
117 | ```
118 |     docker exec -it baleen_app_1 /bin/bash
119 | ```
120 | 
121 | ## Web Admin
122 | 
123 | There is a simple Flask application that ships with Baleen that provides information about the current status of the Baleen ingestion. This app can be run locally in development with the following command:
124 | 
125 |     $ bin/baleen serve
126 | 
127 | You can then reach the website at [http://127.0.0.1:5000/](http://127.0.0.1:5000/). Note that the host and port can be configured in the YAML configuration file or as command line arguments to the serve command.
128 | 
129 | ### Deployment
130 | 
131 | The web application is deployed in production as an Nginx + uWSGI + Flask application that is managed by upstart.
132 | 
133 | ## About
134 | 
135 | Baleen is a tool for ingesting _formal_ natural language data from the discourse of professional and amateur writers: e.g. bloggers and news outlets. Rather than performing web scraping, Baleen focuses on data ingestion through the use of RSS feeds. It performs as much raw data collection as it can, saving data into a Mongo document store.
136 | 
137 | ### Throughput
138 | 
139 | [![Throughput Graph](https://graphs.waffle.io/bbengfort/baleen/throughput.svg)](https://waffle.io/bbengfort/baleen/metrics)
140 | 
141 | ### Attribution
142 | 
143 | The image used in this README, ["Space Whale"][spacewhale.jpg] by [hbitik](http://hbitik.deviantart.com/) is licensed under [CC BY-NC-ND 3.0](http://creativecommons.org/licenses/by-nc-nd/3.0/)
144 | 
145 | 
146 | <!-- References -->
147 | [pypi_img]: https://badge.fury.io/py/baleen.svg
148 | [pypi_href]: https://badge.fury.io/py/baleen
149 | [travis_img]: https://travis-ci.org/bbengfort/baleen.svg?branch=master
150 | [travis_href]: https://travis-ci.org/bbengfort/baleen/
151 | [coveralls_img]: https://coveralls.io/repos/github/bbengfort/baleen/badge.svg?branch=master
152 | [coverals_href]: https://coveralls.io/github/bbengfort/baleen?branch=master
153 | [health_img]: https://landscape.io/github/bbengfort/baleen/master/landscape.svg?style=flat
154 | [health_href]: https://landscape.io/github/bbengfort/baleen/master
155 | [waffle_img]: https://badge.waffle.io/bbengfort/baleen.png?label=ready&title=Ready
156 | [waffle_href]: https://waffle.io/bbengfort/baleen
157 | [rtfd_img]: https://readthedocs.org/projects/baleen-ingest/badge/?version=latest
158 | [rtfd_href]: http://baleen-ingest.readthedocs.org/
159 | [spacewhale.jpg]: http://fav.me/d4736q3
160 | 


--------------------------------------------------------------------------------
/baleen/__init__.py:
--------------------------------------------------------------------------------
 1 | # baleen
 2 | # An automated ingestion service for blogs to construct a corpus.
 3 | #
 4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
 5 | # Created:  Fri Sep 19 10:55:58 2014 -0400
 6 | #
 7 | # Copyright (C) 2014 Bengfort.com
 8 | # For license information, see LICENSE.txt
 9 | #
10 | # ID: __init__.py [5ad94d7] benjamin@bengfort.com $
11 | 
12 | """
13 | An automated ingestion service for blogs to construct a corpus for NLP
14 | research.
15 | """
16 | 
17 | ##########################################################################
18 | ## Imports
19 | ##########################################################################
20 | 
21 | from .version import get_version
22 | 
23 | ##########################################################################
24 | ## Package Version
25 | ##########################################################################
26 | 
27 | __version__ = get_version()
28 | 


--------------------------------------------------------------------------------
/baleen/config.py:
--------------------------------------------------------------------------------
 1 | # baleen.config
 2 | # Uses confire to get meaningful configurations from a yaml file
 3 | #
 4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
 5 | # Created:  Fri Sep 19 11:14:33 2014 -0400
 6 | #
 7 | # Copyright (C) 2014 Bengfort.com
 8 | # For license information, see LICENSE.txt
 9 | #
10 | # ID: config.py [5b443de] benjamin@bengfort.com $
11 | 
12 | """
13 | Uses confire to get meaningful configurations from a yaml file
14 | """
15 | 
16 | ##########################################################################
17 | ## Imports
18 | ##########################################################################
19 | 
20 | import os
21 | import confire
22 | 
23 | ##########################################################################
24 | ## Configuration
25 | ##########################################################################
26 | 
27 | class MongoConfiguration(confire.Configuration):
28 |     """
29 |     Configuration for the Mongo database
30 |     """
31 | 
32 |     host = "localhost"
33 |     port = 27017
34 |     name = "baleen"
35 | 
36 | 
37 | class ServerConfiguration(confire.Configuration):
38 |     """
39 |     Configuration for the web server to run an admin UI.
40 |     """
41 | 
42 |     host = "127.0.0.1"
43 |     port = 5000
44 | 
45 | 
46 | class BaleenConfiguration(confire.Configuration):
47 |     """
48 |     Meaningful defaults and required configurations.
49 | 
50 |     debug:    the app will print or log debug statements
51 |     database: connection information for mongo
52 |     """
53 | 
54 |     CONF_PATHS = [
55 |         "/etc/baleen.yaml",                      # System configuration
56 |         os.path.expanduser("~/.baleen.yaml"),    # User specific config
57 |         os.path.abspath("conf/baleen.yaml"),     # Local configuration
58 |     ]
59 | 
60 |     debug      = True
61 |     database   = MongoConfiguration()
62 |     server     = ServerConfiguration()
63 |     logfile    = 'baleen.log'                    # Location to write log
64 |     loglevel   = 'DEBUG'                         # Log messages to record
65 |     fetch_html = True                            # Actually fetch HTML link
66 |     timeout    = 180                             # Timeout for fetching posts/feeds
67 | 
68 | ## Load settings immediately for import
69 | settings = BaleenConfiguration.load()
70 | 
71 | if __name__ == '__main__':
72 |     print settings
73 | 


--------------------------------------------------------------------------------
/baleen/console/__init__.py:
--------------------------------------------------------------------------------
 1 | # baleen.console
 2 | # Implements the baleen console utility.
 3 | #
 4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
 5 | # Created:  Wed Mar 02 10:52:36 2016 -0500
 6 | #
 7 | # Copyright (C) 2016 Bengfort.com
 8 | # For license information, see LICENSE.txt
 9 | #
10 | # ID: __init__.py [da54aa8] benjamin@bengfort.com $
11 | 
12 | """
13 | Implements the baleen console utility.
14 | """
15 | 
16 | ##########################################################################
17 | ## Imports
18 | ##########################################################################
19 | 
20 | from .app import COMMANDS
21 | from .app import BaleenUtility
22 | 


--------------------------------------------------------------------------------
/baleen/console/app.py:
--------------------------------------------------------------------------------
 1 | # baleen.console.app
 2 | # Definition of the Baleen Utility app and commands
 3 | #
 4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
 5 | # Created:  Wed Mar 02 10:54:51 2016 -0500
 6 | #
 7 | # Copyright (C) 2016 Bengfort.com
 8 | # For license information, see LICENSE.txt
 9 | #
10 | # ID: app.py [da54aa8] benjamin@bengfort.com $
11 | 
12 | """
13 | Definition of the Baleen Utility app and commands
14 | http://bbengfort.github.io/tutorials/2016/01/23/console-utility-commis.html
15 | """
16 | 
17 | ##########################################################################
18 | ## Imports
19 | ##########################################################################
20 | 
21 | from commis import color
22 | from commis import ConsoleProgram
23 | 
24 | from baleen.console.commands import *
25 | from baleen.version import get_version
26 | 
27 | ##########################################################################
28 | ## Utility Definition
29 | ##########################################################################
30 | 
31 | DESCRIPTION = "Management and administration commands for Baleen"
32 | EPILOG      = "If there are any bugs or concerns, submit an issue on Github"
33 | COMMANDS    = (
34 |     IngestCommand,
35 |     ExportCommand,
36 |     LoadOPMLCommand,
37 |     SummaryCommand,
38 |     RunCommand,
39 |     ServeCommand,
40 | )
41 | 
42 | 
43 | ##########################################################################
44 | ## The Baleen CLI Utility
45 | ##########################################################################
46 | 
47 | class BaleenUtility(ConsoleProgram):
48 | 
49 |     description = color.format(DESCRIPTION, color.CYAN)
50 |     epilog      = color.format(EPILOG, color.MAGENTA)
51 |     version     = color.format("baleen v{}", color.CYAN, get_version())
52 | 
53 |     @classmethod
54 |     def load(klass, commands=COMMANDS):
55 |         utility = klass()
56 |         for command in commands:
57 |             utility.register(command)
58 |         return utility
59 | 


--------------------------------------------------------------------------------
/baleen/console/commands/__init__.py:
--------------------------------------------------------------------------------
 1 | # baleen.console.commands
 2 | # Comamnds for the Baleen CLI utility.
 3 | #
 4 | # Author:   Benjamin Bengfort <bengfort@cs.umd.edu>
 5 | # Created:  Wed Mar 02 10:54:07 2016 -0500
 6 | #
 7 | # Copyright (C) 2016 Bengfort.com
 8 | # For license information, see LICENSE.txt
 9 | #
10 | # ID: __init__.py [da54aa8] benjamin@bengfort.com $
11 | 
12 | """
13 | Comamnds for the Baleen CLI utility.
14 | """
15 | 
16 | ##########################################################################
17 | ## Imports
18 | ##########################################################################
19 | 
20 | from .ingest import IngestCommand
21 | from .export import ExportCommand
22 | from .load import LoadOPMLCommand
23 | from .summary import SummaryCommand
24 | from .run import RunCommand
25 | from .serve import ServeCommand
26 | 


--------------------------------------------------------------------------------
/baleen/console/commands/export.py:
--------------------------------------------------------------------------------
 1 | # baleen.console.commands.export
 2 | # Export utility to dump an HTML corpus to disk from the database.
 3 | #
 4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
 5 | # Created:  Wed Mar 02 11:12:50 2016 -0500
 6 | #
 7 | # Copyright (C) 2016 Bengfort.com
 8 | # For license information, see LICENSE.txt
 9 | #
10 | # ID: export.py [da54aa8] benjamin@bengfort.com $
11 | 
12 | """
13 | Export utility to dump an HTML corpus to disk from the database.
14 | """
15 | 
16 | ##########################################################################
17 | ## Imports
18 | ##########################################################################
19 | 
20 | import os
21 | import baleen.models as db
22 | 
23 | from commis import Command
24 | from baleen.console.utils import csv
25 | from baleen.export import MongoExporter, SCHEMES
26 | from baleen.utils.timez import Timer
27 | 
28 | ##########################################################################
29 | ## Command
30 | ##########################################################################
31 | 
32 | class ExportCommand(Command):
33 | 
34 |     name = 'export'
35 |     help = 'export the raw HTML corpus for doing NLP'
36 |     args = {
37 |         '--list-categories': {
38 |             'action': 'store_true',
39 |             'default': False,
40 |             'help': 'show the available categories and exit',
41 |         },
42 |         ('-C', '--categories'): {
43 |             'type': csv(str),
44 |             'default': None,
45 |             'metavar': 'csv',
46 |             'help': 'specify a list of categories to export',
47 |         },
48 |         ('-S', '--scheme'): {
49 |             'type': str,
50 |             'default': 'json',
51 |             'choices': SCHEMES,
52 |             'help': 'specify the output format for the corpus',
53 |         },
54 |         'location': {
55 |             'nargs': 1,
56 |             'type': str,
57 |             'metavar': 'corpus directory',
58 |             'help': 'location to write the corpus out to'
59 |         },
60 |     }
61 | 
62 |     def handle(self, args):
63 |         # Connect to database
64 |         db.connect()
65 | 
66 |         # Expand vars and user on the location passed
67 |         root = os.path.expanduser(args.location[0])
68 |         root = os.path.expandvars(root)
69 | 
70 |         # Create the exporter object
71 |         exporter = MongoExporter(
72 |             root, categories=args.categories, scheme=args.scheme
73 |         )
74 | 
75 |         # If list categories is true, list them and exit.
76 |         if args.list_categories:
77 |             return "\n".join(sorted(exporter.categories))
78 | 
79 |         with Timer() as t:
80 |             exporter.export()
81 | 
82 |         return (
83 |             "Baleen corpus export complete in {}\n"
84 |             "Exported {} posts in {} categories\n"
85 |             "More information is in README in {}"
86 |         ).format(
87 |             t, sum(exporter.counts.values()),
88 |             len(exporter.categories), root
89 |         )
90 | 


--------------------------------------------------------------------------------
/baleen/console/commands/ingest.py:
--------------------------------------------------------------------------------
 1 | # baleen.console.commands.ingest
 2 | # Handles the ingestion utility both for OPML and feeds.
 3 | #
 4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
 5 | # Created:  Wed Mar 02 10:58:56 2016 -0500
 6 | #
 7 | # Copyright (C) 2016 Bengfort.com
 8 | # For license information, see LICENSE.txt
 9 | #
10 | # ID: ingest.py [da54aa8] benjamin@bengfort.com $
11 | 
12 | """
13 | Handles the ingestion utility both for OPML and feeds.
14 | """
15 | 
16 | ##########################################################################
17 | ## Imports
18 | ##########################################################################
19 | 
20 | import baleen.models as db
21 | 
22 | from commis import Command
23 | from commis.exceptions import ConsoleError
24 | from baleen.ingest import Ingestor, MongoIngestor, OPMLIngestor
25 | 
26 | ##########################################################################
27 | ## Command
28 | ##########################################################################
29 | 
30 | class IngestCommand(Command):
31 | 
32 |     name = 'ingest'
33 |     help = 'ingests the RSS feeds to MongoDB'
34 |     args = {
35 |         '--opml': {
36 |             'type': str,
37 |             'default': None,
38 |             'help': 'Ingest directly from an OPML file',
39 |         },
40 |         'feeds': {
41 |             'type': str,
42 |             'nargs': "*",
43 |             'default': None,
44 |             'metavar': 'URL',
45 |             'help': 'Specify a list of feeds as urls'
46 |         }
47 |     }
48 | 
49 |     def handle(self, args):
50 | 
51 |         ingestor = MongoIngestor()
52 | 
53 |         if args.opml:
54 |             ingestor = OPMLIngestor(args.opml)
55 |             raise ConsoleError("opml ingestion is an untested utility!")
56 | 
57 |         if args.feeds:
58 |             ingestor = Ingestor(args.feeds)
59 |             raise ConsoleError("feed ingestion is an untested utility!")
60 | 
61 |         db.connect()
62 |         ingestor.ingest()
63 |         return (
64 |             "Processed {feeds} feeds ({timer}): "
65 |             "{posts} posts with {errors} errors"
66 |         ).format(
67 |             timer=ingestor.timer, **ingestor.counts
68 |         )
69 | 


--------------------------------------------------------------------------------
/baleen/console/commands/load.py:
--------------------------------------------------------------------------------
 1 | # baleen.console.commands.load
 2 | # Loads an OPML file from disk into the database.
 3 | #
 4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
 5 | # Created:  Wed Mar 02 11:05:57 2016 -0500
 6 | #
 7 | # Copyright (C) 2016 Bengfort.com
 8 | # For license information, see LICENSE.txt
 9 | #
10 | # ID: load.py [da54aa8] benjamin@bengfort.com $
11 | 
12 | """
13 | Loads an OPML file from disk into the database.
14 | """
15 | 
16 | ##########################################################################
17 | ## Imports
18 | ##########################################################################
19 | 
20 | from commis import Command
21 | from baleen import models as db
22 | from baleen.opml import load_opml
23 | 
24 | ##########################################################################
25 | ## Command
26 | ##########################################################################
27 | 
28 | class LoadOPMLCommand(Command):
29 | 
30 |     name = 'load'
31 |     help = 'loads an OPML file from disk into the database'
32 |     args = {
33 |         'opml': {
34 |             'nargs': "+",
35 |             'type': str,
36 |             'help': 'OPML file(s) to import to the database'
37 |         }
38 |     }
39 | 
40 |     def handle(self, args):
41 |         # Connect to the database
42 |         db.connect()
43 | 
44 |         # Load the OPML files into the database
45 |         count = sum(load_opml(path) for path in args.opml)
46 |         return "Ingested {} feeds from {} OPML files".format(count, len(args.opml))
47 | 


--------------------------------------------------------------------------------
/baleen/console/commands/run.py:
--------------------------------------------------------------------------------
 1 | # baleen.console.commands.run
 2 | # Runs the ingestor in the background every hour.
 3 | #
 4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
 5 | # Created:  Wed Mar 02 11:14:25 2016 -0500
 6 | #
 7 | # Copyright (C) 2016 Bengfort.com
 8 | # For license information, see LICENSE.txt
 9 | #
10 | # ID: run.py [da54aa8] benjamin@bengfort.com $
11 | 
12 | """
13 | Runs the ingestor in the background every hour.
14 | """
15 | 
16 | ##########################################################################
17 | ## Imports
18 | ##########################################################################
19 | 
20 | import time
21 | import baleen
22 | import schedule
23 | import baleen.models as db
24 | 
25 | from commis import Command
26 | from functools import partial
27 | from baleen.ingest import MongoIngestor
28 | from baleen.utils.logger import IngestLogger
29 | 
30 | ##########################################################################
31 | ## Command
32 | ##########################################################################
33 | 
34 | class RunCommand(Command):
35 | 
36 |     name = 'run'
37 |     help = 'runs the ingest command every hour'
38 |     args = {}
39 | 
40 |     def ingest(self, args):
41 |         db.connect()
42 |         ingestor = MongoIngestor()
43 |         ingestor.ingest()
44 | 
45 |     def handle(self, args):
46 |         logger = IngestLogger()
47 |         logger.info(
48 |             "Starting Baleen v{} ingestion service every hour.".format(baleen.get_version())
49 |         )
50 | 
51 |         schedule.every().hour.do(partial(self.ingest, args))
52 | 
53 |         while True:
54 |             try:
55 |                 schedule.run_pending()
56 |                 time.sleep(1)
57 |             except (KeyboardInterrupt, SystemExit):
58 |                 logger.info("Graceful shutdown of Baleen ingestion service.")
59 |                 return ""
60 |             except Exception as e:
61 |                 logger.critical(str(e))
62 |                 return str(e)
63 | 


--------------------------------------------------------------------------------
/baleen/console/commands/serve.py:
--------------------------------------------------------------------------------
 1 | # baleen.console.commands.serve
 2 | # Run a local development version of the Baleen Flask app.
 3 | #
 4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
 5 | # Created:  Thu Apr 07 08:05:34 2016 -0400
 6 | #
 7 | # Copyright (C) 2016 Bengfort.com
 8 | # For license information, see LICENSE.txt
 9 | #
10 | # ID: serve.py [] benjamin@bengfort.com $
11 | 
12 | """
13 | Run a local development version of the Baleen Flask app.
14 | """
15 | 
16 | ##########################################################################
17 | ## Imports
18 | ##########################################################################
19 | 
20 | from commis import Command
21 | from baleen.www.app import app
22 | from baleen.config import settings
23 | 
24 | ##########################################################################
25 | ## Command
26 | ##########################################################################
27 | 
28 | class ServeCommand(Command):
29 | 
30 |     name = 'serve'
31 |     help = 'serve the Flask administration application'
32 |     args = {
33 |         '--host': {
34 |             'metavar': 'ADDR',
35 |             'default': settings.server.host,
36 |             'help': 'set the host to run the app on'
37 |         },
38 |         '--port': {
39 |             'metavar': 'PORT',
40 |             'type': int,
41 |             'default': settings.server.port,
42 |             'help': 'set the port to run the app on'
43 |         },
44 |         '--debug': {
45 |             'action': 'store_true',
46 |             'help': 'force debug mode in Flask'
47 |         }
48 |     }
49 | 
50 |     def handle(self, args):
51 |         """
52 |         Runs the Baleen Flask application.
53 |         """
54 |         kwargs = {
55 |             'host': args.host,
56 |             'port': args.port,
57 |             'debug': args.debug or settings.debug,
58 |         }
59 | 
60 |         app.run(**kwargs)
61 |         return " * Web application stopped"
62 | 


--------------------------------------------------------------------------------
/baleen/console/commands/summary.py:
--------------------------------------------------------------------------------
  1 | # baleen.console.commands.summary
  2 | # A utility to print out information about the Baleen state.
  3 | #
  4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
  5 | # Created:  Wed Mar 02 11:08:57 2016 -0500
  6 | #
  7 | # Copyright (C) 2016 Bengfort.com
  8 | # For license information, see LICENSE.txt
  9 | #
 10 | # ID: summary.py [da54aa8] benjamin@bengfort.com $
 11 | 
 12 | """
 13 | A utility to print out information about the Baleen state.
 14 | """
 15 | 
 16 | ##########################################################################
 17 | ## Imports
 18 | ##########################################################################
 19 | 
 20 | import baleen
 21 | import baleen.models as db
 22 | 
 23 | from commis import Command
 24 | from baleen.config import settings
 25 | from baleen.utils.timez import HUMAN_DATETIME
 26 | 
 27 | ##########################################################################
 28 | ## Command
 29 | ##########################################################################
 30 | 
 31 | class SummaryCommand(Command):
 32 | 
 33 |     name = 'info'
 34 |     help = 'print info about Baleen from the database'
 35 |     args = {
 36 |         ('-c', '--config'): {
 37 |             'action': 'store_true',
 38 |             'default': False,
 39 |             'help': 'Also print the configuration',
 40 |         }
 41 |     }
 42 | 
 43 |     def handle(self, args):
 44 |         # Setup output and connect to database.
 45 |         output = []
 46 |         db.connect()
 47 | 
 48 |         # Printout configuration details as necessary.
 49 |         if args.config:
 50 |             output.append(u"Configuration:")
 51 |             output.append(unicode(settings))
 52 |             output.append(u"")
 53 | 
 54 |         output.append(u"Baleen v{} Status:".format(baleen.get_version()))
 55 |         output.append(
 56 |             u"{} Feeds and {} Posts after {} Jobs".format(
 57 |                 db.Feed.objects.count(),
 58 |                 db.Post.objects.count(),
 59 |                 db.Job.objects.count(),
 60 |             )
 61 |         )
 62 | 
 63 |         latest = db.Job.objects.order_by('-started').first()
 64 |         output.extend([
 65 |             u"",
 66 |             u"Latest Job: ",
 67 |             u"    Type: {} v{}".format(latest.name, latest.version),
 68 |             u"    Job ID: {}".format(latest.jobid),
 69 |             u"    Started: {}".format(latest.started.strftime(HUMAN_DATETIME))
 70 |         ])
 71 | 
 72 |         if latest.finished:
 73 |             if latest.failed:
 74 |                 output.append(u"    Failed: {}".format(latest.reason))
 75 |             else:
 76 |                 output.append(u"    Finished: {}".format(latest.finished.strftime(HUMAN_DATETIME)))
 77 |                 output.append(u"    Counts:")
 78 |                 output.append(u"      " + u"\n      ".join([u"{}: {}".format(*item) for item in latest.counts.items()]))
 79 |                 output.append(u"    Errors:")
 80 |                 output.append(u"      " + u"\n      ".join([u"{}: {}".format(*item) for item in latest.errors.items()]))
 81 |         else:
 82 |             output.append(u"    Currently Running")
 83 | 
 84 |         latest = db.Feed.objects.order_by('-updated').first()
 85 |         output.extend([
 86 |             u"",
 87 |             u"Latest Feed: ",
 88 |             u"    Title: \"{}\"".format(latest.title),
 89 |             u"    eTag: \"{}\"".format(latest.etag),
 90 |             u"    Modified: {}".format(latest.modified),
 91 |             u"    Updated: {}".format(latest.updated.strftime(HUMAN_DATETIME)),
 92 |             # u"    Posts: {}".format(latest.count_posts()), # This is very slow need to fix.
 93 |         ])
 94 | 
 95 |         latest = db.Post.objects.order_by('-id').first()
 96 |         output.extend([
 97 |             u"",
 98 |             u"Latest Post: ",
 99 |             u"    Title: \"{}\"".format(latest.title),
100 |             u"    Feed: \"{}\"".format(latest.feed.title),
101 |             u"    Fetched: {}".format(latest.created.strftime(HUMAN_DATETIME)),
102 |         ])
103 | 
104 |         return u"\n".join(output).encode('utf-8', errors='replace')
105 | 


--------------------------------------------------------------------------------
/baleen/console/utils.py:
--------------------------------------------------------------------------------
 1 | # baleen.console.utils
 2 | # Argparse extensions and utilities.
 3 | #
 4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
 5 | # Created:  Wed Mar 02 11:01:35 2016 -0500
 6 | #
 7 | # Copyright (C) 2016 Bengfort.com
 8 | # For license information, see LICENSE.txt
 9 | #
10 | # ID: utils.py [da54aa8] benjamin@bengfort.com $
11 | 
12 | """
13 | Argparse extensions and utilities.
14 | """
15 | 
16 | ##########################################################################
17 | ## Imports
18 | ##########################################################################
19 | 
20 | import argparse
21 | 
22 | 
23 | ##########################################################################
24 | ## Console Parsers
25 | ##########################################################################
26 | 
27 | def csv(ptype=int):
28 |     """
29 |     Argparse type for comma seperated values. Also parses the type, e.g. int.
30 |     """
31 |     def parser(s):
32 |         try:
33 |             parse = lambda p: ptype(p.strip())
34 |             return map(parse, s.split(","))
35 |         except Exception:
36 |             raise argparse.ArgumentTypeError(
37 |                 "Could not parse CSV value to type {}: {!r}".format(ptype.__name__, s)
38 |             )
39 | 
40 |     return parser
41 | 


--------------------------------------------------------------------------------
/baleen/exceptions.py:
--------------------------------------------------------------------------------
 1 | # baleen.exceptions
 2 | # Exceptions hierarchy for the Baleen module.
 3 | #
 4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
 5 | # Created:  Wed Mar 02 13:59:03 2016 -0500
 6 | #
 7 | # Copyright (C) 2016 Bengfort.com
 8 | # For license information, see LICENSE.txt
 9 | #
10 | # ID: exceptions.py [538b33d] benjamin@bengfort.com $
11 | 
12 | """
13 | Exceptions hierarchy for the Baleen module.
14 | """
15 | 
16 | ##########################################################################
17 | ## Exceptions Hierarchy
18 | ##########################################################################
19 | 
20 | class BaleenError(Exception):
21 |     """
22 |     The root of all errors in Baleen (hopefully)
23 |     """
24 |     pass
25 | 
26 | 
27 | class FeedTypeError(BaleenError):
28 |     """
29 |     Could not detect the feed type for synchronization
30 |     """
31 |     pass
32 | 
33 | 
34 | class IngestionError(BaleenError):
35 |     """
36 |     Something went wrong specifically with the ingestion process
37 |     """
38 |     pass
39 | 
40 | 
41 | class SynchronizationError(IngestionError):
42 |     """
43 |     Something went wrong with feed synchronization
44 |     """
45 |     pass
46 | 
47 | 
48 | class WranglingError(IngestionError):
49 |     """
50 |     Something went wrong wrangling a post
51 |     """
52 |     pass
53 | 
54 | 
55 | class FetchError(WranglingError):
56 |     """
57 |     Could not fetch the webpage for the post
58 |     """
59 |     pass
60 | 
61 | 
62 | class ExportError(BaleenError):
63 |     """
64 |     Something went wrong with the export of the corpus
65 |     """
66 |     pass
67 | 
68 | 
69 | class TimeoutError(Exception):
70 |     """
71 |     An operation timed out
72 |     """
73 |     pass
74 | 


--------------------------------------------------------------------------------
/baleen/export.py:
--------------------------------------------------------------------------------
  1 | # baleen.export
  2 | # Export an HTML corpus for analyses with NLTK
  3 | #
  4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
  5 | # Created:  Fri Oct 03 16:49:20 2014 -0400
  6 | #
  7 | # Copyright (C) 2014 Bengfort.com
  8 | # For license information, see LICENSE.txt
  9 | #
 10 | # ID: export.py [eb962e7] benjamin@bengfort.com $
 11 | 
 12 | """
 13 | Export an HTML corpus for analyses with NLTK
 14 | """
 15 | 
 16 | ##########################################################################
 17 | ## Imports
 18 | ##########################################################################
 19 | 
 20 | import os
 21 | import codecs
 22 | 
 23 | from enum import Enum
 24 | from datetime import datetime
 25 | from baleen.models import Feed, Post
 26 | from baleen.exceptions import ExportError
 27 | from collections import Counter
 28 | from operator import itemgetter
 29 | 
 30 | ##########################################################################
 31 | ## Module Constants
 32 | ##########################################################################
 33 | 
 34 | DTFMT   = "%b %d, %Y at %H:%M"
 35 | SCHEMES = ('json', 'html')
 36 | State   = Enum('State', 'Init, Started, Finished')
 37 | 
 38 | ##########################################################################
 39 | ## Exporter
 40 | ##########################################################################
 41 | 
 42 | class MongoExporter(object):
 43 |     """
 44 |     The exporter attempts to read the MongoDB as efficiently as possible,
 45 |     writing posts to disk in either HTML or JSON format.
 46 |     """
 47 | 
 48 |     def __init__(self, root, categories=None, scheme='json'):
 49 |         self.root   = root              # Location on disk to write to
 50 |         self.scheme = scheme.lower()    # Output format of the data
 51 |         self.state  = State.Init        # Current state of the export
 52 |         self.counts = Counter()         # Counts of posts per category
 53 |         self.categories = categories    # Specific categories to export
 54 | 
 55 |         if self.scheme not in SCHEMES:
 56 |             raise ExportError(
 57 |                 "Unknown export scheme: '{}' - use one of {}.".format(
 58 |                     self.scheme, ", ".join(SCHEMES)
 59 |                 )
 60 |             )
 61 | 
 62 |     @property
 63 |     def categories(self):
 64 |         if self._categories is None:
 65 |             self._categories = Feed.objects.distinct('category')
 66 |         return self._categories
 67 | 
 68 |     @categories.setter
 69 |     def categories(self, value):
 70 |         self._categories = value
 71 | 
 72 |     def feeds(self, categories=None):
 73 |         """
 74 |         Returns a list of feeds for the specified categories.
 75 |         During export, this list is used to construct a feed-category mapping
 76 |         that is used to perform checking of sequential reads of Posts.
 77 |         """
 78 |         if isinstance(categories, basestring):
 79 |             categories = [categories]
 80 |         elif categories is None:
 81 |             categories = self.categories
 82 |         else:
 83 |             categories = list(categories)
 84 | 
 85 |         return Feed.objects(category__in=categories)
 86 | 
 87 |     def posts(self, categories=None):
 88 |         """
 89 |         This method first creates a mapping of feeds to categories, then
 90 |         iterates through the Posts collection, finding only posts with those
 91 |         given feeds (and not dereferencing the related object). This will
 92 |         speed up the post fetch process and give us more information, quickly.
 93 | 
 94 |         The generator therefore yields post, category tuples to provide for
 95 |         the single pass across the posts.
 96 | 
 97 |         This method also counts the number of posts per category.
 98 | 
 99 |         This method raises an exception if not in the correct state.
100 |         """
101 |         if self.state != State.Started:
102 |             raise ExportError((
103 |                 "Calling the posts method when not in the started state "
104 |                 "could cause double counting or multiple database reads."
105 |             ))
106 | 
107 |         # Create a mapping of feed id to category
108 |         feeds = {
109 |             feed.id: feed.category
110 |             for feed in self.feeds(categories)
111 |         }
112 | 
113 |         # Iterate through all posts that have the given feed ids without
114 |         # dereferencing the related object. Yield (post, category) tuples.
115 |         # This method also counts the number of posts per category.
116 |         for post in Post.objects(feed__in=feeds.keys()).no_dereference().no_cache():
117 |             category = feeds[post.feed.id]
118 |             self.counts[category] += 1
119 | 
120 |             yield post, category
121 | 
122 |     def readme(self, path):
123 |         """
124 |         Writes README information about the state of the export to disk at
125 |         the specified path. The writing requires the export to be finished,
126 |         otherwise, the method will raise an exception.
127 | 
128 |         This method raises an exception if not in the correct state.
129 |         """
130 |         if self.state != State.Finished:
131 |             raise ExportError((
132 |                 "Calling the readme method when not in the finished state "
133 |                 "could lead to writing misleading or incorrect meta data."
134 |             ))
135 | 
136 |         # Create the output lines with the header information.
137 |         output = [
138 |             "Baleen RSS Export",
139 |             "=================", "",
140 |             "Exported on: {}".format(datetime.now().strftime(DTFMT)),
141 |             "{} feeds containing {} posts in {} categories.".format(
142 |                 self.feeds().count(),
143 |                 sum(self.counts.values()),
144 |                 len(self.categories),
145 |             ), "",
146 |             "Category Counts",
147 |             "---------------", "",
148 |         ]
149 | 
150 |         # Append category counts list to the README
151 |         for item in sorted(self.counts.items(), key=itemgetter(0)):
152 |             output.append("- {}: {}".format(*item))
153 | 
154 |         # Add a newline at the end of the README
155 |         output.append("")
156 | 
157 |         # Write out the output to the file as utf-8.
158 |         with codecs.open(path, 'w', encoding='utf-8') as f:
159 |             f.write("\n".join(output))
160 | 
161 |     def feedinfo(self, path):
162 |         """
163 |         Writes information about the feeds to disk for performing lookups on
164 |         the feeds themselves from the object id in each individual post.
165 |         """
166 |         fields = ('id', 'title', 'link', 'category', 'active')
167 |         feeds = Feed.objects(category__in=self.categories).only(*fields)
168 |         with open(path, 'w') as f:
169 |             f.write(feeds.to_json(indent=2))
170 | 
171 |     def export(self):
172 |         """
173 |         Runs the export of the posts to disk.
174 |         """
175 | 
176 |         # Reset the counts object and mark export as started.
177 |         self.counts = Counter()
178 |         self.state  = State.Started
179 | 
180 |         # Make the directory to export if it doesn't exist.
181 |         if not os.path.exists(self.root):
182 |             os.mkdir(self.root)
183 | 
184 |         # If the root is not a directory, then we can't write there.
185 |         if not os.path.isdir(self.root):
186 |             raise ExportError(
187 |                 "'{}' is not a directory!".format(self.root)
188 |             )
189 | 
190 |         # Create the directories for each category on disk and map paths.
191 |         catdir = {}
192 |         for category in self.categories:
193 |             path = os.path.join(self.root, category)
194 | 
195 |             if not os.path.exists(path):
196 |                 os.mkdir(path)
197 | 
198 |             if not os.path.isdir(path):
199 |                 raise ExportError(
200 |                     "'{}' is not a directory!".format(path)
201 |                 )
202 | 
203 |             catdir[category] = path
204 | 
205 |         # Iterate through all posts, writing them to disk correctly.
206 |         # Right now we will simply write them based on their object id.
207 |         for post, category in self.posts():
208 |             path = os.path.join(
209 |                 self.root, catdir[category], "{}.{}".format(post.id, self.scheme)
210 |             )
211 | 
212 |             with codecs.open(path, 'w', encoding='utf-8') as f:
213 |                 action = {
214 |                     'json': lambda: post.to_json(indent=2),
215 |                     'html': post.htmlize,
216 |                 }[self.scheme]
217 | 
218 |                 f.write(action())
219 | 
220 |         # Mark the export as finished and write the README to the corpus.
221 |         self.state = State.Finished
222 |         self.readme(os.path.join(self.root, "README"))
223 |         self.feedinfo(os.path.join(self.root, "feeds.json"))
224 | 
225 | 
226 | if __name__ == '__main__':
227 |     import baleen.models as db
228 | 
229 |     db.connect()
230 |     exporter = MongoExporter('fixtures/corpus')
231 |     exporter.export()
232 | 


--------------------------------------------------------------------------------
/baleen/feed.py:
--------------------------------------------------------------------------------
  1 | # baleen.feed
  2 | # Handles the synchronization of an RSS feed.
  3 | #
  4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
  5 | # Created:  Sun Sep 21 09:58:44 2014 -0400
  6 | #
  7 | # Copyright (C) 2014 Bengfort.com
  8 | # For license information, see LICENSE.txt
  9 | #
 10 | # ID: feed.py [e4baa55] benjamin@bengfort.com $
 11 | 
 12 | """
 13 | Handles the synchronization of documents from an RSS feeds.
 14 | """
 15 | 
 16 | ##########################################################################
 17 | ## Imports
 18 | ##########################################################################
 19 | 
 20 | import feedparser
 21 | 
 22 | from baleen.config import settings
 23 | from baleen.models import Feed
 24 | from baleen.utils.timez import localnow
 25 | from baleen.exceptions import FeedTypeError
 26 | from baleen.exceptions import SynchronizationError
 27 | from baleen.utils.decorators import memoized, reraise, timeout
 28 | 
 29 | 
 30 | ##########################################################################
 31 | ## Module Constants
 32 | ##########################################################################
 33 | 
 34 | FEEDPARSER_IGNORABLE_FIELDS = {
 35 |     'updated', 'updated_parsed', 'id',
 36 |     'published', 'published_parsed', 'category',
 37 | }
 38 | 
 39 | 
 40 | ##########################################################################
 41 | ## Feed Synchronization
 42 | ##########################################################################
 43 | 
 44 | class FeedSync(object):
 45 |     """
 46 |     A utility that wraps both a Feed object and the feedparser library.
 47 |     The feed that is passed into the FeedSync can be one of the following:
 48 | 
 49 |         - a string representing the url to the RSS feed
 50 |         - a dictionary with an xmlUrl key (from OPML)
 51 |         - a Feed object loaded from MongoDB.
 52 | 
 53 |     The feed synchronization utility is smart enough to access what it needs.
 54 |     """
 55 | 
 56 |     URL   = "FEED_URL"
 57 |     DICT  = "FEED_DICT"
 58 |     MODEL = "FEED_MODEL"
 59 | 
 60 |     @classmethod
 61 |     def factory(klass, feeds):
 62 |         """
 63 |         Yields a feed synchronizer for each feed in the feeds.
 64 |         """
 65 |         for feed in feeds:
 66 |             yield klass(feed)
 67 | 
 68 |     def __init__(self, feed):
 69 |         """
 70 |         Feed can be a string (url), a dictionary with an `xmlUrl` or a Feed.
 71 |         """
 72 |         self.feed = feed
 73 | 
 74 |     @memoized
 75 |     def type(self):
 76 |         """
 77 |         Returns the type of the feed.
 78 |         """
 79 |         if isinstance(self.feed, basestring):
 80 |             return self.URL
 81 | 
 82 |         if isinstance(self.feed, Feed):
 83 |             return self.MODEL
 84 | 
 85 |         if isinstance(self.feed, dict):
 86 |             if 'xmlUrl' not in self.feed:
 87 |                 raise FeedTypeError(
 88 |                     "Dictionary object does not contain 'xmlUrl' key!"
 89 |                 )
 90 |             return self.DICT
 91 | 
 92 |         raise FeedTypeError(
 93 |             "Could not determine feed type from '{}'".format(type(self.feed))
 94 |         )
 95 | 
 96 |     @memoized
 97 |     def url(self):
 98 |         """
 99 |         Extracts the url from the feed based on the type.
100 |         """
101 |         return {
102 |             self.URL: lambda: self.feed,
103 |             self.DICT: lambda: self.feed.get('xmlUrl', None),
104 |             self.MODEL: lambda: self.feed.link,
105 |         }[self.type]()
106 | 
107 |     @timeout(settings.timeout)
108 |     def parse(self):
109 |         """
110 |         Wraps the feedparser.parse function such that if the feed is an model,
111 |         it uses the etag or modified to prevent duplicating the download.
112 | 
113 |         NOTE: Calling this function will NOT update the feed use sync instead!
114 |         NOTE: Exceptions in this function will not be handled by Baleen!
115 |         """
116 |         # Only models contain the etag/modified saved information.
117 |         if self.type == self.MODEL:
118 |             # If there is an etag use it (even if there is also modified)
119 |             if self.feed.etag:
120 |                 return feedparser.parse(self.url, etag=self.feed.etag)
121 | 
122 |             # If there is a modified date, then use it
123 |             if self.feed.modified:
124 |                 return feedparser.parse(self.url, modified=self.feed.modified)
125 | 
126 |         # Otherwise just return the parse of the URL
127 |         return feedparser.parse(self.url)
128 | 
129 |     @reraise(klass=SynchronizationError)
130 |     def sync(self, save=True):
131 |         """
132 |         Calls the feedparser.parse function correctly but also synchronizes
133 |         the state of the feed (e.g. last modified, etag, etc.) to MongoDB.
134 | 
135 |         Note: If the feed isn't a model, it just does the same as parse.
136 | 
137 |         If save is True (default) will save the Feed back to MongoDB.
138 |         """
139 |         # Get the result from the parse function.
140 |         result = self.parse()
141 | 
142 |         # If this is not a model, bail out and return the result.
143 |         if not self.type == self.MODEL:
144 |             return result
145 | 
146 |         # Otherwise update the model in MongoDB with synchronization info.
147 |         # Set the last fetched timestamp on the model.
148 |         self.feed.fetched = localnow()
149 | 
150 |         # Update the feed properties from the result.
151 |         for key in ('etag', 'modified', 'version'):
152 |             if key in result and getattr(result, key):
153 |                 setattr(self.feed, key, getattr(result, key))
154 | 
155 |         # Update the link via the href
156 |         if 'href' in result and result.href:
157 |             self.feed.link = result.href
158 | 
159 |         # Update the feed items from the result.
160 |         for key, val in result.feed.items():
161 |             if key in FEEDPARSER_IGNORABLE_FIELDS:
162 |                 # Ignore these generated or protected fields.
163 |                 continue
164 | 
165 |             if key == 'link':
166 |                 self.feed.urls['htmlUrl'] = val
167 | 
168 |             elif key == 'links':
169 |                 for idx, link in enumerate(val):
170 |                     if 'rel' in link:
171 |                         self.feed.urls[link['rel'] + str(idx)] = link['href']
172 |                     else:
173 |                         self.feed.urls["link{}".format(idx)] = link['href']
174 | 
175 |             else:
176 |                 setattr(self.feed, key, val)
177 | 
178 |         if save:
179 |             self.feed.save()
180 | 
181 |         return result
182 | 
183 |     def entries(self, save=True):
184 |         """
185 |         A helper function to simultaneously call sync and iterate over the
186 |         entries from the feed. This is the usual method of interacting with
187 |         the feed sync object. Note that this just returns raw dicts not Posts.
188 |         """
189 |         result = self.sync(save=save)
190 |         return result.entries
191 | 


--------------------------------------------------------------------------------
/baleen/ingest.py:
--------------------------------------------------------------------------------
  1 | # baleen.ingest
  2 | # The ingestion runner that implements ingestion for a collection of feeds.
  3 | #
  4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
  5 | # Created:  Wed Mar 02 23:23:06 2016 -0500
  6 | #
  7 | # Copyright (C) 2016 Bengfort.com
  8 | # For license information, see LICENSE.txt
  9 | #
 10 | # ID: ingest.py [4ee79a0] benjamin@bengfort.com $
 11 | 
 12 | """
 13 | The ingestion runner that implements ingestion for a collection of feeds.
 14 | """
 15 | 
 16 | ##########################################################################
 17 | ## Imports
 18 | ##########################################################################
 19 | 
 20 | import uuid
 21 | 
 22 | from baleen.opml import OPML
 23 | from baleen.exceptions import *
 24 | from baleen import models as db
 25 | from baleen.feed import FeedSync
 26 | from baleen.config import settings
 27 | from baleen.utils.timez import Timer
 28 | from baleen.wrangle import PostWrangler
 29 | from baleen.utils.logger import LoggingMixin
 30 | from baleen.utils.decorators import memoized
 31 | 
 32 | from datetime import datetime
 33 | from collections import Counter
 34 | 
 35 | 
 36 | ##########################################################################
 37 | ## Helper Functions
 38 | ##########################################################################
 39 | 
 40 | def stype(obj):
 41 |     """
 42 |     Returns the string of the type. Used to count exception types.
 43 |     """
 44 |     if isinstance(obj, BaleenError):
 45 |         if hasattr(obj, "original"):
 46 |             return "{} ({})".format(
 47 |                 type(obj).__name__, type(obj.original).__name__
 48 |             )
 49 |     return type(obj).__name__
 50 | 
 51 | 
 52 | ##########################################################################
 53 | ## Base Ingestion Class
 54 | ##########################################################################
 55 | 
 56 | class Ingestor(LoggingMixin):
 57 |     """
 58 |     Base class for the ingestors.
 59 | 
 60 |     Ingestors manage the synchronization of feeds, wrangling of posts, and
 61 |     fetching of web pages to store to the Mongo database. Ingestors can
 62 |     either get feeds from a list of strings, an OPML file or a Mongo query.
 63 | 
 64 |     Ingestors also perform logging and exception handling.
 65 |     """
 66 | 
 67 |     def __init__(self, feeds=None, **options):
 68 |         self.timer   = None         # Processing timer
 69 |         self.jobid   = None         # Unique job id for every run
 70 |         self.options = options      # Any other options passed in
 71 |         self._feeds  = feeds        # Allows pass in feed collection
 72 |         self.errors  = Counter()    # Count the number of error types
 73 | 
 74 |     @property
 75 |     def name(self):
 76 |         return self.__class__.__name__
 77 | 
 78 |     @memoized
 79 |     def counts(self):
 80 |         """
 81 |         Keep track of counts and ensure zero keys exist.
 82 |         """
 83 |         counts = Counter()
 84 |         for key in ('feeds', 'posts', 'errors', 'feed_error'):
 85 |             counts[key] = 0
 86 |         return counts
 87 | 
 88 |     def feeds(self):
 89 |         """
 90 |         This is the primary entry point for subclasses, they must specificy
 91 |         how to get access to a collection of feeds to syncrhonize.
 92 |         """
 93 |         if self._feeds is not None:
 94 |             return self._feeds
 95 | 
 96 |         raise IngestionError(
 97 |             "No feeds specified for {} ingestion!".format(self.name)
 98 |         )
 99 | 
100 |     def started(self):
101 |         """
102 |         Run when the ingestor is started and used for logging. Subclasses can
103 |         use it as a hook to perform extra work right before kick off.
104 |         """
105 |         message = "{} job {} started".format(self.name, self.jobid)
106 |         self.logger.info(message)
107 | 
108 |     def failed(self, exception):
109 |         """
110 |         Executed when a complete ingestion run has failed (very bad). Used
111 |         to log the exception or clean up before Baleen crashes!
112 |         """
113 |         message = "{} job {} failed!".format(self.name, self.jobid)
114 |         self.logger.error("Ingestion Error: {}".format(exception))
115 |         self.logger.critical(message)
116 | 
117 |     def finished(self):
118 |         """
119 |         Run when the ingestor has finished and used for logging. Subclasses
120 |         can use it as a hook to perform any completion work.
121 |         """
122 |         # Notify the results
123 |         results = (
124 |             "Processed {feeds} feeds ({timer}) "
125 |             "{posts} posts with {errors} errors"
126 |         ).format(
127 |             timer=self.timer, **self.counts
128 |         )
129 |         self.logger.info(results)
130 | 
131 |         # Notify job finished
132 |         message  = "{} job {} finished".format(self.name, self.jobid)
133 |         self.logger.info(message)
134 | 
135 |     def process(self):
136 |         """
137 |         Runs the ingestion process by iterating over the feeds, synchronizing
138 |         and then wrangling posts into the database as well as fetching pages.
139 |         """
140 |         for idx, fsync in enumerate(FeedSync.factory(self.feeds())):
141 |             try:
142 |                 self.process_feed(fsync)
143 |                 self.counts['feeds'] += 1
144 |             except SynchronizationError as e:
145 |                 self.counts['feed_error'] += 1
146 |                 self.errors[stype(e)] += 1
147 |                 self.logger.error(
148 |                     u"Error on Feed {} ({}): {}".format(
149 |                         idx+1, fsync.feed, unicode(e)
150 |                     )
151 |                 )
152 | 
153 |     def process_feed(self, fsync):
154 |         """
155 |         Synchronizes a feed and catches exceptions
156 |         """
157 |         factory = PostWrangler.factory(fsync.entries(), fsync.feed)
158 |         for idx, post in enumerate(factory):
159 |             try:
160 |                 self.process_post(post)
161 |                 self.counts["posts"] += 1
162 |             except WranglingError as e:
163 |                 self.counts["errors"] += 1
164 |                 self.errors[stype(e)] += 1
165 |                 self.logger.error(
166 |                     u"Post Error for feed {} on entry {}: {}".format(
167 |                         fsync.feed, idx, unicode(e)
168 |                     )
169 |                 )
170 | 
171 |     def process_post(self, post):
172 |         """
173 |         Wrangles a post from a single feed and catches exceptions
174 |         """
175 |         post.wrangle()
176 |         if settings.fetch_html:
177 |             try:
178 |                 post.fetch()
179 |             except FetchError as e:
180 |                 self.counts["fetch_error"] += 1
181 |                 self.errors[stype(e)] += 1
182 |                 self.logger.error(
183 |                     u"Fetch Error for post \"{}\" ({}): {}".format(
184 |                         post.post.title, post.post.url, unicode(e)
185 |                     )
186 |                 )
187 | 
188 |     def ingest(self):
189 |         """
190 |         Subclasses do not typically override the ingest method. Instead they
191 |         will override the process hooks for start, failed, and finish,  or the
192 |         process method directly.
193 |         """
194 |         # Set a unique job id for every time run is called.
195 |         # The job id is based on the hostname and a time sequence.
196 |         self.jobid = uuid.uuid1()
197 | 
198 |         # Call the started hook for logging and notification.
199 |         self.started()
200 | 
201 |         # Time how long it takes to perform the processing
202 |         with Timer() as self.timer:
203 |             try:
204 |                 self.process()
205 |             except Exception as e:
206 |                 # If something goes wrong, call the failed hook, then raise.
207 |                 self.failed(e)
208 |                 raise
209 | 
210 |         # Call the finished hook for logging and notification.
211 |         self.finished()
212 | 
213 | 
214 | ##########################################################################
215 | ## Mongo Ingestion Class
216 | ##########################################################################
217 | 
218 | class MongoIngestor(Ingestor):
219 |     """
220 |     Ingests feeds that are stored in the database.
221 |     This type of ingestor also tracks information into the database.
222 |     """
223 | 
224 |     def feeds(self):
225 |         """
226 |         Returns an iterator of all active feeds from the database
227 |         """
228 |         for feed in db.Feed.objects(active=True):
229 |             yield feed
230 | 
231 |     def started(self):
232 |         """
233 |         Save a record about the job start to the database.
234 |         """
235 |         super(MongoIngestor, self).started()
236 |         self.job = db.Job(jobid=self.jobid, name=self.name)
237 |         self.job.save()
238 | 
239 |     def failed(self, exception):
240 |         """
241 |         Save information about the failure to the database.
242 |         """
243 |         super(MongoIngestor, self).failed(exception)
244 |         self.job.failed = True
245 |         self.job.reason = unicode(exception)
246 |         self.job.finished = datetime.now()
247 |         self.job.save()
248 | 
249 |     def finished(self):
250 |         """
251 |         Update the job record in the database.
252 |         """
253 |         super(MongoIngestor, self).finished()
254 |         self.job.reason = u"OK"
255 |         self.job.finished = datetime.now()
256 |         self.job.counts = self.counts
257 |         self.job.errors = self.errors
258 |         self.job.totals = {
259 |             "feeds": db.Feed.objects.count(),
260 |             "posts": db.Post.objects.count(),
261 |             "jobs": db.Job.objects.count(),
262 |         }
263 |         self.job.save()
264 | 
265 | ##########################################################################
266 | ## OPML Ingestion Class
267 | ##########################################################################
268 | 
269 | class OPMLIngestor(Ingestor):
270 |     """
271 |     Ingests feeds from an OPML file.
272 |     """
273 | 
274 |     def __init__(self, path, **options):
275 |         self.opml = OPML(path)
276 |         super(OPMLIngestor, self).__init__(**options)
277 | 
278 |     def feeds(self):
279 |         """
280 |         Returns an iterator of all active feeds from the database
281 |         """
282 |         for feed in self.opml:
283 |             yield feed
284 | 


--------------------------------------------------------------------------------
/baleen/models.py:
--------------------------------------------------------------------------------
  1 | # baleen.models
  2 | # Object Document Models for use with Mongo and mongoengine
  3 | #
  4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
  5 | # Created:  Fri Sep 19 11:30:53 2014 -0400
  6 | #
  7 | # Copyright (C) 2014 Bengfort.com
  8 | # For license information, see LICENSE.txt
  9 | #
 10 | # ID: models.py [5b443de] benjamin@bengfort.com $
 11 | 
 12 | """
 13 | Object Document Models for use with Mongo and mongoengine
 14 | """
 15 | 
 16 | ##########################################################################
 17 | ## Imports
 18 | ##########################################################################
 19 | 
 20 | import baleen
 21 | import hashlib
 22 | import mongoengine as me
 23 | 
 24 | from datetime import datetime,timedelta
 25 | from baleen.config import settings
 26 | from baleen.utils.timez import humanizedelta
 27 | 
 28 | 
 29 | ##########################################################################
 30 | ## Module Constants
 31 | ##########################################################################
 32 | 
 33 | FEEDTYPES = (
 34 |     'atom',
 35 |     'atom01',
 36 |     'atom02',
 37 |     'atom03',
 38 |     'atom10',
 39 |     'cdf',
 40 |     'rss',
 41 |     'rss090',
 42 |     'rss091n',
 43 |     'rss092',
 44 |     'rss093',
 45 |     'rss094',
 46 |     'rss10',
 47 |     'rss20',
 48 | )
 49 | 
 50 | ##########################################################################
 51 | ## Helper Functions
 52 | ##########################################################################
 53 | 
 54 | def connect(**kwargs):
 55 |     """
 56 |     Wrapper for mongoengine connect - connects with configuration details.
 57 |     """
 58 |     name = kwargs.pop('name', settings.database.name)
 59 |     host = kwargs.pop('host', settings.database.host)
 60 |     port = kwargs.pop('port', settings.database.port)
 61 | 
 62 |     return me.connect(name, host=host, port=port, **kwargs)
 63 | 
 64 | ##########################################################################
 65 | ## Models
 66 | ##########################################################################
 67 | 
 68 | class Feed(me.DynamicDocument):
 69 | 
 70 |     version   = me.StringField(choices=FEEDTYPES)
 71 |     etag      = me.StringField()
 72 |     modified  = me.StringField()
 73 |     title     = me.StringField(max_length=256)
 74 |     link      = me.URLField(required=True, unique=True)
 75 |     urls      = me.DictField()
 76 |     category  = me.StringField(required=True)
 77 |     active    = me.BooleanField(default=True)
 78 |     fetched   = me.DateTimeField(default=None)
 79 |     created   = me.DateTimeField(default=datetime.now, required=True)
 80 |     updated   = me.DateTimeField(default=datetime.now, required=True)
 81 | 
 82 |     @classmethod
 83 |     def pre_save(cls, sender, document, **kwargs):
 84 |         document.updated = datetime.now()
 85 | 
 86 |     meta      = {
 87 |         'collection': 'feeds',
 88 |     }
 89 | 
 90 |     @property
 91 |     def xmlurl(self):
 92 |         return self.link
 93 | 
 94 |     @property
 95 |     def htmlurl(self):
 96 |         return self.urls.get('htmlUrl')
 97 | 
 98 |     def count_posts(self):
 99 |         """
100 |         Count the number of associated posts
101 | 
102 |         TODO: This is very, very slow on Mongo (fix and make better).
103 |         """
104 |         return Post.objects(feed=self).count()
105 | 
106 |     def __unicode__(self):
107 |         return self.title if self.title else self.link
108 | 
109 | class Post(me.DynamicDocument):
110 | 
111 |     feed      = me.ReferenceField(Feed)
112 |     title     = me.StringField( max_length=512 )
113 |     url       = me.URLField( required=True, unique=True )
114 |     pubdate   = me.DateTimeField()
115 |     content   = me.StringField( required=True )
116 |     tags      = me.ListField(me.StringField(max_length=256))
117 |     signature = me.StringField( required=True, max_length=64, min_length=64, unique=True )
118 |     created   = me.DateTimeField(default=datetime.now, required=True)
119 |     updated   = me.DateTimeField(default=datetime.now, required=True)
120 | 
121 |     @classmethod
122 |     def pre_save(cls, sender, document, **kwargs):
123 |         document.updated   = datetime.now()
124 |         document.signature = document.hash()
125 | 
126 |     meta      = {
127 |         'collection': 'posts',
128 |     }
129 | 
130 |     def hash(self):
131 |         """
132 |         Returns the SHA256 hash of the content.
133 |         """
134 |         sha = hashlib.sha256()
135 |         sha.update(self.content.encode('UTF-8'))
136 |         return sha.hexdigest()
137 | 
138 |     def htmlize(self):
139 |         """
140 |         Returns an HTML string of the content of the Post.
141 |         In the future we may use bleach to do sanitization or other simple
142 |         sanity checks to ensure that things are going ok, which is why this
143 |         method stub exists. 
144 |         """
145 |         return self.content
146 | 
147 |     def __unicode__(self):
148 |         return self.title if self.title else self.url
149 | 
150 | 
151 | class Job(me.DynamicDocument):
152 | 
153 |     jobid     = me.UUIDField(binary=False, required=True)
154 |     name      = me.StringField(max_length=128, default="Unknown")
155 |     failed    = me.BooleanField(default=False)
156 |     reason    = me.StringField(max_length=512)
157 |     version   = me.StringField(max_length=10, default=baleen.get_version)
158 |     started   = me.DateTimeField(default=datetime.now, required=True)
159 |     finished  = me.DateTimeField(default=None)
160 |     updated   = me.DateTimeField(default=datetime.now, required=True)
161 |     errors    = me.MapField(field=me.IntField())
162 |     counts    = me.MapField(field=me.IntField())
163 |     totals    = me.MapField(field=me.IntField())
164 | 
165 |     @classmethod
166 |     def pre_save(cls, sender, document, **kwargs):
167 |         document.updated = datetime.now()
168 | 
169 |     meta      = {
170 |         'collection': 'jobs',
171 |     }
172 | 
173 |     def duration(self, humanize=False):
174 |         """
175 |         Returns the timedelta of the duration.
176 |         """
177 |         finished = self.finished or datetime.now()
178 |         delta = finished - self.started
179 | 
180 |         if humanize:
181 |             return humanizedelta(
182 |                 days=delta.days,
183 |                 seconds=delta.seconds,
184 |                 microseconds=delta.microseconds
185 |             )
186 |         return delta
187 | 
188 |     @property
189 |     def bootstrap_class(self):
190 |         """
191 |         Uses the duration to determine the colorization of the job.
192 |         """
193 |         if self.finished and self.failed:
194 |             return "danger"
195 | 
196 |         if self.finished and not self.failed:
197 |             if self.duration() > timedelta(minutes=30):
198 |                 return "warning"
199 |             return "success"
200 | 
201 |         if not self.finished:
202 | 
203 |             if self.duration() < timedelta(minutes=30):
204 |                 return "success"
205 | 
206 |             elif timedelta(minutes=30) < self.duration() < timedelta(hours=2):
207 |                 return "warning"
208 | 
209 |             else:
210 |                 return "danger"
211 | 
212 |         return ""
213 | 
214 |     def __unicode__(self):
215 |         return "{} Job {}".format(self.name, self.jobid)
216 | 
217 | 
218 | class Log(me.DynamicDocument):
219 | 
220 |     level     = me.DictField()
221 |     message   = me.StringField(max_length=4096)
222 |     host      = me.StringField(max_length=255)
223 |     user      = me.StringField(max_length=255)
224 |     error     = me.DictField()
225 |     logger    = me.StringField(max_length=255)
226 |     asctime   = me.StringField(max_length=64)
227 |     timestamp = me.DateTimeField()
228 | 
229 |     meta      = {
230 |         'collection': 'logs',
231 |     }
232 | 
233 |     @property
234 |     def bootstrap_class(self):
235 |         """
236 |         Uses the log level to determine the bootstrap class.
237 |         """
238 |         levels = {
239 |             "DEBUG": "success",
240 |             "INFO": "info",
241 |             "WARNING": "warning",
242 |             "WARN": "warning",
243 |             "ERROR": "danger",
244 |             "CRITICAL": "danger",
245 |         }
246 | 
247 |         key = self.level.get('name')
248 |         if key and key in levels:
249 |             return levels[key]
250 |         return ""
251 | 
252 |     def __unicode__(self):
253 |         return self.message
254 | 
255 | ##########################################################################
256 | ## Signals
257 | ##########################################################################
258 | 
259 | me.signals.pre_save.connect(Feed.pre_save, sender=Feed)
260 | me.signals.pre_save.connect(Post.pre_save, sender=Post)
261 | me.signals.pre_save.connect(Post.pre_save, sender=Post)
262 | 


--------------------------------------------------------------------------------
/baleen/opml.py:
--------------------------------------------------------------------------------
  1 | # baleen.opml
  2 | # Reads opml files and gives back outline data
  3 | #
  4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
  5 | # Created:  Sat Sep 20 23:12:07 2014 -0400
  6 | #
  7 | # Copyright (C) 2014 Bengfort.com
  8 | # For license information, see LICENSE.txt
  9 | #
 10 | # ID: opml.py [b2f890b] benjamin@bengfort.com $
 11 | 
 12 | """
 13 | Reads opml files and gives back outline data
 14 | """
 15 | 
 16 | ##########################################################################
 17 | ## Imports
 18 | ##########################################################################
 19 | 
 20 | import baleen.models as db
 21 | from bs4 import BeautifulSoup
 22 | from collections import Counter
 23 | from mongoengine.errors import *
 24 | 
 25 | ##########################################################################
 26 | ## Load Database function
 27 | ##########################################################################
 28 | 
 29 | def load_opml(path):
 30 |     """
 31 |     Loads an OPML file into the Mongo database; returns the count of the
 32 |     number of documents added to the database.
 33 |     """
 34 | 
 35 |     opml = OPML(path)
 36 |     rows = 0
 37 |     for feed in opml:
 38 |         feed.pop('type')                    # Unneeded for database
 39 |         feed.pop('text')                    # Unneeded for database
 40 |         feed['link'] = feed.pop('xmlUrl')   # Rename the XML URL
 41 |         feed['urls'] = {
 42 |             'xmlUrl':  feed['link'],        # Add xmlUrl to urls
 43 |             'htmlUrl': feed.pop('htmlUrl'), # Add htmlUrl to urls
 44 |         }
 45 |         feed = db.Feed(**feed)              # Construct without an ObjectId
 46 | 
 47 |         try:
 48 |             feed.save()
 49 |             rows += 1
 50 |         except NotUniqueError:
 51 |             continue
 52 |     return rows
 53 | 
 54 | ##########################################################################
 55 | ## OPMLReader
 56 | ##########################################################################
 57 | 
 58 | class OPML(object):
 59 | 
 60 |     def __init__(self, path):
 61 |         """
 62 |         Reader for OPML XML files.
 63 |         """
 64 |         self.path = path
 65 | 
 66 |     def categories(self):
 67 |         """
 68 |         Reads the file to capture all the categories
 69 |         """
 70 |         with open(self.path, 'r') as data:
 71 |             soup = BeautifulSoup(data, 'xml')
 72 |             for topic in soup.select('body > outline'):
 73 |                 yield topic['title']
 74 | 
 75 |     def counts(self):
 76 |         """
 77 |         Returns the counts of feeds in each category
 78 |         """
 79 |         counts = Counter()
 80 |         for item in self:
 81 |             counts[item['category']] += 1
 82 |         return counts
 83 | 
 84 |     def __iter__(self):
 85 |         """
 86 |         Yields a dictionary representing the attributes of the RSS feed
 87 |         from the OPML file; also captures category data.
 88 |         """
 89 |         with open(self.path, 'r') as data:
 90 |             soup = BeautifulSoup(data, 'xml')
 91 |             for topic in soup.select('body > outline'):
 92 |                 for feed in topic.find_all('outline'):
 93 |                     data = feed.attrs.copy()
 94 |                     data['category'] = topic['title']
 95 |                     yield data
 96 | 
 97 |     def __len__(self):
 98 |         return sum(1 for item in self)
 99 | 
100 |     def __str__(self):
101 |         counts = self.counts()
102 |         return "OPML with {} categories and {} feeds".format(
103 |             len(counts), sum(counts.values())
104 |         )
105 | 
106 |     def __repr__(self):
107 |         return "<{} at {}>".format(self.__class__.__name__, self.path)
108 | 


--------------------------------------------------------------------------------
/baleen/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # baleen.utils
 2 | # Utilities and helpers functions for the Baleen project.
 3 | #
 4 | # Author:   Benjamin Bengfort <bengfort@cs.umd.edu>
 5 | # Created:  Sun Feb 21 15:00:06 2016 -0500
 6 | #
 7 | # Copyright (C) 2016 University of Maryland
 8 | # For license information, see LICENSE.txt
 9 | #
10 | # ID: __init__.py [caaaaca] benjamin@bengfort.com $
11 | 
12 | """
13 | Utilities and helpers functions for the Baleen project.
14 | """
15 | 
16 | ##########################################################################
17 | ## Imports
18 | ##########################################################################
19 | 
20 | # For the log configuration to work
21 | from . import mongolog
22 | 


--------------------------------------------------------------------------------
/baleen/utils/decorators.py:
--------------------------------------------------------------------------------
  1 | # baleen.utils.decorators
  2 | # Decorators and function utilities for Baleen.
  3 | #
  4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
  5 | # Created:  Wed Mar 02 19:03:43 2016 -0500
  6 | #
  7 | # Copyright (C) 2016 Bengfort.com
  8 | # For license information, see LICENSE.txt
  9 | #
 10 | # ID: decorators.py [538b33d] benjamin@bengfort.com $
 11 | 
 12 | """
 13 | Decorators and function utilities for Baleen.
 14 | """
 15 | 
 16 | ##########################################################################
 17 | ## Imports
 18 | ##########################################################################
 19 | 
 20 | import signal
 21 | from functools import wraps
 22 | from baleen.utils.timez import Timer
 23 | from baleen.exceptions import BaleenError, TimeoutError
 24 | 
 25 | ##########################################################################
 26 | ## Memoization
 27 | ##########################################################################
 28 | 
 29 | def memoized(fget):
 30 |     """
 31 |     Return a property attribute for new-style classes that only calls its
 32 |     getter on the first access. The result is stored and on subsequent
 33 |     accesses is returned, preventing the need to call the getter any more.
 34 |     https://github.com/estebistec/python-memoized-property
 35 |     """
 36 |     attr_name = '_{0}'.format(fget.__name__)
 37 | 
 38 |     @wraps(fget)
 39 |     def fget_memoized(self):
 40 |         if not hasattr(self, attr_name):
 41 |             setattr(self, attr_name, fget(self))
 42 |         return getattr(self, attr_name)
 43 | 
 44 |     return property(fget_memoized)
 45 | 
 46 | 
 47 | ##########################################################################
 48 | ## Timer functions
 49 | ##########################################################################
 50 | 
 51 | def timeit(func):
 52 |     """
 53 |     Returns the number of seconds that a function took along with the result
 54 |     """
 55 | 
 56 |     @wraps(func)
 57 |     def timer_wrapper(*args, **kwargs):
 58 |         """
 59 |         Inner function that uses the Timer context object
 60 |         """
 61 |         with Timer() as timer:
 62 |             result = func(*args, **kwargs)
 63 | 
 64 |         return result, timer
 65 | 
 66 |     return timer_wrapper
 67 | 
 68 | 
 69 | def timeout(seconds):
 70 |     """
 71 |     Raises a TimeoutError if a function does not terminate within
 72 |     specified seconds.
 73 |     """
 74 |     def _timeout_error(signal, frame):
 75 |         raise TimeoutError("Operation did not finish within \
 76 |         {} seconds".format(seconds))
 77 | 
 78 |     def timeout_decorator(func):
 79 | 
 80 |         @wraps(func)
 81 |         def timeout_wrapper(*args, **kwargs):
 82 |             signal.signal(signal.SIGALRM, _timeout_error)
 83 |             signal.alarm(seconds)
 84 |             try:
 85 |                 return func(*args, **kwargs)
 86 |             finally:
 87 |                 signal.alarm(0)
 88 | 
 89 |         return timeout_wrapper
 90 | 
 91 |     return timeout_decorator
 92 | 
 93 | ##########################################################################
 94 | ## Exception Handling
 95 | ##########################################################################
 96 | 
 97 | def reraise(klass=BaleenError, message=None, trap=Exception):
 98 |     """
 99 |     Catches exceptions (those specified by trap) and then reraises the
100 |     exception type specified by class. Also embeds the original exception as
101 |     a property of the new exception: `error.original`. Finally you can
102 |     specify another message to raise, otherwise the error string is used.
103 |     """
104 | 
105 |     def reraise_decorator(func):
106 | 
107 |         @wraps(func)
108 |         def reraise_wrapper(*args, **kwargs):
109 |             """
110 |             Capture Wrapper
111 |             """
112 |             try:
113 |                 return func(*args, **kwargs)
114 |             except trap as e:
115 |                 error = klass(message or e.message)
116 |                 error.original = e
117 |                 raise error
118 | 
119 |         return reraise_wrapper
120 | 
121 |     return reraise_decorator
122 | 


--------------------------------------------------------------------------------
/baleen/utils/logger.py:
--------------------------------------------------------------------------------
  1 | # baleen.utils.logger
  2 | # Logging utility for Baleen
  3 | #
  4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
  5 | # Created:  Mon Sep 22 15:47:34 2014 -0400
  6 | #
  7 | # Copyright (C) 2014 Bengfort.com
  8 | # For license information, see LICENSE.txt
  9 | #
 10 | # ID: logger.py [caaaaca] benjamin@bengfort.com $
 11 | 
 12 | """
 13 | Logging utility for Baleen
 14 | """
 15 | 
 16 | ##########################################################################
 17 | ## Imports
 18 | ##########################################################################
 19 | 
 20 | import logging
 21 | import getpass
 22 | import warnings
 23 | import logging.config
 24 | 
 25 | from baleen.config import settings
 26 | from baleen.utils.timez import COMMON_DATETIME
 27 | 
 28 | ##########################################################################
 29 | ## Logging configuration
 30 | ##########################################################################
 31 | 
 32 | configuration = {
 33 |     'version': 1,
 34 |     'disable_existing_loggers': False,
 35 | 
 36 |     'formatters': {
 37 |         'simple': {
 38 |             'format': '%(name)s %(levelname)s [%(asctime)s] -- %(message)s',
 39 |             'datefmt': COMMON_DATETIME,
 40 |         }
 41 |     },
 42 | 
 43 |     'handlers': {
 44 |         'null': {
 45 |             'level': 'DEBUG',
 46 |             'class': 'logging.NullHandler',
 47 |         },
 48 | 
 49 |         'console': {
 50 |             'level': 'WARNING',
 51 |             'class': 'logging.StreamHandler',
 52 |             'formatter': 'simple',
 53 |         },
 54 | 
 55 |         'logfile': {
 56 |             'level': 'INFO',
 57 |             'class': 'logging.handlers.RotatingFileHandler',
 58 |             'filename': settings.logfile,
 59 |             'maxBytes': '536870912', # 512 MB
 60 |             'formatter': 'simple',
 61 |         },
 62 | 
 63 |         'mongolog': {
 64 |             'level': 'INFO',
 65 |             'class': 'baleen.utils.mongolog.MongoHandler',
 66 |         }
 67 |     },
 68 | 
 69 |     'loggers': {
 70 |         'baleen': {
 71 |             'level': settings.loglevel,
 72 |             'handlers': ['logfile'],
 73 |             'propagagte': True,
 74 |         },
 75 |         'baleen.ingest': {
 76 |             'level': 'INFO',
 77 |             'handlers': ['logfile', 'mongolog'],
 78 |             'propagate': False,
 79 |         }
 80 |     },
 81 | }
 82 | 
 83 | logging.config.dictConfigClass(configuration).configure()
 84 | if not settings.debug: logging.captureWarnings(True)
 85 | 
 86 | ##########################################################################
 87 | ## Logger utility
 88 | ##########################################################################
 89 | 
 90 | class WrappedLogger(object):
 91 |     """
 92 |     Wraps the Python logging module's logger object to ensure that all baleen
 93 |     logging happens with the correct configuration as well as any extra
 94 |     information that might be required by the log file (for example, the user
 95 |     on the machine, hostname, IP address lookup, etc).
 96 | 
 97 |     Subclasses must specify their logger as a class variable so all instances
 98 |     have access to the same logging object.
 99 |     """
100 | 
101 |     logger = None
102 | 
103 |     def __init__(self, **kwargs):
104 |         self.raise_warnings = kwargs.pop('raise_warnings', settings.debug)
105 |         self.logger = kwargs.pop('logger', self.logger)
106 | 
107 |         if not self.logger or not hasattr(self.logger, 'log'):
108 |             raise TypeError(
109 |                 "Subclasses must specify a logger, not {}"
110 |                 .format(type(self.logger))
111 |             )
112 | 
113 |         self.extras = kwargs
114 | 
115 |     def log(self, level, message, *args, **kwargs):
116 |         """
117 |         This is the primary method to override to ensure logging with extra
118 |         options gets correctly specified.
119 |         """
120 |         extra = self.extras.copy()
121 |         extra.update(kwargs.pop('extra', {}))
122 | 
123 |         kwargs['extra'] = extra
124 |         self.logger.log(level, message, *args, **kwargs)
125 | 
126 |     def debug(self, message, *args, **kwargs):
127 |         return self.log(logging.DEBUG, message, *args, **kwargs)
128 | 
129 |     def info(self, message, *args, **kwargs):
130 |         return self.log(logging.INFO, message, *args, **kwargs)
131 | 
132 |     def warning(self, message, *args, **kwargs):
133 |         """
134 |         Specialized warnings system. If a warning subclass is passed into
135 |         the keyword arguments and raise_warnings is True - the warnning will
136 |         be passed to the warnings module.
137 |         """
138 |         warncls = kwargs.pop('warning', None)
139 |         if warncls and self.raise_warnings:
140 |             warnings.warn(message, warncls)
141 | 
142 |         return self.log(logging.WARNING, message, *args, **kwargs)
143 | 
144 |     # Alias warn to warning
145 |     warn = warning
146 | 
147 |     def error(self, message, *args, **kwargs):
148 |         return self.log(logging.ERROR, message, *args, **kwargs)
149 | 
150 |     def critical(self, message, *args, **kwargs):
151 |         return self.log(logging.CRITICAL, message, *args, **kwargs)
152 | 
153 | 
154 | ##########################################################################
155 | ## The Ingestion Logger Class
156 | ##########################################################################
157 | 
158 | class IngestLogger(WrappedLogger):
159 |     """
160 |     Performs logging for the baleen process with the log options above.
161 |     """
162 | 
163 |     logger = logging.getLogger('baleen.ingest')
164 | 
165 |     def __init__(self, **kwargs):
166 |         self._user = kwargs.pop('user', None)
167 |         super(IngestLogger, self).__init__(**kwargs)
168 | 
169 |     @property
170 |     def user(self):
171 |         if not self._user:
172 |             self._user = getpass.getuser()
173 |         return self._user
174 | 
175 |     def log(self, level, message, *args, **kwargs):
176 |         """
177 |         Provide current user as extra context to the logger
178 |         """
179 |         extra = kwargs.pop('extra', {})
180 |         extra.update({
181 |             'user': self.user
182 |         })
183 | 
184 |         kwargs['extra'] = extra
185 |         super(IngestLogger, self).log(level, message, *args, **kwargs)
186 | 
187 | 
188 | ##########################################################################
189 | ## Logging Mixin
190 | ##########################################################################
191 | 
192 | class LoggingMixin(object):
193 |     """
194 |     Mix in to classes that need their own logging object!
195 |     """
196 | 
197 |     @property
198 |     def logger(self):
199 |         """
200 |         Instantiates and returns a IngestLogger instance
201 |         """
202 |         if not hasattr(self, '_logger') or not self._logger:
203 |             self._logger = IngestLogger()
204 |         return self._logger
205 | 


--------------------------------------------------------------------------------
/baleen/utils/mongolog.py:
--------------------------------------------------------------------------------
  1 | # baleen.utils.mongolog
  2 | # Handlers and formatters for logging to Mongo
  3 | #
  4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
  5 | # Created:  Tue Sep 23 09:11:52 2014 -0400
  6 | #
  7 | # Copyright (C) 2014 Bengfort.com
  8 | # For license information, see LICENSE.txt
  9 | #
 10 | # ID: mongolog.py [caaaaca] benjamin@bengfort.com $
 11 | 
 12 | """
 13 | Handlers and formatters for logging to Mongo
 14 | """
 15 | 
 16 | ##########################################################################
 17 | ## Imports
 18 | ##########################################################################
 19 | 
 20 | import getpass
 21 | import logging
 22 | import logging.config
 23 | from baleen.utils.timez import *
 24 | from baleen.config import settings
 25 | 
 26 | from datetime import datetime
 27 | from socket import gethostname
 28 | from pymongo import MongoClient
 29 | from pymongo.errors import PyMongoError
 30 | 
 31 | ##########################################################################
 32 | ## Mongo Formatter/Handler
 33 | ##########################################################################
 34 | 
 35 | class MongoFormatter(logging.Formatter):
 36 | 
 37 |     def __init__(self, fmt='%(name)s %(levelname)s [%(asctime)s] -- %(message)s', datefmt=COMMON_DATETIME):
 38 |         super(MongoFormatter, self).__init__(fmt, datefmt)
 39 | 
 40 |     def format(self, record):
 41 |         """
 42 |         Formats LogRecord into a Python dictionary
 43 |         """
 44 | 
 45 |         ## Get the dictionary ready for Mongo
 46 |         data    = record.__dict__.copy()
 47 | 
 48 |         ## Get the log message as intended via super
 49 |         message   = super(MongoFormatter, self).format(record)
 50 |         timestamp = datetime.fromtimestamp(data.pop('created'))
 51 |         location  = {
 52 |             'module': data.pop('module'),
 53 |             'file': data.pop('pathname'),
 54 |             'filename': data.pop('filename'),
 55 |             'lineno': data.pop('lineno'),
 56 |             'method': data.pop('funcName')
 57 |         }
 58 |         error     = {
 59 |             'info': data.pop('exc_info'),
 60 |             'text': data.pop('exc_text'),
 61 |         }
 62 |         process   = {
 63 |             'process': data.pop('process'),
 64 |             'processName': data.pop('processName'),
 65 |             'thread': data.pop('thread'),
 66 |             'threadName': data.pop('threadName'),
 67 |         }
 68 |         logger    = data.pop('name')
 69 |         level     = {
 70 |             'number': data.pop('levelno'),
 71 |             'name': data.pop('levelname'),
 72 |         }
 73 |         info      = tuple(unicode(arg) for arg in data.pop('args'))
 74 | 
 75 |         for key in ('relativeCreated', 'msecs', 'msg'):
 76 |             del data[key]
 77 | 
 78 |         data.update({
 79 |             'logger': logger,
 80 |             # 'process': process,
 81 |             'message': message,
 82 |             'timestamp': timestamp,
 83 |             'level': level,
 84 |             # 'location': location,
 85 |             'error': error,
 86 |             'user': getpass.getuser(),
 87 |             'host': gethostname(),
 88 |             # 'info': info,
 89 |         })
 90 | 
 91 |         return data
 92 | 
 93 | class MongoHandler(logging.Handler):
 94 | 
 95 |     def __init__(self, level=logging.NOTSET, **kwargs):
 96 |         super(MongoHandler, self).__init__(level)
 97 |         self.host            = kwargs.get('host', settings.database.host)
 98 |         self.port            = kwargs.get('port', settings.database.port)
 99 |         self.database_name   = kwargs.get('database', settings.database.name)
100 |         self.collection_name = kwargs.get('collection', 'logs')
101 |         self.fail_silently   = kwargs.get('fail_silently', False)
102 |         self.formatter       = kwargs.get('formatter', MongoFormatter())
103 | 
104 |         self.connection      = None
105 |         self.database        = None
106 |         self.collection      = None
107 |         self.connect()
108 | 
109 |     def connect(self):
110 |         """
111 |         Connect to the Mongo database.
112 |         """
113 |         try:
114 |             self.connection = MongoClient(host=self.host, port=self.port)
115 |         except PyMongoError:
116 |             if self.fail_silently:
117 |                 return
118 |             else:
119 |                 raise
120 | 
121 |         self.database   = self.connection[self.database_name]
122 |         self.collection = self.database[self.collection_name]
123 | 
124 |     def close(self):
125 |         """
126 |         Close the connection to the Mongo database.
127 |         """
128 |         if self.connection is not None:
129 |             self.connection = None
130 | 
131 |     def emit(self, record):
132 |         """
133 |         Insert log record into Mongo database
134 |         """
135 |         if self.collection is not None:
136 |             try:
137 |                 self.collection.insert(self.format(record))
138 |             except Exception:
139 |                 if not self.fail_silently:
140 |                     self.handleError(record)
141 | 
142 | if __name__ == '__main__':
143 |     logger = logging.getLogger('demo')
144 |     logger.setLevel(logging.INFO)
145 |     logger.addHandler(MongoHandler())
146 |     logger.info("This is a test of the logging system")
147 | 


--------------------------------------------------------------------------------
/baleen/utils/timez.py:
--------------------------------------------------------------------------------
  1 | # baleen.utils.timez
  2 | # Utility functions for Baleen
  3 | #
  4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
  5 | # Created:  Mon Sep 22 10:14:57 2014 -0400
  6 | #
  7 | # Copyright (C) 2014 Bengfort.com
  8 | # For license information, see LICENSE.txt
  9 | #
 10 | # ID: timez.py [caaaaca] benjamin@bengfort.com $
 11 | 
 12 | """
 13 | Utility functions for Baleenc
 14 | """
 15 | 
 16 | ##########################################################################
 17 | ## Imports
 18 | ##########################################################################
 19 | 
 20 | import re
 21 | import time
 22 | 
 23 | from dateutil.tz import tzlocal, tzutc
 24 | from datetime import datetime, timedelta
 25 | from dateutil.relativedelta import relativedelta
 26 | 
 27 | ##########################################################################
 28 | ## Format constants
 29 | ##########################################################################
 30 | 
 31 | HUMAN_DATETIME   = "%a %b %d %H:%M:%S %Y %z"
 32 | HUMAN_DATE       = "%b %d, %Y"
 33 | HUMAN_TIME       = "%I:%M:%S %p"
 34 | JSON_DATETIME    = "%Y-%m-%dT%H:%M:%S.%fZ" # Must be UTC
 35 | ISO8601_DATETIME = "%Y-%m-%dT%H:%M:%S%z"
 36 | ISO8601_DATE     = "%Y-%m-%d"
 37 | ISO8601_TIME     = "%H:%M:%S"
 38 | COMMON_DATETIME  = "%d/%b/%Y:%H:%M:%S %z"
 39 | WEB_UTC_DATETIME = "%a, %b %d, %Y at %H:%M UTC"
 40 | 
 41 | ##########################################################################
 42 | ## Module helper function
 43 | ##########################################################################
 44 | 
 45 | def localnow():
 46 |     return datetime.now(tzlocal())
 47 | 
 48 | 
 49 | def utcnow():
 50 |     now = datetime.utcnow()
 51 |     now = now.replace(tzinfo=tzutc())
 52 |     return now
 53 | 
 54 | 
 55 | zre = re.compile(r'([\-\+]\d{4})')
 56 | def strptimez(dtstr, dtfmt):
 57 |     """
 58 |     Helper function that performs the timezone calculation to correctly
 59 |     compute the '%z' format that is not added by default in Python 2.7.
 60 |     """
 61 |     if '%z' not in dtfmt:
 62 |         return datetime.strptime(dtstr, dtfmt)
 63 | 
 64 |     dtfmt  = dtfmt.replace('%z', '')
 65 |     offset = int(zre.search(dtstr).group(1))
 66 |     dtstr  = zre.sub('', dtstr)
 67 |     delta  = timedelta(hours = offset/100)
 68 |     utctsp = datetime.strptime(dtstr, dtfmt) - delta
 69 |     return utctsp.replace(tzinfo=tzutc())
 70 | 
 71 | 
 72 | def humanizedelta(*args, **kwargs):
 73 |     """
 74 |     Wrapper around dateutil.relativedelta (same construtor args) and returns
 75 |     a humanized string representing the detla in a meaningful way.
 76 |     """
 77 |     if 'milliseconds' in kwargs:
 78 |         sec  = kwargs.get('seconds', 0)
 79 |         msec = kwargs.pop('milliseconds')
 80 |         kwargs['seconds'] = sec + (float(msec) / 1000.0)
 81 | 
 82 |     delta = relativedelta(*args, **kwargs)
 83 |     attrs = ('years', 'months', 'days', 'hours', 'minutes', 'seconds')
 84 |     parts = [
 85 |         '%d %s' % (getattr(delta, attr), getattr(delta, attr) > 1 and attr or attr[:-1])
 86 |         for attr in attrs if getattr(delta, attr)
 87 |     ]
 88 | 
 89 |     return " ".join(parts)
 90 | 
 91 | 
 92 | ##########################################################################
 93 | ## Timer functions
 94 | ##########################################################################
 95 | 
 96 | 
 97 | class Timer(object):
 98 |     """
 99 |     A context object timer. Usage:
100 |         >>> with Timer() as timer:
101 |         ...     do_something()
102 |         >>> print timer.elapsed
103 |     """
104 | 
105 |     def __init__(self, wall_clock=True):
106 |         """
107 |         If wall_clock is True then use time.time() to get the number of
108 |         actually elapsed seconds. If wall_clock is False, use time.clock to
109 |         get the process time instead.
110 |         """
111 |         self.wall_clock = wall_clock
112 |         self.time = time.time if wall_clock else time.clock
113 | 
114 |         # Stubs for serializing an empty timer.
115 |         self.started  = None
116 |         self.finished = None
117 |         self.elapsed  = 0.0
118 | 
119 |     def __enter__(self):
120 |         self.started  = self.time()
121 |         return self
122 | 
123 |     def __exit__(self, typ, value, tb):
124 |         self.finished = self.time()
125 |         self.elapsed  = self.finished - self.started
126 | 
127 |     def __str__(self):
128 |         return humanizedelta(seconds=self.elapsed)
129 | 


--------------------------------------------------------------------------------
/baleen/version.py:
--------------------------------------------------------------------------------
 1 | # baleen.version
 2 | # Stores version information such that it can be read by setuptools.
 3 | #
 4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
 5 | # Created:  Thu Feb 18 20:14:16 2016 -0500
 6 | #
 7 | # Copyright (C) 2016 Bengfort.com
 8 | # For license information, see LICENSE.txt
 9 | #
10 | # ID: version.py [edff1dd] benjamin@bengfort.com $
11 | 
12 | """
13 | Stores version information such that it can be read by setuptools.
14 | """
15 | 
16 | ##########################################################################
17 | ## Imports
18 | ##########################################################################
19 | 
20 | __version_info__ = {
21 |     'major': 0,
22 |     'minor': 3,
23 |     'micro': 3,
24 |     'releaselevel': 'final',
25 |     'serial': 0,
26 | }
27 | 
28 | 
29 | def get_version(short=False):
30 |     """
31 |     Computes a string representation of the version from __version_info__.
32 |     """
33 |     assert __version_info__['releaselevel'] in ('alpha', 'beta', 'final')
34 |     vers = ["%(major)i.%(minor)i" % __version_info__, ]
35 |     if __version_info__['micro']:
36 |         vers.append(".%(micro)i" % __version_info__)
37 |     if __version_info__['releaselevel'] != 'final' and not short:
38 |         vers.append('%s%i' % (__version_info__['releaselevel'][0],
39 |                               __version_info__['serial']))
40 |     return ''.join(vers)
41 | 


--------------------------------------------------------------------------------
/baleen/wrangle.py:
--------------------------------------------------------------------------------
  1 | # baleen.wrangle
  2 | # Wrangles the post objects from a synchronized feed.
  3 | #
  4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
  5 | # Created:  Wed Mar 02 21:52:49 2016 -0500
  6 | #
  7 | # Copyright (C) 2016 Bengfort.com
  8 | # For license information, see LICENSE.txt
  9 | #
 10 | # ID: wrangle.py [568d540] benjamin@bengfort.com $
 11 | 
 12 | """
 13 | Wrangles the post objects from a synchronized feed.
 14 | 
 15 | Feed objects don't require a lot of wrangling, and are handled primarily by
 16 | the FeedSync object. However Posts do require some hoop jumping, which this
 17 | module provides.
 18 | """
 19 | 
 20 | ##########################################################################
 21 | ## Imports
 22 | ##########################################################################
 23 | 
 24 | import requests
 25 | 
 26 | from copy import deepcopy
 27 | from dateutil import parser as dtparser
 28 | 
 29 | from baleen.config import settings
 30 | from baleen.models import Post
 31 | from baleen.utils.decorators import reraise
 32 | from baleen.exceptions import WranglingError, FetchError
 33 | 
 34 | ##########################################################################
 35 | ## Module Constants
 36 | ##########################################################################
 37 | 
 38 | FEEDPARSER_REMOVABLE_FIELDS = (
 39 |     'id', 'published_parsed', 'expired_parsed',
 40 |     'updated', 'updated_parsed',  'created', 'created_parsed',
 41 | )
 42 | 
 43 | ##########################################################################
 44 | ## Post Wrangling Object
 45 | ##########################################################################
 46 | 
 47 | class PostWrangler(object):
 48 |     """
 49 |     As FeedSync wraps Feed to do work, so to does PostWrangler wrap an entry
 50 |     to create a Post object, to ensure that data is of a high quality, and to
 51 |     do extra things like fetch the full webpage from the URL provided.
 52 | 
 53 |     This object directly converts its input (a dict) to a models.Post object.
 54 |     """
 55 | 
 56 |     @classmethod
 57 |     def factory(klass, entries, feed=None):
 58 |         """
 59 |         Yields a post wrangler for each entry in the entries.
 60 |         """
 61 |         for entry in entries:
 62 |             yield klass(deepcopy(entry), feed=feed)
 63 | 
 64 |     def __init__(self, entry, feed=None):
 65 |         """
 66 |         Entry is expected to be the dictionary object from a FeedSync
 67 |         After wrangling, it will become a models.Post object.
 68 |         """
 69 |         self.feed = feed
 70 |         self.post = entry
 71 | 
 72 |     def is_wrangled(self):
 73 |         """
 74 |         Checks the class of the post to see if wrangling has occurred.
 75 |         """
 76 |         return isinstance(self.post, Post)
 77 | 
 78 |     @reraise(klass=WranglingError)
 79 |     def wrangle(self, save=True):
 80 |         """
 81 |         Converts the raw entry to standard data. If save, saves to database.
 82 | 
 83 |         Metholodolgy of wrangling is as follows:
 84 | 
 85 |             - all fields are kept in the entry except `published` and
 86 |               `published_parsed` since these many not contain TZ data -
 87 |               instead these two fields are replaced by `pubdate`. If there
 88 |               is no publication date, `pubdate` is set to None.
 89 | 
 90 |             - the tags field, if it exists, is converted to a list of
 91 |               strings. Although this may cause some data loss; it will
 92 |               make tagging of all posts simpler for the application.
 93 | 
 94 |             - link will be renamed url
 95 | 
 96 |             - content will be populated with summary, if content does not
 97 |               exist in the feed. Supposedly feedparser was already doing
 98 |               this, but it appears to not be regular.
 99 | 
100 |             - title, url, content, and tags will all be encoded UTF-8.
101 | 
102 |             - removes the id field so a Mongo generated ObjectID is stored.
103 | 
104 |         See the models.Post for more information on the data structure.
105 | 
106 |         NOTE: This method is destructive, the raw entry will be converted.
107 |         """
108 |         ## Don't rewrangle an already wrangled post
109 |         if self.is_wrangled():
110 |             return self.post
111 | 
112 |         ## Saves typing self.post everywhere
113 |         post = self.post.copy()
114 | 
115 |         ## Remove unwanted fields
116 |         for field in FEEDPARSER_REMOVABLE_FIELDS:
117 |             if field in post: del post[field]
118 | 
119 |         ## Handle the pubdate and published strings
120 |         post['pubdate'] = dtparser.parse(post.pop('published')) if 'published' in post else None
121 | 
122 |         ## Handle the tags in the entry
123 |         post['tags'] = [tag['term'] for tag in self.post.tags] if 'tags' in post else []
124 | 
125 |         ## Rename the link field to url
126 |         post['url'] = self.post.link or post.get('href', None) or self.post.id
127 |         if 'link' in post: del post['link']
128 | 
129 |         ## Handle the content
130 |         if 'content' not in post:
131 |             post['content'] = post.get('summary')
132 |         else:
133 |             selected = None
134 |             for idx, item in enumerate(post['content']):
135 |                 if idx == 0:
136 |                     # Take the first item
137 |                     selected = item
138 |                 elif item['type'] == 'text/html':
139 |                     # Unless we find another item that is html
140 |                     selected = item
141 | 
142 |             # Update the post with the content info
143 |             post['language'] = selected.get('language')
144 |             post['mimetype'] = selected.get('type')
145 |             post['content']  = selected.get('value')
146 | 
147 |         ## Create the post object
148 |         ## Start using self.post here!
149 |         self.post = Post(feed=self.feed, **post)
150 |         if save:
151 |             self.post.save()
152 | 
153 |         return self.post
154 | 
155 |     @reraise(klass=FetchError)
156 |     def fetch(self, save=True):
157 |         """
158 |         Fetches the entire webpage for the post. If save, adds the page to
159 |         the content of the post and saves it back to the database.
160 | 
161 |         Raises an exception if not wrangled yet.
162 |         Raises exceptions if there is a problem with the fetch.
163 |         """
164 |         if not self.is_wrangled():
165 |             raise ValueError("Entry not yet wrangled, cannot fetch.")
166 | 
167 |         response = requests.get(self.post.url, timeout=settings.timeout)
168 |         response.raise_for_status()
169 | 
170 |         if response.text:
171 |             self.post.content = response.text
172 | 
173 |         if save:
174 |             self.post.save()
175 | 
176 |         return self.post
177 | 


--------------------------------------------------------------------------------
/baleen/www/__init__.py:
--------------------------------------------------------------------------------
 1 | # baleen.www
 2 | # A small web application that will allow us to manage the Baleen app.
 3 | #
 4 | # Author:   Laura Lorenz <lalorenz6@gmail.com>
 5 | # Created:  Sun Apr 3 12:59:42 2016 -0400
 6 | #
 7 | # Copyright (C) 2016 Bengfort.com
 8 | # For license information, see LICENSE.txt
 9 | #
10 | # ID: __init__.py [] benjamin@bengfort.com $
11 | 
12 | """
13 | A small web application that will allow us to manage the Baleen app.
14 | """
15 | 
16 | ##########################################################################
17 | ## Imports
18 | ##########################################################################
19 | 


--------------------------------------------------------------------------------
/baleen/www/app.py:
--------------------------------------------------------------------------------
  1 | # baleen.www.app
  2 | # Flask application definition in Baleen.
  3 | #
  4 | # Author:   Laura Lorenz <lalorenz6@gmail.com>
  5 | # Created:  Sun Apr 3 12:59:42 2016 -0400
  6 | #
  7 | # Copyright (C) 2016 Bengfort.com
  8 | # For license information, see LICENSE.txt
  9 | #
 10 | # ID: app.py [] lalorenz6@gmail.com $
 11 | 
 12 | """
 13 | Flask application definition in Baleen.
 14 | """
 15 | 
 16 | ##########################################################################
 17 | ## Imports
 18 | ##########################################################################
 19 | 
 20 | import baleen
 21 | 
 22 | from baleen.config import settings
 23 | from baleen.models import Feed, Post, Job, Log
 24 | from baleen.utils.timez import WEB_UTC_DATETIME
 25 | 
 26 | from flask import Flask, render_template, request
 27 | from flask.ext.mongoengine import MongoEngine
 28 | from flask_humanize import Humanize
 29 | 
 30 | ##########################################################################
 31 | ## Flask Application
 32 | ##########################################################################
 33 | 
 34 | # set up an app instance
 35 | app = Flask(__name__)
 36 | 
 37 | # set debug to true to get debug pages when there is an error
 38 | app.debug = settings.debug
 39 | 
 40 | # configure the app with the confire settings
 41 | app.config['MONGODB_SETTINGS'] = {
 42 |     'db':   settings.database.name,
 43 |     'host': settings.database.host,
 44 |     'port': settings.database.port,
 45 | }
 46 | 
 47 | # connect to the database using the Flask extension
 48 | db = MongoEngine(app)
 49 | 
 50 | # add the humanize extension
 51 | humanize = Humanize(app)
 52 | 
 53 | ##########################################################################
 54 | ## Routes
 55 | ##########################################################################
 56 | 
 57 | @app.route("/")
 58 | def index():
 59 |     """
 60 |     Displays an index page with the feed listing
 61 |     """
 62 |     # get all the stuff we want
 63 |     feeds = Feed.objects()
 64 |     feed_count = feeds.count()
 65 |     topics = set([feed.category for feed in Feed.objects.only('category')])
 66 |     feeds_topics_counts = len(topics)
 67 | 
 68 |     # TODO: probably should put this in the database along with the feed.
 69 |     feed_icons = {'gaming':'fa fa-gamepad',
 70 |                   'design':'fa fa-building-o',
 71 |                   'business':'fa fa-briefcase',
 72 |                   'cinema':'fa fa-video-camera',
 73 |                   'data-science':'fa fa-area-chart',
 74 |                   'cooking':'fa fa-cutlery',
 75 |                   'sports':'fa fa-futbol-o',
 76 |                   'books':'fa fa-book',
 77 |                   'tech':'fa fa-cogs',
 78 |                   'politics':'fa fa-university',
 79 |                   'news':'fa fa-newspaper-o',
 80 |                   'essays':'fa fa-pencil-square-o',
 81 |                   'do-it-yourself':'fa fa-wrench'
 82 |                  }
 83 |     feeds_topics = {
 84 |         topic: Feed.objects(category=topic)
 85 |         for topic in topics
 86 |     }
 87 | 
 88 |     # load all the data into the templates/feed_list.html template
 89 |     return render_template('index.html',
 90 |                            feeds=feeds,
 91 |                            feeds_topics=feeds_topics,
 92 |                            feed_count=feed_count,
 93 |                            topic_count=feeds_topics_counts,
 94 |                            feed_icons=feed_icons)
 95 | 
 96 | @app.route("/status/")
 97 | def status():
 98 |     """
 99 |     Displays the current Baleen status and job listing
100 |     """
101 |     version = baleen.get_version()
102 |     counts = {
103 |         'feeds': Feed.objects.count(),
104 |         'posts': Post.objects.count(),
105 |         'jobs':  Job.objects.count(),
106 |     }
107 |     latest_job = Job.objects.order_by('-started').first()
108 |     latest_feed = Feed.objects.order_by('-updated').first()
109 |     latest_post = Post.objects.order_by('-id').first()
110 |     recent_jobs = Job.objects.order_by('-started').limit(10)
111 | 
112 |     # load all data into job_status template
113 |     return render_template(
114 |         'status.html',
115 |         latest_job=latest_job,
116 |         latest_feed=latest_feed,
117 |         latest_post=latest_post,
118 |         version=version,
119 |         counts=counts,
120 |         dtfmt=WEB_UTC_DATETIME,
121 |         recent_jobs=recent_jobs
122 |     )
123 | 
124 | 
125 | @app.route("/logs/")
126 | def logs():
127 |     """
128 |     Displays log records from the Mongo Database.
129 |     This is paginated and allows flexible per-page counts (max 200 record).
130 |     """
131 |     # Get pagination information for request
132 |     page = int(request.args.get('page', 1))
133 |     per_page = min(int(request.args.get('per_page', 50)), 200)
134 | 
135 |     # Compute the pagination variables
136 |     n_logs   = Log.objects.count()
137 |     n_pages  = (n_logs + per_page // 2) // per_page
138 |     nextp    = page + 1 if page + 1 <= n_pages else None
139 |     prevp    = page - 1 if page > 1 else None
140 | 
141 |     # Perform query
142 |     offset   = (page - 1) * per_page
143 |     logs     = Log.objects.order_by('-id').skip(offset).limit(per_page)
144 | 
145 |     return render_template(
146 |         'logs.html',
147 |         page = page,
148 |         num_pages = n_pages,
149 |         per_page  = per_page,
150 |         logs = logs,
151 |         num_logs = n_logs,
152 |         next = nextp,
153 |         prev = prevp,
154 |     )
155 | 
156 | 
157 | ##########################################################################
158 | ## Main Method
159 | ##########################################################################
160 | 
161 | if __name__ == "__main__":
162 |     # if you run this file as a script, it will start the flask server
163 |     app.run(host=settings.server.host, port=settings.server.port)
164 | 


--------------------------------------------------------------------------------
/baleen/www/static/css/baleen.css:
--------------------------------------------------------------------------------
 1 | /* Baleen specific styles for various things. */
 2 | 
 3 | html,
 4 | body {
 5 |   height: 100%;
 6 | }
 7 | 
 8 | /*body {
 9 |   padding-top: 70px;
10 | }*/
11 | 
12 | /* Wrapper for page content to push down footer */
13 | #wrap {
14 |   min-height: 100%;
15 |   height: auto;
16 |   /* Negative indent footer by its height */
17 |   margin: 0 auto -76px;
18 |   /* Pad bottom by footer height */
19 |   padding: 0 0 106px;
20 | }
21 | 
22 | .navbar-brand-img {
23 |     width: 22px;
24 |     height: 22px;
25 |     float: left;
26 |     margin: -3px 4px 0 0;
27 | }
28 | 
29 | /* Set the fixed height of the footer here */
30 | #footer {
31 |     background-color: #fff;
32 |     border-top: 1px solid #eee;
33 |     height: 76px;
34 |     padding: 30px 15px;
35 | }
36 | 
37 | .app-tabs .tab-pane {
38 |     margin-top: 20px;
39 | }
40 | 
41 | #loading img {
42 |     margin: 200px auto;
43 | }
44 | 
45 | th {
46 |     background-color: #008CBA;
47 |     color: white;
48 | } 
49 | 


--------------------------------------------------------------------------------
/baleen/www/static/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DistrictDataLabs/baleen/bb2ae323a3ab3a066a4a289401847e1251abc55d/baleen/www/static/favicon.png


--------------------------------------------------------------------------------
/baleen/www/templates/base.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html class="no-js" lang="">
 3 |   <head>
 4 |     {% block meta %}
 5 |     <meta charset="utf-8">
 6 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
 7 |     <meta name="author" content="{% block author %}Sasan Bahadaran{% endblock %}">
 8 |     <meta name="keywords" content="{% block keywords %}baleen, ingestion, district data labs, nlp{% endblock %}">
 9 |     <meta name="description" content="{% block description %}An automated ingestion service for blogs to construct a corpus for NLP research.{% endblock %}">
10 |     <meta name="viewport" content="width=device-width, initial-scale=1">
11 |     {% endblock %}
12 | 
13 |     <title>{% block title %}Baleen Status{% endblock %}</title>
14 | 
15 |     <!-- Shortcut Icons -->
16 |     <link rel="shortcut icon" href="{{ url_for('static', filename='favicon.png') }}">
17 | 
18 |     {% block stylesheets %}
19 |     <!-- CDN CSS Files -->
20 |     <link href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-1q8mTJOASx8j1Au+a5WDVnPi2lkFfwwEAa8hDDdjZlpLegxhjVME1fgjWPGmkzs7" crossorigin="anonymous">
21 |     <link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.5.0/css/font-awesome.min.css" rel="stylesheet" integrity="sha384-XdYbMnZ/QjLh6iI4ogqCTaIjrFk87ip+ekIjefZch0Y+PvJ8CDYtEs1ipDmPorQ+" crossorigin="anonymous">
22 |     <link href="https://maxcdn.bootstrapcdn.com/bootswatch/3.3.6/yeti/bootstrap.min.css" rel="stylesheet" integrity="sha384-yxFy3Tt84CcGRj9UI7RA25hoUMpUPoFzcdPtK3hBdNgEGnh9FdKgMVM+lbAZTKN2" crossorigin="anonymous">
23 | 
24 |     <!-- Static CSS Files -->
25 |     <link href="{{ url_for('static', filename='css/baleen.css') }}" rel="stylesheet">
26 |     {% endblock %}
27 |   </head>
28 |   <body>
29 | 
30 |     <div id="wrap">
31 | 
32 |       {% block navbar %}
33 |         {% include 'components/navbar.html' %}
34 |       {% endblock %}
35 | 
36 |       <!-- Begin page content -->
37 |       <div id="content">
38 |       {% block body %}
39 |       {% endblock %}
40 |       </div>
41 | 
42 |     </div>
43 | 
44 | 
45 |     {% block footer %}
46 |       {% include 'components/footer.html' %}
47 |     {% endblock %}
48 | 
49 |     {% block modals %}{% endblock %}
50 | 
51 |     {% block javascripts %}
52 |     <!-- CDN JS Files -->
53 |     <script src="https://code.jquery.com/jquery-2.2.3.min.js" integrity="sha256-a23g1Nt4dtEYOj7bR+vTu7+T8VP13humZFBJNIYoEJo=" crossorigin="anonymous"></script>
54 |     <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js" integrity="sha384-0mSbJDEHialfmuBBQP6A4Qrprq5OVfW37PRR3j5ELqxss1yVqOtnepnHVP9aJ7xS" crossorigin="anonymous"></script>
55 |     {% endblock %}
56 | 
57 |   </body>
58 | </html>
59 | 


--------------------------------------------------------------------------------
/baleen/www/templates/components/footer.html:
--------------------------------------------------------------------------------
 1 | <div id="footer">
 2 |   <div class="container">
 3 |     <div class="row">
 4 |       <div class="col-xs-6">
 5 |         <ul class="list-unstyled list-inline">
 6 |           <li><a href="http://baleen-ingest.readthedocs.org/en/latest/"><i class="fa fa-book"></i></a></li>
 7 |           <li><a href="https://github.com/bbengfort/baleen/"><i class="fa fa-github-square"></i></a></li>
 8 |           <li><a href="https://waffle.io/bbengfort/baleen/"><i class="fa fa-trello"></i></a></li>
 9 |         </ul>
10 |       </div>
11 |       <div class="col-xs-6">
12 |         <p class="text-right text-muted">
13 |           Created by <a href="https://github.com/bahadasx/">@bahadasx</a>, <a href="https://github.com/lauralorenz/">@lauralorenz</a>, and <a href="https://github.com/bbengfort/">@bbengfort</a>
14 |         </p>
15 |       </div>
16 |     </div>
17 |   </div>
18 | </div>
19 | 


--------------------------------------------------------------------------------
/baleen/www/templates/components/navbar.html:
--------------------------------------------------------------------------------
 1 | <nav class="navbar navbar-default">
 2 |  <div class="container">
 3 |    <!-- Brand and toggle get grouped for better mobile display -->
 4 |    <div class="navbar-header">
 5 |      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-collapse-left">
 6 |        <span class="sr-only">Toggle navigation</span>
 7 |        <span class="icon-bar"></span>
 8 |        <span class="icon-bar"></span>
 9 |        <span class="icon-bar"></span>
10 |      </button>
11 |      <a class="navbar-brand" href="/">Baleen</a>
12 |    </div>
13 | 
14 |    <!-- Collect the nav links, forms, and other content for toggling -->
15 |    <div class="collapse navbar-collapse" id="bs-collapse-left">
16 |      <ul class="nav navbar-nav">
17 |       <li>
18 |          <a href="/">
19 |            <i class="fa fa-rss"></i>
20 |            Feeds
21 |          </a>
22 |       </li>
23 |       <li>
24 |         <a href="/status/">
25 |           <i class="fa fa-heartbeat"></i>
26 |           Status
27 |         </a>
28 |       </li>
29 |       <li>
30 |         <a href="/logs/">
31 |           <i class="fa fa-book"></i>
32 |           Logs
33 |         </a>
34 |       </li>
35 |      </ul>
36 | 
37 |      <ul class="nav navbar-nav navbar-right">
38 |        <li class="dropdown">
39 |          <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
40 |            <i class="fa fa-cogs"></i> Resources <span class="caret"></span></a>
41 |          <ul class="dropdown-menu" role="menu">
42 |            <li>
43 |              <a href="http://baleen-ingest.readthedocs.org/en/latest/">
44 |                <i class="fa fa-book"></i>
45 |                Documentation
46 |              </a>
47 |            </li>
48 |            <li>
49 |              <a href="https://github.com/bbengfort/baleen/">
50 |                <i class="fa fa-github-square"></i>
51 |                Repository
52 |              </a>
53 |            </li>
54 |            <li>
55 |              <a href="https://waffle.io/bbengfort/baleen/">
56 |                <i class="fa fa-trello"></i>
57 |                Agile Board
58 |              </a>
59 |            </li>
60 |          </ul>
61 |        </li>
62 |      </ul>
63 |    </div><!-- /.navbar-collapse -->
64 |  </div><!-- /.container-fluid -->
65 | </nav>
66 | 


--------------------------------------------------------------------------------
/baleen/www/templates/index.html:
--------------------------------------------------------------------------------
 1 | {% extends 'base.html' %}
 2 | {% block title %}RSS Feed List{% endblock %}
 3 | 
 4 | {% block body %}
 5 | <div class="container">
 6 | 
 7 |   <!-- Page Title -->
 8 |   <div id="top" class="row">
 9 |     <div class="col-xs-12">
10 |       <div class="page-header">
11 |         <h1>Feeds <small>{{ feed_count }} feeds in {{topic_count}} topics</small></h1>
12 |       </div>
13 |     </div>
14 |   </div>
15 | 
16 |   <!-- Feed Listing -->
17 |   <div class="row">
18 |     <div class="col-xs-12">
19 |       <!-- Feeds navigation -->
20 |       <div class="list-group">
21 |         {% for topic, feeds in feeds_topics|dictsort %}
22 |         <a class="list-group-item" href="#{{ topic|replace(" ", "-")|lower }}">
23 |         <i class="{{ feed_icons[topic|replace(" ", "-")|lower] }}" aria-hidden="true"></i>{{ feed_icons.topic }}
24 |         <span class="badge">{{ feeds.count() }}</span>
25 |         {{ topic|title }}
26 |         </a>
27 |         {% endfor %}
28 |       </div>
29 | 
30 |       <!-- Converted the feeds list into a table -->
31 |         {% for topic, feeds in feeds_topics|dictsort %}
32 |           <h3 id="{{ topic|replace(" ", "-")|lower }}"><i class="{{ feed_icons[topic|replace(" ", "-")|lower] }}" aria-hidden="true"></i>
33 |           {{ topic|title }} <small>({{ feeds.count() }} feeds)</small></h3>
34 |           <table class="table table-bordered table-striped table-hover">
35 |             <thead>
36 |               <th>Active</th>
37 |               <th>Title</th>
38 |               <th>Link</th>
39 |             </thead>
40 |             <tbody>
41 |             {% for feed in feeds %}
42 |             <tr>
43 |               <td><i class="fa fa{% if feed.active %}-check{% endif %}-square-o"></i>
44 |               <td>{{ feed.title }}</td>
45 |               <td><a href="{{ feed.link }}">{{ feed.link }}</a></td>
46 |             </tr>
47 |             {% endfor %}
48 |             </tbody>
49 |           </table>
50 | 
51 |           <a href="#top">Back to Top</a>
52 |         {% endfor %}
53 |     </div><!-- feed listing column ends -->
54 |   </div><!-- feed listing row ends -->
55 | 
56 | </div><!-- container ends -->
57 | {% endblock%}
58 | 


--------------------------------------------------------------------------------
/baleen/www/templates/logs.html:
--------------------------------------------------------------------------------
 1 | {% extends 'base.html' %}
 2 | {% block title %}Baleen Log Records{% endblock %}
 3 | 
 4 | {% block body %}
 5 | <div class="container">
 6 | 
 7 |   <!-- Page Title -->
 8 |   <div id="top" class="row">
 9 |     <div class="col-xs-12">
10 |       <div class="page-header">
11 |         <h1>Log Records</h1>
12 |         <p class="text-muted">Page {{ page }} of {{ num_pages }} &middot; {{ num_logs|humanize('intcomma') }} log records</small>
13 |       </div>
14 |     </div>
15 |   </div>
16 | 
17 |   <!-- Log table -->
18 |   <div class="row">
19 |     <div class="col-md-12">
20 | 
21 |       <table class="table table-hover table-bordered">
22 |         <tbody>
23 |           {% for log in logs %}
24 |           <tr>
25 |             <td class="{{ log.bootstrap_class }}">{{ log.message }}</td>
26 |           </tr>
27 |           {% else %}
28 |           <tr>
29 |             <td>No log records yet</td>
30 |           </tr>
31 |           {% endfor %}
32 |         </tbody>
33 |       </table>
34 | 
35 |       <nav>
36 |         <ul class="pager">
37 |           <li class="previous{% if not prev %} disabled{% endif %}"><a href="{% if prev %}/logs/?page={{ prev }}&per_page={{ per_page }}{% else %}#{% endif %}">
38 |             <span aria-hidden="true">&larr;</span> Newer
39 |           </a></li>
40 |           <li class="next{% if not next %} disabled{% endif %}"><a href="{% if next %}/logs/?page={{ next }}&per_page={{ per_page }}{% else %}#{% endif %}">
41 |             Older <span aria-hidden="true">&rarr;</span>
42 |           </a></li>
43 |         </ul>
44 |       </nav>
45 |     </div>
46 |   </div>
47 | 
48 | {% endblock %}
49 | 


--------------------------------------------------------------------------------
/baleen/www/templates/status.html:
--------------------------------------------------------------------------------
  1 | {% extends 'base.html' %}
  2 | {% block title %}Job Status Page{% endblock %}
  3 | 
  4 | {% block body %}
  5 | <div class="container">
  6 | 
  7 |   <!-- Page Title -->
  8 |   <div id="top" class="row">
  9 |     <div class="col-xs-12">
 10 |       <div class="page-header">
 11 |         <h1>Status <small>{{ counts['posts']|humanize('intcomma') }} posts for {{ counts['feeds']|humanize('intcomma') }} feeds after {{ counts['jobs']|humanize('intcomma') }} jobs</small</h1>
 12 |         <!--p class="lead">{{ counts['posts']|humanize('intcomma') }} posts for {{ counts['feeds']|humanize('intcomma') }} feeds after {{ counts['jobs']|humanize('intcomma') }} jobs</p-->
 13 |       </div>
 14 | 
 15 |       <h2>Latest Job</h2>
 16 |     </div>
 17 |   </div>
 18 | 
 19 |   <div class="row">
 20 |     <div class="col-md-6">
 21 | 
 22 |       <!-- Info Panel -->
 23 |       <div class="panel panel-primary">
 24 |         <!-- Default panel contents -->
 25 |         <div class="panel-heading panel-info">
 26 |           Job Info
 27 |         </div>
 28 | 
 29 |         <!-- Table -->
 30 |         <table class="table table-hover">
 31 |           <tr>
 32 |             {% if not latest_job.finished %}
 33 |             <td><i class="fa fa-play"></i> Status</td>
 34 |             <td>Job Running</td>
 35 |             {% elif latest_job.failed %}
 36 |             <td><i class="fa fa-exclamation-circle" style="color:red" ></i> Status</td>
 37 |             <td>Job Failed</td>
 38 |             <td>{{ latest_job.reason }}</td>
 39 |             {% else  %}
 40 |             <td><i class="fa fa-check-circle" style="color:green" ></i> Status</td>
 41 |             <td>Job Complete</td>
 42 |             {% endif  %}
 43 |           </tr>
 44 |           <tr>
 45 |             <td><i class="fa fa-gears"></i>  Type</td>
 46 |             <td>{{ latest_job.name }} v{{ latest_job.version }}</td>
 47 |           </tr>
 48 |           <tr>
 49 |             <td><i class="fa fa-info-circle"></i> Job ID</td>
 50 |             <td>{{ latest_job.jobid }}</td>
 51 |           </tr>
 52 |           <tr>
 53 |             <td><i class="fa fa-clock-o"></i>  Started</td>
 54 |             <td>{{ latest_job.started.strftime(dtfmt) }} ({{ latest_job.started|humanize('naturaltime') }})</td>
 55 |           </tr>
 56 |           <tr>
 57 |             <td><i class="fa fa-flag-checkered"></i> Finished</td>
 58 |             <td>{% if latest_job.finished %}{{ latest_job.finished.strftime(dtfmt) }} ({{ latest_job.finished|humanize('naturaltime') }}){% endif %}</td>
 59 |           </tr>
 60 |           <tr>
 61 |             <td class="{{ latest_job.bootstrap_class }}"><i class="fa fa-hourglass-end"></i> Duration</td>
 62 |             <td class="{{ latest_job.bootstrap_class }}">{{ latest_job.duration(humanize=True) }}</td>
 63 |           </tr>
 64 |         </table>
 65 |       </div>
 66 | 
 67 |       <!-- Post Panel -->
 68 |       <div class="panel panel-primary">
 69 |         <!-- Default panel contents -->
 70 |         <div class="panel-heading">Latest Post</div>
 71 | 
 72 |         <!-- Table -->
 73 |         <table class="table table-hover">
 74 |           <tr>
 75 |             <td><i class="fa fa-quote-left"></i> Title</td>
 76 |             <td><a href="{{ latest_post.url }}" target="_blank">{{ latest_post.title }}</a></td>
 77 |           </tr>
 78 |           <tr>
 79 |             <td><i class="fa fa-rss"></i> Feed</td>
 80 |             <td>{{ latest_post.feed.title }}</td>
 81 |           </tr>
 82 |           <tr>
 83 |             <td><i class="fa fa-calendar"></i> Published</td>
 84 |             <td>{{ latest_post.pubdate.strftime(dtfmt) }}</td>
 85 |           </tr>
 86 |           <tr>
 87 |             <td><i class="fa fa-download"></i> Fetched</td>
 88 |             <td>{{ latest_post.updated.strftime(dtfmt) }} ({{ latest_post.updated|humanize('naturaltime') }})</td>
 89 |           </tr>
 90 |         </table>
 91 |       </div>
 92 | 
 93 |       <!-- Feed Panel -->
 94 |       <div class="panel panel-primary">
 95 |         <!-- Default panel contents -->
 96 |         <div class="panel-heading">Latest Feed</div>
 97 | 
 98 |         <!-- Table -->
 99 |         <table class="table table-hover">
100 |           <tr>
101 |             <td><i class="fa fa-quote-left"></i> Title</td>
102 |             <td><a href="{{ latest_feed.link }}" target="_blank">{{ latest_feed.title }}</a></td>
103 |           </tr>
104 |           <tr>
105 |             <td><i class="fa fa-tag"></i> eTag</td>
106 |             <td>{{ latest_feed.etag }}</td>
107 |           </tr>
108 |           <tr>
109 |             <td><i class="fa fa-calendar-plus-o"></i> Modified</td>
110 |             <td>{{ latest_feed.modified }}</td>
111 |           </tr>
112 |           <tr>
113 |             <td><i class="fa fa-calendar-check-o"></i> Updated</td>
114 |             <td>{{ latest_feed.updated.strftime(dtfmt) }} ({{ latest_feed.updated|humanize('naturaltime') }})</td>
115 |           </tr>
116 |         </table>
117 |       </div>
118 | 
119 |     </div><!-- left column ends -->
120 | 
121 |     <div class="col-md-6">
122 | 
123 |       <!-- Counts Panel -->
124 |       <div class="panel panel-primary">
125 |         <!-- Default panel contents -->
126 |         <div class="panel-heading">Counts</div>
127 | 
128 |         <!-- List group -->
129 |         <ul class="list-group">
130 |           {% for key, value in latest_job.counts.items() %}
131 |           <li class="list-group-item">
132 |             <span class="badge">{{ value|humanize('intcomma') }}</span>
133 |             {{ key.replace("_", " ").title() }}
134 |           </li>
135 |           {% endfor %}
136 |         </ul>
137 |       </div>
138 | 
139 |       <!-- Errors Panel -->
140 |       <div class="panel panel-primary">
141 |         <!-- Default panel contents -->
142 |         <div class="panel-heading">Errors</div>
143 | 
144 |         <!-- List group -->
145 |         <ul class="list-group">
146 |           {% for key, value in latest_job.errors.items() %}
147 |           <li class="list-group-item">
148 |             <span class="badge">{{ value|humanize('intcomma') }}</span>
149 |             {{ key }}
150 |           </li>
151 |           {% endfor %}
152 |         </ul>
153 |       </div>
154 | 
155 |     </div><!-- right column ends -->
156 |   </div><!-- row ends -->
157 | 
158 |   <!-- Jobs listing -->
159 |   <div class="row">
160 |     <div class="col-xs-12">
161 | 
162 |       <h2>Job History</h2>
163 | 
164 |       <table class="table table-hover table-striped table-bordered">
165 |         <thead>
166 |           <th>Job</th>
167 |           <th>Job ID</th>
168 |           <th>Posts</th>
169 |           <th>Errors</th>
170 |           <th>Started</th>
171 |           <th>Duration</th>
172 |         </thead>
173 |         <tbody>
174 |           {% for job in recent_jobs %}
175 |           <tr class="{% if job.failed %}danger{% endif %}">
176 |             <td>{{ job.name }} v{{ job.version }}</td>
177 |             <td>{{ job.jobid }}</td>
178 |             <td>{% if 'posts' in job.counts %}{{ job.counts['posts']|humanize('intcomma') }}{% else %}N/A{% endif %}</td>
179 |             <td>{% if 'errors' in job.counts %}{{ job.counts['errors']|humanize('intcomma') }}{% else %}N/A{% endif %}</td>
180 |             <td>{{ job.started|humanize('naturaltime') }}</td>
181 |             <td><span class="text-{{ job.bootstrap_class }}">{{ job.duration(humanize=True) }}</span></td>
182 |           </tr>
183 |           {% endfor %}
184 |         </tbody>
185 |       </table>
186 | 
187 |     </div>
188 |   </div>
189 | 
190 | </div><!-- container ends -->
191 | {% endblock %}
192 | 


--------------------------------------------------------------------------------
/bin/baleen:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # baleen
 3 | # Management and administration script for Baleen
 4 | #
 5 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
 6 | # Created:  Fri Sep 19 10:56:44 2014 -0400
 7 | #
 8 | # Copyright (C) 2014 Bengfort.com
 9 | # For license information, see LICENSE.txt
10 | #
11 | # ID: baleen [5ad94d7] benjamin@bengfort.com $
12 | 
13 | """
14 | Management and administration script for Baleen
15 | """
16 | 
17 | ##########################################################################
18 | ## Imports
19 | ##########################################################################
20 | 
21 | from baleen.console import BaleenUtility
22 | 
23 | 
24 | ##########################################################################
25 | ## Load and execute the CLI utility
26 | ##########################################################################
27 | 
28 | if __name__ == '__main__':
29 |     app = BaleenUtility.load()
30 |     app.execute()
31 | 


--------------------------------------------------------------------------------
/bin/doctimes.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # export publish dates of documents in the corpus.
 3 | 
 4 | import os
 5 | import csv
 6 | import bson
 7 | import argparse
 8 | 
 9 | from datetime import datetime
10 | from pymongo import MongoClient
11 | 
12 | 
13 | def main(args):
14 | 
15 |     # Connect to the Database
16 |     conn = MongoClient()
17 | 	db = conn.baleen
18 | 	posts = db.posts
19 | 
20 |     # Create a hook to the CSV file
21 |     writer = csv.DictWriter(args.outpath, fieldnames=["_id", "pubdate"])
22 |     writer.writeheader()
23 | 
24 |     # Collect the IDs and pubdates
25 |     count = 0
26 |     for row in posts.find({}, {"_id": 1, "pubdate": 1}):
27 |         count += 1
28 |         writer.writerow(row)
29 | 
30 |     print("wrote {} rows to {}".format(count, args.outpath.name))
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     parser = argparse.ArgumentParser(
35 |         description="export pubdates for documents by id"
36 |     )
37 | 
38 |     parser.add_argument(
39 |         "-o", "--outpath", default="pubdates.csv", type=argparse.FileType('w'),
40 |         help="location to write out the results csv file to",
41 |     )
42 | 
43 |     args = parser.parse_args()
44 |     main(args)
45 | 


--------------------------------------------------------------------------------
/bin/ldoc.py:
--------------------------------------------------------------------------------
 1 | import bson
 2 | from pymongo import MongoClient
 3 | 
 4 | 
 5 | def main():
 6 | 	connection = MongoClient()
 7 | 	db = connection.baleen
 8 | 	collection = db.posts
 9 | 	col_size = collection.count()
10 | 	print("Found %d documents in baleen:posts", col_size)
11 | 	idx = 1
12 | 
13 | 	col_sizes = {}
14 | 	for  post in collection.find():
15 | 		print("Item {} of {}".format(idx, col_size))
16 | 		#print(post)
17 | 		#print(post['_id'])
18 | 		#print("{} - {}".format(len(post['content']), post['_id']))
19 | 		col_sizes[post['_id']] = len(post['content'])
20 | 		idx += 1
21 | 
22 | 	print(col_sizes)
23 | 
24 | 	for w in sorted(col_sizes, key=col_sizes.get, reverse=True):
25 |   		print w, col_sizes[w]
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     main()
30 | 


--------------------------------------------------------------------------------
/conf/baleen-example.yaml:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Baleen application
 2 | # This file belongs in: /etc/baleen.yaml
 3 | 
 4 | # Basic Flags
 5 | debug: true
 6 | 
 7 | # Logging Information
 8 | logfile: 'baleen.log'
 9 | loglevel: 'DEBUG'
10 | 
11 | # Use Requests to fetch complete HTML
12 | fetch_html: True
13 | 
14 | # Database Information
15 | database:
16 |     host: localhost
17 |     port: 27017
18 |     name: baleen
19 | 
20 | # Web Admin Server
21 | server:
22 |     host: 127.0.0.1
23 |     port: 5000
24 | 


--------------------------------------------------------------------------------
/conf/upstart/baleen.conf:
--------------------------------------------------------------------------------
 1 | # baleen.conf
 2 | #
 3 | # Author:  Benjamin Bengfort
 4 | # Created: Tue Mar 01 08:18:40 2016 -0500
 5 | #
 6 | # Upstart configuration for Baleen
 7 | # For more details on this configuration see the Baleen deployment docs.
 8 | # This file belongs in: /etc/init/baleen.conf
 9 | 
10 | # Documentation
11 | author      "Benjamin Bengfort <bbengfort@districtdatalabs.com"
12 | description "An automated ingestion service to construct an HTML corpus."
13 | version     "0.3"
14 | 
15 | # Event Definition
16 | start on runlevel [2345]
17 | stop on runlevel [016]
18 | 
19 | # Service Definition
20 | respawn
21 | respawn limit 10 5
22 | 
23 | # Make sure that the local user has access
24 | setuid ubuntu
25 | setgid ubuntu
26 | 
27 | # Environment
28 | env BALEEN=/var/apps/baleen/bin/baleen
29 | env ACTIVATE=/var/envs/baleen/bin/activate
30 | 
31 | # Script to execute
32 | script
33 |     . $ACTIVATE
34 |     exec $BALEEN run
35 | end script
36 | 


--------------------------------------------------------------------------------
/conf/uwsgi/baleen.ini:
--------------------------------------------------------------------------------
 1 | # baleen.ini
 2 | #
 3 | # Author:  Benjamin Bengfort
 4 | # Created: Thu Apr 07 12:12:53 2016 -0400
 5 | #
 6 | # uWSGI configuration for the Baleen web admin app.
 7 | # For more details on this configuration see the Baleen deployment docs.
 8 | # This file belongs in: /etc/uwsgi/apps-available/baleen.ini
 9 | 
10 | [uwsgi]
11 | # Flask Settings
12 | virtualenv     = /var/envs/baleen
13 | chdir          = /var/apps/baleen
14 | module         = baleen.www.app:app
15 | 
16 | # uWSGI Process Settings
17 | uid            = ubuntu
18 | gid            = ubuntu
19 | master         = true
20 | master-as-root = true
21 | workers        = 2
22 | no-orphans     = true
23 | socket         = 127.0.0.1:3264
24 | stats          = 127.0.0.1:1717
25 | vacuum         = true
26 | log-date       = true
27 | plugins        = python
28 | 


--------------------------------------------------------------------------------
/conf/uwsgi/baleen.nginx:
--------------------------------------------------------------------------------
 1 | # baleen.nginx
 2 | #
 3 | # Author:  Benjamin Bengfort
 4 | # Created: Thu Apr 07 12:14:03 2016 -0400
 5 | #
 6 | # Nginx configuration for the Baleen web admin app.
 7 | # For more details on this configuration see the Baleen deployment docs.
 8 | # This file belongs in: /etc/nginx/sites-available/baleen
 9 | 
10 | server {
11 |     # Deny illegal Host headers
12 |     server_name _;
13 |     return 444;
14 | }
15 | 
16 | server {
17 |     # The basics
18 |     listen 80;
19 |     server_tokens off;
20 |     server_name baleen.districtdatalabs.com;
21 |     client_max_body_size 3M;
22 | 
23 |     # Logging
24 |     access_log /var/log/nginx/access_baleen.log;
25 |     error_log  /var/log/nginx/error_baleen.log;
26 | 
27 |     # Locations and Root
28 |     root /var/www/baleen/;
29 | 
30 |     location /static {
31 |         # Allow static content served at /static/ url.
32 |         # Content continues to ve available during downtime.
33 |         alias       /var/apps/baleen/baleen/www/static;
34 |         access_log  off;
35 |         expires     30d;
36 |     }
37 | 
38 |     location /favicon.ico {
39 |         # Allow less intelligent browsers to still get favicon.
40 |         alias       /var/apps/baleen/baleen/www/static/favicon.ico;
41 |         access_log  off;
42 |         expires     30d;
43 |     }
44 | 
45 |     location /humans.txt {
46 |         # Ensure that humans.txt is in the site root.
47 |         alias       /var/apps/baleen/baleen/www/static/humans.txt;
48 |         access_log  off;
49 |         expires     30d;
50 |     }
51 | 
52 |     location /robots.txt {
53 |         # Ensure that robots.txt is in the site root.
54 |         alias       /var/apps/baleen/baleen/www/static/robots.txt;
55 |         access_log  off;
56 |         expires     30d;
57 |     }
58 | 
59 |     # Maintenance Mode Settings
60 |     error_page 502 503 504 @maintenance;
61 | 
62 |     location @maintenance {
63 |         # Create a directory in the site static files called "downtime"
64 |         # which should contain a "maintenance.html" file.
65 |         root /var/apps/baleen/baleen/www/static/downtime;
66 |         rewrite ^(.*)$ /maintenance.html break;
67 |     }
68 | 
69 |     # Finally, the Django App!
70 |     location / {
71 |         # If there is a file called downtime in the public directory, go
72 |         # into maintenance mode and return downtime page if exists.
73 |         if (-f /var/www/downtime) {
74 |             return 503;
75 |         }
76 | 
77 |         # uWSGI Settings
78 |         uwsgi_pass 127.0.0.1:3264;
79 |         proxy_set_header    Host                $host;
80 |         proxy_set_header    X-Real-IP           $remote_addr;
81 |         proxy_set_header    X-Forwarded-For     $proxy_add_x_forwarded_for;
82 |         proxy_set_header    X-Forwarded-Proto   $http_x_forwarded_proto;
83 |         add_header          X-Cache-Status      $upstream_cache_status;
84 | 
85 |         include uwsgi_params;
86 |     }
87 | }
88 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | mongo:
 2 |    image: mongo:3.2.3
 3 | app:
 4 |    build: .
 5 |    dockerfile: Dockerfile-app
 6 |    environment:
 7 |       - VIRTUAL_ENV=/usr/local/
 8 |    volumes:
 9 |       - .:/baleen
10 |    links:
11 |       - mongo
12 |    ports:
13 |       - 5000:5000
14 |    restart: always


--------------------------------------------------------------------------------
/docs/about.md:
--------------------------------------------------------------------------------
  1 | # About     
  2 | 
  3 | Baleen is a tool for ingesting _formal_ natural language data from the discourse of professional and amateur writers: e.g. bloggers and news outlets. Rather than performing web scraping, Baleen focuses on data ingestion through the use of RSS feeds. It performs as much raw data collection as it can, saving data into a Mongo document store.
  4 | 
  5 | ## Contributing
  6 | 
  7 | Baleen is open source, and I'd love your help. If you would like to contribute, you can do so in the following ways:
  8 | 
  9 | 1. Add issues or bugs to the bug tracker: [https://github.com/bbengfort/baleen/issues](https://github.com/bbengfort/baleen/issues)
 10 | 2. Work on a card on the dev board: [https://waffle.io/bbengfort/baleen](https://waffle.io/bbengfort/baleen)
 11 | 3. Create a pull request in Github: [https://github.com/bbengfort/baleen/pulls](https://github.com/bbengfort/baleen/pulls)
 12 | 
 13 | Note that labels in the Github issues are defined in the blog post: [How we use labels on GitHub Issues at Mediocre Laboratories](https://mediocre.com/forum/topics/how-we-use-labels-on-github-issues-at-mediocre-laboratories).
 14 | 
 15 | If you are a member of the District Data Labs Faculty group, you have direct access to the repository, which is set up in a typical production/release/development cycle as described in _[A Successful Git Branching Model](http://nvie.com/posts/a-successful-git-branching-model/)_. A typical workflow is as follows:
 16 | 
 17 | 1. Select a card from the [dev board](https://waffle.io/bbengfort/baleen) - preferably one that is "ready" then move it to "in-progress".
 18 | 
 19 | 2. Create a branch off of develop called "feature-[feature name]", work and commit into that branch.
 20 | 
 21 |         ~$ git checkout -b feature-myfeature develop
 22 | 
 23 | 3. Once you are done working (and everything is tested) merge your feature into develop.
 24 | 
 25 |         ~$ git checkout develop
 26 |         ~$ git merge --no-ff feature-myfeature
 27 |         ~$ git branch -d feature-myfeature
 28 |         ~$ git push origin develop
 29 | 
 30 | 4. Repeat. Releases will be routinely pushed into master via release branches, then deployed to the server.
 31 | 
 32 | ## Contributors
 33 | 
 34 | Thank you for all your help contributing to make Baleen a great project!
 35 | 
 36 | ### Maintainers
 37 | 
 38 | - Benjamin Bengfort: [@bbengfort](https://github.com/bbengfort/)
 39 | 
 40 | ### Contributors
 41 | 
 42 | - Laura Lorenz: [@lauralorenz](https://github.com/lauralorenz)
 43 | - Sasan Bahadaran: [@bahadasx](https://github.com/bahadasx)
 44 | 
 45 | ## Changelog
 46 | 
 47 | The release versions that are sent to the Python package index (PyPI) are also tagged in Github. You can see the tags through the Github web application and download the tarball of the version you'd like. Additionally PyPI will host the various releases of Baleen (eventually).
 48 | 
 49 | The versioning uses a three part version system, "a.b.c" - "a" represents a major release that may not be backwards compatible. "b" is incremented on minor releases that may contain extra features, but are backwards compatible. "c" releases are bug fixes or other micro changes that developers should feel free to immediately update to.
 50 | 
 51 | ### Version 0.3.3
 52 | 
 53 | * **tag**: [v0.3.3](https://github.com/bbengfort/baleen/releases/tag/v0.3.3)
 54 | * **deployment**: Monday, April 18, 2016
 55 | * **commit**: (see tag)
 56 | 
 57 | Extended the Baleen export functionality to dump either an HTML or JSON corpus to disk in a suitable format for NLP analysis, particularly using NLTK. The new export functionality is still single process, but does some smart things to reduce the amount of time the export takes, as well as the amount of memory required. Additionally, we have improved the visual interface to the web application, making status messages more noticeable as we monitor continued data ingestion.
 58 | 
 59 | The app can be found online at [http://baleen.districtdatalabs.com](http://baleen.districtdatalabs.com).
 60 | 
 61 | ### Version 0.3.2
 62 | 
 63 | * **tag**: [v0.3.2](https://github.com/bbengfort/baleen/releases/tag/v0.3.2)
 64 | * **deployment**: Wednesday, April 13, 2016
 65 | * **commit**: [642ad60](https://github.com/bbengfort/baleen/commit/642ad609dc6e97f052dff8458d0cc43e9721eed7)
 66 | 
 67 | Some changes to the web application to attempt to solve SEGFAULT errors and to make the status and the logs more readable. This is just a quick hotfix to make sure we have decent monitoring in the app.
 68 | 
 69 | The app can be found online at [http://baleen.districtdatalabs.com](http://baleen.districtdatalabs.com).
 70 | 
 71 | ### Version 0.3.1
 72 | 
 73 | * **tag**: [v0.3.1](https://github.com/bbengfort/baleen/releases/tag/v0.3.1)
 74 | * **deployment**: Thursday, April 7, 2016
 75 | * **commit**: [121de50](https://github.com/bbengfort/baleen/commit/121de50084aaa29b8098756630c3a7a0a14d8d78)
 76 | 
 77 | Very happy to have had [@lauralorenz](https://github.com/lauralorenz) and [@bahadasx](https://github.com/bahadasx) contribute to Baleen by building a web admin app. The app is a very simple Flask app that reads from the database and reports on the status, including the list of available feeds. It also reports information from the log file.
 78 | 
 79 | The app can be found online now at [http://baleen.districtdatalabs.com](http://baleen.districtdatalabs.com).
 80 | 
 81 | ### Version 0.3
 82 | 
 83 | * **tag**: [v0.3](https://github.com/bbengfort/baleen/releases/tag/v0.3)
 84 | * **deployment**: Thursday, March 3, 2016
 85 | * **commit**: [9e095bc](https://github.com/bbengfort/baleen/commit/9e095bc4cac584e906dfd6e38eb77b1ef5afe107)
 86 | 
 87 | Releases one day after another! The reason is because Baleen needs to be running in production to gather a large enough corpus for PyCon. Version 0.3 is a major release that implements the revised component architecture. It should hopefully be more stable, give more visibility into what's going on, be easier to update and fix, and have a few more features. Features include tracking ingestion jobs in the Mongo database (so we can add a web application), synchronization of feeds and wrangling of posts are not coupled. Added Commis for easier console utility management, and finally added some other tools and tests.
 88 | 
 89 | ### Version 0.2.1
 90 | 
 91 | * **tag**: [v0.2.1](https://github.com/bbengfort/baleen/releases/tag/v0.2.1)
 92 | * **deployment**: Wednesday, March 2, 2016
 93 | * **commit**: [85aa949](https://github.com/bbengfort/baleen/commit/85aa949f8fae453a491b0129dcb1ad6d02832e3e)
 94 | 
 95 | Hotfix for an error that caused unicode strings to kill the ingestion in a try/except block (as it was being written to the logger)! This error was so serious it needed to be fixed right away, even in the middle of Version 0.3 updates.
 96 | 
 97 | ### Version 0.2
 98 | 
 99 | * **tag**: [v0.2](https://github.com/bbengfort/baleen/releases/tag/v0.2)
100 | * **deployment**: Tuesday, March 1, 2016
101 | * **commit**: [8e4e06e](https://github.com/bbengfort/baleen/commit/8e4e06e793b4ef949e83ab4c6d1715b03ae33957)
102 | 
103 | This update was a push to get Baleen running on EC2 on an hourly basis in preparation for PyCon. We updated all of Baleen's dependencies to their latest versions, added tests and other important fixtures, and organized the code a bit better. New functionality includes the ability to fetch the post webpage from the link, export the corpus to disk using the command line utility, and run in the background using the schedule library.
104 | 
105 | ### Version 0.1
106 | 
107 | * **tag**: [v0.1](https://github.com/bbengfort/baleen/releases/tag/v0.1)
108 | * **release**: Tuesday, September 23, 2014
109 | * **deployment**: Thursday, February 18, 2016
110 | * **commit**: [f5f15dd](https://github.com/bbengfort/baleen/commit/f5f15dda6da9c0fb680d7af43bb941c5086845a1)
111 | 
112 | This was the initial version of Baleen before the revamp occurred thanks to the PyCon tutorial. Baleen in this form was a command line utility that fetched RSS feeds on demand and stored them in a Mongo database. The input to Baleen is an OPML file that contains an RSS feed listing as well as their topics.
113 | 
114 | Baleen was originally used to produce a corpus for the [District Data Labs](https://www.districtdatalabs.com) _NLP with NLTK_ course. The corpus was then adapted for use in the [Statistics.com](http://www.statistics.com/) online course of the same name. The problem is that because Baleen had to be ran manually, it was difficult to get a high quality corpus on demand.
115 | 


--------------------------------------------------------------------------------
/docs/components.md:
--------------------------------------------------------------------------------
 1 | # Baleen Components
 2 | 
 3 | Baleen's objective is simple: given an OPML file of RSS feeds, download all the posts from those feeds and save them to MongoDB storage. While this task seems like it could be easily completed with a single function, once you start integrating the parts of the program, things get more complex. The following component architecture describes how we've put together Baleen:
 4 | 
 5 | [![Baleen Component Architecture](/images/component_architecture.png)](/images/component_architecture.png)
 6 | 
 7 | There are three main parts to the component architecture:
 8 | 
 9 | - Interacting with the local disk: importing OPML and exporting a corpus.
10 | - Interacting with the MongoDB storage of posts.
11 | - Fetching data from both the RSS feeds as well as the complete web page.
12 | 
13 | Additionally there are utilities, configuration, and logging as well as the command line program that uses [commis](http://commis.readthedocs.org/en/latest/), but those are pretty standard and are not specific to Baleen. In the next sections we'll look at and describe the operation of each of these main blocks of code.
14 | 
15 | ## MongoDB Models
16 | 
17 | The central part of the operation of Baleen revolves around the interaction with MongoDB. Baleen uses [mongoengine](http://mongoengine.org/) as an ODM to provide models for inserting documents into collections. There are two primary models:
18 | 
19 | - `Feed`: maintains information about an RSS or Atom feed.
20 | - `Post`: a document that has been syndicated by a feed.
21 | 
22 | Hopefully the relationship is clear: a `Feed` is a listing of `Post` documents. Our collection objective is the HTML content of a `Post` and we use the `Feed` to obtain the `Post` rather than web scraping.
23 | 
24 | Note that these models do nothing except manipulate their data store and read and write to the database. Methods for ingestion, wranging, or fetching the full web page _wrap_ their respective models. E.g. you wouldn't do `Feed.sync()` to collect the latest RSS feed, instead you would use some `Sync` object and pass it a feed: `Sync(feed)`.
25 | 
26 | ## Ingestion
27 | 
28 | The ingestion portion of the Baleen service is the most critical and the requirements are as follows:
29 | 
30 | 1. On a _routine_ basis, collect and ingest feeds from MongoDB or an OPML file.
31 | 2. Synchronize feeds by fetching the latest RSS/Atom from their `xmlUrl`.  
32 | 3. For each item in the synchronized feed, create and wrangle a post.
33 | 4. For each post fetch the full HTML from the `htmlUrl`.
34 | 5. Be able to track the start/stop/duration of the ingestion for a set of feeds.
35 | 6. Be able to track the number of errors, posts ingested.
36 | 7. _Allow no duplicate posts to be added to the database_.
37 | 
38 | In order to synchronize feeds, we use the [feedparser](https://pypi.python.org/pypi/feedparser) library and to fetch documents from the web, we use [Requests](http://docs.python-requests.org/en/master/). A single `Ingest` instance takes as input an iterable of feeds from either MongoDB or from an OPML file. When run it maintains two queues: a feed processing queue and a page processing queue (so that it can be threaded or multiprocessed).
39 | 
40 | Feed processing is performed by a `FeedSync` object which takes a single feed as input. The `FeedSync` object fetches the RSS via feedparser, and iterates through all posts, wrangling them and saving them to Mongo. The `PageWrangler` object takes a post as input, wrangles the data from a variety of feed types, then fetches the complete web page.
41 | 
42 | Once the `Ingest` instance has cleared it's work queue, it logs various information and terminates. Note that the `Ingest` instance is responsible for error handling and logging, while the sync and fetch utilities must raise exceptions.
43 | 
44 | ## Import and Export
45 | 
46 | The import utility uses an `OPMLReader` to load and parse the OPML file from disk with [Beautiful Soup](http://www.crummy.com/software/BeautifulSoup/). The OPML file exposes a tree hierarchy or table of contents structure to the feeds where the first level is a "category" and the secondary level is each RSS/Atom feed item. On import, we simply read the OPML file and add any additional feeds to the MongoDB without duplication. This allows us to maintain a single master list of RSS from multiple OPML files.
47 | 
48 | Note, we've found that the best way to create OPML files is to use the [Feedly](https://feedly.com) app, which allows us to organize our feeds. Under their "organize feeds" section, they also have an Export OPML link (and an import OPML link).
49 | 
50 | The export utility creates a categorized corpus structure ready for NLTK using the `MongoExport` class. Each category from the TOC structure of the OPML is a directory in the corpus on disk, then each post is written as an HTML file. The exporter also writes a README file with information about the contents fo the corpus. Key concerns here involve HTML sanitization (removing scripts) and readability (extracting only the text we want to analyze).
51 | 


--------------------------------------------------------------------------------
/docs/images/component_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DistrictDataLabs/baleen/bb2ae323a3ab3a066a4a289401847e1251abc55d/docs/images/component_architecture.png


--------------------------------------------------------------------------------
/docs/images/service_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DistrictDataLabs/baleen/bb2ae323a3ab3a066a4a289401847e1251abc55d/docs/images/service_architecture.png


--------------------------------------------------------------------------------
/docs/images/spacewhale.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DistrictDataLabs/baleen/bb2ae323a3ab3a066a4a289401847e1251abc55d/docs/images/spacewhale.jpg


--------------------------------------------------------------------------------
/docs/images/whaleship.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DistrictDataLabs/baleen/bb2ae323a3ab3a066a4a289401847e1251abc55d/docs/images/whaleship.jpg


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Welcome to Baleen
 2 | 
 3 | **Complete documentation coming soon!**
 4 | 
 5 | ## Quick Start
 6 | 
 7 | This quick start is intended to get you setup with Baleen in development mode (since the project is still under development). If you'd like to run Baleen in production, please see the [documentation][rtfd_href].
 8 | 
 9 | 1. Clone the repository
10 | 
11 |         $ git clone git@github.com:bbengfort/baleen.git
12 |         $ cd baleen
13 | 
14 | 
15 | 2. Create a virtualenv and install the dependencies
16 | 
17 |         $ virtualenv venv
18 |         $ source venv/bin/activate
19 |         $ pip install -r requirements.txt
20 | 
21 | 3. Add the `baleen` module to your `$PYTHONPATH` via the virtualenv.
22 | 
23 |         $ echo $(pwd) > venv/lib/python2.7/site-packages/baleen.pth
24 | 
25 | 4. Create your local configuration file. Edit it with the connection details to your local MongoDB server.  This is also a good time to check and make sure that you can create a database called Baleen on Mongo.
26 | 
27 |         $ cp conf/baleen-example.yaml conf/baleen.yaml
28 | 
29 |     The YAML file should look similar to:
30 | 
31 |         debug: true
32 |         testing: false
33 |         database:
34 |             host: localhost
35 |             port: 27017
36 |             name: baleen
37 | 
38 | 5. Run the tests to make sure everything is ok.
39 | 
40 |         $ make test
41 | 
42 | 6. Make sure that the command line utility is ready to go:
43 | 
44 |         $ bin/baleen --help
45 | 
46 | 7. Import the feeds from the `feedly.opml` file in the fixtures.
47 | 
48 |         $ bin/baleen import fixtures/feedly.opml
49 |         Ingested 101 feeds from 1 OPML files
50 | 
51 | 8. Perform an ingestion of the feeds that were imported from the `feedly.opml` file.
52 | 
53 |         $ bin/baleen ingest
54 | 
55 | Your Mongo database collections should be created as you add new documents to them, and at this point you're ready to develop!
56 | 


--------------------------------------------------------------------------------
/fixtures/fields.json:
--------------------------------------------------------------------------------
 1 | // A summary of the fields for many RSS feeds downloaded using Python feedparser
 2 | {
 3 |   "fields": {
 4 |     "dc_source": 7,
 5 |     "media_credit": 71,
 6 |     "updated_parsed": 277,
 7 |     "links": 2130,
 8 |     "twitter": 20,
 9 |     "media_text": 31,
10 |     "summary_detail": 1993,
11 |     "href": 386,
12 |     "wfw_commentrss": 896,
13 |     "id": 2100,
14 |     "slash_comments": 866,
15 |     "contributors": 3,
16 |     "published_parsed": 2070,
17 |     "title": 2130,
18 |     "comments": 1061,
19 |     "content": 1134,
20 |     "source": 40,
21 |     "title_detail": 2130,
22 |     "mash_thumbnail": 30,
23 |     "dc_identifier": 65,
24 |     "updated": 277,
25 |     "gd_image": 50,
26 |     "media_description": 18,
27 |     "tags": 1275,
28 |     "feedburner_origlink": 896,
29 |     "media_group": 10,
30 |     "media_content": 361,
31 |     "feedburner_origenclosurelink": 26,
32 |     "thr_total": 85,
33 |     "authors": 1691,
34 |     "author_detail": 1574,
35 |     "desceditca": 1,
36 |     "guidislink": 2100,
37 |     "titleeditca": 1,
38 |     "dc_type": 15,
39 |     "author": 1691,
40 |     "media_thumbnail": 287,
41 |     "summary": 2130,
42 |     "media_copyright": 25,
43 |     "published": 2070,
44 |     "link": 2130,
45 |     "postid": 40
46 |   },
47 |   "feeds": 101,
48 |   "entries": 2130,
49 |   "versions": {
50 |     "": 4,
51 |     "rss20": 82,
52 |     "rss10": 1,
53 |     "atom10": 14
54 |   },
55 |   "time": 73.437
56 | }
57 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: Baleen
 2 | repo_name: GitHub
 3 | repo_url: https://github.com/bbengfort/baleen
 4 | site_description: An automated ingestion service for blogs to construct a corpus for NLP research.
 5 | site_author: District Data Labs
 6 | copyright: Built by District Data Labs, licensed by <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/80x15.png" /></a>
 7 | theme: readthedocs
 8 | 
 9 | pages:
10 |     - "Introduction": index.md
11 |     - "Component Architecture": components.md
12 |     - "Service Architecture": service.md
13 |     - "About Baleen": about.md
14 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | ## Requests
 2 | feedparser==5.2.1
 3 | requests==2.9.1
 4 | 
 5 | ## Database
 6 | pymongo==3.2.1
 7 | mongoengine==0.10.6
 8 | blinker==1.4
 9 | 
10 | ## Parsing
11 | beautifulsoup4==4.4.1
12 | lxml==3.5.0
13 | 
14 | ## Configuration
15 | confire==0.2.0
16 | PyYAML==3.11
17 | 
18 | ## Command Line
19 | commis==0.2
20 | colorama==0.3.6
21 | 
22 | ## Utilities
23 | schedule==0.3.2
24 | python-dateutil==2.4.2
25 | enum34==1.1.3
26 | six==1.10.0
27 | 
28 | ## Web Admin
29 | Flask==0.10.1
30 | Flask-Admin==1.4.0
31 | Flask-WTF==0.12
32 | flask-mongoengine==0.7.5
33 | Flask-Humanize==0.3.0
34 | WTForms==2.1
35 | Jinja2==2.8
36 | humanize==0.5.1
37 | itsdangerous==0.24
38 | MarkupSafe==0.23
39 | Werkzeug==0.11.5
40 | 
41 | ## Testing
42 | ## Uncomment and install for development
43 | #nose==1.3.7
44 | #coverage==4.0.3
45 | #mock==1.3.0
46 | #funcsigs==0.4
47 | #pbr==1.8.1
48 | #mongomock==3.2.1
49 | #sentinels==0.0.6
50 | 
51 | ## Building
52 | ## Uncomment and install for deployment
53 | #wheel==0.29.0
54 | 
55 | ## Pip Freeze Stuff
56 | #Python==2.7.10
57 | #pip==8.1.1
58 | #setuptools==0.9.7
59 | #wsgiref==0.1.2
60 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # setup
  3 | # Setup script for installing baleen
  4 | #
  5 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
  6 | # Created:  Fri Sep 19 10:59:24 2014 -0400
  7 | #
  8 | # Copyright (C) 2014 Bengfort.com
  9 | # For license information, see LICENSE.txt and NOTICE.md
 10 | #
 11 | # ID: setup.py [5ad94d7] benjamin@bengfort.com $
 12 | 
 13 | """
 14 | Setup script for installing baleen.
 15 | See http://bbengfort.github.io/programmer/2016/01/20/packaging-with-pypi.html
 16 | """
 17 | 
 18 | ##########################################################################
 19 | ## Imports
 20 | ##########################################################################
 21 | 
 22 | import os
 23 | import re
 24 | import codecs
 25 | 
 26 | from setuptools import setup
 27 | from setuptools import find_packages
 28 | 
 29 | ##########################################################################
 30 | ## Package Information
 31 | ##########################################################################
 32 | 
 33 | ## Basic information
 34 | NAME         = "baleen"
 35 | DESCRIPTION  = "An automated ingestion service for blogs to construct a corpus for NLP research."
 36 | AUTHOR       = "Benjamin Bengfort"
 37 | EMAIL        = "benjamin@bengfort.com"
 38 | LICENSE      = "MIT"
 39 | REPOSITORY   = "https://github.com/bbengfort/baleen"
 40 | PACKAGE      = "baleen"
 41 | 
 42 | ## Define the keywords
 43 | KEYWORDS     = ('nlp', 'baleen', 'ingestion', 'blogs', 'rss')
 44 | 
 45 | ## Define the classifiers
 46 | ## See https://pypi.python.org/pypi?%3Aaction=list_classifiers
 47 | CLASSIFIERS  = (
 48 |     'Development Status :: 4 - Beta',
 49 |     'Environment :: Console',
 50 |     'Intended Audience :: Developers',
 51 |     'License :: OSI Approved :: MIT License',
 52 |     'Natural Language :: English',
 53 |     'Operating System :: OS Independent',
 54 |     'Programming Language :: Python',
 55 |     'Programming Language :: Python :: 2.7',
 56 |     'Topic :: Software Development',
 57 |     'Topic :: Software Development :: Libraries :: Python Modules',
 58 |     'Topic :: Utilities',
 59 | )
 60 | 
 61 | ## Important Paths
 62 | PROJECT      = os.path.abspath(os.path.dirname(__file__))
 63 | REQUIRE_PATH = "requirements.txt"
 64 | VERSION_PATH = os.path.join(PACKAGE, "version.py")
 65 | PKG_DESCRIBE = "DESCRIPTION.txt"
 66 | 
 67 | ## Directories to ignore in find_packages
 68 | EXCLUDES     = (
 69 |     "tests", "bin", "docs", "fixtures", "register", "notebooks",
 70 | )
 71 | 
 72 | ##########################################################################
 73 | ## Helper Functions
 74 | ##########################################################################
 75 | 
 76 | def read(*parts):
 77 |     """
 78 |     Assume UTF-8 encoding and return the contents of the file located at the
 79 |     absolute path from the REPOSITORY joined with *parts.
 80 |     """
 81 |     with codecs.open(os.path.join(PROJECT, *parts), 'rb', 'utf-8') as f:
 82 |         return f.read()
 83 | 
 84 | 
 85 | def get_version(path=VERSION_PATH):
 86 |     """
 87 |     Reads the __init__.py defined in the VERSION_PATH to find the get_version
 88 |     function, and executes it to ensure that it is loaded correctly.
 89 |     """
 90 |     namespace = {}
 91 |     exec(read(path), namespace)
 92 |     return namespace['get_version']()
 93 | 
 94 | 
 95 | def get_requires(path=REQUIRE_PATH):
 96 |     """
 97 |     Yields a generator of requirements as defined by the REQUIRE_PATH which
 98 |     should point to a requirements.txt output by `pip freeze`.
 99 |     """
100 |     for line in read(path).splitlines():
101 |         line = line.strip()
102 |         if line and not line.startswith('#'):
103 |             yield line
104 | 
105 | ##########################################################################
106 | ## Define the configuration
107 | ##########################################################################
108 | 
109 | config = {
110 |     "name": NAME,
111 |     "version": get_version(),
112 |     "description": DESCRIPTION,
113 |     "long_description": read(PKG_DESCRIBE),
114 |     "license": LICENSE,
115 |     "author": AUTHOR,
116 |     "author_email": EMAIL,
117 |     "maintainer": AUTHOR,
118 |     "maintainer_email": EMAIL,
119 |     "url": REPOSITORY,
120 |     "download_url": "{}/tarball/v{}".format(REPOSITORY, get_version()),
121 |     "packages": find_packages(where=PROJECT, exclude=EXCLUDES),
122 |     "install_requires": list(get_requires()),
123 |     "classifiers": CLASSIFIERS,
124 |     "keywords": KEYWORDS,
125 |     "zip_safe": False,
126 |     "scripts": ['bin/baleen'],
127 | }
128 | 
129 | ##########################################################################
130 | ## Run setup script
131 | ##########################################################################
132 | 
133 | if __name__ == '__main__':
134 |     setup(**config)
135 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | # tests
 2 | # Testing for the baleen module
 3 | #
 4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
 5 | # Created:  Fri Sep 19 10:58:15 2014 -0400
 6 | #
 7 | # Copyright (C) 2014 Bengfort.com
 8 | # For license information, see LICENSE.txt
 9 | #
10 | # ID: __init__.py [5ad94d7] benjamin@bengfort.com $
11 | 
12 | """
13 | Testing for the baleen module
14 | """
15 | 
16 | ##########################################################################
17 | ## Imports
18 | ##########################################################################
19 | 
20 | import unittest
21 | 
22 | ##########################################################################
23 | ## Module Constants
24 | ##########################################################################
25 | 
26 | TEST_VERSION = "0.3.3" ## Also the expected version onf the package
27 | 
28 | ##########################################################################
29 | ## Test Cases
30 | ##########################################################################
31 | 
32 | class InitializationTest(unittest.TestCase):
33 | 
34 |     def test_initialization(self):
35 |         """
36 |         Tests a simple world fact by asserting that 10*10 is 100
37 |         """
38 |         self.assertEqual(10*10, 100)
39 | 
40 |     def test_import(self):
41 |         """
42 |         Can import baleen
43 |         """
44 |         try:
45 |             import baleen
46 |         except ImportError:
47 |             self.fail("Unable to import the baleen module!")
48 | 
49 |     def test_version(self):
50 |         """
51 |         Assert that the version is sane
52 |         """
53 |         import baleen
54 |         self.assertEqual(TEST_VERSION, baleen.__version__)
55 | 


--------------------------------------------------------------------------------
/tests/fixtures/feedly.opml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | <opml version="1.0">
 4 |     <head>
 5 |         <title>null subscriptions in feedly Cloud</title>
 6 |     </head>
 7 |     <body>
 8 |         <outline text="news" title="news">
 9 |             <outline type="rss" text="The New York Times" title="The New York Times" xmlUrl="http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml" htmlUrl="http://www.nytimes.com/pages/index.html?partner=rss&amp;emc=rss"/>
10 |             <outline type="rss" text="Washington Post: Breaking News, World, US, DC News &amp; Analysis" title="Washington Post: Breaking News, World, US, DC News &amp; Analysis" xmlUrl="http://www.washingtonpost.com/rss/homepage" htmlUrl="http://www.washingtonpost.com/pb/homepage/"/>
11 |         </outline>
12 |         <outline text="do it yourself" title="do it yourself">
13 |             <outline type="rss" text="Hack a Day" title="Hack a Day" xmlUrl="http://www.hackaday.com/rss.xml" htmlUrl="http://hackaday.com"/>
14 |             <outline type="rss" text="Instructables" title="Instructables" xmlUrl="http://www.instructables.com/tag/type:instructable/rss.xml" htmlUrl="http://www.instructables.com"/>
15 |         </outline>
16 |         <outline text="business" title="business">
17 |             <outline type="rss" text="Entrepreneur" title="Entrepreneur" xmlUrl="http://feeds.feedburner.com/entrepreneur/latest" htmlUrl="http://www.entrepreneur.com/latest?utm_source=Feedly&amp;utm_medium=related&amp;utm_campaign=syndication"/>
18 |             <outline type="rss" text="Freakonomics" title="Freakonomics" xmlUrl="http://freakonomics.blogs.nytimes.com/feed/" htmlUrl="http://freakonomics.com"/>
19 |             <outline type="rss" text="VentureBeat" title="VentureBeat" xmlUrl="http://feeds.feedburner.com/venturebeat" htmlUrl="http://venturebeat.com"/>
20 |         </outline>
21 |         <outline text="gaming" title="gaming">
22 |             <outline type="rss" text="TouchArcade" title="TouchArcade" xmlUrl="http://toucharcade.com/feed/" htmlUrl="http://toucharcade.com"/>
23 |             <outline type="rss" text="Opposable Thumbs" title="Opposable Thumbs" xmlUrl="http://feeds.arstechnica.com/arstechnica/gaming/" htmlUrl="http://arstechnica.com"/>
24 |             <outline type="rss" text="Joystiq" title="Joystiq" xmlUrl="http://www.joystiq.com/rss.xml" htmlUrl="http://www.joystiq.com"/>
25 |         </outline>
26 |         <outline text="data science" title="data science">
27 |             <outline type="rss" text="no free hunch" title="no free hunch" xmlUrl="http://blog.kaggle.com/feed/" htmlUrl="http://blog.kaggle.com"/>
28 |             <outline type="rss" text="KDnuggets" title="KDnuggets" xmlUrl="http://feeds.feedburner.com/kdnuggets-data-mining-analytics" htmlUrl="http://www.kdnuggets.com"/>
29 |             <outline type="rss" text="The Numbers" title="The Numbers" xmlUrl="http://blogs.wsj.com/numbersguy/feed/" htmlUrl="http://blogs.wsj.com/numbers"/>
30 |             <outline type="rss" text="FiveThirtyEight" title="FiveThirtyEight" xmlUrl="http://www.fivethirtyeight.com/feeds/posts/default" htmlUrl="http://fivethirtyeight.com"/>
31 |         </outline>
32 |         <outline text="essays" title="essays">
33 |             <outline type="rss" text="The Electric Typewriter" title="The Electric Typewriter" xmlUrl="http://tetw.tumblr.com/rss" htmlUrl="http://tetw.org/"/>
34 |             <outline type="rss" text="The Essayist" title="The Essayist" xmlUrl="http://essayist.tumblr.com/rss" htmlUrl="http://essayist.tumblr.com/"/>
35 |         </outline>
36 |         <outline text="politics" title="politics">
37 |             <outline type="rss" text="Political Mojo | Mother Jones" title="Political Mojo | Mother Jones" xmlUrl="http://motherjones.com/mojo/feed" htmlUrl="http://www.motherjones.com/Blogs"/>
38 |             <outline type="rss" text="The Foundry" title="The Foundry" xmlUrl="http://blog.heritage.org/feed/" htmlUrl="http://dailysignal.com/"/>
39 |         </outline>
40 |         <outline text="tech" title="tech">
41 |             <outline type="rss" text="Ars Technica" title="Ars Technica" xmlUrl="http://feeds.arstechnica.com/arstechnica/index/" htmlUrl="http://arstechnica.com"/>
42 |             <outline type="rss" text="Gizmodo" title="Gizmodo" xmlUrl="http://feeds.gawker.com/gizmodo/full" htmlUrl="http://gizmodo.com"/>
43 |             <outline type="rss" text="Engadget" title="Engadget" xmlUrl="http://www.engadget.com/rss-full.xml" htmlUrl="http://www.engadget.com"/>
44 |         </outline>
45 |         <outline text="cinema" title="cinema">
46 |             <outline type="rss" text="Film School Rejects" title="Film School Rejects" xmlUrl="http://feeds.feedburner.com/FilmSchoolRejects" htmlUrl="http://filmschoolrejects.com"/>
47 |             <outline type="rss" text="Hollywood Reporter" title="Hollywood Reporter" xmlUrl="http://feeds.feedburner.com/thr/news" htmlUrl="http://www.hollywoodreporter.com"/>
48 |             <outline type="rss" text="ScreenCrave.com" title="ScreenCrave.com" xmlUrl="http://screencrave.com/feed/" htmlUrl="http://screencrave.com"/>
49 |         </outline>
50 |         <outline text="books" title="books">
51 |             <outline type="rss" text="The Rumpus.net" title="The Rumpus.net" xmlUrl="http://therumpus.net/feed/" htmlUrl="http://therumpus.net"/>
52 |             <outline type="rss" text="Granta Magazine" title="Granta Magazine" xmlUrl="http://www.granta.com/Online-Only/rss.xml" htmlUrl="http://granta.com"/>
53 |             <outline type="rss" text="The Millions" title="The Millions" xmlUrl="http://feeds.feedburner.com/themillionsblog/fedw" htmlUrl="http://www.themillions.com"/>
54 |         </outline>
55 |         <outline text="sports" title="sports">
56 |             <outline type="rss" text="ESPN.com" title="ESPN.com" xmlUrl="http://sports.espn.go.com/espn/rss/news" htmlUrl="http://espn.go.com"/>
57 |             <outline type="rss" text="NBA.com: News" title="NBA.com: News" xmlUrl="http://www.nba.com/rss/nba_rss.xml" htmlUrl="http://www.nba.com/news"/>
58 |             <outline type="rss" text="NFL.com" title="NFL.com" xmlUrl="http://www.nfl.com/rss/rsslanding?searchString=home" htmlUrl="http://www.nfl.com/rss/rsslanding"/>
59 |         </outline>
60 |         <outline text="cooking" title="cooking">
61 |             <outline type="rss" text="Love and Olive Oil" title="Love and Olive Oil" xmlUrl="http://www.loveandoliveoil.com/feed" htmlUrl="http://www.loveandoliveoil.com"/>
62 |             <outline type="rss" text="Cookie and Kate" title="Cookie and Kate" xmlUrl="http://feeds.feedburner.com/CookieAndKate" htmlUrl="http://cookieandkate.com"/>
63 |             <outline type="rss" text="Baking Bites" title="Baking Bites" xmlUrl="http://bakingbites.com/feed/" htmlUrl="http://bakingbites.com"/>
64 |             <outline type="rss" text="Serious Eats" title="Serious Eats" xmlUrl="http://feeds.seriouseats.com/seriouseatsfeaturesvideos" htmlUrl="http://www.seriouseats.com/"/>
65 |         </outline>
66 |         <outline text="design" title="design">
67 |             <outline type="rss" text="Cool Hunting" title="Cool Hunting" xmlUrl="http://www.coolhunting.com/atom.xml" htmlUrl="http://www.coolhunting.com"/>
68 |             <outline type="rss" text="Yatzer" title="Yatzer" xmlUrl="http://www.yatzer.com/feed/index.php" htmlUrl="http://www.yatzer.com/frontpage"/>
69 |         </outline>
70 |     </body>
71 | </opml>
72 | 


--------------------------------------------------------------------------------
/tests/fixtures/feedparser_result.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DistrictDataLabs/baleen/bb2ae323a3ab3a066a4a289401847e1251abc55d/tests/fixtures/feedparser_result.pickle


--------------------------------------------------------------------------------
/tests/test_export.py:
--------------------------------------------------------------------------------
 1 | # tests.test_export
 2 | # Test the export module - to generate a corpus for machine learning.
 3 | #
 4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
 5 | # Created:  Sun Feb 21 15:49:18 2016 -0500
 6 | #
 7 | # Copyright (C) 2016 Bengfort.com
 8 | # For license information, see LICENSE.txt
 9 | #
10 | # ID: test_export.py [2988c53] benjamin@bengfort.com $
11 | 
12 | """
13 | Test the export module - to generate a corpus for machine learning.
14 | """
15 | 
16 | ##########################################################################
17 | ## Imports
18 | ##########################################################################
19 | 
20 | import unittest
21 | 
22 | try:
23 |     from unittest import mock
24 | except ImportError:
25 |     import mock
26 | 
27 | from baleen.export import *
28 | from baleen.exceptions import ExportError
29 | 
30 | 
31 | ##########################################################################
32 | ## Export Tests
33 | ##########################################################################
34 | 
35 | class ExportTests(unittest.TestCase):
36 | 
37 |     def test_scheme_specification(self):
38 |         """
39 |         Assert that only known schemes are allowed.
40 |         """
41 | 
42 |         # Make sure good schemes don't error
43 |         for scheme in SCHEMES:
44 |             try:
45 |                 exporter = MongoExporter("/tmp/corpus", scheme=scheme)
46 |             except ExportError:
47 |                 self.fail("Could not use expected scheme, {}".format(scheme))
48 | 
49 |         # Make sure bad schemes do error
50 |         for scheme in ('text', 'txt', 'bson', 'xml', 'yaml'):
51 |             with self.assertRaises(ExportError):
52 |                 exporter = MongoExporter("/tmp/corpus", scheme=scheme)
53 | 


--------------------------------------------------------------------------------
/tests/test_feed.py:
--------------------------------------------------------------------------------
  1 | # tests.test_feed
  2 | # Test the feed module - the main entry point to Baleen
  3 | #
  4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
  5 | # Created:  Sun Feb 21 15:49:18 2016 -0500
  6 | #
  7 | # Copyright (C) 2016 Bengfort.com
  8 | # For license information, see LICENSE.txt
  9 | #
 10 | # ID: test_feed.py [2988c53] benjamin@bengfort.com $
 11 | 
 12 | """
 13 | Test the feed module - the main entry point to Baleen
 14 | """
 15 | 
 16 | ##########################################################################
 17 | ## Imports
 18 | ##########################################################################
 19 | 
 20 | import os
 21 | import pickle
 22 | import unittest
 23 | 
 24 | from mongomock import MongoClient as MockMongoClient
 25 | 
 26 | try:
 27 |     from unittest import mock
 28 | except ImportError:
 29 |     import mock
 30 | 
 31 | from baleen.feed import *
 32 | from baleen.models import *
 33 | from urlparse import urlparse
 34 | from baleen.exceptions import FeedTypeError
 35 | 
 36 | ##########################################################################
 37 | ## Fixtures
 38 | ##########################################################################
 39 | 
 40 | FIXTURES = os.path.join(os.path.dirname(__file__), "fixtures")
 41 | FEEDLY   = os.path.join(FIXTURES, "feedly.opml")
 42 | RESULT   = os.path.join(FIXTURES, "feedparser_result.pickle")
 43 | 
 44 | # Feed Fixtures
 45 | STR_FEED     = 'http://freakonomics.blogs.nytimes.com/feed/'
 46 | UNICODE_FEED = u'http://blog.kaggle.com/feed/'
 47 | OPML_FEED    = {
 48 |     "type":"rss", "text":"The Daily Notebook", "title":"The Daily Notebook",
 49 |     "xmlUrl":"https://mubi.com/notebook/posts.atom", "htmlUrl":"https://mubi.com/notebook/posts",
 50 | }
 51 | MONGO_FEED   = Feed(
 52 |     title = u'The Rumpus.net',
 53 |     link = u'http://therumpus.net/feed/',
 54 |     urls = {u'htmlurl': u'http://therumpus.net'},
 55 |     category = u'books',
 56 | )
 57 | 
 58 | ##########################################################################
 59 | ## Feed Synchronization Tests
 60 | ##########################################################################
 61 | 
 62 | class FeedSyncTests(unittest.TestCase):
 63 | 
 64 |     def setUp(self):
 65 |         """
 66 |         Create the mongomock connection
 67 |         """
 68 |         self.conn = connect(host='mongomock://localhost')
 69 |         assert isinstance(self.conn, MockMongoClient)
 70 | 
 71 |         # Clear out the database
 72 |         for feed in Feed.objects(): feed.delete()
 73 |         for post in Post.objects(): post.delete()
 74 | 
 75 |     def tearDown(self):
 76 |         """
 77 |         Drop the mongomock connection
 78 |         """
 79 |         assert isinstance(self.conn, MockMongoClient)
 80 |         self.conn = None
 81 | 
 82 |     def test_fsync_factory(self):
 83 |         """
 84 |         Test multiple types in the feed sync factory
 85 |         """
 86 |         cases = (
 87 |             STR_FEED, UNICODE_FEED, OPML_FEED, MONGO_FEED
 88 |         )
 89 | 
 90 |         for fsync in FeedSync.factory(cases):
 91 |             self.assertIsInstance(fsync, FeedSync)
 92 | 
 93 |     def test_type_check(self):
 94 |         """
 95 |         Assert that strings, Feeds, and dicts can be sync'd
 96 |         """
 97 |         cases = (
 98 |             (STR_FEED, FeedSync.URL),
 99 |             (UNICODE_FEED, FeedSync.URL),
100 |             (OPML_FEED, FeedSync.DICT),
101 |             (MONGO_FEED, FeedSync.MODEL),
102 |         )
103 | 
104 |         for feed, ftype in cases:
105 |             fsync = FeedSync(feed)
106 |             self.assertEqual(fsync.type, ftype)
107 | 
108 |     def test_bad_type(self):
109 |         """
110 |         Test that bad types raise an exception in sync
111 |         """
112 |         cases = (
113 |             10, {u'htmlurl': u'https://mubi.com/notebook/posts'}, ['a','b','c']
114 |         )
115 | 
116 |         for case in cases:
117 |             fsync = FeedSync(case)
118 |             with self.assertRaises(FeedTypeError):
119 |                 fsync.type
120 | 
121 |     def test_url_extraction(self):
122 |         """
123 |         Test the feed sync multiple type url extraction
124 |         """
125 |         cases = (
126 |             (STR_FEED, STR_FEED),
127 |             (UNICODE_FEED, UNICODE_FEED),
128 |             (OPML_FEED, OPML_FEED['xmlUrl']),
129 |             (MONGO_FEED, MONGO_FEED.link),
130 |         )
131 | 
132 |         for feed, url in cases:
133 |             fsync = FeedSync(feed)
134 |             self.assertEqual(fsync.url, url)
135 | 
136 |     @mock.patch('baleen.feed.feedparser.parse')
137 |     def test_feedparser_wrapping(self, mock_feedparser):
138 |         """
139 |         Test the feedparser access by mocking feedparser calls
140 |         """
141 | 
142 |         # Ensure that the mocking worked out for us
143 |         assert mock_feedparser is feedparser.parse
144 | 
145 |         cases = (
146 |             (STR_FEED, STR_FEED),
147 |             (UNICODE_FEED, UNICODE_FEED),
148 |             (OPML_FEED, OPML_FEED['xmlUrl']),
149 |             (MONGO_FEED, MONGO_FEED.link),
150 |         )
151 | 
152 |         for feed, url in cases:
153 |             fsync  = FeedSync(feed)
154 |             result = fsync.parse()
155 |             mock_feedparser.assert_called_with(url)
156 | 
157 |     @mock.patch('baleen.feed.feedparser.parse')
158 |     def test_feedparser_wrapping(self, mock_feedparser):
159 |         """
160 |         Test etag and modified blocking on feedparser for Feed objects
161 |         """
162 | 
163 |         # Ensure that the mocking worked out for us
164 |         assert mock_feedparser is feedparser.parse
165 | 
166 |         feed = Feed(link = u'https://mubi.com/notebook/posts.atom')
167 |         feed.etag = 'abcdefg'
168 | 
169 |         # Test Case 1: etag but no modified
170 |         result = FeedSync(feed).parse()
171 |         mock_feedparser.assert_called_with(feed.link, etag=feed.etag)
172 | 
173 |         # Test Case 2: modified but no etag
174 |         feed.etag = None
175 |         feed.modified = "Fri, 11 Jun 2012 23:00:34 GMT"
176 |         result = FeedSync(feed).parse()
177 |         mock_feedparser.assert_called_with(feed.link, modified=feed.modified)
178 | 
179 |         # Test Case 3: modified and etag
180 |         feed.etag = 'hijklmnop'
181 |         result = FeedSync(feed).parse()
182 |         mock_feedparser.assert_called_with(feed.link, etag=feed.etag)
183 | 
184 |     @mock.patch('baleen.feed.feedparser.parse')
185 |     def test_feed_sync(self, mock_feedparser):
186 |         """
187 |         Test that sync updates the Feed object
188 |         """
189 |         # Ensure that the mocking worked out for us
190 |         assert mock_feedparser is feedparser.parse
191 | 
192 |         # Give the mock feedparser a result!
193 |         with open(RESULT, 'rb') as f:
194 |             mock_feedparser.return_value = pickle.load(f)
195 | 
196 |         fsync  = FeedSync(MONGO_FEED)
197 |         result = fsync.sync()
198 | 
199 |         # Fetch the feed from the database.
200 |         self.assertEqual(Feed.objects.count(), 1)
201 |         feed = Feed.objects.first()
202 | 
203 |         # Ensure that the various properties have been set.
204 |         self.assertEqual(feed.etag, u'W/"29e84abdc28e3fa87709d1f309b7c214-gzip"')
205 |         self.assertEqual(feed.modified, u'Wed, 02 Mar 2016 22:00:06 GMT')
206 |         self.assertEqual(feed.version, u'rss20')
207 |         self.assertEqual(feed.link, MONGO_FEED.link)
208 |         self.assertIsNotNone(feed.fetched)
209 | 
210 |     @mock.patch('baleen.feed.feedparser.parse')
211 |     def test_feed_sync_mongodb(self, mock_feedparser):
212 |         """
213 |         Test the sync MongoDB interaction
214 |         """
215 |         # Ensure that the mocking worked out for us
216 |         assert mock_feedparser is feedparser.parse
217 | 
218 |         # Give the mock feedparser a result!
219 |         with open(RESULT, 'rb') as f:
220 |             mock_feedparser.return_value = pickle.load(f)
221 | 
222 |         fsync  = FeedSync(MONGO_FEED)
223 | 
224 |         # Test sync without save
225 |         result = fsync.sync(save=False)
226 |         self.assertEqual(Feed.objects.count(), 0)
227 | 
228 |         # Test sync with save
229 |         result = fsync.sync()
230 |         self.assertEqual(Feed.objects.count(), 1)
231 | 
232 |     @mock.patch('baleen.feed.feedparser.parse')
233 |     def test_feed_sync_non_model(self, mock_feedparser):
234 |         """
235 |         Test the sync with a non-model feed.
236 |         """
237 |         # Ensure that the mocking worked out for us
238 |         assert mock_feedparser is feedparser.parse
239 | 
240 |         # Give the mock feedparser a result!
241 |         with open(RESULT, 'rb') as f:
242 |             mock_feedparser.return_value = pickle.load(f)
243 | 
244 |         fsync  = FeedSync(OPML_FEED)
245 | 
246 |         # Test sync without save
247 |         result = fsync.sync()
248 |         self.assertEqual(Feed.objects.count(), 0)
249 | 


--------------------------------------------------------------------------------
/tests/test_ingest.py:
--------------------------------------------------------------------------------
  1 | # tests.test_ingest
  2 | # Test the ingestor mechanism in an integration fashion.
  3 | #
  4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
  5 | # Created:  Thu Mar 03 13:01:12 2016 -0500
  6 | #
  7 | # Copyright (C) 2016 Bengfort.com
  8 | # For license information, see LICENSE.txt
  9 | #
 10 | # ID: test_ingest.py [df0c71b] benjamin@bengfort.com $
 11 | 
 12 | """
 13 | Test the ingestor mechanism in an integration fashion.
 14 | """
 15 | 
 16 | ##########################################################################
 17 | ## Imports
 18 | ##########################################################################
 19 | 
 20 | import unittest
 21 | 
 22 | from .test_models import MongoTestMixin
 23 | 
 24 | try:
 25 |     from unittest import mock
 26 | except ImportError:
 27 |     import mock
 28 | 
 29 | import baleen.models as db
 30 | 
 31 | from baleen.ingest import stype
 32 | from baleen.ingest import Ingestor
 33 | from baleen.ingest import MongoIngestor
 34 | from baleen.ingest import OPMLIngestor
 35 | from baleen.utils.decorators import reraise
 36 | from baleen.exceptions import *
 37 | from baleen.utils.logger import IngestLogger
 38 | 
 39 | 
 40 | ##########################################################################
 41 | ## Helper Functions
 42 | ##########################################################################
 43 | 
 44 | ACTION_METHODS = ('started', 'finished', 'failed', 'process')
 45 | 
 46 | def get_ingest_mock(klass=Ingestor):
 47 |     """
 48 |     Mocks all functions of the ingestor that are called in ingest.
 49 |     This means there should be NO side effects when ingest is called.
 50 |     """
 51 |     # Verify and create ingestor class
 52 |     ingestor = klass()
 53 |     verify_ingest_mock(ingestor)
 54 | 
 55 |     # Remove action methods
 56 |     for method in ACTION_METHODS:
 57 |         setattr(ingestor, method, mock.MagicMock())
 58 | 
 59 |     return ingestor
 60 | 
 61 | 
 62 | def verify_ingest_mock(ingestor):
 63 |     """
 64 |     Ensures that no methods other than action methods are called
 65 |     """
 66 |     ingestor = mock.create_autospec(ingestor, instance=True)
 67 |     reset_mock_method(ingestor, 'ingest')
 68 |     ingestor.ingest()
 69 | 
 70 |     for method in ingestor._mock_methods:
 71 |         action = getattr(ingestor, method)
 72 |         if method not in ACTION_METHODS:
 73 |             if hasattr(action, 'assert_not_called'):
 74 |                 action.assert_not_called()
 75 | 
 76 | 
 77 | def reset_mock_method(obj, method):
 78 |     """
 79 |     Resets a mock object's method to the orignal
 80 |     """
 81 |     klass  = obj.__class__
 82 |     action = getattr(klass, method)
 83 | 
 84 |     setattr(obj, method, action.__get__(obj, klass))
 85 |     return obj
 86 | 
 87 | 
 88 | ##########################################################################
 89 | ## Test Ingestor
 90 | ##########################################################################
 91 | 
 92 | class IngestorTests(MongoTestMixin, unittest.TestCase):
 93 | 
 94 |     def test_stype_helper(self):
 95 |         """
 96 |         Test the stype helper function
 97 |         """
 98 |         self.assertEqual(stype(BaleenError("Bad things!")), BaleenError.__name__)
 99 | 
100 |     def test_stype_embed_helper(self):
101 |         """
102 |         Test stype on reraises decorators.
103 |         """
104 | 
105 |         @reraise(BaleenError)
106 |         def badfunc():
107 |             raise TypeError("This is clearly the wrong type!")
108 | 
109 |         try:
110 |             badfunc()
111 |         except BaleenError as e:
112 |             self.assertEqual(stype(e), "BaleenError (TypeError)")
113 | 
114 |     def test_ingestor_hooks(self):
115 |         """
116 |         Test the started and finished ingestor hooks
117 |         """
118 | 
119 |         # Create Ingestor and call the entry point method
120 |         ingestor = get_ingest_mock()
121 |         ingestor.ingest()
122 | 
123 |         # Assert that started and finished were called, and failed wasn't.
124 |         ingestor.started.assert_called_once_with()
125 |         ingestor.finished.assert_called_once_with()
126 |         ingestor.failed.assert_not_called()
127 | 
128 |     def test_ingestor_failed_hook(self):
129 |         """
130 |         Test the started and failed ingestor hooks
131 |         """
132 | 
133 |         ingestor = get_ingest_mock()
134 |         ingestor.process.side_effect = Exception("Things went wrong!")
135 | 
136 |         # Call the entry point method
137 |         with self.assertRaises(Exception) as cm:
138 |             ingestor.ingest()
139 | 
140 |         # Assert that started and finished were called, and failed wasn't.
141 |         ingestor.started.assert_called_once_with()
142 |         ingestor.finished.assert_not_called()
143 |         ingestor.failed.assert_called_once_with(cm.exception)
144 | 
145 |     def test_ingestor_state(self):
146 |         """
147 |         Ensure that the ingestor state is correctly modified
148 |         """
149 |         ingestor = get_ingest_mock()
150 | 
151 |         self.assertIsNone(ingestor.jobid)
152 |         self.assertIsNone(ingestor.timer)
153 | 
154 |         ingestor.ingest()
155 | 
156 |         self.assertIsNotNone(ingestor.jobid)
157 |         self.assertIsNotNone(ingestor.timer)
158 | 


--------------------------------------------------------------------------------
/tests/test_models.py:
--------------------------------------------------------------------------------
  1 | # tests.test_models
  2 | # Testing for the mongoengine models (basic stuff).
  3 | #
  4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
  5 | # Created:  Wed Mar 02 21:11:08 2016 -0500
  6 | #
  7 | # Copyright (C) 2016 Bengfort.com
  8 | # For license information, see LICENSE.txt
  9 | #
 10 | # ID: test_models.py [2930b9d] benjamin@bengfort.com $
 11 | 
 12 | """
 13 | Testing for the mongoengine models (basic stuff).
 14 | """
 15 | 
 16 | ##########################################################################
 17 | ## Imports
 18 | ##########################################################################
 19 | 
 20 | import unittest
 21 | import mongoengine as me
 22 | 
 23 | from mongomock import MongoClient as MockMongoClient
 24 | 
 25 | try:
 26 |     from unittest import mock
 27 | except ImportError:
 28 |     import mock
 29 | 
 30 | from baleen.models import *
 31 | 
 32 | 
 33 | ##########################################################################
 34 | ## Mongo Test Mixin
 35 | ##########################################################################
 36 | 
 37 | class MongoTestMixin(object):
 38 | 
 39 |     def setUp(self):
 40 |         """
 41 |         Create the mongomock connection
 42 |         """
 43 |         self.conn = connect(host='mongomock://localhost')
 44 |         assert isinstance(self.conn, MockMongoClient)
 45 | 
 46 |         # Clear out the database
 47 |         for feed in Feed.objects(): feed.delete()
 48 |         for post in Post.objects(): post.delete()
 49 | 
 50 |     def tearDown(self):
 51 |         """
 52 |         Drop the mongomock connection
 53 |         """
 54 |         assert isinstance(self.conn, MockMongoClient)
 55 |         self.conn = None
 56 | 
 57 |     def assertDateTimeEqual(self, dta, dtb):
 58 |         """
 59 |         Assert that two datetimes are within 1 second of each other
 60 |         """
 61 |         dta = dta.replace(microsecond=0)
 62 |         dtb = dta.replace(microsecond=0)
 63 | 
 64 |         if dta.second != dtb.second:
 65 |             self.assertLessThanEqual(
 66 |                 abs(dta.second - dtb.second), 1, "datetimes are not one second apart!"
 67 |             )
 68 |             dta = dta.replace(second=0)
 69 |             dtb = dtb.replace(second=0)
 70 | 
 71 |         self.assertEqual(dta, dtb)
 72 | 
 73 | ##########################################################################
 74 | ## Feed Model Tests
 75 | ##########################################################################
 76 | 
 77 | class FeedModelTests(MongoTestMixin, unittest.TestCase):
 78 | 
 79 |     def test_link_requred(self):
 80 |         """
 81 |         Assert that the feed link is required
 82 |         """
 83 |         feed = Feed(title="My Awesome Feed", category="socks")
 84 |         with self.assertRaises(me.ValidationError):
 85 |             feed.save()
 86 | 
 87 |     def test_created_updated(self):
 88 |         """
 89 |         Ensure the feed updated timestamp is tracked
 90 |         """
 91 |         feed = Feed(title="A News Feed", category="news", link="https://example.com/feed.atom")
 92 |         feed.save()
 93 | 
 94 |         self.assertIsNotNone(feed.created)
 95 |         self.assertIsNotNone(feed.updated)
 96 |         self.assertDateTimeEqual(feed.created, feed.updated)
 97 | 
 98 |         feed.title = "An Olds Feed"
 99 |         feed.save()
100 |         self.assertNotEqual(feed.created, feed.updated)
101 | 
102 |     def test_properties(self):
103 |         """
104 |         Test the properties of the feed model
105 |         """
106 |         feed = Feed(title="A News Feed", category="news", link="https://example.com/feed.atom")
107 |         feed.save()
108 | 
109 |         self.assertEqual(feed.xmlurl, feed.link)
110 |         self.assertIsNone(feed.htmlurl)
111 | 
112 |         feed.urls = {'htmlUrl': 'https://example.com/'}
113 |         feed.save()
114 | 
115 |         self.assertEqual(feed.htmlurl, 'https://example.com/')
116 | 
117 |     def test_stringify(self):
118 |         """
119 |         Test the stringification of a feed
120 |         """
121 |         feed = Feed(category="news", link="https://example.com/feed.atom")
122 |         feed.save()
123 | 
124 |         self.assertEqual(str(feed), feed.link)
125 | 
126 |         feed.title = "A News Feed"
127 |         feed.save()
128 | 
129 |         self.assertEqual(str(feed), feed.title)
130 | 
131 | 
132 | ##########################################################################
133 | ## Post Model Tests
134 | ##########################################################################
135 | 
136 | class PostModelTests(MongoTestMixin, unittest.TestCase):
137 | 
138 |     def test_url_requred(self):
139 |         """
140 |         Assert that the post url is required
141 |         """
142 |         post = Post(title="My Awesome Post", content="socks")
143 |         with self.assertRaises(me.ValidationError):
144 |             post.save()
145 | 
146 |     def test_created_updated(self):
147 |         """
148 |         Ensure the post updated timestamp is tracked
149 |         """
150 |         post = Post(title="My Awesome Post", content="socks", url="http://example.com/socks.html")
151 |         post.save()
152 | 
153 |         self.assertIsNotNone(post.created)
154 |         self.assertIsNotNone(post.updated)
155 |         self.assertDateTimeEqual(post.created, post.updated)
156 | 
157 |         post.title = "My even more awesome Post!"
158 |         post.save()
159 |         self.assertNotEqual(post.created, post.updated)
160 | 
161 |     def test_content_hashing(self):
162 |         """
163 |         Test the automatic hashing of content
164 |         """
165 |         post = Post(content="socks", url="http://example.com/socks.html")
166 |         self.assertIsNone(post.signature)
167 |         post.save()
168 | 
169 |         self.assertIsNotNone(post.signature)
170 |         self.assertEqual(post.signature, '54f6d9fbe8ee576f82d6eb7e4d1d55691a1f0b7bd956246d3de56ee84bd1d333')
171 | 
172 |     def test_stringify(self):
173 |         """
174 |         Test the stringification of a post
175 |         """
176 |         post = Post(content="socks", signature="abc", url="http://example.com/socks.html")
177 |         post.save()
178 | 
179 |         self.assertEqual(str(post), post.url)
180 | 
181 |         post.title = "My Awesome Post"
182 |         post.save()
183 | 
184 |         self.assertEqual(str(post), post.title)
185 | 


--------------------------------------------------------------------------------
/tests/test_opml.py:
--------------------------------------------------------------------------------
  1 | # tests.test_opml
  2 | # Testing for the OPML reader and ingestion function.
  3 | #
  4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
  5 | # Created:  Fri Feb 19 08:50:19 2016 -0500
  6 | #
  7 | # Copyright (C) 2016 Bengfort.com
  8 | # For license information, see LICENSE.txt
  9 | #
 10 | # ID: test_opml.py [a0d0da3] benjamin@bengfort.com $
 11 | 
 12 | """
 13 | Testing for the OPML reader and ingestion function.
 14 | """
 15 | 
 16 | ##########################################################################
 17 | ## Imports
 18 | ##########################################################################
 19 | 
 20 | import os
 21 | import unittest
 22 | 
 23 | from .test_models import MongoTestMixin
 24 | 
 25 | try:
 26 |     from unittest import mock
 27 | except ImportError:
 28 |     import mock
 29 | 
 30 | from baleen.opml import OPML, load_opml
 31 | from baleen.models import Feed
 32 | 
 33 | ##########################################################################
 34 | ## Fixtures
 35 | ##########################################################################
 36 | 
 37 | FIXTURES = os.path.join(os.path.dirname(__file__), "fixtures")
 38 | FEEDLY   = os.path.join(FIXTURES, "feedly.opml")
 39 | 
 40 | ##########################################################################
 41 | ## Test Load OPML command
 42 | ##########################################################################
 43 | 
 44 | class LoadOPMLTests(MongoTestMixin, unittest.TestCase):
 45 | 
 46 |     def test_load_opml_integrated(self):
 47 |         """
 48 |         Test the integration of the ingest helper function
 49 |         """
 50 |         self.assertEqual(Feed.objects.count(), 0)
 51 |         self.assertEqual(load_opml(FEEDLY), 36)
 52 |         self.assertEqual(Feed.objects.count(), 36)
 53 | 
 54 |         for feed in Feed.objects():
 55 |             self.assertIn('xmlUrl', feed.urls)
 56 |             self.assertIn('htmlUrl', feed.urls)
 57 | 
 58 |     def test_load_opml_no_duplicates(self):
 59 |         """
 60 |         Assert multiple calls to the load_opml creates no duplicates
 61 |         """
 62 |         self.assertEqual(Feed.objects.count(), 0)
 63 |         self.assertEqual(load_opml(FEEDLY), 36)
 64 |         self.assertEqual(Feed.objects.count(), 36)
 65 | 
 66 |         for _ in xrange(10):
 67 |             self.assertEqual(load_opml(FEEDLY), 0)
 68 |             self.assertEqual(Feed.objects.count(), 36)
 69 | 
 70 | ##########################################################################
 71 | ## OPML Reader Test
 72 | ##########################################################################
 73 | 
 74 | class OPMLTests(unittest.TestCase):
 75 | 
 76 |     def test_fixture(self):
 77 |         """
 78 |         Assert the required opml fixture is available
 79 |         """
 80 |         self.assertTrue(os.path.exists(FEEDLY))
 81 |         self.assertTrue(os.path.isfile(FEEDLY))
 82 | 
 83 |     def test_categories(self):
 84 |         """
 85 |         Test the OPML categories listing
 86 |         """
 87 |         opml = OPML(FEEDLY)
 88 |         expected = [
 89 |             u'news',
 90 |             u'do it yourself',
 91 |             u'business',
 92 |             u'gaming',
 93 |             u'data science',
 94 |             u'essays',
 95 |             u'politics',
 96 |             u'tech',
 97 |             u'cinema',
 98 |             u'books',
 99 |             u'sports',
100 |             u'cooking',
101 |             u'design'
102 |         ]
103 | 
104 |         print list(opml.categories())
105 | 
106 |         self.assertEqual(list(opml.categories()), expected)
107 | 
108 |     def test_length(self):
109 |         """
110 |         Test the OPML len built in
111 |         """
112 |         opml = OPML(FEEDLY)
113 |         self.assertEqual(len(opml), 36)
114 | 
115 |     def test_counts(self):
116 |         """
117 |         Test the OPML category counter and item iterator
118 |         """
119 |         opml = OPML(FEEDLY)
120 |         expected = {
121 |             'cooking': 4,
122 |             'cinema': 3,
123 |             'gaming': 3,
124 |             'tech': 3,
125 |             'essays': 2,
126 |             'business': 3,
127 |             'design': 2,
128 |             'sports': 3,
129 |             'books': 3,
130 |             'data science': 4,
131 |             'do it yourself': 2,
132 |             'news': 2,
133 |             'politics': 2,
134 |         }
135 |         counts = opml.counts()
136 | 
137 |         for key, val in expected.items():
138 |             self.assertIn(key, counts)
139 |             self.assertEqual(
140 |                 counts[key], val,
141 |                 "{} mismatch: {} vs {}".format(key, counts[key], val)
142 |             )
143 | 
144 |     def test_item_iterator_detail(self):
145 |         """
146 |         Test the XML result returned from OPML iteration
147 |         """
148 | 
149 |         opml  = OPML(FEEDLY)
150 |         attrs = ['category', 'title', 'text', 'htmlUrl', 'xmlUrl', 'type']
151 |         for item in opml:
152 |             self.assertTrue(isinstance(item, dict))
153 |             self.assertEqual(item.keys(), attrs)
154 | 


--------------------------------------------------------------------------------
/tests/test_wrangle.py:
--------------------------------------------------------------------------------
  1 | # tests.test_wrangle
  2 | # Test the post wrangling module and functionality.
  3 | #
  4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
  5 | # Created:  Wed Mar 02 22:38:08 2016 -0500
  6 | #
  7 | # Copyright (C) 2016 Bengfort.com
  8 | # For license information, see LICENSE.txt
  9 | #
 10 | # ID: test_wrangle.py [568d540] benjamin@bengfort.com $
 11 | 
 12 | """
 13 | Test the post wrangling module and functionality.
 14 | """
 15 | 
 16 | ##########################################################################
 17 | ## Imports
 18 | ##########################################################################
 19 | 
 20 | import os
 21 | import pickle
 22 | import unittest
 23 | 
 24 | from .test_models import MongoTestMixin
 25 | 
 26 | try:
 27 |     from unittest import mock
 28 | except ImportError:
 29 |     import mock
 30 | 
 31 | from baleen.wrangle import *
 32 | from baleen.exceptions import *
 33 | from baleen.models import Feed, Post
 34 | 
 35 | ##########################################################################
 36 | ## Fixtures
 37 | ##########################################################################
 38 | 
 39 | FIXTURES  = os.path.join(os.path.dirname(__file__), "fixtures")
 40 | RESULT    = os.path.join(FIXTURES, "feedparser_result.pickle")
 41 | FEED      = Feed(
 42 |     title = u'The Rumpus.net',
 43 |     link  = u'http://therumpus.net/feed/',
 44 |     urls  = {u'htmlurl': u'http://therumpus.net'}, category = u'books',
 45 | )
 46 | 
 47 | 
 48 | def mocked_requests_get(*args, **kwargs):
 49 |     class MockResponse:
 50 |         def __init__(self, text, status_code):
 51 |             self.text = text
 52 |             self.status_code = status_code
 53 | 
 54 |         def raise_for_status(self):
 55 |             if self.status_code != 200:
 56 |                 raise Exception("HTTP {}".format(self.status_code))
 57 | 
 58 |     text = "Luke, I am your father!"
 59 | 
 60 |     if args[0] == 'http://example.com/vader/':
 61 |         return MockResponse(text, 200)
 62 | 
 63 |     return MockResponse("??", 404)
 64 | 
 65 | ##########################################################################
 66 | ## Test Wrangling Posts
 67 | ##########################################################################
 68 | 
 69 | class PostWranglerTests(MongoTestMixin, unittest.TestCase):
 70 | 
 71 |     def setUp(self):
 72 |         super(PostWranglerTests, self).setUp()
 73 |         self.feed = FEED
 74 |         self.feed.save()
 75 | 
 76 |         with open(RESULT, 'rb') as f:
 77 |             self.entries = pickle.load(f).entries
 78 | 
 79 |     def test_wrangle_factory(self):
 80 |         """
 81 |         Test multiple types in the feed sync factory
 82 |         """
 83 | 
 84 |         for wrangle in PostWrangler.factory(self.entries, feed=self.feed):
 85 |             self.assertIsInstance(wrangle, PostWrangler)
 86 | 
 87 |     def test_wrangle_integration(self):
 88 |         """
 89 |         Test wrangling of all entries in the result.
 90 |         """
 91 |         self.assertEqual(Post.objects.count(), 0)
 92 |         for wrangle in PostWrangler.factory(self.entries, feed=self.feed):
 93 |             wrangle.wrangle()
 94 |             wrangle.wrangle() # Make sure that double wrangle does nothing.
 95 | 
 96 |         self.assertEqual(Post.objects.count(), 10)
 97 | 
 98 |         # Ensure there are no duplicates
 99 |         for wrangle in PostWrangler.factory(self.entries, feed=self.feed):
100 |             with self.assertRaises(WranglingError) as cm:
101 |                 wrangle.wrangle()
102 |             self.assertEqual(Post.objects.count(), 10)
103 | 
104 |     def test_is_wrangled(self):
105 |         """
106 |         Test the wrangling detection
107 |         """
108 |         wrangle = PostWrangler(self.entries[0])
109 |         self.assertFalse(wrangle.is_wrangled())
110 |         wrangle.wrangle()
111 |         self.assertTrue(wrangle.is_wrangled())
112 | 
113 |     def test_save_not_save(self):
114 |         """
115 |         Test the wrangle interaction with the database
116 |         """
117 |         self.assertEqual(Post.objects.count(), 0)
118 |         wrangle = PostWrangler(self.entries[0])
119 | 
120 |         # Don't save the wrangle
121 |         wrangle.wrangle(False)
122 |         self.assertEqual(Post.objects.count(), 0)
123 | 
124 |         # We've already wrangled so nothing should happen!
125 |         wrangle.wrangle()
126 |         self.assertEqual(Post.objects.count(), 0)
127 | 
128 |         # Try making something happen directly
129 |         wrangle.wrangle().save()
130 |         self.assertEqual(Post.objects.count(), 1)
131 | 
132 |         # Toss in something else entirely
133 |         wrangle = PostWrangler(self.entries[1])
134 |         wrangle.wrangle()
135 |         self.assertEqual(Post.objects.count(), 2)
136 | 
137 |     def test_feed_or_not(self):
138 |         """
139 |         Test can be saved with or without a feed
140 |         """
141 |         withfeed = PostWrangler(self.entries[0], feed=self.feed)
142 |         nofeed   = PostWrangler(self.entries[1])
143 | 
144 |         post = withfeed.wrangle()
145 |         self.assertEqual(post.feed, self.feed)
146 | 
147 |         post = nofeed.wrangle()
148 |         self.assertIsNone(post.feed)
149 | 
150 |     @mock.patch('baleen.wrangle.requests.get', side_effect=mocked_requests_get)
151 |     def test_fetch_not_wrangled(self, mock_requests):
152 |         """
153 |         Assert that fetch requires wrangling
154 |         """
155 |         assert mock_requests is requests.get
156 | 
157 |         wrangle = PostWrangler(self.entries[0], feed=self.feed)
158 |         with self.assertRaises(FetchError):
159 |             wrangle.fetch()
160 | 
161 |     @mock.patch('baleen.wrangle.requests.get', side_effect=mocked_requests_get)
162 |     def test_fetch_overwrites_content(self, mock_requests):
163 |         """
164 |         Test that the fetch overwrites content.
165 |         """
166 |         assert mock_requests is requests.get
167 | 
168 |         wrangle = PostWrangler(self.entries[0], feed=self.feed)
169 |         wrangle.wrangle()
170 |         self.assertEqual(Post.objects.count(), 1)
171 | 
172 |         wrangle.post.url = 'http://example.com/vader/'
173 |         post = wrangle.fetch()
174 |         self.assertEqual(Post.objects.count(), 1)
175 |         self.assertNotEqual(post.created, post.updated)
176 | 
177 |         self.assertEqual(post.content, "Luke, I am your father!")
178 | 
179 |     @mock.patch('baleen.wrangle.requests.get', side_effect=mocked_requests_get)
180 |     def test_fetch_no_save(self, mock_requests):
181 |         """
182 |         Test that the fetch does not save on demand.
183 |         """
184 |         assert mock_requests is requests.get
185 | 
186 |         wrangle = PostWrangler(self.entries[0], feed=self.feed)
187 |         wrangle.wrangle()
188 |         self.assertEqual(Post.objects.count(), 1)
189 | 
190 |         wrangle.post.url = 'http://example.com/vader/'
191 |         wrangle.fetch(save=False)
192 |         self.assertEqual(Post.objects.count(), 1)
193 | 
194 |         post = Post.objects.first()
195 |         self.assertDateTimeEqual(post.created, post.updated)
196 |         self.assertNotEqual(post.content, "Luke, I am your father!")
197 | 
198 |     @mock.patch('baleen.wrangle.requests.get', side_effect=mocked_requests_get)
199 |     def test_fetch_raises_404(self, mock_requests):
200 |         """
201 |         Test that fetch raises exception on HTTP error
202 |         """
203 |         assert mock_requests is requests.get
204 | 
205 |         wrangle = PostWrangler(self.entries[0], feed=self.feed)
206 |         wrangle.wrangle()
207 |         self.assertEqual(Post.objects.count(), 1)
208 | 
209 |         with self.assertRaises(FetchError):
210 |             wrangle.post.url = 'http://example.com/obiwan/'
211 |             wrangle.fetch()
212 |         
213 | 


--------------------------------------------------------------------------------
/tests/utils_tests/__init__.py:
--------------------------------------------------------------------------------
 1 | # tests.utils_tests
 2 | # Tests for the Baleen utilities package.
 3 | #
 4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
 5 | # Created:  Sun Feb 21 15:31:55 2016 -0500
 6 | #
 7 | # Copyright (C) 2016 Bengfort.com
 8 | # For license information, see LICENSE.txt
 9 | #
10 | # ID: __init__.py [2988c53] benjamin@bengfort.com $
11 | 
12 | """
13 | Tests for the Baleen utilities package.
14 | """
15 | 
16 | ##########################################################################
17 | ## Imports
18 | ##########################################################################
19 | 


--------------------------------------------------------------------------------
/tests/utils_tests/test_decorators.py:
--------------------------------------------------------------------------------
  1 | # test.test_utils.test_decorators
  2 | # Testing the decorators utility package.
  3 | #
  4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
  5 | # Created:  Wed Mar 02 19:06:34 2016 -0500
  6 | #
  7 | # Copyright (C) 2016 Bengfort.com
  8 | # For license information, see LICENSE.txt
  9 | #
 10 | # ID: test_decorators.py [538b33d] benjamin@bengfort.com $
 11 | 
 12 | """
 13 | Testing the decorators utility package.
 14 | """
 15 | 
 16 | ##########################################################################
 17 | ## Imports
 18 | ##########################################################################
 19 | 
 20 | import time
 21 | import unittest
 22 | 
 23 | from baleen.utils.decorators import *
 24 | from baleen.utils.timez import Timer
 25 | from baleen.exceptions import *
 26 | 
 27 | try:
 28 |     from unittest import mock
 29 | except ImportError:
 30 |     import mock
 31 | 
 32 | 
 33 | ##########################################################################
 34 | ## Decorators Tests
 35 | ##########################################################################
 36 | 
 37 | class DecoratorsTests(unittest.TestCase):
 38 |     """
 39 |     Basic decorators utility tests.
 40 |     """
 41 | 
 42 |     def test_memoized(self):
 43 |         """
 44 |         Test the memoized property
 45 |         """
 46 | 
 47 |         class Thing(object):
 48 | 
 49 |             @memoized
 50 |             def attr(self):
 51 |                 return 42
 52 | 
 53 |         thing = Thing()
 54 |         self.assertFalse(hasattr(thing, '_attr'))
 55 |         self.assertEqual(thing.attr, 42)
 56 |         self.assertTrue(hasattr(thing, '_attr'))
 57 | 
 58 |     def test_timeit(self):
 59 |         """
 60 |         Test the timeit decorator
 61 |         """
 62 | 
 63 |         @timeit
 64 |         def myfunc():
 65 |             return 42
 66 | 
 67 |         output = myfunc()
 68 |         self.assertEqual(len(output), 2)
 69 |         result, timer = output
 70 |         self.assertEqual(result, 42)
 71 |         self.assertTrue(isinstance(timer, Timer))
 72 | 
 73 |     def test_reraise(self):
 74 |         """
 75 |         Test the reraise decorator
 76 |         """
 77 | 
 78 |         # Test 1: Regular old reraise
 79 | 
 80 |         @reraise()
 81 |         def alpha():
 82 |             raise Exception("Should be a BaleenError")
 83 | 
 84 |         with self.assertRaises(BaleenError) as cm:
 85 |             alpha()
 86 | 
 87 |         e = cm.exception
 88 |         self.assertEqual(str(e), "Should be a BaleenError")
 89 |         self.assertTrue(hasattr(e, "original"))
 90 |         self.assertIsInstance(e.original, Exception)
 91 |         self.assertEqual(str(e.original), "Should be a BaleenError")
 92 | 
 93 |     def test_reraise_message(self):
 94 |         """
 95 |         Test the reraise decorator with a message
 96 |         """
 97 | 
 98 |         # Test 2: Reraise with a new message
 99 | 
100 |         @reraise(message="I'm handling it!")
101 |         def bravo():
102 |             raise NotImplementedError("I'm not handling it!")
103 | 
104 |         with self.assertRaises(BaleenError) as cm:
105 |             bravo()
106 | 
107 |         e = cm.exception
108 |         self.assertEqual(str(e), "I'm handling it!")
109 |         self.assertTrue(hasattr(e, "original"))
110 |         self.assertIsInstance(e.original, NotImplementedError)
111 |         self.assertEqual(str(e.original), "I'm not handling it!")
112 | 
113 |     def test_reraise_arguments(self):
114 |         """
115 |         Test the reraise decorator with all possible arguments
116 |         """
117 | 
118 |         # Test 3: All possible arguments to reraise
119 | 
120 |         @reraise(klass=FeedTypeError, message="bad feed type", trap=TypeError)
121 |         def charlie():
122 |             raise TypeError("requires an integer")
123 | 
124 |         with self.assertRaises(FeedTypeError) as cm:
125 |             charlie()
126 | 
127 |         e = cm.exception
128 |         self.assertEqual(str(e), "bad feed type")
129 |         self.assertTrue(hasattr(e, "original"))
130 |         self.assertIsInstance(e.original, TypeError)
131 |         self.assertEqual(str(e.original), "requires an integer")
132 | 
133 |     def test_reraise_trap(self):
134 |         """
135 |         Test the reraise decorator by missing the trap
136 |         """
137 | 
138 |         # Test 4: Missing the trap
139 | 
140 |         @reraise(klass=FeedTypeError, message="bad feed type", trap=TypeError)
141 |         def delta():
142 |             raise ValueError("this should be the exception raised")
143 | 
144 |         with self.assertRaises(ValueError) as cm:
145 |             delta()
146 | 
147 |         e = cm.exception
148 |         self.assertEqual(str(e), "this should be the exception raised")
149 |         self.assertFalse(hasattr(e, "original"))
150 | 


--------------------------------------------------------------------------------
/tests/utils_tests/test_logger.py:
--------------------------------------------------------------------------------
  1 | # tests.utils_tests.test_logger
  2 | # Simple tests for the logger module.
  3 | #
  4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
  5 | # Created:  Thu Mar 03 11:52:06 2016 -0500
  6 | #
  7 | # Copyright (C) 2016 Bengfort.com
  8 | # For license information, see LICENSE.txt
  9 | #
 10 | # ID: test_logger.py [df0c71b] benjamin@bengfort.com $
 11 | 
 12 | """
 13 | Simple tests for the logger module.
 14 | """
 15 | 
 16 | ##########################################################################
 17 | ## Imports
 18 | ##########################################################################
 19 | 
 20 | import getpass
 21 | import unittest
 22 | 
 23 | try:
 24 |     from unittest import mock
 25 | except ImportError:
 26 |     import mock
 27 | 
 28 | from baleen.utils.logger import *
 29 | 
 30 | 
 31 | ##########################################################################
 32 | ## Module Helpers
 33 | ##########################################################################
 34 | 
 35 | PREFIX = "TEST LOG"
 36 | IGNORE = "IGNORE: This should not be in a log file or database!"
 37 | 
 38 | def tmsgf(message, prefix=PREFIX, ignore=IGNORE):
 39 |     return "{}: {} ({})".format(prefix, message, ignore)
 40 | 
 41 | ##########################################################################
 42 | ## Logger Test
 43 | ##########################################################################
 44 | 
 45 | class IngestLoggerTests(unittest.TestCase):
 46 |     """
 47 |     Simply exercises the methods of the logger.
 48 |     """
 49 | 
 50 |     @mock.patch('baleen.utils.logger.IngestLogger.logger')
 51 |     def test_log_extra(self, mock_logger):
 52 |         """
 53 |         Assert that extra (user) is passed to logger
 54 |         """
 55 | 
 56 |         logger  = IngestLogger()
 57 | 
 58 |         assert logger.logger is mock_logger
 59 | 
 60 |         message = tmsgf("Do not double space after a period!")
 61 |         logger.log(logging.DEBUG, message)
 62 | 
 63 |         mock_logger.log.assert_called_with(logging.DEBUG, message, extra={'user': getpass.getuser()})
 64 | 
 65 |     @mock.patch('baleen.utils.logger.IngestLogger.logger')
 66 |     def test_log_debug(self, mock_logger):
 67 |         """
 68 |         Test the debug logger
 69 |         """
 70 | 
 71 |         logger  = IngestLogger()
 72 | 
 73 |         assert logger.logger is mock_logger
 74 | 
 75 |         message = tmsgf("All CAPS is not shouting!")
 76 |         logger.debug(message)
 77 | 
 78 |         mock_logger.log.assert_called_with(logging.DEBUG, message, extra=mock.ANY)
 79 | 
 80 |     @mock.patch('baleen.utils.logger.IngestLogger.logger')
 81 |     def test_log_info(self, mock_logger):
 82 |         """
 83 |         Test the info logger
 84 |         """
 85 | 
 86 |         logger  = IngestLogger()
 87 | 
 88 |         assert logger.logger is mock_logger
 89 | 
 90 |         message = tmsgf("Birds and Bees Flock with Seas!")
 91 |         logger.info(message)
 92 | 
 93 |         mock_logger.log.assert_called_with(logging.INFO, message, extra=mock.ANY)
 94 | 
 95 |     @mock.patch('baleen.utils.logger.IngestLogger.logger')
 96 |     def test_log_warn(self, mock_logger):
 97 |         """
 98 |         Test the warn logger
 99 |         """
100 | 
101 |         logger  = IngestLogger()
102 | 
103 |         assert logger.logger is mock_logger
104 | 
105 |         message = tmsgf("You shouldn't touch that hot stove!")
106 |         logger.warn(message)
107 | 
108 |         mock_logger.log.assert_called_with(logging.WARNING, message, extra=mock.ANY)
109 | 
110 |     @mock.patch('baleen.utils.logger.IngestLogger.logger')
111 |     def test_log_error(self, mock_logger):
112 |         """
113 |         Test the error logger
114 |         """
115 | 
116 |         logger  = IngestLogger()
117 | 
118 |         assert logger.logger is mock_logger
119 | 
120 |         message = tmsgf("Someone let the rooster into the hen house!")
121 |         logger.error(message)
122 | 
123 |         mock_logger.log.assert_called_with(logging.ERROR, message, extra=mock.ANY)
124 | 
125 |     @mock.patch('baleen.utils.logger.IngestLogger.logger')
126 |     def test_log_critical(self, mock_logger):
127 |         """
128 |         Test the critical logger
129 |         """
130 | 
131 |         logger  = IngestLogger()
132 | 
133 |         assert logger.logger is mock_logger
134 | 
135 |         message = tmsgf("Someone let the fox into the hen house!")
136 |         logger.critical(message)
137 | 
138 |         mock_logger.log.assert_called_with(logging.CRITICAL, message, extra=mock.ANY)
139 | 


--------------------------------------------------------------------------------
/tests/utils_tests/test_mongolog.py:
--------------------------------------------------------------------------------
 1 | # tests.utils_tests.test_mongolog
 2 | # Simple tests for logging to MongoDB
 3 | #
 4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
 5 | # Created:  Thu Mar 03 11:53:46 2016 -0500
 6 | #
 7 | # Copyright (C) 2016 Bengfort.com
 8 | # For license information, see LICENSE.txt
 9 | #
10 | # ID: test_mongolog.py [df0c71b] benjamin@bengfort.com $
11 | 
12 | """
13 | Simple tests for logging to MongoDB
14 | """
15 | 
16 | ##########################################################################
17 | ## Imports
18 | ##########################################################################
19 | 
20 | import logging
21 | import unittest
22 | 
23 | from mongomock import MongoClient as MockMongoClient
24 | 
25 | try:
26 |     from unittest import mock
27 | except ImportError:
28 |     import mock
29 | 
30 | from baleen.utils import mongolog as ml
31 | from .test_logger import tmsgf
32 | 
33 | ##########################################################################
34 | ## Mongo Log Handler Tests
35 | ##########################################################################
36 | 
37 | class MongoLogHandlerTests(unittest.TestCase):
38 |     """
39 |     Simply exercises the methods of the logger.
40 |     """
41 | 
42 |     @mock.patch('baleen.utils.mongolog.MongoClient', MockMongoClient)
43 |     def test_logging_to_mongo(self):
44 |         """
45 |         Test the mongo log handler and logging to mongo
46 |         """
47 |         assert ml.MongoClient is MockMongoClient
48 | 
49 |         handler = ml.MongoHandler(level=logging.DEBUG)
50 |         self.assertIsInstance(handler.connection, MockMongoClient)
51 | 
52 |         # Ensure there is nothing in the database.
53 |         self.assertEqual(handler.collection.count(), 0)
54 | 
55 |         # Create the logging instance.
56 |         logger = logging.getLogger('test.mongo.logger.demo')
57 |         logger.setLevel(logging.INFO)
58 |         logger.addHandler(handler)
59 | 
60 |         # Log a message
61 |         logger.info(tmsgf("This is a test of the mongo logger"))
62 | 
63 |         # Ensure there is now a log message
64 |         self.assertEqual(handler.collection.count(), 1)
65 | 


--------------------------------------------------------------------------------
/tests/utils_tests/test_timez.py:
--------------------------------------------------------------------------------
  1 | # test.utils_tests.test_timez
  2 | # Testing for the timez time helpers library.
  3 | #
  4 | # Author:   Benjamin Bengfort <benjamin@bengfort.com>
  5 | # Created:  Sun Feb 21 15:33:18 2016 -0500
  6 | #
  7 | # Copyright (C) 2016 Bengfort.com
  8 | # For license information, see LICENSE.txt
  9 | #
 10 | # ID: test_timez.py [df0c71b] benjamin@bengfort.com $
 11 | 
 12 | """
 13 | Testing for the timez time helpers library.
 14 | """
 15 | 
 16 | ##########################################################################
 17 | ## Imports
 18 | ##########################################################################
 19 | 
 20 | import time
 21 | import unittest
 22 | 
 23 | from datetime import datetime
 24 | from dateutil.tz import tzutc
 25 | from baleen.utils.timez import *
 26 | 
 27 | ##########################################################################
 28 | ## Helper Functions Test Cases
 29 | ##########################################################################
 30 | 
 31 | class TimezHelpersTests(unittest.TestCase):
 32 | 
 33 |     def setUp(self):
 34 |         self.localnow = datetime.now(tzlocal()).replace(microsecond=0)
 35 |         self.utcnow   = self.localnow.astimezone(tzutc())
 36 | 
 37 |     def tearDown(self):
 38 |         self.localnow = self.utcnow = None
 39 | 
 40 |     def test_non_naive_datetimes(self):
 41 |         """
 42 |         Assert that localnow and utcnow return non-naive datetimes
 43 |         """
 44 |         self.assertIsNotNone(localnow().tzinfo)
 45 |         self.assertIsNotNone(utcnow().tzinfo)
 46 | 
 47 |     def test_humanizedelta(self):
 48 |         """
 49 |         Test the humanize delta function to convert seconds
 50 |         """
 51 |         cases = (
 52 |             (12512334, "144 days 19 hours 38 minutes 54 seconds"),
 53 |             (34321, "9 hours 32 minutes 1 second"),
 54 |             (3428, "57 minutes 8 seconds"),
 55 |             (1, "1 second"),
 56 |             (0.21, "0 second"),
 57 |         )
 58 | 
 59 |         for seconds, expected in cases:
 60 |             self.assertEqual(humanizedelta(seconds=seconds), expected)
 61 | 
 62 |     def test_humanizedelta_milliseconds(self):
 63 |         """
 64 |         Test the humanize delta function to conver milliseconds
 65 |         """
 66 | 
 67 |         # Case with seconds already there
 68 |         self.assertEqual(humanizedelta(seconds=10, milliseconds=2000), '12 seconds')
 69 | 
 70 |         # Case without seconds present
 71 |         self.assertEqual(humanizedelta(milliseconds=456875), '7 minutes 36 seconds')
 72 | 
 73 |     def test_strptimez(self):
 74 |         """
 75 |         Test the parsing of timezone aware date strings
 76 |         """
 77 |         dtfmt = "%Y-%m-%dT%H:%M:%S%z"
 78 | 
 79 |         cases = (
 80 |             ('2012-12-27T12:53:12-0500', datetime(2012, 12, 27, 17, 53, 12, tzinfo=tzutc())),
 81 |             ('2012-12-27T12:53:12+0800', datetime(2012, 12, 27, 4, 53, 12, tzinfo=tzutc())),
 82 |         )
 83 | 
 84 |         for dtstr, dt in cases:
 85 |             self.assertEqual(dt, strptimez(dtstr, dtfmt))
 86 | 
 87 |         # Non-timezone case
 88 |         self.assertEqual(
 89 |             strptimez('2012-12-27T12:53:12', "%Y-%m-%dT%H:%M:%S"),
 90 |             datetime(2012, 12, 27, 12, 53, 12)
 91 |         )
 92 | 
 93 |     def test_strptimez_no_z(self):
 94 |         """
 95 |         Assert that strptimez works with no '%z'
 96 |         This should return a timezone naive datetime
 97 |         """
 98 |         dtfmt = "%a %b %d %H:%M:%S %Y"
 99 |         dtstr = self.localnow.strftime(dtfmt)
100 |         self.assertEqual(strptimez(dtstr, dtfmt), self.localnow.replace(tzinfo=None))
101 | 
102 | 
103 |     def test_strptimez_no_space(self):
104 |         """
105 |         Non-space delimited '%z' works
106 |         """
107 |         dtfmt = "%Y-%m-%dT%H:%M:%S%z"
108 |         dtstr = self.localnow.strftime(dtfmt)
109 |         self.assertEqual(strptimez(dtstr, dtfmt), self.utcnow)
110 | 
111 |     def test_begin_z(self):
112 |         """
113 |         Test fmt that begins with '%z'
114 |         """
115 |         dtfmt = "%z %H:%M:%S for %Y-%m-%d"
116 |         dtstr = self.localnow.strftime(dtfmt)
117 |         self.assertEqual(strptimez(dtstr, dtfmt), self.utcnow)
118 | 
119 |     def test_middle_z(self):
120 |         """
121 |         Test fmt that contains '%z'
122 |         """
123 |         dtfmt = "time is: %H:%M:%S %z on %Y-%m-%d "
124 |         dtstr = self.localnow.strftime(dtfmt)
125 |         self.assertEqual(strptimez(dtstr, dtfmt), self.utcnow)
126 | 
127 |     def test_timer(self):
128 |         """
129 |         Test the Timer context manager
130 |         """
131 |         with Timer() as t:
132 |             time.sleep(1)
133 | 
134 |         self.assertGreater(t.finished, t.started)
135 |         self.assertEqual(t.elapsed, t.finished-t.started)
136 |         self.assertEqual(str(t), '1 seconds')
137 | 


--------------------------------------------------------------------------------