├── .gitignore ├── .travis.yml ├── DESCRIPTION.txt ├── Dockerfile-app ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── baleen ├── __init__.py ├── config.py ├── console │ ├── __init__.py │ ├── app.py │ ├── commands │ │ ├── __init__.py │ │ ├── export.py │ │ ├── ingest.py │ │ ├── load.py │ │ ├── run.py │ │ ├── serve.py │ │ └── summary.py │ └── utils.py ├── exceptions.py ├── export.py ├── feed.py ├── ingest.py ├── models.py ├── opml.py ├── utils │ ├── __init__.py │ ├── decorators.py │ ├── logger.py │ ├── mongolog.py │ └── timez.py ├── version.py ├── wrangle.py └── www │ ├── __init__.py │ ├── app.py │ ├── static │ ├── css │ │ └── baleen.css │ └── favicon.png │ └── templates │ ├── base.html │ ├── components │ ├── footer.html │ └── navbar.html │ ├── index.html │ ├── logs.html │ └── status.html ├── bin ├── baleen ├── doctimes.py └── ldoc.py ├── conf ├── baleen-example.yaml ├── upstart │ └── baleen.conf └── uwsgi │ ├── baleen.ini │ └── baleen.nginx ├── docker-compose.yml ├── docs ├── about.md ├── components.md ├── images │ ├── component_architecture.png │ ├── service_architecture.png │ ├── spacewhale.jpg │ └── whaleship.jpg ├── index.md └── service.md ├── fixtures └── fields.json ├── mkdocs.yml ├── requirements.txt ├── setup.cfg ├── setup.py └── tests ├── __init__.py ├── fixtures ├── feedly.opml └── feedparser_result.pickle ├── test_export.py ├── test_feed.py ├── test_ingest.py ├── test_models.py ├── test_opml.py ├── test_wrangle.py └── utils_tests ├── __init__.py ├── test_decorators.py ├── test_logger.py ├── test_mongolog.py └── test_timez.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | venv/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .cache 41 | nosetests.xml 42 | coverage.xml 43 | 44 | # Translations 45 | *.mo 46 | *.pot 47 | 48 | # Django stuff: 49 | *.log 50 | 51 | # Sphinx documentation 52 | docs/_build/ 53 | 54 | # PyBuilder 55 | target/ 56 | 57 | # Local configurations 58 | conf/baleen.yaml 59 | fixtures/corpus 60 | fixtures/feedly/* 61 | notebook 62 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | 2 | language: python 3 | 4 | python: 5 | - '2.7' 6 | 7 | before_install: 8 | - pip install nose 9 | - pip install coverage 10 | - pip install coveralls 11 | - pip install mock 12 | - pip install mongomock 13 | 14 | install: pip install -r requirements.txt 15 | 16 | script: make test 17 | 18 | after_script: coveralls 19 | 20 | notifications: 21 | email: 22 | recipients: 23 | - benjamin@bengfort.com 24 | 25 | on_success: change 26 | on_failure: always 27 | -------------------------------------------------------------------------------- /DESCRIPTION.txt: -------------------------------------------------------------------------------- 1 | Baleen is a tool for ingesting formal natural language data from the discourse of professional and amateur writers: e.g. bloggers and news outlets. Rather than performing web scraping, Baleen focuses on data ingestion through the use of RSS feeds. It performs as much raw data collection as it can, saving data into a Mongo document store. 2 | 3 | For more, please see the full documentation at: http://baleen-ingest.readthedocs.org/en/latest/ 4 | -------------------------------------------------------------------------------- /Dockerfile-app: -------------------------------------------------------------------------------- 1 | FROM python:2.7 2 | # things we like 3 | RUN apt-get update && apt-get install -y \ 4 | git \ 5 | vim 6 | # set up volume we will share our codebase with 7 | VOLUME /baleen 8 | WORKDIR /baleen 9 | # add baleen package to our python path 10 | RUN echo $(pwd) > /usr/local/lib/python2.7/site-packages/baleen.pth 11 | # install requirements 12 | COPY requirements.txt requirements.txt 13 | RUN pip install -r requirements.txt 14 | EXPOSE 5000 15 | # until we get the baleen daemon set, just sleep for now 16 | CMD /bin/sleep Inf 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Benjamin Bengfort 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.md 2 | include *.txt 3 | include *.yml 4 | include Makefile 5 | recursive-include docs *.md 6 | recursive-include docs *.jpg 7 | recursive-include tests *.py 8 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Shell to use with Make 2 | SHELL := /bin/bash 3 | 4 | # Set important Paths 5 | PROJECT := baleen 6 | LOCALPATH := $(CURDIR)/$(PROJECT) 7 | PYTHONPATH := $(LOCALPATH)/ 8 | PYTHON_BIN := $(VIRTUAL_ENV)/bin 9 | 10 | # Export targets not associated with files 11 | .PHONY: test coverage pip virtualenv clean publish 12 | 13 | # Clean build files 14 | clean: 15 | find . -name "*.pyc" -print0 | xargs -0 rm -rf 16 | -rm -rf htmlcov 17 | -rm -rf .coverage 18 | -rm -rf build 19 | -rm -rf dist 20 | -rm -rf $(PROJECT).egg-info 21 | 22 | # Targets for Coruscate testing 23 | test: 24 | $(PYTHON_BIN)/nosetests -v --with-coverage --cover-package=$(PROJECT) --cover-inclusive --cover-erase tests 25 | 26 | # Publish to gh-pages 27 | publish: 28 | git subtree push --prefix=deploy origin gh-pages 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Baleen 2 | **An automated ingestion service for blogs to construct a corpus for NLP research.** 3 | 4 | [![PyPI version][pypi_img]][pypi_href] 5 | [![Build Status][travis_img]][travis_href] 6 | [![Coverage Status][coveralls_img]][coverals_href] 7 | [![Code Health][health_img]][health_href] 8 | [![Documentation Status][rtfd_img]][rtfd_href] 9 | [![Stories in Ready][waffle_img]][waffle_href] 10 | 11 | [![Space Whale](docs/images/spacewhale.jpg)][spacewhale.jpg] 12 | 13 | ## Quick Start 14 | 15 | This quick start is intended to get you setup with Baleen in development mode (since the project is still under development). If you'd like to run Baleen in production, please see the [documentation][rtfd_href]. 16 | 17 | 1. Clone the repository 18 | 19 | ``` 20 | $ git clone git@github.com:bbengfort/baleen.git 21 | $ cd baleen 22 | ``` 23 | 24 | 2. Create a virtualenv and install the dependencies 25 | 26 | ``` 27 | $ virtualenv venv 28 | $ source venv/bin/activate 29 | $ pip install -r requirements.txt 30 | ``` 31 | 32 | 3. Add the `baleen` module to your `$PYTHONPATH` via the virtualenv. 33 | 34 | ``` 35 | $ echo $(pwd) > venv/lib/python2.7/site-packages/baleen.pth 36 | ``` 37 | 38 | 4. Create your local configuration file. Edit it with the connection details to your local MongoDB server. This is also a good time to check and make sure that you can create a database called Baleen on Mongo. 39 | 40 | ``` 41 | $ cp conf/baleen-example.yaml conf/baleen.yaml 42 | ``` 43 | 44 | ```yaml 45 | debug: true 46 | testing: false 47 | database: 48 | host: localhost 49 | port: 27017 50 | name: baleen 51 | server: 52 | host: 127.0.0.1 53 | port: 5000 54 | 55 | ``` 56 | 57 | 5. Run the tests to make sure everything is ok. 58 | 59 | ``` 60 | $ make test 61 | ``` 62 | 63 | 6. Make sure that the command line utility is ready to go: 64 | 65 | ``` 66 | $ bin/baleen --help 67 | ``` 68 | 69 | 7. Import the feeds from the `feedly.opml` file in the fixtures. 70 | 71 | ``` 72 | $ bin/baleen load tests/fixtures/feedly.opml 73 | Ingested 36 feeds from 1 OPML files 74 | ``` 75 | 76 | 8. Perform an ingestion of the feeds that were imported from the `feedly.opml` file. 77 | 78 | ``` 79 | $ bin/baleen ingest 80 | ``` 81 | 82 | Your Mongo database collections should be created as you add new documents to them, and at this point you're ready to develop! 83 | 84 | ## Docker Setup 85 | 86 | Included in this repository are files related to setting up the development environment using docker if you wish. 87 | 88 | 1. Install Docker Machine and Docker Compose e.g. with [Docker Toolbox](https://www.docker.com/products/docker-toolbox). 89 | 90 | 2. Clone the repository 91 | 92 | ``` 93 | $ git clone git@github.com:bbengfort/baleen.git 94 | $ cd baleen 95 | ``` 96 | 97 | 3. Create your local configuration file. Edit it with your configuration details; your MongoDB server will be at host `mongo`. 98 | 99 | ``` 100 | $ cp conf/baleen-example.yaml conf/baleen.yaml 101 | ``` 102 | 103 | ```yaml 104 | debug: true 105 | testing: false 106 | database: 107 | host: mongo 108 | port: 27017 109 | name: baleen 110 | server: 111 | host: 127.0.0.1 112 | port: 5000 113 | ``` 114 | 115 | 4. Exec interactively into the `app` container to interact with baleen as described in the above setup directions 5-8. 116 | 117 | ``` 118 | docker exec -it baleen_app_1 /bin/bash 119 | ``` 120 | 121 | ## Web Admin 122 | 123 | There is a simple Flask application that ships with Baleen that provides information about the current status of the Baleen ingestion. This app can be run locally in development with the following command: 124 | 125 | $ bin/baleen serve 126 | 127 | You can then reach the website at [http://127.0.0.1:5000/](http://127.0.0.1:5000/). Note that the host and port can be configured in the YAML configuration file or as command line arguments to the serve command. 128 | 129 | ### Deployment 130 | 131 | The web application is deployed in production as an Nginx + uWSGI + Flask application that is managed by upstart. 132 | 133 | ## About 134 | 135 | Baleen is a tool for ingesting _formal_ natural language data from the discourse of professional and amateur writers: e.g. bloggers and news outlets. Rather than performing web scraping, Baleen focuses on data ingestion through the use of RSS feeds. It performs as much raw data collection as it can, saving data into a Mongo document store. 136 | 137 | ### Throughput 138 | 139 | [![Throughput Graph](https://graphs.waffle.io/bbengfort/baleen/throughput.svg)](https://waffle.io/bbengfort/baleen/metrics) 140 | 141 | ### Attribution 142 | 143 | The image used in this README, ["Space Whale"][spacewhale.jpg] by [hbitik](http://hbitik.deviantart.com/) is licensed under [CC BY-NC-ND 3.0](http://creativecommons.org/licenses/by-nc-nd/3.0/) 144 | 145 | 146 | 147 | [pypi_img]: https://badge.fury.io/py/baleen.svg 148 | [pypi_href]: https://badge.fury.io/py/baleen 149 | [travis_img]: https://travis-ci.org/bbengfort/baleen.svg?branch=master 150 | [travis_href]: https://travis-ci.org/bbengfort/baleen/ 151 | [coveralls_img]: https://coveralls.io/repos/github/bbengfort/baleen/badge.svg?branch=master 152 | [coverals_href]: https://coveralls.io/github/bbengfort/baleen?branch=master 153 | [health_img]: https://landscape.io/github/bbengfort/baleen/master/landscape.svg?style=flat 154 | [health_href]: https://landscape.io/github/bbengfort/baleen/master 155 | [waffle_img]: https://badge.waffle.io/bbengfort/baleen.png?label=ready&title=Ready 156 | [waffle_href]: https://waffle.io/bbengfort/baleen 157 | [rtfd_img]: https://readthedocs.org/projects/baleen-ingest/badge/?version=latest 158 | [rtfd_href]: http://baleen-ingest.readthedocs.org/ 159 | [spacewhale.jpg]: http://fav.me/d4736q3 160 | -------------------------------------------------------------------------------- /baleen/__init__.py: -------------------------------------------------------------------------------- 1 | # baleen 2 | # An automated ingestion service for blogs to construct a corpus. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Fri Sep 19 10:55:58 2014 -0400 6 | # 7 | # Copyright (C) 2014 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: __init__.py [5ad94d7] benjamin@bengfort.com $ 11 | 12 | """ 13 | An automated ingestion service for blogs to construct a corpus for NLP 14 | research. 15 | """ 16 | 17 | ########################################################################## 18 | ## Imports 19 | ########################################################################## 20 | 21 | from .version import get_version 22 | 23 | ########################################################################## 24 | ## Package Version 25 | ########################################################################## 26 | 27 | __version__ = get_version() 28 | -------------------------------------------------------------------------------- /baleen/config.py: -------------------------------------------------------------------------------- 1 | # baleen.config 2 | # Uses confire to get meaningful configurations from a yaml file 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Fri Sep 19 11:14:33 2014 -0400 6 | # 7 | # Copyright (C) 2014 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: config.py [5b443de] benjamin@bengfort.com $ 11 | 12 | """ 13 | Uses confire to get meaningful configurations from a yaml file 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import os 21 | import confire 22 | 23 | ########################################################################## 24 | ## Configuration 25 | ########################################################################## 26 | 27 | class MongoConfiguration(confire.Configuration): 28 | """ 29 | Configuration for the Mongo database 30 | """ 31 | 32 | host = "localhost" 33 | port = 27017 34 | name = "baleen" 35 | 36 | 37 | class ServerConfiguration(confire.Configuration): 38 | """ 39 | Configuration for the web server to run an admin UI. 40 | """ 41 | 42 | host = "127.0.0.1" 43 | port = 5000 44 | 45 | 46 | class BaleenConfiguration(confire.Configuration): 47 | """ 48 | Meaningful defaults and required configurations. 49 | 50 | debug: the app will print or log debug statements 51 | database: connection information for mongo 52 | """ 53 | 54 | CONF_PATHS = [ 55 | "/etc/baleen.yaml", # System configuration 56 | os.path.expanduser("~/.baleen.yaml"), # User specific config 57 | os.path.abspath("conf/baleen.yaml"), # Local configuration 58 | ] 59 | 60 | debug = True 61 | database = MongoConfiguration() 62 | server = ServerConfiguration() 63 | logfile = 'baleen.log' # Location to write log 64 | loglevel = 'DEBUG' # Log messages to record 65 | fetch_html = True # Actually fetch HTML link 66 | timeout = 180 # Timeout for fetching posts/feeds 67 | 68 | ## Load settings immediately for import 69 | settings = BaleenConfiguration.load() 70 | 71 | if __name__ == '__main__': 72 | print settings 73 | -------------------------------------------------------------------------------- /baleen/console/__init__.py: -------------------------------------------------------------------------------- 1 | # baleen.console 2 | # Implements the baleen console utility. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Wed Mar 02 10:52:36 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: __init__.py [da54aa8] benjamin@bengfort.com $ 11 | 12 | """ 13 | Implements the baleen console utility. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | from .app import COMMANDS 21 | from .app import BaleenUtility 22 | -------------------------------------------------------------------------------- /baleen/console/app.py: -------------------------------------------------------------------------------- 1 | # baleen.console.app 2 | # Definition of the Baleen Utility app and commands 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Wed Mar 02 10:54:51 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: app.py [da54aa8] benjamin@bengfort.com $ 11 | 12 | """ 13 | Definition of the Baleen Utility app and commands 14 | http://bbengfort.github.io/tutorials/2016/01/23/console-utility-commis.html 15 | """ 16 | 17 | ########################################################################## 18 | ## Imports 19 | ########################################################################## 20 | 21 | from commis import color 22 | from commis import ConsoleProgram 23 | 24 | from baleen.console.commands import * 25 | from baleen.version import get_version 26 | 27 | ########################################################################## 28 | ## Utility Definition 29 | ########################################################################## 30 | 31 | DESCRIPTION = "Management and administration commands for Baleen" 32 | EPILOG = "If there are any bugs or concerns, submit an issue on Github" 33 | COMMANDS = ( 34 | IngestCommand, 35 | ExportCommand, 36 | LoadOPMLCommand, 37 | SummaryCommand, 38 | RunCommand, 39 | ServeCommand, 40 | ) 41 | 42 | 43 | ########################################################################## 44 | ## The Baleen CLI Utility 45 | ########################################################################## 46 | 47 | class BaleenUtility(ConsoleProgram): 48 | 49 | description = color.format(DESCRIPTION, color.CYAN) 50 | epilog = color.format(EPILOG, color.MAGENTA) 51 | version = color.format("baleen v{}", color.CYAN, get_version()) 52 | 53 | @classmethod 54 | def load(klass, commands=COMMANDS): 55 | utility = klass() 56 | for command in commands: 57 | utility.register(command) 58 | return utility 59 | -------------------------------------------------------------------------------- /baleen/console/commands/__init__.py: -------------------------------------------------------------------------------- 1 | # baleen.console.commands 2 | # Comamnds for the Baleen CLI utility. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Wed Mar 02 10:54:07 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: __init__.py [da54aa8] benjamin@bengfort.com $ 11 | 12 | """ 13 | Comamnds for the Baleen CLI utility. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | from .ingest import IngestCommand 21 | from .export import ExportCommand 22 | from .load import LoadOPMLCommand 23 | from .summary import SummaryCommand 24 | from .run import RunCommand 25 | from .serve import ServeCommand 26 | -------------------------------------------------------------------------------- /baleen/console/commands/export.py: -------------------------------------------------------------------------------- 1 | # baleen.console.commands.export 2 | # Export utility to dump an HTML corpus to disk from the database. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Wed Mar 02 11:12:50 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: export.py [da54aa8] benjamin@bengfort.com $ 11 | 12 | """ 13 | Export utility to dump an HTML corpus to disk from the database. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import os 21 | import baleen.models as db 22 | 23 | from commis import Command 24 | from baleen.console.utils import csv 25 | from baleen.export import MongoExporter, SCHEMES 26 | from baleen.utils.timez import Timer 27 | 28 | ########################################################################## 29 | ## Command 30 | ########################################################################## 31 | 32 | class ExportCommand(Command): 33 | 34 | name = 'export' 35 | help = 'export the raw HTML corpus for doing NLP' 36 | args = { 37 | '--list-categories': { 38 | 'action': 'store_true', 39 | 'default': False, 40 | 'help': 'show the available categories and exit', 41 | }, 42 | ('-C', '--categories'): { 43 | 'type': csv(str), 44 | 'default': None, 45 | 'metavar': 'csv', 46 | 'help': 'specify a list of categories to export', 47 | }, 48 | ('-S', '--scheme'): { 49 | 'type': str, 50 | 'default': 'json', 51 | 'choices': SCHEMES, 52 | 'help': 'specify the output format for the corpus', 53 | }, 54 | 'location': { 55 | 'nargs': 1, 56 | 'type': str, 57 | 'metavar': 'corpus directory', 58 | 'help': 'location to write the corpus out to' 59 | }, 60 | } 61 | 62 | def handle(self, args): 63 | # Connect to database 64 | db.connect() 65 | 66 | # Expand vars and user on the location passed 67 | root = os.path.expanduser(args.location[0]) 68 | root = os.path.expandvars(root) 69 | 70 | # Create the exporter object 71 | exporter = MongoExporter( 72 | root, categories=args.categories, scheme=args.scheme 73 | ) 74 | 75 | # If list categories is true, list them and exit. 76 | if args.list_categories: 77 | return "\n".join(sorted(exporter.categories)) 78 | 79 | with Timer() as t: 80 | exporter.export() 81 | 82 | return ( 83 | "Baleen corpus export complete in {}\n" 84 | "Exported {} posts in {} categories\n" 85 | "More information is in README in {}" 86 | ).format( 87 | t, sum(exporter.counts.values()), 88 | len(exporter.categories), root 89 | ) 90 | -------------------------------------------------------------------------------- /baleen/console/commands/ingest.py: -------------------------------------------------------------------------------- 1 | # baleen.console.commands.ingest 2 | # Handles the ingestion utility both for OPML and feeds. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Wed Mar 02 10:58:56 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: ingest.py [da54aa8] benjamin@bengfort.com $ 11 | 12 | """ 13 | Handles the ingestion utility both for OPML and feeds. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import baleen.models as db 21 | 22 | from commis import Command 23 | from commis.exceptions import ConsoleError 24 | from baleen.ingest import Ingestor, MongoIngestor, OPMLIngestor 25 | 26 | ########################################################################## 27 | ## Command 28 | ########################################################################## 29 | 30 | class IngestCommand(Command): 31 | 32 | name = 'ingest' 33 | help = 'ingests the RSS feeds to MongoDB' 34 | args = { 35 | '--opml': { 36 | 'type': str, 37 | 'default': None, 38 | 'help': 'Ingest directly from an OPML file', 39 | }, 40 | 'feeds': { 41 | 'type': str, 42 | 'nargs': "*", 43 | 'default': None, 44 | 'metavar': 'URL', 45 | 'help': 'Specify a list of feeds as urls' 46 | } 47 | } 48 | 49 | def handle(self, args): 50 | 51 | ingestor = MongoIngestor() 52 | 53 | if args.opml: 54 | ingestor = OPMLIngestor(args.opml) 55 | raise ConsoleError("opml ingestion is an untested utility!") 56 | 57 | if args.feeds: 58 | ingestor = Ingestor(args.feeds) 59 | raise ConsoleError("feed ingestion is an untested utility!") 60 | 61 | db.connect() 62 | ingestor.ingest() 63 | return ( 64 | "Processed {feeds} feeds ({timer}): " 65 | "{posts} posts with {errors} errors" 66 | ).format( 67 | timer=ingestor.timer, **ingestor.counts 68 | ) 69 | -------------------------------------------------------------------------------- /baleen/console/commands/load.py: -------------------------------------------------------------------------------- 1 | # baleen.console.commands.load 2 | # Loads an OPML file from disk into the database. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Wed Mar 02 11:05:57 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: load.py [da54aa8] benjamin@bengfort.com $ 11 | 12 | """ 13 | Loads an OPML file from disk into the database. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | from commis import Command 21 | from baleen import models as db 22 | from baleen.opml import load_opml 23 | 24 | ########################################################################## 25 | ## Command 26 | ########################################################################## 27 | 28 | class LoadOPMLCommand(Command): 29 | 30 | name = 'load' 31 | help = 'loads an OPML file from disk into the database' 32 | args = { 33 | 'opml': { 34 | 'nargs': "+", 35 | 'type': str, 36 | 'help': 'OPML file(s) to import to the database' 37 | } 38 | } 39 | 40 | def handle(self, args): 41 | # Connect to the database 42 | db.connect() 43 | 44 | # Load the OPML files into the database 45 | count = sum(load_opml(path) for path in args.opml) 46 | return "Ingested {} feeds from {} OPML files".format(count, len(args.opml)) 47 | -------------------------------------------------------------------------------- /baleen/console/commands/run.py: -------------------------------------------------------------------------------- 1 | # baleen.console.commands.run 2 | # Runs the ingestor in the background every hour. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Wed Mar 02 11:14:25 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: run.py [da54aa8] benjamin@bengfort.com $ 11 | 12 | """ 13 | Runs the ingestor in the background every hour. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import time 21 | import baleen 22 | import schedule 23 | import baleen.models as db 24 | 25 | from commis import Command 26 | from functools import partial 27 | from baleen.ingest import MongoIngestor 28 | from baleen.utils.logger import IngestLogger 29 | 30 | ########################################################################## 31 | ## Command 32 | ########################################################################## 33 | 34 | class RunCommand(Command): 35 | 36 | name = 'run' 37 | help = 'runs the ingest command every hour' 38 | args = {} 39 | 40 | def ingest(self, args): 41 | db.connect() 42 | ingestor = MongoIngestor() 43 | ingestor.ingest() 44 | 45 | def handle(self, args): 46 | logger = IngestLogger() 47 | logger.info( 48 | "Starting Baleen v{} ingestion service every hour.".format(baleen.get_version()) 49 | ) 50 | 51 | schedule.every().hour.do(partial(self.ingest, args)) 52 | 53 | while True: 54 | try: 55 | schedule.run_pending() 56 | time.sleep(1) 57 | except (KeyboardInterrupt, SystemExit): 58 | logger.info("Graceful shutdown of Baleen ingestion service.") 59 | return "" 60 | except Exception as e: 61 | logger.critical(str(e)) 62 | return str(e) 63 | -------------------------------------------------------------------------------- /baleen/console/commands/serve.py: -------------------------------------------------------------------------------- 1 | # baleen.console.commands.serve 2 | # Run a local development version of the Baleen Flask app. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Thu Apr 07 08:05:34 2016 -0400 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: serve.py [] benjamin@bengfort.com $ 11 | 12 | """ 13 | Run a local development version of the Baleen Flask app. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | from commis import Command 21 | from baleen.www.app import app 22 | from baleen.config import settings 23 | 24 | ########################################################################## 25 | ## Command 26 | ########################################################################## 27 | 28 | class ServeCommand(Command): 29 | 30 | name = 'serve' 31 | help = 'serve the Flask administration application' 32 | args = { 33 | '--host': { 34 | 'metavar': 'ADDR', 35 | 'default': settings.server.host, 36 | 'help': 'set the host to run the app on' 37 | }, 38 | '--port': { 39 | 'metavar': 'PORT', 40 | 'type': int, 41 | 'default': settings.server.port, 42 | 'help': 'set the port to run the app on' 43 | }, 44 | '--debug': { 45 | 'action': 'store_true', 46 | 'help': 'force debug mode in Flask' 47 | } 48 | } 49 | 50 | def handle(self, args): 51 | """ 52 | Runs the Baleen Flask application. 53 | """ 54 | kwargs = { 55 | 'host': args.host, 56 | 'port': args.port, 57 | 'debug': args.debug or settings.debug, 58 | } 59 | 60 | app.run(**kwargs) 61 | return " * Web application stopped" 62 | -------------------------------------------------------------------------------- /baleen/console/commands/summary.py: -------------------------------------------------------------------------------- 1 | # baleen.console.commands.summary 2 | # A utility to print out information about the Baleen state. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Wed Mar 02 11:08:57 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: summary.py [da54aa8] benjamin@bengfort.com $ 11 | 12 | """ 13 | A utility to print out information about the Baleen state. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import baleen 21 | import baleen.models as db 22 | 23 | from commis import Command 24 | from baleen.config import settings 25 | from baleen.utils.timez import HUMAN_DATETIME 26 | 27 | ########################################################################## 28 | ## Command 29 | ########################################################################## 30 | 31 | class SummaryCommand(Command): 32 | 33 | name = 'info' 34 | help = 'print info about Baleen from the database' 35 | args = { 36 | ('-c', '--config'): { 37 | 'action': 'store_true', 38 | 'default': False, 39 | 'help': 'Also print the configuration', 40 | } 41 | } 42 | 43 | def handle(self, args): 44 | # Setup output and connect to database. 45 | output = [] 46 | db.connect() 47 | 48 | # Printout configuration details as necessary. 49 | if args.config: 50 | output.append(u"Configuration:") 51 | output.append(unicode(settings)) 52 | output.append(u"") 53 | 54 | output.append(u"Baleen v{} Status:".format(baleen.get_version())) 55 | output.append( 56 | u"{} Feeds and {} Posts after {} Jobs".format( 57 | db.Feed.objects.count(), 58 | db.Post.objects.count(), 59 | db.Job.objects.count(), 60 | ) 61 | ) 62 | 63 | latest = db.Job.objects.order_by('-started').first() 64 | output.extend([ 65 | u"", 66 | u"Latest Job: ", 67 | u" Type: {} v{}".format(latest.name, latest.version), 68 | u" Job ID: {}".format(latest.jobid), 69 | u" Started: {}".format(latest.started.strftime(HUMAN_DATETIME)) 70 | ]) 71 | 72 | if latest.finished: 73 | if latest.failed: 74 | output.append(u" Failed: {}".format(latest.reason)) 75 | else: 76 | output.append(u" Finished: {}".format(latest.finished.strftime(HUMAN_DATETIME))) 77 | output.append(u" Counts:") 78 | output.append(u" " + u"\n ".join([u"{}: {}".format(*item) for item in latest.counts.items()])) 79 | output.append(u" Errors:") 80 | output.append(u" " + u"\n ".join([u"{}: {}".format(*item) for item in latest.errors.items()])) 81 | else: 82 | output.append(u" Currently Running") 83 | 84 | latest = db.Feed.objects.order_by('-updated').first() 85 | output.extend([ 86 | u"", 87 | u"Latest Feed: ", 88 | u" Title: \"{}\"".format(latest.title), 89 | u" eTag: \"{}\"".format(latest.etag), 90 | u" Modified: {}".format(latest.modified), 91 | u" Updated: {}".format(latest.updated.strftime(HUMAN_DATETIME)), 92 | # u" Posts: {}".format(latest.count_posts()), # This is very slow need to fix. 93 | ]) 94 | 95 | latest = db.Post.objects.order_by('-id').first() 96 | output.extend([ 97 | u"", 98 | u"Latest Post: ", 99 | u" Title: \"{}\"".format(latest.title), 100 | u" Feed: \"{}\"".format(latest.feed.title), 101 | u" Fetched: {}".format(latest.created.strftime(HUMAN_DATETIME)), 102 | ]) 103 | 104 | return u"\n".join(output).encode('utf-8', errors='replace') 105 | -------------------------------------------------------------------------------- /baleen/console/utils.py: -------------------------------------------------------------------------------- 1 | # baleen.console.utils 2 | # Argparse extensions and utilities. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Wed Mar 02 11:01:35 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: utils.py [da54aa8] benjamin@bengfort.com $ 11 | 12 | """ 13 | Argparse extensions and utilities. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import argparse 21 | 22 | 23 | ########################################################################## 24 | ## Console Parsers 25 | ########################################################################## 26 | 27 | def csv(ptype=int): 28 | """ 29 | Argparse type for comma seperated values. Also parses the type, e.g. int. 30 | """ 31 | def parser(s): 32 | try: 33 | parse = lambda p: ptype(p.strip()) 34 | return map(parse, s.split(",")) 35 | except Exception: 36 | raise argparse.ArgumentTypeError( 37 | "Could not parse CSV value to type {}: {!r}".format(ptype.__name__, s) 38 | ) 39 | 40 | return parser 41 | -------------------------------------------------------------------------------- /baleen/exceptions.py: -------------------------------------------------------------------------------- 1 | # baleen.exceptions 2 | # Exceptions hierarchy for the Baleen module. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Wed Mar 02 13:59:03 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: exceptions.py [538b33d] benjamin@bengfort.com $ 11 | 12 | """ 13 | Exceptions hierarchy for the Baleen module. 14 | """ 15 | 16 | ########################################################################## 17 | ## Exceptions Hierarchy 18 | ########################################################################## 19 | 20 | class BaleenError(Exception): 21 | """ 22 | The root of all errors in Baleen (hopefully) 23 | """ 24 | pass 25 | 26 | 27 | class FeedTypeError(BaleenError): 28 | """ 29 | Could not detect the feed type for synchronization 30 | """ 31 | pass 32 | 33 | 34 | class IngestionError(BaleenError): 35 | """ 36 | Something went wrong specifically with the ingestion process 37 | """ 38 | pass 39 | 40 | 41 | class SynchronizationError(IngestionError): 42 | """ 43 | Something went wrong with feed synchronization 44 | """ 45 | pass 46 | 47 | 48 | class WranglingError(IngestionError): 49 | """ 50 | Something went wrong wrangling a post 51 | """ 52 | pass 53 | 54 | 55 | class FetchError(WranglingError): 56 | """ 57 | Could not fetch the webpage for the post 58 | """ 59 | pass 60 | 61 | 62 | class ExportError(BaleenError): 63 | """ 64 | Something went wrong with the export of the corpus 65 | """ 66 | pass 67 | 68 | 69 | class TimeoutError(Exception): 70 | """ 71 | An operation timed out 72 | """ 73 | pass 74 | -------------------------------------------------------------------------------- /baleen/export.py: -------------------------------------------------------------------------------- 1 | # baleen.export 2 | # Export an HTML corpus for analyses with NLTK 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Fri Oct 03 16:49:20 2014 -0400 6 | # 7 | # Copyright (C) 2014 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: export.py [eb962e7] benjamin@bengfort.com $ 11 | 12 | """ 13 | Export an HTML corpus for analyses with NLTK 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import os 21 | import codecs 22 | 23 | from enum import Enum 24 | from datetime import datetime 25 | from baleen.models import Feed, Post 26 | from baleen.exceptions import ExportError 27 | from collections import Counter 28 | from operator import itemgetter 29 | 30 | ########################################################################## 31 | ## Module Constants 32 | ########################################################################## 33 | 34 | DTFMT = "%b %d, %Y at %H:%M" 35 | SCHEMES = ('json', 'html') 36 | State = Enum('State', 'Init, Started, Finished') 37 | 38 | ########################################################################## 39 | ## Exporter 40 | ########################################################################## 41 | 42 | class MongoExporter(object): 43 | """ 44 | The exporter attempts to read the MongoDB as efficiently as possible, 45 | writing posts to disk in either HTML or JSON format. 46 | """ 47 | 48 | def __init__(self, root, categories=None, scheme='json'): 49 | self.root = root # Location on disk to write to 50 | self.scheme = scheme.lower() # Output format of the data 51 | self.state = State.Init # Current state of the export 52 | self.counts = Counter() # Counts of posts per category 53 | self.categories = categories # Specific categories to export 54 | 55 | if self.scheme not in SCHEMES: 56 | raise ExportError( 57 | "Unknown export scheme: '{}' - use one of {}.".format( 58 | self.scheme, ", ".join(SCHEMES) 59 | ) 60 | ) 61 | 62 | @property 63 | def categories(self): 64 | if self._categories is None: 65 | self._categories = Feed.objects.distinct('category') 66 | return self._categories 67 | 68 | @categories.setter 69 | def categories(self, value): 70 | self._categories = value 71 | 72 | def feeds(self, categories=None): 73 | """ 74 | Returns a list of feeds for the specified categories. 75 | During export, this list is used to construct a feed-category mapping 76 | that is used to perform checking of sequential reads of Posts. 77 | """ 78 | if isinstance(categories, basestring): 79 | categories = [categories] 80 | elif categories is None: 81 | categories = self.categories 82 | else: 83 | categories = list(categories) 84 | 85 | return Feed.objects(category__in=categories) 86 | 87 | def posts(self, categories=None): 88 | """ 89 | This method first creates a mapping of feeds to categories, then 90 | iterates through the Posts collection, finding only posts with those 91 | given feeds (and not dereferencing the related object). This will 92 | speed up the post fetch process and give us more information, quickly. 93 | 94 | The generator therefore yields post, category tuples to provide for 95 | the single pass across the posts. 96 | 97 | This method also counts the number of posts per category. 98 | 99 | This method raises an exception if not in the correct state. 100 | """ 101 | if self.state != State.Started: 102 | raise ExportError(( 103 | "Calling the posts method when not in the started state " 104 | "could cause double counting or multiple database reads." 105 | )) 106 | 107 | # Create a mapping of feed id to category 108 | feeds = { 109 | feed.id: feed.category 110 | for feed in self.feeds(categories) 111 | } 112 | 113 | # Iterate through all posts that have the given feed ids without 114 | # dereferencing the related object. Yield (post, category) tuples. 115 | # This method also counts the number of posts per category. 116 | for post in Post.objects(feed__in=feeds.keys()).no_dereference().no_cache(): 117 | category = feeds[post.feed.id] 118 | self.counts[category] += 1 119 | 120 | yield post, category 121 | 122 | def readme(self, path): 123 | """ 124 | Writes README information about the state of the export to disk at 125 | the specified path. The writing requires the export to be finished, 126 | otherwise, the method will raise an exception. 127 | 128 | This method raises an exception if not in the correct state. 129 | """ 130 | if self.state != State.Finished: 131 | raise ExportError(( 132 | "Calling the readme method when not in the finished state " 133 | "could lead to writing misleading or incorrect meta data." 134 | )) 135 | 136 | # Create the output lines with the header information. 137 | output = [ 138 | "Baleen RSS Export", 139 | "=================", "", 140 | "Exported on: {}".format(datetime.now().strftime(DTFMT)), 141 | "{} feeds containing {} posts in {} categories.".format( 142 | self.feeds().count(), 143 | sum(self.counts.values()), 144 | len(self.categories), 145 | ), "", 146 | "Category Counts", 147 | "---------------", "", 148 | ] 149 | 150 | # Append category counts list to the README 151 | for item in sorted(self.counts.items(), key=itemgetter(0)): 152 | output.append("- {}: {}".format(*item)) 153 | 154 | # Add a newline at the end of the README 155 | output.append("") 156 | 157 | # Write out the output to the file as utf-8. 158 | with codecs.open(path, 'w', encoding='utf-8') as f: 159 | f.write("\n".join(output)) 160 | 161 | def feedinfo(self, path): 162 | """ 163 | Writes information about the feeds to disk for performing lookups on 164 | the feeds themselves from the object id in each individual post. 165 | """ 166 | fields = ('id', 'title', 'link', 'category', 'active') 167 | feeds = Feed.objects(category__in=self.categories).only(*fields) 168 | with open(path, 'w') as f: 169 | f.write(feeds.to_json(indent=2)) 170 | 171 | def export(self): 172 | """ 173 | Runs the export of the posts to disk. 174 | """ 175 | 176 | # Reset the counts object and mark export as started. 177 | self.counts = Counter() 178 | self.state = State.Started 179 | 180 | # Make the directory to export if it doesn't exist. 181 | if not os.path.exists(self.root): 182 | os.mkdir(self.root) 183 | 184 | # If the root is not a directory, then we can't write there. 185 | if not os.path.isdir(self.root): 186 | raise ExportError( 187 | "'{}' is not a directory!".format(self.root) 188 | ) 189 | 190 | # Create the directories for each category on disk and map paths. 191 | catdir = {} 192 | for category in self.categories: 193 | path = os.path.join(self.root, category) 194 | 195 | if not os.path.exists(path): 196 | os.mkdir(path) 197 | 198 | if not os.path.isdir(path): 199 | raise ExportError( 200 | "'{}' is not a directory!".format(path) 201 | ) 202 | 203 | catdir[category] = path 204 | 205 | # Iterate through all posts, writing them to disk correctly. 206 | # Right now we will simply write them based on their object id. 207 | for post, category in self.posts(): 208 | path = os.path.join( 209 | self.root, catdir[category], "{}.{}".format(post.id, self.scheme) 210 | ) 211 | 212 | with codecs.open(path, 'w', encoding='utf-8') as f: 213 | action = { 214 | 'json': lambda: post.to_json(indent=2), 215 | 'html': post.htmlize, 216 | }[self.scheme] 217 | 218 | f.write(action()) 219 | 220 | # Mark the export as finished and write the README to the corpus. 221 | self.state = State.Finished 222 | self.readme(os.path.join(self.root, "README")) 223 | self.feedinfo(os.path.join(self.root, "feeds.json")) 224 | 225 | 226 | if __name__ == '__main__': 227 | import baleen.models as db 228 | 229 | db.connect() 230 | exporter = MongoExporter('fixtures/corpus') 231 | exporter.export() 232 | -------------------------------------------------------------------------------- /baleen/feed.py: -------------------------------------------------------------------------------- 1 | # baleen.feed 2 | # Handles the synchronization of an RSS feed. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Sun Sep 21 09:58:44 2014 -0400 6 | # 7 | # Copyright (C) 2014 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: feed.py [e4baa55] benjamin@bengfort.com $ 11 | 12 | """ 13 | Handles the synchronization of documents from an RSS feeds. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import feedparser 21 | 22 | from baleen.config import settings 23 | from baleen.models import Feed 24 | from baleen.utils.timez import localnow 25 | from baleen.exceptions import FeedTypeError 26 | from baleen.exceptions import SynchronizationError 27 | from baleen.utils.decorators import memoized, reraise, timeout 28 | 29 | 30 | ########################################################################## 31 | ## Module Constants 32 | ########################################################################## 33 | 34 | FEEDPARSER_IGNORABLE_FIELDS = { 35 | 'updated', 'updated_parsed', 'id', 36 | 'published', 'published_parsed', 'category', 37 | } 38 | 39 | 40 | ########################################################################## 41 | ## Feed Synchronization 42 | ########################################################################## 43 | 44 | class FeedSync(object): 45 | """ 46 | A utility that wraps both a Feed object and the feedparser library. 47 | The feed that is passed into the FeedSync can be one of the following: 48 | 49 | - a string representing the url to the RSS feed 50 | - a dictionary with an xmlUrl key (from OPML) 51 | - a Feed object loaded from MongoDB. 52 | 53 | The feed synchronization utility is smart enough to access what it needs. 54 | """ 55 | 56 | URL = "FEED_URL" 57 | DICT = "FEED_DICT" 58 | MODEL = "FEED_MODEL" 59 | 60 | @classmethod 61 | def factory(klass, feeds): 62 | """ 63 | Yields a feed synchronizer for each feed in the feeds. 64 | """ 65 | for feed in feeds: 66 | yield klass(feed) 67 | 68 | def __init__(self, feed): 69 | """ 70 | Feed can be a string (url), a dictionary with an `xmlUrl` or a Feed. 71 | """ 72 | self.feed = feed 73 | 74 | @memoized 75 | def type(self): 76 | """ 77 | Returns the type of the feed. 78 | """ 79 | if isinstance(self.feed, basestring): 80 | return self.URL 81 | 82 | if isinstance(self.feed, Feed): 83 | return self.MODEL 84 | 85 | if isinstance(self.feed, dict): 86 | if 'xmlUrl' not in self.feed: 87 | raise FeedTypeError( 88 | "Dictionary object does not contain 'xmlUrl' key!" 89 | ) 90 | return self.DICT 91 | 92 | raise FeedTypeError( 93 | "Could not determine feed type from '{}'".format(type(self.feed)) 94 | ) 95 | 96 | @memoized 97 | def url(self): 98 | """ 99 | Extracts the url from the feed based on the type. 100 | """ 101 | return { 102 | self.URL: lambda: self.feed, 103 | self.DICT: lambda: self.feed.get('xmlUrl', None), 104 | self.MODEL: lambda: self.feed.link, 105 | }[self.type]() 106 | 107 | @timeout(settings.timeout) 108 | def parse(self): 109 | """ 110 | Wraps the feedparser.parse function such that if the feed is an model, 111 | it uses the etag or modified to prevent duplicating the download. 112 | 113 | NOTE: Calling this function will NOT update the feed use sync instead! 114 | NOTE: Exceptions in this function will not be handled by Baleen! 115 | """ 116 | # Only models contain the etag/modified saved information. 117 | if self.type == self.MODEL: 118 | # If there is an etag use it (even if there is also modified) 119 | if self.feed.etag: 120 | return feedparser.parse(self.url, etag=self.feed.etag) 121 | 122 | # If there is a modified date, then use it 123 | if self.feed.modified: 124 | return feedparser.parse(self.url, modified=self.feed.modified) 125 | 126 | # Otherwise just return the parse of the URL 127 | return feedparser.parse(self.url) 128 | 129 | @reraise(klass=SynchronizationError) 130 | def sync(self, save=True): 131 | """ 132 | Calls the feedparser.parse function correctly but also synchronizes 133 | the state of the feed (e.g. last modified, etag, etc.) to MongoDB. 134 | 135 | Note: If the feed isn't a model, it just does the same as parse. 136 | 137 | If save is True (default) will save the Feed back to MongoDB. 138 | """ 139 | # Get the result from the parse function. 140 | result = self.parse() 141 | 142 | # If this is not a model, bail out and return the result. 143 | if not self.type == self.MODEL: 144 | return result 145 | 146 | # Otherwise update the model in MongoDB with synchronization info. 147 | # Set the last fetched timestamp on the model. 148 | self.feed.fetched = localnow() 149 | 150 | # Update the feed properties from the result. 151 | for key in ('etag', 'modified', 'version'): 152 | if key in result and getattr(result, key): 153 | setattr(self.feed, key, getattr(result, key)) 154 | 155 | # Update the link via the href 156 | if 'href' in result and result.href: 157 | self.feed.link = result.href 158 | 159 | # Update the feed items from the result. 160 | for key, val in result.feed.items(): 161 | if key in FEEDPARSER_IGNORABLE_FIELDS: 162 | # Ignore these generated or protected fields. 163 | continue 164 | 165 | if key == 'link': 166 | self.feed.urls['htmlUrl'] = val 167 | 168 | elif key == 'links': 169 | for idx, link in enumerate(val): 170 | if 'rel' in link: 171 | self.feed.urls[link['rel'] + str(idx)] = link['href'] 172 | else: 173 | self.feed.urls["link{}".format(idx)] = link['href'] 174 | 175 | else: 176 | setattr(self.feed, key, val) 177 | 178 | if save: 179 | self.feed.save() 180 | 181 | return result 182 | 183 | def entries(self, save=True): 184 | """ 185 | A helper function to simultaneously call sync and iterate over the 186 | entries from the feed. This is the usual method of interacting with 187 | the feed sync object. Note that this just returns raw dicts not Posts. 188 | """ 189 | result = self.sync(save=save) 190 | return result.entries 191 | -------------------------------------------------------------------------------- /baleen/ingest.py: -------------------------------------------------------------------------------- 1 | # baleen.ingest 2 | # The ingestion runner that implements ingestion for a collection of feeds. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Wed Mar 02 23:23:06 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: ingest.py [4ee79a0] benjamin@bengfort.com $ 11 | 12 | """ 13 | The ingestion runner that implements ingestion for a collection of feeds. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import uuid 21 | 22 | from baleen.opml import OPML 23 | from baleen.exceptions import * 24 | from baleen import models as db 25 | from baleen.feed import FeedSync 26 | from baleen.config import settings 27 | from baleen.utils.timez import Timer 28 | from baleen.wrangle import PostWrangler 29 | from baleen.utils.logger import LoggingMixin 30 | from baleen.utils.decorators import memoized 31 | 32 | from datetime import datetime 33 | from collections import Counter 34 | 35 | 36 | ########################################################################## 37 | ## Helper Functions 38 | ########################################################################## 39 | 40 | def stype(obj): 41 | """ 42 | Returns the string of the type. Used to count exception types. 43 | """ 44 | if isinstance(obj, BaleenError): 45 | if hasattr(obj, "original"): 46 | return "{} ({})".format( 47 | type(obj).__name__, type(obj.original).__name__ 48 | ) 49 | return type(obj).__name__ 50 | 51 | 52 | ########################################################################## 53 | ## Base Ingestion Class 54 | ########################################################################## 55 | 56 | class Ingestor(LoggingMixin): 57 | """ 58 | Base class for the ingestors. 59 | 60 | Ingestors manage the synchronization of feeds, wrangling of posts, and 61 | fetching of web pages to store to the Mongo database. Ingestors can 62 | either get feeds from a list of strings, an OPML file or a Mongo query. 63 | 64 | Ingestors also perform logging and exception handling. 65 | """ 66 | 67 | def __init__(self, feeds=None, **options): 68 | self.timer = None # Processing timer 69 | self.jobid = None # Unique job id for every run 70 | self.options = options # Any other options passed in 71 | self._feeds = feeds # Allows pass in feed collection 72 | self.errors = Counter() # Count the number of error types 73 | 74 | @property 75 | def name(self): 76 | return self.__class__.__name__ 77 | 78 | @memoized 79 | def counts(self): 80 | """ 81 | Keep track of counts and ensure zero keys exist. 82 | """ 83 | counts = Counter() 84 | for key in ('feeds', 'posts', 'errors', 'feed_error'): 85 | counts[key] = 0 86 | return counts 87 | 88 | def feeds(self): 89 | """ 90 | This is the primary entry point for subclasses, they must specificy 91 | how to get access to a collection of feeds to syncrhonize. 92 | """ 93 | if self._feeds is not None: 94 | return self._feeds 95 | 96 | raise IngestionError( 97 | "No feeds specified for {} ingestion!".format(self.name) 98 | ) 99 | 100 | def started(self): 101 | """ 102 | Run when the ingestor is started and used for logging. Subclasses can 103 | use it as a hook to perform extra work right before kick off. 104 | """ 105 | message = "{} job {} started".format(self.name, self.jobid) 106 | self.logger.info(message) 107 | 108 | def failed(self, exception): 109 | """ 110 | Executed when a complete ingestion run has failed (very bad). Used 111 | to log the exception or clean up before Baleen crashes! 112 | """ 113 | message = "{} job {} failed!".format(self.name, self.jobid) 114 | self.logger.error("Ingestion Error: {}".format(exception)) 115 | self.logger.critical(message) 116 | 117 | def finished(self): 118 | """ 119 | Run when the ingestor has finished and used for logging. Subclasses 120 | can use it as a hook to perform any completion work. 121 | """ 122 | # Notify the results 123 | results = ( 124 | "Processed {feeds} feeds ({timer}) " 125 | "{posts} posts with {errors} errors" 126 | ).format( 127 | timer=self.timer, **self.counts 128 | ) 129 | self.logger.info(results) 130 | 131 | # Notify job finished 132 | message = "{} job {} finished".format(self.name, self.jobid) 133 | self.logger.info(message) 134 | 135 | def process(self): 136 | """ 137 | Runs the ingestion process by iterating over the feeds, synchronizing 138 | and then wrangling posts into the database as well as fetching pages. 139 | """ 140 | for idx, fsync in enumerate(FeedSync.factory(self.feeds())): 141 | try: 142 | self.process_feed(fsync) 143 | self.counts['feeds'] += 1 144 | except SynchronizationError as e: 145 | self.counts['feed_error'] += 1 146 | self.errors[stype(e)] += 1 147 | self.logger.error( 148 | u"Error on Feed {} ({}): {}".format( 149 | idx+1, fsync.feed, unicode(e) 150 | ) 151 | ) 152 | 153 | def process_feed(self, fsync): 154 | """ 155 | Synchronizes a feed and catches exceptions 156 | """ 157 | factory = PostWrangler.factory(fsync.entries(), fsync.feed) 158 | for idx, post in enumerate(factory): 159 | try: 160 | self.process_post(post) 161 | self.counts["posts"] += 1 162 | except WranglingError as e: 163 | self.counts["errors"] += 1 164 | self.errors[stype(e)] += 1 165 | self.logger.error( 166 | u"Post Error for feed {} on entry {}: {}".format( 167 | fsync.feed, idx, unicode(e) 168 | ) 169 | ) 170 | 171 | def process_post(self, post): 172 | """ 173 | Wrangles a post from a single feed and catches exceptions 174 | """ 175 | post.wrangle() 176 | if settings.fetch_html: 177 | try: 178 | post.fetch() 179 | except FetchError as e: 180 | self.counts["fetch_error"] += 1 181 | self.errors[stype(e)] += 1 182 | self.logger.error( 183 | u"Fetch Error for post \"{}\" ({}): {}".format( 184 | post.post.title, post.post.url, unicode(e) 185 | ) 186 | ) 187 | 188 | def ingest(self): 189 | """ 190 | Subclasses do not typically override the ingest method. Instead they 191 | will override the process hooks for start, failed, and finish, or the 192 | process method directly. 193 | """ 194 | # Set a unique job id for every time run is called. 195 | # The job id is based on the hostname and a time sequence. 196 | self.jobid = uuid.uuid1() 197 | 198 | # Call the started hook for logging and notification. 199 | self.started() 200 | 201 | # Time how long it takes to perform the processing 202 | with Timer() as self.timer: 203 | try: 204 | self.process() 205 | except Exception as e: 206 | # If something goes wrong, call the failed hook, then raise. 207 | self.failed(e) 208 | raise 209 | 210 | # Call the finished hook for logging and notification. 211 | self.finished() 212 | 213 | 214 | ########################################################################## 215 | ## Mongo Ingestion Class 216 | ########################################################################## 217 | 218 | class MongoIngestor(Ingestor): 219 | """ 220 | Ingests feeds that are stored in the database. 221 | This type of ingestor also tracks information into the database. 222 | """ 223 | 224 | def feeds(self): 225 | """ 226 | Returns an iterator of all active feeds from the database 227 | """ 228 | for feed in db.Feed.objects(active=True): 229 | yield feed 230 | 231 | def started(self): 232 | """ 233 | Save a record about the job start to the database. 234 | """ 235 | super(MongoIngestor, self).started() 236 | self.job = db.Job(jobid=self.jobid, name=self.name) 237 | self.job.save() 238 | 239 | def failed(self, exception): 240 | """ 241 | Save information about the failure to the database. 242 | """ 243 | super(MongoIngestor, self).failed(exception) 244 | self.job.failed = True 245 | self.job.reason = unicode(exception) 246 | self.job.finished = datetime.now() 247 | self.job.save() 248 | 249 | def finished(self): 250 | """ 251 | Update the job record in the database. 252 | """ 253 | super(MongoIngestor, self).finished() 254 | self.job.reason = u"OK" 255 | self.job.finished = datetime.now() 256 | self.job.counts = self.counts 257 | self.job.errors = self.errors 258 | self.job.totals = { 259 | "feeds": db.Feed.objects.count(), 260 | "posts": db.Post.objects.count(), 261 | "jobs": db.Job.objects.count(), 262 | } 263 | self.job.save() 264 | 265 | ########################################################################## 266 | ## OPML Ingestion Class 267 | ########################################################################## 268 | 269 | class OPMLIngestor(Ingestor): 270 | """ 271 | Ingests feeds from an OPML file. 272 | """ 273 | 274 | def __init__(self, path, **options): 275 | self.opml = OPML(path) 276 | super(OPMLIngestor, self).__init__(**options) 277 | 278 | def feeds(self): 279 | """ 280 | Returns an iterator of all active feeds from the database 281 | """ 282 | for feed in self.opml: 283 | yield feed 284 | -------------------------------------------------------------------------------- /baleen/models.py: -------------------------------------------------------------------------------- 1 | # baleen.models 2 | # Object Document Models for use with Mongo and mongoengine 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Fri Sep 19 11:30:53 2014 -0400 6 | # 7 | # Copyright (C) 2014 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: models.py [5b443de] benjamin@bengfort.com $ 11 | 12 | """ 13 | Object Document Models for use with Mongo and mongoengine 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import baleen 21 | import hashlib 22 | import mongoengine as me 23 | 24 | from datetime import datetime,timedelta 25 | from baleen.config import settings 26 | from baleen.utils.timez import humanizedelta 27 | 28 | 29 | ########################################################################## 30 | ## Module Constants 31 | ########################################################################## 32 | 33 | FEEDTYPES = ( 34 | 'atom', 35 | 'atom01', 36 | 'atom02', 37 | 'atom03', 38 | 'atom10', 39 | 'cdf', 40 | 'rss', 41 | 'rss090', 42 | 'rss091n', 43 | 'rss092', 44 | 'rss093', 45 | 'rss094', 46 | 'rss10', 47 | 'rss20', 48 | ) 49 | 50 | ########################################################################## 51 | ## Helper Functions 52 | ########################################################################## 53 | 54 | def connect(**kwargs): 55 | """ 56 | Wrapper for mongoengine connect - connects with configuration details. 57 | """ 58 | name = kwargs.pop('name', settings.database.name) 59 | host = kwargs.pop('host', settings.database.host) 60 | port = kwargs.pop('port', settings.database.port) 61 | 62 | return me.connect(name, host=host, port=port, **kwargs) 63 | 64 | ########################################################################## 65 | ## Models 66 | ########################################################################## 67 | 68 | class Feed(me.DynamicDocument): 69 | 70 | version = me.StringField(choices=FEEDTYPES) 71 | etag = me.StringField() 72 | modified = me.StringField() 73 | title = me.StringField(max_length=256) 74 | link = me.URLField(required=True, unique=True) 75 | urls = me.DictField() 76 | category = me.StringField(required=True) 77 | active = me.BooleanField(default=True) 78 | fetched = me.DateTimeField(default=None) 79 | created = me.DateTimeField(default=datetime.now, required=True) 80 | updated = me.DateTimeField(default=datetime.now, required=True) 81 | 82 | @classmethod 83 | def pre_save(cls, sender, document, **kwargs): 84 | document.updated = datetime.now() 85 | 86 | meta = { 87 | 'collection': 'feeds', 88 | } 89 | 90 | @property 91 | def xmlurl(self): 92 | return self.link 93 | 94 | @property 95 | def htmlurl(self): 96 | return self.urls.get('htmlUrl') 97 | 98 | def count_posts(self): 99 | """ 100 | Count the number of associated posts 101 | 102 | TODO: This is very, very slow on Mongo (fix and make better). 103 | """ 104 | return Post.objects(feed=self).count() 105 | 106 | def __unicode__(self): 107 | return self.title if self.title else self.link 108 | 109 | class Post(me.DynamicDocument): 110 | 111 | feed = me.ReferenceField(Feed) 112 | title = me.StringField( max_length=512 ) 113 | url = me.URLField( required=True, unique=True ) 114 | pubdate = me.DateTimeField() 115 | content = me.StringField( required=True ) 116 | tags = me.ListField(me.StringField(max_length=256)) 117 | signature = me.StringField( required=True, max_length=64, min_length=64, unique=True ) 118 | created = me.DateTimeField(default=datetime.now, required=True) 119 | updated = me.DateTimeField(default=datetime.now, required=True) 120 | 121 | @classmethod 122 | def pre_save(cls, sender, document, **kwargs): 123 | document.updated = datetime.now() 124 | document.signature = document.hash() 125 | 126 | meta = { 127 | 'collection': 'posts', 128 | } 129 | 130 | def hash(self): 131 | """ 132 | Returns the SHA256 hash of the content. 133 | """ 134 | sha = hashlib.sha256() 135 | sha.update(self.content.encode('UTF-8')) 136 | return sha.hexdigest() 137 | 138 | def htmlize(self): 139 | """ 140 | Returns an HTML string of the content of the Post. 141 | In the future we may use bleach to do sanitization or other simple 142 | sanity checks to ensure that things are going ok, which is why this 143 | method stub exists. 144 | """ 145 | return self.content 146 | 147 | def __unicode__(self): 148 | return self.title if self.title else self.url 149 | 150 | 151 | class Job(me.DynamicDocument): 152 | 153 | jobid = me.UUIDField(binary=False, required=True) 154 | name = me.StringField(max_length=128, default="Unknown") 155 | failed = me.BooleanField(default=False) 156 | reason = me.StringField(max_length=512) 157 | version = me.StringField(max_length=10, default=baleen.get_version) 158 | started = me.DateTimeField(default=datetime.now, required=True) 159 | finished = me.DateTimeField(default=None) 160 | updated = me.DateTimeField(default=datetime.now, required=True) 161 | errors = me.MapField(field=me.IntField()) 162 | counts = me.MapField(field=me.IntField()) 163 | totals = me.MapField(field=me.IntField()) 164 | 165 | @classmethod 166 | def pre_save(cls, sender, document, **kwargs): 167 | document.updated = datetime.now() 168 | 169 | meta = { 170 | 'collection': 'jobs', 171 | } 172 | 173 | def duration(self, humanize=False): 174 | """ 175 | Returns the timedelta of the duration. 176 | """ 177 | finished = self.finished or datetime.now() 178 | delta = finished - self.started 179 | 180 | if humanize: 181 | return humanizedelta( 182 | days=delta.days, 183 | seconds=delta.seconds, 184 | microseconds=delta.microseconds 185 | ) 186 | return delta 187 | 188 | @property 189 | def bootstrap_class(self): 190 | """ 191 | Uses the duration to determine the colorization of the job. 192 | """ 193 | if self.finished and self.failed: 194 | return "danger" 195 | 196 | if self.finished and not self.failed: 197 | if self.duration() > timedelta(minutes=30): 198 | return "warning" 199 | return "success" 200 | 201 | if not self.finished: 202 | 203 | if self.duration() < timedelta(minutes=30): 204 | return "success" 205 | 206 | elif timedelta(minutes=30) < self.duration() < timedelta(hours=2): 207 | return "warning" 208 | 209 | else: 210 | return "danger" 211 | 212 | return "" 213 | 214 | def __unicode__(self): 215 | return "{} Job {}".format(self.name, self.jobid) 216 | 217 | 218 | class Log(me.DynamicDocument): 219 | 220 | level = me.DictField() 221 | message = me.StringField(max_length=4096) 222 | host = me.StringField(max_length=255) 223 | user = me.StringField(max_length=255) 224 | error = me.DictField() 225 | logger = me.StringField(max_length=255) 226 | asctime = me.StringField(max_length=64) 227 | timestamp = me.DateTimeField() 228 | 229 | meta = { 230 | 'collection': 'logs', 231 | } 232 | 233 | @property 234 | def bootstrap_class(self): 235 | """ 236 | Uses the log level to determine the bootstrap class. 237 | """ 238 | levels = { 239 | "DEBUG": "success", 240 | "INFO": "info", 241 | "WARNING": "warning", 242 | "WARN": "warning", 243 | "ERROR": "danger", 244 | "CRITICAL": "danger", 245 | } 246 | 247 | key = self.level.get('name') 248 | if key and key in levels: 249 | return levels[key] 250 | return "" 251 | 252 | def __unicode__(self): 253 | return self.message 254 | 255 | ########################################################################## 256 | ## Signals 257 | ########################################################################## 258 | 259 | me.signals.pre_save.connect(Feed.pre_save, sender=Feed) 260 | me.signals.pre_save.connect(Post.pre_save, sender=Post) 261 | me.signals.pre_save.connect(Post.pre_save, sender=Post) 262 | -------------------------------------------------------------------------------- /baleen/opml.py: -------------------------------------------------------------------------------- 1 | # baleen.opml 2 | # Reads opml files and gives back outline data 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Sat Sep 20 23:12:07 2014 -0400 6 | # 7 | # Copyright (C) 2014 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: opml.py [b2f890b] benjamin@bengfort.com $ 11 | 12 | """ 13 | Reads opml files and gives back outline data 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import baleen.models as db 21 | from bs4 import BeautifulSoup 22 | from collections import Counter 23 | from mongoengine.errors import * 24 | 25 | ########################################################################## 26 | ## Load Database function 27 | ########################################################################## 28 | 29 | def load_opml(path): 30 | """ 31 | Loads an OPML file into the Mongo database; returns the count of the 32 | number of documents added to the database. 33 | """ 34 | 35 | opml = OPML(path) 36 | rows = 0 37 | for feed in opml: 38 | feed.pop('type') # Unneeded for database 39 | feed.pop('text') # Unneeded for database 40 | feed['link'] = feed.pop('xmlUrl') # Rename the XML URL 41 | feed['urls'] = { 42 | 'xmlUrl': feed['link'], # Add xmlUrl to urls 43 | 'htmlUrl': feed.pop('htmlUrl'), # Add htmlUrl to urls 44 | } 45 | feed = db.Feed(**feed) # Construct without an ObjectId 46 | 47 | try: 48 | feed.save() 49 | rows += 1 50 | except NotUniqueError: 51 | continue 52 | return rows 53 | 54 | ########################################################################## 55 | ## OPMLReader 56 | ########################################################################## 57 | 58 | class OPML(object): 59 | 60 | def __init__(self, path): 61 | """ 62 | Reader for OPML XML files. 63 | """ 64 | self.path = path 65 | 66 | def categories(self): 67 | """ 68 | Reads the file to capture all the categories 69 | """ 70 | with open(self.path, 'r') as data: 71 | soup = BeautifulSoup(data, 'xml') 72 | for topic in soup.select('body > outline'): 73 | yield topic['title'] 74 | 75 | def counts(self): 76 | """ 77 | Returns the counts of feeds in each category 78 | """ 79 | counts = Counter() 80 | for item in self: 81 | counts[item['category']] += 1 82 | return counts 83 | 84 | def __iter__(self): 85 | """ 86 | Yields a dictionary representing the attributes of the RSS feed 87 | from the OPML file; also captures category data. 88 | """ 89 | with open(self.path, 'r') as data: 90 | soup = BeautifulSoup(data, 'xml') 91 | for topic in soup.select('body > outline'): 92 | for feed in topic.find_all('outline'): 93 | data = feed.attrs.copy() 94 | data['category'] = topic['title'] 95 | yield data 96 | 97 | def __len__(self): 98 | return sum(1 for item in self) 99 | 100 | def __str__(self): 101 | counts = self.counts() 102 | return "OPML with {} categories and {} feeds".format( 103 | len(counts), sum(counts.values()) 104 | ) 105 | 106 | def __repr__(self): 107 | return "<{} at {}>".format(self.__class__.__name__, self.path) 108 | -------------------------------------------------------------------------------- /baleen/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # baleen.utils 2 | # Utilities and helpers functions for the Baleen project. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Sun Feb 21 15:00:06 2016 -0500 6 | # 7 | # Copyright (C) 2016 University of Maryland 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: __init__.py [caaaaca] benjamin@bengfort.com $ 11 | 12 | """ 13 | Utilities and helpers functions for the Baleen project. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | # For the log configuration to work 21 | from . import mongolog 22 | -------------------------------------------------------------------------------- /baleen/utils/decorators.py: -------------------------------------------------------------------------------- 1 | # baleen.utils.decorators 2 | # Decorators and function utilities for Baleen. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Wed Mar 02 19:03:43 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: decorators.py [538b33d] benjamin@bengfort.com $ 11 | 12 | """ 13 | Decorators and function utilities for Baleen. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import signal 21 | from functools import wraps 22 | from baleen.utils.timez import Timer 23 | from baleen.exceptions import BaleenError, TimeoutError 24 | 25 | ########################################################################## 26 | ## Memoization 27 | ########################################################################## 28 | 29 | def memoized(fget): 30 | """ 31 | Return a property attribute for new-style classes that only calls its 32 | getter on the first access. The result is stored and on subsequent 33 | accesses is returned, preventing the need to call the getter any more. 34 | https://github.com/estebistec/python-memoized-property 35 | """ 36 | attr_name = '_{0}'.format(fget.__name__) 37 | 38 | @wraps(fget) 39 | def fget_memoized(self): 40 | if not hasattr(self, attr_name): 41 | setattr(self, attr_name, fget(self)) 42 | return getattr(self, attr_name) 43 | 44 | return property(fget_memoized) 45 | 46 | 47 | ########################################################################## 48 | ## Timer functions 49 | ########################################################################## 50 | 51 | def timeit(func): 52 | """ 53 | Returns the number of seconds that a function took along with the result 54 | """ 55 | 56 | @wraps(func) 57 | def timer_wrapper(*args, **kwargs): 58 | """ 59 | Inner function that uses the Timer context object 60 | """ 61 | with Timer() as timer: 62 | result = func(*args, **kwargs) 63 | 64 | return result, timer 65 | 66 | return timer_wrapper 67 | 68 | 69 | def timeout(seconds): 70 | """ 71 | Raises a TimeoutError if a function does not terminate within 72 | specified seconds. 73 | """ 74 | def _timeout_error(signal, frame): 75 | raise TimeoutError("Operation did not finish within \ 76 | {} seconds".format(seconds)) 77 | 78 | def timeout_decorator(func): 79 | 80 | @wraps(func) 81 | def timeout_wrapper(*args, **kwargs): 82 | signal.signal(signal.SIGALRM, _timeout_error) 83 | signal.alarm(seconds) 84 | try: 85 | return func(*args, **kwargs) 86 | finally: 87 | signal.alarm(0) 88 | 89 | return timeout_wrapper 90 | 91 | return timeout_decorator 92 | 93 | ########################################################################## 94 | ## Exception Handling 95 | ########################################################################## 96 | 97 | def reraise(klass=BaleenError, message=None, trap=Exception): 98 | """ 99 | Catches exceptions (those specified by trap) and then reraises the 100 | exception type specified by class. Also embeds the original exception as 101 | a property of the new exception: `error.original`. Finally you can 102 | specify another message to raise, otherwise the error string is used. 103 | """ 104 | 105 | def reraise_decorator(func): 106 | 107 | @wraps(func) 108 | def reraise_wrapper(*args, **kwargs): 109 | """ 110 | Capture Wrapper 111 | """ 112 | try: 113 | return func(*args, **kwargs) 114 | except trap as e: 115 | error = klass(message or e.message) 116 | error.original = e 117 | raise error 118 | 119 | return reraise_wrapper 120 | 121 | return reraise_decorator 122 | -------------------------------------------------------------------------------- /baleen/utils/logger.py: -------------------------------------------------------------------------------- 1 | # baleen.utils.logger 2 | # Logging utility for Baleen 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Mon Sep 22 15:47:34 2014 -0400 6 | # 7 | # Copyright (C) 2014 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: logger.py [caaaaca] benjamin@bengfort.com $ 11 | 12 | """ 13 | Logging utility for Baleen 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import logging 21 | import getpass 22 | import warnings 23 | import logging.config 24 | 25 | from baleen.config import settings 26 | from baleen.utils.timez import COMMON_DATETIME 27 | 28 | ########################################################################## 29 | ## Logging configuration 30 | ########################################################################## 31 | 32 | configuration = { 33 | 'version': 1, 34 | 'disable_existing_loggers': False, 35 | 36 | 'formatters': { 37 | 'simple': { 38 | 'format': '%(name)s %(levelname)s [%(asctime)s] -- %(message)s', 39 | 'datefmt': COMMON_DATETIME, 40 | } 41 | }, 42 | 43 | 'handlers': { 44 | 'null': { 45 | 'level': 'DEBUG', 46 | 'class': 'logging.NullHandler', 47 | }, 48 | 49 | 'console': { 50 | 'level': 'WARNING', 51 | 'class': 'logging.StreamHandler', 52 | 'formatter': 'simple', 53 | }, 54 | 55 | 'logfile': { 56 | 'level': 'INFO', 57 | 'class': 'logging.handlers.RotatingFileHandler', 58 | 'filename': settings.logfile, 59 | 'maxBytes': '536870912', # 512 MB 60 | 'formatter': 'simple', 61 | }, 62 | 63 | 'mongolog': { 64 | 'level': 'INFO', 65 | 'class': 'baleen.utils.mongolog.MongoHandler', 66 | } 67 | }, 68 | 69 | 'loggers': { 70 | 'baleen': { 71 | 'level': settings.loglevel, 72 | 'handlers': ['logfile'], 73 | 'propagagte': True, 74 | }, 75 | 'baleen.ingest': { 76 | 'level': 'INFO', 77 | 'handlers': ['logfile', 'mongolog'], 78 | 'propagate': False, 79 | } 80 | }, 81 | } 82 | 83 | logging.config.dictConfigClass(configuration).configure() 84 | if not settings.debug: logging.captureWarnings(True) 85 | 86 | ########################################################################## 87 | ## Logger utility 88 | ########################################################################## 89 | 90 | class WrappedLogger(object): 91 | """ 92 | Wraps the Python logging module's logger object to ensure that all baleen 93 | logging happens with the correct configuration as well as any extra 94 | information that might be required by the log file (for example, the user 95 | on the machine, hostname, IP address lookup, etc). 96 | 97 | Subclasses must specify their logger as a class variable so all instances 98 | have access to the same logging object. 99 | """ 100 | 101 | logger = None 102 | 103 | def __init__(self, **kwargs): 104 | self.raise_warnings = kwargs.pop('raise_warnings', settings.debug) 105 | self.logger = kwargs.pop('logger', self.logger) 106 | 107 | if not self.logger or not hasattr(self.logger, 'log'): 108 | raise TypeError( 109 | "Subclasses must specify a logger, not {}" 110 | .format(type(self.logger)) 111 | ) 112 | 113 | self.extras = kwargs 114 | 115 | def log(self, level, message, *args, **kwargs): 116 | """ 117 | This is the primary method to override to ensure logging with extra 118 | options gets correctly specified. 119 | """ 120 | extra = self.extras.copy() 121 | extra.update(kwargs.pop('extra', {})) 122 | 123 | kwargs['extra'] = extra 124 | self.logger.log(level, message, *args, **kwargs) 125 | 126 | def debug(self, message, *args, **kwargs): 127 | return self.log(logging.DEBUG, message, *args, **kwargs) 128 | 129 | def info(self, message, *args, **kwargs): 130 | return self.log(logging.INFO, message, *args, **kwargs) 131 | 132 | def warning(self, message, *args, **kwargs): 133 | """ 134 | Specialized warnings system. If a warning subclass is passed into 135 | the keyword arguments and raise_warnings is True - the warnning will 136 | be passed to the warnings module. 137 | """ 138 | warncls = kwargs.pop('warning', None) 139 | if warncls and self.raise_warnings: 140 | warnings.warn(message, warncls) 141 | 142 | return self.log(logging.WARNING, message, *args, **kwargs) 143 | 144 | # Alias warn to warning 145 | warn = warning 146 | 147 | def error(self, message, *args, **kwargs): 148 | return self.log(logging.ERROR, message, *args, **kwargs) 149 | 150 | def critical(self, message, *args, **kwargs): 151 | return self.log(logging.CRITICAL, message, *args, **kwargs) 152 | 153 | 154 | ########################################################################## 155 | ## The Ingestion Logger Class 156 | ########################################################################## 157 | 158 | class IngestLogger(WrappedLogger): 159 | """ 160 | Performs logging for the baleen process with the log options above. 161 | """ 162 | 163 | logger = logging.getLogger('baleen.ingest') 164 | 165 | def __init__(self, **kwargs): 166 | self._user = kwargs.pop('user', None) 167 | super(IngestLogger, self).__init__(**kwargs) 168 | 169 | @property 170 | def user(self): 171 | if not self._user: 172 | self._user = getpass.getuser() 173 | return self._user 174 | 175 | def log(self, level, message, *args, **kwargs): 176 | """ 177 | Provide current user as extra context to the logger 178 | """ 179 | extra = kwargs.pop('extra', {}) 180 | extra.update({ 181 | 'user': self.user 182 | }) 183 | 184 | kwargs['extra'] = extra 185 | super(IngestLogger, self).log(level, message, *args, **kwargs) 186 | 187 | 188 | ########################################################################## 189 | ## Logging Mixin 190 | ########################################################################## 191 | 192 | class LoggingMixin(object): 193 | """ 194 | Mix in to classes that need their own logging object! 195 | """ 196 | 197 | @property 198 | def logger(self): 199 | """ 200 | Instantiates and returns a IngestLogger instance 201 | """ 202 | if not hasattr(self, '_logger') or not self._logger: 203 | self._logger = IngestLogger() 204 | return self._logger 205 | -------------------------------------------------------------------------------- /baleen/utils/mongolog.py: -------------------------------------------------------------------------------- 1 | # baleen.utils.mongolog 2 | # Handlers and formatters for logging to Mongo 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Tue Sep 23 09:11:52 2014 -0400 6 | # 7 | # Copyright (C) 2014 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: mongolog.py [caaaaca] benjamin@bengfort.com $ 11 | 12 | """ 13 | Handlers and formatters for logging to Mongo 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import getpass 21 | import logging 22 | import logging.config 23 | from baleen.utils.timez import * 24 | from baleen.config import settings 25 | 26 | from datetime import datetime 27 | from socket import gethostname 28 | from pymongo import MongoClient 29 | from pymongo.errors import PyMongoError 30 | 31 | ########################################################################## 32 | ## Mongo Formatter/Handler 33 | ########################################################################## 34 | 35 | class MongoFormatter(logging.Formatter): 36 | 37 | def __init__(self, fmt='%(name)s %(levelname)s [%(asctime)s] -- %(message)s', datefmt=COMMON_DATETIME): 38 | super(MongoFormatter, self).__init__(fmt, datefmt) 39 | 40 | def format(self, record): 41 | """ 42 | Formats LogRecord into a Python dictionary 43 | """ 44 | 45 | ## Get the dictionary ready for Mongo 46 | data = record.__dict__.copy() 47 | 48 | ## Get the log message as intended via super 49 | message = super(MongoFormatter, self).format(record) 50 | timestamp = datetime.fromtimestamp(data.pop('created')) 51 | location = { 52 | 'module': data.pop('module'), 53 | 'file': data.pop('pathname'), 54 | 'filename': data.pop('filename'), 55 | 'lineno': data.pop('lineno'), 56 | 'method': data.pop('funcName') 57 | } 58 | error = { 59 | 'info': data.pop('exc_info'), 60 | 'text': data.pop('exc_text'), 61 | } 62 | process = { 63 | 'process': data.pop('process'), 64 | 'processName': data.pop('processName'), 65 | 'thread': data.pop('thread'), 66 | 'threadName': data.pop('threadName'), 67 | } 68 | logger = data.pop('name') 69 | level = { 70 | 'number': data.pop('levelno'), 71 | 'name': data.pop('levelname'), 72 | } 73 | info = tuple(unicode(arg) for arg in data.pop('args')) 74 | 75 | for key in ('relativeCreated', 'msecs', 'msg'): 76 | del data[key] 77 | 78 | data.update({ 79 | 'logger': logger, 80 | # 'process': process, 81 | 'message': message, 82 | 'timestamp': timestamp, 83 | 'level': level, 84 | # 'location': location, 85 | 'error': error, 86 | 'user': getpass.getuser(), 87 | 'host': gethostname(), 88 | # 'info': info, 89 | }) 90 | 91 | return data 92 | 93 | class MongoHandler(logging.Handler): 94 | 95 | def __init__(self, level=logging.NOTSET, **kwargs): 96 | super(MongoHandler, self).__init__(level) 97 | self.host = kwargs.get('host', settings.database.host) 98 | self.port = kwargs.get('port', settings.database.port) 99 | self.database_name = kwargs.get('database', settings.database.name) 100 | self.collection_name = kwargs.get('collection', 'logs') 101 | self.fail_silently = kwargs.get('fail_silently', False) 102 | self.formatter = kwargs.get('formatter', MongoFormatter()) 103 | 104 | self.connection = None 105 | self.database = None 106 | self.collection = None 107 | self.connect() 108 | 109 | def connect(self): 110 | """ 111 | Connect to the Mongo database. 112 | """ 113 | try: 114 | self.connection = MongoClient(host=self.host, port=self.port) 115 | except PyMongoError: 116 | if self.fail_silently: 117 | return 118 | else: 119 | raise 120 | 121 | self.database = self.connection[self.database_name] 122 | self.collection = self.database[self.collection_name] 123 | 124 | def close(self): 125 | """ 126 | Close the connection to the Mongo database. 127 | """ 128 | if self.connection is not None: 129 | self.connection = None 130 | 131 | def emit(self, record): 132 | """ 133 | Insert log record into Mongo database 134 | """ 135 | if self.collection is not None: 136 | try: 137 | self.collection.insert(self.format(record)) 138 | except Exception: 139 | if not self.fail_silently: 140 | self.handleError(record) 141 | 142 | if __name__ == '__main__': 143 | logger = logging.getLogger('demo') 144 | logger.setLevel(logging.INFO) 145 | logger.addHandler(MongoHandler()) 146 | logger.info("This is a test of the logging system") 147 | -------------------------------------------------------------------------------- /baleen/utils/timez.py: -------------------------------------------------------------------------------- 1 | # baleen.utils.timez 2 | # Utility functions for Baleen 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Mon Sep 22 10:14:57 2014 -0400 6 | # 7 | # Copyright (C) 2014 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: timez.py [caaaaca] benjamin@bengfort.com $ 11 | 12 | """ 13 | Utility functions for Baleenc 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import re 21 | import time 22 | 23 | from dateutil.tz import tzlocal, tzutc 24 | from datetime import datetime, timedelta 25 | from dateutil.relativedelta import relativedelta 26 | 27 | ########################################################################## 28 | ## Format constants 29 | ########################################################################## 30 | 31 | HUMAN_DATETIME = "%a %b %d %H:%M:%S %Y %z" 32 | HUMAN_DATE = "%b %d, %Y" 33 | HUMAN_TIME = "%I:%M:%S %p" 34 | JSON_DATETIME = "%Y-%m-%dT%H:%M:%S.%fZ" # Must be UTC 35 | ISO8601_DATETIME = "%Y-%m-%dT%H:%M:%S%z" 36 | ISO8601_DATE = "%Y-%m-%d" 37 | ISO8601_TIME = "%H:%M:%S" 38 | COMMON_DATETIME = "%d/%b/%Y:%H:%M:%S %z" 39 | WEB_UTC_DATETIME = "%a, %b %d, %Y at %H:%M UTC" 40 | 41 | ########################################################################## 42 | ## Module helper function 43 | ########################################################################## 44 | 45 | def localnow(): 46 | return datetime.now(tzlocal()) 47 | 48 | 49 | def utcnow(): 50 | now = datetime.utcnow() 51 | now = now.replace(tzinfo=tzutc()) 52 | return now 53 | 54 | 55 | zre = re.compile(r'([\-\+]\d{4})') 56 | def strptimez(dtstr, dtfmt): 57 | """ 58 | Helper function that performs the timezone calculation to correctly 59 | compute the '%z' format that is not added by default in Python 2.7. 60 | """ 61 | if '%z' not in dtfmt: 62 | return datetime.strptime(dtstr, dtfmt) 63 | 64 | dtfmt = dtfmt.replace('%z', '') 65 | offset = int(zre.search(dtstr).group(1)) 66 | dtstr = zre.sub('', dtstr) 67 | delta = timedelta(hours = offset/100) 68 | utctsp = datetime.strptime(dtstr, dtfmt) - delta 69 | return utctsp.replace(tzinfo=tzutc()) 70 | 71 | 72 | def humanizedelta(*args, **kwargs): 73 | """ 74 | Wrapper around dateutil.relativedelta (same construtor args) and returns 75 | a humanized string representing the detla in a meaningful way. 76 | """ 77 | if 'milliseconds' in kwargs: 78 | sec = kwargs.get('seconds', 0) 79 | msec = kwargs.pop('milliseconds') 80 | kwargs['seconds'] = sec + (float(msec) / 1000.0) 81 | 82 | delta = relativedelta(*args, **kwargs) 83 | attrs = ('years', 'months', 'days', 'hours', 'minutes', 'seconds') 84 | parts = [ 85 | '%d %s' % (getattr(delta, attr), getattr(delta, attr) > 1 and attr or attr[:-1]) 86 | for attr in attrs if getattr(delta, attr) 87 | ] 88 | 89 | return " ".join(parts) 90 | 91 | 92 | ########################################################################## 93 | ## Timer functions 94 | ########################################################################## 95 | 96 | 97 | class Timer(object): 98 | """ 99 | A context object timer. Usage: 100 | >>> with Timer() as timer: 101 | ... do_something() 102 | >>> print timer.elapsed 103 | """ 104 | 105 | def __init__(self, wall_clock=True): 106 | """ 107 | If wall_clock is True then use time.time() to get the number of 108 | actually elapsed seconds. If wall_clock is False, use time.clock to 109 | get the process time instead. 110 | """ 111 | self.wall_clock = wall_clock 112 | self.time = time.time if wall_clock else time.clock 113 | 114 | # Stubs for serializing an empty timer. 115 | self.started = None 116 | self.finished = None 117 | self.elapsed = 0.0 118 | 119 | def __enter__(self): 120 | self.started = self.time() 121 | return self 122 | 123 | def __exit__(self, typ, value, tb): 124 | self.finished = self.time() 125 | self.elapsed = self.finished - self.started 126 | 127 | def __str__(self): 128 | return humanizedelta(seconds=self.elapsed) 129 | -------------------------------------------------------------------------------- /baleen/version.py: -------------------------------------------------------------------------------- 1 | # baleen.version 2 | # Stores version information such that it can be read by setuptools. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Thu Feb 18 20:14:16 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: version.py [edff1dd] benjamin@bengfort.com $ 11 | 12 | """ 13 | Stores version information such that it can be read by setuptools. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | __version_info__ = { 21 | 'major': 0, 22 | 'minor': 3, 23 | 'micro': 3, 24 | 'releaselevel': 'final', 25 | 'serial': 0, 26 | } 27 | 28 | 29 | def get_version(short=False): 30 | """ 31 | Computes a string representation of the version from __version_info__. 32 | """ 33 | assert __version_info__['releaselevel'] in ('alpha', 'beta', 'final') 34 | vers = ["%(major)i.%(minor)i" % __version_info__, ] 35 | if __version_info__['micro']: 36 | vers.append(".%(micro)i" % __version_info__) 37 | if __version_info__['releaselevel'] != 'final' and not short: 38 | vers.append('%s%i' % (__version_info__['releaselevel'][0], 39 | __version_info__['serial'])) 40 | return ''.join(vers) 41 | -------------------------------------------------------------------------------- /baleen/wrangle.py: -------------------------------------------------------------------------------- 1 | # baleen.wrangle 2 | # Wrangles the post objects from a synchronized feed. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Wed Mar 02 21:52:49 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: wrangle.py [568d540] benjamin@bengfort.com $ 11 | 12 | """ 13 | Wrangles the post objects from a synchronized feed. 14 | 15 | Feed objects don't require a lot of wrangling, and are handled primarily by 16 | the FeedSync object. However Posts do require some hoop jumping, which this 17 | module provides. 18 | """ 19 | 20 | ########################################################################## 21 | ## Imports 22 | ########################################################################## 23 | 24 | import requests 25 | 26 | from copy import deepcopy 27 | from dateutil import parser as dtparser 28 | 29 | from baleen.config import settings 30 | from baleen.models import Post 31 | from baleen.utils.decorators import reraise 32 | from baleen.exceptions import WranglingError, FetchError 33 | 34 | ########################################################################## 35 | ## Module Constants 36 | ########################################################################## 37 | 38 | FEEDPARSER_REMOVABLE_FIELDS = ( 39 | 'id', 'published_parsed', 'expired_parsed', 40 | 'updated', 'updated_parsed', 'created', 'created_parsed', 41 | ) 42 | 43 | ########################################################################## 44 | ## Post Wrangling Object 45 | ########################################################################## 46 | 47 | class PostWrangler(object): 48 | """ 49 | As FeedSync wraps Feed to do work, so to does PostWrangler wrap an entry 50 | to create a Post object, to ensure that data is of a high quality, and to 51 | do extra things like fetch the full webpage from the URL provided. 52 | 53 | This object directly converts its input (a dict) to a models.Post object. 54 | """ 55 | 56 | @classmethod 57 | def factory(klass, entries, feed=None): 58 | """ 59 | Yields a post wrangler for each entry in the entries. 60 | """ 61 | for entry in entries: 62 | yield klass(deepcopy(entry), feed=feed) 63 | 64 | def __init__(self, entry, feed=None): 65 | """ 66 | Entry is expected to be the dictionary object from a FeedSync 67 | After wrangling, it will become a models.Post object. 68 | """ 69 | self.feed = feed 70 | self.post = entry 71 | 72 | def is_wrangled(self): 73 | """ 74 | Checks the class of the post to see if wrangling has occurred. 75 | """ 76 | return isinstance(self.post, Post) 77 | 78 | @reraise(klass=WranglingError) 79 | def wrangle(self, save=True): 80 | """ 81 | Converts the raw entry to standard data. If save, saves to database. 82 | 83 | Metholodolgy of wrangling is as follows: 84 | 85 | - all fields are kept in the entry except `published` and 86 | `published_parsed` since these many not contain TZ data - 87 | instead these two fields are replaced by `pubdate`. If there 88 | is no publication date, `pubdate` is set to None. 89 | 90 | - the tags field, if it exists, is converted to a list of 91 | strings. Although this may cause some data loss; it will 92 | make tagging of all posts simpler for the application. 93 | 94 | - link will be renamed url 95 | 96 | - content will be populated with summary, if content does not 97 | exist in the feed. Supposedly feedparser was already doing 98 | this, but it appears to not be regular. 99 | 100 | - title, url, content, and tags will all be encoded UTF-8. 101 | 102 | - removes the id field so a Mongo generated ObjectID is stored. 103 | 104 | See the models.Post for more information on the data structure. 105 | 106 | NOTE: This method is destructive, the raw entry will be converted. 107 | """ 108 | ## Don't rewrangle an already wrangled post 109 | if self.is_wrangled(): 110 | return self.post 111 | 112 | ## Saves typing self.post everywhere 113 | post = self.post.copy() 114 | 115 | ## Remove unwanted fields 116 | for field in FEEDPARSER_REMOVABLE_FIELDS: 117 | if field in post: del post[field] 118 | 119 | ## Handle the pubdate and published strings 120 | post['pubdate'] = dtparser.parse(post.pop('published')) if 'published' in post else None 121 | 122 | ## Handle the tags in the entry 123 | post['tags'] = [tag['term'] for tag in self.post.tags] if 'tags' in post else [] 124 | 125 | ## Rename the link field to url 126 | post['url'] = self.post.link or post.get('href', None) or self.post.id 127 | if 'link' in post: del post['link'] 128 | 129 | ## Handle the content 130 | if 'content' not in post: 131 | post['content'] = post.get('summary') 132 | else: 133 | selected = None 134 | for idx, item in enumerate(post['content']): 135 | if idx == 0: 136 | # Take the first item 137 | selected = item 138 | elif item['type'] == 'text/html': 139 | # Unless we find another item that is html 140 | selected = item 141 | 142 | # Update the post with the content info 143 | post['language'] = selected.get('language') 144 | post['mimetype'] = selected.get('type') 145 | post['content'] = selected.get('value') 146 | 147 | ## Create the post object 148 | ## Start using self.post here! 149 | self.post = Post(feed=self.feed, **post) 150 | if save: 151 | self.post.save() 152 | 153 | return self.post 154 | 155 | @reraise(klass=FetchError) 156 | def fetch(self, save=True): 157 | """ 158 | Fetches the entire webpage for the post. If save, adds the page to 159 | the content of the post and saves it back to the database. 160 | 161 | Raises an exception if not wrangled yet. 162 | Raises exceptions if there is a problem with the fetch. 163 | """ 164 | if not self.is_wrangled(): 165 | raise ValueError("Entry not yet wrangled, cannot fetch.") 166 | 167 | response = requests.get(self.post.url, timeout=settings.timeout) 168 | response.raise_for_status() 169 | 170 | if response.text: 171 | self.post.content = response.text 172 | 173 | if save: 174 | self.post.save() 175 | 176 | return self.post 177 | -------------------------------------------------------------------------------- /baleen/www/__init__.py: -------------------------------------------------------------------------------- 1 | # baleen.www 2 | # A small web application that will allow us to manage the Baleen app. 3 | # 4 | # Author: Laura Lorenz 5 | # Created: Sun Apr 3 12:59:42 2016 -0400 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: __init__.py [] benjamin@bengfort.com $ 11 | 12 | """ 13 | A small web application that will allow us to manage the Baleen app. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | -------------------------------------------------------------------------------- /baleen/www/app.py: -------------------------------------------------------------------------------- 1 | # baleen.www.app 2 | # Flask application definition in Baleen. 3 | # 4 | # Author: Laura Lorenz 5 | # Created: Sun Apr 3 12:59:42 2016 -0400 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: app.py [] lalorenz6@gmail.com $ 11 | 12 | """ 13 | Flask application definition in Baleen. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import baleen 21 | 22 | from baleen.config import settings 23 | from baleen.models import Feed, Post, Job, Log 24 | from baleen.utils.timez import WEB_UTC_DATETIME 25 | 26 | from flask import Flask, render_template, request 27 | from flask.ext.mongoengine import MongoEngine 28 | from flask_humanize import Humanize 29 | 30 | ########################################################################## 31 | ## Flask Application 32 | ########################################################################## 33 | 34 | # set up an app instance 35 | app = Flask(__name__) 36 | 37 | # set debug to true to get debug pages when there is an error 38 | app.debug = settings.debug 39 | 40 | # configure the app with the confire settings 41 | app.config['MONGODB_SETTINGS'] = { 42 | 'db': settings.database.name, 43 | 'host': settings.database.host, 44 | 'port': settings.database.port, 45 | } 46 | 47 | # connect to the database using the Flask extension 48 | db = MongoEngine(app) 49 | 50 | # add the humanize extension 51 | humanize = Humanize(app) 52 | 53 | ########################################################################## 54 | ## Routes 55 | ########################################################################## 56 | 57 | @app.route("/") 58 | def index(): 59 | """ 60 | Displays an index page with the feed listing 61 | """ 62 | # get all the stuff we want 63 | feeds = Feed.objects() 64 | feed_count = feeds.count() 65 | topics = set([feed.category for feed in Feed.objects.only('category')]) 66 | feeds_topics_counts = len(topics) 67 | 68 | # TODO: probably should put this in the database along with the feed. 69 | feed_icons = {'gaming':'fa fa-gamepad', 70 | 'design':'fa fa-building-o', 71 | 'business':'fa fa-briefcase', 72 | 'cinema':'fa fa-video-camera', 73 | 'data-science':'fa fa-area-chart', 74 | 'cooking':'fa fa-cutlery', 75 | 'sports':'fa fa-futbol-o', 76 | 'books':'fa fa-book', 77 | 'tech':'fa fa-cogs', 78 | 'politics':'fa fa-university', 79 | 'news':'fa fa-newspaper-o', 80 | 'essays':'fa fa-pencil-square-o', 81 | 'do-it-yourself':'fa fa-wrench' 82 | } 83 | feeds_topics = { 84 | topic: Feed.objects(category=topic) 85 | for topic in topics 86 | } 87 | 88 | # load all the data into the templates/feed_list.html template 89 | return render_template('index.html', 90 | feeds=feeds, 91 | feeds_topics=feeds_topics, 92 | feed_count=feed_count, 93 | topic_count=feeds_topics_counts, 94 | feed_icons=feed_icons) 95 | 96 | @app.route("/status/") 97 | def status(): 98 | """ 99 | Displays the current Baleen status and job listing 100 | """ 101 | version = baleen.get_version() 102 | counts = { 103 | 'feeds': Feed.objects.count(), 104 | 'posts': Post.objects.count(), 105 | 'jobs': Job.objects.count(), 106 | } 107 | latest_job = Job.objects.order_by('-started').first() 108 | latest_feed = Feed.objects.order_by('-updated').first() 109 | latest_post = Post.objects.order_by('-id').first() 110 | recent_jobs = Job.objects.order_by('-started').limit(10) 111 | 112 | # load all data into job_status template 113 | return render_template( 114 | 'status.html', 115 | latest_job=latest_job, 116 | latest_feed=latest_feed, 117 | latest_post=latest_post, 118 | version=version, 119 | counts=counts, 120 | dtfmt=WEB_UTC_DATETIME, 121 | recent_jobs=recent_jobs 122 | ) 123 | 124 | 125 | @app.route("/logs/") 126 | def logs(): 127 | """ 128 | Displays log records from the Mongo Database. 129 | This is paginated and allows flexible per-page counts (max 200 record). 130 | """ 131 | # Get pagination information for request 132 | page = int(request.args.get('page', 1)) 133 | per_page = min(int(request.args.get('per_page', 50)), 200) 134 | 135 | # Compute the pagination variables 136 | n_logs = Log.objects.count() 137 | n_pages = (n_logs + per_page // 2) // per_page 138 | nextp = page + 1 if page + 1 <= n_pages else None 139 | prevp = page - 1 if page > 1 else None 140 | 141 | # Perform query 142 | offset = (page - 1) * per_page 143 | logs = Log.objects.order_by('-id').skip(offset).limit(per_page) 144 | 145 | return render_template( 146 | 'logs.html', 147 | page = page, 148 | num_pages = n_pages, 149 | per_page = per_page, 150 | logs = logs, 151 | num_logs = n_logs, 152 | next = nextp, 153 | prev = prevp, 154 | ) 155 | 156 | 157 | ########################################################################## 158 | ## Main Method 159 | ########################################################################## 160 | 161 | if __name__ == "__main__": 162 | # if you run this file as a script, it will start the flask server 163 | app.run(host=settings.server.host, port=settings.server.port) 164 | -------------------------------------------------------------------------------- /baleen/www/static/css/baleen.css: -------------------------------------------------------------------------------- 1 | /* Baleen specific styles for various things. */ 2 | 3 | html, 4 | body { 5 | height: 100%; 6 | } 7 | 8 | /*body { 9 | padding-top: 70px; 10 | }*/ 11 | 12 | /* Wrapper for page content to push down footer */ 13 | #wrap { 14 | min-height: 100%; 15 | height: auto; 16 | /* Negative indent footer by its height */ 17 | margin: 0 auto -76px; 18 | /* Pad bottom by footer height */ 19 | padding: 0 0 106px; 20 | } 21 | 22 | .navbar-brand-img { 23 | width: 22px; 24 | height: 22px; 25 | float: left; 26 | margin: -3px 4px 0 0; 27 | } 28 | 29 | /* Set the fixed height of the footer here */ 30 | #footer { 31 | background-color: #fff; 32 | border-top: 1px solid #eee; 33 | height: 76px; 34 | padding: 30px 15px; 35 | } 36 | 37 | .app-tabs .tab-pane { 38 | margin-top: 20px; 39 | } 40 | 41 | #loading img { 42 | margin: 200px auto; 43 | } 44 | 45 | th { 46 | background-color: #008CBA; 47 | color: white; 48 | } 49 | -------------------------------------------------------------------------------- /baleen/www/static/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistrictDataLabs/baleen/bb2ae323a3ab3a066a4a289401847e1251abc55d/baleen/www/static/favicon.png -------------------------------------------------------------------------------- /baleen/www/templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | {% block meta %} 5 | 6 | 7 | 8 | 9 | 10 | 11 | {% endblock %} 12 | 13 | {% block title %}Baleen Status{% endblock %} 14 | 15 | 16 | 17 | 18 | {% block stylesheets %} 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | {% endblock %} 27 | 28 | 29 | 30 |
31 | 32 | {% block navbar %} 33 | {% include 'components/navbar.html' %} 34 | {% endblock %} 35 | 36 | 37 |
38 | {% block body %} 39 | {% endblock %} 40 |
41 | 42 |
43 | 44 | 45 | {% block footer %} 46 | {% include 'components/footer.html' %} 47 | {% endblock %} 48 | 49 | {% block modals %}{% endblock %} 50 | 51 | {% block javascripts %} 52 | 53 | 54 | 55 | {% endblock %} 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /baleen/www/templates/components/footer.html: -------------------------------------------------------------------------------- 1 | 19 | -------------------------------------------------------------------------------- /baleen/www/templates/components/navbar.html: -------------------------------------------------------------------------------- 1 | 66 | -------------------------------------------------------------------------------- /baleen/www/templates/index.html: -------------------------------------------------------------------------------- 1 | {% extends 'base.html' %} 2 | {% block title %}RSS Feed List{% endblock %} 3 | 4 | {% block body %} 5 |
6 | 7 | 8 |
9 |
10 | 13 |
14 |
15 | 16 | 17 |
18 |
19 | 20 |
21 | {% for topic, feeds in feeds_topics|dictsort %} 22 | 23 | {{ feed_icons.topic }} 24 | {{ feeds.count() }} 25 | {{ topic|title }} 26 | 27 | {% endfor %} 28 |
29 | 30 | 31 | {% for topic, feeds in feeds_topics|dictsort %} 32 |

33 | {{ topic|title }} ({{ feeds.count() }} feeds)

34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | {% for feed in feeds %} 42 | 43 | 45 | 46 | 47 | {% endfor %} 48 | 49 |
ActiveTitleLink
44 | {{ feed.title }}{{ feed.link }}
50 | 51 | Back to Top 52 | {% endfor %} 53 |
54 |
55 | 56 |
57 | {% endblock%} 58 | -------------------------------------------------------------------------------- /baleen/www/templates/logs.html: -------------------------------------------------------------------------------- 1 | {% extends 'base.html' %} 2 | {% block title %}Baleen Log Records{% endblock %} 3 | 4 | {% block body %} 5 |
6 | 7 | 8 |
9 |
10 | 14 |
15 |
16 | 17 | 18 |
19 |
20 | 21 | 22 | 23 | {% for log in logs %} 24 | 25 | 26 | 27 | {% else %} 28 | 29 | 30 | 31 | {% endfor %} 32 | 33 |
{{ log.message }}
No log records yet
34 | 35 | 45 |
46 |
47 | 48 | {% endblock %} 49 | -------------------------------------------------------------------------------- /baleen/www/templates/status.html: -------------------------------------------------------------------------------- 1 | {% extends 'base.html' %} 2 | {% block title %}Job Status Page{% endblock %} 3 | 4 | {% block body %} 5 |
6 | 7 | 8 |
9 |
10 | 14 | 15 |

Latest Job

16 |
17 |
18 | 19 |
20 |
21 | 22 | 23 |
24 | 25 |
26 | Job Info 27 |
28 | 29 | 30 | 31 | 32 | {% if not latest_job.finished %} 33 | 34 | 35 | {% elif latest_job.failed %} 36 | 37 | 38 | 39 | {% else %} 40 | 41 | 42 | {% endif %} 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 |
StatusJob Running StatusJob Failed{{ latest_job.reason }} StatusJob Complete
Type{{ latest_job.name }} v{{ latest_job.version }}
Job ID{{ latest_job.jobid }}
Started{{ latest_job.started.strftime(dtfmt) }} ({{ latest_job.started|humanize('naturaltime') }})
Finished{% if latest_job.finished %}{{ latest_job.finished.strftime(dtfmt) }} ({{ latest_job.finished|humanize('naturaltime') }}){% endif %}
Duration{{ latest_job.duration(humanize=True) }}
65 |
66 | 67 | 68 |
69 | 70 |
Latest Post
71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 |
Title{{ latest_post.title }}
Feed{{ latest_post.feed.title }}
Published{{ latest_post.pubdate.strftime(dtfmt) }}
Fetched{{ latest_post.updated.strftime(dtfmt) }} ({{ latest_post.updated|humanize('naturaltime') }})
91 |
92 | 93 | 94 |
95 | 96 |
Latest Feed
97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 |
Title{{ latest_feed.title }}
eTag{{ latest_feed.etag }}
Modified{{ latest_feed.modified }}
Updated{{ latest_feed.updated.strftime(dtfmt) }} ({{ latest_feed.updated|humanize('naturaltime') }})
117 |
118 | 119 |
120 | 121 |
122 | 123 | 124 |
125 | 126 |
Counts
127 | 128 | 129 |
    130 | {% for key, value in latest_job.counts.items() %} 131 |
  • 132 | {{ value|humanize('intcomma') }} 133 | {{ key.replace("_", " ").title() }} 134 |
  • 135 | {% endfor %} 136 |
137 |
138 | 139 | 140 |
141 | 142 |
Errors
143 | 144 | 145 |
    146 | {% for key, value in latest_job.errors.items() %} 147 |
  • 148 | {{ value|humanize('intcomma') }} 149 | {{ key }} 150 |
  • 151 | {% endfor %} 152 |
153 |
154 | 155 |
156 |
157 | 158 | 159 |
160 |
161 | 162 |

Job History

163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | {% for job in recent_jobs %} 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | {% endfor %} 184 | 185 |
JobJob IDPostsErrorsStartedDuration
{{ job.name }} v{{ job.version }}{{ job.jobid }}{% if 'posts' in job.counts %}{{ job.counts['posts']|humanize('intcomma') }}{% else %}N/A{% endif %}{% if 'errors' in job.counts %}{{ job.counts['errors']|humanize('intcomma') }}{% else %}N/A{% endif %}{{ job.started|humanize('naturaltime') }}{{ job.duration(humanize=True) }}
186 | 187 |
188 |
189 | 190 |
191 | {% endblock %} 192 | -------------------------------------------------------------------------------- /bin/baleen: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # baleen 3 | # Management and administration script for Baleen 4 | # 5 | # Author: Benjamin Bengfort 6 | # Created: Fri Sep 19 10:56:44 2014 -0400 7 | # 8 | # Copyright (C) 2014 Bengfort.com 9 | # For license information, see LICENSE.txt 10 | # 11 | # ID: baleen [5ad94d7] benjamin@bengfort.com $ 12 | 13 | """ 14 | Management and administration script for Baleen 15 | """ 16 | 17 | ########################################################################## 18 | ## Imports 19 | ########################################################################## 20 | 21 | from baleen.console import BaleenUtility 22 | 23 | 24 | ########################################################################## 25 | ## Load and execute the CLI utility 26 | ########################################################################## 27 | 28 | if __name__ == '__main__': 29 | app = BaleenUtility.load() 30 | app.execute() 31 | -------------------------------------------------------------------------------- /bin/doctimes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # export publish dates of documents in the corpus. 3 | 4 | import os 5 | import csv 6 | import bson 7 | import argparse 8 | 9 | from datetime import datetime 10 | from pymongo import MongoClient 11 | 12 | 13 | def main(args): 14 | 15 | # Connect to the Database 16 | conn = MongoClient() 17 | db = conn.baleen 18 | posts = db.posts 19 | 20 | # Create a hook to the CSV file 21 | writer = csv.DictWriter(args.outpath, fieldnames=["_id", "pubdate"]) 22 | writer.writeheader() 23 | 24 | # Collect the IDs and pubdates 25 | count = 0 26 | for row in posts.find({}, {"_id": 1, "pubdate": 1}): 27 | count += 1 28 | writer.writerow(row) 29 | 30 | print("wrote {} rows to {}".format(count, args.outpath.name)) 31 | 32 | 33 | if __name__ == '__main__': 34 | parser = argparse.ArgumentParser( 35 | description="export pubdates for documents by id" 36 | ) 37 | 38 | parser.add_argument( 39 | "-o", "--outpath", default="pubdates.csv", type=argparse.FileType('w'), 40 | help="location to write out the results csv file to", 41 | ) 42 | 43 | args = parser.parse_args() 44 | main(args) 45 | -------------------------------------------------------------------------------- /bin/ldoc.py: -------------------------------------------------------------------------------- 1 | import bson 2 | from pymongo import MongoClient 3 | 4 | 5 | def main(): 6 | connection = MongoClient() 7 | db = connection.baleen 8 | collection = db.posts 9 | col_size = collection.count() 10 | print("Found %d documents in baleen:posts", col_size) 11 | idx = 1 12 | 13 | col_sizes = {} 14 | for post in collection.find(): 15 | print("Item {} of {}".format(idx, col_size)) 16 | #print(post) 17 | #print(post['_id']) 18 | #print("{} - {}".format(len(post['content']), post['_id'])) 19 | col_sizes[post['_id']] = len(post['content']) 20 | idx += 1 21 | 22 | print(col_sizes) 23 | 24 | for w in sorted(col_sizes, key=col_sizes.get, reverse=True): 25 | print w, col_sizes[w] 26 | 27 | 28 | if __name__ == '__main__': 29 | main() 30 | -------------------------------------------------------------------------------- /conf/baleen-example.yaml: -------------------------------------------------------------------------------- 1 | # Configuration file for the Baleen application 2 | # This file belongs in: /etc/baleen.yaml 3 | 4 | # Basic Flags 5 | debug: true 6 | 7 | # Logging Information 8 | logfile: 'baleen.log' 9 | loglevel: 'DEBUG' 10 | 11 | # Use Requests to fetch complete HTML 12 | fetch_html: True 13 | 14 | # Database Information 15 | database: 16 | host: localhost 17 | port: 27017 18 | name: baleen 19 | 20 | # Web Admin Server 21 | server: 22 | host: 127.0.0.1 23 | port: 5000 24 | -------------------------------------------------------------------------------- /conf/upstart/baleen.conf: -------------------------------------------------------------------------------- 1 | # baleen.conf 2 | # 3 | # Author: Benjamin Bengfort 4 | # Created: Tue Mar 01 08:18:40 2016 -0500 5 | # 6 | # Upstart configuration for Baleen 7 | # For more details on this configuration see the Baleen deployment docs. 8 | # This file belongs in: /etc/init/baleen.conf 9 | 10 | # Documentation 11 | author "Benjamin Bengfort venv/lib/python2.7/site-packages/baleen.pth 24 | 25 | 4. Create your local configuration file. Edit it with the connection details to your local MongoDB server. This is also a good time to check and make sure that you can create a database called Baleen on Mongo. 26 | 27 | $ cp conf/baleen-example.yaml conf/baleen.yaml 28 | 29 | The YAML file should look similar to: 30 | 31 | debug: true 32 | testing: false 33 | database: 34 | host: localhost 35 | port: 27017 36 | name: baleen 37 | 38 | 5. Run the tests to make sure everything is ok. 39 | 40 | $ make test 41 | 42 | 6. Make sure that the command line utility is ready to go: 43 | 44 | $ bin/baleen --help 45 | 46 | 7. Import the feeds from the `feedly.opml` file in the fixtures. 47 | 48 | $ bin/baleen import fixtures/feedly.opml 49 | Ingested 101 feeds from 1 OPML files 50 | 51 | 8. Perform an ingestion of the feeds that were imported from the `feedly.opml` file. 52 | 53 | $ bin/baleen ingest 54 | 55 | Your Mongo database collections should be created as you add new documents to them, and at this point you're ready to develop! 56 | -------------------------------------------------------------------------------- /fixtures/fields.json: -------------------------------------------------------------------------------- 1 | // A summary of the fields for many RSS feeds downloaded using Python feedparser 2 | { 3 | "fields": { 4 | "dc_source": 7, 5 | "media_credit": 71, 6 | "updated_parsed": 277, 7 | "links": 2130, 8 | "twitter": 20, 9 | "media_text": 31, 10 | "summary_detail": 1993, 11 | "href": 386, 12 | "wfw_commentrss": 896, 13 | "id": 2100, 14 | "slash_comments": 866, 15 | "contributors": 3, 16 | "published_parsed": 2070, 17 | "title": 2130, 18 | "comments": 1061, 19 | "content": 1134, 20 | "source": 40, 21 | "title_detail": 2130, 22 | "mash_thumbnail": 30, 23 | "dc_identifier": 65, 24 | "updated": 277, 25 | "gd_image": 50, 26 | "media_description": 18, 27 | "tags": 1275, 28 | "feedburner_origlink": 896, 29 | "media_group": 10, 30 | "media_content": 361, 31 | "feedburner_origenclosurelink": 26, 32 | "thr_total": 85, 33 | "authors": 1691, 34 | "author_detail": 1574, 35 | "desceditca": 1, 36 | "guidislink": 2100, 37 | "titleeditca": 1, 38 | "dc_type": 15, 39 | "author": 1691, 40 | "media_thumbnail": 287, 41 | "summary": 2130, 42 | "media_copyright": 25, 43 | "published": 2070, 44 | "link": 2130, 45 | "postid": 40 46 | }, 47 | "feeds": 101, 48 | "entries": 2130, 49 | "versions": { 50 | "": 4, 51 | "rss20": 82, 52 | "rss10": 1, 53 | "atom10": 14 54 | }, 55 | "time": 73.437 56 | } 57 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Baleen 2 | repo_name: GitHub 3 | repo_url: https://github.com/bbengfort/baleen 4 | site_description: An automated ingestion service for blogs to construct a corpus for NLP research. 5 | site_author: District Data Labs 6 | copyright: Built by District Data Labs, licensed by Creative Commons License 7 | theme: readthedocs 8 | 9 | pages: 10 | - "Introduction": index.md 11 | - "Component Architecture": components.md 12 | - "Service Architecture": service.md 13 | - "About Baleen": about.md 14 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ## Requests 2 | feedparser==5.2.1 3 | requests==2.9.1 4 | 5 | ## Database 6 | pymongo==3.2.1 7 | mongoengine==0.10.6 8 | blinker==1.4 9 | 10 | ## Parsing 11 | beautifulsoup4==4.4.1 12 | lxml==3.5.0 13 | 14 | ## Configuration 15 | confire==0.2.0 16 | PyYAML==3.11 17 | 18 | ## Command Line 19 | commis==0.2 20 | colorama==0.3.6 21 | 22 | ## Utilities 23 | schedule==0.3.2 24 | python-dateutil==2.4.2 25 | enum34==1.1.3 26 | six==1.10.0 27 | 28 | ## Web Admin 29 | Flask==0.10.1 30 | Flask-Admin==1.4.0 31 | Flask-WTF==0.12 32 | flask-mongoengine==0.7.5 33 | Flask-Humanize==0.3.0 34 | WTForms==2.1 35 | Jinja2==2.8 36 | humanize==0.5.1 37 | itsdangerous==0.24 38 | MarkupSafe==0.23 39 | Werkzeug==0.11.5 40 | 41 | ## Testing 42 | ## Uncomment and install for development 43 | #nose==1.3.7 44 | #coverage==4.0.3 45 | #mock==1.3.0 46 | #funcsigs==0.4 47 | #pbr==1.8.1 48 | #mongomock==3.2.1 49 | #sentinels==0.0.6 50 | 51 | ## Building 52 | ## Uncomment and install for deployment 53 | #wheel==0.29.0 54 | 55 | ## Pip Freeze Stuff 56 | #Python==2.7.10 57 | #pip==8.1.1 58 | #setuptools==0.9.7 59 | #wsgiref==0.1.2 60 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # setup 3 | # Setup script for installing baleen 4 | # 5 | # Author: Benjamin Bengfort 6 | # Created: Fri Sep 19 10:59:24 2014 -0400 7 | # 8 | # Copyright (C) 2014 Bengfort.com 9 | # For license information, see LICENSE.txt and NOTICE.md 10 | # 11 | # ID: setup.py [5ad94d7] benjamin@bengfort.com $ 12 | 13 | """ 14 | Setup script for installing baleen. 15 | See http://bbengfort.github.io/programmer/2016/01/20/packaging-with-pypi.html 16 | """ 17 | 18 | ########################################################################## 19 | ## Imports 20 | ########################################################################## 21 | 22 | import os 23 | import re 24 | import codecs 25 | 26 | from setuptools import setup 27 | from setuptools import find_packages 28 | 29 | ########################################################################## 30 | ## Package Information 31 | ########################################################################## 32 | 33 | ## Basic information 34 | NAME = "baleen" 35 | DESCRIPTION = "An automated ingestion service for blogs to construct a corpus for NLP research." 36 | AUTHOR = "Benjamin Bengfort" 37 | EMAIL = "benjamin@bengfort.com" 38 | LICENSE = "MIT" 39 | REPOSITORY = "https://github.com/bbengfort/baleen" 40 | PACKAGE = "baleen" 41 | 42 | ## Define the keywords 43 | KEYWORDS = ('nlp', 'baleen', 'ingestion', 'blogs', 'rss') 44 | 45 | ## Define the classifiers 46 | ## See https://pypi.python.org/pypi?%3Aaction=list_classifiers 47 | CLASSIFIERS = ( 48 | 'Development Status :: 4 - Beta', 49 | 'Environment :: Console', 50 | 'Intended Audience :: Developers', 51 | 'License :: OSI Approved :: MIT License', 52 | 'Natural Language :: English', 53 | 'Operating System :: OS Independent', 54 | 'Programming Language :: Python', 55 | 'Programming Language :: Python :: 2.7', 56 | 'Topic :: Software Development', 57 | 'Topic :: Software Development :: Libraries :: Python Modules', 58 | 'Topic :: Utilities', 59 | ) 60 | 61 | ## Important Paths 62 | PROJECT = os.path.abspath(os.path.dirname(__file__)) 63 | REQUIRE_PATH = "requirements.txt" 64 | VERSION_PATH = os.path.join(PACKAGE, "version.py") 65 | PKG_DESCRIBE = "DESCRIPTION.txt" 66 | 67 | ## Directories to ignore in find_packages 68 | EXCLUDES = ( 69 | "tests", "bin", "docs", "fixtures", "register", "notebooks", 70 | ) 71 | 72 | ########################################################################## 73 | ## Helper Functions 74 | ########################################################################## 75 | 76 | def read(*parts): 77 | """ 78 | Assume UTF-8 encoding and return the contents of the file located at the 79 | absolute path from the REPOSITORY joined with *parts. 80 | """ 81 | with codecs.open(os.path.join(PROJECT, *parts), 'rb', 'utf-8') as f: 82 | return f.read() 83 | 84 | 85 | def get_version(path=VERSION_PATH): 86 | """ 87 | Reads the __init__.py defined in the VERSION_PATH to find the get_version 88 | function, and executes it to ensure that it is loaded correctly. 89 | """ 90 | namespace = {} 91 | exec(read(path), namespace) 92 | return namespace['get_version']() 93 | 94 | 95 | def get_requires(path=REQUIRE_PATH): 96 | """ 97 | Yields a generator of requirements as defined by the REQUIRE_PATH which 98 | should point to a requirements.txt output by `pip freeze`. 99 | """ 100 | for line in read(path).splitlines(): 101 | line = line.strip() 102 | if line and not line.startswith('#'): 103 | yield line 104 | 105 | ########################################################################## 106 | ## Define the configuration 107 | ########################################################################## 108 | 109 | config = { 110 | "name": NAME, 111 | "version": get_version(), 112 | "description": DESCRIPTION, 113 | "long_description": read(PKG_DESCRIBE), 114 | "license": LICENSE, 115 | "author": AUTHOR, 116 | "author_email": EMAIL, 117 | "maintainer": AUTHOR, 118 | "maintainer_email": EMAIL, 119 | "url": REPOSITORY, 120 | "download_url": "{}/tarball/v{}".format(REPOSITORY, get_version()), 121 | "packages": find_packages(where=PROJECT, exclude=EXCLUDES), 122 | "install_requires": list(get_requires()), 123 | "classifiers": CLASSIFIERS, 124 | "keywords": KEYWORDS, 125 | "zip_safe": False, 126 | "scripts": ['bin/baleen'], 127 | } 128 | 129 | ########################################################################## 130 | ## Run setup script 131 | ########################################################################## 132 | 133 | if __name__ == '__main__': 134 | setup(**config) 135 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # tests 2 | # Testing for the baleen module 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Fri Sep 19 10:58:15 2014 -0400 6 | # 7 | # Copyright (C) 2014 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: __init__.py [5ad94d7] benjamin@bengfort.com $ 11 | 12 | """ 13 | Testing for the baleen module 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import unittest 21 | 22 | ########################################################################## 23 | ## Module Constants 24 | ########################################################################## 25 | 26 | TEST_VERSION = "0.3.3" ## Also the expected version onf the package 27 | 28 | ########################################################################## 29 | ## Test Cases 30 | ########################################################################## 31 | 32 | class InitializationTest(unittest.TestCase): 33 | 34 | def test_initialization(self): 35 | """ 36 | Tests a simple world fact by asserting that 10*10 is 100 37 | """ 38 | self.assertEqual(10*10, 100) 39 | 40 | def test_import(self): 41 | """ 42 | Can import baleen 43 | """ 44 | try: 45 | import baleen 46 | except ImportError: 47 | self.fail("Unable to import the baleen module!") 48 | 49 | def test_version(self): 50 | """ 51 | Assert that the version is sane 52 | """ 53 | import baleen 54 | self.assertEqual(TEST_VERSION, baleen.__version__) 55 | -------------------------------------------------------------------------------- /tests/fixtures/feedly.opml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | null subscriptions in feedly Cloud 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /tests/fixtures/feedparser_result.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistrictDataLabs/baleen/bb2ae323a3ab3a066a4a289401847e1251abc55d/tests/fixtures/feedparser_result.pickle -------------------------------------------------------------------------------- /tests/test_export.py: -------------------------------------------------------------------------------- 1 | # tests.test_export 2 | # Test the export module - to generate a corpus for machine learning. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Sun Feb 21 15:49:18 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: test_export.py [2988c53] benjamin@bengfort.com $ 11 | 12 | """ 13 | Test the export module - to generate a corpus for machine learning. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import unittest 21 | 22 | try: 23 | from unittest import mock 24 | except ImportError: 25 | import mock 26 | 27 | from baleen.export import * 28 | from baleen.exceptions import ExportError 29 | 30 | 31 | ########################################################################## 32 | ## Export Tests 33 | ########################################################################## 34 | 35 | class ExportTests(unittest.TestCase): 36 | 37 | def test_scheme_specification(self): 38 | """ 39 | Assert that only known schemes are allowed. 40 | """ 41 | 42 | # Make sure good schemes don't error 43 | for scheme in SCHEMES: 44 | try: 45 | exporter = MongoExporter("/tmp/corpus", scheme=scheme) 46 | except ExportError: 47 | self.fail("Could not use expected scheme, {}".format(scheme)) 48 | 49 | # Make sure bad schemes do error 50 | for scheme in ('text', 'txt', 'bson', 'xml', 'yaml'): 51 | with self.assertRaises(ExportError): 52 | exporter = MongoExporter("/tmp/corpus", scheme=scheme) 53 | -------------------------------------------------------------------------------- /tests/test_feed.py: -------------------------------------------------------------------------------- 1 | # tests.test_feed 2 | # Test the feed module - the main entry point to Baleen 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Sun Feb 21 15:49:18 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: test_feed.py [2988c53] benjamin@bengfort.com $ 11 | 12 | """ 13 | Test the feed module - the main entry point to Baleen 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import os 21 | import pickle 22 | import unittest 23 | 24 | from mongomock import MongoClient as MockMongoClient 25 | 26 | try: 27 | from unittest import mock 28 | except ImportError: 29 | import mock 30 | 31 | from baleen.feed import * 32 | from baleen.models import * 33 | from urlparse import urlparse 34 | from baleen.exceptions import FeedTypeError 35 | 36 | ########################################################################## 37 | ## Fixtures 38 | ########################################################################## 39 | 40 | FIXTURES = os.path.join(os.path.dirname(__file__), "fixtures") 41 | FEEDLY = os.path.join(FIXTURES, "feedly.opml") 42 | RESULT = os.path.join(FIXTURES, "feedparser_result.pickle") 43 | 44 | # Feed Fixtures 45 | STR_FEED = 'http://freakonomics.blogs.nytimes.com/feed/' 46 | UNICODE_FEED = u'http://blog.kaggle.com/feed/' 47 | OPML_FEED = { 48 | "type":"rss", "text":"The Daily Notebook", "title":"The Daily Notebook", 49 | "xmlUrl":"https://mubi.com/notebook/posts.atom", "htmlUrl":"https://mubi.com/notebook/posts", 50 | } 51 | MONGO_FEED = Feed( 52 | title = u'The Rumpus.net', 53 | link = u'http://therumpus.net/feed/', 54 | urls = {u'htmlurl': u'http://therumpus.net'}, 55 | category = u'books', 56 | ) 57 | 58 | ########################################################################## 59 | ## Feed Synchronization Tests 60 | ########################################################################## 61 | 62 | class FeedSyncTests(unittest.TestCase): 63 | 64 | def setUp(self): 65 | """ 66 | Create the mongomock connection 67 | """ 68 | self.conn = connect(host='mongomock://localhost') 69 | assert isinstance(self.conn, MockMongoClient) 70 | 71 | # Clear out the database 72 | for feed in Feed.objects(): feed.delete() 73 | for post in Post.objects(): post.delete() 74 | 75 | def tearDown(self): 76 | """ 77 | Drop the mongomock connection 78 | """ 79 | assert isinstance(self.conn, MockMongoClient) 80 | self.conn = None 81 | 82 | def test_fsync_factory(self): 83 | """ 84 | Test multiple types in the feed sync factory 85 | """ 86 | cases = ( 87 | STR_FEED, UNICODE_FEED, OPML_FEED, MONGO_FEED 88 | ) 89 | 90 | for fsync in FeedSync.factory(cases): 91 | self.assertIsInstance(fsync, FeedSync) 92 | 93 | def test_type_check(self): 94 | """ 95 | Assert that strings, Feeds, and dicts can be sync'd 96 | """ 97 | cases = ( 98 | (STR_FEED, FeedSync.URL), 99 | (UNICODE_FEED, FeedSync.URL), 100 | (OPML_FEED, FeedSync.DICT), 101 | (MONGO_FEED, FeedSync.MODEL), 102 | ) 103 | 104 | for feed, ftype in cases: 105 | fsync = FeedSync(feed) 106 | self.assertEqual(fsync.type, ftype) 107 | 108 | def test_bad_type(self): 109 | """ 110 | Test that bad types raise an exception in sync 111 | """ 112 | cases = ( 113 | 10, {u'htmlurl': u'https://mubi.com/notebook/posts'}, ['a','b','c'] 114 | ) 115 | 116 | for case in cases: 117 | fsync = FeedSync(case) 118 | with self.assertRaises(FeedTypeError): 119 | fsync.type 120 | 121 | def test_url_extraction(self): 122 | """ 123 | Test the feed sync multiple type url extraction 124 | """ 125 | cases = ( 126 | (STR_FEED, STR_FEED), 127 | (UNICODE_FEED, UNICODE_FEED), 128 | (OPML_FEED, OPML_FEED['xmlUrl']), 129 | (MONGO_FEED, MONGO_FEED.link), 130 | ) 131 | 132 | for feed, url in cases: 133 | fsync = FeedSync(feed) 134 | self.assertEqual(fsync.url, url) 135 | 136 | @mock.patch('baleen.feed.feedparser.parse') 137 | def test_feedparser_wrapping(self, mock_feedparser): 138 | """ 139 | Test the feedparser access by mocking feedparser calls 140 | """ 141 | 142 | # Ensure that the mocking worked out for us 143 | assert mock_feedparser is feedparser.parse 144 | 145 | cases = ( 146 | (STR_FEED, STR_FEED), 147 | (UNICODE_FEED, UNICODE_FEED), 148 | (OPML_FEED, OPML_FEED['xmlUrl']), 149 | (MONGO_FEED, MONGO_FEED.link), 150 | ) 151 | 152 | for feed, url in cases: 153 | fsync = FeedSync(feed) 154 | result = fsync.parse() 155 | mock_feedparser.assert_called_with(url) 156 | 157 | @mock.patch('baleen.feed.feedparser.parse') 158 | def test_feedparser_wrapping(self, mock_feedparser): 159 | """ 160 | Test etag and modified blocking on feedparser for Feed objects 161 | """ 162 | 163 | # Ensure that the mocking worked out for us 164 | assert mock_feedparser is feedparser.parse 165 | 166 | feed = Feed(link = u'https://mubi.com/notebook/posts.atom') 167 | feed.etag = 'abcdefg' 168 | 169 | # Test Case 1: etag but no modified 170 | result = FeedSync(feed).parse() 171 | mock_feedparser.assert_called_with(feed.link, etag=feed.etag) 172 | 173 | # Test Case 2: modified but no etag 174 | feed.etag = None 175 | feed.modified = "Fri, 11 Jun 2012 23:00:34 GMT" 176 | result = FeedSync(feed).parse() 177 | mock_feedparser.assert_called_with(feed.link, modified=feed.modified) 178 | 179 | # Test Case 3: modified and etag 180 | feed.etag = 'hijklmnop' 181 | result = FeedSync(feed).parse() 182 | mock_feedparser.assert_called_with(feed.link, etag=feed.etag) 183 | 184 | @mock.patch('baleen.feed.feedparser.parse') 185 | def test_feed_sync(self, mock_feedparser): 186 | """ 187 | Test that sync updates the Feed object 188 | """ 189 | # Ensure that the mocking worked out for us 190 | assert mock_feedparser is feedparser.parse 191 | 192 | # Give the mock feedparser a result! 193 | with open(RESULT, 'rb') as f: 194 | mock_feedparser.return_value = pickle.load(f) 195 | 196 | fsync = FeedSync(MONGO_FEED) 197 | result = fsync.sync() 198 | 199 | # Fetch the feed from the database. 200 | self.assertEqual(Feed.objects.count(), 1) 201 | feed = Feed.objects.first() 202 | 203 | # Ensure that the various properties have been set. 204 | self.assertEqual(feed.etag, u'W/"29e84abdc28e3fa87709d1f309b7c214-gzip"') 205 | self.assertEqual(feed.modified, u'Wed, 02 Mar 2016 22:00:06 GMT') 206 | self.assertEqual(feed.version, u'rss20') 207 | self.assertEqual(feed.link, MONGO_FEED.link) 208 | self.assertIsNotNone(feed.fetched) 209 | 210 | @mock.patch('baleen.feed.feedparser.parse') 211 | def test_feed_sync_mongodb(self, mock_feedparser): 212 | """ 213 | Test the sync MongoDB interaction 214 | """ 215 | # Ensure that the mocking worked out for us 216 | assert mock_feedparser is feedparser.parse 217 | 218 | # Give the mock feedparser a result! 219 | with open(RESULT, 'rb') as f: 220 | mock_feedparser.return_value = pickle.load(f) 221 | 222 | fsync = FeedSync(MONGO_FEED) 223 | 224 | # Test sync without save 225 | result = fsync.sync(save=False) 226 | self.assertEqual(Feed.objects.count(), 0) 227 | 228 | # Test sync with save 229 | result = fsync.sync() 230 | self.assertEqual(Feed.objects.count(), 1) 231 | 232 | @mock.patch('baleen.feed.feedparser.parse') 233 | def test_feed_sync_non_model(self, mock_feedparser): 234 | """ 235 | Test the sync with a non-model feed. 236 | """ 237 | # Ensure that the mocking worked out for us 238 | assert mock_feedparser is feedparser.parse 239 | 240 | # Give the mock feedparser a result! 241 | with open(RESULT, 'rb') as f: 242 | mock_feedparser.return_value = pickle.load(f) 243 | 244 | fsync = FeedSync(OPML_FEED) 245 | 246 | # Test sync without save 247 | result = fsync.sync() 248 | self.assertEqual(Feed.objects.count(), 0) 249 | -------------------------------------------------------------------------------- /tests/test_ingest.py: -------------------------------------------------------------------------------- 1 | # tests.test_ingest 2 | # Test the ingestor mechanism in an integration fashion. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Thu Mar 03 13:01:12 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: test_ingest.py [df0c71b] benjamin@bengfort.com $ 11 | 12 | """ 13 | Test the ingestor mechanism in an integration fashion. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import unittest 21 | 22 | from .test_models import MongoTestMixin 23 | 24 | try: 25 | from unittest import mock 26 | except ImportError: 27 | import mock 28 | 29 | import baleen.models as db 30 | 31 | from baleen.ingest import stype 32 | from baleen.ingest import Ingestor 33 | from baleen.ingest import MongoIngestor 34 | from baleen.ingest import OPMLIngestor 35 | from baleen.utils.decorators import reraise 36 | from baleen.exceptions import * 37 | from baleen.utils.logger import IngestLogger 38 | 39 | 40 | ########################################################################## 41 | ## Helper Functions 42 | ########################################################################## 43 | 44 | ACTION_METHODS = ('started', 'finished', 'failed', 'process') 45 | 46 | def get_ingest_mock(klass=Ingestor): 47 | """ 48 | Mocks all functions of the ingestor that are called in ingest. 49 | This means there should be NO side effects when ingest is called. 50 | """ 51 | # Verify and create ingestor class 52 | ingestor = klass() 53 | verify_ingest_mock(ingestor) 54 | 55 | # Remove action methods 56 | for method in ACTION_METHODS: 57 | setattr(ingestor, method, mock.MagicMock()) 58 | 59 | return ingestor 60 | 61 | 62 | def verify_ingest_mock(ingestor): 63 | """ 64 | Ensures that no methods other than action methods are called 65 | """ 66 | ingestor = mock.create_autospec(ingestor, instance=True) 67 | reset_mock_method(ingestor, 'ingest') 68 | ingestor.ingest() 69 | 70 | for method in ingestor._mock_methods: 71 | action = getattr(ingestor, method) 72 | if method not in ACTION_METHODS: 73 | if hasattr(action, 'assert_not_called'): 74 | action.assert_not_called() 75 | 76 | 77 | def reset_mock_method(obj, method): 78 | """ 79 | Resets a mock object's method to the orignal 80 | """ 81 | klass = obj.__class__ 82 | action = getattr(klass, method) 83 | 84 | setattr(obj, method, action.__get__(obj, klass)) 85 | return obj 86 | 87 | 88 | ########################################################################## 89 | ## Test Ingestor 90 | ########################################################################## 91 | 92 | class IngestorTests(MongoTestMixin, unittest.TestCase): 93 | 94 | def test_stype_helper(self): 95 | """ 96 | Test the stype helper function 97 | """ 98 | self.assertEqual(stype(BaleenError("Bad things!")), BaleenError.__name__) 99 | 100 | def test_stype_embed_helper(self): 101 | """ 102 | Test stype on reraises decorators. 103 | """ 104 | 105 | @reraise(BaleenError) 106 | def badfunc(): 107 | raise TypeError("This is clearly the wrong type!") 108 | 109 | try: 110 | badfunc() 111 | except BaleenError as e: 112 | self.assertEqual(stype(e), "BaleenError (TypeError)") 113 | 114 | def test_ingestor_hooks(self): 115 | """ 116 | Test the started and finished ingestor hooks 117 | """ 118 | 119 | # Create Ingestor and call the entry point method 120 | ingestor = get_ingest_mock() 121 | ingestor.ingest() 122 | 123 | # Assert that started and finished were called, and failed wasn't. 124 | ingestor.started.assert_called_once_with() 125 | ingestor.finished.assert_called_once_with() 126 | ingestor.failed.assert_not_called() 127 | 128 | def test_ingestor_failed_hook(self): 129 | """ 130 | Test the started and failed ingestor hooks 131 | """ 132 | 133 | ingestor = get_ingest_mock() 134 | ingestor.process.side_effect = Exception("Things went wrong!") 135 | 136 | # Call the entry point method 137 | with self.assertRaises(Exception) as cm: 138 | ingestor.ingest() 139 | 140 | # Assert that started and finished were called, and failed wasn't. 141 | ingestor.started.assert_called_once_with() 142 | ingestor.finished.assert_not_called() 143 | ingestor.failed.assert_called_once_with(cm.exception) 144 | 145 | def test_ingestor_state(self): 146 | """ 147 | Ensure that the ingestor state is correctly modified 148 | """ 149 | ingestor = get_ingest_mock() 150 | 151 | self.assertIsNone(ingestor.jobid) 152 | self.assertIsNone(ingestor.timer) 153 | 154 | ingestor.ingest() 155 | 156 | self.assertIsNotNone(ingestor.jobid) 157 | self.assertIsNotNone(ingestor.timer) 158 | -------------------------------------------------------------------------------- /tests/test_models.py: -------------------------------------------------------------------------------- 1 | # tests.test_models 2 | # Testing for the mongoengine models (basic stuff). 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Wed Mar 02 21:11:08 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: test_models.py [2930b9d] benjamin@bengfort.com $ 11 | 12 | """ 13 | Testing for the mongoengine models (basic stuff). 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import unittest 21 | import mongoengine as me 22 | 23 | from mongomock import MongoClient as MockMongoClient 24 | 25 | try: 26 | from unittest import mock 27 | except ImportError: 28 | import mock 29 | 30 | from baleen.models import * 31 | 32 | 33 | ########################################################################## 34 | ## Mongo Test Mixin 35 | ########################################################################## 36 | 37 | class MongoTestMixin(object): 38 | 39 | def setUp(self): 40 | """ 41 | Create the mongomock connection 42 | """ 43 | self.conn = connect(host='mongomock://localhost') 44 | assert isinstance(self.conn, MockMongoClient) 45 | 46 | # Clear out the database 47 | for feed in Feed.objects(): feed.delete() 48 | for post in Post.objects(): post.delete() 49 | 50 | def tearDown(self): 51 | """ 52 | Drop the mongomock connection 53 | """ 54 | assert isinstance(self.conn, MockMongoClient) 55 | self.conn = None 56 | 57 | def assertDateTimeEqual(self, dta, dtb): 58 | """ 59 | Assert that two datetimes are within 1 second of each other 60 | """ 61 | dta = dta.replace(microsecond=0) 62 | dtb = dta.replace(microsecond=0) 63 | 64 | if dta.second != dtb.second: 65 | self.assertLessThanEqual( 66 | abs(dta.second - dtb.second), 1, "datetimes are not one second apart!" 67 | ) 68 | dta = dta.replace(second=0) 69 | dtb = dtb.replace(second=0) 70 | 71 | self.assertEqual(dta, dtb) 72 | 73 | ########################################################################## 74 | ## Feed Model Tests 75 | ########################################################################## 76 | 77 | class FeedModelTests(MongoTestMixin, unittest.TestCase): 78 | 79 | def test_link_requred(self): 80 | """ 81 | Assert that the feed link is required 82 | """ 83 | feed = Feed(title="My Awesome Feed", category="socks") 84 | with self.assertRaises(me.ValidationError): 85 | feed.save() 86 | 87 | def test_created_updated(self): 88 | """ 89 | Ensure the feed updated timestamp is tracked 90 | """ 91 | feed = Feed(title="A News Feed", category="news", link="https://example.com/feed.atom") 92 | feed.save() 93 | 94 | self.assertIsNotNone(feed.created) 95 | self.assertIsNotNone(feed.updated) 96 | self.assertDateTimeEqual(feed.created, feed.updated) 97 | 98 | feed.title = "An Olds Feed" 99 | feed.save() 100 | self.assertNotEqual(feed.created, feed.updated) 101 | 102 | def test_properties(self): 103 | """ 104 | Test the properties of the feed model 105 | """ 106 | feed = Feed(title="A News Feed", category="news", link="https://example.com/feed.atom") 107 | feed.save() 108 | 109 | self.assertEqual(feed.xmlurl, feed.link) 110 | self.assertIsNone(feed.htmlurl) 111 | 112 | feed.urls = {'htmlUrl': 'https://example.com/'} 113 | feed.save() 114 | 115 | self.assertEqual(feed.htmlurl, 'https://example.com/') 116 | 117 | def test_stringify(self): 118 | """ 119 | Test the stringification of a feed 120 | """ 121 | feed = Feed(category="news", link="https://example.com/feed.atom") 122 | feed.save() 123 | 124 | self.assertEqual(str(feed), feed.link) 125 | 126 | feed.title = "A News Feed" 127 | feed.save() 128 | 129 | self.assertEqual(str(feed), feed.title) 130 | 131 | 132 | ########################################################################## 133 | ## Post Model Tests 134 | ########################################################################## 135 | 136 | class PostModelTests(MongoTestMixin, unittest.TestCase): 137 | 138 | def test_url_requred(self): 139 | """ 140 | Assert that the post url is required 141 | """ 142 | post = Post(title="My Awesome Post", content="socks") 143 | with self.assertRaises(me.ValidationError): 144 | post.save() 145 | 146 | def test_created_updated(self): 147 | """ 148 | Ensure the post updated timestamp is tracked 149 | """ 150 | post = Post(title="My Awesome Post", content="socks", url="http://example.com/socks.html") 151 | post.save() 152 | 153 | self.assertIsNotNone(post.created) 154 | self.assertIsNotNone(post.updated) 155 | self.assertDateTimeEqual(post.created, post.updated) 156 | 157 | post.title = "My even more awesome Post!" 158 | post.save() 159 | self.assertNotEqual(post.created, post.updated) 160 | 161 | def test_content_hashing(self): 162 | """ 163 | Test the automatic hashing of content 164 | """ 165 | post = Post(content="socks", url="http://example.com/socks.html") 166 | self.assertIsNone(post.signature) 167 | post.save() 168 | 169 | self.assertIsNotNone(post.signature) 170 | self.assertEqual(post.signature, '54f6d9fbe8ee576f82d6eb7e4d1d55691a1f0b7bd956246d3de56ee84bd1d333') 171 | 172 | def test_stringify(self): 173 | """ 174 | Test the stringification of a post 175 | """ 176 | post = Post(content="socks", signature="abc", url="http://example.com/socks.html") 177 | post.save() 178 | 179 | self.assertEqual(str(post), post.url) 180 | 181 | post.title = "My Awesome Post" 182 | post.save() 183 | 184 | self.assertEqual(str(post), post.title) 185 | -------------------------------------------------------------------------------- /tests/test_opml.py: -------------------------------------------------------------------------------- 1 | # tests.test_opml 2 | # Testing for the OPML reader and ingestion function. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Fri Feb 19 08:50:19 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: test_opml.py [a0d0da3] benjamin@bengfort.com $ 11 | 12 | """ 13 | Testing for the OPML reader and ingestion function. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import os 21 | import unittest 22 | 23 | from .test_models import MongoTestMixin 24 | 25 | try: 26 | from unittest import mock 27 | except ImportError: 28 | import mock 29 | 30 | from baleen.opml import OPML, load_opml 31 | from baleen.models import Feed 32 | 33 | ########################################################################## 34 | ## Fixtures 35 | ########################################################################## 36 | 37 | FIXTURES = os.path.join(os.path.dirname(__file__), "fixtures") 38 | FEEDLY = os.path.join(FIXTURES, "feedly.opml") 39 | 40 | ########################################################################## 41 | ## Test Load OPML command 42 | ########################################################################## 43 | 44 | class LoadOPMLTests(MongoTestMixin, unittest.TestCase): 45 | 46 | def test_load_opml_integrated(self): 47 | """ 48 | Test the integration of the ingest helper function 49 | """ 50 | self.assertEqual(Feed.objects.count(), 0) 51 | self.assertEqual(load_opml(FEEDLY), 36) 52 | self.assertEqual(Feed.objects.count(), 36) 53 | 54 | for feed in Feed.objects(): 55 | self.assertIn('xmlUrl', feed.urls) 56 | self.assertIn('htmlUrl', feed.urls) 57 | 58 | def test_load_opml_no_duplicates(self): 59 | """ 60 | Assert multiple calls to the load_opml creates no duplicates 61 | """ 62 | self.assertEqual(Feed.objects.count(), 0) 63 | self.assertEqual(load_opml(FEEDLY), 36) 64 | self.assertEqual(Feed.objects.count(), 36) 65 | 66 | for _ in xrange(10): 67 | self.assertEqual(load_opml(FEEDLY), 0) 68 | self.assertEqual(Feed.objects.count(), 36) 69 | 70 | ########################################################################## 71 | ## OPML Reader Test 72 | ########################################################################## 73 | 74 | class OPMLTests(unittest.TestCase): 75 | 76 | def test_fixture(self): 77 | """ 78 | Assert the required opml fixture is available 79 | """ 80 | self.assertTrue(os.path.exists(FEEDLY)) 81 | self.assertTrue(os.path.isfile(FEEDLY)) 82 | 83 | def test_categories(self): 84 | """ 85 | Test the OPML categories listing 86 | """ 87 | opml = OPML(FEEDLY) 88 | expected = [ 89 | u'news', 90 | u'do it yourself', 91 | u'business', 92 | u'gaming', 93 | u'data science', 94 | u'essays', 95 | u'politics', 96 | u'tech', 97 | u'cinema', 98 | u'books', 99 | u'sports', 100 | u'cooking', 101 | u'design' 102 | ] 103 | 104 | print list(opml.categories()) 105 | 106 | self.assertEqual(list(opml.categories()), expected) 107 | 108 | def test_length(self): 109 | """ 110 | Test the OPML len built in 111 | """ 112 | opml = OPML(FEEDLY) 113 | self.assertEqual(len(opml), 36) 114 | 115 | def test_counts(self): 116 | """ 117 | Test the OPML category counter and item iterator 118 | """ 119 | opml = OPML(FEEDLY) 120 | expected = { 121 | 'cooking': 4, 122 | 'cinema': 3, 123 | 'gaming': 3, 124 | 'tech': 3, 125 | 'essays': 2, 126 | 'business': 3, 127 | 'design': 2, 128 | 'sports': 3, 129 | 'books': 3, 130 | 'data science': 4, 131 | 'do it yourself': 2, 132 | 'news': 2, 133 | 'politics': 2, 134 | } 135 | counts = opml.counts() 136 | 137 | for key, val in expected.items(): 138 | self.assertIn(key, counts) 139 | self.assertEqual( 140 | counts[key], val, 141 | "{} mismatch: {} vs {}".format(key, counts[key], val) 142 | ) 143 | 144 | def test_item_iterator_detail(self): 145 | """ 146 | Test the XML result returned from OPML iteration 147 | """ 148 | 149 | opml = OPML(FEEDLY) 150 | attrs = ['category', 'title', 'text', 'htmlUrl', 'xmlUrl', 'type'] 151 | for item in opml: 152 | self.assertTrue(isinstance(item, dict)) 153 | self.assertEqual(item.keys(), attrs) 154 | -------------------------------------------------------------------------------- /tests/test_wrangle.py: -------------------------------------------------------------------------------- 1 | # tests.test_wrangle 2 | # Test the post wrangling module and functionality. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Wed Mar 02 22:38:08 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: test_wrangle.py [568d540] benjamin@bengfort.com $ 11 | 12 | """ 13 | Test the post wrangling module and functionality. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import os 21 | import pickle 22 | import unittest 23 | 24 | from .test_models import MongoTestMixin 25 | 26 | try: 27 | from unittest import mock 28 | except ImportError: 29 | import mock 30 | 31 | from baleen.wrangle import * 32 | from baleen.exceptions import * 33 | from baleen.models import Feed, Post 34 | 35 | ########################################################################## 36 | ## Fixtures 37 | ########################################################################## 38 | 39 | FIXTURES = os.path.join(os.path.dirname(__file__), "fixtures") 40 | RESULT = os.path.join(FIXTURES, "feedparser_result.pickle") 41 | FEED = Feed( 42 | title = u'The Rumpus.net', 43 | link = u'http://therumpus.net/feed/', 44 | urls = {u'htmlurl': u'http://therumpus.net'}, category = u'books', 45 | ) 46 | 47 | 48 | def mocked_requests_get(*args, **kwargs): 49 | class MockResponse: 50 | def __init__(self, text, status_code): 51 | self.text = text 52 | self.status_code = status_code 53 | 54 | def raise_for_status(self): 55 | if self.status_code != 200: 56 | raise Exception("HTTP {}".format(self.status_code)) 57 | 58 | text = "Luke, I am your father!" 59 | 60 | if args[0] == 'http://example.com/vader/': 61 | return MockResponse(text, 200) 62 | 63 | return MockResponse("??", 404) 64 | 65 | ########################################################################## 66 | ## Test Wrangling Posts 67 | ########################################################################## 68 | 69 | class PostWranglerTests(MongoTestMixin, unittest.TestCase): 70 | 71 | def setUp(self): 72 | super(PostWranglerTests, self).setUp() 73 | self.feed = FEED 74 | self.feed.save() 75 | 76 | with open(RESULT, 'rb') as f: 77 | self.entries = pickle.load(f).entries 78 | 79 | def test_wrangle_factory(self): 80 | """ 81 | Test multiple types in the feed sync factory 82 | """ 83 | 84 | for wrangle in PostWrangler.factory(self.entries, feed=self.feed): 85 | self.assertIsInstance(wrangle, PostWrangler) 86 | 87 | def test_wrangle_integration(self): 88 | """ 89 | Test wrangling of all entries in the result. 90 | """ 91 | self.assertEqual(Post.objects.count(), 0) 92 | for wrangle in PostWrangler.factory(self.entries, feed=self.feed): 93 | wrangle.wrangle() 94 | wrangle.wrangle() # Make sure that double wrangle does nothing. 95 | 96 | self.assertEqual(Post.objects.count(), 10) 97 | 98 | # Ensure there are no duplicates 99 | for wrangle in PostWrangler.factory(self.entries, feed=self.feed): 100 | with self.assertRaises(WranglingError) as cm: 101 | wrangle.wrangle() 102 | self.assertEqual(Post.objects.count(), 10) 103 | 104 | def test_is_wrangled(self): 105 | """ 106 | Test the wrangling detection 107 | """ 108 | wrangle = PostWrangler(self.entries[0]) 109 | self.assertFalse(wrangle.is_wrangled()) 110 | wrangle.wrangle() 111 | self.assertTrue(wrangle.is_wrangled()) 112 | 113 | def test_save_not_save(self): 114 | """ 115 | Test the wrangle interaction with the database 116 | """ 117 | self.assertEqual(Post.objects.count(), 0) 118 | wrangle = PostWrangler(self.entries[0]) 119 | 120 | # Don't save the wrangle 121 | wrangle.wrangle(False) 122 | self.assertEqual(Post.objects.count(), 0) 123 | 124 | # We've already wrangled so nothing should happen! 125 | wrangle.wrangle() 126 | self.assertEqual(Post.objects.count(), 0) 127 | 128 | # Try making something happen directly 129 | wrangle.wrangle().save() 130 | self.assertEqual(Post.objects.count(), 1) 131 | 132 | # Toss in something else entirely 133 | wrangle = PostWrangler(self.entries[1]) 134 | wrangle.wrangle() 135 | self.assertEqual(Post.objects.count(), 2) 136 | 137 | def test_feed_or_not(self): 138 | """ 139 | Test can be saved with or without a feed 140 | """ 141 | withfeed = PostWrangler(self.entries[0], feed=self.feed) 142 | nofeed = PostWrangler(self.entries[1]) 143 | 144 | post = withfeed.wrangle() 145 | self.assertEqual(post.feed, self.feed) 146 | 147 | post = nofeed.wrangle() 148 | self.assertIsNone(post.feed) 149 | 150 | @mock.patch('baleen.wrangle.requests.get', side_effect=mocked_requests_get) 151 | def test_fetch_not_wrangled(self, mock_requests): 152 | """ 153 | Assert that fetch requires wrangling 154 | """ 155 | assert mock_requests is requests.get 156 | 157 | wrangle = PostWrangler(self.entries[0], feed=self.feed) 158 | with self.assertRaises(FetchError): 159 | wrangle.fetch() 160 | 161 | @mock.patch('baleen.wrangle.requests.get', side_effect=mocked_requests_get) 162 | def test_fetch_overwrites_content(self, mock_requests): 163 | """ 164 | Test that the fetch overwrites content. 165 | """ 166 | assert mock_requests is requests.get 167 | 168 | wrangle = PostWrangler(self.entries[0], feed=self.feed) 169 | wrangle.wrangle() 170 | self.assertEqual(Post.objects.count(), 1) 171 | 172 | wrangle.post.url = 'http://example.com/vader/' 173 | post = wrangle.fetch() 174 | self.assertEqual(Post.objects.count(), 1) 175 | self.assertNotEqual(post.created, post.updated) 176 | 177 | self.assertEqual(post.content, "Luke, I am your father!") 178 | 179 | @mock.patch('baleen.wrangle.requests.get', side_effect=mocked_requests_get) 180 | def test_fetch_no_save(self, mock_requests): 181 | """ 182 | Test that the fetch does not save on demand. 183 | """ 184 | assert mock_requests is requests.get 185 | 186 | wrangle = PostWrangler(self.entries[0], feed=self.feed) 187 | wrangle.wrangle() 188 | self.assertEqual(Post.objects.count(), 1) 189 | 190 | wrangle.post.url = 'http://example.com/vader/' 191 | wrangle.fetch(save=False) 192 | self.assertEqual(Post.objects.count(), 1) 193 | 194 | post = Post.objects.first() 195 | self.assertDateTimeEqual(post.created, post.updated) 196 | self.assertNotEqual(post.content, "Luke, I am your father!") 197 | 198 | @mock.patch('baleen.wrangle.requests.get', side_effect=mocked_requests_get) 199 | def test_fetch_raises_404(self, mock_requests): 200 | """ 201 | Test that fetch raises exception on HTTP error 202 | """ 203 | assert mock_requests is requests.get 204 | 205 | wrangle = PostWrangler(self.entries[0], feed=self.feed) 206 | wrangle.wrangle() 207 | self.assertEqual(Post.objects.count(), 1) 208 | 209 | with self.assertRaises(FetchError): 210 | wrangle.post.url = 'http://example.com/obiwan/' 211 | wrangle.fetch() 212 | 213 | -------------------------------------------------------------------------------- /tests/utils_tests/__init__.py: -------------------------------------------------------------------------------- 1 | # tests.utils_tests 2 | # Tests for the Baleen utilities package. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Sun Feb 21 15:31:55 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: __init__.py [2988c53] benjamin@bengfort.com $ 11 | 12 | """ 13 | Tests for the Baleen utilities package. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | -------------------------------------------------------------------------------- /tests/utils_tests/test_decorators.py: -------------------------------------------------------------------------------- 1 | # test.test_utils.test_decorators 2 | # Testing the decorators utility package. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Wed Mar 02 19:06:34 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: test_decorators.py [538b33d] benjamin@bengfort.com $ 11 | 12 | """ 13 | Testing the decorators utility package. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import time 21 | import unittest 22 | 23 | from baleen.utils.decorators import * 24 | from baleen.utils.timez import Timer 25 | from baleen.exceptions import * 26 | 27 | try: 28 | from unittest import mock 29 | except ImportError: 30 | import mock 31 | 32 | 33 | ########################################################################## 34 | ## Decorators Tests 35 | ########################################################################## 36 | 37 | class DecoratorsTests(unittest.TestCase): 38 | """ 39 | Basic decorators utility tests. 40 | """ 41 | 42 | def test_memoized(self): 43 | """ 44 | Test the memoized property 45 | """ 46 | 47 | class Thing(object): 48 | 49 | @memoized 50 | def attr(self): 51 | return 42 52 | 53 | thing = Thing() 54 | self.assertFalse(hasattr(thing, '_attr')) 55 | self.assertEqual(thing.attr, 42) 56 | self.assertTrue(hasattr(thing, '_attr')) 57 | 58 | def test_timeit(self): 59 | """ 60 | Test the timeit decorator 61 | """ 62 | 63 | @timeit 64 | def myfunc(): 65 | return 42 66 | 67 | output = myfunc() 68 | self.assertEqual(len(output), 2) 69 | result, timer = output 70 | self.assertEqual(result, 42) 71 | self.assertTrue(isinstance(timer, Timer)) 72 | 73 | def test_reraise(self): 74 | """ 75 | Test the reraise decorator 76 | """ 77 | 78 | # Test 1: Regular old reraise 79 | 80 | @reraise() 81 | def alpha(): 82 | raise Exception("Should be a BaleenError") 83 | 84 | with self.assertRaises(BaleenError) as cm: 85 | alpha() 86 | 87 | e = cm.exception 88 | self.assertEqual(str(e), "Should be a BaleenError") 89 | self.assertTrue(hasattr(e, "original")) 90 | self.assertIsInstance(e.original, Exception) 91 | self.assertEqual(str(e.original), "Should be a BaleenError") 92 | 93 | def test_reraise_message(self): 94 | """ 95 | Test the reraise decorator with a message 96 | """ 97 | 98 | # Test 2: Reraise with a new message 99 | 100 | @reraise(message="I'm handling it!") 101 | def bravo(): 102 | raise NotImplementedError("I'm not handling it!") 103 | 104 | with self.assertRaises(BaleenError) as cm: 105 | bravo() 106 | 107 | e = cm.exception 108 | self.assertEqual(str(e), "I'm handling it!") 109 | self.assertTrue(hasattr(e, "original")) 110 | self.assertIsInstance(e.original, NotImplementedError) 111 | self.assertEqual(str(e.original), "I'm not handling it!") 112 | 113 | def test_reraise_arguments(self): 114 | """ 115 | Test the reraise decorator with all possible arguments 116 | """ 117 | 118 | # Test 3: All possible arguments to reraise 119 | 120 | @reraise(klass=FeedTypeError, message="bad feed type", trap=TypeError) 121 | def charlie(): 122 | raise TypeError("requires an integer") 123 | 124 | with self.assertRaises(FeedTypeError) as cm: 125 | charlie() 126 | 127 | e = cm.exception 128 | self.assertEqual(str(e), "bad feed type") 129 | self.assertTrue(hasattr(e, "original")) 130 | self.assertIsInstance(e.original, TypeError) 131 | self.assertEqual(str(e.original), "requires an integer") 132 | 133 | def test_reraise_trap(self): 134 | """ 135 | Test the reraise decorator by missing the trap 136 | """ 137 | 138 | # Test 4: Missing the trap 139 | 140 | @reraise(klass=FeedTypeError, message="bad feed type", trap=TypeError) 141 | def delta(): 142 | raise ValueError("this should be the exception raised") 143 | 144 | with self.assertRaises(ValueError) as cm: 145 | delta() 146 | 147 | e = cm.exception 148 | self.assertEqual(str(e), "this should be the exception raised") 149 | self.assertFalse(hasattr(e, "original")) 150 | -------------------------------------------------------------------------------- /tests/utils_tests/test_logger.py: -------------------------------------------------------------------------------- 1 | # tests.utils_tests.test_logger 2 | # Simple tests for the logger module. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Thu Mar 03 11:52:06 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: test_logger.py [df0c71b] benjamin@bengfort.com $ 11 | 12 | """ 13 | Simple tests for the logger module. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import getpass 21 | import unittest 22 | 23 | try: 24 | from unittest import mock 25 | except ImportError: 26 | import mock 27 | 28 | from baleen.utils.logger import * 29 | 30 | 31 | ########################################################################## 32 | ## Module Helpers 33 | ########################################################################## 34 | 35 | PREFIX = "TEST LOG" 36 | IGNORE = "IGNORE: This should not be in a log file or database!" 37 | 38 | def tmsgf(message, prefix=PREFIX, ignore=IGNORE): 39 | return "{}: {} ({})".format(prefix, message, ignore) 40 | 41 | ########################################################################## 42 | ## Logger Test 43 | ########################################################################## 44 | 45 | class IngestLoggerTests(unittest.TestCase): 46 | """ 47 | Simply exercises the methods of the logger. 48 | """ 49 | 50 | @mock.patch('baleen.utils.logger.IngestLogger.logger') 51 | def test_log_extra(self, mock_logger): 52 | """ 53 | Assert that extra (user) is passed to logger 54 | """ 55 | 56 | logger = IngestLogger() 57 | 58 | assert logger.logger is mock_logger 59 | 60 | message = tmsgf("Do not double space after a period!") 61 | logger.log(logging.DEBUG, message) 62 | 63 | mock_logger.log.assert_called_with(logging.DEBUG, message, extra={'user': getpass.getuser()}) 64 | 65 | @mock.patch('baleen.utils.logger.IngestLogger.logger') 66 | def test_log_debug(self, mock_logger): 67 | """ 68 | Test the debug logger 69 | """ 70 | 71 | logger = IngestLogger() 72 | 73 | assert logger.logger is mock_logger 74 | 75 | message = tmsgf("All CAPS is not shouting!") 76 | logger.debug(message) 77 | 78 | mock_logger.log.assert_called_with(logging.DEBUG, message, extra=mock.ANY) 79 | 80 | @mock.patch('baleen.utils.logger.IngestLogger.logger') 81 | def test_log_info(self, mock_logger): 82 | """ 83 | Test the info logger 84 | """ 85 | 86 | logger = IngestLogger() 87 | 88 | assert logger.logger is mock_logger 89 | 90 | message = tmsgf("Birds and Bees Flock with Seas!") 91 | logger.info(message) 92 | 93 | mock_logger.log.assert_called_with(logging.INFO, message, extra=mock.ANY) 94 | 95 | @mock.patch('baleen.utils.logger.IngestLogger.logger') 96 | def test_log_warn(self, mock_logger): 97 | """ 98 | Test the warn logger 99 | """ 100 | 101 | logger = IngestLogger() 102 | 103 | assert logger.logger is mock_logger 104 | 105 | message = tmsgf("You shouldn't touch that hot stove!") 106 | logger.warn(message) 107 | 108 | mock_logger.log.assert_called_with(logging.WARNING, message, extra=mock.ANY) 109 | 110 | @mock.patch('baleen.utils.logger.IngestLogger.logger') 111 | def test_log_error(self, mock_logger): 112 | """ 113 | Test the error logger 114 | """ 115 | 116 | logger = IngestLogger() 117 | 118 | assert logger.logger is mock_logger 119 | 120 | message = tmsgf("Someone let the rooster into the hen house!") 121 | logger.error(message) 122 | 123 | mock_logger.log.assert_called_with(logging.ERROR, message, extra=mock.ANY) 124 | 125 | @mock.patch('baleen.utils.logger.IngestLogger.logger') 126 | def test_log_critical(self, mock_logger): 127 | """ 128 | Test the critical logger 129 | """ 130 | 131 | logger = IngestLogger() 132 | 133 | assert logger.logger is mock_logger 134 | 135 | message = tmsgf("Someone let the fox into the hen house!") 136 | logger.critical(message) 137 | 138 | mock_logger.log.assert_called_with(logging.CRITICAL, message, extra=mock.ANY) 139 | -------------------------------------------------------------------------------- /tests/utils_tests/test_mongolog.py: -------------------------------------------------------------------------------- 1 | # tests.utils_tests.test_mongolog 2 | # Simple tests for logging to MongoDB 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Thu Mar 03 11:53:46 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: test_mongolog.py [df0c71b] benjamin@bengfort.com $ 11 | 12 | """ 13 | Simple tests for logging to MongoDB 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import logging 21 | import unittest 22 | 23 | from mongomock import MongoClient as MockMongoClient 24 | 25 | try: 26 | from unittest import mock 27 | except ImportError: 28 | import mock 29 | 30 | from baleen.utils import mongolog as ml 31 | from .test_logger import tmsgf 32 | 33 | ########################################################################## 34 | ## Mongo Log Handler Tests 35 | ########################################################################## 36 | 37 | class MongoLogHandlerTests(unittest.TestCase): 38 | """ 39 | Simply exercises the methods of the logger. 40 | """ 41 | 42 | @mock.patch('baleen.utils.mongolog.MongoClient', MockMongoClient) 43 | def test_logging_to_mongo(self): 44 | """ 45 | Test the mongo log handler and logging to mongo 46 | """ 47 | assert ml.MongoClient is MockMongoClient 48 | 49 | handler = ml.MongoHandler(level=logging.DEBUG) 50 | self.assertIsInstance(handler.connection, MockMongoClient) 51 | 52 | # Ensure there is nothing in the database. 53 | self.assertEqual(handler.collection.count(), 0) 54 | 55 | # Create the logging instance. 56 | logger = logging.getLogger('test.mongo.logger.demo') 57 | logger.setLevel(logging.INFO) 58 | logger.addHandler(handler) 59 | 60 | # Log a message 61 | logger.info(tmsgf("This is a test of the mongo logger")) 62 | 63 | # Ensure there is now a log message 64 | self.assertEqual(handler.collection.count(), 1) 65 | -------------------------------------------------------------------------------- /tests/utils_tests/test_timez.py: -------------------------------------------------------------------------------- 1 | # test.utils_tests.test_timez 2 | # Testing for the timez time helpers library. 3 | # 4 | # Author: Benjamin Bengfort 5 | # Created: Sun Feb 21 15:33:18 2016 -0500 6 | # 7 | # Copyright (C) 2016 Bengfort.com 8 | # For license information, see LICENSE.txt 9 | # 10 | # ID: test_timez.py [df0c71b] benjamin@bengfort.com $ 11 | 12 | """ 13 | Testing for the timez time helpers library. 14 | """ 15 | 16 | ########################################################################## 17 | ## Imports 18 | ########################################################################## 19 | 20 | import time 21 | import unittest 22 | 23 | from datetime import datetime 24 | from dateutil.tz import tzutc 25 | from baleen.utils.timez import * 26 | 27 | ########################################################################## 28 | ## Helper Functions Test Cases 29 | ########################################################################## 30 | 31 | class TimezHelpersTests(unittest.TestCase): 32 | 33 | def setUp(self): 34 | self.localnow = datetime.now(tzlocal()).replace(microsecond=0) 35 | self.utcnow = self.localnow.astimezone(tzutc()) 36 | 37 | def tearDown(self): 38 | self.localnow = self.utcnow = None 39 | 40 | def test_non_naive_datetimes(self): 41 | """ 42 | Assert that localnow and utcnow return non-naive datetimes 43 | """ 44 | self.assertIsNotNone(localnow().tzinfo) 45 | self.assertIsNotNone(utcnow().tzinfo) 46 | 47 | def test_humanizedelta(self): 48 | """ 49 | Test the humanize delta function to convert seconds 50 | """ 51 | cases = ( 52 | (12512334, "144 days 19 hours 38 minutes 54 seconds"), 53 | (34321, "9 hours 32 minutes 1 second"), 54 | (3428, "57 minutes 8 seconds"), 55 | (1, "1 second"), 56 | (0.21, "0 second"), 57 | ) 58 | 59 | for seconds, expected in cases: 60 | self.assertEqual(humanizedelta(seconds=seconds), expected) 61 | 62 | def test_humanizedelta_milliseconds(self): 63 | """ 64 | Test the humanize delta function to conver milliseconds 65 | """ 66 | 67 | # Case with seconds already there 68 | self.assertEqual(humanizedelta(seconds=10, milliseconds=2000), '12 seconds') 69 | 70 | # Case without seconds present 71 | self.assertEqual(humanizedelta(milliseconds=456875), '7 minutes 36 seconds') 72 | 73 | def test_strptimez(self): 74 | """ 75 | Test the parsing of timezone aware date strings 76 | """ 77 | dtfmt = "%Y-%m-%dT%H:%M:%S%z" 78 | 79 | cases = ( 80 | ('2012-12-27T12:53:12-0500', datetime(2012, 12, 27, 17, 53, 12, tzinfo=tzutc())), 81 | ('2012-12-27T12:53:12+0800', datetime(2012, 12, 27, 4, 53, 12, tzinfo=tzutc())), 82 | ) 83 | 84 | for dtstr, dt in cases: 85 | self.assertEqual(dt, strptimez(dtstr, dtfmt)) 86 | 87 | # Non-timezone case 88 | self.assertEqual( 89 | strptimez('2012-12-27T12:53:12', "%Y-%m-%dT%H:%M:%S"), 90 | datetime(2012, 12, 27, 12, 53, 12) 91 | ) 92 | 93 | def test_strptimez_no_z(self): 94 | """ 95 | Assert that strptimez works with no '%z' 96 | This should return a timezone naive datetime 97 | """ 98 | dtfmt = "%a %b %d %H:%M:%S %Y" 99 | dtstr = self.localnow.strftime(dtfmt) 100 | self.assertEqual(strptimez(dtstr, dtfmt), self.localnow.replace(tzinfo=None)) 101 | 102 | 103 | def test_strptimez_no_space(self): 104 | """ 105 | Non-space delimited '%z' works 106 | """ 107 | dtfmt = "%Y-%m-%dT%H:%M:%S%z" 108 | dtstr = self.localnow.strftime(dtfmt) 109 | self.assertEqual(strptimez(dtstr, dtfmt), self.utcnow) 110 | 111 | def test_begin_z(self): 112 | """ 113 | Test fmt that begins with '%z' 114 | """ 115 | dtfmt = "%z %H:%M:%S for %Y-%m-%d" 116 | dtstr = self.localnow.strftime(dtfmt) 117 | self.assertEqual(strptimez(dtstr, dtfmt), self.utcnow) 118 | 119 | def test_middle_z(self): 120 | """ 121 | Test fmt that contains '%z' 122 | """ 123 | dtfmt = "time is: %H:%M:%S %z on %Y-%m-%d " 124 | dtstr = self.localnow.strftime(dtfmt) 125 | self.assertEqual(strptimez(dtstr, dtfmt), self.utcnow) 126 | 127 | def test_timer(self): 128 | """ 129 | Test the Timer context manager 130 | """ 131 | with Timer() as t: 132 | time.sleep(1) 133 | 134 | self.assertGreater(t.finished, t.started) 135 | self.assertEqual(t.elapsed, t.finished-t.started) 136 | self.assertEqual(str(t), '1 seconds') 137 | --------------------------------------------------------------------------------