├── .gitignore ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.md ├── README.txt ├── application.wsgi.template ├── demo.py ├── doc ├── Makefile ├── build │ ├── doctrees │ │ ├── dssg.doctree │ │ ├── dssg.webapp.doctree │ │ ├── environment.pickle │ │ ├── index.doctree │ │ ├── modules.doctree │ │ └── web_hooks.doctree │ └── html │ │ ├── .buildinfo │ │ ├── _sources │ │ ├── dssg.txt │ │ ├── dssg.webapp.txt │ │ ├── index.txt │ │ ├── modules.txt │ │ └── web_hooks.txt │ │ ├── _static │ │ ├── ajax-loader.gif │ │ ├── basic.css │ │ ├── comment-bright.png │ │ ├── comment-close.png │ │ ├── comment.png │ │ ├── default.css │ │ ├── doctools.js │ │ ├── down-pressed.png │ │ ├── down.png │ │ ├── file.png │ │ ├── jquery.js │ │ ├── minus.png │ │ ├── plus.png │ │ ├── pygments.css │ │ ├── searchtools.js │ │ ├── sidebar.js │ │ ├── underscore.js │ │ ├── up-pressed.png │ │ ├── up.png │ │ └── websupport.js │ │ ├── dssg.html │ │ ├── dssg.webapp.html │ │ ├── genindex.html │ │ ├── index.html │ │ ├── modules.html │ │ ├── objects.inv │ │ ├── py-modindex.html │ │ ├── search.html │ │ ├── searchindex.js │ │ └── web_hooks.html ├── make.bat └── source │ ├── conf.py │ ├── dssg.rst │ ├── dssg.webapp.rst │ ├── index.rst │ └── modules.rst ├── dssg ├── README.md ├── __init__.py ├── classifier.py ├── config │ ├── dssg.ini │ └── dssg.ini.template ├── data │ └── classifier │ │ ├── election_v000.pkl │ │ └── election_v001.pkl ├── machine.py ├── model │ ├── __init__.py │ ├── base_model.py │ ├── category.py │ ├── deployment.py │ ├── message.py │ └── report.py ├── platt.py ├── tests │ ├── __init__.py │ ├── test_base_model.py │ ├── test_deployment.py │ ├── test_extract_from_text.py │ ├── test_machine.py │ ├── test_pep8.py │ └── test_rest_api.py ├── util.py ├── vectorizer.py └── webapp │ ├── README.md │ ├── __init__.py │ └── rest_api.py ├── nltk_data ├── chunkers │ └── maxent_ne_chunker │ │ └── english_ace_multiclass.pickle ├── corpora │ └── words │ │ ├── README │ │ ├── en │ │ └── en-basic ├── taggers │ └── maxent_treebank_pos_tagger │ │ └── english.pickle └── tokenizers │ └── punkt │ ├── README │ └── english.pickle ├── requirements-dev.txt ├── requirements-travis.txt ├── requirements.txt ├── scripts ├── autopep8.py ├── autopep8.sh ├── make_sphinx_docs.sh ├── nosetests.sh └── upload_to_pypi.sh ├── server.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | *.pyc 3 | *.DS_Store 4 | *.swp 5 | dssg/tmp_* 6 | machine.data* 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | ##-- Travis guide for Python projects: 2 | ##-- http://about.travis-ci.org/docs/user/languages/python/ 3 | language: python 4 | 5 | ##-- which python version(s) 6 | python: 7 | - "2.7" 8 | 9 | ##-- set environment variables 10 | # env: 11 | # - EXAMPLE_VAR=1.2.3 12 | 13 | ##-- install dependencies 14 | virtualenv: 15 | system_site_packages: true 16 | 17 | before_install: 18 | - sudo apt-get install -qq python-numpy python-scipy 19 | 20 | install: 21 | - "pip install -r requirements-travis.txt --use-mirrors" 22 | - sudo mv nltk_data /usr/share/nltk_data 23 | 24 | ##-- run tests 25 | script: nosetests dssg/tests 26 | 27 | ##-- choose git branches 28 | branches: 29 | only: 30 | - master -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (C) 2013 [Data Science for Social Good Fellowship at the University of Chicago](http://dssg.io) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Ushine learning: Smarter crowdsourcing for crisis maps 2 | 3 | 4 | **Ushine Learning** is a machine learning API built to support [Ushahidi](http://www.ushahidi.com), a crowdsourced crisis reporting platform. 5 | 6 | This is project is a part of the 2013 [Data Science for Social Good](http://www.dssg.io) fellowship, in partnership with [Ushahidi](http://www.ushahidi.com/). 7 | 8 | *For a quick and gentle overview of the project, check out our [blog post](http://dssg.io/2013/07/15/ushahidi-machine-learning-for-human-rights.html).* 9 | 10 | ## Background: crisis crowdsourcing 11 | 12 | In **crisis situations** like contested elections, natural disasters, and troubling humanitarian situations, there's an _information gap_ between the _providers_ (the voters, disaster survivors, and victims) and the _responders_ (the election monitors, aid organizations, NGOs and journalists). 13 | 14 | **Crowdsourced crisis reporting platforms**, like Ushahidi, aim to narrow this information gap. They provide centralized software to _collect, curate, and publish reports_ coming from the ground during crises. 15 | 16 | 17 | 18 | ## The problem: Human review of reports doesn’t scale 19 | 20 | Currently, each report is processed prior to publication by a human reviewer. Reviewers needs to go through a series of tasks: translating, finding the location, applying category labels, removing personally-identifying information, and more. Not only do they have to extract information, but they also need to verify its accuracy against what's truly happening on the ground. 21 | 22 | The human review process is slow and tedious, may require domain expertise, and may be inconsistent across reviewers. It is difficult to scale and problematic for high-volume or fast-paced reporting situations. 23 | 24 | ## The solution: annotation suggestions using natural language processing 25 | 26 | We use computing to make the review process scale. By using machine learning and natural language processing, we can make initial guesses or automatically extract items which previously had been entirely human-determined (such as categories, location, URL, and sensitive information). With our system, no longer must the reviewers do everything from scratch. 27 | 28 | This reduces the number of reviewers needed, and lessens the time and tedium they spend processing. Instead, reviewers can focus their energies on _verifying accuracy_ and _responding_ to the reports-- the parts that really matter. 29 | 30 | ## Project layout 31 | 32 | Recall that we are concerned with labeling reports, and the steps a report goes through are the following: 33 | 34 | 1. A "citizen" submits a report to Ushahidi. 35 | 2. **NEW**: Ushahidi sends the report to Ushine Learning, which generates suggested labels and returns them to Ushahidi. 36 | 3. Ushahidi shows the incoming report to an "admin", who annotates it. **NEW**: Suggested labels can be shown to the admin to help make their annotation process easier. 37 | 4. The admin applies the final labels and approves the report. 38 | 5. This report, with its labels, is added to a map of all reports. This map is used to help raise situational awareness. 39 | 40 | In order to achieve this workflow, our project has 4 major pieces. The **Machine Learning Module**, **Flask Webapp**, and **Ushahidi Plugin** make up the system's architecture. The **User Experiment** is an important part of our methodology: experimental validation of our results by testing with real users. 41 | 42 | At the base is a Python **(1) Machine Learning Module** which learns from a corpus of labeled reports and provides automated suggested labels for novel reports. This component needs to have a way to communicate with Ushahidi, a web platform, so we've created a **(2) Flask Webapp** which which wraps the Machine Learning module and can communicate with an Ushahidi server. The Flask Webapp, at a high-level, _receives reports_ from and _sends suggestions_ to Ushahidi, using a REST-ful API and JSON objects. But the truth is that we don't talk directly to a vanilla Ushahidi; instead, we talk to an **(3) Ushahidi Plugin** deployed on a Crowdmap instance. This plugin is written in PHP and connected with the Ushahidi Crowdmap. It provides to glue to send and receive on the Ushahidi side. (Note: this plugin requires some core changes into the Ushahidi platform in order to show its results. We hope these changes will be incorporated into Ushahidi 2.x and 3.0.) 43 | 44 | The **(4) User Experiment** was made to test our impact on real users. Without real users, we could evaluate the accuracy of our algorithms on test data. However, the scenarios and outcomes that concerned us most were proving that we improved from "before" (no suggestions) to "after" (with machine suggestions) on parameters like: speed, accuracy, and frustration. You can read in detail about this work and our experimental results in the Wiki. 45 | 46 | Technical details of each of these components are linked below. 47 | 48 | 1. [Machine Learning Module](https://github.com/dssg/ushine-learning/tree/master/dssg) 49 | 2. [Flask Webapp](https://github.com/dssg/ushine-learning/tree/master/dssg/webapp) 50 | 3. [Ushahidi Plugin](https://github.com/ekala/Ushahidi_Web/tree/dssg-integration) 51 | 4. [User Experiment](https://github.com/nathanleiby/ushine-learning-experiment) 52 | 53 | ## Installation Guide 54 | 55 | ### Basics 56 | 57 | Clone the repo. 58 | 59 | ``` 60 | git clone https://github.com/dssg/ushine-learning 61 | cd ushine-learning/ 62 | ``` 63 | 64 | Install python requirements. 65 | 66 | ``` 67 | pip install -r requirements.txt 68 | ``` 69 | 70 | Install NTLK dependencies. 71 | 72 | ``` 73 | mv nltk_data /usr/share/nltk_data # on unix 74 | ``` 75 | 76 | 79 | 80 | 85 | 86 | ### Webapp Deployment 87 | 88 | *Setup configuration* 89 | 90 | Create a config file. 91 | 92 | ``` 93 | cp dssg/config/dssg.ini.template dssg/config/dssg.ini 94 | ``` 95 | 96 | Edit the `dssg/config/dssg.ini` config file with 97 | - database settings 98 | - path to classifier, which is stored as a pickled Python object (`/path/to/classifer.pkl`), e.g. in the `dssg/data/classifier` directory. 99 | 100 | *How To Run The Flask Web App* 101 | 102 | Then, run the webapp. You can run it directly via 103 | 104 | ``` 105 | python server.py 106 | ``` 107 | 108 | To deploy the webapp in production, we suggest using [Gunicorn](http://gunicorn.org/) & [nginx](http://nginx.org/). 109 | 110 | ## Documentation 111 | 112 | The latest documentation is available on [ReadTheDocs](https://ushine-learning.readthedocs.org/en/latest/). 113 | 114 | To update the documentation, you may do the following: 115 | 116 | 1. Auto-generate the latest API docs. Run `sphinx-apidoc -o doc/source dssg`, passing `-f` flag to overwrite existing apidocs. 117 | 2. Optional: Update the doc/source files directly. 118 | 3. Make the updated HTML files. Run `make html` from `doc/` path, where makefile resides. 119 | 120 | ## FAQ 121 | 122 | **Why Ushine Learning?** Ushahidi. Machine Learning. Pronounced "oo-sheen". 123 | 124 | 125 | ## Team 126 | ![Team](https://raw.github.com/dssg/dssg.github.io/761993c24ea2991170ef64048115cb805f5f13fb/img/people/teams/ushahidi.png) 127 | 128 | 129 | ## Contributing to the project 130 | 131 | To get involved, please check the [issue tracker](https://github.com/dssg/ushine-learning/issues). Issues include everything from bugs to new project ideas that we'd like to see happen! 132 | 133 | To get in touch, email the team at dssg-ushahidi@googlegroups.com or file a Github issue. 134 | 135 | ## License 136 | 137 | The MIT License (MIT) 138 | 139 | Copyright (C) 2013 [Data Science for Social Good Fellowship at the University of Chicago](http://dssg.io) 140 | 141 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 142 | 143 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 144 | 145 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 146 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | Hello world readme 2 | 3 | This file is included in MANIFEST.in, saying it should be pushed to pypi (Python package manager) -------------------------------------------------------------------------------- /application.wsgi.template: -------------------------------------------------------------------------------- 1 | import sys 2 | from os.path import dirname, realpath 3 | 4 | # Append the application path to the system path 5 | app_path = dirname(realpath(__file__)) 6 | sys.path.append(app_path) 7 | 8 | from dssg import load_config 9 | from dssg.webapp import app as application 10 | 11 | config_file = app_path + '/dssg/config/dssg.ini' 12 | load_config(application, config_file) 13 | 14 | # Import the API endpoints 15 | from dssg.webapp.rest_api import * 16 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | # TODO: Change pickle to cpickle. pickle in binary representation. 5 | 6 | import dssg.Machine 7 | import cPickle as pickle 8 | import json 9 | from dssg.junutils import * 10 | import random 11 | import time 12 | import operator 13 | import argparse 14 | from pprint import pprint 15 | 16 | # Show logging for training of classifer 17 | import sys 18 | import logging 19 | from functools import reduce 20 | FORMAT = '%(levelname)s: %(message)s' 21 | logging.basicConfig(format=FORMAT, stream=sys.stderr, level=logging.INFO) 22 | 23 | MACHINE_FILENAME = 'machine.data' 24 | mac = None 25 | # labeledMessageList = getLabeledMessagesFromJson(uchaguziJsonPath)[:-1] 26 | labeledMessageList = getFullMessagesFromUchaguziMergedCategory( 27 | uchaguziJsonPath, uchaguziCategoryJsonPath) 28 | 29 | # Train on subset (say 75%) of messages 30 | random.seed(0) 31 | # Just for testing purpose 32 | random.shuffle(labeledMessageList) 33 | nTrain = int(round(float(len(labeledMessageList)) * .75)) 34 | # nTrain = 300 35 | trainingSet = labeledMessageList[0:nTrain] 36 | validationSet = labeledMessageList[nTrain:] 37 | 38 | 39 | def main(): 40 | parser = argparse.ArgumentParser( 41 | description='Demo of DSSG-Ushahidi machine learning') 42 | 43 | parser.add_argument('-t', '--train', action='store_true', default=False, 44 | dest='boolean_train', 45 | help='Train the machine with a dataset') 46 | # TODO: Allow passing "which" dataset as a parameter... 47 | # name the outputted machine file appropriately 48 | parser.add_argument('-g', '--guess', action='store_true', default=False, 49 | dest='boolean_guess', 50 | help='Select a random item from the validation set, then show result of guessing with the trained model') 51 | parser.add_argument( 52 | '-m', '--message', help='Custom message to guess on', required=False, default=None) 53 | 54 | args = vars(parser.parse_args()) 55 | 56 | if args['boolean_train']: 57 | print train() 58 | 59 | if args['boolean_guess']: 60 | pprint(guess()) 61 | 62 | if args['message']: 63 | print args['message'] 64 | pprint(guess(args['message'])) 65 | 66 | # check if any arguments have a value which is not None or False 67 | arguments_exist = reduce(lambda x, y: x or y, args.values()) 68 | if not arguments_exist: 69 | print "use -h to view help" 70 | 71 | 72 | def train(): 73 | new_machine() 74 | mac = load_machine() 75 | if not mac: 76 | return "Error loading machine" 77 | 78 | start = time.time() 79 | 80 | print len(trainingSet) 81 | 82 | mac.train(trainingSet) 83 | duration = time.time() - start 84 | print "training time (seconds) = %s" % (duration, ) 85 | 86 | save_machine(mac) 87 | 88 | 89 | # TODO: allow passing any message_id, to check its output 90 | def guess(text=None): 91 | mac = load_machine() 92 | 93 | if not mac: 94 | print "Error loading machine" 95 | 96 | if not text: 97 | random.seed() 98 | # - use system time to set seed 99 | m = random.choice(validationSet) 100 | # pprint(m) 101 | actual_message = m['description'] 102 | actual_labels = m['categories'] 103 | else: 104 | actual_message = text 105 | actual_labels = None 106 | 107 | print 108 | print "Message text..." 109 | print actual_message 110 | 111 | print 112 | print "Guessing categories and similar messages..." 113 | g = mac.guess([m]) 114 | pprint(g) 115 | 116 | print 117 | print "Guessing Language..." 118 | language_guess = mac.guess_language(actual_message) 119 | pprint(language_guess) 120 | 121 | print 122 | print "Guessing entities..." 123 | ent_text = mac.guess_entities(actual_message) 124 | pprint(ent_text) 125 | 126 | 127 | # 128 | # Helpers 129 | # 130 | 131 | def new_machine(): 132 | print "creating new machine" 133 | mac = dssg.Machine.Machine() 134 | mac = save_machine(mac) 135 | 136 | 137 | def load_machine(): 138 | global mac 139 | 140 | print "loading machine from", MACHINE_FILENAME 141 | start = time.time() 142 | 143 | if not mac: 144 | try: 145 | f = open(MACHINE_FILENAME, 'r') 146 | mac = pickle.load(f) 147 | 148 | except Exception: 149 | print "failed to load data from %s" % (MACHINE_FILENAME,) 150 | 151 | duration = time.time() - start 152 | print "load time (seconds) = %s" % (duration, ) 153 | 154 | return mac 155 | 156 | 157 | def save_machine(mac=None): 158 | f = open(MACHINE_FILENAME, 'w') 159 | pickle.dump(mac, f) 160 | f.close() 161 | 162 | return mac 163 | 164 | if __name__ == "__main__": 165 | main() 166 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/WebHooksDemo.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/WebHooksDemo.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/WebHooksDemo" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/WebHooksDemo" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /doc/build/doctrees/dssg.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/ushine-learning/71e2eb6746b464a509c1bb39c495591f5b0a5d5f/doc/build/doctrees/dssg.doctree -------------------------------------------------------------------------------- /doc/build/doctrees/dssg.webapp.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/ushine-learning/71e2eb6746b464a509c1bb39c495591f5b0a5d5f/doc/build/doctrees/dssg.webapp.doctree -------------------------------------------------------------------------------- /doc/build/doctrees/environment.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/ushine-learning/71e2eb6746b464a509c1bb39c495591f5b0a5d5f/doc/build/doctrees/environment.pickle -------------------------------------------------------------------------------- /doc/build/doctrees/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/ushine-learning/71e2eb6746b464a509c1bb39c495591f5b0a5d5f/doc/build/doctrees/index.doctree -------------------------------------------------------------------------------- /doc/build/doctrees/modules.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/ushine-learning/71e2eb6746b464a509c1bb39c495591f5b0a5d5f/doc/build/doctrees/modules.doctree -------------------------------------------------------------------------------- /doc/build/doctrees/web_hooks.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/ushine-learning/71e2eb6746b464a509c1bb39c495591f5b0a5d5f/doc/build/doctrees/web_hooks.doctree -------------------------------------------------------------------------------- /doc/build/html/.buildinfo: -------------------------------------------------------------------------------- 1 | # Sphinx build info version 1 2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. 3 | config: ea6bbb6ebb41a3ac8876be51867fd8e1 4 | tags: a205e9ed8462ae86fdd2f73488852ba9 5 | -------------------------------------------------------------------------------- /doc/build/html/_sources/dssg.txt: -------------------------------------------------------------------------------- 1 | dssg Package 2 | ============ 3 | 4 | :mod:`Machine` Module 5 | --------------------- 6 | 7 | .. automodule:: dssg.Machine 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | 12 | :mod:`junutils` Module 13 | ---------------------- 14 | 15 | .. automodule:: dssg.junutils 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | Subpackages 21 | ----------- 22 | 23 | .. toctree:: 24 | 25 | dssg.webapp 26 | 27 | -------------------------------------------------------------------------------- /doc/build/html/_sources/dssg.webapp.txt: -------------------------------------------------------------------------------- 1 | webapp Package 2 | ============== 3 | 4 | :mod:`webapp` Package 5 | --------------------- 6 | 7 | .. automodule:: dssg.webapp 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | 12 | :mod:`rest_api` Module 13 | ---------------------- 14 | 15 | .. automodule:: dssg.webapp.rest_api 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | -------------------------------------------------------------------------------- /doc/build/html/_sources/index.txt: -------------------------------------------------------------------------------- 1 | .. pybedtools documentation master file, created by 2 | sphinx-quickstart on Tue Apr 13 18:15:46 2010. 3 | You can adapt this file completely to your liking, but it 4 | should at least contain the root `toctree` directive. 5 | 6 | Welcome to examplemodule's documentation! 7 | ========================================= 8 | 9 | Contents: 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | modules.rst 15 | 16 | .. automodule:: dssg.Machine 17 | 18 | .. autoclass:: dssg.Machine 19 | :members: 20 | -------------------------------------------------------------------------------- /doc/build/html/_sources/modules.txt: -------------------------------------------------------------------------------- 1 | dssg 2 | ==== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | dssg 8 | -------------------------------------------------------------------------------- /doc/build/html/_sources/web_hooks.txt: -------------------------------------------------------------------------------- 1 | web_hooks Package 2 | ================= 3 | 4 | :mod:`Machine` Module 5 | --------------------- 6 | 7 | .. automodule:: dssg.Machine 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: -------------------------------------------------------------------------------- /doc/build/html/_static/ajax-loader.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/ushine-learning/71e2eb6746b464a509c1bb39c495591f5b0a5d5f/doc/build/html/_static/ajax-loader.gif -------------------------------------------------------------------------------- /doc/build/html/_static/basic.css: -------------------------------------------------------------------------------- 1 | /* 2 | * basic.css 3 | * ~~~~~~~~~ 4 | * 5 | * Sphinx stylesheet -- basic theme. 6 | * 7 | * :copyright: Copyright 2007-2013 by the Sphinx team, see AUTHORS. 8 | * :license: BSD, see LICENSE for details. 9 | * 10 | */ 11 | 12 | /* -- main layout ----------------------------------------------------------- */ 13 | 14 | div.clearer { 15 | clear: both; 16 | } 17 | 18 | /* -- relbar ---------------------------------------------------------------- */ 19 | 20 | div.related { 21 | width: 100%; 22 | font-size: 90%; 23 | } 24 | 25 | div.related h3 { 26 | display: none; 27 | } 28 | 29 | div.related ul { 30 | margin: 0; 31 | padding: 0 0 0 10px; 32 | list-style: none; 33 | } 34 | 35 | div.related li { 36 | display: inline; 37 | } 38 | 39 | div.related li.right { 40 | float: right; 41 | margin-right: 5px; 42 | } 43 | 44 | /* -- sidebar --------------------------------------------------------------- */ 45 | 46 | div.sphinxsidebarwrapper { 47 | padding: 10px 5px 0 10px; 48 | } 49 | 50 | div.sphinxsidebar { 51 | float: left; 52 | width: 230px; 53 | margin-left: -100%; 54 | font-size: 90%; 55 | } 56 | 57 | div.sphinxsidebar ul { 58 | list-style: none; 59 | } 60 | 61 | div.sphinxsidebar ul ul, 62 | div.sphinxsidebar ul.want-points { 63 | margin-left: 20px; 64 | list-style: square; 65 | } 66 | 67 | div.sphinxsidebar ul ul { 68 | margin-top: 0; 69 | margin-bottom: 0; 70 | } 71 | 72 | div.sphinxsidebar form { 73 | margin-top: 10px; 74 | } 75 | 76 | div.sphinxsidebar input { 77 | border: 1px solid #98dbcc; 78 | font-family: sans-serif; 79 | font-size: 1em; 80 | } 81 | 82 | div.sphinxsidebar #searchbox input[type="text"] { 83 | width: 170px; 84 | } 85 | 86 | div.sphinxsidebar #searchbox input[type="submit"] { 87 | width: 30px; 88 | } 89 | 90 | img { 91 | border: 0; 92 | } 93 | 94 | /* -- search page ----------------------------------------------------------- */ 95 | 96 | ul.search { 97 | margin: 10px 0 0 20px; 98 | padding: 0; 99 | } 100 | 101 | ul.search li { 102 | padding: 5px 0 5px 20px; 103 | background-image: url(file.png); 104 | background-repeat: no-repeat; 105 | background-position: 0 7px; 106 | } 107 | 108 | ul.search li a { 109 | font-weight: bold; 110 | } 111 | 112 | ul.search li div.context { 113 | color: #888; 114 | margin: 2px 0 0 30px; 115 | text-align: left; 116 | } 117 | 118 | ul.keywordmatches li.goodmatch a { 119 | font-weight: bold; 120 | } 121 | 122 | /* -- index page ------------------------------------------------------------ */ 123 | 124 | table.contentstable { 125 | width: 90%; 126 | } 127 | 128 | table.contentstable p.biglink { 129 | line-height: 150%; 130 | } 131 | 132 | a.biglink { 133 | font-size: 1.3em; 134 | } 135 | 136 | span.linkdescr { 137 | font-style: italic; 138 | padding-top: 5px; 139 | font-size: 90%; 140 | } 141 | 142 | /* -- general index --------------------------------------------------------- */ 143 | 144 | table.indextable { 145 | width: 100%; 146 | } 147 | 148 | table.indextable td { 149 | text-align: left; 150 | vertical-align: top; 151 | } 152 | 153 | table.indextable dl, table.indextable dd { 154 | margin-top: 0; 155 | margin-bottom: 0; 156 | } 157 | 158 | table.indextable tr.pcap { 159 | height: 10px; 160 | } 161 | 162 | table.indextable tr.cap { 163 | margin-top: 10px; 164 | background-color: #f2f2f2; 165 | } 166 | 167 | img.toggler { 168 | margin-right: 3px; 169 | margin-top: 3px; 170 | cursor: pointer; 171 | } 172 | 173 | div.modindex-jumpbox { 174 | border-top: 1px solid #ddd; 175 | border-bottom: 1px solid #ddd; 176 | margin: 1em 0 1em 0; 177 | padding: 0.4em; 178 | } 179 | 180 | div.genindex-jumpbox { 181 | border-top: 1px solid #ddd; 182 | border-bottom: 1px solid #ddd; 183 | margin: 1em 0 1em 0; 184 | padding: 0.4em; 185 | } 186 | 187 | /* -- general body styles --------------------------------------------------- */ 188 | 189 | a.headerlink { 190 | visibility: hidden; 191 | } 192 | 193 | h1:hover > a.headerlink, 194 | h2:hover > a.headerlink, 195 | h3:hover > a.headerlink, 196 | h4:hover > a.headerlink, 197 | h5:hover > a.headerlink, 198 | h6:hover > a.headerlink, 199 | dt:hover > a.headerlink { 200 | visibility: visible; 201 | } 202 | 203 | div.body p.caption { 204 | text-align: inherit; 205 | } 206 | 207 | div.body td { 208 | text-align: left; 209 | } 210 | 211 | .field-list ul { 212 | padding-left: 1em; 213 | } 214 | 215 | .first { 216 | margin-top: 0 !important; 217 | } 218 | 219 | p.rubric { 220 | margin-top: 30px; 221 | font-weight: bold; 222 | } 223 | 224 | img.align-left, .figure.align-left, object.align-left { 225 | clear: left; 226 | float: left; 227 | margin-right: 1em; 228 | } 229 | 230 | img.align-right, .figure.align-right, object.align-right { 231 | clear: right; 232 | float: right; 233 | margin-left: 1em; 234 | } 235 | 236 | img.align-center, .figure.align-center, object.align-center { 237 | display: block; 238 | margin-left: auto; 239 | margin-right: auto; 240 | } 241 | 242 | .align-left { 243 | text-align: left; 244 | } 245 | 246 | .align-center { 247 | text-align: center; 248 | } 249 | 250 | .align-right { 251 | text-align: right; 252 | } 253 | 254 | /* -- sidebars -------------------------------------------------------------- */ 255 | 256 | div.sidebar { 257 | margin: 0 0 0.5em 1em; 258 | border: 1px solid #ddb; 259 | padding: 7px 7px 0 7px; 260 | background-color: #ffe; 261 | width: 40%; 262 | float: right; 263 | } 264 | 265 | p.sidebar-title { 266 | font-weight: bold; 267 | } 268 | 269 | /* -- topics ---------------------------------------------------------------- */ 270 | 271 | div.topic { 272 | border: 1px solid #ccc; 273 | padding: 7px 7px 0 7px; 274 | margin: 10px 0 10px 0; 275 | } 276 | 277 | p.topic-title { 278 | font-size: 1.1em; 279 | font-weight: bold; 280 | margin-top: 10px; 281 | } 282 | 283 | /* -- admonitions ----------------------------------------------------------- */ 284 | 285 | div.admonition { 286 | margin-top: 10px; 287 | margin-bottom: 10px; 288 | padding: 7px; 289 | } 290 | 291 | div.admonition dt { 292 | font-weight: bold; 293 | } 294 | 295 | div.admonition dl { 296 | margin-bottom: 0; 297 | } 298 | 299 | p.admonition-title { 300 | margin: 0px 10px 5px 0px; 301 | font-weight: bold; 302 | } 303 | 304 | div.body p.centered { 305 | text-align: center; 306 | margin-top: 25px; 307 | } 308 | 309 | /* -- tables ---------------------------------------------------------------- */ 310 | 311 | table.docutils { 312 | border: 0; 313 | border-collapse: collapse; 314 | } 315 | 316 | table.docutils td, table.docutils th { 317 | padding: 1px 8px 1px 5px; 318 | border-top: 0; 319 | border-left: 0; 320 | border-right: 0; 321 | border-bottom: 1px solid #aaa; 322 | } 323 | 324 | table.field-list td, table.field-list th { 325 | border: 0 !important; 326 | } 327 | 328 | table.footnote td, table.footnote th { 329 | border: 0 !important; 330 | } 331 | 332 | th { 333 | text-align: left; 334 | padding-right: 5px; 335 | } 336 | 337 | table.citation { 338 | border-left: solid 1px gray; 339 | margin-left: 1px; 340 | } 341 | 342 | table.citation td { 343 | border-bottom: none; 344 | } 345 | 346 | /* -- other body styles ----------------------------------------------------- */ 347 | 348 | ol.arabic { 349 | list-style: decimal; 350 | } 351 | 352 | ol.loweralpha { 353 | list-style: lower-alpha; 354 | } 355 | 356 | ol.upperalpha { 357 | list-style: upper-alpha; 358 | } 359 | 360 | ol.lowerroman { 361 | list-style: lower-roman; 362 | } 363 | 364 | ol.upperroman { 365 | list-style: upper-roman; 366 | } 367 | 368 | dl { 369 | margin-bottom: 15px; 370 | } 371 | 372 | dd p { 373 | margin-top: 0px; 374 | } 375 | 376 | dd ul, dd table { 377 | margin-bottom: 10px; 378 | } 379 | 380 | dd { 381 | margin-top: 3px; 382 | margin-bottom: 10px; 383 | margin-left: 30px; 384 | } 385 | 386 | dt:target, .highlighted { 387 | background-color: #fbe54e; 388 | } 389 | 390 | dl.glossary dt { 391 | font-weight: bold; 392 | font-size: 1.1em; 393 | } 394 | 395 | .field-list ul { 396 | margin: 0; 397 | padding-left: 1em; 398 | } 399 | 400 | .field-list p { 401 | margin: 0; 402 | } 403 | 404 | .refcount { 405 | color: #060; 406 | } 407 | 408 | .optional { 409 | font-size: 1.3em; 410 | } 411 | 412 | .versionmodified { 413 | font-style: italic; 414 | } 415 | 416 | .system-message { 417 | background-color: #fda; 418 | padding: 5px; 419 | border: 3px solid red; 420 | } 421 | 422 | .footnote:target { 423 | background-color: #ffa; 424 | } 425 | 426 | .line-block { 427 | display: block; 428 | margin-top: 1em; 429 | margin-bottom: 1em; 430 | } 431 | 432 | .line-block .line-block { 433 | margin-top: 0; 434 | margin-bottom: 0; 435 | margin-left: 1.5em; 436 | } 437 | 438 | .guilabel, .menuselection { 439 | font-family: sans-serif; 440 | } 441 | 442 | .accelerator { 443 | text-decoration: underline; 444 | } 445 | 446 | .classifier { 447 | font-style: oblique; 448 | } 449 | 450 | abbr, acronym { 451 | border-bottom: dotted 1px; 452 | cursor: help; 453 | } 454 | 455 | /* -- code displays --------------------------------------------------------- */ 456 | 457 | pre { 458 | overflow: auto; 459 | overflow-y: hidden; /* fixes display issues on Chrome browsers */ 460 | } 461 | 462 | td.linenos pre { 463 | padding: 5px 0px; 464 | border: 0; 465 | background-color: transparent; 466 | color: #aaa; 467 | } 468 | 469 | table.highlighttable { 470 | margin-left: 0.5em; 471 | } 472 | 473 | table.highlighttable td { 474 | padding: 0 0.5em 0 0.5em; 475 | } 476 | 477 | tt.descname { 478 | background-color: transparent; 479 | font-weight: bold; 480 | font-size: 1.2em; 481 | } 482 | 483 | tt.descclassname { 484 | background-color: transparent; 485 | } 486 | 487 | tt.xref, a tt { 488 | background-color: transparent; 489 | font-weight: bold; 490 | } 491 | 492 | h1 tt, h2 tt, h3 tt, h4 tt, h5 tt, h6 tt { 493 | background-color: transparent; 494 | } 495 | 496 | .viewcode-link { 497 | float: right; 498 | } 499 | 500 | .viewcode-back { 501 | float: right; 502 | font-family: sans-serif; 503 | } 504 | 505 | div.viewcode-block:target { 506 | margin: -1px -10px; 507 | padding: 0 10px; 508 | } 509 | 510 | /* -- math display ---------------------------------------------------------- */ 511 | 512 | img.math { 513 | vertical-align: middle; 514 | } 515 | 516 | div.body div.math p { 517 | text-align: center; 518 | } 519 | 520 | span.eqno { 521 | float: right; 522 | } 523 | 524 | /* -- printout stylesheet --------------------------------------------------- */ 525 | 526 | @media print { 527 | div.document, 528 | div.documentwrapper, 529 | div.bodywrapper { 530 | margin: 0 !important; 531 | width: 100%; 532 | } 533 | 534 | div.sphinxsidebar, 535 | div.related, 536 | div.footer, 537 | #top-link { 538 | display: none; 539 | } 540 | } -------------------------------------------------------------------------------- /doc/build/html/_static/comment-bright.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/ushine-learning/71e2eb6746b464a509c1bb39c495591f5b0a5d5f/doc/build/html/_static/comment-bright.png -------------------------------------------------------------------------------- /doc/build/html/_static/comment-close.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/ushine-learning/71e2eb6746b464a509c1bb39c495591f5b0a5d5f/doc/build/html/_static/comment-close.png -------------------------------------------------------------------------------- /doc/build/html/_static/comment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/ushine-learning/71e2eb6746b464a509c1bb39c495591f5b0a5d5f/doc/build/html/_static/comment.png -------------------------------------------------------------------------------- /doc/build/html/_static/default.css: -------------------------------------------------------------------------------- 1 | /* 2 | * default.css_t 3 | * ~~~~~~~~~~~~~ 4 | * 5 | * Sphinx stylesheet -- default theme. 6 | * 7 | * :copyright: Copyright 2007-2013 by the Sphinx team, see AUTHORS. 8 | * :license: BSD, see LICENSE for details. 9 | * 10 | */ 11 | 12 | @import url("basic.css"); 13 | 14 | /* -- page layout ----------------------------------------------------------- */ 15 | 16 | body { 17 | font-family: sans-serif; 18 | font-size: 100%; 19 | background-color: #11303d; 20 | color: #000; 21 | margin: 0; 22 | padding: 0; 23 | } 24 | 25 | div.document { 26 | background-color: #1c4e63; 27 | } 28 | 29 | div.documentwrapper { 30 | float: left; 31 | width: 100%; 32 | } 33 | 34 | div.bodywrapper { 35 | margin: 0 0 0 230px; 36 | } 37 | 38 | div.body { 39 | background-color: #ffffff; 40 | color: #000000; 41 | padding: 0 20px 30px 20px; 42 | } 43 | 44 | div.footer { 45 | color: #ffffff; 46 | width: 100%; 47 | padding: 9px 0 9px 0; 48 | text-align: center; 49 | font-size: 75%; 50 | } 51 | 52 | div.footer a { 53 | color: #ffffff; 54 | text-decoration: underline; 55 | } 56 | 57 | div.related { 58 | background-color: #133f52; 59 | line-height: 30px; 60 | color: #ffffff; 61 | } 62 | 63 | div.related a { 64 | color: #ffffff; 65 | } 66 | 67 | div.sphinxsidebar { 68 | } 69 | 70 | div.sphinxsidebar h3 { 71 | font-family: 'Trebuchet MS', sans-serif; 72 | color: #ffffff; 73 | font-size: 1.4em; 74 | font-weight: normal; 75 | margin: 0; 76 | padding: 0; 77 | } 78 | 79 | div.sphinxsidebar h3 a { 80 | color: #ffffff; 81 | } 82 | 83 | div.sphinxsidebar h4 { 84 | font-family: 'Trebuchet MS', sans-serif; 85 | color: #ffffff; 86 | font-size: 1.3em; 87 | font-weight: normal; 88 | margin: 5px 0 0 0; 89 | padding: 0; 90 | } 91 | 92 | div.sphinxsidebar p { 93 | color: #ffffff; 94 | } 95 | 96 | div.sphinxsidebar p.topless { 97 | margin: 5px 10px 10px 10px; 98 | } 99 | 100 | div.sphinxsidebar ul { 101 | margin: 10px; 102 | padding: 0; 103 | color: #ffffff; 104 | } 105 | 106 | div.sphinxsidebar a { 107 | color: #98dbcc; 108 | } 109 | 110 | div.sphinxsidebar input { 111 | border: 1px solid #98dbcc; 112 | font-family: sans-serif; 113 | font-size: 1em; 114 | } 115 | 116 | 117 | 118 | /* -- hyperlink styles ------------------------------------------------------ */ 119 | 120 | a { 121 | color: #355f7c; 122 | text-decoration: none; 123 | } 124 | 125 | a:visited { 126 | color: #355f7c; 127 | text-decoration: none; 128 | } 129 | 130 | a:hover { 131 | text-decoration: underline; 132 | } 133 | 134 | 135 | 136 | /* -- body styles ----------------------------------------------------------- */ 137 | 138 | div.body h1, 139 | div.body h2, 140 | div.body h3, 141 | div.body h4, 142 | div.body h5, 143 | div.body h6 { 144 | font-family: 'Trebuchet MS', sans-serif; 145 | background-color: #f2f2f2; 146 | font-weight: normal; 147 | color: #20435c; 148 | border-bottom: 1px solid #ccc; 149 | margin: 20px -20px 10px -20px; 150 | padding: 3px 0 3px 10px; 151 | } 152 | 153 | div.body h1 { margin-top: 0; font-size: 200%; } 154 | div.body h2 { font-size: 160%; } 155 | div.body h3 { font-size: 140%; } 156 | div.body h4 { font-size: 120%; } 157 | div.body h5 { font-size: 110%; } 158 | div.body h6 { font-size: 100%; } 159 | 160 | a.headerlink { 161 | color: #c60f0f; 162 | font-size: 0.8em; 163 | padding: 0 4px 0 4px; 164 | text-decoration: none; 165 | } 166 | 167 | a.headerlink:hover { 168 | background-color: #c60f0f; 169 | color: white; 170 | } 171 | 172 | div.body p, div.body dd, div.body li { 173 | text-align: justify; 174 | line-height: 130%; 175 | } 176 | 177 | div.admonition p.admonition-title + p { 178 | display: inline; 179 | } 180 | 181 | div.admonition p { 182 | margin-bottom: 5px; 183 | } 184 | 185 | div.admonition pre { 186 | margin-bottom: 5px; 187 | } 188 | 189 | div.admonition ul, div.admonition ol { 190 | margin-bottom: 5px; 191 | } 192 | 193 | div.note { 194 | background-color: #eee; 195 | border: 1px solid #ccc; 196 | } 197 | 198 | div.seealso { 199 | background-color: #ffc; 200 | border: 1px solid #ff6; 201 | } 202 | 203 | div.topic { 204 | background-color: #eee; 205 | } 206 | 207 | div.warning { 208 | background-color: #ffe4e4; 209 | border: 1px solid #f66; 210 | } 211 | 212 | p.admonition-title { 213 | display: inline; 214 | } 215 | 216 | p.admonition-title:after { 217 | content: ":"; 218 | } 219 | 220 | pre { 221 | padding: 5px; 222 | background-color: #eeffcc; 223 | color: #333333; 224 | line-height: 120%; 225 | border: 1px solid #ac9; 226 | border-left: none; 227 | border-right: none; 228 | } 229 | 230 | tt { 231 | background-color: #ecf0f3; 232 | padding: 0 1px 0 1px; 233 | font-size: 0.95em; 234 | } 235 | 236 | th { 237 | background-color: #ede; 238 | } 239 | 240 | .warning tt { 241 | background: #efc2c2; 242 | } 243 | 244 | .note tt { 245 | background: #d6d6d6; 246 | } 247 | 248 | .viewcode-back { 249 | font-family: sans-serif; 250 | } 251 | 252 | div.viewcode-block:target { 253 | background-color: #f4debf; 254 | border-top: 1px solid #ac9; 255 | border-bottom: 1px solid #ac9; 256 | } -------------------------------------------------------------------------------- /doc/build/html/_static/doctools.js: -------------------------------------------------------------------------------- 1 | /* 2 | * doctools.js 3 | * ~~~~~~~~~~~ 4 | * 5 | * Sphinx JavaScript utilities for all documentation. 6 | * 7 | * :copyright: Copyright 2007-2013 by the Sphinx team, see AUTHORS. 8 | * :license: BSD, see LICENSE for details. 9 | * 10 | */ 11 | 12 | /** 13 | * select a different prefix for underscore 14 | */ 15 | $u = _.noConflict(); 16 | 17 | /** 18 | * make the code below compatible with browsers without 19 | * an installed firebug like debugger 20 | if (!window.console || !console.firebug) { 21 | var names = ["log", "debug", "info", "warn", "error", "assert", "dir", 22 | "dirxml", "group", "groupEnd", "time", "timeEnd", "count", "trace", 23 | "profile", "profileEnd"]; 24 | window.console = {}; 25 | for (var i = 0; i < names.length; ++i) 26 | window.console[names[i]] = function() {}; 27 | } 28 | */ 29 | 30 | /** 31 | * small helper function to urldecode strings 32 | */ 33 | jQuery.urldecode = function(x) { 34 | return decodeURIComponent(x).replace(/\+/g, ' '); 35 | }; 36 | 37 | /** 38 | * small helper function to urlencode strings 39 | */ 40 | jQuery.urlencode = encodeURIComponent; 41 | 42 | /** 43 | * This function returns the parsed url parameters of the 44 | * current request. Multiple values per key are supported, 45 | * it will always return arrays of strings for the value parts. 46 | */ 47 | jQuery.getQueryParameters = function(s) { 48 | if (typeof s == 'undefined') 49 | s = document.location.search; 50 | var parts = s.substr(s.indexOf('?') + 1).split('&'); 51 | var result = {}; 52 | for (var i = 0; i < parts.length; i++) { 53 | var tmp = parts[i].split('=', 2); 54 | var key = jQuery.urldecode(tmp[0]); 55 | var value = jQuery.urldecode(tmp[1]); 56 | if (key in result) 57 | result[key].push(value); 58 | else 59 | result[key] = [value]; 60 | } 61 | return result; 62 | }; 63 | 64 | /** 65 | * highlight a given string on a jquery object by wrapping it in 66 | * span elements with the given class name. 67 | */ 68 | jQuery.fn.highlightText = function(text, className) { 69 | function highlight(node) { 70 | if (node.nodeType == 3) { 71 | var val = node.nodeValue; 72 | var pos = val.toLowerCase().indexOf(text); 73 | if (pos >= 0 && !jQuery(node.parentNode).hasClass(className)) { 74 | var span = document.createElement("span"); 75 | span.className = className; 76 | span.appendChild(document.createTextNode(val.substr(pos, text.length))); 77 | node.parentNode.insertBefore(span, node.parentNode.insertBefore( 78 | document.createTextNode(val.substr(pos + text.length)), 79 | node.nextSibling)); 80 | node.nodeValue = val.substr(0, pos); 81 | } 82 | } 83 | else if (!jQuery(node).is("button, select, textarea")) { 84 | jQuery.each(node.childNodes, function() { 85 | highlight(this); 86 | }); 87 | } 88 | } 89 | return this.each(function() { 90 | highlight(this); 91 | }); 92 | }; 93 | 94 | /** 95 | * Small JavaScript module for the documentation. 96 | */ 97 | var Documentation = { 98 | 99 | init : function() { 100 | this.fixFirefoxAnchorBug(); 101 | this.highlightSearchWords(); 102 | this.initIndexTable(); 103 | }, 104 | 105 | /** 106 | * i18n support 107 | */ 108 | TRANSLATIONS : {}, 109 | PLURAL_EXPR : function(n) { return n == 1 ? 0 : 1; }, 110 | LOCALE : 'unknown', 111 | 112 | // gettext and ngettext don't access this so that the functions 113 | // can safely bound to a different name (_ = Documentation.gettext) 114 | gettext : function(string) { 115 | var translated = Documentation.TRANSLATIONS[string]; 116 | if (typeof translated == 'undefined') 117 | return string; 118 | return (typeof translated == 'string') ? translated : translated[0]; 119 | }, 120 | 121 | ngettext : function(singular, plural, n) { 122 | var translated = Documentation.TRANSLATIONS[singular]; 123 | if (typeof translated == 'undefined') 124 | return (n == 1) ? singular : plural; 125 | return translated[Documentation.PLURALEXPR(n)]; 126 | }, 127 | 128 | addTranslations : function(catalog) { 129 | for (var key in catalog.messages) 130 | this.TRANSLATIONS[key] = catalog.messages[key]; 131 | this.PLURAL_EXPR = new Function('n', 'return +(' + catalog.plural_expr + ')'); 132 | this.LOCALE = catalog.locale; 133 | }, 134 | 135 | /** 136 | * add context elements like header anchor links 137 | */ 138 | addContextElements : function() { 139 | $('div[id] > :header:first').each(function() { 140 | $('\u00B6'). 141 | attr('href', '#' + this.id). 142 | attr('title', _('Permalink to this headline')). 143 | appendTo(this); 144 | }); 145 | $('dt[id]').each(function() { 146 | $('\u00B6'). 147 | attr('href', '#' + this.id). 148 | attr('title', _('Permalink to this definition')). 149 | appendTo(this); 150 | }); 151 | }, 152 | 153 | /** 154 | * workaround a firefox stupidity 155 | */ 156 | fixFirefoxAnchorBug : function() { 157 | if (document.location.hash && $.browser.mozilla) 158 | window.setTimeout(function() { 159 | document.location.href += ''; 160 | }, 10); 161 | }, 162 | 163 | /** 164 | * highlight the search words provided in the url in the text 165 | */ 166 | highlightSearchWords : function() { 167 | var params = $.getQueryParameters(); 168 | var terms = (params.highlight) ? params.highlight[0].split(/\s+/) : []; 169 | if (terms.length) { 170 | var body = $('div.body'); 171 | window.setTimeout(function() { 172 | $.each(terms, function() { 173 | body.highlightText(this.toLowerCase(), 'highlighted'); 174 | }); 175 | }, 10); 176 | $('') 178 | .appendTo($('#searchbox')); 179 | } 180 | }, 181 | 182 | /** 183 | * init the domain index toggle buttons 184 | */ 185 | initIndexTable : function() { 186 | var togglers = $('img.toggler').click(function() { 187 | var src = $(this).attr('src'); 188 | var idnum = $(this).attr('id').substr(7); 189 | $('tr.cg-' + idnum).toggle(); 190 | if (src.substr(-9) == 'minus.png') 191 | $(this).attr('src', src.substr(0, src.length-9) + 'plus.png'); 192 | else 193 | $(this).attr('src', src.substr(0, src.length-8) + 'minus.png'); 194 | }).css('display', ''); 195 | if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) { 196 | togglers.click(); 197 | } 198 | }, 199 | 200 | /** 201 | * helper function to hide the search marks again 202 | */ 203 | hideSearchWords : function() { 204 | $('#searchbox .highlight-link').fadeOut(300); 205 | $('span.highlighted').removeClass('highlighted'); 206 | }, 207 | 208 | /** 209 | * make the url absolute 210 | */ 211 | makeURL : function(relativeURL) { 212 | return DOCUMENTATION_OPTIONS.URL_ROOT + '/' + relativeURL; 213 | }, 214 | 215 | /** 216 | * get the current relative url 217 | */ 218 | getCurrentURL : function() { 219 | var path = document.location.pathname; 220 | var parts = path.split(/\//); 221 | $.each(DOCUMENTATION_OPTIONS.URL_ROOT.split(/\//), function() { 222 | if (this == '..') 223 | parts.pop(); 224 | }); 225 | var url = parts.join('/'); 226 | return path.substring(url.lastIndexOf('/') + 1, path.length - 1); 227 | } 228 | }; 229 | 230 | // quick alias for translations 231 | _ = Documentation.gettext; 232 | 233 | $(document).ready(function() { 234 | Documentation.init(); 235 | }); 236 | -------------------------------------------------------------------------------- /doc/build/html/_static/down-pressed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/ushine-learning/71e2eb6746b464a509c1bb39c495591f5b0a5d5f/doc/build/html/_static/down-pressed.png -------------------------------------------------------------------------------- /doc/build/html/_static/down.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/ushine-learning/71e2eb6746b464a509c1bb39c495591f5b0a5d5f/doc/build/html/_static/down.png -------------------------------------------------------------------------------- /doc/build/html/_static/file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/ushine-learning/71e2eb6746b464a509c1bb39c495591f5b0a5d5f/doc/build/html/_static/file.png -------------------------------------------------------------------------------- /doc/build/html/_static/minus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/ushine-learning/71e2eb6746b464a509c1bb39c495591f5b0a5d5f/doc/build/html/_static/minus.png -------------------------------------------------------------------------------- /doc/build/html/_static/plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/ushine-learning/71e2eb6746b464a509c1bb39c495591f5b0a5d5f/doc/build/html/_static/plus.png -------------------------------------------------------------------------------- /doc/build/html/_static/pygments.css: -------------------------------------------------------------------------------- 1 | .highlight .hll { background-color: #ffffcc } 2 | .highlight { background: #eeffcc; } 3 | .highlight .c { color: #408090; font-style: italic } /* Comment */ 4 | .highlight .err { border: 1px solid #FF0000 } /* Error */ 5 | .highlight .k { color: #007020; font-weight: bold } /* Keyword */ 6 | .highlight .o { color: #666666 } /* Operator */ 7 | .highlight .cm { color: #408090; font-style: italic } /* Comment.Multiline */ 8 | .highlight .cp { color: #007020 } /* Comment.Preproc */ 9 | .highlight .c1 { color: #408090; font-style: italic } /* Comment.Single */ 10 | .highlight .cs { color: #408090; background-color: #fff0f0 } /* Comment.Special */ 11 | .highlight .gd { color: #A00000 } /* Generic.Deleted */ 12 | .highlight .ge { font-style: italic } /* Generic.Emph */ 13 | .highlight .gr { color: #FF0000 } /* Generic.Error */ 14 | .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ 15 | .highlight .gi { color: #00A000 } /* Generic.Inserted */ 16 | .highlight .go { color: #333333 } /* Generic.Output */ 17 | .highlight .gp { color: #c65d09; font-weight: bold } /* Generic.Prompt */ 18 | .highlight .gs { font-weight: bold } /* Generic.Strong */ 19 | .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ 20 | .highlight .gt { color: #0044DD } /* Generic.Traceback */ 21 | .highlight .kc { color: #007020; font-weight: bold } /* Keyword.Constant */ 22 | .highlight .kd { color: #007020; font-weight: bold } /* Keyword.Declaration */ 23 | .highlight .kn { color: #007020; font-weight: bold } /* Keyword.Namespace */ 24 | .highlight .kp { color: #007020 } /* Keyword.Pseudo */ 25 | .highlight .kr { color: #007020; font-weight: bold } /* Keyword.Reserved */ 26 | .highlight .kt { color: #902000 } /* Keyword.Type */ 27 | .highlight .m { color: #208050 } /* Literal.Number */ 28 | .highlight .s { color: #4070a0 } /* Literal.String */ 29 | .highlight .na { color: #4070a0 } /* Name.Attribute */ 30 | .highlight .nb { color: #007020 } /* Name.Builtin */ 31 | .highlight .nc { color: #0e84b5; font-weight: bold } /* Name.Class */ 32 | .highlight .no { color: #60add5 } /* Name.Constant */ 33 | .highlight .nd { color: #555555; font-weight: bold } /* Name.Decorator */ 34 | .highlight .ni { color: #d55537; font-weight: bold } /* Name.Entity */ 35 | .highlight .ne { color: #007020 } /* Name.Exception */ 36 | .highlight .nf { color: #06287e } /* Name.Function */ 37 | .highlight .nl { color: #002070; font-weight: bold } /* Name.Label */ 38 | .highlight .nn { color: #0e84b5; font-weight: bold } /* Name.Namespace */ 39 | .highlight .nt { color: #062873; font-weight: bold } /* Name.Tag */ 40 | .highlight .nv { color: #bb60d5 } /* Name.Variable */ 41 | .highlight .ow { color: #007020; font-weight: bold } /* Operator.Word */ 42 | .highlight .w { color: #bbbbbb } /* Text.Whitespace */ 43 | .highlight .mf { color: #208050 } /* Literal.Number.Float */ 44 | .highlight .mh { color: #208050 } /* Literal.Number.Hex */ 45 | .highlight .mi { color: #208050 } /* Literal.Number.Integer */ 46 | .highlight .mo { color: #208050 } /* Literal.Number.Oct */ 47 | .highlight .sb { color: #4070a0 } /* Literal.String.Backtick */ 48 | .highlight .sc { color: #4070a0 } /* Literal.String.Char */ 49 | .highlight .sd { color: #4070a0; font-style: italic } /* Literal.String.Doc */ 50 | .highlight .s2 { color: #4070a0 } /* Literal.String.Double */ 51 | .highlight .se { color: #4070a0; font-weight: bold } /* Literal.String.Escape */ 52 | .highlight .sh { color: #4070a0 } /* Literal.String.Heredoc */ 53 | .highlight .si { color: #70a0d0; font-style: italic } /* Literal.String.Interpol */ 54 | .highlight .sx { color: #c65d09 } /* Literal.String.Other */ 55 | .highlight .sr { color: #235388 } /* Literal.String.Regex */ 56 | .highlight .s1 { color: #4070a0 } /* Literal.String.Single */ 57 | .highlight .ss { color: #517918 } /* Literal.String.Symbol */ 58 | .highlight .bp { color: #007020 } /* Name.Builtin.Pseudo */ 59 | .highlight .vc { color: #bb60d5 } /* Name.Variable.Class */ 60 | .highlight .vg { color: #bb60d5 } /* Name.Variable.Global */ 61 | .highlight .vi { color: #bb60d5 } /* Name.Variable.Instance */ 62 | .highlight .il { color: #208050 } /* Literal.Number.Integer.Long */ -------------------------------------------------------------------------------- /doc/build/html/_static/sidebar.js: -------------------------------------------------------------------------------- 1 | /* 2 | * sidebar.js 3 | * ~~~~~~~~~~ 4 | * 5 | * This script makes the Sphinx sidebar collapsible. 6 | * 7 | * .sphinxsidebar contains .sphinxsidebarwrapper. This script adds 8 | * in .sphixsidebar, after .sphinxsidebarwrapper, the #sidebarbutton 9 | * used to collapse and expand the sidebar. 10 | * 11 | * When the sidebar is collapsed the .sphinxsidebarwrapper is hidden 12 | * and the width of the sidebar and the margin-left of the document 13 | * are decreased. When the sidebar is expanded the opposite happens. 14 | * This script saves a per-browser/per-session cookie used to 15 | * remember the position of the sidebar among the pages. 16 | * Once the browser is closed the cookie is deleted and the position 17 | * reset to the default (expanded). 18 | * 19 | * :copyright: Copyright 2007-2013 by the Sphinx team, see AUTHORS. 20 | * :license: BSD, see LICENSE for details. 21 | * 22 | */ 23 | 24 | $(function() { 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | // global elements used by the functions. 34 | // the 'sidebarbutton' element is defined as global after its 35 | // creation, in the add_sidebar_button function 36 | var bodywrapper = $('.bodywrapper'); 37 | var sidebar = $('.sphinxsidebar'); 38 | var sidebarwrapper = $('.sphinxsidebarwrapper'); 39 | 40 | // for some reason, the document has no sidebar; do not run into errors 41 | if (!sidebar.length) return; 42 | 43 | // original margin-left of the bodywrapper and width of the sidebar 44 | // with the sidebar expanded 45 | var bw_margin_expanded = bodywrapper.css('margin-left'); 46 | var ssb_width_expanded = sidebar.width(); 47 | 48 | // margin-left of the bodywrapper and width of the sidebar 49 | // with the sidebar collapsed 50 | var bw_margin_collapsed = '.8em'; 51 | var ssb_width_collapsed = '.8em'; 52 | 53 | // colors used by the current theme 54 | var dark_color = $('.related').css('background-color'); 55 | var light_color = $('.document').css('background-color'); 56 | 57 | function sidebar_is_collapsed() { 58 | return sidebarwrapper.is(':not(:visible)'); 59 | } 60 | 61 | function toggle_sidebar() { 62 | if (sidebar_is_collapsed()) 63 | expand_sidebar(); 64 | else 65 | collapse_sidebar(); 66 | } 67 | 68 | function collapse_sidebar() { 69 | sidebarwrapper.hide(); 70 | sidebar.css('width', ssb_width_collapsed); 71 | bodywrapper.css('margin-left', bw_margin_collapsed); 72 | sidebarbutton.css({ 73 | 'margin-left': '0', 74 | 'height': bodywrapper.height() 75 | }); 76 | sidebarbutton.find('span').text('»'); 77 | sidebarbutton.attr('title', _('Expand sidebar')); 78 | document.cookie = 'sidebar=collapsed'; 79 | } 80 | 81 | function expand_sidebar() { 82 | bodywrapper.css('margin-left', bw_margin_expanded); 83 | sidebar.css('width', ssb_width_expanded); 84 | sidebarwrapper.show(); 85 | sidebarbutton.css({ 86 | 'margin-left': ssb_width_expanded-12, 87 | 'height': bodywrapper.height() 88 | }); 89 | sidebarbutton.find('span').text('«'); 90 | sidebarbutton.attr('title', _('Collapse sidebar')); 91 | document.cookie = 'sidebar=expanded'; 92 | } 93 | 94 | function add_sidebar_button() { 95 | sidebarwrapper.css({ 96 | 'float': 'left', 97 | 'margin-right': '0', 98 | 'width': ssb_width_expanded - 28 99 | }); 100 | // create the button 101 | sidebar.append( 102 | '
«
' 103 | ); 104 | var sidebarbutton = $('#sidebarbutton'); 105 | light_color = sidebarbutton.css('background-color'); 106 | // find the height of the viewport to center the '<<' in the page 107 | var viewport_height; 108 | if (window.innerHeight) 109 | viewport_height = window.innerHeight; 110 | else 111 | viewport_height = $(window).height(); 112 | sidebarbutton.find('span').css({ 113 | 'display': 'block', 114 | 'margin-top': (viewport_height - sidebar.position().top - 20) / 2 115 | }); 116 | 117 | sidebarbutton.click(toggle_sidebar); 118 | sidebarbutton.attr('title', _('Collapse sidebar')); 119 | sidebarbutton.css({ 120 | 'color': '#FFFFFF', 121 | 'border-left': '1px solid ' + dark_color, 122 | 'font-size': '1.2em', 123 | 'cursor': 'pointer', 124 | 'height': bodywrapper.height(), 125 | 'padding-top': '1px', 126 | 'margin-left': ssb_width_expanded - 12 127 | }); 128 | 129 | sidebarbutton.hover( 130 | function () { 131 | $(this).css('background-color', dark_color); 132 | }, 133 | function () { 134 | $(this).css('background-color', light_color); 135 | } 136 | ); 137 | } 138 | 139 | function set_position_from_cookie() { 140 | if (!document.cookie) 141 | return; 142 | var items = document.cookie.split(';'); 143 | for(var k=0; k2;a== 12 | null&&(a=[]);if(y&&a.reduce===y)return e&&(c=b.bind(c,e)),f?a.reduce(c,d):a.reduce(c);j(a,function(a,b,i){f?d=c.call(e,d,a,b,i):(d=a,f=true)});if(!f)throw new TypeError("Reduce of empty array with no initial value");return d};b.reduceRight=b.foldr=function(a,c,d,e){var f=arguments.length>2;a==null&&(a=[]);if(z&&a.reduceRight===z)return e&&(c=b.bind(c,e)),f?a.reduceRight(c,d):a.reduceRight(c);var g=b.toArray(a).reverse();e&&!f&&(c=b.bind(c,e));return f?b.reduce(g,c,d,e):b.reduce(g,c)};b.find=b.detect= 13 | function(a,c,b){var e;E(a,function(a,g,h){if(c.call(b,a,g,h))return e=a,true});return e};b.filter=b.select=function(a,c,b){var e=[];if(a==null)return e;if(A&&a.filter===A)return a.filter(c,b);j(a,function(a,g,h){c.call(b,a,g,h)&&(e[e.length]=a)});return e};b.reject=function(a,c,b){var e=[];if(a==null)return e;j(a,function(a,g,h){c.call(b,a,g,h)||(e[e.length]=a)});return e};b.every=b.all=function(a,c,b){var e=true;if(a==null)return e;if(B&&a.every===B)return a.every(c,b);j(a,function(a,g,h){if(!(e= 14 | e&&c.call(b,a,g,h)))return n});return e};var E=b.some=b.any=function(a,c,d){c||(c=b.identity);var e=false;if(a==null)return e;if(C&&a.some===C)return a.some(c,d);j(a,function(a,b,h){if(e||(e=c.call(d,a,b,h)))return n});return!!e};b.include=b.contains=function(a,c){var b=false;if(a==null)return b;return p&&a.indexOf===p?a.indexOf(c)!=-1:b=E(a,function(a){return a===c})};b.invoke=function(a,c){var d=i.call(arguments,2);return b.map(a,function(a){return(b.isFunction(c)?c||a:a[c]).apply(a,d)})};b.pluck= 15 | function(a,c){return b.map(a,function(a){return a[c]})};b.max=function(a,c,d){if(!c&&b.isArray(a))return Math.max.apply(Math,a);if(!c&&b.isEmpty(a))return-Infinity;var e={computed:-Infinity};j(a,function(a,b,h){b=c?c.call(d,a,b,h):a;b>=e.computed&&(e={value:a,computed:b})});return e.value};b.min=function(a,c,d){if(!c&&b.isArray(a))return Math.min.apply(Math,a);if(!c&&b.isEmpty(a))return Infinity;var e={computed:Infinity};j(a,function(a,b,h){b=c?c.call(d,a,b,h):a;bd?1:0}),"value")};b.groupBy=function(a,c){var d={},e=b.isFunction(c)?c:function(a){return a[c]};j(a,function(a,b){var c=e(a,b);(d[c]||(d[c]=[])).push(a)});return d};b.sortedIndex=function(a, 17 | c,d){d||(d=b.identity);for(var e=0,f=a.length;e>1;d(a[g])=0})})};b.difference=function(a){var c=b.flatten(i.call(arguments,1));return b.filter(a,function(a){return!b.include(c,a)})};b.zip=function(){for(var a=i.call(arguments),c=b.max(b.pluck(a,"length")),d=Array(c),e=0;e=0;d--)b=[a[d].apply(this,b)];return b[0]}}; 24 | b.after=function(a,b){return a<=0?b():function(){if(--a<1)return b.apply(this,arguments)}};b.keys=J||function(a){if(a!==Object(a))throw new TypeError("Invalid object");var c=[],d;for(d in a)b.has(a,d)&&(c[c.length]=d);return c};b.values=function(a){return b.map(a,b.identity)};b.functions=b.methods=function(a){var c=[],d;for(d in a)b.isFunction(a[d])&&c.push(d);return c.sort()};b.extend=function(a){j(i.call(arguments,1),function(b){for(var d in b)a[d]=b[d]});return a};b.defaults=function(a){j(i.call(arguments, 25 | 1),function(b){for(var d in b)a[d]==null&&(a[d]=b[d])});return a};b.clone=function(a){return!b.isObject(a)?a:b.isArray(a)?a.slice():b.extend({},a)};b.tap=function(a,b){b(a);return a};b.isEqual=function(a,b){return q(a,b,[])};b.isEmpty=function(a){if(b.isArray(a)||b.isString(a))return a.length===0;for(var c in a)if(b.has(a,c))return false;return true};b.isElement=function(a){return!!(a&&a.nodeType==1)};b.isArray=o||function(a){return l.call(a)=="[object Array]"};b.isObject=function(a){return a===Object(a)}; 26 | b.isArguments=function(a){return l.call(a)=="[object Arguments]"};if(!b.isArguments(arguments))b.isArguments=function(a){return!(!a||!b.has(a,"callee"))};b.isFunction=function(a){return l.call(a)=="[object Function]"};b.isString=function(a){return l.call(a)=="[object String]"};b.isNumber=function(a){return l.call(a)=="[object Number]"};b.isNaN=function(a){return a!==a};b.isBoolean=function(a){return a===true||a===false||l.call(a)=="[object Boolean]"};b.isDate=function(a){return l.call(a)=="[object Date]"}; 27 | b.isRegExp=function(a){return l.call(a)=="[object RegExp]"};b.isNull=function(a){return a===null};b.isUndefined=function(a){return a===void 0};b.has=function(a,b){return I.call(a,b)};b.noConflict=function(){r._=G;return this};b.identity=function(a){return a};b.times=function(a,b,d){for(var e=0;e/g,">").replace(/"/g,""").replace(/'/g,"'").replace(/\//g,"/")};b.mixin=function(a){j(b.functions(a), 28 | function(c){K(c,b[c]=a[c])})};var L=0;b.uniqueId=function(a){var b=L++;return a?a+b:b};b.templateSettings={evaluate:/<%([\s\S]+?)%>/g,interpolate:/<%=([\s\S]+?)%>/g,escape:/<%-([\s\S]+?)%>/g};var t=/.^/,u=function(a){return a.replace(/\\\\/g,"\\").replace(/\\'/g,"'")};b.template=function(a,c){var d=b.templateSettings,d="var __p=[],print=function(){__p.push.apply(__p,arguments);};with(obj||{}){__p.push('"+a.replace(/\\/g,"\\\\").replace(/'/g,"\\'").replace(d.escape||t,function(a,b){return"',_.escape("+ 29 | u(b)+"),'"}).replace(d.interpolate||t,function(a,b){return"',"+u(b)+",'"}).replace(d.evaluate||t,function(a,b){return"');"+u(b).replace(/[\r\n\t]/g," ")+";__p.push('"}).replace(/\r/g,"\\r").replace(/\n/g,"\\n").replace(/\t/g,"\\t")+"');}return __p.join('');",e=new Function("obj","_",d);return c?e(c,b):function(a){return e.call(this,a,b)}};b.chain=function(a){return b(a).chain()};var m=function(a){this._wrapped=a};b.prototype=m.prototype;var v=function(a,c){return c?b(a).chain():a},K=function(a,c){m.prototype[a]= 30 | function(){var a=i.call(arguments);H.call(a,this._wrapped);return v(c.apply(b,a),this._chain)}};b.mixin(b);j("pop,push,reverse,shift,sort,splice,unshift".split(","),function(a){var b=k[a];m.prototype[a]=function(){var d=this._wrapped;b.apply(d,arguments);var e=d.length;(a=="shift"||a=="splice")&&e===0&&delete d[0];return v(d,this._chain)}});j(["concat","join","slice"],function(a){var b=k[a];m.prototype[a]=function(){return v(b.apply(this._wrapped,arguments),this._chain)}});m.prototype.chain=function(){this._chain= 31 | true;return this};m.prototype.value=function(){return this._wrapped}}).call(this); 32 | -------------------------------------------------------------------------------- /doc/build/html/_static/up-pressed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/ushine-learning/71e2eb6746b464a509c1bb39c495591f5b0a5d5f/doc/build/html/_static/up-pressed.png -------------------------------------------------------------------------------- /doc/build/html/_static/up.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/ushine-learning/71e2eb6746b464a509c1bb39c495591f5b0a5d5f/doc/build/html/_static/up.png -------------------------------------------------------------------------------- /doc/build/html/dssg.webapp.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | webapp Package — WebHooksDemo 0.1 documentation 10 | 11 | 12 | 13 | 14 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 48 | 49 |
50 |
51 |
52 |
53 | 54 |
55 |

webapp Package

56 |
57 |

webapp Package

58 |
59 |
60 |

rest_api Module

61 |
62 |
63 | dssg.webapp.rest_api.detect_language()
64 |

Given some text, returns a ranked list of likey natural languages 65 | the given content is in

66 |
67 | 68 |
69 |
70 | dssg.webapp.rest_api.extract_entities()
71 |

Given some text input, identify - besides location - people, 72 | organisations and other types of entities within the text

73 |
74 | 75 |
76 |
77 | dssg.webapp.rest_api.similar_messages()
78 |

Given text, finds the near duplicate messages.

79 |

input: text 80 | output: list. made up of tuples of (id, message text). 81 | [todo: does this only return reports? or unannotated messages, too? 82 | should be any message for completeness, and then the front-end can decide 83 | what should be hidden from the user.]

84 |
85 | 86 |
87 |
88 | dssg.webapp.rest_api.suggest_categories()
89 |

Given a message/report, suggests the possible categories 90 | that the message could fall into

91 |
92 | 93 |
94 |
95 | dssg.webapp.rest_api.suggest_locations()
96 |

Suggest locations in a text string. These might be useful keywords for 97 | annotators to geolocate.

98 |

input: full message’s text [string] 99 | output: list. each item is a python dictionary:

100 |
101 |
    102 |
  • text : the text for the specific entity [string]
  • 103 |
  • indices : tuple of (start [int], end [int]) offset where entity is 104 | located in given full message
  • 105 |
  • confidence : probability from 0-to-1 [float]
  • 106 |
107 |
108 |
109 | 110 |
111 |
112 | dssg.webapp.rest_api.suggest_sensitive_info()
113 |

Suggest personally identifying information (PII) – such as 114 | credit card numbers, phone numbers, email, etc – 115 | from a text string. These are useful for annotators to investigate 116 | and strip before publicly posting information.

117 |

input: text, 118 | input: options

119 |
120 |
    121 |
  • custom regex for local phone numbers
  • 122 |
  • flags or booleans to specify the type of pii (e.g. phone_only)
  • 123 |
124 |
125 |
126 |
output: list of dictionaries:
127 |
    128 |
  • word
  • 129 |
  • type (e-mail, phone, ID, person name, etc.)
  • 130 |
  • indices (start/end offset in text)
  • 131 |
  • confidence [todo: is possible?]
  • 132 |
133 |
134 |
135 |
136 | 137 |
138 |
139 | 140 | 141 |
142 |
143 |
144 |
145 |
146 |

Table Of Contents

147 | 154 | 155 |

Previous topic

156 |

dssg Package

158 |

This Page

159 | 163 | 175 | 176 |
177 |
178 |
179 |
180 | 197 | 201 | 202 | -------------------------------------------------------------------------------- /doc/build/html/genindex.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | Index — WebHooksDemo 0.1 documentation 11 | 12 | 13 | 14 | 15 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 42 | 43 |
44 |
45 |
46 |
47 | 48 | 49 |

Index

50 | 51 |
52 | C 53 | | D 54 | | E 55 | | G 56 | | I 57 | | L 58 | | M 59 | | S 60 | | T 61 | | U 62 | 63 |
64 |

C

65 | 66 | 76 | 86 |
67 | 68 |
categories (dssg.Machine.Machine attribute) 69 |
70 | 71 | 72 |
clean_data() (dssg.Machine.Machine method) 73 |
74 | 75 |
77 | 78 |
computeSimilarities() (dssg.Machine.Machine method) 79 |
80 | 81 | 82 |
countTruth() (in module dssg.junutils) 83 |
84 | 85 |
87 | 88 |

D

89 | 90 | 104 | 114 |
91 | 92 |
detect_language() (in module dssg.webapp.rest_api) 93 |
94 | 95 | 96 |
dssg.junutils (module) 97 |
98 | 99 | 100 |
dssg.Machine (module), [1] 101 |
102 | 103 |
105 | 106 |
dssg.webapp (module) 107 |
108 | 109 | 110 |
dssg.webapp.rest_api (module) 111 |
112 | 113 |
115 | 116 |

E

117 | 118 | 124 | 130 |
119 | 120 |
entities (dssg.Machine.Machine attribute) 121 |
122 | 123 |
125 | 126 |
extract_entities() (in module dssg.webapp.rest_api) 127 |
128 | 129 |
131 | 132 |

G

133 | 134 | 152 | 166 |
135 | 136 |
getFullMessagesFromJson() (in module dssg.junutils) 137 |
138 | 139 | 140 |
getFullMessagesFromUchaguziMergedCategory() (in module dssg.junutils) 141 |
142 | 143 | 144 |
getTrainStats() (dssg.Machine.Machine method) 145 |
146 | 147 | 148 |
guess() (dssg.Machine.Machine method) 149 |
150 | 151 |
153 | 154 |
guess_entities() (dssg.Machine.Machine static method) 155 |
156 | 157 | 158 |
guess_language() (dssg.Machine.Machine static method) 159 |
160 | 161 | 162 |
guess_private_info() (dssg.Machine.Machine static method) 163 |
164 | 165 |
167 | 168 |

I

169 | 170 | 176 |
171 | 172 |
isAllNumbers() (in module dssg.junutils) 173 |
174 | 175 |
177 | 178 |

L

179 | 180 | 190 | 196 |
181 | 182 |
loadDatasetWithMappedCategories() (in module dssg.junutils) 183 |
184 | 185 | 186 |
loadIncidentListFromCsv() (in module dssg.junutils) 187 |
188 | 189 |
191 | 192 |
loadJsonFromPath() (in module dssg.junutils) 193 |
194 | 195 |
197 | 198 |

M

199 | 200 | 212 | 218 |
201 | 202 |
Machine (class in dssg.Machine) 203 |
204 | 205 |
206 | 207 |
(in module dssg) 208 |
209 | 210 |
211 |
213 | 214 |
messages (dssg.Machine.Machine attribute) 215 |
216 | 217 |
219 | 220 |

S

221 | 222 | 232 | 242 |
223 | 224 |
similar_messages() (in module dssg.webapp.rest_api) 225 |
226 | 227 | 228 |
suggest_categories() (in module dssg.webapp.rest_api) 229 |
230 | 231 |
233 | 234 |
suggest_locations() (in module dssg.webapp.rest_api) 235 |
236 | 237 | 238 |
suggest_sensitive_info() (in module dssg.webapp.rest_api) 239 |
240 | 241 |
243 | 244 |

T

245 | 246 | 252 |
247 | 248 |
train() (dssg.Machine.Machine method) 249 |
250 | 251 |
253 | 254 |

U

255 | 256 | 262 | 268 |
257 | 258 |
UnicodeDictReader() (in module dssg.junutils) 259 |
260 | 261 |
263 | 264 |
unicodeToAscii() (in module dssg.junutils) 265 |
266 | 267 |
269 | 270 | 271 | 272 |
273 |
274 |
275 |
276 |
277 | 278 | 279 | 280 | 292 | 293 |
294 |
295 |
296 |
297 | 309 | 313 | 314 | -------------------------------------------------------------------------------- /doc/build/html/index.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Welcome to examplemodule’s documentation! — WebHooksDemo 0.1 documentation 10 | 11 | 12 | 13 | 14 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 45 | 46 |
47 |
48 |
49 |
50 | 51 |
52 |

Welcome to examplemodule’s documentation!

53 |

Contents:

54 |
55 | 61 |
62 |
63 |
64 | dssg.Machine
65 |

alias of dssg.Machine

66 |
67 | 68 |
69 | 70 | 71 |
72 |
73 |
74 |
75 |
76 |

Next topic

77 |

dssg

79 |

This Page

80 | 84 | 96 | 97 |
98 |
99 |
100 |
101 | 116 | 120 | 121 | -------------------------------------------------------------------------------- /doc/build/html/modules.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | dssg — WebHooksDemo 0.1 documentation 10 | 11 | 12 | 13 | 14 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 49 | 50 |
51 |
52 |
53 |
54 | 55 |
56 |

dssg

57 |
58 | 73 |
74 |
75 | 76 | 77 |
78 |
79 |
80 |
81 |
82 |

Previous topic

83 |

Welcome to examplemodule’s documentation!

85 |

Next topic

86 |

dssg Package

88 |

This Page

89 | 93 | 105 | 106 |
107 |
108 |
109 |
110 | 128 | 132 | 133 | -------------------------------------------------------------------------------- /doc/build/html/objects.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/ushine-learning/71e2eb6746b464a509c1bb39c495591f5b0a5d5f/doc/build/html/objects.inv -------------------------------------------------------------------------------- /doc/build/html/py-modindex.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Python Module Index — WebHooksDemo 0.1 documentation 10 | 11 | 12 | 13 | 14 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 44 | 45 |
46 |
47 |
48 |
49 | 50 | 51 |

Python Module Index

52 | 53 |
54 | d 55 |
56 | 57 | 58 | 59 | 61 | 62 | 64 | 67 | 68 | 69 | 72 | 73 | 74 | 77 | 78 | 79 | 82 | 83 | 84 | 87 |
 
60 | d
65 | dssg 66 |
    70 | dssg.junutils 71 |
    75 | dssg.Machine 76 |
    80 | dssg.webapp 81 |
    85 | dssg.webapp.rest_api 86 |
88 | 89 | 90 |
91 |
92 |
93 |
94 |
95 | 107 | 108 |
109 |
110 |
111 |
112 | 124 | 128 | 129 | -------------------------------------------------------------------------------- /doc/build/html/search.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Search — WebHooksDemo 0.1 documentation 10 | 11 | 12 | 13 | 14 | 23 | 24 | 25 | 26 | 27 | 28 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 49 | 50 |
51 |
52 |
53 |
54 | 55 |

Search

56 |
57 | 58 |

59 | Please activate JavaScript to enable the search 60 | functionality. 61 |

62 |
63 |

64 | From here you can search these documents. Enter your search 65 | words into the box below and click "search". Note that the search 66 | function will automatically search for all of the words. Pages 67 | containing fewer words won't appear in the result list. 68 |

69 |
70 | 71 | 72 | 73 |
74 | 75 |
76 | 77 |
78 | 79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 | 100 | 104 | 105 | -------------------------------------------------------------------------------- /doc/build/html/searchindex.js: -------------------------------------------------------------------------------- 1 | Search.setIndex({envversion:42,terms:{guess_ent:1,identifi:3,valid:[],posnegprbd:[],show:[],text:[3,1],getfullmessagesfromjson:1,test_docstr:[],rang:1,phone_regex:1,balanc:[],front:3,prefix:[],categoryidlist:1,"0x10e41fb50":[],incid:1,simpli:1,save_machin:[],find:3,languag:[3,1],paramet:[],content:[0,3],privat:1,locat:[3,1],dssgruncv:[],also:1,sensit:1,except:[],param:1,should:[3,1],rest_api:[],dssgvectorizerunigrambysklearn:[],loadjsonfrompath:1,pair:1,mail:[3,1],hidden:3,main:[],might:3,defin:1,real:1,non:1,"float":3,"return":[3,1],string:[3,1],format:[],handl:1,label1:[],label2:[],credit:3,webhook:[],msgdict:1,load_machin:[],format_except:[],cannot:1,fall:3,report:3,potenti:1,"0x111d76090":[],getfeaturenamelist:[],requir:[],like:[],measur:1,name:[3,1],specif:3,submitfield:[],did:1,list:[3,1],url:1,integ:[],item:[3,1],exampl:1,getfullmessagesfromuchaguzimergedcategori:1,predictscor:[],each:[3,1],output:[3,1],minfreq:[],complet:3,where:3,page:[],two:1,counttruth:1,set:1,could:3,natur:3,frame:[],some:3,maximum:[],gettrainstat:1,second:1,full:3,"static":1,unigramextractor:[],arg:[],csrf_enabl:[],stack:[],beyond:1,todo:3,index:[],what:3,getword:[],appear:1,uchaguzijsonpath:1,neg:[],wtform:[],"0x10d348050":[],label:1,categori:[3,1],score:1,flask_wtf:[],"0x110b68790":[],"0x10c0b7790":[],dssgbinaryclassifiernaivebay:[],entiti:[3,1],email:[3,1],binaryclassifiertrain:[],machin:[],trainstat:[],investig:3,probabilit:1,kei:[],etyp:[],contain:1,loaddatasetwithmappedcategori:1,decid:3,messagelist:1,base:1,dsetjsonpath:1,dictionari:[3,1],offset:[3,1],path:1,main_menu:[],secret_kei:[],valu:1,both:1,search:[],obj:[],nfold:[],geoloc:[3,1],similar:1,suggest_loc:3,traceback:[],meaning:1,local:3,turn:[],"int":3,person:[3,1],organis:3,constructor:[],fals:[],loadincidentlistfromcsv:1,utf8_data:1,"0x11133f050":[],whole:1,textareafield:[],extract_ent:3,guess_private_info:1,major:[],unannot:3,feel:[],modul:[],within:3,number:[3,1],rank:[3,1],likei:3,order:1,computesimilar:1,dssgunigramextractor:[],"boolean":3,done:[],messag:[3,1],msg:1,strip:[3,1],vote:[],dssgbinaryclassifiermajorityvot:[],categorylist:[],post:3,dssgvectorizerunigramcount:[],duplic:[3,1],regex:[3,1],unicodetoascii:1,mappedcategorypath:1,from:[3,1],start:[3,1],gpe:1,licens:1,transform:[],submit:[],custom:3,dssgvectorizergener:[],besid:3,interpret:1,"0x11238aef0":[],recommend:1,dup:1,astr:1,type:[3,1],twitter:1,includ:1,"function":1,passport:1,tooskewedlabelsexcept:[],guess:1,python:3,tupl:[3,1],classifierd:[],criteria:1,specifi:3,sort:1,highlight_ent:[],phone:[3,1],flag:3,part:1,too:3,input:3,webapp:[],given:3,cach:[],card:3,dsetbinari:[],none:[],made:3,word:[3,1],dssgvector:[],keyword:3,possibl:[3,1],provid:1,alia:0,remov:1,bagofwordsexceptstopword:[],extractunigram:[],annot:3,links_menu:[],uchaguzicategoryjsonpath:1,limit:[],can:3,publicli:3,learn:1,meet:1,dssgcategoryclassifi:[],bagofwordsnotinset:[],howev:1,address:1,categorytitlelist:1,pii:3,onli:3,look:[],predict:[],"abstract":[],meant:[],them:1,dssg:[],new_machin:[],packag:[],unit:1,kwarg:1,decreas:1,have:1,home:[],similar_messag:3,inf:1,doe:3,option:[3,1],bagofword:[],incidentid:1,multipl:1,form:[],etc:[3,1],suggest:3,make:1,detect_languag:3,when:[],unboundfield:[],classifi:[],note:1,suggest_categori:3,ideal:1,other:3,stopfil:[],take:1,guess_languag:1,peopl:3,web_hook:[],properti:1,csrf_context:[],confid:[3,1],same:1,"0x10c88e050":[],usernam:1,alist:1,formdata:[],isallnumb:1,driver:1,unicodedictread:1,lucki:[],"_auto":[],befor:3,mac:[],rais:[],nbclassifi:[],user:3,consid:1,mai:1,end:3,phone_onli:3,clean_data:1,"class":1,"0x10e9f6d50":[],dssgbinaryclassifiersvc:[],binarylabeledmessagelist:[],ani:3,dssgbinaryclassifi:[],fittransform:[],classmethod:[],machine_statu:[],boolfunc:1,descript:[],correspond:1,object:1,element:1,inform:[3,1],which:1,train:1,probabl:3,thi:[3,1],english:[],unicodestr:1,suggest_sensitive_info:3,messageform:[],"0x10cb71790":[],badword:[]},objtypes:{"0":"py:module","1":"py:method","2":"py:staticmethod","3":"py:function","4":"py:attribute","5":"py:class"},objnames:{"0":["py","module","Python module"],"1":["py","method","Python method"],"2":["py","staticmethod","Python static method"],"3":["py","function","Python function"],"4":["py","attribute","Python attribute"],"5":["py","class","Python class"]},filenames:["index","dssg","modules","dssg.webapp"],titles:["Welcome to examplemodule’s documentation!","dssg Package","dssg","webapp Package"],objects:{"dssg.Machine":{Machine:[1,5,1,""]},"dssg.junutils":{unicodeToAscii:[1,3,1,""],countTruth:[1,3,1,""],getFullMessagesFromUchaguziMergedCategory:[1,3,1,""],isAllNumbers:[1,3,1,""],getFullMessagesFromJson:[1,3,1,""],loadDatasetWithMappedCategories:[1,3,1,""],loadJsonFromPath:[1,3,1,""],loadIncidentListFromCsv:[1,3,1,""],UnicodeDictReader:[1,3,1,""]},dssg:{Machine:[1,0,1,""],webapp:[3,0,1,""],junutils:[1,0,1,""]},"dssg.Machine.Machine":{guess_private_info:[1,2,1,""],guess:[1,1,1,""],guess_entities:[1,2,1,""],messages:[1,4,1,""],getTrainStats:[1,1,1,""],entities:[1,4,1,""],train:[1,1,1,""],computeSimilarities:[1,1,1,""],guess_language:[1,2,1,""],clean_data:[1,1,1,""],categories:[1,4,1,""]},"dssg.webapp.rest_api":{suggest_sensitive_info:[3,3,1,""],extract_entities:[3,3,1,""],suggest_locations:[3,3,1,""],detect_language:[3,3,1,""],similar_messages:[3,3,1,""],suggest_categories:[3,3,1,""]},"dssg.webapp":{rest_api:[3,0,1,""]}},titleterms:{machin:1,subpackag:1,rest_api:3,welcom:0,form:[],titl:[],modul:[3,1],dssgclassifi:[],webhook:[],indic:[],webhooksdemo:[],packag:[3,1],dssg:[1,2],web_hook:[],try_featureengin:[],tabl:[],webapp:3,main:[],junutil:1,document:0,examplemodul:0}}) -------------------------------------------------------------------------------- /doc/build/html/web_hooks.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | web_hooks Package — WebHooksDemo 0.1 documentation 10 | 11 | 12 | 13 | 14 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 41 | 42 |
43 |
44 |
45 |
46 | 47 |
48 |

web_hooks Package

49 |
50 |

Machine Module

51 |
52 |
53 | class dssg.Machine.Machine
54 |

Bases: object

55 |

Base class for Machine Learning

56 |
57 |
58 | categories
59 |

The categories property.

60 |
61 | 62 |
63 |
64 | clean_data(lm, params={})
65 |
66 | 67 |
68 |
69 | computeSimilarities(msg)
70 |

returns a set of message id’s with similarity score, sorted by similarity score. 71 | I recommend using >=0.875 to define ‘near-dup’. 72 | :return [(‘1’, 0.9), (‘2’, 0.8), ...], sorted by the real value

73 |
74 |
(second element of each item) in decreasing order.
75 |
76 | 77 |
78 |
79 | entities
80 |

The entities property.

81 |
82 | 83 |
84 |
85 | getTrainStats()
86 |
87 | 88 |
89 |
90 | guess(messages)
91 |

Takes a list of messages (each in dictionary form), and make 92 | guesses their category labels, languages, and so on. 93 | :param messages list of messages 94 | :return [(msgDict, {‘c1’: 4.6, ‘c2’: 4.2}), ...], list of pairs: message, dictionary of categories.

95 |
96 |
The numbers returned range from (-inf, inf), but are ranks and cannot be interpreted beyond (e.g., as probabilites).
97 |
98 | 99 |
100 |
101 | static guess_entities(text)
102 |

Returns list of entity guesses for the message 103 | Each entity (ideally) also includes

104 |
105 |
    106 |
  • text
  • 107 |
  • start (offset in included string)
  • 108 |
  • type (person, location, etc)
  • 109 |
  • confidence
  • 110 |
111 |
112 |
113 | 114 |
115 |
116 | static guess_language(text)
117 |

Returns list of language guesses for the message, with confidence measure (0 to 1).

118 |
119 | 120 |
121 |
122 | static guess_private_info(text, **kwargs)
123 |

Returns list of potentially private/sensitive information to consider stripping. 124 | Output is a list of tuples each containing two parts:

125 |
126 |
    127 |
  • the private information type (PERSON, ID, PHONE, etc.)
  • 128 |
  • the word(s) in the message that correspond to this unit
  • 129 |
130 |
131 |

Note that the same words in text may appear (in whole or part) in multiple tuples 132 | For example, a number may meet the criteria for both an ID and a phone number.

133 |

The types of information:

134 |
    135 |
  1. 136 |
    Named entities [types: PERSON, GPE, etc.]
    137 |

    Note this includes possible locations (GPE), which may be non-private and useful for geolocation

    138 |
    139 |
    140 |
  2. 141 |
  3. ID numbers (passport, driver’s license, etc.) [type: ID]

    142 |
  4. 143 |
  5. Usernames (e.g. Twitter handles) [type: USERNAME]

    144 |
  6. 145 |
  7. URLs [type: URL]

    146 |
  8. 147 |
  9. E-mail addresses [type: EMAIL]

    148 |
  10. 149 |
  11. Phone numbers, using the optional provided regex “phone_regex” [type: PHONE]

    150 |
  12. 151 |
152 |
153 | 154 |
155 |
156 | messages
157 |

The messages property.

158 |
159 | 160 |
161 |
162 | train(messageList)
163 |

Takes list of messages. each message is a dictionary.

164 |
165 | 166 |
167 | 168 |
169 |
170 | 171 | 172 |
173 |
174 |
175 |
176 |
177 |

Table Of Contents

178 | 184 | 185 |

This Page

186 | 190 | 202 | 203 |
204 |
205 |
206 |
207 | 219 | 223 | 224 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source 10 | set I18NSPHINXOPTS=%SPHINXOPTS% source 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | goto end 41 | ) 42 | 43 | if "%1" == "clean" ( 44 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 45 | del /q /s %BUILDDIR%\* 46 | goto end 47 | ) 48 | 49 | 50 | %SPHINXBUILD% 2> nul 51 | if errorlevel 9009 ( 52 | echo. 53 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 54 | echo.installed, then set the SPHINXBUILD environment variable to point 55 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 56 | echo.may add the Sphinx directory to PATH. 57 | echo. 58 | echo.If you don't have Sphinx installed, grab it from 59 | echo.http://sphinx-doc.org/ 60 | exit /b 1 61 | ) 62 | 63 | if "%1" == "html" ( 64 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 68 | goto end 69 | ) 70 | 71 | if "%1" == "dirhtml" ( 72 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 76 | goto end 77 | ) 78 | 79 | if "%1" == "singlehtml" ( 80 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 84 | goto end 85 | ) 86 | 87 | if "%1" == "pickle" ( 88 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can process the pickle files. 92 | goto end 93 | ) 94 | 95 | if "%1" == "json" ( 96 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 97 | if errorlevel 1 exit /b 1 98 | echo. 99 | echo.Build finished; now you can process the JSON files. 100 | goto end 101 | ) 102 | 103 | if "%1" == "htmlhelp" ( 104 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 105 | if errorlevel 1 exit /b 1 106 | echo. 107 | echo.Build finished; now you can run HTML Help Workshop with the ^ 108 | .hhp project file in %BUILDDIR%/htmlhelp. 109 | goto end 110 | ) 111 | 112 | if "%1" == "qthelp" ( 113 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 114 | if errorlevel 1 exit /b 1 115 | echo. 116 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 117 | .qhcp project file in %BUILDDIR%/qthelp, like this: 118 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\WebHooksDemo.qhcp 119 | echo.To view the help file: 120 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\WebHooksDemo.ghc 121 | goto end 122 | ) 123 | 124 | if "%1" == "devhelp" ( 125 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished. 129 | goto end 130 | ) 131 | 132 | if "%1" == "epub" ( 133 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 137 | goto end 138 | ) 139 | 140 | if "%1" == "latex" ( 141 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 145 | goto end 146 | ) 147 | 148 | if "%1" == "latexpdf" ( 149 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 150 | cd %BUILDDIR%/latex 151 | make all-pdf 152 | cd %BUILDDIR%/.. 153 | echo. 154 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 155 | goto end 156 | ) 157 | 158 | if "%1" == "latexpdfja" ( 159 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 160 | cd %BUILDDIR%/latex 161 | make all-pdf-ja 162 | cd %BUILDDIR%/.. 163 | echo. 164 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 165 | goto end 166 | ) 167 | 168 | if "%1" == "text" ( 169 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 170 | if errorlevel 1 exit /b 1 171 | echo. 172 | echo.Build finished. The text files are in %BUILDDIR%/text. 173 | goto end 174 | ) 175 | 176 | if "%1" == "man" ( 177 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 178 | if errorlevel 1 exit /b 1 179 | echo. 180 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 181 | goto end 182 | ) 183 | 184 | if "%1" == "texinfo" ( 185 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 186 | if errorlevel 1 exit /b 1 187 | echo. 188 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 189 | goto end 190 | ) 191 | 192 | if "%1" == "gettext" ( 193 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 194 | if errorlevel 1 exit /b 1 195 | echo. 196 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 197 | goto end 198 | ) 199 | 200 | if "%1" == "changes" ( 201 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 202 | if errorlevel 1 exit /b 1 203 | echo. 204 | echo.The overview file is in %BUILDDIR%/changes. 205 | goto end 206 | ) 207 | 208 | if "%1" == "linkcheck" ( 209 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 210 | if errorlevel 1 exit /b 1 211 | echo. 212 | echo.Link check complete; look for any errors in the above output ^ 213 | or in %BUILDDIR%/linkcheck/output.txt. 214 | goto end 215 | ) 216 | 217 | if "%1" == "doctest" ( 218 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 219 | if errorlevel 1 exit /b 1 220 | echo. 221 | echo.Testing of doctests in the sources finished, look at the ^ 222 | results in %BUILDDIR%/doctest/output.txt. 223 | goto end 224 | ) 225 | 226 | if "%1" == "xml" ( 227 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 228 | if errorlevel 1 exit /b 1 229 | echo. 230 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 231 | goto end 232 | ) 233 | 234 | if "%1" == "pseudoxml" ( 235 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 236 | if errorlevel 1 exit /b 1 237 | echo. 238 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 239 | goto end 240 | ) 241 | 242 | :end 243 | -------------------------------------------------------------------------------- /doc/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # WebHooksDemo documentation build configuration file, created by 4 | # sphinx-quickstart on Tue Jul 23 14:09:37 2013. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sys, os 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | sys.path.insert(0, os.path.abspath('../..')) 20 | 21 | # -- General configuration ----------------------------------------------------- 22 | 23 | # If your documentation needs a minimal Sphinx version, state it here. 24 | #needs_sphinx = '1.0' 25 | 26 | # Add any Sphinx extension module names here, as strings. They can be extensions 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 28 | extensions = ['sphinx.ext.autodoc'] 29 | 30 | # Add any paths that contain templates here, relative to this directory. 31 | templates_path = ['_templates'] 32 | 33 | # The suffix of source filenames. 34 | source_suffix = '.rst' 35 | 36 | # The encoding of source files. 37 | #source_encoding = 'utf-8-sig' 38 | 39 | # The master toctree document. 40 | master_doc = 'index' 41 | 42 | # General information about the project. 43 | project = u'WebHooksDemo' 44 | copyright = u'2013, Nathan Leiby' 45 | 46 | # The version info for the project you're documenting, acts as replacement for 47 | # |version| and |release|, also used in various other places throughout the 48 | # built documents. 49 | # 50 | # The short X.Y version. 51 | version = '0.1' 52 | # The full version, including alpha/beta/rc tags. 53 | release = '0.1' 54 | 55 | # The language for content autogenerated by Sphinx. Refer to documentation 56 | # for a list of supported languages. 57 | #language = None 58 | 59 | # There are two options for replacing |today|: either, you set today to some 60 | # non-false value, then it is used: 61 | #today = '' 62 | # Else, today_fmt is used as the format for a strftime call. 63 | #today_fmt = '%B %d, %Y' 64 | 65 | # List of patterns, relative to source directory, that match files and 66 | # directories to ignore when looking for source files. 67 | exclude_patterns = [] 68 | 69 | # The reST default role (used for this markup: `text`) to use for all documents. 70 | #default_role = None 71 | 72 | # If true, '()' will be appended to :func: etc. cross-reference text. 73 | #add_function_parentheses = True 74 | 75 | # If true, the current module name will be prepended to all description 76 | # unit titles (such as .. function::). 77 | #add_module_names = True 78 | 79 | # If true, sectionauthor and moduleauthor directives will be shown in the 80 | # output. They are ignored by default. 81 | #show_authors = False 82 | 83 | # The name of the Pygments (syntax highlighting) style to use. 84 | pygments_style = 'sphinx' 85 | 86 | # A list of ignored prefixes for module index sorting. 87 | #modindex_common_prefix = [] 88 | 89 | # If true, keep warnings as "system message" paragraphs in the built documents. 90 | #keep_warnings = False 91 | 92 | 93 | # -- Options for HTML output --------------------------------------------------- 94 | 95 | # The theme to use for HTML and HTML Help pages. See the documentation for 96 | # a list of builtin themes. 97 | html_theme = 'default' 98 | 99 | # Theme options are theme-specific and customize the look and feel of a theme 100 | # further. For a list of options available for each theme, see the 101 | # documentation. 102 | #html_theme_options = {} 103 | 104 | # Add any paths that contain custom themes here, relative to this directory. 105 | #html_theme_path = [] 106 | 107 | # The name for this set of Sphinx documents. If None, it defaults to 108 | # " v documentation". 109 | #html_title = None 110 | 111 | # A shorter title for the navigation bar. Default is the same as html_title. 112 | #html_short_title = None 113 | 114 | # The name of an image file (relative to this directory) to place at the top 115 | # of the sidebar. 116 | #html_logo = None 117 | 118 | # The name of an image file (within the static path) to use as favicon of the 119 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 120 | # pixels large. 121 | #html_favicon = None 122 | 123 | # Add any paths that contain custom static files (such as style sheets) here, 124 | # relative to this directory. They are copied after the builtin static files, 125 | # so a file named "default.css" will overwrite the builtin "default.css". 126 | html_static_path = ['_static'] 127 | 128 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 129 | # using the given strftime format. 130 | #html_last_updated_fmt = '%b %d, %Y' 131 | 132 | # If true, SmartyPants will be used to convert quotes and dashes to 133 | # typographically correct entities. 134 | #html_use_smartypants = True 135 | 136 | # Custom sidebar templates, maps document names to template names. 137 | #html_sidebars = {} 138 | 139 | # Additional templates that should be rendered to pages, maps page names to 140 | # template names. 141 | #html_additional_pages = {} 142 | 143 | # If false, no module index is generated. 144 | #html_domain_indices = True 145 | 146 | # If false, no index is generated. 147 | #html_use_index = True 148 | 149 | # If true, the index is split into individual pages for each letter. 150 | #html_split_index = False 151 | 152 | # If true, links to the reST sources are added to the pages. 153 | #html_show_sourcelink = True 154 | 155 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 156 | #html_show_sphinx = True 157 | 158 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 159 | #html_show_copyright = True 160 | 161 | # If true, an OpenSearch description file will be output, and all pages will 162 | # contain a tag referring to it. The value of this option must be the 163 | # base URL from which the finished HTML is served. 164 | #html_use_opensearch = '' 165 | 166 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 167 | #html_file_suffix = None 168 | 169 | # Output file base name for HTML help builder. 170 | htmlhelp_basename = 'WebHooksDemodoc' 171 | 172 | 173 | # -- Options for LaTeX output -------------------------------------------------- 174 | 175 | latex_elements = { 176 | # The paper size ('letterpaper' or 'a4paper'). 177 | #'papersize': 'letterpaper', 178 | 179 | # The font size ('10pt', '11pt' or '12pt'). 180 | #'pointsize': '10pt', 181 | 182 | # Additional stuff for the LaTeX preamble. 183 | #'preamble': '', 184 | } 185 | 186 | # Grouping the document tree into LaTeX files. List of tuples 187 | # (source start file, target name, title, author, documentclass [howto/manual]). 188 | latex_documents = [ 189 | ('index', 'WebHooksDemo.tex', u'WebHooksDemo Documentation', 190 | u'Nathan Leiby', 'manual'), 191 | ] 192 | 193 | # The name of an image file (relative to this directory) to place at the top of 194 | # the title page. 195 | #latex_logo = None 196 | 197 | # For "manual" documents, if this is true, then toplevel headings are parts, 198 | # not chapters. 199 | #latex_use_parts = False 200 | 201 | # If true, show page references after internal links. 202 | #latex_show_pagerefs = False 203 | 204 | # If true, show URL addresses after external links. 205 | #latex_show_urls = False 206 | 207 | # Documents to append as an appendix to all manuals. 208 | #latex_appendices = [] 209 | 210 | # If false, no module index is generated. 211 | #latex_domain_indices = True 212 | 213 | 214 | # -- Options for manual page output -------------------------------------------- 215 | 216 | # One entry per manual page. List of tuples 217 | # (source start file, name, description, authors, manual section). 218 | man_pages = [ 219 | ('index', 'webhooksdemo', u'WebHooksDemo Documentation', 220 | [u'Nathan Leiby'], 1) 221 | ] 222 | 223 | # If true, show URL addresses after external links. 224 | #man_show_urls = False 225 | 226 | 227 | # -- Options for Texinfo output ------------------------------------------------ 228 | 229 | # Grouping the document tree into Texinfo files. List of tuples 230 | # (source start file, target name, title, author, 231 | # dir menu entry, description, category) 232 | texinfo_documents = [ 233 | ('index', 'WebHooksDemo', u'WebHooksDemo Documentation', 234 | u'Nathan Leiby', 'WebHooksDemo', 'One line description of project.', 235 | 'Miscellaneous'), 236 | ] 237 | 238 | # Documents to append as an appendix to all manuals. 239 | #texinfo_appendices = [] 240 | 241 | # If false, no module index is generated. 242 | #texinfo_domain_indices = True 243 | 244 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 245 | #texinfo_show_urls = 'footnote' 246 | 247 | # If true, do not generate a @detailmenu in the "Top" node's menu. 248 | #texinfo_no_detailmenu = False 249 | -------------------------------------------------------------------------------- /doc/source/dssg.rst: -------------------------------------------------------------------------------- 1 | dssg Package 2 | ============ 3 | 4 | :mod:`Machine` Module 5 | --------------------- 6 | 7 | .. automodule:: dssg.Machine 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | 12 | :mod:`junutils` Module 13 | ---------------------- 14 | 15 | .. automodule:: dssg.junutils 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | Subpackages 21 | ----------- 22 | 23 | .. toctree:: 24 | 25 | dssg.webapp 26 | 27 | -------------------------------------------------------------------------------- /doc/source/dssg.webapp.rst: -------------------------------------------------------------------------------- 1 | webapp Package 2 | ============== 3 | 4 | :mod:`webapp` Package 5 | --------------------- 6 | 7 | .. automodule:: dssg.webapp 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | 12 | :mod:`rest_api` Module 13 | ---------------------- 14 | 15 | .. automodule:: dssg.webapp.rest_api 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | -------------------------------------------------------------------------------- /doc/source/index.rst: -------------------------------------------------------------------------------- 1 | .. pybedtools documentation master file, created by 2 | sphinx-quickstart on Tue Apr 13 18:15:46 2010. 3 | You can adapt this file completely to your liking, but it 4 | should at least contain the root `toctree` directive. 5 | 6 | Welcome to examplemodule's documentation! 7 | ========================================= 8 | 9 | Contents: 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | modules.rst 15 | 16 | .. automodule:: dssg.Machine 17 | 18 | .. autoclass:: dssg.Machine 19 | :members: 20 | -------------------------------------------------------------------------------- /doc/source/modules.rst: -------------------------------------------------------------------------------- 1 | dssg 2 | ==== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | dssg 8 | -------------------------------------------------------------------------------- /dssg/README.md: -------------------------------------------------------------------------------- 1 | Machine Learning Module 2 | ==== 3 | 4 | `machine.py` is the core class. A `machine` has methods which enable 5 | 6 | - suggesting categories 7 | - suggesting locations 8 | - suggesting entities (person names, political groups, and more) 9 | - detecting language 10 | - detecting near-duplicate messages 11 | 12 | The other files in this directory (`classifier.py`, `vectorizer.py`, and `platt.py`) are the classes which compose the classifier and allow category prediction. The underlying algorithm is a support vector machine, which is extensively documented in the wiki. -------------------------------------------------------------------------------- /dssg/__init__.py: -------------------------------------------------------------------------------- 1 | import ConfigParser 2 | import os 3 | 4 | from flask.ext.sqlalchemy import SQLAlchemy 5 | 6 | import util as util 7 | from classifier import DssgCategoryClassifier 8 | from machine import Machine 9 | 10 | db = None 11 | machine = None 12 | category_classifier = None 13 | 14 | 15 | def load_config(app, config_file): 16 | """Loads the configuration from the specified file and sets the 17 | properties of ```app```, ```db``` and ```machine``` application objects 18 | 19 | :param app: the flask application object 20 | :param config_file: the absolute path to the configuration file 21 | """ 22 | global db, machine, category_classifier 23 | 24 | config = ConfigParser.SafeConfigParser() 25 | 26 | try: 27 | config.readfp(open(config_file)) 28 | except IOError as e: 29 | app.logger.error("An error while reading '%s': %s" % 30 | (config_file, e.strerror)) 31 | 32 | # Initialize the database 33 | try: 34 | database_uri = config.get('database', 'sqlalchemy.url') 35 | pool_size = config.get('database', 'sqlalchemy.pool_size') 36 | 37 | # SQLAlchemy configuration 38 | app.config['SQLALCHEMY_DATABASE_URI'] = database_uri 39 | app.config['SQLALCHEMY_POOL_SIZE'] = int(pool_size) 40 | except ConfigParser.NoSectionError as e: 41 | logger.error("The specified section does not exist", e) 42 | 43 | db = SQLAlchemy(app) 44 | 45 | # Intialize the machine 46 | classifier_file = config.get("classifier", "classifier.file") 47 | if not classifier_file is None: 48 | if os.path.exists(classifier_file): 49 | _dict = util.load_pickle(classifier_file) 50 | category_classifier = _dict['categoryClassifier'] 51 | if not isinstance(category_classifier, DssgCategoryClassifier): 52 | app.logger.error("Invalid classifier object type: %s" % 53 | type(category_classifier)) 54 | category_classifier = None 55 | return 56 | # Proceed 57 | machine = Machine(category_classifier) 58 | else: 59 | app.logger.info("The classifier file '%s' does not exist" % 60 | classifier_file) 61 | -------------------------------------------------------------------------------- /dssg/config/dssg.ini: -------------------------------------------------------------------------------- 1 | # 2 | # DSSG Configuration 3 | # 4 | # Configuration options available for your DSSG instance 5 | # 6 | 7 | [database] 8 | # Database settings 9 | sqlalchemy.url = mysql://dssgweb:dssgweb@localhost/dssg_web 10 | sqlalchemy.pool_size = 5 11 | 12 | [classifier] 13 | # Path to the trained classifier model 14 | classifier.file = /path/to/classifer.pkl -------------------------------------------------------------------------------- /dssg/config/dssg.ini.template: -------------------------------------------------------------------------------- 1 | # 2 | # DSSG Configuration 3 | # 4 | # Configuration options available for your DSSG instance 5 | # 6 | 7 | [database] 8 | # Database settings 9 | sqlalchemy.url = mysql://dssgweb:dssgweb@localhost/dssg_web 10 | sqlalchemy.pool_size = 5 11 | 12 | [classifier] 13 | # Path to the trained classifier model 14 | classifier.file = /path/to/classifer.pkl -------------------------------------------------------------------------------- /dssg/data/classifier/election_v000.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/ushine-learning/71e2eb6746b464a509c1bb39c495591f5b0a5d5f/dssg/data/classifier/election_v000.pkl -------------------------------------------------------------------------------- /dssg/data/classifier/election_v001.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/ushine-learning/71e2eb6746b464a509c1bb39c495591f5b0a5d5f/dssg/data/classifier/election_v001.pkl -------------------------------------------------------------------------------- /dssg/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .category import Category 2 | from .deployment import Deployment 3 | from .message import Message 4 | from .report import Report, ReportCategory 5 | -------------------------------------------------------------------------------- /dssg/model/base_model.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from sqlalchemy import orm 4 | 5 | from dssg import db 6 | 7 | 8 | class BaseModel: 9 | 10 | """Base class for all mapped classes""" 11 | 12 | def __init__(self, **kwargs): 13 | for k, v in kwargs.iteritems(): 14 | setattr(self, k, v) 15 | 16 | @classmethod 17 | def by_id(cls, id): 18 | """Load and return by the primary key""" 19 | return db.session.query(cls).get(id) 20 | 21 | def create(self): 22 | """Saves the current object in the database""" 23 | db.session.add(self) 24 | db.session.commit() 25 | 26 | def delete(self): 27 | """Deletes current object from the database""" 28 | db.session.delete(self) 29 | db.session.commit() 30 | 31 | def as_dict(self): 32 | """Returns a dictionary representation of a database 33 | table row 34 | 35 | :rtype: dict 36 | """ 37 | _dict = {} 38 | table = orm.class_mapper(self.__class__).mapped_table 39 | for col in table.c: 40 | val = getattr(self, col.name) 41 | if isinstance(val, datetime.date): 42 | val = str(val) 43 | if isinstance(val, datetime.datetime): 44 | val = val.isoformat() 45 | _dict[col.name] = val 46 | return _dict 47 | 48 | @classmethod 49 | def create_all(cls, entries=[]): 50 | """Saves a list of objects in bulk 51 | 52 | :param entries: the list of objects to be saved 53 | """ 54 | for row in entries: 55 | db.session.add(row) 56 | if len(entries) > 0: 57 | db.session.commit() 58 | -------------------------------------------------------------------------------- /dssg/model/category.py: -------------------------------------------------------------------------------- 1 | from dssg import db 2 | import base_model 3 | 4 | 5 | class Category(base_model.BaseModel, db.Model): 6 | 7 | """Mapping for the category table""" 8 | 9 | __tablename__ = 'category' 10 | 11 | id = db.Column( 12 | db.Integer, 13 | db.Sequence('seq_category_id'), 14 | primary_key=True) 15 | deployment_id = db.Column(db.Integer, db.ForeignKey('deployment.id')) 16 | origin_category_id = db.Column(db.Integer, nullable=False) 17 | origin_parent_id = db.Column(db.Integer, nullable=False) 18 | title = db.Column(db.String(50), nullable=False) 19 | -------------------------------------------------------------------------------- /dssg/model/deployment.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | 3 | from dssg import db 4 | import base_model 5 | 6 | 7 | class Deployment(base_model.BaseModel, db.Model): 8 | 9 | __tablename__ = 'deployment' 10 | 11 | id = db.Column(db.Integer, db.Sequence('seq_deployment'), primary_key=True) 12 | name = db.Column(db.String(50)) 13 | url = db.Column(db.String(100), nullable=False) 14 | url_hash = db.Column(db.String(32), nullable=False, unique=True) 15 | message_count = db.Column(db.Integer) 16 | report_count = db.Column(db.Integer) 17 | 18 | # One-to-many relationship definitions 19 | categories = db.relationship('Category', backref='deployment', 20 | cascade="all, delete, delete-orphan") 21 | reports = db.relationship('Report', backref='deployment', 22 | cascade="all, delete, delete-orphan") 23 | messages = db.relationship('Message', backref='deployment', 24 | cascade="all, delete, delete-orphan") 25 | 26 | def save(self): 27 | self.url_hash = hashlib.md5(self.url).hexdigest() 28 | db.session.add(self) 29 | db.session.commit() 30 | 31 | @classmethod 32 | def by_url(cl, deployment_url): 33 | """Return the deployment with the given url 34 | 35 | :param deployment_url: the url of the deployment 36 | :type deployment_url: string 37 | 38 | :returns: the deployment with the given url or None if there is 39 | no deployment with that url 40 | :rtype: dssg.model.Deployment 41 | 42 | """ 43 | # Get the MD5 hash of the deployment url 44 | url_hash = hashlib.md5(deployment_url).hexdigest() 45 | return Deployment.query.filter_by(url_hash=url_hash).first() 46 | -------------------------------------------------------------------------------- /dssg/model/message.py: -------------------------------------------------------------------------------- 1 | from dssg import db 2 | import base_model 3 | 4 | 5 | class Message(base_model.BaseModel, db.Model): 6 | 7 | __tablename__ = 'message' 8 | 9 | id = db.Column(db.Integer, db.Sequence('seq_message'), primary_key=True) 10 | deployment_id = db.Column(db.Integer, db.ForeignKey('deployment.id')) 11 | origin_message_id = db.Column(db.Integer, nullable=False) 12 | content = db.Column(db.Text, nullable=False) 13 | simhash = db.Column(db.String(64), nullable=False) 14 | -------------------------------------------------------------------------------- /dssg/model/report.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy.ext.associationproxy import association_proxy 2 | 3 | import base_model 4 | from dssg import db 5 | 6 | 7 | class Report(base_model.BaseModel, db.Model): 8 | __tablename__ = 'report' 9 | id = db.Column(db.Integer, db.Sequence('seq_report'), primary_key=True) 10 | deployment_id = db.Column(db.Integer, db.ForeignKey('deployment.id')) 11 | origin_report_id = db.Column(db.Integer, nullable=False) 12 | description = db.Column(db.Text, nullable=False) 13 | title = db.Column(db.String(255), nullable=False) 14 | simhash = db.Column(db.String(64), nullable=False) 15 | 16 | # Association proxy of "report_categories" collection 17 | # to "categories" attribute 18 | categories = association_proxy('report_categories', 'category') 19 | 20 | 21 | class ReportCategory(base_model.BaseModel, db.Model): 22 | __tablename__ = 'report_category' 23 | report_id = db.Column(db.Integer, db.ForeignKey('report.id'), 24 | primary_key=True) 25 | category_id = db.Column(db.Integer, db.ForeignKey('category.id'), 26 | primary_key=True) 27 | 28 | # bi-directional attribute/collection of report/report_categories 29 | report = db.relationship(Report, 30 | backref=db.backref('report_categories', 31 | cascade='all, delete-orphan')) 32 | 33 | # Reference to the category object 34 | category = db.relationship("Category") 35 | -------------------------------------------------------------------------------- /dssg/platt.py: -------------------------------------------------------------------------------- 1 | # source: http://home.caltech.edu/~htlin/program/libsvm/doc/platt.py 2 | from sys import argv 3 | from math import log, exp 4 | from string import atof 5 | from random import randrange 6 | #--[Basic Function]------------------------------------------------------- 7 | 8 | 9 | def SigmoidTrain(deci, label, prior1=None, prior0=None): 10 | """ 11 | input decision_values, real_labels{1,-1}, #positive_instances, #negative_instances 12 | output [A,B] that minimize sigmoid likilihood 13 | refer to Platt's Probablistic Output for Support Vector Machines 14 | """ 15 | 16 | # Count prior0 and prior1 if needed 17 | if prior1 is None or prior0 is None: 18 | prior1, prior0 = 0, 0 19 | for i in range(len(label)): 20 | if label[i] > 0: 21 | prior1 += 1 22 | else: 23 | prior0 += 1 24 | 25 | # Parameter Setting 26 | maxiter = 100 # Maximum number of iterations 27 | minstep = 1e-10 # Minimum step taken in line search 28 | sigma = 1e-12 # For numerically strict PD of Hessian 29 | eps = 1e-5 30 | 31 | # Construct Target Support 32 | hiTarget = (prior1 + 1.0) / (prior1 + 2.0) 33 | loTarget = 1 / (prior0 + 2.0) 34 | length = prior1 + prior0 35 | t = [] 36 | 37 | for i in range(length): 38 | if label[i] > 0: 39 | t.append(hiTarget) 40 | else: 41 | t.append(loTarget) 42 | 43 | # Initial Point and Initial Fun Value 44 | A, B = 0.0, log((prior0 + 1.0) / (prior1 + 1.0)) 45 | fval = 0.0 46 | 47 | for i in range(length): 48 | fApB = deci[i] * A + B 49 | if fApB >= 0: 50 | fval += t[i] * fApB + log(1 + exp(-fApB)) 51 | else: 52 | fval += (t[i] - 1) * fApB + log(1 + exp(fApB)) 53 | 54 | for it in range(maxiter): 55 | # Update Gradient and Hessian (use H' = H + sigma I) 56 | h11 = h22 = sigma # Numerically ensures strict PD 57 | h21 = g1 = g2 = 0.0 58 | for i in range(length): 59 | fApB = deci[i] * A + B 60 | if (fApB >= 0): 61 | p = exp(-fApB) / (1.0 + exp(-fApB)) 62 | q = 1.0 / (1.0 + exp(-fApB)) 63 | else: 64 | p = 1.0 / (1.0 + exp(fApB)) 65 | q = exp(fApB) / (1.0 + exp(fApB)) 66 | d2 = p * q 67 | h11 += deci[i] * deci[i] * d2 68 | h22 += d2 69 | h21 += deci[i] * d2 70 | d1 = t[i] - p 71 | g1 += deci[i] * d1 72 | g2 += d1 73 | 74 | # Stopping Criteria 75 | if abs(g1) < eps and abs(g2) < eps: 76 | break 77 | 78 | # Finding Newton direction: -inv(H') * g 79 | det = h11 * h22 - h21 * h21 80 | dA = -(h22 * g1 - h21 * g2) / det 81 | dB = -(-h21 * g1 + h11 * g2) / det 82 | gd = g1 * dA + g2 * dB 83 | 84 | # Line Search 85 | stepsize = 1 86 | while stepsize >= minstep: 87 | newA = A + stepsize * dA 88 | newB = B + stepsize * dB 89 | 90 | # New function value 91 | newf = 0.0 92 | for i in range(length): 93 | fApB = deci[i] * newA + newB 94 | if fApB >= 0: 95 | newf += t[i] * fApB + log(1 + exp(-fApB)) 96 | else: 97 | newf += (t[i] - 1) * fApB + log(1 + exp(fApB)) 98 | 99 | # Check sufficient decrease 100 | if newf < fval + 0.0001 * stepsize * gd: 101 | A, B, fval = newA, newB, newf 102 | break 103 | else: 104 | stepsize = stepsize / 2.0 105 | 106 | if stepsize < minstep: 107 | print "line search fails", A, B, g1, g2, dA, dB, gd 108 | return [A, B] 109 | 110 | if it >= maxiter - 1: 111 | print "reaching maximal iterations", g1, g2 112 | return [A, B] 113 | 114 | 115 | def SigmoidPredict(deci, AB): 116 | """ 117 | reads decision_value and Platt parameter [A,B] 118 | outputs predicted probability 119 | """ 120 | A, B = AB 121 | fApB = deci * A + B 122 | if (fApB >= 0): 123 | return exp(-fApB) / (1.0 + exp(-fApB)) 124 | else: 125 | return 1.0 / (1 + exp(fApB)) 126 | return prob 127 | -------------------------------------------------------------------------------- /dssg/tests/__init__.py: -------------------------------------------------------------------------------- 1 | from flask.ext.sqlalchemy import SQLAlchemy 2 | 3 | import dssg 4 | from dssg.webapp import app 5 | 6 | app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:////tmp/test.db' 7 | dssg.db = SQLAlchemy(app) 8 | 9 | # Import the models after initializing SQLAlchemy 10 | import dssg.model as _model 11 | 12 | 13 | def setup_module(): 14 | """Tests the model mapping and subsequent table creation""" 15 | dssg.db.create_all() 16 | 17 | 18 | def teardown_module(): 19 | """Drop all the schema tables""" 20 | dssg.db.drop_all() 21 | 22 | 23 | def create_deployment(name, url): 24 | deployment = _model.Deployment(name=name, url=url) 25 | deployment.save() 26 | 27 | return deployment 28 | -------------------------------------------------------------------------------- /dssg/tests/test_base_model.py: -------------------------------------------------------------------------------- 1 | from nose.tools import assert_equals 2 | from sqlalchemy import func 3 | 4 | from dssg import db 5 | from dssg.model import Category 6 | 7 | 8 | def test_bulk_create(): 9 | """Tests creation of several items in a single batch""" 10 | categories = [] 11 | categories.append(Category(title='category 1', 12 | origin_category_id=1, 13 | origin_parent_id=0)) 14 | categories.append(Category(title='category 2', 15 | origin_category_id=2, 16 | origin_parent_id=0)) 17 | categories.append(Category(title='category 2 child', 18 | origin_category_id=3, 19 | origin_parent_id=2)) 20 | Category.create_all(categories) 21 | 22 | count = db.session.query(func.count('*')).select_from(Category).scalar() 23 | assert_equals(count, 3) 24 | -------------------------------------------------------------------------------- /dssg/tests/test_deployment.py: -------------------------------------------------------------------------------- 1 | from nose.tools import assert_equals 2 | 3 | import dssg.tests as _test 4 | from dssg.model import Deployment 5 | 6 | 7 | class TestDeployment: 8 | 9 | @classmethod 10 | def setup_class(self): 11 | """Set up preliminary test data""" 12 | _test.create_deployment( 13 | 'ushine integration', 14 | 'http://dssg.ushahididev.com') 15 | _test.create_deployment('Nigeria budget monitoring', 16 | 'https://monitoringbudget.crowdmap.com') 17 | 18 | def test_save(self): 19 | """Tests creation of a new deployment entry""" 20 | out = _test.create_deployment('uchaguzi 2013', 21 | 'http://uchaguzi.co.ke') 22 | assert_equals(out.url_hash, '87888b7f4d65d4947cde38b99e201544') 23 | 24 | def test_by_url(self): 25 | """Tests finding a deployment by its url""" 26 | result = Deployment.by_url('http://dssg.ushahididev.com') 27 | assert_equals(result.url_hash, 'bc2f8da9e34c3fe1ec5fdc2d1fea23c1') 28 | 29 | def test_as_dict(self): 30 | deployment = Deployment.by_url('https://monitoringbudget.crowdmap.com') 31 | _dict = deployment.as_dict() 32 | # assert_equals('categories' in _dict, True) 33 | # assert_equals('reports' in _dict, True) 34 | # assert_equals('messages' in _dict, True) 35 | assert_equals(_dict['name'], 'Nigeria budget monitoring') 36 | -------------------------------------------------------------------------------- /dssg/tests/test_extract_from_text.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Verify private data stripping functionality 6 | """ 7 | 8 | from nose.tools import * 9 | from dssg.machine import Machine 10 | 11 | 12 | # 13 | # Setup / Teardown 14 | # 15 | 16 | def setup(): 17 | print "setup!" 18 | 19 | 20 | def teardown(): 21 | print "tear down!" 22 | 23 | # 24 | # Tests 25 | # 26 | 27 | 28 | def test_canExtractUrls(): 29 | text = "Awesome post talking about http://www.mysite.com which is full of private info!" 30 | extract_fn = Machine._extract_urls 31 | expected = [('URL', 'http://www.mysite.com')] 32 | _helper_canExtractX(text, extract_fn, expected) 33 | 34 | 35 | def test_canExtractEntityGroups(): 36 | text = "My name is Indigo Montoya. I am from the Congo." 37 | extract_fn = Machine._extract_entity_groups 38 | expected = ['PERSON', 'GSP'] 39 | _helper_canExtractX(text, extract_fn, expected) 40 | 41 | 42 | def test_canExtractEntities(): 43 | text = "My name is Indigo Montoya. I am from the Congo." 44 | extract_fn = Machine._extract_entities 45 | expected = [('PERSON', 'Indigo Montoya'), ('GSP', 'Congo')] 46 | _helper_canExtractX(text, extract_fn, expected) 47 | 48 | 49 | def test_canExtractIds(): 50 | text = "Oh my gosh I accidentally included my credit card number 14320099 and passport P123411." 51 | extract_fn = Machine._extract_ids 52 | expected = [('ID', '14320099'), ('ID', 'P123411')] 53 | _helper_canExtractX(text, extract_fn, expected) 54 | 55 | 56 | def test_canExtractUsernames(): 57 | text = "RT best tweet evarrrr @123fake @justinbieber @BarackObama." 58 | extract_fn = Machine._extract_usernames 59 | expected = [('TWITTER', '@justinbieber'), ( 60 | 'TWITTER', '@BarackObama'), ('TWITTER', '@123fake')] 61 | _helper_canExtractX(text, extract_fn, expected) 62 | 63 | 64 | def test_canExtractEmails(): 65 | text = "Hello my email is fakeperson@example.com and I am here." 66 | extract_fn = Machine._extract_emails 67 | expected = [('EMAIL', 'fakeperson@example.com')] 68 | _helper_canExtractX(text, extract_fn, expected) 69 | 70 | 71 | def test_canExtractPhones(): 72 | text = "This is my phone number 555-555-3333!" 73 | extract_fn = Machine._extract_phones 74 | expected = [('PHONE', '555-555-3333')] 75 | _helper_canExtractX(text, extract_fn, expected) 76 | 77 | # 78 | # Helpers 79 | # 80 | 81 | 82 | def _helper_canExtractX(text='', extract_fn=None, expected=None): 83 | actual = extract_fn(text) 84 | 85 | assert(set(expected) == set( 86 | actual)), "set(expected) != set(actual).\n set(expected): %s, set(actual): %s" % (set(expected), set(actual)) 87 | assert(len(expected) == len( 88 | actual)), "len(expected) != len(actual).\n len(expected): %s, actual: %s" % (len(expected), len(actual)) 89 | -------------------------------------------------------------------------------- /dssg/tests/test_machine.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | from nose.tools import * 5 | from dssg.machine import Machine 6 | 7 | # 8 | # Setup / Teardown 9 | # 10 | 11 | 12 | def setup(): 13 | print "setup!" 14 | 15 | 16 | def teardown(): 17 | print "tear down!" 18 | 19 | # 20 | # Tests 21 | # 22 | 23 | 24 | def test_canCreateMachine(): 25 | mac = Machine() 26 | assert (mac is not None) 27 | 28 | 29 | def test_canGuessLanguage(): 30 | mac = Machine() 31 | text = "Hello world this is definitely some English text" 32 | g = mac.guess_language(text) 33 | assert (g is not None) 34 | 35 | 36 | def test_canGuessEntities(): 37 | # Note: test takes a few seconds to run - loading NTLK is slow 38 | mac = Machine() 39 | text = "The United States is a country. Thomas Jefferson was a president. This is definitely Lower Wacker Drive." 40 | g = mac.guess_entities(text) 41 | assert (g is not None) 42 | 43 | 44 | def test_canStripPrivateInfo(): 45 | mac = Machine() 46 | # TODO: Change 'text' to have all the types of private info, not just URL. 47 | text = "This post talks about http://www.mysite.com which is full of private info!" 48 | 49 | actual = mac.guess_private_info(text) 50 | expected = [('URL', 'http://www.mysite.com')] 51 | 52 | assert(set(expected) == set( 53 | actual)), "set(expected) != set(actual).\n set(expected): %s, set(actual): %s" % (set(expected), set(actual)) 54 | assert(len(expected) == len( 55 | actual)), "len(expected) != len(actual).\n len(expected): %s, actual: %s" % (len(expected), len(actual)) 56 | 57 | # 58 | # Helpers 59 | # 60 | -------------------------------------------------------------------------------- /dssg/tests/test_pep8.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | import pep8 4 | 5 | # from: https://github.com/dssg/tweedr/blob/master/tests/syntax.py 6 | 7 | 8 | class TestFormatting(unittest.TestCase): 9 | 10 | def test_pep8(self): 11 | basepath = os.getcwd() 12 | print 'Running PEP-8 checks on path', basepath 13 | path_list = [basepath + '/dssg', basepath + '/dssg/tests'] 14 | pep8style = pep8.StyleGuide(paths=path_list, ignore=['E128', 'E501']) 15 | report = pep8style.check_files() 16 | if report.total_errors: 17 | print report.total_errors 18 | 19 | self.assertEqual( 20 | report.total_errors, 0, 'Codebase does not pass PEP-8') 21 | -------------------------------------------------------------------------------- /dssg/tests/test_rest_api.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | These test cases focus on the REST API. They aim to enforce: 6 | 7 | - consistency in returned JSON object, corresponding to documentation / user expectation. 8 | - poorly formed REST calls are handled properly and consistently. 9 | 10 | """ 11 | 12 | 13 | from dssg.webapp.rest_api import app 14 | 15 | import unittest 16 | import json 17 | from pprint import pprint 18 | 19 | API_VERSION = '/v1' 20 | 21 | 22 | class TestRestApi(unittest.TestCase): 23 | 24 | # 25 | # Setup / Teardown 26 | # 27 | 28 | def setUp(self): 29 | self.app = app.test_client() 30 | 31 | def tearDown(self): 32 | pass 33 | 34 | # 35 | # Tests 36 | # 37 | 38 | def test_detect_language(self): 39 | text = "This is a posting in the English language, so the most likely language should be English." 40 | send_json = json.dumps({ 41 | 'text': text 42 | }) 43 | rv = self.app.post( 44 | API_VERSION + '/language', data=send_json, content_type='application/json') 45 | 46 | return_json = json.loads(rv.data) 47 | 48 | key_lang = 'language' 49 | key_conf = 'confidence' 50 | 51 | # self.assertIsInstance( 52 | # json_val, list, 'Should return a list of languages, under "%s" key in 53 | # json' % (json_key,)) 54 | 55 | # self.assertIsInstance( 56 | # json_val[0], list, 'Each item is a list') 57 | # 58 | self.assertIsInstance( 59 | return_json, dict, 'Should return a dict') 60 | 61 | self.assertTrue( 62 | key_lang in return_json, 'Json should have key "%s"' % (key_lang,)) 63 | 64 | self.assertTrue( 65 | key_conf in return_json, 'Json should have key "%s"' % (key_conf,)) 66 | 67 | # TODO: Should be unicode, or str? 68 | actual = type(return_json[key_lang]) 69 | expected = unicode 70 | self.assertIs( 71 | actual, expected, '"%s" element should be a %s, instead was %s' % (key_lang, expected, actual)) 72 | 73 | self.assertEqual( 74 | len(return_json[key_lang]), 2, '"%s" element should be a 2-letter language code' % (key_lang,)) 75 | 76 | self.assertIs( 77 | type(return_json[key_conf]), float, 'Second element should be a float, 0-to-1 probability') 78 | 79 | self.assertTrue(return_json[key_conf] >= 0 and return_json[ 80 | key_conf] <= 1, 'Second element should be in range 0-to-1, because is a probability') 81 | 82 | # TODO: move to non-API tests. belongs in machine tests 83 | # validity of method belongs in machine tests, while validity of API 84 | # interface goes here 85 | self.assertEqual( 86 | return_json[key_lang], 'en', 'First value should be English') 87 | 88 | # TODO: move to non-API tests, belongs in machine tests 89 | # expected_language_count = 97 # using langid 90 | # self.assertEqual( 91 | # len(json_val), expected_language_count, '%s languages should be 92 | # returned, but instead got %s' % (expected_language_count, 93 | # len(json_val))) 94 | 95 | def test_suggest_locations(self): 96 | text = "My name is Indigo Montoya. I am from the Congo." 97 | send_json = json.dumps({ 98 | 'text': text 99 | }) 100 | rv = self.app.post( 101 | API_VERSION + '/locations', data=send_json, content_type='application/json') 102 | 103 | print rv 104 | print rv.data 105 | 106 | return_json = json.loads(rv.data) 107 | json_key = 'locations' 108 | json_val = return_json[json_key] 109 | 110 | key_gsp = "GSP" 111 | 112 | pprint(json_val) 113 | 114 | self.assertIsInstance( 115 | json_val, dict, 'Should return a dict of location entity types') 116 | 117 | self.assertIsInstance( 118 | json_val[key_gsp], list, 'Each entity type is composed of a list of items') 119 | 120 | # self.assertIsInstance( 121 | # json_val, list, 'Should return a list of locations, under "%s" key in 122 | # json' % (json_key,)) 123 | 124 | # self.assertIsInstance( 125 | # json_val[0], list, 'Each item is a list') 126 | 127 | # TODO: Should be unicode, or str? 128 | actual = type(json_val[key_gsp][0]) 129 | expected = unicode 130 | self.assertIs( 131 | actual, expected, 'First element should be a %s, instead was %s' % (expected, actual)) 132 | 133 | actual = json_val.keys()[0] 134 | # TODO: fetch from machine.py, so stays in sync? 135 | expected = ['LOCATION', 'GPE', 'GSP'] # location entity types 136 | self.assertIn( 137 | actual, expected, 'Dict key should be a code for location entity type') 138 | 139 | self.assertIs( 140 | type(json_val[key_gsp][0]), unicode, 'List item should be unicode text for entity name') 141 | 142 | # TODO: move to non-API tests. belongs in machine tests 143 | self.assertEqual( 144 | json_val[key_gsp][0], 'Congo', 'First value should be Congo') 145 | 146 | expected_count = 1 147 | self.assertEqual( 148 | len(json_val[key_gsp]), expected_count, '%s locations should be returned, but instead got %s' % (expected_count, len(json_val))) 149 | 150 | # TODO: etc... to enforce all REST endpoints 151 | def test_suggest_sensitive_info(self): 152 | pass 153 | 154 | def test_extract_entities(self): 155 | pass 156 | 157 | def test_suggest_categories(self): 158 | pass 159 | 160 | def test_similar_messages(self): 161 | pass 162 | 163 | if __name__ == '__main__': 164 | unittest.main() 165 | -------------------------------------------------------------------------------- /dssg/util.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import codecs 3 | import cStringIO 4 | import os 5 | import sys 6 | import json 7 | import copy 8 | import logging 9 | import cPickle as pickle 10 | from datetime import datetime 11 | 12 | FORMAT = '%(levelname)s: %(message)s' 13 | logging.basicConfig(format=FORMAT, stream=sys.stdout, level=logging.INFO) 14 | 15 | uchaguziPath = 'data/processed/uchaguzi/uchaguzi-message_with_category_list.csv' 16 | uchaguziJsonPath = 'data/processed/uchaguzi_new.json' 17 | uchaguziCategoryJsonPath = 'data/processed/uchaguzi_new_categories.json' 18 | 19 | # 20 | # Pickle 21 | # 22 | 23 | 24 | def loadPickle(fileName): 25 | """ Load a pickle file. """ 26 | return load_pickle(fileName) 27 | 28 | 29 | def load_pickle(fileName): 30 | """ Load a pickle file. """ 31 | with open(fileName, 'rb') as f: 32 | varDic = pickle.load(f) 33 | return varDic 34 | 35 | 36 | def savePickle(var, fileName, protocol=0): 37 | """ Saves a pickle file """ 38 | f = open(fileName, 'wb') 39 | pickle.dump(var, f, protocol=protocol) 40 | f.close() 41 | 42 | # 43 | # for CSV 44 | # 45 | 46 | 47 | class UTF8Recoder: 48 | 49 | """ 50 | Iterator that reads an encoded stream and reencodes the input to UTF-8 51 | """ 52 | 53 | def __init__(self, f, encoding): 54 | self.reader = codecs.getreader(encoding)(f) 55 | 56 | def __iter__(self): 57 | return self 58 | 59 | def next(self): 60 | return self.reader.next().encode("utf-8") 61 | 62 | 63 | class UnicodeReader: 64 | 65 | """ 66 | A CSV reader which will iterate over lines in the CSV file "f", 67 | which is encoded in the given encoding. 68 | """ 69 | 70 | def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): 71 | f = UTF8Recoder(f, encoding) 72 | self.reader = csv.reader(f, dialect=dialect, **kwds) 73 | 74 | def next(self): 75 | row = self.reader.next() 76 | return [unicode(s, "utf-8") for s in row] 77 | 78 | def __iter__(self): 79 | return self 80 | 81 | 82 | class UnicodeWriter: 83 | 84 | """ 85 | A CSV writer which will write rows to CSV file "f", 86 | which is encoded in the given encoding. 87 | """ 88 | 89 | def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): 90 | # Redirect output to a queue 91 | self.queue = cStringIO.StringIO() 92 | self.writer = csv.writer(self.queue, dialect=dialect, **kwds) 93 | self.stream = f 94 | self.encoder = codecs.getincrementalencoder(encoding)() 95 | 96 | def writerow(self, row): 97 | self.writer.writerow([s.encode("utf-8") for s in row]) 98 | # Fetch UTF-8 output from the queue ... 99 | data = self.queue.getvalue() 100 | data = data.decode("utf-8") 101 | # ... and reencode it into the target encoding 102 | data = self.encoder.encode(data) 103 | # write to the target stream 104 | self.stream.write(data) 105 | # empty queue 106 | self.queue.truncate(0) 107 | 108 | def writerows(self, rows): 109 | for row in rows: 110 | self.writerow(row) 111 | 112 | 113 | def UnicodeDictReader(utf8_data, **kwargs): 114 | """ 115 | DictReader that works with unicode. 116 | """ 117 | csv_reader = csv.DictReader(utf8_data, **kwargs) 118 | for row in csv_reader: 119 | yield dict([(key, unicode(value, 'utf-8')) for key, value in row.iteritems()]) 120 | 121 | # 122 | # utiliti functions 123 | # 124 | 125 | 126 | def tic(): 127 | """ 128 | Equivalent to Matlab's tic. It start measuring time. 129 | returns handle of the time start point. 130 | """ 131 | global gStartTime 132 | gStartTime = datetime.utcnow() 133 | return gStartTime 134 | 135 | 136 | def toc(prev=None): 137 | """ 138 | Get a timestamp in seconds. Time interval is from previous call of tic() to current call of toc(). 139 | You can optionally specify the handle of the time ending point. 140 | """ 141 | if prev is None: 142 | prev = gStartTime 143 | return (datetime.utcnow() - prev).total_seconds() 144 | 145 | 146 | def unicodeToAscii(unicodeStr): 147 | """ 148 | Turns a unicode string to ascii code. When failing to do so for certain 149 | characters, it replaces with a white space. This is particularly useful when 150 | using simhash since it only works well for ascii code. 151 | """ 152 | s = copy.copy(unicodeStr) 153 | while True: 154 | try: 155 | ret = codecs.encode(s, 'ascii') 156 | return ret 157 | except UnicodeEncodeError as e: 158 | s = s.replace(e.object[e.start:e.end], ' ') 159 | return None 160 | 161 | 162 | def loadIncidentListFromCsv(path): 163 | """ 164 | DEPRECATED 165 | Returns a list of incidents. An incident is [incidentId, message, 166 | categoryIdList, categoryTitleList]. However, the categories may have 167 | duplicates. Duplicates are not meaningful and should be removed, but I 168 | simply did not remove them in this function. 169 | """ 170 | incidentList = [] 171 | with open(path, 'r') as fp: 172 | reader = csv.reader(fp, delimiter=';', quotechar='"') 173 | bFirstLine = True 174 | for row in reader: 175 | if (bFirstLine): # skip the first line 176 | bFirstLine = False 177 | continue 178 | assert(len(row) == 4) 179 | incidentId = int(row[0]) 180 | message = row[1] 181 | # Note that there are duplicates. 182 | categoryIdList = map(lambda x: int(x), row[2].split(',')) 183 | categoryTitleList = map(lambda x: x.strip(), row[3].split(',')) 184 | assert(len(categoryIdList) == len(categoryTitleList)) 185 | 186 | incidentList.append([incidentId, message, categoryIdList, 187 | categoryTitleList]) 188 | return incidentList 189 | 190 | 191 | def getFullMessagesFromJson(path): 192 | """ 193 | Read JSON report data from path. 194 | """ 195 | with open(path, 'r') as fp: 196 | data = json.load(fp) 197 | messageList = [] 198 | keyList = sorted(data.keys()) # - sort by keys. 199 | for k in keyList: 200 | v = data[k] 201 | v[u'id'] = k 202 | messageList.append(v) 203 | return messageList 204 | 205 | 206 | def countTruth(boolFunc, aList): 207 | """ 208 | Counts the number of `True` value when applied `boolFunc` to each element in 209 | `aList` 210 | """ 211 | return len(filter(boolFunc, aList)) 212 | 213 | 214 | def isAllNumbers(aStr): 215 | """ 216 | Returns True if `aStr` consists of all numbers 217 | """ 218 | aStr = aStr.strip() 219 | if (len(aStr) == countTruth(lambda x: x >= '0' and x <= '9', aStr)): 220 | return True 221 | else: 222 | return False 223 | 224 | 225 | def loadJsonFromPath(path): 226 | """ 227 | Load a JSON object from given `path` 228 | """ 229 | with open(path, 'r') as fp: 230 | data = json.load(fp) 231 | return data 232 | 233 | 234 | def getFullMessagesFromUchaguziMergedCategory( 235 | uchaguziJsonPath, uchaguziCategoryJsonPath): 236 | """ 237 | DEPRECATED 238 | """ 239 | messageList = getFullMessagesFromJson(uchaguziJsonPath) 240 | #- map 'Polling station logisitcal issues' to 'Polling Station Logisitcal Issues' 241 | #- remove duplicate category labels 242 | for msg in messageList: 243 | categories = msg['categories'] 244 | for j in range(len(categories)): 245 | if categories[j] == 'Polling station logisitcal issues': 246 | categories[j] = 'Polling Station Logistical Issues' 247 | msg['categories'] = list(set(categories)) 248 | 249 | #--- these are selected categories. let's transform 250 | selectedCategories = [('parent', 'Counting + Results'), 251 | ('parent', 'Fear and Tension'), 252 | ('parent', 'POSITIVE EVENTS'), 253 | ('parent', 'Polling Station Administration'), 254 | ('parent', 'Security Issues'), 255 | ('parent', 'Staffing Issues'), 256 | ('parent', 'Voting Issues'), 257 | ('leaf', 'Resolved'), 258 | ('leaf', 'Unresolved')] 259 | 260 | categories = loadJsonFromPath(uchaguziCategoryJsonPath) 261 | categoryByName = dict([(cat['category_title'], cat) for cat in categories]) 262 | categoryById = dict([(cat['id'], cat) for cat in categories]) 263 | 264 | #--- create mappings 265 | catMap = {} 266 | for selectedCat in selectedCategories: 267 | catType = selectedCat[0] 268 | catName = selectedCat[1] 269 | id = categoryByName[catName]['id'] 270 | if (catType == 'parent'): 271 | #- find all categories that falls below it, or itself. 272 | for item in categories: 273 | if (item['parent_id'] == id or item['id'] == id): 274 | catMap[item['category_title']] = catName 275 | elif (catType == 'leaf'): 276 | catMap[catName] = catName 277 | else: 278 | assert false 279 | 280 | logging.info('Constructed mapping') 281 | 282 | #--- apply mappings 283 | ignoredLabelSet = set() 284 | for msg in messageList: 285 | labelList = msg['categories'] 286 | newLabelSet = set() 287 | for label in labelList: 288 | if (label in catMap): 289 | newLabelSet.add(catMap[label]) 290 | else: 291 | ignoredLabelSet.add(label) 292 | msg['categories'] = list(newLabelSet) 293 | 294 | logging.info('Ignored labels: %s', str(ignoredLabelSet)) 295 | 296 | return messageList 297 | 298 | 299 | def loadDatasetWithMappedCategories(dsetJsonPath, mappedCategoryPath): 300 | """ 301 | Load a dataset while mapping specific categories to a more general, 302 | and common categories. 303 | """ 304 | #---- read dataset 305 | messageList = getFullMessagesFromJson(dsetJsonPath) 306 | for msg in messageList: 307 | msg['categories'] = list(set(msg['categories'])) 308 | 309 | #---- read mappedCategory 310 | catMap = {} 311 | with open(mappedCategoryPath, 'rb') as inf: 312 | csvReader = UnicodeDictReader(inf) 313 | #headers = csvReader.fieldnames; 314 | for row in csvReader: 315 | # json.json.dumps(row) 316 | engCat = row['Category (English)'] 317 | superCat = row['Super Category'] 318 | assert (superCat is not None and superCat != '') 319 | catMap[engCat] = superCat 320 | 321 | #---- apply mapping 322 | for msg in messageList: 323 | catList = msg['categories'] 324 | newCatSet = set() 325 | for cat in catList: 326 | mappedCat = catMap[cat] 327 | if (mappedCat not in ['Other', '?']): 328 | newCatSet.add(mappedCat) 329 | msg['categories'] = list(newCatSet) 330 | 331 | return messageList 332 | -------------------------------------------------------------------------------- /dssg/webapp/README.md: -------------------------------------------------------------------------------- 1 | Flask Webapp 2 | ==== 3 | 4 | `server.py` runs the webapp and `dssg/webapp/rest_api.py` defines the API. 5 | 6 | The webapp serves recommendations in response to POST requests, via REST. It also saves a local copy of much of the data, via SQLAlchemy. 7 | 8 | These access the toolkit and machine learning functionality we provide. There is also a SQLAlchemy database that mirrors the important information from the Ushahidi app which we need for (1) updating the classifer and (2) detecting duplicate messages. 9 | -------------------------------------------------------------------------------- /dssg/webapp/__init__.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, jsonify, make_response 2 | 3 | # Flask application instance 4 | app = Flask(__name__) 5 | 6 | 7 | @app.errorhandler(404) 8 | def not_found(error): 9 | return make_response(jsonify({'error': 'Not found'}), 404) 10 | 11 | 12 | @app.errorhandler(400) 13 | def bad_request(error): 14 | return make_response(jsonify({'error': 'Bad request'}), 400) 15 | -------------------------------------------------------------------------------- /nltk_data/chunkers/maxent_ne_chunker/english_ace_multiclass.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/ushine-learning/71e2eb6746b464a509c1bb39c495591f5b0a5d5f/nltk_data/chunkers/maxent_ne_chunker/english_ace_multiclass.pickle -------------------------------------------------------------------------------- /nltk_data/corpora/words/README: -------------------------------------------------------------------------------- 1 | Wordlists 2 | 3 | en: English 4 | en-basic: 850 English words: C.K. Ogden in The ABC of Basic English (1932) 5 | -------------------------------------------------------------------------------- /nltk_data/corpora/words/en-basic: -------------------------------------------------------------------------------- 1 | I 2 | a 3 | able 4 | about 5 | account 6 | acid 7 | across 8 | act 9 | addition 10 | adjustment 11 | advertisement 12 | after 13 | again 14 | against 15 | agreement 16 | air 17 | all 18 | almost 19 | among 20 | amount 21 | amusement 22 | and 23 | angle 24 | angry 25 | animal 26 | answer 27 | ant 28 | any 29 | apparatus 30 | apple 31 | approval 32 | arch 33 | argument 34 | arm 35 | army 36 | art 37 | as 38 | at 39 | attack 40 | attempt 41 | attention 42 | attraction 43 | authority 44 | automatic 45 | awake 46 | baby 47 | back 48 | bad 49 | bag 50 | balance 51 | ball 52 | band 53 | base 54 | basin 55 | basket 56 | bath 57 | be 58 | beautiful 59 | because 60 | bed 61 | bee 62 | before 63 | behaviour 64 | belief 65 | bell 66 | bent 67 | berry 68 | between 69 | bird 70 | birth 71 | bit 72 | bite 73 | bitter 74 | black 75 | blade 76 | blood 77 | blow 78 | blue 79 | board 80 | boat 81 | body 82 | boiling 83 | bone 84 | book 85 | boot 86 | bottle 87 | box 88 | boy 89 | brain 90 | brake 91 | branch 92 | brass 93 | bread 94 | breath 95 | brick 96 | bridge 97 | bright 98 | broken 99 | brother 100 | brown 101 | brush 102 | bucket 103 | building 104 | bulb 105 | burn 106 | burst 107 | business 108 | but 109 | butter 110 | button 111 | by 112 | cake 113 | camera 114 | canvas 115 | card 116 | care 117 | carriage 118 | cart 119 | cat 120 | cause 121 | certain 122 | chain 123 | chalk 124 | chance 125 | change 126 | cheap 127 | cheese 128 | chemical 129 | chest 130 | chief 131 | chin 132 | church 133 | circle 134 | clean 135 | clear 136 | clock 137 | cloth 138 | cloud 139 | coal 140 | coat 141 | cold 142 | collar 143 | colour 144 | comb 145 | come 146 | comfort 147 | committee 148 | common 149 | company 150 | comparison 151 | competition 152 | complete 153 | complex 154 | condition 155 | connection 156 | conscious 157 | control 158 | cook 159 | copper 160 | copy 161 | cord 162 | cork 163 | cotton 164 | cough 165 | country 166 | cover 167 | cow 168 | crack 169 | credit 170 | crime 171 | cruel 172 | crush 173 | cry 174 | cup 175 | current 176 | curtain 177 | curve 178 | cushion 179 | cut 180 | damage 181 | danger 182 | dark 183 | daughter 184 | day 185 | dead 186 | dear 187 | death 188 | debt 189 | decision 190 | deep 191 | degree 192 | delicate 193 | dependent 194 | design 195 | desire 196 | destruction 197 | detail 198 | development 199 | different 200 | digestion 201 | direction 202 | dirty 203 | discovery 204 | discussion 205 | disease 206 | disgust 207 | distance 208 | distribution 209 | division 210 | do 211 | dog 212 | door 213 | doubt 214 | down 215 | drain 216 | drawer 217 | dress 218 | drink 219 | driving 220 | drop 221 | dry 222 | dust 223 | ear 224 | early 225 | earth 226 | east 227 | edge 228 | education 229 | effect 230 | egg 231 | elastic 232 | electric 233 | end 234 | engine 235 | enough 236 | equal 237 | error 238 | even 239 | event 240 | ever 241 | every 242 | example 243 | exchange 244 | existence 245 | expansion 246 | experience 247 | expert 248 | eye 249 | face 250 | fact 251 | fall 252 | false 253 | family 254 | far 255 | farm 256 | fat 257 | father 258 | fear 259 | feather 260 | feeble 261 | feeling 262 | female 263 | fertile 264 | fiction 265 | field 266 | fight 267 | finger 268 | fire 269 | first 270 | fish 271 | fixed 272 | flag 273 | flame 274 | flat 275 | flight 276 | floor 277 | flower 278 | fly 279 | fold 280 | food 281 | foolish 282 | foot 283 | for 284 | force 285 | fork 286 | form 287 | forward 288 | fowl 289 | frame 290 | free 291 | frequent 292 | friend 293 | from 294 | front 295 | fruit 296 | full 297 | future 298 | garden 299 | general 300 | get 301 | girl 302 | give 303 | glass 304 | glove 305 | go 306 | goat 307 | gold 308 | good 309 | government 310 | grain 311 | grass 312 | great 313 | green 314 | grey 315 | grip 316 | group 317 | growth 318 | guide 319 | gun 320 | hair 321 | hammer 322 | hand 323 | hanging 324 | happy 325 | harbour 326 | hard 327 | harmony 328 | hat 329 | hate 330 | have 331 | he 332 | head 333 | healthy 334 | hearing 335 | heart 336 | heat 337 | help 338 | here 339 | high 340 | history 341 | hole 342 | hollow 343 | hook 344 | hope 345 | horn 346 | horse 347 | hospital 348 | hour 349 | house 350 | how 351 | humour 352 | ice 353 | idea 354 | if 355 | ill 356 | important 357 | impulse 358 | in 359 | increase 360 | industry 361 | ink 362 | insect 363 | instrument 364 | insurance 365 | interest 366 | invention 367 | iron 368 | island 369 | jelly 370 | jewel 371 | join 372 | journey 373 | judge 374 | jump 375 | keep 376 | kettle 377 | key 378 | kick 379 | kind 380 | kiss 381 | knee 382 | knife 383 | knot 384 | knowledge 385 | land 386 | language 387 | last 388 | late 389 | laugh 390 | law 391 | lead 392 | leaf 393 | learning 394 | leather 395 | left 396 | leg 397 | let 398 | letter 399 | level 400 | library 401 | lift 402 | light 403 | like 404 | limit 405 | line 406 | linen 407 | lip 408 | liquid 409 | list 410 | little 411 | living 412 | lock 413 | long 414 | look 415 | loose 416 | loss 417 | loud 418 | love 419 | low 420 | machine 421 | make 422 | male 423 | man 424 | manager 425 | map 426 | mark 427 | market 428 | married 429 | mass 430 | match 431 | material 432 | may 433 | meal 434 | measure 435 | meat 436 | medical 437 | meeting 438 | memory 439 | metal 440 | middle 441 | military 442 | milk 443 | mind 444 | mine 445 | minute 446 | mist 447 | mixed 448 | money 449 | monkey 450 | month 451 | moon 452 | morning 453 | mother 454 | motion 455 | mountain 456 | mouth 457 | move 458 | much 459 | muscle 460 | music 461 | nail 462 | name 463 | narrow 464 | nation 465 | natural 466 | near 467 | necessary 468 | neck 469 | need 470 | needle 471 | nerve 472 | net 473 | new 474 | news 475 | night 476 | no 477 | noise 478 | normal 479 | north 480 | nose 481 | not 482 | note 483 | now 484 | number 485 | nut 486 | observation 487 | of 488 | off 489 | offer 490 | office 491 | oil 492 | old 493 | on 494 | only 495 | open 496 | operation 497 | opinion 498 | opposite 499 | or 500 | orange 501 | order 502 | organization 503 | ornament 504 | other 505 | out 506 | oven 507 | over 508 | owner 509 | page 510 | pain 511 | paint 512 | paper 513 | parallel 514 | parcel 515 | part 516 | past 517 | paste 518 | payment 519 | peace 520 | pen 521 | pencil 522 | person 523 | physical 524 | picture 525 | pig 526 | pin 527 | pipe 528 | place 529 | plane 530 | plant 531 | plate 532 | play 533 | please 534 | pleasure 535 | plough 536 | pocket 537 | point 538 | poison 539 | polish 540 | political 541 | poor 542 | porter 543 | position 544 | possible 545 | pot 546 | potato 547 | powder 548 | power 549 | present 550 | price 551 | print 552 | prison 553 | private 554 | probable 555 | process 556 | produce 557 | profit 558 | property 559 | prose 560 | protest 561 | public 562 | pull 563 | pump 564 | punishment 565 | purpose 566 | push 567 | put 568 | quality 569 | question 570 | quick 571 | quiet 572 | quite 573 | rail 574 | rain 575 | range 576 | rat 577 | rate 578 | ray 579 | reaction 580 | reading 581 | ready 582 | reason 583 | receipt 584 | record 585 | red 586 | regret 587 | regular 588 | relation 589 | religion 590 | representative 591 | request 592 | respect 593 | responsible 594 | rest 595 | reward 596 | rhythm 597 | rice 598 | right 599 | ring 600 | river 601 | road 602 | rod 603 | roll 604 | roof 605 | room 606 | root 607 | rough 608 | round 609 | rub 610 | rule 611 | run 612 | sad 613 | safe 614 | sail 615 | salt 616 | same 617 | sand 618 | say 619 | scale 620 | school 621 | science 622 | scissors 623 | screw 624 | sea 625 | seat 626 | second 627 | secret 628 | secretary 629 | see 630 | seed 631 | seem 632 | selection 633 | self 634 | send 635 | sense 636 | separate 637 | serious 638 | servant 639 | sex 640 | shade 641 | shake 642 | shame 643 | sharp 644 | sheep 645 | shelf 646 | ship 647 | shirt 648 | shock 649 | shoe 650 | short 651 | shut 652 | side 653 | sign 654 | silk 655 | silver 656 | simple 657 | sister 658 | size 659 | skin 660 | skirt 661 | sky 662 | sleep 663 | slip 664 | slope 665 | slow 666 | small 667 | smash 668 | smell 669 | smile 670 | smoke 671 | smooth 672 | snake 673 | sneeze 674 | snow 675 | so 676 | soap 677 | society 678 | sock 679 | soft 680 | solid 681 | some 682 | son 683 | song 684 | sort 685 | sound 686 | soup 687 | south 688 | space 689 | spade 690 | special 691 | sponge 692 | spoon 693 | spring 694 | square 695 | stage 696 | stamp 697 | star 698 | start 699 | statement 700 | station 701 | steam 702 | steel 703 | stem 704 | step 705 | stick 706 | sticky 707 | stiff 708 | still 709 | stitch 710 | stocking 711 | stomach 712 | stone 713 | stop 714 | store 715 | story 716 | straight 717 | strange 718 | street 719 | stretch 720 | strong 721 | structure 722 | substance 723 | such 724 | sudden 725 | sugar 726 | suggestion 727 | summer 728 | sun 729 | support 730 | surprise 731 | sweet 732 | swim 733 | system 734 | table 735 | tail 736 | take 737 | talk 738 | tall 739 | taste 740 | tax 741 | teaching 742 | tendency 743 | test 744 | than 745 | that 746 | the 747 | then 748 | theory 749 | there 750 | thick 751 | thin 752 | thing 753 | this 754 | though 755 | thought 756 | thread 757 | throat 758 | through 759 | thumb 760 | thunder 761 | ticket 762 | tight 763 | till 764 | time 765 | tin 766 | tired 767 | to 768 | toe 769 | together 770 | tomorrow 771 | tongue 772 | tooth 773 | top 774 | touch 775 | town 776 | trade 777 | train 778 | transport 779 | tray 780 | tree 781 | trick 782 | trouble 783 | trousers 784 | true 785 | turn 786 | twist 787 | umbrella 788 | under 789 | unit 790 | up 791 | use 792 | value 793 | verse 794 | very 795 | vessel 796 | view 797 | violent 798 | voice 799 | waiting 800 | walk 801 | wall 802 | war 803 | warm 804 | wash 805 | waste 806 | watch 807 | water 808 | wave 809 | wax 810 | way 811 | weather 812 | week 813 | weight 814 | well 815 | west 816 | wet 817 | wheel 818 | when 819 | where 820 | while 821 | whip 822 | whistle 823 | white 824 | who 825 | why 826 | wide 827 | will 828 | wind 829 | window 830 | wine 831 | wing 832 | winter 833 | wire 834 | wise 835 | with 836 | woman 837 | wood 838 | wool 839 | word 840 | work 841 | worm 842 | wound 843 | writing 844 | wrong 845 | year 846 | yellow 847 | yes 848 | yesterday 849 | you 850 | young 851 | -------------------------------------------------------------------------------- /nltk_data/taggers/maxent_treebank_pos_tagger/english.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/ushine-learning/71e2eb6746b464a509c1bb39c495591f5b0a5d5f/nltk_data/taggers/maxent_treebank_pos_tagger/english.pickle -------------------------------------------------------------------------------- /nltk_data/tokenizers/punkt/README: -------------------------------------------------------------------------------- 1 | Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected) 2 | 3 | Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have 4 | been contributed by various people using NLTK for sentence boundary detection. 5 | 6 | For information about how to use these models, please confer the tokenization HOWTO: 7 | http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html 8 | and chapter 3.8 of the NLTK book: 9 | http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation 10 | 11 | There are pretrained tokenizers for the following languages: 12 | 13 | File Language Source Contents Size of training corpus(in tokens) Model contributed by 14 | ======================================================================================================================================================================= 15 | czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss 16 | Literarni Noviny 17 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 18 | danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss 19 | (Berlingske Avisdata, Copenhagen) Weekend Avisen 20 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 21 | dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss 22 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 23 | english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss 24 | (American) 25 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 26 | estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss 27 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 28 | finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss 29 | Text Bank (Suomen Kielen newspapers 30 | Tekstipankki) 31 | Finnish Center for IT Science 32 | (CSC) 33 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 34 | french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss 35 | (European) 36 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 37 | german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss 38 | (Switzerland) CD-ROM 39 | (Uses "ss" 40 | instead of "ß") 41 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 42 | greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss 43 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 44 | italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss 45 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 46 | norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss 47 | (Bokmål and Information Technologies, 48 | Nynorsk) Bergen 49 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 50 | polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner 51 | (http://www.nkjp.pl/) 52 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 53 | portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss 54 | (Brazilian) (Linguateca) 55 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 56 | slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss 57 | Slovene Academy for Arts 58 | and Sciences 59 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 60 | spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss 61 | (European) 62 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 63 | swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss 64 | (and some other texts) 65 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 66 | turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss 67 | (Türkçe Derlem Projesi) 68 | University of Ankara 69 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 70 | 71 | The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to 72 | Unicode using the codecs module. 73 | 74 | Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection. 75 | Computational Linguistics 32: 485-525. 76 | 77 | ---- Training Code ---- 78 | 79 | # import punkt 80 | import nltk.tokenize.punkt 81 | 82 | # Make a new Tokenizer 83 | tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer() 84 | 85 | # Read in training corpus (one example: Slovene) 86 | import codecs 87 | text = codecs.open("slovene.plain","Ur","iso-8859-2").read() 88 | 89 | # Train tokenizer 90 | tokenizer.train(text) 91 | 92 | # Dump pickled tokenizer 93 | import pickle 94 | out = open("slovene.pickle","wb") 95 | pickle.dump(tokenizer, out) 96 | out.close() 97 | 98 | --------- 99 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | Sphinx==1.2b1 2 | ipdb==0.7 -------------------------------------------------------------------------------- /requirements-travis.txt: -------------------------------------------------------------------------------- 1 | Flask==0.10.1 2 | langid 3 | nltk==2.0.4 4 | nose==1.3.0 5 | pep8==1.4.6 6 | pyparsing==1.5.7 7 | python-dateutil==2.1 8 | python-hashes 9 | pytz==2013b 10 | pyzmq==13.1.0 11 | readline==6.2.4.1 12 | scikit-learn 13 | six==1.3.0 14 | wsgiref==0.1.2 15 | sqlalchemy==0.8.0 16 | Flask-SQLAlchemy==1.0 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -r requirements-travis.txt 2 | -r requirements-dev.txt -------------------------------------------------------------------------------- /scripts/autopep8.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | import subprocess 5 | 6 | """ 7 | Attempts to automatically make files conform to PEP8 (aggressively). 8 | 9 | Runs on files in place. 10 | 11 | Best practice: mak these changes as a separate commit from any other changes. 12 | That way you don't confuse "meaningful" code changes vs. PEP8 changes 13 | when someone is looking at history. 14 | 15 | Requires autopep8 is installed: 16 | 17 | $ pip install -r requirements-dev.txt 18 | """ 19 | 20 | 21 | def main(): 22 | 23 | files = [ 24 | "dssg/__init__.py", 25 | "dssg/classifier.py", 26 | "dssg/machine.py", 27 | "dssg/platt.py", 28 | "dssg/util.py", 29 | "dssg/vectorizer.py", 30 | ] 31 | 32 | directories = [ 33 | "dssg/model", 34 | "dssg/webapp", 35 | "dssg/tests", 36 | ] 37 | 38 | # autopep8 specific files 39 | for f in files: 40 | autopep8_single_file(f) 41 | 42 | for d in directories: 43 | autopep8_directory_recursively(d) 44 | 45 | def autopep8_single_file(path): 46 | """run autopep8 on a single file""" 47 | subprocess.call(["autopep8", "--in-place", "--aggressive", path]) 48 | 49 | 50 | def autopep8_directory_recursively(directory): 51 | """run autopep8 on a directory, recursively""" 52 | subprocess.call( 53 | ["autopep8", "--in-place", "--aggressive", directory, "-r"]) 54 | 55 | if __name__ == "__main__": 56 | main() 57 | 58 | 59 | 60 | 61 | # navigate from `/ushine-learning/scripts` to `/ushine-learning/` 62 | -------------------------------------------------------------------------------- /scripts/autopep8.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # NOTE: requires autopep8 is installed from pip, e.g. via `pip install -r requirements-dev.txt` 4 | 5 | # navigate from `/ushine-learning/scripts` to `/ushine-learning/` 6 | cd .. 7 | python scripts/autopep8.py -------------------------------------------------------------------------------- /scripts/make_sphinx_docs.sh: -------------------------------------------------------------------------------- 1 | cd .. 2 | sphinx-apidoc -o doc/source dssg -f 3 | cd doc 4 | make html 5 | cd .. 6 | -------------------------------------------------------------------------------- /scripts/nosetests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # NOTE: requires autopep8 is installed from pip, e.g. via `pip install -r requirements-dev.txt` 4 | 5 | # navigate from `/ushine-learning/scripts` to `/ushine-learning/` 6 | cd .. 7 | nosetests dssg/tests test -------------------------------------------------------------------------------- /scripts/upload_to_pypi.sh: -------------------------------------------------------------------------------- 1 | cd .. 2 | python setup.py sdist register upload 3 | cd scripts 4 | -------------------------------------------------------------------------------- /server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | import logging as logger 4 | from os.path import dirname, realpath 5 | 6 | import dssg 7 | from dssg.webapp import app 8 | 9 | # Load the application file 10 | config_file = dirname(realpath(__file__)) + '/dssg/config/dssg.ini' 11 | dssg.load_config(app, config_file) 12 | 13 | # Import the API endpoints 14 | from dssg.webapp.rest_api import * 15 | 16 | if __name__ == "__main__": 17 | app.debug = True 18 | app.run() 19 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # https://pypi.python.org/pypi/ushine 2 | 3 | '''setuptools works by triggering subcommands from higher level commands. 4 | The default commands 'install' and 'develop' trigger the following sequences: 5 | 6 | install: 7 | 1. build 8 | 2. build_py 9 | 3. install_lib 10 | 4. install_egg_info 11 | 5. egg_info 12 | 6. install_scripts 13 | 14 | develop: 15 | 1. egg_info 16 | 2. build_ext 17 | ''' 18 | 19 | from setuptools import setup, find_packages 20 | 21 | readme = open('README.txt').read() 22 | setup( 23 | name='ushine', 24 | version='0.1.0', 25 | author='Kayla Jacobs, Kwang-Sung Jun, Nathan Leiby, Elena Eneva', 26 | author_email='nathanleiby@gmail.com', 27 | license='MIT', 28 | description='Machine learning toolkit - originally built for Ushahidi\'s crowdmapping platform', 29 | long_description=readme, 30 | packages=find_packages(), 31 | install_requires=[ 32 | # 'flask', 33 | # 'sqlalchemy', 34 | # 'scikit-learn', 35 | ], 36 | dependency_links=[ 37 | ], 38 | entry_points={ 39 | # 'console_scripts': [], 40 | }, 41 | tests_require=[ 42 | 'nose', 43 | 'pep8', 44 | 'pyflakes', 45 | ], 46 | test_suite='test', 47 | ) --------------------------------------------------------------------------------