├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── doc
    ├── README.md
    ├── dsc-pitch.key
    ├── dsc-pitch.pdf
    ├── kdd-pitch.pdf
    ├── kdd-pitch.pptx
    ├── kdd-poster.pdf
    ├── kdd-poster.pptx
    └── report
    │   ├── .gitignore
    │   ├── confusion_matrix.png
    │   ├── tweedr.bib
    │   ├── tweedr.pdf
    │   └── tweedr.tex
├── package.json
├── setup.py
├── static
    ├── img
    │   ├── logos
    │   │   ├── qcri.png
    │   │   └── uchicago.png
    │   └── screenshots
    │   │   └── crisistracker-syria-2013-08-23.png
    ├── lib
    │   ├── backbone.js
    │   ├── backbone.min.js
    │   ├── cookies.js
    │   ├── handlebars.js
    │   ├── handlebars.runtime.js
    │   ├── jquery.js
    │   ├── jquery.min.js
    │   ├── templating.js
    │   ├── underscore.js
    │   └── underscore.min.js
    ├── master.css
    └── master.less
├── templates
    ├── code.mako
    ├── crf.mako
    ├── gloss.bars
    └── layout.mako
├── tests
    ├── README.md
    ├── test_codebase.py
    └── test_libraries.py
├── tools
    └── git-hooks
    │   ├── README.md
    │   └── pre-commit
├── tweedr
    ├── README.md
    ├── __init__.py
    ├── api
    │   ├── README.md
    │   ├── __init__.py
    │   ├── mappers
    │   │   ├── __init__.py
    │   │   ├── basic.py
    │   │   ├── ml.py
    │   │   ├── nlp.py
    │   │   └── similar.py
    │   ├── pipeline.py
    │   └── protocols.py
    ├── ark
    │   ├── __init__.py
    │   ├── java
    │   │   ├── __init__.py
    │   │   └── singleton.py
    │   └── tweetmotif
    │   │   ├── LICENSE
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── emoticons.py
    │   │   └── twokenize.py
    ├── cli
    │   ├── __init__.py
    │   ├── database.py
    │   ├── pipeline.py
    │   └── ui.py
    ├── corpora
    │   ├── __init__.py
    │   ├── qcri.py
    │   └── qcri_database.py
    ├── emr
    │   ├── README.md
    │   ├── __init__.py
    │   ├── gnip_geo.py
    │   └── gnip_wc.py
    ├── lib
    │   ├── __init__.py
    │   ├── readers.py
    │   ├── text.py
    │   └── timeout.py
    ├── ml
    │   ├── __init__.py
    │   ├── build_confusion_matrix.py
    │   ├── classifier.py
    │   ├── crf
    │   │   ├── __init__.py
    │   │   ├── classifier.py
    │   │   └── wrapper.py
    │   ├── evaluate.py
    │   ├── evaluate_combinations.py
    │   ├── features
    │   │   ├── __init__.py
    │   │   ├── characters.py
    │   │   ├── dbpedia.py
    │   │   ├── lexicons.py
    │   │   ├── ngrams.py
    │   │   ├── nlp.py
    │   │   └── sets.py
    │   ├── lexicon_list.py
    │   ├── pyslda
    │   │   ├── PreProcess.py
    │   │   ├── PySLDA.py
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── evaluate-classifier.py
    │   │   ├── loadModel.R
    │   │   ├── saveModel.R
    │   │   ├── testLDA.R
    │   │   └── trainLDA.R
    │   ├── spotlight
    │   │   └── __init__.py
    │   └── wordnet.py
    ├── models
    │   ├── README.md
    │   ├── __init__.py
    │   ├── example.py
    │   ├── metadata.py
    │   ├── schema.py
    │   └── schema.template
    └── ui
    │   ├── README.md
    │   ├── __init__.py
    │   ├── crf.py
    │   └── middleware.py
└── web
    ├── README.md
    └── extraction-tool
        ├── README.md
        ├── db.php
        └── index.php


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.egg-info
 3 | .DS_Store
 4 | .DS_Store.orig
 5 | node_modules/
 6 | /*.egg
 7 | /ext/
 8 | /dist/
 9 | /build/
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.7"
 4 | virtualenv:
 5 |   system_site_packages: true
 6 | before_install:
 7 |   - "sudo apt-get update -qq"
 8 |   - "sudo apt-get install -qq python-scipy python-nose"
 9 | # python-numpy python-mako python-mysqldb python-scikits-learn python-sqlalchemy pep8
10 | env: MYSQL_HOST=dummyhost MYSQL_USER=dummyuser MYSQL_PASS=dummypass MYSQL_DATABASE=dummydatabase
11 | install: "pip install . --use-mirrors"
12 | script: "python setup.py nosetests -e no_ci --with-doctest"
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 The University of Chicago
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Tweedr: measuring disaster damage with tweets
  2 | 
  3 | <a href="http://www.qcri.com/"><img src="http://dssg.io/img/partners/qcri.png" width="400" align="right"></a>
  4 | 
  5 | Tweedr makes information from social media more accessible to providers of disaster relief. There are two aspects to the application:
  6 | 
  7 | 1. An **API** / **pipeline** for applying _machine learning techniques_ and _natural language processing tools_ to analyze social media produced in response to a disaster.
  8 | 2. A **user interface** for manipulating, filtering, and aggregating this enhanced social media data.
  9 | 
 10 | Tweedr is a [Data Science for Social Good](http://dssg.io/) project, through a partnership with the [Qatar Computational Research Institute](http://qcri.qa/).
 11 | 
 12 | ## Problem, solution, data
 13 | 
 14 | ![web app screenshot](https://raw.github.com/dssg/dssg.github.io/master/img/posts/tweedr-screenshot.png)
 15 | 
 16 | * For an extensive discussion of the problem and proposed solution, [visit our wiki](https://github.com/dssg/tweedr/wiki).
 17 | * Get start using the tweedr api, [check out our tutorial website](http://tokens.qcri.dssg.io/tweedrtutorial/).
 18 | 
 19 | 
 20 | ## Project layout
 21 | 
 22 | * [`doc/`](doc) contains various presentations, along with accompanying slides and poster.
 23 |     + [`doc/report/`](doc/report) contains a more technical and extensive write-up of this project. _In progress._
 24 | * `ext/` is created by a complete install; external data sources and libraries are downloaded to this folder.
 25 | * [`static/`](static) contains static (non-Javascript) files used by the web app.
 26 | * [`templates/`](templates) contain templates (both server-side and client-side) used by the web app.
 27 | * [`tests/`](tests) contain unittest-like tests. Use `python setup.py test` to run these.
 28 | * [`tools/`](tools) holds tools to aid development (currently, only a test-running git-hook).
 29 | * [`tweedr/`](tweedr) contains the main Python app and functions as a Python package (e.g., `import tweedr`).
 30 | 
 31 | 
 32 | ## Installation guide
 33 | 
 34 |     git clone https://github.com/dssg/tweedr.git
 35 |     cd tweedr
 36 |     python setup.py develop download_ext
 37 | 
 38 | If you want to jump straight to development, see the [Contributing](https://github.com/dssg/tweedr/wiki/Contributing) wiki page.
 39 | 
 40 | ### Dependencies
 41 | 
 42 | Tweedr uses a number of external libraries and resources. This is the dependency tree:
 43 | 
 44 | * [Tweedr](https://github.com/dssg/tweedr): Primarily python, on github
 45 |     - [crfsuite](http://www.chokkan.org/software/crfsuite/): C/C++, from source
 46 |         + [libLBFGS](http://www.chokkan.org/software/liblbfgs/): C/C++, from source
 47 |     - [scikit-learn](http://scikit-learn.org/stable/): Python, from PyPI
 48 |         + [numpy](http://www.numpy.org/): Python, with C/C++ (blas/lapack), Fortran links, from PyPI or package manager
 49 |         + [scipy](http://www.scipy.org/): Python, with C/C++, from PyPI or package manager
 50 |     - [TweetNLP](http://www.ark.cs.cmu.edu/TweetNLP/): Java, from jar
 51 |     - [PyPer] (https://pypi.python.org/pypi/PypeR/1.1.0): Python, with R, from PyPI
 52 | 
 53 | `crfsuite` and `liblbfgs` are the only components that can't be installed directly with Python via `setuptools`. Though if you have trouble installing some of the packages above, you might have better luck looking for those packages in your operating system's pacakge manager or as binaries on the projects' websites.
 54 | 
 55 | ### Installation steps
 56 | 
 57 | *1. Installing libLBFGS*
 58 | 
 59 | The source code can be downloaded from the [maintainer's webpage](http://www.chokkan.org/software/liblbfgs/), though this [Github fork](https://github.com/chbrown/liblbfgs) (and below) attempts to simplify the install process.
 60 | 
 61 |     git clone https://github.com/chbrown/liblbfgs.git
 62 |     cd liblbfgs
 63 |     ./configure
 64 |     make
 65 |     sudo make install
 66 | 
 67 | *2. Installing CRFsuite*
 68 | 
 69 | Like libLBFGS, a tarball can be downloaded from the [original website](http://www.chokkan.org/software/crfsuite/), though the accompanying [fork on Github](https://github.com/chbrown/crfsuite) attempts to document the installation process and make compilation more automatic on both Linux and Mac OS X.
 70 | 
 71 |     git clone https://github.com/chbrown/crfsuite.git
 72 |     cd crfsuite
 73 |     ./configure
 74 |     make
 75 |     sudo make install
 76 | 
 77 | That installs the library, but not the Python wrapper, which takes a few more steps:
 78 | 
 79 |     cd swig/python
 80 |     python setup.py build_ext
 81 |     sudo python setup.py install_lib
 82 | 
 83 | To test whether it installed correctly, you can run the following at your terminal, which should print out the current CRFsuite version:
 84 | 
 85 |     python -c 'import crfsuite; print crfsuite.version()'
 86 |     > 0.12.2
 87 | 
 88 | The [github repository](https://github.com/chbrown/crfsuite) documents a few more options that might come in handy if the process above does not work for your operating system.
 89 | 
 90 | 
 91 | *3. Configuring environment variables*
 92 | 
 93 | Tweedr also connects to a number of remote resources when running live; see [[Environment]] for instructions on setting those up.
 94 | 
 95 | 
 96 | *4. Installing Tweedr*
 97 | 
 98 | After installing `crfsuite` and `liblbfgs`, everything else should be installable via setuptools / distutils:
 99 | 
100 |     git clone https://github.com/dssg/tweedr.git
101 |     cd tweedr
102 |     python setup.py install
103 | 
104 | And then to download external data requirements:
105 | 
106 |     python setup.py download_ext
107 | 
108 | The `download_ext` command will download external data, which currently includes the following packages / sources:
109 | 
110 | * [TweetNLP 0.3.2 tarball](http://ark-tweet-nlp.googlecode.com/files/ark-tweet-nlp-0.3.2.tgz) (Github repository: [ark-tweet-nlp](https://github.com/brendano/ark-tweet-nlp))
111 | 
112 | You may get an error, "IOError: cmu.arktweetnlp.RunTagger error", if you try to use some parts of Tweedr before installing this component.
113 | 
114 | 
115 | *5. Instantiating the database*
116 | 
117 | While we are not currently able to release our data, you can easily recreate the structure of our database by running the following command:
118 | 
119 |     tweedr-database create
120 | 
121 | This simply uses SQLAlchemy to un-reflect the database, by running `metadata.create_all()`.
122 | 
123 | 
124 | ### Running Tweedr
125 | 
126 | At this point, you should have tools like `tweedr-ui` and `tweedr-pipeline` on your `PATH`, and you can run each of those with the `--help` flag to view the usage messages.
127 | 
128 | See [the API section](https://github.com/dssg/tweedr/wiki#tweedr-api-how-it-works) of the wiki for a description of some of the fields that `tweedr-pipeline` adds.
129 | 
130 | 
131 | ### Troubleshooting
132 | 
133 | If your installation is still missing packages, see the [manually installing](https://github.com/dssg/tweedr/wiki/Manually-installing) page of the wiki.
134 | 
135 | 
136 | ## Team
137 | ![Team](https://raw.github.com/dssg/dssg.github.io/761993c24ea2991170ef64048115cb805f5f13fb/img/people/teams/tweedr.png)
138 | 
139 | 
140 | ## Contributing to the project
141 | 
142 | Want to get in touch? Found a bug? Open up a [new issue](https://github.com/dssg/tweedr/issues/new) or email us at [dssg-qcri@googlegroups.com](mailto:dssg-qcri@googlegroups.com).
143 | 
144 | 
145 | ## License
146 | 
147 | Copyright © 2013 The University of Chicago. [MIT Licensed](LICENSE).
148 | 


--------------------------------------------------------------------------------
/doc/README.md:
--------------------------------------------------------------------------------
1 | # Presentations
2 | 
3 | - **Thursday, July 11, 2013**.
4 |   Presented [twitter-informed-disaster-relief](http://prezi.com/83-blihpamgf/twitter-informed-disaster-relief/) (Prezi link) for the full DSSG group.
5 | - **Wednesday, July 24, 2013**.
6 |   Presented [dsc-pitch.key](dsc-pitch.key) (Keynote, [PDF](dsc-pitch.pdf)) for the Data Science Chicago meetup.
7 | 


--------------------------------------------------------------------------------
/doc/dsc-pitch.key:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/doc/dsc-pitch.key


--------------------------------------------------------------------------------
/doc/dsc-pitch.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/doc/dsc-pitch.pdf


--------------------------------------------------------------------------------
/doc/kdd-pitch.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/doc/kdd-pitch.pdf


--------------------------------------------------------------------------------
/doc/kdd-pitch.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/doc/kdd-pitch.pptx


--------------------------------------------------------------------------------
/doc/kdd-poster.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/doc/kdd-poster.pdf


--------------------------------------------------------------------------------
/doc/kdd-poster.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/doc/kdd-poster.pptx


--------------------------------------------------------------------------------
/doc/report/.gitignore:
--------------------------------------------------------------------------------
1 | *.log
2 | *.aux
3 | *.bbl
4 | *.blg
5 | 


--------------------------------------------------------------------------------
/doc/report/confusion_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/doc/report/confusion_matrix.png


--------------------------------------------------------------------------------
/doc/report/tweedr.bib:
--------------------------------------------------------------------------------
  1 | 
  2 | @article{lin_class-imbalanced_2012,
  3 | 	title = {Class-imbalanced classifiers for high-dimensional data},
  4 | 	issn = {1467-5463, 1477-4054},
  5 | 	url = {http://bib.oxfordjournals.org/content/early/2012/03/08/bib.bbs006},
  6 | 	doi = {10.1093/bib/bbs006},
  7 | 	abstract = {A class-imbalanced classifier is a decision rule to predict the class membership of new samples from an available data set where the class sizes differ considerably. When the class sizes are very different, most standard classification algorithms may favor the larger (majority) class resulting in poor accuracy in the minority class prediction. A class-imbalanced classifier typically modifies a standard classifier by a correction strategy or by incorporating a new strategy in the training phase to account for differential class sizes. This article reviews and evaluates some most important methods for class prediction of high-dimensional imbalanced data. The evaluation addresses the fundamental issues of the class-imbalanced classification problem: imbalance ratio, small disjuncts and overlap complexity, lack of data and feature selection. Four class-imbalanced classifiers are considered. The four classifiers include three standard classification algorithms each coupled with an ensemble correction strategy and one support vector machines ({SVM)-based} correction classifier. The three algorithms are (i) diagonal linear discriminant analysis ({DLDA)}, (ii) random forests ({RFs)} and (ii) {SVMs.} The {SVM-based} correction classifier is {SVM} threshold adjustment ({SVM-THR).} A {Monte–Carlo} simulation and five genomic data sets were used to illustrate the analysis and address the issues. The {SVM-ensemble} classifier appears to perform the best when the class imbalance is not too severe. The {SVM-THR} performs well if the imbalance is severe and predictors are highly correlated. The {DLDA} with a feature selection can perform well without using the ensemble correction.},
  8 | 	language = {en},
  9 | 	urldate = {2013-09-02},
 10 | 	journal = {Briefings in Bioinformatics},
 11 | 	author = {Lin, Wei-Jiun and Chen, James J.},
 12 | 	month = mar,
 13 | 	year = {2012},
 14 | 	note = {{PMID:} 22408190},
 15 | 	keywords = {class-imbalanced prediction, feature selection, lack of data, performance metrics, threshold adjustment, under-sampling ensemble},
 16 | 	pages = {bbs006},
 17 | 	file = {Snapshot:/Users/awculott/Library/Application Support/Firefox/Profiles/0zv8dy28.default/zotero/storage/BDAQWHVI/bib.html:text/html}
 18 | }
 19 | 
 20 | @inproceedings{mandel12demo,
 21 |   author = {Benjamin Mandel and Aron Culotta and John Boulahanis and Danielle Stark and Bonnie Lewis and Jeremy Rodrigue},
 22 |   title = {A demographic analysis of online sentiment during {H}urricane {I}rene},
 23 |   booktitle = {NAACL-HLT Workshop on Language in Social Media},
 24 |   shortbooktitle = {HLT/NAACL},
 25 |   year = {2012},
 26 |   mytype = {Refereed Workshop Publications},
 27 |   url = {http://www2.selu.edu/Academics/Faculty/aculotta/pubs/mandel12demo.pdf},
 28 |   abstract = {We examine the response to the recent natural disaster Hurricane Irene on Twitter.com. We collect over 65,000 Twitter messages relating to Hurricane Irene from August 18th to August 31st, 2011, and group them by location and gender. We train a sentiment classifier to categorize messages based on level of concern, and then use this classifier to investigate demographic differences. We report three principal findings: (1) the number of Twitter messages related to Hurricane Irene in directly affected regions peaks around the time the hurricane hits that region; (2) the level of concern in the days leading up to the hurricane's arrival is dependent on region; and (3) the level of concern is dependent on gender, with females being more likely to express concern than males. Qualitative linguistic variations further support these differences. We conclude that social media analysis provides a viable, real-time complement to traditional survey methods for understanding public perception towards an impending disaster.},
 29 | }
 30 | 
 31 | 
 32 | @inproceedings{imran_practical_2013,
 33 | 	address = {Republic and Canton of Geneva, Switzerland},
 34 | 	series = {{WWW} '13 Companion},
 35 | 	title = {Practical extraction of disaster-relevant information from social media},
 36 | 	isbn = {978-1-4503-2038-2},
 37 | 	url = {http://dl.acm.org/citation.cfm?id=2487788.2488109},
 38 | 	abstract = {During times of disasters online users generate a significant amount of data, some of which are extremely valuable for relief efforts. In this paper, we study the nature of social-media content generated during two different natural disasters. We also train a model based on conditional random fields to extract valuable information from such content. We evaluate our techniques over our two datasets through a set of carefully designed experiments. We also test our methods over a non-disaster dataset to show that our extraction model is useful for extracting information from socially-generated content in general.},
 39 | 	urldate = {2013-09-02},
 40 | 	booktitle = {Proceedings of the 22nd international conference on World Wide Web companion},
 41 | 	publisher = {International World Wide Web Conferences Steering Committee},
 42 | 	author = {Imran, Muhammad and Elbassuoni, Shady and Castillo, Carlos and Diaz, Fernando and Meier, Patrick},
 43 | 	year = {2013},
 44 | 	keywords = {information extraction, information filtering, social media},
 45 | 	pages = {1021–1024}
 46 | },
 47 | 
 48 | @inproceedings{meier_extracting_2013,
 49 | 	title = {Extracting Information Nuggets from Disaster-Related Messages in Social Media},
 50 | 	booktitle = {10th International Conference on Information Systems for Crisis Response and Management},
 51 | 	author = {Meier, Patrick and Castillo, Carlos and Imran, Muhammad and Elbassuoni, Shady Mamoon and Diaz, Fernando},
 52 | 	year = {2013}
 53 | },
 54 | 
 55 | @inproceedings{kumar_tweettracker_2011,
 56 | 	title = {{TweetTracker:} An Analysis Tool for Humanitarian and Disaster Relief},
 57 | 	booktitle = {{ICWSM'11}},
 58 | 	author = {Kumar, Shamanth and Barbier, Geoffrey and Abbasi, Mohammad Ali and Liu, Huan},
 59 | 	year = {2011}
 60 | },
 61 | 
 62 | @inproceedings{cheong_social_2011,
 63 | 	title = {Social Media Data Mining: A Social Network Analysis Of Tweets During The 2010-2011 Australian Floods},
 64 | 	booktitle = {{PACIS'11}},
 65 | 	author = {Cheong, France and Cheong, Christopher},
 66 | 	year = {2011},
 67 | 	pages = {46--46}
 68 | }
 69 | 
 70 | 
 71 | @techreport{blei10supervised,
 72 |         type = {{arXiv} e-print},
 73 |         title = {Supervised Topic Models},
 74 |         number = {1003.0783},
 75 |         urldate = {2013-08-23},
 76 |         author = {Blei, David M. and {McAuliffe}, Jon D.},
 77 |         month = mar,
 78 |         year = {2010},
 79 |         keywords = {Statistics - Machine Learning},
 80 | }
 81 | 
 82 | @book{sutton12intro,
 83 |         address = {Hanover, {MA}},
 84 |         title = {An introduction to conditional random fields},
 85 |         isbn = {9781601985736  1601985738},
 86 |         url = {http://search.ebscohost.com/login.aspx?direct=true&scope=site&db=nlebk&db=nlabk&AN=593830},
 87 |         language = {English},
 88 |         urldate = {2013-08-23},
 89 |         publisher = {Now Publishers},
 90 |         author = {Sutton, Charles and {McCallum}, Andrew K},
 91 |         year = 2012
 92 | }
 93 | 
 94 | 
 95 | 
 96 | @article{bloom70space,
 97 | 	title = {Space/time trade-offs in hash coding with allowable errors},
 98 | 	volume = 13,
 99 | 	issn = {0001-0782},
100 | 	url = {http://doi.acm.org/10.1145/362686.362692},
101 | 	doi = {10.1145/362686.362692},
102 | 	number = 7,
103 | 	urldate = {2013-08-23},
104 | 	journal = {Commun. {ACM}},
105 | 	author = {Bloom, Burton H.},
106 | 	month = jul,
107 | 	year = 1970,
108 | 	keywords = {hash addressing, hash coding, retrieval efficiency, retrieval trade-offs, scatter storage, searching, storage efficiency, storage layout},
109 | 	pages = {422–426}
110 | }
111 | 
112 | 
113 | @inproceedings{charikar02similarity,
114 | 	address = {New York, {NY}, {USA}},
115 | 	series = {{STOC} '02},
116 | 	title = {Similarity estimation techniques from rounding algorithms},
117 | 	isbn = {1-58113-495-9},
118 | 	url = {http://doi.acm.org/10.1145/509907.509965},
119 | 	doi = {10.1145/509907.509965},
120 | 	urldate = {2013-08-23},
121 | 	booktitle = {Proceedings of the thiry-fourth annual {ACM} symposium on Theory of computing},
122 | 	publisher = {{ACM}},
123 | 	author = {Charikar, Moses S.},
124 | 	year = 2002,
125 | 	pages = {380–388}
126 | }


--------------------------------------------------------------------------------
/doc/report/tweedr.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/doc/report/tweedr.pdf


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": "0.0.4",
 3 |   "homepage": "https://github.com/dssg/tweedr",
 4 |   "description": "Twitter-Informed Disaster Response",
 5 |   "license": "MIT",
 6 |   "staticDependencies": {
 7 |     "jquery": "*",
 8 |     "underscore": "*",
 9 |     "backbone": "*",
10 |     "handlebars": "*",
11 |     "misc-js": "git://github.com/chbrown/misc-js.git"
12 |   },
13 |   "staticPattern": "static/lib/{file}"
14 | }


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | '''setuptools works by triggering subcommands from higher level commands.
  2 | The default commands 'install' and 'develop' trigger the following sequences:
  3 | 
  4 | install:
  5 |   1. build
  6 |   2. build_py
  7 |   3. install_lib
  8 |   4. install_egg_info
  9 |   5. egg_info
 10 |   6. install_scripts
 11 | 
 12 | develop:
 13 |   1. egg_info
 14 |   2. build_ext
 15 | '''
 16 | from setuptools import setup, find_packages
 17 | from distutils import log, core
 18 | from distutils.dir_util import remove_tree
 19 | import os
 20 | import json
 21 | 
 22 | here = os.path.dirname(__file__) or os.curdir
 23 | package = json.load(open(os.path.join(here, 'package.json')))
 24 | 
 25 | 
 26 | class download_ext(core.Command):
 27 |     description = 'download external dependencies'
 28 |     user_options = []
 29 | 
 30 |     def initialize_options(self):
 31 |         self.ext_path = None
 32 | 
 33 |     def finalize_options(self):
 34 |         self.ext_path = os.path.join(here, 'ext')
 35 | 
 36 |     def download_ark_tweet_nlp(self):
 37 |         import urllib
 38 |         import tarfile
 39 |         url = 'http://ark-tweet-nlp.googlecode.com/files/ark-tweet-nlp-0.3.2.tgz'
 40 |         log.info('Downloading %s', url)
 41 |         tgz_filepath, headers = urllib.urlretrieve(url)
 42 |         log.info('Opening %s', tgz_filepath)
 43 |         with tarfile.open(tgz_filepath, 'r:gz') as tgz:
 44 |             # pull all the jars out, flattening them
 45 |             for tarinfo in tgz.getmembers():
 46 |                 if tarinfo.name.endswith('.jar'):
 47 |                     tarinfo_name = tarinfo.name
 48 |                     local_filepath = os.path.join(self.ext_path, os.path.basename(tarinfo.name))
 49 |                     tarinfo.name = local_filepath
 50 |                     tgz.extract(tarinfo)
 51 |                     log.info('Extracting %s to %s', tarinfo_name, local_filepath)
 52 | 
 53 |     def run(self):
 54 |         self.mkpath(self.ext_path)
 55 |         self.download_ark_tweet_nlp()
 56 | 
 57 | 
 58 | class dist_clean(core.Command):
 59 |     description = 'remove all files not under version control'
 60 |     user_options = []
 61 | 
 62 |     def initialize_options(self):
 63 |         pass
 64 | 
 65 |     def finalize_options(self):
 66 |         # set all = True for the benefit of the "clean" subcommand
 67 |         self.all = True
 68 | 
 69 |     def run(self):
 70 |         self.run_command('clean')
 71 |         log.debug('removing inessential directories from root')
 72 |         for directory in os.listdir(here):
 73 |             if directory.endswith(('dist', 'ext', '.egg', '.egg-info')):
 74 |                 remove_tree(directory, dry_run=self.dry_run)
 75 | 
 76 |         log.debug('removing inessential files from project')
 77 |         for dirpath, _, filenames in os.walk('.'):
 78 |             filepaths = [os.path.join(dirpath, filename) for filename in filenames]
 79 |             for filepath in filepaths:
 80 |                 if filepath.endswith(('.pyc', '.DS_Store')):
 81 |                     log.info('rm %s', filepath)
 82 |                     if self.dry_run:
 83 |                         continue
 84 |                     os.remove(filepath)
 85 | 
 86 | setup(
 87 |     name='tweedr',
 88 |     version=str(package['version']),
 89 |     url=str(package['homepage']),
 90 |     license=open(os.path.join(here, 'LICENSE')).read(),
 91 |     packages=find_packages(),
 92 |     install_requires=[
 93 |         'bottle',
 94 |         'colorama',
 95 |         'mako',
 96 |         'matplotlib',
 97 |         'mrjob',
 98 |         'mysql-python',
 99 |         'pattern',
100 |         'pybloomfiltermmap>=0.3.11',
101 |         'pyper',
102 |         'python-hashes',
103 |         'requests',
104 |         'scikit-learn',
105 |         'scipy',
106 |         'sqlalchemy',
107 |         'ujson',
108 |     ],
109 |     entry_points={
110 |         'console_scripts': [
111 |             'tweedr-ui = tweedr.cli.ui:main',
112 |             'tweedr-database = tweedr.cli.database:main',
113 |             'tweedr-pipeline = tweedr.cli.pipeline:main',
114 |         ],
115 |     },
116 |     cmdclass={
117 |         'download_ext': download_ext,
118 |         'dist_clean': dist_clean,
119 |     },
120 |     tests_require=[
121 |         'nose',
122 |         'pep8',
123 |         'pyflakes',
124 |     ],
125 |     test_suite='nose.collector',
126 | )
127 | 


--------------------------------------------------------------------------------
/static/img/logos/qcri.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/static/img/logos/qcri.png


--------------------------------------------------------------------------------
/static/img/logos/uchicago.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/static/img/logos/uchicago.png


--------------------------------------------------------------------------------
/static/img/screenshots/crisistracker-syria-2013-08-23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/static/img/screenshots/crisistracker-syria-2013-08-23.png


--------------------------------------------------------------------------------
/static/lib/cookies.js:
--------------------------------------------------------------------------------
 1 | // Copyright 2012-2013, Christopher Brown <io@henrian.com>, MIT Licensed
 2 | // https://github.com/chbrown/misc-js :: cookies.js
 3 | // "use strict"; /*jslint indent: 2 */
 4 | var cookies = (function() {
 5 |   function extend(target, source) {
 6 |     for (var key in source) {
 7 |       if (source.hasOwnProperty(key)) {
 8 |         target[key] = source[key];
 9 |       }
10 |     }
11 |     return target;
12 |   }
13 | 
14 |   var default_cookie = {};
15 | 
16 |   function getOptions(opts) {
17 |     // if it's a function, call it.
18 |     var defaults = default_cookie.call ? default_cookie() : default_cookie;
19 |     // opts override defaults, but defaults cannot be changed
20 |     return extend(extend({}, defaults), opts);
21 |   }
22 | 
23 |   return {
24 |     setDefault: function(new_default_cookie) {
25 |       default_cookie = new_default_cookie;
26 |     },
27 |     get: function(name, opts) {
28 |       opts = getOptions(opts);
29 | 
30 |       var document_cookie = document.cookie;
31 |       var cookies = (document_cookie && document_cookie !== '') ? document_cookie.split(/\s*;\s*/) : [];
32 |       for (var i = 0, cookie; (cookie = cookies[i]); i++) {
33 |         // Does this cookie string begin with the name we want?
34 |         if (cookie.slice(0, name.length + 1) == (name + '=')) {
35 |           var raw = cookie.slice(name.length + 1);
36 |           return opts.raw ? raw : decodeURIComponent(raw);
37 |         }
38 |       }
39 |     },
40 |     set: function(name, value, opts) {
41 |       opts = getOptions(opts);
42 | 
43 |       var encode = opts.raw ? function(s) { return s; } : encodeURIComponent;
44 | 
45 |       var pairs = [[encode(name), encode(value.toString())]];
46 |       if (opts.expires) pairs.push(['expires', opts.expires.toUTCString()]);
47 |       if (opts.path) pairs.push(['path', opts.path]);
48 |       if (opts.domain) pairs.push(['domain', opts.domain]);
49 |       if (opts.secure) pairs.push(['secure']);
50 |       var cookie = pairs.map(function(pair) { return pair.join('='); }).join('; ');
51 |       document.cookie = cookie;
52 |       return cookie;
53 |     },
54 |     del: function(name, opts) {
55 |       opts = getOptions(opts);
56 | 
57 |       this.set(name, '', {expires: -1});
58 |     }
59 |   };
60 | })();
61 | 


--------------------------------------------------------------------------------
/static/lib/handlebars.runtime.js:
--------------------------------------------------------------------------------
  1 | /*
  2 | 
  3 | Copyright (C) 2011 by Yehuda Katz
  4 | 
  5 | Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | of this software and associated documentation files (the "Software"), to deal
  7 | in the Software without restriction, including without limitation the rights
  8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | copies of the Software, and to permit persons to whom the Software is
 10 | furnished to do so, subject to the following conditions:
 11 | 
 12 | The above copyright notice and this permission notice shall be included in
 13 | all copies or substantial portions of the Software.
 14 | 
 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | THE SOFTWARE.
 22 | 
 23 | */
 24 | 
 25 | // lib/handlebars/browser-prefix.js
 26 | var Handlebars = {};
 27 | 
 28 | (function(Handlebars, undefined) {
 29 | ;
 30 | // lib/handlebars/base.js
 31 | 
 32 | Handlebars.VERSION = "1.0.0";
 33 | Handlebars.COMPILER_REVISION = 4;
 34 | 
 35 | Handlebars.REVISION_CHANGES = {
 36 |   1: '<= 1.0.rc.2', // 1.0.rc.2 is actually rev2 but doesn't report it
 37 |   2: '== 1.0.0-rc.3',
 38 |   3: '== 1.0.0-rc.4',
 39 |   4: '>= 1.0.0'
 40 | };
 41 | 
 42 | Handlebars.helpers  = {};
 43 | Handlebars.partials = {};
 44 | 
 45 | var toString = Object.prototype.toString,
 46 |     functionType = '[object Function]',
 47 |     objectType = '[object Object]';
 48 | 
 49 | Handlebars.registerHelper = function(name, fn, inverse) {
 50 |   if (toString.call(name) === objectType) {
 51 |     if (inverse || fn) { throw new Handlebars.Exception('Arg not supported with multiple helpers'); }
 52 |     Handlebars.Utils.extend(this.helpers, name);
 53 |   } else {
 54 |     if (inverse) { fn.not = inverse; }
 55 |     this.helpers[name] = fn;
 56 |   }
 57 | };
 58 | 
 59 | Handlebars.registerPartial = function(name, str) {
 60 |   if (toString.call(name) === objectType) {
 61 |     Handlebars.Utils.extend(this.partials,  name);
 62 |   } else {
 63 |     this.partials[name] = str;
 64 |   }
 65 | };
 66 | 
 67 | Handlebars.registerHelper('helperMissing', function(arg) {
 68 |   if(arguments.length === 2) {
 69 |     return undefined;
 70 |   } else {
 71 |     throw new Error("Missing helper: '" + arg + "'");
 72 |   }
 73 | });
 74 | 
 75 | Handlebars.registerHelper('blockHelperMissing', function(context, options) {
 76 |   var inverse = options.inverse || function() {}, fn = options.fn;
 77 | 
 78 |   var type = toString.call(context);
 79 | 
 80 |   if(type === functionType) { context = context.call(this); }
 81 | 
 82 |   if(context === true) {
 83 |     return fn(this);
 84 |   } else if(context === false || context == null) {
 85 |     return inverse(this);
 86 |   } else if(type === "[object Array]") {
 87 |     if(context.length > 0) {
 88 |       return Handlebars.helpers.each(context, options);
 89 |     } else {
 90 |       return inverse(this);
 91 |     }
 92 |   } else {
 93 |     return fn(context);
 94 |   }
 95 | });
 96 | 
 97 | Handlebars.K = function() {};
 98 | 
 99 | Handlebars.createFrame = Object.create || function(object) {
100 |   Handlebars.K.prototype = object;
101 |   var obj = new Handlebars.K();
102 |   Handlebars.K.prototype = null;
103 |   return obj;
104 | };
105 | 
106 | Handlebars.logger = {
107 |   DEBUG: 0, INFO: 1, WARN: 2, ERROR: 3, level: 3,
108 | 
109 |   methodMap: {0: 'debug', 1: 'info', 2: 'warn', 3: 'error'},
110 | 
111 |   // can be overridden in the host environment
112 |   log: function(level, obj) {
113 |     if (Handlebars.logger.level <= level) {
114 |       var method = Handlebars.logger.methodMap[level];
115 |       if (typeof console !== 'undefined' && console[method]) {
116 |         console[method].call(console, obj);
117 |       }
118 |     }
119 |   }
120 | };
121 | 
122 | Handlebars.log = function(level, obj) { Handlebars.logger.log(level, obj); };
123 | 
124 | Handlebars.registerHelper('each', function(context, options) {
125 |   var fn = options.fn, inverse = options.inverse;
126 |   var i = 0, ret = "", data;
127 | 
128 |   var type = toString.call(context);
129 |   if(type === functionType) { context = context.call(this); }
130 | 
131 |   if (options.data) {
132 |     data = Handlebars.createFrame(options.data);
133 |   }
134 | 
135 |   if(context && typeof context === 'object') {
136 |     if(context instanceof Array){
137 |       for(var j = context.length; i<j; i++) {
138 |         if (data) { data.index = i; }
139 |         ret = ret + fn(context[i], { data: data });
140 |       }
141 |     } else {
142 |       for(var key in context) {
143 |         if(context.hasOwnProperty(key)) {
144 |           if(data) { data.key = key; }
145 |           ret = ret + fn(context[key], {data: data});
146 |           i++;
147 |         }
148 |       }
149 |     }
150 |   }
151 | 
152 |   if(i === 0){
153 |     ret = inverse(this);
154 |   }
155 | 
156 |   return ret;
157 | });
158 | 
159 | Handlebars.registerHelper('if', function(conditional, options) {
160 |   var type = toString.call(conditional);
161 |   if(type === functionType) { conditional = conditional.call(this); }
162 | 
163 |   if(!conditional || Handlebars.Utils.isEmpty(conditional)) {
164 |     return options.inverse(this);
165 |   } else {
166 |     return options.fn(this);
167 |   }
168 | });
169 | 
170 | Handlebars.registerHelper('unless', function(conditional, options) {
171 |   return Handlebars.helpers['if'].call(this, conditional, {fn: options.inverse, inverse: options.fn});
172 | });
173 | 
174 | Handlebars.registerHelper('with', function(context, options) {
175 |   var type = toString.call(context);
176 |   if(type === functionType) { context = context.call(this); }
177 | 
178 |   if (!Handlebars.Utils.isEmpty(context)) return options.fn(context);
179 | });
180 | 
181 | Handlebars.registerHelper('log', function(context, options) {
182 |   var level = options.data && options.data.level != null ? parseInt(options.data.level, 10) : 1;
183 |   Handlebars.log(level, context);
184 | });
185 | ;
186 | // lib/handlebars/utils.js
187 | 
188 | var errorProps = ['description', 'fileName', 'lineNumber', 'message', 'name', 'number', 'stack'];
189 | 
190 | Handlebars.Exception = function(message) {
191 |   var tmp = Error.prototype.constructor.apply(this, arguments);
192 | 
193 |   // Unfortunately errors are not enumerable in Chrome (at least), so `for prop in tmp` doesn't work.
194 |   for (var idx = 0; idx < errorProps.length; idx++) {
195 |     this[errorProps[idx]] = tmp[errorProps[idx]];
196 |   }
197 | };
198 | Handlebars.Exception.prototype = new Error();
199 | 
200 | // Build out our basic SafeString type
201 | Handlebars.SafeString = function(string) {
202 |   this.string = string;
203 | };
204 | Handlebars.SafeString.prototype.toString = function() {
205 |   return this.string.toString();
206 | };
207 | 
208 | var escape = {
209 |   "&": "&amp;",
210 |   "<": "&lt;",
211 |   ">": "&gt;",
212 |   '"': "&quot;",
213 |   "'": "&#x27;",
214 |   "`": "&#x60;"
215 | };
216 | 
217 | var badChars = /[&<>"'`]/g;
218 | var possible = /[&<>"'`]/;
219 | 
220 | var escapeChar = function(chr) {
221 |   return escape[chr] || "&amp;";
222 | };
223 | 
224 | Handlebars.Utils = {
225 |   extend: function(obj, value) {
226 |     for(var key in value) {
227 |       if(value.hasOwnProperty(key)) {
228 |         obj[key] = value[key];
229 |       }
230 |     }
231 |   },
232 | 
233 |   escapeExpression: function(string) {
234 |     // don't escape SafeStrings, since they're already safe
235 |     if (string instanceof Handlebars.SafeString) {
236 |       return string.toString();
237 |     } else if (string == null || string === false) {
238 |       return "";
239 |     }
240 | 
241 |     // Force a string conversion as this will be done by the append regardless and
242 |     // the regex test will do this transparently behind the scenes, causing issues if
243 |     // an object's to string has escaped characters in it.
244 |     string = string.toString();
245 | 
246 |     if(!possible.test(string)) { return string; }
247 |     return string.replace(badChars, escapeChar);
248 |   },
249 | 
250 |   isEmpty: function(value) {
251 |     if (!value && value !== 0) {
252 |       return true;
253 |     } else if(toString.call(value) === "[object Array]" && value.length === 0) {
254 |       return true;
255 |     } else {
256 |       return false;
257 |     }
258 |   }
259 | };
260 | ;
261 | // lib/handlebars/runtime.js
262 | 
263 | Handlebars.VM = {
264 |   template: function(templateSpec) {
265 |     // Just add water
266 |     var container = {
267 |       escapeExpression: Handlebars.Utils.escapeExpression,
268 |       invokePartial: Handlebars.VM.invokePartial,
269 |       programs: [],
270 |       program: function(i, fn, data) {
271 |         var programWrapper = this.programs[i];
272 |         if(data) {
273 |           programWrapper = Handlebars.VM.program(i, fn, data);
274 |         } else if (!programWrapper) {
275 |           programWrapper = this.programs[i] = Handlebars.VM.program(i, fn);
276 |         }
277 |         return programWrapper;
278 |       },
279 |       merge: function(param, common) {
280 |         var ret = param || common;
281 | 
282 |         if (param && common) {
283 |           ret = {};
284 |           Handlebars.Utils.extend(ret, common);
285 |           Handlebars.Utils.extend(ret, param);
286 |         }
287 |         return ret;
288 |       },
289 |       programWithDepth: Handlebars.VM.programWithDepth,
290 |       noop: Handlebars.VM.noop,
291 |       compilerInfo: null
292 |     };
293 | 
294 |     return function(context, options) {
295 |       options = options || {};
296 |       var result = templateSpec.call(container, Handlebars, context, options.helpers, options.partials, options.data);
297 | 
298 |       var compilerInfo = container.compilerInfo || [],
299 |           compilerRevision = compilerInfo[0] || 1,
300 |           currentRevision = Handlebars.COMPILER_REVISION;
301 | 
302 |       if (compilerRevision !== currentRevision) {
303 |         if (compilerRevision < currentRevision) {
304 |           var runtimeVersions = Handlebars.REVISION_CHANGES[currentRevision],
305 |               compilerVersions = Handlebars.REVISION_CHANGES[compilerRevision];
306 |           throw "Template was precompiled with an older version of Handlebars than the current runtime. "+
307 |                 "Please update your precompiler to a newer version ("+runtimeVersions+") or downgrade your runtime to an older version ("+compilerVersions+").";
308 |         } else {
309 |           // Use the embedded version info since the runtime doesn't know about this revision yet
310 |           throw "Template was precompiled with a newer version of Handlebars than the current runtime. "+
311 |                 "Please update your runtime to a newer version ("+compilerInfo[1]+").";
312 |         }
313 |       }
314 | 
315 |       return result;
316 |     };
317 |   },
318 | 
319 |   programWithDepth: function(i, fn, data /*, $depth */) {
320 |     var args = Array.prototype.slice.call(arguments, 3);
321 | 
322 |     var program = function(context, options) {
323 |       options = options || {};
324 | 
325 |       return fn.apply(this, [context, options.data || data].concat(args));
326 |     };
327 |     program.program = i;
328 |     program.depth = args.length;
329 |     return program;
330 |   },
331 |   program: function(i, fn, data) {
332 |     var program = function(context, options) {
333 |       options = options || {};
334 | 
335 |       return fn(context, options.data || data);
336 |     };
337 |     program.program = i;
338 |     program.depth = 0;
339 |     return program;
340 |   },
341 |   noop: function() { return ""; },
342 |   invokePartial: function(partial, name, context, helpers, partials, data) {
343 |     var options = { helpers: helpers, partials: partials, data: data };
344 | 
345 |     if(partial === undefined) {
346 |       throw new Handlebars.Exception("The partial " + name + " could not be found");
347 |     } else if(partial instanceof Function) {
348 |       return partial(context, options);
349 |     } else if (!Handlebars.compile) {
350 |       throw new Handlebars.Exception("The partial " + name + " could not be compiled when running in runtime-only mode");
351 |     } else {
352 |       partials[name] = Handlebars.compile(partial, {data: data !== undefined});
353 |       return partials[name](context, options);
354 |     }
355 |   }
356 | };
357 | 
358 | Handlebars.template = Handlebars.VM.template;
359 | ;
360 | // lib/handlebars/browser-suffix.js
361 | })(Handlebars);
362 | ;
363 | 


--------------------------------------------------------------------------------
/static/lib/templating.js:
--------------------------------------------------------------------------------
 1 | // Copyright 2013, Christopher Brown <io@henrian.com>, MIT Licensed
 2 | // https://github.com/chbrown/misc-js :: templating.js
 3 | "use strict"; /*jslint indent: 2 */ /*globals _, $, Backbone, Handlebars */
 4 | 
 5 | // Templates debug / caching. E.g.:
 6 | // new TemplateManager({
 7 | //   cache: window.DEBUG ? Handlebars.templates : {},
 8 | //   url: '/templates/',
 9 | //   extension: '.bars',
10 | //   compile: Handlebars.compile
11 | // });
12 | function TemplateManager(opts) {
13 |   this.cache = opts.cache || {}; // optional, defaults to {}
14 |   this.url = opts.url || '/'; // where to look for templates
15 |   this.extension = opts.extension || ''; // append to given template names
16 |   this.querystring = opts.querystring || '?t=' + (new Date()).getTime(); // aka., url.search
17 |   this.compile = opts.compile || null; // what to use for cache misses
18 | }
19 | TemplateManager.prototype.render = function(template_name, context) {
20 |   // synchronous; returns html.
21 |   var self = this;
22 |   var cached_template = this.cache[template_name];
23 |   // only cache once per page load
24 |   if (!cached_template) {
25 |     $.ajax({
26 |       url: this.url + template_name + this.extension + this.querystring,
27 |       async: false,
28 |       success: function(template_src) {
29 |         cached_template = self.compile(template_src);
30 |       }
31 |     });
32 |     // yes, the above *will* execute synchronously!
33 |     this.cache[template_name] = cached_template;
34 |   }
35 |   return cached_template(context);
36 | };
37 | 
38 | // handlebars_manager is a global this file offers. requires `Handlebars` to be loaded.
39 | var HandlebarsTemplates = new TemplateManager({
40 |   cache: window.DEBUG ? Handlebars.templates : {},
41 |   url: '/templates/',
42 |   extension: '.bars',
43 |   compile: Handlebars.compile
44 | });
45 | var Templates = HandlebarsTemplates;
46 | 
47 | // requires Backbone and handlebars_manager
48 | var TemplatedView = Backbone.View.extend({
49 |   // TemplatedView has hooks called (pre|post)(Initialize|Render), each of which take a context
50 |   //   that context is just the model and whatever options the view is initialized with.
51 |   // preInitialize, postInitialize, preRender, postRender
52 |   initialize: function(opts) {
53 |     // prePreRender
54 |     var ctx = _.extend(this.model ? this.model.toJSON() : {}, opts);
55 |     if (this.preInitialize) this.preInitialize(ctx);
56 |     this.render(ctx);
57 |     if (this.postInitialize) this.postInitialize(ctx);
58 |   },
59 |   render: function(ctx) {
60 |     if (this.preRender) this.preRender(ctx);
61 |     this.el.innerHTML = Templates.render(this.template, ctx);
62 |     if (this.postRender) this.postRender(ctx);
63 |     if (ctx.replace) {
64 |       // if .replace is given, it's the parent node that this new view should
65 |       // attach to, replacing the old contents.
66 |       ctx.replace.replaceWith(this.$el);
67 |     }
68 |     return this;
69 |   }
70 | });
71 | 
72 | var TemplatedCollection = Backbone.Collection.extend({
73 |   renderTo: function($el, View) {
74 |     var fragment = document.createDocumentFragment();
75 |     this.each(function(model) {
76 |       var ctx = model.toJSON();
77 |       ctx.model = model;
78 |       var view = new View(ctx);
79 |       fragment.appendChild(view.el);
80 |     });
81 |     $el.append(fragment);
82 |   }
83 | });
84 | 


--------------------------------------------------------------------------------
/static/master.css:
--------------------------------------------------------------------------------
 1 | html{height:100%}
 2 | body{font-family:'Helvetica Neue',Helvetica,Arial;font-weight:200;height:100%;margin:0;padding:0;box-sizing:border-box}
 3 | h1,h2,h3,h4,h5,h6{margin:.25em 0 .5em}
 4 | p{margin:.5em 0}
 5 | label span{font-weight:bold}
 6 | table{border-collapse:collapse}table td{padding:0 8px 2px 0;vertical-align:top;font-size:90%;white-space:nowrap}
 7 | table.valign td{vertical-align:middle}
 8 | table.gloss tr:first-child td{font-weight:bold}
 9 | table.gloss td,table.gloss th{text-align:left;border:1px dotted #ccc;padding:1px 3px;word-break:normal;white-space:normal;max-width:140px}
10 | button{margin:10px 0}
11 | textarea{display:block}
12 | .control{margin:10px;float:left}
13 | .content{padding:1em}
14 | 


--------------------------------------------------------------------------------
/static/master.less:
--------------------------------------------------------------------------------
 1 | html {
 2 |   height: 100%;
 3 | }
 4 | body {
 5 |   font-family: 'Helvetica Neue', Helvetica, Arial;
 6 |   font-weight: 200;
 7 | 
 8 |   height: 100%;
 9 |   margin: 0;
10 |   padding: 0;
11 |   box-sizing: border-box;
12 | }
13 | h1, h2, h3, h4, h5, h6 {
14 |   margin: 0.25em 0 0.5em;
15 | }
16 | p {
17 |   margin: 0.5em 0;
18 | }
19 | label span {
20 |   font-weight: bold;
21 | }
22 | table {
23 |   border-collapse: collapse;
24 |   td {
25 |     padding: 0 8px 2px 0;
26 |     vertical-align: top;
27 |     font-size: 90%;
28 |     white-space: nowrap;
29 |   }
30 |   &.valign td {
31 |     vertical-align: middle;
32 |   }
33 |   &.gloss {
34 |     tr:first-child td {
35 |       font-weight: bold;
36 |     }
37 |     td, th {
38 |       text-align: left;
39 |       border: 1px dotted #CCC;
40 |       padding: 1px 3px;
41 |       word-break: normal;
42 |       white-space: normal;
43 |       max-width: 140px;
44 |     }
45 |   }
46 | }
47 | button {
48 |   margin: 10px 0;
49 | }
50 | textarea {
51 |   display: block;
52 | }
53 | .control {
54 |   margin: 10px;
55 |   float: left;
56 | }
57 | .content {
58 |   padding: 1em;
59 | }
60 | 


--------------------------------------------------------------------------------
/templates/code.mako:
--------------------------------------------------------------------------------
1 | <!DOCTYPE html>
2 | <meta charset="utf-8">
3 | 
4 | <pre>
5 | ${body | n}
6 | </pre>
7 | 


--------------------------------------------------------------------------------
/templates/crf.mako:
--------------------------------------------------------------------------------
 1 | <%inherit file="layout.mako" />
 2 | 
 3 | <div class="content">
 4 |   <div style="width: 600px;">
 5 |     <button style="float: right" class="random">Fill with random tweet</button>
 6 |     <h1>Tagger demonstration</h1>
 7 |   </div>
 8 | 
 9 |   <form style="width: 600px;">
10 |     <textarea style="width: 100%; height: 150px; font-size: 1.5em;"></textarea>
11 |     <button>Label tweet</button>
12 |   </form>
13 | 
14 |   <div class="tweet"></div>
15 | </div>
16 | 
17 | <script>
18 | DEBUG = true;
19 | // really naive localStorage shim
20 | if (!window.localStorage) window.localStorage = {};
21 | 
22 | $('button.random').on('click', function() {
23 |   $.getJSON('/tokenized_labels/sample', function(data) {
24 |     var text = data.tweet;
25 |     $('form textarea').val(text);
26 |     tagAndDisplay(text);
27 |   });
28 | });
29 | 
30 | $('form').on('submit', function(ev) {
31 |   ev.preventDefault();
32 | 
33 |   var text = $('form textarea').val();
34 |   tagAndDisplay(text);
35 | });
36 | 
37 | 
38 | var SequencedText = Backbone.Model.extend({
39 |   // usually has properties: text (a String), sequences (an array of arrays of strings)
40 |   tag: function(callback) {
41 |     // callback signature: function(err)
42 |     var self = this;
43 |     $.post('/tagger/tag', {text: this.get('text')}, function(result) {
44 |       // result is the JSON representation of a row from labelized_token
45 |       self.set('sequences', result.sequences)
46 |       callback(null);
47 |     });
48 |   }
49 | });
50 | 
51 | var SequencedTextView = TemplatedView.extend({
52 |   template: 'gloss',
53 |   events: {},
54 | });
55 | 
56 | function tagAndDisplay(text) {
57 |   // callback signature: function(err, tweet_view)
58 |   var sequenced_text = new SequencedText({text: text});
59 |   sequenced_text.tag(function(err) {
60 |     new SequencedTextView({model: sequenced_text, el: $('.tweet')});
61 |   });
62 | }
63 | 
64 | $(function() {
65 |   if (localStorage.text) {
66 |     tagAndDisplay(localStorage.text);
67 |   }
68 |   else {
69 |     $('button.random').trigger('click');
70 |   }
71 | });
72 | </script>
73 | 


--------------------------------------------------------------------------------
/templates/gloss.bars:
--------------------------------------------------------------------------------
 1 | <h3>Text</h3>
 2 | <p>{{text}}</p>
 3 | 
 4 | <h3>Alignments</h3>
 5 | <table class="gloss">
 6 |   <tbody>
 7 |     {{#each sequences}}
 8 |       <tr>
 9 |         <td><i>{{name}}</i></td>
10 |         {{#each values}}
11 |           <td>{{.}}</td>
12 |         {{/each}}
13 |       </tr>
14 |     {{/each}}
15 |   </tbody>
16 | </table>
17 | 


--------------------------------------------------------------------------------
/templates/layout.mako:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <meta charset="utf-8">
 3 | 
 4 | <link href="/static/master.css" rel="stylesheet" type="text/css" />
 5 | 
 6 | <script src="/static/lib/underscore.min.js"></script>
 7 | <script src="/static/lib/jquery.min.js"></script>
 8 | <script src="/static/lib/backbone.min.js"></script>
 9 | <script src="/static/lib/handlebars.js"></script>
10 | <script src="/static/lib/templating.js"></script>
11 | 
12 | ${next.body()}
13 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
 1 | ## Tests
 2 | 
 3 | Tweedr uses [nose](http://nose.readthedocs.org/) as the test runner.
 4 | 
 5 | Tests can be disabled on Travis CI by putting "no_ci" into the test name.
 6 | 
 7 | There are three ways to run tests (assuming you have `nose` installed), all of which must be called from the package root directory.
 8 | 
 9 | 1. `nosetests`
10 | 2. `python setup.py test`
11 | 3. `python setup.py nosetests`
12 | 
13 | Travis CI uses the last of these because it's the only one that automatically installs packages from `tests_require` in setup.py as well as allows setting command line options (it uses `-e no_ci` to exclude tests with "no_ci" in their name).
14 | 


--------------------------------------------------------------------------------
/tests/test_codebase.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import tweedr
 4 | from tweedr.lib import walk
 5 | 
 6 | source_endings = ('.py', '.bars', '.js', '.md', '.txt', '.mako', '.yml', '.less', '.json', '.css')
 7 | 
 8 | 
 9 | def not_egg(filepath):
10 |     return '.egg' not in filepath
11 | 
12 | 
13 | def not_git(filepath):
14 |     return '/.git/' not in filepath
15 | 
16 | 
17 | def not_static(filepath):
18 |     return '/static/lib/' not in filepath
19 | 
20 | 
21 | def is_source(filepath):
22 |     return filepath.endswith(source_endings)
23 | 
24 | 
25 | def is_python(filepath):
26 |     return filepath.endswith('.py')
27 | 
28 | 
29 | def test_pep8():
30 |     '''Running PEP-8 checks recursively in %s''' % tweedr.root
31 |     import pep8
32 |     ignore = [
33 |         'E128',  # E128 continuation line under-indented for visual indent
34 |         'E501',  # E501 line too long (?? > 79 characters)
35 |     ]
36 |     total_errors = 0
37 |     pep8style = pep8.StyleGuide(ignore=ignore)
38 |     for filepath in walk(tweedr.root, not_egg, not_git, is_python):
39 |         total_errors += pep8style.check_files([filepath]).total_errors
40 | 
41 |     assert total_errors == 0, 'Codebase does not pass PEP-8 (%d errors)' % total_errors
42 | 
43 | 
44 | def test_pyflakes():
45 |     '''Running pyflakes checks recursively in %s''' % tweedr.root
46 |     from pyflakes import api as pyflakes
47 |     total_errors = 0
48 |     for filepath in walk(tweedr.root, not_egg, not_git, is_python):
49 |         total_errors += pyflakes.checkPath(filepath)
50 | 
51 |     assert total_errors == 0, 'Codebase does not pass pyflakes (%d errors)' % total_errors
52 | 
53 | 
54 | def test_trailing_whitespace():
55 |     '''Running trailing whitespace checks recursively in %s''' % tweedr.root
56 |     total_errors = 0
57 |     for filepath in walk(tweedr.root, not_egg, not_git, not_static, is_source):
58 |         with open(filepath) as fp:
59 |             for line_i, raw in enumerate(fp):
60 |                 line = raw.rstrip('\n')
61 |                 if line.endswith((' ', '\t')):
62 |                     print >> sys.stdout, '%s:%d: trailing whitespace' % (filepath, line_i + 1)
63 |                     total_errors += 1
64 | 
65 |     assert total_errors == 0, 'Codebase has trailing whitespace (%d errors)' % total_errors
66 | 
67 | 
68 | def test_mysql_credentials_no_ci():
69 |     names = ['MYSQL_PASS', 'MYSQL_HOST']
70 |     values = [os.environ[name] for name in names]
71 | 
72 |     for filepath in walk(tweedr.root):
73 |         with open(filepath) as fp:
74 |             contents = fp.read()
75 |             for value in values:
76 |                 assert value not in contents, 'Found a blacklisted credential (%s) in %s' % (value, filepath)
77 | 


--------------------------------------------------------------------------------
/tests/test_libraries.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def test_mysql_python_import_no_ci():
 5 |     import MySQLdb
 6 |     assert MySQLdb is not None, 'MySQLdb should not be None.'
 7 | 
 8 | 
 9 | def test_mysql_python_no_ci():
10 |     import MySQLdb
11 |     connection = MySQLdb.connect(
12 |         os.environ['MYSQL_HOST'],
13 |         os.environ['MYSQL_USER'],
14 |         os.environ['MYSQL_PASS'],
15 |         os.environ['MYSQL_DATABASE'])
16 |     cursor = connection.cursor()
17 | 
18 |     # test version
19 |     version_query = 'SELECT VERSION()'
20 |     cursor.execute(version_query)
21 |     version_result = cursor.fetchone()[0]
22 |     assert version_result.split('.')[0] == '5', 'MySQL major version must equal 5'
23 | 
24 |     # test schema
25 |     tables_query = 'SELECT TABLE_NAME FROM information_schema.TABLES WHERE TABLE_SCHEMA = "QCRI"'
26 |     cursor.execute(tables_query)
27 |     tables_result = [result[0] for result in cursor.fetchall()]
28 |     table_names = ['tokenized_labels', 'tweets']
29 |     for table_name in table_names:
30 |         assert table_name in tables_result, 'The table "%s" was not found in the database' % table_name
31 | 
32 |     connection.close()
33 | 
34 | 
35 | def test_sqlalchemy_no_ci():
36 |     from sqlalchemy import create_engine, MetaData
37 | 
38 |     connection_string = 'mysql+mysqldb://%(MYSQL_USER)s:%(MYSQL_PASS)s@%(MYSQL_HOST)s/%(MYSQL_DATABASE)s' % os.environ
39 |     engine = create_engine(connection_string, convert_unicode=True)
40 | 
41 |     metadata = MetaData(bind=engine)
42 |     metadata.reflect()
43 | 
44 |     table_names = ['DamageClassification', 'labels']
45 |     for table_name in table_names:
46 |         assert table_name in metadata.tables, 'The table "%s" was not found in SqlAlchemy reflection results' % table_name
47 | 
48 | 
49 | def test_tweedr_models_no_ci():
50 |     from tweedr.models import DBSession, TokenizedLabel, Label
51 | 
52 |     Tables = [TokenizedLabel, Label]
53 |     for Table in Tables:
54 |         row_count = DBSession.query(Table).count()
55 |         assert row_count > 0, 'There should be more than 0 rows in the table "%s"' % Table.name
56 | 


--------------------------------------------------------------------------------
/tools/git-hooks/README.md:
--------------------------------------------------------------------------------
 1 | ## git hooks
 2 | 
 3 | `pre-commit` requires all tests to pass before you commit.
 4 | 
 5 | Here's the entire file:
 6 | 
 7 | ```bash
 8 | #!/bin/sh
 9 | cd $(dirname $GIT_DIR)
10 | python setup.py test
11 | ```
12 | 
13 | Install:
14 | 
15 | ```bash
16 | cd tweedr
17 | cp tools/git-hooks/pre-commit .git/hooks/pre-commit
18 | ```
19 | 


--------------------------------------------------------------------------------
/tools/git-hooks/pre-commit:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | cd $(dirname $GIT_DIR)
3 | python setup.py test
4 | 


--------------------------------------------------------------------------------
/tweedr/README.md:
--------------------------------------------------------------------------------
 1 | ## Tweedr Python package
 2 | 
 3 | * [`api/`](api) contains the main "pipeline" command line tool
 4 | * [`corpora/`](corpora) contains scripts for reading corpora into predictable data structures from various source formats.
 5 | * [`emr/`](emr) contains scripts for running jobs on Elastic Map Reduce
 6 | * [`lib/`](lib) holds miscellaneous helpers or basic text manipulation tools.
 7 | * [`ml/`](ml) contains all the machine learning and natural language processing tools.
 8 | * [`models/`](models) holds the database schema and relationship definitions.
 9 | * [`ui/`](ui) contains the web application.
10 | * [`__init__.py`](__init__.py) contains extensive log configuration.
11 | 
12 | ## Use
13 | 
14 | After installing, `tweedr` can be used as a Python package:
15 | 
16 |     import tweedr
17 |     print tweedr.__version__
18 | 


--------------------------------------------------------------------------------
/tweedr/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import json
 4 | import logging
 5 | from colorama import Fore, Back, Style
 6 | 
 7 | # just resolve this file in the context of the current working directory
 8 | # and find the parent of its directory
 9 | root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
10 | with open(os.path.join(root, 'package.json')) as fd:
11 |     package = json.load(fd)
12 |     __version__ = str(package['version'])
13 | 
14 | # add SILLY loglevel (above notset=0, below debug=10)
15 | SILLY = 5
16 | logging.addLevelName(SILLY, 'SILLY')
17 | 
18 | 
19 | class ColorFormatter(logging.Formatter):
20 |     # colors: https://pypi.python.org/pypi/colorama
21 |     thresholds = [
22 |         (logging.CRITICAL, (Back.RED, Back.RESET)),
23 |         (logging.ERROR, (Fore.RED, Fore.RESET)),
24 |         (logging.WARNING, (Back.YELLOW + Fore.BLACK, Back.RESET + Fore.RESET)),
25 |         (logging.INFO, (Fore.CYAN, Fore.RESET)),
26 |         (logging.DEBUG, (Fore.GREEN, Fore.RESET)),
27 |         (SILLY, (Style.DIM, Style.NORMAL)),
28 |         (logging.NOTSET, ('', '')),
29 |     ]
30 | 
31 |     def format(self, record):
32 |         result = super(ColorFormatter, self).format(record)
33 | 
34 |         for threshold, (prefix, postfix) in self.thresholds:
35 |             if record.levelno >= threshold:
36 |                 break
37 |         return prefix + result + postfix
38 | 
39 | 
40 | class TweedrLogger(logging.Logger):
41 |     # def __init__(self, name, **kw):
42 |     #     super(TweedrLogger, self).__init__(name, **kw)
43 | 
44 |     def silly(self, msg, *args, **kwargs):
45 |         if self.isEnabledFor(SILLY):
46 |             self._log(SILLY, msg, args, **kwargs)
47 | 
48 |     def notset(self, msg, *args, **kwargs):
49 |         if self.isEnabledFor(logging.NOTSET):
50 |             self._log(logging.NOTSET, msg, args, **kwargs)
51 | 
52 |     def __repr__(self):
53 |         return '<%s name=%s level=%d (effective=%d) parent=%s disabled=%d>' % (self.__class__.__name__,
54 |             self.name, self.level, self.getEffectiveLevel(), self.parent, self.disabled)
55 | 
56 | 
57 | # the following 5 lines replace logging.basicConfig(level=default_level)
58 | #   very similar effect, but with a color formatter.
59 | handler = logging.StreamHandler(sys.stderr)
60 | color_formatter = ColorFormatter(fmt='%(levelname)s:%(name)s:%(message)s')
61 | handler.setFormatter(color_formatter)
62 | logging.root.addHandler(handler)
63 | logging.root.setLevel(logging.DEBUG)
64 | 
65 | logging.setLoggerClass(TweedrLogger)
66 | 
67 | logger = logging.getLogger(__name__)
68 | 


--------------------------------------------------------------------------------
/tweedr/api/README.md:
--------------------------------------------------------------------------------
 1 | ## API
 2 | 
 3 | 
 4 | ### Instructions for `pipeline.py`
 5 | 
 6 | Let's say your tweets are gzipped json files in `~/corpora/qcri/gnip_tweets/samoa/`:
 7 | 
 8 |     cat ~/corpora/qcri/gnip_tweets/samoa/*.json.gz | gunzip | tweedr-pipeline
 9 | 
10 | * `tweedr-pipeline` is simply an alias for `tweedr.cli.pipeline.main()`
11 | 
12 | 
13 | ### More examples
14 | 
15 | Ignore the hapaxlegomena:
16 | 
17 |     ... | tweedr-pipeline | json -C count text | grep -v $'1\t'
18 | 
19 | Compare bloomfilter's exact matches with simhash:
20 | 
21 |     ... | tweedr-pipeline | json -C count fuzzy_count fuzzy_votes
22 | 


--------------------------------------------------------------------------------
/tweedr/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/tweedr/api/__init__.py


--------------------------------------------------------------------------------
/tweedr/api/mappers/__init__.py:
--------------------------------------------------------------------------------
 1 | from tweedr.api.protocols import DictProtocol
 2 | 
 3 | 
 4 | class Mapper(object):
 5 |     '''Passthrough / interface'''
 6 |     INPUT = DictProtocol
 7 |     OUTPUT = DictProtocol
 8 | 
 9 |     def __call__(self, dict_):
10 |         return dict_
11 | 


--------------------------------------------------------------------------------
/tweedr/api/mappers/basic.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | from tweedr.lib.text import whitespace_unicode_translations
  5 | from tweedr.api.mappers import Mapper
  6 | from tweedr.api.protocols import StringProtocol, DictProtocol, TweetDictProtocol
  7 | 
  8 | import logging
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | class EmptyLineFilter(Mapper):
 13 |     INPUT = StringProtocol
 14 |     OUTPUT = StringProtocol
 15 | 
 16 |     def __call__(self, line):
 17 |         # ignore empty lines
 18 |         stripped_line = line.strip()
 19 |         if stripped_line:
 20 |             return stripped_line
 21 | 
 22 | 
 23 | class JSONParser(Mapper):
 24 |     INPUT = StringProtocol
 25 |     OUTPUT = DictProtocol
 26 | 
 27 |     def __call__(self, line):
 28 |         try:
 29 |             return json.loads(line)
 30 |         except ValueError:
 31 |             logger.critical('Could not parse JSON: %s', line)
 32 |             raise
 33 | 
 34 | 
 35 | class IgnoreMetadata(Mapper):
 36 |     INPUT = DictProtocol
 37 |     OUTPUT = DictProtocol
 38 | 
 39 |     def __call__(self, dict_):
 40 |         if 'info' not in dict_:
 41 |             return dict_
 42 | 
 43 | 
 44 | class TweetStandardizer(Mapper):
 45 |     '''Ensures that a given dict being mapped through the pipeline has basic
 46 |     fields that come with every tweet, coalescing them into predictable names.
 47 | 
 48 |     This is necessary because different sources of tweets (e.g., raw Twitter,
 49 |     GNIP) name this information a variety of different things.
 50 | 
 51 |     The fields are:
 52 | 
 53 |         * `text` Unicode The tweet's textual content
 54 |         * `author` Unicode The Twitter screen name of the tweet's author
 55 |         * `id_str` Unicode The tweet's identifying snowflake, as assigned by
 56 |             Twitter when originally posted.
 57 | 
 58 |     See `TweetDictProtocol`'s documentation for more details.
 59 |     '''
 60 |     INPUT = DictProtocol
 61 |     OUTPUT = TweetDictProtocol
 62 | 
 63 |     def __call__(self, dict_):
 64 |         # ensure text. different sources call it different things.
 65 |         if 'text' in dict_:
 66 |             dict_['text'] = dict_['text'].translate(whitespace_unicode_translations)
 67 |         elif 'body' in dict_:
 68 |             dict_['text'] = dict_.pop('body').translate(whitespace_unicode_translations)
 69 |         else:
 70 |             logger.critical('Could not find text field in %s', dict_)
 71 |             raise KeyError("'text' | 'body'")
 72 | 
 73 |         # ensure author
 74 |         if 'actor' in dict_:
 75 |             dict_['author'] = dict_['actor']['preferredUsername']
 76 |         elif 'user' in dict_:
 77 |             dict_['author'] = dict_['user']['screen_name']
 78 |         else:
 79 |             logger.critical('Could not find author field in %s', dict_)
 80 |             raise KeyError("'actor.preferredUsername' | 'user.screen_name'")
 81 | 
 82 |         # ensure id
 83 |         if 'id_str' in dict_:
 84 |             dict_['id'] = dict_['id_str']
 85 |         else:
 86 |             dict_['id'] = dict_['id'].split(':')[-1]
 87 | 
 88 |         return dict_
 89 | 
 90 | 
 91 | class LineStream(Mapper):
 92 |     INPUT = DictProtocol
 93 |     OUTPUT = None
 94 | 
 95 |     def __init__(self, stream):
 96 |         self.stream = sys.stdout
 97 | 
 98 |     def __call__(self, dict_):
 99 |         json.dump(dict_, self.stream)
100 |         self.stream.write(os.linesep)
101 |         # flush might be unnecessary in production
102 |         self.stream.flush()
103 | 


--------------------------------------------------------------------------------
/tweedr/api/mappers/ml.py:
--------------------------------------------------------------------------------
 1 | from sklearn import feature_extraction, pipeline
 2 | from tweedr.lib.text import token_re
 3 | from tweedr.ml.features import featurize, characters, lexicons, ngrams  # , nlp
 4 | from tweedr.api.mappers import Mapper
 5 | from tweedr.api.protocols import TweetDictProtocol
 6 | 
 7 | import logging
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | class CorpusClassifier(Mapper):
12 |     INPUT = TweetDictProtocol
13 |     OUTPUT = TweetDictProtocol
14 | 
15 |     feature_functions = [
16 |         ngrams.unigrams,
17 |         characters.plural,
18 |         lexicons.is_transportation,
19 |         lexicons.is_building,
20 |         characters.capitalized,
21 |         characters.numeric,
22 |         ngrams.unique,
23 |         lexicons.hypernyms,
24 |         # nlp.pos_tags,
25 |     ]
26 | 
27 |     def tokenizer(self, text):
28 |         tokens = token_re.findall(text)
29 |         tokens_features = featurize(tokens, self.feature_functions)
30 |         for token_features in tokens_features:
31 |             for feature in token_features:
32 |                 yield feature
33 | 
34 |     def __init__(self, datasource, classifier):
35 |         logger.info('Training %s on %s', classifier.__class__.__name__, datasource.__class__.__name__)
36 | 
37 |         # datasource yields (label, text) pairs
38 |         y, X = zip(*datasource)
39 | 
40 |         self.name = datasource.__class__.__name__ + ':' + classifier.__class__.__name__
41 |         self.pipeline = pipeline.Pipeline([
42 |             ('dictionary', feature_extraction.text.CountVectorizer(tokenizer=self.tokenizer)),
43 |             ('tfidf', feature_extraction.text.TfidfTransformer()),
44 |             ('classifier', classifier),
45 |         ])
46 | 
47 |         self.pipeline.fit(X, y)
48 | 
49 |     def __call__(self, tweet):
50 |         text = tweet['text']
51 |         y = self.pipeline.predict([text])[0]
52 | 
53 |         if 'classification' not in tweet:
54 |             tweet['classification'] = []
55 | 
56 |         tweet['classification'].append({
57 |             'name': self.name,
58 |             'label': y,
59 |         })
60 | 
61 |         return tweet
62 | 


--------------------------------------------------------------------------------
/tweedr/api/mappers/nlp.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import itertools
  3 | import requests
  4 | from tweedr.api.mappers import Mapper
  5 | from tweedr.api.protocols import TweetDictProtocol
  6 | from tweedr.lib.text import token_re, zip_boundaries
  7 | from tweedr.ml.features import featurize, characters, dbpedia, lexicons, ngrams
  8 | from tweedr.ark.java import TwitterNLP
  9 | from tweedr.ml.crf.classifier import CRF
 10 | 
 11 | import logging
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class POSTagger(Mapper):
 16 |     INPUT = TweetDictProtocol
 17 |     OUTPUT = TweetDictProtocol
 18 | 
 19 |     def __init__(self):
 20 |         self.tagger = TwitterNLP()
 21 | 
 22 |     def __call__(self, tweet):
 23 |         '''Enhances the input tweet with POS tags, using only the tweet["text"] value:
 24 | 
 25 |             {
 26 |                 ...
 27 |                 "tokens": "@Donnie I hear ya and I hate earthquakes in Cali too ! But I still love living in LA ! :)",
 28 |                 "pos": "@ O V O & O V N P ^ R , & O R V V P ^ ,",
 29 |                 ...
 30 |             }
 31 | 
 32 |         The `tokens` and `pos` values can be split on whitespace to get equal-length lists of strings.
 33 |         '''
 34 |         tokens, pos_tags = self.tagger.tokenize_and_tag(tweet['text'])
 35 |         tweet['tokens'] = tokens
 36 |         tweet['pos'] = pos_tags
 37 |         return tweet
 38 | 
 39 | 
 40 | class SequenceTagger(Mapper):
 41 |     INPUT = TweetDictProtocol
 42 |     OUTPUT = TweetDictProtocol
 43 | 
 44 |     feature_functions = [
 45 |         ngrams.unigrams,
 46 |         characters.plural,
 47 |         lexicons.is_transportation,
 48 |         lexicons.is_building,
 49 |         characters.capitalized,
 50 |         characters.numeric,
 51 |         ngrams.unique,
 52 |         lexicons.hypernyms,
 53 |         dbpedia.spotlight,
 54 |     ]
 55 | 
 56 |     def __init__(self):
 57 |         self.crf = CRF.default(self.feature_functions)
 58 |         logger.info('SequenceTagger initialized')
 59 | 
 60 |     def __call__(self, tweet):
 61 |         text = tweet['text']
 62 |         tokens = token_re.findall(text)
 63 | 
 64 |         # tokens_features = map(list, featurize(tokens, crf_feature_functions))
 65 |         tokens_features = featurize(tokens, self.feature_functions)
 66 | 
 67 |         null_label = 'None'
 68 |         labels = self.crf.predict([tokens_features])[0]
 69 |         # tweet['labels'] = labels
 70 | 
 71 |         if 'sequences' not in tweet:
 72 |             tweet['sequences'] = []
 73 | 
 74 |         for sequence_label, entries in itertools.groupby(zip_boundaries(labels), lambda tup: tup[0]):
 75 |             if sequence_label != null_label:
 76 |                 labels, starts, ends = zip(*entries)
 77 | 
 78 |                 tweet['sequences'].append({
 79 |                     'text': sequence_label,
 80 |                     'start': starts[0],
 81 |                     'end': ends[-1],
 82 |                 })
 83 | 
 84 |         return tweet
 85 | 
 86 | 
 87 | class DBpediaSpotter(Mapper):
 88 |     INPUT = TweetDictProtocol
 89 |     OUTPUT = TweetDictProtocol
 90 | 
 91 |     def __init__(self, confidence=0.1, support=10):
 92 |         self.annotate_url = '%s/rest/annotate' % os.environ.get('SPOTLIGHT', 'http://spotlight.sztaki.hu:2222')
 93 |         self.confidence = confidence
 94 |         self.support = support
 95 |         logger.info('DBpediaSpotter initialized')
 96 | 
 97 |     def __call__(self, tweet):
 98 |         text = tweet['text']
 99 | 
100 |         if 'dbpedia' not in tweet:
101 |             tweet['dbpedia'] = []
102 | 
103 |         r = requests.post(self.annotate_url,
104 |             headers=dict(Accept='application/json'),
105 |             data=dict(text=text, confidence=self.confidence, support=self.support))
106 |         Resources = r.json().get('Resources', [])
107 | 
108 |         for Resource in Resources:
109 |             start = int(Resource['@offset'])
110 |             surface_form = Resource['@surfaceForm']
111 |             types = Resource['@types']
112 | 
113 |             dbpedia_resource = {
114 |                 'text': surface_form,
115 |                 'start': start,
116 |                 'end': start + len(surface_form),
117 |                 'uri': Resource['@URI'],
118 |                 'types': types.split(',') if types else [],
119 |             }
120 | 
121 |             tweet['dbpedia'].append(dbpedia_resource)
122 | 
123 |         return tweet
124 | 


--------------------------------------------------------------------------------
/tweedr/api/mappers/similar.py:
--------------------------------------------------------------------------------
 1 | from tweedr.api.mappers import Mapper
 2 | from tweedr.api.protocols import TweetDictProtocol
 3 | 
 4 | import logging
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | # for the bloomfilter
 8 | import tempfile
 9 | import pybloomfilter
10 | 
11 | # for simhashing
12 | from hashes.simhash import simhash
13 | 
14 | 
15 | class TextCounter(Mapper):
16 |     INPUT = TweetDictProtocol
17 |     OUTPUT = TweetDictProtocol
18 | 
19 |     def __init__(self):
20 |         # Use an in-memory bloomfilter for now, maybe move to pyreBloom if we need something threadsafe?
21 |         bloomfilter_filepath = tempfile.NamedTemporaryFile(delete=False).name
22 |         logger.debug('Saving bloomfilter to %s', bloomfilter_filepath)
23 |         # pybloomfilter.BloomFilter(capacity, error_rate, filename)
24 |         self.bloomfilter = pybloomfilter.BloomFilter(10000000, 0.001, bloomfilter_filepath)
25 |         self.seen = dict()
26 | 
27 |     def __call__(self, dict_):
28 |         text = dict_['text']
29 | 
30 |         # bloomfilter.add(...) returns True if item is already in the filter
31 |         if self.bloomfilter.add(text):
32 |             # we only start to store counts when we see an item more than once
33 |             self.seen[text] = dict_['count'] = self.seen.get(text, 1) + 1
34 |         else:
35 |             dict_['count'] = 1
36 | 
37 |         return dict_
38 | 
39 | 
40 | class FuzzyTextCounter(Mapper):
41 |     INPUT = TweetDictProtocol
42 |     OUTPUT = TweetDictProtocol
43 | 
44 |     def __init__(self, threshold=0.97):
45 |         self.threshold = threshold
46 |         logger.debug('Simhash counter initialized with threshold of %0.3f', threshold)
47 | 
48 |         # list of all processed simhash objects
49 |         self.simhashes = []
50 |         # votes is a lookup from a simhash hex to the original's id
51 |         self.votes = dict()
52 | 
53 |     def __call__(self, dict_):
54 |         text = dict_['text']
55 |         self_simhash = simhash(text)
56 | 
57 |         fuzzy_count = 0
58 |         sum_other_votes = 0
59 |         for other_simhash in self.simhashes:
60 |             if self_simhash.similarity(other_simhash) > self.threshold:
61 |                 # increment the votes of the others
62 |                 other_votes = self.votes[other_simhash.hash] = self.votes.get(other_simhash.hash, 1) + 1
63 |                 fuzzy_count += 1
64 |                 sum_other_votes += other_votes
65 | 
66 |         # should self.votes be elevated based on fuzzy_count?
67 |         self.votes[self_simhash.hash] = self.votes.get(self_simhash.hash, 0) + 1
68 | 
69 |         # maybe normalize based on the number of total votes?
70 |         dict_['fuzzy_count'] = fuzzy_count
71 |         dict_['fuzzy_votes'] = sum_other_votes
72 | 
73 |         # store simhash in global state now that we've finished processing
74 |         self.simhashes.append(self_simhash)
75 |         return dict_
76 | 


--------------------------------------------------------------------------------
/tweedr/api/pipeline.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | logger = logging.getLogger(__name__)
 3 | 
 4 | 
 5 | class Pipeline(object):
 6 |     def __init__(self, *mappers):
 7 |         logger.info('%s -> [pipeline] -> %s', mappers[0].INPUT, mappers[-1].OUTPUT)
 8 |         # type-check the connections between the provided mappers
 9 |         total_errors = 0
10 |         for from_pipe, to_pipe in zip(mappers, mappers[1:]):
11 |             # Python lets you use `a <= b` to say `a is a subclass of b`
12 |             # SuperClass >= Class is true
13 |             # Class >= Class is true
14 |             # Class >= SuperClass is false
15 |             if from_pipe.OUTPUT < to_pipe.INPUT:
16 |                 logger.error('Pipeline cannot connect mappers: %s[%s] -> %s[%s]',
17 |                     from_pipe.__class__.__name__, from_pipe.OUTPUT.__name__,
18 |                     to_pipe.__class__.__name__, to_pipe.INPUT.__name__)
19 |                 total_errors += 1
20 |         if total_errors > 0:
21 |             raise TypeError('Pipeline types do not match.')
22 |         self.mappers = mappers
23 | 
24 |     def __call__(self, payload):
25 |         logger.notset('Pipeline processing payload: %s', payload)
26 |         # TODO: maybe wrap with a try-except here?
27 |         for mapper in self.mappers:
28 |             payload = mapper(payload)
29 |             if payload is None:
30 |                 break
31 |         return payload
32 | 


--------------------------------------------------------------------------------
/tweedr/api/protocols.py:
--------------------------------------------------------------------------------
 1 | class StringProtocol(object):
 2 |     pass
 3 | 
 4 | 
 5 | class DictProtocol(object):
 6 |     pass
 7 | 
 8 | 
 9 | class TweetDictProtocol(DictProtocol):
10 |     '''This merely asserts that the following fields will exist and have reasonable values:
11 | 
12 |         text: String
13 |         id: String
14 |         author: String
15 |     '''
16 | 


--------------------------------------------------------------------------------
/tweedr/ark/__init__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import logging
 4 | logger = logging.getLogger(__name__)
 5 | 
 6 | 
 7 | def main():
 8 |     '''Example usage:
 9 | 
10 |     echo "The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced no evidence that any irregularities took place." | python -m tweedr.ark.__init__
11 |     '''
12 |     if sys.stdin.isatty():
13 |         logger.error('You must pipe in a string')
14 |         exit(1)
15 | 
16 |     from tweedr.ark.java import TwitterNLP
17 |     tagger = TwitterNLP()
18 | 
19 |     for line in sys.stdin:
20 |         print '[input]', line.strip()
21 |         tag_line = tagger.predict(line)
22 |         print '[output]', tag_line
23 | 
24 | if __name__ == '__main__':
25 |     main()
26 | 


--------------------------------------------------------------------------------
/tweedr/ark/java/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from subprocess import Popen, PIPE
 3 | 
 4 | import tweedr
 5 | from tweedr.ml.classifier import ClassifierI
 6 | from tweedr.lib.text import whitespace_unicode_translations
 7 | 
 8 | import logging
 9 | logger = logging.getLogger(__name__)
10 | 
11 | jar_path = os.path.join(tweedr.root, 'ext', 'ark-tweet-nlp-0.3.2.jar')
12 | 
13 | 
14 | class TwitterNLP(ClassifierI):
15 |     def __init__(self, *args, **kw):
16 |         self.proc = Popen(['java', '-cp', jar_path, 'cmu.arktweetnlp.RunTagger',
17 |             '--input-format', 'text', '--output-format', 'pretsv'],
18 |             stdin=PIPE, stdout=PIPE, stderr=PIPE)
19 | 
20 |         logger.info('cmu.arktweetnlp.RunTagger Java VM initialized with PID: %d', self.proc.pid)
21 | 
22 |     def fit(self, X, y):
23 |         raise NotImplementedError('TwitterNLP is pre-trained; re-training is not supported.')
24 | 
25 |     def predict(self, X):
26 |         # return only the labels (the POS tags)
27 |         return self.parse_string(X)[1]
28 | 
29 |     # additional fields below are not required by ClassifierI, except that
30 |     # they are called in predict
31 |     def tokenize_and_tag(self, document):
32 |         # only return the first two lines (tokens and labels)
33 |         return self.parse_string(document)[:2]
34 | 
35 |     def parse_string(self, document):
36 |         '''
37 |         Take a single string, remove any CR / LF / tab whitespace, and run it
38 |         through TwitterNLP as an individual sequence of text.
39 | 
40 |             `document` String line of input
41 | 
42 |         Returns a tuple of strings, each of which is an equal-length (after
43 |         `split`'ing) whitespace-separated sequence of tokens / POS tags /
44 |         confidences.
45 |         '''
46 |         # sanitize the input and convert to bytestring
47 |         if not isinstance(document, unicode):
48 |             document = document.decode('utf8')
49 |         string = document.translate(whitespace_unicode_translations).encode('utf8').strip()
50 | 
51 |         # write input with EOL marker (RunTagger won't return tags until it hits a newline)
52 |         self.proc.stdin.write(string)
53 |         self.proc.stdin.write('\n')
54 | 
55 |         # wait for output
56 |         result = self.proc.stdout.readline()
57 |         # no available stdout (the empty string) means there was an error
58 |         if result == '':
59 |             for stderr_line in self.proc.stderr:
60 |                 logger.error(stderr_line.rstrip())
61 |             raise IOError('cmu.arktweetnlp.RunTagger error')
62 | 
63 |         # output of cmu.arktweetnlp.RunTagger is TOKENS<tab>TAGS<tab>CONFIDENCES<tab>ORIGINAL
64 |         parts = result.split('\t')
65 |         # cut off the original input, which is parts[3]
66 |         return parts[0:3]
67 | 


--------------------------------------------------------------------------------
/tweedr/ark/java/singleton.py:
--------------------------------------------------------------------------------
 1 | from tweedr.ark.java import TwitterNLP
 2 | 
 3 | import logging
 4 | logger = logging.getLogger(__name__)
 5 | 
 6 | logger.debug('The TwitterNLP POS tagger is being loaded as a module singleton')
 7 | 
 8 | # simply by importing this module, the TwitterNLP tagger will be started up and
 9 | # made available to other scripts.
10 | tagger = TwitterNLP()
11 | 


--------------------------------------------------------------------------------
/tweedr/ark/tweetmotif/LICENSE:
--------------------------------------------------------------------------------
  1 |                               Apache License
  2 |                         Version 2.0, January 2004
  3 |                      http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 |    "License" shall mean the terms and conditions for use, reproduction,
 10 |    and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |    "Licensor" shall mean the copyright owner or entity authorized by
 13 |    the copyright owner that is granting the License.
 14 | 
 15 |    "Legal Entity" shall mean the union of the acting entity and all
 16 |    other entities that control, are controlled by, or are under common
 17 |    control with that entity. For the purposes of this definition,
 18 |    "control" means (i) the power, direct or indirect, to cause the
 19 |    direction or management of such entity, whether by contract or
 20 |    otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |    outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |    "You" (or "Your") shall mean an individual or Legal Entity
 24 |    exercising permissions granted by this License.
 25 | 
 26 |    "Source" form shall mean the preferred form for making modifications,
 27 |    including but not limited to software source code, documentation
 28 |    source, and configuration files.
 29 | 
 30 |    "Object" form shall mean any form resulting from mechanical
 31 |    transformation or translation of a Source form, including but
 32 |    not limited to compiled object code, generated documentation,
 33 |    and conversions to other media types.
 34 | 
 35 |    "Work" shall mean the work of authorship, whether in Source or
 36 |    Object form, made available under the License, as indicated by a
 37 |    copyright notice that is included in or attached to the work
 38 |    (an example is provided in the Appendix below).
 39 | 
 40 |    "Derivative Works" shall mean any work, whether in Source or Object
 41 |    form, that is based on (or derived from) the Work and for which the
 42 |    editorial revisions, annotations, elaborations, or other modifications
 43 |    represent, as a whole, an original work of authorship. For the purposes
 44 |    of this License, Derivative Works shall not include works that remain
 45 |    separable from, or merely link (or bind by name) to the interfaces of,
 46 |    the Work and Derivative Works thereof.
 47 | 
 48 |    "Contribution" shall mean any work of authorship, including
 49 |    the original version of the Work and any modifications or additions
 50 |    to that Work or Derivative Works thereof, that is intentionally
 51 |    submitted to Licensor for inclusion in the Work by the copyright owner
 52 |    or by an individual or Legal Entity authorized to submit on behalf of
 53 |    the copyright owner. For the purposes of this definition, "submitted"
 54 |    means any form of electronic, verbal, or written communication sent
 55 |    to the Licensor or its representatives, including but not limited to
 56 |    communication on electronic mailing lists, source code control systems,
 57 |    and issue tracking systems that are managed by, or on behalf of, the
 58 |    Licensor for the purpose of discussing and improving the Work, but
 59 |    excluding communication that is conspicuously marked or otherwise
 60 |    designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |    "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |    on behalf of whom a Contribution has been received by Licensor and
 64 |    subsequently incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License. Subject to the terms and conditions of
 67 |    this License, each Contributor hereby grants to You a perpetual,
 68 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |    copyright license to reproduce, prepare Derivative Works of,
 70 |    publicly display, publicly perform, sublicense, and distribute the
 71 |    Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3. Grant of Patent License. Subject to the terms and conditions of
 74 |    this License, each Contributor hereby grants to You a perpetual,
 75 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |    (except as stated in this section) patent license to make, have made,
 77 |    use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |    where such license applies only to those patent claims licensable
 79 |    by such Contributor that are necessarily infringed by their
 80 |    Contribution(s) alone or by combination of their Contribution(s)
 81 |    with the Work to which such Contribution(s) was submitted. If You
 82 |    institute patent litigation against any entity (including a
 83 |    cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |    or a Contribution incorporated within the Work constitutes direct
 85 |    or contributory patent infringement, then any patent licenses
 86 |    granted to You under this License for that Work shall terminate
 87 |    as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution. You may reproduce and distribute copies of the
 90 |    Work or Derivative Works thereof in any medium, with or without
 91 |    modifications, and in Source or Object form, provided that You
 92 |    meet the following conditions:
 93 | 
 94 |    (a) You must give any other recipients of the Work or
 95 |        Derivative Works a copy of this License; and
 96 | 
 97 |    (b) You must cause any modified files to carry prominent notices
 98 |        stating that You changed the files; and
 99 | 
100 |    (c) You must retain, in the Source form of any Derivative Works
101 |        that You distribute, all copyright, patent, trademark, and
102 |        attribution notices from the Source form of the Work,
103 |        excluding those notices that do not pertain to any part of
104 |        the Derivative Works; and
105 | 
106 |    (d) If the Work includes a "NOTICE" text file as part of its
107 |        distribution, then any Derivative Works that You distribute must
108 |        include a readable copy of the attribution notices contained
109 |        within such NOTICE file, excluding those notices that do not
110 |        pertain to any part of the Derivative Works, in at least one
111 |        of the following places: within a NOTICE text file distributed
112 |        as part of the Derivative Works; within the Source form or
113 |        documentation, if provided along with the Derivative Works; or,
114 |        within a display generated by the Derivative Works, if and
115 |        wherever such third-party notices normally appear. The contents
116 |        of the NOTICE file are for informational purposes only and
117 |        do not modify the License. You may add Your own attribution
118 |        notices within Derivative Works that You distribute, alongside
119 |        or as an addendum to the NOTICE text from the Work, provided
120 |        that such additional attribution notices cannot be construed
121 |        as modifying the License.
122 | 
123 |    You may add Your own copyright statement to Your modifications and
124 |    may provide additional or different license terms and conditions
125 |    for use, reproduction, or distribution of Your modifications, or
126 |    for any such Derivative Works as a whole, provided Your use,
127 |    reproduction, and distribution of the Work otherwise complies with
128 |    the conditions stated in this License.
129 | 
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 |    any Contribution intentionally submitted for inclusion in the Work
132 |    by You to the Licensor shall be under the terms and conditions of
133 |    this License, without any additional terms or conditions.
134 |    Notwithstanding the above, nothing herein shall supersede or modify
135 |    the terms of any separate license agreement you may have executed
136 |    with Licensor regarding such Contributions.
137 | 
138 | 6. Trademarks. This License does not grant permission to use the trade
139 |    names, trademarks, service marks, or product names of the Licensor,
140 |    except as required for reasonable and customary use in describing the
141 |    origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 |    agreed to in writing, Licensor provides the Work (and each
145 |    Contributor provides its Contributions) on an "AS IS" BASIS,
146 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |    implied, including, without limitation, any warranties or conditions
148 |    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |    PARTICULAR PURPOSE. You are solely responsible for determining the
150 |    appropriateness of using or redistributing the Work and assume any
151 |    risks associated with Your exercise of permissions under this License.
152 | 
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 |    whether in tort (including negligence), contract, or otherwise,
155 |    unless required by applicable law (such as deliberate and grossly
156 |    negligent acts) or agreed to in writing, shall any Contributor be
157 |    liable to You for damages, including any direct, indirect, special,
158 |    incidental, or consequential damages of any character arising as a
159 |    result of this License or out of the use or inability to use the
160 |    Work (including but not limited to damages for loss of goodwill,
161 |    work stoppage, computer failure or malfunction, or any and all
162 |    other commercial damages or losses), even if such Contributor
163 |    has been advised of the possibility of such damages.
164 | 
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 |    the Work or Derivative Works thereof, You may choose to offer,
167 |    and charge a fee for, acceptance of support, warranty, indemnity,
168 |    or other liability obligations and/or rights consistent with this
169 |    License. However, in accepting such obligations, You may act only
170 |    on Your own behalf and on Your sole responsibility, not on behalf
171 |    of any other Contributor, and only if You agree to indemnify,
172 |    defend, and hold each Contributor harmless for any liability
173 |    incurred by, or claims asserted against, such Contributor by reason
174 |    of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 
178 | APPENDIX: How to apply the Apache License to your work.
179 | 
180 |    To apply the Apache License to your work, attach the following
181 |    boilerplate notice, with the fields enclosed by brackets "[]"
182 |    replaced with your own identifying information. (Don't include
183 |    the brackets!)  The text should be enclosed in the appropriate
184 |    comment syntax for the file format. We also recommend that a
185 |    file or class name and description of purpose be included on the
186 |    same "printed page" as the copyright notice for easier
187 |    identification within third-party archives.
188 | 
189 | Copyright 2009-2010, Brendan O'Connor, Michel Krieger, and David Ahn
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 |     http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 | 


--------------------------------------------------------------------------------
/tweedr/ark/tweetmotif/README.md:
--------------------------------------------------------------------------------
 1 | # TweetMotif
 2 | 
 3 | The files `emoticons.py` and `twokenize.py` are originally from [TweetMotif](https://github.com/brendano/tweetmotif).
 4 | 
 5 | Brendan O'Connor, Michel Krieger, and David Ahn. [_TweetMotif: Exploratory Search and Topic Summarization for Twitter_](http://anyall.org/oconnor_krieger_ahn.icwsm2010.tweetmotif.pdf). ICWSM-2010.
 6 | 
 7 | 
 8 | ## License
 9 | 
10 | Copyright © 2009-2010, Brendan O'Connor, Michel Krieger, and David Ahn.
11 | 
12 | TweetMotif is licensed under the [Apache License 2.0](LICENSE).
13 | 


--------------------------------------------------------------------------------
/tweedr/ark/tweetmotif/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/tweedr/ark/tweetmotif/__init__.py


--------------------------------------------------------------------------------
/tweedr/ark/tweetmotif/emoticons.py:
--------------------------------------------------------------------------------
 1 | """ emoticon recognition via patterns.  tested on english-language twitter, but
 2 | probably works for other social media dialects. """
 3 | 
 4 | __author__ = "Brendan O'Connor (anyall.org, brenocon@gmail.com)"
 5 | __version__ = "april 2009"
 6 | 
 7 | import re
 8 | import sys
 9 | 
10 | mycompile = lambda pat: re.compile(pat, re.UNICODE)
11 | #SMILEY = mycompile(r'[:=].{0,1}[\)dpD]')
12 | #MULTITOK_SMILEY = mycompile(r' : [\)dp]')
13 | 
14 | NormalEyes = r'[:=]'
15 | Wink = r'[;]'
16 | 
17 | NoseArea = r'(|o|O|-)'  # rather tight precision, \S might be reasonable...
18 | 
19 | HappyMouths = r'[D\)\]]'
20 | SadMouths = r'[\(\[]'
21 | Tongue = r'[pP]'
22 | OtherMouths = r'[doO/\\]'  # remove forward slash if http://'s aren't cleaned
23 | 
24 | Happy_RE = mycompile('(\^_\^|' + NormalEyes + NoseArea + HappyMouths + ')')
25 | Sad_RE = mycompile(NormalEyes + NoseArea + SadMouths)
26 | 
27 | Wink_RE = mycompile(Wink + NoseArea + HappyMouths)
28 | Tongue_RE = mycompile(NormalEyes + NoseArea + Tongue)
29 | Other_RE = mycompile('(' + NormalEyes + '|' + Wink + ')' + NoseArea + OtherMouths)
30 | 
31 | Emoticon = (
32 |     "(" + NormalEyes + "|" + Wink + ")" + NoseArea +
33 |     "(" + Tongue + "|" + OtherMouths +
34 |     "|" + SadMouths + "|" + HappyMouths + ")"
35 | )
36 | Emoticon_RE = mycompile(Emoticon)
37 | 
38 | #Emoticon_RE = "|".join([Happy_RE,Sad_RE,Wink_RE,Tongue_RE,Other_RE])
39 | #Emoticon_RE = mycompile(Emoticon_RE)
40 | 
41 | 
42 | def analyze_tweet(text):
43 |     h = Happy_RE.search(text)
44 |     s = Sad_RE.search(text)
45 |     if h and s:
46 |         return "BOTH_HS"
47 |     if h:
48 |         return "HAPPY"
49 |     if s:
50 |         return "SAD"
51 |     return "NA"
52 | 
53 |     # more complex & harder, so disabled for now
54 |     #w= Wink_RE.search(text)
55 |     #t= Tongue_RE.search(text)
56 |     #a= Other_RE.search(text)
57 |     #h,w,s,t,a = [bool(x) for x in [h,w,s,t,a]]
58 |     # if sum([h,w,s,t,a])>1: return "MULTIPLE"
59 |     # if sum([h,w,s,t,a])==1:
60 |     #  if h: return "HAPPY"
61 |     #  if s: return "SAD"
62 |     #  if w: return "WINK"
63 |     #  if a: return "OTHER"
64 |     #  if t: return "TONGUE"
65 |     # return "NA"
66 | 
67 | if __name__ == '__main__':
68 |     for line in sys.stdin:
69 |         import sane_re
70 |         sane_re._S(line[:-1]).show_match(Emoticon_RE, numbers=False)
71 |         #print(analyze_tweet(line.strip()), line.strip(), sep="\t")
72 | 


--------------------------------------------------------------------------------
/tweedr/ark/tweetmotif/twokenize.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """ tokenizer for tweets!  might be appropriate for other social media dialects too.
  3 | general philosophy is to throw as little out as possible.
  4 | development philosophy: every time you change a rule, do a diff of this
  5 | program's output on ~100k tweets.  if you iterate through many possible rules
  6 | and only accept the ones that seeem to result in good diffs, it's a sort of
  7 | statistical learning with in-the-loop human evaluation :)
  8 | """
  9 | 
 10 | __author__ = "brendan o'connor (anyall.org)"
 11 | 
 12 | import re
 13 | import sys
 14 | import emoticons
 15 | mycompile = lambda pat: re.compile(pat, re.UNICODE)
 16 | 
 17 | 
 18 | def regex_or(*items):
 19 |     r = '|'.join(items)
 20 |     r = '(' + r + ')'
 21 |     return r
 22 | 
 23 | 
 24 | def pos_lookahead(r):
 25 |     return '(?=' + r + ')'
 26 | 
 27 | 
 28 | def neg_lookahead(r):
 29 |     return '(?!' + r + ')'
 30 | 
 31 | 
 32 | def optional(r):
 33 |     return '(%s)?' % r
 34 | 
 35 | 
 36 | PunctChars = r'''['“".?!,:;]'''
 37 | Punct = '%s+' % PunctChars
 38 | Entity = '&(amp|lt|gt|quot);'
 39 | 
 40 | # one-liner URL recognition:
 41 | #Url = r'''https?://\S+'''
 42 | 
 43 | # more complex version:
 44 | UrlStart1 = regex_or('https?://', r'www\.')
 45 | CommonTLDs = regex_or('com', 'co\\.uk', 'org', 'net', 'info', 'ca')
 46 | UrlStart2 = r'[a-z0-9\.-]+?' + r'\.' + CommonTLDs + pos_lookahead(r'[/ \W\b]')
 47 | # * not + for case of:  "go to bla.com." -- don't want period
 48 | UrlBody = r'[^ \t\r\n<>]*?'
 49 | UrlExtraCrapBeforeEnd = '%s+?' % regex_or(PunctChars, Entity)
 50 | UrlEnd = regex_or(r'\.\.+', r'[<>]', r'\s', '$')
 51 | Url = (r'\b' + regex_or(UrlStart1, UrlStart2) + UrlBody + pos_lookahead(optional(UrlExtraCrapBeforeEnd) + UrlEnd))
 52 | 
 53 | Url_RE = re.compile("(%s)" % Url, re.U | re.I)
 54 | 
 55 | Timelike = r'\d+:\d+'
 56 | NumNum = r'\d+\.\d+'
 57 | NumberWithCommas = r'(\d+,)+?\d{3}' + pos_lookahead(regex_or('[^,]', '$'))
 58 | 
 59 | Abbrevs1 = ['am', 'pm', 'us', 'usa', 'ie', 'eg']
 60 | 
 61 | 
 62 | def regexify_abbrev(a):
 63 |     chars = list(a)
 64 |     icase = ["[%s%s]" % (c, c.upper()) for c in chars]
 65 |     dotted = [r'%s\.' % x for x in icase]
 66 |     return "".join(dotted)
 67 | Abbrevs = [regexify_abbrev(a) for a in Abbrevs1]
 68 | 
 69 | BoundaryNotDot = regex_or(r'\s', '[“"?!,:;]', Entity)
 70 | aa1 = r'''([A-Za-z]\.){2,}''' + pos_lookahead(BoundaryNotDot)
 71 | aa2 = r'''([A-Za-z]\.){1,}[A-Za-z]''' + pos_lookahead(BoundaryNotDot)
 72 | ArbitraryAbbrev = regex_or(aa1, aa2)
 73 | 
 74 | assert '-' != '―'
 75 | Separators = regex_or('--+', '―')
 76 | Decorations = r' [  ♫   ]+ '.replace(' ', '')
 77 | 
 78 | EmbeddedApostrophe = r"\S+'\S+"
 79 | 
 80 | ProtectThese = [
 81 |     emoticons.Emoticon,
 82 |     Url,
 83 |     Entity,
 84 |     Timelike,
 85 |     NumNum,
 86 |     NumberWithCommas,
 87 |     Punct,
 88 |     ArbitraryAbbrev,
 89 |     Separators,
 90 |     Decorations,
 91 |     EmbeddedApostrophe,
 92 | ]
 93 | Protect_RE = mycompile(regex_or(*ProtectThese))
 94 | 
 95 | 
 96 | class Tokenization(list):
 97 |     " list of tokens, plus extra info "
 98 | 
 99 |     def __init__(self):
100 |         self.alignments = []
101 |         self.text = ""
102 | 
103 |     def subset(self, tok_inds):
104 |         new = Tokenization()
105 |         new += [self[i] for i in tok_inds]
106 |         new.alignments = [self.alignments[i] for i in tok_inds]
107 |         new.text = self.text
108 |         return new
109 | 
110 |     def assert_consistent(t):
111 |         assert len(t) == len(t.alignments)
112 |         assert [t.text[t.alignments[i]: (t.alignments[i] + len(t[i]))]
113 |                 for i in range(len(t))] == list(t)
114 | 
115 | 
116 | def align(toks, orig):
117 |     s_i = 0
118 |     alignments = [None] * len(toks)
119 |     for tok_i in range(len(toks)):
120 |         while True:
121 |             L = len(toks[tok_i])
122 |             if orig[s_i:(s_i + L)] == toks[tok_i]:
123 |                 alignments[tok_i] = s_i
124 |                 s_i += L
125 |                 break
126 |             s_i += 1
127 |             if s_i >= len(orig):
128 |                 raise AlignmentFailed((orig, toks, alignments))
129 |             #if orig[s_i] != ' ': raise AlignmentFailed("nonspace advance: %s" % ((s_i,orig),))
130 |     if any(a is None for a in alignments):
131 |         raise AlignmentFailed((orig, toks, alignments))
132 | 
133 |     return alignments
134 | 
135 | 
136 | class AlignmentFailed(Exception):
137 |     pass
138 | 
139 | 
140 | def unicodify(s, encoding='utf8', *args):
141 |     if isinstance(s, unicode):
142 |         return s
143 |     if isinstance(s, str):
144 |         return s.decode(encoding, *args)
145 |     return unicode(s)
146 | 
147 | 
148 | def tokenize(tweet):
149 |     text = unicodify(tweet)
150 |     text = squeeze_whitespace(text)
151 |     t = Tokenization()
152 |     t += simple_tokenize(text)
153 |     t.text = text
154 |     t.alignments = align(t, text)
155 |     return t
156 | 
157 | 
158 | def simple_tokenize(text):
159 |     s = text
160 |     s = edge_punct_munge(s)
161 | 
162 |     # strict alternating ordering through the string.  first and last are goods.
163 |     # good bad good bad good bad good
164 |     goods = []
165 |     bads = []
166 |     i = 0
167 |     if Protect_RE.search(s):
168 |         for m in Protect_RE.finditer(s):
169 |             goods.append((i, m.start()))
170 |             bads.append(m.span())
171 |             i = m.end()
172 |         goods.append((m.end(), len(s)))
173 |     else:
174 |         goods = [(0, len(s))]
175 |     assert len(bads) + 1 == len(goods)
176 | 
177 |     goods = [s[i:j] for i, j in goods]
178 |     bads = [s[i:j] for i, j in bads]
179 |     # print goods
180 |     # print bads
181 |     goods = [unprotected_tokenize(x) for x in goods]
182 |     res = []
183 |     for i in range(len(bads)):
184 |         res += goods[i]
185 |         res.append(bads[i])
186 |     res += goods[-1]
187 | 
188 |     res = post_process(res)
189 |     return res
190 | 
191 | AposS = mycompile(r"(\S+)('s)$")
192 | 
193 | 
194 | def post_process(pre_toks):
195 |     # hacky: further splitting of certain tokens
196 |     post_toks = []
197 |     for tok in pre_toks:
198 |         m = AposS.search(tok)
199 |         if m:
200 |             post_toks += m.groups()
201 |         else:
202 |             post_toks.append(tok)
203 |     return post_toks
204 | 
205 | WS_RE = mycompile(r'\s+')
206 | 
207 | 
208 | def squeeze_whitespace(s):
209 |     new_string = WS_RE.sub(" ", s)
210 |     return new_string.strip()
211 | 
212 | # fun: copy and paste outta http://en.wikipedia.org/wiki/Smart_quotes
213 | EdgePunct = r"""[  ' " “ ” ‘ ’ < > « » { } ( \) [ \]  ]""".replace(' ', '')
214 | # NotEdgePunct = r"""[^'"([\)\]]"""  # alignment failures?
215 | NotEdgePunct = r"""[a-zA-Z0-9]"""
216 | EdgePunctLeft = r"""(\s|^)(%s+)(%s)""" % (EdgePunct, NotEdgePunct)
217 | EdgePunctRight = r"""(%s)(%s+)(\s|$)""" % (NotEdgePunct, EdgePunct)
218 | EdgePunctLeft_RE = mycompile(EdgePunctLeft)
219 | EdgePunctRight_RE = mycompile(EdgePunctRight)
220 | 
221 | 
222 | def edge_punct_munge(s):
223 |     s = EdgePunctLeft_RE.sub(r"\1\2 \3", s)
224 |     s = EdgePunctRight_RE.sub(r"\1 \2\3", s)
225 |     return s
226 | 
227 | 
228 | def unprotected_tokenize(s):
229 |     return s.split()
230 | 
231 | if __name__ == '__main__':
232 |     for line in sys.stdin:
233 |         print u" ".join(tokenize(line[:-1])).encode('utf-8')
234 |         # print "CUR\t" + " ".join(tokenize(line[:-1]))
235 |         # print "WS\t" + " ".join(line[:-1].split())
236 |         # print ansi.color(line.strip(),'red')
237 |         # print ansi.color(" ".join(tokenize(line.strip())),'blue','bold')
238 | 


--------------------------------------------------------------------------------
/tweedr/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/tweedr/cli/__init__.py


--------------------------------------------------------------------------------
/tweedr/cli/database.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import argparse
 4 | import mako.template
 5 | 
 6 | 
 7 | def reflect(**kw):
 8 |     from tweedr.models import metadata
 9 |     schema_filepath = os.path.join(os.path.dirname(metadata.__file__), 'schema.py')
10 |     schema_template_filepath = os.path.join(os.path.dirname(metadata.__file__), 'schema.template')
11 | 
12 |     template = mako.template.Template(filename=schema_template_filepath)
13 |     metadata.metadata.reflect()
14 |     schema = template.render(metadata=metadata.metadata)
15 | 
16 |     if kw.get('in_place'):
17 |         with open(schema_filepath, 'w') as out:
18 |             out.write(schema)
19 |     else:
20 |         sys.stdout.write(schema)
21 | 
22 |     print >> sys.stderr, '\nDone printing schema'
23 | 
24 | 
25 | def create(**kw):
26 |     from tweedr.models.schema import metadata
27 |     metadata.create_all()
28 | 
29 | 
30 | commands = dict(reflect=reflect, create=create)
31 | 
32 | 
33 | def main():
34 |     parser = argparse.ArgumentParser(description='Tweedr database tools')
35 |     parser.add_argument('command', choices=commands, help='Command to run')
36 |     parser.add_argument('--in-place', action='store_true', help='Whether or not to update the schema.py file in place')
37 | 
38 |     opts = parser.parse_args()
39 |     commands[opts.command](**vars(opts))
40 | 
41 | if __name__ == '__main__':
42 |     main()
43 | 


--------------------------------------------------------------------------------
/tweedr/cli/pipeline.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | from tweedr.api import pipeline
 4 | from tweedr.api.mappers import basic, similar, nlp, ml
 5 | from tweedr.corpora.qcri import a126730_datasource, a121571_datasource, a126728_datasource, a122047_datasource
 6 | 
 7 | from sklearn import linear_model, naive_bayes, neighbors, svm
 8 | 
 9 | import logging
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | def main():
14 |     parser = argparse.ArgumentParser(description='Run tweets from STDIN through the tweedr pipeline, output to STDOUT.')
15 |     parser.add_argument('-v', '--verbose', action='store_true', help='Log extra output')
16 |     opts = parser.parse_args()
17 | 
18 |     # bump threshold down to show info=20, debug=10, and silly=5 if --verbose is set
19 |     if opts.verbose:
20 |         logger.setLevel('SILLY')
21 | 
22 |     if sys.stdin.isatty():
23 |         raise IOError('You must provide input via STDIN')
24 | 
25 |     cli_pipeline = pipeline.Pipeline(
26 |         basic.EmptyLineFilter(),
27 |         basic.JSONParser(),
28 |         basic.IgnoreMetadata(),
29 |         basic.TweetStandardizer(),
30 |         similar.TextCounter(),
31 |         similar.FuzzyTextCounter(),
32 |         nlp.POSTagger(),
33 |         nlp.SequenceTagger(),
34 |         nlp.DBpediaSpotter(),
35 | 
36 |         ml.CorpusClassifier(a126730_datasource(), naive_bayes.MultinomialNB()),
37 |         ml.CorpusClassifier(a121571_datasource(), svm.SVC(gamma=2, C=1)),
38 |         ml.CorpusClassifier(a126728_datasource(), neighbors.KNeighborsClassifier(3)),
39 |         ml.CorpusClassifier(a122047_datasource(), linear_model.LogisticRegression()),
40 | 
41 |         basic.LineStream(sys.stdout),
42 |     )
43 | 
44 |     logger.debug('Pipeline created')
45 | 
46 |     try:
47 |         for i, line in enumerate(sys.stdin):
48 |             cli_pipeline(line)
49 |     except KeyboardInterrupt:
50 |         logger.critical('SIGINT received; Exiting.')
51 | 
52 |     logger.info('Processed %d lines', i)
53 |     logger.debug('Pipeline exited')
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     main()
58 | 


--------------------------------------------------------------------------------
/tweedr/cli/ui.py:
--------------------------------------------------------------------------------
 1 | from bottle import run
 2 | from tweedr.ui import middleware, crf
 3 | 
 4 | 
 5 | def main():
 6 |     '''This is called by the package's console_scripts entry point "tweedr-ui"
 7 | 
 8 |     The reloader is slow and only handles python module changes.
 9 |     I recommend using 3rd party restarter, say, node_restarter:
10 |         node_restarter **/*.py **/*.css **/*.mako 'python tweedr/cli/ui.py'
11 |     '''
12 |     app = middleware.add_duration_header(crf.app)
13 |     run(app)
14 | 
15 | if __name__ == '__main__':
16 |     main()
17 | 


--------------------------------------------------------------------------------
/tweedr/corpora/__init__.py:
--------------------------------------------------------------------------------
 1 | class DatasourceI(object):
 2 |     '''As usual, a reference, as opposed to an interface you actually have to implement'''
 3 | 
 4 |     def __init__(self):
 5 |         pass
 6 | 
 7 |     def __iter__(self):
 8 |         '''This should yield tuples of label-document (basestring, basestring) pairs.'''
 9 |         raise NotImplementedError(__doc__)
10 | 


--------------------------------------------------------------------------------
/tweedr/corpora/qcri.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import csv
  3 | from tweedr.corpora import DatasourceI
  4 | from tweedr.lib import globfirst
  5 | 
  6 | corpora_root = os.path.expanduser(os.environ.get('CORPORA', '~/corpora'))
  7 | 
  8 | 
  9 | class CSVDatasouce(DatasourceI):
 10 |     filepath = None
 11 |     label_column = 'category'
 12 |     text_column = 'text'
 13 | 
 14 |     def __iter__(self):
 15 |         with open(self.filepath) as fp:
 16 |             for row in csv.DictReader(fp):
 17 |                 yield row[self.label_column], row[self.text_column]
 18 | 
 19 | 
 20 | class a131709_datasource(CSVDatasouce):
 21 |     # from joplin/
 22 |     '''
 23 |     Counts:
 24 | 
 25 |      94 Other
 26 |     265 Informative (Direct)
 27 |     469 Informative (Indirect)
 28 |     762 Informative (Direct or Indirect)
 29 |     794 Personal only
 30 |     '''
 31 |     filepath = globfirst('**/a131709.csv', root=corpora_root)
 32 |     label_column = 'choose_one'
 33 |     text_column = 'tweet'
 34 | 
 35 | 
 36 | class a121571_datasource(CSVDatasouce):
 37 |     # from joplin/
 38 |     '''
 39 |     Counts:
 40 | 
 41 |      46 People missing, found or seen
 42 |     130 Unknown
 43 |     137 Casualties and damage
 44 |     204 Donations of money, goods or services
 45 |     280 Information source
 46 |     436 Caution and advice
 47 |     '''
 48 |     # TODO: come up with a better name
 49 |     filepath = globfirst('**/a121571.csv', root=corpora_root)
 50 |     label_column = 'choose_one'
 51 |     text_column = 'text'
 52 | 
 53 | 
 54 | class a122047_datasource(CSVDatasouce):
 55 |     # from joplin/
 56 |     '''
 57 |     Counts:
 58 | 
 59 |       3 A shelter is open or available
 60 |      27 A siren has been heard
 61 |      99 A tornado sighting/touchdown has been reported
 62 |     102 Other
 63 |     207 A tornado/thunderstorm warning has been issued or has been lifted
 64 |     '''
 65 |     filepath = globfirst('**/a122047.csv', root=corpora_root)
 66 |     label_column = 'type_of_advice_or_caution'
 67 |     text_column = 'text'
 68 | 
 69 | 
 70 | class a126730_datasource(CSVDatasouce):
 71 |     # from joplin/
 72 |     '''
 73 |     Counts:
 74 | 
 75 |      1 Both people and infrastructure
 76 |      1 People: injured
 77 |      2 People: injured and dead
 78 |     12 Not damage-related
 79 |     17 Infrastructure (building, bridge, road, etc.) damaged
 80 |     47 Not specified (maybe people or infrastructure)
 81 |     58 People: dead
 82 |     '''
 83 |     filepath = globfirst('**/a126730.csv', root=corpora_root)
 84 |     label_column = 'people_or_infrastructure'
 85 |     text_column = 'text'
 86 | 
 87 | 
 88 | class a126728_datasource(CSVDatasouce):
 89 |     # from joplin/
 90 |     '''
 91 |     Counts:
 92 | 
 93 |       2 Discount (rebate/special offer)
 94 |       3 Blood
 95 |       3 Equipment (machine/generator/pump/etc.)
 96 |       6 Food
 97 |       7 Shelter
 98 |      11 Volunteers/work
 99 |      53 Money
100 |     119 Other, or not specified
101 |     '''
102 |     filepath = globfirst('**/a126728.csv', root=corpora_root)
103 |     label_column = 'type_of_donation'
104 |     text_column = 'text'
105 | 
106 | 
107 | class a122582_datasource(CSVDatasouce):
108 |     # from joplin/
109 |     '''
110 |     Counts:
111 | 
112 |       4 Tune to this radio station (or: I am listening to this station)
113 |      10 Watch this TV channel (or: I am watching this channel)
114 |      33 None of the above
115 |      35 Look at this photo or these photos
116 |      58 Look at this video or these videos
117 |     139 Look at this web site/page
118 |     '''
119 |     filepath = globfirst('**/a122582.csv', root=corpora_root)
120 |     label_column = 'type_of_message'
121 |     text_column = 'text'
122 | 
123 | 
124 | class a143145_datasource(CSVDatasouce):
125 |     # from sandy/
126 |     '''
127 |     Counts:
128 | 
129 |      78 Informative (Direct)
130 |      79 Informative (Direct or Indirect)
131 |     161 Other
132 |     296 Personal Only
133 |     386 Informative (Indirect)
134 |     '''
135 |     filepath = globfirst('**/a143145.csv', root=corpora_root)
136 |     label_column = 'choose_one'
137 |     text_column = 'tweet'
138 | 
139 | 
140 | class a144267_datasource(CSVDatasouce):
141 |     # from sandy/
142 |     '''
143 |     Counts:
144 | 
145 |      32 Donations of money, goods or services
146 |      72 Information Source
147 |     125 Unknown
148 |     144 Caution and advice
149 |     170 Casualties and damage
150 |     '''
151 |     filepath = globfirst('**/a144267.csv', root=corpora_root)
152 |     label_column = 'choose_one'
153 |     text_column = 'tweet'
154 | 
155 | 
156 | class a146283_datasource(CSVDatasouce):
157 |     # from sandy/
158 |     '''
159 |     Counts:
160 | 
161 |      6 A shelter is open or available
162 |     20 A hurricane warning has been issued or has been lifted
163 |     23 A hurricane sighting has been reported
164 |     77 Other
165 |     '''
166 |     filepath = globfirst('**/a146283.csv', root=corpora_root)
167 |     label_column = 'type_of_advice_or_caution'
168 |     text_column = 'tweet'
169 | 
170 | 
171 | class a146281_datasource(CSVDatasouce):
172 |     # from sandy/
173 |     '''
174 |     Counts:
175 | 
176 |      1 People: injured
177 |      3 People: injured and dead
178 |     12 Not specified (maybe people or infrastructure)
179 |     13 Both people and infrastructure
180 |     16 Not damage-related
181 |     34 People: dead
182 |     91 Infrastructure (building, bridge, road, etc.) damage
183 |     '''
184 |     filepath = globfirst('**/a146281.csv', root=corpora_root)
185 |     label_column = 'people_or_infrastructure'
186 |     text_column = 'tweet'
187 | 
188 | 
189 | if __name__ == '__main__':
190 |     '''You may need to get these files from S3, something like:
191 | 
192 |     mkdir -p ~/corpora/tweedr
193 |     cd ~/corpora/tweedr
194 |     s3cmd sync s3://qcri/joplin/labeled/ .
195 |     s3cmd sync s3://qcri/sandy/labeled/ .
196 |     '''
197 |     for label, text in a121571_datasource():
198 |         print label, '\t', text
199 | 


--------------------------------------------------------------------------------
/tweedr/corpora/qcri_database.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | import numpy as np
 4 | from sets import Set
 5 | 
 6 | from tweedr.models import DBSession, DamageClassification
 7 | 
 8 | 
 9 | class DamageClassifiedCorpus(object):
10 | 
11 |     def __init__(self):
12 |         labeled_tweets = \
13 |             np.array(DBSession.query(DamageClassification).filter(DamageClassification.mturk_code
14 |                 == 'QCRI').limit(1000).all())
15 |         labeled_tweets = map(lambda x: (x.text, int(x.label)), labeled_tweets)
16 |         self.dataset = labeled_tweets
17 | 
18 |     def __iter__(self):
19 |         return iter(self.dataset)
20 | 
21 | 
22 | 
23 | test_set = DamageClassifiedCorpus()
24 | 


--------------------------------------------------------------------------------
/tweedr/emr/README.md:
--------------------------------------------------------------------------------
 1 | ## Examples for the wordcounter
 2 | 
 3 | Run the counter locally (using some data that's on the qcri machine):
 4 | 
 5 |     python gnip_wc.py /home/chbrown/data/gnip/christchurch/2011-02-28*.json.gz
 6 | 
 7 | That particular example glob is about 15MB compressed, 107MB uncompressed, 59704 lines (= tweets).
 8 | 
 9 | It runs in about 2m 30s locally, on the qcri AWS machine.
10 | 
11 | Or run on EMR, using the same glob, but from S3.
12 | 
13 |     python gnip_wc.py -r emr s3://qcri/gnip/christchurch/2011-02-28*.json.gz --output-dir "s3://qcri/tmp-`date +%s`"
14 | 
15 | This took about 21m, without specifying any numbers. From the docs:
16 | 
17 | > By default, **mrjob** runs a single `m1.small`, which is a cheap but not very powerful instance type.
18 | 
19 | Trying a few more instances at once:
20 | 
21 |     !! --num-ec2-instances 2
22 | 
23 | Hmm. Still took 21 minutes.
24 | 
25 |     !! --num-ec2-instances 4
26 | 
27 | Better! Took 13m. I think most of this is overhead in starting the cluster.
28 | 
29 |     !! --num-ec2-instances 8
30 | 
31 | Overkill, apparently. Took 13m again.
32 | 
33 | Could also try some different types:
34 | 
35 |     --ec2_instance_type c1.medium
36 | 
37 | (Umm... later.)
38 | 
39 | How about all of christchurch?
40 | 
41 |     python gnip_wc.py -r emr --num-ec2-instances 8 s3://qcri/gnip/christchurch/*.json.gz
42 | 
43 | That's 757,382 tweets, 270MB compressed = 1.7GB uncompressed, simple word count took 1h 12m (apparently it only took 53m, as billed by AWS, so $(.12+.03) * 8 = $1.20), produced 12 output files for a total of ~12MB (uncompressed).
44 | 
45 | ## Running a full geolocation task:
46 | 
47 |     cd ~/src/qcri/emr
48 |     output_dir="s3://qcri/tmp-`date +%s`"
49 |     echo Using $output_dir as our output directory
50 |     time python gnip_geo.py --containers gnip_containers.geojson \
51 |       s3://qcri/gnip/*/*.json.gz --output-dir $output_dir -r emr --num-ec2-instances 5
52 | 
53 | ## Examples
54 | 
55 | Some of the fields from a geolocated tweet might look like this. Note that the coordinates are `[lat,lng]`.
56 | 
57 |     {
58 |       ...
59 |       "gnip": {
60 |         "matching_rules": [
61 |           {
62 |             "value": "has:geo",
63 |             "tag": "westtx:geo"
64 |           }
65 |         ],
66 |         ...
67 |       },
68 |       ...
69 |       "geo": {
70 |         "type": "Point",
71 |         "coordinates": [
72 |           31.4119232,
73 |           -86.1200234
74 |         ]
75 |       },
76 |       ...
77 |     }
78 | 
79 | And here's some GeoJSON, for reference (because Polygons are weird, allowing inner rings):
80 | 
81 |     { "type": "Polygon",
82 |       "coordinates": [
83 |         [
84 |           [100.0, 0.0], [101.0, 0.0], [101.0, 1.0], [100.0, 1.0], [100.0, 0.0]
85 |         ]
86 |       ]
87 |     }
88 | 
89 | But Points are easier:
90 | 
91 |     { "type": "Point", "coordinates": [100.0, 0.0] }
92 | 


--------------------------------------------------------------------------------
/tweedr/emr/__init__.py:
--------------------------------------------------------------------------------
 1 | import ujson
 2 | from mrjob.protocol import _ClassBasedKeyCachingProtocol
 3 | 
 4 | 
 5 | class UltraJSONProtocol(_ClassBasedKeyCachingProtocol):
 6 |     @classmethod
 7 |     def load_from_string(cls, value):
 8 |         return ujson.loads(value)
 9 | 
10 |     @classmethod
11 |     def dump_to_string(cls, value):
12 |         return ujson.dumps(value)
13 | 
14 | 
15 | class UltraJSONValueProtocol(object):
16 |     @classmethod
17 |     def read(cls, line):
18 |         return (None, ujson.loads(line))
19 | 
20 |     @classmethod
21 |     def write(cls, key, value):
22 |         return ujson.dumps(value)
23 | 


--------------------------------------------------------------------------------
/tweedr/emr/gnip_geo.py:
--------------------------------------------------------------------------------
 1 | from mrjob.job import MRJob
 2 | from mrjob.protocol import JSONValueProtocol
 3 | import json
 4 | 
 5 | 
 6 | def bbox_contains(bbox, longitude, latitude):
 7 |     sw_lon, sw_lat, ne_lon, ne_lat = bbox
 8 |     return (sw_lon <= longitude <= ne_lon) and (sw_lat <= latitude <= ne_lat)
 9 | 
10 | 
11 | class GeoExtract(MRJob):
12 |     INPUT_PROTOCOL = JSONValueProtocol
13 | 
14 |     def configure_options(self):
15 |         super(GeoExtract, self).configure_options()
16 |         # add_file_option: http://mrjob.readthedocs.org/en/latest/guides/writing-mrjobs.html
17 |         self.add_file_option('--containers', help='.geojson feature collection to filter for')
18 | 
19 |     def mapper_init(self):
20 |         with open(self.options.containers) as fp:
21 |             self.feature_collection = json.load(fp)
22 | 
23 |     def mapper(self, _, line):
24 |         # Ignore metadata / reports
25 |         if 'info' in line and line['info']['message'] == 'Replay Request Completed':
26 |             return
27 | 
28 |         # if any(rule['value'] == 'has:geo' line['gnip']['matching_rules']):
29 |         if 'geo' in line and line['geo'].get('type') == 'Point':
30 |             latitude, longitude = line['geo']['coordinates']
31 |             for feature in self.feature_collection['features']:
32 |                 if bbox_contains(feature['bbox'], longitude, latitude):
33 |                     yield feature['properties']['name'], line
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     # Maybe run the whole thing in a try-catch-finally with counters for error logging?
38 |     # might make it easier to debug than pulling down the whole bucket of attempts and
39 |     # browsing through the stderr files to find the tracebacks
40 |     # http://pythonhosted.org/mrjob/guides/writing-mrjobs.html#counters
41 |     GeoExtract.run()
42 | 


--------------------------------------------------------------------------------
/tweedr/emr/gnip_wc.py:
--------------------------------------------------------------------------------
 1 | from mrjob.job import MRJob
 2 | from mrjob.protocol import JSONValueProtocol
 3 | import json
 4 | 
 5 | 
 6 | class WordCount(MRJob):
 7 |     '''
 8 |     The default MRJob.INPUT_PROTOCOL is `RawValueProtocol`, but we are reading tweets,
 9 |     so we'll add a parser before we even get to the mapper.
10 |     '''
11 |     # incoming line needs to be parsed (I think), so we set a protocol to do so
12 |     INPUT_PROTOCOL = JSONValueProtocol
13 | 
14 |     def mapper(self, key, line):
15 |         '''The key to the first mapper in the step-pipeline is always None.'''
16 | 
17 |         # GNIP-style streams sometimes have metadata lines, but we can just ignore them
18 |         if 'info' in line and line['info']['message'] == 'Replay Request Completed':
19 |             return
20 | 
21 |         # GNIP-style tweets have the tweet text in {'body': '...'} instead of the standard {'text': '...'}
22 |         if 'body' not in line:
23 |             raise Exception('Missing body field in tweet:\n  ' + json.dumps(line))
24 | 
25 |         text = line['body']
26 |         yield '~~~TOTAL~~~', 1
27 |         for token in text.split():
28 |             yield token.lower(), 1
29 | 
30 |     def combiner(self, key, value_iter):
31 |         yield key, sum(value_iter)
32 | 
33 |     def reducer(self, key, value_iter):
34 |         yield key, sum(value_iter)
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     WordCount.run()
39 | 


--------------------------------------------------------------------------------
/tweedr/lib/__init__.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import fnmatch
  4 | import random
  5 | import subprocess
  6 | from copy import copy
  7 | 
  8 | 
  9 | def mapList(iterable):
 10 |     return map(list, iterable)
 11 | 
 12 | 
 13 | def stderr(s):
 14 |     sys.stderr.write(s)
 15 |     sys.stderr.flush()
 16 | 
 17 | 
 18 | def stderrn(s=''):
 19 |     stderr(str(s) + os.linesep)
 20 | 
 21 | 
 22 | def stdout(s):
 23 |     sys.stdout.write(s)
 24 |     sys.stdout.flush()
 25 | 
 26 | 
 27 | def stdoutn(s=''):
 28 |     stdout(str(s) + os.linesep)
 29 | 
 30 | 
 31 | def tty_size():
 32 |     height, width = subprocess.check_output(['stty', 'size']).split()
 33 |     return (int(height), int(width))
 34 | 
 35 | 
 36 | def uniq(xs):
 37 |     # order preserving. From http://www.peterbe.com/plog/uniqifiers-benchmark
 38 |     seen = {}
 39 |     checked = []
 40 |     for x in xs:
 41 |         if x in seen:
 42 |             continue
 43 |         seen[x] = 1
 44 |         checked.append(x)
 45 |     return checked
 46 | 
 47 | 
 48 | def iglob(pattern, root='.'):
 49 |     for dirpath, dirnames, filenames in os.walk(root):
 50 |         filepaths = [os.path.join(dirpath, filename) for filename in filenames]
 51 |         for filepath in fnmatch.filter(filepaths, pattern):
 52 |             yield filepath
 53 | 
 54 | 
 55 | def globfirst(pattern, root='.'):
 56 |     try:
 57 |         return iglob(pattern, root).next()
 58 |     except StopIteration:
 59 |         return None
 60 | 
 61 | 
 62 | def walk(top, *predicates):
 63 |     # predicate(filepath) must be True for each predicate, for each filepath to be returned
 64 |     for dirpath, dirnames, filenames in os.walk(top):
 65 |         filepaths = [os.path.join(dirpath, filename) for filename in filenames]
 66 |         for filepath in filepaths:
 67 |             if all(predicate(filepath) for predicate in predicates):
 68 |                 yield filepath
 69 | 
 70 | 
 71 | def bifurcate(xs, ratio, shuffle=False):
 72 |     '''
 73 |     Takes a list like [b, c, a, m, n] and ratio like 0.6 and returns two lists: [b, c, a], [m, n]
 74 | 
 75 |     E.g.,
 76 | 
 77 |         test, train = bifurcate(tokenized_labels, test_proportion, shuffle=True)
 78 |     '''
 79 |     length = len(xs)
 80 |     pivot = int(ratio * length)
 81 |     if shuffle:
 82 |         xs = copy(xs)
 83 |         random.shuffle(xs)
 84 | 
 85 |     return (xs[:pivot], xs[pivot:])
 86 | 
 87 | 
 88 | class Counts(object):
 89 |     def __init__(self):
 90 |         object.__setattr__(self, '_store', {})
 91 | 
 92 |     def __getattr__(self, name):
 93 |         return self._store.get(name, 0)
 94 | 
 95 |     def __setattr__(self, name, value):
 96 |         self._store[name] = value
 97 | 
 98 |     def empty_copy(self):
 99 |         other = Counts()
100 |         other._store = dict((name, 0) for name in self._store)
101 |         return other
102 | 
103 |     def add(self, other):
104 |         for name, value in other._store.items():
105 |             self._store[name] = self._store.get(name, 0) + value
106 | 
107 |     def __repr__(self):
108 |         return '<Counts %s>' % ' '.join('%s=%d' % (name, value) for name, value in self._store.items())
109 | 


--------------------------------------------------------------------------------
/tweedr/lib/readers.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from StringIO import StringIO
 3 | 
 4 | 
 5 | def infer(s):
 6 |     if s.isdigit():
 7 |         return int(s)
 8 |     elif s.isalpha():
 9 |         return s
10 |     return float(s)
11 | 
12 | 
13 | def read_simple_csv(path):
14 |     rows = []
15 |     with open(path) as csv_fp:
16 |         for line in csv_fp:
17 |             rows.append([infer(cell) for cell in line.strip().split(',')])
18 |     return rows
19 | 
20 | 
21 | def read_until(readable, marks):
22 |     '''Could stall if mark never happens before the EOF'''
23 |     stdout_buffer = StringIO()
24 |     while True:
25 |         # read in bytes one-by-one because we have to break as soon as we hit
26 |         #   any `mark` character
27 |         byte = readable.read(1)
28 |         if byte in marks:
29 |             output = stdout_buffer.getvalue()
30 |             stdout_buffer.close()
31 |             return output
32 |         stdout_buffer.write(byte)
33 | 
34 | 
35 | class SniffingDictReader(csv.DictReader, object):
36 |     # csv.DictReader(csvfile, fieldnames=None, restkey=None, restval=None, dialect='excel', *args, **kwds)
37 |     def __init__(self, csvfile, restkey=None, restval=None):
38 |         sniffer = csv.Sniffer()
39 |         # sniff the first line
40 |         sample = csvfile.readline()
41 |         dialect = sniffer.sniff(sample)
42 |         # rewind
43 |         csvfile.seek(0)
44 | 
45 |         super(SniffingDictReader, self).__init__(csvfile, restkey=restkey, restval=restval, dialect=dialect)
46 | 


--------------------------------------------------------------------------------
/tweedr/lib/text.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import re
 3 | 
 4 | token_re = re.compile('\S+')
 5 | 
 6 | punctuation_deletions = [u"'"]
 7 | punctuation_elisions = [u'-', u',', u'.', u',', u';', u':', u'|', u'&']
 8 | 
 9 | punctuation_translations = dict(
10 |     [(ord(char), None) for char in punctuation_deletions] +
11 |     [(ord(char), u' ') for char in punctuation_elisions])
12 | 
13 | whitespace_unicode_translations = {ord('\t'): u' ', ord('\n'): u' ', ord('\r'): u''}
14 | 
15 | 
16 | def UpperCamelCase(name):
17 |     return re.sub('(^|-|_)(.)', lambda g: g.group(2).upper(), name)
18 | 
19 | 
20 | def underscore(name):
21 |     return re.sub('([A-Z]+)', r'_\1', name).strip('_').lower()
22 | 
23 | 
24 | def singular(name):
25 |     return re.sub('s$', '', name)
26 | 
27 | 
28 | def utf8str(s):
29 |     if isinstance(s, unicode):
30 |         return s.encode('utf8')
31 |     return s
32 | 
33 | 
34 | def zip_boundaries(xs, space_len=1):
35 |     '''Take a list of strings and iterate through them along with boundary indices.
36 | 
37 |     >>> tokens = 'Into the void .'.split()
38 |     >>> list(zip_boundaries(tokens))
39 |     [('Into', 0, 4), ('the', 5, 8), ('void', 9, 13), ('.', 14, 15)]
40 |     '''
41 |     start = 0
42 |     for x in xs:
43 |         x_len = len(x)
44 |         yield x, start, start + x_len
45 |         start += x_len + space_len
46 | 
47 | 
48 | def gloss(alignments, prefixes=None, postfixes=None, width=None, toksep=' ', linesep='\n', groupsep='\n'):
49 |     '''
50 |     Creates an interlinear gloss.
51 | 
52 |     Take a list of [('a', 'DET'), ('beluga', 'N')] and return a string covering multiples lines, like:
53 |         a   beluga
54 |         DET N
55 |     each item in `alignments` should have the same length, N
56 |     `prefixes`, if provided, should be N-long
57 |     `postfixes`, if provided, should be N-long
58 |     '''
59 |     if width is None:
60 |         width = int(subprocess.check_output(['tput', 'cols']))
61 |     toksep_len = len(toksep)
62 | 
63 |     # a "group" is a N-line string, each line of which is at most `width` characters
64 |     # `groups` is a list of such groups
65 |     groups = []
66 | 
67 |     def flush_buffer(line_buffer):
68 |         if len(line_buffer) > 0:
69 |             lines = [toksep.join(tokens) for tokens in line_buffer]
70 |             if prefixes:
71 |                 lines = [prefix + line for prefix, line in zip(prefixes, lines)]
72 |             if postfixes:
73 |                 lines = [line + postfix for postfix, line in zip(postfixes, lines)]
74 |             groups.append(linesep.join(lines))
75 |         return [[] for _ in alignments[0]]
76 | 
77 |     # the line_buffer is an N-long list of lists of tokens (strings)
78 |     # [[e1, e2, e3], [f1, f2, f3], [g1, g2, g3]]
79 |     line_buffer = flush_buffer([])
80 |     # the line_buffer_width is just the cumulative width of the current line_buffer
81 |     line_buffer_width = 0
82 | 
83 |     for aligned in alignments:
84 |         aligned = map(str, aligned)
85 |         length = max(map(len, aligned))
86 |         line_buffer_width += toksep_len + length
87 |         if line_buffer_width >= width:
88 |             line_buffer = flush_buffer(line_buffer)
89 |             line_buffer_width = length
90 |         for i, token in enumerate(aligned):
91 |             line_buffer[i].append(token.ljust(length))
92 | 
93 |     flush_buffer(line_buffer)
94 | 
95 |     return groupsep.join(groups)
96 | 


--------------------------------------------------------------------------------
/tweedr/lib/timeout.py:
--------------------------------------------------------------------------------
 1 | '''This module is mostly from github.com/chbrown/remoting
 2 | See that repository's readme and /remoting/timeout.py
 3 | 
 4 | This is basically how timeouts in Python work:
 5 | 
 6 | Use `signal.signal` to queue up a function to run after a specified amount of
 7 | time. This function's sole purpose is to raise an exception.
 8 | 
 9 | You run your target method, the `func` arg to this decorate() method.
10 |     Two things can happen from here:
11 |     a. Your function finishes before the timeout period. In that case, immediately tell `signal.signal` "just kidding, dont run that function after all." We cancel the scheduled signal from step 1, and put the old handler back in place.
12 |     b. Your function does not finish in time, TimeoutError is raised, and you have to catch it somewhere upstream.
13 | 
14 | '''
15 | import signal
16 | 
17 | 
18 | class TimeoutError(Exception):
19 |     def __call__(self, signum, frame):
20 |         self.args
21 |         raise self
22 | 
23 |     def __repr__(self):
24 |         return '%s(%s)' % (self.__class__.__name__, self.message)
25 | 
26 | 
27 | def timeout_after(seconds):
28 |     '''Closures in python are so beautiful.'''
29 |     def decorate(func):
30 |         def wrapper(*args, **kw):
31 |             new_ALRM = TimeoutError('Timed out after %d seconds.' % seconds)
32 |             old_ALRM = signal.signal(signal.SIGALRM, new_ALRM)
33 |             signal.alarm(seconds)
34 |             try:
35 |                 result = func(*args, **kw)
36 |             # we don't handle the error here
37 |             finally:
38 |                 # but we do put the old handler back in place
39 |                 signal.signal(signal.SIGALRM, old_ALRM)
40 |             signal.alarm(0)
41 |             return result
42 |         wrapper.func_name = func.__name__
43 |         return wrapper
44 |     return decorate
45 | 
46 | 
47 | def example():
48 |     '''Usage example. Should be doctests?'''
49 |     import os
50 | 
51 |     @timeout_after(5)
52 |     def waiter_task(seconds):
53 |         os.system('sleep %d' % seconds)
54 |         return 'Waited %ds successfully' % seconds
55 | 
56 |     print waiter_task(2)  # --> prints 'Waited 2s successfully'
57 |     print waiter_task(7)  # --> throws
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     print example()
62 | 


--------------------------------------------------------------------------------
/tweedr/ml/__init__.py:
--------------------------------------------------------------------------------
 1 | from sklearn import metrics
 2 | from tweedr.lib import Counts
 3 | 
 4 | 
 5 | def print_metrics_summary(gold_labels, predicted_labels, sample=0):
 6 |     print '''    Accuracy: {accuracy}
 7 |     P/R: {precision:.4f}/{recall:.4f}
 8 |     F1: {fscore:.4f}'''.format(
 9 |         accuracy=metrics.accuracy_score(gold_labels, predicted_labels),
10 |         precision=metrics.precision_score(gold_labels, predicted_labels),
11 |         recall=metrics.recall_score(gold_labels, predicted_labels),
12 |         fscore=metrics.f1_score(gold_labels, predicted_labels)
13 |     )
14 | 
15 |     if sample > 0:
16 |         print 'Sample of classifications '
17 |         for _, gold, predicted in zip(xrange(sample), gold_labels, predicted_labels):
18 |             print '  gold: {gold}, predicted: {predicted}'.format(gold=gold, predicted=predicted)
19 | 
20 | 
21 | def compare_labels(gold_labels, predicted_labels, null_label):
22 |     # produces a Counts object with values:
23 |     #  .true_positives
24 |     #  .false_negatives
25 |     #  .true_negatives
26 |     #  .false_positives
27 |     #  .comparisons = SUM of the others
28 |     counts = Counts()
29 |     for gold_label, predicted_label in zip(gold_labels, predicted_labels):
30 |         counts.comparisons += 1
31 |         if gold_label != null_label:
32 |             if predicted_label == gold_label:
33 |                 counts.true_positives += 1
34 |             else:
35 |                 counts.false_negatives += 1
36 | 
37 |         if gold_label == null_label:
38 |             if predicted_label == gold_label:
39 |                 counts.true_negatives += 1
40 |             else:
41 |                 counts.false_positives += 1
42 | 
43 |     return counts
44 | 


--------------------------------------------------------------------------------
/tweedr/ml/build_confusion_matrix.py:
--------------------------------------------------------------------------------
  1 | # import os
  2 | import argparse
  3 | 
  4 | from sklearn import cross_validation  # , metrics
  5 | # from sklearn.pipeline import Pipeline
  6 | # from sklearn.feature_extraction import text
  7 | 
  8 | import pylab as pl
  9 | from tweedr.models import DBSession, TokenizedLabel, Label
 10 | from tweedr.ml.crf.classifier import CRF
 11 | from tweedr.ml.features import crf_feature_functions, featurize
 12 | from sklearn.metrics import confusion_matrix
 13 | 
 14 | import logging
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | flatMap = lambda iterable: map(list, iterable)
 18 | 
 19 | 
 20 | def evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y, index, opts):
 21 |     '''If you use print_gloss, your test_y better be lists, not iterables.'''
 22 |     logger.info('Training on %d, testing on %d', len(train_y), len(test_y))
 23 |     classifier.fit(train_X, train_y)
 24 |     predicted_y = classifier.predict(test_X)
 25 |     # flatten
 26 |     test_y = sum(test_y, [])
 27 |     predicted_y = sum(predicted_y, [])
 28 |     #counts = compare_labels(test_y, predicted_y, 'None')
 29 | 
 30 |     gold_labels = []
 31 |     predicted_labels = []
 32 |     i = 0
 33 |     diction = {}
 34 | 
 35 |     j = 0
 36 | 
 37 |     while j < len(test_y):
 38 |         try:
 39 |             diction[test_y[j]] += 1
 40 |         except KeyError:
 41 |             diction[test_y[j]] = 1
 42 |         j = j + 1
 43 | 
 44 |     if (opts.include_none == 0):
 45 |         while i < len(test_y):
 46 |             if (test_y[i] == "None" and predicted_y[i] == "None"):
 47 |                 pass
 48 |             else:
 49 |                 try:
 50 |                     if diction[test_y[i]] > opts.threshold:
 51 |                         gold_labels.append(test_y[i])
 52 |                         predicted_labels.append(predicted_y[i])
 53 |                 except KeyError:
 54 |                     pass
 55 |             i = i + 1
 56 | 
 57 |     cm = confusion_matrix(gold_labels, predicted_labels)
 58 |     print "Confusion Matrix"
 59 |     print cm
 60 |     pl.matshow(cm)
 61 |     pl.title('Confusion Matrix')
 62 |     pl.colorbar()
 63 |     pl.savefig("confusion_matrix" + str(index) + '.png', format='png')
 64 |     pl.clf()
 65 | 
 66 | 
 67 | def main():
 68 |     parser = argparse.ArgumentParser(
 69 |         description='Train CRFSuite on data from the QCRI MySQL database',
 70 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 71 |     parser.add_argument('-k', '--k-folds',
 72 |         type=int, default=10, help='How many folds of the data to test on')
 73 |     parser.add_argument('--max-data',
 74 |         type=int, default=10000, help='Maximum data points to train and test on')
 75 |     parser.add_argument('--include-none', type=int, default=0, help='Include None in Confusion Matrix.')
 76 |     parser.add_argument('-threshold', type=int, default=10, help='Threshold for number of gold labels classified.')
 77 |     opts = parser.parse_args()
 78 | 
 79 |     # e.g., tokenized_label =
 80 |     # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16
 81 |     #    tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5>
 82 |     # Train and test must be iterables of objects that support CRF-ready
 83 |     # .tokens and .labels attributes.
 84 |     query = DBSession.query(TokenizedLabel).limit(opts.max_data)
 85 |     X_y = ((featurize(item.tokens, crf_feature_functions), item.labels) for item in query)
 86 |     # unzip and flatten into static list
 87 |     X, y = zip(*X_y)
 88 |     # we need to read X multiple times, so make sure it's all static
 89 |     X = map(flatMap, X)
 90 | 
 91 |     categories = dict((label.id, label.text) for label in DBSession.query(Label))
 92 |     print 'categories', categories
 93 | 
 94 |     N = len(y)
 95 |     index = 0
 96 |     for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True):
 97 |         # train, test = tokenized_labels[train_indices], tokenized_labels[test_indices]
 98 |         train_X = [X[i] for i in train_indices]
 99 |         train_y = [y[i] for i in train_indices]
100 |         test_X = [X[i] for i in test_indices]
101 |         test_y = [y[i] for i in test_indices]
102 |         classifier = CRF()
103 |         # print_gloss=True
104 |         index = index + 1
105 |         evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y, index, opts)
106 | 
107 | if __name__ == '__main__':
108 |     main()
109 | 


--------------------------------------------------------------------------------
/tweedr/ml/classifier.py:
--------------------------------------------------------------------------------
 1 | '''I recommend emulating the scikit-learn interface, with or without
 2 | ClassifierI because fit and predict are more descriptive names than append_raw
 3 | and save, etc. In the CRF case, it's less transparent how to access the
 4 | underlying tagger/trainer but I think as long as it's following the sklearn
 5 | paradigm, an opaque wrapper is okay.
 6 | '''
 7 | from sklearn import base
 8 | 
 9 | import logging
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | class ClassifierI(base.ClassifierMixin):
14 |     '''
15 |     Interface to emulate sklearn classifiers.
16 | 
17 |     * `X`: an iterable of data points, each of which might be a point in many-dimensional space, a list of strings, etc.
18 |     * `y`: an iterable of discrete labels, each of which may be a string, or a True/False value, or just an integer (not a float).
19 |     '''
20 |     def __init__(self, *args, **kw):
21 |         pass
22 | 
23 |     def fit(self, X, y):
24 |         '''Fit the model according to the given training data.'''
25 |         raise NotImplementedError(__doc__)
26 | 
27 |     def fit_transform(self, X, y=None):
28 |         '''Fit to some data, then transform it'''
29 |         self.fit(X, y)
30 |         return self.transform(X)
31 | 
32 |     def get_params(self, deep=False):
33 |         '''Get parameters for the estimator'''
34 |         raise NotImplementedError(__doc__)
35 | 
36 |     def predict(self, X):
37 |         '''Predict class labels for samples in X.'''
38 |         raise NotImplementedError(__doc__)
39 | 
40 |     # def score(self, X, y):
41 |     #     '''Returns the mean accuracy on the given test data and labels.'''
42 |     #     raise NotImplementedError(__doc__)
43 | 
44 |     def set_params(self, **params):
45 |         '''Set the parameters of the estimator.'''
46 |         raise NotImplementedError(__doc__)
47 | 
48 |     def transform(self, X, threshold=None):
49 |         '''Reduce X to its most important features.'''
50 |         raise NotImplementedError(__doc__)
51 | 


--------------------------------------------------------------------------------
/tweedr/ml/crf/__init__.py:
--------------------------------------------------------------------------------
 1 | from tweedr.lib.text import utf8str
 2 | import crfsuite
 3 | 
 4 | 
 5 | class ItemSequence(crfsuite.ItemSequence):
 6 |     def __init__(self, features_iter, check=False):
 7 |         '''Create new ItemSequence, typedef std::vector<Item> based on the
 8 |         given iterable of iterable of 2-tuples or strings.
 9 |         If check=True, any unicode present in the given features_iter
10 |         will be encoded into a bytestring as utf8.'''
11 |         super(ItemSequence, self).__init__()
12 |         self.append_raw(features_iter, check=check)
13 | 
14 |     def append_raw(self, features_iter, check=False):
15 |         '''
16 |         @features_iter is an iterable of iterables, of tuples or strings.
17 |             type: [[(str, float) | str]], where [] is an iterable
18 |         '''
19 |         for features in features_iter:
20 |             if check:
21 |                 features = map(utf8str, features)
22 |             item = crfsuite.Item()
23 |             for feature in features:
24 |                 if isinstance(feature, tuple):
25 |                     attribute = crfsuite.Attribute(*feature)
26 |                 else:
27 |                     attribute = crfsuite.Attribute(feature)
28 |                 item.append(attribute)
29 |             self.append(item)
30 | 


--------------------------------------------------------------------------------
/tweedr/ml/crf/classifier.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | import crfsuite
  4 | from tweedr.ml.crf import ItemSequence
  5 | from tweedr.ml.classifier import ClassifierI
  6 | from tweedr.ml.features import featurize
  7 | 
  8 | from itertools import izip
  9 | 
 10 | import logging
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class CRF(ClassifierI):
 15 |     '''
 16 |     Doesn't fit entirely within the classifier paradigm, due to the hierarchy of data:
 17 |     Sentences have each token labeled, but each sentence is an individual entity.
 18 |     '''
 19 |     def __init__(self, algorithm='l2sgd', type_='crf1d'):
 20 |         self.trainer = crfsuite.Trainer()
 21 |         self.trainer.select(algorithm, type_)
 22 |         # default parameters:
 23 |         self.trainer.set('c2', '0.1')
 24 | 
 25 |     def fit(self, X, y):
 26 |         # For a CRF, X is an iterable of lists of lists of features (=strings)
 27 |         # and y is a list of list of token labels (=strings)
 28 |         for features_iter, labels in zip(X, y):
 29 |             items = ItemSequence(features_iter, check=True)
 30 |             self.trainer.append(items, tuple(labels), 0)
 31 | 
 32 |         self.model_filepath = tempfile.NamedTemporaryFile(delete=False).name
 33 |         self.trainer.train(self.model_filepath, -1)
 34 |         # persist to file and pull it back out.
 35 |         self.tagger = crfsuite.Tagger()
 36 |         self.tagger.open(self.model_filepath)
 37 | 
 38 |     def get_params(self, help=False):
 39 |         params = self.trainer.params()
 40 |         return dict((name, self.trainer.help(name) if help else self.trainer.get(name)) for name in params)
 41 | 
 42 |     def predict(self, X):
 43 |         y = []
 44 |         for features_iter in X:
 45 |             # maybe use self.predict_one(features_iter) instead?
 46 |             items = ItemSequence(features_iter, check=True)
 47 |             # this will just die if self.tagger has not been set
 48 |             self.tagger.set(items)
 49 |             # could also run self.probability() and self.marginal()
 50 |             # convert tuple (output of viterbi()) to list
 51 |             labels = list(self.tagger.viterbi())
 52 |             y.append(labels)
 53 |         return y
 54 | 
 55 |     def set_params(self, **params):
 56 |         for name, value in params.item():
 57 |             self.trainer.set(name, value)
 58 | 
 59 |     # additional fields below are not required by ClassifierI
 60 |     def predict_one(self, features_iter):
 61 |         items = ItemSequence(features_iter, check=True)
 62 |         self.tagger.set(items)
 63 |         return list(self.tagger.viterbi())
 64 | 
 65 |     def save(self, model_filepath):
 66 |         logger.debug('Saving model to %s', model_filepath)
 67 |         # just die if self.model_filepath doesn't exist
 68 |         os.rename(self.model_filepath, model_filepath)
 69 |         self.model_filepath = model_filepath
 70 | 
 71 |     @classmethod
 72 |     def from_file(cls, model_filepath):
 73 |         '''If we are given a model_filepath that points to an existing file, use it.
 74 |         otherwise, create a temporary file to store the model because CRFSuite
 75 |         doesn't seem to allow us to create a tagger directly from a trained
 76 |         trainer object.'''
 77 |         # cls = CRF, obviously
 78 |         crf = cls()
 79 |         crf.tagger = crfsuite.Tagger()
 80 |         logger.debug('Loading existing model from %s', model_filepath)
 81 |         crf.tagger.open(model_filepath)
 82 |         crf.model_filepath = model_filepath
 83 | 
 84 |         return crf
 85 | 
 86 |     @classmethod
 87 |     def from_data(cls, data, feature_functions):
 88 |         '''data must be an iterable of objects with .tokens and .labels attributes.'''
 89 |         crf = cls()
 90 |         X_y = ((featurize(datum.tokens, feature_functions), datum.labels) for datum in data)
 91 |         X, y = izip(*X_y)
 92 |         # X (and y) are iterables, by the way
 93 | 
 94 |         logger.debug('Fitting CRF')
 95 |         crf.fit(X, y)
 96 | 
 97 |         return crf
 98 | 
 99 |     @classmethod
100 |     def default(cls, feature_functions, retrain=False, limit=10000):
101 |         # Is it messy to have this method here, since it depends on tweedr.models.*?
102 |         # and on a specific filepath in the local filesystem?
103 |         model_filepath = '/tmp/tweedr.ml.crf.classifier-max%d.model' % limit
104 |         if os.path.exists(model_filepath):
105 |             return cls.from_file(model_filepath)
106 |         else:
107 |             from tweedr.models import DBSession, TokenizedLabel
108 |             query = DBSession.query(TokenizedLabel).limit(10000)
109 |             crf = cls.from_data(query, feature_functions)
110 |             crf.save(model_filepath)
111 |             return crf
112 | 


--------------------------------------------------------------------------------
/tweedr/ml/crf/wrapper.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import crfsuite
 3 | import tempfile
 4 | 
 5 | from tweedr.ml.crf import ItemSequence
 6 | from tweedr.ml.features import featurize
 7 | 
 8 | import logging
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | class Trainer(crfsuite.Trainer):
13 |     """
14 |     Inherit crfsuite.Trainer to implement message() function, which receives
15 |     progress messages from a training process.
16 |     """
17 |     def message(self, s):
18 |         logger.silly('Trainer.message: %s', s.strip())
19 | 
20 |     def append_raw(self, features_iter, labels):
21 |         # len(labels) = len(features_iter) = length of sentence / sequence
22 |         # labels is a tuple of strings, features_iter is an tuple/list of variable-length lists of strings.
23 |         # this just wraps all the data / labels with crfsuite types
24 |         items = ItemSequence(features_iter)
25 |         # labels = crfsuite.StringList(labels)
26 |         self.append(items, tuple(labels), 0)
27 | 
28 |     def save(self, model_path):
29 |         # Trainer.select(algorithm, type): Initialize the training algorithm and set type of graphical model
30 |         # lbfgs is the default algorithm
31 |         # l2sgd is L2-regularized SGD
32 |         # crf1d is 1st-order dyad features.
33 |         self.select('l2sgd', 'crf1d')
34 | 
35 |         # Set the coefficient for L2 regularization to 0.1
36 |         # potential values change based on algorithm previously selected
37 |         # See http://www.chokkan.org/software/crfsuite/manual.html
38 |         self.set('c2', '0.1')
39 | 
40 |         # Start training; the training process will invoke trainer.message()
41 |         # to report the progress.
42 |         self.train(model_path, -1)
43 | 
44 |         # print 'After training: params and their values'
45 |         # for name in trainer.params():
46 |         #     print name, trainer.get(name), trainer.help(name)
47 | 
48 | 
49 | class Tagger(crfsuite.Tagger):
50 |     def __init__(self, model_path):
51 |         super(Tagger, self).__init__()
52 |         self.open(model_path)
53 | 
54 |     def tag_raw(self, features_iter):
55 |         '''
56 |         Obtain the label sequence predicted by the tagger.
57 | 
58 |         This returns a tuple of strings (label identifiers)
59 |         '''
60 |         items = ItemSequence(features_iter)
61 |         self.set(items)
62 |         # could also run self.probability() and self.marginal()
63 |         return self.viterbi()
64 | 
65 |     @classmethod
66 |     def from_path_or_data(cls, data, feature_functions, model_filepath=None):
67 |         '''If we are given a model_filepath that points to an existing file, use it.
68 |         otherwise, create a temporary file to store the model because CRFSuite
69 |         doesn't seem to allow us to create a tagger directly from a trained
70 |         trainer object.'''
71 |         if model_filepath is None or not os.path.exists(model_filepath):
72 |             if model_filepath is None:
73 |                 model_filepath = tempfile.NamedTemporaryFile(delete=False).name
74 | 
75 |             trainer = Trainer()
76 |             for i, datum in enumerate(data):
77 |                 tokens = datum.tokens
78 |                 labels = datum.labels
79 | 
80 |                 tokens_features = featurize(tokens, feature_functions)
81 |                 trainer.append_raw(tokens_features, labels)
82 | 
83 |             trainer.save(model_filepath)
84 |             logger.debug('Trained on %d instances and saved to %s', i, model_filepath)
85 |         else:
86 |             logger.debug('Loading existing model from %s', model_filepath)
87 | 
88 |         return cls(model_filepath)
89 | 


--------------------------------------------------------------------------------
/tweedr/ml/evaluate.py:
--------------------------------------------------------------------------------
  1 | # import os
  2 | import argparse
  3 | from colorama import Fore
  4 | 
  5 | from sklearn import cross_validation  # , metrics
  6 | # from sklearn.pipeline import Pipeline
  7 | # from sklearn.feature_extraction import text
  8 | 
  9 | from tweedr.lib.text import gloss
 10 | from tweedr.models import DBSession, TokenizedLabel
 11 | from tweedr.ml import compare_labels  # print_metrics_summary
 12 | from tweedr.ml.crf.classifier import CRF
 13 | from tweedr.ml.features import crf_feature_functions, featurize, featurize_adjacent
 14 | 
 15 | import logging
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | flatMap = lambda iterable: map(list, iterable)
 19 | 
 20 | 
 21 | def evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y, print_gloss=False):
 22 |     '''If you use print_gloss, your test_y better be lists, not iterables.'''
 23 |     logger.info('Training on %d, testing on %d', len(train_y), len(test_y))
 24 |     classifier.fit(train_X, train_y)
 25 |     predicted_y = classifier.predict(test_X)
 26 | 
 27 |     if print_gloss:
 28 |         for tokens_features, gold_labels, predicted_labels in zip(test_X, test_y, predicted_y):
 29 |             print '-' * 80
 30 |             # hope that the first feature string is the unigram!
 31 |             tokens = [token_features[0] for token_features in tokens_features]
 32 |             print gloss(zip(tokens, gold_labels, predicted_labels),
 33 |                 prefixes=(Fore.WHITE, Fore.YELLOW, Fore.BLUE),
 34 |                 postfixes=(Fore.RESET, Fore.RESET, Fore.RESET))
 35 | 
 36 |     # flatten
 37 |     test_y = sum(test_y, [])
 38 |     predicted_y = sum(predicted_y, [])
 39 |     counts = compare_labels(test_y, predicted_y, 'None')
 40 |     print 'counts', counts
 41 | 
 42 |     # sklearn metrics doesn't like string labels.
 43 |     # used_labels = list(set(gold_labels + predicted_labels))
 44 |     # print 'used_labels', used_labels
 45 |     # lookup = dict((label, index) for index, label in enumerate(used_labels))
 46 |     # print 'lookup', lookup
 47 |     # remap to integers
 48 |     # gold_labels = [lookup[gold_label] for gold_label in gold_labels]
 49 |     # predicted_labels = [lookup[predicted_label] for predicted_label in predicted_labels]
 50 | 
 51 |     # print_metrics_summary(gold_labels, predicted_labels)
 52 |     # classification_report requires numeric labels, apparently?
 53 |     # print metrics.classification_report(gold_labels, predicted_labels)
 54 | 
 55 |     precision = float(counts.true_positives) / (counts.true_positives + counts.false_positives)
 56 |     recall = float(counts.true_positives) / (counts.true_positives + counts.false_negatives)
 57 |     fscore = 2 * (precision * recall / (precision + recall))
 58 |     for name, value in [('Precision', precision), ('Recall', recall), ('F-score', fscore)]:
 59 |         print '%s: %.4f' % (name, value)
 60 | 
 61 | 
 62 | def main():
 63 |     parser = argparse.ArgumentParser(
 64 |         description='Train CRFSuite on data from the QCRI MySQL database',
 65 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 66 |     parser.add_argument('-k', '--k-folds',
 67 |         type=int, default=10, help='How many folds of the data to test on')
 68 |     parser.add_argument('--max-data',
 69 |         type=int, default=10000, help='Maximum data points to train and test on')
 70 |     parser.add_argument('--adjacent',
 71 |         type=int, default=0, help='Set adjacent to 1 if adjacent functions want to be used')
 72 |     opts = parser.parse_args()
 73 | 
 74 |     # e.g., tokenized_label =
 75 |     # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16
 76 |     #    tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5>
 77 |     # Train and test must be iterables of objects that support CRF-ready
 78 |     # .tokens and .labels attributes.
 79 |     query = DBSession.query(TokenizedLabel).\
 80 |         filter(TokenizedLabel.tweet is not None).\
 81 |         filter(TokenizedLabel.tweet != '').\
 82 |         limit(opts.max_data)
 83 |     if (opts.adjacent == 0):
 84 |         X_y = ((featurize(item.tokens, crf_feature_functions), item.labels) for item in query)
 85 |     else:
 86 |         X_y = ((featurize_adjacent(item.tokens, crf_feature_functions), item.labels) for item in query)
 87 |     # unzip and flatten into static list
 88 |     X, y = zip(*X_y)
 89 |     # we need to read X multiple times, so make sure it's all static
 90 |     X = map(flatMap, X)
 91 | 
 92 |     N = len(y)
 93 |     for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True):
 94 |         # train, test = tokenized_labels[train_indices], tokenized_labels[test_indices]
 95 |         train_X = [X[i] for i in train_indices]
 96 |         train_y = [y[i] for i in train_indices]
 97 |         test_X = [X[i] for i in test_indices]
 98 |         test_y = [y[i] for i in test_indices]
 99 |         classifier = CRF()
100 |         # print_gloss=True
101 |         evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y)
102 | 
103 | if __name__ == '__main__':
104 |     main()
105 | 


--------------------------------------------------------------------------------
/tweedr/ml/evaluate_combinations.py:
--------------------------------------------------------------------------------
  1 | # import os
  2 | import argparse
  3 | from colorama import Fore
  4 | import itertools
  5 | from sklearn import cross_validation  # , metrics
  6 | # from sklearn.pipeline import Pipeline
  7 | # from sklearn.feature_extraction import text
  8 | 
  9 | from tweedr.lib.text import gloss
 10 | from tweedr.models import DBSession, TokenizedLabel, Label
 11 | from tweedr.ml import compare_labels  # print_metrics_summary
 12 | from tweedr.ml.crf.classifier import CRF
 13 | from tweedr.ml.features import crf_feature_functions, featurize
 14 | 
 15 | import logging
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | flatMap = lambda iterable: map(list, iterable)
 19 | 
 20 | 
 21 | def evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y, print_gloss=False):
 22 |     '''If you use print_gloss, your test_y better be lists, not iterables.'''
 23 |     try:
 24 |         logger.info('Training on %d, testing on %d', len(train_y), len(test_y))
 25 |         classifier.fit(train_X, train_y)
 26 |         predicted_y = classifier.predict(test_X)
 27 | 
 28 |         if print_gloss:
 29 |             for tokens_features, gold_labels, predicted_labels in zip(test_X, test_y, predicted_y):
 30 |                 print '-' * 80
 31 |                 # hope that the first feature string is the unigram!
 32 |                 tokens = [token_features[0] for token_features in tokens_features]
 33 |                 print gloss(zip(tokens, gold_labels, predicted_labels),
 34 |                             fixes=(Fore.WHITE, Fore.YELLOW, Fore.BLUE),
 35 |                             postfixes=(Fore.RESET, Fore.RESET, Fore.RESET))
 36 | 
 37 |         # flatten
 38 |         test_y = sum(test_y, [])
 39 |         predicted_y = sum(predicted_y, [])
 40 |         counts = compare_labels(test_y, predicted_y, 'None')
 41 |         print 'counts', counts
 42 | 
 43 |     # sklearn metrics doesn't like string labels.
 44 |     # used_labels = list(set(gold_labels + predicted_labels))
 45 |     # print 'used_labels', used_labels
 46 |     # lookup = dict((label, index) for index, label in enumerate(used_labels))
 47 |     # print 'lookup', lookup
 48 |     # remap to integers
 49 |     # gold_labels = [lookup[gold_label] for gold_label in gold_labels]
 50 |     # predicted_labels = [lookup[predicted_label] for predicted_label in predicted_labels]
 51 | 
 52 |     # print_metrics_summary(gold_labels, predicted_labels)
 53 |     # classification_report requires numeric labels, apparently?
 54 |     # print metrics.classification_report(gold_labels, predicted_labels)
 55 |         try:
 56 |             precision = float(counts.true_positives) / (counts.true_positives + counts.false_positives)
 57 |             recall = float(counts.true_positives) / (counts.true_positives + counts.false_negatives)
 58 |             fscore = 2 * (precision * recall / (precision + recall))
 59 |             for name, value in [('Precision', precision), ('Recall', recall), ('F-score', fscore)]:
 60 |                 print '%s: %.4f' % (name, value)
 61 |         except ZeroDivisionError:
 62 |             pass
 63 |     except IOError:
 64 |         pass
 65 | 
 66 | 
 67 | def main():
 68 |     parser = argparse.ArgumentParser(
 69 |         description='Train CRFSuite on data from the QCRI MySQL database',
 70 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 71 |     parser.add_argument('-k', '--k-folds',
 72 |         type=int, default=10, help='How many folds of the data to test on')
 73 |     parser.add_argument('--max-data',
 74 |         type=int, default=10000, help='Maximum data points to train and test on')
 75 |     opts = parser.parse_args()
 76 | 
 77 |     # e.g., tokenized_label =
 78 |     # <TokenizedLabel dssg_id=23346 token_start=13 token_end=16
 79 |     #    tweet=Tornado Kills 89 in Missouri. http://t.co/IEuBas5 token_type=i18 token= 89 id=5>
 80 |     # Train and test must be iterables of objects that support CRF-ready
 81 |     # .tokens and .labels attributes.
 82 |     query = DBSession.query(TokenizedLabel).limit(opts.max_data)
 83 | 
 84 |     for L in range(0, len(crf_feature_functions) + 1):
 85 |         for subset in itertools.combinations(crf_feature_functions, L):
 86 |             sub = list(subset)
 87 |             print sub
 88 |             X_y = ((featurize(item.tokens, sub), item.labels) for item in query)
 89 |             # unzip and flatten into static list
 90 |             X, y = zip(*X_y)
 91 |             # we need to read X multiple times, so make sure it's all static
 92 |             X = map(flatMap, X)
 93 |             categories = dict((label.id, label.text) for label in DBSession.query(Label))
 94 |             print 'categories', categories
 95 | 
 96 |             N = len(y)
 97 |             #tests on different data sets -> k folds is set to 10 right now
 98 |             for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True):
 99 |                 train_X = [X[i] for i in train_indices]
100 |                 train_y = [y[i] for i in train_indices]
101 |                 test_X = [X[i] for i in test_indices]
102 |                 test_y = [y[i] for i in test_indices]
103 |                 classifier = CRF()
104 |                 evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y)
105 | 
106 | if __name__ == '__main__':
107 |     main()
108 | 


--------------------------------------------------------------------------------
/tweedr/ml/features/__init__.py:
--------------------------------------------------------------------------------
 1 | # each feature function takes an N-long document (list of strings) and returns an N-long list
 2 | #   of lists/tuples of features (i.e., strings) to add to the total data for that sentence.
 3 | #   often the list will contain lists that are 1-long
 4 | from itertools import izip, chain
 5 | 
 6 | 
 7 | def spacer(xs):
 8 |     return [' '.join(xs)]
 9 | 
10 | 
11 | def featurize_adjacent(tokens, feature_functions):
12 |     feature_functions_results = [feature_function(tokens) for feature_function in feature_functions]
13 |     list_of_token_features = []
14 |     #add token features
15 |     for token_featuress in izip(*feature_functions_results):
16 |         list_of_token_features.append(list(chain.from_iterable(token_featuress)))
17 |     #add features to the left and to the right
18 |     i = 0
19 |     while i < len(list_of_token_features):
20 |         j = list_of_token_features[i]
21 |         it = [k for k in j]
22 |         if i > 0:
23 |             a = list_of_token_features[i - 1]
24 |             c = ['^^^' + k for k in a]
25 |             try:
26 |                 c.pop(0)
27 |             except IndexError:
28 |                 pass
29 |             it += c
30 | 
31 |         if i < len(list_of_token_features) - 1:
32 |             b = list_of_token_features[i + 1]
33 |             d = ['$$$' + k for k in b]
34 |             try:
35 |                 d.pop(0)
36 |             except IndexError:
37 |                 pass
38 |             it += d
39 |         i = i + 1
40 |         yield chain.from_iterable([it])
41 | 
42 | 
43 | def featurize(tokens, feature_functions):
44 |     '''Take a N-long list of strings (natural text), apply each feature function,
45 |     and then unzip (transpose) and flatten so that we get a N-long list of
46 |     arbitrarily-long lists of strings.
47 |     '''
48 |     feature_functions_results = [feature_function(tokens) for feature_function in feature_functions]
49 |     for token_featuress in izip(*feature_functions_results):
50 |         yield chain.from_iterable(token_featuress)
51 | 
52 | 
53 | def featurize_to_dict(tokens, feature_functions):
54 |     '''Take a N-long list of strings (natural text), apply each feature function,
55 |     create N-long list of dicts with keys that are the names of feature functions,
56 |     and values that are the joined output of those functions.
57 |     '''
58 |     feature_functions_results = [feature_function(tokens) for feature_function in feature_functions]
59 |     for token_featuress in izip(*feature_functions_results):
60 |         token_feature_dict = dict()
61 |         for feature_function, token_features in zip(feature_functions, token_featuress):
62 |             token_feature_string = ' '.join(token_features)
63 |             if token_feature_string:
64 |                 token_feature_dict[feature_function.__name__] = token_feature_string
65 |         yield token_feature_dict
66 | 
67 | 
68 | def main():
69 |     # example usage:
70 |     # echo "The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced no evidence that any irregularities took place." | python __init__.py
71 |     import sys
72 |     from tweedr.lib.text import token_re
73 |     from tweedr.ml.features.sets import all_feature_functions
74 |     for line in sys.stdin:
75 |         # tokenize the document on whitespace
76 |         tokens = token_re.findall(line)
77 |         # apply all feature functions
78 |         tokens_features = featurize(tokens, all_feature_functions)
79 |         for i, token_features in enumerate(tokens_features):
80 |             print i, list(token_features)
81 | 
82 | if __name__ == '__main__':
83 |     main()
84 | 


--------------------------------------------------------------------------------
/tweedr/ml/features/characters.py:
--------------------------------------------------------------------------------
 1 | def capitalized(document):
 2 |     return [['CAPITALIZED'] if token[0].isupper() else [] for token in document]
 3 | 
 4 | 
 5 | def plural(document):
 6 |     return [['PLURAL'] if token.endswith('s') else [] for token in document]
 7 | 
 8 | 
 9 | def numeric(document):
10 |     return [['NUMERIC'] if token.isdigit() else [] for token in document]
11 | 
12 | 
13 | def includes_numeric(document):
14 |     return [['INCLUDES_NUMERIC'] if any(char.isdigit() for char in token) else [] for token in document]
15 | 


--------------------------------------------------------------------------------
/tweedr/ml/features/dbpedia.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import requests
 3 | from tweedr.ml.spotlight import annotate
 4 | from tweedr.lib.text import zip_boundaries
 5 | 
 6 | spotlight_annotate_url = '%s/rest/annotate' % os.environ.get('SPOTLIGHT', 'http://spotlight.sztaki.hu:2222')
 7 | 
 8 | 
 9 | def get_pos(offset, document):
10 |     doc_joined = " ".join(document)
11 |     beginning = doc_joined[:offset]
12 |     length = len(beginning.split(" ")) - 1
13 |     return length
14 | 
15 | 
16 | def features(document):
17 |     doc_length = len(document)
18 |     doc_joined = " ".join(document)
19 |     positions = [[] for x in xrange(doc_length)]
20 |     try:
21 |         annotations = annotate('http://tweedr.dssg.io:2222/rest/annotate', doc_joined, confidence=0.4, support=20)
22 |         for a in annotations:
23 |             offset = a["offset"]
24 |             type = a["types"]
25 |             all_types = type.split(",")
26 |             dbpedia_type = all_types[0]
27 |             pos = get_pos(offset, document)
28 |             db = str(dbpedia_type)
29 |             positions[pos] = [db.upper()]
30 |     except Exception:
31 |         return positions
32 |     return positions
33 | 
34 | 
35 | def spotlight(document, confidence=0.1, support=10):
36 |     document_string = u' '.join(document)
37 |     r = requests.post(spotlight_annotate_url,
38 |         headers=dict(Accept='application/json'),
39 |         data=dict(text=document_string, confidence=confidence, support=support))
40 |     Resources = r.json().get('Resources', [])
41 |     for token, token_start, token_end in zip_boundaries(document):
42 |         labels = []
43 |         for Resource in Resources:
44 |             entity_start = int(Resource['@offset'])
45 |             entity_end = entity_start + len(Resource['@surfaceForm'])
46 | 
47 |             if entity_start <= token_start <= entity_end or entity_start <= token_end <= entity_end:
48 |                 entity_uri = Resource['@URI']
49 |                 entity_types = Resource['@types'].split(',')
50 |                 labels += [entity_uri] + entity_types
51 |         yield labels
52 | 


--------------------------------------------------------------------------------
/tweedr/ml/features/lexicons.py:
--------------------------------------------------------------------------------
 1 | from tweedr.ml import wordnet, lexicon_list
 2 | 
 3 | 
 4 | def is_transportation(document):
 5 |     return [['TRANSPORTATION'] if token in lexicon_list.transportation else [] for token in document]
 6 | 
 7 | 
 8 | def is_building(document):
 9 |     return [['BUILDING'] if token in lexicon_list.buildings else [] for token in document]
10 | 
11 | 
12 | def hypernyms(document, recursive=True, depth=1):
13 |     '''Iterate through all senses for all 1-away hypernyms. E.g.:
14 | 
15 |         print map(list, hypernyms(document))
16 |     '''
17 |     for token in document:
18 |         yield wordnet.token_hypernyms(token, recursive, depth)
19 | 


--------------------------------------------------------------------------------
/tweedr/ml/features/ngrams.py:
--------------------------------------------------------------------------------
 1 | from tweedr.ml.features import spacer
 2 | 
 3 | 
 4 | def unigrams(document):
 5 |     return [[token] for token in document]
 6 | 
 7 | 
 8 | def rbigrams(document):
 9 |     grams = zip(document, document[1:] + ['$$$'])
10 |     return map(spacer, grams)
11 | 
12 | 
13 | def lbigrams(document):
14 |     grams = zip(['^^^'] + document[:-1], document)
15 |     return map(spacer, grams)
16 | 
17 | 
18 | def ctrigrams(document):
19 |     grams = zip(['^^^'] + document[:-1], document, document[1:] + ['$$$'])
20 |     return map(spacer, grams)
21 | 
22 | 
23 | def unique(document):
24 |     # TODO: unique doesn't really belong here, but doesn't quite merit its own module
25 |     seen = {}
26 |     features = []
27 |     for token in document:
28 |         features.append(['UNIQUE'] if token not in seen else [])
29 |         seen[token] = 1
30 |     return features
31 | 


--------------------------------------------------------------------------------
/tweedr/ml/features/nlp.py:
--------------------------------------------------------------------------------
 1 | # the tagger is global, powered by the singleton module in tweedr.ml.ark
 2 | from tweedr.ark.java.singleton import tagger
 3 | 
 4 | import logging
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | 
 8 | def pos_tags(document):
 9 |     text = ' '.join(document)
10 |     tokens_line, tags_line = tagger.tokenize_and_tag(text)
11 |     tokens = tokens_line.split()
12 |     tags = tags_line.split()
13 | 
14 |     if not (len(document) == len(tokens) == len(tags)):
15 |         # TODO: make this warning unnecessary
16 |         logger.critical('TwitterNLP tagger did not tokenize correctly: %s vs %s', text, tokens_line)
17 |     return [[tag] for tag in tags]
18 | 


--------------------------------------------------------------------------------
/tweedr/ml/features/sets.py:
--------------------------------------------------------------------------------
 1 | from tweedr.ml.features import characters, dbpedia, lexicons, ngrams
 2 | 
 3 | crf_feature_functions = [
 4 |     ngrams.unigrams,
 5 |     characters.plural,
 6 |     lexicons.is_transportation,
 7 |     lexicons.is_building,
 8 |     characters.capitalized,
 9 |     characters.numeric,
10 |     ngrams.unique,
11 |     lexicons.hypernyms,
12 |     dbpedia.features,
13 | ]
14 | 
15 | all_feature_functions = crf_feature_functions + [
16 |     ngrams.rbigrams,
17 |     ngrams.lbigrams,
18 |     ngrams.ctrigrams,
19 | ]
20 | 
21 | classifier_feature_functions = [
22 |     ngrams.unigrams,
23 | ]
24 | 


--------------------------------------------------------------------------------
/tweedr/ml/lexicon_list.py:
--------------------------------------------------------------------------------
1 | transportation = ['aerial tramway', 'aircraft', 'aircraft carrier', 'airplane', 'ambulance', 'armored car', 'auto', 'automobile', 'baby carriage', 'balloon', 'bathyscaphe', 'barge', 'barrow', 'battleship', 'bicycle', 'bike', 'biplane', 'blimp', 'boat', 'bobsled', 'bomber', 'boxcar', 'broomstick', 'buggy', 'bulldozer', 'bullet train', 'bus', 'cab', 'cabin cruiser', 'cable car', 'caboose', 'camper', 'canoe', 'car', 'caravan', 'caravel', 'cargo ship', 'carriage', 'carrier', 'cart', 'catamaran', 'chairlift', 'chariot', 'chopper', 'clipper ship', 'clunker', 'coach', 'compact car', 'combine', 'compact car', 'Conestoga wagon', 'container ship', 'convertible', 'conveyance', 'conveyor belt', 'convoy', 'coupe', 'covered wagon', 'crane', 'crop duster', 'cruise ship', 'cruiser', 'cutter', 'cycle', 'delivery truck', 'delivery van', 'destroyer', 'diesel truck', 'dinghy', 'dirigible', 'dirt bike', 'diving bell', 'dog cart', 'dogsled', 'donkey cart', 'dray', 'driver', 'dugout canoe', 'dump truck', 'earth mover', 'eighteen-wheeler', 'electric car', 'elevated railroad', 'elevator', 'engine', 'escalator', 'express train', 'ferry', 'fireboat', 'fire engine', 'fishing boat', 'flatbed truck', 'forklift', 'four-door', 'four-wheel drive', 'four-by-four', 'freighter', 'freight train', 'frigate', 'funicular railway', 'galleon', 'garbage truck', 'glider', 'go-cart', 'golf cart', 'gondola', 'gondola lift', 'gridlock', 'handcar', 'hang glider', 'hansom cab', 'hardtop', 'harvester', 'hatchback', 'haul', 'hay wagon', 'hearse', 'helicopter', 'hook and ladder truck', 'hovercraft', 'hot-air balloon', 'hot rod', 'houseboat', 'hull', 'humvee', 'hybrid', 'hydrofoil', 'hydroplane', 'ice boat', 'ice breaker', 'jalopy', 'jeep', 'jet', 'jet boat', 'jetliner', 'journey', 'jetpack', 'jet ski', 'jumbo jet', 'junk', 'kayak', 'ketch', 'landing craft', 'lifeboat', 'life raft', 'light rail', 'limo', 'limousine', 'litter', 'locomotive', 'lorry', 'low-rider', 'magic carpet', 'maglev', 'mast', 'minesweeper', 'minibus', 'minivan', 'model T', 'monorail', 'moped', 'motor', 'motorcar', 'motorboat', 'motorcycle', 'motor home', 'mountain bike', 'narrowboat', 'oar', 'ocean liner', 'off-road vehicle', 'oil tanker', 'outboard motor', 'outrigger canoe', 'oxcart', 'paddle', 'paddlewheeler', 'parachute', 'passenger', 'patrol car', 'pedal boat', 'pickup truck', 'pilot', 'plane', 'police car', 'power boat', 'prairie schooner', 'propeller', 'PT boat', 'pumper truck', 'punt', 'push cart', 'racecar', 'racing car', 'raft', 'ragtop', 'railroad', 'railway', 'rapid transit', 'recreational vehicle', 'rickshaw', 'ride', 'riverboat', 'roadster', 'rocket', 'rover', 'rowboat', 'rudder', 'runabout', 'RV', 'sail', 'sailboat', 'satellite', 'school bus', 'schooner', 'scooter', 'scull', 'seaplane', 'sedan', 'sedan chair', 'Segway', 'semi', 'ship', 'shuttle', 'side wheeler', 'skiff', 'ski lift', 'ski tow', 'sled', 'sledge', 'sleigh', 'snow cat', 'snowmobile', 'snowplow', 'spaceship', 'space shuttle', 'speedboat', 'sports car', 'sport-utility vehicle', 'SUV', 'squad car', 'SST', 'stagecoach', 'station wagon', 'steamboat', 'steamship', 'stretch limo', 'stock car', 'stroller', 'subcompact', 'submarine', 'subway', 'surrey', 'SUV', 'tank', 'tanker', 'taxi', 'taxicab', 'T-bar lift', 'thresher', 'tire', 'toboggan', 'town car', 'tow truck', 'tracks', 'tractor', 'tractor-trailer', 'trail bike', 'trailer', 'train', 'tram', 'tramway', 'transit', 'trawler', 'tricycle', 'trolley', 'truck', 'tugboat', 'two-door', 'van', 'vehicle', 'vespa', 'vessel', 'wagon', 'wheelchair', 'yacht']
2 | 
3 | buildings = ['abbey', 'aircraft hangar', 'airport terminal', 'amphitheater', 'apartment building', 'aqueduct', 'arch', 'arena', 'armory', 'assembly hall', 'barn', 'barracks', 'beach house', 'boathouse', 'boarding house', 'bowling alley', 'bridge', 'brownstone', 'building', 'bungalow', 'bunkhouse', 'bunker', 'cabana', 'cabin', 'capitol', 'carport', 'castle', 'catacomb', 'cathedral', 'chalet', 'chapel', 'chateau', 'church', 'cinema', 'city hall', 'clubhouse', 'college', 'compound', 'concert hall', 'condominium', 'conservatory', 'cottage', 'courthouse', 'crypt', 'depot', 'detached house', 'dock', 'dome', 'dormitory', 'double wide', 'duplex', 'dwelling', 'earth-sheltered house', 'embassy', 'exposition hall', 'factory', 'farm', 'farmhouse', 'ferry slip', 'ferry terminal', 'firehouse', 'fire station', 'folly', 'forge', 'fort', 'fortress', 'foundry', 'gallery', 'garage', 'gas station', 'gazebo', 'geodesic dome', 'granary', 'greenhouse', 'gym', 'gymnasium', 'hall', 'hangar', 'haunted house', 'headquarters', 'high-rise', 'home', 'hospital', 'hostel', 'hotel', 'hot house', 'house', 'houseboat', 'housing project', 'hunting lodge', 'hut', 'igloo', 'jail', 'kiosk', 'laboratory', 'lean-to', 'library', 'lighthouse', 'lodge', 'log cabin', 'longhouse', 'mall', 'manor', 'manse', 'mansion', 'marina', 'market', 'mausoleum', 'meeting hall', 'mill', 'minaret', 'mobile home', 'monastery', 'monument', 'mosque', 'motel', 'museum', 'nuclear power plant', 'nursing home', 'observatory', 'office building', 'opera house', 'outbuilding', 'outhouse', 'pagoda', 'palace', 'parking garage', 'parliament', 'pavilion', 'plant', 'playhouse', 'police station', 'pool house', 'post office', 'power plant', 'prefab building', 'prison', 'pump house', 'pyramid', 'quonset hut', 'railway station', 'ranch', 'rectory', 'refinery', 'residence', 'restaurant', 'roller rink', 'roundhouse', 'rowhouse', 'school', 'shack', 'shed', 'shelter', 'shopping center', 'shopping mall', 'shrine', 'silo', 'skating rink', 'skyscraper', 'skyway', 'smokestack', 'spire', 'split-level house', 'stable', 'stadium', 'state house', 'station', 'steeple', 'store', 'storehouse', 'strip mall', 'structure', 'studio', 'supermarket', 'symphony', 'synagogue', 'temple', 'tenement', 'tent', 'terminal', 'theater', 'tipi', 'toll house', 'tomb', 'tower', 'townhouse', 'treehouse', 'triplex', 'Tudor house', 'university', 'vault', 'vicarage', 'villa', 'warehouse', 'watermill', 'workshop', 'yurt']
4 | 


--------------------------------------------------------------------------------
/tweedr/ml/pyslda/PreProcess.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import re
 5 | import types
 6 | import string
 7 | from pattern.en.wordlist import BASIC
 8 | 
 9 | stopwords = \
10 |     '''im,rt,a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your'''
11 | stopwords = stopwords.split(',')
12 | stopwords.extend(BASIC)
13 | 
14 | 
15 | def processTweet(tweet):
16 |     if isinstance(tweet, types.NoneType):
17 |         return ' '
18 |     tweet = str(tweet)
19 |     tweet = tweet.lower()
20 |     tweet = re.sub('((www\.[/s]+)|(https?://[^\s]+))', ' ', tweet)
21 |     tweet = tweet.translate(None, string.punctuation)
22 |     tweet = re.sub("@[^\s]+", ' ', tweet)
23 |     tweet = re.sub('[\s]+', ' ', tweet)
24 |     tweet = re.sub(r"#([^\s]+)", r'\1', tweet)
25 |     tweet = tweet.strip('\'"')
26 |     words = tweet.split()
27 |     words = [word for word in words if not word in stopwords]
28 |     words = ' '.join(words)
29 |     words = words.strip()
30 |     return words
31 | 
32 | 
33 | def is_ascii(tweet):
34 |     for c in tweet:
35 |         if ord(c) >= 128:
36 |             return False
37 |     return True
38 | 


--------------------------------------------------------------------------------
/tweedr/ml/pyslda/PySLDA.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from pyper import R
  5 | import os
  6 | from PreProcess import processTweet, is_ascii
  7 | 
  8 | 
  9 | class supervisedLDA:
 10 | 
 11 |     def __init__(self, dataFileName, alpha=1.0, numtopics=5, eta=0.1, logistic=True, lamda=1.0, e_iter=10, m_iter=4, variance=0.25, cutoff=0.25):
 12 |         model_filename = 'model_%s.RDS' % dataFileName
 13 |         vocab_filename = 'vocabulary_%s.RDS' % dataFileName
 14 |         fullpath = os.path.realpath(__file__)
 15 |         (path, files) = os.path.split(fullpath)
 16 |         self.path = path
 17 |         self.params = {
 18 |             'numtopics': numtopics,
 19 |             'alpha': alpha,
 20 |             'eta': eta,
 21 |             'logistic': logistic,
 22 |             'lambda': lamda,
 23 |             'e_iter': e_iter,
 24 |             'm_iter': m_iter,
 25 |             'variance': variance,
 26 |             'OutputName': dataFileName,
 27 |             'model_filename': model_filename,
 28 |             'vocab_filename': vocab_filename,
 29 |             'test_cutoff': cutoff,
 30 |         }
 31 |         self.r = R(use_pandas=True, use_numpy=True)
 32 |         self.assign_R_params()
 33 | 
 34 |     def set_param(self, param_name, param_value):
 35 |         self.params[param_name] = param_value
 36 | 
 37 |     def get_params(self, deep=False):
 38 |         return self.params
 39 | 
 40 |     def assign_R_params(self):
 41 |         for (key, value) in self.params.iteritems():
 42 |             self.r.assign(key, value)
 43 | 
 44 |     def fit(self, documents, labels):
 45 |         (documents, labels) = self.transform(documents, labels)
 46 |         self.r.assign('documents', documents)
 47 |         self.r.assign('labels', labels)
 48 |         self.r.run('source("trainLDA.R")')
 49 |         vocab = self.r['vocabulary']
 50 |         self.set_param('vocabulary', vocab)
 51 |         self.assign_R_params()
 52 | 
 53 |     def transform(self, documents, labels):
 54 |         documents = [(tweet if is_ascii(tweet) else ' ') for tweet in
 55 |                      documents]
 56 |         documents = map(lambda x: processTweet(x), documents)
 57 |         documents = map(lambda x: str(x).translate(None, '"'),
 58 |                         documents)
 59 |         (tweets_filtered, labels_filtered) = ([], [])
 60 |         for (tweet, label) in zip(documents, labels):
 61 |             if len(tweet) > 1:
 62 |                 tweets_filtered.append(tweet)
 63 |                 labels_filtered.append(label)
 64 |         return (tweets_filtered, labels_filtered)
 65 | 
 66 |     def test_transform(self, documents):
 67 |         documents = [(tweet if is_ascii(tweet) else ' ') for tweet in
 68 |                      documents]
 69 |         documents = map(lambda x: processTweet(x), documents)
 70 |         documents = map(lambda x: str(x).translate(None, '"'),
 71 |                         documents)
 72 |         tweets_filtered = []
 73 |         for tweet in documents:
 74 |             if len(tweet) > 1:
 75 |                 tweets_filtered.append(tweet)
 76 |         return tweets_filtered
 77 | 
 78 |     def __str__(self):
 79 |         return 'sLDA(cut:%s)' % self.params['test_cutoff']
 80 | 
 81 |     def predict(self, documents, gold_labels):
 82 |         (documents, gold_labels) = self.transform(documents,
 83 |                 gold_labels)
 84 |         self.r.assign('testDocuments', documents)
 85 |         self.r.run('source("testLDA.R")')
 86 |         predictions = self.r['pred']
 87 |         cutoff = self.params['test_cutoff']
 88 |         predictions = map(lambda x: int(x > cutoff), predictions)
 89 |         return (predictions, gold_labels)
 90 | 
 91 |     def save_model(self):
 92 |         self.r.run('source("%s/saveModel.R")' % self.path)
 93 | 
 94 |     def load_model(self):
 95 |         self.r.run('source("%s/loadModel.R")' % self.path)
 96 |         vocab = self.r['vocab']
 97 |         topics = self.r['topics']
 98 |         self.set_param('vocab', vocab)
 99 |         self.set_param('topics', topics)
100 |         self.assign_R_params()
101 | 


--------------------------------------------------------------------------------
/tweedr/ml/pyslda/README.md:
--------------------------------------------------------------------------------
 1 | ### Supervised Latent Dirchlet Allocation
 2 | 
 3 | This sub-module creates a python wrapper for the sLDA algorithm in R for binary classification and topic modeling. This wrapper implements methods in a fashion similar to classifiers provided in Scikit-Learn. Because there is no way to convert the R "slda" model into a Python object, the model is stored on disk. The sLDA classifier cannot currently handle non-ASCII text.
 4 | 
 5 | * * *
 6 | 
 7 | To create a sLDA classifier object, call supervisedLDA() and provide a filename for where you want the model to be saved.
 8 | 
 9 | Methods:
10 | 
11 | 1.  Fit(documents, labels): Trains the sLDA classifier using the provided corpus and corresponding labels.
12 | 2.  Predict(Document, gold_labels): Returns a vector containing the likelihood that each document in testing corpus will have a positive label.
13 | 3.  SaveModel: Saves the vocabulary used in the trained sLDA model.
14 | 4.  LoadModel: Loads an existing vocabulary into the sLDA model.


--------------------------------------------------------------------------------
/tweedr/ml/pyslda/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/tweedr/ml/pyslda/__init__.py


--------------------------------------------------------------------------------
/tweedr/ml/pyslda/evaluate-classifier.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """Evaluate the tweet classifier: is this tweet about damage/casualty or not?
  5 | 
  6 | Results as of 8/8/13:
  7 | 
  8 | INFO:eval-clf:Reading labeled tweets from database...
  9 | INFO:eval-clf:Read 1045 tweets
 10 | model        ....f1....   pre....rec....acc....f1_std pre_std....rec_std....acc_std
 11 | KNeighborsCla....0.52....0.84....0.38....0.84....0.09.... 0.10....   0.08....   0.04
 12 | SVC(C=1, cach....0.45....0.89....0.32....0.83....0.14.... 0.07....   0.13....   0.05
 13 | DecisionTreeC....0.55....0.91....0.40....0.85....0.09.... 0.07....   0.08....   0.03
 14 | MultinomialNB....0.62....0.53....0.74....0.79....0.07.... 0.08....   0.08....   0.03
 15 | LogisticRegre....0.66....0.78....0.59....0.86....0.05.... 0.10....   0.06....   0.03
 16 | """
 17 | 
 18 | import argparse
 19 | import logging
 20 | import numpy as np
 21 | 
 22 | from sklearn import cross_validation  # , metrics
 23 | from sklearn.feature_extraction.text import CountVectorizer
 24 | 
 25 | # from sklearn.feature_extraction.text import TfidfTransformer
 26 | 
 27 | from sklearn import metrics
 28 | from sklearn.linear_model import LogisticRegression
 29 | from sklearn.neighbors import KNeighborsClassifier
 30 | from sklearn.pipeline import Pipeline
 31 | from sklearn.svm import SVC
 32 | from sklearn.tree import DecisionTreeClassifier
 33 | from sklearn.naive_bayes import MultinomialNB
 34 | 
 35 | from tweedr.models import DamageClassification, DBSession
 36 | from tweedr.ml.pyslda import PySLDA
 37 | 
 38 | logger = logging.getLogger('eval-clf')
 39 | 
 40 | 
 41 | def summarize(evals, n):
 42 |     '''Compute average and standard deviation for evaluation metrics'''
 43 | 
 44 |     avg = {}
 45 |     for key in evals[0].iterkeys():
 46 |         scores = np.array([e[key] for e in evals])
 47 |         avg[key] = np.average(scores)
 48 |         avg[key + '_std'] = np.std(scores)
 49 |     return avg
 50 | 
 51 | 
 52 | def score(y_true, y_pred):
 53 |     '''Compute evaluation metrics. Note pos_label=1 parameter of
 54 |     f1/precision/recall. Thus, we only compute precision of the positive class
 55 |     (as opposed to computing the precision for both classes and taking the
 56 |     average).'''
 57 | 
 58 |     return {
 59 |         'acc': metrics.accuracy_score(y_true, y_pred),
 60 |         'f1': metrics.f1_score(y_true, y_pred, pos_label=1),
 61 |         'pre': metrics.precision_score(y_true, y_pred, pos_label=1),
 62 |         'rec': metrics.recall_score(y_true, y_pred, pos_label=1),
 63 |     }
 64 | 
 65 | 
 66 | def read_tweets():
 67 |     '''Read labeled tweets from database'''
 68 | 
 69 |     logger.info('Reading labeled tweets from database...')
 70 |     labeled_tweets = \
 71 |         np.array(DBSession.query(DamageClassification).filter(DamageClassification.mturk_code
 72 |                  == 'QCRI').limit(opts.max_data).all())
 73 |     logger.info('Read %d tweets', len(labeled_tweets))
 74 |     return labeled_tweets
 75 | 
 76 | 
 77 | def metric_names():
 78 |     '''Name of metrics. Gotcha: keep in sync with score function'''
 79 | 
 80 |     metric_names = ['f1', 'pre', 'rec', 'acc']
 81 |     return metric_names + [m + '_std' for m in metric_names]
 82 | 
 83 | 
 84 | if __name__ == '__main__':
 85 |     parser = \
 86 |         argparse.ArgumentParser(description='Train a classifier on data from the QCRI MySQL database',
 87 |                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 88 |     parser.add_argument('-k', '--k-folds', type=int, default=10,
 89 |                         help='How many folds of the data to test on')
 90 |     parser.add_argument('--max-data', type=int, default=10000,
 91 |                         help='Maximum data points to train and test on')
 92 |     opts = parser.parse_args()
 93 | 
 94 |     labeled_tweets = read_tweets()
 95 | 
 96 |     # FIXME: add features beyond bag of words
 97 | 
 98 |     pipeline = Pipeline([('vect', CountVectorizer())])
 99 | 
100 |     # ,                         ('tfidf', TfidfTransformer())])
101 | 
102 |     x = pipeline.fit_transform([t.text for t in
103 |                                labeled_tweets]).toarray()
104 |     x_text = np.array([t.text.lower() for t in labeled_tweets])
105 |     y = np.array([t.label for t in labeled_tweets])
106 |     y_list = np.array([t.label for t in labeled_tweets])
107 | 
108 |     classifiers = [
109 |         KNeighborsClassifier(3),
110 |         SVC(gamma=2, C=1),
111 |         DecisionTreeClassifier(max_depth=5),
112 |         MultinomialNB(),
113 |         LogisticRegression(),
114 |         PySLDA.supervisedLDA('testModel01', cutoff=0.15),
115 |     ]
116 | 
117 |     # FIXME: Write sLDA wrapper that extends ClassifierI for inclusion above
118 | 
119 |     last_classifier = len(classifiers) - 1
120 |     cv = cross_validation.KFold(len(y), opts.k_folds, shuffle=True,
121 |                                 random_state=1234)
122 |     metric_names = metric_names()
123 |     print '\t'.join(['model' + ' ' * 8] + metric_names)
124 |     for (i, clf) in enumerate(classifiers):
125 |         results = []
126 |         for (train, test) in cv:
127 |             if i < last_classifier:
128 |                 truth = y[test]
129 |                 pred = clf.fit(x[train], y[train]).predict(x[test])
130 |                 results.append(score(truth, pred))
131 |             else:
132 |                 truth = y_list[test]
133 |                 clf.fit(list(x_text[train]), list(y_list[train]))
134 |                 (pred, truth) = clf.predict(list(x_text[test]),
135 |                         list(truth))
136 |                 results.append(score(truth, pred))
137 |         results_avg = summarize(results, opts.k_folds)
138 |         print str(clf)[:13] + '\t' + '\t'.join(['%.2f' % results_avg[m]
139 |                 for m in metric_names])
140 | 


--------------------------------------------------------------------------------
/tweedr/ml/pyslda/loadModel.R:
--------------------------------------------------------------------------------
1 | require("lda")
2 | 
3 | vocab <- readRDS(vocab_filename)
4 | 


--------------------------------------------------------------------------------
/tweedr/ml/pyslda/saveModel.R:
--------------------------------------------------------------------------------
1 | require("lda")
2 | 
3 | saveRDS(vocab,vocabulary_filename)
4 | 


--------------------------------------------------------------------------------
/tweedr/ml/pyslda/testLDA.R:
--------------------------------------------------------------------------------
1 | require("lda")
2 | require("pracma")
3 | 
4 | 
5 | sldaModel <- readRDS(model_filename)
6 | corpus <- lexicalize(testDocuments, lower=TRUE, vocab=vocabulary)
7 | pred <- slda.predict(corpus, sldaModel$topics, sldaModel$model, alpha = 1.0, eta = 0.1)
8 | 
9 | pred <- sigmoid(pred)


--------------------------------------------------------------------------------
/tweedr/ml/pyslda/trainLDA.R:
--------------------------------------------------------------------------------
1 | require("lda")
2 | corpus <- lexicalize(documents)
3 | documents <- corpus$documents
4 | vocabulary <- corpus$vocab
5 | 
6 | params <- sample(c(-1,1), numtopics, replace=TRUE)
7 | result <- slda.em(documents=documents, K = numtopics, vocab=vocabulary, num.e.iterations = e_iter, num.m.iterations= m_iter, alpha = alpha, eta = eta, as.numeric(labels), params, variance = variance, lambda = lambda, logistic = logistic, method="sLDA")
8 | 
9 | saveRDS(result,model_filename)


--------------------------------------------------------------------------------
/tweedr/ml/spotlight/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Python DBpedia Spotlight API Wrapper
  3 | ====================================
  4 | 
  5 | This is just a simple interface to a Spotlight API.
  6 | 
  7 | Tested with DBPedia Spotlight 0.5 and 0.6.5.
  8 | 
  9 | Note that I'm trying to track Spotlight release version numbers, so you can
 10 | easily see which pyspotlight version has been tested with which Spotlight
 11 | release.
 12 | 
 13 | I hope the code and the small documentation speaks for itself :-)
 14 | 
 15 | If you should encounter any problems, feel free to contact me on github
 16 | (originell). I'm happy to help out with anything related to my code.
 17 | """
 18 | __version_info__ = (0, 6, 5)
 19 | __version__ = '.'.join(map(str, __version_info__))
 20 | 
 21 | 
 22 | import requests
 23 | 
 24 | 
 25 | class SpotlightException(Exception):
 26 |     """
 27 |     Exception raised on Spotlight failures.
 28 | 
 29 |     Basically this exception is raised if there was no valid JSON response
 30 |     from Spotlight.
 31 |     """
 32 |     pass
 33 | 
 34 | 
 35 | # Some helper functions.
 36 | def _convert_number(value):
 37 |     """
 38 |     Try to convert a string to an int or float.
 39 |     """
 40 |     if isinstance(value, bool):
 41 |         return value
 42 |     # Workaround for footnotes being put into Resources.surfaceForm and then
 43 |     # having them parsed by the JSON parser into a list. (issue #4)
 44 |     if isinstance(value, list):
 45 |         value = unicode(value)
 46 | 
 47 |     try:
 48 |         return int(value)
 49 |     except ValueError:
 50 |         try:
 51 |             return float(value)
 52 |         except ValueError:
 53 |             return value
 54 | 
 55 | 
 56 | def _dict_cleanup(dic, dict_type=dict):
 57 |     """
 58 |     Clean the response dictionary from ugly @ signs in keys.
 59 | 
 60 |     TODO: Make this an iteration based recursion instead of function based.
 61 |           That way we can avoid stack fails.
 62 |     """
 63 |     clean = dict_type()
 64 |     for key, value in dic.iteritems():
 65 |         if value is None:
 66 |             continue
 67 | 
 68 |         key = key.replace('@', '')
 69 |         try:
 70 |             try:
 71 |                 # If this is a string or bool,
 72 |                 # go straight to type conversion.
 73 |                 if (isinstance(value, basestring) or
 74 |                         isinstance(value, bool)):
 75 |                     raise AttributeError
 76 |                 # Test for an iterable (list, tuple, set)
 77 |                 value[0]
 78 |                 # Clean up each element in the iterable
 79 |                 clean[key] = [_dict_cleanup(element, dict_type) for element in value]
 80 |             except KeyError:
 81 |                 clean[key] = _dict_cleanup(value, dict_type)
 82 |         except AttributeError:
 83 |             clean[key] = _convert_number(value)
 84 |     return clean
 85 | 
 86 | 
 87 | # Main functions.
 88 | #
 89 | # I was inspired to go back to a function based approach after seeing this
 90 | # awesome talk by Jack Diederich: Stop Writing Classes
 91 | # http://pyvideo.org/video/880/stop-writing-classes
 92 | # Most of the class-based approach had the problems he described.
 93 | # Embarrassing!
 94 | def annotate(address, text, confidence=0.0, support=0,
 95 |              spotter='LingPipeSpotter', disambiguator='Default',
 96 |              policy='whitelist', headers={}):
 97 |     """
 98 |     Annotate a text.
 99 | 
100 |     Can raise :exc:`requests.exceptions.HTTPError` or
101 |     :exc:`SpotlightException`, depending on where the failure is (HTTP status
102 |     code not 200 or the response not containing valid json).
103 | 
104 |     :param address:
105 |         The absolute address of the annotate REST API.
106 |     :type address: string
107 | 
108 |     :param text:
109 |         The text to be sent.
110 |     :type text: string
111 | 
112 |     :param confidence:
113 |         Filter out annotations below a given confidence.
114 |         Based on my experience I would suggest you set this to something
115 |         above 0.4, however your experience might vary from text to text.
116 |     :type confidence: float
117 | 
118 |     :param support:
119 |         Only output annotations above a given prominence (support).
120 |         Based on my experience I would suggest you set this to something
121 |         above 20, however your experience might vary from text to text.
122 |     :type support: int
123 | 
124 |     :param spotter:
125 |         One of spotters available on your DBPedia Spotlight server.
126 |         For example one of: LingPipeSpotter, AtLeastOneNounSelector,
127 |                             CoOccurrenceBasedSelector
128 |     :type spotter: string
129 | 
130 |     :param disambiguator:
131 |         The disambiguator to use on the annotation.
132 |     :type disambiguator: string
133 | 
134 |     :param policy:
135 |         The policy to be used.
136 |     :type disambiguator: string
137 | 
138 |     :param headers:
139 |         Additional headers to be set on the request.
140 |     :type headers: dictionary
141 | 
142 |     :rtype: list of resources
143 |     """
144 |     payload = {'confidence': confidence, 'support': support,
145 |                'spotter': spotter, 'disambiguator': disambiguator,
146 |                'policy': policy, 'text': text}
147 |     reqheaders = {'accept': 'application/json'}
148 |     reqheaders.update(headers)
149 | 
150 |     # Its better for the user to have to explicitly provide a protocl in the
151 |     # URL, since transmissions might happen over HTTPS or any other secure or
152 |     # faster (spdy :D) channel.
153 |     if not '://' in address:
154 |         raise SpotlightException('Oops. Looks like you forgot the protocol '
155 |                                  '(http/https) in your url (%s).' % address)
156 | 
157 |     response = requests.post(address, data=payload, headers=reqheaders)
158 |     if response.status_code != requests.codes.ok:
159 |         # Every http code besides 200 shall raise an exception.
160 |         response.raise_for_status()
161 | 
162 |     pydict = response.json
163 |     if pydict is None:
164 |         raise SpotlightException("Spotlight's response did not contain valid "
165 |                                  "JSON: %s" % response.text)
166 | 
167 |     if not 'Resources' in pydict:
168 |         raise SpotlightException('No Resources found in spotlight response: %s' % pydict)
169 | 
170 |     return [_dict_cleanup(resource) for resource in pydict['Resources']]
171 | 
172 | 
173 | # This is more or less a duplicate of the annotate function, with just
174 | # the return line being the difference haha.
175 | def candidates(address, text, confidence=0.0, support=0,
176 |              spotter='LingPipeSpotter', disambiguator='Default',
177 |              policy='whitelist', headers={}):
178 |     """
179 |     Get the candidates from a text.
180 | 
181 |     Uses the same arguments as :meth:`annotate`.
182 | 
183 |     :rtype: list of surface forms
184 |     """
185 |     payload = {'confidence': confidence, 'support': support,
186 |                'spotter': spotter, 'disambiguator': disambiguator,
187 |                'policy': policy, 'text': text}
188 |     reqheaders = {'accept': 'application/json'}
189 |     reqheaders.update(headers)
190 |     response = requests.post(address, data=payload, headers=reqheaders)
191 |     if response.status_code != requests.codes.ok:
192 |         # Every http code besides 200 shall raise an exception.
193 |         response.raise_for_status()
194 | 
195 |     pydict = response.json
196 |     if pydict is None:
197 |         raise SpotlightException("Spotlight's response did not contain valid "
198 |                                  "JSON: %s" % response.text)
199 | 
200 |     if not 'annotation' in pydict:
201 |         raise SpotlightException('No annotations found in spotlight response: %s' % pydict)
202 | 
203 |     if not 'surfaceForm' in pydict['annotation']:
204 |         raise SpotlightException('No surface forms found in spotlight response: %s' % pydict)
205 | 
206 |     # Previously we assumed that the surfaceForm is *always* a list, however
207 |     # depending on how many are returned, this does not have to be the case.
208 |     # So we are doing some good ol' duck typing here.
209 |     try:
210 |         pydict['annotation']['surfaceForm'][0]
211 |     except KeyError:
212 |         # However note that we will *always* return a list.
213 |         return [_dict_cleanup(pydict['annotation']['surfaceForm']), ]
214 |     return [_dict_cleanup(form)
215 |             for form in pydict['annotation']['surfaceForm']]
216 | 


--------------------------------------------------------------------------------
/tweedr/ml/wordnet.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from pattern.en import wordnet
 5 | from pattern.vector import stem
 6 | 
 7 | 
 8 | def WordnetFeatures(token):
 9 |     synset = wordnet.synsets(token)
10 |     if len(synset) > 0:
11 |         synset = synset[0]
12 |         hypernym = synset.hypernyms(depth=2, recursive=True)
13 | #       hypernym.extend(synset.hyponyms(depth=2,recursive=True))
14 |         return [hyper.senses[0] for hyper in hypernym]
15 |     else:
16 |         return []
17 | 
18 | 
19 | def WordNet(document):
20 |     return [WordnetFeatures(token) for token in document]
21 | 
22 | 
23 | def token_hypernyms(token, recursive, depth):
24 |     '''Stem each token using default stemmer from the pattern library (PORTER?)'''
25 |     for synset in wordnet.synsets(stem(token)):
26 |         for hypernym in synset.hypernyms(recursive, depth):
27 |             for sense in hypernym.senses:
28 |                 yield sense
29 | 


--------------------------------------------------------------------------------
/tweedr/models/README.md:
--------------------------------------------------------------------------------
 1 | ## MySQL libraries for Python
 2 | 
 3 | psycopg2 is the (pretty clear) choice for PostgreSQL (or pg8000 if you don't have build tools).
 4 | But MySQL has more options.
 5 | 
 6 | * MySQL for Python, by Andy Dustman
 7 |     - original: http://sourceforge.net/projects/mysql-python/
 8 |     - docs: http://mysql-python.sourceforge.net/MySQLdb.html
 9 |     - github: https://github.com/farcepest/MySQLdb1
10 |     - next generation: https://github.com/farcepest/moist
11 |     - `easy_install MySQL-Python`
12 |     - `import MySQLdb`
13 | 
14 | * PyMySQL
15 |     - Pure python
16 |     - http://www.pymysql.org/
17 | 
18 | * OurSQL
19 |     - http://pythonhosted.org/oursql/
20 | 


--------------------------------------------------------------------------------
/tweedr/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import orm
 2 | from tweedr.lib.text import token_re
 3 | from tweedr.models.metadata import engine
 4 | 
 5 | sessionmaker = orm.sessionmaker(bind=engine)
 6 | DBSession = sessionmaker()
 7 | 
 8 | # we write enhanced ORM classes directly on top of the schema originals,
 9 | # so that enhancements are optional and transparent
10 | from tweedr.models.schema import (
11 |     DamageClassification,
12 |     TokenizedLabel,
13 |     UniformSample,
14 |     Label,
15 |     KeywordSample,
16 |     Tweet,
17 | )
18 | 
19 | # This quiets the 'import unused' pyflakes warning
20 | __all__ = ['DamageClassification', 'TokenizedLabel', 'UniformSample', 'Label', 'KeywordSample', 'Tweet']
21 | 
22 | 
23 | class DamageClassification(DamageClassification):
24 |     # DamageClassification does not actually have a single FK, but references multiple tables.
25 |     # so this join is actually way more complicated
26 |     # tweet_object = orm.relationship(Tweet, lazy='join',
27 |         # primaryjoin=orm.foreign(DamageClassification.DSSG_ID) == Tweet.dssg_id)
28 | 
29 |     @property
30 |     def text(self):
31 |         'Run join query to get text of this labeled tweet'
32 |         # FIXME: Slow. Consider join in instead.
33 |         if not hasattr(self, 'text_'):
34 |             if 'uniform' in self.which_sample:
35 |                 self.text_ = DBSession.query(UniformSample.text).filter(UniformSample.dssg_id == self.DSSG_ID).first()[0]
36 |             elif 'keyword' in self.which_sample:
37 |                 self.text_ = DBSession.query(KeywordSample.text).filter(KeywordSample.dssg_id == self.DSSG_ID).first()[0]
38 |             else:
39 |                 self.text_ = None
40 |         return self.text_
41 | 
42 |     @property
43 |     def label(self):
44 |         if self.Infrastructure == 1 or self.Casualty == 1:
45 |             return 1.
46 |         else:
47 |             return 0.
48 | 
49 | 
50 | class TokenizedLabel(TokenizedLabel):
51 |     # the FK is called token_type, even though it is an identifying key and not
52 |     # an actual token type, so we name the target object "token_type_object"
53 |     token_type_object = orm.relationship(Label, lazy='subquery',
54 |         primaryjoin=orm.foreign(TokenizedLabel.token_type) == Label.id)
55 | 
56 |     @property
57 |     def tokens(self):
58 |         return token_re.findall(unicode(self.tweet).encode('utf8'))
59 | 
60 |     @property
61 |     def labels(self, null_label=None):
62 |         labels = []
63 |         label_start, label_end = self.token_start, self.token_end
64 |         for match in token_re.finditer(self.tweet):
65 |             token_start, token_end = match.span()
66 |             # token = match.group(0)
67 |             # we want to determine if this particular token in the original tweet overlaps
68 |             #   with any portion of the selected label (label_span)
69 |             label = null_label
70 |             if label_start <= token_start <= label_end or label_start <= token_end <= label_end:
71 |                 label = self.token_type_object.text
72 |             labels.append(label)
73 |         return [unicode(label).encode('utf8') for label in labels]
74 | 


--------------------------------------------------------------------------------
/tweedr/models/example.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from tweedr.lib.text import whitespace_unicode_translations
 3 | from tweedr.models import DBSession, TokenizedLabel, Label
 4 | 
 5 | import logging
 6 | logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
 7 | 
 8 | 
 9 | def count():
10 |     print >> sys.stderr, 'Tweet count started.'
11 |     print 'There are %d labels in the database.' % DBSession.query(Label).count()
12 |     print 'There are %d tokenized labels in the database.' % DBSession.query(TokenizedLabel).count()
13 | 
14 | 
15 | def first(limit):
16 |     print >> sys.stderr, 'First %d tweets.' % limit
17 |     for tokenized_label in DBSession.query(TokenizedLabel).limit(limit):
18 |         # print repr(tokenized_label)
19 |         tokenized_label_text = unicode(tokenized_label).translate(whitespace_unicode_translations).encode('utf8')
20 |         token_type_object = tokenized_label.token_type_object
21 |         print token_type_object.id, '\t', token_type_object.text, '\t', tokenized_label_text
22 | 
23 | if __name__ == '__main__':
24 |     # py example.py | awk -F\\t '{print $1,$2}' | sort | uniq -c | sort -g
25 |     first(1000)
26 | 


--------------------------------------------------------------------------------
/tweedr/models/metadata.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from sqlalchemy import create_engine, MetaData
 4 | 
 5 | connection_string = 'mysql+mysqldb://%(MYSQL_USER)s:%(MYSQL_PASS)s@%(MYSQL_HOST)s/%(MYSQL_DATABASE)s' % os.environ
 6 | engine = create_engine(connection_string, encoding='latin1', convert_unicode=True)
 7 | # yep, it's latin1. Check it:
 8 | # mysql -h host-stuff-here.rds.amazonaws.com -u ourusername -p
 9 | # SHOW DATABASES;
10 | # USE THEDATABASEWITHSTUFFINIT;
11 | # SHOW VARIABLES LIKE "character_set_database";
12 | metadata = MetaData(bind=engine)
13 | 


--------------------------------------------------------------------------------
/tweedr/models/schema.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This file, schema.py, is generated by reflect.py but may have small manual
  3 | modifications. It should only require regeneration when the database schema
  4 | changes. It does not provide much past what the following snippet does, except
  5 | that it doesn't require database calls to reflect the database on start up.
  6 | 
  7 |     class Something(Base):
  8 |         __table__ = Table('somethings', metadata, autoload=True)
  9 | 
 10 | from schema import (
 11 |     DamageClassification,
 12 |     TokenizedLabel,
 13 |     UniformSample,
 14 |     Label,
 15 |     KeywordSample,
 16 |     Tweet,
 17 | )
 18 | '''
 19 | 
 20 | from sqlalchemy import Column
 21 | from sqlalchemy.dialects import mysql
 22 | from sqlalchemy.ext.declarative import declarative_base
 23 | 
 24 | from tweedr.models.metadata import metadata
 25 | 
 26 | 
 27 | class BaseMixin(object):
 28 |     def __json__(self):
 29 |         '''This method serves to both clone the record (copying its values)
 30 |         as well as filter out the special sqlalchemy key (_sa_instance_state)
 31 |         '''
 32 |         return dict((k, v) for k, v in self.__dict__.items() if k != '_sa_instance_state')
 33 | 
 34 |     def __unicode__(self):
 35 |         type_name = self.__class__.__name__
 36 |         pairs = [u'%s=%s' % (k, v) for k, v in self.__json__().items()]
 37 |         return u'<{type_name} {pairs}>'.format(type_name=type_name, pairs=u' '.join(pairs))
 38 | 
 39 |     def __str__(self):
 40 |         return unicode(self).encode('utf-8')
 41 | 
 42 |     def __repr__(self):
 43 |         return str(self)
 44 | 
 45 | Base = declarative_base(metadata=metadata, cls=BaseMixin)
 46 | 
 47 | 
 48 | class DamageClassification(Base):
 49 |     __tablename__ = 'DamageClassification'
 50 |     id = Column(mysql.INTEGER(display_width=11), primary_key=True)
 51 |     DSSG_ID = Column(mysql.INTEGER(display_width=11))
 52 |     Tweet = Column(mysql.TEXT())
 53 |     Infrastructure = Column(mysql.TINYINT(display_width=1))
 54 |     Casualty = Column(mysql.TINYINT(display_width=1))
 55 |     mturk_code = Column(mysql.TEXT())
 56 |     which_sample = Column(mysql.VARCHAR(length=10))
 57 |     is_extracted = Column(mysql.INTEGER(display_width=1))
 58 | 
 59 | 
 60 | class TokenizedLabel(Base):
 61 |     __tablename__ = 'tokenized_labels'
 62 |     id = Column(mysql.INTEGER(display_width=20), primary_key=True)
 63 |     dssg_id = Column(mysql.INTEGER(display_width=100))
 64 |     tweet = Column(mysql.VARCHAR(length=500))
 65 |     token_start = Column(mysql.INTEGER(display_width=50))
 66 |     token_end = Column(mysql.INTEGER(display_width=50))
 67 |     token_type = Column(mysql.VARCHAR(length=500))
 68 |     token = Column(mysql.VARCHAR(length=500))
 69 |     mturk_code = Column(mysql.VARCHAR(length=50))
 70 |     which_sample = Column(mysql.VARCHAR(length=10))
 71 | 
 72 | 
 73 | class UniformSample(Base):
 74 |     __tablename__ = 'uniform_sample'
 75 |     id = Column(mysql.INTEGER(display_width=20), primary_key=True)
 76 |     dssg_id = Column(mysql.INTEGER(display_width=20))
 77 |     pwd = Column(mysql.VARCHAR(length=500))
 78 |     text = Column(mysql.VARCHAR(length=500))
 79 |     is_extracted = Column(mysql.INTEGER(display_width=11))
 80 |     is_classified = Column(mysql.INTEGER(display_width=11))
 81 |     type_sample = Column(mysql.VARCHAR(length=10))
 82 | 
 83 | 
 84 | class Label(Base):
 85 |     __tablename__ = 'labels'
 86 |     id = Column(mysql.VARCHAR(length=10), primary_key=True)
 87 |     text = Column(mysql.VARCHAR(length=100))
 88 | 
 89 | 
 90 | class KeywordSample(Base):
 91 |     __tablename__ = 'keyword_sample'
 92 |     id = Column(mysql.INTEGER(display_width=20), primary_key=True)
 93 |     dssg_id = Column(mysql.INTEGER(display_width=20))
 94 |     pwd = Column(mysql.VARCHAR(length=500))
 95 |     text = Column(mysql.VARCHAR(length=500))
 96 |     is_extracted = Column(mysql.INTEGER(display_width=11))
 97 |     is_classified = Column(mysql.INTEGER(display_width=11))
 98 |     type_sample = Column(mysql.VARCHAR(length=10))
 99 | 
100 | 
101 | class Tweet(Base):
102 |     __tablename__ = 'tweets'
103 |     dssg_id = Column(mysql.INTEGER(display_width=20), primary_key=True)
104 |     pwd = Column(mysql.VARCHAR(length=500))
105 |     _unit_id = Column(mysql.VARCHAR(length=500))
106 |     _golden = Column(mysql.VARCHAR(length=500))
107 |     _unit_state = Column(mysql.VARCHAR(length=500))
108 |     _trusted_judgment = Column(mysql.VARCHAR(length=500))
109 |     _last_jugment_at = Column(mysql.VARCHAR(length=500))
110 |     choose_one = Column(mysql.VARCHAR(length=500))
111 |     choose_oneconfidence = Column(mysql.VARCHAR(length=500))
112 |     choose_one_gold = Column(mysql.VARCHAR(length=500))
113 |     predicted = Column(mysql.VARCHAR(length=500))
114 |     text_no_rt = Column(mysql.VARCHAR(length=500))
115 |     tweet = Column(mysql.VARCHAR(length=500))
116 |     _trusted_judgments = Column(mysql.VARCHAR(length=500))
117 |     _last_judgment_at = Column(mysql.VARCHAR(length=500))
118 |     source = Column(mysql.VARCHAR(length=500))
119 |     type_of_advice_or_caution = Column(mysql.VARCHAR(length=500))
120 |     type_of_advice_or_cautionconfidence = Column(mysql.VARCHAR(length=500))
121 |     what = Column(mysql.VARCHAR(length=500))
122 |     when_ = Column(mysql.VARCHAR(length=500))
123 |     where_ = Column(mysql.VARCHAR(length=500))
124 |     category = Column(mysql.VARCHAR(length=500))
125 |     id = Column(mysql.VARCHAR(length=500))
126 |     retweetcount = Column(mysql.VARCHAR(length=500))
127 |     screenname = Column(mysql.VARCHAR(length=500))
128 |     source_gold = Column(mysql.VARCHAR(length=500))
129 |     text = Column(mysql.VARCHAR(length=500))
130 |     type_of_advice_or_caution_gold = Column(mysql.VARCHAR(length=500))
131 |     userid = Column(mysql.VARCHAR(length=500))
132 |     what_gold = Column(mysql.VARCHAR(length=500))
133 |     when_gold = Column(mysql.VARCHAR(length=500))
134 |     where_gold = Column(mysql.VARCHAR(length=500))
135 |     user_id = Column(mysql.VARCHAR(length=500))
136 |     how_many_injured_or_dead_if_people = Column(mysql.VARCHAR(length=500))
137 |     people_or_infrastructure = Column(mysql.VARCHAR(length=500))
138 |     people_or_infrastructureconfidence = Column(mysql.VARCHAR(length=500))
139 |     what_infrastructure_was_damaged_if_infrastructure = Column(mysql.VARCHAR(length=500))
140 |     how_many_injured_or_dead_if_people_gold = Column(mysql.VARCHAR(length=500))
141 |     people_or_infrastructure_gold = Column(mysql.VARCHAR(length=500))
142 |     intention = Column(mysql.VARCHAR(length=500))
143 |     intentionconfidence = Column(mysql.VARCHAR(length=500))
144 |     type_of_donation = Column(mysql.VARCHAR(length=500))
145 |     type_of_donationconfidence = Column(mysql.VARCHAR(length=500))
146 |     who = Column(mysql.VARCHAR(length=500))
147 |     intention_gold = Column(mysql.VARCHAR(length=500))
148 |     type_of_donation_gold = Column(mysql.VARCHAR(length=500))
149 |     who_gold = Column(mysql.VARCHAR(length=500))
150 |     type_of_message = Column(mysql.VARCHAR(length=500))
151 |     type_of_message_confidence = Column(mysql.VARCHAR(length=500))
152 |     url_or_name_of_the_stationchannel = Column(mysql.VARCHAR(length=500))
153 |     type_of_message_gold = Column(mysql.VARCHAR(length=500))
154 |     url_or_name_of_the_stationchannel_gold = Column(mysql.VARCHAR(length=500))
155 |     joplin_raw = Column(mysql.VARCHAR(length=500))
156 |     creationdate = Column(mysql.VARCHAR(length=500))
157 |     replyto = Column(mysql.VARCHAR(length=500))
158 |     replytouser = Column(mysql.VARCHAR(length=500))
159 |     replytoscreenname = Column(mysql.VARCHAR(length=500))
160 |     longitude = Column(mysql.VARCHAR(length=500))
161 |     latitude = Column(mysql.VARCHAR(length=500))
162 |     favorite = Column(mysql.VARCHAR(length=500))
163 |     retweet = Column(mysql.VARCHAR(length=500))
164 |     hashtags = Column(mysql.VARCHAR(length=500))
165 |     mediaurl = Column(mysql.VARCHAR(length=500))
166 |     city = Column(mysql.VARCHAR(length=500))
167 |     sandy_raw_dataset = Column(mysql.VARCHAR(length=500))
168 |     tweet__no = Column(mysql.VARCHAR(length=500))
169 |     user = Column(mysql.VARCHAR(length=500))
170 |     tweet_text = Column(mysql.VARCHAR(length=500))
171 |     url = Column(mysql.VARCHAR(length=500))
172 |     sandy_labeled = Column(mysql.VARCHAR(length=500))
173 |     type = Column(mysql.VARCHAR(length=500))
174 |     hom_many_injured_or_dead_if_people = Column(mysql.VARCHAR(length=500))
175 |     hom_many_injured_or_dead_if_people_gold = Column(mysql.VARCHAR(length=500))
176 |     what_infrastructure_was_damaged_if_infrastructure_gold = Column(mysql.VARCHAR(length=500))
177 |     the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event = Column(mysql.VARCHAR(length=500))
178 |     the_author_of_the_tweet_seems_to_be_an_eye_witness = Column(mysql.VARCHAR(length=500))
179 |     _of_the_eventconfidence = Column(mysql.VARCHAR(length=500))
180 |     the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event_ = Column(mysql.VARCHAR(length=500))
181 |     tweet_no = Column(mysql.VARCHAR(length=500))
182 |     tweet_no_rt = Column(mysql.VARCHAR(length=500))
183 |     type_of_mess = Column(mysql.VARCHAR(length=500))
184 |     age_gold = Column(mysql.VARCHAR(length=500))
185 |     _created_at = Column(mysql.VARCHAR(length=500))
186 |     _id = Column(mysql.VARCHAR(length=500))
187 |     _missed = Column(mysql.VARCHAR(length=500))
188 |     _started_at = Column(mysql.VARCHAR(length=500))
189 |     _tainted = Column(mysql.VARCHAR(length=500))
190 |     _channel = Column(mysql.VARCHAR(length=500))
191 |     _trust = Column(mysql.VARCHAR(length=500))
192 |     _worker_id = Column(mysql.VARCHAR(length=500))
193 |     _country = Column(mysql.VARCHAR(length=500))
194 |     _region = Column(mysql.VARCHAR(length=500))
195 |     _city = Column(mysql.VARCHAR(length=500))
196 |     _ip = Column(mysql.VARCHAR(length=500))
197 |     word_or_shortphrase = Column(mysql.VARCHAR(length=500))
198 |     instruction = Column(mysql.VARCHAR(length=500))
199 |     type_ = Column(mysql.VARCHAR(length=500))
200 |     of_message = Column(mysql.VARCHAR(length=500))
201 |     type_of_messageconfidence = Column(mysql.VARCHAR(length=500))
202 |     word_or_shortphrase_gold = Column(mysql.VARCHAR(length=500))
203 |     tokenized = Column(mysql.INTEGER(display_width=11))
204 |     body = Column(mysql.VARCHAR(length=500))
205 |     object_id = Column(mysql.INTEGER(display_width=50))
206 |     type_of_sampling = Column(mysql.VARCHAR(length=500))
207 |     is_random_keyword = Column(mysql.INTEGER(display_width=11))
208 |     is_categorized = Column(mysql.INTEGER(display_width=11))
209 | 


--------------------------------------------------------------------------------
/tweedr/models/schema.template:
--------------------------------------------------------------------------------
 1 | <%! from tweedr.lib.text import UpperCamelCase, singular %>\
 2 | <%def name="column_args(column)" filter="trim">
 3 |     mysql.${repr(column.type)}${', primary_key=True' if column.primary_key else ''}${', unique=True' if column.unique else ''}
 4 | </%def>\
 5 | '''
 6 | This file, schema.py, can be generated by running `tweedr-database reflect`.
 7 | It should only require regeneration when the database schema changes.
 8 | It does not provide much past what the following snippet does, except
 9 | that it doesn't require database calls to reflect the database on start up:
10 | 
11 |     class Something(Base):
12 |         __table__ = Table('somethings', metadata, autoload=True)
13 | 
14 | You can use the following snippet to import all mapped tables.
15 | 
16 | from tweedr.models.schema import (
17 | % for table in metadata.sorted_tables:
18 |     ${singular(UpperCamelCase(table.name))},
19 | % endfor
20 | )
21 | 
22 | '''
23 | 
24 | from sqlalchemy import Column
25 | from sqlalchemy.dialects import mysql
26 | from sqlalchemy.ext.declarative import declarative_base
27 | 
28 | from tweedr.models.metadata import metadata
29 | 
30 | 
31 | class BaseMixin(object):
32 |     def __json__(self):
33 |         '''This method serves to both clone the record (copying its values)
34 |         as well as filter out the special sqlalchemy key (_sa_instance_state)
35 |         '''
36 |         return dict((k, v) for k, v in self.__dict__.items() if k != '_sa_instance_state')
37 | 
38 |     def __unicode__(self):
39 |         type_name = self.__class__.__name__
40 |         pairs = [u'%s=%s' % (k, v) for k, v in self.__json__().items()]
41 |         return u'<{type_name} {pairs}>'.format(type_name=type_name, pairs=u' '.join(pairs))
42 | 
43 |     def __str__(self):
44 |         return unicode(self).encode('utf-8')
45 | 
46 |     def __repr__(self):
47 |         return str(self)
48 | 
49 | Base = declarative_base(metadata=metadata, cls=BaseMixin)
50 | % for table in metadata.sorted_tables:
51 | 
52 | 
53 | class ${singular(UpperCamelCase(table.name))}(Base):
54 |     __tablename__ = '${table.name}'
55 |     % for column in table.columns:
56 |     ${column.name} = Column(${column_args(column)})
57 |     % endfor
58 | % endfor
59 | 


--------------------------------------------------------------------------------
/tweedr/ui/README.md:
--------------------------------------------------------------------------------
 1 | ## Configuration
 2 | 
 3 | Make sure your environment variables are available to the process that will be serving the app.
 4 | 
 5 | See the wiki [Environment](https://github.com/dssg/tweedr/wiki/Environment) page, particular the `MYSQL_*` variables.
 6 | 
 7 | 
 8 | ## Running
 9 | 
10 | The `tweedr-ui` CLI gets installed when you install tweedr. Simply run it:
11 | 
12 |     tweedr-ui
13 | 
14 | 
15 | ## Browsing
16 | 
17 | As you can see from the output of that call, Bottle serves the application on port 8080 by default.
18 | 
19 | * http://127.0.0.1:8080/
20 | 
21 | This should redirect you to `/crf` — take a look at your developer console in the browser to see the endpoints it's hitting to load new tweets and tag them.
22 | 


--------------------------------------------------------------------------------
/tweedr/ui/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/tweedr/ui/__init__.py


--------------------------------------------------------------------------------
/tweedr/ui/crf.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | 
 4 | import bottle
 5 | from bottle import request, redirect, static_file, mako_view as view
 6 | 
 7 | import tweedr
 8 | from tweedr.lib.text import token_re
 9 | from tweedr.ml.crf.classifier import CRF
10 | from tweedr.ml.features import featurize
11 | from tweedr.ml.features.sets import crf_feature_functions
12 | from tweedr.models import DBSession, TokenizedLabel
13 | 
14 | import logging
15 | logger = logging.getLogger(__name__)
16 | 
17 | # tell bottle where to look for templates
18 | # We use Mako templates (*.mako) that are in the templates/ directory in the package root.
19 | # There are also Handlebars (*.bars) templates in there, but those are rendered on the client-side.
20 | bottle.TEMPLATE_PATH.append(os.path.join(tweedr.root, 'templates'))
21 | 
22 | # this is the primary export
23 | app = bottle.Bottle()
24 | 
25 | # globals are messy, but we don't to retrain a tagger for every request
26 | logger.debug('initializing %s (training or loading CRF using defaults)', __name__)
27 | GLOBALS = dict(tagger=CRF.default(crf_feature_functions))
28 | 
29 | 
30 | @app.get('/')
31 | def root():
32 |     redirect('/crf')
33 | 
34 | 
35 | @app.get('/crf')
36 | @view('crf.mako')
37 | def index():
38 |     # effectively static; all the fun stuff happens in the template
39 |     return dict()
40 | 
41 | 
42 | @app.get('/tokenized_labels/sample')
43 | def tokenized_labels_sample():
44 |     total = DBSession.query(TokenizedLabel).count()
45 |     index = random.randrange(total)
46 |     logger.debug('/tokenized_labels/sample: choosing #%d out of %d', index, total)
47 |     tokenized_label = DBSession.query(TokenizedLabel).offset(index).limit(1).first()
48 |     return tokenized_label.__json__()
49 | 
50 | 
51 | @app.post('/tagger/tag')
52 | def tagger_tag():
53 |     # For bottle >= 0.10, request.forms.xyz attributes return unicode strings
54 |     # and an empty string if decoding fails.
55 |     text = request.forms.text
56 |     tokens = token_re.findall(text.encode('utf8'))
57 | 
58 |     tokens_features = map(list, featurize(tokens, crf_feature_functions))
59 |     tagger = GLOBALS['tagger']
60 |     labels = tagger.predict([tokens_features])[0]
61 | 
62 |     sequences = [
63 |         {'name': 'tokens', 'values': tokens},
64 |         {'name': 'labels', 'values': labels},
65 |     ]
66 |     for feature_function in crf_feature_functions:
67 |         sequences.append({
68 |             'name': feature_function.__name__,
69 |             'values': [', '.join(features) for features in feature_function(tokens)]})
70 | 
71 |     return {'sequences': sequences}
72 | 
73 | 
74 | @app.route('/tagger/retrain')
75 | def tagger_retrain():
76 |     GLOBALS['tagger'] = CRF.default(crf_feature_functions, retrain=True)
77 |     return dict(success=True)
78 | 
79 | 
80 | @app.route('/static/<filepath:path>')
81 | def serve_static_file(filepath):
82 |     return static_file(filepath, os.path.join(tweedr.root, 'static'))
83 | 
84 | 
85 | @app.route('/templates/<filepath:path>')
86 | def serve_templates_file(filepath):
87 |     return static_file(filepath, os.path.join(tweedr.root, 'templates'))
88 | 


--------------------------------------------------------------------------------
/tweedr/ui/middleware.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | 
 4 | def add_duration_header(app):
 5 |     def call(environ, start_response):
 6 |         started = time.time()
 7 | 
 8 |         def wrapped_start_response(status, headers):
 9 |             duration = time.time() - started
10 |             return start_response(status, headers + [('X-Duration', str(duration))])
11 | 
12 |         return app(environ, wrapped_start_response)
13 | 
14 |     return call
15 | 


--------------------------------------------------------------------------------
/web/README.md:
--------------------------------------------------------------------------------
 1 | # Extraction and Classification Web Tool
 2 | 
 3 | This directory contains the code for the extraction and classification tool to crowdsource labeled data.
 4 | 
 5 | ## Directories 
 6 | * Extraction-tool contains the extraction UI
 7 | * Classification-tool contains the classification UI
 8 | 
 9 | 
10 | ## License
11 | 
12 | Copyright © 2013 The University of Chicago. [MIT Licensed](LICENSE).
13 | 


--------------------------------------------------------------------------------
/web/extraction-tool/README.md:
--------------------------------------------------------------------------------
 1 | # Extraction Tool
 2 | 
 3 | This tool can be used to create labeled data from tweets. 
 4 | 
 5 | ## Setup Database for tweet output
 6 | CREATE TABLE `tokenized_labels` (
 7 |   `id` int(20) NOT NULL AUTO_INCREMENT,
 8 |   `dssg_id` int(100) NOT NULL,
 9 |   `tweet` varchar(500) DEFAULT NULL,
10 |   `token_start` int(50) DEFAULT NULL,
11 |   `token_end` int(50) DEFAULT NULL,
12 |   `token_type` varchar(500) DEFAULT NULL,
13 |   `token` varchar(500) DEFAULT NULL,
14 |   `mturk_code` varchar(50) DEFAULT NULL,
15 |   `which_sample` varchar(10) DEFAULT NULL,
16 |   `which_disaster` varchar(60) DEFAULT NULL,
17 |   PRIMARY KEY (`id`)
18 | ) ENGINE=InnoDB AUTO_INCREMENT=908 DEFAULT CHARSET=latin1;
19 | 
20 | 
21 | ## Contact
22 | 
23 | Want to get in touch? Found a bug? Open up a [new issue](https://github.com/dssg/tweedr/issues/new) or email us at [dssg-qcri@googlegroups.com](mailto:dssg-qcri@googlegroups.com).
24 | 
25 | 
26 | ## License
27 | 
28 | Copyright © 2013 The University of Chicago. [MIT Licensed](LICENSE).
29 | 


--------------------------------------------------------------------------------
/web/extraction-tool/db.php:
--------------------------------------------------------------------------------
  1 | <?php 
  2 |  header("Location:index.php");
  3 | $username="YOUR_USERNAME";
  4 | $password="YOUR_PASSWORD";
  5 | $database="YOUR_DATABASE";
  6 | $link = mysql_connect('HOSTNAME',$username,$password);
  7 | session_start();
  8 | if (!$link) {
  9 |   die('Could not connect: ' . mysql_error());
 10 |  }
 11 | //echo 'Connected successfully';
 12 | 
 13 | 
 14 | @mysql_select_db($database) or die( "Unable to select database");
 15 | 
 16 | print "<style>body {background-image:url('background.png');
 17 |   background-repeat:yes;
 18 | font-family: fontC;
 19 |  color:#111;
 20 | }
 21 | 
 22 | 
 23 | 
 24 | </style>";
 25 | $hash = $_SESSION['hash'];
 26 | $id_num = $_GET['tweet_id'];
 27 | $id2 = $_GET['id'];
 28 | 
 29 | $random = $_GET['random'];
 30 | print $tweet_id;
 31 | $sample = "";
 32 | 
 33 | $result = mysql_query("SELECT * FROM DamageClassification WHERE id='$id2'");
 34 | 
 35 | 
 36 | 
 37 | $tweet = "";
 38 | if($result === FALSE) {
 39 |     die(mysql_error()); // TODO: better error handling
 40 | }
 41 | 
 42 | $row = mysql_fetch_array($result);
 43 | //$tweet = $row['Tweet'];
 44 | $id = $row['DSSG_ID'];
 45 | $sample= $row['type_sample'];
 46 | 
 47 | if (is_null($sample)){
 48 |   $tweet = $row["Tweet"];
 49 | 
 50 | }else{
 51 |   if (strcmp($sample, "keyword") == 0 || strcmp($sample, "by_keyword")==0){
 52 |     $r = mysql_query("SELECT * FROM keyword_sample WHERE dssg_id = '$id' LIMIT 1");
 53 |     $row2 = mysql_fetch_array($r);
 54 |     $tweet = $row2["text"];
 55 | 
 56 |   }else if (strcmp($sample,"uniform")==0){
 57 |     $r = mysql_query("SELECT * FROM uniform_sample WHERE dssg_id = '$id' LIMIT 1");
 58 |     $row2 = mysql_fetch_array($r);
 59 |     $tweet = $row2["text"];
 60 | 
 61 |   }
 62 | 
 63 | }
 64 | 
 65 | 
 66 | 
 67 | //Pass the data in the form of url page.php?variable=value and read the values as `echo $_GET['variable']
 68 | print $tweet . "<br>";
 69 |  $tweet =  mysql_real_escape_string($tweet);
 70 | 
 71 | 
 72 | for ($i = 0; $i < 27; $i++){
 73 | $post_name = "i" . $i;
 74 | print $post_name;
 75 | 
 76 | $h = $_POST[$post_name];
 77 | print "bridge:";
 78 | if($h != "") {
 79 |     $t = explode(",_,_,", $h);
 80 | foreach ($t as $token) {
 81 |     print $token. ", ";
 82 |      $token =  mysql_real_escape_string($token);
 83 | $e = explode(" ", $token);
 84 | $token_start = $e[count($e)-2];
 85 | $token_end = $e[count($e)-1];
 86 | $gr = " " . $token_start . " " . $token_end;
 87 | 
 88 | $ra = str_replace($gr,"",$token);
 89 | print "start:" . $token_start . "<br>";
 90 | print "end:" . $token_end . "<br>";
 91 | 
 92 | $request= "INSERT INTO tokenized_labels values (NULL, '$id', '$tweet','$token_start', '$token_end', '$post_name', '$ra', '$hash', '$sample')";
 93 |  $results = mysql_query($request, $link);
 94 | 
 95 |   echo mysql_errno($link) . ": " . mysql_error($link) . "\n";
 96 | 
 97 | 
 98 | 
 99 |  }
100 |  }
101 | 
102 | 
103 | 
104 | 
105 | 
106 | }
107 | $bad_tweet = $_POST['bad'];
108 | 
109 | if (strpos($bad_tweet, "NO ENTITIES, CLICK SUBMI") !== false){
110 | 
111 |   $request = "UPDATE DamageClassification SET Infrastructure=0 WHERE id='$id2'";
112 |   $results = mysql_query($request, $link);
113 | 
114 |   echo mysql_errno($link) . ": " . mysql_error($link) . "\n";
115 | 
116 | 
117 |   $request = "UPDATE DamageClassification SET Casualty=0 WHERE id='$id2'";
118 |   $results = mysql_query($request, $link);
119 | 
120 |   echo mysql_errno($link) . ": " . mysql_error($link) . "\n";
121 | 
122 | 
123 | 
124 | }
125 | 
126 | 
127 | 
128 | 
129 | $request = "UPDATE DamageClassification SET is_extracted=is_extracted+1 WHERE id='$id2'";
130 | $results = mysql_query($request, $link);
131 | 
132 | echo mysql_errno($link) . ": " . mysql_error($link) . "\n";
133 | 
134 | 
135 | 
136 | ?>


--------------------------------------------------------------------------------