├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── doc ├── README.md ├── dsc-pitch.key ├── dsc-pitch.pdf ├── kdd-pitch.pdf ├── kdd-pitch.pptx ├── kdd-poster.pdf ├── kdd-poster.pptx └── report │ ├── .gitignore │ ├── confusion_matrix.png │ ├── tweedr.bib │ ├── tweedr.pdf │ └── tweedr.tex ├── package.json ├── setup.py ├── static ├── img │ ├── logos │ │ ├── qcri.png │ │ └── uchicago.png │ └── screenshots │ │ └── crisistracker-syria-2013-08-23.png ├── lib │ ├── backbone.js │ ├── backbone.min.js │ ├── cookies.js │ ├── handlebars.js │ ├── handlebars.runtime.js │ ├── jquery.js │ ├── jquery.min.js │ ├── templating.js │ ├── underscore.js │ └── underscore.min.js ├── master.css └── master.less ├── templates ├── code.mako ├── crf.mako ├── gloss.bars └── layout.mako ├── tests ├── README.md ├── test_codebase.py └── test_libraries.py ├── tools └── git-hooks │ ├── README.md │ └── pre-commit ├── tweedr ├── README.md ├── __init__.py ├── api │ ├── README.md │ ├── __init__.py │ ├── mappers │ │ ├── __init__.py │ │ ├── basic.py │ │ ├── ml.py │ │ ├── nlp.py │ │ └── similar.py │ ├── pipeline.py │ └── protocols.py ├── ark │ ├── __init__.py │ ├── java │ │ ├── __init__.py │ │ └── singleton.py │ └── tweetmotif │ │ ├── LICENSE │ │ ├── README.md │ │ ├── __init__.py │ │ ├── emoticons.py │ │ └── twokenize.py ├── cli │ ├── __init__.py │ ├── database.py │ ├── pipeline.py │ └── ui.py ├── corpora │ ├── __init__.py │ ├── qcri.py │ └── qcri_database.py ├── emr │ ├── README.md │ ├── __init__.py │ ├── gnip_geo.py │ └── gnip_wc.py ├── lib │ ├── __init__.py │ ├── readers.py │ ├── text.py │ └── timeout.py ├── ml │ ├── __init__.py │ ├── build_confusion_matrix.py │ ├── classifier.py │ ├── crf │ │ ├── __init__.py │ │ ├── classifier.py │ │ └── wrapper.py │ ├── evaluate.py │ ├── evaluate_combinations.py │ ├── features │ │ ├── __init__.py │ │ ├── characters.py │ │ ├── dbpedia.py │ │ ├── lexicons.py │ │ ├── ngrams.py │ │ ├── nlp.py │ │ └── sets.py │ ├── lexicon_list.py │ ├── pyslda │ │ ├── PreProcess.py │ │ ├── PySLDA.py │ │ ├── README.md │ │ ├── __init__.py │ │ ├── evaluate-classifier.py │ │ ├── loadModel.R │ │ ├── saveModel.R │ │ ├── testLDA.R │ │ └── trainLDA.R │ ├── spotlight │ │ └── __init__.py │ └── wordnet.py ├── models │ ├── README.md │ ├── __init__.py │ ├── example.py │ ├── metadata.py │ ├── schema.py │ └── schema.template └── ui │ ├── README.md │ ├── __init__.py │ ├── crf.py │ └── middleware.py └── web ├── README.md └── extraction-tool ├── README.md ├── db.php └── index.php /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.egg-info 3 | .DS_Store 4 | .DS_Store.orig 5 | node_modules/ 6 | /*.egg 7 | /ext/ 8 | /dist/ 9 | /build/ 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | virtualenv: 5 | system_site_packages: true 6 | before_install: 7 | - "sudo apt-get update -qq" 8 | - "sudo apt-get install -qq python-scipy python-nose" 9 | # python-numpy python-mako python-mysqldb python-scikits-learn python-sqlalchemy pep8 10 | env: MYSQL_HOST=dummyhost MYSQL_USER=dummyuser MYSQL_PASS=dummypass MYSQL_DATABASE=dummydatabase 11 | install: "pip install . --use-mirrors" 12 | script: "python setup.py nosetests -e no_ci --with-doctest" 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 The University of Chicago 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tweedr: measuring disaster damage with tweets 2 | 3 | 4 | 5 | Tweedr makes information from social media more accessible to providers of disaster relief. There are two aspects to the application: 6 | 7 | 1. An **API** / **pipeline** for applying _machine learning techniques_ and _natural language processing tools_ to analyze social media produced in response to a disaster. 8 | 2. A **user interface** for manipulating, filtering, and aggregating this enhanced social media data. 9 | 10 | Tweedr is a [Data Science for Social Good](http://dssg.io/) project, through a partnership with the [Qatar Computational Research Institute](http://qcri.qa/). 11 | 12 | ## Problem, solution, data 13 | 14 | ![web app screenshot](https://raw.github.com/dssg/dssg.github.io/master/img/posts/tweedr-screenshot.png) 15 | 16 | * For an extensive discussion of the problem and proposed solution, [visit our wiki](https://github.com/dssg/tweedr/wiki). 17 | * Get start using the tweedr api, [check out our tutorial website](http://tokens.qcri.dssg.io/tweedrtutorial/). 18 | 19 | 20 | ## Project layout 21 | 22 | * [`doc/`](doc) contains various presentations, along with accompanying slides and poster. 23 | + [`doc/report/`](doc/report) contains a more technical and extensive write-up of this project. _In progress._ 24 | * `ext/` is created by a complete install; external data sources and libraries are downloaded to this folder. 25 | * [`static/`](static) contains static (non-Javascript) files used by the web app. 26 | * [`templates/`](templates) contain templates (both server-side and client-side) used by the web app. 27 | * [`tests/`](tests) contain unittest-like tests. Use `python setup.py test` to run these. 28 | * [`tools/`](tools) holds tools to aid development (currently, only a test-running git-hook). 29 | * [`tweedr/`](tweedr) contains the main Python app and functions as a Python package (e.g., `import tweedr`). 30 | 31 | 32 | ## Installation guide 33 | 34 | git clone https://github.com/dssg/tweedr.git 35 | cd tweedr 36 | python setup.py develop download_ext 37 | 38 | If you want to jump straight to development, see the [Contributing](https://github.com/dssg/tweedr/wiki/Contributing) wiki page. 39 | 40 | ### Dependencies 41 | 42 | Tweedr uses a number of external libraries and resources. This is the dependency tree: 43 | 44 | * [Tweedr](https://github.com/dssg/tweedr): Primarily python, on github 45 | - [crfsuite](http://www.chokkan.org/software/crfsuite/): C/C++, from source 46 | + [libLBFGS](http://www.chokkan.org/software/liblbfgs/): C/C++, from source 47 | - [scikit-learn](http://scikit-learn.org/stable/): Python, from PyPI 48 | + [numpy](http://www.numpy.org/): Python, with C/C++ (blas/lapack), Fortran links, from PyPI or package manager 49 | + [scipy](http://www.scipy.org/): Python, with C/C++, from PyPI or package manager 50 | - [TweetNLP](http://www.ark.cs.cmu.edu/TweetNLP/): Java, from jar 51 | - [PyPer] (https://pypi.python.org/pypi/PypeR/1.1.0): Python, with R, from PyPI 52 | 53 | `crfsuite` and `liblbfgs` are the only components that can't be installed directly with Python via `setuptools`. Though if you have trouble installing some of the packages above, you might have better luck looking for those packages in your operating system's pacakge manager or as binaries on the projects' websites. 54 | 55 | ### Installation steps 56 | 57 | *1. Installing libLBFGS* 58 | 59 | The source code can be downloaded from the [maintainer's webpage](http://www.chokkan.org/software/liblbfgs/), though this [Github fork](https://github.com/chbrown/liblbfgs) (and below) attempts to simplify the install process. 60 | 61 | git clone https://github.com/chbrown/liblbfgs.git 62 | cd liblbfgs 63 | ./configure 64 | make 65 | sudo make install 66 | 67 | *2. Installing CRFsuite* 68 | 69 | Like libLBFGS, a tarball can be downloaded from the [original website](http://www.chokkan.org/software/crfsuite/), though the accompanying [fork on Github](https://github.com/chbrown/crfsuite) attempts to document the installation process and make compilation more automatic on both Linux and Mac OS X. 70 | 71 | git clone https://github.com/chbrown/crfsuite.git 72 | cd crfsuite 73 | ./configure 74 | make 75 | sudo make install 76 | 77 | That installs the library, but not the Python wrapper, which takes a few more steps: 78 | 79 | cd swig/python 80 | python setup.py build_ext 81 | sudo python setup.py install_lib 82 | 83 | To test whether it installed correctly, you can run the following at your terminal, which should print out the current CRFsuite version: 84 | 85 | python -c 'import crfsuite; print crfsuite.version()' 86 | > 0.12.2 87 | 88 | The [github repository](https://github.com/chbrown/crfsuite) documents a few more options that might come in handy if the process above does not work for your operating system. 89 | 90 | 91 | *3. Configuring environment variables* 92 | 93 | Tweedr also connects to a number of remote resources when running live; see [[Environment]] for instructions on setting those up. 94 | 95 | 96 | *4. Installing Tweedr* 97 | 98 | After installing `crfsuite` and `liblbfgs`, everything else should be installable via setuptools / distutils: 99 | 100 | git clone https://github.com/dssg/tweedr.git 101 | cd tweedr 102 | python setup.py install 103 | 104 | And then to download external data requirements: 105 | 106 | python setup.py download_ext 107 | 108 | The `download_ext` command will download external data, which currently includes the following packages / sources: 109 | 110 | * [TweetNLP 0.3.2 tarball](http://ark-tweet-nlp.googlecode.com/files/ark-tweet-nlp-0.3.2.tgz) (Github repository: [ark-tweet-nlp](https://github.com/brendano/ark-tweet-nlp)) 111 | 112 | You may get an error, "IOError: cmu.arktweetnlp.RunTagger error", if you try to use some parts of Tweedr before installing this component. 113 | 114 | 115 | *5. Instantiating the database* 116 | 117 | While we are not currently able to release our data, you can easily recreate the structure of our database by running the following command: 118 | 119 | tweedr-database create 120 | 121 | This simply uses SQLAlchemy to un-reflect the database, by running `metadata.create_all()`. 122 | 123 | 124 | ### Running Tweedr 125 | 126 | At this point, you should have tools like `tweedr-ui` and `tweedr-pipeline` on your `PATH`, and you can run each of those with the `--help` flag to view the usage messages. 127 | 128 | See [the API section](https://github.com/dssg/tweedr/wiki#tweedr-api-how-it-works) of the wiki for a description of some of the fields that `tweedr-pipeline` adds. 129 | 130 | 131 | ### Troubleshooting 132 | 133 | If your installation is still missing packages, see the [manually installing](https://github.com/dssg/tweedr/wiki/Manually-installing) page of the wiki. 134 | 135 | 136 | ## Team 137 | ![Team](https://raw.github.com/dssg/dssg.github.io/761993c24ea2991170ef64048115cb805f5f13fb/img/people/teams/tweedr.png) 138 | 139 | 140 | ## Contributing to the project 141 | 142 | Want to get in touch? Found a bug? Open up a [new issue](https://github.com/dssg/tweedr/issues/new) or email us at [dssg-qcri@googlegroups.com](mailto:dssg-qcri@googlegroups.com). 143 | 144 | 145 | ## License 146 | 147 | Copyright © 2013 The University of Chicago. [MIT Licensed](LICENSE). 148 | -------------------------------------------------------------------------------- /doc/README.md: -------------------------------------------------------------------------------- 1 | # Presentations 2 | 3 | - **Thursday, July 11, 2013**. 4 | Presented [twitter-informed-disaster-relief](http://prezi.com/83-blihpamgf/twitter-informed-disaster-relief/) (Prezi link) for the full DSSG group. 5 | - **Wednesday, July 24, 2013**. 6 | Presented [dsc-pitch.key](dsc-pitch.key) (Keynote, [PDF](dsc-pitch.pdf)) for the Data Science Chicago meetup. 7 | -------------------------------------------------------------------------------- /doc/dsc-pitch.key: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/doc/dsc-pitch.key -------------------------------------------------------------------------------- /doc/dsc-pitch.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/doc/dsc-pitch.pdf -------------------------------------------------------------------------------- /doc/kdd-pitch.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/doc/kdd-pitch.pdf -------------------------------------------------------------------------------- /doc/kdd-pitch.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/doc/kdd-pitch.pptx -------------------------------------------------------------------------------- /doc/kdd-poster.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/doc/kdd-poster.pdf -------------------------------------------------------------------------------- /doc/kdd-poster.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/doc/kdd-poster.pptx -------------------------------------------------------------------------------- /doc/report/.gitignore: -------------------------------------------------------------------------------- 1 | *.log 2 | *.aux 3 | *.bbl 4 | *.blg 5 | -------------------------------------------------------------------------------- /doc/report/confusion_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/doc/report/confusion_matrix.png -------------------------------------------------------------------------------- /doc/report/tweedr.bib: -------------------------------------------------------------------------------- 1 | 2 | @article{lin_class-imbalanced_2012, 3 | title = {Class-imbalanced classifiers for high-dimensional data}, 4 | issn = {1467-5463, 1477-4054}, 5 | url = {http://bib.oxfordjournals.org/content/early/2012/03/08/bib.bbs006}, 6 | doi = {10.1093/bib/bbs006}, 7 | abstract = {A class-imbalanced classifier is a decision rule to predict the class membership of new samples from an available data set where the class sizes differ considerably. When the class sizes are very different, most standard classification algorithms may favor the larger (majority) class resulting in poor accuracy in the minority class prediction. A class-imbalanced classifier typically modifies a standard classifier by a correction strategy or by incorporating a new strategy in the training phase to account for differential class sizes. This article reviews and evaluates some most important methods for class prediction of high-dimensional imbalanced data. The evaluation addresses the fundamental issues of the class-imbalanced classification problem: imbalance ratio, small disjuncts and overlap complexity, lack of data and feature selection. Four class-imbalanced classifiers are considered. The four classifiers include three standard classification algorithms each coupled with an ensemble correction strategy and one support vector machines ({SVM)-based} correction classifier. The three algorithms are (i) diagonal linear discriminant analysis ({DLDA)}, (ii) random forests ({RFs)} and (ii) {SVMs.} The {SVM-based} correction classifier is {SVM} threshold adjustment ({SVM-THR).} A {Monte–Carlo} simulation and five genomic data sets were used to illustrate the analysis and address the issues. The {SVM-ensemble} classifier appears to perform the best when the class imbalance is not too severe. The {SVM-THR} performs well if the imbalance is severe and predictors are highly correlated. The {DLDA} with a feature selection can perform well without using the ensemble correction.}, 8 | language = {en}, 9 | urldate = {2013-09-02}, 10 | journal = {Briefings in Bioinformatics}, 11 | author = {Lin, Wei-Jiun and Chen, James J.}, 12 | month = mar, 13 | year = {2012}, 14 | note = {{PMID:} 22408190}, 15 | keywords = {class-imbalanced prediction, feature selection, lack of data, performance metrics, threshold adjustment, under-sampling ensemble}, 16 | pages = {bbs006}, 17 | file = {Snapshot:/Users/awculott/Library/Application Support/Firefox/Profiles/0zv8dy28.default/zotero/storage/BDAQWHVI/bib.html:text/html} 18 | } 19 | 20 | @inproceedings{mandel12demo, 21 | author = {Benjamin Mandel and Aron Culotta and John Boulahanis and Danielle Stark and Bonnie Lewis and Jeremy Rodrigue}, 22 | title = {A demographic analysis of online sentiment during {H}urricane {I}rene}, 23 | booktitle = {NAACL-HLT Workshop on Language in Social Media}, 24 | shortbooktitle = {HLT/NAACL}, 25 | year = {2012}, 26 | mytype = {Refereed Workshop Publications}, 27 | url = {http://www2.selu.edu/Academics/Faculty/aculotta/pubs/mandel12demo.pdf}, 28 | abstract = {We examine the response to the recent natural disaster Hurricane Irene on Twitter.com. We collect over 65,000 Twitter messages relating to Hurricane Irene from August 18th to August 31st, 2011, and group them by location and gender. We train a sentiment classifier to categorize messages based on level of concern, and then use this classifier to investigate demographic differences. We report three principal findings: (1) the number of Twitter messages related to Hurricane Irene in directly affected regions peaks around the time the hurricane hits that region; (2) the level of concern in the days leading up to the hurricane's arrival is dependent on region; and (3) the level of concern is dependent on gender, with females being more likely to express concern than males. Qualitative linguistic variations further support these differences. We conclude that social media analysis provides a viable, real-time complement to traditional survey methods for understanding public perception towards an impending disaster.}, 29 | } 30 | 31 | 32 | @inproceedings{imran_practical_2013, 33 | address = {Republic and Canton of Geneva, Switzerland}, 34 | series = {{WWW} '13 Companion}, 35 | title = {Practical extraction of disaster-relevant information from social media}, 36 | isbn = {978-1-4503-2038-2}, 37 | url = {http://dl.acm.org/citation.cfm?id=2487788.2488109}, 38 | abstract = {During times of disasters online users generate a significant amount of data, some of which are extremely valuable for relief efforts. In this paper, we study the nature of social-media content generated during two different natural disasters. We also train a model based on conditional random fields to extract valuable information from such content. We evaluate our techniques over our two datasets through a set of carefully designed experiments. We also test our methods over a non-disaster dataset to show that our extraction model is useful for extracting information from socially-generated content in general.}, 39 | urldate = {2013-09-02}, 40 | booktitle = {Proceedings of the 22nd international conference on World Wide Web companion}, 41 | publisher = {International World Wide Web Conferences Steering Committee}, 42 | author = {Imran, Muhammad and Elbassuoni, Shady and Castillo, Carlos and Diaz, Fernando and Meier, Patrick}, 43 | year = {2013}, 44 | keywords = {information extraction, information filtering, social media}, 45 | pages = {1021–1024} 46 | }, 47 | 48 | @inproceedings{meier_extracting_2013, 49 | title = {Extracting Information Nuggets from Disaster-Related Messages in Social Media}, 50 | booktitle = {10th International Conference on Information Systems for Crisis Response and Management}, 51 | author = {Meier, Patrick and Castillo, Carlos and Imran, Muhammad and Elbassuoni, Shady Mamoon and Diaz, Fernando}, 52 | year = {2013} 53 | }, 54 | 55 | @inproceedings{kumar_tweettracker_2011, 56 | title = {{TweetTracker:} An Analysis Tool for Humanitarian and Disaster Relief}, 57 | booktitle = {{ICWSM'11}}, 58 | author = {Kumar, Shamanth and Barbier, Geoffrey and Abbasi, Mohammad Ali and Liu, Huan}, 59 | year = {2011} 60 | }, 61 | 62 | @inproceedings{cheong_social_2011, 63 | title = {Social Media Data Mining: A Social Network Analysis Of Tweets During The 2010-2011 Australian Floods}, 64 | booktitle = {{PACIS'11}}, 65 | author = {Cheong, France and Cheong, Christopher}, 66 | year = {2011}, 67 | pages = {46--46} 68 | } 69 | 70 | 71 | @techreport{blei10supervised, 72 | type = {{arXiv} e-print}, 73 | title = {Supervised Topic Models}, 74 | number = {1003.0783}, 75 | urldate = {2013-08-23}, 76 | author = {Blei, David M. and {McAuliffe}, Jon D.}, 77 | month = mar, 78 | year = {2010}, 79 | keywords = {Statistics - Machine Learning}, 80 | } 81 | 82 | @book{sutton12intro, 83 | address = {Hanover, {MA}}, 84 | title = {An introduction to conditional random fields}, 85 | isbn = {9781601985736 1601985738}, 86 | url = {http://search.ebscohost.com/login.aspx?direct=true&scope=site&db=nlebk&db=nlabk&AN=593830}, 87 | language = {English}, 88 | urldate = {2013-08-23}, 89 | publisher = {Now Publishers}, 90 | author = {Sutton, Charles and {McCallum}, Andrew K}, 91 | year = 2012 92 | } 93 | 94 | 95 | 96 | @article{bloom70space, 97 | title = {Space/time trade-offs in hash coding with allowable errors}, 98 | volume = 13, 99 | issn = {0001-0782}, 100 | url = {http://doi.acm.org/10.1145/362686.362692}, 101 | doi = {10.1145/362686.362692}, 102 | number = 7, 103 | urldate = {2013-08-23}, 104 | journal = {Commun. {ACM}}, 105 | author = {Bloom, Burton H.}, 106 | month = jul, 107 | year = 1970, 108 | keywords = {hash addressing, hash coding, retrieval efficiency, retrieval trade-offs, scatter storage, searching, storage efficiency, storage layout}, 109 | pages = {422–426} 110 | } 111 | 112 | 113 | @inproceedings{charikar02similarity, 114 | address = {New York, {NY}, {USA}}, 115 | series = {{STOC} '02}, 116 | title = {Similarity estimation techniques from rounding algorithms}, 117 | isbn = {1-58113-495-9}, 118 | url = {http://doi.acm.org/10.1145/509907.509965}, 119 | doi = {10.1145/509907.509965}, 120 | urldate = {2013-08-23}, 121 | booktitle = {Proceedings of the thiry-fourth annual {ACM} symposium on Theory of computing}, 122 | publisher = {{ACM}}, 123 | author = {Charikar, Moses S.}, 124 | year = 2002, 125 | pages = {380–388} 126 | } -------------------------------------------------------------------------------- /doc/report/tweedr.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/doc/report/tweedr.pdf -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.0.4", 3 | "homepage": "https://github.com/dssg/tweedr", 4 | "description": "Twitter-Informed Disaster Response", 5 | "license": "MIT", 6 | "staticDependencies": { 7 | "jquery": "*", 8 | "underscore": "*", 9 | "backbone": "*", 10 | "handlebars": "*", 11 | "misc-js": "git://github.com/chbrown/misc-js.git" 12 | }, 13 | "staticPattern": "static/lib/{file}" 14 | } -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | '''setuptools works by triggering subcommands from higher level commands. 2 | The default commands 'install' and 'develop' trigger the following sequences: 3 | 4 | install: 5 | 1. build 6 | 2. build_py 7 | 3. install_lib 8 | 4. install_egg_info 9 | 5. egg_info 10 | 6. install_scripts 11 | 12 | develop: 13 | 1. egg_info 14 | 2. build_ext 15 | ''' 16 | from setuptools import setup, find_packages 17 | from distutils import log, core 18 | from distutils.dir_util import remove_tree 19 | import os 20 | import json 21 | 22 | here = os.path.dirname(__file__) or os.curdir 23 | package = json.load(open(os.path.join(here, 'package.json'))) 24 | 25 | 26 | class download_ext(core.Command): 27 | description = 'download external dependencies' 28 | user_options = [] 29 | 30 | def initialize_options(self): 31 | self.ext_path = None 32 | 33 | def finalize_options(self): 34 | self.ext_path = os.path.join(here, 'ext') 35 | 36 | def download_ark_tweet_nlp(self): 37 | import urllib 38 | import tarfile 39 | url = 'http://ark-tweet-nlp.googlecode.com/files/ark-tweet-nlp-0.3.2.tgz' 40 | log.info('Downloading %s', url) 41 | tgz_filepath, headers = urllib.urlretrieve(url) 42 | log.info('Opening %s', tgz_filepath) 43 | with tarfile.open(tgz_filepath, 'r:gz') as tgz: 44 | # pull all the jars out, flattening them 45 | for tarinfo in tgz.getmembers(): 46 | if tarinfo.name.endswith('.jar'): 47 | tarinfo_name = tarinfo.name 48 | local_filepath = os.path.join(self.ext_path, os.path.basename(tarinfo.name)) 49 | tarinfo.name = local_filepath 50 | tgz.extract(tarinfo) 51 | log.info('Extracting %s to %s', tarinfo_name, local_filepath) 52 | 53 | def run(self): 54 | self.mkpath(self.ext_path) 55 | self.download_ark_tweet_nlp() 56 | 57 | 58 | class dist_clean(core.Command): 59 | description = 'remove all files not under version control' 60 | user_options = [] 61 | 62 | def initialize_options(self): 63 | pass 64 | 65 | def finalize_options(self): 66 | # set all = True for the benefit of the "clean" subcommand 67 | self.all = True 68 | 69 | def run(self): 70 | self.run_command('clean') 71 | log.debug('removing inessential directories from root') 72 | for directory in os.listdir(here): 73 | if directory.endswith(('dist', 'ext', '.egg', '.egg-info')): 74 | remove_tree(directory, dry_run=self.dry_run) 75 | 76 | log.debug('removing inessential files from project') 77 | for dirpath, _, filenames in os.walk('.'): 78 | filepaths = [os.path.join(dirpath, filename) for filename in filenames] 79 | for filepath in filepaths: 80 | if filepath.endswith(('.pyc', '.DS_Store')): 81 | log.info('rm %s', filepath) 82 | if self.dry_run: 83 | continue 84 | os.remove(filepath) 85 | 86 | setup( 87 | name='tweedr', 88 | version=str(package['version']), 89 | url=str(package['homepage']), 90 | license=open(os.path.join(here, 'LICENSE')).read(), 91 | packages=find_packages(), 92 | install_requires=[ 93 | 'bottle', 94 | 'colorama', 95 | 'mako', 96 | 'matplotlib', 97 | 'mrjob', 98 | 'mysql-python', 99 | 'pattern', 100 | 'pybloomfiltermmap>=0.3.11', 101 | 'pyper', 102 | 'python-hashes', 103 | 'requests', 104 | 'scikit-learn', 105 | 'scipy', 106 | 'sqlalchemy', 107 | 'ujson', 108 | ], 109 | entry_points={ 110 | 'console_scripts': [ 111 | 'tweedr-ui = tweedr.cli.ui:main', 112 | 'tweedr-database = tweedr.cli.database:main', 113 | 'tweedr-pipeline = tweedr.cli.pipeline:main', 114 | ], 115 | }, 116 | cmdclass={ 117 | 'download_ext': download_ext, 118 | 'dist_clean': dist_clean, 119 | }, 120 | tests_require=[ 121 | 'nose', 122 | 'pep8', 123 | 'pyflakes', 124 | ], 125 | test_suite='nose.collector', 126 | ) 127 | -------------------------------------------------------------------------------- /static/img/logos/qcri.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/static/img/logos/qcri.png -------------------------------------------------------------------------------- /static/img/logos/uchicago.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/static/img/logos/uchicago.png -------------------------------------------------------------------------------- /static/img/screenshots/crisistracker-syria-2013-08-23.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/static/img/screenshots/crisistracker-syria-2013-08-23.png -------------------------------------------------------------------------------- /static/lib/cookies.js: -------------------------------------------------------------------------------- 1 | // Copyright 2012-2013, Christopher Brown , MIT Licensed 2 | // https://github.com/chbrown/misc-js :: cookies.js 3 | // "use strict"; /*jslint indent: 2 */ 4 | var cookies = (function() { 5 | function extend(target, source) { 6 | for (var key in source) { 7 | if (source.hasOwnProperty(key)) { 8 | target[key] = source[key]; 9 | } 10 | } 11 | return target; 12 | } 13 | 14 | var default_cookie = {}; 15 | 16 | function getOptions(opts) { 17 | // if it's a function, call it. 18 | var defaults = default_cookie.call ? default_cookie() : default_cookie; 19 | // opts override defaults, but defaults cannot be changed 20 | return extend(extend({}, defaults), opts); 21 | } 22 | 23 | return { 24 | setDefault: function(new_default_cookie) { 25 | default_cookie = new_default_cookie; 26 | }, 27 | get: function(name, opts) { 28 | opts = getOptions(opts); 29 | 30 | var document_cookie = document.cookie; 31 | var cookies = (document_cookie && document_cookie !== '') ? document_cookie.split(/\s*;\s*/) : []; 32 | for (var i = 0, cookie; (cookie = cookies[i]); i++) { 33 | // Does this cookie string begin with the name we want? 34 | if (cookie.slice(0, name.length + 1) == (name + '=')) { 35 | var raw = cookie.slice(name.length + 1); 36 | return opts.raw ? raw : decodeURIComponent(raw); 37 | } 38 | } 39 | }, 40 | set: function(name, value, opts) { 41 | opts = getOptions(opts); 42 | 43 | var encode = opts.raw ? function(s) { return s; } : encodeURIComponent; 44 | 45 | var pairs = [[encode(name), encode(value.toString())]]; 46 | if (opts.expires) pairs.push(['expires', opts.expires.toUTCString()]); 47 | if (opts.path) pairs.push(['path', opts.path]); 48 | if (opts.domain) pairs.push(['domain', opts.domain]); 49 | if (opts.secure) pairs.push(['secure']); 50 | var cookie = pairs.map(function(pair) { return pair.join('='); }).join('; '); 51 | document.cookie = cookie; 52 | return cookie; 53 | }, 54 | del: function(name, opts) { 55 | opts = getOptions(opts); 56 | 57 | this.set(name, '', {expires: -1}); 58 | } 59 | }; 60 | })(); 61 | -------------------------------------------------------------------------------- /static/lib/handlebars.runtime.js: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Copyright (C) 2011 by Yehuda Katz 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | 23 | */ 24 | 25 | // lib/handlebars/browser-prefix.js 26 | var Handlebars = {}; 27 | 28 | (function(Handlebars, undefined) { 29 | ; 30 | // lib/handlebars/base.js 31 | 32 | Handlebars.VERSION = "1.0.0"; 33 | Handlebars.COMPILER_REVISION = 4; 34 | 35 | Handlebars.REVISION_CHANGES = { 36 | 1: '<= 1.0.rc.2', // 1.0.rc.2 is actually rev2 but doesn't report it 37 | 2: '== 1.0.0-rc.3', 38 | 3: '== 1.0.0-rc.4', 39 | 4: '>= 1.0.0' 40 | }; 41 | 42 | Handlebars.helpers = {}; 43 | Handlebars.partials = {}; 44 | 45 | var toString = Object.prototype.toString, 46 | functionType = '[object Function]', 47 | objectType = '[object Object]'; 48 | 49 | Handlebars.registerHelper = function(name, fn, inverse) { 50 | if (toString.call(name) === objectType) { 51 | if (inverse || fn) { throw new Handlebars.Exception('Arg not supported with multiple helpers'); } 52 | Handlebars.Utils.extend(this.helpers, name); 53 | } else { 54 | if (inverse) { fn.not = inverse; } 55 | this.helpers[name] = fn; 56 | } 57 | }; 58 | 59 | Handlebars.registerPartial = function(name, str) { 60 | if (toString.call(name) === objectType) { 61 | Handlebars.Utils.extend(this.partials, name); 62 | } else { 63 | this.partials[name] = str; 64 | } 65 | }; 66 | 67 | Handlebars.registerHelper('helperMissing', function(arg) { 68 | if(arguments.length === 2) { 69 | return undefined; 70 | } else { 71 | throw new Error("Missing helper: '" + arg + "'"); 72 | } 73 | }); 74 | 75 | Handlebars.registerHelper('blockHelperMissing', function(context, options) { 76 | var inverse = options.inverse || function() {}, fn = options.fn; 77 | 78 | var type = toString.call(context); 79 | 80 | if(type === functionType) { context = context.call(this); } 81 | 82 | if(context === true) { 83 | return fn(this); 84 | } else if(context === false || context == null) { 85 | return inverse(this); 86 | } else if(type === "[object Array]") { 87 | if(context.length > 0) { 88 | return Handlebars.helpers.each(context, options); 89 | } else { 90 | return inverse(this); 91 | } 92 | } else { 93 | return fn(context); 94 | } 95 | }); 96 | 97 | Handlebars.K = function() {}; 98 | 99 | Handlebars.createFrame = Object.create || function(object) { 100 | Handlebars.K.prototype = object; 101 | var obj = new Handlebars.K(); 102 | Handlebars.K.prototype = null; 103 | return obj; 104 | }; 105 | 106 | Handlebars.logger = { 107 | DEBUG: 0, INFO: 1, WARN: 2, ERROR: 3, level: 3, 108 | 109 | methodMap: {0: 'debug', 1: 'info', 2: 'warn', 3: 'error'}, 110 | 111 | // can be overridden in the host environment 112 | log: function(level, obj) { 113 | if (Handlebars.logger.level <= level) { 114 | var method = Handlebars.logger.methodMap[level]; 115 | if (typeof console !== 'undefined' && console[method]) { 116 | console[method].call(console, obj); 117 | } 118 | } 119 | } 120 | }; 121 | 122 | Handlebars.log = function(level, obj) { Handlebars.logger.log(level, obj); }; 123 | 124 | Handlebars.registerHelper('each', function(context, options) { 125 | var fn = options.fn, inverse = options.inverse; 126 | var i = 0, ret = "", data; 127 | 128 | var type = toString.call(context); 129 | if(type === functionType) { context = context.call(this); } 130 | 131 | if (options.data) { 132 | data = Handlebars.createFrame(options.data); 133 | } 134 | 135 | if(context && typeof context === 'object') { 136 | if(context instanceof Array){ 137 | for(var j = context.length; i": ">", 212 | '"': """, 213 | "'": "'", 214 | "`": "`" 215 | }; 216 | 217 | var badChars = /[&<>"'`]/g; 218 | var possible = /[&<>"'`]/; 219 | 220 | var escapeChar = function(chr) { 221 | return escape[chr] || "&"; 222 | }; 223 | 224 | Handlebars.Utils = { 225 | extend: function(obj, value) { 226 | for(var key in value) { 227 | if(value.hasOwnProperty(key)) { 228 | obj[key] = value[key]; 229 | } 230 | } 231 | }, 232 | 233 | escapeExpression: function(string) { 234 | // don't escape SafeStrings, since they're already safe 235 | if (string instanceof Handlebars.SafeString) { 236 | return string.toString(); 237 | } else if (string == null || string === false) { 238 | return ""; 239 | } 240 | 241 | // Force a string conversion as this will be done by the append regardless and 242 | // the regex test will do this transparently behind the scenes, causing issues if 243 | // an object's to string has escaped characters in it. 244 | string = string.toString(); 245 | 246 | if(!possible.test(string)) { return string; } 247 | return string.replace(badChars, escapeChar); 248 | }, 249 | 250 | isEmpty: function(value) { 251 | if (!value && value !== 0) { 252 | return true; 253 | } else if(toString.call(value) === "[object Array]" && value.length === 0) { 254 | return true; 255 | } else { 256 | return false; 257 | } 258 | } 259 | }; 260 | ; 261 | // lib/handlebars/runtime.js 262 | 263 | Handlebars.VM = { 264 | template: function(templateSpec) { 265 | // Just add water 266 | var container = { 267 | escapeExpression: Handlebars.Utils.escapeExpression, 268 | invokePartial: Handlebars.VM.invokePartial, 269 | programs: [], 270 | program: function(i, fn, data) { 271 | var programWrapper = this.programs[i]; 272 | if(data) { 273 | programWrapper = Handlebars.VM.program(i, fn, data); 274 | } else if (!programWrapper) { 275 | programWrapper = this.programs[i] = Handlebars.VM.program(i, fn); 276 | } 277 | return programWrapper; 278 | }, 279 | merge: function(param, common) { 280 | var ret = param || common; 281 | 282 | if (param && common) { 283 | ret = {}; 284 | Handlebars.Utils.extend(ret, common); 285 | Handlebars.Utils.extend(ret, param); 286 | } 287 | return ret; 288 | }, 289 | programWithDepth: Handlebars.VM.programWithDepth, 290 | noop: Handlebars.VM.noop, 291 | compilerInfo: null 292 | }; 293 | 294 | return function(context, options) { 295 | options = options || {}; 296 | var result = templateSpec.call(container, Handlebars, context, options.helpers, options.partials, options.data); 297 | 298 | var compilerInfo = container.compilerInfo || [], 299 | compilerRevision = compilerInfo[0] || 1, 300 | currentRevision = Handlebars.COMPILER_REVISION; 301 | 302 | if (compilerRevision !== currentRevision) { 303 | if (compilerRevision < currentRevision) { 304 | var runtimeVersions = Handlebars.REVISION_CHANGES[currentRevision], 305 | compilerVersions = Handlebars.REVISION_CHANGES[compilerRevision]; 306 | throw "Template was precompiled with an older version of Handlebars than the current runtime. "+ 307 | "Please update your precompiler to a newer version ("+runtimeVersions+") or downgrade your runtime to an older version ("+compilerVersions+")."; 308 | } else { 309 | // Use the embedded version info since the runtime doesn't know about this revision yet 310 | throw "Template was precompiled with a newer version of Handlebars than the current runtime. "+ 311 | "Please update your runtime to a newer version ("+compilerInfo[1]+")."; 312 | } 313 | } 314 | 315 | return result; 316 | }; 317 | }, 318 | 319 | programWithDepth: function(i, fn, data /*, $depth */) { 320 | var args = Array.prototype.slice.call(arguments, 3); 321 | 322 | var program = function(context, options) { 323 | options = options || {}; 324 | 325 | return fn.apply(this, [context, options.data || data].concat(args)); 326 | }; 327 | program.program = i; 328 | program.depth = args.length; 329 | return program; 330 | }, 331 | program: function(i, fn, data) { 332 | var program = function(context, options) { 333 | options = options || {}; 334 | 335 | return fn(context, options.data || data); 336 | }; 337 | program.program = i; 338 | program.depth = 0; 339 | return program; 340 | }, 341 | noop: function() { return ""; }, 342 | invokePartial: function(partial, name, context, helpers, partials, data) { 343 | var options = { helpers: helpers, partials: partials, data: data }; 344 | 345 | if(partial === undefined) { 346 | throw new Handlebars.Exception("The partial " + name + " could not be found"); 347 | } else if(partial instanceof Function) { 348 | return partial(context, options); 349 | } else if (!Handlebars.compile) { 350 | throw new Handlebars.Exception("The partial " + name + " could not be compiled when running in runtime-only mode"); 351 | } else { 352 | partials[name] = Handlebars.compile(partial, {data: data !== undefined}); 353 | return partials[name](context, options); 354 | } 355 | } 356 | }; 357 | 358 | Handlebars.template = Handlebars.VM.template; 359 | ; 360 | // lib/handlebars/browser-suffix.js 361 | })(Handlebars); 362 | ; 363 | -------------------------------------------------------------------------------- /static/lib/templating.js: -------------------------------------------------------------------------------- 1 | // Copyright 2013, Christopher Brown , MIT Licensed 2 | // https://github.com/chbrown/misc-js :: templating.js 3 | "use strict"; /*jslint indent: 2 */ /*globals _, $, Backbone, Handlebars */ 4 | 5 | // Templates debug / caching. E.g.: 6 | // new TemplateManager({ 7 | // cache: window.DEBUG ? Handlebars.templates : {}, 8 | // url: '/templates/', 9 | // extension: '.bars', 10 | // compile: Handlebars.compile 11 | // }); 12 | function TemplateManager(opts) { 13 | this.cache = opts.cache || {}; // optional, defaults to {} 14 | this.url = opts.url || '/'; // where to look for templates 15 | this.extension = opts.extension || ''; // append to given template names 16 | this.querystring = opts.querystring || '?t=' + (new Date()).getTime(); // aka., url.search 17 | this.compile = opts.compile || null; // what to use for cache misses 18 | } 19 | TemplateManager.prototype.render = function(template_name, context) { 20 | // synchronous; returns html. 21 | var self = this; 22 | var cached_template = this.cache[template_name]; 23 | // only cache once per page load 24 | if (!cached_template) { 25 | $.ajax({ 26 | url: this.url + template_name + this.extension + this.querystring, 27 | async: false, 28 | success: function(template_src) { 29 | cached_template = self.compile(template_src); 30 | } 31 | }); 32 | // yes, the above *will* execute synchronously! 33 | this.cache[template_name] = cached_template; 34 | } 35 | return cached_template(context); 36 | }; 37 | 38 | // handlebars_manager is a global this file offers. requires `Handlebars` to be loaded. 39 | var HandlebarsTemplates = new TemplateManager({ 40 | cache: window.DEBUG ? Handlebars.templates : {}, 41 | url: '/templates/', 42 | extension: '.bars', 43 | compile: Handlebars.compile 44 | }); 45 | var Templates = HandlebarsTemplates; 46 | 47 | // requires Backbone and handlebars_manager 48 | var TemplatedView = Backbone.View.extend({ 49 | // TemplatedView has hooks called (pre|post)(Initialize|Render), each of which take a context 50 | // that context is just the model and whatever options the view is initialized with. 51 | // preInitialize, postInitialize, preRender, postRender 52 | initialize: function(opts) { 53 | // prePreRender 54 | var ctx = _.extend(this.model ? this.model.toJSON() : {}, opts); 55 | if (this.preInitialize) this.preInitialize(ctx); 56 | this.render(ctx); 57 | if (this.postInitialize) this.postInitialize(ctx); 58 | }, 59 | render: function(ctx) { 60 | if (this.preRender) this.preRender(ctx); 61 | this.el.innerHTML = Templates.render(this.template, ctx); 62 | if (this.postRender) this.postRender(ctx); 63 | if (ctx.replace) { 64 | // if .replace is given, it's the parent node that this new view should 65 | // attach to, replacing the old contents. 66 | ctx.replace.replaceWith(this.$el); 67 | } 68 | return this; 69 | } 70 | }); 71 | 72 | var TemplatedCollection = Backbone.Collection.extend({ 73 | renderTo: function($el, View) { 74 | var fragment = document.createDocumentFragment(); 75 | this.each(function(model) { 76 | var ctx = model.toJSON(); 77 | ctx.model = model; 78 | var view = new View(ctx); 79 | fragment.appendChild(view.el); 80 | }); 81 | $el.append(fragment); 82 | } 83 | }); 84 | -------------------------------------------------------------------------------- /static/master.css: -------------------------------------------------------------------------------- 1 | html{height:100%} 2 | body{font-family:'Helvetica Neue',Helvetica,Arial;font-weight:200;height:100%;margin:0;padding:0;box-sizing:border-box} 3 | h1,h2,h3,h4,h5,h6{margin:.25em 0 .5em} 4 | p{margin:.5em 0} 5 | label span{font-weight:bold} 6 | table{border-collapse:collapse}table td{padding:0 8px 2px 0;vertical-align:top;font-size:90%;white-space:nowrap} 7 | table.valign td{vertical-align:middle} 8 | table.gloss tr:first-child td{font-weight:bold} 9 | table.gloss td,table.gloss th{text-align:left;border:1px dotted #ccc;padding:1px 3px;word-break:normal;white-space:normal;max-width:140px} 10 | button{margin:10px 0} 11 | textarea{display:block} 12 | .control{margin:10px;float:left} 13 | .content{padding:1em} 14 | -------------------------------------------------------------------------------- /static/master.less: -------------------------------------------------------------------------------- 1 | html { 2 | height: 100%; 3 | } 4 | body { 5 | font-family: 'Helvetica Neue', Helvetica, Arial; 6 | font-weight: 200; 7 | 8 | height: 100%; 9 | margin: 0; 10 | padding: 0; 11 | box-sizing: border-box; 12 | } 13 | h1, h2, h3, h4, h5, h6 { 14 | margin: 0.25em 0 0.5em; 15 | } 16 | p { 17 | margin: 0.5em 0; 18 | } 19 | label span { 20 | font-weight: bold; 21 | } 22 | table { 23 | border-collapse: collapse; 24 | td { 25 | padding: 0 8px 2px 0; 26 | vertical-align: top; 27 | font-size: 90%; 28 | white-space: nowrap; 29 | } 30 | &.valign td { 31 | vertical-align: middle; 32 | } 33 | &.gloss { 34 | tr:first-child td { 35 | font-weight: bold; 36 | } 37 | td, th { 38 | text-align: left; 39 | border: 1px dotted #CCC; 40 | padding: 1px 3px; 41 | word-break: normal; 42 | white-space: normal; 43 | max-width: 140px; 44 | } 45 | } 46 | } 47 | button { 48 | margin: 10px 0; 49 | } 50 | textarea { 51 | display: block; 52 | } 53 | .control { 54 | margin: 10px; 55 | float: left; 56 | } 57 | .content { 58 | padding: 1em; 59 | } 60 | -------------------------------------------------------------------------------- /templates/code.mako: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | ${body | n}
6 | 
7 | -------------------------------------------------------------------------------- /templates/crf.mako: -------------------------------------------------------------------------------- 1 | <%inherit file="layout.mako" /> 2 | 3 |
4 |
5 | 6 |

Tagger demonstration

7 |
8 | 9 |
10 | 11 | 12 |
13 | 14 |
15 |
16 | 17 | 73 | -------------------------------------------------------------------------------- /templates/gloss.bars: -------------------------------------------------------------------------------- 1 |

Text

2 |

{{text}}

3 | 4 |

Alignments

5 | 6 | 7 | {{#each sequences}} 8 | 9 | 10 | {{#each values}} 11 | 12 | {{/each}} 13 | 14 | {{/each}} 15 | 16 |
{{name}}{{.}}
17 | -------------------------------------------------------------------------------- /templates/layout.mako: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | ${next.body()} 13 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | ## Tests 2 | 3 | Tweedr uses [nose](http://nose.readthedocs.org/) as the test runner. 4 | 5 | Tests can be disabled on Travis CI by putting "no_ci" into the test name. 6 | 7 | There are three ways to run tests (assuming you have `nose` installed), all of which must be called from the package root directory. 8 | 9 | 1. `nosetests` 10 | 2. `python setup.py test` 11 | 3. `python setup.py nosetests` 12 | 13 | Travis CI uses the last of these because it's the only one that automatically installs packages from `tests_require` in setup.py as well as allows setting command line options (it uses `-e no_ci` to exclude tests with "no_ci" in their name). 14 | -------------------------------------------------------------------------------- /tests/test_codebase.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import tweedr 4 | from tweedr.lib import walk 5 | 6 | source_endings = ('.py', '.bars', '.js', '.md', '.txt', '.mako', '.yml', '.less', '.json', '.css') 7 | 8 | 9 | def not_egg(filepath): 10 | return '.egg' not in filepath 11 | 12 | 13 | def not_git(filepath): 14 | return '/.git/' not in filepath 15 | 16 | 17 | def not_static(filepath): 18 | return '/static/lib/' not in filepath 19 | 20 | 21 | def is_source(filepath): 22 | return filepath.endswith(source_endings) 23 | 24 | 25 | def is_python(filepath): 26 | return filepath.endswith('.py') 27 | 28 | 29 | def test_pep8(): 30 | '''Running PEP-8 checks recursively in %s''' % tweedr.root 31 | import pep8 32 | ignore = [ 33 | 'E128', # E128 continuation line under-indented for visual indent 34 | 'E501', # E501 line too long (?? > 79 characters) 35 | ] 36 | total_errors = 0 37 | pep8style = pep8.StyleGuide(ignore=ignore) 38 | for filepath in walk(tweedr.root, not_egg, not_git, is_python): 39 | total_errors += pep8style.check_files([filepath]).total_errors 40 | 41 | assert total_errors == 0, 'Codebase does not pass PEP-8 (%d errors)' % total_errors 42 | 43 | 44 | def test_pyflakes(): 45 | '''Running pyflakes checks recursively in %s''' % tweedr.root 46 | from pyflakes import api as pyflakes 47 | total_errors = 0 48 | for filepath in walk(tweedr.root, not_egg, not_git, is_python): 49 | total_errors += pyflakes.checkPath(filepath) 50 | 51 | assert total_errors == 0, 'Codebase does not pass pyflakes (%d errors)' % total_errors 52 | 53 | 54 | def test_trailing_whitespace(): 55 | '''Running trailing whitespace checks recursively in %s''' % tweedr.root 56 | total_errors = 0 57 | for filepath in walk(tweedr.root, not_egg, not_git, not_static, is_source): 58 | with open(filepath) as fp: 59 | for line_i, raw in enumerate(fp): 60 | line = raw.rstrip('\n') 61 | if line.endswith((' ', '\t')): 62 | print >> sys.stdout, '%s:%d: trailing whitespace' % (filepath, line_i + 1) 63 | total_errors += 1 64 | 65 | assert total_errors == 0, 'Codebase has trailing whitespace (%d errors)' % total_errors 66 | 67 | 68 | def test_mysql_credentials_no_ci(): 69 | names = ['MYSQL_PASS', 'MYSQL_HOST'] 70 | values = [os.environ[name] for name in names] 71 | 72 | for filepath in walk(tweedr.root): 73 | with open(filepath) as fp: 74 | contents = fp.read() 75 | for value in values: 76 | assert value not in contents, 'Found a blacklisted credential (%s) in %s' % (value, filepath) 77 | -------------------------------------------------------------------------------- /tests/test_libraries.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def test_mysql_python_import_no_ci(): 5 | import MySQLdb 6 | assert MySQLdb is not None, 'MySQLdb should not be None.' 7 | 8 | 9 | def test_mysql_python_no_ci(): 10 | import MySQLdb 11 | connection = MySQLdb.connect( 12 | os.environ['MYSQL_HOST'], 13 | os.environ['MYSQL_USER'], 14 | os.environ['MYSQL_PASS'], 15 | os.environ['MYSQL_DATABASE']) 16 | cursor = connection.cursor() 17 | 18 | # test version 19 | version_query = 'SELECT VERSION()' 20 | cursor.execute(version_query) 21 | version_result = cursor.fetchone()[0] 22 | assert version_result.split('.')[0] == '5', 'MySQL major version must equal 5' 23 | 24 | # test schema 25 | tables_query = 'SELECT TABLE_NAME FROM information_schema.TABLES WHERE TABLE_SCHEMA = "QCRI"' 26 | cursor.execute(tables_query) 27 | tables_result = [result[0] for result in cursor.fetchall()] 28 | table_names = ['tokenized_labels', 'tweets'] 29 | for table_name in table_names: 30 | assert table_name in tables_result, 'The table "%s" was not found in the database' % table_name 31 | 32 | connection.close() 33 | 34 | 35 | def test_sqlalchemy_no_ci(): 36 | from sqlalchemy import create_engine, MetaData 37 | 38 | connection_string = 'mysql+mysqldb://%(MYSQL_USER)s:%(MYSQL_PASS)s@%(MYSQL_HOST)s/%(MYSQL_DATABASE)s' % os.environ 39 | engine = create_engine(connection_string, convert_unicode=True) 40 | 41 | metadata = MetaData(bind=engine) 42 | metadata.reflect() 43 | 44 | table_names = ['DamageClassification', 'labels'] 45 | for table_name in table_names: 46 | assert table_name in metadata.tables, 'The table "%s" was not found in SqlAlchemy reflection results' % table_name 47 | 48 | 49 | def test_tweedr_models_no_ci(): 50 | from tweedr.models import DBSession, TokenizedLabel, Label 51 | 52 | Tables = [TokenizedLabel, Label] 53 | for Table in Tables: 54 | row_count = DBSession.query(Table).count() 55 | assert row_count > 0, 'There should be more than 0 rows in the table "%s"' % Table.name 56 | -------------------------------------------------------------------------------- /tools/git-hooks/README.md: -------------------------------------------------------------------------------- 1 | ## git hooks 2 | 3 | `pre-commit` requires all tests to pass before you commit. 4 | 5 | Here's the entire file: 6 | 7 | ```bash 8 | #!/bin/sh 9 | cd $(dirname $GIT_DIR) 10 | python setup.py test 11 | ``` 12 | 13 | Install: 14 | 15 | ```bash 16 | cd tweedr 17 | cp tools/git-hooks/pre-commit .git/hooks/pre-commit 18 | ``` 19 | -------------------------------------------------------------------------------- /tools/git-hooks/pre-commit: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | cd $(dirname $GIT_DIR) 3 | python setup.py test 4 | -------------------------------------------------------------------------------- /tweedr/README.md: -------------------------------------------------------------------------------- 1 | ## Tweedr Python package 2 | 3 | * [`api/`](api) contains the main "pipeline" command line tool 4 | * [`corpora/`](corpora) contains scripts for reading corpora into predictable data structures from various source formats. 5 | * [`emr/`](emr) contains scripts for running jobs on Elastic Map Reduce 6 | * [`lib/`](lib) holds miscellaneous helpers or basic text manipulation tools. 7 | * [`ml/`](ml) contains all the machine learning and natural language processing tools. 8 | * [`models/`](models) holds the database schema and relationship definitions. 9 | * [`ui/`](ui) contains the web application. 10 | * [`__init__.py`](__init__.py) contains extensive log configuration. 11 | 12 | ## Use 13 | 14 | After installing, `tweedr` can be used as a Python package: 15 | 16 | import tweedr 17 | print tweedr.__version__ 18 | -------------------------------------------------------------------------------- /tweedr/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import logging 5 | from colorama import Fore, Back, Style 6 | 7 | # just resolve this file in the context of the current working directory 8 | # and find the parent of its directory 9 | root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 10 | with open(os.path.join(root, 'package.json')) as fd: 11 | package = json.load(fd) 12 | __version__ = str(package['version']) 13 | 14 | # add SILLY loglevel (above notset=0, below debug=10) 15 | SILLY = 5 16 | logging.addLevelName(SILLY, 'SILLY') 17 | 18 | 19 | class ColorFormatter(logging.Formatter): 20 | # colors: https://pypi.python.org/pypi/colorama 21 | thresholds = [ 22 | (logging.CRITICAL, (Back.RED, Back.RESET)), 23 | (logging.ERROR, (Fore.RED, Fore.RESET)), 24 | (logging.WARNING, (Back.YELLOW + Fore.BLACK, Back.RESET + Fore.RESET)), 25 | (logging.INFO, (Fore.CYAN, Fore.RESET)), 26 | (logging.DEBUG, (Fore.GREEN, Fore.RESET)), 27 | (SILLY, (Style.DIM, Style.NORMAL)), 28 | (logging.NOTSET, ('', '')), 29 | ] 30 | 31 | def format(self, record): 32 | result = super(ColorFormatter, self).format(record) 33 | 34 | for threshold, (prefix, postfix) in self.thresholds: 35 | if record.levelno >= threshold: 36 | break 37 | return prefix + result + postfix 38 | 39 | 40 | class TweedrLogger(logging.Logger): 41 | # def __init__(self, name, **kw): 42 | # super(TweedrLogger, self).__init__(name, **kw) 43 | 44 | def silly(self, msg, *args, **kwargs): 45 | if self.isEnabledFor(SILLY): 46 | self._log(SILLY, msg, args, **kwargs) 47 | 48 | def notset(self, msg, *args, **kwargs): 49 | if self.isEnabledFor(logging.NOTSET): 50 | self._log(logging.NOTSET, msg, args, **kwargs) 51 | 52 | def __repr__(self): 53 | return '<%s name=%s level=%d (effective=%d) parent=%s disabled=%d>' % (self.__class__.__name__, 54 | self.name, self.level, self.getEffectiveLevel(), self.parent, self.disabled) 55 | 56 | 57 | # the following 5 lines replace logging.basicConfig(level=default_level) 58 | # very similar effect, but with a color formatter. 59 | handler = logging.StreamHandler(sys.stderr) 60 | color_formatter = ColorFormatter(fmt='%(levelname)s:%(name)s:%(message)s') 61 | handler.setFormatter(color_formatter) 62 | logging.root.addHandler(handler) 63 | logging.root.setLevel(logging.DEBUG) 64 | 65 | logging.setLoggerClass(TweedrLogger) 66 | 67 | logger = logging.getLogger(__name__) 68 | -------------------------------------------------------------------------------- /tweedr/api/README.md: -------------------------------------------------------------------------------- 1 | ## API 2 | 3 | 4 | ### Instructions for `pipeline.py` 5 | 6 | Let's say your tweets are gzipped json files in `~/corpora/qcri/gnip_tweets/samoa/`: 7 | 8 | cat ~/corpora/qcri/gnip_tweets/samoa/*.json.gz | gunzip | tweedr-pipeline 9 | 10 | * `tweedr-pipeline` is simply an alias for `tweedr.cli.pipeline.main()` 11 | 12 | 13 | ### More examples 14 | 15 | Ignore the hapaxlegomena: 16 | 17 | ... | tweedr-pipeline | json -C count text | grep -v $'1\t' 18 | 19 | Compare bloomfilter's exact matches with simhash: 20 | 21 | ... | tweedr-pipeline | json -C count fuzzy_count fuzzy_votes 22 | -------------------------------------------------------------------------------- /tweedr/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/tweedr/api/__init__.py -------------------------------------------------------------------------------- /tweedr/api/mappers/__init__.py: -------------------------------------------------------------------------------- 1 | from tweedr.api.protocols import DictProtocol 2 | 3 | 4 | class Mapper(object): 5 | '''Passthrough / interface''' 6 | INPUT = DictProtocol 7 | OUTPUT = DictProtocol 8 | 9 | def __call__(self, dict_): 10 | return dict_ 11 | -------------------------------------------------------------------------------- /tweedr/api/mappers/basic.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | from tweedr.lib.text import whitespace_unicode_translations 5 | from tweedr.api.mappers import Mapper 6 | from tweedr.api.protocols import StringProtocol, DictProtocol, TweetDictProtocol 7 | 8 | import logging 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class EmptyLineFilter(Mapper): 13 | INPUT = StringProtocol 14 | OUTPUT = StringProtocol 15 | 16 | def __call__(self, line): 17 | # ignore empty lines 18 | stripped_line = line.strip() 19 | if stripped_line: 20 | return stripped_line 21 | 22 | 23 | class JSONParser(Mapper): 24 | INPUT = StringProtocol 25 | OUTPUT = DictProtocol 26 | 27 | def __call__(self, line): 28 | try: 29 | return json.loads(line) 30 | except ValueError: 31 | logger.critical('Could not parse JSON: %s', line) 32 | raise 33 | 34 | 35 | class IgnoreMetadata(Mapper): 36 | INPUT = DictProtocol 37 | OUTPUT = DictProtocol 38 | 39 | def __call__(self, dict_): 40 | if 'info' not in dict_: 41 | return dict_ 42 | 43 | 44 | class TweetStandardizer(Mapper): 45 | '''Ensures that a given dict being mapped through the pipeline has basic 46 | fields that come with every tweet, coalescing them into predictable names. 47 | 48 | This is necessary because different sources of tweets (e.g., raw Twitter, 49 | GNIP) name this information a variety of different things. 50 | 51 | The fields are: 52 | 53 | * `text` Unicode The tweet's textual content 54 | * `author` Unicode The Twitter screen name of the tweet's author 55 | * `id_str` Unicode The tweet's identifying snowflake, as assigned by 56 | Twitter when originally posted. 57 | 58 | See `TweetDictProtocol`'s documentation for more details. 59 | ''' 60 | INPUT = DictProtocol 61 | OUTPUT = TweetDictProtocol 62 | 63 | def __call__(self, dict_): 64 | # ensure text. different sources call it different things. 65 | if 'text' in dict_: 66 | dict_['text'] = dict_['text'].translate(whitespace_unicode_translations) 67 | elif 'body' in dict_: 68 | dict_['text'] = dict_.pop('body').translate(whitespace_unicode_translations) 69 | else: 70 | logger.critical('Could not find text field in %s', dict_) 71 | raise KeyError("'text' | 'body'") 72 | 73 | # ensure author 74 | if 'actor' in dict_: 75 | dict_['author'] = dict_['actor']['preferredUsername'] 76 | elif 'user' in dict_: 77 | dict_['author'] = dict_['user']['screen_name'] 78 | else: 79 | logger.critical('Could not find author field in %s', dict_) 80 | raise KeyError("'actor.preferredUsername' | 'user.screen_name'") 81 | 82 | # ensure id 83 | if 'id_str' in dict_: 84 | dict_['id'] = dict_['id_str'] 85 | else: 86 | dict_['id'] = dict_['id'].split(':')[-1] 87 | 88 | return dict_ 89 | 90 | 91 | class LineStream(Mapper): 92 | INPUT = DictProtocol 93 | OUTPUT = None 94 | 95 | def __init__(self, stream): 96 | self.stream = sys.stdout 97 | 98 | def __call__(self, dict_): 99 | json.dump(dict_, self.stream) 100 | self.stream.write(os.linesep) 101 | # flush might be unnecessary in production 102 | self.stream.flush() 103 | -------------------------------------------------------------------------------- /tweedr/api/mappers/ml.py: -------------------------------------------------------------------------------- 1 | from sklearn import feature_extraction, pipeline 2 | from tweedr.lib.text import token_re 3 | from tweedr.ml.features import featurize, characters, lexicons, ngrams # , nlp 4 | from tweedr.api.mappers import Mapper 5 | from tweedr.api.protocols import TweetDictProtocol 6 | 7 | import logging 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class CorpusClassifier(Mapper): 12 | INPUT = TweetDictProtocol 13 | OUTPUT = TweetDictProtocol 14 | 15 | feature_functions = [ 16 | ngrams.unigrams, 17 | characters.plural, 18 | lexicons.is_transportation, 19 | lexicons.is_building, 20 | characters.capitalized, 21 | characters.numeric, 22 | ngrams.unique, 23 | lexicons.hypernyms, 24 | # nlp.pos_tags, 25 | ] 26 | 27 | def tokenizer(self, text): 28 | tokens = token_re.findall(text) 29 | tokens_features = featurize(tokens, self.feature_functions) 30 | for token_features in tokens_features: 31 | for feature in token_features: 32 | yield feature 33 | 34 | def __init__(self, datasource, classifier): 35 | logger.info('Training %s on %s', classifier.__class__.__name__, datasource.__class__.__name__) 36 | 37 | # datasource yields (label, text) pairs 38 | y, X = zip(*datasource) 39 | 40 | self.name = datasource.__class__.__name__ + ':' + classifier.__class__.__name__ 41 | self.pipeline = pipeline.Pipeline([ 42 | ('dictionary', feature_extraction.text.CountVectorizer(tokenizer=self.tokenizer)), 43 | ('tfidf', feature_extraction.text.TfidfTransformer()), 44 | ('classifier', classifier), 45 | ]) 46 | 47 | self.pipeline.fit(X, y) 48 | 49 | def __call__(self, tweet): 50 | text = tweet['text'] 51 | y = self.pipeline.predict([text])[0] 52 | 53 | if 'classification' not in tweet: 54 | tweet['classification'] = [] 55 | 56 | tweet['classification'].append({ 57 | 'name': self.name, 58 | 'label': y, 59 | }) 60 | 61 | return tweet 62 | -------------------------------------------------------------------------------- /tweedr/api/mappers/nlp.py: -------------------------------------------------------------------------------- 1 | import os 2 | import itertools 3 | import requests 4 | from tweedr.api.mappers import Mapper 5 | from tweedr.api.protocols import TweetDictProtocol 6 | from tweedr.lib.text import token_re, zip_boundaries 7 | from tweedr.ml.features import featurize, characters, dbpedia, lexicons, ngrams 8 | from tweedr.ark.java import TwitterNLP 9 | from tweedr.ml.crf.classifier import CRF 10 | 11 | import logging 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class POSTagger(Mapper): 16 | INPUT = TweetDictProtocol 17 | OUTPUT = TweetDictProtocol 18 | 19 | def __init__(self): 20 | self.tagger = TwitterNLP() 21 | 22 | def __call__(self, tweet): 23 | '''Enhances the input tweet with POS tags, using only the tweet["text"] value: 24 | 25 | { 26 | ... 27 | "tokens": "@Donnie I hear ya and I hate earthquakes in Cali too ! But I still love living in LA ! :)", 28 | "pos": "@ O V O & O V N P ^ R , & O R V V P ^ ,", 29 | ... 30 | } 31 | 32 | The `tokens` and `pos` values can be split on whitespace to get equal-length lists of strings. 33 | ''' 34 | tokens, pos_tags = self.tagger.tokenize_and_tag(tweet['text']) 35 | tweet['tokens'] = tokens 36 | tweet['pos'] = pos_tags 37 | return tweet 38 | 39 | 40 | class SequenceTagger(Mapper): 41 | INPUT = TweetDictProtocol 42 | OUTPUT = TweetDictProtocol 43 | 44 | feature_functions = [ 45 | ngrams.unigrams, 46 | characters.plural, 47 | lexicons.is_transportation, 48 | lexicons.is_building, 49 | characters.capitalized, 50 | characters.numeric, 51 | ngrams.unique, 52 | lexicons.hypernyms, 53 | dbpedia.spotlight, 54 | ] 55 | 56 | def __init__(self): 57 | self.crf = CRF.default(self.feature_functions) 58 | logger.info('SequenceTagger initialized') 59 | 60 | def __call__(self, tweet): 61 | text = tweet['text'] 62 | tokens = token_re.findall(text) 63 | 64 | # tokens_features = map(list, featurize(tokens, crf_feature_functions)) 65 | tokens_features = featurize(tokens, self.feature_functions) 66 | 67 | null_label = 'None' 68 | labels = self.crf.predict([tokens_features])[0] 69 | # tweet['labels'] = labels 70 | 71 | if 'sequences' not in tweet: 72 | tweet['sequences'] = [] 73 | 74 | for sequence_label, entries in itertools.groupby(zip_boundaries(labels), lambda tup: tup[0]): 75 | if sequence_label != null_label: 76 | labels, starts, ends = zip(*entries) 77 | 78 | tweet['sequences'].append({ 79 | 'text': sequence_label, 80 | 'start': starts[0], 81 | 'end': ends[-1], 82 | }) 83 | 84 | return tweet 85 | 86 | 87 | class DBpediaSpotter(Mapper): 88 | INPUT = TweetDictProtocol 89 | OUTPUT = TweetDictProtocol 90 | 91 | def __init__(self, confidence=0.1, support=10): 92 | self.annotate_url = '%s/rest/annotate' % os.environ.get('SPOTLIGHT', 'http://spotlight.sztaki.hu:2222') 93 | self.confidence = confidence 94 | self.support = support 95 | logger.info('DBpediaSpotter initialized') 96 | 97 | def __call__(self, tweet): 98 | text = tweet['text'] 99 | 100 | if 'dbpedia' not in tweet: 101 | tweet['dbpedia'] = [] 102 | 103 | r = requests.post(self.annotate_url, 104 | headers=dict(Accept='application/json'), 105 | data=dict(text=text, confidence=self.confidence, support=self.support)) 106 | Resources = r.json().get('Resources', []) 107 | 108 | for Resource in Resources: 109 | start = int(Resource['@offset']) 110 | surface_form = Resource['@surfaceForm'] 111 | types = Resource['@types'] 112 | 113 | dbpedia_resource = { 114 | 'text': surface_form, 115 | 'start': start, 116 | 'end': start + len(surface_form), 117 | 'uri': Resource['@URI'], 118 | 'types': types.split(',') if types else [], 119 | } 120 | 121 | tweet['dbpedia'].append(dbpedia_resource) 122 | 123 | return tweet 124 | -------------------------------------------------------------------------------- /tweedr/api/mappers/similar.py: -------------------------------------------------------------------------------- 1 | from tweedr.api.mappers import Mapper 2 | from tweedr.api.protocols import TweetDictProtocol 3 | 4 | import logging 5 | logger = logging.getLogger(__name__) 6 | 7 | # for the bloomfilter 8 | import tempfile 9 | import pybloomfilter 10 | 11 | # for simhashing 12 | from hashes.simhash import simhash 13 | 14 | 15 | class TextCounter(Mapper): 16 | INPUT = TweetDictProtocol 17 | OUTPUT = TweetDictProtocol 18 | 19 | def __init__(self): 20 | # Use an in-memory bloomfilter for now, maybe move to pyreBloom if we need something threadsafe? 21 | bloomfilter_filepath = tempfile.NamedTemporaryFile(delete=False).name 22 | logger.debug('Saving bloomfilter to %s', bloomfilter_filepath) 23 | # pybloomfilter.BloomFilter(capacity, error_rate, filename) 24 | self.bloomfilter = pybloomfilter.BloomFilter(10000000, 0.001, bloomfilter_filepath) 25 | self.seen = dict() 26 | 27 | def __call__(self, dict_): 28 | text = dict_['text'] 29 | 30 | # bloomfilter.add(...) returns True if item is already in the filter 31 | if self.bloomfilter.add(text): 32 | # we only start to store counts when we see an item more than once 33 | self.seen[text] = dict_['count'] = self.seen.get(text, 1) + 1 34 | else: 35 | dict_['count'] = 1 36 | 37 | return dict_ 38 | 39 | 40 | class FuzzyTextCounter(Mapper): 41 | INPUT = TweetDictProtocol 42 | OUTPUT = TweetDictProtocol 43 | 44 | def __init__(self, threshold=0.97): 45 | self.threshold = threshold 46 | logger.debug('Simhash counter initialized with threshold of %0.3f', threshold) 47 | 48 | # list of all processed simhash objects 49 | self.simhashes = [] 50 | # votes is a lookup from a simhash hex to the original's id 51 | self.votes = dict() 52 | 53 | def __call__(self, dict_): 54 | text = dict_['text'] 55 | self_simhash = simhash(text) 56 | 57 | fuzzy_count = 0 58 | sum_other_votes = 0 59 | for other_simhash in self.simhashes: 60 | if self_simhash.similarity(other_simhash) > self.threshold: 61 | # increment the votes of the others 62 | other_votes = self.votes[other_simhash.hash] = self.votes.get(other_simhash.hash, 1) + 1 63 | fuzzy_count += 1 64 | sum_other_votes += other_votes 65 | 66 | # should self.votes be elevated based on fuzzy_count? 67 | self.votes[self_simhash.hash] = self.votes.get(self_simhash.hash, 0) + 1 68 | 69 | # maybe normalize based on the number of total votes? 70 | dict_['fuzzy_count'] = fuzzy_count 71 | dict_['fuzzy_votes'] = sum_other_votes 72 | 73 | # store simhash in global state now that we've finished processing 74 | self.simhashes.append(self_simhash) 75 | return dict_ 76 | -------------------------------------------------------------------------------- /tweedr/api/pipeline.py: -------------------------------------------------------------------------------- 1 | import logging 2 | logger = logging.getLogger(__name__) 3 | 4 | 5 | class Pipeline(object): 6 | def __init__(self, *mappers): 7 | logger.info('%s -> [pipeline] -> %s', mappers[0].INPUT, mappers[-1].OUTPUT) 8 | # type-check the connections between the provided mappers 9 | total_errors = 0 10 | for from_pipe, to_pipe in zip(mappers, mappers[1:]): 11 | # Python lets you use `a <= b` to say `a is a subclass of b` 12 | # SuperClass >= Class is true 13 | # Class >= Class is true 14 | # Class >= SuperClass is false 15 | if from_pipe.OUTPUT < to_pipe.INPUT: 16 | logger.error('Pipeline cannot connect mappers: %s[%s] -> %s[%s]', 17 | from_pipe.__class__.__name__, from_pipe.OUTPUT.__name__, 18 | to_pipe.__class__.__name__, to_pipe.INPUT.__name__) 19 | total_errors += 1 20 | if total_errors > 0: 21 | raise TypeError('Pipeline types do not match.') 22 | self.mappers = mappers 23 | 24 | def __call__(self, payload): 25 | logger.notset('Pipeline processing payload: %s', payload) 26 | # TODO: maybe wrap with a try-except here? 27 | for mapper in self.mappers: 28 | payload = mapper(payload) 29 | if payload is None: 30 | break 31 | return payload 32 | -------------------------------------------------------------------------------- /tweedr/api/protocols.py: -------------------------------------------------------------------------------- 1 | class StringProtocol(object): 2 | pass 3 | 4 | 5 | class DictProtocol(object): 6 | pass 7 | 8 | 9 | class TweetDictProtocol(DictProtocol): 10 | '''This merely asserts that the following fields will exist and have reasonable values: 11 | 12 | text: String 13 | id: String 14 | author: String 15 | ''' 16 | -------------------------------------------------------------------------------- /tweedr/ark/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import logging 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | def main(): 8 | '''Example usage: 9 | 10 | echo "The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced no evidence that any irregularities took place." | python -m tweedr.ark.__init__ 11 | ''' 12 | if sys.stdin.isatty(): 13 | logger.error('You must pipe in a string') 14 | exit(1) 15 | 16 | from tweedr.ark.java import TwitterNLP 17 | tagger = TwitterNLP() 18 | 19 | for line in sys.stdin: 20 | print '[input]', line.strip() 21 | tag_line = tagger.predict(line) 22 | print '[output]', tag_line 23 | 24 | if __name__ == '__main__': 25 | main() 26 | -------------------------------------------------------------------------------- /tweedr/ark/java/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | from subprocess import Popen, PIPE 3 | 4 | import tweedr 5 | from tweedr.ml.classifier import ClassifierI 6 | from tweedr.lib.text import whitespace_unicode_translations 7 | 8 | import logging 9 | logger = logging.getLogger(__name__) 10 | 11 | jar_path = os.path.join(tweedr.root, 'ext', 'ark-tweet-nlp-0.3.2.jar') 12 | 13 | 14 | class TwitterNLP(ClassifierI): 15 | def __init__(self, *args, **kw): 16 | self.proc = Popen(['java', '-cp', jar_path, 'cmu.arktweetnlp.RunTagger', 17 | '--input-format', 'text', '--output-format', 'pretsv'], 18 | stdin=PIPE, stdout=PIPE, stderr=PIPE) 19 | 20 | logger.info('cmu.arktweetnlp.RunTagger Java VM initialized with PID: %d', self.proc.pid) 21 | 22 | def fit(self, X, y): 23 | raise NotImplementedError('TwitterNLP is pre-trained; re-training is not supported.') 24 | 25 | def predict(self, X): 26 | # return only the labels (the POS tags) 27 | return self.parse_string(X)[1] 28 | 29 | # additional fields below are not required by ClassifierI, except that 30 | # they are called in predict 31 | def tokenize_and_tag(self, document): 32 | # only return the first two lines (tokens and labels) 33 | return self.parse_string(document)[:2] 34 | 35 | def parse_string(self, document): 36 | ''' 37 | Take a single string, remove any CR / LF / tab whitespace, and run it 38 | through TwitterNLP as an individual sequence of text. 39 | 40 | `document` String line of input 41 | 42 | Returns a tuple of strings, each of which is an equal-length (after 43 | `split`'ing) whitespace-separated sequence of tokens / POS tags / 44 | confidences. 45 | ''' 46 | # sanitize the input and convert to bytestring 47 | if not isinstance(document, unicode): 48 | document = document.decode('utf8') 49 | string = document.translate(whitespace_unicode_translations).encode('utf8').strip() 50 | 51 | # write input with EOL marker (RunTagger won't return tags until it hits a newline) 52 | self.proc.stdin.write(string) 53 | self.proc.stdin.write('\n') 54 | 55 | # wait for output 56 | result = self.proc.stdout.readline() 57 | # no available stdout (the empty string) means there was an error 58 | if result == '': 59 | for stderr_line in self.proc.stderr: 60 | logger.error(stderr_line.rstrip()) 61 | raise IOError('cmu.arktweetnlp.RunTagger error') 62 | 63 | # output of cmu.arktweetnlp.RunTagger is TOKENSTAGSCONFIDENCESORIGINAL 64 | parts = result.split('\t') 65 | # cut off the original input, which is parts[3] 66 | return parts[0:3] 67 | -------------------------------------------------------------------------------- /tweedr/ark/java/singleton.py: -------------------------------------------------------------------------------- 1 | from tweedr.ark.java import TwitterNLP 2 | 3 | import logging 4 | logger = logging.getLogger(__name__) 5 | 6 | logger.debug('The TwitterNLP POS tagger is being loaded as a module singleton') 7 | 8 | # simply by importing this module, the TwitterNLP tagger will be started up and 9 | # made available to other scripts. 10 | tagger = TwitterNLP() 11 | -------------------------------------------------------------------------------- /tweedr/ark/tweetmotif/LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2009-2010, Brendan O'Connor, Michel Krieger, and David Ahn 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /tweedr/ark/tweetmotif/README.md: -------------------------------------------------------------------------------- 1 | # TweetMotif 2 | 3 | The files `emoticons.py` and `twokenize.py` are originally from [TweetMotif](https://github.com/brendano/tweetmotif). 4 | 5 | Brendan O'Connor, Michel Krieger, and David Ahn. [_TweetMotif: Exploratory Search and Topic Summarization for Twitter_](http://anyall.org/oconnor_krieger_ahn.icwsm2010.tweetmotif.pdf). ICWSM-2010. 6 | 7 | 8 | ## License 9 | 10 | Copyright © 2009-2010, Brendan O'Connor, Michel Krieger, and David Ahn. 11 | 12 | TweetMotif is licensed under the [Apache License 2.0](LICENSE). 13 | -------------------------------------------------------------------------------- /tweedr/ark/tweetmotif/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/tweedr/ark/tweetmotif/__init__.py -------------------------------------------------------------------------------- /tweedr/ark/tweetmotif/emoticons.py: -------------------------------------------------------------------------------- 1 | """ emoticon recognition via patterns. tested on english-language twitter, but 2 | probably works for other social media dialects. """ 3 | 4 | __author__ = "Brendan O'Connor (anyall.org, brenocon@gmail.com)" 5 | __version__ = "april 2009" 6 | 7 | import re 8 | import sys 9 | 10 | mycompile = lambda pat: re.compile(pat, re.UNICODE) 11 | #SMILEY = mycompile(r'[:=].{0,1}[\)dpD]') 12 | #MULTITOK_SMILEY = mycompile(r' : [\)dp]') 13 | 14 | NormalEyes = r'[:=]' 15 | Wink = r'[;]' 16 | 17 | NoseArea = r'(|o|O|-)' # rather tight precision, \S might be reasonable... 18 | 19 | HappyMouths = r'[D\)\]]' 20 | SadMouths = r'[\(\[]' 21 | Tongue = r'[pP]' 22 | OtherMouths = r'[doO/\\]' # remove forward slash if http://'s aren't cleaned 23 | 24 | Happy_RE = mycompile('(\^_\^|' + NormalEyes + NoseArea + HappyMouths + ')') 25 | Sad_RE = mycompile(NormalEyes + NoseArea + SadMouths) 26 | 27 | Wink_RE = mycompile(Wink + NoseArea + HappyMouths) 28 | Tongue_RE = mycompile(NormalEyes + NoseArea + Tongue) 29 | Other_RE = mycompile('(' + NormalEyes + '|' + Wink + ')' + NoseArea + OtherMouths) 30 | 31 | Emoticon = ( 32 | "(" + NormalEyes + "|" + Wink + ")" + NoseArea + 33 | "(" + Tongue + "|" + OtherMouths + 34 | "|" + SadMouths + "|" + HappyMouths + ")" 35 | ) 36 | Emoticon_RE = mycompile(Emoticon) 37 | 38 | #Emoticon_RE = "|".join([Happy_RE,Sad_RE,Wink_RE,Tongue_RE,Other_RE]) 39 | #Emoticon_RE = mycompile(Emoticon_RE) 40 | 41 | 42 | def analyze_tweet(text): 43 | h = Happy_RE.search(text) 44 | s = Sad_RE.search(text) 45 | if h and s: 46 | return "BOTH_HS" 47 | if h: 48 | return "HAPPY" 49 | if s: 50 | return "SAD" 51 | return "NA" 52 | 53 | # more complex & harder, so disabled for now 54 | #w= Wink_RE.search(text) 55 | #t= Tongue_RE.search(text) 56 | #a= Other_RE.search(text) 57 | #h,w,s,t,a = [bool(x) for x in [h,w,s,t,a]] 58 | # if sum([h,w,s,t,a])>1: return "MULTIPLE" 59 | # if sum([h,w,s,t,a])==1: 60 | # if h: return "HAPPY" 61 | # if s: return "SAD" 62 | # if w: return "WINK" 63 | # if a: return "OTHER" 64 | # if t: return "TONGUE" 65 | # return "NA" 66 | 67 | if __name__ == '__main__': 68 | for line in sys.stdin: 69 | import sane_re 70 | sane_re._S(line[:-1]).show_match(Emoticon_RE, numbers=False) 71 | #print(analyze_tweet(line.strip()), line.strip(), sep="\t") 72 | -------------------------------------------------------------------------------- /tweedr/ark/tweetmotif/twokenize.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ tokenizer for tweets! might be appropriate for other social media dialects too. 3 | general philosophy is to throw as little out as possible. 4 | development philosophy: every time you change a rule, do a diff of this 5 | program's output on ~100k tweets. if you iterate through many possible rules 6 | and only accept the ones that seeem to result in good diffs, it's a sort of 7 | statistical learning with in-the-loop human evaluation :) 8 | """ 9 | 10 | __author__ = "brendan o'connor (anyall.org)" 11 | 12 | import re 13 | import sys 14 | import emoticons 15 | mycompile = lambda pat: re.compile(pat, re.UNICODE) 16 | 17 | 18 | def regex_or(*items): 19 | r = '|'.join(items) 20 | r = '(' + r + ')' 21 | return r 22 | 23 | 24 | def pos_lookahead(r): 25 | return '(?=' + r + ')' 26 | 27 | 28 | def neg_lookahead(r): 29 | return '(?!' + r + ')' 30 | 31 | 32 | def optional(r): 33 | return '(%s)?' % r 34 | 35 | 36 | PunctChars = r'''['“".?!,:;]''' 37 | Punct = '%s+' % PunctChars 38 | Entity = '&(amp|lt|gt|quot);' 39 | 40 | # one-liner URL recognition: 41 | #Url = r'''https?://\S+''' 42 | 43 | # more complex version: 44 | UrlStart1 = regex_or('https?://', r'www\.') 45 | CommonTLDs = regex_or('com', 'co\\.uk', 'org', 'net', 'info', 'ca') 46 | UrlStart2 = r'[a-z0-9\.-]+?' + r'\.' + CommonTLDs + pos_lookahead(r'[/ \W\b]') 47 | # * not + for case of: "go to bla.com." -- don't want period 48 | UrlBody = r'[^ \t\r\n<>]*?' 49 | UrlExtraCrapBeforeEnd = '%s+?' % regex_or(PunctChars, Entity) 50 | UrlEnd = regex_or(r'\.\.+', r'[<>]', r'\s', '$') 51 | Url = (r'\b' + regex_or(UrlStart1, UrlStart2) + UrlBody + pos_lookahead(optional(UrlExtraCrapBeforeEnd) + UrlEnd)) 52 | 53 | Url_RE = re.compile("(%s)" % Url, re.U | re.I) 54 | 55 | Timelike = r'\d+:\d+' 56 | NumNum = r'\d+\.\d+' 57 | NumberWithCommas = r'(\d+,)+?\d{3}' + pos_lookahead(regex_or('[^,]', '$')) 58 | 59 | Abbrevs1 = ['am', 'pm', 'us', 'usa', 'ie', 'eg'] 60 | 61 | 62 | def regexify_abbrev(a): 63 | chars = list(a) 64 | icase = ["[%s%s]" % (c, c.upper()) for c in chars] 65 | dotted = [r'%s\.' % x for x in icase] 66 | return "".join(dotted) 67 | Abbrevs = [regexify_abbrev(a) for a in Abbrevs1] 68 | 69 | BoundaryNotDot = regex_or(r'\s', '[“"?!,:;]', Entity) 70 | aa1 = r'''([A-Za-z]\.){2,}''' + pos_lookahead(BoundaryNotDot) 71 | aa2 = r'''([A-Za-z]\.){1,}[A-Za-z]''' + pos_lookahead(BoundaryNotDot) 72 | ArbitraryAbbrev = regex_or(aa1, aa2) 73 | 74 | assert '-' != '―' 75 | Separators = regex_or('--+', '―') 76 | Decorations = r' [ ♫ ]+ '.replace(' ', '') 77 | 78 | EmbeddedApostrophe = r"\S+'\S+" 79 | 80 | ProtectThese = [ 81 | emoticons.Emoticon, 82 | Url, 83 | Entity, 84 | Timelike, 85 | NumNum, 86 | NumberWithCommas, 87 | Punct, 88 | ArbitraryAbbrev, 89 | Separators, 90 | Decorations, 91 | EmbeddedApostrophe, 92 | ] 93 | Protect_RE = mycompile(regex_or(*ProtectThese)) 94 | 95 | 96 | class Tokenization(list): 97 | " list of tokens, plus extra info " 98 | 99 | def __init__(self): 100 | self.alignments = [] 101 | self.text = "" 102 | 103 | def subset(self, tok_inds): 104 | new = Tokenization() 105 | new += [self[i] for i in tok_inds] 106 | new.alignments = [self.alignments[i] for i in tok_inds] 107 | new.text = self.text 108 | return new 109 | 110 | def assert_consistent(t): 111 | assert len(t) == len(t.alignments) 112 | assert [t.text[t.alignments[i]: (t.alignments[i] + len(t[i]))] 113 | for i in range(len(t))] == list(t) 114 | 115 | 116 | def align(toks, orig): 117 | s_i = 0 118 | alignments = [None] * len(toks) 119 | for tok_i in range(len(toks)): 120 | while True: 121 | L = len(toks[tok_i]) 122 | if orig[s_i:(s_i + L)] == toks[tok_i]: 123 | alignments[tok_i] = s_i 124 | s_i += L 125 | break 126 | s_i += 1 127 | if s_i >= len(orig): 128 | raise AlignmentFailed((orig, toks, alignments)) 129 | #if orig[s_i] != ' ': raise AlignmentFailed("nonspace advance: %s" % ((s_i,orig),)) 130 | if any(a is None for a in alignments): 131 | raise AlignmentFailed((orig, toks, alignments)) 132 | 133 | return alignments 134 | 135 | 136 | class AlignmentFailed(Exception): 137 | pass 138 | 139 | 140 | def unicodify(s, encoding='utf8', *args): 141 | if isinstance(s, unicode): 142 | return s 143 | if isinstance(s, str): 144 | return s.decode(encoding, *args) 145 | return unicode(s) 146 | 147 | 148 | def tokenize(tweet): 149 | text = unicodify(tweet) 150 | text = squeeze_whitespace(text) 151 | t = Tokenization() 152 | t += simple_tokenize(text) 153 | t.text = text 154 | t.alignments = align(t, text) 155 | return t 156 | 157 | 158 | def simple_tokenize(text): 159 | s = text 160 | s = edge_punct_munge(s) 161 | 162 | # strict alternating ordering through the string. first and last are goods. 163 | # good bad good bad good bad good 164 | goods = [] 165 | bads = [] 166 | i = 0 167 | if Protect_RE.search(s): 168 | for m in Protect_RE.finditer(s): 169 | goods.append((i, m.start())) 170 | bads.append(m.span()) 171 | i = m.end() 172 | goods.append((m.end(), len(s))) 173 | else: 174 | goods = [(0, len(s))] 175 | assert len(bads) + 1 == len(goods) 176 | 177 | goods = [s[i:j] for i, j in goods] 178 | bads = [s[i:j] for i, j in bads] 179 | # print goods 180 | # print bads 181 | goods = [unprotected_tokenize(x) for x in goods] 182 | res = [] 183 | for i in range(len(bads)): 184 | res += goods[i] 185 | res.append(bads[i]) 186 | res += goods[-1] 187 | 188 | res = post_process(res) 189 | return res 190 | 191 | AposS = mycompile(r"(\S+)('s)$") 192 | 193 | 194 | def post_process(pre_toks): 195 | # hacky: further splitting of certain tokens 196 | post_toks = [] 197 | for tok in pre_toks: 198 | m = AposS.search(tok) 199 | if m: 200 | post_toks += m.groups() 201 | else: 202 | post_toks.append(tok) 203 | return post_toks 204 | 205 | WS_RE = mycompile(r'\s+') 206 | 207 | 208 | def squeeze_whitespace(s): 209 | new_string = WS_RE.sub(" ", s) 210 | return new_string.strip() 211 | 212 | # fun: copy and paste outta http://en.wikipedia.org/wiki/Smart_quotes 213 | EdgePunct = r"""[ ' " “ ” ‘ ’ < > « » { } ( \) [ \] ]""".replace(' ', '') 214 | # NotEdgePunct = r"""[^'"([\)\]]""" # alignment failures? 215 | NotEdgePunct = r"""[a-zA-Z0-9]""" 216 | EdgePunctLeft = r"""(\s|^)(%s+)(%s)""" % (EdgePunct, NotEdgePunct) 217 | EdgePunctRight = r"""(%s)(%s+)(\s|$)""" % (NotEdgePunct, EdgePunct) 218 | EdgePunctLeft_RE = mycompile(EdgePunctLeft) 219 | EdgePunctRight_RE = mycompile(EdgePunctRight) 220 | 221 | 222 | def edge_punct_munge(s): 223 | s = EdgePunctLeft_RE.sub(r"\1\2 \3", s) 224 | s = EdgePunctRight_RE.sub(r"\1 \2\3", s) 225 | return s 226 | 227 | 228 | def unprotected_tokenize(s): 229 | return s.split() 230 | 231 | if __name__ == '__main__': 232 | for line in sys.stdin: 233 | print u" ".join(tokenize(line[:-1])).encode('utf-8') 234 | # print "CUR\t" + " ".join(tokenize(line[:-1])) 235 | # print "WS\t" + " ".join(line[:-1].split()) 236 | # print ansi.color(line.strip(),'red') 237 | # print ansi.color(" ".join(tokenize(line.strip())),'blue','bold') 238 | -------------------------------------------------------------------------------- /tweedr/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/tweedr/cli/__init__.py -------------------------------------------------------------------------------- /tweedr/cli/database.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | import mako.template 5 | 6 | 7 | def reflect(**kw): 8 | from tweedr.models import metadata 9 | schema_filepath = os.path.join(os.path.dirname(metadata.__file__), 'schema.py') 10 | schema_template_filepath = os.path.join(os.path.dirname(metadata.__file__), 'schema.template') 11 | 12 | template = mako.template.Template(filename=schema_template_filepath) 13 | metadata.metadata.reflect() 14 | schema = template.render(metadata=metadata.metadata) 15 | 16 | if kw.get('in_place'): 17 | with open(schema_filepath, 'w') as out: 18 | out.write(schema) 19 | else: 20 | sys.stdout.write(schema) 21 | 22 | print >> sys.stderr, '\nDone printing schema' 23 | 24 | 25 | def create(**kw): 26 | from tweedr.models.schema import metadata 27 | metadata.create_all() 28 | 29 | 30 | commands = dict(reflect=reflect, create=create) 31 | 32 | 33 | def main(): 34 | parser = argparse.ArgumentParser(description='Tweedr database tools') 35 | parser.add_argument('command', choices=commands, help='Command to run') 36 | parser.add_argument('--in-place', action='store_true', help='Whether or not to update the schema.py file in place') 37 | 38 | opts = parser.parse_args() 39 | commands[opts.command](**vars(opts)) 40 | 41 | if __name__ == '__main__': 42 | main() 43 | -------------------------------------------------------------------------------- /tweedr/cli/pipeline.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | from tweedr.api import pipeline 4 | from tweedr.api.mappers import basic, similar, nlp, ml 5 | from tweedr.corpora.qcri import a126730_datasource, a121571_datasource, a126728_datasource, a122047_datasource 6 | 7 | from sklearn import linear_model, naive_bayes, neighbors, svm 8 | 9 | import logging 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def main(): 14 | parser = argparse.ArgumentParser(description='Run tweets from STDIN through the tweedr pipeline, output to STDOUT.') 15 | parser.add_argument('-v', '--verbose', action='store_true', help='Log extra output') 16 | opts = parser.parse_args() 17 | 18 | # bump threshold down to show info=20, debug=10, and silly=5 if --verbose is set 19 | if opts.verbose: 20 | logger.setLevel('SILLY') 21 | 22 | if sys.stdin.isatty(): 23 | raise IOError('You must provide input via STDIN') 24 | 25 | cli_pipeline = pipeline.Pipeline( 26 | basic.EmptyLineFilter(), 27 | basic.JSONParser(), 28 | basic.IgnoreMetadata(), 29 | basic.TweetStandardizer(), 30 | similar.TextCounter(), 31 | similar.FuzzyTextCounter(), 32 | nlp.POSTagger(), 33 | nlp.SequenceTagger(), 34 | nlp.DBpediaSpotter(), 35 | 36 | ml.CorpusClassifier(a126730_datasource(), naive_bayes.MultinomialNB()), 37 | ml.CorpusClassifier(a121571_datasource(), svm.SVC(gamma=2, C=1)), 38 | ml.CorpusClassifier(a126728_datasource(), neighbors.KNeighborsClassifier(3)), 39 | ml.CorpusClassifier(a122047_datasource(), linear_model.LogisticRegression()), 40 | 41 | basic.LineStream(sys.stdout), 42 | ) 43 | 44 | logger.debug('Pipeline created') 45 | 46 | try: 47 | for i, line in enumerate(sys.stdin): 48 | cli_pipeline(line) 49 | except KeyboardInterrupt: 50 | logger.critical('SIGINT received; Exiting.') 51 | 52 | logger.info('Processed %d lines', i) 53 | logger.debug('Pipeline exited') 54 | 55 | 56 | if __name__ == '__main__': 57 | main() 58 | -------------------------------------------------------------------------------- /tweedr/cli/ui.py: -------------------------------------------------------------------------------- 1 | from bottle import run 2 | from tweedr.ui import middleware, crf 3 | 4 | 5 | def main(): 6 | '''This is called by the package's console_scripts entry point "tweedr-ui" 7 | 8 | The reloader is slow and only handles python module changes. 9 | I recommend using 3rd party restarter, say, node_restarter: 10 | node_restarter **/*.py **/*.css **/*.mako 'python tweedr/cli/ui.py' 11 | ''' 12 | app = middleware.add_duration_header(crf.app) 13 | run(app) 14 | 15 | if __name__ == '__main__': 16 | main() 17 | -------------------------------------------------------------------------------- /tweedr/corpora/__init__.py: -------------------------------------------------------------------------------- 1 | class DatasourceI(object): 2 | '''As usual, a reference, as opposed to an interface you actually have to implement''' 3 | 4 | def __init__(self): 5 | pass 6 | 7 | def __iter__(self): 8 | '''This should yield tuples of label-document (basestring, basestring) pairs.''' 9 | raise NotImplementedError(__doc__) 10 | -------------------------------------------------------------------------------- /tweedr/corpora/qcri.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | from tweedr.corpora import DatasourceI 4 | from tweedr.lib import globfirst 5 | 6 | corpora_root = os.path.expanduser(os.environ.get('CORPORA', '~/corpora')) 7 | 8 | 9 | class CSVDatasouce(DatasourceI): 10 | filepath = None 11 | label_column = 'category' 12 | text_column = 'text' 13 | 14 | def __iter__(self): 15 | with open(self.filepath) as fp: 16 | for row in csv.DictReader(fp): 17 | yield row[self.label_column], row[self.text_column] 18 | 19 | 20 | class a131709_datasource(CSVDatasouce): 21 | # from joplin/ 22 | ''' 23 | Counts: 24 | 25 | 94 Other 26 | 265 Informative (Direct) 27 | 469 Informative (Indirect) 28 | 762 Informative (Direct or Indirect) 29 | 794 Personal only 30 | ''' 31 | filepath = globfirst('**/a131709.csv', root=corpora_root) 32 | label_column = 'choose_one' 33 | text_column = 'tweet' 34 | 35 | 36 | class a121571_datasource(CSVDatasouce): 37 | # from joplin/ 38 | ''' 39 | Counts: 40 | 41 | 46 People missing, found or seen 42 | 130 Unknown 43 | 137 Casualties and damage 44 | 204 Donations of money, goods or services 45 | 280 Information source 46 | 436 Caution and advice 47 | ''' 48 | # TODO: come up with a better name 49 | filepath = globfirst('**/a121571.csv', root=corpora_root) 50 | label_column = 'choose_one' 51 | text_column = 'text' 52 | 53 | 54 | class a122047_datasource(CSVDatasouce): 55 | # from joplin/ 56 | ''' 57 | Counts: 58 | 59 | 3 A shelter is open or available 60 | 27 A siren has been heard 61 | 99 A tornado sighting/touchdown has been reported 62 | 102 Other 63 | 207 A tornado/thunderstorm warning has been issued or has been lifted 64 | ''' 65 | filepath = globfirst('**/a122047.csv', root=corpora_root) 66 | label_column = 'type_of_advice_or_caution' 67 | text_column = 'text' 68 | 69 | 70 | class a126730_datasource(CSVDatasouce): 71 | # from joplin/ 72 | ''' 73 | Counts: 74 | 75 | 1 Both people and infrastructure 76 | 1 People: injured 77 | 2 People: injured and dead 78 | 12 Not damage-related 79 | 17 Infrastructure (building, bridge, road, etc.) damaged 80 | 47 Not specified (maybe people or infrastructure) 81 | 58 People: dead 82 | ''' 83 | filepath = globfirst('**/a126730.csv', root=corpora_root) 84 | label_column = 'people_or_infrastructure' 85 | text_column = 'text' 86 | 87 | 88 | class a126728_datasource(CSVDatasouce): 89 | # from joplin/ 90 | ''' 91 | Counts: 92 | 93 | 2 Discount (rebate/special offer) 94 | 3 Blood 95 | 3 Equipment (machine/generator/pump/etc.) 96 | 6 Food 97 | 7 Shelter 98 | 11 Volunteers/work 99 | 53 Money 100 | 119 Other, or not specified 101 | ''' 102 | filepath = globfirst('**/a126728.csv', root=corpora_root) 103 | label_column = 'type_of_donation' 104 | text_column = 'text' 105 | 106 | 107 | class a122582_datasource(CSVDatasouce): 108 | # from joplin/ 109 | ''' 110 | Counts: 111 | 112 | 4 Tune to this radio station (or: I am listening to this station) 113 | 10 Watch this TV channel (or: I am watching this channel) 114 | 33 None of the above 115 | 35 Look at this photo or these photos 116 | 58 Look at this video or these videos 117 | 139 Look at this web site/page 118 | ''' 119 | filepath = globfirst('**/a122582.csv', root=corpora_root) 120 | label_column = 'type_of_message' 121 | text_column = 'text' 122 | 123 | 124 | class a143145_datasource(CSVDatasouce): 125 | # from sandy/ 126 | ''' 127 | Counts: 128 | 129 | 78 Informative (Direct) 130 | 79 Informative (Direct or Indirect) 131 | 161 Other 132 | 296 Personal Only 133 | 386 Informative (Indirect) 134 | ''' 135 | filepath = globfirst('**/a143145.csv', root=corpora_root) 136 | label_column = 'choose_one' 137 | text_column = 'tweet' 138 | 139 | 140 | class a144267_datasource(CSVDatasouce): 141 | # from sandy/ 142 | ''' 143 | Counts: 144 | 145 | 32 Donations of money, goods or services 146 | 72 Information Source 147 | 125 Unknown 148 | 144 Caution and advice 149 | 170 Casualties and damage 150 | ''' 151 | filepath = globfirst('**/a144267.csv', root=corpora_root) 152 | label_column = 'choose_one' 153 | text_column = 'tweet' 154 | 155 | 156 | class a146283_datasource(CSVDatasouce): 157 | # from sandy/ 158 | ''' 159 | Counts: 160 | 161 | 6 A shelter is open or available 162 | 20 A hurricane warning has been issued or has been lifted 163 | 23 A hurricane sighting has been reported 164 | 77 Other 165 | ''' 166 | filepath = globfirst('**/a146283.csv', root=corpora_root) 167 | label_column = 'type_of_advice_or_caution' 168 | text_column = 'tweet' 169 | 170 | 171 | class a146281_datasource(CSVDatasouce): 172 | # from sandy/ 173 | ''' 174 | Counts: 175 | 176 | 1 People: injured 177 | 3 People: injured and dead 178 | 12 Not specified (maybe people or infrastructure) 179 | 13 Both people and infrastructure 180 | 16 Not damage-related 181 | 34 People: dead 182 | 91 Infrastructure (building, bridge, road, etc.) damage 183 | ''' 184 | filepath = globfirst('**/a146281.csv', root=corpora_root) 185 | label_column = 'people_or_infrastructure' 186 | text_column = 'tweet' 187 | 188 | 189 | if __name__ == '__main__': 190 | '''You may need to get these files from S3, something like: 191 | 192 | mkdir -p ~/corpora/tweedr 193 | cd ~/corpora/tweedr 194 | s3cmd sync s3://qcri/joplin/labeled/ . 195 | s3cmd sync s3://qcri/sandy/labeled/ . 196 | ''' 197 | for label, text in a121571_datasource(): 198 | print label, '\t', text 199 | -------------------------------------------------------------------------------- /tweedr/corpora/qcri_database.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | import numpy as np 4 | from sets import Set 5 | 6 | from tweedr.models import DBSession, DamageClassification 7 | 8 | 9 | class DamageClassifiedCorpus(object): 10 | 11 | def __init__(self): 12 | labeled_tweets = \ 13 | np.array(DBSession.query(DamageClassification).filter(DamageClassification.mturk_code 14 | == 'QCRI').limit(1000).all()) 15 | labeled_tweets = map(lambda x: (x.text, int(x.label)), labeled_tweets) 16 | self.dataset = labeled_tweets 17 | 18 | def __iter__(self): 19 | return iter(self.dataset) 20 | 21 | 22 | 23 | test_set = DamageClassifiedCorpus() 24 | -------------------------------------------------------------------------------- /tweedr/emr/README.md: -------------------------------------------------------------------------------- 1 | ## Examples for the wordcounter 2 | 3 | Run the counter locally (using some data that's on the qcri machine): 4 | 5 | python gnip_wc.py /home/chbrown/data/gnip/christchurch/2011-02-28*.json.gz 6 | 7 | That particular example glob is about 15MB compressed, 107MB uncompressed, 59704 lines (= tweets). 8 | 9 | It runs in about 2m 30s locally, on the qcri AWS machine. 10 | 11 | Or run on EMR, using the same glob, but from S3. 12 | 13 | python gnip_wc.py -r emr s3://qcri/gnip/christchurch/2011-02-28*.json.gz --output-dir "s3://qcri/tmp-`date +%s`" 14 | 15 | This took about 21m, without specifying any numbers. From the docs: 16 | 17 | > By default, **mrjob** runs a single `m1.small`, which is a cheap but not very powerful instance type. 18 | 19 | Trying a few more instances at once: 20 | 21 | !! --num-ec2-instances 2 22 | 23 | Hmm. Still took 21 minutes. 24 | 25 | !! --num-ec2-instances 4 26 | 27 | Better! Took 13m. I think most of this is overhead in starting the cluster. 28 | 29 | !! --num-ec2-instances 8 30 | 31 | Overkill, apparently. Took 13m again. 32 | 33 | Could also try some different types: 34 | 35 | --ec2_instance_type c1.medium 36 | 37 | (Umm... later.) 38 | 39 | How about all of christchurch? 40 | 41 | python gnip_wc.py -r emr --num-ec2-instances 8 s3://qcri/gnip/christchurch/*.json.gz 42 | 43 | That's 757,382 tweets, 270MB compressed = 1.7GB uncompressed, simple word count took 1h 12m (apparently it only took 53m, as billed by AWS, so $(.12+.03) * 8 = $1.20), produced 12 output files for a total of ~12MB (uncompressed). 44 | 45 | ## Running a full geolocation task: 46 | 47 | cd ~/src/qcri/emr 48 | output_dir="s3://qcri/tmp-`date +%s`" 49 | echo Using $output_dir as our output directory 50 | time python gnip_geo.py --containers gnip_containers.geojson \ 51 | s3://qcri/gnip/*/*.json.gz --output-dir $output_dir -r emr --num-ec2-instances 5 52 | 53 | ## Examples 54 | 55 | Some of the fields from a geolocated tweet might look like this. Note that the coordinates are `[lat,lng]`. 56 | 57 | { 58 | ... 59 | "gnip": { 60 | "matching_rules": [ 61 | { 62 | "value": "has:geo", 63 | "tag": "westtx:geo" 64 | } 65 | ], 66 | ... 67 | }, 68 | ... 69 | "geo": { 70 | "type": "Point", 71 | "coordinates": [ 72 | 31.4119232, 73 | -86.1200234 74 | ] 75 | }, 76 | ... 77 | } 78 | 79 | And here's some GeoJSON, for reference (because Polygons are weird, allowing inner rings): 80 | 81 | { "type": "Polygon", 82 | "coordinates": [ 83 | [ 84 | [100.0, 0.0], [101.0, 0.0], [101.0, 1.0], [100.0, 1.0], [100.0, 0.0] 85 | ] 86 | ] 87 | } 88 | 89 | But Points are easier: 90 | 91 | { "type": "Point", "coordinates": [100.0, 0.0] } 92 | -------------------------------------------------------------------------------- /tweedr/emr/__init__.py: -------------------------------------------------------------------------------- 1 | import ujson 2 | from mrjob.protocol import _ClassBasedKeyCachingProtocol 3 | 4 | 5 | class UltraJSONProtocol(_ClassBasedKeyCachingProtocol): 6 | @classmethod 7 | def load_from_string(cls, value): 8 | return ujson.loads(value) 9 | 10 | @classmethod 11 | def dump_to_string(cls, value): 12 | return ujson.dumps(value) 13 | 14 | 15 | class UltraJSONValueProtocol(object): 16 | @classmethod 17 | def read(cls, line): 18 | return (None, ujson.loads(line)) 19 | 20 | @classmethod 21 | def write(cls, key, value): 22 | return ujson.dumps(value) 23 | -------------------------------------------------------------------------------- /tweedr/emr/gnip_geo.py: -------------------------------------------------------------------------------- 1 | from mrjob.job import MRJob 2 | from mrjob.protocol import JSONValueProtocol 3 | import json 4 | 5 | 6 | def bbox_contains(bbox, longitude, latitude): 7 | sw_lon, sw_lat, ne_lon, ne_lat = bbox 8 | return (sw_lon <= longitude <= ne_lon) and (sw_lat <= latitude <= ne_lat) 9 | 10 | 11 | class GeoExtract(MRJob): 12 | INPUT_PROTOCOL = JSONValueProtocol 13 | 14 | def configure_options(self): 15 | super(GeoExtract, self).configure_options() 16 | # add_file_option: http://mrjob.readthedocs.org/en/latest/guides/writing-mrjobs.html 17 | self.add_file_option('--containers', help='.geojson feature collection to filter for') 18 | 19 | def mapper_init(self): 20 | with open(self.options.containers) as fp: 21 | self.feature_collection = json.load(fp) 22 | 23 | def mapper(self, _, line): 24 | # Ignore metadata / reports 25 | if 'info' in line and line['info']['message'] == 'Replay Request Completed': 26 | return 27 | 28 | # if any(rule['value'] == 'has:geo' line['gnip']['matching_rules']): 29 | if 'geo' in line and line['geo'].get('type') == 'Point': 30 | latitude, longitude = line['geo']['coordinates'] 31 | for feature in self.feature_collection['features']: 32 | if bbox_contains(feature['bbox'], longitude, latitude): 33 | yield feature['properties']['name'], line 34 | 35 | 36 | if __name__ == '__main__': 37 | # Maybe run the whole thing in a try-catch-finally with counters for error logging? 38 | # might make it easier to debug than pulling down the whole bucket of attempts and 39 | # browsing through the stderr files to find the tracebacks 40 | # http://pythonhosted.org/mrjob/guides/writing-mrjobs.html#counters 41 | GeoExtract.run() 42 | -------------------------------------------------------------------------------- /tweedr/emr/gnip_wc.py: -------------------------------------------------------------------------------- 1 | from mrjob.job import MRJob 2 | from mrjob.protocol import JSONValueProtocol 3 | import json 4 | 5 | 6 | class WordCount(MRJob): 7 | ''' 8 | The default MRJob.INPUT_PROTOCOL is `RawValueProtocol`, but we are reading tweets, 9 | so we'll add a parser before we even get to the mapper. 10 | ''' 11 | # incoming line needs to be parsed (I think), so we set a protocol to do so 12 | INPUT_PROTOCOL = JSONValueProtocol 13 | 14 | def mapper(self, key, line): 15 | '''The key to the first mapper in the step-pipeline is always None.''' 16 | 17 | # GNIP-style streams sometimes have metadata lines, but we can just ignore them 18 | if 'info' in line and line['info']['message'] == 'Replay Request Completed': 19 | return 20 | 21 | # GNIP-style tweets have the tweet text in {'body': '...'} instead of the standard {'text': '...'} 22 | if 'body' not in line: 23 | raise Exception('Missing body field in tweet:\n ' + json.dumps(line)) 24 | 25 | text = line['body'] 26 | yield '~~~TOTAL~~~', 1 27 | for token in text.split(): 28 | yield token.lower(), 1 29 | 30 | def combiner(self, key, value_iter): 31 | yield key, sum(value_iter) 32 | 33 | def reducer(self, key, value_iter): 34 | yield key, sum(value_iter) 35 | 36 | 37 | if __name__ == '__main__': 38 | WordCount.run() 39 | -------------------------------------------------------------------------------- /tweedr/lib/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import fnmatch 4 | import random 5 | import subprocess 6 | from copy import copy 7 | 8 | 9 | def mapList(iterable): 10 | return map(list, iterable) 11 | 12 | 13 | def stderr(s): 14 | sys.stderr.write(s) 15 | sys.stderr.flush() 16 | 17 | 18 | def stderrn(s=''): 19 | stderr(str(s) + os.linesep) 20 | 21 | 22 | def stdout(s): 23 | sys.stdout.write(s) 24 | sys.stdout.flush() 25 | 26 | 27 | def stdoutn(s=''): 28 | stdout(str(s) + os.linesep) 29 | 30 | 31 | def tty_size(): 32 | height, width = subprocess.check_output(['stty', 'size']).split() 33 | return (int(height), int(width)) 34 | 35 | 36 | def uniq(xs): 37 | # order preserving. From http://www.peterbe.com/plog/uniqifiers-benchmark 38 | seen = {} 39 | checked = [] 40 | for x in xs: 41 | if x in seen: 42 | continue 43 | seen[x] = 1 44 | checked.append(x) 45 | return checked 46 | 47 | 48 | def iglob(pattern, root='.'): 49 | for dirpath, dirnames, filenames in os.walk(root): 50 | filepaths = [os.path.join(dirpath, filename) for filename in filenames] 51 | for filepath in fnmatch.filter(filepaths, pattern): 52 | yield filepath 53 | 54 | 55 | def globfirst(pattern, root='.'): 56 | try: 57 | return iglob(pattern, root).next() 58 | except StopIteration: 59 | return None 60 | 61 | 62 | def walk(top, *predicates): 63 | # predicate(filepath) must be True for each predicate, for each filepath to be returned 64 | for dirpath, dirnames, filenames in os.walk(top): 65 | filepaths = [os.path.join(dirpath, filename) for filename in filenames] 66 | for filepath in filepaths: 67 | if all(predicate(filepath) for predicate in predicates): 68 | yield filepath 69 | 70 | 71 | def bifurcate(xs, ratio, shuffle=False): 72 | ''' 73 | Takes a list like [b, c, a, m, n] and ratio like 0.6 and returns two lists: [b, c, a], [m, n] 74 | 75 | E.g., 76 | 77 | test, train = bifurcate(tokenized_labels, test_proportion, shuffle=True) 78 | ''' 79 | length = len(xs) 80 | pivot = int(ratio * length) 81 | if shuffle: 82 | xs = copy(xs) 83 | random.shuffle(xs) 84 | 85 | return (xs[:pivot], xs[pivot:]) 86 | 87 | 88 | class Counts(object): 89 | def __init__(self): 90 | object.__setattr__(self, '_store', {}) 91 | 92 | def __getattr__(self, name): 93 | return self._store.get(name, 0) 94 | 95 | def __setattr__(self, name, value): 96 | self._store[name] = value 97 | 98 | def empty_copy(self): 99 | other = Counts() 100 | other._store = dict((name, 0) for name in self._store) 101 | return other 102 | 103 | def add(self, other): 104 | for name, value in other._store.items(): 105 | self._store[name] = self._store.get(name, 0) + value 106 | 107 | def __repr__(self): 108 | return '' % ' '.join('%s=%d' % (name, value) for name, value in self._store.items()) 109 | -------------------------------------------------------------------------------- /tweedr/lib/readers.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from StringIO import StringIO 3 | 4 | 5 | def infer(s): 6 | if s.isdigit(): 7 | return int(s) 8 | elif s.isalpha(): 9 | return s 10 | return float(s) 11 | 12 | 13 | def read_simple_csv(path): 14 | rows = [] 15 | with open(path) as csv_fp: 16 | for line in csv_fp: 17 | rows.append([infer(cell) for cell in line.strip().split(',')]) 18 | return rows 19 | 20 | 21 | def read_until(readable, marks): 22 | '''Could stall if mark never happens before the EOF''' 23 | stdout_buffer = StringIO() 24 | while True: 25 | # read in bytes one-by-one because we have to break as soon as we hit 26 | # any `mark` character 27 | byte = readable.read(1) 28 | if byte in marks: 29 | output = stdout_buffer.getvalue() 30 | stdout_buffer.close() 31 | return output 32 | stdout_buffer.write(byte) 33 | 34 | 35 | class SniffingDictReader(csv.DictReader, object): 36 | # csv.DictReader(csvfile, fieldnames=None, restkey=None, restval=None, dialect='excel', *args, **kwds) 37 | def __init__(self, csvfile, restkey=None, restval=None): 38 | sniffer = csv.Sniffer() 39 | # sniff the first line 40 | sample = csvfile.readline() 41 | dialect = sniffer.sniff(sample) 42 | # rewind 43 | csvfile.seek(0) 44 | 45 | super(SniffingDictReader, self).__init__(csvfile, restkey=restkey, restval=restval, dialect=dialect) 46 | -------------------------------------------------------------------------------- /tweedr/lib/text.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import re 3 | 4 | token_re = re.compile('\S+') 5 | 6 | punctuation_deletions = [u"'"] 7 | punctuation_elisions = [u'-', u',', u'.', u',', u';', u':', u'|', u'&'] 8 | 9 | punctuation_translations = dict( 10 | [(ord(char), None) for char in punctuation_deletions] + 11 | [(ord(char), u' ') for char in punctuation_elisions]) 12 | 13 | whitespace_unicode_translations = {ord('\t'): u' ', ord('\n'): u' ', ord('\r'): u''} 14 | 15 | 16 | def UpperCamelCase(name): 17 | return re.sub('(^|-|_)(.)', lambda g: g.group(2).upper(), name) 18 | 19 | 20 | def underscore(name): 21 | return re.sub('([A-Z]+)', r'_\1', name).strip('_').lower() 22 | 23 | 24 | def singular(name): 25 | return re.sub('s$', '', name) 26 | 27 | 28 | def utf8str(s): 29 | if isinstance(s, unicode): 30 | return s.encode('utf8') 31 | return s 32 | 33 | 34 | def zip_boundaries(xs, space_len=1): 35 | '''Take a list of strings and iterate through them along with boundary indices. 36 | 37 | >>> tokens = 'Into the void .'.split() 38 | >>> list(zip_boundaries(tokens)) 39 | [('Into', 0, 4), ('the', 5, 8), ('void', 9, 13), ('.', 14, 15)] 40 | ''' 41 | start = 0 42 | for x in xs: 43 | x_len = len(x) 44 | yield x, start, start + x_len 45 | start += x_len + space_len 46 | 47 | 48 | def gloss(alignments, prefixes=None, postfixes=None, width=None, toksep=' ', linesep='\n', groupsep='\n'): 49 | ''' 50 | Creates an interlinear gloss. 51 | 52 | Take a list of [('a', 'DET'), ('beluga', 'N')] and return a string covering multiples lines, like: 53 | a beluga 54 | DET N 55 | each item in `alignments` should have the same length, N 56 | `prefixes`, if provided, should be N-long 57 | `postfixes`, if provided, should be N-long 58 | ''' 59 | if width is None: 60 | width = int(subprocess.check_output(['tput', 'cols'])) 61 | toksep_len = len(toksep) 62 | 63 | # a "group" is a N-line string, each line of which is at most `width` characters 64 | # `groups` is a list of such groups 65 | groups = [] 66 | 67 | def flush_buffer(line_buffer): 68 | if len(line_buffer) > 0: 69 | lines = [toksep.join(tokens) for tokens in line_buffer] 70 | if prefixes: 71 | lines = [prefix + line for prefix, line in zip(prefixes, lines)] 72 | if postfixes: 73 | lines = [line + postfix for postfix, line in zip(postfixes, lines)] 74 | groups.append(linesep.join(lines)) 75 | return [[] for _ in alignments[0]] 76 | 77 | # the line_buffer is an N-long list of lists of tokens (strings) 78 | # [[e1, e2, e3], [f1, f2, f3], [g1, g2, g3]] 79 | line_buffer = flush_buffer([]) 80 | # the line_buffer_width is just the cumulative width of the current line_buffer 81 | line_buffer_width = 0 82 | 83 | for aligned in alignments: 84 | aligned = map(str, aligned) 85 | length = max(map(len, aligned)) 86 | line_buffer_width += toksep_len + length 87 | if line_buffer_width >= width: 88 | line_buffer = flush_buffer(line_buffer) 89 | line_buffer_width = length 90 | for i, token in enumerate(aligned): 91 | line_buffer[i].append(token.ljust(length)) 92 | 93 | flush_buffer(line_buffer) 94 | 95 | return groupsep.join(groups) 96 | -------------------------------------------------------------------------------- /tweedr/lib/timeout.py: -------------------------------------------------------------------------------- 1 | '''This module is mostly from github.com/chbrown/remoting 2 | See that repository's readme and /remoting/timeout.py 3 | 4 | This is basically how timeouts in Python work: 5 | 6 | Use `signal.signal` to queue up a function to run after a specified amount of 7 | time. This function's sole purpose is to raise an exception. 8 | 9 | You run your target method, the `func` arg to this decorate() method. 10 | Two things can happen from here: 11 | a. Your function finishes before the timeout period. In that case, immediately tell `signal.signal` "just kidding, dont run that function after all." We cancel the scheduled signal from step 1, and put the old handler back in place. 12 | b. Your function does not finish in time, TimeoutError is raised, and you have to catch it somewhere upstream. 13 | 14 | ''' 15 | import signal 16 | 17 | 18 | class TimeoutError(Exception): 19 | def __call__(self, signum, frame): 20 | self.args 21 | raise self 22 | 23 | def __repr__(self): 24 | return '%s(%s)' % (self.__class__.__name__, self.message) 25 | 26 | 27 | def timeout_after(seconds): 28 | '''Closures in python are so beautiful.''' 29 | def decorate(func): 30 | def wrapper(*args, **kw): 31 | new_ALRM = TimeoutError('Timed out after %d seconds.' % seconds) 32 | old_ALRM = signal.signal(signal.SIGALRM, new_ALRM) 33 | signal.alarm(seconds) 34 | try: 35 | result = func(*args, **kw) 36 | # we don't handle the error here 37 | finally: 38 | # but we do put the old handler back in place 39 | signal.signal(signal.SIGALRM, old_ALRM) 40 | signal.alarm(0) 41 | return result 42 | wrapper.func_name = func.__name__ 43 | return wrapper 44 | return decorate 45 | 46 | 47 | def example(): 48 | '''Usage example. Should be doctests?''' 49 | import os 50 | 51 | @timeout_after(5) 52 | def waiter_task(seconds): 53 | os.system('sleep %d' % seconds) 54 | return 'Waited %ds successfully' % seconds 55 | 56 | print waiter_task(2) # --> prints 'Waited 2s successfully' 57 | print waiter_task(7) # --> throws 58 | 59 | 60 | if __name__ == '__main__': 61 | print example() 62 | -------------------------------------------------------------------------------- /tweedr/ml/__init__.py: -------------------------------------------------------------------------------- 1 | from sklearn import metrics 2 | from tweedr.lib import Counts 3 | 4 | 5 | def print_metrics_summary(gold_labels, predicted_labels, sample=0): 6 | print ''' Accuracy: {accuracy} 7 | P/R: {precision:.4f}/{recall:.4f} 8 | F1: {fscore:.4f}'''.format( 9 | accuracy=metrics.accuracy_score(gold_labels, predicted_labels), 10 | precision=metrics.precision_score(gold_labels, predicted_labels), 11 | recall=metrics.recall_score(gold_labels, predicted_labels), 12 | fscore=metrics.f1_score(gold_labels, predicted_labels) 13 | ) 14 | 15 | if sample > 0: 16 | print 'Sample of classifications ' 17 | for _, gold, predicted in zip(xrange(sample), gold_labels, predicted_labels): 18 | print ' gold: {gold}, predicted: {predicted}'.format(gold=gold, predicted=predicted) 19 | 20 | 21 | def compare_labels(gold_labels, predicted_labels, null_label): 22 | # produces a Counts object with values: 23 | # .true_positives 24 | # .false_negatives 25 | # .true_negatives 26 | # .false_positives 27 | # .comparisons = SUM of the others 28 | counts = Counts() 29 | for gold_label, predicted_label in zip(gold_labels, predicted_labels): 30 | counts.comparisons += 1 31 | if gold_label != null_label: 32 | if predicted_label == gold_label: 33 | counts.true_positives += 1 34 | else: 35 | counts.false_negatives += 1 36 | 37 | if gold_label == null_label: 38 | if predicted_label == gold_label: 39 | counts.true_negatives += 1 40 | else: 41 | counts.false_positives += 1 42 | 43 | return counts 44 | -------------------------------------------------------------------------------- /tweedr/ml/build_confusion_matrix.py: -------------------------------------------------------------------------------- 1 | # import os 2 | import argparse 3 | 4 | from sklearn import cross_validation # , metrics 5 | # from sklearn.pipeline import Pipeline 6 | # from sklearn.feature_extraction import text 7 | 8 | import pylab as pl 9 | from tweedr.models import DBSession, TokenizedLabel, Label 10 | from tweedr.ml.crf.classifier import CRF 11 | from tweedr.ml.features import crf_feature_functions, featurize 12 | from sklearn.metrics import confusion_matrix 13 | 14 | import logging 15 | logger = logging.getLogger(__name__) 16 | 17 | flatMap = lambda iterable: map(list, iterable) 18 | 19 | 20 | def evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y, index, opts): 21 | '''If you use print_gloss, your test_y better be lists, not iterables.''' 22 | logger.info('Training on %d, testing on %d', len(train_y), len(test_y)) 23 | classifier.fit(train_X, train_y) 24 | predicted_y = classifier.predict(test_X) 25 | # flatten 26 | test_y = sum(test_y, []) 27 | predicted_y = sum(predicted_y, []) 28 | #counts = compare_labels(test_y, predicted_y, 'None') 29 | 30 | gold_labels = [] 31 | predicted_labels = [] 32 | i = 0 33 | diction = {} 34 | 35 | j = 0 36 | 37 | while j < len(test_y): 38 | try: 39 | diction[test_y[j]] += 1 40 | except KeyError: 41 | diction[test_y[j]] = 1 42 | j = j + 1 43 | 44 | if (opts.include_none == 0): 45 | while i < len(test_y): 46 | if (test_y[i] == "None" and predicted_y[i] == "None"): 47 | pass 48 | else: 49 | try: 50 | if diction[test_y[i]] > opts.threshold: 51 | gold_labels.append(test_y[i]) 52 | predicted_labels.append(predicted_y[i]) 53 | except KeyError: 54 | pass 55 | i = i + 1 56 | 57 | cm = confusion_matrix(gold_labels, predicted_labels) 58 | print "Confusion Matrix" 59 | print cm 60 | pl.matshow(cm) 61 | pl.title('Confusion Matrix') 62 | pl.colorbar() 63 | pl.savefig("confusion_matrix" + str(index) + '.png', format='png') 64 | pl.clf() 65 | 66 | 67 | def main(): 68 | parser = argparse.ArgumentParser( 69 | description='Train CRFSuite on data from the QCRI MySQL database', 70 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 71 | parser.add_argument('-k', '--k-folds', 72 | type=int, default=10, help='How many folds of the data to test on') 73 | parser.add_argument('--max-data', 74 | type=int, default=10000, help='Maximum data points to train and test on') 75 | parser.add_argument('--include-none', type=int, default=0, help='Include None in Confusion Matrix.') 76 | parser.add_argument('-threshold', type=int, default=10, help='Threshold for number of gold labels classified.') 77 | opts = parser.parse_args() 78 | 79 | # e.g., tokenized_label = 80 | # 82 | # Train and test must be iterables of objects that support CRF-ready 83 | # .tokens and .labels attributes. 84 | query = DBSession.query(TokenizedLabel).limit(opts.max_data) 85 | X_y = ((featurize(item.tokens, crf_feature_functions), item.labels) for item in query) 86 | # unzip and flatten into static list 87 | X, y = zip(*X_y) 88 | # we need to read X multiple times, so make sure it's all static 89 | X = map(flatMap, X) 90 | 91 | categories = dict((label.id, label.text) for label in DBSession.query(Label)) 92 | print 'categories', categories 93 | 94 | N = len(y) 95 | index = 0 96 | for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True): 97 | # train, test = tokenized_labels[train_indices], tokenized_labels[test_indices] 98 | train_X = [X[i] for i in train_indices] 99 | train_y = [y[i] for i in train_indices] 100 | test_X = [X[i] for i in test_indices] 101 | test_y = [y[i] for i in test_indices] 102 | classifier = CRF() 103 | # print_gloss=True 104 | index = index + 1 105 | evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y, index, opts) 106 | 107 | if __name__ == '__main__': 108 | main() 109 | -------------------------------------------------------------------------------- /tweedr/ml/classifier.py: -------------------------------------------------------------------------------- 1 | '''I recommend emulating the scikit-learn interface, with or without 2 | ClassifierI because fit and predict are more descriptive names than append_raw 3 | and save, etc. In the CRF case, it's less transparent how to access the 4 | underlying tagger/trainer but I think as long as it's following the sklearn 5 | paradigm, an opaque wrapper is okay. 6 | ''' 7 | from sklearn import base 8 | 9 | import logging 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class ClassifierI(base.ClassifierMixin): 14 | ''' 15 | Interface to emulate sklearn classifiers. 16 | 17 | * `X`: an iterable of data points, each of which might be a point in many-dimensional space, a list of strings, etc. 18 | * `y`: an iterable of discrete labels, each of which may be a string, or a True/False value, or just an integer (not a float). 19 | ''' 20 | def __init__(self, *args, **kw): 21 | pass 22 | 23 | def fit(self, X, y): 24 | '''Fit the model according to the given training data.''' 25 | raise NotImplementedError(__doc__) 26 | 27 | def fit_transform(self, X, y=None): 28 | '''Fit to some data, then transform it''' 29 | self.fit(X, y) 30 | return self.transform(X) 31 | 32 | def get_params(self, deep=False): 33 | '''Get parameters for the estimator''' 34 | raise NotImplementedError(__doc__) 35 | 36 | def predict(self, X): 37 | '''Predict class labels for samples in X.''' 38 | raise NotImplementedError(__doc__) 39 | 40 | # def score(self, X, y): 41 | # '''Returns the mean accuracy on the given test data and labels.''' 42 | # raise NotImplementedError(__doc__) 43 | 44 | def set_params(self, **params): 45 | '''Set the parameters of the estimator.''' 46 | raise NotImplementedError(__doc__) 47 | 48 | def transform(self, X, threshold=None): 49 | '''Reduce X to its most important features.''' 50 | raise NotImplementedError(__doc__) 51 | -------------------------------------------------------------------------------- /tweedr/ml/crf/__init__.py: -------------------------------------------------------------------------------- 1 | from tweedr.lib.text import utf8str 2 | import crfsuite 3 | 4 | 5 | class ItemSequence(crfsuite.ItemSequence): 6 | def __init__(self, features_iter, check=False): 7 | '''Create new ItemSequence, typedef std::vector based on the 8 | given iterable of iterable of 2-tuples or strings. 9 | If check=True, any unicode present in the given features_iter 10 | will be encoded into a bytestring as utf8.''' 11 | super(ItemSequence, self).__init__() 12 | self.append_raw(features_iter, check=check) 13 | 14 | def append_raw(self, features_iter, check=False): 15 | ''' 16 | @features_iter is an iterable of iterables, of tuples or strings. 17 | type: [[(str, float) | str]], where [] is an iterable 18 | ''' 19 | for features in features_iter: 20 | if check: 21 | features = map(utf8str, features) 22 | item = crfsuite.Item() 23 | for feature in features: 24 | if isinstance(feature, tuple): 25 | attribute = crfsuite.Attribute(*feature) 26 | else: 27 | attribute = crfsuite.Attribute(feature) 28 | item.append(attribute) 29 | self.append(item) 30 | -------------------------------------------------------------------------------- /tweedr/ml/crf/classifier.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import crfsuite 4 | from tweedr.ml.crf import ItemSequence 5 | from tweedr.ml.classifier import ClassifierI 6 | from tweedr.ml.features import featurize 7 | 8 | from itertools import izip 9 | 10 | import logging 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class CRF(ClassifierI): 15 | ''' 16 | Doesn't fit entirely within the classifier paradigm, due to the hierarchy of data: 17 | Sentences have each token labeled, but each sentence is an individual entity. 18 | ''' 19 | def __init__(self, algorithm='l2sgd', type_='crf1d'): 20 | self.trainer = crfsuite.Trainer() 21 | self.trainer.select(algorithm, type_) 22 | # default parameters: 23 | self.trainer.set('c2', '0.1') 24 | 25 | def fit(self, X, y): 26 | # For a CRF, X is an iterable of lists of lists of features (=strings) 27 | # and y is a list of list of token labels (=strings) 28 | for features_iter, labels in zip(X, y): 29 | items = ItemSequence(features_iter, check=True) 30 | self.trainer.append(items, tuple(labels), 0) 31 | 32 | self.model_filepath = tempfile.NamedTemporaryFile(delete=False).name 33 | self.trainer.train(self.model_filepath, -1) 34 | # persist to file and pull it back out. 35 | self.tagger = crfsuite.Tagger() 36 | self.tagger.open(self.model_filepath) 37 | 38 | def get_params(self, help=False): 39 | params = self.trainer.params() 40 | return dict((name, self.trainer.help(name) if help else self.trainer.get(name)) for name in params) 41 | 42 | def predict(self, X): 43 | y = [] 44 | for features_iter in X: 45 | # maybe use self.predict_one(features_iter) instead? 46 | items = ItemSequence(features_iter, check=True) 47 | # this will just die if self.tagger has not been set 48 | self.tagger.set(items) 49 | # could also run self.probability() and self.marginal() 50 | # convert tuple (output of viterbi()) to list 51 | labels = list(self.tagger.viterbi()) 52 | y.append(labels) 53 | return y 54 | 55 | def set_params(self, **params): 56 | for name, value in params.item(): 57 | self.trainer.set(name, value) 58 | 59 | # additional fields below are not required by ClassifierI 60 | def predict_one(self, features_iter): 61 | items = ItemSequence(features_iter, check=True) 62 | self.tagger.set(items) 63 | return list(self.tagger.viterbi()) 64 | 65 | def save(self, model_filepath): 66 | logger.debug('Saving model to %s', model_filepath) 67 | # just die if self.model_filepath doesn't exist 68 | os.rename(self.model_filepath, model_filepath) 69 | self.model_filepath = model_filepath 70 | 71 | @classmethod 72 | def from_file(cls, model_filepath): 73 | '''If we are given a model_filepath that points to an existing file, use it. 74 | otherwise, create a temporary file to store the model because CRFSuite 75 | doesn't seem to allow us to create a tagger directly from a trained 76 | trainer object.''' 77 | # cls = CRF, obviously 78 | crf = cls() 79 | crf.tagger = crfsuite.Tagger() 80 | logger.debug('Loading existing model from %s', model_filepath) 81 | crf.tagger.open(model_filepath) 82 | crf.model_filepath = model_filepath 83 | 84 | return crf 85 | 86 | @classmethod 87 | def from_data(cls, data, feature_functions): 88 | '''data must be an iterable of objects with .tokens and .labels attributes.''' 89 | crf = cls() 90 | X_y = ((featurize(datum.tokens, feature_functions), datum.labels) for datum in data) 91 | X, y = izip(*X_y) 92 | # X (and y) are iterables, by the way 93 | 94 | logger.debug('Fitting CRF') 95 | crf.fit(X, y) 96 | 97 | return crf 98 | 99 | @classmethod 100 | def default(cls, feature_functions, retrain=False, limit=10000): 101 | # Is it messy to have this method here, since it depends on tweedr.models.*? 102 | # and on a specific filepath in the local filesystem? 103 | model_filepath = '/tmp/tweedr.ml.crf.classifier-max%d.model' % limit 104 | if os.path.exists(model_filepath): 105 | return cls.from_file(model_filepath) 106 | else: 107 | from tweedr.models import DBSession, TokenizedLabel 108 | query = DBSession.query(TokenizedLabel).limit(10000) 109 | crf = cls.from_data(query, feature_functions) 110 | crf.save(model_filepath) 111 | return crf 112 | -------------------------------------------------------------------------------- /tweedr/ml/crf/wrapper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import crfsuite 3 | import tempfile 4 | 5 | from tweedr.ml.crf import ItemSequence 6 | from tweedr.ml.features import featurize 7 | 8 | import logging 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class Trainer(crfsuite.Trainer): 13 | """ 14 | Inherit crfsuite.Trainer to implement message() function, which receives 15 | progress messages from a training process. 16 | """ 17 | def message(self, s): 18 | logger.silly('Trainer.message: %s', s.strip()) 19 | 20 | def append_raw(self, features_iter, labels): 21 | # len(labels) = len(features_iter) = length of sentence / sequence 22 | # labels is a tuple of strings, features_iter is an tuple/list of variable-length lists of strings. 23 | # this just wraps all the data / labels with crfsuite types 24 | items = ItemSequence(features_iter) 25 | # labels = crfsuite.StringList(labels) 26 | self.append(items, tuple(labels), 0) 27 | 28 | def save(self, model_path): 29 | # Trainer.select(algorithm, type): Initialize the training algorithm and set type of graphical model 30 | # lbfgs is the default algorithm 31 | # l2sgd is L2-regularized SGD 32 | # crf1d is 1st-order dyad features. 33 | self.select('l2sgd', 'crf1d') 34 | 35 | # Set the coefficient for L2 regularization to 0.1 36 | # potential values change based on algorithm previously selected 37 | # See http://www.chokkan.org/software/crfsuite/manual.html 38 | self.set('c2', '0.1') 39 | 40 | # Start training; the training process will invoke trainer.message() 41 | # to report the progress. 42 | self.train(model_path, -1) 43 | 44 | # print 'After training: params and their values' 45 | # for name in trainer.params(): 46 | # print name, trainer.get(name), trainer.help(name) 47 | 48 | 49 | class Tagger(crfsuite.Tagger): 50 | def __init__(self, model_path): 51 | super(Tagger, self).__init__() 52 | self.open(model_path) 53 | 54 | def tag_raw(self, features_iter): 55 | ''' 56 | Obtain the label sequence predicted by the tagger. 57 | 58 | This returns a tuple of strings (label identifiers) 59 | ''' 60 | items = ItemSequence(features_iter) 61 | self.set(items) 62 | # could also run self.probability() and self.marginal() 63 | return self.viterbi() 64 | 65 | @classmethod 66 | def from_path_or_data(cls, data, feature_functions, model_filepath=None): 67 | '''If we are given a model_filepath that points to an existing file, use it. 68 | otherwise, create a temporary file to store the model because CRFSuite 69 | doesn't seem to allow us to create a tagger directly from a trained 70 | trainer object.''' 71 | if model_filepath is None or not os.path.exists(model_filepath): 72 | if model_filepath is None: 73 | model_filepath = tempfile.NamedTemporaryFile(delete=False).name 74 | 75 | trainer = Trainer() 76 | for i, datum in enumerate(data): 77 | tokens = datum.tokens 78 | labels = datum.labels 79 | 80 | tokens_features = featurize(tokens, feature_functions) 81 | trainer.append_raw(tokens_features, labels) 82 | 83 | trainer.save(model_filepath) 84 | logger.debug('Trained on %d instances and saved to %s', i, model_filepath) 85 | else: 86 | logger.debug('Loading existing model from %s', model_filepath) 87 | 88 | return cls(model_filepath) 89 | -------------------------------------------------------------------------------- /tweedr/ml/evaluate.py: -------------------------------------------------------------------------------- 1 | # import os 2 | import argparse 3 | from colorama import Fore 4 | 5 | from sklearn import cross_validation # , metrics 6 | # from sklearn.pipeline import Pipeline 7 | # from sklearn.feature_extraction import text 8 | 9 | from tweedr.lib.text import gloss 10 | from tweedr.models import DBSession, TokenizedLabel 11 | from tweedr.ml import compare_labels # print_metrics_summary 12 | from tweedr.ml.crf.classifier import CRF 13 | from tweedr.ml.features import crf_feature_functions, featurize, featurize_adjacent 14 | 15 | import logging 16 | logger = logging.getLogger(__name__) 17 | 18 | flatMap = lambda iterable: map(list, iterable) 19 | 20 | 21 | def evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y, print_gloss=False): 22 | '''If you use print_gloss, your test_y better be lists, not iterables.''' 23 | logger.info('Training on %d, testing on %d', len(train_y), len(test_y)) 24 | classifier.fit(train_X, train_y) 25 | predicted_y = classifier.predict(test_X) 26 | 27 | if print_gloss: 28 | for tokens_features, gold_labels, predicted_labels in zip(test_X, test_y, predicted_y): 29 | print '-' * 80 30 | # hope that the first feature string is the unigram! 31 | tokens = [token_features[0] for token_features in tokens_features] 32 | print gloss(zip(tokens, gold_labels, predicted_labels), 33 | prefixes=(Fore.WHITE, Fore.YELLOW, Fore.BLUE), 34 | postfixes=(Fore.RESET, Fore.RESET, Fore.RESET)) 35 | 36 | # flatten 37 | test_y = sum(test_y, []) 38 | predicted_y = sum(predicted_y, []) 39 | counts = compare_labels(test_y, predicted_y, 'None') 40 | print 'counts', counts 41 | 42 | # sklearn metrics doesn't like string labels. 43 | # used_labels = list(set(gold_labels + predicted_labels)) 44 | # print 'used_labels', used_labels 45 | # lookup = dict((label, index) for index, label in enumerate(used_labels)) 46 | # print 'lookup', lookup 47 | # remap to integers 48 | # gold_labels = [lookup[gold_label] for gold_label in gold_labels] 49 | # predicted_labels = [lookup[predicted_label] for predicted_label in predicted_labels] 50 | 51 | # print_metrics_summary(gold_labels, predicted_labels) 52 | # classification_report requires numeric labels, apparently? 53 | # print metrics.classification_report(gold_labels, predicted_labels) 54 | 55 | precision = float(counts.true_positives) / (counts.true_positives + counts.false_positives) 56 | recall = float(counts.true_positives) / (counts.true_positives + counts.false_negatives) 57 | fscore = 2 * (precision * recall / (precision + recall)) 58 | for name, value in [('Precision', precision), ('Recall', recall), ('F-score', fscore)]: 59 | print '%s: %.4f' % (name, value) 60 | 61 | 62 | def main(): 63 | parser = argparse.ArgumentParser( 64 | description='Train CRFSuite on data from the QCRI MySQL database', 65 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 66 | parser.add_argument('-k', '--k-folds', 67 | type=int, default=10, help='How many folds of the data to test on') 68 | parser.add_argument('--max-data', 69 | type=int, default=10000, help='Maximum data points to train and test on') 70 | parser.add_argument('--adjacent', 71 | type=int, default=0, help='Set adjacent to 1 if adjacent functions want to be used') 72 | opts = parser.parse_args() 73 | 74 | # e.g., tokenized_label = 75 | # 77 | # Train and test must be iterables of objects that support CRF-ready 78 | # .tokens and .labels attributes. 79 | query = DBSession.query(TokenizedLabel).\ 80 | filter(TokenizedLabel.tweet is not None).\ 81 | filter(TokenizedLabel.tweet != '').\ 82 | limit(opts.max_data) 83 | if (opts.adjacent == 0): 84 | X_y = ((featurize(item.tokens, crf_feature_functions), item.labels) for item in query) 85 | else: 86 | X_y = ((featurize_adjacent(item.tokens, crf_feature_functions), item.labels) for item in query) 87 | # unzip and flatten into static list 88 | X, y = zip(*X_y) 89 | # we need to read X multiple times, so make sure it's all static 90 | X = map(flatMap, X) 91 | 92 | N = len(y) 93 | for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True): 94 | # train, test = tokenized_labels[train_indices], tokenized_labels[test_indices] 95 | train_X = [X[i] for i in train_indices] 96 | train_y = [y[i] for i in train_indices] 97 | test_X = [X[i] for i in test_indices] 98 | test_y = [y[i] for i in test_indices] 99 | classifier = CRF() 100 | # print_gloss=True 101 | evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y) 102 | 103 | if __name__ == '__main__': 104 | main() 105 | -------------------------------------------------------------------------------- /tweedr/ml/evaluate_combinations.py: -------------------------------------------------------------------------------- 1 | # import os 2 | import argparse 3 | from colorama import Fore 4 | import itertools 5 | from sklearn import cross_validation # , metrics 6 | # from sklearn.pipeline import Pipeline 7 | # from sklearn.feature_extraction import text 8 | 9 | from tweedr.lib.text import gloss 10 | from tweedr.models import DBSession, TokenizedLabel, Label 11 | from tweedr.ml import compare_labels # print_metrics_summary 12 | from tweedr.ml.crf.classifier import CRF 13 | from tweedr.ml.features import crf_feature_functions, featurize 14 | 15 | import logging 16 | logger = logging.getLogger(__name__) 17 | 18 | flatMap = lambda iterable: map(list, iterable) 19 | 20 | 21 | def evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y, print_gloss=False): 22 | '''If you use print_gloss, your test_y better be lists, not iterables.''' 23 | try: 24 | logger.info('Training on %d, testing on %d', len(train_y), len(test_y)) 25 | classifier.fit(train_X, train_y) 26 | predicted_y = classifier.predict(test_X) 27 | 28 | if print_gloss: 29 | for tokens_features, gold_labels, predicted_labels in zip(test_X, test_y, predicted_y): 30 | print '-' * 80 31 | # hope that the first feature string is the unigram! 32 | tokens = [token_features[0] for token_features in tokens_features] 33 | print gloss(zip(tokens, gold_labels, predicted_labels), 34 | fixes=(Fore.WHITE, Fore.YELLOW, Fore.BLUE), 35 | postfixes=(Fore.RESET, Fore.RESET, Fore.RESET)) 36 | 37 | # flatten 38 | test_y = sum(test_y, []) 39 | predicted_y = sum(predicted_y, []) 40 | counts = compare_labels(test_y, predicted_y, 'None') 41 | print 'counts', counts 42 | 43 | # sklearn metrics doesn't like string labels. 44 | # used_labels = list(set(gold_labels + predicted_labels)) 45 | # print 'used_labels', used_labels 46 | # lookup = dict((label, index) for index, label in enumerate(used_labels)) 47 | # print 'lookup', lookup 48 | # remap to integers 49 | # gold_labels = [lookup[gold_label] for gold_label in gold_labels] 50 | # predicted_labels = [lookup[predicted_label] for predicted_label in predicted_labels] 51 | 52 | # print_metrics_summary(gold_labels, predicted_labels) 53 | # classification_report requires numeric labels, apparently? 54 | # print metrics.classification_report(gold_labels, predicted_labels) 55 | try: 56 | precision = float(counts.true_positives) / (counts.true_positives + counts.false_positives) 57 | recall = float(counts.true_positives) / (counts.true_positives + counts.false_negatives) 58 | fscore = 2 * (precision * recall / (precision + recall)) 59 | for name, value in [('Precision', precision), ('Recall', recall), ('F-score', fscore)]: 60 | print '%s: %.4f' % (name, value) 61 | except ZeroDivisionError: 62 | pass 63 | except IOError: 64 | pass 65 | 66 | 67 | def main(): 68 | parser = argparse.ArgumentParser( 69 | description='Train CRFSuite on data from the QCRI MySQL database', 70 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 71 | parser.add_argument('-k', '--k-folds', 72 | type=int, default=10, help='How many folds of the data to test on') 73 | parser.add_argument('--max-data', 74 | type=int, default=10000, help='Maximum data points to train and test on') 75 | opts = parser.parse_args() 76 | 77 | # e.g., tokenized_label = 78 | # 80 | # Train and test must be iterables of objects that support CRF-ready 81 | # .tokens and .labels attributes. 82 | query = DBSession.query(TokenizedLabel).limit(opts.max_data) 83 | 84 | for L in range(0, len(crf_feature_functions) + 1): 85 | for subset in itertools.combinations(crf_feature_functions, L): 86 | sub = list(subset) 87 | print sub 88 | X_y = ((featurize(item.tokens, sub), item.labels) for item in query) 89 | # unzip and flatten into static list 90 | X, y = zip(*X_y) 91 | # we need to read X multiple times, so make sure it's all static 92 | X = map(flatMap, X) 93 | categories = dict((label.id, label.text) for label in DBSession.query(Label)) 94 | print 'categories', categories 95 | 96 | N = len(y) 97 | #tests on different data sets -> k folds is set to 10 right now 98 | for train_indices, test_indices in cross_validation.KFold(N, opts.k_folds, shuffle=True): 99 | train_X = [X[i] for i in train_indices] 100 | train_y = [y[i] for i in train_indices] 101 | test_X = [X[i] for i in test_indices] 102 | test_y = [y[i] for i in test_indices] 103 | classifier = CRF() 104 | evaluateSequenceClassifier(classifier, train_X, train_y, test_X, test_y) 105 | 106 | if __name__ == '__main__': 107 | main() 108 | -------------------------------------------------------------------------------- /tweedr/ml/features/__init__.py: -------------------------------------------------------------------------------- 1 | # each feature function takes an N-long document (list of strings) and returns an N-long list 2 | # of lists/tuples of features (i.e., strings) to add to the total data for that sentence. 3 | # often the list will contain lists that are 1-long 4 | from itertools import izip, chain 5 | 6 | 7 | def spacer(xs): 8 | return [' '.join(xs)] 9 | 10 | 11 | def featurize_adjacent(tokens, feature_functions): 12 | feature_functions_results = [feature_function(tokens) for feature_function in feature_functions] 13 | list_of_token_features = [] 14 | #add token features 15 | for token_featuress in izip(*feature_functions_results): 16 | list_of_token_features.append(list(chain.from_iterable(token_featuress))) 17 | #add features to the left and to the right 18 | i = 0 19 | while i < len(list_of_token_features): 20 | j = list_of_token_features[i] 21 | it = [k for k in j] 22 | if i > 0: 23 | a = list_of_token_features[i - 1] 24 | c = ['^^^' + k for k in a] 25 | try: 26 | c.pop(0) 27 | except IndexError: 28 | pass 29 | it += c 30 | 31 | if i < len(list_of_token_features) - 1: 32 | b = list_of_token_features[i + 1] 33 | d = ['$$$' + k for k in b] 34 | try: 35 | d.pop(0) 36 | except IndexError: 37 | pass 38 | it += d 39 | i = i + 1 40 | yield chain.from_iterable([it]) 41 | 42 | 43 | def featurize(tokens, feature_functions): 44 | '''Take a N-long list of strings (natural text), apply each feature function, 45 | and then unzip (transpose) and flatten so that we get a N-long list of 46 | arbitrarily-long lists of strings. 47 | ''' 48 | feature_functions_results = [feature_function(tokens) for feature_function in feature_functions] 49 | for token_featuress in izip(*feature_functions_results): 50 | yield chain.from_iterable(token_featuress) 51 | 52 | 53 | def featurize_to_dict(tokens, feature_functions): 54 | '''Take a N-long list of strings (natural text), apply each feature function, 55 | create N-long list of dicts with keys that are the names of feature functions, 56 | and values that are the joined output of those functions. 57 | ''' 58 | feature_functions_results = [feature_function(tokens) for feature_function in feature_functions] 59 | for token_featuress in izip(*feature_functions_results): 60 | token_feature_dict = dict() 61 | for feature_function, token_features in zip(feature_functions, token_featuress): 62 | token_feature_string = ' '.join(token_features) 63 | if token_feature_string: 64 | token_feature_dict[feature_function.__name__] = token_feature_string 65 | yield token_feature_dict 66 | 67 | 68 | def main(): 69 | # example usage: 70 | # echo "The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced no evidence that any irregularities took place." | python __init__.py 71 | import sys 72 | from tweedr.lib.text import token_re 73 | from tweedr.ml.features.sets import all_feature_functions 74 | for line in sys.stdin: 75 | # tokenize the document on whitespace 76 | tokens = token_re.findall(line) 77 | # apply all feature functions 78 | tokens_features = featurize(tokens, all_feature_functions) 79 | for i, token_features in enumerate(tokens_features): 80 | print i, list(token_features) 81 | 82 | if __name__ == '__main__': 83 | main() 84 | -------------------------------------------------------------------------------- /tweedr/ml/features/characters.py: -------------------------------------------------------------------------------- 1 | def capitalized(document): 2 | return [['CAPITALIZED'] if token[0].isupper() else [] for token in document] 3 | 4 | 5 | def plural(document): 6 | return [['PLURAL'] if token.endswith('s') else [] for token in document] 7 | 8 | 9 | def numeric(document): 10 | return [['NUMERIC'] if token.isdigit() else [] for token in document] 11 | 12 | 13 | def includes_numeric(document): 14 | return [['INCLUDES_NUMERIC'] if any(char.isdigit() for char in token) else [] for token in document] 15 | -------------------------------------------------------------------------------- /tweedr/ml/features/dbpedia.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | from tweedr.ml.spotlight import annotate 4 | from tweedr.lib.text import zip_boundaries 5 | 6 | spotlight_annotate_url = '%s/rest/annotate' % os.environ.get('SPOTLIGHT', 'http://spotlight.sztaki.hu:2222') 7 | 8 | 9 | def get_pos(offset, document): 10 | doc_joined = " ".join(document) 11 | beginning = doc_joined[:offset] 12 | length = len(beginning.split(" ")) - 1 13 | return length 14 | 15 | 16 | def features(document): 17 | doc_length = len(document) 18 | doc_joined = " ".join(document) 19 | positions = [[] for x in xrange(doc_length)] 20 | try: 21 | annotations = annotate('http://tweedr.dssg.io:2222/rest/annotate', doc_joined, confidence=0.4, support=20) 22 | for a in annotations: 23 | offset = a["offset"] 24 | type = a["types"] 25 | all_types = type.split(",") 26 | dbpedia_type = all_types[0] 27 | pos = get_pos(offset, document) 28 | db = str(dbpedia_type) 29 | positions[pos] = [db.upper()] 30 | except Exception: 31 | return positions 32 | return positions 33 | 34 | 35 | def spotlight(document, confidence=0.1, support=10): 36 | document_string = u' '.join(document) 37 | r = requests.post(spotlight_annotate_url, 38 | headers=dict(Accept='application/json'), 39 | data=dict(text=document_string, confidence=confidence, support=support)) 40 | Resources = r.json().get('Resources', []) 41 | for token, token_start, token_end in zip_boundaries(document): 42 | labels = [] 43 | for Resource in Resources: 44 | entity_start = int(Resource['@offset']) 45 | entity_end = entity_start + len(Resource['@surfaceForm']) 46 | 47 | if entity_start <= token_start <= entity_end or entity_start <= token_end <= entity_end: 48 | entity_uri = Resource['@URI'] 49 | entity_types = Resource['@types'].split(',') 50 | labels += [entity_uri] + entity_types 51 | yield labels 52 | -------------------------------------------------------------------------------- /tweedr/ml/features/lexicons.py: -------------------------------------------------------------------------------- 1 | from tweedr.ml import wordnet, lexicon_list 2 | 3 | 4 | def is_transportation(document): 5 | return [['TRANSPORTATION'] if token in lexicon_list.transportation else [] for token in document] 6 | 7 | 8 | def is_building(document): 9 | return [['BUILDING'] if token in lexicon_list.buildings else [] for token in document] 10 | 11 | 12 | def hypernyms(document, recursive=True, depth=1): 13 | '''Iterate through all senses for all 1-away hypernyms. E.g.: 14 | 15 | print map(list, hypernyms(document)) 16 | ''' 17 | for token in document: 18 | yield wordnet.token_hypernyms(token, recursive, depth) 19 | -------------------------------------------------------------------------------- /tweedr/ml/features/ngrams.py: -------------------------------------------------------------------------------- 1 | from tweedr.ml.features import spacer 2 | 3 | 4 | def unigrams(document): 5 | return [[token] for token in document] 6 | 7 | 8 | def rbigrams(document): 9 | grams = zip(document, document[1:] + ['$$$']) 10 | return map(spacer, grams) 11 | 12 | 13 | def lbigrams(document): 14 | grams = zip(['^^^'] + document[:-1], document) 15 | return map(spacer, grams) 16 | 17 | 18 | def ctrigrams(document): 19 | grams = zip(['^^^'] + document[:-1], document, document[1:] + ['$$$']) 20 | return map(spacer, grams) 21 | 22 | 23 | def unique(document): 24 | # TODO: unique doesn't really belong here, but doesn't quite merit its own module 25 | seen = {} 26 | features = [] 27 | for token in document: 28 | features.append(['UNIQUE'] if token not in seen else []) 29 | seen[token] = 1 30 | return features 31 | -------------------------------------------------------------------------------- /tweedr/ml/features/nlp.py: -------------------------------------------------------------------------------- 1 | # the tagger is global, powered by the singleton module in tweedr.ml.ark 2 | from tweedr.ark.java.singleton import tagger 3 | 4 | import logging 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | def pos_tags(document): 9 | text = ' '.join(document) 10 | tokens_line, tags_line = tagger.tokenize_and_tag(text) 11 | tokens = tokens_line.split() 12 | tags = tags_line.split() 13 | 14 | if not (len(document) == len(tokens) == len(tags)): 15 | # TODO: make this warning unnecessary 16 | logger.critical('TwitterNLP tagger did not tokenize correctly: %s vs %s', text, tokens_line) 17 | return [[tag] for tag in tags] 18 | -------------------------------------------------------------------------------- /tweedr/ml/features/sets.py: -------------------------------------------------------------------------------- 1 | from tweedr.ml.features import characters, dbpedia, lexicons, ngrams 2 | 3 | crf_feature_functions = [ 4 | ngrams.unigrams, 5 | characters.plural, 6 | lexicons.is_transportation, 7 | lexicons.is_building, 8 | characters.capitalized, 9 | characters.numeric, 10 | ngrams.unique, 11 | lexicons.hypernyms, 12 | dbpedia.features, 13 | ] 14 | 15 | all_feature_functions = crf_feature_functions + [ 16 | ngrams.rbigrams, 17 | ngrams.lbigrams, 18 | ngrams.ctrigrams, 19 | ] 20 | 21 | classifier_feature_functions = [ 22 | ngrams.unigrams, 23 | ] 24 | -------------------------------------------------------------------------------- /tweedr/ml/lexicon_list.py: -------------------------------------------------------------------------------- 1 | transportation = ['aerial tramway', 'aircraft', 'aircraft carrier', 'airplane', 'ambulance', 'armored car', 'auto', 'automobile', 'baby carriage', 'balloon', 'bathyscaphe', 'barge', 'barrow', 'battleship', 'bicycle', 'bike', 'biplane', 'blimp', 'boat', 'bobsled', 'bomber', 'boxcar', 'broomstick', 'buggy', 'bulldozer', 'bullet train', 'bus', 'cab', 'cabin cruiser', 'cable car', 'caboose', 'camper', 'canoe', 'car', 'caravan', 'caravel', 'cargo ship', 'carriage', 'carrier', 'cart', 'catamaran', 'chairlift', 'chariot', 'chopper', 'clipper ship', 'clunker', 'coach', 'compact car', 'combine', 'compact car', 'Conestoga wagon', 'container ship', 'convertible', 'conveyance', 'conveyor belt', 'convoy', 'coupe', 'covered wagon', 'crane', 'crop duster', 'cruise ship', 'cruiser', 'cutter', 'cycle', 'delivery truck', 'delivery van', 'destroyer', 'diesel truck', 'dinghy', 'dirigible', 'dirt bike', 'diving bell', 'dog cart', 'dogsled', 'donkey cart', 'dray', 'driver', 'dugout canoe', 'dump truck', 'earth mover', 'eighteen-wheeler', 'electric car', 'elevated railroad', 'elevator', 'engine', 'escalator', 'express train', 'ferry', 'fireboat', 'fire engine', 'fishing boat', 'flatbed truck', 'forklift', 'four-door', 'four-wheel drive', 'four-by-four', 'freighter', 'freight train', 'frigate', 'funicular railway', 'galleon', 'garbage truck', 'glider', 'go-cart', 'golf cart', 'gondola', 'gondola lift', 'gridlock', 'handcar', 'hang glider', 'hansom cab', 'hardtop', 'harvester', 'hatchback', 'haul', 'hay wagon', 'hearse', 'helicopter', 'hook and ladder truck', 'hovercraft', 'hot-air balloon', 'hot rod', 'houseboat', 'hull', 'humvee', 'hybrid', 'hydrofoil', 'hydroplane', 'ice boat', 'ice breaker', 'jalopy', 'jeep', 'jet', 'jet boat', 'jetliner', 'journey', 'jetpack', 'jet ski', 'jumbo jet', 'junk', 'kayak', 'ketch', 'landing craft', 'lifeboat', 'life raft', 'light rail', 'limo', 'limousine', 'litter', 'locomotive', 'lorry', 'low-rider', 'magic carpet', 'maglev', 'mast', 'minesweeper', 'minibus', 'minivan', 'model T', 'monorail', 'moped', 'motor', 'motorcar', 'motorboat', 'motorcycle', 'motor home', 'mountain bike', 'narrowboat', 'oar', 'ocean liner', 'off-road vehicle', 'oil tanker', 'outboard motor', 'outrigger canoe', 'oxcart', 'paddle', 'paddlewheeler', 'parachute', 'passenger', 'patrol car', 'pedal boat', 'pickup truck', 'pilot', 'plane', 'police car', 'power boat', 'prairie schooner', 'propeller', 'PT boat', 'pumper truck', 'punt', 'push cart', 'racecar', 'racing car', 'raft', 'ragtop', 'railroad', 'railway', 'rapid transit', 'recreational vehicle', 'rickshaw', 'ride', 'riverboat', 'roadster', 'rocket', 'rover', 'rowboat', 'rudder', 'runabout', 'RV', 'sail', 'sailboat', 'satellite', 'school bus', 'schooner', 'scooter', 'scull', 'seaplane', 'sedan', 'sedan chair', 'Segway', 'semi', 'ship', 'shuttle', 'side wheeler', 'skiff', 'ski lift', 'ski tow', 'sled', 'sledge', 'sleigh', 'snow cat', 'snowmobile', 'snowplow', 'spaceship', 'space shuttle', 'speedboat', 'sports car', 'sport-utility vehicle', 'SUV', 'squad car', 'SST', 'stagecoach', 'station wagon', 'steamboat', 'steamship', 'stretch limo', 'stock car', 'stroller', 'subcompact', 'submarine', 'subway', 'surrey', 'SUV', 'tank', 'tanker', 'taxi', 'taxicab', 'T-bar lift', 'thresher', 'tire', 'toboggan', 'town car', 'tow truck', 'tracks', 'tractor', 'tractor-trailer', 'trail bike', 'trailer', 'train', 'tram', 'tramway', 'transit', 'trawler', 'tricycle', 'trolley', 'truck', 'tugboat', 'two-door', 'van', 'vehicle', 'vespa', 'vessel', 'wagon', 'wheelchair', 'yacht'] 2 | 3 | buildings = ['abbey', 'aircraft hangar', 'airport terminal', 'amphitheater', 'apartment building', 'aqueduct', 'arch', 'arena', 'armory', 'assembly hall', 'barn', 'barracks', 'beach house', 'boathouse', 'boarding house', 'bowling alley', 'bridge', 'brownstone', 'building', 'bungalow', 'bunkhouse', 'bunker', 'cabana', 'cabin', 'capitol', 'carport', 'castle', 'catacomb', 'cathedral', 'chalet', 'chapel', 'chateau', 'church', 'cinema', 'city hall', 'clubhouse', 'college', 'compound', 'concert hall', 'condominium', 'conservatory', 'cottage', 'courthouse', 'crypt', 'depot', 'detached house', 'dock', 'dome', 'dormitory', 'double wide', 'duplex', 'dwelling', 'earth-sheltered house', 'embassy', 'exposition hall', 'factory', 'farm', 'farmhouse', 'ferry slip', 'ferry terminal', 'firehouse', 'fire station', 'folly', 'forge', 'fort', 'fortress', 'foundry', 'gallery', 'garage', 'gas station', 'gazebo', 'geodesic dome', 'granary', 'greenhouse', 'gym', 'gymnasium', 'hall', 'hangar', 'haunted house', 'headquarters', 'high-rise', 'home', 'hospital', 'hostel', 'hotel', 'hot house', 'house', 'houseboat', 'housing project', 'hunting lodge', 'hut', 'igloo', 'jail', 'kiosk', 'laboratory', 'lean-to', 'library', 'lighthouse', 'lodge', 'log cabin', 'longhouse', 'mall', 'manor', 'manse', 'mansion', 'marina', 'market', 'mausoleum', 'meeting hall', 'mill', 'minaret', 'mobile home', 'monastery', 'monument', 'mosque', 'motel', 'museum', 'nuclear power plant', 'nursing home', 'observatory', 'office building', 'opera house', 'outbuilding', 'outhouse', 'pagoda', 'palace', 'parking garage', 'parliament', 'pavilion', 'plant', 'playhouse', 'police station', 'pool house', 'post office', 'power plant', 'prefab building', 'prison', 'pump house', 'pyramid', 'quonset hut', 'railway station', 'ranch', 'rectory', 'refinery', 'residence', 'restaurant', 'roller rink', 'roundhouse', 'rowhouse', 'school', 'shack', 'shed', 'shelter', 'shopping center', 'shopping mall', 'shrine', 'silo', 'skating rink', 'skyscraper', 'skyway', 'smokestack', 'spire', 'split-level house', 'stable', 'stadium', 'state house', 'station', 'steeple', 'store', 'storehouse', 'strip mall', 'structure', 'studio', 'supermarket', 'symphony', 'synagogue', 'temple', 'tenement', 'tent', 'terminal', 'theater', 'tipi', 'toll house', 'tomb', 'tower', 'townhouse', 'treehouse', 'triplex', 'Tudor house', 'university', 'vault', 'vicarage', 'villa', 'warehouse', 'watermill', 'workshop', 'yurt'] 4 | -------------------------------------------------------------------------------- /tweedr/ml/pyslda/PreProcess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | import types 6 | import string 7 | from pattern.en.wordlist import BASIC 8 | 9 | stopwords = \ 10 | '''im,rt,a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your''' 11 | stopwords = stopwords.split(',') 12 | stopwords.extend(BASIC) 13 | 14 | 15 | def processTweet(tweet): 16 | if isinstance(tweet, types.NoneType): 17 | return ' ' 18 | tweet = str(tweet) 19 | tweet = tweet.lower() 20 | tweet = re.sub('((www\.[/s]+)|(https?://[^\s]+))', ' ', tweet) 21 | tweet = tweet.translate(None, string.punctuation) 22 | tweet = re.sub("@[^\s]+", ' ', tweet) 23 | tweet = re.sub('[\s]+', ' ', tweet) 24 | tweet = re.sub(r"#([^\s]+)", r'\1', tweet) 25 | tweet = tweet.strip('\'"') 26 | words = tweet.split() 27 | words = [word for word in words if not word in stopwords] 28 | words = ' '.join(words) 29 | words = words.strip() 30 | return words 31 | 32 | 33 | def is_ascii(tweet): 34 | for c in tweet: 35 | if ord(c) >= 128: 36 | return False 37 | return True 38 | -------------------------------------------------------------------------------- /tweedr/ml/pyslda/PySLDA.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | from pyper import R 5 | import os 6 | from PreProcess import processTweet, is_ascii 7 | 8 | 9 | class supervisedLDA: 10 | 11 | def __init__(self, dataFileName, alpha=1.0, numtopics=5, eta=0.1, logistic=True, lamda=1.0, e_iter=10, m_iter=4, variance=0.25, cutoff=0.25): 12 | model_filename = 'model_%s.RDS' % dataFileName 13 | vocab_filename = 'vocabulary_%s.RDS' % dataFileName 14 | fullpath = os.path.realpath(__file__) 15 | (path, files) = os.path.split(fullpath) 16 | self.path = path 17 | self.params = { 18 | 'numtopics': numtopics, 19 | 'alpha': alpha, 20 | 'eta': eta, 21 | 'logistic': logistic, 22 | 'lambda': lamda, 23 | 'e_iter': e_iter, 24 | 'm_iter': m_iter, 25 | 'variance': variance, 26 | 'OutputName': dataFileName, 27 | 'model_filename': model_filename, 28 | 'vocab_filename': vocab_filename, 29 | 'test_cutoff': cutoff, 30 | } 31 | self.r = R(use_pandas=True, use_numpy=True) 32 | self.assign_R_params() 33 | 34 | def set_param(self, param_name, param_value): 35 | self.params[param_name] = param_value 36 | 37 | def get_params(self, deep=False): 38 | return self.params 39 | 40 | def assign_R_params(self): 41 | for (key, value) in self.params.iteritems(): 42 | self.r.assign(key, value) 43 | 44 | def fit(self, documents, labels): 45 | (documents, labels) = self.transform(documents, labels) 46 | self.r.assign('documents', documents) 47 | self.r.assign('labels', labels) 48 | self.r.run('source("trainLDA.R")') 49 | vocab = self.r['vocabulary'] 50 | self.set_param('vocabulary', vocab) 51 | self.assign_R_params() 52 | 53 | def transform(self, documents, labels): 54 | documents = [(tweet if is_ascii(tweet) else ' ') for tweet in 55 | documents] 56 | documents = map(lambda x: processTweet(x), documents) 57 | documents = map(lambda x: str(x).translate(None, '"'), 58 | documents) 59 | (tweets_filtered, labels_filtered) = ([], []) 60 | for (tweet, label) in zip(documents, labels): 61 | if len(tweet) > 1: 62 | tweets_filtered.append(tweet) 63 | labels_filtered.append(label) 64 | return (tweets_filtered, labels_filtered) 65 | 66 | def test_transform(self, documents): 67 | documents = [(tweet if is_ascii(tweet) else ' ') for tweet in 68 | documents] 69 | documents = map(lambda x: processTweet(x), documents) 70 | documents = map(lambda x: str(x).translate(None, '"'), 71 | documents) 72 | tweets_filtered = [] 73 | for tweet in documents: 74 | if len(tweet) > 1: 75 | tweets_filtered.append(tweet) 76 | return tweets_filtered 77 | 78 | def __str__(self): 79 | return 'sLDA(cut:%s)' % self.params['test_cutoff'] 80 | 81 | def predict(self, documents, gold_labels): 82 | (documents, gold_labels) = self.transform(documents, 83 | gold_labels) 84 | self.r.assign('testDocuments', documents) 85 | self.r.run('source("testLDA.R")') 86 | predictions = self.r['pred'] 87 | cutoff = self.params['test_cutoff'] 88 | predictions = map(lambda x: int(x > cutoff), predictions) 89 | return (predictions, gold_labels) 90 | 91 | def save_model(self): 92 | self.r.run('source("%s/saveModel.R")' % self.path) 93 | 94 | def load_model(self): 95 | self.r.run('source("%s/loadModel.R")' % self.path) 96 | vocab = self.r['vocab'] 97 | topics = self.r['topics'] 98 | self.set_param('vocab', vocab) 99 | self.set_param('topics', topics) 100 | self.assign_R_params() 101 | -------------------------------------------------------------------------------- /tweedr/ml/pyslda/README.md: -------------------------------------------------------------------------------- 1 | ### Supervised Latent Dirchlet Allocation 2 | 3 | This sub-module creates a python wrapper for the sLDA algorithm in R for binary classification and topic modeling. This wrapper implements methods in a fashion similar to classifiers provided in Scikit-Learn. Because there is no way to convert the R "slda" model into a Python object, the model is stored on disk. The sLDA classifier cannot currently handle non-ASCII text. 4 | 5 | * * * 6 | 7 | To create a sLDA classifier object, call supervisedLDA() and provide a filename for where you want the model to be saved. 8 | 9 | Methods: 10 | 11 | 1. Fit(documents, labels): Trains the sLDA classifier using the provided corpus and corresponding labels. 12 | 2. Predict(Document, gold_labels): Returns a vector containing the likelihood that each document in testing corpus will have a positive label. 13 | 3. SaveModel: Saves the vocabulary used in the trained sLDA model. 14 | 4. LoadModel: Loads an existing vocabulary into the sLDA model. -------------------------------------------------------------------------------- /tweedr/ml/pyslda/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/tweedr/ml/pyslda/__init__.py -------------------------------------------------------------------------------- /tweedr/ml/pyslda/evaluate-classifier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Evaluate the tweet classifier: is this tweet about damage/casualty or not? 5 | 6 | Results as of 8/8/13: 7 | 8 | INFO:eval-clf:Reading labeled tweets from database... 9 | INFO:eval-clf:Read 1045 tweets 10 | model ....f1.... pre....rec....acc....f1_std pre_std....rec_std....acc_std 11 | KNeighborsCla....0.52....0.84....0.38....0.84....0.09.... 0.10.... 0.08.... 0.04 12 | SVC(C=1, cach....0.45....0.89....0.32....0.83....0.14.... 0.07.... 0.13.... 0.05 13 | DecisionTreeC....0.55....0.91....0.40....0.85....0.09.... 0.07.... 0.08.... 0.03 14 | MultinomialNB....0.62....0.53....0.74....0.79....0.07.... 0.08.... 0.08.... 0.03 15 | LogisticRegre....0.66....0.78....0.59....0.86....0.05.... 0.10.... 0.06.... 0.03 16 | """ 17 | 18 | import argparse 19 | import logging 20 | import numpy as np 21 | 22 | from sklearn import cross_validation # , metrics 23 | from sklearn.feature_extraction.text import CountVectorizer 24 | 25 | # from sklearn.feature_extraction.text import TfidfTransformer 26 | 27 | from sklearn import metrics 28 | from sklearn.linear_model import LogisticRegression 29 | from sklearn.neighbors import KNeighborsClassifier 30 | from sklearn.pipeline import Pipeline 31 | from sklearn.svm import SVC 32 | from sklearn.tree import DecisionTreeClassifier 33 | from sklearn.naive_bayes import MultinomialNB 34 | 35 | from tweedr.models import DamageClassification, DBSession 36 | from tweedr.ml.pyslda import PySLDA 37 | 38 | logger = logging.getLogger('eval-clf') 39 | 40 | 41 | def summarize(evals, n): 42 | '''Compute average and standard deviation for evaluation metrics''' 43 | 44 | avg = {} 45 | for key in evals[0].iterkeys(): 46 | scores = np.array([e[key] for e in evals]) 47 | avg[key] = np.average(scores) 48 | avg[key + '_std'] = np.std(scores) 49 | return avg 50 | 51 | 52 | def score(y_true, y_pred): 53 | '''Compute evaluation metrics. Note pos_label=1 parameter of 54 | f1/precision/recall. Thus, we only compute precision of the positive class 55 | (as opposed to computing the precision for both classes and taking the 56 | average).''' 57 | 58 | return { 59 | 'acc': metrics.accuracy_score(y_true, y_pred), 60 | 'f1': metrics.f1_score(y_true, y_pred, pos_label=1), 61 | 'pre': metrics.precision_score(y_true, y_pred, pos_label=1), 62 | 'rec': metrics.recall_score(y_true, y_pred, pos_label=1), 63 | } 64 | 65 | 66 | def read_tweets(): 67 | '''Read labeled tweets from database''' 68 | 69 | logger.info('Reading labeled tweets from database...') 70 | labeled_tweets = \ 71 | np.array(DBSession.query(DamageClassification).filter(DamageClassification.mturk_code 72 | == 'QCRI').limit(opts.max_data).all()) 73 | logger.info('Read %d tweets', len(labeled_tweets)) 74 | return labeled_tweets 75 | 76 | 77 | def metric_names(): 78 | '''Name of metrics. Gotcha: keep in sync with score function''' 79 | 80 | metric_names = ['f1', 'pre', 'rec', 'acc'] 81 | return metric_names + [m + '_std' for m in metric_names] 82 | 83 | 84 | if __name__ == '__main__': 85 | parser = \ 86 | argparse.ArgumentParser(description='Train a classifier on data from the QCRI MySQL database', 87 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 88 | parser.add_argument('-k', '--k-folds', type=int, default=10, 89 | help='How many folds of the data to test on') 90 | parser.add_argument('--max-data', type=int, default=10000, 91 | help='Maximum data points to train and test on') 92 | opts = parser.parse_args() 93 | 94 | labeled_tweets = read_tweets() 95 | 96 | # FIXME: add features beyond bag of words 97 | 98 | pipeline = Pipeline([('vect', CountVectorizer())]) 99 | 100 | # , ('tfidf', TfidfTransformer())]) 101 | 102 | x = pipeline.fit_transform([t.text for t in 103 | labeled_tweets]).toarray() 104 | x_text = np.array([t.text.lower() for t in labeled_tweets]) 105 | y = np.array([t.label for t in labeled_tweets]) 106 | y_list = np.array([t.label for t in labeled_tweets]) 107 | 108 | classifiers = [ 109 | KNeighborsClassifier(3), 110 | SVC(gamma=2, C=1), 111 | DecisionTreeClassifier(max_depth=5), 112 | MultinomialNB(), 113 | LogisticRegression(), 114 | PySLDA.supervisedLDA('testModel01', cutoff=0.15), 115 | ] 116 | 117 | # FIXME: Write sLDA wrapper that extends ClassifierI for inclusion above 118 | 119 | last_classifier = len(classifiers) - 1 120 | cv = cross_validation.KFold(len(y), opts.k_folds, shuffle=True, 121 | random_state=1234) 122 | metric_names = metric_names() 123 | print '\t'.join(['model' + ' ' * 8] + metric_names) 124 | for (i, clf) in enumerate(classifiers): 125 | results = [] 126 | for (train, test) in cv: 127 | if i < last_classifier: 128 | truth = y[test] 129 | pred = clf.fit(x[train], y[train]).predict(x[test]) 130 | results.append(score(truth, pred)) 131 | else: 132 | truth = y_list[test] 133 | clf.fit(list(x_text[train]), list(y_list[train])) 134 | (pred, truth) = clf.predict(list(x_text[test]), 135 | list(truth)) 136 | results.append(score(truth, pred)) 137 | results_avg = summarize(results, opts.k_folds) 138 | print str(clf)[:13] + '\t' + '\t'.join(['%.2f' % results_avg[m] 139 | for m in metric_names]) 140 | -------------------------------------------------------------------------------- /tweedr/ml/pyslda/loadModel.R: -------------------------------------------------------------------------------- 1 | require("lda") 2 | 3 | vocab <- readRDS(vocab_filename) 4 | -------------------------------------------------------------------------------- /tweedr/ml/pyslda/saveModel.R: -------------------------------------------------------------------------------- 1 | require("lda") 2 | 3 | saveRDS(vocab,vocabulary_filename) 4 | -------------------------------------------------------------------------------- /tweedr/ml/pyslda/testLDA.R: -------------------------------------------------------------------------------- 1 | require("lda") 2 | require("pracma") 3 | 4 | 5 | sldaModel <- readRDS(model_filename) 6 | corpus <- lexicalize(testDocuments, lower=TRUE, vocab=vocabulary) 7 | pred <- slda.predict(corpus, sldaModel$topics, sldaModel$model, alpha = 1.0, eta = 0.1) 8 | 9 | pred <- sigmoid(pred) -------------------------------------------------------------------------------- /tweedr/ml/pyslda/trainLDA.R: -------------------------------------------------------------------------------- 1 | require("lda") 2 | corpus <- lexicalize(documents) 3 | documents <- corpus$documents 4 | vocabulary <- corpus$vocab 5 | 6 | params <- sample(c(-1,1), numtopics, replace=TRUE) 7 | result <- slda.em(documents=documents, K = numtopics, vocab=vocabulary, num.e.iterations = e_iter, num.m.iterations= m_iter, alpha = alpha, eta = eta, as.numeric(labels), params, variance = variance, lambda = lambda, logistic = logistic, method="sLDA") 8 | 9 | saveRDS(result,model_filename) -------------------------------------------------------------------------------- /tweedr/ml/spotlight/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Python DBpedia Spotlight API Wrapper 3 | ==================================== 4 | 5 | This is just a simple interface to a Spotlight API. 6 | 7 | Tested with DBPedia Spotlight 0.5 and 0.6.5. 8 | 9 | Note that I'm trying to track Spotlight release version numbers, so you can 10 | easily see which pyspotlight version has been tested with which Spotlight 11 | release. 12 | 13 | I hope the code and the small documentation speaks for itself :-) 14 | 15 | If you should encounter any problems, feel free to contact me on github 16 | (originell). I'm happy to help out with anything related to my code. 17 | """ 18 | __version_info__ = (0, 6, 5) 19 | __version__ = '.'.join(map(str, __version_info__)) 20 | 21 | 22 | import requests 23 | 24 | 25 | class SpotlightException(Exception): 26 | """ 27 | Exception raised on Spotlight failures. 28 | 29 | Basically this exception is raised if there was no valid JSON response 30 | from Spotlight. 31 | """ 32 | pass 33 | 34 | 35 | # Some helper functions. 36 | def _convert_number(value): 37 | """ 38 | Try to convert a string to an int or float. 39 | """ 40 | if isinstance(value, bool): 41 | return value 42 | # Workaround for footnotes being put into Resources.surfaceForm and then 43 | # having them parsed by the JSON parser into a list. (issue #4) 44 | if isinstance(value, list): 45 | value = unicode(value) 46 | 47 | try: 48 | return int(value) 49 | except ValueError: 50 | try: 51 | return float(value) 52 | except ValueError: 53 | return value 54 | 55 | 56 | def _dict_cleanup(dic, dict_type=dict): 57 | """ 58 | Clean the response dictionary from ugly @ signs in keys. 59 | 60 | TODO: Make this an iteration based recursion instead of function based. 61 | That way we can avoid stack fails. 62 | """ 63 | clean = dict_type() 64 | for key, value in dic.iteritems(): 65 | if value is None: 66 | continue 67 | 68 | key = key.replace('@', '') 69 | try: 70 | try: 71 | # If this is a string or bool, 72 | # go straight to type conversion. 73 | if (isinstance(value, basestring) or 74 | isinstance(value, bool)): 75 | raise AttributeError 76 | # Test for an iterable (list, tuple, set) 77 | value[0] 78 | # Clean up each element in the iterable 79 | clean[key] = [_dict_cleanup(element, dict_type) for element in value] 80 | except KeyError: 81 | clean[key] = _dict_cleanup(value, dict_type) 82 | except AttributeError: 83 | clean[key] = _convert_number(value) 84 | return clean 85 | 86 | 87 | # Main functions. 88 | # 89 | # I was inspired to go back to a function based approach after seeing this 90 | # awesome talk by Jack Diederich: Stop Writing Classes 91 | # http://pyvideo.org/video/880/stop-writing-classes 92 | # Most of the class-based approach had the problems he described. 93 | # Embarrassing! 94 | def annotate(address, text, confidence=0.0, support=0, 95 | spotter='LingPipeSpotter', disambiguator='Default', 96 | policy='whitelist', headers={}): 97 | """ 98 | Annotate a text. 99 | 100 | Can raise :exc:`requests.exceptions.HTTPError` or 101 | :exc:`SpotlightException`, depending on where the failure is (HTTP status 102 | code not 200 or the response not containing valid json). 103 | 104 | :param address: 105 | The absolute address of the annotate REST API. 106 | :type address: string 107 | 108 | :param text: 109 | The text to be sent. 110 | :type text: string 111 | 112 | :param confidence: 113 | Filter out annotations below a given confidence. 114 | Based on my experience I would suggest you set this to something 115 | above 0.4, however your experience might vary from text to text. 116 | :type confidence: float 117 | 118 | :param support: 119 | Only output annotations above a given prominence (support). 120 | Based on my experience I would suggest you set this to something 121 | above 20, however your experience might vary from text to text. 122 | :type support: int 123 | 124 | :param spotter: 125 | One of spotters available on your DBPedia Spotlight server. 126 | For example one of: LingPipeSpotter, AtLeastOneNounSelector, 127 | CoOccurrenceBasedSelector 128 | :type spotter: string 129 | 130 | :param disambiguator: 131 | The disambiguator to use on the annotation. 132 | :type disambiguator: string 133 | 134 | :param policy: 135 | The policy to be used. 136 | :type disambiguator: string 137 | 138 | :param headers: 139 | Additional headers to be set on the request. 140 | :type headers: dictionary 141 | 142 | :rtype: list of resources 143 | """ 144 | payload = {'confidence': confidence, 'support': support, 145 | 'spotter': spotter, 'disambiguator': disambiguator, 146 | 'policy': policy, 'text': text} 147 | reqheaders = {'accept': 'application/json'} 148 | reqheaders.update(headers) 149 | 150 | # Its better for the user to have to explicitly provide a protocl in the 151 | # URL, since transmissions might happen over HTTPS or any other secure or 152 | # faster (spdy :D) channel. 153 | if not '://' in address: 154 | raise SpotlightException('Oops. Looks like you forgot the protocol ' 155 | '(http/https) in your url (%s).' % address) 156 | 157 | response = requests.post(address, data=payload, headers=reqheaders) 158 | if response.status_code != requests.codes.ok: 159 | # Every http code besides 200 shall raise an exception. 160 | response.raise_for_status() 161 | 162 | pydict = response.json 163 | if pydict is None: 164 | raise SpotlightException("Spotlight's response did not contain valid " 165 | "JSON: %s" % response.text) 166 | 167 | if not 'Resources' in pydict: 168 | raise SpotlightException('No Resources found in spotlight response: %s' % pydict) 169 | 170 | return [_dict_cleanup(resource) for resource in pydict['Resources']] 171 | 172 | 173 | # This is more or less a duplicate of the annotate function, with just 174 | # the return line being the difference haha. 175 | def candidates(address, text, confidence=0.0, support=0, 176 | spotter='LingPipeSpotter', disambiguator='Default', 177 | policy='whitelist', headers={}): 178 | """ 179 | Get the candidates from a text. 180 | 181 | Uses the same arguments as :meth:`annotate`. 182 | 183 | :rtype: list of surface forms 184 | """ 185 | payload = {'confidence': confidence, 'support': support, 186 | 'spotter': spotter, 'disambiguator': disambiguator, 187 | 'policy': policy, 'text': text} 188 | reqheaders = {'accept': 'application/json'} 189 | reqheaders.update(headers) 190 | response = requests.post(address, data=payload, headers=reqheaders) 191 | if response.status_code != requests.codes.ok: 192 | # Every http code besides 200 shall raise an exception. 193 | response.raise_for_status() 194 | 195 | pydict = response.json 196 | if pydict is None: 197 | raise SpotlightException("Spotlight's response did not contain valid " 198 | "JSON: %s" % response.text) 199 | 200 | if not 'annotation' in pydict: 201 | raise SpotlightException('No annotations found in spotlight response: %s' % pydict) 202 | 203 | if not 'surfaceForm' in pydict['annotation']: 204 | raise SpotlightException('No surface forms found in spotlight response: %s' % pydict) 205 | 206 | # Previously we assumed that the surfaceForm is *always* a list, however 207 | # depending on how many are returned, this does not have to be the case. 208 | # So we are doing some good ol' duck typing here. 209 | try: 210 | pydict['annotation']['surfaceForm'][0] 211 | except KeyError: 212 | # However note that we will *always* return a list. 213 | return [_dict_cleanup(pydict['annotation']['surfaceForm']), ] 214 | return [_dict_cleanup(form) 215 | for form in pydict['annotation']['surfaceForm']] 216 | -------------------------------------------------------------------------------- /tweedr/ml/wordnet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | from pattern.en import wordnet 5 | from pattern.vector import stem 6 | 7 | 8 | def WordnetFeatures(token): 9 | synset = wordnet.synsets(token) 10 | if len(synset) > 0: 11 | synset = synset[0] 12 | hypernym = synset.hypernyms(depth=2, recursive=True) 13 | # hypernym.extend(synset.hyponyms(depth=2,recursive=True)) 14 | return [hyper.senses[0] for hyper in hypernym] 15 | else: 16 | return [] 17 | 18 | 19 | def WordNet(document): 20 | return [WordnetFeatures(token) for token in document] 21 | 22 | 23 | def token_hypernyms(token, recursive, depth): 24 | '''Stem each token using default stemmer from the pattern library (PORTER?)''' 25 | for synset in wordnet.synsets(stem(token)): 26 | for hypernym in synset.hypernyms(recursive, depth): 27 | for sense in hypernym.senses: 28 | yield sense 29 | -------------------------------------------------------------------------------- /tweedr/models/README.md: -------------------------------------------------------------------------------- 1 | ## MySQL libraries for Python 2 | 3 | psycopg2 is the (pretty clear) choice for PostgreSQL (or pg8000 if you don't have build tools). 4 | But MySQL has more options. 5 | 6 | * MySQL for Python, by Andy Dustman 7 | - original: http://sourceforge.net/projects/mysql-python/ 8 | - docs: http://mysql-python.sourceforge.net/MySQLdb.html 9 | - github: https://github.com/farcepest/MySQLdb1 10 | - next generation: https://github.com/farcepest/moist 11 | - `easy_install MySQL-Python` 12 | - `import MySQLdb` 13 | 14 | * PyMySQL 15 | - Pure python 16 | - http://www.pymysql.org/ 17 | 18 | * OurSQL 19 | - http://pythonhosted.org/oursql/ 20 | -------------------------------------------------------------------------------- /tweedr/models/__init__.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import orm 2 | from tweedr.lib.text import token_re 3 | from tweedr.models.metadata import engine 4 | 5 | sessionmaker = orm.sessionmaker(bind=engine) 6 | DBSession = sessionmaker() 7 | 8 | # we write enhanced ORM classes directly on top of the schema originals, 9 | # so that enhancements are optional and transparent 10 | from tweedr.models.schema import ( 11 | DamageClassification, 12 | TokenizedLabel, 13 | UniformSample, 14 | Label, 15 | KeywordSample, 16 | Tweet, 17 | ) 18 | 19 | # This quiets the 'import unused' pyflakes warning 20 | __all__ = ['DamageClassification', 'TokenizedLabel', 'UniformSample', 'Label', 'KeywordSample', 'Tweet'] 21 | 22 | 23 | class DamageClassification(DamageClassification): 24 | # DamageClassification does not actually have a single FK, but references multiple tables. 25 | # so this join is actually way more complicated 26 | # tweet_object = orm.relationship(Tweet, lazy='join', 27 | # primaryjoin=orm.foreign(DamageClassification.DSSG_ID) == Tweet.dssg_id) 28 | 29 | @property 30 | def text(self): 31 | 'Run join query to get text of this labeled tweet' 32 | # FIXME: Slow. Consider join in instead. 33 | if not hasattr(self, 'text_'): 34 | if 'uniform' in self.which_sample: 35 | self.text_ = DBSession.query(UniformSample.text).filter(UniformSample.dssg_id == self.DSSG_ID).first()[0] 36 | elif 'keyword' in self.which_sample: 37 | self.text_ = DBSession.query(KeywordSample.text).filter(KeywordSample.dssg_id == self.DSSG_ID).first()[0] 38 | else: 39 | self.text_ = None 40 | return self.text_ 41 | 42 | @property 43 | def label(self): 44 | if self.Infrastructure == 1 or self.Casualty == 1: 45 | return 1. 46 | else: 47 | return 0. 48 | 49 | 50 | class TokenizedLabel(TokenizedLabel): 51 | # the FK is called token_type, even though it is an identifying key and not 52 | # an actual token type, so we name the target object "token_type_object" 53 | token_type_object = orm.relationship(Label, lazy='subquery', 54 | primaryjoin=orm.foreign(TokenizedLabel.token_type) == Label.id) 55 | 56 | @property 57 | def tokens(self): 58 | return token_re.findall(unicode(self.tweet).encode('utf8')) 59 | 60 | @property 61 | def labels(self, null_label=None): 62 | labels = [] 63 | label_start, label_end = self.token_start, self.token_end 64 | for match in token_re.finditer(self.tweet): 65 | token_start, token_end = match.span() 66 | # token = match.group(0) 67 | # we want to determine if this particular token in the original tweet overlaps 68 | # with any portion of the selected label (label_span) 69 | label = null_label 70 | if label_start <= token_start <= label_end or label_start <= token_end <= label_end: 71 | label = self.token_type_object.text 72 | labels.append(label) 73 | return [unicode(label).encode('utf8') for label in labels] 74 | -------------------------------------------------------------------------------- /tweedr/models/example.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from tweedr.lib.text import whitespace_unicode_translations 3 | from tweedr.models import DBSession, TokenizedLabel, Label 4 | 5 | import logging 6 | logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) 7 | 8 | 9 | def count(): 10 | print >> sys.stderr, 'Tweet count started.' 11 | print 'There are %d labels in the database.' % DBSession.query(Label).count() 12 | print 'There are %d tokenized labels in the database.' % DBSession.query(TokenizedLabel).count() 13 | 14 | 15 | def first(limit): 16 | print >> sys.stderr, 'First %d tweets.' % limit 17 | for tokenized_label in DBSession.query(TokenizedLabel).limit(limit): 18 | # print repr(tokenized_label) 19 | tokenized_label_text = unicode(tokenized_label).translate(whitespace_unicode_translations).encode('utf8') 20 | token_type_object = tokenized_label.token_type_object 21 | print token_type_object.id, '\t', token_type_object.text, '\t', tokenized_label_text 22 | 23 | if __name__ == '__main__': 24 | # py example.py | awk -F\\t '{print $1,$2}' | sort | uniq -c | sort -g 25 | first(1000) 26 | -------------------------------------------------------------------------------- /tweedr/models/metadata.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from sqlalchemy import create_engine, MetaData 4 | 5 | connection_string = 'mysql+mysqldb://%(MYSQL_USER)s:%(MYSQL_PASS)s@%(MYSQL_HOST)s/%(MYSQL_DATABASE)s' % os.environ 6 | engine = create_engine(connection_string, encoding='latin1', convert_unicode=True) 7 | # yep, it's latin1. Check it: 8 | # mysql -h host-stuff-here.rds.amazonaws.com -u ourusername -p 9 | # SHOW DATABASES; 10 | # USE THEDATABASEWITHSTUFFINIT; 11 | # SHOW VARIABLES LIKE "character_set_database"; 12 | metadata = MetaData(bind=engine) 13 | -------------------------------------------------------------------------------- /tweedr/models/schema.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This file, schema.py, is generated by reflect.py but may have small manual 3 | modifications. It should only require regeneration when the database schema 4 | changes. It does not provide much past what the following snippet does, except 5 | that it doesn't require database calls to reflect the database on start up. 6 | 7 | class Something(Base): 8 | __table__ = Table('somethings', metadata, autoload=True) 9 | 10 | from schema import ( 11 | DamageClassification, 12 | TokenizedLabel, 13 | UniformSample, 14 | Label, 15 | KeywordSample, 16 | Tweet, 17 | ) 18 | ''' 19 | 20 | from sqlalchemy import Column 21 | from sqlalchemy.dialects import mysql 22 | from sqlalchemy.ext.declarative import declarative_base 23 | 24 | from tweedr.models.metadata import metadata 25 | 26 | 27 | class BaseMixin(object): 28 | def __json__(self): 29 | '''This method serves to both clone the record (copying its values) 30 | as well as filter out the special sqlalchemy key (_sa_instance_state) 31 | ''' 32 | return dict((k, v) for k, v in self.__dict__.items() if k != '_sa_instance_state') 33 | 34 | def __unicode__(self): 35 | type_name = self.__class__.__name__ 36 | pairs = [u'%s=%s' % (k, v) for k, v in self.__json__().items()] 37 | return u'<{type_name} {pairs}>'.format(type_name=type_name, pairs=u' '.join(pairs)) 38 | 39 | def __str__(self): 40 | return unicode(self).encode('utf-8') 41 | 42 | def __repr__(self): 43 | return str(self) 44 | 45 | Base = declarative_base(metadata=metadata, cls=BaseMixin) 46 | 47 | 48 | class DamageClassification(Base): 49 | __tablename__ = 'DamageClassification' 50 | id = Column(mysql.INTEGER(display_width=11), primary_key=True) 51 | DSSG_ID = Column(mysql.INTEGER(display_width=11)) 52 | Tweet = Column(mysql.TEXT()) 53 | Infrastructure = Column(mysql.TINYINT(display_width=1)) 54 | Casualty = Column(mysql.TINYINT(display_width=1)) 55 | mturk_code = Column(mysql.TEXT()) 56 | which_sample = Column(mysql.VARCHAR(length=10)) 57 | is_extracted = Column(mysql.INTEGER(display_width=1)) 58 | 59 | 60 | class TokenizedLabel(Base): 61 | __tablename__ = 'tokenized_labels' 62 | id = Column(mysql.INTEGER(display_width=20), primary_key=True) 63 | dssg_id = Column(mysql.INTEGER(display_width=100)) 64 | tweet = Column(mysql.VARCHAR(length=500)) 65 | token_start = Column(mysql.INTEGER(display_width=50)) 66 | token_end = Column(mysql.INTEGER(display_width=50)) 67 | token_type = Column(mysql.VARCHAR(length=500)) 68 | token = Column(mysql.VARCHAR(length=500)) 69 | mturk_code = Column(mysql.VARCHAR(length=50)) 70 | which_sample = Column(mysql.VARCHAR(length=10)) 71 | 72 | 73 | class UniformSample(Base): 74 | __tablename__ = 'uniform_sample' 75 | id = Column(mysql.INTEGER(display_width=20), primary_key=True) 76 | dssg_id = Column(mysql.INTEGER(display_width=20)) 77 | pwd = Column(mysql.VARCHAR(length=500)) 78 | text = Column(mysql.VARCHAR(length=500)) 79 | is_extracted = Column(mysql.INTEGER(display_width=11)) 80 | is_classified = Column(mysql.INTEGER(display_width=11)) 81 | type_sample = Column(mysql.VARCHAR(length=10)) 82 | 83 | 84 | class Label(Base): 85 | __tablename__ = 'labels' 86 | id = Column(mysql.VARCHAR(length=10), primary_key=True) 87 | text = Column(mysql.VARCHAR(length=100)) 88 | 89 | 90 | class KeywordSample(Base): 91 | __tablename__ = 'keyword_sample' 92 | id = Column(mysql.INTEGER(display_width=20), primary_key=True) 93 | dssg_id = Column(mysql.INTEGER(display_width=20)) 94 | pwd = Column(mysql.VARCHAR(length=500)) 95 | text = Column(mysql.VARCHAR(length=500)) 96 | is_extracted = Column(mysql.INTEGER(display_width=11)) 97 | is_classified = Column(mysql.INTEGER(display_width=11)) 98 | type_sample = Column(mysql.VARCHAR(length=10)) 99 | 100 | 101 | class Tweet(Base): 102 | __tablename__ = 'tweets' 103 | dssg_id = Column(mysql.INTEGER(display_width=20), primary_key=True) 104 | pwd = Column(mysql.VARCHAR(length=500)) 105 | _unit_id = Column(mysql.VARCHAR(length=500)) 106 | _golden = Column(mysql.VARCHAR(length=500)) 107 | _unit_state = Column(mysql.VARCHAR(length=500)) 108 | _trusted_judgment = Column(mysql.VARCHAR(length=500)) 109 | _last_jugment_at = Column(mysql.VARCHAR(length=500)) 110 | choose_one = Column(mysql.VARCHAR(length=500)) 111 | choose_oneconfidence = Column(mysql.VARCHAR(length=500)) 112 | choose_one_gold = Column(mysql.VARCHAR(length=500)) 113 | predicted = Column(mysql.VARCHAR(length=500)) 114 | text_no_rt = Column(mysql.VARCHAR(length=500)) 115 | tweet = Column(mysql.VARCHAR(length=500)) 116 | _trusted_judgments = Column(mysql.VARCHAR(length=500)) 117 | _last_judgment_at = Column(mysql.VARCHAR(length=500)) 118 | source = Column(mysql.VARCHAR(length=500)) 119 | type_of_advice_or_caution = Column(mysql.VARCHAR(length=500)) 120 | type_of_advice_or_cautionconfidence = Column(mysql.VARCHAR(length=500)) 121 | what = Column(mysql.VARCHAR(length=500)) 122 | when_ = Column(mysql.VARCHAR(length=500)) 123 | where_ = Column(mysql.VARCHAR(length=500)) 124 | category = Column(mysql.VARCHAR(length=500)) 125 | id = Column(mysql.VARCHAR(length=500)) 126 | retweetcount = Column(mysql.VARCHAR(length=500)) 127 | screenname = Column(mysql.VARCHAR(length=500)) 128 | source_gold = Column(mysql.VARCHAR(length=500)) 129 | text = Column(mysql.VARCHAR(length=500)) 130 | type_of_advice_or_caution_gold = Column(mysql.VARCHAR(length=500)) 131 | userid = Column(mysql.VARCHAR(length=500)) 132 | what_gold = Column(mysql.VARCHAR(length=500)) 133 | when_gold = Column(mysql.VARCHAR(length=500)) 134 | where_gold = Column(mysql.VARCHAR(length=500)) 135 | user_id = Column(mysql.VARCHAR(length=500)) 136 | how_many_injured_or_dead_if_people = Column(mysql.VARCHAR(length=500)) 137 | people_or_infrastructure = Column(mysql.VARCHAR(length=500)) 138 | people_or_infrastructureconfidence = Column(mysql.VARCHAR(length=500)) 139 | what_infrastructure_was_damaged_if_infrastructure = Column(mysql.VARCHAR(length=500)) 140 | how_many_injured_or_dead_if_people_gold = Column(mysql.VARCHAR(length=500)) 141 | people_or_infrastructure_gold = Column(mysql.VARCHAR(length=500)) 142 | intention = Column(mysql.VARCHAR(length=500)) 143 | intentionconfidence = Column(mysql.VARCHAR(length=500)) 144 | type_of_donation = Column(mysql.VARCHAR(length=500)) 145 | type_of_donationconfidence = Column(mysql.VARCHAR(length=500)) 146 | who = Column(mysql.VARCHAR(length=500)) 147 | intention_gold = Column(mysql.VARCHAR(length=500)) 148 | type_of_donation_gold = Column(mysql.VARCHAR(length=500)) 149 | who_gold = Column(mysql.VARCHAR(length=500)) 150 | type_of_message = Column(mysql.VARCHAR(length=500)) 151 | type_of_message_confidence = Column(mysql.VARCHAR(length=500)) 152 | url_or_name_of_the_stationchannel = Column(mysql.VARCHAR(length=500)) 153 | type_of_message_gold = Column(mysql.VARCHAR(length=500)) 154 | url_or_name_of_the_stationchannel_gold = Column(mysql.VARCHAR(length=500)) 155 | joplin_raw = Column(mysql.VARCHAR(length=500)) 156 | creationdate = Column(mysql.VARCHAR(length=500)) 157 | replyto = Column(mysql.VARCHAR(length=500)) 158 | replytouser = Column(mysql.VARCHAR(length=500)) 159 | replytoscreenname = Column(mysql.VARCHAR(length=500)) 160 | longitude = Column(mysql.VARCHAR(length=500)) 161 | latitude = Column(mysql.VARCHAR(length=500)) 162 | favorite = Column(mysql.VARCHAR(length=500)) 163 | retweet = Column(mysql.VARCHAR(length=500)) 164 | hashtags = Column(mysql.VARCHAR(length=500)) 165 | mediaurl = Column(mysql.VARCHAR(length=500)) 166 | city = Column(mysql.VARCHAR(length=500)) 167 | sandy_raw_dataset = Column(mysql.VARCHAR(length=500)) 168 | tweet__no = Column(mysql.VARCHAR(length=500)) 169 | user = Column(mysql.VARCHAR(length=500)) 170 | tweet_text = Column(mysql.VARCHAR(length=500)) 171 | url = Column(mysql.VARCHAR(length=500)) 172 | sandy_labeled = Column(mysql.VARCHAR(length=500)) 173 | type = Column(mysql.VARCHAR(length=500)) 174 | hom_many_injured_or_dead_if_people = Column(mysql.VARCHAR(length=500)) 175 | hom_many_injured_or_dead_if_people_gold = Column(mysql.VARCHAR(length=500)) 176 | what_infrastructure_was_damaged_if_infrastructure_gold = Column(mysql.VARCHAR(length=500)) 177 | the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event = Column(mysql.VARCHAR(length=500)) 178 | the_author_of_the_tweet_seems_to_be_an_eye_witness = Column(mysql.VARCHAR(length=500)) 179 | _of_the_eventconfidence = Column(mysql.VARCHAR(length=500)) 180 | the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event_ = Column(mysql.VARCHAR(length=500)) 181 | tweet_no = Column(mysql.VARCHAR(length=500)) 182 | tweet_no_rt = Column(mysql.VARCHAR(length=500)) 183 | type_of_mess = Column(mysql.VARCHAR(length=500)) 184 | age_gold = Column(mysql.VARCHAR(length=500)) 185 | _created_at = Column(mysql.VARCHAR(length=500)) 186 | _id = Column(mysql.VARCHAR(length=500)) 187 | _missed = Column(mysql.VARCHAR(length=500)) 188 | _started_at = Column(mysql.VARCHAR(length=500)) 189 | _tainted = Column(mysql.VARCHAR(length=500)) 190 | _channel = Column(mysql.VARCHAR(length=500)) 191 | _trust = Column(mysql.VARCHAR(length=500)) 192 | _worker_id = Column(mysql.VARCHAR(length=500)) 193 | _country = Column(mysql.VARCHAR(length=500)) 194 | _region = Column(mysql.VARCHAR(length=500)) 195 | _city = Column(mysql.VARCHAR(length=500)) 196 | _ip = Column(mysql.VARCHAR(length=500)) 197 | word_or_shortphrase = Column(mysql.VARCHAR(length=500)) 198 | instruction = Column(mysql.VARCHAR(length=500)) 199 | type_ = Column(mysql.VARCHAR(length=500)) 200 | of_message = Column(mysql.VARCHAR(length=500)) 201 | type_of_messageconfidence = Column(mysql.VARCHAR(length=500)) 202 | word_or_shortphrase_gold = Column(mysql.VARCHAR(length=500)) 203 | tokenized = Column(mysql.INTEGER(display_width=11)) 204 | body = Column(mysql.VARCHAR(length=500)) 205 | object_id = Column(mysql.INTEGER(display_width=50)) 206 | type_of_sampling = Column(mysql.VARCHAR(length=500)) 207 | is_random_keyword = Column(mysql.INTEGER(display_width=11)) 208 | is_categorized = Column(mysql.INTEGER(display_width=11)) 209 | -------------------------------------------------------------------------------- /tweedr/models/schema.template: -------------------------------------------------------------------------------- 1 | <%! from tweedr.lib.text import UpperCamelCase, singular %>\ 2 | <%def name="column_args(column)" filter="trim"> 3 | mysql.${repr(column.type)}${', primary_key=True' if column.primary_key else ''}${', unique=True' if column.unique else ''} 4 | \ 5 | ''' 6 | This file, schema.py, can be generated by running `tweedr-database reflect`. 7 | It should only require regeneration when the database schema changes. 8 | It does not provide much past what the following snippet does, except 9 | that it doesn't require database calls to reflect the database on start up: 10 | 11 | class Something(Base): 12 | __table__ = Table('somethings', metadata, autoload=True) 13 | 14 | You can use the following snippet to import all mapped tables. 15 | 16 | from tweedr.models.schema import ( 17 | % for table in metadata.sorted_tables: 18 | ${singular(UpperCamelCase(table.name))}, 19 | % endfor 20 | ) 21 | 22 | ''' 23 | 24 | from sqlalchemy import Column 25 | from sqlalchemy.dialects import mysql 26 | from sqlalchemy.ext.declarative import declarative_base 27 | 28 | from tweedr.models.metadata import metadata 29 | 30 | 31 | class BaseMixin(object): 32 | def __json__(self): 33 | '''This method serves to both clone the record (copying its values) 34 | as well as filter out the special sqlalchemy key (_sa_instance_state) 35 | ''' 36 | return dict((k, v) for k, v in self.__dict__.items() if k != '_sa_instance_state') 37 | 38 | def __unicode__(self): 39 | type_name = self.__class__.__name__ 40 | pairs = [u'%s=%s' % (k, v) for k, v in self.__json__().items()] 41 | return u'<{type_name} {pairs}>'.format(type_name=type_name, pairs=u' '.join(pairs)) 42 | 43 | def __str__(self): 44 | return unicode(self).encode('utf-8') 45 | 46 | def __repr__(self): 47 | return str(self) 48 | 49 | Base = declarative_base(metadata=metadata, cls=BaseMixin) 50 | % for table in metadata.sorted_tables: 51 | 52 | 53 | class ${singular(UpperCamelCase(table.name))}(Base): 54 | __tablename__ = '${table.name}' 55 | % for column in table.columns: 56 | ${column.name} = Column(${column_args(column)}) 57 | % endfor 58 | % endfor 59 | -------------------------------------------------------------------------------- /tweedr/ui/README.md: -------------------------------------------------------------------------------- 1 | ## Configuration 2 | 3 | Make sure your environment variables are available to the process that will be serving the app. 4 | 5 | See the wiki [Environment](https://github.com/dssg/tweedr/wiki/Environment) page, particular the `MYSQL_*` variables. 6 | 7 | 8 | ## Running 9 | 10 | The `tweedr-ui` CLI gets installed when you install tweedr. Simply run it: 11 | 12 | tweedr-ui 13 | 14 | 15 | ## Browsing 16 | 17 | As you can see from the output of that call, Bottle serves the application on port 8080 by default. 18 | 19 | * http://127.0.0.1:8080/ 20 | 21 | This should redirect you to `/crf` — take a look at your developer console in the browser to see the endpoints it's hitting to load new tweets and tag them. 22 | -------------------------------------------------------------------------------- /tweedr/ui/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dssg/tweedr/88991d7448accb381646376bc3932185f87a7d3f/tweedr/ui/__init__.py -------------------------------------------------------------------------------- /tweedr/ui/crf.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | import bottle 5 | from bottle import request, redirect, static_file, mako_view as view 6 | 7 | import tweedr 8 | from tweedr.lib.text import token_re 9 | from tweedr.ml.crf.classifier import CRF 10 | from tweedr.ml.features import featurize 11 | from tweedr.ml.features.sets import crf_feature_functions 12 | from tweedr.models import DBSession, TokenizedLabel 13 | 14 | import logging 15 | logger = logging.getLogger(__name__) 16 | 17 | # tell bottle where to look for templates 18 | # We use Mako templates (*.mako) that are in the templates/ directory in the package root. 19 | # There are also Handlebars (*.bars) templates in there, but those are rendered on the client-side. 20 | bottle.TEMPLATE_PATH.append(os.path.join(tweedr.root, 'templates')) 21 | 22 | # this is the primary export 23 | app = bottle.Bottle() 24 | 25 | # globals are messy, but we don't to retrain a tagger for every request 26 | logger.debug('initializing %s (training or loading CRF using defaults)', __name__) 27 | GLOBALS = dict(tagger=CRF.default(crf_feature_functions)) 28 | 29 | 30 | @app.get('/') 31 | def root(): 32 | redirect('/crf') 33 | 34 | 35 | @app.get('/crf') 36 | @view('crf.mako') 37 | def index(): 38 | # effectively static; all the fun stuff happens in the template 39 | return dict() 40 | 41 | 42 | @app.get('/tokenized_labels/sample') 43 | def tokenized_labels_sample(): 44 | total = DBSession.query(TokenizedLabel).count() 45 | index = random.randrange(total) 46 | logger.debug('/tokenized_labels/sample: choosing #%d out of %d', index, total) 47 | tokenized_label = DBSession.query(TokenizedLabel).offset(index).limit(1).first() 48 | return tokenized_label.__json__() 49 | 50 | 51 | @app.post('/tagger/tag') 52 | def tagger_tag(): 53 | # For bottle >= 0.10, request.forms.xyz attributes return unicode strings 54 | # and an empty string if decoding fails. 55 | text = request.forms.text 56 | tokens = token_re.findall(text.encode('utf8')) 57 | 58 | tokens_features = map(list, featurize(tokens, crf_feature_functions)) 59 | tagger = GLOBALS['tagger'] 60 | labels = tagger.predict([tokens_features])[0] 61 | 62 | sequences = [ 63 | {'name': 'tokens', 'values': tokens}, 64 | {'name': 'labels', 'values': labels}, 65 | ] 66 | for feature_function in crf_feature_functions: 67 | sequences.append({ 68 | 'name': feature_function.__name__, 69 | 'values': [', '.join(features) for features in feature_function(tokens)]}) 70 | 71 | return {'sequences': sequences} 72 | 73 | 74 | @app.route('/tagger/retrain') 75 | def tagger_retrain(): 76 | GLOBALS['tagger'] = CRF.default(crf_feature_functions, retrain=True) 77 | return dict(success=True) 78 | 79 | 80 | @app.route('/static/') 81 | def serve_static_file(filepath): 82 | return static_file(filepath, os.path.join(tweedr.root, 'static')) 83 | 84 | 85 | @app.route('/templates/') 86 | def serve_templates_file(filepath): 87 | return static_file(filepath, os.path.join(tweedr.root, 'templates')) 88 | -------------------------------------------------------------------------------- /tweedr/ui/middleware.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | 4 | def add_duration_header(app): 5 | def call(environ, start_response): 6 | started = time.time() 7 | 8 | def wrapped_start_response(status, headers): 9 | duration = time.time() - started 10 | return start_response(status, headers + [('X-Duration', str(duration))]) 11 | 12 | return app(environ, wrapped_start_response) 13 | 14 | return call 15 | -------------------------------------------------------------------------------- /web/README.md: -------------------------------------------------------------------------------- 1 | # Extraction and Classification Web Tool 2 | 3 | This directory contains the code for the extraction and classification tool to crowdsource labeled data. 4 | 5 | ## Directories 6 | * Extraction-tool contains the extraction UI 7 | * Classification-tool contains the classification UI 8 | 9 | 10 | ## License 11 | 12 | Copyright © 2013 The University of Chicago. [MIT Licensed](LICENSE). 13 | -------------------------------------------------------------------------------- /web/extraction-tool/README.md: -------------------------------------------------------------------------------- 1 | # Extraction Tool 2 | 3 | This tool can be used to create labeled data from tweets. 4 | 5 | ## Setup Database for tweet output 6 | CREATE TABLE `tokenized_labels` ( 7 | `id` int(20) NOT NULL AUTO_INCREMENT, 8 | `dssg_id` int(100) NOT NULL, 9 | `tweet` varchar(500) DEFAULT NULL, 10 | `token_start` int(50) DEFAULT NULL, 11 | `token_end` int(50) DEFAULT NULL, 12 | `token_type` varchar(500) DEFAULT NULL, 13 | `token` varchar(500) DEFAULT NULL, 14 | `mturk_code` varchar(50) DEFAULT NULL, 15 | `which_sample` varchar(10) DEFAULT NULL, 16 | `which_disaster` varchar(60) DEFAULT NULL, 17 | PRIMARY KEY (`id`) 18 | ) ENGINE=InnoDB AUTO_INCREMENT=908 DEFAULT CHARSET=latin1; 19 | 20 | 21 | ## Contact 22 | 23 | Want to get in touch? Found a bug? Open up a [new issue](https://github.com/dssg/tweedr/issues/new) or email us at [dssg-qcri@googlegroups.com](mailto:dssg-qcri@googlegroups.com). 24 | 25 | 26 | ## License 27 | 28 | Copyright © 2013 The University of Chicago. [MIT Licensed](LICENSE). 29 | -------------------------------------------------------------------------------- /web/extraction-tool/db.php: -------------------------------------------------------------------------------- 1 | body {background-image:url('background.png'); 17 | background-repeat:yes; 18 | font-family: fontC; 19 | color:#111; 20 | } 21 | 22 | 23 | 24 | "; 25 | $hash = $_SESSION['hash']; 26 | $id_num = $_GET['tweet_id']; 27 | $id2 = $_GET['id']; 28 | 29 | $random = $_GET['random']; 30 | print $tweet_id; 31 | $sample = ""; 32 | 33 | $result = mysql_query("SELECT * FROM DamageClassification WHERE id='$id2'"); 34 | 35 | 36 | 37 | $tweet = ""; 38 | if($result === FALSE) { 39 | die(mysql_error()); // TODO: better error handling 40 | } 41 | 42 | $row = mysql_fetch_array($result); 43 | //$tweet = $row['Tweet']; 44 | $id = $row['DSSG_ID']; 45 | $sample= $row['type_sample']; 46 | 47 | if (is_null($sample)){ 48 | $tweet = $row["Tweet"]; 49 | 50 | }else{ 51 | if (strcmp($sample, "keyword") == 0 || strcmp($sample, "by_keyword")==0){ 52 | $r = mysql_query("SELECT * FROM keyword_sample WHERE dssg_id = '$id' LIMIT 1"); 53 | $row2 = mysql_fetch_array($r); 54 | $tweet = $row2["text"]; 55 | 56 | }else if (strcmp($sample,"uniform")==0){ 57 | $r = mysql_query("SELECT * FROM uniform_sample WHERE dssg_id = '$id' LIMIT 1"); 58 | $row2 = mysql_fetch_array($r); 59 | $tweet = $row2["text"]; 60 | 61 | } 62 | 63 | } 64 | 65 | 66 | 67 | //Pass the data in the form of url page.php?variable=value and read the values as `echo $_GET['variable'] 68 | print $tweet . "
"; 69 | $tweet = mysql_real_escape_string($tweet); 70 | 71 | 72 | for ($i = 0; $i < 27; $i++){ 73 | $post_name = "i" . $i; 74 | print $post_name; 75 | 76 | $h = $_POST[$post_name]; 77 | print "bridge:"; 78 | if($h != "") { 79 | $t = explode(",_,_,", $h); 80 | foreach ($t as $token) { 81 | print $token. ", "; 82 | $token = mysql_real_escape_string($token); 83 | $e = explode(" ", $token); 84 | $token_start = $e[count($e)-2]; 85 | $token_end = $e[count($e)-1]; 86 | $gr = " " . $token_start . " " . $token_end; 87 | 88 | $ra = str_replace($gr,"",$token); 89 | print "start:" . $token_start . "
"; 90 | print "end:" . $token_end . "
"; 91 | 92 | $request= "INSERT INTO tokenized_labels values (NULL, '$id', '$tweet','$token_start', '$token_end', '$post_name', '$ra', '$hash', '$sample')"; 93 | $results = mysql_query($request, $link); 94 | 95 | echo mysql_errno($link) . ": " . mysql_error($link) . "\n"; 96 | 97 | 98 | 99 | } 100 | } 101 | 102 | 103 | 104 | 105 | 106 | } 107 | $bad_tweet = $_POST['bad']; 108 | 109 | if (strpos($bad_tweet, "NO ENTITIES, CLICK SUBMI") !== false){ 110 | 111 | $request = "UPDATE DamageClassification SET Infrastructure=0 WHERE id='$id2'"; 112 | $results = mysql_query($request, $link); 113 | 114 | echo mysql_errno($link) . ": " . mysql_error($link) . "\n"; 115 | 116 | 117 | $request = "UPDATE DamageClassification SET Casualty=0 WHERE id='$id2'"; 118 | $results = mysql_query($request, $link); 119 | 120 | echo mysql_errno($link) . ": " . mysql_error($link) . "\n"; 121 | 122 | 123 | 124 | } 125 | 126 | 127 | 128 | 129 | $request = "UPDATE DamageClassification SET is_extracted=is_extracted+1 WHERE id='$id2'"; 130 | $results = mysql_query($request, $link); 131 | 132 | echo mysql_errno($link) . ": " . mysql_error($link) . "\n"; 133 | 134 | 135 | 136 | ?> --------------------------------------------------------------------------------