├── .Makefile.d ├── README.txt ├── bin │ └── run-then-sigkill ├── command.mk ├── path.mk ├── procfile.mk ├── vendor.mk └── wait.mk ├── .gitignore ├── .nixd ├── .gitignore ├── bin │ └── nixd-bootstrap ├── lib │ ├── apache.sh │ └── java.sh └── sbin │ ├── elasticsearch │ ├── kafka │ ├── poorman │ └── python2.7 ├── LICENSE ├── Makefile ├── README.rst ├── config.json ├── docs ├── Makefile ├── Simple Simulated Stream.ipynb ├── Twitter API Demo.ipynb ├── api.rst ├── birding-topology-sketch.svg ├── conf.py ├── config.rst ├── gnip.rst ├── index.rst ├── make.bat ├── production.rst ├── solo.rst ├── topology.rst └── tour.rst ├── fabfile.py ├── project.clj ├── setup.py ├── src └── birding │ ├── __init__.py │ ├── bolt.py │ ├── config.py │ ├── follow.py │ ├── gnip.py │ ├── search.py │ ├── shelf.py │ ├── spout.py │ ├── twitter.py │ └── version.py ├── tasks.py ├── topologies └── birding.clj └── virtualenvs └── birding.txt /.Makefile.d/README.txt: -------------------------------------------------------------------------------- 1 | make recipes based on github.com/rduplain/Makefile.d 2 | -------------------------------------------------------------------------------- /.Makefile.d/bin/run-then-sigkill: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Wrap command line, sending a kill -9 when receiving a signal. 3 | 4 | "$@" & 5 | pid=$! 6 | 7 | trap "trap - INT TERM EXIT; kill -9 $pid" INT TERM EXIT 8 | wait $pid 9 | -------------------------------------------------------------------------------- /.Makefile.d/command.mk: -------------------------------------------------------------------------------- 1 | %-command: 2 | @which $* >/dev/null || ( echo "Requires '$*' command."; false ) 3 | 4 | .PHONY: %-command 5 | -------------------------------------------------------------------------------- /.Makefile.d/path.mk: -------------------------------------------------------------------------------- 1 | DIR := $(dir $(lastword $(MAKEFILE_LIST))) 2 | PROJECT_ROOT := $(abspath $(dir $(abspath $(DIR)))) 3 | 4 | export PROJECT_ROOT 5 | -------------------------------------------------------------------------------- /.Makefile.d/procfile.mk: -------------------------------------------------------------------------------- 1 | DIR := $(dir $(lastword $(MAKEFILE_LIST))) 2 | 3 | include $(DIR)/path.mk 4 | 5 | Procfile := $(PROJECT_ROOT)/Procfile 6 | 7 | # The `proc` target resets the procfile. 8 | proc: 9 | @rm -f $(Procfile) 10 | @touch $(Procfile) 11 | 12 | proc-%: 13 | @echo "$*: make --no-print-directory run-$*" >> $(Procfile) 14 | -------------------------------------------------------------------------------- /.Makefile.d/vendor.mk: -------------------------------------------------------------------------------- 1 | __FILE__ := $(lastword $(MAKEFILE_LIST)) 2 | DIR := $(dir $(lastword $(MAKEFILE_LIST))) 3 | 4 | include $(DIR)/path.mk 5 | 6 | NIXD_VERSION := fa4fc39e7fd8c9a9183684e349c81931326d7523 7 | NIXD_SHA1SUM := b30e0e2927fef8b492223c1daf35e9f818584065 8 | 9 | VENDOR := $(PROJECT_ROOT)/.nixd 10 | nixd := $(VENDOR)/bin/nixd 11 | 12 | export VENDOR 13 | 14 | $(nixd): $(__FILE__) 15 | @rm -f $@ 16 | @$(VENDOR)/bin/nixd-bootstrap $(nixd) $(NIXD_VERSION) sha1 $(NIXD_SHA1SUM) 17 | 18 | vendor-%: $(nixd) 19 | @$(nixd) install $* 20 | -------------------------------------------------------------------------------- /.Makefile.d/wait.mk: -------------------------------------------------------------------------------- 1 | DIR := $(dir $(lastword $(MAKEFILE_LIST))) 2 | 3 | include $(DIR)/command.mk 4 | 5 | wait-tcp-%: nc-command 6 | @bash -c "while ! nc -z $(HOST) $*; do sleep 0.25; done" 7 | 8 | wait-tcp-%: HOST := localhost 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .#* 2 | *~ 3 | \#* 4 | *.aux 5 | birding.yml 6 | _build 7 | *.codekit 8 | .coverage 9 | *.db 10 | *.db-journal 11 | /.develop 12 | /dist/ 13 | .DS_Store 14 | *.egg 15 | *.egg-info 16 | *.eggs 17 | /.emacs* 18 | .ipynb_checkpoints 19 | *.lock 20 | *.log 21 | *.log.[0-9]* 22 | __MACOSX 23 | *.out 24 | *.pdf 25 | /Procfile 26 | *.pyc 27 | /_resources 28 | *.sqlite 29 | *.sqlite-journal 30 | *.sw* 31 | Untitled*.ipynb 32 | -------------------------------------------------------------------------------- /.nixd/.gitignore: -------------------------------------------------------------------------------- 1 | /bin/nixd 2 | /opt 3 | /src 4 | /usr 5 | /var 6 | -------------------------------------------------------------------------------- /.nixd/bin/nixd-bootstrap: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Bootstrap a nixd project by downloading nixd and verifying checksum. 3 | # 4 | # Note: This uses downloaded nixd to verify checksum; use trusted https. 5 | # This file is provided by the nixd project. 6 | 7 | # Exit immediately if a command error or non-zero return occurs. 8 | set -e 9 | 10 | GITHUB_REPO=${GITHUB_REPO:-rduplain/nixd} 11 | GITHUB_BASE=${GITHUB_BASE:-https://raw.githubusercontent.com} 12 | GITHUB_PATTERN=${GITHUB_PATTERN:-$GITHUB_BASE/$GITHUB_REPO/COMMIT/bin/nixd} 13 | 14 | usage() { 15 | # Print program usage to stderr and return 2. 16 | 17 | exec >&2 # Redirect all further stdout to stderr. 18 | 19 | if [ $# -gt 0 ]; then 20 | # Print message argument, if given. 21 | echo "$@" 22 | echo 23 | fi 24 | 25 | echo "usage: $0 DESTINATION NIXD_GIT_COMMIT_OR_URL sha1 CHECKSUM" 26 | echo 27 | echo "DESTINATION - destination filepath of resulting download" 28 | echo "NIXD_GIT_COMMIT_OR_URL - nixd git commit hash or full download URL" 29 | echo "CHECKSUM - checksum of nixd executable file" 30 | return 2 31 | } 32 | 33 | main() { 34 | # Go get nixd as directed. 35 | 36 | if [ $# -ne 4 ]; then 37 | usage 38 | fi 39 | 40 | local nixd="$1" 41 | local git_commit_or_url=$2 42 | local hash_function=$3 43 | local hash_value=$4 44 | shift 4 45 | 46 | if ! which curl > /dev/null; then 47 | echo "Requires 'curl' program, but could not find it in PATH." >&2 48 | return 3 49 | fi 50 | 51 | if [ -e "$nixd" ]; then 52 | echo "Abort. File already exists: $nixd" >&2 53 | return 2 54 | fi 55 | 56 | trap "if [ -e '$nixd' ]; then mv -v '$nixd' '$nixd'.rej; fi" INT TERM EXIT 57 | 58 | local url 59 | case "$git_commit_or_url" in 60 | [A-Fa-f0-9]*) 61 | url=${GITHUB_PATTERN/COMMIT/$git_commit_or_url} 62 | ;; 63 | *) 64 | url=$git_commit_or_url 65 | ;; 66 | esac 67 | 68 | local dir=$( dirname "$nixd" ) 69 | mkdir -p "$dir" 70 | 71 | curl -L -o "$nixd" $url 72 | 73 | if ! grep -q 'checksum() {' "$nixd"; then 74 | echo "Abort. Download is invalid. Is the commit hash / URL valid?" >&2 75 | return 1 76 | fi 77 | 78 | chmod 775 "$nixd" 79 | 80 | if "$nixd" checksum "$nixd" $hash_function TEST > /dev/null 2>&1; then 81 | echo "Abort. Download appears to accept any checksum." >&2 82 | return 1 83 | fi 84 | 85 | if "$nixd" checksum "$nixd" $hash_function $hash_value; then 86 | trap - INT TERM EXIT 87 | echo "Successfully bootstrapped nixd at $nixd" 88 | return 0 89 | else 90 | echo "Abort. File checksum is invalid: $nixd" >&2 91 | return 1 92 | fi 93 | } 94 | 95 | main "$@" 96 | -------------------------------------------------------------------------------- /.nixd/lib/apache.sh: -------------------------------------------------------------------------------- 1 | # Utilities for installations using Apache (apache.org) projects. 2 | 3 | locate_apache_mirror() { 4 | # Print to stdout a full URL matching the closest mirror of given filepath. 5 | 6 | curl https://www.apache.org/dyn/closer.cgi?path=$1 |\ 7 | grep -i -A 10 'we suggest the following' |\ 8 | grep href | head -1 | cut -f 2 -d '"' 9 | } 10 | -------------------------------------------------------------------------------- /.nixd/lib/java.sh: -------------------------------------------------------------------------------- 1 | # Utilities for installations using Java/JVM. 2 | 3 | check_java_version() { 4 | # Check if `java` is available and is version 1.7+. 5 | 6 | if ! java_version=$(java -version 2>&1); then 7 | nixd_error "No java found." 8 | if [ -n "$java_version" ]; then 9 | nixd_error $java_version 10 | fi 11 | return 1 12 | fi 13 | 14 | if [ -z "$java_version" ]; then 15 | nixd_echo '`java -version` is empty.' 16 | return 1 17 | elif echo $java_version | grep -q '\bversion "1.7'; then 18 | nixd_echo "Found Java 1.7." 19 | elif echo $java_version | grep -q '\bversion "1.8'; then 20 | nixd_echo "Found Java 1.8." 21 | else 22 | nixd_error "Java version is unsupported:" 23 | nixd_error $java_version 24 | return 1 25 | fi 26 | } 27 | -------------------------------------------------------------------------------- /.nixd/sbin/elasticsearch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . $NIXD_LIB/java.sh 4 | 5 | UNPACKED=elasticsearch-1.7.1 6 | ARCHIVE=$UNPACKED.tar.gz 7 | SHA1SUM=0984ae27624e57c12c33d4a559c3ebae25e74508 8 | 9 | BIN=$NIXD_PREFIX/bin 10 | EXE_BASENAME=elasticsearch 11 | DESTINATION=$NIXD_OPT/elasticsearch 12 | ES_YML=$DESTINATION/config/elasticsearch.yml 13 | LOGGING_YML=$DESTINATION/config/logging.yml 14 | ES_DATA=$NIXD_VAR/lib/elasticsearch 15 | 16 | # Space-delimited list of elasticsearch plugin identifiers. 17 | PLUGINS="royrusso/elasticsearch-HQ" 18 | 19 | # Log full JSON DSL detail when queries are above this threshold. 20 | SLOW_THRESHOLD=10s 21 | 22 | check() { 23 | nixd_ls $BIN/$EXE_BASENAME $DESTINATION 24 | nixd_newer_than $SYNONYMS_SRC 25 | } 26 | 27 | resources() { 28 | local url=https://download.elastic.co/elasticsearch/elasticsearch/$ARCHIVE 29 | echo $url $ARCHIVE sha1 $SHA1SUM 30 | } 31 | 32 | pretest() { 33 | check_java_version 34 | } 35 | 36 | install() { 37 | rm -fr $UNPACKED $DESTINATION 38 | tar -xzf $ARCHIVE 39 | mv -v $UNPACKED $DESTINATION 40 | mkdir -p $BIN 41 | cd $BIN 42 | ln -sf $DESTINATION/bin/$EXE_BASENAME $EXE_BASENAME 43 | generate_elasticsearch_yml > $ES_YML 44 | generate_logging_yml > $LOGGING_YML 45 | install_plugins $PLUGINS 46 | } 47 | 48 | install_plugins() { 49 | for plugin in "$@"; do 50 | nixd_echo "Installing plugin: $plugin ..." 51 | $DESTINATION/bin/plugin --install $plugin 52 | done 53 | } 54 | 55 | # It's dangerous to go alone. 56 | # elasticsearch silently handles yml that does not make any sense. 57 | 58 | generate_elasticsearch_yml() { 59 | # https://www.elastic.co/guide/en/elasticsearch/guide/current/_important_configuration_changes.html 60 | 61 | echo 'cluster.name: birding' 62 | if [ -n "$HOSTNAME" ]; then 63 | echo "node.name: $HOSTNAME" 64 | fi 65 | cat -< $KAFKA_CONF_PATH 43 | generate_zookeeper_properties > $ZOOKEEPER_CONF_PATH 44 | } 45 | 46 | generate_kafka_properties() { 47 | # Print to stdout a properties configuration file for kafka. 48 | 49 | cat -<`_ 35 | * `Stable `_ 36 | 37 | 38 | Interaction 39 | ----------- 40 | 41 | Given a checkout of birding, to run birding in place, see `Downloading and 42 | running birding `_. 43 | 44 | 45 | Python Support 46 | -------------- 47 | 48 | The birding project uses Python 2.7 and will also support Python 3.4+ when 49 | underlying dependencies support Python 3. 50 | -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "library": "", 3 | "topology_specs": "topologies/", 4 | "virtualenv_specs": "virtualenvs/", 5 | "envs": { 6 | "prod": { 7 | "nimbus": "localhost", 8 | "workers": ["localhost"], 9 | "log": { 10 | "path": "", 11 | "max_bytes": 1000000, 12 | "backup_count": 10, 13 | "level": "info" 14 | }, 15 | "virtualenv_root": "./" 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " applehelp to make an Apple Help Book" 34 | @echo " devhelp to make HTML files and a Devhelp project" 35 | @echo " epub to make an epub" 36 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 37 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 38 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 39 | @echo " text to make text files" 40 | @echo " man to make manual pages" 41 | @echo " texinfo to make Texinfo files" 42 | @echo " info to make Texinfo files and run them through makeinfo" 43 | @echo " gettext to make PO message catalogs" 44 | @echo " changes to make an overview of all changed/added/deprecated items" 45 | @echo " xml to make Docutils-native XML files" 46 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 47 | @echo " linkcheck to check all external links for integrity" 48 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 49 | @echo " coverage to run coverage check of the documentation (if enabled)" 50 | 51 | clean: 52 | rm -rf $(BUILDDIR)/* 53 | 54 | html: 55 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 56 | @echo 57 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 58 | 59 | dirhtml: 60 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 61 | @echo 62 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 63 | 64 | singlehtml: 65 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 66 | @echo 67 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 68 | 69 | pickle: 70 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 71 | @echo 72 | @echo "Build finished; now you can process the pickle files." 73 | 74 | json: 75 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 76 | @echo 77 | @echo "Build finished; now you can process the JSON files." 78 | 79 | htmlhelp: 80 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 81 | @echo 82 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 83 | ".hhp project file in $(BUILDDIR)/htmlhelp." 84 | 85 | qthelp: 86 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 87 | @echo 88 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 89 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 90 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/birding.qhcp" 91 | @echo "To view the help file:" 92 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/birding.qhc" 93 | 94 | applehelp: 95 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 96 | @echo 97 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 98 | @echo "N.B. You won't be able to view it unless you put it in" \ 99 | "~/Library/Documentation/Help or install it in your application" \ 100 | "bundle." 101 | 102 | devhelp: 103 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 104 | @echo 105 | @echo "Build finished." 106 | @echo "To view the help file:" 107 | @echo "# mkdir -p $$HOME/.local/share/devhelp/birding" 108 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/birding" 109 | @echo "# devhelp" 110 | 111 | epub: 112 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 113 | @echo 114 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 115 | 116 | latex: 117 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 118 | @echo 119 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 120 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 121 | "(use \`make latexpdf' here to do that automatically)." 122 | 123 | latexpdf: 124 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 125 | @echo "Running LaTeX files through pdflatex..." 126 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 127 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 128 | 129 | latexpdfja: 130 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 131 | @echo "Running LaTeX files through platex and dvipdfmx..." 132 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 133 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 134 | 135 | text: 136 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 137 | @echo 138 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 139 | 140 | man: 141 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 142 | @echo 143 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 144 | 145 | texinfo: 146 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 147 | @echo 148 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 149 | @echo "Run \`make' in that directory to run these through makeinfo" \ 150 | "(use \`make info' here to do that automatically)." 151 | 152 | info: 153 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 154 | @echo "Running Texinfo files through makeinfo..." 155 | make -C $(BUILDDIR)/texinfo info 156 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 157 | 158 | gettext: 159 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 160 | @echo 161 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 162 | 163 | changes: 164 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 165 | @echo 166 | @echo "The overview file is in $(BUILDDIR)/changes." 167 | 168 | linkcheck: 169 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 170 | @echo 171 | @echo "Link check complete; look for any errors in the above output " \ 172 | "or in $(BUILDDIR)/linkcheck/output.txt." 173 | 174 | doctest: 175 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 176 | @echo "Testing of doctests in the sources finished, look at the " \ 177 | "results in $(BUILDDIR)/doctest/output.txt." 178 | 179 | coverage: 180 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 181 | @echo "Testing of coverage in the sources finished, look at the " \ 182 | "results in $(BUILDDIR)/coverage/python.txt." 183 | 184 | xml: 185 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 186 | @echo 187 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 188 | 189 | pseudoxml: 190 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 191 | @echo 192 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 193 | -------------------------------------------------------------------------------- /docs/Simple Simulated Stream.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "# Generate sample data.\n", 12 | "\n", 13 | "import datetime\n", 14 | "import random\n", 15 | "\n", 16 | "URLS = [\n", 17 | " 'http://time.com/3945677/greece-referendum-vote/',\n", 18 | " 'http://fortune.com/this-week-in-photos-062715/',\n", 19 | " 'http://www.bbc.com/news/technology-33379571',\n", 20 | "]\n", 21 | "\n", 22 | "class StepDatetimeGenerator(object):\n", 23 | " def __init__(self, now=None, step=None):\n", 24 | " if now is None:\n", 25 | " now = datetime.datetime.now()\n", 26 | " if step is None:\n", 27 | " step = datetime.timedelta(seconds=0.25)\n", 28 | " self.currently = now\n", 29 | " self._step = step\n", 30 | "\n", 31 | " def step(self):\n", 32 | " self.currently, dt = self.currently + self._step, self.currently\n", 33 | " return dt\n", 34 | "\n", 35 | "def generate_url(urls=None):\n", 36 | " if urls is None:\n", 37 | " urls = URLS\n", 38 | " return random.choice(urls)\n", 39 | "\n", 40 | "def generate_tuples(dt_gen=None, count=10):\n", 41 | " if dt_gen is None:\n", 42 | " start = datetime.datetime.now() - datetime.timedelta(seconds=1800)\n", 43 | " dt_gen = StepDatetimeGenerator()\n", 44 | " i = 0\n", 45 | " while i < count:\n", 46 | " yield generate_url(), dt_gen.step().isoformat()\n", 47 | " i += 1" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 2, 53 | "metadata": { 54 | "collapsed": true 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "# Define processing steps.\n", 59 | "\n", 60 | "from birding import Twitter, TwitterSearchManager\n", 61 | "manager = TwitterSearchManager(Twitter.from_oauth_file())\n", 62 | "\n", 63 | "def search_url(*tup):\n", 64 | " url = tup[0]\n", 65 | " return manager.search(q=url)\n", 66 | "\n", 67 | "def lookup_tweets(search_result):\n", 68 | " return manager.lookup_search_result(search_result)\n", 69 | "\n", 70 | "def dump_lookup_result(lookup_result):\n", 71 | " return manager.dump(lookup_result)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 3, 77 | "metadata": { 78 | "collapsed": false 79 | }, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "@onnoh -- https://twitter.com/onnoh\n", 86 | "BBC News - Reddit in uproar after staff sacking http://t.co/I1EJX0NVEe\n", 87 | "\n", 88 | "@wopot -- https://twitter.com/wopot\n", 89 | "#Reddit in uproar after staff sacking - BBC News\n", 90 | "http://t.co/0aLr16ftFZ\n", 91 | "\n", 92 | "@PaulineGrantTO -- https://twitter.com/PaulineGrantTO\n", 93 | "Reddit in uproar after staff sacking - BBC News http://t.co/wVE75UHN47\n", 94 | "\n", 95 | "@FactsVsOpinion -- https://twitter.com/FactsVsOpinion\n", 96 | "RT @PaulineGrantTO: Reddit in uproar after staff sacking - BBC News http://t.co/wVE75UHN47\n", 97 | "\n", 98 | "@kupathrak -- https://twitter.com/kupathrak\n", 99 | "RT @wopot: #Reddit in uproar after staff sacking - BBC News\n", 100 | "http://t.co/0aLr16ftFZ\n", 101 | "\n", 102 | "@awrd -- https://twitter.com/awrd\n", 103 | "Will be very interesting to see what happens. 'Reddit in uproar after staff sacking' http://t.co/EotOibST8N\n", 104 | "\n", 105 | "@lexinerus -- https://twitter.com/lexinerus\n", 106 | "ReTw 8mw: It's amazing that reddit makes headlines like this and there are people who still don't even know what r… http://t.co/z5YC0iLdIp\n", 107 | "\n", 108 | "@Narconavax -- https://twitter.com/Narconavax\n", 109 | "BBC News - Reddit in uproar after staff sacking http://t.co/rENnPHOpx4\n", 110 | "\n", 111 | "@elliotecweb -- https://twitter.com/elliotecweb\n", 112 | "The world is turning upside-down: \"Reddit in uproar after staff sacking\" http://t.co/VJDYPhc7Tm\n", 113 | "\n", 114 | "@jaszhix -- https://twitter.com/jaszhix\n", 115 | "Reddit in uproar after staff sacking http://t.co/7VsgZakrst\n", 116 | "\n", 117 | "@JenkoSchmidt -- https://twitter.com/JenkoSchmidt\n", 118 | "You did this, somehow, didn't you, @TonyAbbottMHR? #Reddit #Auspol #RedditRevolt http://t.co/BIxtUb7rht\n", 119 | "\n", 120 | "@8mw -- https://twitter.com/8mw\n", 121 | "It's amazing that reddit makes headlines like this and there are people who still don't even know what reddit is. … http://t.co/8LxkVLsJKC\n", 122 | "\n", 123 | "@Hideki_Manga -- https://twitter.com/Hideki_Manga\n", 124 | "“Reddit in uproar after staff sacking” http://t.co/uA2WJX8EF0 HAHAHAHAA!\n", 125 | "\n", 126 | "@lexinerus -- https://twitter.com/lexinerus\n", 127 | "ReTw lexinerus: ReTw 8mw: It's amazing that reddit makes headlines like this and there are people who still don't … http://t.co/z5YC0iLdIp\n", 128 | "\n", 129 | "@MatthewCallaway -- https://twitter.com/MatthewCallaway\n", 130 | "Reddit in uproar after staff sacking - BBC News http://t.co/HxZ2UHe9S9 #RedditRevolt\n", 131 | "@tradingoptions2 -- https://twitter.com/tradingoptions2\n", 132 | "This week in photos, June 27 &#8211; July 3,&nbsp;2015via @FortuneMagazine http://t.co/nQBRmUxWbJ\n", 133 | "\n", 134 | "@editorialiste -- https://twitter.com/editorialiste\n", 135 | "This week in photos, June 27 - July 3, 2015: http://t.co/GLyxPqJCpB\n", 136 | "\n", 137 | "@tailoredapp1 -- https://twitter.com/tailoredapp1\n", 138 | "RT @markgongloff: This week in photos http://t.co/SZnqTs2uHy via @FortuneMagazine\n" 139 | ] 140 | } 141 | ], 142 | "source": [ 143 | "# Put it all together.\n", 144 | "\n", 145 | "for tup in generate_tuples(count=2):\n", 146 | " print(dump_lookup_result(lookup_tweets(search_url(*tup))))" 147 | ] 148 | } 149 | ], 150 | "metadata": { 151 | "kernelspec": { 152 | "display_name": "Python 2", 153 | "language": "python", 154 | "name": "python2" 155 | }, 156 | "language_info": { 157 | "codemirror_mode": { 158 | "name": "ipython", 159 | "version": 2 160 | }, 161 | "file_extension": ".py", 162 | "mimetype": "text/x-python", 163 | "name": "python", 164 | "nbconvert_exporter": "python", 165 | "pygments_lexer": "ipython2", 166 | "version": "2.7.10" 167 | } 168 | }, 169 | "nbformat": 4, 170 | "nbformat_minor": 0 171 | } 172 | -------------------------------------------------------------------------------- /docs/Twitter API Demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from birding.twitter import Twitter\n", 12 | "twitter = Twitter.from_oauth_file()" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": { 19 | "collapsed": false, 20 | "scrolled": false 21 | }, 22 | "outputs": [ 23 | { 24 | "data": { 25 | "text/plain": [ 26 | "{u'completed_in': 0.118,\n", 27 | " u'count': 100,\n", 28 | " u'max_id': 617018231207895040,\n", 29 | " u'max_id_str': u'617018231207895040',\n", 30 | " u'query': u'http%3A%2F%2Fwww.amazon.com%2FAmazon-SK705DI-Echo%2Fdp%2FB00X4WHP5E',\n", 31 | " u'refresh_url': u'?since_id=617018231207895040&q=http%3A%2F%2Fwww.amazon.com%2FAmazon-SK705DI-Echo%2Fdp%2FB00X4WHP5E&include_entities=1',\n", 32 | " u'since_id': 0,\n", 33 | " u'since_id_str': u'0'}" 34 | ] 35 | }, 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "output_type": "execute_result" 39 | } 40 | ], 41 | "source": [ 42 | "url = 'http://www.amazon.com/Amazon-SK705DI-Echo/dp/B00X4WHP5E' # Amazon Echo product page\n", 43 | "search_result = twitter.search.tweets(q=url, count=100)\n", 44 | "search_result['search_metadata']" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": { 51 | "collapsed": false 52 | }, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "{u'contributors': None,\n", 58 | " u'coordinates': None,\n", 59 | " u'created_at': u'Fri Jul 03 17:13:04 +0000 2015',\n", 60 | " u'entities': {u'hashtags': [{u'indices': [17, 24], u'text': u'Amazon'},\n", 61 | " {u'indices': [25, 30], u'text': u'echo'},\n", 62 | " {u'indices': [101, 106], u'text': u'tech'}],\n", 63 | " u'media': [{u'display_url': u'pic.twitter.com/TM0F66XVcY',\n", 64 | " u'expanded_url': u'http://twitter.com/SXEElectronics/status/617018231207895040/photo/1',\n", 65 | " u'id': 617018231157587968,\n", 66 | " u'id_str': u'617018231157587968',\n", 67 | " u'indices': [107, 129],\n", 68 | " u'media_url': u'http://pbs.twimg.com/media/CJAWz-gWgAAvEZD.jpg',\n", 69 | " u'media_url_https': u'https://pbs.twimg.com/media/CJAWz-gWgAAvEZD.jpg',\n", 70 | " u'sizes': {u'large': {u'h': 652, u'resize': u'fit', u'w': 416},\n", 71 | " u'medium': {u'h': 652, u'resize': u'fit', u'w': 416},\n", 72 | " u'small': {u'h': 532, u'resize': u'fit', u'w': 340},\n", 73 | " u'thumb': {u'h': 150, u'resize': u'crop', u'w': 150}},\n", 74 | " u'type': u'photo',\n", 75 | " u'url': u'http://t.co/TM0F66XVcY'}],\n", 76 | " u'symbols': [],\n", 77 | " u'urls': [{u'display_url': u'buff.ly/1GTLSex',\n", 78 | " u'expanded_url': u'http://buff.ly/1GTLSex',\n", 79 | " u'indices': [78, 100],\n", 80 | " u'url': u'http://t.co/eT1C4t205A'}],\n", 81 | " u'user_mentions': [{u'id': 176774540,\n", 82 | " u'id_str': u'176774540',\n", 83 | " u'indices': [59, 67],\n", 84 | " u'name': u'CGP Grey',\n", 85 | " u'screen_name': u'cgpgrey'}]},\n", 86 | " u'favorite_count': 0,\n", 87 | " u'favorited': False,\n", 88 | " u'geo': None,\n", 89 | " u'id': 617018231207895040,\n", 90 | " u'id_str': u'617018231207895040',\n", 91 | " u'in_reply_to_screen_name': None,\n", 92 | " u'in_reply_to_status_id': None,\n", 93 | " u'in_reply_to_status_id_str': None,\n", 94 | " u'in_reply_to_user_id': None,\n", 95 | " u'in_reply_to_user_id_str': None,\n", 96 | " u'is_quote_status': False,\n", 97 | " u'lang': u'en',\n", 98 | " u'metadata': {u'iso_language_code': u'en', u'result_type': u'recent'},\n", 99 | " u'place': None,\n", 100 | " u'possibly_sensitive': False,\n", 101 | " u'retweet_count': 0,\n", 102 | " u'retweeted': False,\n", 103 | " u'source': u'Buffer',\n", 104 | " u'text': u'Has anyone tried #Amazon #echo yet? Reviews look good, but @cgpgrey hated it. http://t.co/eT1C4t205A #tech http://t.co/TM0F66XVcY',\n", 105 | " u'truncated': False,\n", 106 | " u'user': {u'contributors_enabled': False,\n", 107 | " u'created_at': u'Wed Jun 03 11:44:15 +0000 2015',\n", 108 | " u'default_profile': False,\n", 109 | " u'default_profile_image': False,\n", 110 | " u'description': u'Better living through technology. Welcome to the future of the clock.',\n", 111 | " u'entities': {u'description': {u'urls': []},\n", 112 | " u'url': {u'urls': [{u'display_url': u'sxeelectronics.com',\n", 113 | " u'expanded_url': u'http://www.sxeelectronics.com',\n", 114 | " u'indices': [0, 22],\n", 115 | " u'url': u'http://t.co/PIt61hmI6Y'}]}},\n", 116 | " u'favourites_count': 14,\n", 117 | " u'follow_request_sent': False,\n", 118 | " u'followers_count': 45,\n", 119 | " u'following': False,\n", 120 | " u'friends_count': 395,\n", 121 | " u'geo_enabled': True,\n", 122 | " u'has_extended_profile': False,\n", 123 | " u'id': 3307022770,\n", 124 | " u'id_str': u'3307022770',\n", 125 | " u'is_translation_enabled': False,\n", 126 | " u'is_translator': False,\n", 127 | " u'lang': u'en',\n", 128 | " u'listed_count': 1,\n", 129 | " u'location': u'New York, USA',\n", 130 | " u'name': u'SXE Electronics',\n", 131 | " u'notifications': False,\n", 132 | " u'profile_background_color': u'000000',\n", 133 | " u'profile_background_image_url': u'http://abs.twimg.com/images/themes/theme1/bg.png',\n", 134 | " u'profile_background_image_url_https': u'https://abs.twimg.com/images/themes/theme1/bg.png',\n", 135 | " u'profile_background_tile': False,\n", 136 | " u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/3307022770/1434378229',\n", 137 | " u'profile_image_url': u'http://pbs.twimg.com/profile_images/610447596641890304/wrCuxdI0_normal.jpg',\n", 138 | " u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/610447596641890304/wrCuxdI0_normal.jpg',\n", 139 | " u'profile_link_color': u'D31145',\n", 140 | " u'profile_sidebar_border_color': u'000000',\n", 141 | " u'profile_sidebar_fill_color': u'000000',\n", 142 | " u'profile_text_color': u'000000',\n", 143 | " u'profile_use_background_image': False,\n", 144 | " u'protected': False,\n", 145 | " u'screen_name': u'SXEElectronics',\n", 146 | " u'statuses_count': 29,\n", 147 | " u'time_zone': None,\n", 148 | " u'url': u'http://t.co/PIt61hmI6Y',\n", 149 | " u'utc_offset': None,\n", 150 | " u'verified': False}}" 151 | ] 152 | }, 153 | "execution_count": 3, 154 | "metadata": {}, 155 | "output_type": "execute_result" 156 | } 157 | ], 158 | "source": [ 159 | "assert len(search_result['statuses']) > 0, 'Nothing matches search.'\n", 160 | "search_result['statuses'][0]" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 4, 166 | "metadata": { 167 | "collapsed": false 168 | }, 169 | "outputs": [ 170 | { 171 | "name": "stdout", 172 | "output_type": "stream", 173 | "text": [ 174 | "@SXEElectronics -- https://twitter.com/SXEElectronics\n", 175 | "Has anyone tried #Amazon #echo yet? Reviews look good, but @cgpgrey hated it. http://t.co/eT1C4t205A #tech http://t.co/TM0F66XVcY\n", 176 | "\n", 177 | "@WildCougConfess -- https://twitter.com/WildCougConfess\n", 178 | "http://t.co/BgkUw1tqDV\n", 179 | "\n", 180 | "@tomdebaere -- https://twitter.com/tomdebaere\n", 181 | "Amazon Echo: Always Ready, Connected, and Fast. #cool #wantone http://t.co/ONkIcTHMgN\n", 182 | "\n", 183 | "@cortanabot -- https://twitter.com/cortanabot\n", 184 | "Cortana & Siri done right: AmazonEcho http://t.co/tC0NOrIEuw\n", 185 | "\n", 186 | "@pcolazurdo -- https://twitter.com/pcolazurdo\n", 187 | "#Cortana & #Siri done right: #AmazonEcho http://t.co/uSuXHT2S1n #Iwantone\n", 188 | "\n", 189 | "@assistapps -- https://twitter.com/assistapps\n", 190 | "A smart assistant who can hear you despite disturbance. \n", 191 | "Alexa when is bringing Echo home http://t.co/9vyr7elBS3\n", 192 | "\n", 193 | "@assistapps -- https://twitter.com/assistapps\n", 194 | "have you tried? Amazon Echo http://t.co/9vyr7elBS3\n", 195 | "\n", 196 | "@virgiliocorrado -- https://twitter.com/virgiliocorrado\n", 197 | "@MarilynDenisCTV @darrenblamb @1045CHUMFM @RDMCHUMFM have you tried? Amazon Echo http://t.co/9fVBMvjFMu\n", 198 | "\n", 199 | "@kelvinaiesec -- https://twitter.com/kelvinaiesec\n", 200 | "小精靈就是了\n", 201 | "\n", 202 | " http://t.co/1LeA5GXD3h\n", 203 | "\n", 204 | "@alanskyy -- https://twitter.com/alanskyy\n", 205 | "#Amazon want's to know what goes on in your house, embrace #AmazonEcho, ease dropping at no extra charge . - http://t.co/QyeCwi21vB\n", 206 | "\n", 207 | "@ShirlLacsamana -- https://twitter.com/ShirlLacsamana\n", 208 | "RT @Dutchcowboy: really, i did not just order 2 more @amazonecho ’s ... 8-) http://t.co/2GCzBbqwJk\n", 209 | "\n", 210 | "@Dutchcowboy -- https://twitter.com/Dutchcowboy\n", 211 | "really, i did not just order 2 more @amazonecho ’s ... 8-) http://t.co/2GCzBbqwJk\n", 212 | "\n", 213 | "@assistapps -- https://twitter.com/assistapps\n", 214 | "A smart assistant who can hear you despite disturbance. \n", 215 | "Alexa when is bringing Echo home http://t.co/9vyr7elBS3\n", 216 | "\n", 217 | "@stiver_ufo -- https://twitter.com/stiver_ufo\n", 218 | "Amazon Echo is here http://t.co/gpEpeG81Hz\n", 219 | "\n", 220 | "@Rachel__Ray -- https://twitter.com/Rachel__Ray\n", 221 | "@notstevieb I need this for the @audible addiction http://t.co/YE3IFWOnVc @amazonecho\n", 222 | "\n", 223 | "@marceloyamada -- https://twitter.com/marceloyamada\n", 224 | "Mais um sinal de que o futuro já chegou. Assistente doméstico comandado por voz, disponível comercialmente por 180 d…https://t.co/im0eC80CVr\n", 225 | "\n", 226 | "@Henrikop -- https://twitter.com/Henrikop\n", 227 | "High expectations: Amazon Echo. IoT next level. AWS Lambda can however extend it to revolutionary heights...\n", 228 | "http://t.co/spVYx9CDp7\n", 229 | "\n", 230 | "@NEALDOG2007 -- https://twitter.com/NEALDOG2007\n", 231 | "Amazon Echo: Always Ready, Connected, and Fast. http://t.co/oHuDRKolgP\n", 232 | "\n", 233 | "@RunLove -- https://twitter.com/RunLove\n", 234 | "Amazon Echo. Basically like Rosie from TheJetsons. But creepier... @MrMikeCalta @AngryGalvin @gregrahm @SeniorSpanish http://t.co/ltc7NiMmLD\n", 235 | "\n", 236 | "@cz516 -- https://twitter.com/cz516\n", 237 | "is @amazon's ECHO the next BIGGGG thing?\n", 238 | "\n", 239 | "http://t.co/WBZsTBT7nx\n", 240 | "\n", 241 | "@matthew1471 -- https://twitter.com/matthew1471\n", 242 | "7 levels of creepy : http://t.co/XLWcNF6Z84\n", 243 | "\n", 244 | "@aughban -- https://twitter.com/aughban\n", 245 | "hey @AmazonUK when are you going to sell this http://t.co/12O23nMXMj in the UK?? Why don't you want my money?? :((((\n", 246 | "\n", 247 | "@jserna -- https://twitter.com/jserna\n", 248 | "Amazon Echo integration with WeMo and Hue interesting. But I'm waiting for HomeKit. http://t.co/0bGmTFXUgH\n", 249 | "\n", 250 | "@madebyAi -- https://twitter.com/madebyAi\n", 251 | "Will @Amazon Echo finally kick-off the revolution in voice-powered interfaces? http://t.co/M49QYXlmgL\n", 252 | "\n", 253 | "@itchypaws -- https://twitter.com/itchypaws\n", 254 | "Nice - you can use Amazon Echo to control your home when it integrates with WeMo http://t.co/mwW3slx5RF\n", 255 | "\n", 256 | "@TeriRadichel -- https://twitter.com/TeriRadichel\n", 257 | "Amazon Echo ~ Intrigued http://t.co/A8ecZynEHm\n", 258 | "\n", 259 | "@klotzbrocken -- https://twitter.com/klotzbrocken\n", 260 | "Amazon Echo ist in USA verfügbar. Der Retail muss sich einiges gefallen lassen #retailproblems http://t.co/UFW3SqHoMi http://t.co/eK3jZb4WVF\n", 261 | "\n" 262 | ] 263 | } 264 | ], 265 | "source": [ 266 | "import textwrap\n", 267 | "for status in search_result['statuses']:\n", 268 | " print(textwrap.dedent(u\"\"\"\n", 269 | " @{screen_name} -- https://twitter.com/{screen_name}\n", 270 | " {text}\n", 271 | " \"\"\").strip().format(\n", 272 | " screen_name=status['user']['screen_name'],\n", 273 | " text=status['text'],\n", 274 | " ))\n", 275 | " print('')" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 5, 281 | "metadata": { 282 | "collapsed": false, 283 | "scrolled": true 284 | }, 285 | "outputs": [ 286 | { 287 | "data": { 288 | "text/plain": [ 289 | "{u'contributors': None,\n", 290 | " u'coordinates': None,\n", 291 | " u'created_at': u'Fri Jul 03 17:13:04 +0000 2015',\n", 292 | " u'entities': {u'hashtags': [{u'indices': [17, 24], u'text': u'Amazon'},\n", 293 | " {u'indices': [25, 30], u'text': u'echo'},\n", 294 | " {u'indices': [101, 106], u'text': u'tech'}],\n", 295 | " u'media': [{u'display_url': u'pic.twitter.com/TM0F66XVcY',\n", 296 | " u'expanded_url': u'http://twitter.com/SXEElectronics/status/617018231207895040/photo/1',\n", 297 | " u'id': 617018231157587968,\n", 298 | " u'id_str': u'617018231157587968',\n", 299 | " u'indices': [107, 129],\n", 300 | " u'media_url': u'http://pbs.twimg.com/media/CJAWz-gWgAAvEZD.jpg',\n", 301 | " u'media_url_https': u'https://pbs.twimg.com/media/CJAWz-gWgAAvEZD.jpg',\n", 302 | " u'sizes': {u'large': {u'h': 652, u'resize': u'fit', u'w': 416},\n", 303 | " u'medium': {u'h': 652, u'resize': u'fit', u'w': 416},\n", 304 | " u'small': {u'h': 532, u'resize': u'fit', u'w': 340},\n", 305 | " u'thumb': {u'h': 150, u'resize': u'crop', u'w': 150}},\n", 306 | " u'type': u'photo',\n", 307 | " u'url': u'http://t.co/TM0F66XVcY'}],\n", 308 | " u'symbols': [],\n", 309 | " u'urls': [{u'display_url': u'buff.ly/1GTLSex',\n", 310 | " u'expanded_url': u'http://buff.ly/1GTLSex',\n", 311 | " u'indices': [78, 100],\n", 312 | " u'url': u'http://t.co/eT1C4t205A'}],\n", 313 | " u'user_mentions': [{u'id': 176774540,\n", 314 | " u'id_str': u'176774540',\n", 315 | " u'indices': [59, 67],\n", 316 | " u'name': u'CGP Grey',\n", 317 | " u'screen_name': u'cgpgrey'}]},\n", 318 | " u'extended_entities': {u'media': [{u'display_url': u'pic.twitter.com/TM0F66XVcY',\n", 319 | " u'expanded_url': u'http://twitter.com/SXEElectronics/status/617018231207895040/photo/1',\n", 320 | " u'id': 617018231157587968,\n", 321 | " u'id_str': u'617018231157587968',\n", 322 | " u'indices': [107, 129],\n", 323 | " u'media_url': u'http://pbs.twimg.com/media/CJAWz-gWgAAvEZD.jpg',\n", 324 | " u'media_url_https': u'https://pbs.twimg.com/media/CJAWz-gWgAAvEZD.jpg',\n", 325 | " u'sizes': {u'large': {u'h': 652, u'resize': u'fit', u'w': 416},\n", 326 | " u'medium': {u'h': 652, u'resize': u'fit', u'w': 416},\n", 327 | " u'small': {u'h': 532, u'resize': u'fit', u'w': 340},\n", 328 | " u'thumb': {u'h': 150, u'resize': u'crop', u'w': 150}},\n", 329 | " u'type': u'photo',\n", 330 | " u'url': u'http://t.co/TM0F66XVcY'}]},\n", 331 | " u'favorite_count': 0,\n", 332 | " u'favorited': False,\n", 333 | " u'geo': None,\n", 334 | " u'id': 617018231207895040,\n", 335 | " u'id_str': u'617018231207895040',\n", 336 | " u'in_reply_to_screen_name': None,\n", 337 | " u'in_reply_to_status_id': None,\n", 338 | " u'in_reply_to_status_id_str': None,\n", 339 | " u'in_reply_to_user_id': None,\n", 340 | " u'in_reply_to_user_id_str': None,\n", 341 | " u'is_quote_status': False,\n", 342 | " u'lang': u'en',\n", 343 | " u'place': None,\n", 344 | " u'possibly_sensitive': False,\n", 345 | " u'possibly_sensitive_appealable': False,\n", 346 | " u'retweet_count': 0,\n", 347 | " u'retweeted': False,\n", 348 | " u'source': u'Buffer',\n", 349 | " u'text': u'Has anyone tried #Amazon #echo yet? Reviews look good, but @cgpgrey hated it. http://t.co/eT1C4t205A #tech http://t.co/TM0F66XVcY',\n", 350 | " u'truncated': False,\n", 351 | " u'user': {u'contributors_enabled': False,\n", 352 | " u'created_at': u'Wed Jun 03 11:44:15 +0000 2015',\n", 353 | " u'default_profile': False,\n", 354 | " u'default_profile_image': False,\n", 355 | " u'description': u'Better living through technology. Welcome to the future of the clock.',\n", 356 | " u'entities': {u'description': {u'urls': []},\n", 357 | " u'url': {u'urls': [{u'display_url': u'sxeelectronics.com',\n", 358 | " u'expanded_url': u'http://www.sxeelectronics.com',\n", 359 | " u'indices': [0, 22],\n", 360 | " u'url': u'http://t.co/PIt61hmI6Y'}]}},\n", 361 | " u'favourites_count': 14,\n", 362 | " u'follow_request_sent': False,\n", 363 | " u'followers_count': 45,\n", 364 | " u'following': False,\n", 365 | " u'friends_count': 395,\n", 366 | " u'geo_enabled': True,\n", 367 | " u'has_extended_profile': False,\n", 368 | " u'id': 3307022770,\n", 369 | " u'id_str': u'3307022770',\n", 370 | " u'is_translation_enabled': False,\n", 371 | " u'is_translator': False,\n", 372 | " u'lang': u'en',\n", 373 | " u'listed_count': 1,\n", 374 | " u'location': u'New York, USA',\n", 375 | " u'name': u'SXE Electronics',\n", 376 | " u'notifications': False,\n", 377 | " u'profile_background_color': u'000000',\n", 378 | " u'profile_background_image_url': u'http://abs.twimg.com/images/themes/theme1/bg.png',\n", 379 | " u'profile_background_image_url_https': u'https://abs.twimg.com/images/themes/theme1/bg.png',\n", 380 | " u'profile_background_tile': False,\n", 381 | " u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/3307022770/1434378229',\n", 382 | " u'profile_image_url': u'http://pbs.twimg.com/profile_images/610447596641890304/wrCuxdI0_normal.jpg',\n", 383 | " u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/610447596641890304/wrCuxdI0_normal.jpg',\n", 384 | " u'profile_link_color': u'D31145',\n", 385 | " u'profile_sidebar_border_color': u'000000',\n", 386 | " u'profile_sidebar_fill_color': u'000000',\n", 387 | " u'profile_text_color': u'000000',\n", 388 | " u'profile_use_background_image': False,\n", 389 | " u'protected': False,\n", 390 | " u'screen_name': u'SXEElectronics',\n", 391 | " u'statuses_count': 29,\n", 392 | " u'time_zone': None,\n", 393 | " u'url': u'http://t.co/PIt61hmI6Y',\n", 394 | " u'utc_offset': None,\n", 395 | " u'verified': False}}" 396 | ] 397 | }, 398 | "execution_count": 5, 399 | "metadata": {}, 400 | "output_type": "execute_result" 401 | } 402 | ], 403 | "source": [ 404 | "result_id = search_result['statuses'][0]['id']\n", 405 | "twitter.statuses.show(_id=result_id)" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 6, 411 | "metadata": { 412 | "collapsed": true 413 | }, 414 | "outputs": [], 415 | "source": [ 416 | "result_id_pack = ','.join([status['id_str'] for status in search_result['statuses']])" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": 7, 422 | "metadata": { 423 | "collapsed": false 424 | }, 425 | "outputs": [ 426 | { 427 | "data": { 428 | "text/plain": [ 429 | "(twitter.api.TwitterListResponse, 27)" 430 | ] 431 | }, 432 | "execution_count": 7, 433 | "metadata": {}, 434 | "output_type": "execute_result" 435 | } 436 | ], 437 | "source": [ 438 | "lookup_result = twitter.statuses.lookup(_id=result_id_pack)\n", 439 | "assert len(lookup_result) > 0, 'No statuses returned.'\n", 440 | "type(lookup_result), len(lookup_result)" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 8, 446 | "metadata": { 447 | "collapsed": false 448 | }, 449 | "outputs": [ 450 | { 451 | "data": { 452 | "text/plain": [ 453 | "{u'contributors': None,\n", 454 | " u'coordinates': None,\n", 455 | " u'created_at': u'Fri Jun 26 05:15:22 +0000 2015',\n", 456 | " u'entities': {u'hashtags': [],\n", 457 | " u'symbols': [],\n", 458 | " u'urls': [{u'display_url': u'amazon.com/Amazon-SK705DI\\u2026',\n", 459 | " u'expanded_url': u'http://www.amazon.com/Amazon-SK705DI-Echo/dp/B00X4WHP5E',\n", 460 | " u'indices': [48, 70],\n", 461 | " u'url': u'http://t.co/oHuDRKolgP'}],\n", 462 | " u'user_mentions': []},\n", 463 | " u'favorite_count': 0,\n", 464 | " u'favorited': False,\n", 465 | " u'geo': None,\n", 466 | " u'id': 614300900107141120,\n", 467 | " u'id_str': u'614300900107141120',\n", 468 | " u'in_reply_to_screen_name': None,\n", 469 | " u'in_reply_to_status_id': None,\n", 470 | " u'in_reply_to_status_id_str': None,\n", 471 | " u'in_reply_to_user_id': None,\n", 472 | " u'in_reply_to_user_id_str': None,\n", 473 | " u'is_quote_status': False,\n", 474 | " u'lang': u'en',\n", 475 | " u'place': None,\n", 476 | " u'possibly_sensitive': True,\n", 477 | " u'retweet_count': 0,\n", 478 | " u'retweeted': False,\n", 479 | " u'source': u'Twitter Web Client',\n", 480 | " u'text': u'Amazon Echo: Always Ready, Connected, and Fast. http://t.co/oHuDRKolgP',\n", 481 | " u'truncated': False,\n", 482 | " u'user': {u'contributors_enabled': False,\n", 483 | " u'created_at': u'Fri Nov 08 20:28:47 +0000 2013',\n", 484 | " u'default_profile': True,\n", 485 | " u'default_profile_image': False,\n", 486 | " u'description': u'',\n", 487 | " u'entities': {u'description': {u'urls': []},\n", 488 | " u'url': {u'urls': [{u'display_url': u'about.me/jerry.neal',\n", 489 | " u'expanded_url': u'http://about.me/jerry.neal',\n", 490 | " u'indices': [0, 22],\n", 491 | " u'url': u'http://t.co/VoUWmeCNiE'}]}},\n", 492 | " u'favourites_count': 5,\n", 493 | " u'follow_request_sent': False,\n", 494 | " u'followers_count': 191,\n", 495 | " u'following': False,\n", 496 | " u'friends_count': 1774,\n", 497 | " u'geo_enabled': False,\n", 498 | " u'has_extended_profile': False,\n", 499 | " u'id': 2182891302,\n", 500 | " u'id_str': u'2182891302',\n", 501 | " u'is_translation_enabled': False,\n", 502 | " u'is_translator': False,\n", 503 | " u'lang': u'en',\n", 504 | " u'listed_count': 6,\n", 505 | " u'location': u'',\n", 506 | " u'name': u'Jerry Neal',\n", 507 | " u'notifications': False,\n", 508 | " u'profile_background_color': u'C0DEED',\n", 509 | " u'profile_background_image_url': u'http://abs.twimg.com/images/themes/theme1/bg.png',\n", 510 | " u'profile_background_image_url_https': u'https://abs.twimg.com/images/themes/theme1/bg.png',\n", 511 | " u'profile_background_tile': False,\n", 512 | " u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/2182891302/1387166654',\n", 513 | " u'profile_image_url': u'http://pbs.twimg.com/profile_images/460656132449509376/HgxE0bQO_normal.jpeg',\n", 514 | " u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/460656132449509376/HgxE0bQO_normal.jpeg',\n", 515 | " u'profile_link_color': u'0084B4',\n", 516 | " u'profile_sidebar_border_color': u'C0DEED',\n", 517 | " u'profile_sidebar_fill_color': u'DDEEF6',\n", 518 | " u'profile_text_color': u'333333',\n", 519 | " u'profile_use_background_image': True,\n", 520 | " u'protected': False,\n", 521 | " u'screen_name': u'NEALDOG2007',\n", 522 | " u'statuses_count': 3576,\n", 523 | " u'time_zone': u'Central Time (US & Canada)',\n", 524 | " u'url': u'http://t.co/VoUWmeCNiE',\n", 525 | " u'utc_offset': -18000,\n", 526 | " u'verified': False}}" 527 | ] 528 | }, 529 | "execution_count": 8, 530 | "metadata": {}, 531 | "output_type": "execute_result" 532 | } 533 | ], 534 | "source": [ 535 | "lookup_result[0]" 536 | ] 537 | } 538 | ], 539 | "metadata": { 540 | "kernelspec": { 541 | "display_name": "Python 2", 542 | "language": "python", 543 | "name": "python2" 544 | }, 545 | "language_info": { 546 | "codemirror_mode": { 547 | "name": "ipython", 548 | "version": 2 549 | }, 550 | "file_extension": ".py", 551 | "mimetype": "text/x-python", 552 | "name": "python", 553 | "nbconvert_exporter": "python", 554 | "pygments_lexer": "ipython2", 555 | "version": "2.7.10" 556 | } 557 | }, 558 | "nbformat": 4, 559 | "nbformat_minor": 0 560 | } 561 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | .. _api: 2 | 3 | API 4 | === 5 | 6 | .. Use ClassName() to remove __init__ function signature from autoclass, as 7 | component classes are not instantiated directly. 8 | 9 | .. module:: birding.spout 10 | 11 | .. autofunction:: DispatchSpout() 12 | 13 | .. autoclass:: TermCycleSpout() 14 | :members: 15 | 16 | 17 | .. module:: birding.bolt 18 | 19 | .. autoclass:: TwitterSearchBolt() 20 | :members: 21 | 22 | .. autoclass:: TwitterLookupBolt() 23 | :members: 24 | 25 | .. autoclass:: ElasticsearchIndexBolt() 26 | :members: 27 | 28 | .. autoclass:: ResultTopicBolt() 29 | :members: 30 | 31 | 32 | .. module:: birding.search 33 | 34 | .. autofunction:: search_manager_from_config() 35 | 36 | .. autoclass:: SearchManager 37 | :members: 38 | 39 | 40 | .. module:: birding.twitter 41 | 42 | .. autoclass:: Twitter 43 | :members: 44 | 45 | .. autoclass:: TwitterSearchManager 46 | :members: 47 | 48 | .. autofunction:: TwitterSearchManagerFromOAuth() 49 | 50 | 51 | .. module:: birding.gnip 52 | 53 | .. autoclass:: Gnip 54 | :members: 55 | 56 | .. autoclass:: GnipSearchManager 57 | :members: 58 | 59 | 60 | .. autofunction:: birding.config.get_config 61 | 62 | 63 | .. module:: birding.shelf 64 | 65 | .. autofunction:: shelf_from_config() 66 | 67 | .. autoclass:: Shelf 68 | :members: 69 | 70 | .. autoclass:: FreshPacker 71 | :members: 72 | 73 | .. autoclass:: LRUShelf 74 | :members: 75 | 76 | .. autoclass:: FreshLRUShelf 77 | :members: 78 | 79 | .. autoclass:: ElasticsearchShelf 80 | :members: 81 | 82 | .. autoclass:: FreshElasticsearchShelf 83 | :members: 84 | -------------------------------------------------------------------------------- /docs/birding-topology-sketch.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # birding documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Aug 24 11:23:00 2015. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | import shlex 18 | 19 | # If extensions (or modules to document with autodoc) are in another directory, 20 | # add these directories to sys.path here. If the directory is relative to the 21 | # documentation root, use os.path.abspath to make it absolute, like shown here. 22 | #sys.path.insert(0, os.path.abspath('.')) 23 | 24 | # -- General configuration ------------------------------------------------ 25 | 26 | # If your documentation needs a minimal Sphinx version, state it here. 27 | #needs_sphinx = '1.0' 28 | 29 | # Add any Sphinx extension module names here, as strings. They can be 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 31 | # ones. 32 | extensions = [ 33 | 'sphinx.ext.autodoc', 34 | 'sphinx.ext.coverage', 35 | 'sphinx.ext.doctest', 36 | 'sphinx.ext.intersphinx', 37 | 'sphinx.ext.napoleon', 38 | 'sphinx.ext.todo', 39 | 'sphinx.ext.viewcode', 40 | ] 41 | 42 | # Add any paths that contain templates here, relative to this directory. 43 | templates_path = ['_templates'] 44 | 45 | # The suffix(es) of source filenames. 46 | # You can specify multiple suffix as a list of string: 47 | # source_suffix = ['.rst', '.md'] 48 | source_suffix = '.rst' 49 | 50 | # The encoding of source files. 51 | #source_encoding = 'utf-8-sig' 52 | 53 | # The master toctree document. 54 | master_doc = 'index' 55 | 56 | # General information about the project. 57 | project = u'birding' 58 | copyright = u'2015, Parse.ly' 59 | author = u'Parse.ly' 60 | 61 | # The version info for the project you're documenting, acts as replacement for 62 | # |version| and |release|, also used in various other places throughout the 63 | # built documents. 64 | # 65 | # The short X.Y version. 66 | version = '0.0' 67 | # The full version, including alpha/beta/rc tags. 68 | release = '0.0' 69 | 70 | # The language for content autogenerated by Sphinx. Refer to documentation 71 | # for a list of supported languages. 72 | # 73 | # This is also used if you do content translation via gettext catalogs. 74 | # Usually you set "language" from the command line for these cases. 75 | language = None 76 | 77 | # There are two options for replacing |today|: either, you set today to some 78 | # non-false value, then it is used: 79 | #today = '' 80 | # Else, today_fmt is used as the format for a strftime call. 81 | #today_fmt = '%B %d, %Y' 82 | 83 | # List of patterns, relative to source directory, that match files and 84 | # directories to ignore when looking for source files. 85 | exclude_patterns = ['_build'] 86 | 87 | # The reST default role (used for this markup: `text`) to use for all 88 | # documents. 89 | #default_role = None 90 | 91 | # If true, '()' will be appended to :func: etc. cross-reference text. 92 | #add_function_parentheses = True 93 | 94 | # If true, the current module name will be prepended to all description 95 | # unit titles (such as .. function::). 96 | #add_module_names = True 97 | 98 | # If true, sectionauthor and moduleauthor directives will be shown in the 99 | # output. They are ignored by default. 100 | #show_authors = False 101 | 102 | # The name of the Pygments (syntax highlighting) style to use. 103 | pygments_style = 'sphinx' 104 | 105 | # A list of ignored prefixes for module index sorting. 106 | #modindex_common_prefix = [] 107 | 108 | # If true, keep warnings as "system message" paragraphs in the built documents. 109 | #keep_warnings = False 110 | 111 | # If true, `todo` and `todoList` produce output, else they produce nothing. 112 | todo_include_todos = True 113 | 114 | 115 | # -- Options for HTML output ---------------------------------------------- 116 | 117 | # The theme to use for HTML and HTML Help pages. See the documentation for 118 | # a list of builtin themes. 119 | html_theme = 'alabaster' 120 | 121 | # Theme options are theme-specific and customize the look and feel of a theme 122 | # further. For a list of options available for each theme, see the 123 | # documentation. 124 | #html_theme_options = {} 125 | 126 | # Add any paths that contain custom themes here, relative to this directory. 127 | #html_theme_path = [] 128 | 129 | # The name for this set of Sphinx documents. If None, it defaults to 130 | # " v documentation". 131 | #html_title = None 132 | 133 | # A shorter title for the navigation bar. Default is the same as html_title. 134 | #html_short_title = None 135 | 136 | # The name of an image file (relative to this directory) to place at the top 137 | # of the sidebar. 138 | #html_logo = None 139 | 140 | # The name of an image file (within the static path) to use as favicon of the 141 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 142 | # pixels large. 143 | #html_favicon = None 144 | 145 | # Add any paths that contain custom static files (such as style sheets) here, 146 | # relative to this directory. They are copied after the builtin static files, 147 | # so a file named "default.css" will overwrite the builtin "default.css". 148 | html_static_path = ['_static'] 149 | 150 | # Add any extra paths that contain custom files (such as robots.txt or 151 | # .htaccess) here, relative to this directory. These files are copied 152 | # directly to the root of the documentation. 153 | #html_extra_path = [] 154 | 155 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 156 | # using the given strftime format. 157 | #html_last_updated_fmt = '%b %d, %Y' 158 | 159 | # If true, SmartyPants will be used to convert quotes and dashes to 160 | # typographically correct entities. 161 | #html_use_smartypants = True 162 | 163 | # Custom sidebar templates, maps document names to template names. 164 | #html_sidebars = {} 165 | 166 | # Additional templates that should be rendered to pages, maps page names to 167 | # template names. 168 | #html_additional_pages = {} 169 | 170 | # If false, no module index is generated. 171 | #html_domain_indices = True 172 | 173 | # If false, no index is generated. 174 | #html_use_index = True 175 | 176 | # If true, the index is split into individual pages for each letter. 177 | #html_split_index = False 178 | 179 | # If true, links to the reST sources are added to the pages. 180 | #html_show_sourcelink = True 181 | 182 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 183 | #html_show_sphinx = True 184 | 185 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 186 | #html_show_copyright = True 187 | 188 | # If true, an OpenSearch description file will be output, and all pages will 189 | # contain a tag referring to it. The value of this option must be the 190 | # base URL from which the finished HTML is served. 191 | #html_use_opensearch = '' 192 | 193 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 194 | #html_file_suffix = None 195 | 196 | # Language to be used for generating the HTML full-text search index. 197 | # Sphinx supports the following languages: 198 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' 199 | # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' 200 | #html_search_language = 'en' 201 | 202 | # A dictionary with options for the search language support, empty by default. 203 | # Now only 'ja' uses this config value 204 | #html_search_options = {'type': 'default'} 205 | 206 | # The name of a javascript file (relative to the configuration directory) that 207 | # implements a search results scorer. If empty, the default will be used. 208 | #html_search_scorer = 'scorer.js' 209 | 210 | # Output file base name for HTML help builder. 211 | htmlhelp_basename = 'birdingdoc' 212 | 213 | # -- Options for LaTeX output --------------------------------------------- 214 | 215 | latex_elements = { 216 | # The paper size ('letterpaper' or 'a4paper'). 217 | #'papersize': 'letterpaper', 218 | 219 | # The font size ('10pt', '11pt' or '12pt'). 220 | #'pointsize': '10pt', 221 | 222 | # Additional stuff for the LaTeX preamble. 223 | #'preamble': '', 224 | 225 | # Latex figure (float) alignment 226 | #'figure_align': 'htbp', 227 | } 228 | 229 | # Grouping the document tree into LaTeX files. List of tuples 230 | # (source start file, target name, title, 231 | # author, documentclass [howto, manual, or own class]). 232 | latex_documents = [ 233 | (master_doc, 'birding.tex', u'birding Documentation', 234 | u'Parse.ly', 'manual'), 235 | ] 236 | 237 | # The name of an image file (relative to this directory) to place at the top of 238 | # the title page. 239 | #latex_logo = None 240 | 241 | # For "manual" documents, if this is true, then toplevel headings are parts, 242 | # not chapters. 243 | #latex_use_parts = False 244 | 245 | # If true, show page references after internal links. 246 | #latex_show_pagerefs = False 247 | 248 | # If true, show URL addresses after external links. 249 | #latex_show_urls = False 250 | 251 | # Documents to append as an appendix to all manuals. 252 | #latex_appendices = [] 253 | 254 | # If false, no module index is generated. 255 | #latex_domain_indices = True 256 | 257 | 258 | # -- Options for manual page output --------------------------------------- 259 | 260 | # One entry per manual page. List of tuples 261 | # (source start file, name, description, authors, manual section). 262 | man_pages = [ 263 | (master_doc, 'birding', u'birding Documentation', 264 | [author], 1) 265 | ] 266 | 267 | # If true, show URL addresses after external links. 268 | #man_show_urls = False 269 | 270 | 271 | # -- Options for Texinfo output ------------------------------------------- 272 | 273 | # Grouping the document tree into Texinfo files. List of tuples 274 | # (source start file, target name, title, author, 275 | # dir menu entry, description, category) 276 | texinfo_documents = [ 277 | (master_doc, 'birding', u'birding Documentation', 278 | author, 'birding', 'One line description of project.', 279 | 'Miscellaneous'), 280 | ] 281 | 282 | # Documents to append as an appendix to all manuals. 283 | #texinfo_appendices = [] 284 | 285 | # If false, no module index is generated. 286 | #texinfo_domain_indices = True 287 | 288 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 289 | #texinfo_show_urls = 'footnote' 290 | 291 | # If true, do not generate a @detailmenu in the "Top" node's menu. 292 | #texinfo_no_detailmenu = False 293 | 294 | 295 | # -- Options for Epub output ---------------------------------------------- 296 | 297 | # Bibliographic Dublin Core info. 298 | epub_title = project 299 | epub_author = author 300 | epub_publisher = author 301 | epub_copyright = copyright 302 | 303 | # The basename for the epub file. It defaults to the project name. 304 | #epub_basename = project 305 | 306 | # The HTML theme for the epub output. Since the default themes are not optimized 307 | # for small screen space, using the same theme for HTML and epub output is 308 | # usually not wise. This defaults to 'epub', a theme designed to save visual 309 | # space. 310 | #epub_theme = 'epub' 311 | 312 | # The language of the text. It defaults to the language option 313 | # or 'en' if the language is not set. 314 | #epub_language = '' 315 | 316 | # The scheme of the identifier. Typical schemes are ISBN or URL. 317 | #epub_scheme = '' 318 | 319 | # The unique identifier of the text. This can be a ISBN number 320 | # or the project homepage. 321 | #epub_identifier = '' 322 | 323 | # A unique identification for the text. 324 | #epub_uid = '' 325 | 326 | # A tuple containing the cover image and cover page html template filenames. 327 | #epub_cover = () 328 | 329 | # A sequence of (type, uri, title) tuples for the guide element of content.opf. 330 | #epub_guide = () 331 | 332 | # HTML files that should be inserted before the pages created by sphinx. 333 | # The format is a list of tuples containing the path and title. 334 | #epub_pre_files = [] 335 | 336 | # HTML files shat should be inserted after the pages created by sphinx. 337 | # The format is a list of tuples containing the path and title. 338 | #epub_post_files = [] 339 | 340 | # A list of files that should not be packed into the epub file. 341 | epub_exclude_files = ['search.html'] 342 | 343 | # The depth of the table of contents in toc.ncx. 344 | #epub_tocdepth = 3 345 | 346 | # Allow duplicate toc entries. 347 | #epub_tocdup = True 348 | 349 | # Choose between 'default' and 'includehidden'. 350 | #epub_tocscope = 'default' 351 | 352 | # Fix unsupported image types using the Pillow. 353 | #epub_fix_images = False 354 | 355 | # Scale large images. 356 | #epub_max_image_width = 0 357 | 358 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 359 | #epub_show_urls = 'inline' 360 | 361 | # If false, no index is generated. 362 | #epub_use_index = True 363 | 364 | intersphinx_mapping = { 365 | 'python': ('https://docs.python.org/dev', None), 366 | 'pykafka': ('http://pykafka.readthedocs.org/en/stable/', None), 367 | 'streamparse': ('http://streamparse.readthedocs.org/en/stable/', None), 368 | } 369 | -------------------------------------------------------------------------------- /docs/config.rst: -------------------------------------------------------------------------------- 1 | .. _config: 2 | 3 | Configuring `birding` 4 | ===================== 5 | 6 | .. automodule:: birding.config 7 | -------------------------------------------------------------------------------- /docs/gnip.rst: -------------------------------------------------------------------------------- 1 | .. _gnip: 2 | 3 | Searching Gnip 4 | ============== 5 | 6 | `Gnip `_ is Twitter's enterprise API platform, which birding 7 | supports for projects seeking to search at higher rates than allowed in the 8 | public API. The configuration snippet below uses Gnip's APIs instead of 9 | Twitter. See :ref:`config` for how to configure birding. 10 | 11 | .. code-block:: yaml 12 | 13 | SearchManager: 14 | class: birding.gnip.GnipSearchManager 15 | init: 16 | base_url: https://search.gnip.com/accounts/Example 17 | stream: prod.json 18 | username: admin@example.org 19 | password: This.yml.file.should.be.untracked. 20 | 21 | See birding API docs for :class:`~birding.gnip.Gnip` and 22 | :class:`~birding.gnip.GnipSearchManager` for underlying behavior, which is 23 | minimal. 24 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. _index: 2 | 3 | `birding`: streamparse/kafka demo 4 | ================================= 5 | 6 | `birding` is an `open source project`__ to produce a stream of recent twitter_ 7 | activity based on a sequence of search terms, using only `twitter's public 8 | APIs`_. It serves as both a standalone project and a demo of distributed 9 | real-time computing with Python_ using Storm_/streamparse_ and Kafka_/pykafka_. 10 | 11 | __ https://github.com/Parsely/birding 12 | 13 | :ref:`topology` describes the problem and how it fits into a topology. 14 | :ref:`solo` describes how to interact with birding for development, demo, or 15 | light usage. :ref:`tour` provides a light introduction to internals. 16 | :ref:`production` discusses how birding is packaged for production use in an 17 | existing streamparse project. :ref:`config` discusses various options for 18 | birding behavior when running locally or in production. 19 | 20 | .. toctree:: 21 | :maxdepth: 2 22 | 23 | topology 24 | solo 25 | tour 26 | production 27 | config 28 | gnip 29 | api 30 | 31 | To discuss this project, join the `streamparse user group`_. 32 | 33 | :ref:`Documentation Index ` 34 | 35 | .. _twitter: https://twitter.com 36 | .. _`twitter's public APIs`: https://dev.twitter.com/rest/public 37 | .. _Storm: http://storm.apache.org 38 | .. _Python: http://python.org 39 | .. _Kafka: http://kafka.apache.org 40 | .. _streamparse: https://github.com/Parsely/streamparse 41 | .. _pykafka: https://github.com/Parsely/pykafka 42 | .. _`streamparse user group`: https://github.com/Parsely/streamparse#user-group 43 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | echo. coverage to run coverage check of the documentation if enabled 41 | goto end 42 | ) 43 | 44 | if "%1" == "clean" ( 45 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 46 | del /q /s %BUILDDIR%\* 47 | goto end 48 | ) 49 | 50 | 51 | REM Check if sphinx-build is available and fallback to Python version if any 52 | %SPHINXBUILD% 2> nul 53 | if errorlevel 9009 goto sphinx_python 54 | goto sphinx_ok 55 | 56 | :sphinx_python 57 | 58 | set SPHINXBUILD=python -m sphinx.__init__ 59 | %SPHINXBUILD% 2> nul 60 | if errorlevel 9009 ( 61 | echo. 62 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 63 | echo.installed, then set the SPHINXBUILD environment variable to point 64 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 65 | echo.may add the Sphinx directory to PATH. 66 | echo. 67 | echo.If you don't have Sphinx installed, grab it from 68 | echo.http://sphinx-doc.org/ 69 | exit /b 1 70 | ) 71 | 72 | :sphinx_ok 73 | 74 | 75 | if "%1" == "html" ( 76 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 77 | if errorlevel 1 exit /b 1 78 | echo. 79 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 80 | goto end 81 | ) 82 | 83 | if "%1" == "dirhtml" ( 84 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 85 | if errorlevel 1 exit /b 1 86 | echo. 87 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 88 | goto end 89 | ) 90 | 91 | if "%1" == "singlehtml" ( 92 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 93 | if errorlevel 1 exit /b 1 94 | echo. 95 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 96 | goto end 97 | ) 98 | 99 | if "%1" == "pickle" ( 100 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 101 | if errorlevel 1 exit /b 1 102 | echo. 103 | echo.Build finished; now you can process the pickle files. 104 | goto end 105 | ) 106 | 107 | if "%1" == "json" ( 108 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 109 | if errorlevel 1 exit /b 1 110 | echo. 111 | echo.Build finished; now you can process the JSON files. 112 | goto end 113 | ) 114 | 115 | if "%1" == "htmlhelp" ( 116 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 117 | if errorlevel 1 exit /b 1 118 | echo. 119 | echo.Build finished; now you can run HTML Help Workshop with the ^ 120 | .hhp project file in %BUILDDIR%/htmlhelp. 121 | goto end 122 | ) 123 | 124 | if "%1" == "qthelp" ( 125 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 129 | .qhcp project file in %BUILDDIR%/qthelp, like this: 130 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\birding.qhcp 131 | echo.To view the help file: 132 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\birding.ghc 133 | goto end 134 | ) 135 | 136 | if "%1" == "devhelp" ( 137 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 138 | if errorlevel 1 exit /b 1 139 | echo. 140 | echo.Build finished. 141 | goto end 142 | ) 143 | 144 | if "%1" == "epub" ( 145 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 146 | if errorlevel 1 exit /b 1 147 | echo. 148 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 149 | goto end 150 | ) 151 | 152 | if "%1" == "latex" ( 153 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 154 | if errorlevel 1 exit /b 1 155 | echo. 156 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 157 | goto end 158 | ) 159 | 160 | if "%1" == "latexpdf" ( 161 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 162 | cd %BUILDDIR%/latex 163 | make all-pdf 164 | cd %~dp0 165 | echo. 166 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 167 | goto end 168 | ) 169 | 170 | if "%1" == "latexpdfja" ( 171 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 172 | cd %BUILDDIR%/latex 173 | make all-pdf-ja 174 | cd %~dp0 175 | echo. 176 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 177 | goto end 178 | ) 179 | 180 | if "%1" == "text" ( 181 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 182 | if errorlevel 1 exit /b 1 183 | echo. 184 | echo.Build finished. The text files are in %BUILDDIR%/text. 185 | goto end 186 | ) 187 | 188 | if "%1" == "man" ( 189 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 190 | if errorlevel 1 exit /b 1 191 | echo. 192 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 193 | goto end 194 | ) 195 | 196 | if "%1" == "texinfo" ( 197 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 198 | if errorlevel 1 exit /b 1 199 | echo. 200 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 201 | goto end 202 | ) 203 | 204 | if "%1" == "gettext" ( 205 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 206 | if errorlevel 1 exit /b 1 207 | echo. 208 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 209 | goto end 210 | ) 211 | 212 | if "%1" == "changes" ( 213 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 214 | if errorlevel 1 exit /b 1 215 | echo. 216 | echo.The overview file is in %BUILDDIR%/changes. 217 | goto end 218 | ) 219 | 220 | if "%1" == "linkcheck" ( 221 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 222 | if errorlevel 1 exit /b 1 223 | echo. 224 | echo.Link check complete; look for any errors in the above output ^ 225 | or in %BUILDDIR%/linkcheck/output.txt. 226 | goto end 227 | ) 228 | 229 | if "%1" == "doctest" ( 230 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 231 | if errorlevel 1 exit /b 1 232 | echo. 233 | echo.Testing of doctests in the sources finished, look at the ^ 234 | results in %BUILDDIR%/doctest/output.txt. 235 | goto end 236 | ) 237 | 238 | if "%1" == "coverage" ( 239 | %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage 240 | if errorlevel 1 exit /b 1 241 | echo. 242 | echo.Testing of coverage in the sources finished, look at the ^ 243 | results in %BUILDDIR%/coverage/python.txt. 244 | goto end 245 | ) 246 | 247 | if "%1" == "xml" ( 248 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 249 | if errorlevel 1 exit /b 1 250 | echo. 251 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 252 | goto end 253 | ) 254 | 255 | if "%1" == "pseudoxml" ( 256 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 257 | if errorlevel 1 exit /b 1 258 | echo. 259 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 260 | goto end 261 | ) 262 | 263 | :end 264 | -------------------------------------------------------------------------------- /docs/production.rst: -------------------------------------------------------------------------------- 1 | .. _production: 2 | 3 | Using `birding` in production 4 | ============================= 5 | 6 | .. note:: birding is currently alpha software. 7 | 8 | If birding itself satisfies project requirements, see the streamparse project's 9 | discussion of `remote deployment`_ and use ``sparse submit`` from a checkout of 10 | the birding repository. Otherwise, birding is available on the `Python Package 11 | Index `_, which projects can use as a 12 | dependency:: 13 | 14 | pip install birding 15 | 16 | Once installed in the Python environment, birding references are available to 17 | the topology definition. A project's topology can include 18 | ``python-spout-spec`` and ``python-bolt-spec`` declarations which have class 19 | references to ``birding.spout`` and ``birding.bolt`` namespaces, respectively. 20 | The snippet below illustrates this. The :ref:`storm-topology` section has more 21 | detail. 22 | 23 | .. code-block:: clojure 24 | 25 | "search-bolt" (python-bolt-spec 26 | options 27 | {"term-spout" ["term"]} 28 | "birding.bolt.TwitterSearchBolt" 29 | ["term" "timestamp" "search_result"] 30 | :p 2) 31 | 32 | The streamparse project discusses `remote deployment`_ using the ``sparse 33 | submit`` command. :ref:`config` discusses the ``birding.yml`` file which is 34 | located by the ``BIRDING_CONF`` environment variable. Projects using birding 35 | should include its configuration file as part of host configuration management 36 | or a streamparse submit hook, and likewise set the ``BIRDING_CONF`` variable 37 | accordingly. 38 | 39 | Next, goto :ref:`config`. 40 | 41 | .. _`remote deployment`: 42 | http://streamparse.readthedocs.org/en/master/quickstart.html#remote-deployment 43 | -------------------------------------------------------------------------------- /docs/solo.rst: -------------------------------------------------------------------------------- 1 | .. _solo: 2 | 3 | Downloading and running `birding` 4 | ================================= 5 | 6 | .. note:: 7 | 8 | Existing streamparse projects should include the `birding Python package 9 | `_ instead of cloning the birding 10 | repository, which is described in :ref:`production`. 11 | 12 | The birding project fully automates dependencies for the purposes of 13 | development, demo, or light usage. In a terminal on a Unix-like system, clone 14 | the birding repository:: 15 | 16 | git clone https://github.com/Parsely/birding.git 17 | cd birding 18 | 19 | Then run:: 20 | 21 | make run 22 | 23 | The birding project makes every effort to detect if an underlying dependency is 24 | unmet. If `make run` fails, look for messages indicating what is missing or 25 | what went wrong. If an error message says that an address is in use, look for 26 | other processes on the system which are currently using the referenced network 27 | port, then shut them down in order to run birding. If an error is unclear, 28 | `submit an issue `_ including a 29 | build log and mention your operating system. To create a `build.log`:: 30 | 31 | make run 2>&1 | tee build.log 32 | 33 | When birding is running, its console output is verbose as it includes all 34 | output of zookeeper, kafka, storm, and streamparse. Note that -- as with all 35 | streamparse projects -- output from the birding code itself ends up in the 36 | ``logs/`` directory and not in the console. To stop running birding, issue a 37 | keyboard interrupt in the console with Control-C:: 38 | 39 | Control-C 40 | 41 | Using `make run` will pick up `birding.yml` as the project configuration file 42 | if it exists in the root directory next to the `Makefile`. See :ref:`config`. 43 | This simple `birding.yml` to sets the search terms used by birding:: 44 | 45 | TermCycleSpout: 46 | terms: 47 | - mocking bird 48 | - carrier pigeon 49 | 50 | Data for the project ends up in a directory relative to the project root. Clean 51 | runtime data with:: 52 | 53 | make clean-data 54 | 55 | Build docs with ``make docs`` and check for Python errors by static analysis 56 | with ``make flakes``. Make allows multiple targets at once:: 57 | 58 | make clean-data flakes run 59 | 60 | Next, goto one of: 61 | 62 | * :ref:`tour` 63 | * :ref:`production` 64 | * :ref:`config` 65 | -------------------------------------------------------------------------------- /docs/topology.rst: -------------------------------------------------------------------------------- 1 | .. _topology: 2 | 3 | Problem statement & topology 4 | ============================ 5 | 6 | Problem Statement 7 | ----------------- 8 | 9 | Take as input a sequence of terms and timestamps and produce a "filtered 10 | firehose" of twitter_ activity using only `twitter's public APIs`_, without 11 | requiring special API access to twitter or any third party. 12 | 13 | 14 | Specifics 15 | --------- 16 | 17 | * Input is in the format of (term, timestamp), where `term` is any string and 18 | `timestamp` is a date/time value in an ISO 8601 format, 19 | e.g. ``2015-06-25T08:00Z``. 20 | * The motivating use-case: 21 | 22 | * provides this input as a Kafka_ topic 23 | * prefers output be sent to a Kafka topic & include full twitter API results 24 | * prefers the solution be implemented in Python_ 25 | 26 | 27 | Observations 28 | ------------ 29 | 30 | Twitter provides `GET search/tweets`_ to get relevant Tweets_ (status updates) 31 | matching a specified query. Any detail not provided in the search results can 32 | be accessed with `GET statuses/lookup`_, looking up multiple status updates in 33 | a batched request. 34 | 35 | The problem has potentially unbounded streams of data, which makes Storm_ a 36 | relevant technology for the solution. Given that the motivating use-case 37 | prefers Python with Kafka I/O, streamparse_ and pykafka_ are relavant. 38 | 39 | Topology 40 | -------- 41 | 42 | Given the problem statement, a streaming solution looks something like: 43 | 44 | .. Anyone with the link should be able to access / fork this drawing. 45 | .. image:: birding-topology-sketch.svg 46 | :target: https://docs.google.com/drawings/d/1dijNLPjn_96Q2VyPaiGYUfrnO6jXA0sBcIEKcnNERjE/edit 47 | 48 | 49 | Other Goals 50 | ----------- 51 | 52 | The solution should: 53 | 54 | * Encode best practices about how to use Storm_/streamparse_ and 55 | Kafka_/pykafka_. 56 | * Be fully public & open source to serve as an example project, so it should 57 | not depend on anything specific to a specific company/organization. Depending 58 | on the publicly scrutable Twitter API is, of course, okay. 59 | * Include basic command-line tools for testing the topology with data and ways 60 | to configure things like Twitter authentication credentials. 61 | 62 | Next, goto one of: 63 | 64 | * :ref:`solo` 65 | * :ref:`tour` 66 | 67 | .. _twitter: https://twitter.com 68 | .. _`twitter's public APIs`: https://dev.twitter.com/rest/public 69 | .. _Kafka: http://kafka.apache.org 70 | .. _Python: http://python.org 71 | .. _`GET search/tweets`: 72 | https://dev.twitter.com/rest/reference/get/search/tweets 73 | .. _`Tweets`: https://dev.twitter.com/overview/api/tweets 74 | .. _`GET statuses/lookup`: 75 | https://dev.twitter.com/rest/reference/get/statuses/lookup 76 | .. _Storm: http://storm.apache.org 77 | .. _streamparse: https://github.com/Parsely/streamparse 78 | .. _pykafka: https://github.com/Parsely/pykafka 79 | -------------------------------------------------------------------------------- /docs/tour.rst: -------------------------------------------------------------------------------- 1 | .. _tour: 2 | 3 | A tour of `birding`'s implementation 4 | ==================================== 5 | 6 | Python Twitter Client 7 | --------------------- 8 | 9 | There are many `Python packages for Twitter`__. The `Python Twitter Tools`_ 10 | project (``pip install twitter``) is of interest because: 11 | 12 | 1. It has a command-line application to get twitter_ activity which includes a 13 | straightforward authentication workflow to log into twitter and get OAuth_ 14 | credentials, using a PIN-Based_ workflow. 15 | 2. It provides APIs in Python which bind to `twitter's public APIs`_ in a 16 | dynamic and predictable way, where Python attribute and method names 17 | translate to URL paths, e.g. ``twitter.statuses.friends_timeline()`` 18 | retrieves data from ``http://twitter.com/statuses/friends_timeline.json``. 19 | 3. The OAuth credentials saved by the command-line tool can be readily used 20 | when making API calls using the package. 21 | 22 | __ https://pypi.python.org/pypi?%3Aaction=search&term=twitter&submit=search 23 | 24 | 25 | Twitter API 26 | ----------- 27 | 28 | To ease configuration, birding adds a 29 | :meth:`~birding.twitter.Twitter.from_oauth_file` method which will creates a 30 | `Twitter` binding using the OAuth credential file created by the ``twitter`` 31 | command-line application. The ``twitter`` command need only be run once to 32 | create this file, which is saved in the user home directory at 33 | ``~/.twitter_oauth``. Once that file is in place, twitter API interactions look 34 | like this: 35 | 36 | * `Twitter API Demo `_ 37 | 38 | 39 | Search Manager 40 | -------------- 41 | 42 | It is useful to solve the problem itself before being concerned with details 43 | about the topology. birding's :class:`~birding.search.TwitterSearchManager` 44 | composes the `Twitter` object into higher-level method signatures which perform 45 | the processing steps needed for the given :ref:`topology`. A full interaction 46 | before applying Storm looks like this (see ``In[2]``): 47 | 48 | * `Simple Simulated Stream `_ 49 | 50 | 51 | Storm Bolts 52 | ----------- 53 | 54 | With APIs in place to do the work, Bolt_ classes provide Storm components: 55 | 56 | * :class:`~birding.bolt.TwitterSearchBolt` searches the input terms. 57 | * :class:`~birding.bolt.TwitterLookupBolt` expands search results into full 58 | tweets. 59 | * :class:`~birding.bolt.ElasticsearchIndexBolt` indexes the lookup results in 60 | elasticsearch. 61 | * :class:`~birding.bolt.ResultTopicBolt` publishes the lookup results to Kafka. 62 | 63 | 64 | Storm Spouts 65 | ------------ 66 | 67 | Spout_ classes provide Storm components which take birding's input and provide 68 | the source of streams in the topology: 69 | 70 | * :func:`~birding.spout.DispatchSpout` dispatches spout class based on config. 71 | See :ref:`config`. 72 | * :class:`~birding.spout.TermCycleSpout` cycles through a static list of terms. 73 | 74 | 75 | .. _storm-topology: 76 | 77 | Storm Topology 78 | -------------- 79 | 80 | With Storm components ready for streamparse, a topology can pull it all 81 | together. birding's topology uses the `Clojure DSL`_; the `streamparse 82 | discussion of topologies`_ has more detail. In the topology definition below, 83 | note the class references ``"birding.bolt.TwitterSearchBolt"``, 84 | ``"birding.bolt.TwitterLookupBolt"``, and 85 | ``"birding.bolt.ResultTopicBolt"``. These are full Python namespace references 86 | to the birding classes. The names given in the DSL can then be used to wire the 87 | components together. For example, the definition of ``"search-bolt" 88 | (python-bolt-spec ...)`` allows ``"search-bolt"`` to be used as input in 89 | another bolt, ``"lookup-bolt" (python-bolt-spec ... {"search-bolt" :shuffle} 90 | ... )``. 91 | 92 | .. literalinclude:: ../topologies/birding.clj 93 | :language: clojure 94 | 95 | Next, goto one of: 96 | 97 | * :ref:`solo` 98 | * :ref:`production` 99 | * :ref:`config` 100 | 101 | .. _`Python Twitter Tools`: http://mike.verdone.ca/twitter/ 102 | .. _twitter: https://twitter.com 103 | .. _OAuth: https://dev.twitter.com/oauth 104 | .. _PIN-Based: https://dev.twitter.com/oauth/pin-based 105 | .. _`twitter's public APIs`: https://dev.twitter.com/rest/public 106 | .. _Bolt: https://storm.apache.org/documentation/Concepts.html#bolts 107 | .. _Spout: https://storm.apache.org/documentation/Concepts.html#spouts 108 | .. _`Clojure DSL`: http://storm.apache.org/documentation/Clojure-DSL.html 109 | .. _`streamparse discussion of topologies`: 110 | http://streamparse.readthedocs.org/en/master/topologies.html 111 | -------------------------------------------------------------------------------- /fabfile.py: -------------------------------------------------------------------------------- 1 | def pre_submit(topology_name, env_name, env_config): 2 | """Override this function to perform custom actions prior to topology 3 | submission. No SSH tunnels will be active when this function is called.""" 4 | pass 5 | 6 | 7 | def post_submit(topo_name, env_name, env_config): 8 | """Override this function to perform custom actions after topology 9 | submission. Note that the SSH tunnel to Nimbus will still be active 10 | when this function is called.""" 11 | pass 12 | -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject birding "0.0.1-SNAPSHOT" 2 | :source-paths ["topologies"] 3 | :resource-paths ["_resources"] 4 | :target-path "_build" 5 | :min-lein-version "2.0.0" 6 | :jvm-opts ["-client"] 7 | :dependencies [[org.apache.storm/storm-core "0.9.5"] 8 | [com.parsely/streamparse "0.0.4-SNAPSHOT"] 9 | ] 10 | :jar-exclusions [#"log4j\.properties" #"backtype" #"trident" #"META-INF" #"meta-inf" #"\.yaml"] 11 | :uberjar-exclusions [#"log4j\.properties" #"backtype" #"trident" #"META-INF" #"meta-inf" #"\.yaml"] 12 | ) 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Stream twitter searches using the public API. 3 | 4 | Copyright 2015 Parsely, Inc. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | """ 18 | 19 | import os 20 | import re 21 | import sys 22 | 23 | from setuptools import setup, find_packages 24 | 25 | 26 | classifiers = [ 27 | 'Development Status :: 3 - Alpha', 28 | # TODO: 'Development Status :: 4 - Beta', 29 | # TODO: 'Development Status :: 5 - Production/Stable', 30 | 'Intended Audience :: Developers', 31 | 'License :: OSI Approved :: Apache Software License', 32 | 'Programming Language :: Python', 33 | 'Programming Language :: Python', 34 | 'Programming Language :: Python :: 2', 35 | 'Programming Language :: Python :: 2.7', 36 | # TODO: 'Programming Language :: Python :: 3', 37 | # TODO: 'Programming Language :: Python :: 3.4', 38 | # TODO: 'Programming Language :: Python :: 3.5', 39 | ] 40 | 41 | # Update virtualenvs/birding.txt when editing this list. 42 | install_requires = [ 43 | 'elasticsearch==1.7.0', 44 | 'pykafka==1.1.0', 45 | 'pyyaml==3.11', 46 | 'repoze.lru==0.6', 47 | 'requests==2.8.1', 48 | 'six==1.9.0', 49 | 'streamparse==2.0.2', 50 | 'travesty==0.1.1', 51 | 'twitter==1.17.0', 52 | ] 53 | 54 | lint_requires = [ 55 | 'pep8', 56 | 'pyflakes', 57 | ] 58 | 59 | tests_require = [ 60 | 'mock', 61 | 'nose', 62 | ] 63 | 64 | dependency_links = [] 65 | 66 | setup_requires = [] 67 | 68 | if 'nosetests' in sys.argv[1:]: 69 | setup_requires.extend(tests_require) 70 | 71 | 72 | def get_version(filepath='src/birding/version.py'): 73 | """Get version without import, which avoids dependency issues.""" 74 | with open(get_abspath(filepath)) as version_file: 75 | return re.search( 76 | r"""__version__\s+=\s+(['"])(?P.+?)\1""", 77 | version_file.read()).group('version') 78 | 79 | 80 | def readme(filepath='README.rst'): 81 | """Return project README.rst contents as str.""" 82 | with open(get_abspath(filepath)) as fd: 83 | return fd.read() 84 | 85 | 86 | def description(doc=__doc__): 87 | """Return project description from first line of doc.""" 88 | for line in doc.splitlines(): 89 | return line.strip() 90 | 91 | 92 | def get_abspath(filepath): 93 | if os.path.isabs(filepath): 94 | return filepath 95 | setup_py = os.path.abspath(__file__) 96 | project_dir = os.path.dirname(setup_py) 97 | return os.path.abspath(os.path.join(project_dir, filepath)) 98 | 99 | 100 | setup( 101 | name='birding', 102 | version=get_version(), 103 | author='Parsely, Inc.', 104 | author_email='hello@parsely.com', 105 | url='https://github.com/Parsely/birding', 106 | description=description(), 107 | long_description=readme(), 108 | license='Apache License 2.0', 109 | packages=find_packages('src'), 110 | package_dir={'': 'src'}, 111 | entry_points={ 112 | 'console_scripts': [ 113 | 114 | ] 115 | }, 116 | install_requires=install_requires, 117 | tests_require=tests_require, 118 | setup_requires=setup_requires, 119 | extras_require={ 120 | 'test': tests_require, 121 | 'all': install_requires + tests_require, 122 | 'lint': lint_requires 123 | }, 124 | dependency_links=dependency_links, 125 | zip_safe=False, 126 | test_suite='nose.collector', 127 | include_package_data=True, 128 | ) 129 | -------------------------------------------------------------------------------- /src/birding/__init__.py: -------------------------------------------------------------------------------- 1 | """Namespace of all modules in the birding package.""" 2 | 3 | import logging 4 | 5 | from . import bolt, config, follow, gnip, search, shelf, spout, twitter 6 | from .version import VERSION, __version__ 7 | from .version import __doc__ as __license__ 8 | 9 | 10 | __all__ = [ 11 | 'VERSION', 12 | '__license__', 13 | '__version__', 14 | 'bolt', 15 | 'config', 16 | 'follow', 17 | 'gnip', 18 | 'search', 19 | 'shelf', 20 | 'spout', 21 | 'twitter', 22 | ] 23 | 24 | 25 | # Configure the logger. No logger configuration is exposed by birding itself. A 26 | # project using birding can change the log level after importing `birding` 27 | # with: 28 | # 29 | # logging.getLogger('birding').setLevel(logging.DEBUG) 30 | # 31 | logger = logging.getLogger('birding') 32 | logger.addHandler(logging.StreamHandler()) 33 | logger.setLevel(logging.WARNING) 34 | -------------------------------------------------------------------------------- /src/birding/bolt.py: -------------------------------------------------------------------------------- 1 | """Storm Bolt classes.""" 2 | 3 | from __future__ import print_function 4 | 5 | import functools 6 | import json 7 | import sys 8 | 9 | from streamparse.bolt import Bolt 10 | 11 | from .config import get_config, import_name 12 | from .search import search_manager_from_config 13 | from .shelf import shelf_from_config 14 | 15 | 16 | def fault_barrier(fn): 17 | """Method decorator to catch and log errors, then send fail message.""" 18 | @functools.wraps(fn) 19 | def process(self, tup): 20 | try: 21 | return fn(self, tup) 22 | except Exception as e: 23 | if isinstance(e, KeyboardInterrupt): 24 | return 25 | print(str(e), file=sys.stderr) 26 | self.fail(tup) 27 | return process 28 | 29 | 30 | def get_search_manager(config=None, **default_init): 31 | if config is None: 32 | config = get_config()['SearchManager'] 33 | return search_manager_from_config(config, **default_init) 34 | 35 | 36 | class TwitterSearchBolt(Bolt): 37 | def initialize(self, conf, ctx): 38 | """Initialization steps: 39 | 40 | 1. Get :func:`~birding.search.search_manager_from_config`. 41 | 2. Prepare to track searched terms as to avoid redundant searches. 42 | """ 43 | self.manager = get_search_manager() 44 | config = get_config()['TwitterSearchBolt'] 45 | self.term_shelf = shelf_from_config(config) 46 | 47 | @fault_barrier 48 | def process(self, tup): 49 | """Process steps: 50 | 51 | 1. Stream in (term, timestamp). 52 | 2. Perform :meth:`~birding.search.SearchManager.search` on term. 53 | 3. Emit (term, timestamp, search_result). 54 | """ 55 | term, timestamp = tup.values 56 | if term not in self.term_shelf: 57 | self.log( 58 | 'search: {term}, {timestamp}' 59 | .format(term=term, timestamp=timestamp)) 60 | search_result = self.manager.search(q=term) 61 | self.emit([term, timestamp, search_result]) 62 | self.term_shelf[term] = timestamp 63 | 64 | 65 | class TwitterLookupBolt(Bolt): 66 | def initialize(self, conf, ctx): 67 | """Initialization steps: 68 | 69 | 1. Get :func:`~birding.search.search_manager_from_config`. 70 | """ 71 | self.manager = get_search_manager() 72 | 73 | @fault_barrier 74 | def process(self, tup): 75 | """Process steps: 76 | 77 | 1. Stream in (term, timestamp, search_result). 78 | 2. Perform :meth:`~birding.search.SearchManager.lookup_search_result`. 79 | 3. Emit (term, timestamp, lookup_result). 80 | """ 81 | term, timestamp, search_result = tup.values 82 | self.log( 83 | 'lookup: {term}, {timestamp}' 84 | .format(term=term, timestamp=timestamp)) 85 | lookup_result = self.manager.lookup_search_result(search_result) 86 | self.emit([term, timestamp, lookup_result]) 87 | 88 | 89 | class ElasticsearchIndexBolt(Bolt): 90 | def initialize(self, conf, ctx): 91 | """Initialization steps: 92 | 93 | 1. Prepare elasticsearch connection, including details for indexing. 94 | """ 95 | config = get_config()['ElasticsearchIndexBolt'] 96 | elasticsearch_class = import_name(config['elasticsearch_class']) 97 | self.es = elasticsearch_class(**config['elasticsearch_init']) 98 | self.index = config['index'] 99 | self.doc_type = config['doc_type'] 100 | 101 | @fault_barrier 102 | def process(self, tup): 103 | """Process steps: 104 | 105 | 1. Index third positional value from input to elasticsearch. 106 | """ 107 | self.es.bulk( 108 | self.generate_bulk_body(tup.values[2]), 109 | index=self.index, 110 | doc_type=self.doc_type) 111 | 112 | @staticmethod 113 | def generate_bulk_body(statuses): 114 | for status in statuses: 115 | yield {'index': {'_id': str(status['id'])}} 116 | yield status 117 | 118 | 119 | class ResultTopicBolt(Bolt): 120 | def initialize(self, conf, ctx): 121 | """Initialization steps: 122 | 123 | 1. Connect to Kafka. 124 | 2. Prepare Kafka producer for `tweet` topic. 125 | 3. Prepare to track tweets published to topic, to avoid redundant data. 126 | """ 127 | config = get_config()['ResultTopicBolt'] 128 | kafka_class = import_name(config['kafka_class']) 129 | self.client = kafka_class(**config['kafka_init']) 130 | self.topic = self.client.topics[config['topic']] 131 | self.producer = self.topic.get_producer() 132 | 133 | # Use own default index value while still allowing user config. 134 | self.tweet_shelf = shelf_from_config(config, index='pre_kafka_shelf') 135 | 136 | @fault_barrier 137 | def process(self, tup): 138 | """Process steps: 139 | 140 | 1. Stream third positional value from input into Kafka topic. 141 | """ 142 | status_seq = self.iter_using_shelf(tup.values[2], self.tweet_shelf) 143 | # This could be more efficient by passing the result from twitter 144 | # straight through to the producer, instead of deserializing and 145 | # reserializing json. 146 | self.producer.produce(json.dumps(status) for status in status_seq) 147 | 148 | @staticmethod 149 | def iter_using_shelf(statuses, shelf): 150 | for status in statuses: 151 | id_str = str(status['id']) 152 | if id_str in shelf: 153 | continue 154 | yield status 155 | shelf[id_str] = None 156 | -------------------------------------------------------------------------------- /src/birding/config.py: -------------------------------------------------------------------------------- 1 | """birding uses a validated configuration file for runtime details. 2 | 3 | Configuration files use a `YAML `_ format. All values have a 4 | default (below) and accept values of the same name in the configuration file, 5 | which has a default path of ``birding.yml`` in the current working 6 | directory. If needed, the ``BIRDING_CONF`` environment variable can point to 7 | the filepath of the configuration file. 8 | 9 | The scope of the configuration file is limited to details of birding itself, 10 | not of Storm-related topics. Storm details are in the project topology 11 | definition. 12 | 13 | When a configuration value is a Python dotted name, it is a string reference to 14 | the Python object to import. In general, when the value is just an object name 15 | without a full namespace, its assumed to be the relevant birding namespace, 16 | e.g. ``LRUShelf`` is assumed to be ``birding.shelf.LRUShelf``. Respective 17 | ``*_init`` configuration values specify keyword (not positional) arguments to 18 | be passed to the class constructor. 19 | 20 | See :ref:`production` for further discussion on configuration in production 21 | environments. 22 | 23 | For advanced API usage, see :func:`get_config`. The config includes an 24 | `Appendix` to support any additional values not known to birding, such that 25 | these values are available in ``config['Appendix']`` and bypass any 26 | validation. This is useful for code which uses birding's config loader and 27 | needs to define additional values. 28 | 29 | Defaults:: 30 | 31 | Spout: TermCycleSpout 32 | TermCycleSpout: 33 | terms: 34 | - real-time analytics 35 | - apache storm 36 | - pypi 37 | SearchManager: 38 | class: birding.twitter.TwitterSearchManagerFromOAuth 39 | init: {} 40 | TwitterSearchBolt: 41 | shelf_class: FreshLRUShelf 42 | shelf_init: {} 43 | shelf_expiration: 300 44 | ElasticsearchIndexBolt: 45 | elasticsearch_class: elasticsearch.Elasticsearch 46 | elasticsearch_init: 47 | hosts: 48 | - localhost: 9200 49 | index: tweet 50 | doc_type: tweet 51 | ResultTopicBolt: 52 | kafka_class: pykafka.KafkaClient 53 | kafka_init: 54 | hosts: 127.0.0.1:9092 # comma-separated list of hosts 55 | topic: tweet 56 | shelf_class: ElasticsearchShelf 57 | shelf_init: {} 58 | shelf_expiration: null 59 | Appendix: {} 60 | 61 | """ 62 | 63 | import importlib 64 | import logging 65 | import os 66 | import textwrap 67 | from collections import Mapping 68 | from io import StringIO 69 | 70 | import travesty as tv 71 | import yaml 72 | from repoze.lru import LRUCache 73 | 74 | 75 | BIRDING_CONF_DEFAULT = 'birding.yml' 76 | BIRDING_CONF = os.environ.get('BIRDING_CONF', BIRDING_CONF_DEFAULT) 77 | 78 | 79 | SCHEMA = tv.SchemaMapping().of( 80 | Spout = tv.String(), 81 | TermCycleSpout = tv.SchemaMapping().of( 82 | terms = tv.List().of(tv.String())), 83 | SearchManager = tv.SchemaMapping().of(**{ 84 | 'class': tv.String(), 85 | 'init': tv.StrMapping().of(tv.Passthrough())}), 86 | TwitterSearchBolt = tv.SchemaMapping().of( 87 | shelf_class = tv.String(), 88 | shelf_init = tv.StrMapping().of(tv.Passthrough()), 89 | shelf_expiration = tv.Optional(tv.Int())), 90 | ElasticsearchIndexBolt = tv.SchemaMapping().of( 91 | elasticsearch_class = tv.String(), 92 | elasticsearch_init = tv.StrMapping().of(tv.Passthrough()), 93 | index = tv.String(), 94 | doc_type = tv.String()), 95 | ResultTopicBolt = tv.SchemaMapping().of( 96 | kafka_class = tv.String(), 97 | kafka_init = tv.StrMapping().of(tv.Passthrough()), 98 | topic = tv.String(), 99 | shelf_class = tv.String(), 100 | shelf_init = tv.StrMapping().of(tv.Passthrough()), 101 | shelf_expiration = tv.Optional(tv.Int())), 102 | Appendix = tv.Passthrough()) 103 | 104 | 105 | CACHE = LRUCache(16) # size 106 | 107 | 108 | def get_config(filepath=None, default_loader=None, on_missing=None): 109 | """Get a dict for the current birding configuration. 110 | 111 | The resulting dictionary is fully populated with defaults, such that all 112 | valid keys will resolve to valid values. Invalid and extra values in the 113 | configuration result in an exception. 114 | 115 | See :ref:`config` (module-level docstring) for discussion on how birding 116 | configuration works, including filepath loading. Note that a non-default 117 | filepath set via env results in a :py:exc:`OSError` when the file is 118 | missing, but the default filepath is ignored when missing. 119 | 120 | This function caches its return values as to only parse configuration once 121 | per set of inputs. As such, treat the resulting dictionary as read-only as 122 | not to accidentally write values which will be seen by other handles of the 123 | dictionary. 124 | 125 | Args: 126 | filepath (str): path to birding configuration YAML file. 127 | default_loader (callable): 128 | callable which returns file descriptor with YAML data of default 129 | configuration values 130 | on_missing (callable): callback to call when file is missing. 131 | Returns: 132 | dict: dict of current birding configuration; treat as read-only. 133 | 134 | """ 135 | # Handle cache lookup explicitly in order to support keyword arguments. 136 | cache_key = (filepath, default_loader, on_missing) 137 | if CACHE.get(cache_key) is not None: 138 | return CACHE.get(cache_key) 139 | 140 | logger = logging.getLogger('birding') 141 | 142 | if filepath is None: 143 | filepath = BIRDING_CONF 144 | if default_loader is None: 145 | default_loader = get_defaults_file 146 | if on_missing is None: 147 | on_missing = logger.info 148 | 149 | logger.info( 150 | 'Looking for configuration file: {}'.format(os.path.abspath(filepath))) 151 | if not os.path.exists(filepath): 152 | # Log a message if filepath is default; raise error if not default. 153 | on_missing('No {} configuration file found.'.format(filepath)) 154 | if filepath != BIRDING_CONF_DEFAULT: 155 | # Stat the missing file to result in OSError. 156 | os.stat(filepath) 157 | 158 | config = yaml.safe_load(default_loader()) 159 | tv.validate(SCHEMA, config) 160 | if os.path.exists(filepath): 161 | file_config = yaml.safe_load(open(filepath)) 162 | if file_config: 163 | config = overlay(file_config, config) 164 | tv.validate(SCHEMA, config) 165 | 166 | CACHE.put(cache_key, config) 167 | 168 | return config 169 | 170 | 171 | def get_defaults_file(*a, **kw): 172 | """Get a file object with YAML data of configuration defaults. 173 | 174 | Arguments are passed through to :func:`get_defaults_str`. 175 | """ 176 | fd = StringIO() 177 | fd.write(get_defaults_str(*a, **kw)) 178 | fd.seek(0) 179 | return fd 180 | 181 | 182 | def get_defaults_str(raw=None, after='Defaults::'): 183 | """Get the string YAML representation of configuration defaults.""" 184 | if raw is None: 185 | raw = __doc__ 186 | return unicode(textwrap.dedent(raw.split(after)[-1]).strip()) 187 | 188 | 189 | def overlay(upper, lower): 190 | """Return the overlay of `upper` dict onto `lower` dict. 191 | 192 | This operation is similar to `dict.update`, but recurses when it encounters 193 | a dict/mapping, as to allow nested leaf values in the lower collection 194 | which are not in the upper collection. Whenever the upper collection has a 195 | value, its value is used. 196 | 197 | >>> overlay({'a': 0}, {}) 198 | {'a': 0} 199 | >>> abc = {'a': 0, 'b': 1, 'c': 2} 200 | >>> abc == overlay({'a': 0, 'c': 2}, {'a': None, 'b': 1}) 201 | True 202 | >>> result = {' ': None, '_': abc} 203 | >>> result == overlay( 204 | ... {'_': {'a': 0, 'c': 2}, ' ': None}, 205 | ... {'_': {'a': None, 'b': 1}}) 206 | ... 207 | True 208 | >>> 209 | """ 210 | result = {} 211 | for key in upper: 212 | if is_mapping(upper[key]): 213 | lower_value = lower.get(key, {}) 214 | if not is_mapping(lower_value): 215 | msg = 'Attempting to overlay a mapping on a non-mapping: {}' 216 | raise ValueError(msg.format(key)) 217 | result[key] = overlay(upper[key], lower_value) 218 | else: 219 | result[key] = upper[key] 220 | for key in lower: 221 | if key in result: 222 | continue 223 | result[key] = lower[key] 224 | return result 225 | 226 | 227 | def is_mapping(x): 228 | return isinstance(x, Mapping) or isinstance(x, dict) 229 | 230 | 231 | def import_name(name, default_ns=None): 232 | """Import an object based on the dotted string. 233 | 234 | >>> import_name('textwrap') # doctest: +ELLIPSIS 235 | 236 | >>> import_name('birding.config') # doctest: +ELLIPSIS 237 | 238 | >>> import_name('birding.config.get_config') # doctest: +ELLIPSIS 239 | 240 | >>> 241 | 242 | If `ns` is provided, use it as the namespace if `name` does not have a dot. 243 | 244 | >>> ns = 'birding.config' 245 | >>> x = import_name('birding.config.get_config') 246 | >>> x # doctest: +ELLIPSIS 247 | 248 | >>> x == import_name('get_config', default_ns=ns) 249 | True 250 | >>> x == import_name('birding.config.get_config', default_ns=ns) 251 | True 252 | >>> 253 | """ 254 | if '.' not in name: 255 | if default_ns is None: 256 | return importlib.import_module(name) 257 | else: 258 | name = default_ns + '.' + name 259 | module_name, object_name = name.rsplit('.', 1) 260 | module = importlib.import_module(module_name) 261 | return getattr(module, object_name) 262 | 263 | 264 | if __name__ == '__main__': 265 | import pprint 266 | import sys 267 | 268 | if '--yaml' in sys.argv: 269 | print(yaml.safe_dump(get_config(), default_flow_style=False)) 270 | else: 271 | pprint.pprint(get_config()) 272 | -------------------------------------------------------------------------------- /src/birding/follow.py: -------------------------------------------------------------------------------- 1 | """Tool to follow output of birding.""" 2 | 3 | from __future__ import print_function 4 | 5 | import codecs 6 | import json 7 | import sys 8 | from contextlib import contextmanager 9 | from time import sleep 10 | 11 | from pykafka.exceptions import KafkaException 12 | 13 | from .bolt import get_search_manager 14 | from .config import get_config, import_name 15 | 16 | 17 | # Force unicode stdio, avoid ASCII encoding errors. 18 | reload(sys) 19 | sys.stdout = codecs.getwriter('utf-8')(sys.stdout) 20 | sys.stdin = codecs.getreader('utf-8')(sys.stdin) 21 | 22 | 23 | def follow_topic_from_config(): 24 | """Read kafka config, then dispatch to `follow_topic`.""" 25 | config = get_config()['ResultTopicBolt'] 26 | kafka_class = import_name(config['kafka_class']) 27 | return follow_topic(kafka_class, config['topic'], **config['kafka_init']) 28 | 29 | 30 | def follow_topic(kafka_class, name, retry_interval=1, **kafka_init): 31 | """Dump each message from kafka topic to stdio.""" 32 | while True: 33 | try: 34 | client = kafka_class(**kafka_init) 35 | topic = client.topics[name] 36 | consumer = topic.get_simple_consumer(reset_offset_on_start=True) 37 | except Exception as e: 38 | if not should_try_kafka_again(e): 39 | raise 40 | with flushing(sys.stderr): 41 | print( 42 | 'Failed attempt to connect to Kafka. Will retry ...', 43 | file=sys.stderr) 44 | sleep(retry_interval) 45 | else: 46 | with flushing(sys.stdout): 47 | print('Connected to Kafka.') 48 | break 49 | 50 | dump = Dump() 51 | for message in consumer: 52 | with flushing(sys.stdout, sys.stderr): 53 | status = load(message.value) 54 | if status: 55 | dump(status) 56 | 57 | 58 | def follow_fd(fd): 59 | """Dump each line of input to stdio.""" 60 | dump = Dump() 61 | for line in fd: 62 | if not line.strip(): 63 | continue 64 | 65 | with flushing(sys.stdout, sys.stderr): 66 | status = load(line) 67 | if status: 68 | dump(status) 69 | 70 | 71 | def load(message): 72 | try: 73 | return json.loads(message) 74 | except Exception as e: 75 | print(str(e), file=sys.stderr) 76 | 77 | 78 | class Dump(object): 79 | def __init__(self, *a, **kw): 80 | self.manager = get_search_manager(*a, **kw) 81 | 82 | def __call__(self, *statuses): 83 | try: 84 | print(self.manager.dump(statuses)) 85 | print('') 86 | except UnicodeEncodeError as e: 87 | print(str(e), file=sys.stderr) 88 | 89 | 90 | @contextmanager 91 | def flushing(*fds): 92 | yield 93 | for fd in fds: 94 | fd.flush() 95 | 96 | 97 | def should_try_kafka_again(error): 98 | """Determine if the error means to retry or fail, True to retry.""" 99 | msg = 'Unable to retrieve' 100 | return isinstance(error, KafkaException) and str(error).startswith(msg) 101 | 102 | 103 | if __name__ == '__main__': 104 | follow_topic_from_config() 105 | -------------------------------------------------------------------------------- /src/birding/gnip.py: -------------------------------------------------------------------------------- 1 | """Minimal Gnip API using HTTP requests.""" 2 | 3 | import textwrap 4 | 5 | import requests 6 | 7 | from .search import SearchManager 8 | 9 | 10 | class Gnip(object): 11 | """Simple binding to Gnip search API.""" 12 | 13 | session_class = requests.Session 14 | 15 | default_params = { 16 | 'publisher': 'twitter', 17 | 'maxResults': 500, 18 | } 19 | 20 | def __init__(self, base_url, stream, username, password, **params): 21 | """Prepare HTTP session for gnip searches.""" 22 | self.base_url = base_url 23 | self.stream = stream 24 | self.username = username 25 | self.password = password 26 | 27 | self.params = {} # Use on every search. 28 | self.params.update(self.default_params) 29 | self.params.update(params) 30 | 31 | self.session = self.start_session() 32 | 33 | def start_session(self): 34 | session = self.session_class() 35 | session.auth = (self.username, self.password) 36 | return session 37 | 38 | def search(self, q, **kw): 39 | """Search Gnip for given query, returning deserialized response.""" 40 | url = '{base_url}/search/{stream}'.format(**vars(self)) 41 | 42 | params = { 43 | 'q': q, 44 | } 45 | params.update(self.params) 46 | params.update(kw) 47 | 48 | response = self.session.get(url, params=params) 49 | response.raise_for_status() 50 | return response.json() 51 | 52 | 53 | class GnipSearchManager(SearchManager): 54 | """Service object to provide fully-hydrated tweets given a search query.""" 55 | 56 | def __init__(self, *a, **kw): 57 | self.gnip = Gnip(*a, **kw) 58 | 59 | def search(self, q, **kw): 60 | """Search gnip for ``q``, return `results`__ directly from gnip. 61 | 62 | __ http://support.gnip.com/apis/search_api/api_reference.html 63 | """ 64 | return self.gnip.search(q, **kw) 65 | 66 | def lookup_search_result(self, result, **kw): 67 | """Do almost nothing, just pass-through results.""" 68 | return result['results'] 69 | 70 | def lookup(self, id_list, **kw): 71 | """Not implemented.""" 72 | raise NotImplementedError('gnip does not have have a lookup API.') 73 | 74 | @staticmethod 75 | def dump(result): 76 | """Dump result into a string, useful for debugging.""" 77 | if isinstance(result, dict): 78 | # Result is a search result. 79 | statuses = result['results'] 80 | else: 81 | # Result is a lookup result. 82 | statuses = result 83 | status_str_list = [] 84 | for status in statuses: 85 | status_str_list.append(textwrap.dedent(u""" 86 | @{screen_name} -- https://twitter.com/{screen_name} 87 | {text} 88 | """).strip().format( 89 | screen_name=status['actor']['preferredUsername'], 90 | text=status['body'])) 91 | return u'\n\n'.join(status_str_list) 92 | -------------------------------------------------------------------------------- /src/birding/search.py: -------------------------------------------------------------------------------- 1 | """Search. Get tweets.""" 2 | 3 | from abc import ABCMeta, abstractmethod 4 | 5 | from .config import import_name 6 | 7 | 8 | def search_manager_from_config(config, **default_init): 9 | """Get a `SearchManager` instance dynamically based on config. 10 | 11 | `config` is a dictionary containing ``class`` and ``init`` keys as defined 12 | in :mod:`birding.config`. 13 | """ 14 | manager_cls = import_name(config['class'], default_ns='birding.search') 15 | init = {} 16 | init.update(default_init) 17 | init.update(config['init']) 18 | manager = manager_cls(**init) 19 | return manager 20 | 21 | 22 | class SearchManager(object): 23 | """Abstract base class for service object to search for tweets.""" 24 | 25 | __metaclass__ = ABCMeta 26 | 27 | @abstractmethod 28 | def search(self, q=None, **kw): 29 | """Search for ``q``, return results directly from source.""" 30 | 31 | @abstractmethod 32 | def lookup_search_result(self, result, **kw): 33 | """Perform :meth:`lookup` on return value of :meth:`search`.""" 34 | 35 | @abstractmethod 36 | def lookup(self, id_list, **kw): 37 | """Lookup list of statuses, return results directly from source. 38 | 39 | Input can be any sequence of numeric or string values representing 40 | status IDs. 41 | """ 42 | -------------------------------------------------------------------------------- /src/birding/shelf.py: -------------------------------------------------------------------------------- 1 | """Track terms using a simple dict-like interface.""" 2 | 3 | import abc 4 | import collections 5 | import time 6 | 7 | import elasticsearch 8 | from repoze.lru import LRUCache 9 | 10 | from .config import import_name 11 | 12 | 13 | UNSET = object() 14 | 15 | 16 | def shelf_from_config(config, **default_init): 17 | """Get a `Shelf` instance dynamically based on config. 18 | 19 | `config` is a dictionary containing ``shelf_*`` keys as defined in 20 | :mod:`birding.config`. 21 | """ 22 | shelf_cls = import_name(config['shelf_class'], default_ns='birding.shelf') 23 | init = {} 24 | init.update(default_init) 25 | init.update(config['shelf_init']) 26 | shelf = shelf_cls(**init) 27 | if hasattr(shelf, 'set_expiration') and 'shelf_expiration' in config: 28 | shelf.set_expiration(config['shelf_expiration']) 29 | return shelf 30 | 31 | 32 | class Shelf(collections.MutableMapping): 33 | """Abstract base class for a shelf to track -- but not iterate -- values. 34 | 35 | Provides a dict-interface. 36 | """ 37 | 38 | __metaclass__ = abc.ABCMeta 39 | 40 | @abc.abstractmethod 41 | def getitem(self, key): 42 | """Get an item's value from the shelf or raise KeyError(key).""" 43 | 44 | @abc.abstractmethod 45 | def setitem(self, key, value): 46 | """Set an item on the shelf, with the given value.""" 47 | 48 | @abc.abstractmethod 49 | def delitem(self, key): 50 | """Remove an item from the shelf.""" 51 | 52 | @abc.abstractmethod 53 | def clear(self): 54 | """Remove all items from the shelf.""" 55 | 56 | def unpack(self, key, value): 57 | """Unpack value from ``getitem``. 58 | 59 | This is useful for `Shelf` implementations which require metadata be 60 | stored with the shelved values, in which case ``pack`` should implement 61 | the inverse operation. By default, the value is simply passed through 62 | without modification. The ``unpack`` implementation is called on 63 | ``__getitem__`` and therefore can raise `KeyError` if packed metadata 64 | indicates that a value is invalid. 65 | """ 66 | return value 67 | 68 | def pack(self, key, value): 69 | """Pack value given to ``setitem``, inverse of ``unpack``.""" 70 | return value 71 | 72 | def __getitem__(self, key): 73 | return self.unpack(key, self.getitem(self.__keytransform__(key))) 74 | 75 | def __setitem__(self, key, value): 76 | self.setitem(self.__keytransform__(key), self.pack(key, value)) 77 | 78 | def __delitem__(self, key): 79 | self.delitem(self.__keytransform__(key)) 80 | 81 | def __keytransform__(self, key): 82 | return key 83 | 84 | def __iter__(self): 85 | raise NotImplementedError('Shelf instances do not support iteration.') 86 | 87 | def __len__(self): 88 | raise NotImplementedError('Shelf instances do not support iteration.') 89 | 90 | 91 | class FreshPacker(object): 92 | """Mixin for pack/unpack implementation to expire shelf content.""" 93 | 94 | #: Values are no longer fresh after this value, in seconds. 95 | expire_after = 5 * 60 96 | 97 | def unpack(self, key, value): 98 | """Unpack and return value only if it is fresh.""" 99 | value, freshness = value 100 | if not self.is_fresh(freshness): 101 | raise KeyError('{} (stale)'.format(key)) 102 | return value 103 | 104 | def pack(self, key, value): 105 | """Pack value with metadata on its freshness.""" 106 | return value, self.freshness() 107 | 108 | def set_expiration(self, expire_after): 109 | """Set a new expiration for freshness of all unpacked values.""" 110 | self.expire_after = expire_after 111 | 112 | def freshness(self): 113 | """Clock function to use for freshness packing/unpacking.""" 114 | return time.time() 115 | 116 | def is_fresh(self, freshness): 117 | """Return False if given freshness value has expired, else True.""" 118 | if self.expire_after is None: 119 | return True 120 | return self.freshness() - freshness <= self.expire_after 121 | 122 | 123 | class LRUShelf(Shelf): 124 | """An in-memory Least-Recently Used shelf up to `maxsize`..""" 125 | 126 | def __init__(self, maxsize=1000): 127 | self.store = LRUCache(int(maxsize)) 128 | 129 | def getitem(self, key): 130 | value = self.store.get(key, UNSET) 131 | if value is UNSET: 132 | raise KeyError(key) 133 | return value 134 | 135 | def setitem(self, key, value): 136 | self.store.put(key, value) 137 | 138 | def delitem(self, key): 139 | self.store.invalidate(key) 140 | 141 | def clear(self): 142 | self.store.clear() 143 | 144 | 145 | class FreshLRUShelf(FreshPacker, LRUShelf): 146 | """A Least-Recently Used shelf which expires values.""" 147 | 148 | 149 | class ElasticsearchShelf(Shelf): 150 | """A shelf implemented using an elasticsearch index.""" 151 | 152 | def __init__(self, index='shelf', doc_type='shelf', **elasticsearch_init): 153 | self.es = elasticsearch.Elasticsearch(**elasticsearch_init) 154 | self.index_client = elasticsearch.client.IndicesClient(self.es) 155 | self.index = index 156 | self.doc_type = doc_type 157 | 158 | def getitem(self, key): 159 | try: 160 | doc = self.es.get(index=self.index, doc_type=self.doc_type, id=key) 161 | except elasticsearch.exceptions.NotFoundError: 162 | raise KeyError(key) 163 | 164 | if not doc: 165 | raise KeyError(key) 166 | 167 | try: 168 | value = doc['_source']['value'] 169 | except KeyError: 170 | raise KeyError('{} (malformed data)'.format(key)) 171 | 172 | return value 173 | 174 | def setitem(self, key, value): 175 | self.es.index( 176 | index=self.index, 177 | doc_type=self.doc_type, 178 | id=key, 179 | body={'value': value}, 180 | refresh=True) 181 | 182 | def delitem(self, key): 183 | self.es.delete(index=self.index, doc_type=self.doc_type, id=key) 184 | 185 | def clear(self): 186 | self.index_client.delete(self.index) 187 | 188 | 189 | class FreshElasticsearchShelf(FreshPacker, ElasticsearchShelf): 190 | """An shelf implementation with elasticsearch which expires values.""" 191 | -------------------------------------------------------------------------------- /src/birding/spout.py: -------------------------------------------------------------------------------- 1 | """Storm Spout classes.""" 2 | 3 | import datetime 4 | import itertools 5 | 6 | from streamparse.spout import Spout 7 | 8 | from .config import get_config, import_name 9 | 10 | 11 | def DispatchSpout(*a, **kw): 12 | """Factory to dispatch spout class based on config.""" 13 | spout_class_name = get_config()['Spout'] 14 | spout_class = import_name(spout_class_name, default_ns='birding.spout') 15 | return spout_class(*a, **kw) 16 | 17 | 18 | class TermMethods(object): 19 | @staticmethod 20 | def pack_tup_id(term, timestamp): 21 | """Pack term, timestamp into a tuple ID suitable for Storm. 22 | 23 | Example: 24 | 25 | >>> TermMethods.pack_tup_id('search it!', '2015-09-24T14:39:53.429183') 26 | 'search it! 2015-09-24T14:39:53.429183' 27 | >>> 28 | """ 29 | return '{} {}'.format(term, timestamp) 30 | 31 | @staticmethod 32 | def parse_tup_id(tup_id): 33 | """Parse a `pack_tup_id`-packed tuple ID into term, timestamp. 34 | 35 | Example: 36 | 37 | >>> TermMethods.parse_tup_id('search it! 2015-09-24T14:39:53.429183') 38 | ('search it!', '2015-09-24T14:39:53.429183') 39 | >>> 40 | """ 41 | return tuple(tup_id.rsplit(' ', 1)) 42 | 43 | 44 | class TermCycleSpout(Spout, TermMethods): 45 | def initialize(self, stormconf, context): 46 | """Initialization steps: 47 | 48 | 1. Prepare sequence of terms based on config: TermCycleSpout/terms. 49 | """ 50 | self.terms = get_config()['TermCycleSpout']['terms'] 51 | self.term_seq = itertools.cycle(self.terms) 52 | 53 | def next_tuple(self): 54 | """Next tuple steps: 55 | 56 | 1. Emit (term, timestamp) for next term in sequence w/current UTC time. 57 | """ 58 | term = next(self.term_seq) 59 | timestamp = datetime.datetime.utcnow().isoformat() 60 | self.emit([term, timestamp], tup_id=self.pack_tup_id(term, timestamp)) 61 | -------------------------------------------------------------------------------- /src/birding/twitter.py: -------------------------------------------------------------------------------- 1 | """Minimal twitter API shim using http://mike.verdone.ca/twitter/.""" 2 | 3 | from __future__ import absolute_import 4 | 5 | import os 6 | import textwrap 7 | 8 | from twitter.api import Twitter as BaseTwitter 9 | from twitter.cmdline import CONSUMER_KEY, CONSUMER_SECRET 10 | from twitter.oauth import OAuth, read_token_file 11 | 12 | from .search import SearchManager 13 | 14 | 15 | class Twitter(BaseTwitter): 16 | @classmethod 17 | def from_oauth_file(cls, filepath=None): 18 | """Get an object bound to the Twitter API using your own credentials. 19 | 20 | The `twitter` library ships with a `twitter` command that uses PIN 21 | OAuth. Generate your own OAuth credentials by running `twitter` from 22 | the shell, which will open a browser window to authenticate you. Once 23 | successfully run, even just one time, you will have a credential file 24 | at ~/.twitter_oauth. 25 | 26 | This factory function reuses your credential file to get a `Twitter` 27 | object. (Really, this code is just lifted from the `twitter.cmdline` 28 | module to minimize OAuth dancing.) 29 | """ 30 | if filepath is None: 31 | # Use default OAuth filepath from `twitter` command-line program. 32 | home = os.environ.get('HOME', os.environ.get('USERPROFILE', '')) 33 | filepath = os.path.join(home, '.twitter_oauth') 34 | 35 | oauth_token, oauth_token_secret = read_token_file(filepath) 36 | 37 | twitter = cls( 38 | auth=OAuth( 39 | oauth_token, oauth_token_secret, CONSUMER_KEY, CONSUMER_SECRET), 40 | api_version='1.1', 41 | domain='api.twitter.com') 42 | 43 | return twitter 44 | 45 | 46 | class TwitterSearchManager(SearchManager): 47 | """Service object to provide fully-hydrated tweets given a search query.""" 48 | 49 | def __init__(self, twitter): 50 | self.twitter = twitter 51 | 52 | def search(self, q=None, **kw): 53 | """Search twitter for ``q``, return `results`__ directly from twitter. 54 | 55 | __ https://dev.twitter.com/rest/reference/get/search/tweets 56 | """ 57 | if q is None: 58 | raise ValueError('No search query provided for `q` keyword.') 59 | return self.twitter.search.tweets(q=q, **kw) 60 | 61 | def lookup_search_result(self, result, **kw): 62 | """Perform :meth:`lookup` on return value of :meth:`search`.""" 63 | return self.lookup(s['id_str'] for s in result['statuses'], **kw) 64 | 65 | def lookup(self, id_list, **kw): 66 | """Lookup list of statuses, return `results`__ directly from twitter. 67 | 68 | Input can be any sequence of numeric or string values representing 69 | twitter status IDs. 70 | 71 | __ https://dev.twitter.com/rest/reference/get/statuses/lookup 72 | """ 73 | result_id_pack = ','.join([str(_id) for _id in id_list]) 74 | if not result_id_pack: 75 | return [] 76 | return self.twitter.statuses.lookup(_id=result_id_pack) 77 | 78 | @staticmethod 79 | def dump(result): 80 | """Dump result into a string, useful for debugging.""" 81 | if isinstance(result, dict): 82 | # Result is a search result. 83 | statuses = result['statuses'] 84 | else: 85 | # Result is a lookup result. 86 | statuses = result 87 | status_str_list = [] 88 | for status in statuses: 89 | status_str_list.append(textwrap.dedent(u""" 90 | @{screen_name} -- https://twitter.com/{screen_name} 91 | {text} 92 | """).strip().format( 93 | screen_name=status['user']['screen_name'], 94 | text=status['text'])) 95 | return u'\n\n'.join(status_str_list) 96 | 97 | 98 | def TwitterSearchManagerFromOAuth(*a, **kw): 99 | """Build :class:`TwitterSearchManager` from user OAuth file. 100 | 101 | Arguments are passed to :meth:`birding.twitter.Twitter.from_oauth_file`. 102 | """ 103 | return TwitterSearchManager(Twitter.from_oauth_file(*a, **kw)) 104 | 105 | 106 | def main(): 107 | """Do the default action of `twitter` command.""" 108 | from twitter.cmdline import Action, OPTIONS 109 | twitter = Twitter.from_oauth_file() 110 | Action()(twitter, OPTIONS) 111 | 112 | 113 | if __name__ == '__main__': 114 | main() 115 | -------------------------------------------------------------------------------- /src/birding/version.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2015 Parsely, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | __version__ = '0.3-dev' 18 | VERSION = (0, 3, 'dev') 19 | -------------------------------------------------------------------------------- /tasks.py: -------------------------------------------------------------------------------- 1 | def pre_submit(topology_name, env_name, env_config): 2 | """Override this function to perform custom actions prior to topology 3 | submission. No SSH tunnels will be active when this function is called.""" 4 | pass 5 | 6 | 7 | def post_submit(topo_name, env_name, env_config): 8 | """Override this function to perform custom actions after topology 9 | submission. Note that the SSH tunnel to Nimbus will still be active 10 | when this function is called.""" 11 | pass 12 | -------------------------------------------------------------------------------- /topologies/birding.clj: -------------------------------------------------------------------------------- 1 | (ns birding 2 | (:use [streamparse.specs]) 3 | (:gen-class)) 4 | 5 | (defn birding [options] 6 | [ 7 | ;; spout configuration 8 | {"term-spout" (python-spout-spec 9 | options 10 | ; Dispatch class based on birding.yml. 11 | "birding.spout.DispatchSpout" 12 | ["term" "timestamp"] 13 | :conf {"topology.max.spout.pending", 8} 14 | ) 15 | } 16 | ;; bolt configuration 17 | {"search-bolt" (python-bolt-spec 18 | options 19 | ; Use field grouping on term to support in-memory caching. 20 | {"term-spout" ["term"]} 21 | "birding.bolt.TwitterSearchBolt" 22 | ["term" "timestamp" "search_result"] 23 | :p 2 24 | ) 25 | "lookup-bolt" (python-bolt-spec 26 | options 27 | {"search-bolt" :shuffle} 28 | "birding.bolt.TwitterLookupBolt" 29 | ["term" "timestamp" "lookup_result"] 30 | :p 2 31 | ) 32 | "elasticsearch-index-bolt" (python-bolt-spec 33 | options 34 | {"lookup-bolt" :shuffle} 35 | "birding.bolt.ElasticsearchIndexBolt" 36 | [] 37 | :p 1 38 | ) 39 | "result-topic-bolt" (python-bolt-spec 40 | options 41 | {"lookup-bolt" :shuffle} 42 | "birding.bolt.ResultTopicBolt" 43 | [] 44 | :p 1 45 | ) 46 | } 47 | ] 48 | ) 49 | -------------------------------------------------------------------------------- /virtualenvs/birding.txt: -------------------------------------------------------------------------------- 1 | # Update setup.py install_requires when editing this list. 2 | elasticsearch==1.7.0 3 | pykafka==1.1.0 4 | pyyaml==3.11 5 | repoze.lru==0.6 6 | requests==2.8.1 7 | six==1.9.0 8 | streamparse==2.0.2 9 | travesty==0.1.1 10 | twitter==1.17.0 11 | --------------------------------------------------------------------------------