├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── dnflow.cfg.template ├── json2csv.py ├── luigi.cfg ├── queue_tasks.py ├── requirements.txt ├── schema.sql ├── static ├── css │ └── style.css └── js │ └── index.js ├── summarize.py ├── templates ├── base.html ├── feed.xml ├── index.html ├── robots.txt ├── summary.html └── summary_compare.html └── ui.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | 64 | 65 | .python-version 66 | *.swp 67 | ENV 68 | data/ 69 | *.sqlite* 70 | luigi-state.pickle 71 | .DS_Store 72 | *.rdb 73 | node_modules/ 74 | dnflow.cfg 75 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Washington University in St Louis 4 | Copyright (c) 2016 University of California at Riverside 5 | Copyright (c) 2016 University of Maryland. 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dnflow 2 | 3 | An early experiment in automating a series of actions with Twitter 4 | data for docnow. If you want to install dnflow and don't want to manually 5 | set things up yourself give our 6 | [Ansible playbook](https://github.com/docnow/dnflow-ansible) a try. 7 | 8 | Uses [Luigi](http://luigi.readthedocs.org/) for workflow automation. 9 | 10 | 11 | ## running it for yourself 12 | 13 | First create your dnflow configuration file, and add your Twitter application 14 | keys to it: 15 | 16 | cp dnflow.cfg.template dnflow.cfg 17 | 18 | If you are running on a non-standard HTTP port, such as the flask default, 19 | `localhost:5000`, be sure to include the port number in the value of 20 | `HOSTNAME`, e.g.: 21 | 22 | HOSTNAME = 'localhost:5000' 23 | 24 | The current `summarize.py` is set up to collect a handful of tweets 25 | based on a search, then execute a series of counts against it. This 26 | will result in one data file (the source tweets) and several count 27 | files (with the same name under `data/` but with extensions like 28 | `-urls`, `-hashtags` added on. 29 | 30 | Assuming you either have an activated virtualenv or similar sandbox, 31 | install the requirements first: 32 | ``` 33 | % pip install -r requirements 34 | ``` 35 | 36 | Start the `luigid` central scheduler, best done in another terminal: 37 | ``` 38 | % luigid 39 | ``` 40 | 41 | To test the workflow, run the following to kick it off (substituting a 42 | search term of interest): 43 | ``` 44 | % python -m luigi --module summarize RunFlow --term lahoreblast 45 | ``` 46 | 47 | It may take a moment to execute the search, which will require repeated 48 | calls to the Twitter API. As soon as it completes, you should have all 49 | the mentioned files in your `data/` directory. The naming scheme isn't 50 | well thought out. This is only a test. 51 | 52 | While you're at it, take a look at the web ui for luigi's scheduler at: 53 | 54 | http://localhost:8082/ 55 | 56 | (Assuming you didn't change the port when you started luigid.) 57 | 58 | 59 | ## adding the flask UI 60 | 61 | `ui.py` contains a simple web app that allows a search to be specified 62 | through the web, queueing workflows to execute in the background, and 63 | showing workflow process status and links to completed summaries as well. 64 | Running the web UI takes a few more steps. 65 | 66 | * Install and run [Redis](http://redis.io/) 67 | 68 | Redis can be run without configuration changes, best done in another 69 | terminal: 70 | 71 | ``` 72 | % redis-server 73 | ``` 74 | 75 | * Start a [Redis Queue](http://python-rq.org/) worker 76 | 77 | RQ requires a running instance of Redis and one or more workers, also 78 | best done in another terminal. 79 | 80 | ``` 81 | % rq worker 82 | ``` 83 | 84 | * Create the flask UI backend 85 | 86 | A simple SQLite3 database tracks the searches you will create and their 87 | workflow status. Within your dnflow virtual environment: 88 | 89 | ``` 90 | % sqlite3 db.sqlite3 < schema.sql 91 | ``` 92 | 93 | * Start the [flask](http://flask.pocoo.org/) UI 94 | 95 | The flask UI shows a list of existing searches, lets you add new ones, 96 | and links to completed search summaries. Again, within your dnflow 97 | virtual environment, and probably in yet another terminal window: 98 | 99 | ``` 100 | % python ui.py 101 | ``` 102 | 103 | 104 | ### The flow, for now 105 | 106 | The luigi workflow is not automated; it needs to be invoked explicitly. 107 | The web UI is the wrong place to invoke the workflow because the 108 | workflow can run for a long time, yet the UI needs to remain 109 | responsive. For these reasons, the process is separated out with 110 | the queue. 111 | 112 | When a search is added, dnflow adds a job to the queue by defining 113 | a Python subprocess to call the luigi workflow from the commandline. 114 | RQ enqueues this task for later processing. If one or more RQ 115 | workers are available, the job is assigned and begins. Because 116 | dnflow's enqueueing of the job is very fast, it can return an updated 117 | view promptly. 118 | 119 | The luigi workflow takes as long as it needs, generating static files 120 | in a distinct directory for each requested search. 121 | 122 | Integration between the web UI and workflows occurs in the UI's 123 | SQLite database, where search terms are stored with a job id. When 124 | the workflow is assigned to an RQ worker, that search record is 125 | updated through an HTTP PUT to the web app at the URL `/job`, with 126 | a reference to the job id and its output directory. Each individual 127 | task within the workflow further updates this same URL with additional 128 | PUTs upon task start, success, or failure. This is handled using 129 | [Luigi's event 130 | model](http://luigi.readthedocs.io/en/stable/api/luigi.event.html) and 131 | HTTP callbacks/hooks to the UI keep the integration between the two 132 | pieces of the environment simple. During a workflow, the most recent 133 | task status will be recorded in the database, and is available for 134 | display in the UI. 135 | 136 | With these pieces in place, several requests for new searches can 137 | be added rapidly within the UI. Each search will be run by the 138 | next available RQ worker process, so if only one process is available, 139 | they will execute in succession, but with more than one worker running, 140 | multiple workflows can run in parallel. The main limitation here 141 | is the rate limit on Twitter's API. 142 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DocNow/dnflow/3e43cb4a0062af2cdaf3d4948725aaddfcbd84f8/__init__.py -------------------------------------------------------------------------------- /dnflow.cfg.template: -------------------------------------------------------------------------------- 1 | HOSTNAME = 'localhost' 2 | DEBUG = True 3 | DATABASE = 'db.sqlite3' 4 | SECRET_KEY = 'a super secret key' 5 | STATIC_URL_PATH = '/static' 6 | DATA_DIR = 'data' 7 | REDIS_HOST = 'localhost' 8 | REDIS_PORT = 6379 9 | REDIS_DB = 4 10 | TWITTER_CONSUMER_KEY = 'YOUR_TWITTER_CONSUMER_KEY_HERE' 11 | TWITTER_CONSUMER_SECRET = 'YOUR_TWITTER_CONSUMER_SECRET_HERE' 12 | MAX_TIMEOUT = 24 * 60 * 60 13 | 14 | # set the following two variables to o non-empty values to add 15 | # basic auth for PUT updates on /job 16 | HTTP_BASICAUTH_USER = '' 17 | HTTP_BASICAUTH_PASS = '' 18 | -------------------------------------------------------------------------------- /json2csv.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | import json 4 | import fileinput 5 | 6 | def main(): 7 | sheet = csv.writer(sys.stdout, encoding="utf-8") 8 | sheet.writerow(get_headings()) 9 | for line in fileinput.input(): 10 | tweet = json.loads(line) 11 | sheet.writerow(get_row(tweet)) 12 | 13 | def get_headings(): 14 | return [ 15 | 'coordinates', 16 | 'created_at', 17 | 'hashtags', 18 | 'media', 19 | 'urls', 20 | 'favorite_count', 21 | 'id', 22 | 'in_reply_to_screen_name', 23 | 'in_reply_to_status_id', 24 | 'in_reply_to_user_id', 25 | 'lang', 26 | 'place', 27 | 'possibly_sensitive', 28 | 'retweet_count', 29 | 'reweet_id', 30 | 'retweet_screen_name', 31 | 'source', 32 | 'text', 33 | 'tweet_url', 34 | 'user_created_at', 35 | 'user_screen_name', 36 | 'user_default_profile_image', 37 | 'user_description', 38 | 'user_favourites_count', 39 | 'user_followers_count', 40 | 'user_friends_count', 41 | 'user_listed_count', 42 | 'user_location', 43 | 'user_name', 44 | 'user_screen_name', 45 | 'user_statuses_count', 46 | 'user_time_zone', 47 | 'user_urls', 48 | 'user_verified', 49 | ] 50 | 51 | def get_row(t): 52 | get = t.get 53 | user = t.get('user').get 54 | row = [ 55 | coordinates(t), 56 | get('created_at'), 57 | hashtags(t), 58 | media(t), 59 | urls(t), 60 | get('favorite_count'), 61 | get('id_str'), 62 | get('in_reply_to_screen_name'), 63 | get('in_reply_to_status_id'), 64 | get('in_reply_to_user_id'), 65 | get('lang'), 66 | place(t), 67 | get('possibly_sensitive'), 68 | get('retweet_count'), 69 | retweet_id(t), 70 | retweet_screen_name(t), 71 | get('source'), 72 | get('text'), 73 | tweet_url(t), 74 | user('created_at'), 75 | user('screen_name'), 76 | user('default_profile_image'), 77 | user('description'), 78 | user('favourites_count'), 79 | user('followers_count'), 80 | user('friends_count'), 81 | user('listed_count'), 82 | user('location'), 83 | user('name'), 84 | user('screen_name'), 85 | user('statuses_count'), 86 | user('time_zone'), 87 | user_urls(t), 88 | user('verified'), 89 | ] 90 | return row 91 | 92 | def coordinates(t): 93 | if 'coordinates' in t and t['coordinates']: 94 | return '%f %f' % tuple(t['coordinates']['coordinates']) 95 | return None 96 | 97 | def hashtags(t): 98 | return ' '.join([h['text'] for h in t['entities']['hashtags']]) 99 | 100 | def media(t): 101 | if 'media' in t['entities']: 102 | return ' '.join([h['expanded_url'] for h in t['entities']['media']]) 103 | else: 104 | return None 105 | 106 | def urls(t): 107 | return ' '.join([h['expanded_url'] for h in t['entities']['urls']]) 108 | 109 | def place(t): 110 | if t['place']: 111 | return t['place']['full_name'] 112 | 113 | def retweet_id(t): 114 | if 'retweeted_status' in t and t['retweeted_status']: 115 | return t['retweeted_status']['id_str'] 116 | 117 | def retweet_screen_name(t): 118 | if 'retweeted_status' in t and t['retweeted_status']: 119 | return t['retweeted_status']['user']['screen_name'] 120 | 121 | def tweet_url(t): 122 | return "https://twitter.com/%s/status/%s" % (t['user']['screen_name'], t['id_str']) 123 | 124 | def user_urls(t): 125 | u = t.get('user') 126 | if not u: 127 | return None 128 | urls = [] 129 | if 'entities' in u and 'url' in u['entities'] and 'urls' in u['entities']['url']: 130 | for url in u['entities']['url']['urls']: 131 | if url['expanded_url']: 132 | urls.append(url['expanded_url']) 133 | return ' '.join(urls) 134 | 135 | 136 | if __name__ == "__main__": 137 | main() 138 | -------------------------------------------------------------------------------- /luigi.cfg: -------------------------------------------------------------------------------- 1 | [core] 2 | parallel-scheduling = True 3 | 4 | [scheduler] 5 | record_task_history = True 6 | state_path = luigi-state.pickle 7 | 8 | [task_history] 9 | db_connection = sqlite:///history.sqlite.db 10 | -------------------------------------------------------------------------------- /queue_tasks.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | 4 | def run_flow(text, job_id, count, token, secret): 5 | subprocess.run([ 6 | 'python', 7 | '-m', 8 | 'luigi', 9 | '--module', 10 | 'summarize', 11 | 'RunFlow', 12 | '--term', 13 | text, 14 | '--jobid', 15 | str(job_id), 16 | '--count', 17 | str(count), 18 | '--token', 19 | str(token), 20 | '--secret', 21 | str(secret) 22 | ]) 23 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Flask 2 | flask-oauthlib 3 | imagehash 4 | Jinja2 5 | luigi 6 | networkx 7 | pandas 8 | redis 9 | rq 10 | sqlalchemy 11 | twarc 12 | tweepy 13 | -------------------------------------------------------------------------------- /schema.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS searches; 2 | CREATE TABLE searches ( 3 | id INTEGER PRIMARY KEY AUTOINCREMENT, 4 | text TEXT NOT NULL, 5 | date_path TEXT NOT NULL, 6 | user TEXT NOT NULL, 7 | status TEXT, 8 | created DATETIME DEFAULT CURRENT_TIMESTAMP, 9 | published DATETIME 10 | ); 11 | -------------------------------------------------------------------------------- /static/css/style.css: -------------------------------------------------------------------------------- 1 | body { 2 | margin: 1em; 3 | } 4 | 5 | .searchList { 6 | } 7 | 8 | table { 9 | text-align: left; 10 | width: 1200px; 11 | } 12 | 13 | th { 14 | padding 5px; 15 | width: 15%; 16 | } 17 | 18 | th.status { 19 | width: 20%; 20 | } 21 | 22 | th.actions { 23 | width: 20%; 24 | } 25 | 26 | td { 27 | padding-right: 10px; 28 | padding-bottom: 5px; 29 | } 30 | 31 | .disclaimer { 32 | max-width: 600px; 33 | background-color: #eee; 34 | padding: 5px; 35 | border: thin solid #ccc; 36 | } 37 | 38 | .searchForm { 39 | padding: 5px; 40 | } 41 | 42 | #includePublished { 43 | font-size: 12pt; 44 | font-style: italic; 45 | padding: 3px; 46 | margin-bottom: 5px; 47 | } 48 | 49 | button { 50 | width: 90px; 51 | margin-right: 5px; 52 | outline: none; 53 | } 54 | 55 | button.delete { 56 | background-color: pink; 57 | } 58 | 59 | button.publish { 60 | background-color: lightgreen; 61 | } 62 | 63 | button.unpublish { 64 | background-color: yellow; 65 | } 66 | -------------------------------------------------------------------------------- /static/js/index.js: -------------------------------------------------------------------------------- 1 | var Search = React.createClass({ 2 | put: function(change) { 3 | $.ajax({ 4 | type: 'PUT', 5 | url: '/api/search/' + this.props.id, 6 | data: JSON.stringify(change), 7 | dataType: 'json', 8 | contentType: 'application/json' 9 | }); 10 | }, 11 | unpublish: function() { 12 | this.put({id: this.props.id, published: false}); 13 | }, 14 | publish: function() { 15 | this.put({id: this.props.id, published: true}); 16 | }, 17 | remove: function() { 18 | $.ajax({ 19 | type: 'DELETE', 20 | url: '/api/search/' + this.props.id, 21 | data: JSON.stringify({}), 22 | dataType: 'json', 23 | contentType: 'application/json' 24 | }); 25 | }, 26 | render: function() { 27 | var link = {this.props.text}; 28 | if (! (this.props.status == "FINISHED: RunFlow")) { 29 | link = this.props.text; 30 | } 31 | 32 | if (this.props.canModify) { 33 | if (this.props.published) { 34 | var publishButton = 35 | ; 38 | } else { 39 | var publishButton = 40 | ; 43 | } 44 | var buttons = 45 | 46 | { publishButton } 47 | 50 | 51 | } 52 | 53 | return ( 54 | 55 | { formatDateTime(this.props.created) } 56 | {link} 57 | {this.props.user} 58 | { formatDateTime(this.props.published) } 59 | {this.props.status} 60 | { buttons } 61 | 62 | ); 63 | } 64 | }); 65 | 66 | var SearchList = React.createClass({ 67 | render: function() { 68 | var user = this.props.user; 69 | var includePublished = this.props.includePublished; 70 | var searches = this.props.searches.filter(function(search) { 71 | return (search.user == user) || (search.published && includePublished); 72 | }); 73 | var searchNodes = searches.map(function(search) { 74 | return ( 75 | 84 | 85 | 86 | ); 87 | }); 88 | return ( 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | {searchNodes} 102 | 103 |
CreatedSearchCreatorPublishedJob StatusActions
104 | ); 105 | } 106 | }); 107 | 108 | var SearchForm = React.createClass({ 109 | getInitialState: function() { 110 | return {text: '', count: '1000'}; 111 | }, 112 | handleTextChange: function(e) { 113 | this.setState({text: e.target.value}); 114 | }, 115 | handleCountChange: function(e) { 116 | this.setState({count: e.target.value}); 117 | }, 118 | handleSubmit: function(e) { 119 | e.preventDefault(); 120 | var text = this.state.text.trim(); 121 | var count = this.state.count.trim(); 122 | if (!text || !count) { 123 | return; 124 | } 125 | this.props.onSearchSubmit({text: text, count: count}); 126 | this.setState({text: '', count: '1000'}); 127 | }, 128 | render: function() { 129 | return ( 130 |
131 | search: 134 |   135 | num tweets: 140 |   141 | 142 |
143 | ); 144 | } 145 | }); 146 | 147 | var SearchBox = React.createClass({ 148 | loadSearchesFromServer: function() { 149 | $.ajax({ 150 | url: this.props.url, 151 | dataType: 'json', 152 | cache: false, 153 | success: function(data) { 154 | this.setState({searches: data.searches, user: data.user}); 155 | }.bind(this), 156 | error: function(xhr, status, err) { 157 | console.error(this.props.url, status, err.toString()); 158 | }.bind(this) 159 | }); 160 | }, 161 | handleIncludePublishedChange: function(e) { 162 | this.setState({includePublished: e.target.checked}); 163 | }, 164 | handleSearchSubmit: function(search) { 165 | $.ajax({ 166 | // url: this.props.url, 167 | url: '/searches/', 168 | dataType: 'json', 169 | type: 'POST', 170 | data: search, 171 | success: function(data) { 172 | this.setState({searches: data.searches, user: data.user}); 173 | }.bind(this), 174 | error: function(xhr, status, err) { 175 | if (xhr.responseJSON) { 176 | this.setState({error: xhr.responseJSON.error}); 177 | } 178 | }.bind(this) 179 | }); 180 | }, 181 | getInitialState: function() { 182 | return {searches: [], user: null, includePublished: true}; 183 | }, 184 | componentDidMount: function() { 185 | this.loadSearchesFromServer(); 186 | setInterval(this.loadSearchesFromServer, this.props.pollInterval); 187 | }, 188 | render: function() { 189 | if (this.state.user) { 190 | var includePublished = 191 |
192 | include datasets published by others? 193 |   194 | 197 |
198 | } else { 199 | var includePublished = null; 200 | } 201 | return ( 202 |
203 |

{ this.state.error }

204 | 205 |
206 | { includePublished } 207 | 211 |
212 | ); 213 | } 214 | }); 215 | 216 | function formatDateTime(t) { 217 | if (t) { 218 | return $.format.date(new Date(t), 'yyyy-MM-dd HH:mm:ss'); 219 | } else { 220 | return null; 221 | } 222 | } 223 | 224 | ReactDOM.render( 225 | , 226 | document.getElementById('searches') 227 | ); 228 | -------------------------------------------------------------------------------- /summarize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | test.py - initial attempt at automating dn flows using luigi 4 | """ 5 | 6 | import bisect 7 | from collections import Counter 8 | import csv 9 | import hashlib 10 | import json 11 | import logging 12 | import math 13 | import os 14 | import time 15 | import zipfile 16 | import tempfile 17 | from urllib.parse import urlparse 18 | import numpy as np 19 | 20 | import imagehash 21 | from jinja2 import Environment, PackageLoader 22 | import luigi 23 | from luigi.contrib import redis_store 24 | import networkx as nx 25 | from PIL import Image 26 | from flask.config import Config 27 | import requests 28 | import twarc 29 | 30 | import json2csv 31 | 32 | 33 | config = Config(os.path.dirname(__file__)) 34 | config.from_pyfile('dnflow.cfg') 35 | 36 | logging.getLogger().setLevel(logging.WARN) 37 | logging.getLogger('').setLevel(logging.WARN) 38 | logging.getLogger('luigi-interface').setLevel(logging.WARN) 39 | 40 | 41 | def time_hash(digits=6): 42 | """Generate an arbitrary hash based on the current time for filenames.""" 43 | hash = hashlib.sha1() 44 | hash.update(str(time.time()).encode()) 45 | t = time.localtime() 46 | dt = '%s%02d%02d%02d%02d' % (t.tm_year, t.tm_mon, t.tm_mday, 47 | t.tm_hour, t.tm_min) 48 | return '%s-%s' % (dt, hash.hexdigest()[:digits]) 49 | 50 | 51 | def url_filename(url, include_extension=True): 52 | """Given a full URL, return just the filename after the last slash.""" 53 | parsed_url = urlparse(url) 54 | fname = parsed_url.path.split('/')[-1] 55 | if not include_extension: 56 | fname = fname.split('.')[0] 57 | return fname 58 | 59 | 60 | def generate_md5(fname, block_size=2**16): 61 | m = hashlib.md5() 62 | with open(fname, 'rb') as f: 63 | while True: 64 | buf = f.read(block_size) 65 | if not buf: 66 | break 67 | m.update(buf) 68 | return m.hexdigest() 69 | 70 | 71 | def get_block_size(n, d=1, default=100): 72 | """ 73 | returns a block size to use when sending ui updates for a job. 74 | uses the number of items (n) and a dampening value (d), which is 75 | useful for tasks that can take longer and require more updates 76 | """ 77 | # this shouldn't happen but in case it does 78 | if n <= 0: 79 | return default 80 | r = math.ceil(math.log10(n)) * d 81 | if r == 0: 82 | return default 83 | block_size = int(n / r) 84 | if block_size > 0: 85 | return block_size 86 | return default 87 | 88 | 89 | class EventfulTask(luigi.Task): 90 | 91 | @staticmethod 92 | def update_job(date_path, job_id=None, status=None): 93 | data = {} 94 | if job_id: 95 | data['job_id'] = job_id 96 | if date_path: 97 | data['date_path'] = date_path 98 | if status: 99 | data['status'] = status 100 | url = 'http://%s/job/' % config['HOSTNAME'] 101 | 102 | # TODO: basic auth is only used during hackish masking of the prototype 103 | # on the public internet. Eventually we'll want to come up with some 104 | # secure way of doing this PUT update to /job 105 | # https://github.com/DocNow/dnflow/issues/24 106 | 107 | if 'HTTP_BASICAUTH_USER' in config and 'HTTP_BASICAUTH_PASS' in config: 108 | auth = requests.auth.HTTPBasicAuth( 109 | config['HTTP_BASICAUTH_USER'], 110 | config['HTTP_BASICAUTH_PASS'] 111 | ) 112 | r = requests.put(url, data=data, auth=auth) 113 | else: 114 | r = requests.put(url, data=data) 115 | if r.status_code in [200, 302]: 116 | return True 117 | return False 118 | 119 | @luigi.Task.event_handler(luigi.Event.START) 120 | def start(task): 121 | print('### START ###: %s' % task) 122 | EventfulTask.update_job(date_path=task.search['date_path'], 123 | status='STARTED: %s' % task.task_family) 124 | 125 | @luigi.Task.event_handler(luigi.Event.SUCCESS) 126 | def success(task): 127 | print('### SUCCESS ###: %s' % task) 128 | EventfulTask.update_job(date_path=task.search['date_path'], 129 | status='FINISHED: %s' % task.task_family) 130 | 131 | @luigi.Task.event_handler(luigi.Event.PROCESSING_TIME) 132 | def processing_time(task, processing_time): 133 | print('### PROCESSING TIME ###: %s, %s' % (task, processing_time)) 134 | 135 | @luigi.Task.event_handler(luigi.Event.FAILURE) 136 | def failure(task, exc): 137 | print('### FAILURE ###: %s, %s' % (task, exc)) 138 | EventfulTask.update_job(date_path=task.search['date_path'], 139 | status='FAILED: %s' % task.task_family) 140 | 141 | 142 | class FetchTweets(EventfulTask): 143 | search = luigi.DictParameter() 144 | 145 | def output(self): 146 | fname = 'data/%s/tweets.json' % self.search['date_path'] 147 | return luigi.LocalTarget(fname) 148 | 149 | def run(self): 150 | term = self.search['term'] 151 | lang = self.search['lang'] 152 | count = self.search['count'] 153 | t = twarc.Twarc( 154 | consumer_key=config['TWITTER_CONSUMER_KEY'], 155 | consumer_secret=config['TWITTER_CONSUMER_SECRET'], 156 | access_token=self.search['token'], 157 | access_token_secret=self.search['secret'] 158 | ) 159 | with self.output().open('w') as fh: 160 | i = 0 161 | for tweet in t.search(term): 162 | i += 1 163 | if i > count: 164 | break 165 | if i % 500 == 0: 166 | self.update_job( 167 | date_path=self.search['date_path'], 168 | status="STARTED: %s - %s/%s" % 169 | (self.task_family, i, count) 170 | ) 171 | fh.write(json.dumps(tweet) + '\n') 172 | 173 | 174 | class CountHashtags(EventfulTask): 175 | search = luigi.DictParameter() 176 | 177 | def requires(self): 178 | return FetchTweets(search=self.search) 179 | 180 | def output(self): 181 | fname = self.input().fn.replace('tweets.json', 'count-hashtags.csv') 182 | return luigi.LocalTarget(fname) 183 | 184 | def run(self): 185 | c = Counter() 186 | for tweet_str in self.input().open('r'): 187 | tweet = json.loads(tweet_str) 188 | c.update([ht['text'].lower() 189 | for ht in tweet['entities']['hashtags']]) 190 | with self.output().open('w') as fp_counts: 191 | writer = csv.DictWriter(fp_counts, delimiter=',', 192 | quoting=csv.QUOTE_MINIMAL, 193 | fieldnames=['hashtag', 'count']) 194 | writer.writeheader() 195 | for ht, count in c.items(): 196 | writer.writerow({'hashtag': ht, 'count': count}) 197 | 198 | 199 | class EdgelistHashtags(EventfulTask): 200 | search = luigi.DictParameter() 201 | 202 | def requires(self): 203 | return FetchTweets(search=self.search) 204 | 205 | def output(self): 206 | fname = self.input().fn.replace('tweets.json', 'edgelist-hashtags.csv') 207 | return luigi.LocalTarget(fname) 208 | 209 | def run(self): 210 | """Each edge is a tuple containing (screen_name, mentioned_hashtag)""" 211 | with self.output().open('w') as fp_csv: 212 | writer = csv.DictWriter(fp_csv, delimiter=',', 213 | quoting=csv.QUOTE_MINIMAL, 214 | fieldnames=['user', 'hashtag']) 215 | writer.writeheader() 216 | for tweet_str in self.input().open('r'): 217 | tweet = json.loads(tweet_str) 218 | for ht in tweet['entities']['hashtags']: 219 | writer.writerow({'user': tweet['user']['screen_name'], 220 | 'hashtag': ht['text'].lower()}) 221 | 222 | 223 | class CountUrls(EventfulTask): 224 | search = luigi.DictParameter() 225 | 226 | def requires(self): 227 | return FetchTweets(search=self.search) 228 | 229 | def output(self): 230 | fname = self.input().fn.replace('tweets.json', 'count-urls.csv') 231 | return luigi.LocalTarget(fname) 232 | 233 | def run(self): 234 | c = Counter() 235 | for tweet_str in self.input().open('r'): 236 | tweet = json.loads(tweet_str) 237 | c.update([url['expanded_url'] for url in tweet['entities']['urls']]) 238 | with self.output().open('w') as fp_counts: 239 | writer = csv.DictWriter(fp_counts, delimiter=',', 240 | quoting=csv.QUOTE_MINIMAL, 241 | fieldnames=['url', 'count']) 242 | writer.writeheader() 243 | for url, count in c.items(): 244 | writer.writerow({'url': url, 'count': count}) 245 | 246 | 247 | class CountDomains(EventfulTask): 248 | search = luigi.DictParameter() 249 | 250 | def requires(self): 251 | return FetchTweets(search=self.search) 252 | 253 | def output(self): 254 | fname = self.input().fn.replace('tweets.json', 'count-domains.csv') 255 | return luigi.LocalTarget(fname) 256 | 257 | def run(self): 258 | c = Counter() 259 | for tweet_str in self.input().open('r'): 260 | tweet = json.loads(tweet_str) 261 | c.update([urlparse(url['expanded_url']).netloc.lower() 262 | for url in tweet['entities']['urls']]) 263 | with self.output().open('w') as fp_counts: 264 | writer = csv.DictWriter(fp_counts, delimiter=',', 265 | quoting=csv.QUOTE_MINIMAL, 266 | fieldnames=['url', 'count']) 267 | writer.writeheader() 268 | for url, count in c.items(): 269 | writer.writerow({'url': url, 'count': count}) 270 | 271 | 272 | class CountMentions(EventfulTask): 273 | search = luigi.DictParameter() 274 | 275 | def requires(self): 276 | return FetchTweets(search=self.search) 277 | 278 | def output(self): 279 | fname = self.input().fn.replace('tweets.json', 'count-mentions.csv') 280 | return luigi.LocalTarget(fname) 281 | 282 | def run(self): 283 | c = Counter() 284 | for tweet_str in self.input().open('r'): 285 | tweet = json.loads(tweet_str) 286 | c.update([m['screen_name'].lower() 287 | for m in tweet['entities']['user_mentions']]) 288 | with self.output().open('w') as fp_counts: 289 | writer = csv.DictWriter(fp_counts, delimiter=',', 290 | quoting=csv.QUOTE_MINIMAL, 291 | fieldnames=['screen_name', 'count']) 292 | writer.writeheader() 293 | for screen_name, count in c.items(): 294 | writer.writerow({'screen_name': screen_name, 295 | 'count': count}) 296 | 297 | 298 | class EdgelistMentions(EventfulTask): 299 | search = luigi.DictParameter() 300 | 301 | def requires(self): 302 | return FetchTweets(search=self.search) 303 | 304 | def output(self): 305 | fname = self.input().fn.replace('tweets.json', 'edgelist-mentions.csv') 306 | return luigi.LocalTarget(fname) 307 | 308 | def run(self): 309 | """Each edge is a tuple containing (screen_name, 310 | mentioned_screen_name)""" 311 | with self.output().open('w') as fp_csv: 312 | writer = csv.DictWriter(fp_csv, delimiter=',', 313 | fieldnames=('from_user', 'to_user')) 314 | writer.writeheader() 315 | for tweet_str in self.input().open('r'): 316 | tweet = json.loads(tweet_str) 317 | for mention in tweet['entities']['user_mentions']: 318 | writer.writerow({'from_user': tweet['user']['screen_name'], 319 | 'to_user': mention['screen_name']}) 320 | 321 | 322 | class CountMedia(EventfulTask): 323 | search = luigi.DictParameter() 324 | 325 | def requires(self): 326 | return FetchTweets(search=self.search) 327 | 328 | def output(self): 329 | fname = self.input().fn.replace('tweets.json', 'count-media.csv') 330 | return luigi.LocalTarget(fname) 331 | 332 | def run(self): 333 | c = Counter() 334 | for tweet_str in self.input().open('r'): 335 | tweet = json.loads(tweet_str) 336 | c.update([m['media_url'] 337 | for m in tweet['entities'].get('media', []) 338 | if m['type'] == 'photo']) 339 | with self.output().open('w') as fp_counts: 340 | writer = csv.DictWriter(fp_counts, delimiter=',', 341 | quoting=csv.QUOTE_MINIMAL, 342 | fieldnames=['url', 'file', 'count']) 343 | writer.writeheader() 344 | for url, count in c.items(): 345 | writer.writerow({'url': url, 'file': url_filename(url), 346 | 'count': count}) 347 | 348 | 349 | class FetchMedia(EventfulTask): 350 | search = luigi.DictParameter() 351 | 352 | def requires(self): 353 | return CountMedia(search=self.search) 354 | 355 | def output(self): 356 | # ensure only one successful fetch for each url 357 | # unless FetchTweets is called again with a new hash 358 | fname = self.input().fn.replace('count-media.csv', 359 | 'media-checksums-md5.txt') 360 | return luigi.LocalTarget(fname) 361 | 362 | def run(self): 363 | # get a count of files to fetch for updates 364 | with self.input().open('r') as countfile: 365 | # drop one for headers 366 | count = sum(1 for line in countfile) - 1 367 | # determine update block size 368 | update_block_size = get_block_size(count, 5) 369 | 370 | dirname = 'data/%s/media' % self.search['date_path'] 371 | os.makedirs(dirname, exist_ok=True) 372 | # lots of hits to same server, so pool connections 373 | session = requests.Session() 374 | hashes = [] 375 | with self.input().open('r') as csvfile: 376 | reader = csv.DictReader(csvfile, delimiter=',') 377 | for row in reader: 378 | fname = url_filename(row['url']) 379 | if len(fname) == 0: 380 | continue 381 | r = session.get(row['url']) 382 | if r.ok: 383 | full_name = '%s/%s' % (dirname, fname) 384 | with open(full_name, 'wb') as media_file: 385 | media_file.write(r.content) 386 | md5 = generate_md5(full_name) 387 | hashes.append((md5, full_name)) 388 | if len(hashes) % update_block_size == 0: 389 | self.update_job( 390 | date_path=self.search['date_path'], 391 | status="STARTED: %s - %s/%s" % 392 | (self.task_family, len(hashes), count) 393 | ) 394 | with self.output().open('w') as f: 395 | for md5, h in hashes: 396 | f.write('%s %s\n' % (md5, h)) 397 | 398 | 399 | class MatchMedia(EventfulTask): 400 | search = luigi.DictParameter() 401 | 402 | def requires(self): 403 | return FetchMedia(search=self.search) 404 | 405 | def output(self): 406 | fname = self.input().fn.replace('media-checksums-md5.txt', 407 | 'media-graph.json') 408 | return luigi.LocalTarget(fname) 409 | 410 | def run(self): 411 | date_path = self.search['date_path'] 412 | files = sorted(os.listdir('data/%s/media' % date_path)) 413 | hashes = {} 414 | matches = [] 415 | g = nx.Graph() 416 | update_block_size = get_block_size(len(files), 5) 417 | for i in range(len(files)): 418 | f = files[i] 419 | fn = 'data/%s/media/%s' % (date_path, f) 420 | ahash = imagehash.average_hash(Image.open(fn)) 421 | dhash = imagehash.dhash(Image.open(fn)) 422 | phash = imagehash.phash(Image.open(fn)) 423 | hashes[f] = {'ahash': ahash, 'dhash': dhash, 'phash': phash} 424 | for j in range(0, i): 425 | f2name = files[j] 426 | f2 = hashes[f2name] 427 | sumhash = sum([ahash - f2['ahash'], 428 | dhash - f2['dhash'], 429 | phash - f2['phash']]) 430 | # FIXME: 40 is a hard-coded arbitrary (eyeballed) threshold 431 | if sumhash <= 40: 432 | matches.append([f, files[j], 433 | ahash - f2['ahash'], 434 | dhash - f2['dhash'], 435 | phash - f2['phash'], 436 | sumhash]) 437 | g.add_edge(f, f2name) 438 | if i % update_block_size == 0: 439 | self.update_job( 440 | date_path=self.search['date_path'], 441 | status="STARTED: %s - %s/%s" % 442 | (self.task_family, i, len(files)) 443 | ) 444 | with self.output().open('w') as fp_graph: 445 | components = list(nx.connected_components(g)) 446 | # Note: sets are not JSON serializable 447 | d = [] 448 | for s in components: 449 | d.append(list(s)) 450 | json.dump(d, fp_graph, indent=2) 451 | 452 | 453 | class CountFollowers(EventfulTask): 454 | search = luigi.DictParameter() 455 | 456 | def requires(self): 457 | return FetchTweets(search=self.search) 458 | 459 | def output(self): 460 | fname = self.input().fn.replace('tweets.json', 'count-followers.csv') 461 | return luigi.LocalTarget(fname) 462 | 463 | def run(self): 464 | users = {} 465 | for tweet_str in self.input().open('r'): 466 | tweet = json.loads(tweet_str) 467 | user = tweet['user']['screen_name'] 468 | followers = tweet['user']['followers_count'] 469 | users[user] = followers 470 | 471 | with self.output().open('w') as fp_counts: 472 | writer = csv.DictWriter(fp_counts, delimiter=',', 473 | quoting=csv.QUOTE_MINIMAL, 474 | fieldnames=['user', 'count']) 475 | writer.writeheader() 476 | for user, count in users.items(): 477 | writer.writerow({'user': user, 'count': count}) 478 | 479 | 480 | class FollowRatio(EventfulTask): 481 | search = luigi.DictParameter() 482 | 483 | def requires(self): 484 | return FetchTweets(search=self.search) 485 | 486 | def output(self): 487 | fname = self.input().fn.replace('tweets.json', 'follow-ratio.csv') 488 | return luigi.LocalTarget(fname) 489 | 490 | def run(self): 491 | users = {} 492 | for tweet_str in self.input().open('r'): 493 | tweet = json.loads(tweet_str) 494 | user = tweet['user']['screen_name'] 495 | followers = int(tweet['user']['followers_count']) 496 | following = int(tweet['user']['friends_count']) 497 | if following > 0: 498 | r = followers / float(following) 499 | users[user] = r 500 | 501 | with self.output().open('w') as fp_counts: 502 | writer = csv.DictWriter(fp_counts, delimiter=',', 503 | quoting=csv.QUOTE_MINIMAL, 504 | fieldnames=['user', 'count']) 505 | writer.writeheader() 506 | for user, r in users.items(): 507 | writer.writerow({'user': user, 'count': r}) 508 | 509 | 510 | class SummaryHTML(EventfulTask): 511 | search = luigi.DictParameter() 512 | 513 | def requires(self): 514 | return FetchTweets(search=self.search) 515 | 516 | def output(self): 517 | fname = 'data/%s/summary.html' % self.search['date_path'] 518 | return luigi.LocalTarget(fname) 519 | 520 | def run(self): 521 | env = Environment(loader=PackageLoader('web')) 522 | t = env.get_template('summary.html') 523 | title = 'Summary for search "%s"' % self.term 524 | t.stream(title=title).dump(self.output().fn) 525 | 526 | 527 | class SummaryJSON(EventfulTask): 528 | search = luigi.DictParameter() 529 | 530 | def requires(self): 531 | return FetchTweets(search=self.search) 532 | 533 | def output(self): 534 | fname = self.input().fn.replace('tweets.json', 'summary.json') 535 | return luigi.LocalTarget(fname) 536 | 537 | def run(self): 538 | c = Counter() 539 | num_tweets = 0 540 | for tweet_str in self.input().open('r'): 541 | num_tweets += 1 542 | tweet = json.loads(tweet_str) 543 | c.update([m['media_url'] 544 | for m in tweet['entities'].get('media', []) 545 | if m['type'] == 'photo']) 546 | summary = { 547 | 'id': self.search['job_id'], 548 | 'path': self.search['date_path'], 549 | 'date': time.strftime('%Y-%m-%dT%H:%M:%SZ', 550 | time.gmtime()), 551 | 'num_tweets': num_tweets, 552 | 'term': self.search['term'] 553 | } 554 | with self.output().open('w') as fp_summary: 555 | json.dump(summary, fp_summary) 556 | 557 | 558 | class PopulateRedis(EventfulTask): 559 | search = luigi.DictParameter() 560 | 561 | def _get_target(self): 562 | return redis_store.RedisTarget(host=config['REDIS_HOST'], 563 | port=config['REDIS_PORT'], 564 | db=config['REDIS_DB'], 565 | update_id=self.search['date_path']) 566 | 567 | def requires(self): 568 | return MatchMedia(search=self.search) 569 | 570 | def output(self): 571 | return self._get_target() 572 | 573 | def run(self): 574 | date_path = self.search['date_path'] 575 | r = redis_store.redis.StrictRedis(host='localhost') 576 | # Assume tweets.json exists, earlier dependencies require it 577 | tweet_fname = 'data/%s/tweets.json' % date_path 578 | for tweet_str in open(tweet_fname, 'r'): 579 | tweet = json.loads(tweet_str) 580 | pipe = r.pipeline() 581 | # baseline data 582 | pipe.sadd('tweets:%s' % date_path, tweet['id']) 583 | for hashtag in [ht['text'].lower() for ht in 584 | tweet['entities']['hashtags']]: 585 | pipe.zincrby('count:hashtags:%s' % date_path, 586 | hashtag, 1) 587 | pipe.sadd('hashtag:%s:%s' % (hashtag, date_path), 588 | tweet['id']) 589 | for mention in [m['screen_name'].lower() for m in 590 | tweet['entities']['user_mentions']]: 591 | pipe.zincrby('count:mentions:%s' % date_path, 592 | mention, 1) 593 | pipe.sadd('mention:%s:%s' % (mention, date_path), 594 | tweet['id']) 595 | for photo_url in [m['media_url'] 596 | for m in tweet['entities'].get('media', []) 597 | if m['type'] == 'photo']: 598 | photo_id = url_filename(photo_url, include_extension=False) 599 | pipe.zincrby('count:photos:%s' % date_path, photo_id, 1) 600 | pipe.sadd('photo:%s:%s' % (photo_id, date_path), 601 | tweet['id']) 602 | pipe.execute() 603 | 604 | photo_matches_fname = 'data/%s/media-graph.json' % date_path 605 | photo_matches = json.load(open(photo_matches_fname)) 606 | if photo_matches: 607 | pipe = r.pipeline() 608 | # each set of related images 609 | for photo_match in photo_matches: 610 | photo_ids = [pm.split('.')[0] for pm in photo_match] 611 | # each id in the set needs a lookup key 612 | for i in range(len(photo_match)): 613 | photo_id = photo_ids[i] 614 | pipe.sadd('photomatch:%s:%s' % (photo_id, date_path), 615 | *photo_ids) 616 | pipe.execute() 617 | r.sadd('cacheproc', date_path) 618 | target = self._get_target() 619 | target.touch() 620 | 621 | def complete(self): 622 | target = self._get_target() 623 | return target.exists() 624 | 625 | 626 | class ExtractTweetIds(EventfulTask): 627 | search = luigi.DictParameter() 628 | 629 | def requires(self): 630 | return FetchTweets(search=self.search) 631 | 632 | def output(self): 633 | fname = self.input().fn.replace('tweets.json', 'tweet-ids.txt') 634 | return luigi.LocalTarget(fname) 635 | 636 | def run(self): 637 | with self.output().open('w') as fh: 638 | for tweet_str in self.input().open('r'): 639 | tweet = json.loads(tweet_str) 640 | fh.write(tweet['id_str'] + "\n") 641 | 642 | 643 | class BagIt(EventfulTask): 644 | search = luigi.DictParameter() 645 | 646 | def requires(self): 647 | return PopulateRedis(search=self.search) 648 | 649 | def output(self): 650 | date_path = self.search['date_path'] 651 | zip_fn = "data/%s/%s.zip" % (date_path, date_path) 652 | return luigi.LocalTarget(zip_fn) 653 | 654 | def run(self): 655 | data_dir = 'data/%s/' % self.search['date_path'] 656 | ziph = tempfile.NamedTemporaryFile(mode='wb') 657 | z = zipfile.ZipFile(ziph, 'w') 658 | for root, dirs, files in os.walk(data_dir): 659 | for fn in files: 660 | if fn == "tweets.json": 661 | continue 662 | src = str(os.path.join(root, fn)) 663 | dst = src.replace("data/", "") 664 | z.write(src, dst) 665 | z.close() 666 | os.rename(ziph.name, self.output().path) 667 | 668 | 669 | class CountRetweets(EventfulTask): 670 | search = luigi.DictParameter() 671 | 672 | def requires(self): 673 | return FetchTweets(search=self.search) 674 | 675 | def output(self): 676 | fname = self.input().fn.replace('tweets.json', 'retweets.csv') 677 | return luigi.LocalTarget(fname) 678 | 679 | def run(self): 680 | 681 | class Retweet(object): 682 | def __init__(self, id, count): 683 | self.id = id 684 | self.count = count 685 | def __lt__(self, other): 686 | # a trick to have bisect reverse sort 687 | return self.count > other.count 688 | def __repr__(self): 689 | return "%s [%s]" % (self.id, self.count) 690 | 691 | retweet_ids = set() 692 | retweets = [] 693 | 694 | for tweet_str in self.input().open('r'): 695 | tweet = json.loads(tweet_str) 696 | retweet_count = tweet.get('retweet_count', 0) 697 | if retweet_count == 0: 698 | continue 699 | 700 | if 'retweeted_status' in tweet: 701 | tweet_id = tweet['retweeted_status']['id_str'] 702 | else: 703 | tweet_id = tweet['id_str'] 704 | 705 | # ignore duplicate tweets 706 | # NOTE: this only works for search data! 707 | if tweet_id in retweet_ids: 708 | continue 709 | 710 | bisect.insort_right( 711 | retweets, 712 | Retweet(tweet_id, retweet_count) 713 | ) 714 | 715 | retweet_ids.add(tweet_id) 716 | if len(retweets) > 100: 717 | rt = retweets.pop() 718 | retweet_ids.remove(rt.id) 719 | 720 | with self.output().open('w') as fh: 721 | writer = csv.DictWriter(fh, delimiter=',', 722 | quoting=csv.QUOTE_MINIMAL, 723 | fieldnames=['tweet_id', 'count']) 724 | writer.writeheader() 725 | for rt in retweets: 726 | writer.writerow({'tweet_id': rt.id, 'count': rt.count}) 727 | 728 | 729 | class CreateCsv(EventfulTask): 730 | search = luigi.DictParameter() 731 | 732 | def requires(self): 733 | return FetchTweets(search=self.search) 734 | 735 | def output(self): 736 | fname = self.input().fn.replace('tweets.json', 'tweets.csv') 737 | return luigi.LocalTarget(fname) 738 | 739 | def run(self): 740 | with self.output().open('w') as fh: 741 | writer = csv.writer(fh) 742 | writer.writerow(json2csv.get_headings()) 743 | for line in self.input().open('r'): 744 | tweet = json.loads(line) 745 | writer.writerow(json2csv.get_row(tweet)) 746 | 747 | class Sampler(EventfulTask): 748 | search = luigi.DictParameter() 749 | 750 | def requires(self): 751 | return FetchTweets(search=self.search) 752 | 753 | def output(self): 754 | fname = self.input().fn.replace('tweets.json', 'sample.csv') 755 | return luigi.LocalTarget(fname) 756 | 757 | def run(self): 758 | sample_size = 10 759 | #sample_size = self.search['sample_size'] 760 | count = self.search['count'] 761 | index = np.random.random_integers(0,count,sample_size) 762 | counter = 0 763 | with self.output().open('w') as fh: 764 | writer = csv.writer(fh) 765 | writer.writerow(json2csv.get_headings()) 766 | for line in self.input().open('r'): 767 | if counter in index: 768 | tweet = json.loads(line) 769 | writer.writerow(json2csv.get_row(tweet)) 770 | counter += 1 771 | 772 | 773 | 774 | class RunFlow(EventfulTask): 775 | date_path = time_hash() 776 | jobid = luigi.IntParameter() 777 | term = luigi.Parameter() 778 | count = luigi.IntParameter(default=1000) 779 | token = luigi.Parameter() 780 | secret = luigi.Parameter() 781 | 782 | def requires(self): 783 | search = { 784 | "date_path": self.date_path, 785 | "job_id": self.jobid, 786 | "term": self.term, 787 | "count": self.count, 788 | "token": self.token, 789 | "secret": self.secret, 790 | "lang": "en" 791 | } 792 | self.search = search 793 | EventfulTask.update_job(job_id=search['job_id'], 794 | date_path=search['date_path']) 795 | yield CountHashtags(search=search) 796 | yield SummaryJSON(search=search) 797 | yield EdgelistHashtags(search=search) 798 | yield CountUrls(search=search) 799 | yield CountDomains(search=search) 800 | yield CountMentions(search=search) 801 | yield CountFollowers(search=search) 802 | yield CountRetweets(search=search) 803 | yield FollowRatio(search=search) 804 | yield EdgelistMentions(search=search) 805 | yield PopulateRedis(search=search) 806 | yield MatchMedia(search=search) 807 | yield ExtractTweetIds(search=search) 808 | yield CreateCsv(search=search) 809 | yield Sampler(search=search) 810 | yield BagIt(search=search) 811 | -------------------------------------------------------------------------------- /templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | {{ title }} 8 | {% block style_css %} 9 | 10 | 11 | 12 | 13 | 31 | 32 | {% block javascript %} 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | {% endblock javascript %} 42 | 43 | 44 | 45 |
46 |
47 |

48 | home 49 |   |   50 | {% if twitter_user %} 51 | logout 52 | {% else %} 53 | login 54 | {% endif %} 55 |

56 |
57 |
58 | {% block content %} 59 | {% endblock content %} 60 |
61 |
62 | 63 | {% block javascript_extra %} 64 | {% endblock javascript_extra %} 65 | {% if google_analytics %} 66 | 76 | {% endif %} 77 | 78 | 79 | -------------------------------------------------------------------------------- /templates/feed.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | dnflow 4 | {{ feed_url }} 5 | {{ updated }} 6 | 7 | 8 | dnflow 9 | {{ site_url }} 10 | 11 | {% for search in searches %}{% if search.status == "FINISHED: RunFlow" %} 12 | 13 | {{ search.url }} 14 | 15 | {{ search.text }} 16 | {{ search.user }} created a collection for {{ search.text }} 17 | {{ search.published }} 18 | 19 | {% endif %}{% endfor %} 20 | 21 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block content %} 4 | 5 |

hi there 6 | {% if twitter_user %} 7 | {{ twitter_user }} 8 | {% endif %} 9 |

10 | 11 |

12 | This is a design prototype for the Documenting the Now project. Please understand that data created here can disappear at any time. By using this application you are agreeing to our code of conduct. If you have any questions please let us know what you think in our Slack channel, or by emailing info@docnow.io 13 |

14 | 15 |
16 |
17 | 18 | 86 | 87 | 351 | {% endblock javascript_extra %} 352 | 353 | {% block content %} 354 |

  {{ search.text }} by {{ search.user }}

355 |
356 |

357 | 358 | 359 | tweets finished at 360 | 361 |

362 |
363 |
364 |

Modify your search:

365 |
366 |
367 | 368 |
369 | search: 370 | # tweets: 371 | 372 |
373 |
374 |
375 |
376 |
377 |

Popular hashtags

378 |
379 |
380 |

Most mentioned users

381 |
382 |
383 |
384 |
385 |

Common domains

386 |
387 |
388 |

Most referenced URLs

389 |
390 |
391 |
392 |
393 |

Most Followers

394 |
395 |
396 |

Follow Ratio (friends / following)

397 |
398 |
399 | 400 |
401 |
402 | 403 | 404 |
405 |
406 | 407 |
408 |
409 |

Raw data

410 |

411 | Download the raw data behind these charts as 412 | one complete package 413 | or pick them out individually: 414 |

415 | 428 |
429 |
430 | 431 |
432 |

Top Retweets

433 |
434 | 435 |
436 |
437 |

Common images

438 |
439 |
440 |
441 |
442 |

Matching images

443 |
444 |
445 | 446 |
447 |

Sample

448 |
449 | 450 |
451 |
452 |

Create a sample of tweets

453 | Sample size: 454 | 455 |
456 |
457 | 458 | {% endblock content %} 459 | -------------------------------------------------------------------------------- /templates/summary_compare.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block style_css_extra %} 4 | #hashtags { 5 | font: 13px sans-serif; 6 | } 7 | 8 | .axis path, 9 | .axis line { 10 | fill: none; 11 | stroke: #000; 12 | shape-rendering: crispEdges; 13 | } 14 | 15 | .bar { 16 | fill: steelblue; 17 | } 18 | 19 | .x.axis path { 20 | display: none; 21 | } 22 | 23 | .hashtag { 24 | stroke-width: 1; 25 | stroke: #eee; 26 | } 27 | 28 | .legend { 29 | stroke-width: 1; 30 | stroke: #888; 31 | } 32 | 33 | {% endblock style_css_extra %} 34 | 35 | {% block javascript_extra %} 36 | 184 | {% endblock javascript_extra %} 185 | 186 | {% block content %} 187 |
188 |

Comparing {{ search.text }} with

189 |

190 | 191 | 192 |

193 |
194 |
195 |
196 |

Hashtags

197 |
198 |
199 | {% endblock content %} 200 | -------------------------------------------------------------------------------- /ui.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sqlite3 3 | 4 | from flask_oauthlib.client import OAuth 5 | from flask import g, jsonify, request, redirect, session, flash, make_response 6 | from flask import Flask, render_template, url_for, send_from_directory, abort 7 | import pandas as pd 8 | import redis 9 | from rq import Queue 10 | import numpy as np 11 | from queue_tasks import run_flow 12 | 13 | import json 14 | import csv 15 | import numpy as np 16 | import json2csv 17 | from numpy.random import shuffle 18 | 19 | # configure application 20 | 21 | app = Flask(__name__) 22 | app.config.from_pyfile('dnflow.cfg') 23 | 24 | redis_conn = redis.StrictRedis( 25 | host=app.config['REDIS_HOST'], 26 | port=app.config['REDIS_PORT'], 27 | charset='utf-8', 28 | decode_responses=True 29 | ) 30 | 31 | q = Queue(connection=redis_conn) 32 | 33 | logging.getLogger().setLevel(logging.DEBUG) 34 | 35 | 36 | # twitter authentication 37 | 38 | 39 | oauth = OAuth() 40 | twitter = oauth.remote_app('twitter', 41 | base_url='https://api.twitter.com/1/', 42 | request_token_url='https://api.twitter.com/oauth/request_token', 43 | access_token_url='https://api.twitter.com/oauth/access_token', 44 | authorize_url='https://api.twitter.com/oauth/authenticate', 45 | access_token_method='GET', 46 | consumer_key=app.config['TWITTER_CONSUMER_KEY'], 47 | consumer_secret=app.config['TWITTER_CONSUMER_SECRET'] 48 | ) 49 | 50 | 51 | @app.route('/login') 52 | def login(): 53 | next = request.args.get('next') or request.referrer or None 54 | callback_url = 'http://' + app.config['HOSTNAME'] + url_for('oauth_authorized', next=next) 55 | return twitter.authorize(callback=callback_url) 56 | 57 | 58 | @app.route('/logout') 59 | def logout(): 60 | del session['twitter_token'] 61 | del session['twitter_user'] 62 | return redirect('/') 63 | 64 | 65 | @app.route('/oauth-authorized') 66 | def oauth_authorized(): 67 | next_url = request.args.get('next') or url_for('index') 68 | resp = twitter.authorized_response() 69 | if resp is None: 70 | flash(u'You denied the request to sign in.') 71 | return redirect(next_url) 72 | session['twitter_token'] = ( 73 | resp['oauth_token'], 74 | resp['oauth_token_secret'] 75 | ) 76 | session['twitter_user'] = resp['screen_name'] 77 | flash('You were signed in as %s' % resp['screen_name']) 78 | return redirect(next_url) 79 | 80 | 81 | @twitter.tokengetter 82 | def get_twitter_token(token=None): 83 | return session.get('twitter_token') 84 | 85 | 86 | # webapp routes 87 | 88 | 89 | @app.route('/static/') 90 | def send_static(path): 91 | return send_from_directory('/static', path) 92 | 93 | @app.errorhandler(404) 94 | def page_not_found(error): 95 | return 'This route does not exist {}'.format(request.url), 404 96 | 97 | 98 | @app.before_request 99 | def before_request(): 100 | g.db = connect_db() 101 | g.db.row_factory = sqlite3.Row 102 | 103 | 104 | def connect_db(): 105 | return sqlite3.connect(app.config['DATABASE']) 106 | 107 | 108 | def query(sql, args=(), one=False, json=False): 109 | c = g.db.execute(sql, args) 110 | rv = c.fetchall() 111 | c.close() 112 | if json: 113 | return [{k: r[k] for k in r.keys()} for r in rv] 114 | return (dict(rv[0]) if rv else None) if one else rv 115 | 116 | 117 | @app.teardown_request 118 | def teardown_request(exception): 119 | db = getattr(g, 'db', None) 120 | if db is not None: 121 | db.close() 122 | 123 | 124 | @app.context_processor 125 | def inject_user(): 126 | return dict(twitter_user=session.get('twitter_user', None)) 127 | 128 | 129 | @app.context_processor 130 | def inject_analytics(): 131 | return dict(google_analytics=app.config.get('GOOGLE_ANALYTICS')) 132 | 133 | 134 | @app.route('/', methods=['GET']) 135 | def index(): 136 | return render_template('index.html', title='dnflow prototype home') 137 | 138 | 139 | @app.route('/searches/', methods=['POST']) 140 | def add_search(): 141 | text = request.form.get('text', None) 142 | user = session.get('twitter_user', None) 143 | if not user: 144 | response = jsonify({"error": "✋ please login first, thanks!"}) 145 | response.status_code = 403 146 | return response 147 | try: 148 | count = request.form.get('count', None) 149 | count = int(count) 150 | except: 151 | count = 1000 152 | if text: 153 | sql = ''' 154 | INSERT INTO searches (text, date_path, user) 155 | VALUES (?, ?, ?) 156 | ''' 157 | query(sql, [request.form['text'], '', user]) 158 | g.db.commit() 159 | r = query(sql='SELECT last_insert_rowid() AS job_id FROM searches', 160 | one=True) 161 | job_id = r['job_id'] 162 | job = q.enqueue_call( 163 | run_flow, 164 | args=( 165 | text, 166 | job_id, 167 | count, 168 | session['twitter_token'][0], 169 | session['twitter_token'][1] 170 | ), 171 | timeout=app.config['MAX_TIMEOUT'] 172 | ) 173 | logging.debug('job: %s' % job) 174 | return redirect(url_for('index')) 175 | 176 | 177 | @app.route('/job/', methods=['PUT']) 178 | def job(): 179 | job_id = request.form.get('job_id', None) 180 | date_path = request.form.get('date_path', None) 181 | status = request.form.get('status', None) 182 | 183 | # A job is starting, we want the date_path 184 | if job_id and date_path: 185 | query('UPDATE searches SET date_path = ? WHERE id = ?', 186 | [date_path, job_id]) 187 | logging.debug('update date_path=%s where id=%s' % (date_path, job_id)) 188 | g.db.commit() 189 | # A job is in progress, we want the status 190 | if date_path and status: 191 | query('UPDATE searches SET status = ? WHERE date_path = ?', 192 | [status, date_path]) 193 | logging.debug('update status=%s where date_path=%s' % (status, 194 | date_path)) 195 | g.db.commit() 196 | return redirect(url_for('index')) 197 | 198 | 199 | @app.route('/summary//', methods=['GET']) 200 | def summary(date_path): 201 | user = session.get('twitter_user', None) 202 | search = query('SELECT * FROM searches WHERE date_path = ?', [date_path], 203 | one=True) 204 | 205 | if not search['published'] and user != search['user']: 206 | abort(401) 207 | 208 | return render_template('summary.html', title=search['text'], search=search) 209 | 210 | 211 | @app.route('/summary//', methods=['GET']) 212 | def summary_static_proxy(date_path, file_name): 213 | user = session.get('twitter_user', None) 214 | search = query('SELECT * FROM searches WHERE date_path = ?', [date_path], 215 | one=True) 216 | 217 | if not search['published'] and user != search['user']: 218 | abort(401) 219 | 220 | fname = '%s/%s' % (date_path, file_name) 221 | return send_from_directory(app.config['DATA_DIR'], fname, cache_timeout=-1) 222 | 223 | 224 | @app.route('/summary//compare', methods=['GET']) 225 | def summary_compare(search_id): 226 | search = query('SELECT * FROM searches WHERE id = ?', [search_id], 227 | one=True) 228 | compare_ids = request.args.getlist('id') 229 | return render_template('summary_compare.html', search=search, 230 | compare_ids=compare_ids) 231 | 232 | 233 | @app.route('/summary//sample/', methods=['POST']) 234 | def sample(date_path): 235 | try: 236 | sample_size = int(request.form.get('sample_size', None)) 237 | except ValueError: 238 | return redirect(url_for('summary', date_path=date_path)) 239 | summary = json.load(open('data/%s/summary.json' % date_path, 'r')) 240 | num_tweets = summary['num_tweets'] 241 | tweet_index = np.arange(num_tweets) 242 | shuffle(tweet_index) 243 | tweet_index = tweet_index[0:sample_size] 244 | counter = 0 245 | with open('data/%s/sample.csv' % date_path, 'w') as sample_file: 246 | writer = csv.writer(sample_file) 247 | writer.writerow(json2csv.get_headings()) 248 | with open('data/%s/tweets.json' % date_path,'r') as tweets_file: 249 | for line in tweets_file: 250 | tweet = json.loads(line) 251 | if counter in tweet_index: 252 | writer.writerow(json2csv.get_row(tweet)) 253 | counter += 1 254 | return redirect(url_for('summary', date_path=date_path)) 255 | 256 | 257 | @app.route('/feed/') 258 | def feed(): 259 | searches = query( 260 | ''' 261 | SELECT * FROM searches 262 | WHERE published IS NOT NULL 263 | ORDER BY id DESC 264 | ''', json=True) 265 | site_url = 'http://' + app.config['HOSTNAME'] 266 | feed_url = site_url + '/feed/' 267 | def add_url(s): 268 | s['url'] = site_url + '/summary/' + s['date_path'] + '/' 269 | return s 270 | searches = map(_date_format, searches) 271 | searches = list(map(add_url, searches)) 272 | resp = make_response( 273 | render_template( 274 | 'feed.xml', 275 | updated=searches[0]['created'], 276 | site_url=site_url, 277 | feed_url=feed_url, 278 | searches=searches 279 | ) 280 | ) 281 | resp.headers['Content-Type'] = 'application/atom+xml' 282 | return resp 283 | 284 | 285 | @app.route('/robots.txt') 286 | def robots(): 287 | resp = make_response(render_template('robots.txt')) 288 | resp.headers['Content-Type'] = 'text/plain' 289 | return resp 290 | 291 | 292 | # api routes for getting data 293 | 294 | @app.route('/api/searches/', methods=['GET']) 295 | def api_searches(): 296 | user = session.get('twitter_user', None) 297 | q = ''' 298 | SELECT * 299 | FROM searches 300 | WHERE user = ? 301 | OR published IS NOT NULL 302 | ORDER BY id DESC 303 | ''' 304 | searches = query(q, [user], json=True) 305 | searches = { 306 | "user": user, 307 | "searches": list(map(_date_format, searches)) 308 | } 309 | return jsonify(searches) 310 | 311 | 312 | @app.route('/api/search/', methods=["GET", "PUT", "DELETE"]) 313 | def search(search_id): 314 | search = query('SELECT * FROM searches WHERE id = ?', [search_id], one=True) 315 | if not search: 316 | abort(404) 317 | 318 | # they must own the search to modify it 319 | user = session.get('twitter_user', None) 320 | if request.method in ['PUT', 'DELETE'] and search['user'] != user: 321 | abort(401) 322 | 323 | if request.method == 'PUT': 324 | new_search = request.get_json() 325 | if new_search['published']: 326 | query("UPDATE searches SET published = CURRENT_TIMESTAMP WHERE id = ? AND published IS NULL", [search_id]) 327 | elif not new_search['published']: 328 | query("UPDATE searches SET published = NULL WHERE id = ?", 329 | [search_id]) 330 | g.db.commit() 331 | elif request.method == 'DELETE': 332 | query("DELETE FROM searches WHERE id = ?", [search_id]) 333 | g.db.commit() 334 | 335 | return jsonify(_date_format(search)) 336 | 337 | 338 | @app.route('/api/hashtags//', methods=['GET']) 339 | def hashtags_multi(search_id): 340 | ids = [search_id] 341 | ids.extend(request.args.getlist('id')) 342 | in_clause = ','.join([str(i) for i in ids]) 343 | searches = query(""" 344 | SELECT id, date_path, text 345 | FROM searches WHERE id in (%s) 346 | """ % in_clause) 347 | summary = [] 348 | search = searches[0] 349 | summary.append({'id': search['id'], 'date_path': search['date_path'], 350 | 'text': search['text'], 351 | 'colname': 'count_%s' % search['id']}) 352 | d = pd.read_csv('data/%s/count-hashtags.csv' % search['date_path']) 353 | d = d.rename(columns={'count': 'count_%s' % search['id']}) 354 | for search in searches[1:]: 355 | summary.append({'id': search['id'], 'date_path': search['date_path'], 356 | 'text': search['text'], 357 | 'colname': 'count_%s' % search['id']}) 358 | e = pd.read_csv('data/%s/count-hashtags.csv' % search['date_path']) 359 | e = e.rename(columns={'count': 'count_%s' % search['id']}) 360 | d = pd.merge(d, e, on='hashtag', how='outer').fillna(0) 361 | d.sort_values(by='count_%s' % search_id, inplace=True, ascending=False) 362 | result = {'summary': summary, 'hashtags': d.to_dict(orient='record')} 363 | return jsonify(result) 364 | 365 | 366 | @app.route('/api/searches//hashtags/', methods=['GET']) 367 | def hashtags(date_path): 368 | d = _count_entities(date_path, 'hashtags', 'hashtag') 369 | return jsonify(d) 370 | 371 | 372 | @app.route('/api/searches//mentions/', methods=['GET']) 373 | def mentions(date_path): 374 | d = _count_entities(date_path, 'mentions', 'screen_name') 375 | return jsonify(d) 376 | 377 | 378 | def _count_entities(date_path, entity, attrname): 379 | try: 380 | # range query is 0-indexed 381 | num = int(request.args.get('num', 24)) - 1 382 | except: 383 | num = 24 384 | counts = redis_conn.zrevrange('count:%s:%s' % (entity, date_path), 0, num, 385 | True) 386 | return [{attrname: e, 'count': c} for e, c in counts] 387 | 388 | 389 | def _date_format(row): 390 | for name in ['created', 'published']: 391 | t = row[name] 392 | if t: 393 | t = t.replace(' ', 'T') 394 | t += 'Z' 395 | row[name] = t 396 | return row 397 | 398 | 399 | if __name__ == '__main__': 400 | app.run(debug=app.config['DEBUG']) 401 | --------------------------------------------------------------------------------