├── requirements.txt ├── github.sh ├── harvest.sh ├── .gitignore ├── generate.sh ├── harvest.py ├── reportprofile.py ├── d3times.py ├── d3graph.py ├── templates ├── reportprofile.txt ├── timebar.html ├── reportprofile.html ├── wordcloud.html └── graph.html ├── d3output.py ├── d3wordcloud.py ├── d3cotags.py ├── LICENSE ├── stopwords └── stop-words_english_6_en.txt ├── README.md ├── assets └── d3.layout.cloud.js └── profiler.py /requirements.txt: -------------------------------------------------------------------------------- 1 | python-dateutil 2 | pytz 3 | tzlocal 4 | pysparklines 5 | requests_oauthlib 6 | twarc 7 | networkx 8 | humanize 9 | mako 10 | 11 | -------------------------------------------------------------------------------- /github.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source ./github-credentials 4 | 5 | PROJECTDIR=$1 6 | source ./projects/$PROJECTDIR/github-repo 7 | 8 | DATE=`date "+%Y-%m-%dT%H-%M-%S"` 9 | 10 | cd projects/$PROJECTDIR/html 11 | git add . 12 | git commit -m "$DATE update" 13 | git push https://$GITHUB_TOKEN@github.com/pbinkley/$GITHUB_REPO.git 14 | echo "Pushed commit: $DATE" 15 | -------------------------------------------------------------------------------- /harvest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source venv3/bin/activate 3 | export PATH=~/.local/bin:$PATH 4 | 5 | PROJECTDIR=$1 6 | if [ -n "$PROJECTDIR" ]; then 7 | SEARCH=`cat projects/$PROJECTDIR/metadata.json | jq -r ".search"` 8 | 9 | OUTPUT=projects/$PROJECTDIR/data/tweets/tweets-$(date -d "today" +"%Y%m%d%H%M").json 10 | LASTID=`cat projects/$PROJECTDIR/data/tweets/last-id` 11 | echo Lastid: $LASTID Search: $SEARCH 12 | echo Output to $OUTPUT 13 | twarc --since_id $LASTID search "$SEARCH" > $OUTPUT 14 | NEWLASTID=`cat $OUTPUT | head -1 | jq -r ".id_str"` 15 | 16 | if [[ ! -z $NEWLASTID ]]; then 17 | echo $NEWLASTID > projects/$PROJECTDIR/data/tweets/last-id 18 | fi 19 | 20 | echo "Harvested `wc -l $OUTPUT | cut -d " " -f 1` tweets" 21 | 22 | # generate html 23 | ./generate.sh $PROJECTDIR 24 | 25 | else 26 | echo "Provide project directory name" 27 | fi 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # twarc harvest project directories 2 | projects/ 3 | credentials 4 | github-credentials 5 | venv/ 6 | venv3/ 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | env/ 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | 57 | # Sphinx documentation 58 | docs/_build/ 59 | 60 | # PyBuilder 61 | target/ 62 | -------------------------------------------------------------------------------- /generate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source venv3/bin/activate 3 | 4 | PROJECTDIR=$1 5 | 6 | # generate html 7 | twarc/utils/wordcloud.py projects/$PROJECTDIR/data/tweets/*.json > projects/$PROJECTDIR/html/wordcloud.html 8 | twarc/utils/sort_by_id.py projects/$PROJECTDIR/data/tweets/*.json | twarc/utils/deduplicate.py > projects/$PROJECTDIR/data/amalgamated.json 9 | #twarc/utils/network.py --users projects/$PROJECTDIR/data/amalgamated.json projects/$PROJECTDIR/html/network-users.html 10 | #twarc/utils/network.py projects/$PROJECTDIR/data/amalgamated.json projects/$PROJECTDIR/html/network.html 11 | #twarc/utils/wall.py projects/$PROJECTDIR/data/amalgamated.json > projects/$PROJECTDIR/wall.html 12 | ./d3cotags.py -e $PROJECTDIR projects/$PROJECTDIR > projects/$PROJECTDIR/html/cotags.html 13 | ./d3graph.py --mode mentions projects/$PROJECTDIR > projects/$PROJECTDIR/html/mentionsgraph.html 14 | ./d3graph.py --mode retweets projects/$PROJECTDIR > projects/$PROJECTDIR/html/retweetsgraph.html 15 | ./d3graph.py --mode replies projects/$PROJECTDIR > projects/$PROJECTDIR/html/repliesgraph.html 16 | ./d3times.py -a -t "America/Edmonton" -i 3H projects/$PROJECTDIR > projects/$PROJECTDIR/html/timebargraph.html 17 | ./reportprofile.py -o html projects/$PROJECTDIR/data/amalgamated.json > projects/$PROJECTDIR/html/index.html 18 | ./reportprofile.py projects/$PROJECTDIR 19 | twarc dehydrate projects/$PROJECTDIR/data/amalgamated.json | uniq > projects/$PROJECTDIR/html/tweet-ids.txt 20 | -------------------------------------------------------------------------------- /harvest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import argparse 6 | import json 7 | import errno 8 | 9 | def make_sure_path_exists(path): 10 | try: 11 | os.makedirs(path) 12 | except OSError as exception: 13 | if exception.errno != errno.EEXIST: 14 | raise 15 | 16 | parser = argparse.ArgumentParser("harvest") 17 | parser.add_argument("archive_dir", action="store", 18 | help="a directory where results are stored") 19 | args = parser.parse_args() 20 | 21 | if not os.path.isdir(args.archive_dir): 22 | sys.exit("Directory " + args.archive_dir + " does not exist.") 23 | 24 | data_dir = os.path.join(args.archive_dir, "data") 25 | make_sure_path_exists(data_dir) 26 | tweets_dir = os.path.join(data_dir, "tweets") 27 | make_sure_path_exists(tweets_dir) 28 | 29 | metadatafile = os.path.join(args.archive_dir, "metadata.json") 30 | try: 31 | with open(metadatafile) as json_data: 32 | metadata = json.load(json_data) 33 | json_data.close() 34 | except: 35 | sys.exit("Cannot read metadata file " + metadatafile) 36 | 37 | sys.argv = ["", metadata["search"], tweets_dir] 38 | 39 | # find twarc-archive.py on system path 40 | for dirname in os.environ["PATH"].split(os.pathsep): 41 | candidate = os.path.join(dirname, "twarc-archive.py") 42 | print(candidate) 43 | if os.path.isfile(candidate): 44 | break 45 | else: 46 | candidate = "" 47 | try: 48 | execfile(candidate) 49 | except: 50 | sys.exit("Cannot run twarc-archive.py") 51 | -------------------------------------------------------------------------------- /reportprofile.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import optparse 5 | from profiler import Profiler # local module 6 | from dateutil import parser 7 | import sparkline 8 | import os 9 | from mako.template import Template 10 | import glob 11 | import sys 12 | import re 13 | import humanize 14 | 15 | opt_parser = optparse.OptionParser() 16 | opt_parser.add_option("-o", "--output", dest="output", type="str", 17 | help="text | json | html (default: text)", default="text") 18 | opts, args = opt_parser.parse_args() 19 | 20 | profiler = Profiler({"extended": True, "blocks": ["all"]}) 21 | 22 | profiler.gettweets(opts, args) 23 | 24 | data = profiler.report() 25 | 26 | if (opts.output == "json"): 27 | print(json.dumps(data)) 28 | elif (opts.output == "html"): 29 | metadata_file = os.path.join(os.path.dirname(args[0]), "../metadata.json") 30 | with open(metadata_file) as json_data: 31 | metadata = json.load(json_data) 32 | json_data.close() 33 | 34 | data['title'] = metadata['title'] 35 | data['search'] = metadata['search'] 36 | 37 | # gather names and sizes of html files 38 | data['reports'] = [] 39 | p = re.compile('.*\/html\/(.*)\.html') 40 | for report in sorted(glob.glob(os.path.join(os.path.dirname(args[0]), "../html/*.html"))): 41 | m = p.match(report) 42 | size = os.path.getsize(report) 43 | data['reports'].append({'report': m[1], 'size': humanize.naturalsize(size)}) 44 | 45 | mytemplate = Template(filename='templates/reportprofile.html') 46 | print(mytemplate.render(data = data)) 47 | else: 48 | mytemplate = Template(filename='templates/reportprofile.txt') 49 | print(mytemplate.render(data = data)) 50 | -------------------------------------------------------------------------------- /d3times.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import optparse 4 | import pytz # $ pip install pytz 5 | from tzlocal import get_localzone # $ pip install tzlocal 6 | import d3output # local module 7 | from profiler import TimeProfiler # local module 8 | 9 | opt_parser = optparse.OptionParser() 10 | opt_parser.add_option("-t", "--timezone", type=str, default="", 11 | help="output timezone (e.g. 'America/New_York' or 'local'; default: UTC)") 12 | opt_parser.add_option('-a', '--aggregate', action='store_true', default=False, 13 | help="Aggregate the values to produce key-value pairs with counts") 14 | opt_parser.add_option("-o", "--output", dest="output", type="str", 15 | help="html | csv | json (default: html)", default="html") 16 | opt_parser.add_option("-p", "--template", dest="template", type="str", 17 | help="name of template in utils/template (default: timebar.html)", default="timebar.html") 18 | opt_parser.add_option("-i", "--interval", dest="intervalStr", type="str", 19 | help="interval for grouping timestamps, in seconds, minutes or hours, e.g. 15M (default: 1S)", default="1S") 20 | 21 | opts, args = opt_parser.parse_args() 22 | 23 | aggregate = opts.aggregate 24 | tzname = opts.timezone 25 | 26 | # determine output time zone 27 | if tzname == "": 28 | tz = pytz.UTC 29 | elif tzname == "local": 30 | tz = get_localzone() # system timezone, from tzlocal 31 | else: 32 | tz = pytz.timezone(tzname) 33 | 34 | # if an interval is provided in the options, use it; otherwise 35 | # determine the interval from the datetime format 36 | intervalStr = opts.intervalStr # e.g. 15M 37 | 38 | profiler = TimeProfiler({ 39 | "tz": tz, 40 | "output": opts.output, 41 | "aggregate": aggregate, 42 | "intervalStr": intervalStr}) 43 | 44 | profiler.gettweets(opts, args) 45 | 46 | data = profiler.report() 47 | 48 | if opts.output == "html": 49 | d3output.embed(opts.template, data) 50 | else: 51 | print(data) 52 | -------------------------------------------------------------------------------- /d3graph.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import optparse 4 | import d3output # local module 5 | from profiler import Profiler # local module 6 | from profiler import LinkNodesProfiler # local module 7 | from collections import Counter 8 | 9 | opt_parser = optparse.OptionParser() 10 | opt_parser.add_option("-m", "--mode", dest="mode", help="retweets (default) | mentions | replies", 11 | default="retweets") 12 | opt_parser.add_option("-t", "--threshold", dest="threshold", type="int", 13 | help="minimum links to qualify for inclusion (default: 1)", default=1) 14 | opt_parser.add_option("-o", "--output", dest="output", type="str", 15 | help="html | json (default: html)", default="html") 16 | opt_parser.add_option("-p", "--template", dest="template", type="str", 17 | help="name of template in utils/template (default: graph.html)", default="graph.html") 18 | 19 | opts, args = opt_parser.parse_args() 20 | 21 | output = opts.output 22 | 23 | class DirectedProfiler(LinkNodesProfiler): 24 | def __init__(self, opts): 25 | LinkNodesProfiler.__init__(self, opts) 26 | 27 | def process(self, tweet): 28 | Profiler.process(self, tweet) 29 | 30 | def adduser(self, user, tweet): 31 | if self.mode == "mentions": 32 | if "user_mentions" in tweet["entities"]: 33 | for mention in tweet["entities"]["user_mentions"]: 34 | self.addlink(user, str(mention["screen_name"])) 35 | elif self.mode == "replies": 36 | if not(tweet["in_reply_to_screen_name"] == None): 37 | self.addlink(tweet["in_reply_to_screen_name"], user) 38 | else: # default mode: retweets 39 | if "retweeted_status" in tweet: 40 | self.addlink(user, tweet["retweeted_status"]["user"]["screen_name"]) 41 | # add to tweet count for this tag 42 | if not user in self.nodes: 43 | self.addsingle(user) 44 | self.nodes[user]["tweetcount"] += 1 45 | 46 | def report(self): 47 | return LinkNodesProfiler.report(self) 48 | 49 | profiler = DirectedProfiler({ 50 | "mode": opts.mode, 51 | "graph": "directed", 52 | "field": "user"}) 53 | 54 | profiler.gettweets(opts, args) 55 | 56 | data = profiler.report() 57 | 58 | profile = data["profile"] 59 | nodes = data["nodes"] 60 | 61 | if output == "csv": 62 | print(d3output.nodeslinkcsv(nodes)) 63 | elif output == "json": 64 | values = d3output.nodeslinktrees(profile, nodes) 65 | print({"profile": profile, "values": values}) 66 | elif output == "html": 67 | print(d3output.embed(opts.template, d3output.nodeslinktrees(profile, nodes))) 68 | 69 | -------------------------------------------------------------------------------- /templates/reportprofile.txt: -------------------------------------------------------------------------------- 1 | ## -*- coding: utf-8 -*- 2 | <%! import sparkline %> 3 | <%! from dateutil import parser %> 4 | 5 | <%def name="percentage(numerator, denominator)" filter="trim"> 6 | ${str("%.2f" % (float(numerator) / float(denominator) * 100.0))}% 7 | 8 | 9 | Count: ${data['count']} 10 | 11 | Users: ${data['usercount']} 12 | User percentiles: ${sparkline.sparkify(data["userspercentiles"])} 13 | ${str(data["userspercentiles"])} 14 | 15 | Has hashtag: ${"{:>9}".format(str(data["hashtagcount"]))} (${percentage(data["hashtagcount"], data["count"])}) 16 | Hashtags: ${"{:>9}".format(str(data["hashtags"]))} 17 | Hashtags percentiles: ${sparkline.sparkify(data["hashtagspercentiles"])} 18 | ${str(data["hashtagspercentiles"])} 19 | 20 | Has URL: ${"{:>9}".format(str(data["urlcount"]))} (${percentage(data["urlcount"], data["count"])}) 21 | URLs: ${"{:>9}".format(str(data["urls"]))} 22 | URLs percentiles: ${sparkline.sparkify(data["urlspercentiles"])} 23 | ${str(data["urlspercentiles"])} 24 | 25 | Has Image URL: ${"{:>9}".format(str(data["imageurlcount"]))} (${percentage(data["imageurlcount"], data["count"])}) 26 | Image URLs: ${"{:>9}".format(str(data["imageurls"]))} 27 | Image URLs percentiles: ${sparkline.sparkify(data["imageurlspercentiles"])} 28 | ${str(data["imageurlspercentiles"])} 29 | 30 | Originals: ${"{:>9}".format(str(data["originalcount"]))} (${percentage(data["originalcount"], data["count"])}) 31 | Retweets: ${"{:>9}".format(str(data["retweetcount"]))} (${percentage(data["retweetcount"], data["count"])}) 32 | Quotes: ${"{:>9}".format(str(data["quotecount"]))} (${percentage(data["quotecount"], data["count"])}) 33 | Replies: ${"{:>9}".format(str(data["replycount"]))} (${percentage(data["replycount"], data["count"])}) 34 | Geo: ${"{:>9}".format(str(data["geocount"]))} (${percentage(data["geocount"], data["count"])}) 35 | Earliest: ${str(data["earliest"])} 36 | Latest: ${str(data["latest"])} 37 | Duration: ${str(parser.parse(data["latest"]) - parser.parse(data["earliest"]))} 38 | Top users: ${sparkline.sparkify([u["value"] for u in data["topusers"]])} 39 | % for user in data["topusers"]: 40 | ${user["value"]} ${user["name"]} 41 | % endfor 42 | Top hashtags: ${sparkline.sparkify([u["value"] for u in data["tophashtags"]])} 43 | % for hashtag in data["tophashtags"]: 44 | ${hashtag["value"]} ${hashtag["name"]} 45 | % endfor 46 | Top URLs: ${sparkline.sparkify([u["value"] for u in data["topurls"]])} 47 | % for url in data["topurls"]: 48 | ${url["value"]} ${url["name"]} 49 | % endfor 50 | Top Image URLs: ${sparkline.sparkify([u["value"] for u in data["topimageurls"]])} 51 | % for imageurl in data["topimageurls"]: 52 | ${imageurl["value"]} ${imageurl["name"]} 53 | % endfor 54 | -------------------------------------------------------------------------------- /templates/timebar.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | $TITLE$ 4 | 5 | 31 | 32 |
33 | 34 | 127 | -------------------------------------------------------------------------------- /d3output.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import json 6 | import csv 7 | import io 8 | 9 | def nodeslinks(threshold): 10 | 11 | nodes = [] 12 | links = [] 13 | 14 | # lines look like "nodeA,nodeB,123" 15 | for line in sys.stdin: 16 | tokens = line.split(",") 17 | # use try to ignore header line 18 | try: 19 | if int(tokens[2]) >= threshold: 20 | if not tokens[0] in nodes: 21 | nodes.append(tokens[0]) 22 | if not tokens[1] in nodes: 23 | nodes.append(tokens[1]) 24 | links.append({"source": nodes.index(tokens[0]), 25 | "target": nodes.index(tokens[1]), 26 | "value": int(tokens[2])}) 27 | except: 28 | continue 29 | 30 | nodelist = [] 31 | for node in nodes: 32 | nodelist.append({"name": node}) 33 | 34 | print(json.dump({"nodes": nodelist, "links": links})) 35 | 36 | def nodeslinktrees(profile, nodes): 37 | # generate nodes json 38 | nodesoutput = [] 39 | linksoutput = [] 40 | if hasattr(profile["opts"], "graph"): 41 | graph = profile["opts"]["graph"] 42 | else: 43 | graph = "" 44 | for node in nodes: 45 | if graph == "directed": 46 | title = " (" + str(node["tweetcount"]) + " tweet" 47 | if node["tweetcount"] != 1: 48 | title += "s" 49 | title += ": " + unicode(node["source"]) + " out/" + unicode(node["target"]) + " in)" 50 | else: 51 | title = " (" + str(node["tweetcount"]) + " tweet" 52 | if node["tweetcount"] != 1: 53 | title += "s" 54 | title += ")" 55 | nodesoutput.append({"name": node["name"], 56 | "title": node["name"] + title}) 57 | 58 | # generate links 59 | for targetname in node["links"].keys(): 60 | target = node["links"][targetname] 61 | if target["count"] >= profile["opts"]["threshold"]: 62 | linksoutput.append({ 63 | "source": node["id"], 64 | "target": target["id"], 65 | "value": target["count"] 66 | }) 67 | 68 | return {"profile": profile, "nodes": nodesoutput, "links": linksoutput} 69 | 70 | def namevaluecsv(data): 71 | csvout = io.StringIO() 72 | csvwriter = csv.writer(csvout) 73 | csvwriter.writerow(["name", "value"]) 74 | for key, value in sorted(data.items()): 75 | csvwriter.writerow([key, value]) 76 | return csvout.getvalue() 77 | 78 | def valuecsv(data): 79 | csvout = io.StringIO() 80 | csvwriter = csv.writer(csvout) 81 | csvwriter.writerow(["value"]) 82 | for d in data: 83 | csvwriter.writerow([d]) 84 | return csvout.getvalue() 85 | 86 | def nodeslinkcsv(data): 87 | # convert link-nodes objects into csv 88 | # e.g. {"A": {"B": 3, "C": 7}} to A,B,3 and A,C,7 89 | csvout = io.StringIO() 90 | csvwriter = csv.writer(csvout) 91 | csvwriter.writerow(["source", "target", "value"]) 92 | for node in data: 93 | source = node["name"] 94 | # generate csv rows 95 | for targetname in node["links"].iterkeys(): 96 | csvwriter.writerow([source, targetname, node["links"][targetname]["count"]]) 97 | return csvout.getvalue() 98 | 99 | def namevaluejson(data): 100 | output = [] 101 | for key, value in sorted(data.items()): 102 | output.append({"name": key, "value": value}) 103 | return output 104 | 105 | def valuejson(data): 106 | output = [] 107 | for d in data: 108 | output.append(d) 109 | return output 110 | 111 | def embed(template, d3json): 112 | # load metadata.json if present 113 | # d3json["args"] contains filenams passed in, with wildcards resolved 114 | if d3json["profile"]["metadatafile"]: 115 | metadata_file = d3json["profile"]["metadatafile"] 116 | else: 117 | metadata_file = os.path.join(os.path.dirname(d3json["profile"]["args"][0]), "metadata.json") 118 | try: 119 | with open(metadata_file) as json_data: 120 | metadata = json.load(json_data) 121 | json_data.close() 122 | except: 123 | #sys.exit("Cannot read metadata file " + metadata_file) 124 | metadata = {"title": d3json["profile"]["args"][0] 125 | + (" (+)" if len(d3json["profile"]["args"]) > 1 else "") } 126 | d3json["metadata"] = metadata 127 | # generate html by replacing token in template 128 | template_file = os.path.join(os.path.dirname(__file__), "templates", template) 129 | with open (template_file, "r") as template: 130 | output = template.read() 131 | output = output.replace("$TITLE$", metadata["title"]) 132 | output = output.replace("$DATA$", json.dumps(d3json)) 133 | print(output) 134 | -------------------------------------------------------------------------------- /templates/reportprofile.html: -------------------------------------------------------------------------------- 1 | ## -*- coding: utf-8 -*- 2 | <%! import sparkline %> 3 | <%! from dateutil import parser %> 4 | 5 | <%def name="percentage(numerator, denominator)" filter="trim"> 6 | ${str("%.2f" % (float(numerator) / float(denominator) * 100.0))}% 7 | 8 | 9 | 10 | 11 | 12 | ${data['title']} 13 | 14 | 23 | 24 | 25 | 26 |

${data['title']}

27 | 28 |

Search: ${data['search']}

29 | 30 |

Reports (note: some are large)

31 | 32 | 39 | 40 |

Download tweet ids (See twarc for instructions on how to "hydrate" this list of ids to retrieve the original tweets)

41 | 42 |

Harvest statistics

43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | % for user in data["topusers"]: 76 | 77 | % endfor 78 | 79 | % for hashtag in data["tophashtags"]: 80 | 81 | % endfor 82 | 83 | % for url in data["topurls"]: 84 | 85 | % endfor 86 | 88 | % for imageurl in data["topimageurls"]: 89 | 90 | % endfor 91 |
Count:${data['count']}
Users:${data['usercount']}
User percentiles:${sparkline.sparkify(data["userspercentiles"])}
${str(data["userspercentiles"])}
Has hashtag:${"{:>9}".format(str(data["hashtagcount"]))} (${percentage(data["hashtagcount"], data["count"])})
Hashtags:${"{:>9}".format(str(data["hashtags"]))}
Hashtags percentiles:${sparkline.sparkify(data["hashtagspercentiles"])}
${str(data["hashtagspercentiles"])}
Has URL:${"{:>9}".format(str(data["urlcount"]))} (${percentage(data["urlcount"], data["count"])})
URLs:${"{:>9}".format(str(data["urls"]))}
URLs percentiles:${sparkline.sparkify(data["urlspercentiles"])}
${str(data["urlspercentiles"])}
Has Image URL:${"{:>9}".format(str(data["imageurlcount"]))} (${percentage(data["imageurlcount"], data["count"])})
Image URLs:${"{:>9}".format(str(data["imageurls"]))}
Image URLs percentiles:${sparkline.sparkify(data["imageurlspercentiles"])}
${str(data["imageurlspercentiles"])}
Originals:${"{:>9}".format(str(data["originalcount"]))} (${percentage(data["originalcount"], data["count"])})
Retweets:${"{:>9}".format(str(data["retweetcount"]))} (${percentage(data["retweetcount"], data["count"])})
Quotes:${"{:>9}".format(str(data["quotecount"]))} (${percentage(data["quotecount"], data["count"])})
Replies:${"{:>9}".format(str(data["replycount"]))} (${percentage(data["replycount"], data["count"])})
Geo:${"{:>9}".format(str(data["geocount"]))} (${percentage(data["geocount"], data["count"])})
Earliest:${str(data["earliest"])}
Latest:${str(data["latest"])}
Duration:${str(parser.parse(data["latest"]) - parser.parse(data["earliest"]))}
Top users:${sparkline.sparkify([u["value"] for u in data["topusers"]])}
${user["value"]}${user["name"]}
Top hashtags:${sparkline.sparkify([u["value"] for u in data["tophashtags"]])}
${hashtag["value"]}${hashtag["name"]}
Top URLs:${sparkline.sparkify([u["value"] for u in data["topurls"]])}
${url["value"]}${url["name"]}
Top Image URLs:${sparkline.sparkify([u["value"] for u in data["topimageurls"]])} 87 |
(Note: Firefox may not display these images, because of content blocking. You can turn off Enhanced Tracking Protection for this domain to allow the thumbnails to load. The links to the images should still work even when the thumbnails don't.)
${imageurl["value"]}
92 | 93 | -------------------------------------------------------------------------------- /d3wordcloud.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import optparse 4 | import re 5 | import dateutil.parser 6 | from profiler import TimeProfiler 7 | import pytz 8 | import d3output 9 | 10 | opt_parser = optparse.OptionParser() 11 | opt_parser.add_option("-t", "--timezone", type=str, default="", 12 | help="output timezone (e.g. 'America/New_York' or 'local'; default: UTC)") 13 | opt_parser.add_option("-w", "--maxwords", dest="maxwords", type="int", 14 | help="maximum number of words to display (default: 25)", default=25) 15 | opt_parser.add_option("-i", "--interval", dest="intervalStr", type="str", 16 | help="interval for grouping timestamps, in seconds, minutes or hours, e.g. 15M (default: 1H)", 17 | default="1H") 18 | opt_parser.add_option("-s", "--start", type=str, default=None, 19 | help="start date/time") 20 | opt_parser.add_option("-e", "--end", type=str, default=None, 21 | help="end date/time") 22 | opt_parser.add_option("-o", "--output", dest="output", type="str", 23 | help="html | csv | json (default: html)", default="html") 24 | opt_parser.add_option("-p", "--template", dest="template", type="str", 25 | help="name of template in utils/template (default: wordcloud.html)", default="wordcloud.html") 26 | 27 | opts, args = opt_parser.parse_args() 28 | 29 | tzname = opts.timezone 30 | # determine output time zone 31 | if tzname == "": 32 | tz = pytz.UTC 33 | elif tzname == "local": 34 | tz = get_localzone() # system timezone, from tzlocal 35 | else: 36 | tz = pytz.timezone(tzname) 37 | 38 | maxwords = opts.maxwords 39 | intervalStr = opts.intervalStr 40 | output = opts.output 41 | 42 | start = opts.start 43 | end = opts.end 44 | if opts.start: 45 | start = tz.localize(dateutil.parser.parse(start + "0001-01-01 00:00:00"[len(start):])) 46 | if opts.end: 47 | end = tz.localize(dateutil.parser.parse(end + "9999-12-31 23:11:59"[len(end):])) 48 | 49 | # from https://gist.github.com/uogbuji/705383 50 | GRUBER_URLINTEXT_PAT = re.compile(ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))') 51 | 52 | class WordcloudTimeProfiler(TimeProfiler): 53 | def __init__(self, opts): 54 | TimeProfiler.__init__(self, opts) 55 | self.timeslices = {} 56 | self.stop_words = set(line.strip().lower() for line in open("stopwords/stop-words_english_6_en.txt")) 57 | 58 | def process(self, tweet): 59 | created_at = dateutil.parser.parse(tweet["created_at"]) 60 | if ((self.start is None) or (created_at >= self.start)) and ((self.end is None) 61 | or (created_at <= self.end)): 62 | timeslice = TimeProfiler.process(self, tweet) 63 | if not timeslice in self.timeslices: 64 | self.timeslices[timeslice] = {} 65 | word_counts = self.timeslices[timeslice] 66 | text = tweet["text"] 67 | # remove hashtags and user names 68 | text = re.sub("(^|[^\w])[@#]\w*", "\g<1>", text) 69 | # remove urls 70 | text = re.sub(GRUBER_URLINTEXT_PAT, " ", text) 71 | # trim punctuation next to space 72 | text = re.sub(ur"[^\w\s]+(\s|$)|(^|\s)[^\w\s]+", " ", text, re.UNICODE) 73 | # replace internal punctuation, except apostrophes 74 | text = re.sub(ur"[^\w\s\']", " ", text, re.UNICODE) 75 | for word in text.split(): 76 | word = word.lower() 77 | if len(word) < 3: continue 78 | if len(word) > 15: continue 79 | if word in self.stop_words: continue 80 | if word.startswith("rt"): continue 81 | if not re.match("^[a-z]", word, re.IGNORECASE): continue 82 | # remove final 's 83 | word = re.sub("\'s$", "", word) 84 | if len(word) > 0: 85 | word_counts[word] = word_counts.get(word, 0) + 1 86 | 87 | def report(self): 88 | data = TimeProfiler.report(self) 89 | data["profile"]["start"] = str(self.start) 90 | data["profile"]["end"] = str(self.end) 91 | for value in data["values"]: 92 | thisslice = self.timeslices[value["name"]] 93 | # sort words by value 94 | sorted_words = thisslice.keys() 95 | sorted_words.sort(lambda a, b: cmp(thisslice[b], thisslice[a])) 96 | top_words = sorted_words[0:maxwords] 97 | words = [] 98 | for word in top_words: 99 | words.append({ 100 | "text": word, 101 | "count": thisslice[word] 102 | }) 103 | value["words"] = words 104 | return data 105 | 106 | profiler = WordcloudTimeProfiler({ 107 | "tz": tz, 108 | "output": "json", 109 | "aggregate": True, 110 | "intervalStr": intervalStr, 111 | "start": start, 112 | "end": end}) 113 | 114 | profiler.gettweets(opts, args) 115 | 116 | data = profiler.report() 117 | 118 | if opts.output == "html": 119 | d3output.embed(opts.template, data) 120 | else: 121 | print(data) 122 | -------------------------------------------------------------------------------- /d3cotags.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import itertools 5 | import optparse 6 | import d3output # local module 7 | from profiler import Profiler # local module 8 | from profiler import LinkNodesProfiler # local module 9 | from collections import Counter 10 | 11 | class CotagsProfiler(LinkNodesProfiler): 12 | def __init__(self, opts): 13 | LinkNodesProfiler.__init__(self, opts) 14 | self.savetweets = [] 15 | self.counts = Counter() 16 | self.keepers = set() 17 | 18 | def process(self, tweet): 19 | Profiler.process(self, tweet) 20 | # gather a list of the tags in this tweet, lowercased 21 | savetweet = [] 22 | for tag in tweet["entities"]["hashtags"]: 23 | t = tag["text"].lower() 24 | savetweet.append(t) 25 | # and increment count for this tag 26 | self.counts[t] += 1 27 | # add tag list to savetweets 28 | self.savetweets.append(savetweet) 29 | 30 | def report(self): 31 | # for tags below the threshold, replace with "-OTHER" 32 | # which is not necessary if threshold is 0 33 | if self.threshold > 0: 34 | countkeys = self.counts.keys() 35 | for countkey in countkeys: 36 | if self.counts[countkey] < self.threshold: 37 | # for a tag whose count is below the threshold, transfer its 38 | # count to tag "-OTHER" and delete it 39 | if self.keepother: 40 | self.counts["-OTHER"] += self.counts[countkey] 41 | del self.counts[countkey] 42 | else: 43 | # otherwise add it to list of keepers 44 | self.keepers.add(countkey) 45 | if self.keepother: 46 | self.keepers.add("-OTHER") 47 | # keepers now has a complete set of surviving tags 48 | 49 | # now process hashtags in tweets again, replacing any tag not in keepers with -OTHER 50 | self.counts = Counter() 51 | for savetweet in self.savetweets: 52 | 53 | # cleantags gathers unique, lower-cased tags for this tweet 54 | cleantags = set() 55 | 56 | for tag in savetweet: 57 | if self.threshold == 0 or tag in self.keepers: 58 | cleantags.add(tag) 59 | else: 60 | if self.keepother: 61 | cleantags.add("-OTHER") 62 | 63 | # sort tags and remove tags that are in the exclude set 64 | cleantags = sorted(cleantags.difference(self.exclude)) 65 | 66 | # generate all pairs 67 | for c in itertools.combinations(cleantags, 2): 68 | self.addlink(c[0], c[1]) 69 | if self.reciprocal: 70 | self.addlink(c[1], c[0]) 71 | 72 | # if this tag is the only one we're including from this tweet, 73 | # then there won't be any combinations, and so it won't have 74 | # been added to self.nodes by addlink: so add it. 75 | 76 | # add to tweet count for this tag 77 | for tag in cleantags: 78 | if tag in self.nodes: 79 | self.nodes[tag]["tweetcount"] += 1 80 | else: 81 | self.addsingle(tag) 82 | 83 | data = LinkNodesProfiler.report(self) 84 | return data; 85 | 86 | 87 | opt_parser = optparse.OptionParser() 88 | opt_parser.add_option("-o", "--output", dest="output", type="str", 89 | help="html | json (default: html)", default="html") 90 | opt_parser.add_option("-e", "--exclude", type=str, default="", 91 | help="comma-separated list of hashtags to exclude") 92 | opt_parser.add_option("-t", "--threshold", type=int, default=0, 93 | help="threshold below which to treat hashtags as 'other'") 94 | opt_parser.add_option("-r", "--reciprocal", action="store_true", default=False, 95 | help="add reciprocal links for each pair") 96 | opt_parser.add_option("-p", "--template", dest="template", type="str", 97 | help="name of template in utils/template (default: graph.html)", default="graph.html") 98 | opt_parser.add_option("-k", "--keepother", action="store_true", default=False, 99 | help="include -OTHER tag in output for tags below threshold") 100 | 101 | opts, args = opt_parser.parse_args() 102 | 103 | threshold = opts.threshold 104 | exclude = set(opts.exclude.lower().split(",")) 105 | reciprocal = opts.reciprocal 106 | keepother = opts.keepother 107 | output = opts.output 108 | 109 | profiler = CotagsProfiler({ 110 | "threshold": threshold, 111 | "exclude": exclude, 112 | "reciprocal": reciprocal, 113 | "keepother": keepother, 114 | "graph": "undirected", 115 | "field": "hashtag"}) 116 | 117 | profiler.gettweets(opts, args) 118 | 119 | data = profiler.report() 120 | 121 | profile = data["profile"] 122 | nodes = data["nodes"] 123 | 124 | if output == "csv": 125 | print(d3output.nodeslinkcsv(nodes)) 126 | elif output == "json": 127 | values = d3output.nodeslinktrees(profile, nodes) 128 | print({"profile": profile, "values": values}) 129 | elif output == "html": 130 | print(d3output.embed(opts.template, d3output.nodeslinktrees(profile, nodes))) 131 | 132 | 133 | -------------------------------------------------------------------------------- /templates/wordcloud.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 12 | 168 | 169 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | CC0 1.0 Universal 2 | 3 | Statement of Purpose 4 | 5 | The laws of most jurisdictions throughout the world automatically confer 6 | exclusive Copyright and Related Rights (defined below) upon the creator and 7 | subsequent owner(s) (each and all, an "owner") of an original work of 8 | authorship and/or a database (each, a "Work"). 9 | 10 | Certain owners wish to permanently relinquish those rights to a Work for the 11 | purpose of contributing to a commons of creative, cultural and scientific 12 | works ("Commons") that the public can reliably and without fear of later 13 | claims of infringement build upon, modify, incorporate in other works, reuse 14 | and redistribute as freely as possible in any form whatsoever and for any 15 | purposes, including without limitation commercial purposes. These owners may 16 | contribute to the Commons to promote the ideal of a free culture and the 17 | further production of creative, cultural and scientific works, or to gain 18 | reputation or greater distribution for their Work in part through the use and 19 | efforts of others. 20 | 21 | For these and/or other purposes and motivations, and without any expectation 22 | of additional consideration or compensation, the person associating CC0 with a 23 | Work (the "Affirmer"), to the extent that he or she is an owner of Copyright 24 | and Related Rights in the Work, voluntarily elects to apply CC0 to the Work 25 | and publicly distribute the Work under its terms, with knowledge of his or her 26 | Copyright and Related Rights in the Work and the meaning and intended legal 27 | effect of CC0 on those rights. 28 | 29 | 1. Copyright and Related Rights. A Work made available under CC0 may be 30 | protected by copyright and related or neighboring rights ("Copyright and 31 | Related Rights"). Copyright and Related Rights include, but are not limited 32 | to, the following: 33 | 34 | i. the right to reproduce, adapt, distribute, perform, display, communicate, 35 | and translate a Work; 36 | 37 | ii. moral rights retained by the original author(s) and/or performer(s); 38 | 39 | iii. publicity and privacy rights pertaining to a person's image or likeness 40 | depicted in a Work; 41 | 42 | iv. rights protecting against unfair competition in regards to a Work, 43 | subject to the limitations in paragraph 4(a), below; 44 | 45 | v. rights protecting the extraction, dissemination, use and reuse of data in 46 | a Work; 47 | 48 | vi. database rights (such as those arising under Directive 96/9/EC of the 49 | European Parliament and of the Council of 11 March 1996 on the legal 50 | protection of databases, and under any national implementation thereof, 51 | including any amended or successor version of such directive); and 52 | 53 | vii. other similar, equivalent or corresponding rights throughout the world 54 | based on applicable law or treaty, and any national implementations thereof. 55 | 56 | 2. Waiver. To the greatest extent permitted by, but not in contravention of, 57 | applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and 58 | unconditionally waives, abandons, and surrenders all of Affirmer's Copyright 59 | and Related Rights and associated claims and causes of action, whether now 60 | known or unknown (including existing as well as future claims and causes of 61 | action), in the Work (i) in all territories worldwide, (ii) for the maximum 62 | duration provided by applicable law or treaty (including future time 63 | extensions), (iii) in any current or future medium and for any number of 64 | copies, and (iv) for any purpose whatsoever, including without limitation 65 | commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes 66 | the Waiver for the benefit of each member of the public at large and to the 67 | detriment of Affirmer's heirs and successors, fully intending that such Waiver 68 | shall not be subject to revocation, rescission, cancellation, termination, or 69 | any other legal or equitable action to disrupt the quiet enjoyment of the Work 70 | by the public as contemplated by Affirmer's express Statement of Purpose. 71 | 72 | 3. Public License Fallback. Should any part of the Waiver for any reason be 73 | judged legally invalid or ineffective under applicable law, then the Waiver 74 | shall be preserved to the maximum extent permitted taking into account 75 | Affirmer's express Statement of Purpose. In addition, to the extent the Waiver 76 | is so judged Affirmer hereby grants to each affected person a royalty-free, 77 | non transferable, non sublicensable, non exclusive, irrevocable and 78 | unconditional license to exercise Affirmer's Copyright and Related Rights in 79 | the Work (i) in all territories worldwide, (ii) for the maximum duration 80 | provided by applicable law or treaty (including future time extensions), (iii) 81 | in any current or future medium and for any number of copies, and (iv) for any 82 | purpose whatsoever, including without limitation commercial, advertising or 83 | promotional purposes (the "License"). The License shall be deemed effective as 84 | of the date CC0 was applied by Affirmer to the Work. Should any part of the 85 | License for any reason be judged legally invalid or ineffective under 86 | applicable law, such partial invalidity or ineffectiveness shall not 87 | invalidate the remainder of the License, and in such case Affirmer hereby 88 | affirms that he or she will not (i) exercise any of his or her remaining 89 | Copyright and Related Rights in the Work or (ii) assert any associated claims 90 | and causes of action with respect to the Work, in either case contrary to 91 | Affirmer's express Statement of Purpose. 92 | 93 | 4. Limitations and Disclaimers. 94 | 95 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 96 | surrendered, licensed or otherwise affected by this document. 97 | 98 | b. Affirmer offers the Work as-is and makes no representations or warranties 99 | of any kind concerning the Work, express, implied, statutory or otherwise, 100 | including without limitation warranties of title, merchantability, fitness 101 | for a particular purpose, non infringement, or the absence of latent or 102 | other defects, accuracy, or the present or absence of errors, whether or not 103 | discoverable, all to the greatest extent permissible under applicable law. 104 | 105 | c. Affirmer disclaims responsibility for clearing rights of other persons 106 | that may apply to the Work or any use thereof, including without limitation 107 | any person's Copyright and Related Rights in the Work. Further, Affirmer 108 | disclaims responsibility for obtaining any necessary consents, permissions 109 | or other rights required for any use of the Work. 110 | 111 | d. Affirmer understands and acknowledges that Creative Commons is not a 112 | party to this document and has no duty or obligation with respect to this 113 | CC0 or use of the Work. 114 | 115 | For more information, please see 116 | 117 | -------------------------------------------------------------------------------- /stopwords/stop-words_english_6_en.txt: -------------------------------------------------------------------------------- 1 | a 2 | able 3 | about 4 | above 5 | abst 6 | accordance 7 | according 8 | accordingly 9 | across 10 | act 11 | actually 12 | added 13 | adj 14 | adopted 15 | affected 16 | affecting 17 | affects 18 | after 19 | afterwards 20 | again 21 | against 22 | ah 23 | all 24 | almost 25 | alone 26 | along 27 | already 28 | also 29 | although 30 | always 31 | am 32 | among 33 | amongst 34 | amp 35 | an 36 | and 37 | announce 38 | another 39 | any 40 | anybody 41 | anyhow 42 | anymore 43 | anyone 44 | anything 45 | anyway 46 | anyways 47 | anywhere 48 | apparently 49 | approximately 50 | are 51 | aren 52 | arent 53 | arise 54 | around 55 | as 56 | aside 57 | ask 58 | asking 59 | at 60 | auth 61 | available 62 | away 63 | awfully 64 | b 65 | back 66 | be 67 | became 68 | because 69 | become 70 | becomes 71 | becoming 72 | been 73 | before 74 | beforehand 75 | begin 76 | beginning 77 | beginnings 78 | begins 79 | behind 80 | being 81 | believe 82 | below 83 | beside 84 | besides 85 | between 86 | beyond 87 | biol 88 | both 89 | brief 90 | briefly 91 | but 92 | by 93 | c 94 | ca 95 | came 96 | can 97 | cannot 98 | can't 99 | cause 100 | causes 101 | certain 102 | certainly 103 | co 104 | com 105 | come 106 | comes 107 | contain 108 | containing 109 | contains 110 | could 111 | couldn't 112 | couldn 113 | could've 114 | d 115 | date 116 | did 117 | didn't 118 | didn 119 | different 120 | do 121 | does 122 | doesn't 123 | doesn 124 | doing 125 | done 126 | don't 127 | don 128 | down 129 | downwards 130 | due 131 | during 132 | e 133 | each 134 | ed 135 | edu 136 | effect 137 | eg 138 | eight 139 | eighty 140 | either 141 | else 142 | elsewhere 143 | end 144 | ending 145 | enough 146 | especially 147 | et 148 | et-al 149 | etc 150 | even 151 | ever 152 | every 153 | everybody 154 | everyone 155 | everything 156 | everywhere 157 | ex 158 | except 159 | f 160 | far 161 | few 162 | ff 163 | fifth 164 | first 165 | five 166 | fix 167 | followed 168 | following 169 | follows 170 | for 171 | former 172 | formerly 173 | forth 174 | found 175 | four 176 | from 177 | further 178 | furthermore 179 | g 180 | gave 181 | get 182 | gets 183 | getting 184 | give 185 | given 186 | gives 187 | giving 188 | go 189 | goes 190 | gone 191 | got 192 | gotten 193 | h 194 | had 195 | happens 196 | hardly 197 | has 198 | hasn't 199 | hasn 200 | have 201 | haven't 202 | haven 203 | having 204 | he 205 | he'll 206 | he'd 207 | hence 208 | her 209 | here 210 | hereafter 211 | hereby 212 | herein 213 | here's 214 | hereupon 215 | hers 216 | herself 217 | he's 218 | hi 219 | hid 220 | him 221 | himself 222 | his 223 | hither 224 | home 225 | how 226 | howbeit 227 | however 228 | hundred 229 | i 230 | i'd 231 | ie 232 | if 233 | i'll 234 | i'm 235 | immediate 236 | immediately 237 | importance 238 | important 239 | in 240 | inc 241 | indeed 242 | index 243 | information 244 | instead 245 | into 246 | invention 247 | inward 248 | is 249 | isn't 250 | isn 251 | it 252 | it'd 253 | it'll 254 | its 255 | it's 256 | itself 257 | i've 258 | j 259 | just 260 | k 261 | keep 262 | keeps 263 | kept 264 | keys 265 | kg 266 | km 267 | know 268 | known 269 | knows 270 | l 271 | largely 272 | last 273 | lately 274 | later 275 | latter 276 | latterly 277 | least 278 | less 279 | lest 280 | let 281 | lets 282 | let's 283 | like 284 | liked 285 | likely 286 | line 287 | little 288 | 'll 289 | look 290 | looking 291 | looks 292 | ltd 293 | m 294 | made 295 | mainly 296 | make 297 | makes 298 | many 299 | may 300 | maybe 301 | me 302 | mean 303 | means 304 | meantime 305 | meanwhile 306 | merely 307 | mg 308 | might 309 | million 310 | miss 311 | ml 312 | more 313 | moreover 314 | most 315 | mostly 316 | mr 317 | mrs 318 | much 319 | mug 320 | must 321 | my 322 | myself 323 | n 324 | na 325 | name 326 | namely 327 | nay 328 | nd 329 | near 330 | nearly 331 | necessarily 332 | necessary 333 | need 334 | needs 335 | neither 336 | never 337 | nevertheless 338 | new 339 | next 340 | nine 341 | ninety 342 | no 343 | nobody 344 | non 345 | none 346 | nonetheless 347 | noone 348 | nor 349 | normally 350 | nos 351 | not 352 | noted 353 | nothing 354 | now 355 | nowhere 356 | o 357 | obtain 358 | obtained 359 | obviously 360 | of 361 | off 362 | often 363 | oh 364 | ok 365 | okay 366 | old 367 | omitted 368 | on 369 | once 370 | one 371 | ones 372 | only 373 | onto 374 | or 375 | ord 376 | other 377 | others 378 | otherwise 379 | ought 380 | our 381 | ours 382 | ourselves 383 | out 384 | outside 385 | over 386 | overall 387 | owing 388 | own 389 | p 390 | page 391 | pages 392 | part 393 | particular 394 | particularly 395 | past 396 | per 397 | perhaps 398 | placed 399 | please 400 | plus 401 | poorly 402 | possible 403 | possibly 404 | potentially 405 | pp 406 | predominantly 407 | present 408 | previously 409 | primarily 410 | probably 411 | promptly 412 | proud 413 | provides 414 | put 415 | q 416 | que 417 | quickly 418 | quite 419 | qv 420 | r 421 | ran 422 | rather 423 | rd 424 | re 425 | readily 426 | really 427 | recent 428 | recently 429 | ref 430 | refs 431 | regarding 432 | regardless 433 | regards 434 | related 435 | relatively 436 | research 437 | respectively 438 | resulted 439 | resulting 440 | results 441 | right 442 | run 443 | s 444 | said 445 | same 446 | saw 447 | say 448 | saying 449 | says 450 | sec 451 | section 452 | see 453 | seeing 454 | seem 455 | seemed 456 | seeming 457 | seems 458 | seen 459 | self 460 | selves 461 | sent 462 | seven 463 | several 464 | shall 465 | she 466 | she'd 467 | she'll 468 | she's 469 | should 470 | shouldn't 471 | shouldn 472 | show 473 | showed 474 | shown 475 | shows 476 | significant 477 | significantly 478 | similar 479 | similarly 480 | since 481 | six 482 | slightly 483 | so 484 | some 485 | somebody 486 | somehow 487 | someone 488 | something 489 | sometime 490 | sometimes 491 | somewhat 492 | somewhere 493 | soon 494 | sorry 495 | specifically 496 | specified 497 | specify 498 | specifying 499 | state 500 | states 501 | still 502 | stop 503 | strongly 504 | sub 505 | substantially 506 | successfully 507 | such 508 | sufficiently 509 | suggest 510 | sup 511 | sure 512 | t 513 | take 514 | taken 515 | taking 516 | tell 517 | tends 518 | th 519 | than 520 | thank 521 | thanks 522 | thanx 523 | that 524 | that'll 525 | that's 526 | that've 527 | the 528 | their 529 | theirs 530 | them 531 | themselves 532 | then 533 | thence 534 | there 535 | thereafter 536 | thereby 537 | there'd 538 | therefore 539 | therein 540 | there'll 541 | thereof 542 | there're 543 | there's 544 | thereto 545 | thereupon 546 | there've 547 | these 548 | they 549 | they'd 550 | they'll 551 | they're 552 | they've 553 | think 554 | this 555 | those 556 | thou 557 | though 558 | thousand 559 | throug 560 | through 561 | throughout 562 | thru 563 | thus 564 | til 565 | tip 566 | to 567 | together 568 | too 569 | took 570 | toward 571 | towards 572 | tried 573 | tries 574 | truly 575 | try 576 | trying 577 | ts 578 | twice 579 | two 580 | u 581 | un 582 | under 583 | unfortunately 584 | unless 585 | unlike 586 | unlikely 587 | until 588 | unto 589 | up 590 | upon 591 | ups 592 | us 593 | use 594 | used 595 | useful 596 | usefully 597 | usefulness 598 | uses 599 | using 600 | usually 601 | v 602 | value 603 | various 604 | 've 605 | very 606 | via 607 | viz 608 | vol 609 | vols 610 | vs 611 | w 612 | want 613 | wants 614 | was 615 | wasn't 616 | wasn 617 | way 618 | we 619 | wed 620 | welcome 621 | we'll 622 | went 623 | were 624 | we're 625 | weren't 626 | weren 627 | we've 628 | what 629 | whatever 630 | what'll 631 | whats 632 | when 633 | whence 634 | whenever 635 | where 636 | whereafter 637 | whereas 638 | whereby 639 | wherein 640 | where's 641 | whereupon 642 | wherever 643 | whether 644 | which 645 | while 646 | whim 647 | whither 648 | who 649 | who'd 650 | whoever 651 | whole 652 | who'll 653 | whom 654 | whomever 655 | whos 656 | whose 657 | why 658 | widely 659 | will 660 | willing 661 | wish 662 | with 663 | within 664 | without 665 | won't 666 | words 667 | world 668 | would 669 | wouldn 670 | wouldn't 671 | would've 672 | www 673 | x 674 | y 675 | yes 676 | yet 677 | you 678 | you'd 679 | you'll 680 | your 681 | you're 682 | yours 683 | yourself 684 | yourselves 685 | you've 686 | z 687 | zero 688 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # twarc-report 2 | Data conversions and examples for generating reports from [twarc](https://github.com/DocNow/twarc) collections using tools such as D3.js 3 | 4 | - [Requirements](#user-content-requirements) 5 | - [Getting Started](#user-content-getting-started) 6 | - [Recommended Directory Structure](#user-content-recommended-directory-structure) 7 | - [Harvest](#user-content-harvest) 8 | - [Profile](#user-content-profile) 9 | - [D3 Visualizations](#user-content-d3-visualizations) 10 | - [Exploring D3 Examples](#user-content-exploring-d3-examples) 11 | - [Adding Scripts](#user-content-adding-scripts) 12 | - [License](#user-content-license) 13 | 14 | These utilities accept a Twitter json file (as fetched by twarc), 15 | analyze it various ways, and output a json or csv file. The initial 16 | purpose is to feed data into D3.js for various visualizations, but the 17 | intention is to make the outputs generic enough to serve other uses as 18 | well. Each utility has a D3 example template, which it can use to 19 | generate a self-contained html file. It can also generate csv or json 20 | output, and there is a [worked example](#user-content-exploring-d3-examples) of how 21 | to use csv in a pre-existing D3 chart. 22 | 23 | The d3graph.py utility was originally added to the twarc repo as 24 | directed.py but is moving here for consistency. 25 | 26 | ## Requirements 27 | 28 | All requirements may be installed with `pip install -r requirements.txt` 29 | 30 | * dateutil - `python-dateutil` 31 | * pytz - `pip install pytz` 32 | * tzlocal - `pip install tzlocal` 33 | * pysparklines - `pip install pysparklines` 34 | * requests_oauthlib - `pip install requests_oauthlib` 35 | 36 | Install `twarc` according to its instructions, i.e. with `pip install twarc`. 37 | Run `twarc.py` once so 38 | that it can ask for your access token etc. (see twarc's readme). Make sure that `twarc-archive.py` 39 | is on the system path. 40 | 41 | ## Getting Started 42 | 43 | - clone twarc-report to a local directory with your favorite Git client 44 | - install the requirements and populate the twarc submodule, as above 45 | - create a `projects` subdirectory under twarc-report 46 | - create a project directory under `projects`, named appropriately 47 | - in the project directory create `metadata.json` and fill in the search you want to track 48 | - in twarc-report, run `./harvest.py projects/[yourproject]` to harvest your tweets (this may take some time - hours or days for very large searches) 49 | - run `./reportprofile.py projects/[yourproject]` to see a summary of your harvest 50 | - run other scripts to generate various visualizations (see below) 51 | - run `./harvest.py projects/[yourproject]` whenever you want to update your harvest. 52 | 53 | Note that only tweets from the last 7 days or so are available from Twitter at 54 | any given time, so be sure to update your harvest accordingly to avoid gaps. 55 | 56 | ## Recommended Directory Structure 57 | 58 | ``` 59 | twarc-report/ # local clone 60 | projects/ 61 | assets/ # copy of twarc-report/assets/ 62 | projectA/ 63 | data/ # created by harvest.py 64 | tweets/ # populated with tweet*.json files by harvest.py 65 | metadata.json 66 | timeline.html # generated by a twarc-report script 67 | ... 68 | projectB/ 69 | ... 70 | ``` 71 | 72 | Metadata about the project, including the search query, is kept in 73 | `metadata.json`. The `metadata.json` file is created by the user and contains metadata 74 | for the harvest. It should be in this form: 75 | 76 | ``` 77 | {"search": "#ferguson", 78 | "title": "Ferguson Tweets", 79 | "creator": "Peter Binkley"} 80 | ``` 81 | 82 | (Currently only the `search` value is used but other metadata fields will 83 | be used to populate HTML output in future releases.) 84 | 85 | The harvested tweets 86 | and other source data are stored in the `data` subdirectory, with the 87 | tweets going the `tweets` directory. These directories are created by 88 | `harvest.py` if they don't exist. 89 | 90 | Generated HTML files use relative paths like `../assets/d3.vs.min.js` to call 91 | shared libraries from the `assets` directory. They can be created in 92 | the project directories (`ProjectA` etc.). This 93 | allows you to publish the output by syncing the project and assets 94 | directories to a web server while exclusing the `data` subdirectory. You 95 | can also run python's SimpleHTTPServer in the `projects` directory to 96 | load examples you've created in the project directories: 97 | 98 | ``` 99 | python -m SimpleHTTPServer 8000 100 | ``` 101 | 102 | And then visit e.g. `http://localhost:8000/ProjectA/projectA-timebar.html`. 103 | 104 | ## Harvest 105 | 106 | The script `harvest.py` will use twarc's `twarc-archive.py` to start or update a harvest using a given 107 | search and stored in a given directory. The directory path is passed as the only parameter: 108 | 109 | ``` 110 | ./harvest.py projects/ProjectA 111 | ``` 112 | 113 | The search is read from the `metadata.json` file, and tweets are stored 114 | in `data/tweets`. 115 | 116 | ## Profile 117 | 118 | Running `reportprofiler.py` on a tweet collection with the flag `-o text` will generate a summary 119 | profile of the collection, with some basic stats (number of tweets, retweets, users, etc.) and some 120 | possibly interesting sparklines. 121 | 122 | ``` 123 | Count: 25100 124 | Users: 5779 125 | User percentiles: █▂▁▁▁▁▁▁▁▁ 126 | [62, 12, 6, 5, 3, 2, 2, 2, 2, 2] 127 | ``` 128 | 129 | That indicates that the top 10 percent of users accounted for 62% of the tweets, while the bottom 130 | 10% accounted for 2% of the tweets. This will give a quick sense of whether the collection is 131 | dominated by a few voices or has broad participation. The profile also includes the top 10 users 132 | and top 10 shared urls, with similar sparklines. 133 | 134 | Note: the sparklines are generated by [pysparklines](https://pypi.python.org/pypi/pysparklines), 135 | using Unicode block characters. If they have an 136 | uneven baseline, it's the fault of the font. On a Mac, I find that Menlo Regular gives a 137 | good presentation in the terminal. 138 | 139 | ## D3 visualizations 140 | 141 | Some utilities to generate [D3.js](https://d3js.org/) visualizations of aspects of a collection 142 | of tweets are provided. Use "--output=json" or "--output=csv" to output the data for use with 143 | other D3 examples, or "--help" for other options. 144 | 145 | ### d3graph.py 146 | 147 | A directed graph of mentions or retweets, in which nodes are users and 148 | arrows point from the original user to the user who mentions or retweets 149 | them: 150 | 151 | % d3graph.py --mode mentions projects/nasa > projects/nasa/nasa-directed-mentions.html 152 | % d3graph.py --mode retweets projects/nasa > projects/nasa/nasa-directed-retweets.html 153 | % d3graph.py --mode replies projects/nasa > projects/nasa/nasa-directed-replies.html 154 | 155 | ### d3cotag.py 156 | 157 | An undirected graph of co-occurring hashtags: 158 | 159 | % d3cotag.py projects/nasa > projects/nasa/nasa-cotags.html 160 | 161 | A threshold can be specified with "-t": hashtags whose number of 162 | occurrences falls below this will not be linked. Instead, if "-k" is set, 163 | they will be replaced with the pseudo-hashtag "-OTHER". Hashtags can be 164 | excluded with "-e" (takes a comma-delimited list). If the tweets were 165 | harvested by a search for a single hashtag then it's a good idea to 166 | exclude that tag, since every other tag will link to it. 167 | 168 | ### d3timebar.py 169 | 170 | A bar chart timeline with arbitrary intervals, here five minutes: 171 | 172 | % d3times.py -a -t local -i 5M projects/nasa > projects/nasa/nasa-timebargraph.html 173 | 174 | [Examples](https://www.wallandbinkley.com/twarc/bill10/) 175 | 176 | The output timezone is specified by "-t"; the interval is specified by "-i", 177 | using the [standard 178 | abbreviations](https://docs.python.org/2/library/time.html#time.strftime 179 | ): seconds = S, minutes = M, hours = H, days = d, months = m, years = Y. 180 | The example above uses five-minute intervals. Output may be aggregated 181 | using "-a": each row has a time value and a count. Note that if you are 182 | generating the html example, you must use "-a". 183 | 184 | ### d3wordcloud.py 185 | 186 | An animated wordcloud, in which words are added and removed according to 187 | changes in frequency over time. 188 | 189 | % d3wordcloud.py -t local -i 1H projects/nasa > projects/nasa/nasa-wordcloud.html 190 | 191 | [Example](https://www.wallandbinkley.com/twarc/c4l15/animatedwordcloud.html) 192 | 193 | The optional "-t" control timezone and "-i" controls interval, as in `d3timebar.py`. Start and end 194 | timestamps may be set with "-s" and "-e". 195 | 196 | This script calls a [fork](https://github.com/pbinkley/d3-cloud) of Jason Davies' 197 | [d3-cloud](https://github.com/jasondavies/d3-cloud) project. The forked version attempts 198 | to keep the carried-over words in transitions close to their previous position. 199 | 200 | ## Exploring D3 Examples 201 | 202 | The json and csv outputs can be used to view your data in D3 example 203 | visualizations with minimal fuss. There are many many examples to be 204 | explored; Mike Bostock's 205 | [Gallery](https://github.com/mbostock/d3/wiki/Gallery) is a good place 206 | to start. Here's a worked example, using Bostock's [Zoomable Timeline 207 | Area 208 | Chart](https://mbostock.github.io/d3/talk/20111018/area-gradient.html). 209 | It assumes no knowledge of D3. 210 | 211 | First, look at the data input. In line 137 this example loads a csv file 212 | 213 | d3.csv("flights-departed.csv", function(data) { 214 | 215 | The [csv file](https://mbostock.github.io/d3/talk/20111018/flights-departed.csv) looks like this: 216 | 217 | date,value 218 | 1988-01-01,12681 219 | ... 220 | 221 | We can easily generate a csv file that matches that format: 222 | 223 | % ./d3times.py -a -i 1d -o csv 224 | 225 | (I.e. aggregate, one-day interval, output csv). We then just need to edit the 226 | output to make the column headers match the original csv, 227 | i.e. change them to "date,value". 228 | 229 | We also need to check the way the example loads scripts and css assets, 230 | especially the D3 library. In this case it expects a local copy: 231 | 232 | 233 | 234 | 235 | 236 | 237 | Either change those links to point to the original location, or save a 238 | local copy. (Note that if you're going to put your example online you'll 239 | want local copies of scripts, since the [same-origin policy](https://en.wikipedia.org/wiki/Same-origin_policy) 240 | will prevent them from being loaded from the source). 241 | 242 | Once you've matched your data to the example and made sure it can load the 243 | D3.js library, the example may work. In this case it doesn't - it shows 244 | an empty chart. The title "U.S. Commercial Flights, 1999-2001" and the 245 | horizontal scale explain why: it expects dates within a certain 246 | (pre-Twitter) range, and the x domain is hard-coded accordingly. The 247 | setting is easy to find, in line 146: 248 | 249 | x.domain([new Date(1999, 0, 1), new Date(2003, 0, 0)]); 250 | 251 | Change those dates to include the date range of your data, and the 252 | example should work. Don't worry about matching your dates closely: the 253 | chart is zoomable, after all. Alternatively, you could borrow a snippet from the 254 | template timebar.html to set the domain to match the earliest and latest 255 | dates in your data: 256 | 257 | ``` 258 | x.domain([ 259 | d3.min(values, function(d) {return d.name}), 260 | d3.max(values, function(d) {return d.name}) 261 | ]); 262 | ``` 263 | 264 | A typical Twarc harvest gets you a few days worth of tweets, so the 265 | day-level display of this example probably isn't very interesting. We're 266 | not bound by the time format of the example, however. We can see it in 267 | line 63: 268 | 269 | parse = d3.time.format("%Y-%m-%d").parse, 270 | 271 | We can change that to parse at the minute interval: "%Y-%m-%d %H:%M", 272 | and generate our csv at the same interval with "-i 1M". With those 273 | changes we can zoom in until bars represent a minute's worth of tweets. 274 | 275 | This example doesn't work perfectly: I see some odd artifacts around the 276 | bottom of the chart, as if the baseline were slightly above the x axis 277 | and small values are presented as negative. And it doesn't render in 278 | Chrome at all (Firefox and Safari are fine). The example is from 2011 279 | and uses an older version of the D3 library, and with some tinkering it 280 | could probably be updated and made functional. It serves to demonstrate, 281 | though, that only small changes and no knowledge of the complexities of D3 282 | are needed to fit your data into an existing D3 example. 283 | 284 | ## Adding Scripts 285 | 286 | The heart of twarc-report is the `Profiler` class in `profiler.py`. The 287 | scripts pass json records from the twarc harvests to this class, and it 288 | tabulates some basic properties: number of tweets and authors, earliest 289 | and latest timestamp, etc. The scripts create their own profilers that 290 | inherit from this class and that process the extra fields etc. needed by 291 | the script. To add a new script, start by working out its profiler class 292 | to collect the data it needs from each tweet in the process() method, 293 | and to organize the output in the report() method. 294 | 295 | The various output formats are generated by functions in `d3output.py`. 296 | 297 | License 298 | ------- 299 | 300 | * CC0 301 | 302 | -------------------------------------------------------------------------------- /assets/d3.layout.cloud.js: -------------------------------------------------------------------------------- 1 | // Word cloud layout by Jason Davies, http://www.jasondavies.com/word-cloud/ 2 | // Algorithm due to Jonathan Feinberg, http://static.mrfeinberg.com/bv_ch03.pdf 3 | 4 | var previouswords; 5 | 6 | (function() { 7 | function cloud() { 8 | var size = [256, 256], 9 | text = cloudText, 10 | font = cloudFont, 11 | fontSize = cloudFontSize, 12 | fontStyle = cloudFontNormal, 13 | fontWeight = cloudFontNormal, 14 | rotate = cloudRotate, 15 | padding = cloudPadding, 16 | previousword = cloudPreviousword, 17 | spiral = archimedeanSpiral, 18 | words = [], 19 | timeInterval = Infinity, 20 | event = d3.dispatch("word", "end"), 21 | timer = null, 22 | cloud = {}; 23 | 24 | cloud.start = function() { 25 | var board = zeroArray((size[0] >> 5) * size[1]), 26 | bounds = null, 27 | n = words.length, 28 | i = -1, 29 | tags = [], 30 | data = words.map(function(d, i) { 31 | d.text = text.call(this, d, i); 32 | d.font = font.call(this, d, i); 33 | d.style = fontStyle.call(this, d, i); 34 | d.weight = fontWeight.call(this, d, i); 35 | d.rotate = rotate.call(this, d, i); 36 | d.size = ~~fontSize.call(this, d, i); 37 | d.padding = padding.call(this, d, i); 38 | d.previousword = previousword.call(this, d, i); 39 | return d; 40 | }) 41 | // sort update words to insert first 42 | .sort(function(a, b) { 43 | return 44 | (a.previousword && b.previousword) ? b.previousword.size - a.previousword.size : 45 | (a.previousword && !b.previousword) ? -1 : 46 | (!a.previousword && b.previousword) ? 1 : 47 | b.size - a.size; 48 | }); 49 | 50 | if (timer) clearInterval(timer); 51 | timer = setInterval(step, 0); 52 | step(); 53 | previouswords = words; 54 | return cloud; 55 | 56 | function step() { 57 | var start = +new Date, 58 | d; 59 | while (+new Date - start < timeInterval && ++i < n && timer) { 60 | d = data[i]; 61 | // look for word in previouswords; if it's there, use d.x and d.y 62 | if (d.previousword) { 63 | d.x = (size[0] + d.previousword.x) >> 1; 64 | d.y = (size[1] + d.previousword.y) >> 1; 65 | } 66 | else { 67 | d.x = (size[0] * (Math.random() + .5)) >> 1; 68 | d.y = (size[1] * (Math.random() + .5)) >> 1; 69 | } 70 | cloudSprite(d, data, i); 71 | if (d.hasText && place(board, d, bounds)) { 72 | tags.push(d); 73 | event.word(d); 74 | if (bounds) cloudBounds(bounds, d); 75 | else bounds = [{x: d.x + d.x0, y: d.y + d.y0}, {x: d.x + d.x1, y: d.y + d.y1}]; 76 | // Temporary hack 77 | d.x -= size[0] >> 1; 78 | d.y -= size[1] >> 1; 79 | } 80 | } 81 | if (i >= n) { 82 | cloud.stop(); 83 | event.end(tags, bounds); 84 | } 85 | } 86 | } 87 | 88 | cloud.stop = function() { 89 | if (timer) { 90 | clearInterval(timer); 91 | timer = null; 92 | } 93 | return cloud; 94 | }; 95 | 96 | cloud.timeInterval = function(x) { 97 | if (!arguments.length) return timeInterval; 98 | timeInterval = x == null ? Infinity : x; 99 | return cloud; 100 | }; 101 | 102 | function place(board, tag, bounds) { 103 | var perimeter = [{x: 0, y: 0}, {x: size[0], y: size[1]}], 104 | startX = tag.x, 105 | startY = tag.y, 106 | maxDelta = Math.sqrt(size[0] * size[0] + size[1] * size[1]), 107 | s = spiral(size), 108 | dt = Math.random() < .5 ? 1 : -1, 109 | t = -dt, 110 | dxdy, 111 | dx, 112 | dy; 113 | 114 | while (dxdy = s(t += dt)) { 115 | dx = ~~dxdy[0]; 116 | dy = ~~dxdy[1]; 117 | 118 | if (Math.min(dx, dy) > maxDelta) break; 119 | 120 | tag.x = startX + dx; 121 | tag.y = startY + dy; 122 | 123 | if (tag.x + tag.x0 < 0 || tag.y + tag.y0 < 0 || 124 | tag.x + tag.x1 > size[0] || tag.y + tag.y1 > size[1]) continue; 125 | // TODO only check for collisions within current bounds. 126 | if (!bounds || !cloudCollide(tag, board, size[0])) { 127 | if (!bounds || collideRects(tag, bounds)) { 128 | var sprite = tag.sprite, 129 | w = tag.width >> 5, 130 | sw = size[0] >> 5, 131 | lx = tag.x - (w << 4), 132 | sx = lx & 0x7f, 133 | msx = 32 - sx, 134 | h = tag.y1 - tag.y0, 135 | x = (tag.y + tag.y0) * sw + (lx >> 5), 136 | last; 137 | for (var j = 0; j < h; j++) { 138 | last = 0; 139 | for (var i = 0; i <= w; i++) { 140 | board[x + i] |= (last << msx) | (i < w ? (last = sprite[j * w + i]) >>> sx : 0); 141 | } 142 | x += sw; 143 | } 144 | delete tag.sprite; 145 | return true; 146 | } 147 | } 148 | } 149 | return false; 150 | } 151 | 152 | cloud.words = function(x) { 153 | if (!arguments.length) return words; 154 | words = x; 155 | return cloud; 156 | }; 157 | 158 | cloud.size = function(x) { 159 | if (!arguments.length) return size; 160 | size = [+x[0], +x[1]]; 161 | return cloud; 162 | }; 163 | 164 | cloud.font = function(x) { 165 | if (!arguments.length) return font; 166 | font = d3.functor(x); 167 | return cloud; 168 | }; 169 | 170 | cloud.fontStyle = function(x) { 171 | if (!arguments.length) return fontStyle; 172 | fontStyle = d3.functor(x); 173 | return cloud; 174 | }; 175 | 176 | cloud.fontWeight = function(x) { 177 | if (!arguments.length) return fontWeight; 178 | fontWeight = d3.functor(x); 179 | return cloud; 180 | }; 181 | 182 | cloud.rotate = function(x) { 183 | if (!arguments.length) return rotate; 184 | rotate = d3.functor(x); 185 | return cloud; 186 | }; 187 | 188 | cloud.text = function(x) { 189 | if (!arguments.length) return text; 190 | text = d3.functor(x); 191 | return cloud; 192 | }; 193 | 194 | cloud.spiral = function(x) { 195 | if (!arguments.length) return spiral; 196 | spiral = spirals[x + ""] || x; 197 | return cloud; 198 | }; 199 | 200 | cloud.fontSize = function(x) { 201 | if (!arguments.length) return fontSize; 202 | fontSize = d3.functor(x); 203 | return cloud; 204 | }; 205 | 206 | cloud.padding = function(x) { 207 | if (!arguments.length) return padding; 208 | padding = d3.functor(x); 209 | return cloud; 210 | }; 211 | 212 | cloud.previousword = function(x) { 213 | if (!arguments.length) return previousword; 214 | previousword = d3.functor(x); 215 | return cloud; 216 | } 217 | 218 | return d3.rebind(cloud, event, "on"); 219 | } 220 | 221 | function cloudText(d) { 222 | return d.text; 223 | } 224 | 225 | function cloudFont() { 226 | return "serif"; 227 | } 228 | 229 | function cloudFontNormal() { 230 | return "normal"; 231 | } 232 | 233 | function cloudFontSize(d) { 234 | return Math.sqrt(d.value); 235 | } 236 | 237 | function cloudRotate() { 238 | return (~~(Math.random() * 6) - 3) * 30; 239 | } 240 | 241 | function cloudPadding() { 242 | return 1; 243 | } 244 | 245 | function cloudPreviousword() { 246 | // look up previousword in previouswords 247 | if (previouswords) { 248 | var thisword = arguments[0]["text"]; 249 | var result = previouswords.filter(function(o){return o.text == thisword;} ); 250 | } 251 | return result? result[0] : null; // or undefined 252 | } 253 | 254 | // Fetches a monochrome sprite bitmap for the specified text. 255 | // Load in batches for speed. 256 | function cloudSprite(d, data, di) { 257 | if (d.sprite) return; 258 | c.clearRect(0, 0, (cw << 5) / ratio, ch / ratio); 259 | var x = 0, 260 | y = 0, 261 | maxh = 0, 262 | n = data.length; 263 | --di; 264 | while (++di < n) { 265 | d = data[di]; 266 | c.save(); 267 | c.font = d.style + " " + d.weight + " " + ~~((d.size + 1) / ratio) + "px " + d.font; 268 | var w = c.measureText(d.text + "m").width * ratio, 269 | h = d.size << 1; 270 | if (d.rotate) { 271 | var sr = Math.sin(d.rotate * cloudRadians), 272 | cr = Math.cos(d.rotate * cloudRadians), 273 | wcr = w * cr, 274 | wsr = w * sr, 275 | hcr = h * cr, 276 | hsr = h * sr; 277 | w = (Math.max(Math.abs(wcr + hsr), Math.abs(wcr - hsr)) + 0x1f) >> 5 << 5; 278 | h = ~~Math.max(Math.abs(wsr + hcr), Math.abs(wsr - hcr)); 279 | } else { 280 | w = (w + 0x1f) >> 5 << 5; 281 | } 282 | if (h > maxh) maxh = h; 283 | if (x + w >= (cw << 5)) { 284 | x = 0; 285 | y += maxh; 286 | maxh = 0; 287 | } 288 | if (y + h >= ch) break; 289 | c.translate((x + (w >> 1)) / ratio, (y + (h >> 1)) / ratio); 290 | if (d.rotate) c.rotate(d.rotate * cloudRadians); 291 | c.fillText(d.text, 0, 0); 292 | if (d.padding) c.lineWidth = 2 * d.padding, c.strokeText(d.text, 0, 0); 293 | c.restore(); 294 | d.width = w; 295 | d.height = h; 296 | d.xoff = x; 297 | d.yoff = y; 298 | d.x1 = w >> 1; 299 | d.y1 = h >> 1; 300 | d.x0 = -d.x1; 301 | d.y0 = -d.y1; 302 | d.hasText = true; 303 | x += w; 304 | } 305 | var pixels = c.getImageData(0, 0, (cw << 5) / ratio, ch / ratio).data, 306 | sprite = []; 307 | while (--di >= 0) { 308 | d = data[di]; 309 | if (!d.hasText) continue; 310 | var w = d.width, 311 | w32 = w >> 5, 312 | h = d.y1 - d.y0; 313 | // Zero the buffer 314 | for (var i = 0; i < h * w32; i++) sprite[i] = 0; 315 | x = d.xoff; 316 | if (x == null) return; 317 | y = d.yoff; 318 | var seen = 0, 319 | seenRow = -1; 320 | for (var j = 0; j < h; j++) { 321 | for (var i = 0; i < w; i++) { 322 | var k = w32 * j + (i >> 5), 323 | m = pixels[((y + j) * (cw << 5) + (x + i)) << 2] ? 1 << (31 - (i % 32)) : 0; 324 | sprite[k] |= m; 325 | seen |= m; 326 | } 327 | if (seen) seenRow = j; 328 | else { 329 | d.y0++; 330 | h--; 331 | j--; 332 | y++; 333 | } 334 | } 335 | d.y1 = d.y0 + seenRow; 336 | d.sprite = sprite.slice(0, (d.y1 - d.y0) * w32); 337 | } 338 | } 339 | 340 | // Use mask-based collision detection. 341 | function cloudCollide(tag, board, sw) { 342 | sw >>= 5; 343 | var sprite = tag.sprite, 344 | w = tag.width >> 5, 345 | lx = tag.x - (w << 4), 346 | sx = lx & 0x7f, 347 | msx = 32 - sx, 348 | h = tag.y1 - tag.y0, 349 | x = (tag.y + tag.y0) * sw + (lx >> 5), 350 | last; 351 | for (var j = 0; j < h; j++) { 352 | last = 0; 353 | for (var i = 0; i <= w; i++) { 354 | if (((last << msx) | (i < w ? (last = sprite[j * w + i]) >>> sx : 0)) 355 | & board[x + i]) return true; 356 | } 357 | x += sw; 358 | } 359 | return false; 360 | } 361 | 362 | function cloudBounds(bounds, d) { 363 | var b0 = bounds[0], 364 | b1 = bounds[1]; 365 | if (d.x + d.x0 < b0.x) b0.x = d.x + d.x0; 366 | if (d.y + d.y0 < b0.y) b0.y = d.y + d.y0; 367 | if (d.x + d.x1 > b1.x) b1.x = d.x + d.x1; 368 | if (d.y + d.y1 > b1.y) b1.y = d.y + d.y1; 369 | } 370 | 371 | function collideRects(a, b) { 372 | return a.x + a.x1 > b[0].x && a.x + a.x0 < b[1].x && a.y + a.y1 > b[0].y && a.y + a.y0 < b[1].y; 373 | } 374 | 375 | function archimedeanSpiral(size) { 376 | var e = size[0] / size[1]; 377 | return function(t) { 378 | return [e * (t *= .1) * Math.cos(t), t * Math.sin(t)]; 379 | }; 380 | } 381 | 382 | function rectangularSpiral(size) { 383 | var dy = 4, 384 | dx = dy * size[0] / size[1], 385 | x = 0, 386 | y = 0; 387 | return function(t) { 388 | var sign = t < 0 ? -1 : 1; 389 | // See triangular numbers: T_n = n * (n + 1) / 2. 390 | switch ((Math.sqrt(1 + 4 * sign * t) - sign) & 3) { 391 | case 0: x += dx; break; 392 | case 1: y += dy; break; 393 | case 2: x -= dx; break; 394 | default: y -= dy; break; 395 | } 396 | return [x, y]; 397 | }; 398 | } 399 | 400 | // TODO reuse arrays? 401 | function zeroArray(n) { 402 | var a = [], 403 | i = -1; 404 | while (++i < n) a[i] = 0; 405 | return a; 406 | } 407 | 408 | var cloudRadians = Math.PI / 180, 409 | cw = 1 << 11 >> 5, 410 | ch = 1 << 11, 411 | canvas, 412 | ratio = 1; 413 | 414 | if (typeof document !== "undefined") { 415 | canvas = document.createElement("canvas"); 416 | canvas.width = 1; 417 | canvas.height = 1; 418 | ratio = Math.sqrt(canvas.getContext("2d").getImageData(0, 0, 1, 1).data.length >> 2); 419 | canvas.width = (cw << 5) / ratio; 420 | canvas.height = ch / ratio; 421 | } else { 422 | // Attempt to use node-canvas. 423 | canvas = new Canvas(cw << 5, ch); 424 | } 425 | 426 | var c = canvas.getContext("2d"), 427 | spirals = { 428 | archimedean: archimedeanSpiral, 429 | rectangular: rectangularSpiral 430 | }; 431 | c.fillStyle = c.strokeStyle = "red"; 432 | c.textAlign = "center"; 433 | 434 | if (typeof module === "object" && module.exports) module.exports = cloud; 435 | else (d3.layout || (d3.layout = {})).cloud = cloud; 436 | })(); 437 | -------------------------------------------------------------------------------- /profiler.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from dateutil import parser 3 | import datetime 4 | import pytz # $ pip install pytz 5 | from collections import Counter 6 | import operator 7 | import re 8 | import d3output 9 | import fileinput 10 | import json 11 | import os 12 | import glob 13 | import ast 14 | 15 | class Profiler: 16 | def __init__(self, opts): 17 | for k, v in opts.items(): 18 | setattr(self, k, v) 19 | 20 | # set defaults 21 | if not("labelFormat" in opts): 22 | self.labelFormat = "%Y-%m-%d %H:%M:%S %Z" 23 | if not("tz" in opts): 24 | self.tz = pytz.UTC 25 | if not("extended" in opts): 26 | self.extended = False 27 | if not("blocks" in opts): 28 | self.blocks = ["all"] 29 | if "all" in self.blocks: 30 | self.blocks.extend(["topusers", "tophashtags", "topurls", "topimageurls", "urls", 31 | "imageurls"]) 32 | 33 | # initialize 34 | self.count = 0 35 | self.typecounts = {"original": 0, "retweet": 0, "quote": 0, "reply": 0} 36 | self.originalcount = 0 37 | self.retweetcount = 0 38 | self.quotecount = 0 39 | self.quoteandretweetcount = 0 40 | self.replycount = 0 41 | self.geocount = 0 42 | self.earliest = "" 43 | self.latest = "" 44 | self.users = Counter() 45 | if self.extended: 46 | if "tophashtags" in self.blocks: 47 | self.hashtags = Counter() 48 | self.hashtagcount = 0 49 | if "urls" in self.blocks or "topurls" in self.blocks: 50 | self.urls = Counter() 51 | self.urlcount = 0 52 | if "imageurls" in self.blocks or "topimageurls" in self.blocks: 53 | self.imageurls = Counter() 54 | self.imageurlcount = 0 55 | 56 | 57 | def adduser(self, user, tweet): 58 | self.users[user] += 1 59 | 60 | def addurl(self, url): 61 | self.urls[url] += 1 62 | 63 | def addhashtag(self, hashtag): 64 | self.hashtags[hashtag] += 1 65 | 66 | def addimageurl(self, imageurl): 67 | self.imageurls[imageurl] += 1 68 | 69 | def process(self, tweet): 70 | self.count += 1 71 | tweettype = "" 72 | if "retweeted_status" in tweet: 73 | tweettype = "retweet" 74 | elif tweet["is_quote_status"]: 75 | tweettype = "quote" 76 | elif tweet["in_reply_to_status_id"] != None: 77 | tweettype = "reply" 78 | else: tweettype = "original" 79 | if tweet.get("geo") != None: 80 | self.geocount += 1 81 | self.typecounts[tweettype] += 1 82 | 83 | self.created_at = parser.parse(tweet["created_at"]) 84 | if self.earliest == "" or self.earliest > self.created_at: 85 | self.earliest = self.created_at 86 | if self.latest == "" or self.latest < self.created_at: 87 | self.latest = self.created_at 88 | user = tweet["user"]["screen_name"] 89 | self.adduser(user, tweet) 90 | if self.extended: 91 | # handle urls 92 | if "urls" in self.blocks or "topurls" in self.blocks: 93 | if len(tweet["entities"]["urls"]) > 0: 94 | for url in tweet["entities"]["urls"]: 95 | self.addurl(url["expanded_url"]) 96 | self.urlcount += 1 97 | 98 | # handle hashtags 99 | if "hashtags" in self.blocks or "tophashtags" in self.blocks: 100 | if len(tweet["entities"]["hashtags"]) > 0: 101 | for tag in tweet["entities"]["hashtags"]: 102 | # hashtags are not case sensitive, so lower() to dedupe 103 | # or just leave it and accept dupes? 104 | self.addhashtag(tag["text"].lower()) 105 | self.hashtagcount += 1 106 | 107 | # handle imageurls 108 | if "imageurls" in self.blocks or "topimageurls" in self.blocks: 109 | if "media" in tweet["entities"]: 110 | hasimageurl = False 111 | for media in tweet["entities"]["media"]: 112 | if media["type"] == "photo": 113 | self.addimageurl(media["media_url"]) 114 | hasimageurl = True 115 | if hasimageurl: 116 | self.imageurlcount += 1 117 | 118 | def gettweets(self, opts, args): 119 | # prepare to serialize opts and args as json 120 | # converting opts to str produces string with single quotes, 121 | # but json requires double quotes 122 | self.optsdict = ast.literal_eval(str(opts)) 123 | self.argsdict = ast.literal_eval(str(args)) 124 | 125 | # if args has one value, check whether it's a directory 126 | if len(args) == 1 and os.path.isdir(args[0]): 127 | # add path to metadata file and tweets 128 | self.metadatafile = os.path.join(args[0] , "metadata.json") 129 | args = glob.glob(os.path.join(args[0], "data/tweets/tweets-*.json")) 130 | else: 131 | # args must be files, so calculate path to metadata file based on 132 | # dir of first input file 133 | self.metadatafile = os.path.join(os.path.dirname(args[0]), "metadata.json") 134 | for line in fileinput.input(args): 135 | try: 136 | tweet = json.loads(line) 137 | self.process(tweet) 138 | except ValueError as e: 139 | sys.stderr.write("uhoh: %s\n" % e) 140 | 141 | def tops(self, list, title): 142 | # given a list of name-value pairs, return the top 10 pairs by value, 143 | # and a list of integers representing the percent of total value 144 | # held by each of 10 slices 145 | 146 | totalcount = len(list) 147 | totalvalue = int(sum(list.values())) 148 | sorted = list.most_common() 149 | 150 | top = sorted[:10] 151 | top_result = [] 152 | for name, value in top: 153 | top_result.append({"name": name, "value": value}) 154 | 155 | step = float(totalcount) / 10 156 | percentiles = [] 157 | for i in range(0, 10): 158 | start = int(i * step) 159 | end = int((i + 1) * step) 160 | slicecount = end - start 161 | if slicecount > 0: 162 | # weight the slice value as if the slice were an even 10th of the list 163 | weight = 10 / (float(slicecount) / totalcount) 164 | slicevalue = sum(v for k,v in sorted[start:end]) 165 | percentile = int(round(float(slicevalue) / totalvalue * weight)) 166 | else: 167 | percentile = 0 168 | percentiles.append(percentile) 169 | return {"top" + title: top_result, title+"percentiles": percentiles} 170 | 171 | def report(self): 172 | local_earliest = self.tz.normalize(self.earliest.astimezone(self.tz)).strftime(self.labelFormat) 173 | local_latest = self.tz.normalize(self.latest.astimezone(self.tz)).strftime(self.labelFormat) 174 | result = {"count": self.count, 175 | "originalcount": self.typecounts["original"], 176 | "retweetcount": self.typecounts["retweet"], 177 | "quotecount": self.typecounts["quote"], 178 | "replycount": self.typecounts["reply"], 179 | "geocount": self.geocount, 180 | "earliest": local_earliest, 181 | "latest": local_latest, 182 | "usercount": len(self.users), 183 | "opts": self.optsdict, 184 | "args": self.argsdict, 185 | "metadatafile": self.metadatafile} 186 | if self.extended: 187 | if "topusers" in self.blocks: 188 | result.update(self.tops(self.users, "users")) 189 | if "tophashtags" in self.blocks: 190 | result.update(self.tops(self.hashtags, "hashtags")) 191 | if "topurls" in self.blocks: 192 | result.update(self.tops(self.urls, "urls")) 193 | if "urls" in self.blocks: 194 | result.update({"urlcount": self.urlcount, "urls": len(self.urls), 195 | "imageurlcount": self.imageurlcount, "imageurls": len(self.imageurls), 196 | "hashtagcount": self.hashtagcount, "hashtags": len(self.hashtags)}) 197 | if "topimageurls" in self.blocks: 198 | result.update(self.tops(self.imageurls, "imageurls")) 199 | if "imageurls" in self.blocks: 200 | result.update({"imageurlslist": self.imageurls}) 201 | return result 202 | 203 | class LinkNodesProfiler(Profiler): 204 | def __init__(self, opts): 205 | Profiler.__init__(self, opts) 206 | self.nodes = {} 207 | self.nodeid = 0 208 | 209 | # nodes will end up as 210 | # {"userA": 211 | # {"id": 27, 212 | # "source": 0, 213 | # "target": 1, 214 | # "links": { 215 | # "userB": 3, 216 | # "userC": 1 217 | # } 218 | # 219 | # Meaning that userA mentions userB 3 times, and userB mentions userA once. 220 | # We gather the nodes in a dictionary so that we can look up terms to update 221 | # counts, but at the end we convert the dictionary into a list sorted by id 222 | # so that the positions in the list correspond to the ids, as D3 requires. 223 | 224 | def addlink(self, source, target): 225 | if not source in self.nodes: 226 | self.nodes[source] = {"name": source, "id": self.nodeid, "tweetcount": 0, 227 | "source": 1, "target": 0, "links": {}} 228 | self.nodeid += 1 229 | else: 230 | self.nodes[source]["source"] += 1 231 | 232 | if not target in self.nodes: 233 | targetid = self.nodeid 234 | self.nodes[target] = {"name": target, "id": self.nodeid, "tweetcount": 0, 235 | "source": 0, "target": 1, "links": {}} 236 | self.nodeid += 1 237 | else: 238 | self.nodes[target]["target"] += 1 239 | targetid = self.nodes[target]["id"] 240 | 241 | linklist = self.nodes[source]["links"] 242 | if not target in linklist: 243 | linklist[target] = {"count": 1, "id": targetid} 244 | else: 245 | linklist[target]["count"] += 1 246 | 247 | def addsingle(self, name): 248 | if not name in self.nodes: 249 | self.nodes[name] = {"name": name, "id": self.nodeid, "tweetcount": 1, 250 | "source": 0, "target": 0, "links": {}} 251 | self.nodeid += 1 252 | 253 | def report(self): 254 | if hasattr(self, "graph"): 255 | self.optsdict["graph"] = self.graph 256 | if hasattr(self, "field"): 257 | self.optsdict["field"] = self.field 258 | profile = Profiler.report(self) 259 | # convert nodes dictionary to a list, sorted by id 260 | nodelistkeys = sorted(self.nodes, key=lambda w: self.nodes[w]["id"]) 261 | nodelist = [] 262 | for key in nodelistkeys: 263 | nodelist.append(self.nodes[key]) 264 | return {"profile": profile, "nodes": nodelist} 265 | 266 | class TimeProfiler(Profiler): 267 | # interval, in milliseconds 268 | intervalFormats = { 269 | "S": {"name": "second", "format": "%Y-%m-%d %H:%M:%S", "interval": 1000}, 270 | "M": {"name": "minute", "format": "%Y-%m-%d %H:%M", "interval": 1000 * 60}, 271 | "H": {"name": "hour", "format": "%Y-%m-%d %H", "interval": 1000 * 60 * 60}, 272 | "d": {"name": "day", "format": "%Y-%m-%d", "interval": 1000 * 60 * 60 * 24}, 273 | "m": {"name": "month", "format": "%Y-%m", "interval": 1000 * 60 * 60 * 24 * 28}, 274 | "Y": {"name": "year", "format": "%Y-%m", "interval": 1000 * 60 * 60 * 24 * 365} 275 | } 276 | def __init__(self, opts): 277 | Profiler.__init__(self, opts) 278 | try: 279 | self.intervalParts = re.search("([0-9]*)([^0-9]*)", self.intervalStr) 280 | if self.intervalParts.group(1) == "": 281 | self.intervalCount = 1 282 | else: 283 | self.intervalCount = int(self.intervalParts.group(1)) 284 | self.intervalUnit = self.intervalParts.group(2) 285 | self.interval = self.intervalCount * self.intervalFormats[self.intervalUnit]["interval"] 286 | self.format = self.intervalFormats[self.intervalUnit]["format"] 287 | self.intervalLabel = str(self.intervalCount) + " " + self.intervalFormats[self.intervalUnit]["name"] 288 | if self.intervalCount > 1: 289 | self.intervalLabel += "s" 290 | 291 | except ValueError as e: 292 | sys.stderr.write("uhoh: %s\n" % e) 293 | 294 | # gather in a dict with count if aggregating, otherwise in a list 295 | if self.aggregate: 296 | self.items = {} 297 | else: 298 | self.items = [] 299 | 300 | def process(self, tweet): 301 | Profiler.process(self, tweet) 302 | created_at = parser.parse(tweet["created_at"]) 303 | local_dt = self.tz.normalize(created_at.astimezone(self.tz)) 304 | if self.intervalStr != "": 305 | if self.intervalUnit == "S": 306 | local_dt = local_dt - datetime.timedelta(seconds=local_dt.second % int(self.intervalCount)) 307 | elif self.intervalUnit == "M": 308 | local_dt = local_dt - datetime.timedelta(minutes=local_dt.minute % int(self.intervalCount)) 309 | elif self.intervalUnit == "H": 310 | local_dt = local_dt - datetime.timedelta(hours=local_dt.hour % int(self.intervalCount)) 311 | # otherwise use format to aggregate values - though this treats intervalCount as 1 312 | result = local_dt.strftime(self.format) 313 | if self.aggregate: 314 | self.items[result] = self.items.get(result, 0) + 1 315 | else: 316 | self.items.append(result) 317 | # return the time slice label 318 | return result 319 | 320 | def report(self): 321 | self.optsdict["interval"] = self.interval 322 | self.optsdict["format"] = self.format 323 | self.optsdict["intervalLabel"] = self.intervalLabel 324 | profile = Profiler.report(self) 325 | if self.output == "csv": 326 | if self.aggregate: 327 | values = d3output.namevaluecsv(self.items) 328 | else: 329 | values = d3output.valuecsv(self.items) 330 | return values 331 | else: 332 | if self.aggregate: 333 | values = d3output.namevaluejson(self.items) 334 | else: 335 | values = d3output.valuejson(self.items) 336 | return {"profile": profile, "values": values} 337 | -------------------------------------------------------------------------------- /templates/graph.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | $TITLE$ 4 | 5 | 126 | 127 | 128 | 129 | 137 | 138 |
139 | 140 | 481 | 482 | --------------------------------------------------------------------------------