├── requirements.txt
├── github.sh
├── harvest.sh
├── .gitignore
├── generate.sh
├── harvest.py
├── reportprofile.py
├── d3times.py
├── d3graph.py
├── templates
    ├── reportprofile.txt
    ├── timebar.html
    ├── reportprofile.html
    ├── wordcloud.html
    └── graph.html
├── d3output.py
├── d3wordcloud.py
├── d3cotags.py
├── LICENSE
├── stopwords
    └── stop-words_english_6_en.txt
├── README.md
├── assets
    └── d3.layout.cloud.js
└── profiler.py


/requirements.txt:
--------------------------------------------------------------------------------
 1 | python-dateutil
 2 | pytz
 3 | tzlocal
 4 | pysparklines
 5 | requests_oauthlib
 6 | twarc
 7 | networkx
 8 | humanize
 9 | mako
10 | 
11 | 


--------------------------------------------------------------------------------
/github.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source ./github-credentials
 4 | 
 5 | PROJECTDIR=$1
 6 | source ./projects/$PROJECTDIR/github-repo
 7 | 
 8 | DATE=`date "+%Y-%m-%dT%H-%M-%S"`
 9 | 
10 | cd projects/$PROJECTDIR/html
11 | git add .
12 | git commit -m "$DATE update"
13 | git push https://$GITHUB_TOKEN@github.com/pbinkley/$GITHUB_REPO.git
14 | echo "Pushed commit: $DATE"
15 | 


--------------------------------------------------------------------------------
/harvest.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | source venv3/bin/activate
 3 | export PATH=~/.local/bin:$PATH
 4 | 
 5 | PROJECTDIR=$1
 6 | if [ -n "$PROJECTDIR" ]; then
 7 | SEARCH=`cat projects/$PROJECTDIR/metadata.json | jq -r ".search"`
 8 | 
 9 | OUTPUT=projects/$PROJECTDIR/data/tweets/tweets-$(date -d "today" +"%Y%m%d%H%M").json
10 | LASTID=`cat projects/$PROJECTDIR/data/tweets/last-id`
11 | echo Lastid: $LASTID Search: $SEARCH
12 | echo Output to $OUTPUT
13 | twarc --since_id $LASTID search "$SEARCH" > $OUTPUT
14 | NEWLASTID=`cat $OUTPUT | head -1 | jq -r ".id_str"`
15 | 
16 | if [[ ! -z $NEWLASTID ]]; then
17 |   echo $NEWLASTID > projects/$PROJECTDIR/data/tweets/last-id
18 | fi
19 | 
20 | echo "Harvested `wc -l $OUTPUT | cut -d " " -f 1` tweets"
21 | 
22 | # generate html
23 | ./generate.sh $PROJECTDIR
24 | 
25 | else
26 | 	echo "Provide project directory name"
27 | fi
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # twarc harvest project directories
 2 | projects/
 3 | credentials
 4 | github-credentials
 5 | venv/
 6 | venv3/
 7 | 
 8 | # Byte-compiled / optimized / DLL files
 9 | __pycache__/
10 | *.py[cod]
11 | 
12 | # C extensions
13 | *.so
14 | 
15 | # Distribution / packaging
16 | .Python
17 | env/
18 | build/
19 | develop-eggs/
20 | dist/
21 | downloads/
22 | eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | var/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | 
32 | # PyInstaller
33 | #  Usually these files are written by a python script from a template
34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 | 
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 | 
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .coverage
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | 
50 | # Translations
51 | *.mo
52 | *.pot
53 | 
54 | # Django stuff:
55 | *.log
56 | 
57 | # Sphinx documentation
58 | docs/_build/
59 | 
60 | # PyBuilder
61 | target/
62 | 


--------------------------------------------------------------------------------
/generate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | source venv3/bin/activate
 3 | 
 4 | PROJECTDIR=$1
 5 | 
 6 | # generate html
 7 | twarc/utils/wordcloud.py projects/$PROJECTDIR/data/tweets/*.json > projects/$PROJECTDIR/html/wordcloud.html
 8 | twarc/utils/sort_by_id.py projects/$PROJECTDIR/data/tweets/*.json | twarc/utils/deduplicate.py > projects/$PROJECTDIR/data/amalgamated.json
 9 | #twarc/utils/network.py --users projects/$PROJECTDIR/data/amalgamated.json projects/$PROJECTDIR/html/network-users.html
10 | #twarc/utils/network.py projects/$PROJECTDIR/data/amalgamated.json projects/$PROJECTDIR/html/network.html
11 | #twarc/utils/wall.py projects/$PROJECTDIR/data/amalgamated.json > projects/$PROJECTDIR/wall.html
12 | ./d3cotags.py -e $PROJECTDIR projects/$PROJECTDIR > projects/$PROJECTDIR/html/cotags.html
13 | ./d3graph.py --mode mentions projects/$PROJECTDIR > projects/$PROJECTDIR/html/mentionsgraph.html
14 | ./d3graph.py --mode retweets projects/$PROJECTDIR > projects/$PROJECTDIR/html/retweetsgraph.html
15 | ./d3graph.py --mode replies projects/$PROJECTDIR > projects/$PROJECTDIR/html/repliesgraph.html
16 | ./d3times.py -a -t "America/Edmonton" -i 3H projects/$PROJECTDIR > projects/$PROJECTDIR/html/timebargraph.html
17 | ./reportprofile.py -o html projects/$PROJECTDIR/data/amalgamated.json  > projects/$PROJECTDIR/html/index.html 
18 | ./reportprofile.py projects/$PROJECTDIR
19 | twarc dehydrate projects/$PROJECTDIR/data/amalgamated.json | uniq > projects/$PROJECTDIR/html/tweet-ids.txt
20 | 


--------------------------------------------------------------------------------
/harvest.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import sys
 5 | import argparse
 6 | import json
 7 | import errno
 8 | 
 9 | def make_sure_path_exists(path):
10 |     try:
11 |         os.makedirs(path)
12 |     except OSError as exception:
13 |         if exception.errno != errno.EEXIST:
14 |             raise
15 | 
16 | parser = argparse.ArgumentParser("harvest")
17 | parser.add_argument("archive_dir", action="store",
18 |                     help="a directory where results are stored")
19 | args = parser.parse_args()
20 | 
21 | if not os.path.isdir(args.archive_dir):
22 |     sys.exit("Directory " + args.archive_dir + " does not exist.")
23 |     
24 | data_dir = os.path.join(args.archive_dir, "data")
25 | make_sure_path_exists(data_dir)
26 | tweets_dir = os.path.join(data_dir, "tweets")
27 | make_sure_path_exists(tweets_dir)
28 | 
29 | metadatafile = os.path.join(args.archive_dir, "metadata.json")
30 | try:
31 |     with open(metadatafile) as json_data:
32 |         metadata = json.load(json_data)
33 |         json_data.close()
34 | except:
35 |     sys.exit("Cannot read metadata file " + metadatafile)
36 | 
37 | sys.argv = ["", metadata["search"], tweets_dir]
38 | 
39 | # find twarc-archive.py on system path
40 | for dirname in os.environ["PATH"].split(os.pathsep):
41 |     candidate = os.path.join(dirname, "twarc-archive.py")
42 |     print(candidate)
43 |     if os.path.isfile(candidate):
44 |         break 
45 |     else:
46 |         candidate = ""
47 | try:
48 |     execfile(candidate)
49 | except:
50 |     sys.exit("Cannot run twarc-archive.py")
51 | 


--------------------------------------------------------------------------------
/reportprofile.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python            
 2 | 
 3 | import json
 4 | import optparse
 5 | from profiler import Profiler # local module
 6 | from dateutil import parser
 7 | import sparkline
 8 | import os
 9 | from mako.template import Template
10 | import glob
11 | import sys
12 | import re
13 | import humanize
14 | 
15 | opt_parser = optparse.OptionParser()
16 | opt_parser.add_option("-o", "--output", dest="output", type="str", 
17 |     help="text | json | html (default: text)", default="text")
18 | opts, args = opt_parser.parse_args()
19 | 
20 | profiler = Profiler({"extended": True, "blocks": ["all"]})
21 | 
22 | profiler.gettweets(opts, args)
23 | 
24 | data = profiler.report()
25 | 
26 | if (opts.output == "json"):
27 |     print(json.dumps(data))
28 | elif (opts.output == "html"):
29 |     metadata_file = os.path.join(os.path.dirname(args[0]), "../metadata.json")
30 |     with open(metadata_file) as json_data:
31 |         metadata = json.load(json_data)
32 |         json_data.close()
33 | 
34 |     data['title'] = metadata['title']
35 |     data['search'] = metadata['search']
36 | 
37 |     # gather names and sizes of html files
38 |     data['reports'] = []
39 |     p = re.compile('.*\/html\/(.*)\.html')
40 |     for report in sorted(glob.glob(os.path.join(os.path.dirname(args[0]), "../html/*.html"))):
41 |         m = p.match(report)
42 |         size = os.path.getsize(report)
43 |         data['reports'].append({'report': m[1], 'size': humanize.naturalsize(size)})
44 | 
45 |     mytemplate = Template(filename='templates/reportprofile.html')
46 |     print(mytemplate.render(data = data))
47 | else:
48 |     mytemplate = Template(filename='templates/reportprofile.txt')
49 |     print(mytemplate.render(data = data))
50 | 


--------------------------------------------------------------------------------
/d3times.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import optparse
 4 | import pytz # $ pip install pytz
 5 | from tzlocal import get_localzone # $ pip install tzlocal
 6 | import d3output # local module
 7 | from profiler import TimeProfiler # local module
 8 | 
 9 | opt_parser = optparse.OptionParser()
10 | opt_parser.add_option("-t", "--timezone", type=str, default="",
11 |     help="output timezone (e.g. 'America/New_York' or 'local'; default: UTC)")
12 | opt_parser.add_option('-a', '--aggregate', action='store_true', default=False,
13 |     help="Aggregate the values to produce key-value pairs with counts")
14 | opt_parser.add_option("-o", "--output", dest="output", type="str",
15 |     help="html | csv | json (default: html)", default="html")
16 | opt_parser.add_option("-p", "--template", dest="template", type="str",
17 |     help="name of template in utils/template (default: timebar.html)", default="timebar.html")
18 | opt_parser.add_option("-i", "--interval", dest="intervalStr", type="str",
19 |     help="interval for grouping timestamps, in seconds, minutes or hours, e.g. 15M (default: 1S)", default="1S")
20 | 
21 | opts, args = opt_parser.parse_args()
22 | 
23 | aggregate = opts.aggregate
24 | tzname = opts.timezone
25 | 
26 | # determine output time zone
27 | if tzname == "":
28 |     tz = pytz.UTC
29 | elif tzname == "local":
30 |     tz = get_localzone() # system timezone, from tzlocal
31 | else:
32 |     tz = pytz.timezone(tzname)
33 | 
34 | # if an interval is provided in the options, use it; otherwise
35 | # determine the interval from the datetime format
36 | intervalStr = opts.intervalStr # e.g. 15M
37 | 
38 | profiler = TimeProfiler({
39 |     "tz": tz,
40 |     "output": opts.output,
41 |     "aggregate": aggregate,
42 |     "intervalStr": intervalStr})
43 | 
44 | profiler.gettweets(opts, args)
45 | 
46 | data = profiler.report()
47 | 
48 | if opts.output == "html":
49 |     d3output.embed(opts.template, data)
50 | else:
51 |     print(data)
52 | 


--------------------------------------------------------------------------------
/d3graph.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import optparse
 4 | import d3output # local module
 5 | from profiler import Profiler # local module
 6 | from profiler import LinkNodesProfiler # local module
 7 | from collections import Counter
 8 | 
 9 | opt_parser = optparse.OptionParser()
10 | opt_parser.add_option("-m", "--mode", dest="mode", help="retweets (default) | mentions | replies",
11 |     default="retweets")
12 | opt_parser.add_option("-t", "--threshold", dest="threshold", type="int", 
13 |     help="minimum links to qualify for inclusion (default: 1)", default=1)
14 | opt_parser.add_option("-o", "--output", dest="output", type="str", 
15 |     help="html | json (default: html)", default="html")
16 | opt_parser.add_option("-p", "--template", dest="template", type="str", 
17 |     help="name of template in utils/template (default: graph.html)", default="graph.html")
18 |     
19 | opts, args = opt_parser.parse_args()
20 | 
21 | output = opts.output
22 | 
23 | class DirectedProfiler(LinkNodesProfiler):
24 |     def __init__(self, opts):
25 |         LinkNodesProfiler.__init__(self, opts)
26 | 
27 |     def process(self, tweet):
28 |         Profiler.process(self, tweet)
29 | 
30 |     def adduser(self, user, tweet):
31 |         if self.mode == "mentions":
32 |             if "user_mentions" in tweet["entities"]:
33 |                 for mention in tweet["entities"]["user_mentions"]:
34 |                     self.addlink(user, str(mention["screen_name"]))
35 |         elif self.mode == "replies":
36 |             if not(tweet["in_reply_to_screen_name"] == None):
37 |                 self.addlink(tweet["in_reply_to_screen_name"], user)
38 |         else: # default mode: retweets
39 |             if "retweeted_status" in tweet:
40 |                 self.addlink(user, tweet["retweeted_status"]["user"]["screen_name"])
41 |         # add to tweet count for this tag
42 |         if not user in self.nodes:
43 |             self.addsingle(user)
44 |         self.nodes[user]["tweetcount"] += 1
45 | 
46 |     def report(self):
47 |         return LinkNodesProfiler.report(self)
48 | 
49 | profiler = DirectedProfiler({
50 |             "mode": opts.mode,
51 |             "graph": "directed",
52 |             "field": "user"})
53 | 
54 | profiler.gettweets(opts, args)
55 | 
56 | data = profiler.report()
57 | 
58 | profile = data["profile"]
59 | nodes = data["nodes"]
60 | 
61 | if output == "csv":
62 |     print(d3output.nodeslinkcsv(nodes))
63 | elif output == "json":
64 |     values = d3output.nodeslinktrees(profile, nodes)
65 |     print({"profile": profile, "values": values})
66 | elif output == "html":
67 |     print(d3output.embed(opts.template, d3output.nodeslinktrees(profile, nodes)))
68 | 
69 | 


--------------------------------------------------------------------------------
/templates/reportprofile.txt:
--------------------------------------------------------------------------------
 1 | ## -*- coding: utf-8 -*-
 2 | <%! import sparkline %>
 3 | <%! from dateutil import parser %>
 4 | 
 5 | <%def name="percentage(numerator, denominator)" filter="trim">
 6 |     ${str("%.2f" % (float(numerator) / float(denominator) * 100.0))}%
 7 | </%def>
 8 | 
 9 | Count:                ${data['count']}
10 | 
11 | Users:                ${data['usercount']}
12 | User percentiles:     ${sparkline.sparkify(data["userspercentiles"])}
13 |                       ${str(data["userspercentiles"])}
14 | 
15 | Has hashtag:          ${"{:>9}".format(str(data["hashtagcount"]))} (${percentage(data["hashtagcount"], data["count"])})
16 | Hashtags:             ${"{:>9}".format(str(data["hashtags"]))}
17 | Hashtags percentiles: ${sparkline.sparkify(data["hashtagspercentiles"])}
18 |                   ${str(data["hashtagspercentiles"])}
19 | 
20 | Has URL:              ${"{:>9}".format(str(data["urlcount"]))} (${percentage(data["urlcount"], data["count"])})
21 | URLs:                 ${"{:>9}".format(str(data["urls"]))}
22 | URLs percentiles:     ${sparkline.sparkify(data["urlspercentiles"])}
23 |                       ${str(data["urlspercentiles"])}
24 | 
25 | Has Image URL:        ${"{:>9}".format(str(data["imageurlcount"]))} (${percentage(data["imageurlcount"], data["count"])})
26 | Image URLs:           ${"{:>9}".format(str(data["imageurls"]))}
27 | Image URLs percentiles: ${sparkline.sparkify(data["imageurlspercentiles"])}
28 |                         ${str(data["imageurlspercentiles"])}
29 | 
30 | Originals:            ${"{:>9}".format(str(data["originalcount"]))} (${percentage(data["originalcount"], data["count"])})
31 | Retweets:             ${"{:>9}".format(str(data["retweetcount"]))} (${percentage(data["retweetcount"], data["count"])})
32 | Quotes:               ${"{:>9}".format(str(data["quotecount"]))} (${percentage(data["quotecount"], data["count"])})
33 | Replies:              ${"{:>9}".format(str(data["replycount"]))} (${percentage(data["replycount"], data["count"])})
34 | Geo:                  ${"{:>9}".format(str(data["geocount"]))} (${percentage(data["geocount"], data["count"])})
35 | Earliest:             ${str(data["earliest"])}
36 | Latest:               ${str(data["latest"])}
37 | Duration:             ${str(parser.parse(data["latest"]) - parser.parse(data["earliest"]))}
38 | Top users:            ${sparkline.sparkify([u["value"] for u in data["topusers"]])}
39 |     % for user in data["topusers"]:
40 |       ${user["value"]} ${user["name"]}
41 |     % endfor
42 | Top hashtags:         ${sparkline.sparkify([u["value"] for u in data["tophashtags"]])}
43 |     % for hashtag in data["tophashtags"]:
44 |       ${hashtag["value"]} ${hashtag["name"]}
45 |     % endfor
46 | Top URLs:             ${sparkline.sparkify([u["value"] for u in data["topurls"]])}
47 |     % for url in data["topurls"]:
48 |       ${url["value"]} ${url["name"]}
49 |     % endfor
50 | Top Image URLs:       ${sparkline.sparkify([u["value"] for u in data["topimageurls"]])}
51 |     % for imageurl in data["topimageurls"]:
52 |       ${imageurl["value"]} ${imageurl["name"]}
53 |     % endfor
54 | 


--------------------------------------------------------------------------------
/templates/timebar.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <meta charset="utf-8">
  3 | <title>$TITLE$</title>
  4 | <script src="https://d3js.org/d3.v3.min.js"></script>
  5 | <style>
  6 | 
  7 | .axis text {
  8 |   font: 10px sans-serif;
  9 | }
 10 | 
 11 | .axis path, .axis line {
 12 |   fill: none;
 13 |   stroke: #000;
 14 |   shape-rendering: crispEdges;
 15 | }
 16 | 
 17 | #chart {
 18 |   width: 100%;
 19 |   height: 100%;
 20 |   position: absolute;
 21 | }
 22 | 
 23 | #metadata {
 24 |   position: fixed;
 25 |   left: 10px;
 26 |   top: 10px;
 27 |   border: 1px solid black;
 28 |   padding: 5px;
 29 | }
 30 | </style>
 31 | <body>
 32 | <div id="metadata"></div>
 33 | <svg id="chart"></svg>
 34 | <script>
 35 | 
 36 | var data = $DATA$;
 37 | var metadata = data["metadata"];
 38 | var values = data.values;
 39 | var profile = data.profile;
 40 | var opts = profile.opts;
 41 | var tformat = opts.format;
 42 | var tformatter = d3.time.format(tformat);
 43 | var timezone = opts.timezone;
 44 | var template = opts.template;
 45 | var intervalStr = opts.intervalStr;
 46 | var intervalLabel = opts.intervalLabel;
 47 | 
 48 | var html = "<strong>" + metadata["title"] + "</strong><br/>Search: " + metadata["search"] + " | Timezone: " + timezone + " | Template: " + template + " | Interval: " + intervalLabel + "<br/>";
 49 | html += profile.count + " tweets from " + profile.usercount + " users, including " + profile.retweetcount + " retweets (" + (profile.retweetcount / profile.count * 100).toFixed(2) + "%), ";
 50 | html += "and " + profile.geocount + " with geo (" + (profile.geocount / profile.count * 100).toFixed(2) + "%): ";
 51 | html += profile.earliest + " to " + profile.latest;
 52 | d3.select("#metadata").html(html);
 53 | 
 54 | function type(d) { 
 55 |     d.name = d3.time.format(tformat).parse(d.name);
 56 | }
 57 | 
 58 | var vlen = values.length;
 59 | for (var i = 0; i < vlen; i++) {
 60 |     type(values[i]);
 61 | }
 62 | 
 63 | var margin = {top: 100, right: 100, bottom: 100, left: 100},
 64 |     width = parseInt(d3.select("#chart").style("width")) - margin.left - margin.right,
 65 |     height = parseInt(d3.select("#chart").style("height")) - margin.top - margin.bottom;
 66 | 
 67 | var svg = d3.select("#chart")
 68 |     .attr("width", width + margin.left + margin.right)
 69 |     .attr("height", height + margin.top + margin.bottom)
 70 |   .append("g")
 71 |     .attr("transform", "translate(" + margin.left + "," + margin.top + ")");
 72 | 
 73 | var x = d3.time.scale()
 74 |     .domain([
 75 |         d3.min(values, function(d) {return d.name}), 
 76 |         d3.max(values, function(d) {return d.name})
 77 |      ])
 78 |     .nice(d3.time.day)
 79 |     .range([0, width]);
 80 | 
 81 | var y = d3.scale.linear()
 82 |     .domain([0, d3.max(values, function(d) { return d.value; })])
 83 |     .range([height, 0]);
 84 | 
 85 | var timespan = x.invert(width) - x.invert(0) // time span of x, in milliseconds
 86 | var interval = opts.interval; // interval of datapoints, in milliseconds
 87 | var intervalCount = timespan / interval; // how many intervals in x
 88 | var barWidth = width / intervalCount; // width of one interval, in pixels
 89 | 
 90 | // x-axis
 91 | svg.append("g")
 92 |   .attr("class", "x axis")
 93 |   .attr("transform", "translate(0," + height + ")")
 94 |   .call(d3.svg.axis().scale(x).orient("bottom"))
 95 | .append("text")
 96 |   .attr("class", "label")
 97 |   .attr("x", width)
 98 |   .attr("y", 30)
 99 |   .style("text-anchor", "end")
100 |   .text("date");
101 |     
102 | // y-axis
103 | svg.append("g")
104 |   .attr("class", "y axis")
105 |   .call(d3.svg.axis().scale(y).orient("left"))
106 | .append("text")
107 |   .attr("class", "label")
108 |   .attr("transform", "rotate(-90)")
109 |   .attr("y", -70)
110 |   .attr("dy", ".71em")
111 |   .style("text-anchor", "end")
112 |   .text("tweets");
113 | 
114 | var bar = svg.selectAll(".bar")
115 |   .data(values)
116 | .enter().append("g")
117 |   .append("rect")
118 |   .attr("class", "bar")
119 |   .attr("x", function(d) { return x(d.name); })
120 |   .attr("width", barWidth)
121 |   .attr("y", function(d) { return y(d.value); })
122 |   .attr("height", function(d) { return height - y(d.value); })
123 |   .append("svg:title")
124 |   .text(function(d) { return tformatter(d.name) + ": " + d.value + " tweets"; });
125 | 
126 | </script>
127 | </body>


--------------------------------------------------------------------------------
/d3output.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os
  4 | import sys
  5 | import json
  6 | import csv
  7 | import io
  8 | 
  9 | def nodeslinks(threshold):
 10 | 
 11 |     nodes = []
 12 |     links = []
 13 |     
 14 |     # lines look like "nodeA,nodeB,123"
 15 |     for line in sys.stdin:
 16 |         tokens = line.split(",")
 17 |         # use try to ignore header line
 18 |         try:
 19 |             if int(tokens[2]) >= threshold:
 20 |                 if not tokens[0] in nodes:
 21 |                     nodes.append(tokens[0])
 22 |                 if not tokens[1] in nodes:
 23 |                     nodes.append(tokens[1])
 24 |                 links.append({"source": nodes.index(tokens[0]), 
 25 |                 "target": nodes.index(tokens[1]), 
 26 |                 "value": int(tokens[2])}) 
 27 |         except:
 28 |             continue
 29 |     
 30 |     nodelist = []
 31 |     for node in nodes:
 32 |         nodelist.append({"name": node})
 33 |     
 34 |     print(json.dump({"nodes": nodelist, "links": links}))
 35 | 
 36 | def nodeslinktrees(profile, nodes):
 37 |     # generate nodes json
 38 |     nodesoutput = []
 39 |     linksoutput = []
 40 |     if hasattr(profile["opts"], "graph"):
 41 |         graph = profile["opts"]["graph"]
 42 |     else:
 43 |         graph = ""
 44 |     for node in nodes:
 45 |         if graph == "directed":
 46 |             title = " (" + str(node["tweetcount"]) + " tweet"
 47 |             if node["tweetcount"] != 1:
 48 |                 title += "s"
 49 |             title += ": " + unicode(node["source"]) + " out/" + unicode(node["target"]) + " in)"
 50 |         else:
 51 |             title = " (" + str(node["tweetcount"]) + " tweet"
 52 |             if node["tweetcount"] != 1:
 53 |                 title += "s"
 54 |             title += ")"
 55 |         nodesoutput.append({"name": node["name"], 
 56 |             "title": node["name"] + title})
 57 |        
 58 |         # generate links
 59 |         for targetname in node["links"].keys():
 60 |             target = node["links"][targetname]
 61 |             if target["count"] >= profile["opts"]["threshold"]:
 62 |                 linksoutput.append({
 63 |                     "source": node["id"], 
 64 |                     "target": target["id"], 
 65 |                     "value": target["count"]
 66 |                 })
 67 | 
 68 |     return {"profile": profile, "nodes": nodesoutput, "links": linksoutput}
 69 | 
 70 | def namevaluecsv(data):
 71 |     csvout = io.StringIO()
 72 |     csvwriter = csv.writer(csvout)
 73 |     csvwriter.writerow(["name", "value"])
 74 |     for key, value in sorted(data.items()):
 75 |         csvwriter.writerow([key, value])
 76 |     return csvout.getvalue()
 77 |     
 78 | def valuecsv(data):
 79 |     csvout = io.StringIO()
 80 |     csvwriter = csv.writer(csvout)
 81 |     csvwriter.writerow(["value"])
 82 |     for d in data:
 83 |         csvwriter.writerow([d])
 84 |     return csvout.getvalue()
 85 |     
 86 | def nodeslinkcsv(data):
 87 |     # convert link-nodes objects into csv
 88 |     # e.g. {"A": {"B": 3, "C": 7}} to A,B,3 and A,C,7
 89 |     csvout = io.StringIO()
 90 |     csvwriter = csv.writer(csvout)
 91 |     csvwriter.writerow(["source", "target", "value"])
 92 |     for node in data:
 93 |         source = node["name"]
 94 |         # generate csv rows
 95 |         for targetname in node["links"].iterkeys():
 96 |             csvwriter.writerow([source, targetname, node["links"][targetname]["count"]])
 97 |     return csvout.getvalue()
 98 | 
 99 | def namevaluejson(data):
100 |     output = []
101 |     for key, value in sorted(data.items()):
102 |         output.append({"name": key, "value": value})
103 |     return output
104 |     
105 | def valuejson(data):
106 |     output = []
107 |     for d in data:
108 |         output.append(d)
109 |     return output
110 | 
111 | def embed(template, d3json):
112 |     # load metadata.json if present
113 |     # d3json["args"] contains filenams passed in, with wildcards resolved
114 |     if d3json["profile"]["metadatafile"]:
115 |         metadata_file = d3json["profile"]["metadatafile"]
116 |     else:
117 |         metadata_file = os.path.join(os.path.dirname(d3json["profile"]["args"][0]), "metadata.json")
118 |     try:
119 |         with open(metadata_file) as json_data:
120 |             metadata = json.load(json_data)
121 |             json_data.close()
122 |     except:
123 |         #sys.exit("Cannot read metadata file " + metadata_file)
124 |         metadata = {"title": d3json["profile"]["args"][0] 
125 |                     + (" (+)" if len(d3json["profile"]["args"]) > 1 else "") }
126 |     d3json["metadata"] = metadata
127 |     # generate html by replacing token in template
128 |     template_file = os.path.join(os.path.dirname(__file__), "templates", template)
129 |     with open (template_file, "r") as template:
130 |         output = template.read()
131 |         output = output.replace("$TITLE$", metadata["title"])
132 |         output = output.replace("$DATA$", json.dumps(d3json))
133 |         print(output)
134 | 


--------------------------------------------------------------------------------
/templates/reportprofile.html:
--------------------------------------------------------------------------------
 1 | ## -*- coding: utf-8 -*-
 2 | <%! import sparkline %>
 3 | <%! from dateutil import parser %>
 4 | 
 5 | <%def name="percentage(numerator, denominator)" filter="trim">
 6 |   ${str("%.2f" % (float(numerator) / float(denominator) * 100.0))}%
 7 | </%def>
 8 | 
 9 | <!DOCTYPE html>
10 | <head>
11 |   <meta charset="utf-8">
12 | <title>${data['title']}</title>
13 | <script src="https://d3js.org/d3.v3.min.js"></script>
14 | <style type="text/css">
15 |   td img {
16 |     max-width: 300px;
17 |   }
18 |   td:first-child {
19 |     text-align: right;
20 |     vertical-align: text-top;
21 |   }
22 | </style>
23 | </head>
24 | <body>
25 | 
26 |   <h1>${data['title']}</h1>
27 | 
28 |   <p>Search: ${data['search']}</p>
29 | 
30 | <h2>Reports (note: some are large)</h2>
31 | 
32 | <ul>
33 | % for report in data["reports"]:
34 |   % if report["report"] != "index":
35 |   <li><a href="${report['report']}.html">${report['report']}</a> (${report['size']})</li>
36 |   % endif
37 | % endfor
38 | </ul>
39 | 
40 | <p><a href="tweet-ids.txt">Download tweet ids</a> (See <a href="https://github.com/DocNow/twarc#user-content-hydrate">twarc</a> for instructions on how to "hydrate" this list of ids to retrieve the original tweets)</p>
41 | 
42 | <h2>Harvest statistics</h2>
43 | 
44 | <table>
45 | <tr><td>Count:</td><td>${data['count']}</td></tr>
46 | 
47 | <tr><td>Users:</td><td>${data['usercount']}</td></tr>
48 | <tr><td>User percentiles:</td><td>${sparkline.sparkify(data["userspercentiles"])}</td></tr>
49 | <tr><td></td><td>${str(data["userspercentiles"])}</td></tr>
50 | 
51 | <tr><td>Has hashtag:</td><td>${"{:>9}".format(str(data["hashtagcount"]))} (${percentage(data["hashtagcount"], data["count"])})</td></tr>
52 | <tr><td>Hashtags:</td><td>${"{:>9}".format(str(data["hashtags"]))}</td></tr>
53 | <tr><td>Hashtags percentiles:</td><td>${sparkline.sparkify(data["hashtagspercentiles"])}</td></tr>
54 | <tr><td></td><td>${str(data["hashtagspercentiles"])}</td></tr>
55 | 
56 | <tr><td>Has URL:</td><td>${"{:>9}".format(str(data["urlcount"]))} (${percentage(data["urlcount"], data["count"])})</td></tr>
57 | <tr><td>URLs:</td><td>${"{:>9}".format(str(data["urls"]))}</td></tr>
58 | <tr><td>URLs percentiles:</td><td>${sparkline.sparkify(data["urlspercentiles"])}</td></tr>
59 | <tr><td></td><td>${str(data["urlspercentiles"])}</td></tr>
60 | 
61 | <tr><td>Has Image URL:</td><td>${"{:>9}".format(str(data["imageurlcount"]))} (${percentage(data["imageurlcount"], data["count"])})</td></tr>
62 | <tr><td>Image URLs:</td><td>${"{:>9}".format(str(data["imageurls"]))}</td></tr>
63 | <tr><td>Image URLs percentiles:</td><td>${sparkline.sparkify(data["imageurlspercentiles"])}</td></tr>
64 | <tr><td></td><td>${str(data["imageurlspercentiles"])}</td></tr>
65 | 
66 | <tr><td>Originals:</td><td>${"{:>9}".format(str(data["originalcount"]))} (${percentage(data["originalcount"], data["count"])})</td></tr>
67 | <tr><td>Retweets:</td><td>${"{:>9}".format(str(data["retweetcount"]))} (${percentage(data["retweetcount"], data["count"])})</td></tr>
68 | <tr><td>Quotes:</td><td>${"{:>9}".format(str(data["quotecount"]))} (${percentage(data["quotecount"], data["count"])})</td></tr>
69 | <tr><td>Replies:</td><td>${"{:>9}".format(str(data["replycount"]))} (${percentage(data["replycount"], data["count"])})</td></tr>
70 | <tr><td>Geo:</td><td>${"{:>9}".format(str(data["geocount"]))} (${percentage(data["geocount"], data["count"])})</td></tr>
71 | <tr><td>Earliest:</td><td>${str(data["earliest"])}</td></tr>
72 | <tr><td>Latest:</td><td>${str(data["latest"])}</td></tr>
73 | <tr><td>Duration:</td><td>${str(parser.parse(data["latest"]) - parser.parse(data["earliest"]))}</td></tr>
74 | <tr><td>Top users:</td><td>${sparkline.sparkify([u["value"] for u in data["topusers"]])}</td></tr>
75 | % for user in data["topusers"]:
76 | <tr><td>${user["value"]}</td><td><a href='https://twitter.com/${user["name"]}'>${user["name"]}</a></td></tr>
77 | % endfor
78 | <tr><td>Top hashtags:</td><td>${sparkline.sparkify([u["value"] for u in data["tophashtags"]])}</td></tr>
79 | % for hashtag in data["tophashtags"]:
80 | <tr><td>${hashtag["value"]}</td><td><a href='https://twitter.com/hashtags/${hashtag["name"]}'>${hashtag["name"]}</a></td></tr>
81 | % endfor
82 | <tr><td>Top URLs:</td><td>${sparkline.sparkify([u["value"] for u in data["topurls"]])}</td></tr>
83 | % for url in data["topurls"]:
84 | <tr><td>${url["value"]}</td><td><a href='${url["name"]}'>${url["name"]}</a></td></tr>
85 | % endfor
86 | <tr><td>Top Image URLs:</td><td>${sparkline.sparkify([u["value"] for u in data["topimageurls"]])}
87 | <br>(Note: Firefox may not display these images, because of content blocking. You can turn off <a href="https://support.mozilla.org/en-US/kb/enhanced-tracking-protection-firefox-desktop">Enhanced Tracking Protection</a> for this domain to allow the thumbnails to load. The links to the images should still work even when the thumbnails don't.)</td></tr>
88 | % for imageurl in data["topimageurls"]:
89 | <tr><td>${imageurl["value"]}</td><td><a href='${imageurl["name"]}'><img src='${imageurl["name"].replace("http:", "https:")}'></a></td></tr>
90 | % endfor
91 | </table>
92 | </body>
93 | 


--------------------------------------------------------------------------------
/d3wordcloud.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import optparse
  4 | import re
  5 | import dateutil.parser
  6 | from profiler import TimeProfiler
  7 | import pytz
  8 | import d3output
  9 | 
 10 | opt_parser = optparse.OptionParser()
 11 | opt_parser.add_option("-t", "--timezone", type=str, default="", 
 12 |     help="output timezone (e.g. 'America/New_York' or 'local'; default: UTC)")
 13 | opt_parser.add_option("-w", "--maxwords", dest="maxwords", type="int", 
 14 |     help="maximum number of words to display (default: 25)", default=25)
 15 | opt_parser.add_option("-i", "--interval", dest="intervalStr", type="str", 
 16 |     help="interval for grouping timestamps, in seconds, minutes or hours, e.g. 15M (default: 1H)", 
 17 |     default="1H")
 18 | opt_parser.add_option("-s", "--start", type=str, default=None, 
 19 |     help="start date/time")
 20 | opt_parser.add_option("-e", "--end", type=str, default=None, 
 21 |     help="end date/time")
 22 | opt_parser.add_option("-o", "--output", dest="output", type="str", 
 23 |     help="html | csv | json (default: html)", default="html")
 24 | opt_parser.add_option("-p", "--template", dest="template", type="str", 
 25 |     help="name of template in utils/template (default: wordcloud.html)", default="wordcloud.html")
 26 | 
 27 | opts, args = opt_parser.parse_args()
 28 | 
 29 | tzname = opts.timezone
 30 | # determine output time zone
 31 | if tzname == "":
 32 |     tz = pytz.UTC
 33 | elif tzname == "local":
 34 |     tz = get_localzone() # system timezone, from tzlocal
 35 | else: 
 36 |     tz = pytz.timezone(tzname)
 37 | 
 38 | maxwords = opts.maxwords
 39 | intervalStr = opts.intervalStr
 40 | output = opts.output
 41 | 
 42 | start = opts.start
 43 | end = opts.end
 44 | if opts.start:
 45 |     start = tz.localize(dateutil.parser.parse(start + "0001-01-01 00:00:00"[len(start):]))
 46 | if opts.end:
 47 |     end = tz.localize(dateutil.parser.parse(end + "9999-12-31 23:11:59"[len(end):]))
 48 | 
 49 | # from https://gist.github.com/uogbuji/705383
 50 | GRUBER_URLINTEXT_PAT = re.compile(ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
 51 | 
 52 | class WordcloudTimeProfiler(TimeProfiler):
 53 |     def __init__(self, opts):
 54 |         TimeProfiler.__init__(self, opts)
 55 |         self.timeslices = {}
 56 |         self.stop_words = set(line.strip().lower() for line in open("stopwords/stop-words_english_6_en.txt"))
 57 | 
 58 |     def process(self, tweet):
 59 |         created_at = dateutil.parser.parse(tweet["created_at"])
 60 |         if ((self.start is None) or (created_at >= self.start)) and ((self.end is None) 
 61 |                 or (created_at <= self.end)):
 62 |             timeslice = TimeProfiler.process(self, tweet)
 63 |             if not timeslice in self.timeslices:
 64 |                 self.timeslices[timeslice] = {}
 65 |             word_counts = self.timeslices[timeslice]
 66 |             text = tweet["text"]
 67 |             # remove hashtags and user names
 68 |             text = re.sub("(^|[^\w])[@#]\w*", "\g<1>", text)
 69 |             # remove urls
 70 |             text = re.sub(GRUBER_URLINTEXT_PAT, " ", text)
 71 |             # trim punctuation next to space
 72 |             text = re.sub(ur"[^\w\s]+(\s|$)|(^|\s)[^\w\s]+", " ", text, re.UNICODE)
 73 |             # replace internal punctuation, except apostrophes
 74 |             text = re.sub(ur"[^\w\s\']", " ", text, re.UNICODE)
 75 |             for word in text.split():
 76 |                 word = word.lower()
 77 |                 if len(word) < 3: continue
 78 |                 if len(word) > 15: continue
 79 |                 if word in self.stop_words: continue
 80 |                 if word.startswith("rt"): continue
 81 |                 if not re.match("^[a-z]", word, re.IGNORECASE): continue
 82 |                 # remove final 's
 83 |                 word = re.sub("\'s$", "", word)
 84 |                 if len(word) > 0:
 85 |                     word_counts[word] = word_counts.get(word, 0) + 1
 86 | 
 87 |     def report(self):
 88 |         data = TimeProfiler.report(self)
 89 |         data["profile"]["start"] = str(self.start)
 90 |         data["profile"]["end"] = str(self.end)
 91 |         for value in data["values"]:
 92 |             thisslice = self.timeslices[value["name"]]
 93 |             # sort words by value
 94 |             sorted_words = thisslice.keys()
 95 |             sorted_words.sort(lambda a, b: cmp(thisslice[b], thisslice[a]))
 96 |             top_words = sorted_words[0:maxwords]
 97 |             words = []
 98 |             for word in top_words:
 99 |                 words.append({
100 |                     "text": word,
101 |                     "count": thisslice[word]
102 |                 })
103 |             value["words"] = words
104 |         return data
105 | 
106 | profiler = WordcloudTimeProfiler({
107 |     "tz": tz, 
108 |     "output": "json", 
109 |     "aggregate": True, 
110 |     "intervalStr": intervalStr,
111 |     "start": start,
112 |     "end": end})
113 | 
114 | profiler.gettweets(opts, args)
115 | 
116 | data = profiler.report()
117 | 
118 | if opts.output == "html":
119 |     d3output.embed(opts.template, data)
120 | else:
121 |     print(data)
122 | 


--------------------------------------------------------------------------------
/d3cotags.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import json
  4 | import itertools
  5 | import optparse
  6 | import d3output # local module
  7 | from profiler import Profiler # local module
  8 | from profiler import LinkNodesProfiler # local module
  9 | from collections import Counter
 10 | 
 11 | class CotagsProfiler(LinkNodesProfiler):
 12 |     def __init__(self, opts):
 13 |         LinkNodesProfiler.__init__(self, opts)
 14 |         self.savetweets = []
 15 |         self.counts = Counter()
 16 |         self.keepers = set()
 17 | 
 18 |     def process(self, tweet):
 19 |         Profiler.process(self, tweet)
 20 |         # gather a list of the tags in this tweet, lowercased
 21 |         savetweet = []
 22 |         for tag in tweet["entities"]["hashtags"]:
 23 |             t = tag["text"].lower()
 24 |             savetweet.append(t)
 25 |             # and increment count for this tag
 26 |             self.counts[t] += 1
 27 |         # add tag list to savetweets
 28 |         self.savetweets.append(savetweet)
 29 | 
 30 |     def report(self):
 31 |         # for tags below the threshold, replace with "-OTHER"
 32 |         # which is not necessary if threshold is 0
 33 |         if self.threshold > 0:
 34 |             countkeys = self.counts.keys()
 35 |             for countkey in countkeys:
 36 |                 if self.counts[countkey] < self.threshold:
 37 |                     # for a tag whose count is below the threshold, transfer its
 38 |                     # count to tag "-OTHER" and delete it
 39 |                     if self.keepother:
 40 |                         self.counts["-OTHER"] += self.counts[countkey]
 41 |                     del self.counts[countkey]
 42 |                 else:
 43 |                     # otherwise add it to list of keepers
 44 |                     self.keepers.add(countkey)
 45 |             if self.keepother:
 46 |                 self.keepers.add("-OTHER")
 47 |             # keepers now has a complete set of surviving tags
 48 | 
 49 |         # now process hashtags in tweets again, replacing any tag not in keepers with -OTHER
 50 |         self.counts = Counter()
 51 |         for savetweet in self.savetweets:
 52 |         
 53 |             # cleantags gathers unique, lower-cased tags for this tweet
 54 |             cleantags = set()
 55 |         
 56 |             for tag in savetweet:
 57 |                 if self.threshold == 0 or tag in self.keepers:
 58 |                     cleantags.add(tag)
 59 |                 else:
 60 |                     if self.keepother:
 61 |                         cleantags.add("-OTHER")
 62 |                 
 63 |             # sort tags and remove tags that are in the exclude set 
 64 |             cleantags = sorted(cleantags.difference(self.exclude))
 65 |             
 66 |             # generate all pairs
 67 |             for c in itertools.combinations(cleantags, 2):
 68 |                 self.addlink(c[0], c[1])
 69 |                 if self.reciprocal:
 70 |                     self.addlink(c[1], c[0])
 71 |             
 72 |             # if this tag is the only one we're including from this tweet,
 73 |             # then there won't be any combinations, and so it won't have
 74 |             # been added to self.nodes by addlink: so add it.
 75 | 
 76 |             # add to tweet count for this tag
 77 |             for tag in cleantags:
 78 |                 if tag in self.nodes:
 79 |                     self.nodes[tag]["tweetcount"] += 1
 80 |                 else:
 81 |                     self.addsingle(tag)
 82 |                 
 83 |         data = LinkNodesProfiler.report(self)
 84 |         return data;        
 85 |             
 86 | 
 87 | opt_parser = optparse.OptionParser()
 88 | opt_parser.add_option("-o", "--output", dest="output", type="str", 
 89 |     help="html | json (default: html)", default="html")
 90 | opt_parser.add_option("-e", "--exclude", type=str, default="", 
 91 |     help="comma-separated list of hashtags to exclude")
 92 | opt_parser.add_option("-t", "--threshold", type=int, default=0, 
 93 |     help="threshold below which to treat hashtags as 'other'")
 94 | opt_parser.add_option("-r", "--reciprocal", action="store_true", default=False, 
 95 |     help="add reciprocal links for each pair")
 96 | opt_parser.add_option("-p", "--template", dest="template", type="str", 
 97 |     help="name of template in utils/template (default: graph.html)", default="graph.html")
 98 | opt_parser.add_option("-k", "--keepother", action="store_true", default=False, 
 99 |     help="include -OTHER tag in output for tags below threshold")
100 | 
101 | opts, args = opt_parser.parse_args()
102 | 
103 | threshold = opts.threshold
104 | exclude = set(opts.exclude.lower().split(","))
105 | reciprocal = opts.reciprocal
106 | keepother = opts.keepother
107 | output = opts.output
108 | 
109 | profiler = CotagsProfiler({
110 |     "threshold": threshold,
111 |     "exclude": exclude,
112 |     "reciprocal": reciprocal,
113 |     "keepother": keepother,
114 |     "graph": "undirected",
115 |     "field": "hashtag"})
116 |     
117 | profiler.gettweets(opts, args)
118 | 
119 | data = profiler.report()
120 | 
121 | profile = data["profile"]
122 | nodes = data["nodes"]
123 | 
124 | if output == "csv":
125 |     print(d3output.nodeslinkcsv(nodes))
126 | elif output == "json":
127 |     values = d3output.nodeslinktrees(profile, nodes)
128 |     print({"profile": profile, "values": values})
129 | elif output == "html":
130 |     print(d3output.embed(opts.template, d3output.nodeslinktrees(profile, nodes)))
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/templates/wordcloud.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <meta charset="utf-8">
  3 | <body>
  4 |   <script src="https://d3js.org/d3.v3.min.js"></script>
  5 |   <script src="https://d3js.org/d3.layout.cloud.js"></script>
  6 |   <style>
  7 |   .label {
  8 |   font: 48px "Helvetica Neue";
  9 |   fill: #ddd;
 10 | }
 11 | </style>
 12 |   <script>
 13 | 
 14 | //Simple animated example of d3-cloud - https://github.com/jasondavies/d3-cloud
 15 | //Based on https://github.com/jasondavies/d3-cloud/blob/master/examples/simple.html
 16 | 
 17 | var w = window,
 18 |     d = document,
 19 |     e = d.documentElement,
 20 |     g = d.getElementsByTagName('body')[0],
 21 |     width = w.innerWidth || e.clientWidth || g.clientWidth,
 22 |     height = w.innerHeight|| e.clientHeight|| g.clientHeight,
 23 |     currentlabel = "";
 24 | 
 25 | var data = $DATA$;
 26 | var values = data["values"];
 27 |     
 28 | // Encapsulate the word cloud functionality
 29 | function wordCloud(selector) {
 30 | 
 31 |     // get highest count, to serve as high of pointsize domain
 32 |     var max = d3.max(d3.entries(values), function(d) {
 33 |         return (d.value["words"].length > 0) ? d.value["words"][0]["count"] : 0;
 34 |     });
 35 | 
 36 |     // linear scale to convert counts to point sizes
 37 |     var pointsize = d3.scale.log()
 38 |         .domain([1, max])
 39 |         .range([24, 60]);
 40 |         
 41 |     //Construct the word cloud's SVG element
 42 |     var svg = d3.select(selector).append("svg")
 43 |         .attr("width", width)
 44 |         .attr("height", height)
 45 |         .append("g")
 46 |         .attr("transform", "translate(" + Math.round(width/2) + "," + Math.round(height/2) + ")");
 47 | 
 48 | // Add the timeslice label; the value is set on transition.
 49 | var label = svg.append("text")
 50 |     .attr("class", "label")
 51 |     .attr("text-anchor", "start")
 52 |     .attr("y", Math.round(height/2) * -1 +48)
 53 |     .attr("x", Math.round(width/2) * -1 +24)
 54 |     .text("");
 55 | 
 56 |     //Draw the word cloud
 57 |     function draw(words) {
 58 |         // transition timing modelled on http://stackoverflow.com/questions/26421388/d3-js-subtransition-with-exit-and-enter
 59 |         label.text(currentlabel);
 60 | 
 61 |         var cloud = svg.selectAll("g text.word")
 62 |                         .data(words, function(d) { return d.text; })
 63 |         var transition=d3.transition().duration(2000);
 64 |         
 65 |         // remove exiting words 
 66 |         if (!cloud.exit().empty()) 
 67 |             transition = transition.each(function(){
 68 |                 cloud.exit()
 69 |                     .transition()
 70 |                         .duration(1000)
 71 |                         .style("fill", "gray")
 72 |                         .style('fill-opacity', 1e-6)
 73 |                         .attr('font-size', 1)
 74 |                         .remove();
 75 |             }).transition();
 76 | 
 77 |         // Resize and reposition updating words
 78 |         if (!cloud.empty()) {
 79 | 
 80 |             transition = transition.each(function(){
 81 |                 cloud
 82 |                     .transition()
 83 |                         .duration(1000)
 84 |                         .style("fill", "black")
 85 |                         .style("font-size", function(d) { return d.size + "px"; })
 86 |                         .attr("transform", function(d) {
 87 |                                 return "translate(" + [d.x, d.y] + ")rotate(" + d.rotate + ")";
 88 |                             })
 89 |                         .style("fill-opacity", 1);
 90 |             }).transition();
 91 |         }
 92 |         
 93 |         //Entering words
 94 |         if (!cloud.enter().empty()) 
 95 |             transition=transition.each(function(){
 96 |                 cloud.enter()
 97 |                     .append("text")
 98 |                     .style("font-family", "Futura")
 99 |                     .style("fill", "red")
100 |                     .attr("class", "word")
101 |                     .attr("text-anchor", "middle")
102 |                     .attr('font-size', 0)
103 |                     .text(function(d) { return d.text; })
104 |                         .attr("transform", function(d) {
105 |                                 return "translate(" + [d.x, d.y] + ")rotate(" + d.rotate + ")";
106 |                             })
107 |                     .transition()
108 |                         .duration(1000)
109 |                         .style("font-size", function(d) { return d.size + "px"; })
110 |                         .style("fill-opacity", 1);
111 |             }).transition();
112 | 
113 |     }
114 | 
115 | 
116 |     //Use the module pattern to encapsulate the visualisation code. We'll
117 |     // expose only the parts that need to be public.
118 |     return {
119 | 
120 |         //Recompute the word cloud for a new set of words. This method will
121 |         // asycnhronously call draw when the layout has been computed.
122 |         //The outside world will need to call this function, so make it part
123 |         // of the wordCloud return value.
124 |         update: function(words) {
125 |             d3.layout.cloud().size([width, height])
126 |                 .words(words)
127 |                 .padding(5)
128 |                 .rotate(20)
129 |                 .font("Futura")
130 |                 .fontSize(function(d) { return pointsize(d.size); })
131 |                 .on("end", draw)
132 |                 .start();
133 |         }
134 |     }
135 | 
136 | }
137 | 
138 | function getWords(words) {
139 |     return words.map(function(d) {
140 |         return {text: d.text, size: d.count};
141 |     })
142 | }
143 | 
144 | //Create a new instance of the word cloud visualisation.
145 | var myWordCloud = wordCloud('body');
146 | 
147 | //This method tells the word cloud to redraw with a new set of words.
148 | //In reality the new words would probably come from a server request,
149 | // user input or some other source.
150 | 
151 | 
152 | function showNewWords(x) {
153 |     value = values[x];
154 |     // TODO format label for different intervals
155 |     currentlabel = value.name + ":00 : " + value.value + " tweet" + ((value.value > 1) ? "s" : "");
156 |     myWordCloud.update(getWords(value["words"]));
157 |     if (x != values.length) {
158 |         setTimeout(function() { showNewWords(x + 1)}, 7000)
159 |     }
160 | }
161 | 
162 | //Start cycling through the data
163 | showNewWords(0);
164 | 
165 | 
166 | 
167 | </script>
168 | 
169 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | CC0 1.0 Universal
  2 | 
  3 | Statement of Purpose
  4 | 
  5 | The laws of most jurisdictions throughout the world automatically confer
  6 | exclusive Copyright and Related Rights (defined below) upon the creator and
  7 | subsequent owner(s) (each and all, an "owner") of an original work of
  8 | authorship and/or a database (each, a "Work").
  9 | 
 10 | Certain owners wish to permanently relinquish those rights to a Work for the
 11 | purpose of contributing to a commons of creative, cultural and scientific
 12 | works ("Commons") that the public can reliably and without fear of later
 13 | claims of infringement build upon, modify, incorporate in other works, reuse
 14 | and redistribute as freely as possible in any form whatsoever and for any
 15 | purposes, including without limitation commercial purposes. These owners may
 16 | contribute to the Commons to promote the ideal of a free culture and the
 17 | further production of creative, cultural and scientific works, or to gain
 18 | reputation or greater distribution for their Work in part through the use and
 19 | efforts of others.
 20 | 
 21 | For these and/or other purposes and motivations, and without any expectation
 22 | of additional consideration or compensation, the person associating CC0 with a
 23 | Work (the "Affirmer"), to the extent that he or she is an owner of Copyright
 24 | and Related Rights in the Work, voluntarily elects to apply CC0 to the Work
 25 | and publicly distribute the Work under its terms, with knowledge of his or her
 26 | Copyright and Related Rights in the Work and the meaning and intended legal
 27 | effect of CC0 on those rights.
 28 | 
 29 | 1. Copyright and Related Rights. A Work made available under CC0 may be
 30 | protected by copyright and related or neighboring rights ("Copyright and
 31 | Related Rights"). Copyright and Related Rights include, but are not limited
 32 | to, the following:
 33 | 
 34 |   i. the right to reproduce, adapt, distribute, perform, display, communicate,
 35 |   and translate a Work;
 36 | 
 37 |   ii. moral rights retained by the original author(s) and/or performer(s);
 38 | 
 39 |   iii. publicity and privacy rights pertaining to a person's image or likeness
 40 |   depicted in a Work;
 41 | 
 42 |   iv. rights protecting against unfair competition in regards to a Work,
 43 |   subject to the limitations in paragraph 4(a), below;
 44 | 
 45 |   v. rights protecting the extraction, dissemination, use and reuse of data in
 46 |   a Work;
 47 | 
 48 |   vi. database rights (such as those arising under Directive 96/9/EC of the
 49 |   European Parliament and of the Council of 11 March 1996 on the legal
 50 |   protection of databases, and under any national implementation thereof,
 51 |   including any amended or successor version of such directive); and
 52 | 
 53 |   vii. other similar, equivalent or corresponding rights throughout the world
 54 |   based on applicable law or treaty, and any national implementations thereof.
 55 | 
 56 | 2. Waiver. To the greatest extent permitted by, but not in contravention of,
 57 | applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and
 58 | unconditionally waives, abandons, and surrenders all of Affirmer's Copyright
 59 | and Related Rights and associated claims and causes of action, whether now
 60 | known or unknown (including existing as well as future claims and causes of
 61 | action), in the Work (i) in all territories worldwide, (ii) for the maximum
 62 | duration provided by applicable law or treaty (including future time
 63 | extensions), (iii) in any current or future medium and for any number of
 64 | copies, and (iv) for any purpose whatsoever, including without limitation
 65 | commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes
 66 | the Waiver for the benefit of each member of the public at large and to the
 67 | detriment of Affirmer's heirs and successors, fully intending that such Waiver
 68 | shall not be subject to revocation, rescission, cancellation, termination, or
 69 | any other legal or equitable action to disrupt the quiet enjoyment of the Work
 70 | by the public as contemplated by Affirmer's express Statement of Purpose.
 71 | 
 72 | 3. Public License Fallback. Should any part of the Waiver for any reason be
 73 | judged legally invalid or ineffective under applicable law, then the Waiver
 74 | shall be preserved to the maximum extent permitted taking into account
 75 | Affirmer's express Statement of Purpose. In addition, to the extent the Waiver
 76 | is so judged Affirmer hereby grants to each affected person a royalty-free,
 77 | non transferable, non sublicensable, non exclusive, irrevocable and
 78 | unconditional license to exercise Affirmer's Copyright and Related Rights in
 79 | the Work (i) in all territories worldwide, (ii) for the maximum duration
 80 | provided by applicable law or treaty (including future time extensions), (iii)
 81 | in any current or future medium and for any number of copies, and (iv) for any
 82 | purpose whatsoever, including without limitation commercial, advertising or
 83 | promotional purposes (the "License"). The License shall be deemed effective as
 84 | of the date CC0 was applied by Affirmer to the Work. Should any part of the
 85 | License for any reason be judged legally invalid or ineffective under
 86 | applicable law, such partial invalidity or ineffectiveness shall not
 87 | invalidate the remainder of the License, and in such case Affirmer hereby
 88 | affirms that he or she will not (i) exercise any of his or her remaining
 89 | Copyright and Related Rights in the Work or (ii) assert any associated claims
 90 | and causes of action with respect to the Work, in either case contrary to
 91 | Affirmer's express Statement of Purpose.
 92 | 
 93 | 4. Limitations and Disclaimers.
 94 | 
 95 |   a. No trademark or patent rights held by Affirmer are waived, abandoned,
 96 |   surrendered, licensed or otherwise affected by this document.
 97 | 
 98 |   b. Affirmer offers the Work as-is and makes no representations or warranties
 99 |   of any kind concerning the Work, express, implied, statutory or otherwise,
100 |   including without limitation warranties of title, merchantability, fitness
101 |   for a particular purpose, non infringement, or the absence of latent or
102 |   other defects, accuracy, or the present or absence of errors, whether or not
103 |   discoverable, all to the greatest extent permissible under applicable law.
104 | 
105 |   c. Affirmer disclaims responsibility for clearing rights of other persons
106 |   that may apply to the Work or any use thereof, including without limitation
107 |   any person's Copyright and Related Rights in the Work. Further, Affirmer
108 |   disclaims responsibility for obtaining any necessary consents, permissions
109 |   or other rights required for any use of the Work.
110 | 
111 |   d. Affirmer understands and acknowledges that Creative Commons is not a
112 |   party to this document and has no duty or obligation with respect to this
113 |   CC0 or use of the Work.
114 | 
115 | For more information, please see
116 | <https://creativecommons.org/publicdomain/zero/1.0/>
117 | 


--------------------------------------------------------------------------------
/stopwords/stop-words_english_6_en.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | able
  3 | about
  4 | above
  5 | abst
  6 | accordance
  7 | according
  8 | accordingly
  9 | across
 10 | act
 11 | actually
 12 | added
 13 | adj
 14 | adopted
 15 | affected
 16 | affecting
 17 | affects
 18 | after
 19 | afterwards
 20 | again
 21 | against
 22 | ah
 23 | all
 24 | almost
 25 | alone
 26 | along
 27 | already
 28 | also
 29 | although
 30 | always
 31 | am
 32 | among
 33 | amongst
 34 | amp
 35 | an
 36 | and
 37 | announce
 38 | another
 39 | any
 40 | anybody
 41 | anyhow
 42 | anymore
 43 | anyone
 44 | anything
 45 | anyway
 46 | anyways
 47 | anywhere
 48 | apparently
 49 | approximately
 50 | are
 51 | aren
 52 | arent
 53 | arise
 54 | around
 55 | as
 56 | aside
 57 | ask
 58 | asking
 59 | at
 60 | auth
 61 | available
 62 | away
 63 | awfully
 64 | b
 65 | back
 66 | be
 67 | became
 68 | because
 69 | become
 70 | becomes
 71 | becoming
 72 | been
 73 | before
 74 | beforehand
 75 | begin
 76 | beginning
 77 | beginnings
 78 | begins
 79 | behind
 80 | being
 81 | believe
 82 | below
 83 | beside
 84 | besides
 85 | between
 86 | beyond
 87 | biol
 88 | both
 89 | brief
 90 | briefly
 91 | but
 92 | by
 93 | c
 94 | ca
 95 | came
 96 | can
 97 | cannot
 98 | can't
 99 | cause
100 | causes
101 | certain
102 | certainly
103 | co
104 | com
105 | come
106 | comes
107 | contain
108 | containing
109 | contains
110 | could
111 | couldn't
112 | couldn
113 | could've
114 | d
115 | date
116 | did
117 | didn't
118 | didn
119 | different
120 | do
121 | does
122 | doesn't
123 | doesn
124 | doing
125 | done
126 | don't
127 | don
128 | down
129 | downwards
130 | due
131 | during
132 | e
133 | each
134 | ed
135 | edu
136 | effect
137 | eg
138 | eight
139 | eighty
140 | either
141 | else
142 | elsewhere
143 | end
144 | ending
145 | enough
146 | especially
147 | et
148 | et-al
149 | etc
150 | even
151 | ever
152 | every
153 | everybody
154 | everyone
155 | everything
156 | everywhere
157 | ex
158 | except
159 | f
160 | far
161 | few
162 | ff
163 | fifth
164 | first
165 | five
166 | fix
167 | followed
168 | following
169 | follows
170 | for
171 | former
172 | formerly
173 | forth
174 | found
175 | four
176 | from
177 | further
178 | furthermore
179 | g
180 | gave
181 | get
182 | gets
183 | getting
184 | give
185 | given
186 | gives
187 | giving
188 | go
189 | goes
190 | gone
191 | got
192 | gotten
193 | h
194 | had
195 | happens
196 | hardly
197 | has
198 | hasn't
199 | hasn
200 | have
201 | haven't
202 | haven
203 | having
204 | he
205 | he'll
206 | he'd
207 | hence
208 | her
209 | here
210 | hereafter
211 | hereby
212 | herein
213 | here's
214 | hereupon
215 | hers
216 | herself
217 | he's
218 | hi
219 | hid
220 | him
221 | himself
222 | his
223 | hither
224 | home
225 | how
226 | howbeit
227 | however
228 | hundred
229 | i
230 | i'd
231 | ie
232 | if
233 | i'll
234 | i'm
235 | immediate
236 | immediately
237 | importance
238 | important
239 | in
240 | inc
241 | indeed
242 | index
243 | information
244 | instead
245 | into
246 | invention
247 | inward
248 | is
249 | isn't
250 | isn
251 | it
252 | it'd
253 | it'll
254 | its
255 | it's
256 | itself
257 | i've
258 | j
259 | just
260 | k
261 | keep
262 | keeps
263 | kept
264 | keys
265 | kg
266 | km
267 | know
268 | known
269 | knows
270 | l
271 | largely
272 | last
273 | lately
274 | later
275 | latter
276 | latterly
277 | least
278 | less
279 | lest
280 | let
281 | lets
282 | let's
283 | like
284 | liked
285 | likely
286 | line
287 | little
288 | 'll
289 | look
290 | looking
291 | looks
292 | ltd
293 | m
294 | made
295 | mainly
296 | make
297 | makes
298 | many
299 | may
300 | maybe
301 | me
302 | mean
303 | means
304 | meantime
305 | meanwhile
306 | merely
307 | mg
308 | might
309 | million
310 | miss
311 | ml
312 | more
313 | moreover
314 | most
315 | mostly
316 | mr
317 | mrs
318 | much
319 | mug
320 | must
321 | my
322 | myself
323 | n
324 | na
325 | name
326 | namely
327 | nay
328 | nd
329 | near
330 | nearly
331 | necessarily
332 | necessary
333 | need
334 | needs
335 | neither
336 | never
337 | nevertheless
338 | new
339 | next
340 | nine
341 | ninety
342 | no
343 | nobody
344 | non
345 | none
346 | nonetheless
347 | noone
348 | nor
349 | normally
350 | nos
351 | not
352 | noted
353 | nothing
354 | now
355 | nowhere
356 | o
357 | obtain
358 | obtained
359 | obviously
360 | of
361 | off
362 | often
363 | oh
364 | ok
365 | okay
366 | old
367 | omitted
368 | on
369 | once
370 | one
371 | ones
372 | only
373 | onto
374 | or
375 | ord
376 | other
377 | others
378 | otherwise
379 | ought
380 | our
381 | ours
382 | ourselves
383 | out
384 | outside
385 | over
386 | overall
387 | owing
388 | own
389 | p
390 | page
391 | pages
392 | part
393 | particular
394 | particularly
395 | past
396 | per
397 | perhaps
398 | placed
399 | please
400 | plus
401 | poorly
402 | possible
403 | possibly
404 | potentially
405 | pp
406 | predominantly
407 | present
408 | previously
409 | primarily
410 | probably
411 | promptly
412 | proud
413 | provides
414 | put
415 | q
416 | que
417 | quickly
418 | quite
419 | qv
420 | r
421 | ran
422 | rather
423 | rd
424 | re
425 | readily
426 | really
427 | recent
428 | recently
429 | ref
430 | refs
431 | regarding
432 | regardless
433 | regards
434 | related
435 | relatively
436 | research
437 | respectively
438 | resulted
439 | resulting
440 | results
441 | right
442 | run
443 | s
444 | said
445 | same
446 | saw
447 | say
448 | saying
449 | says
450 | sec
451 | section
452 | see
453 | seeing
454 | seem
455 | seemed
456 | seeming
457 | seems
458 | seen
459 | self
460 | selves
461 | sent
462 | seven
463 | several
464 | shall
465 | she
466 | she'd
467 | she'll
468 | she's
469 | should
470 | shouldn't
471 | shouldn
472 | show
473 | showed
474 | shown
475 | shows
476 | significant
477 | significantly
478 | similar
479 | similarly
480 | since
481 | six
482 | slightly
483 | so
484 | some
485 | somebody
486 | somehow
487 | someone
488 | something
489 | sometime
490 | sometimes
491 | somewhat
492 | somewhere
493 | soon
494 | sorry
495 | specifically
496 | specified
497 | specify
498 | specifying
499 | state
500 | states
501 | still
502 | stop
503 | strongly
504 | sub
505 | substantially
506 | successfully
507 | such
508 | sufficiently
509 | suggest
510 | sup
511 | sure
512 | t
513 | take
514 | taken
515 | taking
516 | tell
517 | tends
518 | th
519 | than
520 | thank
521 | thanks
522 | thanx
523 | that
524 | that'll
525 | that's
526 | that've
527 | the
528 | their
529 | theirs
530 | them
531 | themselves
532 | then
533 | thence
534 | there
535 | thereafter
536 | thereby
537 | there'd
538 | therefore
539 | therein
540 | there'll
541 | thereof
542 | there're
543 | there's
544 | thereto
545 | thereupon
546 | there've
547 | these
548 | they
549 | they'd
550 | they'll
551 | they're
552 | they've
553 | think
554 | this
555 | those
556 | thou
557 | though
558 | thousand
559 | throug
560 | through
561 | throughout
562 | thru
563 | thus
564 | til
565 | tip
566 | to
567 | together
568 | too
569 | took
570 | toward
571 | towards
572 | tried
573 | tries
574 | truly
575 | try
576 | trying
577 | ts
578 | twice
579 | two
580 | u
581 | un
582 | under
583 | unfortunately
584 | unless
585 | unlike
586 | unlikely
587 | until
588 | unto
589 | up
590 | upon
591 | ups
592 | us
593 | use
594 | used
595 | useful
596 | usefully
597 | usefulness
598 | uses
599 | using
600 | usually
601 | v
602 | value
603 | various
604 | 've
605 | very
606 | via
607 | viz
608 | vol
609 | vols
610 | vs
611 | w
612 | want
613 | wants
614 | was
615 | wasn't
616 | wasn
617 | way
618 | we
619 | wed
620 | welcome
621 | we'll
622 | went
623 | were
624 | we're
625 | weren't
626 | weren
627 | we've
628 | what
629 | whatever
630 | what'll
631 | whats
632 | when
633 | whence
634 | whenever
635 | where
636 | whereafter
637 | whereas
638 | whereby
639 | wherein
640 | where's
641 | whereupon
642 | wherever
643 | whether
644 | which
645 | while
646 | whim
647 | whither
648 | who
649 | who'd
650 | whoever
651 | whole
652 | who'll
653 | whom
654 | whomever
655 | whos
656 | whose
657 | why
658 | widely
659 | will
660 | willing
661 | wish
662 | with
663 | within
664 | without
665 | won't
666 | words
667 | world
668 | would
669 | wouldn
670 | wouldn't
671 | would've
672 | www
673 | x
674 | y
675 | yes
676 | yet
677 | you
678 | you'd
679 | you'll
680 | your
681 | you're
682 | yours
683 | yourself
684 | yourselves
685 | you've
686 | z
687 | zero
688 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # twarc-report
  2 | Data conversions and examples for generating reports from [twarc](https://github.com/DocNow/twarc) collections using tools such as D3.js
  3 | 
  4 | - [Requirements](#user-content-requirements)
  5 | - [Getting Started](#user-content-getting-started)
  6 | - [Recommended Directory Structure](#user-content-recommended-directory-structure)
  7 | - [Harvest](#user-content-harvest)
  8 | - [Profile](#user-content-profile)
  9 | - [D3 Visualizations](#user-content-d3-visualizations)
 10 | - [Exploring D3 Examples](#user-content-exploring-d3-examples)
 11 | - [Adding Scripts](#user-content-adding-scripts)
 12 | - [License](#user-content-license)
 13 | 
 14 | These utilities accept a Twitter json file (as fetched by twarc),
 15 | analyze it various ways, and output a json or csv file. The initial
 16 | purpose is to feed data into D3.js for various visualizations, but the
 17 | intention is to make the outputs generic enough to serve other uses as
 18 | well. Each utility has a D3 example template, which it can use to
 19 | generate a self-contained html file. It can also generate csv or json
 20 | output, and there is a [worked example](#user-content-exploring-d3-examples) of how
 21 | to use csv in a pre-existing D3 chart.
 22 | 
 23 | The d3graph.py utility was originally added to the twarc repo as
 24 | directed.py but is moving here for consistency.
 25 | 
 26 | ## Requirements
 27 | 
 28 | All requirements may be installed with `pip install -r requirements.txt`
 29 | 
 30 | * dateutil - `python-dateutil`
 31 | * pytz - `pip install pytz`
 32 | * tzlocal - `pip install tzlocal`
 33 | * pysparklines - `pip install pysparklines`
 34 | * requests_oauthlib - `pip install requests_oauthlib`
 35 | 
 36 | Install `twarc` according to its instructions, i.e. with `pip install twarc`. 
 37 | Run `twarc.py` once so
 38 | that it can ask for your access token etc. (see twarc's readme). Make sure that `twarc-archive.py` 
 39 | is on the system path.
 40 | 
 41 | ## Getting Started
 42 | 
 43 | - clone twarc-report to a local directory with your favorite Git client
 44 | - install the requirements and populate the twarc submodule, as above
 45 | - create a `projects` subdirectory under twarc-report
 46 | - create a project directory under `projects`, named appropriately
 47 | - in the project directory create `metadata.json` and fill in the search you want to track
 48 | - in twarc-report, run `./harvest.py projects/[yourproject]` to harvest your tweets (this may take some time - hours or days for very large searches)
 49 | - run `./reportprofile.py projects/[yourproject]` to see a summary of your harvest
 50 | - run other scripts to generate various visualizations (see below)
 51 | - run `./harvest.py projects/[yourproject]` whenever you want to update your harvest.
 52 | 
 53 | Note that only tweets from the last 7 days or so are available from Twitter at 
 54 | any given time, so be sure to update your harvest accordingly to avoid gaps.
 55 | 
 56 | ## Recommended Directory Structure
 57 | 
 58 | ```
 59 | twarc-report/ # local clone
 60 |     projects/
 61 |         assets/ # copy of twarc-report/assets/
 62 |         projectA/
 63 |             data/ # created by harvest.py
 64 |                 tweets/ # populated with tweet*.json files by harvest.py
 65 |             metadata.json
 66 |             timeline.html # generated by a twarc-report script
 67 |             ...
 68 |         projectB/
 69 |         ...
 70 | ```
 71 | 
 72 | Metadata about the project, including the search query, is kept in 
 73 | `metadata.json`. The `metadata.json` file is created by the user and contains metadata 
 74 | for the harvest. It should be in this form:
 75 | 
 76 | ```
 77 | {"search": "#ferguson",
 78 | "title": "Ferguson Tweets",
 79 | "creator": "Peter Binkley"}
 80 | ```
 81 | 
 82 | (Currently only the `search` value is used but other metadata fields will 
 83 | be used to populate HTML output in future releases.)
 84 | 
 85 | The harvested tweets
 86 | and other source data are stored in the `data` subdirectory, with the 
 87 | tweets going the `tweets` directory. These directories are created by 
 88 | `harvest.py` if they don't exist.
 89 | 
 90 | Generated HTML files use relative paths like `../assets/d3.vs.min.js` to call 
 91 | shared libraries from the `assets` directory. They can be created in 
 92 | the project directories (`ProjectA` etc.). This 
 93 | allows you to publish the output by syncing the project and assets 
 94 | directories to a web server while exclusing the `data` subdirectory. You 
 95 | can also run python's SimpleHTTPServer in the `projects` directory to
 96 | load examples you've created in the project directories:
 97 | 
 98 | ```
 99 | python -m SimpleHTTPServer 8000
100 | ```
101 | 
102 | And then visit e.g. `http://localhost:8000/ProjectA/projectA-timebar.html`.
103 | 
104 | ## Harvest
105 | 
106 | The script `harvest.py` will use twarc's `twarc-archive.py` to start or update a harvest using a given
107 | search and stored in a given directory. The directory path is passed as the only parameter:
108 | 
109 | ```
110 | ./harvest.py projects/ProjectA
111 | ```
112 | 
113 | The search is read from the `metadata.json` file, and tweets are stored
114 | in `data/tweets`.
115 |  
116 | ## Profile
117 | 
118 | Running `reportprofiler.py` on a tweet collection with the flag `-o text` will generate a summary 
119 | profile of the collection, with some basic stats (number of tweets, retweets, users, etc.) and some 
120 | possibly interesting sparklines.
121 | 
122 | ```
123 | Count:        25100
124 | Users:         5779
125 | User percentiles: █▂▁▁▁▁▁▁▁▁
126 |                   [62, 12, 6, 5, 3, 2, 2, 2, 2, 2]
127 | ```
128 | 
129 | That indicates that the top 10 percent of users accounted for 62% of the tweets, while the bottom 
130 | 10% accounted for 2% of the tweets. This will give a quick sense of whether the collection is 
131 | dominated by a few voices or has broad participation. The profile also includes the top 10 users 
132 | and top 10 shared urls, with similar sparklines.
133 | 
134 | Note: the sparklines are generated by [pysparklines](https://pypi.python.org/pypi/pysparklines), 
135 | using Unicode block characters. If they have an
136 | uneven baseline, it's the fault of the font. On a Mac, I find that Menlo Regular gives a
137 | good presentation in the terminal.
138 | 
139 | ## D3 visualizations
140 | 
141 | Some utilities to generate [D3.js](https://d3js.org/) visualizations of aspects of a collection
142 | of tweets are provided. Use "--output=json" or "--output=csv" to output the data for use with 
143 | other D3 examples, or "--help" for other options.
144 | 
145 | ### d3graph.py
146 | 
147 | A directed graph of mentions or retweets, in which nodes are users and
148 | arrows point from the original user to the user who mentions or retweets
149 | them:
150 | 
151 |     % d3graph.py --mode mentions projects/nasa > projects/nasa/nasa-directed-mentions.html
152 |     % d3graph.py --mode retweets projects/nasa > projects/nasa/nasa-directed-retweets.html
153 |     % d3graph.py --mode replies projects/nasa > projects/nasa/nasa-directed-replies.html
154 |     
155 | ### d3cotag.py
156 | 
157 | An undirected graph of co-occurring hashtags:
158 | 
159 |     % d3cotag.py projects/nasa > projects/nasa/nasa-cotags.html
160 |     
161 | A threshold can be specified with "-t": hashtags whose number of
162 | occurrences falls below this will not be linked. Instead, if "-k" is set,
163 | they will be replaced with the pseudo-hashtag "-OTHER". Hashtags can be
164 | excluded with "-e" (takes a comma-delimited list). If the tweets were
165 | harvested by a search for a single hashtag then it's a good idea to
166 | exclude that tag, since every other tag will link to it.
167 |     
168 | ### d3timebar.py
169 | 
170 | A bar chart timeline with arbitrary intervals, here five minutes:
171 | 
172 |     % d3times.py -a -t local -i 5M projects/nasa > projects/nasa/nasa-timebargraph.html
173 | 
174 | [Examples](https://www.wallandbinkley.com/twarc/bill10/)
175 | 
176 | The output timezone is specified by "-t"; the interval is specified by "-i",
177 | using the [standard
178 | abbreviations](https://docs.python.org/2/library/time.html#time.strftime
179 | ): seconds = S, minutes = M, hours = H, days = d, months = m, years = Y.
180 | The example above uses five-minute intervals. Output may be aggregated
181 | using "-a": each row has a time value and a count. Note that if you are
182 | generating the html example, you must use "-a".
183 | 
184 | ### d3wordcloud.py
185 | 
186 | An animated wordcloud, in which words are added and removed according to 
187 | changes in frequency over time. 
188 | 
189 |     % d3wordcloud.py -t local -i 1H projects/nasa > projects/nasa/nasa-wordcloud.html
190 |     
191 | [Example](https://www.wallandbinkley.com/twarc/c4l15/animatedwordcloud.html)
192 |     
193 | The optional "-t" control timezone and "-i" controls interval, as in `d3timebar.py`. Start and end 
194 | timestamps may be set with "-s" and "-e". 
195 | 
196 | This script calls a [fork](https://github.com/pbinkley/d3-cloud) of Jason Davies' 
197 | [d3-cloud](https://github.com/jasondavies/d3-cloud) project. The forked version attempts
198 | to keep the carried-over words in transitions close to their previous position. 
199 | 
200 | ## Exploring D3 Examples
201 | 
202 | The json and csv outputs can be used to view your data in D3 example
203 | visualizations with minimal fuss. There are many many examples to be
204 | explored; Mike Bostock's
205 | [Gallery](https://github.com/mbostock/d3/wiki/Gallery) is a good place
206 | to start. Here's a worked example, using Bostock's [Zoomable Timeline
207 | Area
208 | Chart](https://mbostock.github.io/d3/talk/20111018/area-gradient.html).
209 | It assumes no knowledge of D3.
210 | 
211 | First, look at the data input. In line 137 this example loads a csv file 
212 | 
213 |     d3.csv("flights-departed.csv", function(data) {
214 | 
215 | The [csv file](https://mbostock.github.io/d3/talk/20111018/flights-departed.csv) looks like this: 
216 | 
217 |     date,value
218 |     1988-01-01,12681
219 |     ...
220 | 
221 | We can easily generate a csv file that matches that format:
222 | 
223 |     % ./d3times.py -a -i 1d -o csv
224 | 
225 | (I.e. aggregate, one-day interval, output csv). We then just need to edit the
226 | output to make the column headers match the original csv,
227 | i.e. change them to "date,value".
228 | 
229 | We also need to check the way the example loads scripts and css assets,
230 | especially the D3 library. In this case it expects a local copy:
231 | 
232 |     <script type="text/javascript" src="d3/d3.js"></script>
233 |     <script type="text/javascript" src="d3/d3.csv.js"></script>
234 |     <script type="text/javascript" src="d3/d3.time.js"></script>
235 |     <link type="text/css" rel="stylesheet" href="style.css"/>
236 |     
237 | Either change those links to point to the original location, or save a
238 | local copy. (Note that if you're going to put your example online you'll
239 | want local copies of scripts, since the [same-origin policy](https://en.wikipedia.org/wiki/Same-origin_policy) 
240 | will prevent them from being loaded from the source).
241 | 
242 | Once you've matched your data to the example and made sure it can load the
243 | D3.js library, the example may work. In this case it doesn't - it shows
244 | an empty chart. The title "U.S. Commercial Flights, 1999-2001" and the
245 | horizontal scale explain why: it expects dates within a certain
246 | (pre-Twitter) range, and the x domain is hard-coded accordingly. The
247 | setting is easy to find, in line 146:
248 | 
249 |     x.domain([new Date(1999, 0, 1), new Date(2003, 0, 0)]);
250 |     
251 | Change those dates to include the date range of your data, and the
252 | example should work. Don't worry about matching your dates closely: the
253 | chart is zoomable, after all. Alternatively, you could borrow a snippet from the 
254 | template timebar.html to set the domain to match the earliest and latest
255 | dates in your data:
256 | 
257 | ```
258 | x.domain([
259 |     d3.min(values, function(d) {return d.name}), 
260 |     d3.max(values, function(d) {return d.name})
261 |  ]);
262 | ```
263 | 
264 | A typical Twarc harvest gets you a few days worth of tweets, so the
265 | day-level display of this example probably isn't very interesting. We're
266 | not bound by the time format of the example, however. We can see it in
267 | line 63:
268 | 
269 |     parse = d3.time.format("%Y-%m-%d").parse,
270 |     
271 | We can change that to parse at the minute interval: "%Y-%m-%d %H:%M",
272 | and generate our csv at the same interval with "-i 1M". With those
273 | changes we can zoom in until bars represent a minute's worth of tweets.
274 | 
275 | This example doesn't work perfectly: I see some odd artifacts around the
276 | bottom of the chart, as if the baseline were slightly above the x axis
277 | and small values are presented as negative. And it doesn't render in
278 | Chrome at all (Firefox and Safari are fine). The example is from 2011
279 | and uses an older version of the D3 library, and with some tinkering it
280 | could probably be updated and made functional. It serves to demonstrate,
281 | though, that only small changes and no knowledge of the complexities of D3 
282 | are needed to fit your data into an existing D3 example.
283 | 
284 | ## Adding Scripts
285 | 
286 | The heart of twarc-report is the `Profiler` class in `profiler.py`. The
287 | scripts pass json records from the twarc harvests to this class, and it
288 | tabulates some basic properties: number of tweets and authors, earliest
289 | and latest timestamp, etc. The scripts create their own profilers that
290 | inherit from this class and that process the extra fields etc. needed by
291 | the script. To add a new script, start by working out its profiler class
292 | to collect the data it needs from each tweet in the process() method,
293 | and to organize the output in the report() method.
294 | 
295 | The various output formats are generated by functions in `d3output.py`. 
296 | 
297 | License
298 | -------
299 | 
300 | * CC0
301 | 
302 | 


--------------------------------------------------------------------------------
/assets/d3.layout.cloud.js:
--------------------------------------------------------------------------------
  1 | // Word cloud layout by Jason Davies, http://www.jasondavies.com/word-cloud/
  2 | // Algorithm due to Jonathan Feinberg, http://static.mrfeinberg.com/bv_ch03.pdf
  3 | 
  4 | var previouswords;
  5 | 
  6 | (function() {
  7 |   function cloud() {
  8 |     var size = [256, 256],
  9 |         text = cloudText,
 10 |         font = cloudFont,
 11 |         fontSize = cloudFontSize,
 12 |         fontStyle = cloudFontNormal,
 13 |         fontWeight = cloudFontNormal,
 14 |         rotate = cloudRotate,
 15 |         padding = cloudPadding,
 16 |         previousword = cloudPreviousword,
 17 |         spiral = archimedeanSpiral,
 18 |         words = [],
 19 |         timeInterval = Infinity,
 20 |         event = d3.dispatch("word", "end"),
 21 |         timer = null,
 22 |         cloud = {};
 23 | 
 24 |     cloud.start = function() {
 25 |       var board = zeroArray((size[0] >> 5) * size[1]),
 26 |           bounds = null,
 27 |           n = words.length,
 28 |           i = -1,
 29 |           tags = [],
 30 |           data = words.map(function(d, i) {
 31 |             d.text = text.call(this, d, i);
 32 |             d.font = font.call(this, d, i);
 33 |             d.style = fontStyle.call(this, d, i);
 34 |             d.weight = fontWeight.call(this, d, i);
 35 |             d.rotate = rotate.call(this, d, i);
 36 |             d.size = ~~fontSize.call(this, d, i);
 37 |             d.padding = padding.call(this, d, i);
 38 |             d.previousword = previousword.call(this, d, i);
 39 |             return d;
 40 |           })
 41 |           // sort update words to insert first
 42 |           .sort(function(a, b) {
 43 |             return 
 44 |                 (a.previousword && b.previousword) ? b.previousword.size - a.previousword.size : 
 45 |                 (a.previousword && !b.previousword) ? -1 : 
 46 |                 (!a.previousword && b.previousword) ? 1 : 
 47 |                     b.size - a.size;
 48 |         });
 49 | 
 50 |       if (timer) clearInterval(timer);
 51 |       timer = setInterval(step, 0);
 52 |       step();
 53 |       previouswords = words;
 54 |       return cloud;
 55 | 
 56 |       function step() {
 57 |         var start = +new Date,
 58 |             d;
 59 |         while (+new Date - start < timeInterval && ++i < n && timer) {
 60 |           d = data[i];
 61 |           // look for word in previouswords; if it's there, use d.x and d.y
 62 |           if (d.previousword) {
 63 |               d.x = (size[0] + d.previousword.x) >> 1;
 64 |               d.y = (size[1] + d.previousword.y) >> 1;
 65 |           }
 66 |           else {
 67 |               d.x = (size[0] * (Math.random() + .5)) >> 1;
 68 |               d.y = (size[1] * (Math.random() + .5)) >> 1;
 69 |           }
 70 |           cloudSprite(d, data, i);
 71 |           if (d.hasText && place(board, d, bounds)) {
 72 |             tags.push(d);
 73 |             event.word(d);
 74 |             if (bounds) cloudBounds(bounds, d);
 75 |             else bounds = [{x: d.x + d.x0, y: d.y + d.y0}, {x: d.x + d.x1, y: d.y + d.y1}];
 76 |             // Temporary hack
 77 |             d.x -= size[0] >> 1;
 78 |             d.y -= size[1] >> 1;
 79 |           }
 80 |         }
 81 |         if (i >= n) {
 82 |           cloud.stop();
 83 |           event.end(tags, bounds);
 84 |         }
 85 |       }
 86 |     }
 87 | 
 88 |     cloud.stop = function() {
 89 |       if (timer) {
 90 |         clearInterval(timer);
 91 |         timer = null;
 92 |       }
 93 |       return cloud;
 94 |     };
 95 | 
 96 |     cloud.timeInterval = function(x) {
 97 |       if (!arguments.length) return timeInterval;
 98 |       timeInterval = x == null ? Infinity : x;
 99 |       return cloud;
100 |     };
101 | 
102 |     function place(board, tag, bounds) {
103 |       var perimeter = [{x: 0, y: 0}, {x: size[0], y: size[1]}],
104 |           startX = tag.x,
105 |           startY = tag.y,
106 |           maxDelta = Math.sqrt(size[0] * size[0] + size[1] * size[1]),
107 |           s = spiral(size),
108 |           dt = Math.random() < .5 ? 1 : -1,
109 |           t = -dt,
110 |           dxdy,
111 |           dx,
112 |           dy;
113 | 
114 |       while (dxdy = s(t += dt)) {
115 |         dx = ~~dxdy[0];
116 |         dy = ~~dxdy[1];
117 | 
118 |         if (Math.min(dx, dy) > maxDelta) break;
119 | 
120 |         tag.x = startX + dx;
121 |         tag.y = startY + dy;
122 | 
123 |         if (tag.x + tag.x0 < 0 || tag.y + tag.y0 < 0 ||
124 |             tag.x + tag.x1 > size[0] || tag.y + tag.y1 > size[1]) continue;
125 |         // TODO only check for collisions within current bounds.
126 |         if (!bounds || !cloudCollide(tag, board, size[0])) {
127 |           if (!bounds || collideRects(tag, bounds)) {
128 |             var sprite = tag.sprite,
129 |                 w = tag.width >> 5,
130 |                 sw = size[0] >> 5,
131 |                 lx = tag.x - (w << 4),
132 |                 sx = lx & 0x7f,
133 |                 msx = 32 - sx,
134 |                 h = tag.y1 - tag.y0,
135 |                 x = (tag.y + tag.y0) * sw + (lx >> 5),
136 |                 last;
137 |             for (var j = 0; j < h; j++) {
138 |               last = 0;
139 |               for (var i = 0; i <= w; i++) {
140 |                 board[x + i] |= (last << msx) | (i < w ? (last = sprite[j * w + i]) >>> sx : 0);
141 |               }
142 |               x += sw;
143 |             }
144 |             delete tag.sprite;
145 |             return true;
146 |           }
147 |         }
148 |       }
149 |       return false;
150 |     }
151 | 
152 |     cloud.words = function(x) {
153 |       if (!arguments.length) return words;
154 |       words = x;
155 |       return cloud;
156 |     };
157 | 
158 |     cloud.size = function(x) {
159 |       if (!arguments.length) return size;
160 |       size = [+x[0], +x[1]];
161 |       return cloud;
162 |     };
163 | 
164 |     cloud.font = function(x) {
165 |       if (!arguments.length) return font;
166 |       font = d3.functor(x);
167 |       return cloud;
168 |     };
169 | 
170 |     cloud.fontStyle = function(x) {
171 |       if (!arguments.length) return fontStyle;
172 |       fontStyle = d3.functor(x);
173 |       return cloud;
174 |     };
175 | 
176 |     cloud.fontWeight = function(x) {
177 |       if (!arguments.length) return fontWeight;
178 |       fontWeight = d3.functor(x);
179 |       return cloud;
180 |     };
181 | 
182 |     cloud.rotate = function(x) {
183 |       if (!arguments.length) return rotate;
184 |       rotate = d3.functor(x);
185 |       return cloud;
186 |     };
187 | 
188 |     cloud.text = function(x) {
189 |       if (!arguments.length) return text;
190 |       text = d3.functor(x);
191 |       return cloud;
192 |     };
193 | 
194 |     cloud.spiral = function(x) {
195 |       if (!arguments.length) return spiral;
196 |       spiral = spirals[x + ""] || x;
197 |       return cloud;
198 |     };
199 | 
200 |     cloud.fontSize = function(x) {
201 |       if (!arguments.length) return fontSize;
202 |       fontSize = d3.functor(x);
203 |       return cloud;
204 |     };
205 | 
206 |     cloud.padding = function(x) {
207 |       if (!arguments.length) return padding;
208 |       padding = d3.functor(x);
209 |       return cloud;
210 |     };
211 | 
212 |     cloud.previousword = function(x) {
213 |       if (!arguments.length) return previousword;
214 |       previousword = d3.functor(x);
215 |       return cloud;
216 |     }
217 | 
218 |     return d3.rebind(cloud, event, "on");
219 |   }
220 | 
221 |   function cloudText(d) {
222 |     return d.text;
223 |   }
224 | 
225 |   function cloudFont() {
226 |     return "serif";
227 |   }
228 | 
229 |   function cloudFontNormal() {
230 |     return "normal";
231 |   }
232 | 
233 |   function cloudFontSize(d) {
234 |     return Math.sqrt(d.value);
235 |   }
236 | 
237 |   function cloudRotate() {
238 |     return (~~(Math.random() * 6) - 3) * 30;
239 |   }
240 | 
241 |   function cloudPadding() {
242 |     return 1;
243 |   }
244 | 
245 |   function cloudPreviousword() {
246 |     // look up previousword in previouswords
247 |     if (previouswords) {
248 |         var thisword = arguments[0]["text"];
249 |         var result  = previouswords.filter(function(o){return o.text == thisword;} );
250 |     }
251 |     return result? result[0] : null; // or undefined
252 |   }
253 | 
254 |   // Fetches a monochrome sprite bitmap for the specified text.
255 |   // Load in batches for speed.
256 |   function cloudSprite(d, data, di) {
257 |     if (d.sprite) return;
258 |     c.clearRect(0, 0, (cw << 5) / ratio, ch / ratio);
259 |     var x = 0,
260 |         y = 0,
261 |         maxh = 0,
262 |         n = data.length;
263 |     --di;
264 |     while (++di < n) {
265 |       d = data[di];
266 |       c.save();
267 |       c.font = d.style + " " + d.weight + " " + ~~((d.size + 1) / ratio) + "px " + d.font;
268 |       var w = c.measureText(d.text + "m").width * ratio,
269 |           h = d.size << 1;
270 |       if (d.rotate) {
271 |         var sr = Math.sin(d.rotate * cloudRadians),
272 |             cr = Math.cos(d.rotate * cloudRadians),
273 |             wcr = w * cr,
274 |             wsr = w * sr,
275 |             hcr = h * cr,
276 |             hsr = h * sr;
277 |         w = (Math.max(Math.abs(wcr + hsr), Math.abs(wcr - hsr)) + 0x1f) >> 5 << 5;
278 |         h = ~~Math.max(Math.abs(wsr + hcr), Math.abs(wsr - hcr));
279 |       } else {
280 |         w = (w + 0x1f) >> 5 << 5;
281 |       }
282 |       if (h > maxh) maxh = h;
283 |       if (x + w >= (cw << 5)) {
284 |         x = 0;
285 |         y += maxh;
286 |         maxh = 0;
287 |       }
288 |       if (y + h >= ch) break;
289 |       c.translate((x + (w >> 1)) / ratio, (y + (h >> 1)) / ratio);
290 |       if (d.rotate) c.rotate(d.rotate * cloudRadians);
291 |       c.fillText(d.text, 0, 0);
292 |       if (d.padding) c.lineWidth = 2 * d.padding, c.strokeText(d.text, 0, 0);
293 |       c.restore();
294 |       d.width = w;
295 |       d.height = h;
296 |       d.xoff = x;
297 |       d.yoff = y;
298 |       d.x1 = w >> 1;
299 |       d.y1 = h >> 1;
300 |       d.x0 = -d.x1;
301 |       d.y0 = -d.y1;
302 |       d.hasText = true;
303 |       x += w;
304 |     }
305 |     var pixels = c.getImageData(0, 0, (cw << 5) / ratio, ch / ratio).data,
306 |         sprite = [];
307 |     while (--di >= 0) {
308 |       d = data[di];
309 |       if (!d.hasText) continue;
310 |       var w = d.width,
311 |           w32 = w >> 5,
312 |           h = d.y1 - d.y0;
313 |       // Zero the buffer
314 |       for (var i = 0; i < h * w32; i++) sprite[i] = 0;
315 |       x = d.xoff;
316 |       if (x == null) return;
317 |       y = d.yoff;
318 |       var seen = 0,
319 |           seenRow = -1;
320 |       for (var j = 0; j < h; j++) {
321 |         for (var i = 0; i < w; i++) {
322 |           var k = w32 * j + (i >> 5),
323 |               m = pixels[((y + j) * (cw << 5) + (x + i)) << 2] ? 1 << (31 - (i % 32)) : 0;
324 |           sprite[k] |= m;
325 |           seen |= m;
326 |         }
327 |         if (seen) seenRow = j;
328 |         else {
329 |           d.y0++;
330 |           h--;
331 |           j--;
332 |           y++;
333 |         }
334 |       }
335 |       d.y1 = d.y0 + seenRow;
336 |       d.sprite = sprite.slice(0, (d.y1 - d.y0) * w32);
337 |     }
338 |   }
339 | 
340 |   // Use mask-based collision detection.
341 |   function cloudCollide(tag, board, sw) {
342 |     sw >>= 5;
343 |     var sprite = tag.sprite,
344 |         w = tag.width >> 5,
345 |         lx = tag.x - (w << 4),
346 |         sx = lx & 0x7f,
347 |         msx = 32 - sx,
348 |         h = tag.y1 - tag.y0,
349 |         x = (tag.y + tag.y0) * sw + (lx >> 5),
350 |         last;
351 |     for (var j = 0; j < h; j++) {
352 |       last = 0;
353 |       for (var i = 0; i <= w; i++) {
354 |         if (((last << msx) | (i < w ? (last = sprite[j * w + i]) >>> sx : 0))
355 |             & board[x + i]) return true;
356 |       }
357 |       x += sw;
358 |     }
359 |     return false;
360 |   }
361 | 
362 |   function cloudBounds(bounds, d) {
363 |     var b0 = bounds[0],
364 |         b1 = bounds[1];
365 |     if (d.x + d.x0 < b0.x) b0.x = d.x + d.x0;
366 |     if (d.y + d.y0 < b0.y) b0.y = d.y + d.y0;
367 |     if (d.x + d.x1 > b1.x) b1.x = d.x + d.x1;
368 |     if (d.y + d.y1 > b1.y) b1.y = d.y + d.y1;
369 |   }
370 | 
371 |   function collideRects(a, b) {
372 |     return a.x + a.x1 > b[0].x && a.x + a.x0 < b[1].x && a.y + a.y1 > b[0].y && a.y + a.y0 < b[1].y;
373 |   }
374 | 
375 |   function archimedeanSpiral(size) {
376 |     var e = size[0] / size[1];
377 |     return function(t) {
378 |       return [e * (t *= .1) * Math.cos(t), t * Math.sin(t)];
379 |     };
380 |   }
381 | 
382 |   function rectangularSpiral(size) {
383 |     var dy = 4,
384 |         dx = dy * size[0] / size[1],
385 |         x = 0,
386 |         y = 0;
387 |     return function(t) {
388 |       var sign = t < 0 ? -1 : 1;
389 |       // See triangular numbers: T_n = n * (n + 1) / 2.
390 |       switch ((Math.sqrt(1 + 4 * sign * t) - sign) & 3) {
391 |         case 0:  x += dx; break;
392 |         case 1:  y += dy; break;
393 |         case 2:  x -= dx; break;
394 |         default: y -= dy; break;
395 |       }
396 |       return [x, y];
397 |     };
398 |   }
399 | 
400 |   // TODO reuse arrays?
401 |   function zeroArray(n) {
402 |     var a = [],
403 |         i = -1;
404 |     while (++i < n) a[i] = 0;
405 |     return a;
406 |   }
407 | 
408 |   var cloudRadians = Math.PI / 180,
409 |       cw = 1 << 11 >> 5,
410 |       ch = 1 << 11,
411 |       canvas,
412 |       ratio = 1;
413 | 
414 |   if (typeof document !== "undefined") {
415 |     canvas = document.createElement("canvas");
416 |     canvas.width = 1;
417 |     canvas.height = 1;
418 |     ratio = Math.sqrt(canvas.getContext("2d").getImageData(0, 0, 1, 1).data.length >> 2);
419 |     canvas.width = (cw << 5) / ratio;
420 |     canvas.height = ch / ratio;
421 |   } else {
422 |     // Attempt to use node-canvas.
423 |     canvas = new Canvas(cw << 5, ch);
424 |   }
425 | 
426 |   var c = canvas.getContext("2d"),
427 |       spirals = {
428 |         archimedean: archimedeanSpiral,
429 |         rectangular: rectangularSpiral
430 |       };
431 |   c.fillStyle = c.strokeStyle = "red";
432 |   c.textAlign = "center";
433 | 
434 |   if (typeof module === "object" && module.exports) module.exports = cloud;
435 |   else (d3.layout || (d3.layout = {})).cloud = cloud;
436 | })();
437 | 


--------------------------------------------------------------------------------
/profiler.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from dateutil import parser
  3 | import datetime
  4 | import pytz # $ pip install pytz
  5 | from collections import Counter
  6 | import operator
  7 | import re
  8 | import d3output
  9 | import fileinput
 10 | import json
 11 | import os
 12 | import glob
 13 | import ast
 14 | 
 15 | class Profiler:
 16 |     def __init__(self, opts):
 17 |         for k, v in opts.items():
 18 |             setattr(self, k, v)
 19 | 
 20 |         # set defaults
 21 |         if not("labelFormat" in opts):
 22 |             self.labelFormat = "%Y-%m-%d %H:%M:%S %Z"
 23 |         if not("tz" in opts):
 24 |             self.tz = pytz.UTC
 25 |         if not("extended" in opts):
 26 |             self.extended = False
 27 |         if not("blocks" in opts):
 28 |             self.blocks = ["all"]
 29 |         if "all" in self.blocks:
 30 |             self.blocks.extend(["topusers", "tophashtags", "topurls", "topimageurls", "urls", 
 31 |                 "imageurls"])
 32 |             
 33 |         # initialize 
 34 |         self.count = 0
 35 |         self.typecounts = {"original": 0, "retweet": 0, "quote": 0, "reply": 0}
 36 |         self.originalcount = 0
 37 |         self.retweetcount = 0
 38 |         self.quotecount = 0
 39 |         self.quoteandretweetcount = 0
 40 |         self.replycount = 0
 41 |         self.geocount = 0
 42 |         self.earliest = ""
 43 |         self.latest = ""
 44 |         self.users = Counter()
 45 |         if self.extended:
 46 |             if "tophashtags" in self.blocks: 
 47 |                 self.hashtags = Counter()
 48 |                 self.hashtagcount = 0
 49 |             if "urls" in self.blocks or "topurls" in self.blocks:
 50 |                 self.urls = Counter()
 51 |                 self.urlcount = 0
 52 |             if "imageurls" in self.blocks or "topimageurls" in self.blocks:
 53 |                 self.imageurls = Counter()
 54 |                 self.imageurlcount = 0
 55 |             
 56 |         
 57 |     def adduser(self, user, tweet):
 58 |         self.users[user] += 1
 59 |         
 60 |     def addurl(self, url):
 61 |         self.urls[url] += 1
 62 |         
 63 |     def addhashtag(self, hashtag):
 64 |         self.hashtags[hashtag] += 1
 65 | 
 66 |     def addimageurl(self, imageurl):
 67 |         self.imageurls[imageurl] += 1
 68 | 
 69 |     def process(self, tweet):
 70 |         self.count += 1
 71 |         tweettype = ""
 72 |         if "retweeted_status" in tweet:
 73 |             tweettype = "retweet"
 74 |         elif tweet["is_quote_status"]:
 75 |             tweettype = "quote"
 76 |         elif tweet["in_reply_to_status_id"] != None:
 77 |             tweettype = "reply"
 78 |         else: tweettype = "original"
 79 |         if tweet.get("geo") != None:
 80 |             self.geocount += 1
 81 |         self.typecounts[tweettype] += 1
 82 | 
 83 |         self.created_at = parser.parse(tweet["created_at"])
 84 |         if self.earliest == "" or self.earliest > self.created_at:
 85 |             self.earliest = self.created_at
 86 |         if self.latest == "" or self.latest < self.created_at:
 87 |             self.latest = self.created_at
 88 |         user = tweet["user"]["screen_name"]
 89 |         self.adduser(user, tweet)
 90 |         if self.extended:
 91 |             # handle urls
 92 |             if "urls" in self.blocks or "topurls" in self.blocks:
 93 |                 if len(tweet["entities"]["urls"]) > 0:
 94 |                     for url in tweet["entities"]["urls"]:
 95 |                         self.addurl(url["expanded_url"])
 96 |                     self.urlcount += 1
 97 |                 
 98 |             # handle hashtags
 99 |             if "hashtags" in self.blocks or "tophashtags" in self.blocks:
100 |                 if len(tweet["entities"]["hashtags"]) > 0:
101 |                     for tag in tweet["entities"]["hashtags"]:
102 |                         # hashtags are not case sensitive, so lower() to dedupe
103 |                         # or just leave it and accept dupes?
104 |                         self.addhashtag(tag["text"].lower())
105 |                     self.hashtagcount += 1
106 |             
107 |             # handle imageurls
108 |             if "imageurls" in self.blocks or "topimageurls" in self.blocks:
109 |                 if "media" in tweet["entities"]:
110 |                     hasimageurl = False
111 |                     for media in tweet["entities"]["media"]:
112 |                         if media["type"] == "photo":
113 |                             self.addimageurl(media["media_url"])
114 |                             hasimageurl = True
115 |                     if hasimageurl:
116 |                         self.imageurlcount += 1
117 | 
118 |     def gettweets(self, opts, args):
119 |         # prepare to serialize opts and args as json
120 |         # converting opts to str produces string with single quotes,
121 |         # but json requires double quotes
122 |         self.optsdict = ast.literal_eval(str(opts))
123 |         self.argsdict = ast.literal_eval(str(args))
124 | 
125 |         # if args has one value, check whether it's a directory
126 |         if len(args) == 1 and os.path.isdir(args[0]):
127 |             # add path to metadata file and tweets
128 |             self.metadatafile = os.path.join(args[0] , "metadata.json")
129 |             args = glob.glob(os.path.join(args[0], "data/tweets/tweets-*.json"))
130 |         else:
131 |             # args must be files, so calculate path to metadata file based on 
132 |             # dir of first input file
133 |             self.metadatafile = os.path.join(os.path.dirname(args[0]), "metadata.json")
134 |         for line in fileinput.input(args):
135 |             try:
136 |                 tweet = json.loads(line)
137 |                 self.process(tweet)
138 |             except ValueError as e:
139 |                 sys.stderr.write("uhoh: %s\n" % e)
140 |      
141 |     def tops(self, list, title):
142 |         # given a list of name-value pairs, return the top 10 pairs by value,
143 |         # and a list of integers representing the percent of total value
144 |         # held by each of 10 slices
145 |         
146 |         totalcount = len(list)
147 |         totalvalue = int(sum(list.values()))
148 |         sorted = list.most_common()
149 |         
150 |         top = sorted[:10]
151 |         top_result = []
152 |         for name, value in top:
153 |             top_result.append({"name": name, "value": value})
154 | 
155 |         step = float(totalcount) / 10
156 |         percentiles = []
157 |         for i in range(0, 10):
158 |             start = int(i * step)
159 |             end = int((i + 1) * step)
160 |             slicecount = end - start
161 |             if slicecount > 0:
162 |                 # weight the slice value as if the slice were an even 10th of the list
163 |                 weight = 10 / (float(slicecount) / totalcount)
164 |                 slicevalue = sum(v for k,v in sorted[start:end])
165 |                 percentile = int(round(float(slicevalue) / totalvalue * weight))
166 |             else:
167 |                 percentile = 0
168 |             percentiles.append(percentile)
169 |         return {"top" + title: top_result, title+"percentiles": percentiles}
170 |     
171 |     def report(self):
172 |         local_earliest = self.tz.normalize(self.earliest.astimezone(self.tz)).strftime(self.labelFormat)
173 |         local_latest = self.tz.normalize(self.latest.astimezone(self.tz)).strftime(self.labelFormat)
174 |         result = {"count": self.count, 
175 |             "originalcount": self.typecounts["original"], 
176 |             "retweetcount": self.typecounts["retweet"], 
177 |             "quotecount": self.typecounts["quote"], 
178 |             "replycount": self.typecounts["reply"], 
179 |             "geocount": self.geocount,
180 |             "earliest": local_earliest, 
181 |             "latest": local_latest, 
182 |             "usercount": len(self.users),
183 |             "opts": self.optsdict,
184 |             "args": self.argsdict,
185 |             "metadatafile": self.metadatafile}
186 |         if self.extended:
187 |             if "topusers" in self.blocks:
188 |                 result.update(self.tops(self.users, "users"))
189 |             if "tophashtags" in self.blocks:
190 |                 result.update(self.tops(self.hashtags, "hashtags"))
191 |             if "topurls" in self.blocks:
192 |                 result.update(self.tops(self.urls, "urls"))
193 |             if "urls" in self.blocks:
194 |                 result.update({"urlcount": self.urlcount, "urls": len(self.urls), 
195 |                     "imageurlcount": self.imageurlcount, "imageurls": len(self.imageurls),
196 |                     "hashtagcount": self.hashtagcount, "hashtags": len(self.hashtags)})
197 |             if "topimageurls" in self.blocks:
198 |                 result.update(self.tops(self.imageurls, "imageurls"))
199 |             if "imageurls" in self.blocks:
200 |                 result.update({"imageurlslist": self.imageurls})
201 |         return result
202 |             
203 | class LinkNodesProfiler(Profiler):
204 |     def __init__(self, opts):
205 |         Profiler.__init__(self, opts)
206 |         self.nodes = {}
207 |         self.nodeid = 0
208 | 
209 | # nodes will end up as 
210 | #  {"userA": 
211 | #    {"id": 27,
212 | #    "source": 0, 
213 | #    "target": 1,
214 | #    "links": {
215 | #        "userB": 3,
216 | #        "userC": 1
217 | #    }
218 | #    
219 | # Meaning that userA mentions userB 3 times, and userB mentions userA once.
220 | # We gather the nodes in a dictionary so that we can look up terms to update 
221 | # counts, but at the end we convert the dictionary into a list sorted by id
222 | # so that the positions in the list correspond to the ids, as D3 requires.
223 | 
224 |     def addlink(self, source, target):
225 |         if not source in self.nodes:
226 |             self.nodes[source] = {"name": source, "id": self.nodeid, "tweetcount": 0, 
227 |                 "source": 1, "target": 0, "links": {}}
228 |             self.nodeid += 1
229 |         else:
230 |             self.nodes[source]["source"] += 1
231 | 
232 |         if not target in self.nodes:
233 |             targetid = self.nodeid
234 |             self.nodes[target] = {"name": target, "id": self.nodeid, "tweetcount": 0, 
235 |                 "source": 0, "target": 1, "links": {}}
236 |             self.nodeid += 1
237 |         else:            
238 |             self.nodes[target]["target"] += 1
239 |             targetid = self.nodes[target]["id"]
240 | 
241 |         linklist = self.nodes[source]["links"]
242 |         if not target in linklist:
243 |             linklist[target] = {"count": 1, "id": targetid}
244 |         else:
245 |             linklist[target]["count"] += 1
246 |             
247 |     def addsingle(self, name):
248 |         if not name in self.nodes:
249 |             self.nodes[name] = {"name": name, "id": self.nodeid, "tweetcount": 1, 
250 |                 "source": 0, "target": 0, "links": {}}
251 |             self.nodeid += 1        
252 | 
253 |     def report(self):
254 |         if hasattr(self, "graph"):
255 |             self.optsdict["graph"] = self.graph
256 |         if hasattr(self, "field"):
257 |             self.optsdict["field"] = self.field
258 |         profile = Profiler.report(self)
259 |         # convert nodes dictionary to a list, sorted by id
260 |         nodelistkeys = sorted(self.nodes, key=lambda w: self.nodes[w]["id"])
261 |         nodelist = []
262 |         for key in nodelistkeys:
263 |             nodelist.append(self.nodes[key])
264 |         return {"profile": profile, "nodes": nodelist}
265 | 
266 | class TimeProfiler(Profiler):
267 |     # interval, in milliseconds
268 |     intervalFormats = {
269 |         "S": {"name": "second", "format": "%Y-%m-%d %H:%M:%S", "interval": 1000},
270 |         "M": {"name": "minute", "format": "%Y-%m-%d %H:%M", "interval":  1000 * 60},
271 |         "H": {"name": "hour", "format": "%Y-%m-%d %H", "interval":  1000 * 60 * 60},
272 |         "d": {"name": "day", "format": "%Y-%m-%d", "interval":  1000 * 60 * 60 * 24},
273 |         "m": {"name": "month", "format": "%Y-%m", "interval":  1000 * 60 * 60 * 24 * 28},
274 |         "Y": {"name": "year", "format": "%Y-%m", "interval":  1000 * 60 * 60 * 24 * 365}
275 |     }
276 |     def __init__(self, opts):
277 |         Profiler.__init__(self, opts)
278 |         try:
279 |             self.intervalParts = re.search("([0-9]*)([^0-9]*)", self.intervalStr)
280 |             if self.intervalParts.group(1) == "":
281 |                 self.intervalCount = 1
282 |             else:
283 |                 self.intervalCount = int(self.intervalParts.group(1))
284 |             self.intervalUnit = self.intervalParts.group(2)
285 |             self.interval = self.intervalCount * self.intervalFormats[self.intervalUnit]["interval"]
286 |             self.format = self.intervalFormats[self.intervalUnit]["format"]
287 |             self.intervalLabel = str(self.intervalCount) + " " + self.intervalFormats[self.intervalUnit]["name"]
288 |             if self.intervalCount > 1:
289 |                 self.intervalLabel += "s"
290 | 
291 |         except ValueError as e:
292 |             sys.stderr.write("uhoh: %s\n" % e)
293 | 
294 |         # gather in a dict with count if aggregating, otherwise in a list
295 |         if self.aggregate:
296 |             self.items = {}
297 |         else:
298 |             self.items = []
299 | 
300 |     def process(self, tweet):
301 |         Profiler.process(self, tweet)
302 |         created_at = parser.parse(tweet["created_at"])
303 |         local_dt = self.tz.normalize(created_at.astimezone(self.tz))
304 |         if self.intervalStr != "":
305 |             if self.intervalUnit == "S":
306 |                 local_dt = local_dt - datetime.timedelta(seconds=local_dt.second % int(self.intervalCount))
307 |             elif self.intervalUnit == "M":
308 |                 local_dt = local_dt - datetime.timedelta(minutes=local_dt.minute % int(self.intervalCount))
309 |             elif self.intervalUnit == "H":
310 |                 local_dt = local_dt - datetime.timedelta(hours=local_dt.hour % int(self.intervalCount))
311 |         # otherwise use format to aggregate values - though this treats intervalCount as 1
312 |         result = local_dt.strftime(self.format)
313 |         if self.aggregate:
314 |             self.items[result] = self.items.get(result, 0) + 1
315 |         else:
316 |             self.items.append(result)
317 |         # return the time slice label
318 |         return result
319 |             
320 |     def report(self):
321 |         self.optsdict["interval"] = self.interval
322 |         self.optsdict["format"] = self.format
323 |         self.optsdict["intervalLabel"] = self.intervalLabel
324 |         profile = Profiler.report(self)
325 |         if self.output == "csv":
326 |             if self.aggregate:
327 |                 values = d3output.namevaluecsv(self.items)
328 |             else:
329 |                 values = d3output.valuecsv(self.items)
330 |             return values
331 |         else:
332 |             if self.aggregate:
333 |                 values = d3output.namevaluejson(self.items)
334 |             else:
335 |                 values = d3output.valuejson(self.items)
336 |             return {"profile": profile, "values": values}
337 | 


--------------------------------------------------------------------------------
/templates/graph.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <meta charset="utf-8">
  3 | <title>$TITLE$</title>
  4 | <script src="https://d3js.org/d3.v3.min.js"></script>
  5 | <style type="text/css">
  6 | 
  7 | div#sidebar {
  8 |     position:fixed;
  9 |     top:0px;
 10 |     left:0px;
 11 |     width: 250px;
 12 |     height:400px;
 13 |     overflow:scroll;
 14 | 
 15 |     border: 1px solid black;
 16 |     background: #eee;
 17 |     padding: 5px;
 18 |     }
 19 | 
 20 | div#sidebar_lists {
 21 |     font-size: 75%;
 22 | }
 23 | #sidebar_term {
 24 |     clear: both;
 25 |     }
 26 |     
 27 | #termselect {
 28 |     width: 100%;
 29 | }
 30 | div.sidebar_list {
 31 |     width: 50%;
 32 |     float:left;
 33 | }
 34 | 
 35 | div#argsopts {
 36 |     float: right;
 37 | }
 38 | div#metadata {
 39 |     position: fixed;
 40 |     top: 0;
 41 |     right: 0;
 42 |     border: 1px solid black;
 43 |     background: #eee;
 44 |     padding: 5px;
 45 | }
 46 | div.metadata {
 47 |     font-size: 75%;
 48 | }
 49 | 
 50 | path.link {
 51 |   fill: none;
 52 |   stroke: #666;
 53 |   stroke-width: 1.5px;
 54 | }
 55 | 
 56 | marker#plain {
 57 |   fill: #666;
 58 | }
 59 | 
 60 | marker#in {
 61 |   fill: #44f;
 62 | }
 63 | 
 64 | marker#out {
 65 |   fill: #0f0;
 66 | }
 67 | 
 68 | path.link.hlOut {
 69 |     stroke: #0f0;
 70 |     stroke-width: 2px;
 71 | }
 72 | path.link.hlIn, path.link.hlUndirected {
 73 |     stroke: #44f;
 74 |     stroke-width: 2px;
 75 | }
 76 | 
 77 | 
 78 | circle {
 79 |   fill: #eee;
 80 |   stroke: #333;
 81 |   stroke-width: 1.5px;
 82 | }
 83 | circle.hl {
 84 |     stroke: #f00;
 85 |     fill: #f00;
 86 | }
 87 | circle.hlIn, circle.hlUndirected.hlUndirected, circle.hlUndirected {
 88 |     stroke: #0f0;
 89 |     fill: #cfc;
 90 | }
 91 | circle.hlOut {
 92 |     stroke: #66f;
 93 |     fill: #ccf;
 94 | }
 95 | circle.hlout.hlIn {
 96 |     stroke: #66f;
 97 |     fill: #cfc;
 98 | }
 99 | 
100 | text {
101 |   font: 10px sans-serif;
102 |   pointer-events: none;
103 | }
104 | 
105 | text.shad {
106 |   stroke: #fff;
107 |   stroke-width: 3px;
108 |   stroke-opacity: .8;
109 | }
110 | g.hl text.shad {
111 |     stroke: #fcc;
112 |       stroke-opacity: 1;
113 | }
114 | g.hlIn text.shad, g.hlUndirected text.shad {
115 |     stroke: #cfc;
116 |       stroke-opacity: 1;
117 | }
118 | g.hlOut text.shad {
119 |     stroke: #ddf;
120 |       stroke-opacity: 1;
121 | }
122 | 
123 | 
124 | ul { margin: 0; padding: 0; list-style-type: none;}
125 |     </style>
126 |   </head>
127 |   <body>
128 |   
129 |     <div id="sidebar">
130 |         <div id="sidebar_selectdiv"></div>
131 |         <h2 id="sidebar_term"></h2>
132 |         <div id="sidebar_lists">
133 |             <div class="sidebar_list" id="sidebar_inlist"></div>
134 |             <div class="sidebar_list" id="sidebar_outlist"></div>
135 |         </div>
136 |     </div>
137 |     <svg id="chart"></svg>
138 |     <div id="metadata"></div>
139 |   
140 |     <script type="text/javascript">
141 |     var data = $DATA$;
142 |     var metadata = data.metadata;
143 |     var profile = data.profile;
144 |     var opts = profile.opts;
145 |     
146 |     var mode = opts["mode"];
147 |     var threshold = opts["threshold"];
148 |     var graph = opts["graph"];
149 |     var field = opts["field"];
150 |     var links = data["links"];
151 |     var nodes = data["nodes"];
152 |     
153 | 
154 |     html = "<strong>" + metadata["title"] + "</strong> | Search: " + metadata["search"];
155 |         html += " | Threshold: " + threshold;
156 |         if (mode) html += " | Mode: " + mode;
157 |         html += "<br/>";
158 |     html += profile.count + " tweets from " + profile.usercount + " users, including " + profile.retweetcount + " retweets (" + (profile.retweetcount / profile.count * 100).toFixed(2) + "%), "
159 |     html += "and " + profile.geocount + " with geo (" + (profile.geocount / profile.count * 100).toFixed(2) + "%): "
160 |     html += profile.earliest + " to " + profile.latest
161 |     d3.select("#metadata").html(html);
162 | 
163 |     
164 |     switch (field) {
165 |         case "user": twitterlinkroot = "https://twitter.com/";
166 |             break;
167 |         case "hashtag": twitterlinkroot = "https://twitter.com/hashtag/";
168 |             break;
169 |     }
170 |     
171 |     // in directed graph we want different styles for in and out links, but
172 |     // in undirected they should be the same.
173 |     var inLinkClass = (graph == 'directed') ? "In" : "Undirected";
174 |     var outLinkClass = (graph == 'directed') ? "Out" : "Undirected";
175 |     
176 |     // hide the outlink div if this is an undirected graph
177 |     if (graph == 'undirected')
178 |          d3.select("#sidebar_outlist").remove();
179 |     
180 |     function twitterlink(n) {
181 |         return "<a href='" + twitterlinkroot + n + "'>" + n + "</n>";
182 |     }
183 |     
184 |     function selectlink(n) {
185 |         return "<a href='javascript:termselect(\"" + n + "\")'>" + n + "</n>";
186 |     }
187 |     
188 |     function listsort(l) {
189 |         sortl = [];
190 |         for (var a = 0; a < l.length; a++) {
191 |             sortl.push(l[a].toLowerCase() + " " + l[a]);
192 |         }
193 |         sortl = sortl.sort();
194 |         for (var a = 0; a < sortl.length; a++) {
195 |             sortl[a] = sortl[a].split(" ")[1];
196 |         }
197 |         return sortl;
198 |     }
199 |     
200 |     function listhtml(l) {
201 |         l = listsort(l);
202 |         var html = "<ul>";
203 |         for (var a = 0; a < l.length; a++) {
204 |             html += "<li>" + selectlink(l[a]) + "</li>";
205 |         }
206 |         html += "</ul>";
207 |         return html;
208 |     }
209 |     
210 |     function selecthtml(l) {
211 |         l = listsort(l);
212 |         html = "<select id='termselect' onchange='termselect(this.value)'>";
213 |         for (var a = 0; a < l.length; a++) {
214 |             html += "<option>" + l[a] + "</option>";
215 |         }
216 |         html += "</select>";
217 |         return html;
218 |     }
219 |     
220 |     function termselect(name) {
221 |            // working with:
222 |        //    this:
223 |        //    <circle class="" transform="translate(974.0229211897631,951.403518091633)" id="circle_RobertCBinkley" r="6">
224 |     //     <title>RobertCBinkley (1/1)</title>
225 |     //  </circle>
226 |     //  d:
227 |     //  {"name": "RobertCBinkley", "title": "RobertCBinkley (1/1)"}
228 |     //  path:
229 |     //        <path d="M952.2..." marker-end="url(#plain)" in="pabinkley" out="RobertCBinkley" 
230 |     //            class="link suit out_RobertCBinkley in_pabinkley hlOut"/>
231 |          
232 |            root = d3.select("#circle_" + name);
233 |            // show selected term in sidebar
234 |            d3.select("#sidebar_term").html(twitterlink(name));
235 | 
236 |            
237 |            pathsfg = d3.select("#paths-fg");
238 |            circlesfg = d3.select("#circles-fg");
239 |            textsfg = d3.select("#texts-fg");
240 |            paths = d3.select("#paths");
241 |            circles = d3.select("#circles");
242 |            texts = d3.select("#texts");
243 |            
244 |            // clear previous selections
245 |            d3.selectAll(".hl").classed("hl", false);
246 |            d3.selectAll(".hl"+inLinkClass).classed("hl"+inLinkClass, false);
247 |            d3.selectAll(".hl"+outLinkClass).classed("hl"+outLinkClass, false);
248 |            // put foregrounded paths etc. (if any) back in background
249 |            var p = pathsfg.selectAll("path");
250 |            // add arrowheads if directed graph
251 |            if (graph == 'directed') 
252 |                p.attr("marker-end", "url(#plain)");
253 |            if (p.size() > 0)
254 |                d3.select("#paths").each(function() {
255 |                    f = this;
256 |                    p.each(function() { 
257 |                        f.appendChild(this) 
258 |                    });
259 |             })
260 |            var p = circlesfg.selectAll("circle");
261 |            if (p.size() > 0)
262 |                d3.select("#circles").each(function() {
263 |                    f = this;
264 |                    p.each(function() { 
265 |                        f.appendChild(this) 
266 |                    });
267 |             })
268 |            var p = textsfg.selectAll("g");
269 |            if (p.size() > 0)
270 |                d3.select("#texts").each(function() {
271 |                    f = this;
272 |                    p.each(function() { 
273 |                        f.appendChild(this) 
274 |                    });
275 |             })
276 |            
277 |         // highlight current node, paths, branch nodes
278 |            root.classed("hl", true);
279 |            d3.select("#text_" + name).classed("hl", true);
280 |            // highlight paths
281 |            var inlist = [];
282 |            var outlist = [];
283 |            var alllist = {}; // may have to handle duplicates, from in and out lists
284 |            p = d3.selectAll("path.out_" + name).classed("hl"+outLinkClass, true);
285 |            if (graph == 'directed')
286 |                p.attr("marker-end", "url(#out)")
287 |             p.each(function(d){
288 |                if (d.target.name != name) {
289 |                    if (graph == 'directed')
290 |                        outlist.push(d.target.name);
291 |                    else
292 |                        alllist[d.target.name] = true;
293 |                    d3.select("#circle_" + d.target.name).classed("hl"+inLinkClass, true); 
294 |                    d3.select("#text_" + d.target.name).classed("hl"+inLinkClass, true);
295 |                };
296 |            });
297 |            p = d3.selectAll("path.in_" + name).classed("hl"+inLinkClass, true);
298 |            if (graph == 'directed')
299 |                p.attr("marker-end", "url(#in)");
300 |            p.each(function(d){
301 |                if (d.source.name != name) {
302 |                    if (graph == 'directed')
303 |                        inlist.push(d.source.name);
304 |                    else
305 |                        alllist[d.source.name] = true;
306 |                    d3.select("#circle_" + d.source.name).classed("hl"+outLinkClass, true); 
307 |                    d3.select("#text_" + d.source.name).classed("hl"+outLinkClass, true);
308 |                    if (graph == 'directed')
309 |                        this["marker-end"] = "url(#out)";
310 |                };
311 |            });
312 |         var inlabel = {"retweets": "Retweeted by", "replies": "Replies to this", "mentions": "Mentioned"};
313 |         var outlabel = {"retweets": "Retweeted", "replies": "Replies from this", "mentions": "Mentioned by"};
314 |         if (graph == 'directed') {
315 |             d3.select("#sidebar_inlist").html("<h3>" + inlabel[mode] + "</h3>" + listhtml(inlist));
316 |             d3.select("#sidebar_outlist").html("<h3>" + outlabel[mode] + "</h3>" + listhtml(outlist));
317 |         }
318 |         else
319 |             d3.select("#sidebar_inlist").html("<h3>Co-Tags</h3>" + listhtml(Object.keys(alllist).sort()));
320 | 
321 |         // move path to a different group: based on https://groups.google.com/forum/#!topic/d3-js/KEiqiUPZHUs
322 |            var p = paths.selectAll("path.out_" + name +", path.in_" + name);
323 |            if (p.size() > 0)
324 |                d3.select("#paths-fg").each(function() {
325 |                    f = this;
326 |                    p.each(function() { 
327 |                        f.appendChild(this) 
328 |                    });
329 |             })
330 |            var p = circles.selectAll(".hl, .hlIn, .hlOut, .hlUndirected");
331 |            if (p.size() > 0)
332 |                d3.select("#circles-fg").each(function() {
333 |                    f = this;
334 |                    p.each(function() { 
335 |                        f.appendChild(this) 
336 |                    });
337 |             })
338 |            var p = texts.selectAll(".hl, .hlIn, .hlOut, .hlUndirected");
339 |            if (p.size() > 0)
340 |                d3.select("#texts-fg").each(function() {
341 |                    f = this;
342 |                    p.each(function() { 
343 |                        f.appendChild(this) 
344 |                    });
345 |             })
346 | 
347 | 
348 |     }
349 |     
350 | 
351 | // set box size to window size
352 | var win = window,
353 |     d = document,
354 |     e = d.documentElement,
355 |     g = d.getElementsByTagName('body')[0]
356 | 
357 | // prepare list of terms for dropdown
358 | var terms = [];
359 | for (var a = 0; a < nodes.length; a++) {
360 |     terms.push(nodes[a]["name"]);
361 | }
362 | 
363 | d3.select("#sidebar_selectdiv").html(selecthtml(terms));
364 |     
365 | // get viewport dimensions
366 | var portw = win.innerWidth || e.clientWidth || g.clientWidth,
367 |     porth = win.innerHeight|| e.clientHeight|| g.clientHeight;
368 | 
369 | // make the svg really really big
370 | w = 2000;
371 | h = 2000;
372 | 
373 | // but we'll want to scroll so that it is centred
374 | scrollx = (w >= portw) ? Math.round((w - portw)/2) : 0;
375 | scrolly = (h >= porth) ? Math.round((h - porth)/2) : 0;
376 | 
377 | var force = d3.layout.force()
378 |     .nodes(d3.values(nodes))
379 |     .links(links)
380 |     .size([w, h])
381 |     .linkDistance(60)
382 |     .charge(-300)
383 |     .on("tick", tick)
384 |     .start();
385 | 
386 | var svg = d3.select("#chart")
387 |     .attr("width", w)
388 |     .attr("height", h);
389 | 
390 | // Per-type markers, as they don't inherit styles.
391 | svg.append("svg:defs").selectAll("marker")
392 |     .data(["plain", "in", "out"])
393 |   .enter().append("svg:marker")
394 |     .attr("id", String)
395 |     .attr("viewBox", "0 -5 10 10")
396 |     .attr("refX", 15)
397 |     .attr("refY", -1.5)
398 |     .attr("markerWidth", 6)
399 |     .attr("markerHeight", 6)
400 |     .attr("orient", "auto")
401 |   .append("svg:path")
402 |     .attr("d", "M0,-5L10,0L0,5");
403 | 
404 | var path = svg.append("svg:g")
405 |     .attr("id", "paths")
406 |   .selectAll("path")
407 |     .data(force.links())
408 |   .enter().append("svg:path")
409 |     .attr("class", function(d) { return "link " + d.type + " out_" + d.source.name + " in_" + d.target.name; })
410 |     .attr("out", function(d) { return d.source.name; })
411 |     .attr("in", function(d) { return d.target.name; });
412 | // add arrowheads if directed graph
413 | if (graph == 'directed')
414 |     path.attr("marker-end", "url(#plain)");
415 | 
416 | var circle = svg.append("svg:g")
417 |     .attr("id", "circles")
418 |     .selectAll("circle")
419 |     .data(force.nodes())
420 |   .enter().append("svg:circle")
421 |     .attr("r", 6)
422 |     .attr("id", function(d) {return "circle_" + d.name;})
423 |    .on("click", function(d) {
424 |         termselect(d.name);
425 |            })
426 |    .call(force.drag);
427 | 
428 | // tooltips
429 |  circle.append("title")
430 |       .text(function(d) { return d.title; });
431 | 
432 | var text = svg.append("svg:g")
433 |     .attr("id", "texts")
434 |     .selectAll("g")
435 |     .data(force.nodes())
436 |   .enter().append("svg:g")
437 |   .attr("id", function(d) {return "text_" + d.name;});
438 | 
439 | // A copy of the text with a thick white stroke for legibility.
440 | text.append("svg:text")
441 |     .attr("x", 8)
442 |     .attr("y", ".31em")
443 |     .attr("class", "shad")
444 |     
445 |     .text(function(d) { return d.name; });
446 | 
447 | text.append("svg:text")
448 |     .attr("x", 8)
449 |     .attr("y", ".31em")
450 |     .text(function(d) { return d.name; });
451 | 
452 | // foreground groups: we move highlighted objects here, to bring them in front of the rest
453 | var pathsfg = svg.append("svg:g")
454 |     .attr("id", "paths-fg");
455 | var circlesfg = svg.append("svg:g")
456 |     .attr("id", "circles-fg");
457 | var textsfg = svg.append("svg:g")
458 |     .attr("id", "texts-fg");
459 | 
460 | // Use elliptical arc path segments to doubly-encode directionality.
461 | function tick() {
462 |   path.attr("d", function(d) {
463 |     var dx = d.target.x - d.source.x,
464 |         dy = d.target.y - d.source.y,
465 |         dr = Math.sqrt(dx * dx + dy * dy);
466 |     return "M" + d.source.x + "," + d.source.y + "A" + dr + "," + dr + " 0 0,1 " + d.target.x + "," + d.target.y;
467 |   });
468 | 
469 |   circle.attr("transform", function(d) {
470 |     return "translate(" + d.x + "," + d.y + ")";
471 |   });
472 | 
473 |   text.attr("transform", function(d) {
474 |     return "translate(" + d.x + "," + d.y + ")";
475 |   });
476 | }
477 | //    });
478 | 
479 | window.scrollTo(scrollx, scrolly);
480 | </script>
481 | </body>
482 | 


--------------------------------------------------------------------------------