${sparkline.sparkify([u["value"] for u in data["topimageurls"]])}
87 | (Note: Firefox may not display these images, because of content blocking. You can turn off Enhanced Tracking Protection for this domain to allow the thumbnails to load. The links to the images should still work even when the thumbnails don't.)
88 | % for imageurl in data["topimageurls"]:
89 |
${imageurl["value"]}
90 | % endfor
91 |
92 |
93 |
--------------------------------------------------------------------------------
/d3wordcloud.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import optparse
4 | import re
5 | import dateutil.parser
6 | from profiler import TimeProfiler
7 | import pytz
8 | import d3output
9 |
10 | opt_parser = optparse.OptionParser()
11 | opt_parser.add_option("-t", "--timezone", type=str, default="",
12 | help="output timezone (e.g. 'America/New_York' or 'local'; default: UTC)")
13 | opt_parser.add_option("-w", "--maxwords", dest="maxwords", type="int",
14 | help="maximum number of words to display (default: 25)", default=25)
15 | opt_parser.add_option("-i", "--interval", dest="intervalStr", type="str",
16 | help="interval for grouping timestamps, in seconds, minutes or hours, e.g. 15M (default: 1H)",
17 | default="1H")
18 | opt_parser.add_option("-s", "--start", type=str, default=None,
19 | help="start date/time")
20 | opt_parser.add_option("-e", "--end", type=str, default=None,
21 | help="end date/time")
22 | opt_parser.add_option("-o", "--output", dest="output", type="str",
23 | help="html | csv | json (default: html)", default="html")
24 | opt_parser.add_option("-p", "--template", dest="template", type="str",
25 | help="name of template in utils/template (default: wordcloud.html)", default="wordcloud.html")
26 |
27 | opts, args = opt_parser.parse_args()
28 |
29 | tzname = opts.timezone
30 | # determine output time zone
31 | if tzname == "":
32 | tz = pytz.UTC
33 | elif tzname == "local":
34 | tz = get_localzone() # system timezone, from tzlocal
35 | else:
36 | tz = pytz.timezone(tzname)
37 |
38 | maxwords = opts.maxwords
39 | intervalStr = opts.intervalStr
40 | output = opts.output
41 |
42 | start = opts.start
43 | end = opts.end
44 | if opts.start:
45 | start = tz.localize(dateutil.parser.parse(start + "0001-01-01 00:00:00"[len(start):]))
46 | if opts.end:
47 | end = tz.localize(dateutil.parser.parse(end + "9999-12-31 23:11:59"[len(end):]))
48 |
49 | # from https://gist.github.com/uogbuji/705383
50 | GRUBER_URLINTEXT_PAT = re.compile(ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
51 |
52 | class WordcloudTimeProfiler(TimeProfiler):
53 | def __init__(self, opts):
54 | TimeProfiler.__init__(self, opts)
55 | self.timeslices = {}
56 | self.stop_words = set(line.strip().lower() for line in open("stopwords/stop-words_english_6_en.txt"))
57 |
58 | def process(self, tweet):
59 | created_at = dateutil.parser.parse(tweet["created_at"])
60 | if ((self.start is None) or (created_at >= self.start)) and ((self.end is None)
61 | or (created_at <= self.end)):
62 | timeslice = TimeProfiler.process(self, tweet)
63 | if not timeslice in self.timeslices:
64 | self.timeslices[timeslice] = {}
65 | word_counts = self.timeslices[timeslice]
66 | text = tweet["text"]
67 | # remove hashtags and user names
68 | text = re.sub("(^|[^\w])[@#]\w*", "\g<1>", text)
69 | # remove urls
70 | text = re.sub(GRUBER_URLINTEXT_PAT, " ", text)
71 | # trim punctuation next to space
72 | text = re.sub(ur"[^\w\s]+(\s|$)|(^|\s)[^\w\s]+", " ", text, re.UNICODE)
73 | # replace internal punctuation, except apostrophes
74 | text = re.sub(ur"[^\w\s\']", " ", text, re.UNICODE)
75 | for word in text.split():
76 | word = word.lower()
77 | if len(word) < 3: continue
78 | if len(word) > 15: continue
79 | if word in self.stop_words: continue
80 | if word.startswith("rt"): continue
81 | if not re.match("^[a-z]", word, re.IGNORECASE): continue
82 | # remove final 's
83 | word = re.sub("\'s$", "", word)
84 | if len(word) > 0:
85 | word_counts[word] = word_counts.get(word, 0) + 1
86 |
87 | def report(self):
88 | data = TimeProfiler.report(self)
89 | data["profile"]["start"] = str(self.start)
90 | data["profile"]["end"] = str(self.end)
91 | for value in data["values"]:
92 | thisslice = self.timeslices[value["name"]]
93 | # sort words by value
94 | sorted_words = thisslice.keys()
95 | sorted_words.sort(lambda a, b: cmp(thisslice[b], thisslice[a]))
96 | top_words = sorted_words[0:maxwords]
97 | words = []
98 | for word in top_words:
99 | words.append({
100 | "text": word,
101 | "count": thisslice[word]
102 | })
103 | value["words"] = words
104 | return data
105 |
106 | profiler = WordcloudTimeProfiler({
107 | "tz": tz,
108 | "output": "json",
109 | "aggregate": True,
110 | "intervalStr": intervalStr,
111 | "start": start,
112 | "end": end})
113 |
114 | profiler.gettweets(opts, args)
115 |
116 | data = profiler.report()
117 |
118 | if opts.output == "html":
119 | d3output.embed(opts.template, data)
120 | else:
121 | print(data)
122 |
--------------------------------------------------------------------------------
/d3cotags.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import itertools
5 | import optparse
6 | import d3output # local module
7 | from profiler import Profiler # local module
8 | from profiler import LinkNodesProfiler # local module
9 | from collections import Counter
10 |
11 | class CotagsProfiler(LinkNodesProfiler):
12 | def __init__(self, opts):
13 | LinkNodesProfiler.__init__(self, opts)
14 | self.savetweets = []
15 | self.counts = Counter()
16 | self.keepers = set()
17 |
18 | def process(self, tweet):
19 | Profiler.process(self, tweet)
20 | # gather a list of the tags in this tweet, lowercased
21 | savetweet = []
22 | for tag in tweet["entities"]["hashtags"]:
23 | t = tag["text"].lower()
24 | savetweet.append(t)
25 | # and increment count for this tag
26 | self.counts[t] += 1
27 | # add tag list to savetweets
28 | self.savetweets.append(savetweet)
29 |
30 | def report(self):
31 | # for tags below the threshold, replace with "-OTHER"
32 | # which is not necessary if threshold is 0
33 | if self.threshold > 0:
34 | countkeys = self.counts.keys()
35 | for countkey in countkeys:
36 | if self.counts[countkey] < self.threshold:
37 | # for a tag whose count is below the threshold, transfer its
38 | # count to tag "-OTHER" and delete it
39 | if self.keepother:
40 | self.counts["-OTHER"] += self.counts[countkey]
41 | del self.counts[countkey]
42 | else:
43 | # otherwise add it to list of keepers
44 | self.keepers.add(countkey)
45 | if self.keepother:
46 | self.keepers.add("-OTHER")
47 | # keepers now has a complete set of surviving tags
48 |
49 | # now process hashtags in tweets again, replacing any tag not in keepers with -OTHER
50 | self.counts = Counter()
51 | for savetweet in self.savetweets:
52 |
53 | # cleantags gathers unique, lower-cased tags for this tweet
54 | cleantags = set()
55 |
56 | for tag in savetweet:
57 | if self.threshold == 0 or tag in self.keepers:
58 | cleantags.add(tag)
59 | else:
60 | if self.keepother:
61 | cleantags.add("-OTHER")
62 |
63 | # sort tags and remove tags that are in the exclude set
64 | cleantags = sorted(cleantags.difference(self.exclude))
65 |
66 | # generate all pairs
67 | for c in itertools.combinations(cleantags, 2):
68 | self.addlink(c[0], c[1])
69 | if self.reciprocal:
70 | self.addlink(c[1], c[0])
71 |
72 | # if this tag is the only one we're including from this tweet,
73 | # then there won't be any combinations, and so it won't have
74 | # been added to self.nodes by addlink: so add it.
75 |
76 | # add to tweet count for this tag
77 | for tag in cleantags:
78 | if tag in self.nodes:
79 | self.nodes[tag]["tweetcount"] += 1
80 | else:
81 | self.addsingle(tag)
82 |
83 | data = LinkNodesProfiler.report(self)
84 | return data;
85 |
86 |
87 | opt_parser = optparse.OptionParser()
88 | opt_parser.add_option("-o", "--output", dest="output", type="str",
89 | help="html | json (default: html)", default="html")
90 | opt_parser.add_option("-e", "--exclude", type=str, default="",
91 | help="comma-separated list of hashtags to exclude")
92 | opt_parser.add_option("-t", "--threshold", type=int, default=0,
93 | help="threshold below which to treat hashtags as 'other'")
94 | opt_parser.add_option("-r", "--reciprocal", action="store_true", default=False,
95 | help="add reciprocal links for each pair")
96 | opt_parser.add_option("-p", "--template", dest="template", type="str",
97 | help="name of template in utils/template (default: graph.html)", default="graph.html")
98 | opt_parser.add_option("-k", "--keepother", action="store_true", default=False,
99 | help="include -OTHER tag in output for tags below threshold")
100 |
101 | opts, args = opt_parser.parse_args()
102 |
103 | threshold = opts.threshold
104 | exclude = set(opts.exclude.lower().split(","))
105 | reciprocal = opts.reciprocal
106 | keepother = opts.keepother
107 | output = opts.output
108 |
109 | profiler = CotagsProfiler({
110 | "threshold": threshold,
111 | "exclude": exclude,
112 | "reciprocal": reciprocal,
113 | "keepother": keepother,
114 | "graph": "undirected",
115 | "field": "hashtag"})
116 |
117 | profiler.gettweets(opts, args)
118 |
119 | data = profiler.report()
120 |
121 | profile = data["profile"]
122 | nodes = data["nodes"]
123 |
124 | if output == "csv":
125 | print(d3output.nodeslinkcsv(nodes))
126 | elif output == "json":
127 | values = d3output.nodeslinktrees(profile, nodes)
128 | print({"profile": profile, "values": values})
129 | elif output == "html":
130 | print(d3output.embed(opts.template, d3output.nodeslinktrees(profile, nodes)))
131 |
132 |
133 |
--------------------------------------------------------------------------------
/templates/wordcloud.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
12 |
168 |
169 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | CC0 1.0 Universal
2 |
3 | Statement of Purpose
4 |
5 | The laws of most jurisdictions throughout the world automatically confer
6 | exclusive Copyright and Related Rights (defined below) upon the creator and
7 | subsequent owner(s) (each and all, an "owner") of an original work of
8 | authorship and/or a database (each, a "Work").
9 |
10 | Certain owners wish to permanently relinquish those rights to a Work for the
11 | purpose of contributing to a commons of creative, cultural and scientific
12 | works ("Commons") that the public can reliably and without fear of later
13 | claims of infringement build upon, modify, incorporate in other works, reuse
14 | and redistribute as freely as possible in any form whatsoever and for any
15 | purposes, including without limitation commercial purposes. These owners may
16 | contribute to the Commons to promote the ideal of a free culture and the
17 | further production of creative, cultural and scientific works, or to gain
18 | reputation or greater distribution for their Work in part through the use and
19 | efforts of others.
20 |
21 | For these and/or other purposes and motivations, and without any expectation
22 | of additional consideration or compensation, the person associating CC0 with a
23 | Work (the "Affirmer"), to the extent that he or she is an owner of Copyright
24 | and Related Rights in the Work, voluntarily elects to apply CC0 to the Work
25 | and publicly distribute the Work under its terms, with knowledge of his or her
26 | Copyright and Related Rights in the Work and the meaning and intended legal
27 | effect of CC0 on those rights.
28 |
29 | 1. Copyright and Related Rights. A Work made available under CC0 may be
30 | protected by copyright and related or neighboring rights ("Copyright and
31 | Related Rights"). Copyright and Related Rights include, but are not limited
32 | to, the following:
33 |
34 | i. the right to reproduce, adapt, distribute, perform, display, communicate,
35 | and translate a Work;
36 |
37 | ii. moral rights retained by the original author(s) and/or performer(s);
38 |
39 | iii. publicity and privacy rights pertaining to a person's image or likeness
40 | depicted in a Work;
41 |
42 | iv. rights protecting against unfair competition in regards to a Work,
43 | subject to the limitations in paragraph 4(a), below;
44 |
45 | v. rights protecting the extraction, dissemination, use and reuse of data in
46 | a Work;
47 |
48 | vi. database rights (such as those arising under Directive 96/9/EC of the
49 | European Parliament and of the Council of 11 March 1996 on the legal
50 | protection of databases, and under any national implementation thereof,
51 | including any amended or successor version of such directive); and
52 |
53 | vii. other similar, equivalent or corresponding rights throughout the world
54 | based on applicable law or treaty, and any national implementations thereof.
55 |
56 | 2. Waiver. To the greatest extent permitted by, but not in contravention of,
57 | applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and
58 | unconditionally waives, abandons, and surrenders all of Affirmer's Copyright
59 | and Related Rights and associated claims and causes of action, whether now
60 | known or unknown (including existing as well as future claims and causes of
61 | action), in the Work (i) in all territories worldwide, (ii) for the maximum
62 | duration provided by applicable law or treaty (including future time
63 | extensions), (iii) in any current or future medium and for any number of
64 | copies, and (iv) for any purpose whatsoever, including without limitation
65 | commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes
66 | the Waiver for the benefit of each member of the public at large and to the
67 | detriment of Affirmer's heirs and successors, fully intending that such Waiver
68 | shall not be subject to revocation, rescission, cancellation, termination, or
69 | any other legal or equitable action to disrupt the quiet enjoyment of the Work
70 | by the public as contemplated by Affirmer's express Statement of Purpose.
71 |
72 | 3. Public License Fallback. Should any part of the Waiver for any reason be
73 | judged legally invalid or ineffective under applicable law, then the Waiver
74 | shall be preserved to the maximum extent permitted taking into account
75 | Affirmer's express Statement of Purpose. In addition, to the extent the Waiver
76 | is so judged Affirmer hereby grants to each affected person a royalty-free,
77 | non transferable, non sublicensable, non exclusive, irrevocable and
78 | unconditional license to exercise Affirmer's Copyright and Related Rights in
79 | the Work (i) in all territories worldwide, (ii) for the maximum duration
80 | provided by applicable law or treaty (including future time extensions), (iii)
81 | in any current or future medium and for any number of copies, and (iv) for any
82 | purpose whatsoever, including without limitation commercial, advertising or
83 | promotional purposes (the "License"). The License shall be deemed effective as
84 | of the date CC0 was applied by Affirmer to the Work. Should any part of the
85 | License for any reason be judged legally invalid or ineffective under
86 | applicable law, such partial invalidity or ineffectiveness shall not
87 | invalidate the remainder of the License, and in such case Affirmer hereby
88 | affirms that he or she will not (i) exercise any of his or her remaining
89 | Copyright and Related Rights in the Work or (ii) assert any associated claims
90 | and causes of action with respect to the Work, in either case contrary to
91 | Affirmer's express Statement of Purpose.
92 |
93 | 4. Limitations and Disclaimers.
94 |
95 | a. No trademark or patent rights held by Affirmer are waived, abandoned,
96 | surrendered, licensed or otherwise affected by this document.
97 |
98 | b. Affirmer offers the Work as-is and makes no representations or warranties
99 | of any kind concerning the Work, express, implied, statutory or otherwise,
100 | including without limitation warranties of title, merchantability, fitness
101 | for a particular purpose, non infringement, or the absence of latent or
102 | other defects, accuracy, or the present or absence of errors, whether or not
103 | discoverable, all to the greatest extent permissible under applicable law.
104 |
105 | c. Affirmer disclaims responsibility for clearing rights of other persons
106 | that may apply to the Work or any use thereof, including without limitation
107 | any person's Copyright and Related Rights in the Work. Further, Affirmer
108 | disclaims responsibility for obtaining any necessary consents, permissions
109 | or other rights required for any use of the Work.
110 |
111 | d. Affirmer understands and acknowledges that Creative Commons is not a
112 | party to this document and has no duty or obligation with respect to this
113 | CC0 or use of the Work.
114 |
115 | For more information, please see
116 |
117 |
--------------------------------------------------------------------------------
/stopwords/stop-words_english_6_en.txt:
--------------------------------------------------------------------------------
1 | a
2 | able
3 | about
4 | above
5 | abst
6 | accordance
7 | according
8 | accordingly
9 | across
10 | act
11 | actually
12 | added
13 | adj
14 | adopted
15 | affected
16 | affecting
17 | affects
18 | after
19 | afterwards
20 | again
21 | against
22 | ah
23 | all
24 | almost
25 | alone
26 | along
27 | already
28 | also
29 | although
30 | always
31 | am
32 | among
33 | amongst
34 | amp
35 | an
36 | and
37 | announce
38 | another
39 | any
40 | anybody
41 | anyhow
42 | anymore
43 | anyone
44 | anything
45 | anyway
46 | anyways
47 | anywhere
48 | apparently
49 | approximately
50 | are
51 | aren
52 | arent
53 | arise
54 | around
55 | as
56 | aside
57 | ask
58 | asking
59 | at
60 | auth
61 | available
62 | away
63 | awfully
64 | b
65 | back
66 | be
67 | became
68 | because
69 | become
70 | becomes
71 | becoming
72 | been
73 | before
74 | beforehand
75 | begin
76 | beginning
77 | beginnings
78 | begins
79 | behind
80 | being
81 | believe
82 | below
83 | beside
84 | besides
85 | between
86 | beyond
87 | biol
88 | both
89 | brief
90 | briefly
91 | but
92 | by
93 | c
94 | ca
95 | came
96 | can
97 | cannot
98 | can't
99 | cause
100 | causes
101 | certain
102 | certainly
103 | co
104 | com
105 | come
106 | comes
107 | contain
108 | containing
109 | contains
110 | could
111 | couldn't
112 | couldn
113 | could've
114 | d
115 | date
116 | did
117 | didn't
118 | didn
119 | different
120 | do
121 | does
122 | doesn't
123 | doesn
124 | doing
125 | done
126 | don't
127 | don
128 | down
129 | downwards
130 | due
131 | during
132 | e
133 | each
134 | ed
135 | edu
136 | effect
137 | eg
138 | eight
139 | eighty
140 | either
141 | else
142 | elsewhere
143 | end
144 | ending
145 | enough
146 | especially
147 | et
148 | et-al
149 | etc
150 | even
151 | ever
152 | every
153 | everybody
154 | everyone
155 | everything
156 | everywhere
157 | ex
158 | except
159 | f
160 | far
161 | few
162 | ff
163 | fifth
164 | first
165 | five
166 | fix
167 | followed
168 | following
169 | follows
170 | for
171 | former
172 | formerly
173 | forth
174 | found
175 | four
176 | from
177 | further
178 | furthermore
179 | g
180 | gave
181 | get
182 | gets
183 | getting
184 | give
185 | given
186 | gives
187 | giving
188 | go
189 | goes
190 | gone
191 | got
192 | gotten
193 | h
194 | had
195 | happens
196 | hardly
197 | has
198 | hasn't
199 | hasn
200 | have
201 | haven't
202 | haven
203 | having
204 | he
205 | he'll
206 | he'd
207 | hence
208 | her
209 | here
210 | hereafter
211 | hereby
212 | herein
213 | here's
214 | hereupon
215 | hers
216 | herself
217 | he's
218 | hi
219 | hid
220 | him
221 | himself
222 | his
223 | hither
224 | home
225 | how
226 | howbeit
227 | however
228 | hundred
229 | i
230 | i'd
231 | ie
232 | if
233 | i'll
234 | i'm
235 | immediate
236 | immediately
237 | importance
238 | important
239 | in
240 | inc
241 | indeed
242 | index
243 | information
244 | instead
245 | into
246 | invention
247 | inward
248 | is
249 | isn't
250 | isn
251 | it
252 | it'd
253 | it'll
254 | its
255 | it's
256 | itself
257 | i've
258 | j
259 | just
260 | k
261 | keep
262 | keeps
263 | kept
264 | keys
265 | kg
266 | km
267 | know
268 | known
269 | knows
270 | l
271 | largely
272 | last
273 | lately
274 | later
275 | latter
276 | latterly
277 | least
278 | less
279 | lest
280 | let
281 | lets
282 | let's
283 | like
284 | liked
285 | likely
286 | line
287 | little
288 | 'll
289 | look
290 | looking
291 | looks
292 | ltd
293 | m
294 | made
295 | mainly
296 | make
297 | makes
298 | many
299 | may
300 | maybe
301 | me
302 | mean
303 | means
304 | meantime
305 | meanwhile
306 | merely
307 | mg
308 | might
309 | million
310 | miss
311 | ml
312 | more
313 | moreover
314 | most
315 | mostly
316 | mr
317 | mrs
318 | much
319 | mug
320 | must
321 | my
322 | myself
323 | n
324 | na
325 | name
326 | namely
327 | nay
328 | nd
329 | near
330 | nearly
331 | necessarily
332 | necessary
333 | need
334 | needs
335 | neither
336 | never
337 | nevertheless
338 | new
339 | next
340 | nine
341 | ninety
342 | no
343 | nobody
344 | non
345 | none
346 | nonetheless
347 | noone
348 | nor
349 | normally
350 | nos
351 | not
352 | noted
353 | nothing
354 | now
355 | nowhere
356 | o
357 | obtain
358 | obtained
359 | obviously
360 | of
361 | off
362 | often
363 | oh
364 | ok
365 | okay
366 | old
367 | omitted
368 | on
369 | once
370 | one
371 | ones
372 | only
373 | onto
374 | or
375 | ord
376 | other
377 | others
378 | otherwise
379 | ought
380 | our
381 | ours
382 | ourselves
383 | out
384 | outside
385 | over
386 | overall
387 | owing
388 | own
389 | p
390 | page
391 | pages
392 | part
393 | particular
394 | particularly
395 | past
396 | per
397 | perhaps
398 | placed
399 | please
400 | plus
401 | poorly
402 | possible
403 | possibly
404 | potentially
405 | pp
406 | predominantly
407 | present
408 | previously
409 | primarily
410 | probably
411 | promptly
412 | proud
413 | provides
414 | put
415 | q
416 | que
417 | quickly
418 | quite
419 | qv
420 | r
421 | ran
422 | rather
423 | rd
424 | re
425 | readily
426 | really
427 | recent
428 | recently
429 | ref
430 | refs
431 | regarding
432 | regardless
433 | regards
434 | related
435 | relatively
436 | research
437 | respectively
438 | resulted
439 | resulting
440 | results
441 | right
442 | run
443 | s
444 | said
445 | same
446 | saw
447 | say
448 | saying
449 | says
450 | sec
451 | section
452 | see
453 | seeing
454 | seem
455 | seemed
456 | seeming
457 | seems
458 | seen
459 | self
460 | selves
461 | sent
462 | seven
463 | several
464 | shall
465 | she
466 | she'd
467 | she'll
468 | she's
469 | should
470 | shouldn't
471 | shouldn
472 | show
473 | showed
474 | shown
475 | shows
476 | significant
477 | significantly
478 | similar
479 | similarly
480 | since
481 | six
482 | slightly
483 | so
484 | some
485 | somebody
486 | somehow
487 | someone
488 | something
489 | sometime
490 | sometimes
491 | somewhat
492 | somewhere
493 | soon
494 | sorry
495 | specifically
496 | specified
497 | specify
498 | specifying
499 | state
500 | states
501 | still
502 | stop
503 | strongly
504 | sub
505 | substantially
506 | successfully
507 | such
508 | sufficiently
509 | suggest
510 | sup
511 | sure
512 | t
513 | take
514 | taken
515 | taking
516 | tell
517 | tends
518 | th
519 | than
520 | thank
521 | thanks
522 | thanx
523 | that
524 | that'll
525 | that's
526 | that've
527 | the
528 | their
529 | theirs
530 | them
531 | themselves
532 | then
533 | thence
534 | there
535 | thereafter
536 | thereby
537 | there'd
538 | therefore
539 | therein
540 | there'll
541 | thereof
542 | there're
543 | there's
544 | thereto
545 | thereupon
546 | there've
547 | these
548 | they
549 | they'd
550 | they'll
551 | they're
552 | they've
553 | think
554 | this
555 | those
556 | thou
557 | though
558 | thousand
559 | throug
560 | through
561 | throughout
562 | thru
563 | thus
564 | til
565 | tip
566 | to
567 | together
568 | too
569 | took
570 | toward
571 | towards
572 | tried
573 | tries
574 | truly
575 | try
576 | trying
577 | ts
578 | twice
579 | two
580 | u
581 | un
582 | under
583 | unfortunately
584 | unless
585 | unlike
586 | unlikely
587 | until
588 | unto
589 | up
590 | upon
591 | ups
592 | us
593 | use
594 | used
595 | useful
596 | usefully
597 | usefulness
598 | uses
599 | using
600 | usually
601 | v
602 | value
603 | various
604 | 've
605 | very
606 | via
607 | viz
608 | vol
609 | vols
610 | vs
611 | w
612 | want
613 | wants
614 | was
615 | wasn't
616 | wasn
617 | way
618 | we
619 | wed
620 | welcome
621 | we'll
622 | went
623 | were
624 | we're
625 | weren't
626 | weren
627 | we've
628 | what
629 | whatever
630 | what'll
631 | whats
632 | when
633 | whence
634 | whenever
635 | where
636 | whereafter
637 | whereas
638 | whereby
639 | wherein
640 | where's
641 | whereupon
642 | wherever
643 | whether
644 | which
645 | while
646 | whim
647 | whither
648 | who
649 | who'd
650 | whoever
651 | whole
652 | who'll
653 | whom
654 | whomever
655 | whos
656 | whose
657 | why
658 | widely
659 | will
660 | willing
661 | wish
662 | with
663 | within
664 | without
665 | won't
666 | words
667 | world
668 | would
669 | wouldn
670 | wouldn't
671 | would've
672 | www
673 | x
674 | y
675 | yes
676 | yet
677 | you
678 | you'd
679 | you'll
680 | your
681 | you're
682 | yours
683 | yourself
684 | yourselves
685 | you've
686 | z
687 | zero
688 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # twarc-report
2 | Data conversions and examples for generating reports from [twarc](https://github.com/DocNow/twarc) collections using tools such as D3.js
3 |
4 | - [Requirements](#user-content-requirements)
5 | - [Getting Started](#user-content-getting-started)
6 | - [Recommended Directory Structure](#user-content-recommended-directory-structure)
7 | - [Harvest](#user-content-harvest)
8 | - [Profile](#user-content-profile)
9 | - [D3 Visualizations](#user-content-d3-visualizations)
10 | - [Exploring D3 Examples](#user-content-exploring-d3-examples)
11 | - [Adding Scripts](#user-content-adding-scripts)
12 | - [License](#user-content-license)
13 |
14 | These utilities accept a Twitter json file (as fetched by twarc),
15 | analyze it various ways, and output a json or csv file. The initial
16 | purpose is to feed data into D3.js for various visualizations, but the
17 | intention is to make the outputs generic enough to serve other uses as
18 | well. Each utility has a D3 example template, which it can use to
19 | generate a self-contained html file. It can also generate csv or json
20 | output, and there is a [worked example](#user-content-exploring-d3-examples) of how
21 | to use csv in a pre-existing D3 chart.
22 |
23 | The d3graph.py utility was originally added to the twarc repo as
24 | directed.py but is moving here for consistency.
25 |
26 | ## Requirements
27 |
28 | All requirements may be installed with `pip install -r requirements.txt`
29 |
30 | * dateutil - `python-dateutil`
31 | * pytz - `pip install pytz`
32 | * tzlocal - `pip install tzlocal`
33 | * pysparklines - `pip install pysparklines`
34 | * requests_oauthlib - `pip install requests_oauthlib`
35 |
36 | Install `twarc` according to its instructions, i.e. with `pip install twarc`.
37 | Run `twarc.py` once so
38 | that it can ask for your access token etc. (see twarc's readme). Make sure that `twarc-archive.py`
39 | is on the system path.
40 |
41 | ## Getting Started
42 |
43 | - clone twarc-report to a local directory with your favorite Git client
44 | - install the requirements and populate the twarc submodule, as above
45 | - create a `projects` subdirectory under twarc-report
46 | - create a project directory under `projects`, named appropriately
47 | - in the project directory create `metadata.json` and fill in the search you want to track
48 | - in twarc-report, run `./harvest.py projects/[yourproject]` to harvest your tweets (this may take some time - hours or days for very large searches)
49 | - run `./reportprofile.py projects/[yourproject]` to see a summary of your harvest
50 | - run other scripts to generate various visualizations (see below)
51 | - run `./harvest.py projects/[yourproject]` whenever you want to update your harvest.
52 |
53 | Note that only tweets from the last 7 days or so are available from Twitter at
54 | any given time, so be sure to update your harvest accordingly to avoid gaps.
55 |
56 | ## Recommended Directory Structure
57 |
58 | ```
59 | twarc-report/ # local clone
60 | projects/
61 | assets/ # copy of twarc-report/assets/
62 | projectA/
63 | data/ # created by harvest.py
64 | tweets/ # populated with tweet*.json files by harvest.py
65 | metadata.json
66 | timeline.html # generated by a twarc-report script
67 | ...
68 | projectB/
69 | ...
70 | ```
71 |
72 | Metadata about the project, including the search query, is kept in
73 | `metadata.json`. The `metadata.json` file is created by the user and contains metadata
74 | for the harvest. It should be in this form:
75 |
76 | ```
77 | {"search": "#ferguson",
78 | "title": "Ferguson Tweets",
79 | "creator": "Peter Binkley"}
80 | ```
81 |
82 | (Currently only the `search` value is used but other metadata fields will
83 | be used to populate HTML output in future releases.)
84 |
85 | The harvested tweets
86 | and other source data are stored in the `data` subdirectory, with the
87 | tweets going the `tweets` directory. These directories are created by
88 | `harvest.py` if they don't exist.
89 |
90 | Generated HTML files use relative paths like `../assets/d3.vs.min.js` to call
91 | shared libraries from the `assets` directory. They can be created in
92 | the project directories (`ProjectA` etc.). This
93 | allows you to publish the output by syncing the project and assets
94 | directories to a web server while exclusing the `data` subdirectory. You
95 | can also run python's SimpleHTTPServer in the `projects` directory to
96 | load examples you've created in the project directories:
97 |
98 | ```
99 | python -m SimpleHTTPServer 8000
100 | ```
101 |
102 | And then visit e.g. `http://localhost:8000/ProjectA/projectA-timebar.html`.
103 |
104 | ## Harvest
105 |
106 | The script `harvest.py` will use twarc's `twarc-archive.py` to start or update a harvest using a given
107 | search and stored in a given directory. The directory path is passed as the only parameter:
108 |
109 | ```
110 | ./harvest.py projects/ProjectA
111 | ```
112 |
113 | The search is read from the `metadata.json` file, and tweets are stored
114 | in `data/tweets`.
115 |
116 | ## Profile
117 |
118 | Running `reportprofiler.py` on a tweet collection with the flag `-o text` will generate a summary
119 | profile of the collection, with some basic stats (number of tweets, retweets, users, etc.) and some
120 | possibly interesting sparklines.
121 |
122 | ```
123 | Count: 25100
124 | Users: 5779
125 | User percentiles: █▂▁▁▁▁▁▁▁▁
126 | [62, 12, 6, 5, 3, 2, 2, 2, 2, 2]
127 | ```
128 |
129 | That indicates that the top 10 percent of users accounted for 62% of the tweets, while the bottom
130 | 10% accounted for 2% of the tweets. This will give a quick sense of whether the collection is
131 | dominated by a few voices or has broad participation. The profile also includes the top 10 users
132 | and top 10 shared urls, with similar sparklines.
133 |
134 | Note: the sparklines are generated by [pysparklines](https://pypi.python.org/pypi/pysparklines),
135 | using Unicode block characters. If they have an
136 | uneven baseline, it's the fault of the font. On a Mac, I find that Menlo Regular gives a
137 | good presentation in the terminal.
138 |
139 | ## D3 visualizations
140 |
141 | Some utilities to generate [D3.js](https://d3js.org/) visualizations of aspects of a collection
142 | of tweets are provided. Use "--output=json" or "--output=csv" to output the data for use with
143 | other D3 examples, or "--help" for other options.
144 |
145 | ### d3graph.py
146 |
147 | A directed graph of mentions or retweets, in which nodes are users and
148 | arrows point from the original user to the user who mentions or retweets
149 | them:
150 |
151 | % d3graph.py --mode mentions projects/nasa > projects/nasa/nasa-directed-mentions.html
152 | % d3graph.py --mode retweets projects/nasa > projects/nasa/nasa-directed-retweets.html
153 | % d3graph.py --mode replies projects/nasa > projects/nasa/nasa-directed-replies.html
154 |
155 | ### d3cotag.py
156 |
157 | An undirected graph of co-occurring hashtags:
158 |
159 | % d3cotag.py projects/nasa > projects/nasa/nasa-cotags.html
160 |
161 | A threshold can be specified with "-t": hashtags whose number of
162 | occurrences falls below this will not be linked. Instead, if "-k" is set,
163 | they will be replaced with the pseudo-hashtag "-OTHER". Hashtags can be
164 | excluded with "-e" (takes a comma-delimited list). If the tweets were
165 | harvested by a search for a single hashtag then it's a good idea to
166 | exclude that tag, since every other tag will link to it.
167 |
168 | ### d3timebar.py
169 |
170 | A bar chart timeline with arbitrary intervals, here five minutes:
171 |
172 | % d3times.py -a -t local -i 5M projects/nasa > projects/nasa/nasa-timebargraph.html
173 |
174 | [Examples](https://www.wallandbinkley.com/twarc/bill10/)
175 |
176 | The output timezone is specified by "-t"; the interval is specified by "-i",
177 | using the [standard
178 | abbreviations](https://docs.python.org/2/library/time.html#time.strftime
179 | ): seconds = S, minutes = M, hours = H, days = d, months = m, years = Y.
180 | The example above uses five-minute intervals. Output may be aggregated
181 | using "-a": each row has a time value and a count. Note that if you are
182 | generating the html example, you must use "-a".
183 |
184 | ### d3wordcloud.py
185 |
186 | An animated wordcloud, in which words are added and removed according to
187 | changes in frequency over time.
188 |
189 | % d3wordcloud.py -t local -i 1H projects/nasa > projects/nasa/nasa-wordcloud.html
190 |
191 | [Example](https://www.wallandbinkley.com/twarc/c4l15/animatedwordcloud.html)
192 |
193 | The optional "-t" control timezone and "-i" controls interval, as in `d3timebar.py`. Start and end
194 | timestamps may be set with "-s" and "-e".
195 |
196 | This script calls a [fork](https://github.com/pbinkley/d3-cloud) of Jason Davies'
197 | [d3-cloud](https://github.com/jasondavies/d3-cloud) project. The forked version attempts
198 | to keep the carried-over words in transitions close to their previous position.
199 |
200 | ## Exploring D3 Examples
201 |
202 | The json and csv outputs can be used to view your data in D3 example
203 | visualizations with minimal fuss. There are many many examples to be
204 | explored; Mike Bostock's
205 | [Gallery](https://github.com/mbostock/d3/wiki/Gallery) is a good place
206 | to start. Here's a worked example, using Bostock's [Zoomable Timeline
207 | Area
208 | Chart](https://mbostock.github.io/d3/talk/20111018/area-gradient.html).
209 | It assumes no knowledge of D3.
210 |
211 | First, look at the data input. In line 137 this example loads a csv file
212 |
213 | d3.csv("flights-departed.csv", function(data) {
214 |
215 | The [csv file](https://mbostock.github.io/d3/talk/20111018/flights-departed.csv) looks like this:
216 |
217 | date,value
218 | 1988-01-01,12681
219 | ...
220 |
221 | We can easily generate a csv file that matches that format:
222 |
223 | % ./d3times.py -a -i 1d -o csv
224 |
225 | (I.e. aggregate, one-day interval, output csv). We then just need to edit the
226 | output to make the column headers match the original csv,
227 | i.e. change them to "date,value".
228 |
229 | We also need to check the way the example loads scripts and css assets,
230 | especially the D3 library. In this case it expects a local copy:
231 |
232 |
233 |
234 |
235 |
236 |
237 | Either change those links to point to the original location, or save a
238 | local copy. (Note that if you're going to put your example online you'll
239 | want local copies of scripts, since the [same-origin policy](https://en.wikipedia.org/wiki/Same-origin_policy)
240 | will prevent them from being loaded from the source).
241 |
242 | Once you've matched your data to the example and made sure it can load the
243 | D3.js library, the example may work. In this case it doesn't - it shows
244 | an empty chart. The title "U.S. Commercial Flights, 1999-2001" and the
245 | horizontal scale explain why: it expects dates within a certain
246 | (pre-Twitter) range, and the x domain is hard-coded accordingly. The
247 | setting is easy to find, in line 146:
248 |
249 | x.domain([new Date(1999, 0, 1), new Date(2003, 0, 0)]);
250 |
251 | Change those dates to include the date range of your data, and the
252 | example should work. Don't worry about matching your dates closely: the
253 | chart is zoomable, after all. Alternatively, you could borrow a snippet from the
254 | template timebar.html to set the domain to match the earliest and latest
255 | dates in your data:
256 |
257 | ```
258 | x.domain([
259 | d3.min(values, function(d) {return d.name}),
260 | d3.max(values, function(d) {return d.name})
261 | ]);
262 | ```
263 |
264 | A typical Twarc harvest gets you a few days worth of tweets, so the
265 | day-level display of this example probably isn't very interesting. We're
266 | not bound by the time format of the example, however. We can see it in
267 | line 63:
268 |
269 | parse = d3.time.format("%Y-%m-%d").parse,
270 |
271 | We can change that to parse at the minute interval: "%Y-%m-%d %H:%M",
272 | and generate our csv at the same interval with "-i 1M". With those
273 | changes we can zoom in until bars represent a minute's worth of tweets.
274 |
275 | This example doesn't work perfectly: I see some odd artifacts around the
276 | bottom of the chart, as if the baseline were slightly above the x axis
277 | and small values are presented as negative. And it doesn't render in
278 | Chrome at all (Firefox and Safari are fine). The example is from 2011
279 | and uses an older version of the D3 library, and with some tinkering it
280 | could probably be updated and made functional. It serves to demonstrate,
281 | though, that only small changes and no knowledge of the complexities of D3
282 | are needed to fit your data into an existing D3 example.
283 |
284 | ## Adding Scripts
285 |
286 | The heart of twarc-report is the `Profiler` class in `profiler.py`. The
287 | scripts pass json records from the twarc harvests to this class, and it
288 | tabulates some basic properties: number of tweets and authors, earliest
289 | and latest timestamp, etc. The scripts create their own profilers that
290 | inherit from this class and that process the extra fields etc. needed by
291 | the script. To add a new script, start by working out its profiler class
292 | to collect the data it needs from each tweet in the process() method,
293 | and to organize the output in the report() method.
294 |
295 | The various output formats are generated by functions in `d3output.py`.
296 |
297 | License
298 | -------
299 |
300 | * CC0
301 |
302 |
--------------------------------------------------------------------------------
/assets/d3.layout.cloud.js:
--------------------------------------------------------------------------------
1 | // Word cloud layout by Jason Davies, http://www.jasondavies.com/word-cloud/
2 | // Algorithm due to Jonathan Feinberg, http://static.mrfeinberg.com/bv_ch03.pdf
3 |
4 | var previouswords;
5 |
6 | (function() {
7 | function cloud() {
8 | var size = [256, 256],
9 | text = cloudText,
10 | font = cloudFont,
11 | fontSize = cloudFontSize,
12 | fontStyle = cloudFontNormal,
13 | fontWeight = cloudFontNormal,
14 | rotate = cloudRotate,
15 | padding = cloudPadding,
16 | previousword = cloudPreviousword,
17 | spiral = archimedeanSpiral,
18 | words = [],
19 | timeInterval = Infinity,
20 | event = d3.dispatch("word", "end"),
21 | timer = null,
22 | cloud = {};
23 |
24 | cloud.start = function() {
25 | var board = zeroArray((size[0] >> 5) * size[1]),
26 | bounds = null,
27 | n = words.length,
28 | i = -1,
29 | tags = [],
30 | data = words.map(function(d, i) {
31 | d.text = text.call(this, d, i);
32 | d.font = font.call(this, d, i);
33 | d.style = fontStyle.call(this, d, i);
34 | d.weight = fontWeight.call(this, d, i);
35 | d.rotate = rotate.call(this, d, i);
36 | d.size = ~~fontSize.call(this, d, i);
37 | d.padding = padding.call(this, d, i);
38 | d.previousword = previousword.call(this, d, i);
39 | return d;
40 | })
41 | // sort update words to insert first
42 | .sort(function(a, b) {
43 | return
44 | (a.previousword && b.previousword) ? b.previousword.size - a.previousword.size :
45 | (a.previousword && !b.previousword) ? -1 :
46 | (!a.previousword && b.previousword) ? 1 :
47 | b.size - a.size;
48 | });
49 |
50 | if (timer) clearInterval(timer);
51 | timer = setInterval(step, 0);
52 | step();
53 | previouswords = words;
54 | return cloud;
55 |
56 | function step() {
57 | var start = +new Date,
58 | d;
59 | while (+new Date - start < timeInterval && ++i < n && timer) {
60 | d = data[i];
61 | // look for word in previouswords; if it's there, use d.x and d.y
62 | if (d.previousword) {
63 | d.x = (size[0] + d.previousword.x) >> 1;
64 | d.y = (size[1] + d.previousword.y) >> 1;
65 | }
66 | else {
67 | d.x = (size[0] * (Math.random() + .5)) >> 1;
68 | d.y = (size[1] * (Math.random() + .5)) >> 1;
69 | }
70 | cloudSprite(d, data, i);
71 | if (d.hasText && place(board, d, bounds)) {
72 | tags.push(d);
73 | event.word(d);
74 | if (bounds) cloudBounds(bounds, d);
75 | else bounds = [{x: d.x + d.x0, y: d.y + d.y0}, {x: d.x + d.x1, y: d.y + d.y1}];
76 | // Temporary hack
77 | d.x -= size[0] >> 1;
78 | d.y -= size[1] >> 1;
79 | }
80 | }
81 | if (i >= n) {
82 | cloud.stop();
83 | event.end(tags, bounds);
84 | }
85 | }
86 | }
87 |
88 | cloud.stop = function() {
89 | if (timer) {
90 | clearInterval(timer);
91 | timer = null;
92 | }
93 | return cloud;
94 | };
95 |
96 | cloud.timeInterval = function(x) {
97 | if (!arguments.length) return timeInterval;
98 | timeInterval = x == null ? Infinity : x;
99 | return cloud;
100 | };
101 |
102 | function place(board, tag, bounds) {
103 | var perimeter = [{x: 0, y: 0}, {x: size[0], y: size[1]}],
104 | startX = tag.x,
105 | startY = tag.y,
106 | maxDelta = Math.sqrt(size[0] * size[0] + size[1] * size[1]),
107 | s = spiral(size),
108 | dt = Math.random() < .5 ? 1 : -1,
109 | t = -dt,
110 | dxdy,
111 | dx,
112 | dy;
113 |
114 | while (dxdy = s(t += dt)) {
115 | dx = ~~dxdy[0];
116 | dy = ~~dxdy[1];
117 |
118 | if (Math.min(dx, dy) > maxDelta) break;
119 |
120 | tag.x = startX + dx;
121 | tag.y = startY + dy;
122 |
123 | if (tag.x + tag.x0 < 0 || tag.y + tag.y0 < 0 ||
124 | tag.x + tag.x1 > size[0] || tag.y + tag.y1 > size[1]) continue;
125 | // TODO only check for collisions within current bounds.
126 | if (!bounds || !cloudCollide(tag, board, size[0])) {
127 | if (!bounds || collideRects(tag, bounds)) {
128 | var sprite = tag.sprite,
129 | w = tag.width >> 5,
130 | sw = size[0] >> 5,
131 | lx = tag.x - (w << 4),
132 | sx = lx & 0x7f,
133 | msx = 32 - sx,
134 | h = tag.y1 - tag.y0,
135 | x = (tag.y + tag.y0) * sw + (lx >> 5),
136 | last;
137 | for (var j = 0; j < h; j++) {
138 | last = 0;
139 | for (var i = 0; i <= w; i++) {
140 | board[x + i] |= (last << msx) | (i < w ? (last = sprite[j * w + i]) >>> sx : 0);
141 | }
142 | x += sw;
143 | }
144 | delete tag.sprite;
145 | return true;
146 | }
147 | }
148 | }
149 | return false;
150 | }
151 |
152 | cloud.words = function(x) {
153 | if (!arguments.length) return words;
154 | words = x;
155 | return cloud;
156 | };
157 |
158 | cloud.size = function(x) {
159 | if (!arguments.length) return size;
160 | size = [+x[0], +x[1]];
161 | return cloud;
162 | };
163 |
164 | cloud.font = function(x) {
165 | if (!arguments.length) return font;
166 | font = d3.functor(x);
167 | return cloud;
168 | };
169 |
170 | cloud.fontStyle = function(x) {
171 | if (!arguments.length) return fontStyle;
172 | fontStyle = d3.functor(x);
173 | return cloud;
174 | };
175 |
176 | cloud.fontWeight = function(x) {
177 | if (!arguments.length) return fontWeight;
178 | fontWeight = d3.functor(x);
179 | return cloud;
180 | };
181 |
182 | cloud.rotate = function(x) {
183 | if (!arguments.length) return rotate;
184 | rotate = d3.functor(x);
185 | return cloud;
186 | };
187 |
188 | cloud.text = function(x) {
189 | if (!arguments.length) return text;
190 | text = d3.functor(x);
191 | return cloud;
192 | };
193 |
194 | cloud.spiral = function(x) {
195 | if (!arguments.length) return spiral;
196 | spiral = spirals[x + ""] || x;
197 | return cloud;
198 | };
199 |
200 | cloud.fontSize = function(x) {
201 | if (!arguments.length) return fontSize;
202 | fontSize = d3.functor(x);
203 | return cloud;
204 | };
205 |
206 | cloud.padding = function(x) {
207 | if (!arguments.length) return padding;
208 | padding = d3.functor(x);
209 | return cloud;
210 | };
211 |
212 | cloud.previousword = function(x) {
213 | if (!arguments.length) return previousword;
214 | previousword = d3.functor(x);
215 | return cloud;
216 | }
217 |
218 | return d3.rebind(cloud, event, "on");
219 | }
220 |
221 | function cloudText(d) {
222 | return d.text;
223 | }
224 |
225 | function cloudFont() {
226 | return "serif";
227 | }
228 |
229 | function cloudFontNormal() {
230 | return "normal";
231 | }
232 |
233 | function cloudFontSize(d) {
234 | return Math.sqrt(d.value);
235 | }
236 |
237 | function cloudRotate() {
238 | return (~~(Math.random() * 6) - 3) * 30;
239 | }
240 |
241 | function cloudPadding() {
242 | return 1;
243 | }
244 |
245 | function cloudPreviousword() {
246 | // look up previousword in previouswords
247 | if (previouswords) {
248 | var thisword = arguments[0]["text"];
249 | var result = previouswords.filter(function(o){return o.text == thisword;} );
250 | }
251 | return result? result[0] : null; // or undefined
252 | }
253 |
254 | // Fetches a monochrome sprite bitmap for the specified text.
255 | // Load in batches for speed.
256 | function cloudSprite(d, data, di) {
257 | if (d.sprite) return;
258 | c.clearRect(0, 0, (cw << 5) / ratio, ch / ratio);
259 | var x = 0,
260 | y = 0,
261 | maxh = 0,
262 | n = data.length;
263 | --di;
264 | while (++di < n) {
265 | d = data[di];
266 | c.save();
267 | c.font = d.style + " " + d.weight + " " + ~~((d.size + 1) / ratio) + "px " + d.font;
268 | var w = c.measureText(d.text + "m").width * ratio,
269 | h = d.size << 1;
270 | if (d.rotate) {
271 | var sr = Math.sin(d.rotate * cloudRadians),
272 | cr = Math.cos(d.rotate * cloudRadians),
273 | wcr = w * cr,
274 | wsr = w * sr,
275 | hcr = h * cr,
276 | hsr = h * sr;
277 | w = (Math.max(Math.abs(wcr + hsr), Math.abs(wcr - hsr)) + 0x1f) >> 5 << 5;
278 | h = ~~Math.max(Math.abs(wsr + hcr), Math.abs(wsr - hcr));
279 | } else {
280 | w = (w + 0x1f) >> 5 << 5;
281 | }
282 | if (h > maxh) maxh = h;
283 | if (x + w >= (cw << 5)) {
284 | x = 0;
285 | y += maxh;
286 | maxh = 0;
287 | }
288 | if (y + h >= ch) break;
289 | c.translate((x + (w >> 1)) / ratio, (y + (h >> 1)) / ratio);
290 | if (d.rotate) c.rotate(d.rotate * cloudRadians);
291 | c.fillText(d.text, 0, 0);
292 | if (d.padding) c.lineWidth = 2 * d.padding, c.strokeText(d.text, 0, 0);
293 | c.restore();
294 | d.width = w;
295 | d.height = h;
296 | d.xoff = x;
297 | d.yoff = y;
298 | d.x1 = w >> 1;
299 | d.y1 = h >> 1;
300 | d.x0 = -d.x1;
301 | d.y0 = -d.y1;
302 | d.hasText = true;
303 | x += w;
304 | }
305 | var pixels = c.getImageData(0, 0, (cw << 5) / ratio, ch / ratio).data,
306 | sprite = [];
307 | while (--di >= 0) {
308 | d = data[di];
309 | if (!d.hasText) continue;
310 | var w = d.width,
311 | w32 = w >> 5,
312 | h = d.y1 - d.y0;
313 | // Zero the buffer
314 | for (var i = 0; i < h * w32; i++) sprite[i] = 0;
315 | x = d.xoff;
316 | if (x == null) return;
317 | y = d.yoff;
318 | var seen = 0,
319 | seenRow = -1;
320 | for (var j = 0; j < h; j++) {
321 | for (var i = 0; i < w; i++) {
322 | var k = w32 * j + (i >> 5),
323 | m = pixels[((y + j) * (cw << 5) + (x + i)) << 2] ? 1 << (31 - (i % 32)) : 0;
324 | sprite[k] |= m;
325 | seen |= m;
326 | }
327 | if (seen) seenRow = j;
328 | else {
329 | d.y0++;
330 | h--;
331 | j--;
332 | y++;
333 | }
334 | }
335 | d.y1 = d.y0 + seenRow;
336 | d.sprite = sprite.slice(0, (d.y1 - d.y0) * w32);
337 | }
338 | }
339 |
340 | // Use mask-based collision detection.
341 | function cloudCollide(tag, board, sw) {
342 | sw >>= 5;
343 | var sprite = tag.sprite,
344 | w = tag.width >> 5,
345 | lx = tag.x - (w << 4),
346 | sx = lx & 0x7f,
347 | msx = 32 - sx,
348 | h = tag.y1 - tag.y0,
349 | x = (tag.y + tag.y0) * sw + (lx >> 5),
350 | last;
351 | for (var j = 0; j < h; j++) {
352 | last = 0;
353 | for (var i = 0; i <= w; i++) {
354 | if (((last << msx) | (i < w ? (last = sprite[j * w + i]) >>> sx : 0))
355 | & board[x + i]) return true;
356 | }
357 | x += sw;
358 | }
359 | return false;
360 | }
361 |
362 | function cloudBounds(bounds, d) {
363 | var b0 = bounds[0],
364 | b1 = bounds[1];
365 | if (d.x + d.x0 < b0.x) b0.x = d.x + d.x0;
366 | if (d.y + d.y0 < b0.y) b0.y = d.y + d.y0;
367 | if (d.x + d.x1 > b1.x) b1.x = d.x + d.x1;
368 | if (d.y + d.y1 > b1.y) b1.y = d.y + d.y1;
369 | }
370 |
371 | function collideRects(a, b) {
372 | return a.x + a.x1 > b[0].x && a.x + a.x0 < b[1].x && a.y + a.y1 > b[0].y && a.y + a.y0 < b[1].y;
373 | }
374 |
375 | function archimedeanSpiral(size) {
376 | var e = size[0] / size[1];
377 | return function(t) {
378 | return [e * (t *= .1) * Math.cos(t), t * Math.sin(t)];
379 | };
380 | }
381 |
382 | function rectangularSpiral(size) {
383 | var dy = 4,
384 | dx = dy * size[0] / size[1],
385 | x = 0,
386 | y = 0;
387 | return function(t) {
388 | var sign = t < 0 ? -1 : 1;
389 | // See triangular numbers: T_n = n * (n + 1) / 2.
390 | switch ((Math.sqrt(1 + 4 * sign * t) - sign) & 3) {
391 | case 0: x += dx; break;
392 | case 1: y += dy; break;
393 | case 2: x -= dx; break;
394 | default: y -= dy; break;
395 | }
396 | return [x, y];
397 | };
398 | }
399 |
400 | // TODO reuse arrays?
401 | function zeroArray(n) {
402 | var a = [],
403 | i = -1;
404 | while (++i < n) a[i] = 0;
405 | return a;
406 | }
407 |
408 | var cloudRadians = Math.PI / 180,
409 | cw = 1 << 11 >> 5,
410 | ch = 1 << 11,
411 | canvas,
412 | ratio = 1;
413 |
414 | if (typeof document !== "undefined") {
415 | canvas = document.createElement("canvas");
416 | canvas.width = 1;
417 | canvas.height = 1;
418 | ratio = Math.sqrt(canvas.getContext("2d").getImageData(0, 0, 1, 1).data.length >> 2);
419 | canvas.width = (cw << 5) / ratio;
420 | canvas.height = ch / ratio;
421 | } else {
422 | // Attempt to use node-canvas.
423 | canvas = new Canvas(cw << 5, ch);
424 | }
425 |
426 | var c = canvas.getContext("2d"),
427 | spirals = {
428 | archimedean: archimedeanSpiral,
429 | rectangular: rectangularSpiral
430 | };
431 | c.fillStyle = c.strokeStyle = "red";
432 | c.textAlign = "center";
433 |
434 | if (typeof module === "object" && module.exports) module.exports = cloud;
435 | else (d3.layout || (d3.layout = {})).cloud = cloud;
436 | })();
437 |
--------------------------------------------------------------------------------
/profiler.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from dateutil import parser
3 | import datetime
4 | import pytz # $ pip install pytz
5 | from collections import Counter
6 | import operator
7 | import re
8 | import d3output
9 | import fileinput
10 | import json
11 | import os
12 | import glob
13 | import ast
14 |
15 | class Profiler:
16 | def __init__(self, opts):
17 | for k, v in opts.items():
18 | setattr(self, k, v)
19 |
20 | # set defaults
21 | if not("labelFormat" in opts):
22 | self.labelFormat = "%Y-%m-%d %H:%M:%S %Z"
23 | if not("tz" in opts):
24 | self.tz = pytz.UTC
25 | if not("extended" in opts):
26 | self.extended = False
27 | if not("blocks" in opts):
28 | self.blocks = ["all"]
29 | if "all" in self.blocks:
30 | self.blocks.extend(["topusers", "tophashtags", "topurls", "topimageurls", "urls",
31 | "imageurls"])
32 |
33 | # initialize
34 | self.count = 0
35 | self.typecounts = {"original": 0, "retweet": 0, "quote": 0, "reply": 0}
36 | self.originalcount = 0
37 | self.retweetcount = 0
38 | self.quotecount = 0
39 | self.quoteandretweetcount = 0
40 | self.replycount = 0
41 | self.geocount = 0
42 | self.earliest = ""
43 | self.latest = ""
44 | self.users = Counter()
45 | if self.extended:
46 | if "tophashtags" in self.blocks:
47 | self.hashtags = Counter()
48 | self.hashtagcount = 0
49 | if "urls" in self.blocks or "topurls" in self.blocks:
50 | self.urls = Counter()
51 | self.urlcount = 0
52 | if "imageurls" in self.blocks or "topimageurls" in self.blocks:
53 | self.imageurls = Counter()
54 | self.imageurlcount = 0
55 |
56 |
57 | def adduser(self, user, tweet):
58 | self.users[user] += 1
59 |
60 | def addurl(self, url):
61 | self.urls[url] += 1
62 |
63 | def addhashtag(self, hashtag):
64 | self.hashtags[hashtag] += 1
65 |
66 | def addimageurl(self, imageurl):
67 | self.imageurls[imageurl] += 1
68 |
69 | def process(self, tweet):
70 | self.count += 1
71 | tweettype = ""
72 | if "retweeted_status" in tweet:
73 | tweettype = "retweet"
74 | elif tweet["is_quote_status"]:
75 | tweettype = "quote"
76 | elif tweet["in_reply_to_status_id"] != None:
77 | tweettype = "reply"
78 | else: tweettype = "original"
79 | if tweet.get("geo") != None:
80 | self.geocount += 1
81 | self.typecounts[tweettype] += 1
82 |
83 | self.created_at = parser.parse(tweet["created_at"])
84 | if self.earliest == "" or self.earliest > self.created_at:
85 | self.earliest = self.created_at
86 | if self.latest == "" or self.latest < self.created_at:
87 | self.latest = self.created_at
88 | user = tweet["user"]["screen_name"]
89 | self.adduser(user, tweet)
90 | if self.extended:
91 | # handle urls
92 | if "urls" in self.blocks or "topurls" in self.blocks:
93 | if len(tweet["entities"]["urls"]) > 0:
94 | for url in tweet["entities"]["urls"]:
95 | self.addurl(url["expanded_url"])
96 | self.urlcount += 1
97 |
98 | # handle hashtags
99 | if "hashtags" in self.blocks or "tophashtags" in self.blocks:
100 | if len(tweet["entities"]["hashtags"]) > 0:
101 | for tag in tweet["entities"]["hashtags"]:
102 | # hashtags are not case sensitive, so lower() to dedupe
103 | # or just leave it and accept dupes?
104 | self.addhashtag(tag["text"].lower())
105 | self.hashtagcount += 1
106 |
107 | # handle imageurls
108 | if "imageurls" in self.blocks or "topimageurls" in self.blocks:
109 | if "media" in tweet["entities"]:
110 | hasimageurl = False
111 | for media in tweet["entities"]["media"]:
112 | if media["type"] == "photo":
113 | self.addimageurl(media["media_url"])
114 | hasimageurl = True
115 | if hasimageurl:
116 | self.imageurlcount += 1
117 |
118 | def gettweets(self, opts, args):
119 | # prepare to serialize opts and args as json
120 | # converting opts to str produces string with single quotes,
121 | # but json requires double quotes
122 | self.optsdict = ast.literal_eval(str(opts))
123 | self.argsdict = ast.literal_eval(str(args))
124 |
125 | # if args has one value, check whether it's a directory
126 | if len(args) == 1 and os.path.isdir(args[0]):
127 | # add path to metadata file and tweets
128 | self.metadatafile = os.path.join(args[0] , "metadata.json")
129 | args = glob.glob(os.path.join(args[0], "data/tweets/tweets-*.json"))
130 | else:
131 | # args must be files, so calculate path to metadata file based on
132 | # dir of first input file
133 | self.metadatafile = os.path.join(os.path.dirname(args[0]), "metadata.json")
134 | for line in fileinput.input(args):
135 | try:
136 | tweet = json.loads(line)
137 | self.process(tweet)
138 | except ValueError as e:
139 | sys.stderr.write("uhoh: %s\n" % e)
140 |
141 | def tops(self, list, title):
142 | # given a list of name-value pairs, return the top 10 pairs by value,
143 | # and a list of integers representing the percent of total value
144 | # held by each of 10 slices
145 |
146 | totalcount = len(list)
147 | totalvalue = int(sum(list.values()))
148 | sorted = list.most_common()
149 |
150 | top = sorted[:10]
151 | top_result = []
152 | for name, value in top:
153 | top_result.append({"name": name, "value": value})
154 |
155 | step = float(totalcount) / 10
156 | percentiles = []
157 | for i in range(0, 10):
158 | start = int(i * step)
159 | end = int((i + 1) * step)
160 | slicecount = end - start
161 | if slicecount > 0:
162 | # weight the slice value as if the slice were an even 10th of the list
163 | weight = 10 / (float(slicecount) / totalcount)
164 | slicevalue = sum(v for k,v in sorted[start:end])
165 | percentile = int(round(float(slicevalue) / totalvalue * weight))
166 | else:
167 | percentile = 0
168 | percentiles.append(percentile)
169 | return {"top" + title: top_result, title+"percentiles": percentiles}
170 |
171 | def report(self):
172 | local_earliest = self.tz.normalize(self.earliest.astimezone(self.tz)).strftime(self.labelFormat)
173 | local_latest = self.tz.normalize(self.latest.astimezone(self.tz)).strftime(self.labelFormat)
174 | result = {"count": self.count,
175 | "originalcount": self.typecounts["original"],
176 | "retweetcount": self.typecounts["retweet"],
177 | "quotecount": self.typecounts["quote"],
178 | "replycount": self.typecounts["reply"],
179 | "geocount": self.geocount,
180 | "earliest": local_earliest,
181 | "latest": local_latest,
182 | "usercount": len(self.users),
183 | "opts": self.optsdict,
184 | "args": self.argsdict,
185 | "metadatafile": self.metadatafile}
186 | if self.extended:
187 | if "topusers" in self.blocks:
188 | result.update(self.tops(self.users, "users"))
189 | if "tophashtags" in self.blocks:
190 | result.update(self.tops(self.hashtags, "hashtags"))
191 | if "topurls" in self.blocks:
192 | result.update(self.tops(self.urls, "urls"))
193 | if "urls" in self.blocks:
194 | result.update({"urlcount": self.urlcount, "urls": len(self.urls),
195 | "imageurlcount": self.imageurlcount, "imageurls": len(self.imageurls),
196 | "hashtagcount": self.hashtagcount, "hashtags": len(self.hashtags)})
197 | if "topimageurls" in self.blocks:
198 | result.update(self.tops(self.imageurls, "imageurls"))
199 | if "imageurls" in self.blocks:
200 | result.update({"imageurlslist": self.imageurls})
201 | return result
202 |
203 | class LinkNodesProfiler(Profiler):
204 | def __init__(self, opts):
205 | Profiler.__init__(self, opts)
206 | self.nodes = {}
207 | self.nodeid = 0
208 |
209 | # nodes will end up as
210 | # {"userA":
211 | # {"id": 27,
212 | # "source": 0,
213 | # "target": 1,
214 | # "links": {
215 | # "userB": 3,
216 | # "userC": 1
217 | # }
218 | #
219 | # Meaning that userA mentions userB 3 times, and userB mentions userA once.
220 | # We gather the nodes in a dictionary so that we can look up terms to update
221 | # counts, but at the end we convert the dictionary into a list sorted by id
222 | # so that the positions in the list correspond to the ids, as D3 requires.
223 |
224 | def addlink(self, source, target):
225 | if not source in self.nodes:
226 | self.nodes[source] = {"name": source, "id": self.nodeid, "tweetcount": 0,
227 | "source": 1, "target": 0, "links": {}}
228 | self.nodeid += 1
229 | else:
230 | self.nodes[source]["source"] += 1
231 |
232 | if not target in self.nodes:
233 | targetid = self.nodeid
234 | self.nodes[target] = {"name": target, "id": self.nodeid, "tweetcount": 0,
235 | "source": 0, "target": 1, "links": {}}
236 | self.nodeid += 1
237 | else:
238 | self.nodes[target]["target"] += 1
239 | targetid = self.nodes[target]["id"]
240 |
241 | linklist = self.nodes[source]["links"]
242 | if not target in linklist:
243 | linklist[target] = {"count": 1, "id": targetid}
244 | else:
245 | linklist[target]["count"] += 1
246 |
247 | def addsingle(self, name):
248 | if not name in self.nodes:
249 | self.nodes[name] = {"name": name, "id": self.nodeid, "tweetcount": 1,
250 | "source": 0, "target": 0, "links": {}}
251 | self.nodeid += 1
252 |
253 | def report(self):
254 | if hasattr(self, "graph"):
255 | self.optsdict["graph"] = self.graph
256 | if hasattr(self, "field"):
257 | self.optsdict["field"] = self.field
258 | profile = Profiler.report(self)
259 | # convert nodes dictionary to a list, sorted by id
260 | nodelistkeys = sorted(self.nodes, key=lambda w: self.nodes[w]["id"])
261 | nodelist = []
262 | for key in nodelistkeys:
263 | nodelist.append(self.nodes[key])
264 | return {"profile": profile, "nodes": nodelist}
265 |
266 | class TimeProfiler(Profiler):
267 | # interval, in milliseconds
268 | intervalFormats = {
269 | "S": {"name": "second", "format": "%Y-%m-%d %H:%M:%S", "interval": 1000},
270 | "M": {"name": "minute", "format": "%Y-%m-%d %H:%M", "interval": 1000 * 60},
271 | "H": {"name": "hour", "format": "%Y-%m-%d %H", "interval": 1000 * 60 * 60},
272 | "d": {"name": "day", "format": "%Y-%m-%d", "interval": 1000 * 60 * 60 * 24},
273 | "m": {"name": "month", "format": "%Y-%m", "interval": 1000 * 60 * 60 * 24 * 28},
274 | "Y": {"name": "year", "format": "%Y-%m", "interval": 1000 * 60 * 60 * 24 * 365}
275 | }
276 | def __init__(self, opts):
277 | Profiler.__init__(self, opts)
278 | try:
279 | self.intervalParts = re.search("([0-9]*)([^0-9]*)", self.intervalStr)
280 | if self.intervalParts.group(1) == "":
281 | self.intervalCount = 1
282 | else:
283 | self.intervalCount = int(self.intervalParts.group(1))
284 | self.intervalUnit = self.intervalParts.group(2)
285 | self.interval = self.intervalCount * self.intervalFormats[self.intervalUnit]["interval"]
286 | self.format = self.intervalFormats[self.intervalUnit]["format"]
287 | self.intervalLabel = str(self.intervalCount) + " " + self.intervalFormats[self.intervalUnit]["name"]
288 | if self.intervalCount > 1:
289 | self.intervalLabel += "s"
290 |
291 | except ValueError as e:
292 | sys.stderr.write("uhoh: %s\n" % e)
293 |
294 | # gather in a dict with count if aggregating, otherwise in a list
295 | if self.aggregate:
296 | self.items = {}
297 | else:
298 | self.items = []
299 |
300 | def process(self, tweet):
301 | Profiler.process(self, tweet)
302 | created_at = parser.parse(tweet["created_at"])
303 | local_dt = self.tz.normalize(created_at.astimezone(self.tz))
304 | if self.intervalStr != "":
305 | if self.intervalUnit == "S":
306 | local_dt = local_dt - datetime.timedelta(seconds=local_dt.second % int(self.intervalCount))
307 | elif self.intervalUnit == "M":
308 | local_dt = local_dt - datetime.timedelta(minutes=local_dt.minute % int(self.intervalCount))
309 | elif self.intervalUnit == "H":
310 | local_dt = local_dt - datetime.timedelta(hours=local_dt.hour % int(self.intervalCount))
311 | # otherwise use format to aggregate values - though this treats intervalCount as 1
312 | result = local_dt.strftime(self.format)
313 | if self.aggregate:
314 | self.items[result] = self.items.get(result, 0) + 1
315 | else:
316 | self.items.append(result)
317 | # return the time slice label
318 | return result
319 |
320 | def report(self):
321 | self.optsdict["interval"] = self.interval
322 | self.optsdict["format"] = self.format
323 | self.optsdict["intervalLabel"] = self.intervalLabel
324 | profile = Profiler.report(self)
325 | if self.output == "csv":
326 | if self.aggregate:
327 | values = d3output.namevaluecsv(self.items)
328 | else:
329 | values = d3output.valuecsv(self.items)
330 | return values
331 | else:
332 | if self.aggregate:
333 | values = d3output.namevaluejson(self.items)
334 | else:
335 | values = d3output.valuejson(self.items)
336 | return {"profile": profile, "values": values}
337 |
--------------------------------------------------------------------------------
/templates/graph.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | $TITLE$
4 |
5 |
126 |
127 |
128 |
129 |