├── .gitignore
├── README.md
├── analysis.py
├── arxiv
    ├── __init__.py
    ├── db_utils.py
    ├── lda.py
    ├── parse.py
    ├── scrape.py
    ├── stops.txt
    ├── text_utils.py
    └── vocab.py
└── twitter
    ├── .gitignore
    └── twitter.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.xml
 2 | .DS_Store
 3 | *.pyc
 4 | records
 5 | venv
 6 | settings.sh
 7 | lambda*.txt
 8 | models
 9 | vocab.txt
10 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ArXiv analysis
 2 | 
 3 | Run [online variational LDA](http://arxiv.org/abs/1206.7051v1) on all the
 4 | abstracts from the arXiv. The implementation is based on [Matt Hoffman's
 5 | GPL licensed code](http://www.cs.princeton.edu/~mdhoffma/).
 6 | 
 7 | ## Usage
 8 | 
 9 | You'll need a [`mongod`](http://www.mongodb.org/) instance running on
10 | the port given by the environment variable `MONGO_PORT` and a
11 | [`redis-server`](http://redis.io/) instance running on the port given by
12 | the `REDIS_PORT` environment variable.
13 | 
14 | The code depends on the Python packages: `numpy`, `scipy`, `requests`,
15 | `pymongo` and `redis`.
16 | 
17 | * `mkdir abstracts`
18 | * `./analysis.py scrape abstracts` — scrapes all the metadata from the arXiv
19 |   [OAI interface](http://arxiv.org/help/oa/index) and saves the raw XML
20 |   responses as `abstracts/raw-*.xml`. This takes a _long time_ because of
21 |   the arXiv's flow control policies. It took me approximately 6 hours.
22 | * `./analysis.py parse abstracts/raw-*.xml` — parses the raw responses and
23 |   saves the abstracts to a MongoDB database called `arxiv` in the collection
24 |   called `abstracts`.
25 | * `./analysis.py build-vocab` — counts all the words in the corpus removing
26 |   anything with less than 3 characters and removing any stop words.
27 | * `./analysis.py get-vocab 100 5000 > vocab.txt` — lists the vocabulary
28 |   skipping the first 100 most popular words and keeping 5000 words total.
29 | * `./analysis.py run vocab.txt` — runs online variational LDA by randomly
30 |   selecting articles from the database. The topic distributions are stored
31 |   in the `lambda-*.txt` files. This will run forever so just kill it whenever
32 |   you feel like it.
33 | * `./analysis.py vocab.txt lambda-100.txt` — list the topics and their most
34 |   common words at step 100.
35 | 


--------------------------------------------------------------------------------
/analysis.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from __future__ import print_function
 4 | 
 5 | import os
 6 | import sys
 7 | import numpy as np
 8 | 
 9 | try:
10 |     import arxiv
11 |     arxiv = arxiv
12 | except ImportError:
13 |     sys.path.append(os.path.join(os.path.abspath(__file__), u".."))
14 |     import arxiv
15 |     arxiv = arxiv
16 | 
17 | from arxiv.db_utils import db
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     cmd = sys.argv[1]
22 | 
23 |     if cmd == u"scrape":
24 |         print(u"Scraping all the meta-data from the arxiv...")
25 |         arxiv.get()
26 | 
27 |     if cmd == u"parse":
28 |         print(u"Parsing the XML...")
29 |         arxiv.parse()
30 | 
31 |     if cmd == u"build-vocab":
32 |         print(u"Building the vocabulary list...")
33 |         arxiv.build_vocab()
34 | 
35 |     if cmd == u"get-vocab":
36 |         initial, N = 1000, 5000
37 |         if len(sys.argv) >= 3:
38 |             initial = int(sys.argv[2])
39 |         elif len(sys.argv) >= 4:
40 |             N = int(sys.argv[3])
41 | 
42 |         arxiv.get_vocab(initial=initial, N=N)
43 | 
44 |     if cmd in [u"run", u"results"]:
45 |         fn = sys.argv[2]
46 |         vocab = [l.strip() for l in open(fn)]
47 | 
48 |     if cmd == u"run":
49 |         print(u"Running online LDA...")
50 |         coll = db.abstracts
51 |         coll.ensure_index(u"random")
52 | 
53 |         batch_size = 128
54 |         ndocs = coll.count()
55 |         ntopics = 100
56 | 
57 |         lda = arxiv.LDA(vocab, ntopics, ndocs, 1.0 / ntopics, 1.0 / ntopics,
58 |                         1025.0, 0.8)
59 | 
60 |         iteration = 0
61 |         while 1:
62 |             docs = []
63 |             ind = np.random.randint(ndocs - batch_size)
64 |             cursor = coll.find({}, {u"abstract": 1, u"title": 1}) \
65 |                          .sort(u"random").skip(ind)
66 |             for i, d in enumerate(cursor):
67 |                 if i >= batch_size:
68 |                     break
69 |                 if u"title" in d:
70 |                     docs.append(d[u"title"] + u" " + d[u"abstract"])
71 |             gamma, lam, bound = lda.update(docs)
72 |             print(iteration, ind, np.exp(-bound))
73 | 
74 |             if iteration % 10 == 0:
75 |                 np.savetxt(u"lambda-{0}.txt".format(iteration), lam)
76 | 
77 |             iteration += 1
78 | 
79 |     if cmd == u"results":
80 |         print(u"Displaying results...")
81 |         fn = sys.argv[3]
82 |         lam = np.loadtxt(fn)
83 | 
84 |         for i, l in enumerate(lam.T):
85 |             l /= np.sum(l)
86 |             tmp = sorted(zip(l, range(len(l))), key=lambda x: x[0],
87 |                                                 reverse=True)
88 |             print(u"Topic {0}: ".format(i) +
89 |                   u", ".join([u"{0} ({1:.1f})".format(vocab[t[1]], 100 * t[0])
90 |                               for t in tmp[:10]]))
91 | 


--------------------------------------------------------------------------------
/arxiv/__init__.py:
--------------------------------------------------------------------------------
1 | from scrape import *
2 | from parse import *
3 | from vocab import *
4 | from lda import *
5 | 


--------------------------------------------------------------------------------
/arxiv/db_utils.py:
--------------------------------------------------------------------------------
 1 | __all__ = [u"db", u"rdb"]
 2 | 
 3 | import os
 4 | import pymongo
 5 | import redis
 6 | 
 7 | 
 8 | server = os.environ.get(u"MONGO_SERVER", u"localhost")
 9 | port = int(os.environ.get(u"MONGO_PORT", 27017))
10 | redis_server = os.environ.get(u"REDIS_SERVER", u"localhost")
11 | redis_port = int(os.environ.get(u"REDIS_PORT", 27019))
12 | 
13 | db = pymongo.Connection(server, port).arxiv
14 | rdb = redis.Redis(host=redis_server, port=redis_port)
15 | 


--------------------------------------------------------------------------------
/arxiv/lda.py:
--------------------------------------------------------------------------------
  1 | __all__ = [u"LDA"]
  2 | 
  3 | import numpy as np
  4 | from scipy.special import gammaln, psi
  5 | 
  6 | from .text_utils import tokenize_document
  7 | 
  8 | 
  9 | def dir_expect(alpha):
 10 |     if len(alpha.shape) == 1:
 11 |         return psi(alpha) - psi(np.sum(alpha))
 12 |     return psi(alpha) - psi(np.sum(alpha, axis=-1))[:, None]
 13 | 
 14 | 
 15 | class LDA:
 16 | 
 17 |     def __init__(self, vocab, ntopics, ndocs, alpha, eta, delay, rate):
 18 |         self.vocab = list(vocab)
 19 |         self.K = ntopics
 20 |         self.W = len(self.vocab)
 21 |         self.D = ndocs
 22 |         self.alpha = alpha
 23 |         self.eta = eta
 24 |         self.tau = delay
 25 |         self.kappa = rate
 26 | 
 27 |         self.lam = np.random.gamma(100.0, 0.01, (self.W, self.K))
 28 |         self.Elogbeta = dir_expect(self.lam)
 29 |         self.expElogbeta = np.exp(self.Elogbeta)
 30 | 
 31 |         self.tstep = 0
 32 | 
 33 |     def _expectation(self, docs, maxiter=100, tol=0.0001, eps=1e-100):
 34 |         docs = [tokenize_document(d, vocab=self.vocab) for d in docs]
 35 |         size = len(docs)
 36 |         gamma = np.random.gamma(100.0, 0.01, (size, self.K))
 37 |         expElogth = np.exp(dir_expect(gamma))
 38 |         stats = np.zeros_like(self.lam)
 39 | 
 40 |         for i, doc in enumerate(docs):
 41 |             try:
 42 |                 word_ids, word_counts = zip(*[w for w in doc.iteritems()])
 43 |             except ValueError:
 44 |                 continue
 45 |             word_ids = np.array(word_ids, dtype=int)
 46 |             word_counts = np.array(word_counts, dtype=int)
 47 |             gamma_d = gamma[i]
 48 |             expElogth_d = expElogth[i]
 49 |             expElogbeta_d = self.expElogbeta[word_ids]
 50 |             norm = np.dot(expElogth_d, expElogbeta_d.T) + eps
 51 |             for j in range(maxiter):
 52 |                 gamma0 = gamma
 53 |                 gamma_d = self.alpha + expElogth_d * np.dot(
 54 |                                 word_counts / norm, expElogbeta_d)
 55 |                 expElogth_d = np.exp(dir_expect(gamma_d))
 56 |                 norm = np.dot(expElogbeta_d, expElogth_d) + eps
 57 | 
 58 |                 delta = np.mean(np.abs(gamma_d - gamma0))
 59 |                 if delta < tol:
 60 |                     break
 61 |             gamma[i] = gamma_d
 62 |             stats[word_ids] += np.outer(word_counts / norm, expElogth_d)
 63 | 
 64 |         stats *= self.expElogbeta
 65 | 
 66 |         return gamma, stats, docs
 67 | 
 68 |     def update(self, docs, **kwargs):
 69 |         rho = (self.tau + self.tstep) ** -self.kappa
 70 |         gamma, stats, docs = self._expectation(docs, **kwargs)
 71 |         bound = self.approx_bound(docs, gamma, preprocess=False)
 72 | 
 73 |         # Update lambda.
 74 |         self.lam = self.lam * (1.0 - rho) + rho * (self.eta
 75 |                                             + self.D * stats / len(docs))
 76 |         self.Elogbeta = dir_expect(self.lam)
 77 |         self.expElogbeta = np.exp(self.Elogbeta)
 78 |         self.tstep += 1
 79 | 
 80 |         return gamma, self.lam, bound
 81 | 
 82 |     def approx_bound(self, docs, gamma, preprocess=True):
 83 |         if preprocess:
 84 |             docs = [tokenize_document(d, vocab=self.vocab) for d in docs]
 85 | 
 86 |         Elogth = dir_expect(gamma)
 87 | 
 88 |         score = 0.0
 89 |         fullnorm = 0.0
 90 | 
 91 |         for i, doc in enumerate(docs):
 92 |             try:
 93 |                 word_ids, word_counts = zip(*[w for w in doc.iteritems()])
 94 |             except ValueError:
 95 |                 continue
 96 |             word_ids = np.array(word_ids, dtype=int)
 97 |             word_counts = np.array(word_counts, dtype=int)
 98 |             norm = np.zeros(len(word_ids))
 99 |             for j in range(len(word_ids)):
100 |                 tmp = Elogth[i] + self.Elogbeta[word_ids[j]]
101 |                 tmax = np.max(tmp)
102 |                 norm[j] = np.log(sum(np.exp(tmp - tmax))) + tmax
103 |             score += np.sum(word_counts * norm)
104 |             fullnorm += np.sum(word_counts)
105 | 
106 |         score += np.sum((self.alpha - gamma) * Elogth)
107 |         score += np.sum(gammaln(gamma) - gammaln(self.alpha))
108 |         score += np.sum(gammaln(self.alpha * self.K) -
109 |                         gammaln(np.sum(gamma, axis=1)))
110 | 
111 |         score *= self.D / len(docs)
112 | 
113 |         score += np.sum((self.eta - self.lam) * self.Elogbeta)
114 |         score += np.sum(gammaln(self.lam) - gammaln(self.eta))
115 |         score += np.sum(gammaln(self.eta * self.W) -
116 |                         gammaln(np.sum(self.lam, axis=1)))
117 | 
118 |         return score * len(docs) / fullnorm / self.D
119 | 


--------------------------------------------------------------------------------
/arxiv/parse.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | __all__ = [u"parse"]
 4 | 
 5 | import os
 6 | import re
 7 | import random
 8 | from datetime import datetime
 9 | import xml.etree.cElementTree as ET
10 | from multiprocessing import Pool
11 | 
12 | from .db_utils import db
13 | 
14 | 
15 | record_tag = u".//{http://www.openarchives.org/OAI/2.0/}record"
16 | ns_re = re.compile(r"\{(?:.*?)\}(.*)")
17 | date_fmt = u"%a, %d %b %Y %H:%M:%S %Z"
18 | 
19 | comma_and = r"(?:,* and )|(?:,\s*)"
20 | ca_re = re.compile(comma_and)
21 | au_re = re.compile(r"(.+?)(?:" + comma_and + "|(?:\s*$))")
22 | 
23 | affil_re = re.compile(r"(.*?)(?:\((.*)\)|$)")
24 | affils_re = re.compile(r"\(([0-9]+)\) (.*?)(?=(?:,*\s*\()|\))")
25 | 
26 | 
27 | server = os.environ.get(u"MONGO_SERVER", "localhost")
28 | port = int(os.environ.get(u"MONGO_PORT", 27017))
29 | 
30 | 
31 | def parse_one(f):
32 |     print(u"Starting: {0}".format(f))
33 |     coll = db.abstracts
34 | 
35 |     tree = ET.parse(f)
36 |     root = tree.getroot()
37 |     for i, r in enumerate(root.findall(record_tag)):
38 |         doc = {}
39 |         for el in r.iter():
40 |             txt = el.text
41 |             if txt is None:
42 |                 for k, v in el.attrib.iteritems():
43 |                     doc[unicode(k.lower())] = unicode(v)
44 |             elif txt.strip() != u"":
45 |                 k = unicode(ns_re.search(el.tag).groups()[0].lower())
46 |                 txt = unicode(txt.strip())
47 | 
48 |                 if k == u"date":
49 |                     txt = datetime.strptime(txt, date_fmt)
50 |                 elif k == u"categories":
51 |                     txt = [c.strip() for c in txt.split()]
52 |                 elif k == u"authors":
53 |                     spl = txt.replace(u"\n", u"").split(u"((")
54 |                     if len(spl) > 1:
55 |                         if len(spl) > 2:
56 |                             spl = [spl[0], u"(".join(spl[1:])]
57 |                         authors, affils = spl
58 |                         affils = dict(affils_re.findall(u"(" + affils))
59 |                     else:
60 |                         authors, affils = txt, {}
61 |                     authors = [affil_re.findall(a.strip())[0]
62 |                                             for a in au_re.findall(authors)]
63 |                     doc[u"authors"] = []
64 |                     for a in authors:
65 |                         if len(a[1]) > 0 and a[1][0] in u"1234567890":
66 |                             doc[u"authors"].append({u"name": a[0],
67 |                                     u"affil": ", ".join([affils.get(af.strip(),
68 |                                                                     af.strip())
69 |                                         for af in ca_re.split(a[1])])})
70 |                         else:
71 |                             doc[u"authors"].append({u"name": a[0],
72 |                                 u"affil": a[1].strip()})
73 | 
74 |                     k = u"authors_raw"
75 | 
76 |                 doc[k] = txt
77 | 
78 |         # Add a random number for selecting random documents later.
79 |         doc[u"random"] = random.random()
80 |         coll.insert(doc)
81 | 
82 |     print(u"Finished {0}".format(f))
83 | 
84 | 
85 | def parse(fns):
86 |     p = Pool()
87 |     p.map(parse_one, list(fns))
88 | 
89 | 
90 | if __name__ == "__main__":
91 |     import sys
92 |     parse(sys.argv[1:])
93 | 


--------------------------------------------------------------------------------
/arxiv/scrape.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from __future__ import print_function
 4 | 
 5 | __all__ = [u"get"]
 6 | 
 7 | import os
 8 | import re
 9 | import time
10 | 
11 | import requests
12 | 
13 | 
14 | resume_re = re.compile(r".*<resumptionToken.*?>(.*?)</resumptionToken>.*")
15 | url = "http://export.arxiv.org/oai2"
16 | 
17 | 
18 | def get(basepath=u".", max_tries=10):
19 |     """
20 |     Get all the listings from the ArXiv.
21 | 
22 |     """
23 |     req = {u"verb": "ListRecords",
24 |            u"metadataPrefix": u"arXivRaw"}
25 | 
26 |     failures = 0
27 |     count = 0
28 |     while True:
29 |         # Send the request.
30 |         r = requests.post(url, data=req)
31 | 
32 |         # Handle the response.
33 |         code = r.status_code
34 | 
35 |         if code == 503:
36 |             # Asked to retry
37 |             to = int(r.headers["retry-after"])
38 |             print(u"Got 503. Retrying after {0:d} seconds.".format(to))
39 | 
40 |             time.sleep(to)
41 |             failures += 1
42 |             if failures >= max_tries:
43 |                 print(u"Failed too many times...")
44 |                 break
45 | 
46 |         elif code == 200:
47 |             failures = 0
48 | 
49 |             # Write to file.
50 |             content = r.text
51 |             count += 1
52 |             fn = os.path.join(basepath, u"raw-{0:08d}.xml".format(count))
53 |             print(u"Writing to: {0}".format(fn))
54 |             with open(fn, u"w") as f:
55 |                 f.write(content)
56 | 
57 |             # Look for a resumption token.
58 |             token = resume_re.search(content)
59 |             if token is None:
60 |                 break
61 |             token = token.groups()[0]
62 | 
63 |             # If there isn't one, we're all done.
64 |             if token == "":
65 |                 print(u"All done.")
66 |                 break
67 | 
68 |             print(u"Resumption token: {0}.".format(token))
69 | 
70 |             # If there is a resumption token, rebuild the request.
71 |             req = {u"verb": u"ListRecords",
72 |                    u"resumptionToken": token}
73 | 
74 |             # Pause so as not to get banned.
75 |             to = 20
76 |             print(u"Sleeping for {0:d} seconds so as not to get banned."
77 |                     .format(to))
78 |             time.sleep(to)
79 | 
80 |         else:
81 |             # Wha happen'?
82 |             r.raise_for_status()
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     import sys
87 | 
88 |     if len(sys.argv) == 1:
89 |         bp = u"."
90 |     else:
91 |         bp = sys.argv[1]
92 | 
93 |     get(basepath=bp)
94 | 


--------------------------------------------------------------------------------
/arxiv/stops.txt:
--------------------------------------------------------------------------------
  1 | i
  2 | me
  3 | my
  4 | myself
  5 | we
  6 | our
  7 | ours
  8 | ourselves
  9 | you
 10 | your
 11 | yours
 12 | yourself
 13 | yourselves
 14 | he
 15 | him
 16 | his
 17 | himself
 18 | she
 19 | her
 20 | hers
 21 | herself
 22 | it
 23 | its
 24 | itself
 25 | they
 26 | them
 27 | their
 28 | theirs
 29 | themselves
 30 | what
 31 | which
 32 | who
 33 | whom
 34 | this
 35 | that
 36 | these
 37 | those
 38 | am
 39 | is
 40 | are
 41 | was
 42 | were
 43 | be
 44 | been
 45 | being
 46 | have
 47 | has
 48 | had
 49 | having
 50 | do
 51 | does
 52 | did
 53 | doing
 54 | a
 55 | an
 56 | the
 57 | and
 58 | but
 59 | if
 60 | or
 61 | because
 62 | as
 63 | until
 64 | while
 65 | of
 66 | at
 67 | by
 68 | for
 69 | with
 70 | about
 71 | against
 72 | between
 73 | into
 74 | through
 75 | during
 76 | before
 77 | after
 78 | above
 79 | below
 80 | to
 81 | from
 82 | up
 83 | down
 84 | in
 85 | out
 86 | on
 87 | off
 88 | over
 89 | under
 90 | again
 91 | further
 92 | then
 93 | once
 94 | here
 95 | there
 96 | when
 97 | where
 98 | why
 99 | how
100 | all
101 | any
102 | both
103 | each
104 | few
105 | more
106 | most
107 | other
108 | some
109 | such
110 | no
111 | nor
112 | not
113 | only
114 | own
115 | same
116 | so
117 | than
118 | too
119 | very
120 | s
121 | t
122 | can
123 | will
124 | just
125 | don
126 | should
127 | now
128 | .
129 | ,
130 | ?
131 | !
132 | [
133 | ]
134 | {
135 | }
136 | (
137 | )
138 | ;
139 | :
140 | "
141 | '
142 | =
143 | $
144 | <
145 | >
146 | \
147 | /
148 | |
149 | %
150 | ^
151 | &
152 | #
153 | 


--------------------------------------------------------------------------------
/arxiv/text_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from collections import defaultdict
 3 | 
 4 | 
 5 | punct = u".,?![]{}();:\"'=$<>\\/|%^&#`"
 6 | 
 7 | # Load in the list of stop words.
 8 | stopfn = os.path.join(os.path.dirname(os.path.abspath(__file__)), u"stops.txt")
 9 | stops = [line.strip() for line in open(stopfn)]
10 | 
11 | 
12 | def tokenize_document(txt, vocab=None):
13 |     tokens = [t.lower().strip(punct) for t in txt.split()]
14 |     if vocab is None:
15 |         return [t for t in tokens if t not in stops and len(t) > 2]
16 | 
17 |     d = defaultdict(int)
18 |     for t in tokens:
19 |         if t in vocab:
20 |             d[unicode(vocab.index(t))] += 1
21 |     return d
22 | 


--------------------------------------------------------------------------------
/arxiv/vocab.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | __all__ = [u"build_vocab", u"get_vocab"]
 4 | 
 5 | import sys
 6 | from multiprocessing import Pool
 7 | 
 8 | from .db_utils import db, rdb
 9 | from .text_utils import tokenize_document
10 | 
11 | 
12 | def process_one(doc):
13 |     if doc.get(u"title", None) is None:
14 |         return
15 | 
16 |     if doc.get(u"random", 0.0) > 0.99:
17 |         sys.stdout.write(u".")
18 |         sys.stdout.flush()
19 | 
20 |     tokens = tokenize_document(doc[u"title"] + u" " + doc[u"abstract"])
21 | 
22 |     pipe = rdb.pipeline()
23 |     for t in tokens:
24 |         pipe.zincrby(u"vocab", t, 1)
25 |     pipe.execute()
26 | 
27 | 
28 | def build_vocab():
29 |     rdb.flushall()
30 |     coll = db.abstracts
31 | 
32 |     print(u"Fetching a list of documents from mongo...")
33 |     docs = list(coll.find({}, {u"title": 1, u"abstract": 1, u"random": 1}))
34 | 
35 |     print(u"Processing. This will take a while. Watch the grass grow...")
36 |     pool = Pool()
37 |     pool.map(process_one, docs)
38 | 
39 | 
40 | def get_vocab(initial=100, N=5000):
41 |     for w in rdb.zrevrange(u"vocab", initial, initial + N):
42 |         print(w)
43 | 


--------------------------------------------------------------------------------
/twitter/.gitignore:
--------------------------------------------------------------------------------
1 | tweets
2 | 


--------------------------------------------------------------------------------
/twitter/twitter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | 
 4 | from __future__ import print_function
 5 | 
 6 | import os
 7 | import time
 8 | import json
 9 | import requests
10 | from requests.auth import OAuth1
11 | 
12 | 
13 | url = u"https://stream.twitter.com/1/statuses/filter.json"
14 | 
15 | e = os.environ
16 | client_key = e[u"TW_CLIENT_KEY"]
17 | client_secret = e[u"TW_CLIENT_SECRET"]
18 | user_key = e[u"TW_USER_KEY"]
19 | user_secret = e[u"TW_USER_SECRET"]
20 | 
21 | 
22 | def monitor(kw):
23 |     wait = 0
24 |     auth = OAuth1(client_key, client_secret, user_key, user_secret)
25 |     while 1:
26 |         try:
27 |             try:
28 |                 r = requests.post(url, data={"track": kw}, auth=auth,
29 |                                 prefetch=False, timeout=90)
30 |             except requests.exceptions.ConnectionError:
31 |                 print("request failed.")
32 |                 wait = min(wait + 0.25, 16)
33 |             else:
34 |                 code = r.status_code
35 |                 print("{0} returned: {1}".format(url, code))
36 |                 if code == 200:
37 |                     wait = 0
38 |                     try:
39 |                         for line in r.iter_lines():
40 |                             if line:
41 |                                 tweet = json.loads(line)
42 |                                 fn = "tweets/{0}.json".format(tweet["id_str"])
43 |                                 with open(fn, "w") as f:
44 |                                     f.write(line)
45 |                     except requests.exceptions.Timeout:
46 |                         print("request timed out.")
47 |                     except Exception as e:
48 |                         print("failed with {0}".format(e))
49 |                 elif code == 420:
50 |                     if wait == 0:
51 |                         wait = 60
52 |                     else:
53 |                         wait *= 2
54 |                 elif code in [401, 403, 404, 500]:
55 |                     if wait == 0:
56 |                         wait = 5
57 |                     else:
58 |                         wait = min(wait * 2, 320)
59 |                 else:
60 |                     r.raise_for_status()
61 |         except KeyboardInterrupt:
62 |             print("Exiting.")
63 |             break
64 | 
65 |         time.sleep(wait)
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     import sys
70 |     if len(sys.argv) > 1:
71 |         kw = u",".join(sys.argv[1:])
72 |     else:
73 |         kw = u"arxiv"
74 |     monitor(kw)
75 | 


--------------------------------------------------------------------------------