├── .gitignore ├── README.md ├── analysis.py ├── arxiv ├── __init__.py ├── db_utils.py ├── lda.py ├── parse.py ├── scrape.py ├── stops.txt ├── text_utils.py └── vocab.py └── twitter ├── .gitignore └── twitter.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.xml 2 | .DS_Store 3 | *.pyc 4 | records 5 | venv 6 | settings.sh 7 | lambda*.txt 8 | models 9 | vocab.txt 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ArXiv analysis 2 | 3 | Run [online variational LDA](http://arxiv.org/abs/1206.7051v1) on all the 4 | abstracts from the arXiv. The implementation is based on [Matt Hoffman's 5 | GPL licensed code](http://www.cs.princeton.edu/~mdhoffma/). 6 | 7 | ## Usage 8 | 9 | You'll need a [`mongod`](http://www.mongodb.org/) instance running on 10 | the port given by the environment variable `MONGO_PORT` and a 11 | [`redis-server`](http://redis.io/) instance running on the port given by 12 | the `REDIS_PORT` environment variable. 13 | 14 | The code depends on the Python packages: `numpy`, `scipy`, `requests`, 15 | `pymongo` and `redis`. 16 | 17 | * `mkdir abstracts` 18 | * `./analysis.py scrape abstracts` — scrapes all the metadata from the arXiv 19 | [OAI interface](http://arxiv.org/help/oa/index) and saves the raw XML 20 | responses as `abstracts/raw-*.xml`. This takes a _long time_ because of 21 | the arXiv's flow control policies. It took me approximately 6 hours. 22 | * `./analysis.py parse abstracts/raw-*.xml` — parses the raw responses and 23 | saves the abstracts to a MongoDB database called `arxiv` in the collection 24 | called `abstracts`. 25 | * `./analysis.py build-vocab` — counts all the words in the corpus removing 26 | anything with less than 3 characters and removing any stop words. 27 | * `./analysis.py get-vocab 100 5000 > vocab.txt` — lists the vocabulary 28 | skipping the first 100 most popular words and keeping 5000 words total. 29 | * `./analysis.py run vocab.txt` — runs online variational LDA by randomly 30 | selecting articles from the database. The topic distributions are stored 31 | in the `lambda-*.txt` files. This will run forever so just kill it whenever 32 | you feel like it. 33 | * `./analysis.py vocab.txt lambda-100.txt` — list the topics and their most 34 | common words at step 100. 35 | -------------------------------------------------------------------------------- /analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import print_function 4 | 5 | import os 6 | import sys 7 | import numpy as np 8 | 9 | try: 10 | import arxiv 11 | arxiv = arxiv 12 | except ImportError: 13 | sys.path.append(os.path.join(os.path.abspath(__file__), u"..")) 14 | import arxiv 15 | arxiv = arxiv 16 | 17 | from arxiv.db_utils import db 18 | 19 | 20 | if __name__ == "__main__": 21 | cmd = sys.argv[1] 22 | 23 | if cmd == u"scrape": 24 | print(u"Scraping all the meta-data from the arxiv...") 25 | arxiv.get() 26 | 27 | if cmd == u"parse": 28 | print(u"Parsing the XML...") 29 | arxiv.parse() 30 | 31 | if cmd == u"build-vocab": 32 | print(u"Building the vocabulary list...") 33 | arxiv.build_vocab() 34 | 35 | if cmd == u"get-vocab": 36 | initial, N = 1000, 5000 37 | if len(sys.argv) >= 3: 38 | initial = int(sys.argv[2]) 39 | elif len(sys.argv) >= 4: 40 | N = int(sys.argv[3]) 41 | 42 | arxiv.get_vocab(initial=initial, N=N) 43 | 44 | if cmd in [u"run", u"results"]: 45 | fn = sys.argv[2] 46 | vocab = [l.strip() for l in open(fn)] 47 | 48 | if cmd == u"run": 49 | print(u"Running online LDA...") 50 | coll = db.abstracts 51 | coll.ensure_index(u"random") 52 | 53 | batch_size = 128 54 | ndocs = coll.count() 55 | ntopics = 100 56 | 57 | lda = arxiv.LDA(vocab, ntopics, ndocs, 1.0 / ntopics, 1.0 / ntopics, 58 | 1025.0, 0.8) 59 | 60 | iteration = 0 61 | while 1: 62 | docs = [] 63 | ind = np.random.randint(ndocs - batch_size) 64 | cursor = coll.find({}, {u"abstract": 1, u"title": 1}) \ 65 | .sort(u"random").skip(ind) 66 | for i, d in enumerate(cursor): 67 | if i >= batch_size: 68 | break 69 | if u"title" in d: 70 | docs.append(d[u"title"] + u" " + d[u"abstract"]) 71 | gamma, lam, bound = lda.update(docs) 72 | print(iteration, ind, np.exp(-bound)) 73 | 74 | if iteration % 10 == 0: 75 | np.savetxt(u"lambda-{0}.txt".format(iteration), lam) 76 | 77 | iteration += 1 78 | 79 | if cmd == u"results": 80 | print(u"Displaying results...") 81 | fn = sys.argv[3] 82 | lam = np.loadtxt(fn) 83 | 84 | for i, l in enumerate(lam.T): 85 | l /= np.sum(l) 86 | tmp = sorted(zip(l, range(len(l))), key=lambda x: x[0], 87 | reverse=True) 88 | print(u"Topic {0}: ".format(i) + 89 | u", ".join([u"{0} ({1:.1f})".format(vocab[t[1]], 100 * t[0]) 90 | for t in tmp[:10]])) 91 | -------------------------------------------------------------------------------- /arxiv/__init__.py: -------------------------------------------------------------------------------- 1 | from scrape import * 2 | from parse import * 3 | from vocab import * 4 | from lda import * 5 | -------------------------------------------------------------------------------- /arxiv/db_utils.py: -------------------------------------------------------------------------------- 1 | __all__ = [u"db", u"rdb"] 2 | 3 | import os 4 | import pymongo 5 | import redis 6 | 7 | 8 | server = os.environ.get(u"MONGO_SERVER", u"localhost") 9 | port = int(os.environ.get(u"MONGO_PORT", 27017)) 10 | redis_server = os.environ.get(u"REDIS_SERVER", u"localhost") 11 | redis_port = int(os.environ.get(u"REDIS_PORT", 27019)) 12 | 13 | db = pymongo.Connection(server, port).arxiv 14 | rdb = redis.Redis(host=redis_server, port=redis_port) 15 | -------------------------------------------------------------------------------- /arxiv/lda.py: -------------------------------------------------------------------------------- 1 | __all__ = [u"LDA"] 2 | 3 | import numpy as np 4 | from scipy.special import gammaln, psi 5 | 6 | from .text_utils import tokenize_document 7 | 8 | 9 | def dir_expect(alpha): 10 | if len(alpha.shape) == 1: 11 | return psi(alpha) - psi(np.sum(alpha)) 12 | return psi(alpha) - psi(np.sum(alpha, axis=-1))[:, None] 13 | 14 | 15 | class LDA: 16 | 17 | def __init__(self, vocab, ntopics, ndocs, alpha, eta, delay, rate): 18 | self.vocab = list(vocab) 19 | self.K = ntopics 20 | self.W = len(self.vocab) 21 | self.D = ndocs 22 | self.alpha = alpha 23 | self.eta = eta 24 | self.tau = delay 25 | self.kappa = rate 26 | 27 | self.lam = np.random.gamma(100.0, 0.01, (self.W, self.K)) 28 | self.Elogbeta = dir_expect(self.lam) 29 | self.expElogbeta = np.exp(self.Elogbeta) 30 | 31 | self.tstep = 0 32 | 33 | def _expectation(self, docs, maxiter=100, tol=0.0001, eps=1e-100): 34 | docs = [tokenize_document(d, vocab=self.vocab) for d in docs] 35 | size = len(docs) 36 | gamma = np.random.gamma(100.0, 0.01, (size, self.K)) 37 | expElogth = np.exp(dir_expect(gamma)) 38 | stats = np.zeros_like(self.lam) 39 | 40 | for i, doc in enumerate(docs): 41 | try: 42 | word_ids, word_counts = zip(*[w for w in doc.iteritems()]) 43 | except ValueError: 44 | continue 45 | word_ids = np.array(word_ids, dtype=int) 46 | word_counts = np.array(word_counts, dtype=int) 47 | gamma_d = gamma[i] 48 | expElogth_d = expElogth[i] 49 | expElogbeta_d = self.expElogbeta[word_ids] 50 | norm = np.dot(expElogth_d, expElogbeta_d.T) + eps 51 | for j in range(maxiter): 52 | gamma0 = gamma 53 | gamma_d = self.alpha + expElogth_d * np.dot( 54 | word_counts / norm, expElogbeta_d) 55 | expElogth_d = np.exp(dir_expect(gamma_d)) 56 | norm = np.dot(expElogbeta_d, expElogth_d) + eps 57 | 58 | delta = np.mean(np.abs(gamma_d - gamma0)) 59 | if delta < tol: 60 | break 61 | gamma[i] = gamma_d 62 | stats[word_ids] += np.outer(word_counts / norm, expElogth_d) 63 | 64 | stats *= self.expElogbeta 65 | 66 | return gamma, stats, docs 67 | 68 | def update(self, docs, **kwargs): 69 | rho = (self.tau + self.tstep) ** -self.kappa 70 | gamma, stats, docs = self._expectation(docs, **kwargs) 71 | bound = self.approx_bound(docs, gamma, preprocess=False) 72 | 73 | # Update lambda. 74 | self.lam = self.lam * (1.0 - rho) + rho * (self.eta 75 | + self.D * stats / len(docs)) 76 | self.Elogbeta = dir_expect(self.lam) 77 | self.expElogbeta = np.exp(self.Elogbeta) 78 | self.tstep += 1 79 | 80 | return gamma, self.lam, bound 81 | 82 | def approx_bound(self, docs, gamma, preprocess=True): 83 | if preprocess: 84 | docs = [tokenize_document(d, vocab=self.vocab) for d in docs] 85 | 86 | Elogth = dir_expect(gamma) 87 | 88 | score = 0.0 89 | fullnorm = 0.0 90 | 91 | for i, doc in enumerate(docs): 92 | try: 93 | word_ids, word_counts = zip(*[w for w in doc.iteritems()]) 94 | except ValueError: 95 | continue 96 | word_ids = np.array(word_ids, dtype=int) 97 | word_counts = np.array(word_counts, dtype=int) 98 | norm = np.zeros(len(word_ids)) 99 | for j in range(len(word_ids)): 100 | tmp = Elogth[i] + self.Elogbeta[word_ids[j]] 101 | tmax = np.max(tmp) 102 | norm[j] = np.log(sum(np.exp(tmp - tmax))) + tmax 103 | score += np.sum(word_counts * norm) 104 | fullnorm += np.sum(word_counts) 105 | 106 | score += np.sum((self.alpha - gamma) * Elogth) 107 | score += np.sum(gammaln(gamma) - gammaln(self.alpha)) 108 | score += np.sum(gammaln(self.alpha * self.K) - 109 | gammaln(np.sum(gamma, axis=1))) 110 | 111 | score *= self.D / len(docs) 112 | 113 | score += np.sum((self.eta - self.lam) * self.Elogbeta) 114 | score += np.sum(gammaln(self.lam) - gammaln(self.eta)) 115 | score += np.sum(gammaln(self.eta * self.W) - 116 | gammaln(np.sum(self.lam, axis=1))) 117 | 118 | return score * len(docs) / fullnorm / self.D 119 | -------------------------------------------------------------------------------- /arxiv/parse.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | __all__ = [u"parse"] 4 | 5 | import os 6 | import re 7 | import random 8 | from datetime import datetime 9 | import xml.etree.cElementTree as ET 10 | from multiprocessing import Pool 11 | 12 | from .db_utils import db 13 | 14 | 15 | record_tag = u".//{http://www.openarchives.org/OAI/2.0/}record" 16 | ns_re = re.compile(r"\{(?:.*?)\}(.*)") 17 | date_fmt = u"%a, %d %b %Y %H:%M:%S %Z" 18 | 19 | comma_and = r"(?:,* and )|(?:,\s*)" 20 | ca_re = re.compile(comma_and) 21 | au_re = re.compile(r"(.+?)(?:" + comma_and + "|(?:\s*$))") 22 | 23 | affil_re = re.compile(r"(.*?)(?:\((.*)\)|$)") 24 | affils_re = re.compile(r"\(([0-9]+)\) (.*?)(?=(?:,*\s*\()|\))") 25 | 26 | 27 | server = os.environ.get(u"MONGO_SERVER", "localhost") 28 | port = int(os.environ.get(u"MONGO_PORT", 27017)) 29 | 30 | 31 | def parse_one(f): 32 | print(u"Starting: {0}".format(f)) 33 | coll = db.abstracts 34 | 35 | tree = ET.parse(f) 36 | root = tree.getroot() 37 | for i, r in enumerate(root.findall(record_tag)): 38 | doc = {} 39 | for el in r.iter(): 40 | txt = el.text 41 | if txt is None: 42 | for k, v in el.attrib.iteritems(): 43 | doc[unicode(k.lower())] = unicode(v) 44 | elif txt.strip() != u"": 45 | k = unicode(ns_re.search(el.tag).groups()[0].lower()) 46 | txt = unicode(txt.strip()) 47 | 48 | if k == u"date": 49 | txt = datetime.strptime(txt, date_fmt) 50 | elif k == u"categories": 51 | txt = [c.strip() for c in txt.split()] 52 | elif k == u"authors": 53 | spl = txt.replace(u"\n", u"").split(u"((") 54 | if len(spl) > 1: 55 | if len(spl) > 2: 56 | spl = [spl[0], u"(".join(spl[1:])] 57 | authors, affils = spl 58 | affils = dict(affils_re.findall(u"(" + affils)) 59 | else: 60 | authors, affils = txt, {} 61 | authors = [affil_re.findall(a.strip())[0] 62 | for a in au_re.findall(authors)] 63 | doc[u"authors"] = [] 64 | for a in authors: 65 | if len(a[1]) > 0 and a[1][0] in u"1234567890": 66 | doc[u"authors"].append({u"name": a[0], 67 | u"affil": ", ".join([affils.get(af.strip(), 68 | af.strip()) 69 | for af in ca_re.split(a[1])])}) 70 | else: 71 | doc[u"authors"].append({u"name": a[0], 72 | u"affil": a[1].strip()}) 73 | 74 | k = u"authors_raw" 75 | 76 | doc[k] = txt 77 | 78 | # Add a random number for selecting random documents later. 79 | doc[u"random"] = random.random() 80 | coll.insert(doc) 81 | 82 | print(u"Finished {0}".format(f)) 83 | 84 | 85 | def parse(fns): 86 | p = Pool() 87 | p.map(parse_one, list(fns)) 88 | 89 | 90 | if __name__ == "__main__": 91 | import sys 92 | parse(sys.argv[1:]) 93 | -------------------------------------------------------------------------------- /arxiv/scrape.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import print_function 4 | 5 | __all__ = [u"get"] 6 | 7 | import os 8 | import re 9 | import time 10 | 11 | import requests 12 | 13 | 14 | resume_re = re.compile(r".*(.*?).*") 15 | url = "http://export.arxiv.org/oai2" 16 | 17 | 18 | def get(basepath=u".", max_tries=10): 19 | """ 20 | Get all the listings from the ArXiv. 21 | 22 | """ 23 | req = {u"verb": "ListRecords", 24 | u"metadataPrefix": u"arXivRaw"} 25 | 26 | failures = 0 27 | count = 0 28 | while True: 29 | # Send the request. 30 | r = requests.post(url, data=req) 31 | 32 | # Handle the response. 33 | code = r.status_code 34 | 35 | if code == 503: 36 | # Asked to retry 37 | to = int(r.headers["retry-after"]) 38 | print(u"Got 503. Retrying after {0:d} seconds.".format(to)) 39 | 40 | time.sleep(to) 41 | failures += 1 42 | if failures >= max_tries: 43 | print(u"Failed too many times...") 44 | break 45 | 46 | elif code == 200: 47 | failures = 0 48 | 49 | # Write to file. 50 | content = r.text 51 | count += 1 52 | fn = os.path.join(basepath, u"raw-{0:08d}.xml".format(count)) 53 | print(u"Writing to: {0}".format(fn)) 54 | with open(fn, u"w") as f: 55 | f.write(content) 56 | 57 | # Look for a resumption token. 58 | token = resume_re.search(content) 59 | if token is None: 60 | break 61 | token = token.groups()[0] 62 | 63 | # If there isn't one, we're all done. 64 | if token == "": 65 | print(u"All done.") 66 | break 67 | 68 | print(u"Resumption token: {0}.".format(token)) 69 | 70 | # If there is a resumption token, rebuild the request. 71 | req = {u"verb": u"ListRecords", 72 | u"resumptionToken": token} 73 | 74 | # Pause so as not to get banned. 75 | to = 20 76 | print(u"Sleeping for {0:d} seconds so as not to get banned." 77 | .format(to)) 78 | time.sleep(to) 79 | 80 | else: 81 | # Wha happen'? 82 | r.raise_for_status() 83 | 84 | 85 | if __name__ == "__main__": 86 | import sys 87 | 88 | if len(sys.argv) == 1: 89 | bp = u"." 90 | else: 91 | bp = sys.argv[1] 92 | 93 | get(basepath=bp) 94 | -------------------------------------------------------------------------------- /arxiv/stops.txt: -------------------------------------------------------------------------------- 1 | i 2 | me 3 | my 4 | myself 5 | we 6 | our 7 | ours 8 | ourselves 9 | you 10 | your 11 | yours 12 | yourself 13 | yourselves 14 | he 15 | him 16 | his 17 | himself 18 | she 19 | her 20 | hers 21 | herself 22 | it 23 | its 24 | itself 25 | they 26 | them 27 | their 28 | theirs 29 | themselves 30 | what 31 | which 32 | who 33 | whom 34 | this 35 | that 36 | these 37 | those 38 | am 39 | is 40 | are 41 | was 42 | were 43 | be 44 | been 45 | being 46 | have 47 | has 48 | had 49 | having 50 | do 51 | does 52 | did 53 | doing 54 | a 55 | an 56 | the 57 | and 58 | but 59 | if 60 | or 61 | because 62 | as 63 | until 64 | while 65 | of 66 | at 67 | by 68 | for 69 | with 70 | about 71 | against 72 | between 73 | into 74 | through 75 | during 76 | before 77 | after 78 | above 79 | below 80 | to 81 | from 82 | up 83 | down 84 | in 85 | out 86 | on 87 | off 88 | over 89 | under 90 | again 91 | further 92 | then 93 | once 94 | here 95 | there 96 | when 97 | where 98 | why 99 | how 100 | all 101 | any 102 | both 103 | each 104 | few 105 | more 106 | most 107 | other 108 | some 109 | such 110 | no 111 | nor 112 | not 113 | only 114 | own 115 | same 116 | so 117 | than 118 | too 119 | very 120 | s 121 | t 122 | can 123 | will 124 | just 125 | don 126 | should 127 | now 128 | . 129 | , 130 | ? 131 | ! 132 | [ 133 | ] 134 | { 135 | } 136 | ( 137 | ) 138 | ; 139 | : 140 | " 141 | ' 142 | = 143 | $ 144 | < 145 | > 146 | \ 147 | / 148 | | 149 | % 150 | ^ 151 | & 152 | # 153 | -------------------------------------------------------------------------------- /arxiv/text_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from collections import defaultdict 3 | 4 | 5 | punct = u".,?![]{}();:\"'=$<>\\/|%^&#`" 6 | 7 | # Load in the list of stop words. 8 | stopfn = os.path.join(os.path.dirname(os.path.abspath(__file__)), u"stops.txt") 9 | stops = [line.strip() for line in open(stopfn)] 10 | 11 | 12 | def tokenize_document(txt, vocab=None): 13 | tokens = [t.lower().strip(punct) for t in txt.split()] 14 | if vocab is None: 15 | return [t for t in tokens if t not in stops and len(t) > 2] 16 | 17 | d = defaultdict(int) 18 | for t in tokens: 19 | if t in vocab: 20 | d[unicode(vocab.index(t))] += 1 21 | return d 22 | -------------------------------------------------------------------------------- /arxiv/vocab.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | __all__ = [u"build_vocab", u"get_vocab"] 4 | 5 | import sys 6 | from multiprocessing import Pool 7 | 8 | from .db_utils import db, rdb 9 | from .text_utils import tokenize_document 10 | 11 | 12 | def process_one(doc): 13 | if doc.get(u"title", None) is None: 14 | return 15 | 16 | if doc.get(u"random", 0.0) > 0.99: 17 | sys.stdout.write(u".") 18 | sys.stdout.flush() 19 | 20 | tokens = tokenize_document(doc[u"title"] + u" " + doc[u"abstract"]) 21 | 22 | pipe = rdb.pipeline() 23 | for t in tokens: 24 | pipe.zincrby(u"vocab", t, 1) 25 | pipe.execute() 26 | 27 | 28 | def build_vocab(): 29 | rdb.flushall() 30 | coll = db.abstracts 31 | 32 | print(u"Fetching a list of documents from mongo...") 33 | docs = list(coll.find({}, {u"title": 1, u"abstract": 1, u"random": 1})) 34 | 35 | print(u"Processing. This will take a while. Watch the grass grow...") 36 | pool = Pool() 37 | pool.map(process_one, docs) 38 | 39 | 40 | def get_vocab(initial=100, N=5000): 41 | for w in rdb.zrevrange(u"vocab", initial, initial + N): 42 | print(w) 43 | -------------------------------------------------------------------------------- /twitter/.gitignore: -------------------------------------------------------------------------------- 1 | tweets 2 | -------------------------------------------------------------------------------- /twitter/twitter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | from __future__ import print_function 5 | 6 | import os 7 | import time 8 | import json 9 | import requests 10 | from requests.auth import OAuth1 11 | 12 | 13 | url = u"https://stream.twitter.com/1/statuses/filter.json" 14 | 15 | e = os.environ 16 | client_key = e[u"TW_CLIENT_KEY"] 17 | client_secret = e[u"TW_CLIENT_SECRET"] 18 | user_key = e[u"TW_USER_KEY"] 19 | user_secret = e[u"TW_USER_SECRET"] 20 | 21 | 22 | def monitor(kw): 23 | wait = 0 24 | auth = OAuth1(client_key, client_secret, user_key, user_secret) 25 | while 1: 26 | try: 27 | try: 28 | r = requests.post(url, data={"track": kw}, auth=auth, 29 | prefetch=False, timeout=90) 30 | except requests.exceptions.ConnectionError: 31 | print("request failed.") 32 | wait = min(wait + 0.25, 16) 33 | else: 34 | code = r.status_code 35 | print("{0} returned: {1}".format(url, code)) 36 | if code == 200: 37 | wait = 0 38 | try: 39 | for line in r.iter_lines(): 40 | if line: 41 | tweet = json.loads(line) 42 | fn = "tweets/{0}.json".format(tweet["id_str"]) 43 | with open(fn, "w") as f: 44 | f.write(line) 45 | except requests.exceptions.Timeout: 46 | print("request timed out.") 47 | except Exception as e: 48 | print("failed with {0}".format(e)) 49 | elif code == 420: 50 | if wait == 0: 51 | wait = 60 52 | else: 53 | wait *= 2 54 | elif code in [401, 403, 404, 500]: 55 | if wait == 0: 56 | wait = 5 57 | else: 58 | wait = min(wait * 2, 320) 59 | else: 60 | r.raise_for_status() 61 | except KeyboardInterrupt: 62 | print("Exiting.") 63 | break 64 | 65 | time.sleep(wait) 66 | 67 | 68 | if __name__ == "__main__": 69 | import sys 70 | if len(sys.argv) > 1: 71 | kw = u",".join(sys.argv[1:]) 72 | else: 73 | kw = u"arxiv" 74 | monitor(kw) 75 | --------------------------------------------------------------------------------