├── README.md └── hacker_news_getallstories.py /README.md: -------------------------------------------------------------------------------- 1 | hacker-news-download-all-stories 2 | ================================ 3 | 4 | Download *ALL* the submissions from Hacker News using the official API. 5 | 6 | Requires Python 2.7 and pandas. 7 | -------------------------------------------------------------------------------- /hacker_news_getallstories.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | import json 3 | import datetime 4 | import time 5 | import pytz 6 | import pandas as pd 7 | from pandas import DataFrame 8 | 9 | ts = str(int(time.time())) 10 | df = DataFrame() 11 | hitsPerPage = 1000 12 | requested_keys = ["title","url","points","num_comments","author","created_at_i","objectID"] 13 | 14 | i = 0 15 | 16 | while True: 17 | try: 18 | url = 'https://hn.algolia.com/api/v1/search_by_date?tags=story&hitsPerPage=%s&numericFilters=created_at_i<%s' % (hitsPerPage, ts) 19 | req = urllib2.Request(url) 20 | response = urllib2.urlopen(req) 21 | data = json.loads(response.read()) 22 | last = data["nbHits"] < hitsPerPage 23 | data = DataFrame(data["hits"])[requested_keys] 24 | df = df.append(data,ignore_index=True) 25 | ts = data.created_at_i.min() 26 | print i 27 | if (last): 28 | break 29 | time.sleep(3.6) 30 | i += 1 31 | 32 | except Exception, e: 33 | print e 34 | 35 | df["title"] = df["title"].map(lambda x: x.translate(dict.fromkeys([0x201c, 0x201d, 0x2011, 0x2013, 0x2014, 0x2018, 0x2019, 0x2026, 0x2032])).encode('utf-8').replace(',','')) 36 | df["created_at"] = df["created_at_i"].map(lambda x: datetime.datetime.fromtimestamp(int(x), tz=pytz.timezone('America/New_York')).strftime('%Y-%m-%d %H:%M:%S')) 37 | 38 | ordered_df = df[["title","url","points","num_comments","author","created_at","objectID"]] 39 | 40 | ordered_df.to_csv("hacker_news_stories.csv",encoding='utf-8', index=False) 41 | --------------------------------------------------------------------------------