├── .gitignore └── miner.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | *.html 3 | 4 | *.json 5 | -------------------------------------------------------------------------------- /miner.py: -------------------------------------------------------------------------------- 1 | import urllib2, cookielib 2 | from bs4 import BeautifulSoup 3 | from hn import HN 4 | from datetime import datetime 5 | import json 6 | import os 7 | import time 8 | 9 | def download_story(story_id, url, date_code, sleep_time = 1): 10 | stories_folder_path = './stories/' + date_code 11 | try: 12 | os.makedirs(stories_folder_path) 13 | except: 14 | """ Nothing """ 15 | story_file_path = stories_folder_path + '/' + str(story_id) + '.html' 16 | if not os.path.exists(story_file_path): 17 | print "Downloading story " + str(story_id) + " from url " + url 18 | try: 19 | cj = cookielib.CookieJar() 20 | opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) 21 | opener.addheaders = [('User-agent', 'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25')] 22 | story_response = opener.open(url, timeout=5) 23 | #print(story_response.info()) 24 | story_text = story_response.read() 25 | story_response.close() 26 | story_file = open(story_file_path, 'w') 27 | story_file.write(story_text) 28 | except Exception, err: 29 | print err 30 | print "Sleeping for " + str(sleep_time) + " seconds." 31 | time.sleep(sleep_time) 32 | if sleep_time >= 4: 33 | return 34 | download_story(story_id, url, date_code, sleep_time = sleep_time*2) 35 | 36 | def update_via_hn(sleep_time = 1): 37 | current_time = datetime.now() 38 | date_code = current_time.strftime('%Y%m%d') 39 | data_file_path = './story_metadata/story_log_' + date_code + '.json' 40 | data_file = open(data_file_path, 'a') 41 | hackernews = HN() 42 | 43 | try: 44 | print "Getting front page stories..." 45 | front_page_stories = hackernews.get_stories() 46 | print "Getting new stories..." 47 | new_stories = hackernews.get_stories(story_type='newest') 48 | except Exception, err: 49 | if sleep_time <= 8: 50 | print err 51 | time.sleep(sleep_time) 52 | update_via_hn(sleep_time = sleep_time*2) 53 | return 54 | 55 | print "Logging story statuses" 56 | for story in front_page_stories: 57 | story['is_front_page'] = True 58 | story['timestamp'] = str(current_time) 59 | data_file.write(json.dumps(story) + '\n') 60 | download_story(story['story_id'], story['link'], date_code) 61 | for story in new_stories: 62 | story['is_front_page'] = False 63 | story['timestamp'] = str(current_time) 64 | data_file.write(json.dumps(story) + '\n') 65 | download_story(story['story_id'], story['link'], date_code) 66 | 67 | def begin_regular_updates(minutes_interval=1): 68 | try: 69 | os.makedirs('./stories') 70 | except: 71 | print "Stories directory already exists" 72 | 73 | try: 74 | os.makedirs('./story_metadata') 75 | except: 76 | print "Story metadata directory already exists" 77 | 78 | while True: 79 | print "Refreshing HN data..." 80 | update_via_hn() 81 | print "Sleeping for " + str(minutes_interval) + " minutes." 82 | time.sleep(minutes_interval*60) 83 | 84 | begin_regular_updates() --------------------------------------------------------------------------------