├── requirements.txt ├── templates ├── sub_date.html ├── subs_page.html ├── sub_dates_list.html └── post_template.html ├── README.md ├── static └── style.css ├── Flask-Host.py └── Post-Archiver.py /requirements.txt: -------------------------------------------------------------------------------- 1 | Flask==0.12.2 2 | natsort==5.1.0 3 | requests==2.18.4 -------------------------------------------------------------------------------- /templates/sub_date.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | /r/{{sub}}: {{date}} 6 | 7 | 8 | {{posts|safe}} 9 | 10 | -------------------------------------------------------------------------------- /templates/subs_page.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Archived Subreddits 6 |

Archived Subreddits

7 | 8 | 9 | {{subs|safe}} 10 | 11 | -------------------------------------------------------------------------------- /templates/sub_dates_list.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | /r/{{sub}} Archives 6 | 7 | 8 |

/r/{{sub}} Archives

9 | {{dates|safe}} 10 | 11 | -------------------------------------------------------------------------------- /templates/post_template.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | {{title}} 7 | 8 | 9 | 10 | 11 |
12 |

{{post_title}}

13 |

14 | 15 |
16 | {{post_body|safe}} 17 | {{image|safe}} 18 |
19 | 20 |


21 | 22 |
23 | {{comments|safe}} 24 |
25 | 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reddit-Archive-Host 2 | 3 | Set of tools for downloading and locally hosting text-based reddit content. 4 | 5 | ## Usage 6 | #### Downloading posts: 7 | 8 | `python3 Post-Archiver.py [Subreddit] [Time] [Number of posts]` 9 | 10 | `Subreddit` will just be the name of the subreddit you want to archive. 11 | 12 | `Time` will be the time range you want to archive. It will be day, week, month, year, or all 13 | 14 | `Number of posts`is pretty self explanatory; It's how many posts you want to archive. The maximum (due to reddit's json api limitations) is 1000 15 | 16 | #### Hosting downloaded posts: 17 | 18 | `python3 Flask-Host.py` 19 | 20 | This will start an instance of flask so that you can browse your downloaded subreddits in your browser. Just navigate to the url that flask gives you in the commandline. 21 | 22 | ## Other 23 | [What browsing looks like](https://giant.gfycat.com/AmazingWellmadeAiredale.webm) 24 | 25 | Required modules are: requests, natsort and flask. 26 | 27 | You can install them with `pip3 install -U -r requirements.txt` -------------------------------------------------------------------------------- /static/style.css: -------------------------------------------------------------------------------- 1 | .acomment{ 2 | color: #e2e2e2; 3 | background-color: #333; 4 | padding-left: 10px; 5 | width: 1000px; 6 | padding-right: 10px; 7 | min-height: 10em; 8 | display: table-cell; 9 | vertical-align: middle; 10 | box-shadow: 5px 5px 10px #000; 11 | } 12 | /* 13 | .bcomment{ 14 | color: #e2e2e2; 15 | background-color: #333; 16 | padding-left: 10px; 17 | padding-top:5px; 18 | padding-bottom: 5px; 19 | margin-left: 50px; 20 | width: 1000px; 21 | padding-right: 10px; 22 | box-shadow: 5px 5px 10px #000; 23 | } 24 | 25 | .bcommentinfo{ 26 | margin-left: 50px; 27 | } 28 | */ 29 | .postinfo{ 30 | color: #e2e2e2; 31 | margin: auto; 32 | background-color: #333; 33 | padding-left: 10px; 34 | width: 1000px; 35 | padding-right: 10px; 36 | box-shadow: 3px 3px 20px #000; 37 | padding-bottom: 10px; 38 | padding-top: 5px; 39 | text-align: center; 40 | } 41 | 42 | .next-button { 43 | position: absolute; 44 | right: 50px; 45 | color: #e2e2e2; 46 | background-color: #333; 47 | width: 10%; 48 | padding-left: 5px; 49 | padding-right: 5px; 50 | padding-top: 3px; 51 | padding-bottom: 5px; 52 | box-shadow: 3px 3px 20px #000; 53 | text-align: center; 54 | font-size: 130%; 55 | } 56 | 57 | .ptitle { 58 | text-align: center; 59 | } 60 | 61 | .mainpost{ 62 | color: #e2e2e2; 63 | background-color: #333; 64 | box-shadow: 5px 5px 10px #000; 65 | padding-left: 10px; 66 | padding-right: 10px; 67 | padding-bottom: 10px; 68 | padding-top: 5px; 69 | display: block; 70 | margin: auto; 71 | width: 95%; 72 | } 73 | 74 | body{ 75 | background-color: #212121; 76 | color: #e2e2e2; 77 | } 78 | 79 | A:link { 80 | color: #add8e6; 81 | font-weight: bold; 82 | text-decoration: none; 83 | } 84 | 85 | A:visited { 86 | color: #adbce6; 87 | font-weight: bold; 88 | } 89 | 90 | A:hover { 91 | color: #fff; 92 | } 93 | 94 | h2 { 95 | text-align: center; 96 | } -------------------------------------------------------------------------------- /Flask-Host.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template, url_for 2 | from datetime import datetime 3 | from natsort import natsorted 4 | import json, html, os 5 | 6 | app = Flask(__name__) 7 | cwd = os.getcwd() 8 | 9 | @app.route('/') 10 | def index(): 11 | subs = [] 12 | for a in os.listdir('r/'): 13 | if os.path.isdir('r/{}'.format(a)): 14 | subs.append('
{}

\n'.format(a, a)) 15 | return render_template('subs_page.html', subs=''.join(subs)) 16 | 17 | @app.route('/r/') 18 | def subreddit(sub): 19 | dates = [] 20 | for a in natsorted(os.listdir('r/{}'.format(sub))): 21 | pages = 0 22 | if os.path.isfile('r/{}/{}'.format(sub, a)) and 'r/{}/{}'.format(sub, a).endswith('_1.json'): 23 | for b in natsorted(os.listdir('r/{}'.format(sub))): 24 | if b.startswith(a.replace('_1.json','')): 25 | pages += 1 26 | date = os.path.splitext('r/{}/{}'.format(sub, a))[0].split('/')[-1].replace('_1','') 27 | dates.append('\n
\n'.format('/r/{}/date/{}'.format(sub, date), date, str(pages))) 28 | return render_template('sub_dates_list.html', sub=sub, dates=''.join(dates)) 29 | 30 | @app.route('/r//date//') 31 | def sub_date(sub, date, page): 32 | pages = 0 33 | for a in natsorted(os.listdir('r/{}/'.format(sub))): 34 | if a.endswith('.json') and a.startswith(date): 35 | pages += 1 36 | remaining_pages = pages - int(page) 37 | print(remaining_pages) 38 | json_file = open('r/{}/{}_{}.json'.format(sub, date, page), 'r').read() 39 | loaded_json = json.loads(json_file) 40 | posts = [] 41 | 42 | for a in loaded_json['data']['children']: 43 | title = a['data']['title'] 44 | author = a['data']['author'] 45 | score = a['data']['score'] 46 | date = datetime.fromtimestamp(a['data']['created_utc']) 47 | link = '/r/{}/post/{}'.format(sub, a['data']['id']) 48 | posts.append('''

\n'''.format(author, score, date, link, title)) 50 | if remaining_pages > 0: 51 | posts.append('''

'''.format(str(int(page)+1))) 52 | else: 53 | posts.append('''
No more pages


'''.format(str(int(page)+1))) 54 | return render_template('sub_date.html', sub=sub, date=date, posts=''.join(posts)) 55 | 56 | @app.route('/r//post/') 57 | def thread(sub, id): 58 | json_file = open('r/{}/posts/{}.json'.format(sub, id), 'r').read() 59 | loaded_json = json.loads(json_file) 60 | 61 | title = loaded_json[0]['data']['children'][0]['data']['title'] 62 | post_body = loaded_json[0]['data']['children'][0]['data']['selftext_html'] 63 | is_self = loaded_json[0]['data']['children'][0]['data']['is_self'] 64 | if post_body is None: 65 | post_body = title 66 | 67 | if not is_self: 68 | post_body = '' 69 | # image = '
'.format(sub, a, b)) 77 | elif b.endswith(tuple(['.mp4', '.webm'])): 78 | images.append(''.format(sub, a, b)) 79 | else: 80 | if a.endswith(tuple(['.jpg', '.png', '.gif', '.jpeg'])): 81 | images.append(''.format(sub, a)) 82 | elif a.endswith(tuple(['.mp4', '.webm'])): 83 | images.append(''.format(sub, a)) 84 | 85 | comments = loaded_json[1]['data']['children'] 86 | comments_list = [] 87 | for a in comments: 88 | try: 89 | comment_html = html.unescape(a['data']['body_html']) 90 | comment_author = a['data']['author'] 91 | comment_score = a['data']['score'] 92 | # print(comment_html) 93 | comments_list.append('|author: {}| |Score: {}|

'.format(comment_author, comment_score)+comment_html+'



') 94 | except KeyError: 95 | print('KeyError') 96 | pass 97 | 98 | if not is_self: 99 | return render_template('post_template.html', title=title, post_title=title, post_body=html.unescape(post_body), image='\n'.join(images), comments='\n'.join(comments_list)) 100 | else: 101 | return render_template('post_template.html', title=title, post_title=title, post_body=html.unescape(post_body), comments='\n'.join(comments_list)) 102 | 103 | if __name__ == "__main__": 104 | app.run() -------------------------------------------------------------------------------- /Post-Archiver.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import requests, json, sys, os, threading, time, re 3 | 4 | sub = sys.argv[1] 5 | t = sys.argv[2] 6 | limit = int(sys.argv[3]) 7 | date = datetime.now().strftime('%Y-%m-%d') 8 | headers = {'User-Agent': 'Post Archiver'} 9 | threads = [] 10 | extensions = ['.jpg', '.png', '.jpeg', '.gif', '.mp4', '.webm', '.gifv'] 11 | links = [] 12 | finished_links = [] 13 | 14 | os.makedirs('r/{}/posts'.format(sub), exist_ok=True) 15 | 16 | if limit > 100 and limit in [200, 300, 400, 500, 600, 700, 800, 900, 1000]: 17 | pages = int(limit / 100) 18 | url1 = 'https://www.reddit.com/r/{}/top.json?sort=top&t={}&limit=100'.format(sub, t) 19 | sys.stdout.write('\rFetching pages: [1/{}]'.format(str(pages))) 20 | sys.stdout.flush() 21 | big_json = [requests.get(url1, headers=headers).text] 22 | for a in range(1,pages): 23 | sys.stdout.write('\rFetching pages: [{}/{}]'.format(str(a+1), str(pages))) 24 | sys.stdout.flush() 25 | json1 = requests.get(url1+'&after={}'.format(json.loads(big_json[-1])['data']['after']), headers=headers).text 26 | big_json.append(json1) 27 | sys.stdout.write('\n') 28 | for a in range(1,len(big_json)+1): 29 | with open('r/{}/{}_{}.json'.format(sub, date, str(a)), 'w') as f: 30 | f.write(big_json[a-1]) 31 | 32 | elif limit > 100: 33 | print('''If you choose to archive more than 100 posts, you must do it in 100-post increments (i.e 200, 300..., not 250, 375...) 34 | You also can\'t get more than 1000 posts due to reddit\'s API limitations''') 35 | 36 | else: 37 | url = 'https://www.reddit.com/r/{}/top.json?sort=top&t={}&limit={}'.format(sub, t, str(limit)) 38 | list_json = requests.get(url, headers=headers).text 39 | loaded_list_json = json.loads(list_json) 40 | with open('r/{}/{}_1.json'.format(sub, date), 'w') as f: 41 | f.write(list_json) 42 | 43 | if t.lower() == 'day': 44 | datetime.now().strftime('%Y-%m-%d') 45 | elif t.lower() == 'week': 46 | date = 'Week_of_' + datetime.now().strftime('%Y-%m-%d') 47 | elif t.lower() == 'month': 48 | date = datetime.now().strftime('%Y-%m') 49 | elif t.lower() == 'year': 50 | date = datetime.now().strftime('%Y') 51 | elif t.lower() == 'all': 52 | date = 'All_Time' 53 | 54 | def download(url, id): 55 | json_file = requests.get(url, headers=headers).text 56 | with open('r/{}/posts/{}.json'.format(sub, id), 'w') as f: 57 | f.write(json_file) 58 | def download_image(url, file_name): 59 | with open('static/images/{}/{}'.format(sub, file_name), 'wb') as file: 60 | response = requests.get(url, headers=headers) 61 | file.write(response.content) 62 | 63 | # sys.stdout.write('\r[{}/{}]'.format(current_num, folder_len)) 64 | # sys.stdout.flush() 65 | current_post = 0 66 | try: 67 | for a in big_json: 68 | for b in json.loads(a)['data']['children']: 69 | current_post += 1 70 | sys.stdout.write('\rFetching posts: [{}/{}]'.format(str(current_post), str(limit))) 71 | sys.stdout.flush() 72 | json_url = 'https://reddit.com'+b['data']['permalink']+'.json' 73 | thread_id = b['data']['id'] 74 | 75 | t = threading.Thread(target=download, args=(json_url, thread_id,)) 76 | t.start() 77 | threads.append(t) 78 | 79 | link = b['data']['url'], b['data']['id'] 80 | if 'gfycat' in link[0] or 'imgur' in link[0] or 'i.redd.it' in link[0] or link[0].endswith(tuple(extensions)): 81 | os.makedirs('static/images/{}'.format(sub), exist_ok=True) 82 | links.append(link) 83 | time.sleep(0.02) 84 | sys.stdout.write('\n') 85 | except NameError: 86 | for a in loaded_list_json['data']['children']: 87 | current_post += 1 88 | sys.stdout.write('\rFetching posts: [{}/{}]'.format(str(current_post), str(limit))) 89 | sys.stdout.flush() 90 | json_url = 'https://reddit.com'+a['data']['permalink']+'.json' 91 | thread_id = a['data']['id'] 92 | 93 | t = threading.Thread(target=download, args=(json_url, thread_id,)) 94 | t.start() 95 | threads.append(t) 96 | 97 | link = a['data']['url'], a['data']['id'] 98 | if 'gfycat' in link[0] or 'imgur' in link[0] or 'i.redd.it' in link[0] or link[0].endswith(tuple(extensions)): 99 | os.makedirs('static/images/{}'.format(sub), exist_ok=True) 100 | links.append(link) 101 | time.sleep(0.02) 102 | current_image_link = 0 103 | for c in links: 104 | current_image_link += 1 105 | sys.stdout.write('\rParsing image links: [{}/{}]'.format(str(current_image_link), str(len(links)))) 106 | sys.stdout.flush() 107 | if "imgur.com" in c[0]: 108 | if '/a/' in c[0] or '/gallery/' in c[0]: 109 | finished_links.append(c) 110 | 111 | elif c[0].endswith(tuple(extensions)): 112 | if c[0].endswith('.gifv'): 113 | newurl = c[0].replace(".gifv",".mp4") 114 | finished_links.append(tuple([newurl, c[1]])) 115 | 116 | else: 117 | finished_links.append(c) 118 | 119 | else: 120 | html_page = requests.get(c[0]) 121 | if html_page.status_code == 404: 122 | pass 123 | # print('404: skipping') 124 | else: 125 | imgur_id = c[0].split('/')[-1] 126 | # print(c[0]) 127 | try: 128 | link = re.findall('(?:href|src)="(?:https?:)?(\/\/i\.imgur\.com\/{}\.\S+?)"'.format(imgur_id), html_page.text)[0] 129 | link = 'https:' + link 130 | finished_links.append(tuple([link, c[1]])) 131 | except IndexError: 132 | # print('IndexError on link {}'.format(c[0])) 133 | fixedlink = c[0].split('?')[0] 134 | # print(fixedlink) 135 | pass 136 | 137 | elif "i.redd.it" in c[0] or "i.reddituploads.com" in c[0]: 138 | finished_links.append(c) 139 | 140 | elif "gfycat.com" in c[0] and not c[0].endswith('.webm'): 141 | gfycat_id = c[0].split('/')[-1] 142 | link = 'http://giant.gfycat.com/{}.webm'.format(gfycat_id) 143 | finished_links.append(tuple([link, c[1]])) 144 | 145 | elif c[0].endswith(tuple(extensions)): 146 | finished_links.append(c) 147 | sys.stdout.write('\n') 148 | 149 | current_image = 0 150 | try: 151 | for d in finished_links: 152 | current_image += 1 153 | sys.stdout.write('\rDownloading images: [{}/{}]'.format(str(current_image), str(len(finished_links)))) 154 | sys.stdout.flush() 155 | a_imgnumber = 0 156 | a_threads = [] 157 | donelinks = [] 158 | if '/a/' in d[0] or '/gallery/' in d[0]: 159 | os.makedirs('static/images/{}/{}'.format(sub, d[1])) 160 | html_page = requests.get(d[0] + '/layout/blog') 161 | if html_page.status_code == 404: 162 | pass 163 | # print('404: skipping') 164 | else: 165 | imglinks = re.findall(r'\.*?{"hash":"([a-zA-Z0-9]+)".*?"ext":"(\.[a-zA-Z0-9]+)".*?', html_page.text) 166 | for i in imglinks: 167 | try: 168 | if i[0]+i[1] not in donelinks: 169 | a_imgnumber += 1 170 | if i[1] == '.gif': 171 | ext = '.mp4' 172 | else: 173 | ext = i[1] 174 | g = threading.Thread(target=download_image, args=('https://i.imgur.com/'+i[0]+ext, '{}/{}'.format(d[1], str(a_imgnumber)+ext))) 175 | a_threads.append(g) 176 | g.start() 177 | donelinks.append(i[0]+i[1]) 178 | except KeyboardInterrupt: 179 | print('\nCtrl-C Pressed; Finishing current threads then stopping...') 180 | for f in a_threads: 181 | f.join() 182 | sys.exit() 183 | for f in a_threads: 184 | f.join() 185 | else: 186 | ext = os.path.splitext(d[0])[1] 187 | t = threading.Thread(target=download_image, args=(d[0], d[1]+ext)) 188 | t.start() 189 | threads.append(t) 190 | 191 | for e in threads: 192 | e.join() 193 | sys.stdout.write('\n') 194 | 195 | except KeyboardInterrupt: 196 | print('\nCtrl-C Pressed; Finishing current threads then stopping...') 197 | for e in threads: 198 | e.join() 199 | sys.exit() 200 | 201 | 202 | for b in threads: 203 | b.join() 204 | 205 | print('All done!') --------------------------------------------------------------------------------