├── requirements.txt ├── .github └── FUNDING.yml ├── logindata.py.example ├── .gitignore ├── docker-compose.yml ├── html ├── comment-div.html ├── post.html ├── post-div.html ├── upvoted.html ├── saved.html ├── username.html ├── main.js └── style.css ├── Dockerfile ├── README.md ├── save.py └── utilities.py /requirements.txt: -------------------------------------------------------------------------------- 1 | praw 2 | requests 3 | youtube_dl 4 | yt-dlp 5 | tqdm 6 | redvid 7 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [samirelanduk] 4 | -------------------------------------------------------------------------------- /logindata.py.example: -------------------------------------------------------------------------------- 1 | REDDIT_USERNAME = "username" 2 | REDDIT_PASSWORD = "password" 3 | REDDIT_CLIENT_ID = "id" 4 | REDDIT_SECRET = "secret" -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | __pycache__ 3 | .DS_Store 4 | samirelanduk 5 | secrets.py 6 | secrets1.py 7 | secrets2.py 8 | /logindata.py 9 | logindata.py 10 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.2" 2 | services: 3 | reddit-save: 4 | build: . 5 | image: reddit-save:latest 6 | environment: 7 | REDDIT_USERNAME: 8 | REDDIT_PASSWORD: 9 | REDDIT_CLIENT_ID: 10 | REDDIT_SECRET: 11 | volumes: 12 | - "./archive:/opt/app/archive" 13 | -------------------------------------------------------------------------------- /html/comment-div.html: -------------------------------------------------------------------------------- 1 |
2 |
3 | 4 |
(+)
5 |
6 |
7 |
8 |
9 | 10 |
11 |
-------------------------------------------------------------------------------- /html/post.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | <!--title--> 4 | 5 | 6 | 7 | 8 | 9 | 10 |
11 |

Comments

12 | 13 |
14 | 15 | -------------------------------------------------------------------------------- /html/post-div.html: -------------------------------------------------------------------------------- 1 |
2 |

3 |
4 | 9 | 10 | 11 | 12 |
13 |
14 | 15 |
16 |
17 | 18 |
19 | 20 |
-------------------------------------------------------------------------------- /html/upvoted.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Upvoted Posts 4 | 5 | 6 | 7 | 8 | 9 | 13 |

Upvoted Posts

14 | 15 | 19 | 20 | -------------------------------------------------------------------------------- /html/saved.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Saved Posts 4 | 5 | 6 | 7 | 8 | 9 | 13 |
14 |

Saved Posts

15 | 16 |
17 |
18 |

Saved Comments

19 | 20 |
21 | 25 | 26 | -------------------------------------------------------------------------------- /html/username.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Saved Posts 4 | 5 | 6 | 7 | 8 | 9 | 13 |
14 |

u/[username]'s' Posts

15 | 16 |
17 |
18 |

u/[username]'s' Comments

19 | 20 |
21 | 25 | 26 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM jrottenberg/ffmpeg:4.0-alpine 2 | 3 | ENV PYTHONUNBUFFERED=1 4 | ENV DOCKER=1 5 | 6 | RUN apk add build-base && apk add python3-dev 7 | 8 | RUN echo "**** install Python ****" && \ 9 | apk add --no-cache python3 && \ 10 | if [ ! -e /usr/bin/python ]; then ln -sf python3 /usr/bin/python ; fi && \ 11 | \ 12 | echo "**** install pip ****" && \ 13 | python3 -m ensurepip && \ 14 | rm -r /usr/lib/python*/ensurepip && \ 15 | pip3 install --no-cache --upgrade pip setuptools wheel && \ 16 | if [ ! -e /usr/bin/pip ]; then ln -s pip3 /usr/bin/pip ; fi 17 | 18 | 19 | COPY requirements.txt /opt/app/requirements.txt 20 | WORKDIR /opt/app 21 | RUN pip install -r requirements.txt 22 | COPY . . 23 | 24 | ENTRYPOINT ["python", "save.py"] 25 | CMD [] 26 | -------------------------------------------------------------------------------- /html/main.js: -------------------------------------------------------------------------------- 1 | const toggleView = () => { 2 | const postsSection = document.querySelector(".posts-section"); 3 | const commentsSection = document.querySelector(".comments-section"); 4 | if (commentsSection.style.display === "none") { 5 | commentsSection.style.display = "block"; 6 | postsSection.style.display = "none"; 7 | } else { 8 | postsSection.style.display = "block"; 9 | commentsSection.style.display = "none"; 10 | } 11 | } 12 | 13 | const toggleMedia = e => { 14 | const img = e.target; 15 | const preview = img.parentNode; 16 | preview.classList.toggle("full"); 17 | } 18 | 19 | window.addEventListener("load", function() { 20 | const postsSection = document.querySelector(".posts-section"); 21 | const commentsSection = document.querySelector(".comments-section"); 22 | if (commentsSection) { 23 | commentsSection.style.display = "none"; 24 | const toggleButton = document.createElement("button"); 25 | toggleButton.innerText = "toggle"; 26 | toggleButton.addEventListener("click", toggleView); 27 | document.body.insertBefore(toggleButton, postsSection); 28 | } 29 | 30 | for (let preview of document.querySelectorAll(".preview")) { 31 | const media = preview.querySelector("img") || preview.querySelector("video"); 32 | if (media) { 33 | media.addEventListener("click", toggleMedia); 34 | } 35 | } 36 | }) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # reddit-save 2 | 3 | A Python utility for backing up your reddit upvoted/saved stuff. 4 | 5 | Browsing through the stuff you've liked or saved on reddit is really enjoyable and, depending on the reason you saved something, can be a great way to recap stuff you once thought important. It is a personalised feed of posts and comments by the one person guaranteed to know what you like - past you. 6 | 7 | However over time more and more of the older posts will be deleted or missing, and the historical record atrophies. Use this tool to back up those posts and comments to your computer where you can browse them offline, and where they are safe forever. 8 | 9 | reddit-save will back up saved posts, saved comments, and upvoted posts. It can't do upvoted comments because the reddit API doesn't expose them. Crucially, when it is run again on the same location it will ignore any posts/comments previously archived - once something is saved, it's saved permanently. 10 | 11 | ## Installation 12 | 13 | ```bash 14 | $ git clone https://github.com/samirelanduk/reddit-save . 15 | $ cd reddit-save 16 | $ pip install -r requirements.txt 17 | ``` 18 | 19 | If you get permission errors, try using `sudo` or using a virtual environment. 20 | 21 | You will need [ffmpeg](https://ffmpeg.org/) installed somewhere too. 22 | 23 | Rename the file `logindata.py.example` to `logindata.py`. You will need to add four things to this file: your reddit username and password, and a reddit client ID and secret. The latter two are obtained using [the instructions here](https://github.com/reddit-archive/reddit/wiki/OAuth2-Quick-Start-Example#first-steps). The file should look something like this: 24 | 25 | ```python 26 | REDDIT_USERNAME = "spez" 27 | REDDIT_PASSWORD = "myredditpassword123" 28 | REDDIT_CLIENT_ID = "sadsU7-zfX" 29 | REDDIT_SECRET = "687DDJSS&999d-hdkjK8h" 30 | ``` 31 | 32 | (If you have 2FA enabled, you will need to append that to the password, separated by a colon.) 33 | 34 | ## Use 35 | 36 | Create a folder that will contain your archive. Then run: 37 | 38 | ```bash 39 | $ ./save.py saved folder_name 40 | $ ./save.py upvoted folder_name 41 | ``` 42 | 43 | The first command will back up your saved posts/comments to a file called folder_name/saved.html. The second will back up your upvoted posts to a file called folder_name/upvoted.html. 44 | 45 | Each post will have its top-level comments saved, as well as each of their immediate child comments (but no further). 46 | 47 | Linked media files (images, videos etc.) will be saved locally where possible, though imgur is currently not well supported in all cases. 48 | 49 | If you want to also break the resultant HTML file into multiple files (as browsers struggle to display enormous HTML files) you can add the `--page-size 100` argument (replacing 100 with whatever the posts-per page you want is). 50 | 51 | ## Use with Docker 52 | 53 | Rather than installing dependencies locally, you can use Docker to create a local image and use that instead. First build the image: 54 | 55 | ```bash 56 | $ docker build -t redditsave . 57 | ``` 58 | 59 | Then run reddit-save within a container created from this image: 60 | 61 | ```bash 62 | $ docker run \ 63 | -e REDDIT_USERNAME=spez \ 64 | -e REDDIT_PASSWORD="myredditpassword123" \ 65 | -e REDDIT_CLIENT_ID="sadsU7-zfX" \ 66 | -e REDDIT_SECRET="687DDJSS&999d-hdkjK8h" \ 67 | -v /Local/location/to/save/in:/opt/app/archive \ 68 | redditsave saved 69 | ``` 70 | 71 | ## Backing up a specific username 72 | 73 | Rather than backing up your own saved/upvoted posts and comments, you can back up the submitted posts and comments of another user: 74 | 75 | ```bash 76 | ./save.py user:samirelanduk folder_name 77 | ``` -------------------------------------------------------------------------------- /save.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import os 5 | import re 6 | from tqdm import tqdm 7 | from utilities import * 8 | 9 | # Get arguments 10 | def validate_mode(mode): 11 | if mode not in ["saved", "upvoted"] and not mode.startswith("user:"): 12 | raise argparse.ArgumentTypeError(f"Invalid mode: {mode}") 13 | return mode 14 | parser = argparse.ArgumentParser(description="Save reddit posts to file.") 15 | parser.add_argument("mode", type=validate_mode, nargs=1, help="The file to convert.") 16 | if os.getenv("DOCKER", "0") != "1": 17 | parser.add_argument("location", type=str, nargs=1, help="The path to save to.") 18 | # Optional page size argument 19 | parser.add_argument("--page-size", type=int, nargs=1, default=[0], help="The number of posts to save per page.") 20 | args = parser.parse_args() 21 | mode = args.mode[0] 22 | page_size = args.page_size[0] 23 | location = "./archive/" if os.getenv("DOCKER", "0") == "1" else args.location[0] 24 | 25 | # Is location specified a directory? 26 | if not os.path.isdir(location): 27 | print(location, "is not a directory") 28 | 29 | # Make a client object 30 | client = make_client() 31 | 32 | # Saved posts or upvoted posts? 33 | if mode == "saved": 34 | html_file = "saved.html" 35 | get_posts = get_saved_posts 36 | get_comments = get_saved_comments 37 | elif mode == "upvoted": 38 | html_file = "upvoted.html" 39 | get_posts = get_upvoted_posts 40 | get_comments = lambda client: [] 41 | elif mode.startswith("user:"): 42 | username = mode.split(":")[-1] 43 | html_file = f"{username}.html" 44 | get_posts = lambda client: get_user_posts(client, username) 45 | get_comments = lambda client: get_user_comments(client, username) 46 | 47 | # Make directory for media and posts 48 | if not os.path.exists(os.path.join(location, "media")): 49 | os.mkdir(os.path.join(location, "media")) 50 | if not os.path.exists(os.path.join(location, "posts")): 51 | os.mkdir(os.path.join(location, "posts")) 52 | 53 | # Get files to search through 54 | print("Getting previously saved posts and comments...") 55 | existing_ids, existing_posts_html, existing_comments_html = get_previous(location, html_file) 56 | print(len(existing_posts_html), "previous posts.") 57 | print(len(existing_comments_html), "previous comments.") 58 | 59 | # Get posts HTML 60 | posts_html = [] 61 | posts = [p for p in get_posts(client) if p.id not in existing_ids] 62 | if not posts: 63 | print("No new posts") 64 | else: 65 | for post in tqdm(posts): 66 | post_html = get_post_html(post) 67 | media = save_media(post, location) 68 | if media: 69 | post_html = add_media_preview_to_html(post_html, media) 70 | posts_html.append(post_html) 71 | page_html = create_post_page_html(post, post_html) 72 | with open(os.path.join(location, "posts", f"{post.id}.html"), "w", encoding="utf-8") as f: 73 | f.write(page_html) 74 | posts_html += existing_posts_html 75 | 76 | # Get comments HTML 77 | comments_html = [] 78 | comments = [c for c in get_comments(client) if c.id not in existing_ids] 79 | if not comments: 80 | print("No new comments") 81 | else: 82 | for comment in tqdm(comments): 83 | comment_html = get_comment_html(comment) 84 | media = save_media(post, location) 85 | comments_html.append(comment_html) 86 | comments_html += existing_comments_html 87 | 88 | # Save overall HTML 89 | print("Saving HTML...") 90 | if page_size: 91 | length = max(len(posts_html), len(comments_html)) 92 | page_count = (length // page_size) + 1 93 | for i in range(page_count): 94 | posts_on_page = posts_html[i*page_size:(i+1)*page_size] 95 | comments_on_page = comments_html[i*page_size:(i+1)*page_size] 96 | has_next = i < page_count - 1 97 | save_html(posts_on_page, comments_on_page, location, html_file, i, has_next, username=html_file.split(".")[0]) 98 | save_html(posts_html, comments_html, location, html_file, None, False, username=html_file.split(".")[0]) 99 | -------------------------------------------------------------------------------- /html/style.css: -------------------------------------------------------------------------------- 1 | * { 2 | margin: 0; 3 | padding: 0; 4 | border: 0; 5 | outline: none; 6 | font-size: 100%; 7 | font: inherit; 8 | vertical-align: baseline; 9 | list-style: none; 10 | background-color: inherit; 11 | box-sizing: border-box; 12 | -webkit-appearance: none; 13 | -moz-appearance: none; 14 | } 15 | 16 | body { 17 | font-family: 'Open Sans', Verdana; 18 | } 19 | 20 | img, video { 21 | max-width: 200px; 22 | max-height: 200px; 23 | cursor: pointer; 24 | } 25 | 26 | a { 27 | color: #0079d3; 28 | text-decoration: none; 29 | } 30 | 31 | p, li { 32 | font-size: 14px; 33 | margin-bottom: 8px; 34 | } 35 | 36 | ul li { 37 | list-style: disc; 38 | } 39 | 40 | ol li { 41 | list-style-type: decimal; 42 | } 43 | 44 | li { 45 | margin-left: 24px; 46 | } 47 | 48 | em { 49 | font-style: italic; 50 | } 51 | 52 | strong { 53 | font-weight: bold; 54 | } 55 | 56 | code { 57 | font-family: monospace; 58 | white-space: pre; 59 | } 60 | 61 | blockquote { 62 | font-style: italic; 63 | padding-left: 8px; 64 | position: relative; 65 | } 66 | 67 | blockquote:before { 68 | position: absolute; 69 | left: 0; 70 | width: 4px; 71 | content: ""; 72 | height: 100%; 73 | background-color: #16a085; 74 | } 75 | 76 | a:hover { 77 | text-decoration: underline; 78 | } 79 | 80 | h1 { 81 | font-weight: bold; 82 | font-size: 24px; 83 | padding: 8px 16px; 84 | } 85 | 86 | .links { 87 | padding: 12px 16px 0px; 88 | font-size: 12px; 89 | display: flex; 90 | gap: 8px; 91 | } 92 | 93 | .post, .comment { 94 | border-top: 1px solid #f0f0f0; 95 | padding: 12px 16px; 96 | } 97 | 98 | .post h2 { 99 | font-weight: bold; 100 | font-size: 18px; 101 | } 102 | 103 | .post h1, .comment h1 { 104 | padding: 0; 105 | margin-bottom: 8px; 106 | } 107 | 108 | .post .info { 109 | font-size: 12px; 110 | font-weight: 300; 111 | margin-bottom: 16px; 112 | } 113 | 114 | .post .info time { 115 | font-weight: 500; 116 | } 117 | 118 | .post .info time, .post .info span { 119 | padding-right: 4px; 120 | } 121 | 122 | .post .links a { 123 | padding-right: 4px; 124 | font-weight: 400; 125 | } 126 | 127 | .post .links a:after { 128 | content: "|"; 129 | position: relative; 130 | right: -4px; 131 | } 132 | 133 | .post .links a:last-child:after { 134 | content: ""; 135 | } 136 | 137 | .comment .info { 138 | display: flex; 139 | align-items: baseline; 140 | margin-bottom: 8px; 141 | } 142 | 143 | .comment .info time { 144 | padding-right: 12px; 145 | } 146 | 147 | .comment .info div { 148 | font-weight: 300; 149 | font-size: 12px; 150 | padding-right: 8px; 151 | position: relative; 152 | top: -1px; 153 | } 154 | 155 | .comments h2 { 156 | font-weight: bold; 157 | font-size: 20px; 158 | padding: 8px 16px; 159 | } 160 | 161 | .op { 162 | background: #0055df; 163 | color: white; 164 | font-weight: 700; 165 | padding: 0px 4px; 166 | border-radius: 4px; 167 | } 168 | 169 | button { 170 | border: 1px solid #1abc9c60; 171 | color: #1abc9c; 172 | border-radius: 5px; 173 | padding: 4px 8px;; 174 | font-size: 12px; 175 | cursor: pointer; 176 | position: absolute; 177 | right: 16px; 178 | font-weight: bold; 179 | top: 10px; 180 | background-color: #1abc9c20; 181 | } 182 | 183 | button:hover { 184 | background-color: #1abc9c40; 185 | } 186 | 187 | .preview.full { 188 | width: 100vw; 189 | height: 100vh; 190 | position: fixed; 191 | left: 0; 192 | top: 0; 193 | background-color: #00000080; 194 | display: flex; 195 | justify-content: center; 196 | z-index: 100; 197 | align-items: center; 198 | } 199 | 200 | .preview.full img, .preview.full video { 201 | max-width: 100vw; 202 | max-height: 100vh; 203 | width: 100%; 204 | height: 100%; 205 | object-fit: contain; 206 | } -------------------------------------------------------------------------------- /utilities.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import praw 4 | import requests 5 | from redvid import Downloader 6 | import yt_dlp 7 | import re 8 | from datetime import datetime 9 | 10 | try: 11 | from logindata import REDDIT_USERNAME, REDDIT_PASSWORD 12 | from logindata import REDDIT_CLIENT_ID, REDDIT_SECRET 13 | except ImportError: 14 | REDDIT_USERNAME = os.getenv("REDDIT_USERNAME") 15 | REDDIT_PASSWORD = os.getenv("REDDIT_PASSWORD") 16 | REDDIT_CLIENT_ID = os.getenv("REDDIT_CLIENT_ID") 17 | REDDIT_SECRET = os.getenv("REDDIT_SECRET") 18 | 19 | IMAGE_EXTENSIONS = ["gif", "gifv", "jpg", "jpeg", "png"] 20 | VIDEO_EXTENSIONS = ["mp4"] 21 | PLATFORMS = ["redgifs.com", "gfycat.com", "imgur.com", "youtube.com"] 22 | 23 | 24 | def make_client(): 25 | """Creates a PRAW client with the details in the secrets.py file.""" 26 | 27 | print(REDDIT_USERNAME) 28 | 29 | return praw.Reddit( 30 | username=REDDIT_USERNAME, 31 | password=REDDIT_PASSWORD, 32 | client_id=REDDIT_CLIENT_ID, 33 | client_secret=REDDIT_SECRET, 34 | user_agent="reddit-save", 35 | ) 36 | 37 | 38 | def get_previous(location, html_file): 39 | html_files = [f for f in os.listdir(location) if f.endswith(".html")] 40 | pattern = html_file.replace(".html", r"\.(\d+)?\.html") 41 | matches = [re.match(pattern, f) for f in html_files] 42 | matches = [m[0] for m in matches if m] 43 | matches.sort(key=lambda x: int(x.split(".")[1])) 44 | existing_ids = [] 45 | existing_posts_html = [] 46 | existing_comments_html = [] 47 | if html_file in html_files: matches.append(html_file) 48 | for match in matches: 49 | with open(os.path.join(location, match), encoding="utf-8") as f: 50 | current_html = f.read() 51 | for id in re.findall(r'id="(.+?)"', current_html): 52 | if id not in existing_ids: 53 | existing_ids.append(id) 54 | posts = re.findall( 55 | r'(
<\/div>)', 56 | current_html 57 | ) 58 | comments = re.findall( 59 | r'(
<\/div>)', 60 | current_html 61 | ) 62 | for post in posts: 63 | if post not in existing_posts_html: 64 | existing_posts_html.append(post) 65 | for comment in comments: 66 | if comment not in existing_comments_html: 67 | existing_comments_html.append(comment) 68 | return existing_ids, existing_posts_html, existing_comments_html 69 | 70 | 71 | def get_saved_posts(client): 72 | """Gets a list of posts that the user has saved.""" 73 | 74 | return [ 75 | saved for saved in client.user.me().saved(limit=None) 76 | if saved.__class__.__name__ == "Submission" 77 | ] 78 | 79 | 80 | def get_upvoted_posts(client): 81 | """Gets a list of posts that the user has upvoted.""" 82 | 83 | return [ 84 | upvoted for upvoted in client.user.me().upvoted(limit=None) 85 | if upvoted.__class__.__name__ == "Submission" 86 | ] 87 | 88 | 89 | def get_saved_comments(client): 90 | """Gets a list of comments that the user has saved.""" 91 | 92 | return [ 93 | saved for saved in client.user.me().saved(limit=None) 94 | if saved.__class__.__name__ != "Submission" 95 | ] 96 | 97 | 98 | def get_user_posts(client, username): 99 | """Gets a list of posts that the user has made.""" 100 | 101 | return [ 102 | post for post in client.redditor(username).submissions.new(limit=None) 103 | ] 104 | 105 | 106 | def get_user_comments(client, username): 107 | """Gets a list of comments that the user has made.""" 108 | 109 | return [ 110 | comment for comment in client.redditor(username).comments.new(limit=None) 111 | ] 112 | 113 | 114 | def get_post_html(post): 115 | """Takes a post object and creates a HTML for it - but not including the 116 | preview HTML.""" 117 | 118 | with open(os.path.join("html", "post-div.html"), encoding="utf-8") as f: 119 | html = f.read() 120 | dt = datetime.utcfromtimestamp(post.created_utc) 121 | html = html.replace("", post.title) 122 | html = html.replace("", f"/r/{str(post.subreddit)}") 123 | html = html.replace("", f"/u/{post.author.name}" if post.author else "[deleted]") 124 | html = html.replace("", f"posts/{post.id}.html") 125 | html = html.replace("", f"https://reddit.com{post.permalink}") 126 | html = html.replace("", post.url) 127 | html = html.replace("", post.id) 128 | html = html.replace("", (post.selftext_html or "").replace( 129 | '", str(dt)) 132 | html = html.replace("", dt.strftime("%d %B, %Y")) 133 | return html 134 | 135 | 136 | def save_media(post, location): 137 | """Takes a post object and tries to download any image/video it might be 138 | associated with. If it can, it will return the filename.""" 139 | 140 | url = post.url 141 | stripped_url = url.split("?")[0] 142 | if url.endswith(post.permalink): return None 143 | 144 | # What is the key information? 145 | extension = stripped_url.split(".")[-1].lower() 146 | domain = ".".join(post.url.split("/")[2].split(".")[-2:]) 147 | readable_name = list(filter(bool, post.permalink.split("/")))[-1] 148 | 149 | # If it's an imgur gallery, forget it 150 | if domain == "imgur.com" and "gallery" in url: return None 151 | 152 | # Can the media be obtained directly? 153 | if extension in IMAGE_EXTENSIONS + VIDEO_EXTENSIONS: 154 | filename = f"{readable_name}_{post.id}.{extension}" 155 | try: 156 | response = requests.get(post.url) 157 | except: 158 | return 159 | media_type = response.headers.get("Content-Type", "") 160 | if media_type.startswith("image") or media_type.startswith("video"): 161 | with open(os.path.join(location, "media", filename), "wb") as f: 162 | f.write(response.content) 163 | return filename 164 | 165 | # Is this a reddit gallery? 166 | if domain == "reddit.com" and "gallery" in url: 167 | json_url = url + ".json" 168 | resp = requests.get(json_url) 169 | 170 | sleep = 1 171 | while resp.status_code == 429: 172 | time.sleep(sleep) 173 | print(f"Rate limited, sleeping for {sleep} seconds") 174 | resp = requests.get(json_url) 175 | sleep *= 2 176 | 177 | data = resp.json() 178 | post_data = data[0]["data"]["children"][0]["data"] 179 | media = post_data.get("media_metadata") 180 | if not media: return None 181 | filenames = [] 182 | for idx, data in enumerate(list(media.values()), 1): 183 | if "m" not in data: continue 184 | ext = data["m"].split("/")[-1] 185 | if "u" in data["s"]: 186 | base_url = data["s"]["u"].replace("&", "&") # unescape URL 187 | else: 188 | continue 189 | response = requests.get(base_url) 190 | if response.status_code == 200: 191 | filename = f"{readable_name}_{post.id}_{idx}.{ext}" 192 | with open(os.path.join(location, "media", filename), "wb") as f: 193 | f.write(response.content) 194 | filenames.append(filename) 195 | return filenames[0] if filenames else None 196 | 197 | # Is this a v.redd.it link? 198 | if domain == "redd.it": 199 | downloader = Downloader(max_q=True, log=False) 200 | downloader.url = url 201 | current = os.getcwd() 202 | try: 203 | name = downloader.download() 204 | extension = name.split(".")[-1] 205 | filename = f"{readable_name}_{post.id}.{extension}" 206 | os.rename(name, os.path.join(location, "media", filename)) 207 | return filename 208 | except: 209 | os.chdir(current) 210 | return None 211 | 212 | # Is it a gfycat link that redirects? Update the URL if possible 213 | if domain == "gfycat.com": 214 | html = requests.get(post.url).content 215 | if len(html) < 50000: 216 | match = re.search(r"http([\dA-Za-z\+\:\/\.]+)\.mp4", html.decode()) 217 | if match: 218 | url = match.group() 219 | else: 220 | return None 221 | 222 | # Is this an imgur image? 223 | if domain == "imgur.com" and extension != "gifv": 224 | for extension in IMAGE_EXTENSIONS: 225 | direct_url = f'https://i.{url[url.find("//") + 2:]}.{extension}' 226 | direct_url = direct_url.replace("i.imgur.com", "imgur.com") 227 | direct_url = direct_url.replace("m.imgur.com", "imgur.com") 228 | try: 229 | response = requests.get(direct_url) 230 | except: continue 231 | if response.status_code == 200: 232 | filename = f"{readable_name}_{post.id}.{extension}" 233 | with open(os.path.join(location, "media", filename), "wb") as f: 234 | f.write(response.content) 235 | return filename 236 | 237 | # Try to use youtube_dl if it's one of the possible domains 238 | if domain in PLATFORMS: 239 | options = { 240 | "nocheckcertificate": True, "quiet": True, "no_warnings": True, 241 | "ignoreerrors": True, "no-progress": True, 242 | "outtmpl": os.path.join( 243 | location, "media", f"{readable_name}_{post.id}" + ".%(ext)s" 244 | ) 245 | } 246 | with yt_dlp.YoutubeDL(options) as ydl: 247 | try: 248 | ydl.download([url]) 249 | except: 250 | os.chdir(current) 251 | return 252 | for f in os.listdir(os.path.join(location, "media")): 253 | if f.startswith(f"{readable_name}_{post.id}"): 254 | return f 255 | 256 | 257 | def add_media_preview_to_html(post_html, media): 258 | """Takes post HTML and returns a modified version with the preview 259 | inserted.""" 260 | 261 | extension = media.split(".")[-1] 262 | location = "/".join(["media", media]) 263 | if extension in IMAGE_EXTENSIONS: 264 | return post_html.replace( 265 | "", 266 | f'' 267 | ) 268 | if extension in VIDEO_EXTENSIONS: 269 | return post_html.replace( 270 | "", 271 | f'' 272 | ) 273 | return post_html 274 | 275 | 276 | def create_post_page_html(post, post_html): 277 | """Creates the HTML for a post's own page.""" 278 | 279 | with open(os.path.join("html", "post.html"), encoding="utf-8") as f: 280 | html = f.read() 281 | html = html.replace("", post.title) 282 | html = html.replace("", post_html.replace("h2>", "h1>").replace( 283 | '", f"") 290 | with open(os.path.join("html", "main.js"), encoding="utf-8") as f: 291 | html = html.replace("", f"") 292 | comments_html = [] 293 | post.comments.replace_more(limit=0) 294 | for comment in post.comments: 295 | comments_html.append(get_comment_html( 296 | comment, op=post.author.name if post.author else None 297 | )) 298 | html = html.replace("", "\n".join(comments_html)) 299 | return html 300 | 301 | 302 | def get_comment_html(comment, children=True, op=None): 303 | """Takes a post object and creates a HTML for it - it will get its children 304 | too unless you specify otherwise.""" 305 | 306 | with open(os.path.join("html", "comment-div.html"), encoding="utf-8") as f: 307 | html = f.read() 308 | dt = datetime.utcfromtimestamp(comment.created_utc) 309 | author = "[deleted]" 310 | if comment.author: 311 | if comment.author == op: 312 | author = f'/u/{comment.author.name}' 313 | else: 314 | author = f"/u/{comment.author.name}" 315 | html = html.replace("", author) 316 | html = html.replace("", (comment.body_html or "").replace( 317 | '", str(comment.score)) 320 | html = html.replace("", f"https://reddit.com{comment.permalink}") 321 | html = html.replace("", str(dt)) 322 | html = html.replace("", comment.id) 323 | html = html.replace("", dt.strftime("%H:%M - %d %B, %Y")) 324 | if children: 325 | children_html = [] 326 | for child in comment.replies: 327 | children_html.append(get_comment_html(child, children=False, op=op)) 328 | html = html.replace("", "\n".join(children_html)) 329 | return html 330 | 331 | 332 | def save_html(posts, comments, location, html_file, page, has_next, username=None): 333 | if username: 334 | with open(os.path.join("html", "username.html"), encoding="utf-8") as f: 335 | html = f.read().replace("[username]", username) 336 | else: 337 | with open(os.path.join("html", html_file), encoding="utf-8") as f: 338 | html = f.read() 339 | with open(os.path.join("html", "style.css"), encoding="utf-8") as f: 340 | html = html.replace("", f"") 341 | with open(os.path.join("html", "main.js"), encoding="utf-8") as f: 342 | html = html.replace("", f"") 343 | if page == 0 or page is None: 344 | html = html.replace("Previous", "") 345 | else: 346 | html = html.replace(".p.html", f".{page-1}.html") 347 | if not has_next or page is None: 348 | html = html.replace("Next", "") 349 | else: 350 | html = html.replace(".n.html", f".{page+1}.html") 351 | html = html.replace("", "\n".join(posts)) 352 | html = html.replace("", "\n".join(comments)) 353 | file_name = html_file if page is None else html_file.replace(".html", f".{page}.html") 354 | with open(os.path.join(location, file_name), "w", encoding="utf-8") as f: 355 | f.write(html) 356 | --------------------------------------------------------------------------------