├── .gitignore ├── Pipfile ├── .github └── workflows │ └── publish-docker.yml ├── LICENSE ├── Dockerfile ├── remarkable-substack.nomad.hcl ├── remarkable.py ├── README.md ├── main.py ├── sstack.py └── Pipfile.lock /.gitignore: -------------------------------------------------------------------------------- 1 | .substack-cookie 2 | db_file.json 3 | __pycache__ 4 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | rmapy = "*" 8 | requests = "*" 9 | pypdf = "*" 10 | pytest-playwright = "*" 11 | playwright-stealth = "*" 12 | playwright = "*" 13 | 14 | [dev-packages] 15 | 16 | [scripts] 17 | "sstack.py" = "python3 sstack.py" 18 | "main.py" = "python3 main.py" 19 | -------------------------------------------------------------------------------- /.github/workflows/publish-docker.yml: -------------------------------------------------------------------------------- 1 | name: Publish Docker image 2 | on: 3 | push: 4 | tags: 5 | - 'v*' 6 | workflow_dispatch: 7 | 8 | jobs: 9 | push_to_registry: 10 | name: Push Docker image to GitHub Packages 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Check out the repo 14 | uses: actions/checkout@v2 15 | 16 | - name: Push to GitHub Packages 17 | uses: docker/build-push-action@v1 18 | with: 19 | username: ${{ github.actor }} 20 | password: ${{ secrets.GITHUB_TOKEN }} 21 | registry: docker.pkg.github.com 22 | repository: jwoglom/remarkable-substack/remarkable-substack 23 | tag_with_ref: true 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 James Woglom 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.14-bookworm as base 2 | 3 | # The following is adapted from: 4 | # https://sourcery.ai/blog/python-docker/ 5 | 6 | # Setup env 7 | ENV LANG C.UTF-8 8 | ENV LC_ALL C.UTF-8 9 | ENV PYTHONDONTWRITEBYTECODE 1 10 | ENV PYTHONFAULTHANDLER 1 11 | 12 | FROM base AS python-deps 13 | 14 | # Install pipenv and compilation dependencies 15 | RUN apt-get update && apt-get install -y --no-install-recommends gcc wget 16 | RUN pip install pipenv 17 | RUN wget https://github.com/ddvk/rmapi/releases/download/v0.0.32/rmapi-linux-amd64.tar.gz 18 | 19 | RUN tar xvzf rmapi-linux-amd64.tar.gz 20 | 21 | RUN mkdir -p /base 22 | WORKDIR /base 23 | 24 | # Install python dependencies in /.venv 25 | COPY Pipfile . 26 | COPY Pipfile.lock . 27 | RUN PIPENV_VENV_IN_PROJECT=1 pipenv install --deploy 28 | FROM base AS runtime 29 | 30 | # Copy virtualenv from python-deps stage 31 | COPY --from=python-deps /base/.venv /base/.venv 32 | COPY --from=python-deps /rmapi /base/.venv/bin/rmapi 33 | ENV PATH="/base/.venv/bin:$PATH" 34 | 35 | RUN playwright install-deps 36 | RUN playwright install 37 | 38 | # Create and switch to a new user 39 | RUN useradd --create-home appuser 40 | RUN mkdir -p /home/appuser/.cache/ 41 | RUN cp -r /root/.cache/ms-playwright /home/appuser/.cache/ 42 | RUN chown -R appuser /home/appuser/.cache 43 | WORKDIR /home/appuser 44 | USER appuser 45 | 46 | # Install application into container 47 | COPY . . 48 | 49 | # Run the application 50 | ENTRYPOINT ["python3", "-u", "main.py"] -------------------------------------------------------------------------------- /remarkable-substack.nomad.hcl: -------------------------------------------------------------------------------- 1 | job "remarkable-substack" { 2 | datacenters = ["dc1"] 3 | 4 | type = "batch" 5 | 6 | reschedule { 7 | attempts = 5 8 | interval = "6h" 9 | delay = "30s" 10 | delay_function = "exponential" 11 | max_delay = "30m" 12 | unlimited = false 13 | } 14 | 15 | periodic { 16 | cron = "0 */3 * * *" 17 | prohibit_overlap = true 18 | } 19 | 20 | group "default" { 21 | count = 1 22 | 23 | volume "remarkable_substack_config" { 24 | type = "host" 25 | source = "remarkable_substack_config" 26 | } 27 | 28 | volume "remarkable_rmapi_config" { 29 | type = "host" 30 | source = "remarkable_rmapi_config" 31 | } 32 | 33 | task "default" { 34 | driver = "docker" 35 | 36 | config { 37 | image = "ghcr.io/jwoglom/remarkable-substack/remarkable-substack" 38 | privileged = true 39 | 40 | entrypoint = ["bash"] 41 | args = [ 42 | "-c", 43 | <= 0 and unread_hrs >= args.delete_unread_after_hours: 99 | print(f"Article not opened after {unread_hrs} hrs, will delete if needed: {file}") 100 | delete_if_needed[id] = f'{args.folder}/{file}' 101 | 102 | print(f'{existing_ids=}') 103 | print(f'{delete_if_needed.keys()=}') 104 | if args.delete_already_read: 105 | print(f'{files_to_delete=}') 106 | 107 | if len(files_to_delete) > 0: 108 | print('Deleting old files') 109 | for path in files_to_delete: 110 | print(f'Deleting {path}') 111 | assert path.startswith(f'{args.folder}/') 112 | assert '../' not in path 113 | assert '/..' not in path 114 | assert len(path) > 2 + len(args.folder) 115 | rm.rm(path) 116 | 117 | id = parse_filename(path) 118 | if id and id in article_data: 119 | article_data[id]['deleted'] = now_ts 120 | files_to_delete = [] 121 | 122 | 123 | with Stealth().use_sync(sync_playwright()) as p: 124 | chromium = p.chromium 125 | browser = chromium.launch(headless=not args.non_headless, slow_mo=args.slow_mo) 126 | context = browser.new_context( 127 | viewport={'width': 1920, 'height': 1080}, 128 | user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36', 129 | locale='en-US', 130 | timezone_id='America/New_York', 131 | ) 132 | 133 | cookie_file = os.path.join(args.config_folder, '.substack-cookie') 134 | try: 135 | ss = Substack(context, cookie_file=cookie_file, login_url=args.substack_login_url) 136 | subs = ss.get_subscriptions() 137 | except Exception as e: 138 | if args.relogin_command: 139 | subprocess.run(['/bin/bash', '-c', args.relogin_command]) 140 | raise e 141 | publications = {} 142 | for pub in subs['publications']: 143 | publications[pub['id']] = pub['name'] 144 | 145 | def to_filename(post): 146 | pub_name = publications[post['publication_id']] 147 | title = post['title'] 148 | return f"{pub_name} - {title} [{id}].pdf" 149 | 150 | 151 | new_ids = set() 152 | fetched_ids = set() 153 | fetched_old_ids = set() 154 | all_posts = [] 155 | after = None 156 | while len(fetched_ids) < args.max_fetch_count: 157 | print(f'get_posts(after={after})') 158 | posts = ss.get_posts(limit=20, after=after) 159 | 160 | for post in posts['posts']: 161 | id = str(post['id']) 162 | fetched_ids.add(id) 163 | if id not in existing_ids: 164 | if id not in already_downloaded_ids: 165 | if len(new_ids) + len(existing_ids) < args.max_save_count: 166 | print(f'Found new article: {id}: {to_filename(post)}') 167 | new_ids.add(id) 168 | elif len(delete_if_needed) > 0 and args.delete_unread_after_hours >= 0: 169 | delete_id = list(sorted(list(delete_if_needed.keys())))[0] 170 | print(f'Article in delete_if_needed dropped: {delete_id} {delete_if_needed[delete_id]}') 171 | files_to_delete.add(delete_if_needed[delete_id]) 172 | del delete_if_needed[delete_id] 173 | 174 | print(f'Found new article: {id}: {to_filename(post)}') 175 | new_ids.add(id) 176 | else: 177 | print(f'Found but not downloading new article (no space): {id}: {to_filename(post)}') 178 | else: 179 | print(f'Article already read: {id}: {to_filename(post)}') 180 | 181 | else: 182 | fetched_old_ids.add(id) 183 | print(f'Article already on remarkable: {id}: {to_filename(post)}') 184 | after = post['post_date'] 185 | all_posts.append(post) 186 | 187 | if not posts['more']: 188 | print('No more posts to return -- stopping') 189 | break 190 | 191 | time.sleep(5) 192 | 193 | print(f'{fetched_ids=}') 194 | print(f'{fetched_old_ids=}') 195 | print(f'{new_ids=}') 196 | 197 | dir = tempfile.gettempdir() 198 | if args.tmp_folder: 199 | dir = args.tmp_folder 200 | to_upload = [] 201 | for post in all_posts: 202 | id = str(post['id']) 203 | if id in new_ids: 204 | output_file = os.path.join(dir, to_filename(post)) 205 | print(f"Downloading {post['canonical_url']} to pdf {output_file}") 206 | ss.download_pdf(post['canonical_url'], output_file) 207 | if not os.path.exists(output_file): 208 | print(f"Unable to download {post['canonical_url']} to {output_file}. Skipping") 209 | time.sleep(5) 210 | continue 211 | num_pages = get_num_pages(output_file) 212 | article_data[id] = { 213 | 'id': id, 214 | 'num_pages': num_pages, 215 | 'canonical_url': post['canonical_url'], 216 | 'filename': to_filename(post), 217 | 'added': now_ts 218 | } 219 | to_upload.append(output_file) 220 | print(f"Download complete: {article_data[id]}") 221 | time.sleep(5) 222 | 223 | 224 | print(f'Uploading: {to_upload}') 225 | for f in to_upload: 226 | print(f'Uploading {f} to {args.folder}') 227 | rm.put(f, args.folder) 228 | 229 | print('Upload complete') 230 | 231 | 232 | if args.delete_already_read and len(files_to_delete) > 0: 233 | print('Deleting old files') 234 | for path in files_to_delete: 235 | print(f'Deleting {path}') 236 | assert path.startswith(f'{args.folder}/') 237 | assert '../' not in path 238 | assert '/..' not in path 239 | assert len(path) > 2 + len(args.folder) 240 | rm.rm(path) 241 | 242 | id = parse_filename(path) 243 | if id and id in article_data: 244 | article_data[id]['deleted'] = now_ts 245 | 246 | with open(db_file, 'w') as f: 247 | f.write(json.dumps(article_data)) 248 | 249 | def get_num_pages(path): 250 | with open(path, 'rb') as f: 251 | r = pypdf.PdfReader(f) 252 | return len(r.pages) 253 | 254 | if __name__ == '__main__': 255 | args = parse_args() 256 | main(args) -------------------------------------------------------------------------------- /sstack.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import pickle 3 | import os 4 | import urllib.parse 5 | import json 6 | import time 7 | import subprocess 8 | 9 | from playwright.sync_api import sync_playwright 10 | from playwright_stealth import Stealth 11 | 12 | login_failures = 0 13 | login_successes = 0 14 | class Substack: 15 | def __init__(self, context, cookie_file=None, login_url=None): 16 | self.context = context 17 | self.page = None 18 | 19 | self.s = requests.Session() 20 | self.cookies = None 21 | self.cookie_file = cookie_file 22 | if login_url: 23 | print('Using Substack login_url') 24 | try: 25 | self.login(login_url) 26 | except Exception as e: 27 | print('login failed, trying to read existing cookies', e) 28 | self.read_cookies() 29 | else: 30 | print(f'Using existing substack cookie file {cookie_file=}') 31 | self.read_cookies() 32 | self.launch_homepage_and_save_cookies() 33 | 34 | def _new_page(self): 35 | p = self.context.new_page() 36 | def _refresh_if_429(response): 37 | if response.status == 429 and not ('api/v1' in response.url): 38 | print('429, waiting', response.url) 39 | time.sleep(5) 40 | p.reload() 41 | p.wait_for_load_state() 42 | p.on('response', _refresh_if_429) 43 | return p 44 | 45 | def login(self, login_url, headless=True): 46 | #r = self.s.get(login_url, allow_redirects=True) 47 | print('[login] Opening playwright:', login_url) 48 | if not self.page: 49 | self.page = self._new_page() 50 | page = self.page 51 | page.goto(login_url) 52 | page.wait_for_load_state() 53 | page.wait_for_timeout(5000) 54 | try: 55 | page.evaluate('location.reload()') 56 | except: 57 | print('location.reload() failed') 58 | page.goto(login_url) 59 | page.wait_for_load_state() 60 | try: 61 | page.evaluate('location.reload()') 62 | except: 63 | print('location.reload() failed') 64 | page.goto('https://substack.com/home') 65 | page.wait_for_load_state() 66 | c = self.context.cookies() 67 | print('[login] got cookies: %s' % c) 68 | self.write_cookies(c) 69 | 70 | def launch_homepage_and_save_cookies(self): 71 | print('[launch] Opening playwright: https://substack.com/home') 72 | if self.cookies: 73 | print(f'adding {len(self.cookies)} cookies') 74 | self.context.add_cookies(self.cookies) 75 | if not self.page: 76 | self.page = self._new_page() 77 | page = self.page 78 | page.goto('https://substack.com/home') 79 | page.wait_for_load_state() 80 | try: 81 | page.evaluate('location.reload()') 82 | except: 83 | print('location.reload() failed') 84 | page.goto('https://substack.com/home') 85 | page.wait_for_load_state() 86 | c = self.context.cookies() 87 | print('[launch] got cookies: %s' % c) 88 | self.write_cookies(c) 89 | 90 | def write_cookies(self, playwright_cookies): 91 | def _to_json(c): 92 | return { 93 | 'name': c.get('name'), 94 | 'value': c.get('value'), 95 | 'domain': c.get('domain'), 96 | 'path': c.get('path'), 97 | 'expires': c.get('expires'), 98 | 'httpOnly': c.get('httpOnly'), 99 | 'secure': c.get('secure'), 100 | 'sameSite': c.get('sameSite'), 101 | } 102 | 103 | if not self.cookie_file: 104 | return 105 | with open(self.cookie_file, 'w') as f: 106 | j = [_to_json(c) for c in playwright_cookies] 107 | f.write(json.dumps(j, indent=4)) 108 | self.cookies = j 109 | 110 | def read_cookies(self): 111 | if not self.cookie_file: 112 | return 113 | if not os.path.exists(self.cookie_file): 114 | return 115 | with open(self.cookie_file, 'r') as f: 116 | self.cookies = json.load(f) 117 | for c in self.cookies: 118 | self.s.cookies.set(c['name'], c['value'], domain=c['domain']) 119 | 120 | 121 | 122 | 123 | def get_posts(self, inbox_type='inbox', limit=12, after=None): # max limit enforced by substack: 20 124 | url = f'https://substack.com/api/v1/reader/posts?inboxType={inbox_type}&limit={limit}' 125 | if after: 126 | url += f'&after={after}' 127 | r = self.s.get(url) 128 | if r.status_code//100 != 2: 129 | raise RuntimeError(f'{r.status_code}: {r.text}') 130 | return r.json() 131 | 132 | def get_archive(self, domain, limit=12, offset=None): # max limit enforced by substack: 20 133 | url = f'https://{domain}/api/v1/archive?sort=new&search=&limit={limit}' 134 | if offset: 135 | url += f'&offset={offset}' 136 | r = self.s.get(url) 137 | if r.status_code == 429: 138 | print('429, waiting') 139 | time.sleep(5) 140 | if r.status_code//100 != 2: 141 | raise RuntimeError(f'{r.status_code}: {r.text}') 142 | return r.json() 143 | 144 | def get_full_archive(self, domain): 145 | out = [] 146 | offset = None 147 | c = 0 148 | while True: 149 | try: 150 | ret = self.get_archive(domain, limit=12, offset=offset) 151 | except RuntimeError as e: 152 | if '429:' in str(e): 153 | time.sleep(15) 154 | continue 155 | else: 156 | raise e 157 | print(f'get_archive({offset=})') 158 | if not ret: 159 | print(f'get_full_archive done {len(out)}') 160 | return out 161 | c += 1 162 | out += ret 163 | if offset: 164 | offset += 12 165 | else: 166 | offset = 12 167 | if c % 5 == 0: 168 | time.sleep(5) 169 | time.sleep(1) 170 | 171 | def get_subscriptions(self): 172 | r = self.s.get(f'https://substack.com/api/v1/subscriptions') 173 | if r.status_code//100 != 2: 174 | raise RuntimeError(f'{r.status_code}: {r.text}') 175 | return r.json() 176 | 177 | # def playwright_cookies(self): 178 | # return [{'name': k.name, 'value': k.value, 'port': k.port, 'domain': k.domain, 'path': k.path, 'secure': k.secure, 'expires': k.expires} for k in self.s.cookies] 179 | 180 | 181 | relogin_command_run = False 182 | def download_pdf(self, *args, **kwargs): 183 | global login_successes 184 | global login_failures 185 | for i in range(3): 186 | try: 187 | ret = self._download_pdf(*args, retry=i, **kwargs) 188 | if ret: 189 | login_successes += 1 190 | print(f'STATUS {login_failures=} {login_successes=}') 191 | return ret 192 | except Exception as e: 193 | print('download_pdf call', i+1, 'swallowed exception', e) 194 | print('Retrying download_pdf()') 195 | ret = self._download_pdf(*args, retry=3, **kwargs) 196 | if not ret: 197 | login_failures += 1 198 | if kwargs.get('relogin_command') and not self.relogin_command_run and login_successes == 0: 199 | print(f'STATUS {login_failures=} {login_successes=}') 200 | subprocess.run(['/bin/bash', '-c', kwargs.get('relogin_command')]) 201 | self.relogin_command_run = True 202 | else: 203 | login_successes += 1 204 | print(f'STATUS {login_failures=} {login_successes=}') 205 | return ret 206 | 207 | def _download_pdf(self, url, output_file, headless=True, slow_mo=0, relogin_command=None, retry=0): 208 | print('Opening playwright:', url) 209 | 210 | _logged_in_locator = 'button:has-text("New post"), [placeholder*="What\'s on your mind"]' 211 | _logged_out_locator = 'button:has-text("Sign in")' 212 | 213 | if self.cookies: 214 | print(f'adding {len(self.cookies)} cookies') 215 | self.context.add_cookies(self.cookies) 216 | if not self.page: 217 | self.page = self._new_page() 218 | page = self.page 219 | 220 | print('Opening https://substack.com/home') 221 | page.goto('https://substack.com/home') 222 | page.wait_for_load_state() 223 | page.wait_for_timeout(5000) 224 | print('Opened https://substack.com/home') 225 | 226 | # Check for logged-in state: either find logged-in element OR confirm sign-in button is absent 227 | logged_in = False 228 | try: 229 | page.locator(_logged_in_locator).first.wait_for(timeout=2000) 230 | logged_in = True 231 | except Exception: 232 | # Fallback: check if sign-in button is NOT visible (indicates logged in) 233 | try: 234 | sign_in_visible = page.locator(_logged_out_locator).first.is_visible() 235 | logged_in = not sign_in_visible 236 | except Exception: 237 | pass 238 | 239 | if not logged_in: 240 | print('Unable to ensure logged-in on substack homepage, you need to relogin') 241 | return None 242 | print('Found logged-in session on substack.com') 243 | 244 | page.goto(url) 245 | try: 246 | page.wait_for_load_state(timeout=5000) 247 | except: 248 | print('load state ignored') 249 | print('Ensuring logged-in session carries to article details') 250 | 251 | # For article pages, check for sign-in link/button (can be or