├── LICENSE ├── README.md ├── precommit.sh └── pwr /LICENSE: -------------------------------------------------------------------------------- 1 | MIT-0 License 2 | 3 | Copyright (c) Muxup contributors. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so. 11 | 12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 13 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 14 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 15 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 16 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 17 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 18 | SOFTWARE. 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pwr 2 | 3 | ## Summary 4 | 5 | pwr (paced web reader) is a script and terminal-centric workflow I use for 6 | keeping up to date with various sources online, shared on the off chance it's 7 | useful to you too. 8 | 9 | See its [page on Muxup.com](https://muxup.com/pwr) for more information about 10 | why it exists and how to use it. 11 | 12 | ## Implementation details 13 | 14 | * Ships with an example list of sources and "fetchers" (functions that will 15 | return extracted URLs and titles). This includes examples of using 16 | BeautifulSoup to extract from HTML in cases where RSS feeds are not present 17 | or are insufficient. 18 | * Data is stored in JSON in `$XDG_DATA_HOME/.local/share/pwr`. It does assume 19 | Python 3.6+ dictionary semantics of entries being maintained in insertion 20 | order. 21 | * Supporting fetching data from multiple sources in parallel is an obvious 22 | extension. It simply hasn't been slow enough so far to make it worth 23 | prioritising. 24 | * Caching of RSS feeds and similar isn't implemented. This is fine if you're 25 | running it once every day or so as intended, but if you're looking to run it 26 | more regularly you should add improvements in that area. 27 | * pwr is a quick script that scratches an itch. It's definitely not the 28 | world's most robust or clean Python code. 29 | -------------------------------------------------------------------------------- /precommit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | die () { 4 | printf "%s\n" "$*" >&2 5 | exit 1 6 | } 7 | 8 | mypy --ignore-missing-imports --strict pwr || die "mypy failed" 9 | isort --profile black -c pwr || die "isort found issues" 10 | black --check pwr || die "black reports build.py needs reformatting" 11 | flake8 --max-line-length 88 --extend-ignore=E203,E266,E302,E501,W291 pwr || die "flake8 found issues" 12 | -------------------------------------------------------------------------------- /pwr: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright Muxup contributors. 4 | # Distributed under the terms of the MIT-0 license, see LICENSE for details. 5 | # SPDX-License-Identifier: MIT-0 6 | 7 | import datetime 8 | import itertools 9 | import json 10 | import os 11 | import subprocess 12 | import sys 13 | import textwrap 14 | import time 15 | from pathlib import Path 16 | from typing import Callable, TypedDict 17 | 18 | import requests 19 | from bs4 import BeautifulSoup 20 | 21 | ### Type definitions ### 22 | 23 | URLAndTitleList = list[list[str]] 24 | 25 | 26 | class SourceData(TypedDict): 27 | seen: list[str] 28 | entries: URLAndTitleList 29 | 30 | 31 | class PWRData(TypedDict): 32 | last_read: str 33 | last_fetch: str 34 | last_filter: str 35 | sources: dict[str, SourceData] 36 | 37 | 38 | ### Config ### 39 | 40 | # Wrapped in a function to workaround Python's lack of support for forward 41 | # declarations of functions (such as the referenced fetcher functions) 42 | # fmt: off 43 | def get_sources() -> dict[str, tuple[Callable[..., URLAndTitleList], *tuple[object, ...]]]: 44 | return { 45 | "Rust Internals": (discourse_fetcher, "https://internals.rust-lang.org/"), 46 | "Swift Evolution": (discourse_fetcher, "https://forums.swift.org/c/evolution/18"), 47 | "HN": (feed_fetcher, "https://news.ycombinator.com/rss", True), 48 | "lobste.rs": (feed_fetcher, "https://lobste.rs/rss", True), 49 | "/r/programminglanguages": (feed_fetcher, "http://www.reddit.com/r/programminglanguages/.rss"), 50 | "/r/rust": (feed_fetcher, "http://www.reddit.com/r/rust/top.rss?t=week"), 51 | "Muxup": (feed_fetcher, "https://muxup.com/feed.xml"), 52 | "Igalia": (feed_fetcher, "https://www.igalia.com/feed.xml"), 53 | "cs.PL": (arxiv_fetcher, "cs.PL"), 54 | "cs.AR": (arxiv_fetcher, "cs.AR"), 55 | "Hylo discussions": (ghdiscussions_fetcher, "orgs/hylo-lang"), 56 | "RISC-V announcements": (groupsio_fetcher, "https://lists.riscv.org/g/tech-announce/topics"), 57 | "Nim forum": (nimforum_fetcher,), 58 | } 59 | # fmt: on 60 | 61 | 62 | # URLs that will be opened unconditionally when performing the 'read' action. 63 | extra_urls_for_read = ["https://guardian.co.uk"] 64 | data_dir = Path( 65 | os.environ.get("XDG_DATA_HOME", Path.home() / ".local" / "share" / "pwr") 66 | ) 67 | data_file = data_dir / "data.json" 68 | preferred_browser = os.environ.get("BROWSER", "firefox") 69 | saved_seen_url_limit = 250 70 | fetch_timeout = 10 71 | read_url_batch_size = 10 72 | 73 | 74 | ### Fetchers ### 75 | 76 | # A fetcher is passed whatever arguments were present in the sources 77 | # dictionary, and is responsible for returning an array of [url, title] 78 | # arrays. The caller will take care of removing any [url, title] entries where 79 | # the url has already been seen. A fetcher might append `##someval` to a URL 80 | # to force it appear fresh (where someval might be a timestamp, number of 81 | # replies, or some other data). 82 | 83 | 84 | def feed_fetcher(url: str, comments_as_link: bool = False) -> URLAndTitleList: 85 | soup = BeautifulSoup(fetch_from_url(url), features="xml") 86 | entries = soup.find_all(["item", "entry"]) 87 | extracted = [] 88 | for entry in entries: 89 | title = entry.find("title").text 90 | link = entry.find(comments_as_link and "comments" or "link") 91 | url = link.get("href") or link.text 92 | extracted.append([url, title]) 93 | return extracted 94 | 95 | 96 | # Custom fetcher rather than just using the RSS feed because the arXiv RSS 97 | # feeds only include any papers posted in the last day, so it's possible to 98 | # miss them if you don't fetch regularly enough. 99 | def arxiv_fetcher(category: str) -> URLAndTitleList: 100 | url = f"https://arxiv.org/list/{category}/recent" 101 | soup = BeautifulSoup(fetch_from_url(url), "html.parser") 102 | extracted = [] 103 | 104 | for dt in soup.find_all("dt"): 105 | title_tag = dt.find_next("div", class_="list-title") 106 | title = title_tag.text.replace("Title:", "").strip() 107 | abstract = dt.find_next("a", title="Abstract") 108 | url = "https://arxiv.org" + abstract["href"] 109 | extracted.append([url, title]) 110 | return extracted 111 | 112 | 113 | # The Discourse RSS feeds don't provide the same listing as when viewing a 114 | # category sorted by most recently replied to (see 115 | # ), so extract it ourselves. 116 | def discourse_fetcher(url: str) -> URLAndTitleList: 117 | soup = BeautifulSoup(fetch_from_url(url), "html.parser") 118 | extracted = [] 119 | 120 | topics = soup.find_all("tr", class_="topic-list-item") 121 | for topic in topics: 122 | title_tag = topic.find("a", class_="title") 123 | title = title_tag.text.strip() 124 | url = title_tag.get("href") 125 | replies = topic.find("span", class_="posts").text.strip() 126 | extracted.append([f"{url}##{replies}", append_replies(title, int(replies))]) 127 | return extracted 128 | 129 | 130 | def ghdiscussions_fetcher(ghpath: str) -> URLAndTitleList: 131 | soup = BeautifulSoup( 132 | fetch_from_url(f"https://github.com/{ghpath}/discussions"), "html.parser" 133 | ) 134 | extracted = [] 135 | 136 | for topic in soup.find_all("a", attrs={"data-hovercard-type": "discussion"}): 137 | title = topic.text.strip() 138 | url = f"https://github.com{topic.get('href')}" 139 | replies_element = topic.find_next( 140 | "a", attrs={"aria-label": lambda x: x and "comment" in x} 141 | ) 142 | replies = replies_element.text.strip() 143 | extracted.append([f"{url}##{replies}", append_replies(title, int(replies))]) 144 | return extracted 145 | 146 | 147 | def groupsio_fetcher(groupurl: str) -> URLAndTitleList: 148 | soup = BeautifulSoup(fetch_from_url(groupurl), "html.parser") 149 | extracted = [] 150 | for topic_span in soup.find_all("span", class_="subject"): 151 | link = topic_span.find("a") 152 | title = link.text.strip() 153 | url = link.get("href") 154 | reply_count_span = topic_span.find("span", class_="hashtag-position") 155 | reply_count = reply_count_span.text.strip() if reply_count_span else "0" 156 | extracted.append( 157 | [f"{url}##{reply_count}", append_replies(title, int(reply_count))] 158 | ) 159 | return extracted 160 | 161 | 162 | def nimforum_fetcher() -> URLAndTitleList: 163 | thread_data = json.loads(fetch_from_url("https://forum.nim-lang.org/threads.json")) 164 | extracted = [] 165 | for thread in thread_data["threads"]: 166 | extracted.append( 167 | [ 168 | f"https://forum.nim-lang.org/t/{thread['id']}##{thread['replies']}", 169 | append_replies(thread["topic"], thread["replies"]), 170 | ] 171 | ) 172 | return extracted 173 | 174 | 175 | ### Helper functions ### 176 | 177 | 178 | def append_replies(title: str, count: int) -> str: 179 | if count == 1: 180 | return f"{title} (1 reply)" 181 | return f"{title} ({count} replies)" 182 | 183 | 184 | def load_data() -> PWRData: 185 | if data_file.exists(): 186 | return json.loads(data_file.read_text()) # type: ignore 187 | return { 188 | "last_read": "Never", 189 | "last_fetch": "Never", 190 | "last_filter": "Never", 191 | "sources": {}, 192 | } 193 | 194 | 195 | def save_data(data: PWRData) -> None: 196 | data_file.write_text(json.dumps(data, indent=2)) 197 | 198 | 199 | def fetch_from_url(url: str, max_retries: int = 5, delay: int = 1) -> str: 200 | headers = {"User-Agent": "pwr - paced web reader"} 201 | for attempt in range(max_retries): 202 | try: 203 | print(f"Fetching {url} ...", end="", flush=True) 204 | response = requests.get(url, headers=headers, timeout=10) 205 | response.raise_for_status() 206 | print("DONE") 207 | return response.text 208 | except requests.exceptions.RequestException as e: 209 | print(f"FAILED {str(e)}") 210 | if attempt < max_retries - 1: 211 | time.sleep(delay) 212 | delay *= 4 213 | print("Max retries reached. Giving up.") 214 | sys.exit(1) 215 | 216 | 217 | def get_time() -> str: 218 | return datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M:%S %Z") 219 | 220 | 221 | def print_help() -> None: 222 | print("Usage: pwr [action]") 223 | print("\nIf no action is given, cycles through read/fetch/filter in sequence.\n") 224 | print("Available actions:") 225 | print(" read - Read previously selected/enqueued URls") 226 | print(" fetch - Retrieve new article titles for review") 227 | print(" filter - Review article titles and decide which to read") 228 | print(" status - Print information about current status") 229 | 230 | 231 | def speedbump(action_name: str, data: PWRData) -> None: 232 | print(f"About to start pwr {action_name}, last run at: {data['last_'+action_name]}") # type: ignore 233 | input(f"Press Enter to continue with pwr {action_name}") 234 | 235 | 236 | def get_last_action(data: PWRData) -> tuple[str, str]: 237 | def date_str_for_op(key: str) -> str: 238 | val = data[key] # type: ignore 239 | if val == "Never": 240 | return "1970-01-01 00:00:00 UTC" 241 | return val # type: ignore 242 | 243 | recent_ops = [ 244 | ("read", date_str_for_op("last_read")), 245 | ("fetch", date_str_for_op("last_fetch")), 246 | ("filter", date_str_for_op("last_filter")), 247 | ] 248 | last_action, last_action_datetime = max(recent_ops, key=lambda x: x[1]) 249 | return (last_action, last_action_datetime) 250 | 251 | 252 | def count_urls(data: PWRData) -> int: 253 | return sum(len(source_data["entries"]) for source_data in data["sources"].values()) 254 | 255 | 256 | ### Action implementations ### 257 | 258 | 259 | def do_read() -> None: 260 | data = load_data() 261 | last_action, _ = get_last_action(data) 262 | speedbump("read", data) 263 | if last_action != "filter": 264 | print( 265 | "WARNING: filter is not the most recent action. Did you forget to run it?" 266 | ) 267 | input("Press Enter to continue anyway, or Ctrl-C to abort") 268 | urls = extra_urls_for_read.copy() 269 | 270 | for source_data in data["sources"].values(): 271 | for url, _ in source_data["entries"]: 272 | url = url.split("##")[0] 273 | if not url.startswith(("http://", "https://")): 274 | print(f"Skipping url '{url}' as it doesn't have a recognised protocol") 275 | continue 276 | urls.append(url) 277 | source_data["entries"] = [] 278 | 279 | print( 280 | f"Launching browser (in batches of {read_url_batch_size}) for {len(urls)} URLs." 281 | ) 282 | 283 | for url_batch in itertools.batched(urls, read_url_batch_size): 284 | print(f"Opening batch of URLs with browser {preferred_browser}") 285 | subprocess.Popen([preferred_browser] + list(url_batch)) 286 | if len(url_batch) == read_url_batch_size: 287 | input("Press Enter to continue to next batch") 288 | 289 | print("All URLs read, saving changes") 290 | data["last_read"] = get_time() 291 | save_data(data) 292 | print(f"pwr read ended successfully at {data['last_read']}") 293 | 294 | 295 | def do_fetch() -> None: 296 | data = load_data() 297 | speedbump("fetch", data) 298 | total_filtered_extracted = 0 299 | 300 | for source_name, source_info in get_sources().items(): 301 | if source_name not in data["sources"]: 302 | data["sources"][source_name] = {"seen": [], "entries": []} 303 | else: 304 | # Ensure serialised order of data from fetchers reflects any 305 | # changes made to the sources dict order. 306 | value = data["sources"].pop(source_name) 307 | data["sources"][source_name] = value 308 | 309 | print(f"Processing source {source_name}") 310 | func = source_info[0] 311 | extracted = func(*source_info[1:]) 312 | 313 | saved_source_data = data["sources"][source_name] 314 | saved_source_data["seen"] = saved_source_data["seen"][-saved_seen_url_limit:] 315 | seen_set = dict.fromkeys(saved_source_data["seen"]) 316 | filtered_extracted = [] 317 | for url, title in extracted: 318 | if url in seen_set: 319 | # Ensure entry in seen_set is refreshed (i.e. affect ordering) 320 | seen_set.pop(url) 321 | else: 322 | filtered_extracted.append([url, title]) 323 | seen_set[url] = None 324 | saved_source_data["seen"] = list(seen_set.keys()) 325 | saved_source_data["entries"].extend(filtered_extracted) 326 | total_filtered_extracted += len(filtered_extracted) 327 | print( 328 | f"Retrieved {len(extracted)} items, {len(filtered_extracted)} remain after removing seen items" 329 | ) 330 | 331 | # Delete data for any sources no longer in the sources list in this 332 | # script. 333 | sources = get_sources() 334 | for source_name in list(data["sources"].keys()): 335 | if source_name not in sources: 336 | del data["sources"][source_name] 337 | 338 | data["last_fetch"] = get_time() 339 | save_data(data) 340 | print( 341 | f"\nA total of {total_filtered_extracted} items were queued up for filtering." 342 | ) 343 | print(f"pwr fetch ended successfully at {data['last_fetch']}") 344 | 345 | 346 | def do_filter() -> None: 347 | data = load_data() 348 | speedbump("filter", data) 349 | num_urls_before_filtering = count_urls(data) 350 | wrapper = textwrap.TextWrapper( 351 | width=98, initial_indent="d ", subsequent_indent=" " 352 | ) 353 | filter_file = data_dir / "filter.pwr" 354 | 355 | with filter_file.open("w") as file: 356 | file.write("------------------------------------------------------------\n") 357 | file.write(f"Filter file generated at {get_time()}\n") 358 | file.write("DO NOT DELETE OR MOVE ANY LINES\n") 359 | file.write("To mark an item for reading, replace the 'd' prefix with 'r'\n") 360 | file.write("Exit editor with non-zero return code (:cq in vim) to abort\n") 361 | file.write("------------------------------------------------------------\n\n") 362 | for source_name, source_data in data["sources"].items(): 363 | if not source_data["entries"]: 364 | continue 365 | file.write(f"# {source_name}\n") 366 | for _, title in source_data["entries"]: 367 | file.write(wrapper.fill(title)) 368 | file.write("\n") 369 | file.write("\n") 370 | 371 | result = subprocess.run([os.environ.get("EDITOR", "vim"), filter_file]) 372 | if result.returncode != 0: 373 | print("Exiting early as editor returned non-zero exit code") 374 | print("Filtering not applied") 375 | sys.exit(1) 376 | 377 | with filter_file.open("r") as file: 378 | filtered_entries: URLAndTitleList = [] 379 | cur_source_name = None 380 | index = 0 381 | 382 | for line in file: 383 | if line.startswith("# "): 384 | new_source_name = line[2:].strip() 385 | if new_source_name not in data["sources"]: 386 | raise ValueError( 387 | f"Source {new_source_name} not found in saved json" 388 | ) 389 | if cur_source_name: 390 | data["sources"][cur_source_name]["entries"] = filtered_entries 391 | filtered_entries = [] 392 | index = 0 393 | cur_source_name = new_source_name 394 | elif line.startswith("d "): 395 | index += 1 396 | elif line.startswith("r "): 397 | if not cur_source_name: 398 | raise ValueError( 399 | "Invalid input. 'r ' encountered with no preceding heading." 400 | ) 401 | 402 | filtered_entries.append( 403 | data["sources"][cur_source_name]["entries"][index] 404 | ) 405 | index += 1 406 | 407 | if cur_source_name: 408 | data["sources"][cur_source_name]["entries"] = filtered_entries 409 | 410 | num_urls_after_filtering = count_urls(data) 411 | print( 412 | f"Filtered {num_urls_before_filtering} entries down to {num_urls_after_filtering} ({num_urls_before_filtering - num_urls_after_filtering} removed)." 413 | ) 414 | data["last_filter"] = get_time() 415 | save_data(data) 416 | print(f"pwr filter ended successfully at {data['last_filter']}") 417 | 418 | 419 | def do_status() -> None: 420 | data = load_data() 421 | last_action, last_action_datetime = get_last_action(data) 422 | print(f"Last operation was '{last_action}' at {last_action_datetime}") 423 | print(f"{count_urls(data)} items in entries database") 424 | 425 | 426 | def do_test_fetcher(name: str, args: list[str]) -> None: 427 | fetcher_fn = globals().get(f"{name}_fetcher") 428 | if not fetcher_fn or not callable(fetcher_fn): 429 | print(f"Error: fetcher function '{name}' not found.") 430 | sys.exit(1) 431 | extracted = fetcher_fn(*args) 432 | print(f"{len(extracted)} URL+title pairs extracted:\n") 433 | for entry in extracted: 434 | print(f"URL: {entry[0]}") 435 | print(f"Title: {entry[1]}") 436 | print() 437 | 438 | 439 | ### Main ### 440 | 441 | if __name__ == "__main__": 442 | data_dir.mkdir(parents=True, exist_ok=True) 443 | 444 | if len(sys.argv) < 2: 445 | data = load_data() 446 | last_action, _ = get_last_action(data) 447 | if last_action != "filter": 448 | print( 449 | f"Error: argument-less pwr flow only valid when 'filter' was last action (last action: '{last_action}')" 450 | ) 451 | sys.exit(1) 452 | do_read() 453 | do_fetch() 454 | do_filter() 455 | sys.exit(0) 456 | 457 | action = sys.argv[1] 458 | if action == "read": 459 | do_read() 460 | elif action == "fetch": 461 | do_fetch() 462 | elif action == "filter": 463 | do_filter() 464 | elif action == "status": 465 | do_status() 466 | elif action == "test-fetcher": 467 | if len(sys.argv) < 3: 468 | print("Missing required fetcher name argument.") 469 | sys.exit(1) 470 | do_test_fetcher(sys.argv[2], sys.argv[3:]) 471 | else: 472 | print(f"Error: Invalid action '{action}'") 473 | print_help() 474 | sys.exit(1) 475 | --------------------------------------------------------------------------------