├── LICENSE
├── README.md
├── precommit.sh
└── pwr


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT-0 License
 2 | 
 3 | Copyright (c) Muxup contributors.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so.
11 | 
12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
17 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
18 | SOFTWARE.
19 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # pwr
 2 | 
 3 | ## Summary
 4 | 
 5 | pwr (paced web reader) is a script and terminal-centric workflow I use for
 6 | keeping up to date with various sources online, shared on the off chance it's
 7 | useful to you too.
 8 | 
 9 | See its [page on Muxup.com](https://muxup.com/pwr) for more information about
10 | why it exists and how to use it.
11 | 
12 | ## Implementation details
13 | 
14 | * Ships with an example list of sources and "fetchers" (functions that will
15 |   return extracted URLs and titles). This includes examples of using
16 |   BeautifulSoup to extract from HTML in cases where RSS feeds are not present
17 |   or are insufficient.
18 | * Data is stored in JSON in `$XDG_DATA_HOME/.local/share/pwr`. It does assume
19 |   Python 3.6+ dictionary semantics of entries being maintained in insertion
20 |   order.
21 | * Supporting fetching data from multiple sources in parallel is an obvious
22 |   extension. It simply hasn't been slow enough so far to make it worth
23 |   prioritising.
24 | * Caching of RSS feeds and similar isn't implemented. This is fine if you're
25 |   running it once every day or so as intended, but if you're looking to run it
26 |   more regularly you should add improvements in that area.
27 | * pwr is a quick script that scratches an itch. It's definitely not the
28 |   world's most robust or clean Python code.
29 | 


--------------------------------------------------------------------------------
/precommit.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | die () {
 4 |   printf "%s\n" "$*" >&2
 5 |   exit 1
 6 | }
 7 | 
 8 | mypy --ignore-missing-imports --strict pwr || die "mypy failed"
 9 | isort --profile black -c pwr || die "isort found issues"
10 | black --check pwr || die "black reports build.py needs reformatting"
11 | flake8 --max-line-length 88 --extend-ignore=E203,E266,E302,E501,W291 pwr || die "flake8 found issues"
12 | 


--------------------------------------------------------------------------------
/pwr:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Copyright Muxup contributors.
  4 | # Distributed under the terms of the MIT-0 license, see LICENSE for details.
  5 | # SPDX-License-Identifier: MIT-0
  6 | 
  7 | import datetime
  8 | import itertools
  9 | import json
 10 | import os
 11 | import subprocess
 12 | import sys
 13 | import textwrap
 14 | import time
 15 | from pathlib import Path
 16 | from typing import Callable, TypedDict
 17 | 
 18 | import requests
 19 | from bs4 import BeautifulSoup
 20 | 
 21 | ### Type definitions ###
 22 | 
 23 | URLAndTitleList = list[list[str]]
 24 | 
 25 | 
 26 | class SourceData(TypedDict):
 27 |     seen: list[str]
 28 |     entries: URLAndTitleList
 29 | 
 30 | 
 31 | class PWRData(TypedDict):
 32 |     last_read: str
 33 |     last_fetch: str
 34 |     last_filter: str
 35 |     sources: dict[str, SourceData]
 36 | 
 37 | 
 38 | ### Config ###
 39 | 
 40 | # Wrapped in a function to workaround Python's lack of support for forward
 41 | # declarations of functions (such as the referenced fetcher functions)
 42 | # fmt: off
 43 | def get_sources() -> dict[str, tuple[Callable[..., URLAndTitleList], *tuple[object, ...]]]:
 44 |     return {
 45 |         "Rust Internals": (discourse_fetcher, "https://internals.rust-lang.org/"),
 46 |         "Swift Evolution": (discourse_fetcher, "https://forums.swift.org/c/evolution/18"),
 47 |         "HN": (feed_fetcher, "https://news.ycombinator.com/rss", True),
 48 |         "lobste.rs": (feed_fetcher, "https://lobste.rs/rss", True),
 49 |         "/r/programminglanguages": (feed_fetcher, "http://www.reddit.com/r/programminglanguages/.rss"),
 50 |         "/r/rust": (feed_fetcher, "http://www.reddit.com/r/rust/top.rss?t=week"),
 51 |         "Muxup": (feed_fetcher, "https://muxup.com/feed.xml"),
 52 |         "Igalia": (feed_fetcher, "https://www.igalia.com/feed.xml"),
 53 |         "cs.PL": (arxiv_fetcher, "cs.PL"),
 54 |         "cs.AR": (arxiv_fetcher, "cs.AR"),
 55 |         "Hylo discussions": (ghdiscussions_fetcher, "orgs/hylo-lang"),
 56 |         "RISC-V announcements": (groupsio_fetcher, "https://lists.riscv.org/g/tech-announce/topics"),
 57 |         "Nim forum": (nimforum_fetcher,),
 58 |     }
 59 | # fmt: on
 60 | 
 61 | 
 62 | # URLs that will be opened unconditionally when performing the 'read' action.
 63 | extra_urls_for_read = ["https://guardian.co.uk"]
 64 | data_dir = Path(
 65 |     os.environ.get("XDG_DATA_HOME", Path.home() / ".local" / "share" / "pwr")
 66 | )
 67 | data_file = data_dir / "data.json"
 68 | preferred_browser = os.environ.get("BROWSER", "firefox")
 69 | saved_seen_url_limit = 250
 70 | fetch_timeout = 10
 71 | read_url_batch_size = 10
 72 | 
 73 | 
 74 | ### Fetchers ###
 75 | 
 76 | # A fetcher is passed whatever arguments were present in the sources
 77 | # dictionary, and is responsible for returning an array of [url, title]
 78 | # arrays. The caller will take care of removing any [url, title] entries where
 79 | # the url has already been seen. A fetcher might append `##someval` to a URL
 80 | # to force it appear fresh (where someval might be a timestamp, number of
 81 | # replies, or some other data).
 82 | 
 83 | 
 84 | def feed_fetcher(url: str, comments_as_link: bool = False) -> URLAndTitleList:
 85 |     soup = BeautifulSoup(fetch_from_url(url), features="xml")
 86 |     entries = soup.find_all(["item", "entry"])
 87 |     extracted = []
 88 |     for entry in entries:
 89 |         title = entry.find("title").text
 90 |         link = entry.find(comments_as_link and "comments" or "link")
 91 |         url = link.get("href") or link.text
 92 |         extracted.append([url, title])
 93 |     return extracted
 94 | 
 95 | 
 96 | # Custom fetcher rather than just using the RSS feed because the arXiv RSS
 97 | # feeds only include any papers posted in the last day, so it's possible to
 98 | # miss them if you don't fetch regularly enough.
 99 | def arxiv_fetcher(category: str) -> URLAndTitleList:
100 |     url = f"https://arxiv.org/list/{category}/recent"
101 |     soup = BeautifulSoup(fetch_from_url(url), "html.parser")
102 |     extracted = []
103 | 
104 |     for dt in soup.find_all("dt"):
105 |         title_tag = dt.find_next("div", class_="list-title")
106 |         title = title_tag.text.replace("Title:", "").strip()
107 |         abstract = dt.find_next("a", title="Abstract")
108 |         url = "https://arxiv.org" + abstract["href"]
109 |         extracted.append([url, title])
110 |     return extracted
111 | 
112 | 
113 | # The Discourse RSS feeds don't provide the same listing as when viewing a
114 | # category sorted by most recently replied to (see
115 | # <https://meta.discourse.org/t/missing-rss-feed-which-corresponds-to-new-topics/295686>), so extract it ourselves.
116 | def discourse_fetcher(url: str) -> URLAndTitleList:
117 |     soup = BeautifulSoup(fetch_from_url(url), "html.parser")
118 |     extracted = []
119 | 
120 |     topics = soup.find_all("tr", class_="topic-list-item")
121 |     for topic in topics:
122 |         title_tag = topic.find("a", class_="title")
123 |         title = title_tag.text.strip()
124 |         url = title_tag.get("href")
125 |         replies = topic.find("span", class_="posts").text.strip()
126 |         extracted.append([f"{url}##{replies}", append_replies(title, int(replies))])
127 |     return extracted
128 | 
129 | 
130 | def ghdiscussions_fetcher(ghpath: str) -> URLAndTitleList:
131 |     soup = BeautifulSoup(
132 |         fetch_from_url(f"https://github.com/{ghpath}/discussions"), "html.parser"
133 |     )
134 |     extracted = []
135 | 
136 |     for topic in soup.find_all("a", attrs={"data-hovercard-type": "discussion"}):
137 |         title = topic.text.strip()
138 |         url = f"https://github.com{topic.get('href')}"
139 |         replies_element = topic.find_next(
140 |             "a", attrs={"aria-label": lambda x: x and "comment" in x}
141 |         )
142 |         replies = replies_element.text.strip()
143 |         extracted.append([f"{url}##{replies}", append_replies(title, int(replies))])
144 |     return extracted
145 | 
146 | 
147 | def groupsio_fetcher(groupurl: str) -> URLAndTitleList:
148 |     soup = BeautifulSoup(fetch_from_url(groupurl), "html.parser")
149 |     extracted = []
150 |     for topic_span in soup.find_all("span", class_="subject"):
151 |         link = topic_span.find("a")
152 |         title = link.text.strip()
153 |         url = link.get("href")
154 |         reply_count_span = topic_span.find("span", class_="hashtag-position")
155 |         reply_count = reply_count_span.text.strip() if reply_count_span else "0"
156 |         extracted.append(
157 |             [f"{url}##{reply_count}", append_replies(title, int(reply_count))]
158 |         )
159 |     return extracted
160 | 
161 | 
162 | def nimforum_fetcher() -> URLAndTitleList:
163 |     thread_data = json.loads(fetch_from_url("https://forum.nim-lang.org/threads.json"))
164 |     extracted = []
165 |     for thread in thread_data["threads"]:
166 |         extracted.append(
167 |             [
168 |                 f"https://forum.nim-lang.org/t/{thread['id']}##{thread['replies']}",
169 |                 append_replies(thread["topic"], thread["replies"]),
170 |             ]
171 |         )
172 |     return extracted
173 | 
174 | 
175 | ### Helper functions ###
176 | 
177 | 
178 | def append_replies(title: str, count: int) -> str:
179 |     if count == 1:
180 |         return f"{title} (1 reply)"
181 |     return f"{title} ({count} replies)"
182 | 
183 | 
184 | def load_data() -> PWRData:
185 |     if data_file.exists():
186 |         return json.loads(data_file.read_text())  # type: ignore
187 |     return {
188 |         "last_read": "Never",
189 |         "last_fetch": "Never",
190 |         "last_filter": "Never",
191 |         "sources": {},
192 |     }
193 | 
194 | 
195 | def save_data(data: PWRData) -> None:
196 |     data_file.write_text(json.dumps(data, indent=2))
197 | 
198 | 
199 | def fetch_from_url(url: str, max_retries: int = 5, delay: int = 1) -> str:
200 |     headers = {"User-Agent": "pwr - paced web reader"}
201 |     for attempt in range(max_retries):
202 |         try:
203 |             print(f"Fetching {url} ...", end="", flush=True)
204 |             response = requests.get(url, headers=headers, timeout=10)
205 |             response.raise_for_status()
206 |             print("DONE")
207 |             return response.text
208 |         except requests.exceptions.RequestException as e:
209 |             print(f"FAILED {str(e)}")
210 |             if attempt < max_retries - 1:
211 |                 time.sleep(delay)
212 |                 delay *= 4
213 |     print("Max retries reached. Giving up.")
214 |     sys.exit(1)
215 | 
216 | 
217 | def get_time() -> str:
218 |     return datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M:%S %Z")
219 | 
220 | 
221 | def print_help() -> None:
222 |     print("Usage: pwr [action]")
223 |     print("\nIf no action is given, cycles through read/fetch/filter in sequence.\n")
224 |     print("Available actions:")
225 |     print("  read   - Read previously selected/enqueued URls")
226 |     print("  fetch  - Retrieve new article titles for review")
227 |     print("  filter - Review article titles and decide which to read")
228 |     print("  status - Print information about current status")
229 | 
230 | 
231 | def speedbump(action_name: str, data: PWRData) -> None:
232 |     print(f"About to start pwr {action_name}, last run at: {data['last_'+action_name]}")  # type: ignore
233 |     input(f"Press Enter to continue with pwr {action_name}")
234 | 
235 | 
236 | def get_last_action(data: PWRData) -> tuple[str, str]:
237 |     def date_str_for_op(key: str) -> str:
238 |         val = data[key]  # type: ignore
239 |         if val == "Never":
240 |             return "1970-01-01 00:00:00 UTC"
241 |         return val  # type: ignore
242 | 
243 |     recent_ops = [
244 |         ("read", date_str_for_op("last_read")),
245 |         ("fetch", date_str_for_op("last_fetch")),
246 |         ("filter", date_str_for_op("last_filter")),
247 |     ]
248 |     last_action, last_action_datetime = max(recent_ops, key=lambda x: x[1])
249 |     return (last_action, last_action_datetime)
250 | 
251 | 
252 | def count_urls(data: PWRData) -> int:
253 |     return sum(len(source_data["entries"]) for source_data in data["sources"].values())
254 | 
255 | 
256 | ### Action implementations ###
257 | 
258 | 
259 | def do_read() -> None:
260 |     data = load_data()
261 |     last_action, _ = get_last_action(data)
262 |     speedbump("read", data)
263 |     if last_action != "filter":
264 |         print(
265 |             "WARNING: filter is not the most recent action. Did you forget to run it?"
266 |         )
267 |         input("Press Enter to continue anyway, or Ctrl-C to abort")
268 |     urls = extra_urls_for_read.copy()
269 | 
270 |     for source_data in data["sources"].values():
271 |         for url, _ in source_data["entries"]:
272 |             url = url.split("##")[0]
273 |             if not url.startswith(("http://", "https://")):
274 |                 print(f"Skipping url '{url}' as it doesn't have a recognised protocol")
275 |                 continue
276 |             urls.append(url)
277 |         source_data["entries"] = []
278 | 
279 |     print(
280 |         f"Launching browser (in batches of {read_url_batch_size}) for {len(urls)} URLs."
281 |     )
282 | 
283 |     for url_batch in itertools.batched(urls, read_url_batch_size):
284 |         print(f"Opening batch of URLs with browser {preferred_browser}")
285 |         subprocess.Popen([preferred_browser] + list(url_batch))
286 |         if len(url_batch) == read_url_batch_size:
287 |             input("Press Enter to continue to next batch")
288 | 
289 |     print("All URLs read, saving changes")
290 |     data["last_read"] = get_time()
291 |     save_data(data)
292 |     print(f"pwr read ended successfully at {data['last_read']}")
293 | 
294 | 
295 | def do_fetch() -> None:
296 |     data = load_data()
297 |     speedbump("fetch", data)
298 |     total_filtered_extracted = 0
299 | 
300 |     for source_name, source_info in get_sources().items():
301 |         if source_name not in data["sources"]:
302 |             data["sources"][source_name] = {"seen": [], "entries": []}
303 |         else:
304 |             # Ensure serialised order of data from fetchers reflects any
305 |             # changes made to the sources dict order.
306 |             value = data["sources"].pop(source_name)
307 |             data["sources"][source_name] = value
308 | 
309 |         print(f"Processing source {source_name}")
310 |         func = source_info[0]
311 |         extracted = func(*source_info[1:])
312 | 
313 |         saved_source_data = data["sources"][source_name]
314 |         saved_source_data["seen"] = saved_source_data["seen"][-saved_seen_url_limit:]
315 |         seen_set = dict.fromkeys(saved_source_data["seen"])
316 |         filtered_extracted = []
317 |         for url, title in extracted:
318 |             if url in seen_set:
319 |                 # Ensure entry in seen_set is refreshed (i.e. affect ordering)
320 |                 seen_set.pop(url)
321 |             else:
322 |                 filtered_extracted.append([url, title])
323 |             seen_set[url] = None
324 |         saved_source_data["seen"] = list(seen_set.keys())
325 |         saved_source_data["entries"].extend(filtered_extracted)
326 |         total_filtered_extracted += len(filtered_extracted)
327 |         print(
328 |             f"Retrieved {len(extracted)} items, {len(filtered_extracted)} remain after removing seen items"
329 |         )
330 | 
331 |     # Delete data for any sources no longer in the sources list in this
332 |     # script.
333 |     sources = get_sources()
334 |     for source_name in list(data["sources"].keys()):
335 |         if source_name not in sources:
336 |             del data["sources"][source_name]
337 | 
338 |     data["last_fetch"] = get_time()
339 |     save_data(data)
340 |     print(
341 |         f"\nA total of {total_filtered_extracted} items were queued up for filtering."
342 |     )
343 |     print(f"pwr fetch ended successfully at {data['last_fetch']}")
344 | 
345 | 
346 | def do_filter() -> None:
347 |     data = load_data()
348 |     speedbump("filter", data)
349 |     num_urls_before_filtering = count_urls(data)
350 |     wrapper = textwrap.TextWrapper(
351 |         width=98, initial_indent="d ", subsequent_indent="  "
352 |     )
353 |     filter_file = data_dir / "filter.pwr"
354 | 
355 |     with filter_file.open("w") as file:
356 |         file.write("------------------------------------------------------------\n")
357 |         file.write(f"Filter file generated at {get_time()}\n")
358 |         file.write("DO NOT DELETE OR MOVE ANY LINES\n")
359 |         file.write("To mark an item for reading, replace the 'd' prefix with 'r'\n")
360 |         file.write("Exit editor with non-zero return code (:cq in vim) to abort\n")
361 |         file.write("------------------------------------------------------------\n\n")
362 |         for source_name, source_data in data["sources"].items():
363 |             if not source_data["entries"]:
364 |                 continue
365 |             file.write(f"# {source_name}\n")
366 |             for _, title in source_data["entries"]:
367 |                 file.write(wrapper.fill(title))
368 |                 file.write("\n")
369 |             file.write("\n")
370 | 
371 |     result = subprocess.run([os.environ.get("EDITOR", "vim"), filter_file])
372 |     if result.returncode != 0:
373 |         print("Exiting early as editor returned non-zero exit code")
374 |         print("Filtering not applied")
375 |         sys.exit(1)
376 | 
377 |     with filter_file.open("r") as file:
378 |         filtered_entries: URLAndTitleList = []
379 |         cur_source_name = None
380 |         index = 0
381 | 
382 |         for line in file:
383 |             if line.startswith("# "):
384 |                 new_source_name = line[2:].strip()
385 |                 if new_source_name not in data["sources"]:
386 |                     raise ValueError(
387 |                         f"Source {new_source_name} not found in saved json"
388 |                     )
389 |                 if cur_source_name:
390 |                     data["sources"][cur_source_name]["entries"] = filtered_entries
391 |                 filtered_entries = []
392 |                 index = 0
393 |                 cur_source_name = new_source_name
394 |             elif line.startswith("d "):
395 |                 index += 1
396 |             elif line.startswith("r "):
397 |                 if not cur_source_name:
398 |                     raise ValueError(
399 |                         "Invalid input. 'r ' encountered with no preceding heading."
400 |                     )
401 | 
402 |                 filtered_entries.append(
403 |                     data["sources"][cur_source_name]["entries"][index]
404 |                 )
405 |                 index += 1
406 | 
407 |         if cur_source_name:
408 |             data["sources"][cur_source_name]["entries"] = filtered_entries
409 | 
410 |     num_urls_after_filtering = count_urls(data)
411 |     print(
412 |         f"Filtered {num_urls_before_filtering} entries down to {num_urls_after_filtering} ({num_urls_before_filtering - num_urls_after_filtering} removed)."
413 |     )
414 |     data["last_filter"] = get_time()
415 |     save_data(data)
416 |     print(f"pwr filter ended successfully at {data['last_filter']}")
417 | 
418 | 
419 | def do_status() -> None:
420 |     data = load_data()
421 |     last_action, last_action_datetime = get_last_action(data)
422 |     print(f"Last operation was '{last_action}' at {last_action_datetime}")
423 |     print(f"{count_urls(data)} items in entries database")
424 | 
425 | 
426 | def do_test_fetcher(name: str, args: list[str]) -> None:
427 |     fetcher_fn = globals().get(f"{name}_fetcher")
428 |     if not fetcher_fn or not callable(fetcher_fn):
429 |         print(f"Error: fetcher function '{name}' not found.")
430 |         sys.exit(1)
431 |     extracted = fetcher_fn(*args)
432 |     print(f"{len(extracted)} URL+title pairs extracted:\n")
433 |     for entry in extracted:
434 |         print(f"URL: {entry[0]}")
435 |         print(f"Title: {entry[1]}")
436 |         print()
437 | 
438 | 
439 | ### Main ###
440 | 
441 | if __name__ == "__main__":
442 |     data_dir.mkdir(parents=True, exist_ok=True)
443 | 
444 |     if len(sys.argv) < 2:
445 |         data = load_data()
446 |         last_action, _ = get_last_action(data)
447 |         if last_action != "filter":
448 |             print(
449 |                 f"Error: argument-less pwr flow only valid when 'filter' was last action (last action: '{last_action}')"
450 |             )
451 |             sys.exit(1)
452 |         do_read()
453 |         do_fetch()
454 |         do_filter()
455 |         sys.exit(0)
456 | 
457 |     action = sys.argv[1]
458 |     if action == "read":
459 |         do_read()
460 |     elif action == "fetch":
461 |         do_fetch()
462 |     elif action == "filter":
463 |         do_filter()
464 |     elif action == "status":
465 |         do_status()
466 |     elif action == "test-fetcher":
467 |         if len(sys.argv) < 3:
468 |             print("Missing required fetcher name argument.")
469 |             sys.exit(1)
470 |         do_test_fetcher(sys.argv[2], sys.argv[3:])
471 |     else:
472 |         print(f"Error: Invalid action '{action}'")
473 |         print_help()
474 |         sys.exit(1)
475 | 


--------------------------------------------------------------------------------