.... wtf
61 | img_h1_captions = soup.select(".imgboxa h1")
62 | for h1 in img_h1_captions:
63 | h1.name = "p"
64 |
65 | return str(soup)
66 |
--------------------------------------------------------------------------------
/tests/tests_recipe_utils.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from calendar import monthrange
3 | from datetime import timedelta
4 |
5 | from _recipe_utils import (
6 | get_local_now,
7 | onlyon_weekdays,
8 | onlyon_days,
9 | onlyat_hours,
10 | every_x_days,
11 | every_x_hours,
12 | last_n_days_of_month,
13 | first_n_days_of_month,
14 | )
15 |
16 |
17 | class RecipeUtilsTests(unittest.TestCase):
18 | def test_onlyon_weekdays(self):
19 | curr_weekday = get_local_now().weekday()
20 | whole_week = list(range(0, 7))
21 | self.assertTrue(onlyon_weekdays(whole_week))
22 |
23 | whole_week.remove(curr_weekday)
24 | self.assertFalse(onlyon_weekdays(whole_week))
25 |
26 | def test_onlyon_days(self):
27 | curr_day = get_local_now().day
28 | whole_month = list(range(1, 32))
29 | self.assertTrue(onlyon_days(whole_month))
30 |
31 | whole_month.remove(curr_day)
32 | self.assertFalse(onlyon_days(whole_month))
33 |
34 | def test_onlyat_hours(self):
35 | curr_hour = get_local_now().hour
36 | whole_day = list(range(0, 24))
37 | self.assertTrue(onlyat_hours(whole_day))
38 |
39 | whole_day.remove(curr_hour)
40 | self.assertFalse(onlyat_hours(whole_day))
41 |
42 | def test_every_x_days(self):
43 | last_run = (get_local_now() - timedelta(days=1)).timestamp()
44 | self.assertTrue(every_x_days(last_run, 1))
45 |
46 | last_run = (get_local_now() - timedelta(days=0.5)).timestamp()
47 | self.assertFalse(every_x_days(last_run, 1))
48 |
49 | last_run = (get_local_now() - timedelta(days=0.75)).timestamp()
50 | self.assertTrue(every_x_days(last_run, 1, drift=0.25 * 24 * 60))
51 |
52 | def test_every_x_hours(self):
53 | last_run = (get_local_now() - timedelta(hours=1)).timestamp()
54 | self.assertTrue(every_x_hours(last_run, 1))
55 |
56 | last_run = (get_local_now() - timedelta(hours=0.5)).timestamp()
57 | self.assertFalse(every_x_hours(last_run, 1))
58 |
59 | last_run = (get_local_now() - timedelta(hours=0.75)).timestamp()
60 | self.assertTrue(every_x_hours(last_run, 1, drift=0.25 * 60))
61 |
62 | def test_last_n_days_of_month(self):
63 | now = get_local_now()
64 | _, month_end = monthrange(now.year, now.month)
65 | self.assertTrue(last_n_days_of_month(month_end - now.day + 1))
66 | self.assertFalse(last_n_days_of_month(month_end - now.day))
67 |
68 | def test_first_n_days_of_month(self):
69 | now = get_local_now()
70 | self.assertTrue(first_n_days_of_month(now.day))
71 | self.assertFalse(first_n_days_of_month(now.day - 1))
72 |
--------------------------------------------------------------------------------
/static/opds.scss:
--------------------------------------------------------------------------------
1 | @import 'colours';
2 |
3 | body {
4 | font-family: system-ui, sans-serif;
5 | max-width: 600px;
6 | margin: 1rem auto;
7 | padding: 0 1rem;
8 | color: $base-color;
9 | background-color: $base-bg-color;
10 | }
11 |
12 | a {
13 | color: $link-color;
14 | text-decoration: none;
15 |
16 | &:hover {
17 | color: $link-hover-color;
18 | text-decoration: underline;
19 | }
20 |
21 | &:visited {
22 | color: $link-visited-color;
23 | }
24 | }
25 |
26 | .notice {
27 | border-left: 4px solid lighten($link-color, 20%);
28 | background-color: $link-color;
29 | padding: 1rem;
30 | color: $base-bg-color;
31 |
32 | a {
33 | color: $base-bg-color;
34 |
35 | &:before {
36 | content: "\2190";
37 | margin-right: 0.2rem;
38 | display: inline-block;
39 | text-decoration: none;
40 | }
41 | }
42 | }
43 |
44 | ul.entries {
45 | padding-left: 1.5rem;
46 |
47 | > li {
48 | margin-bottom: 1.5rem;
49 | }
50 | }
51 |
52 | .item-header {
53 | font-size: 1.3rem;
54 | font-weight: bold;
55 | }
56 |
57 | .item-updated {
58 | font-size: 0.8rem;
59 | margin: 0.4rem 0;
60 |
61 | .cat {
62 | margin-right: 0.4rem;
63 | padding: 0.1rem 0.6rem;
64 | border-radius: 1rem;
65 | color: $base-color;
66 | border: 1px solid $base-color;
67 | }
68 | }
69 |
70 | .downloads {
71 | margin-top: 0.6rem;
72 |
73 | a.book {
74 | display: inline-block;
75 | text-align: center;
76 | min-width: 6rem;
77 | margin-right: 0.6rem;
78 | padding: 0.2rem 0.4rem;
79 | border-radius: 0.2rem;
80 | border: 1px solid $base-disabled-color;
81 | background-color: $book-bg-color;
82 | }
83 | }
84 |
85 | [data-theme="dark"] {
86 | body {
87 | color: $dark-base-color;
88 | background-color: $dark-base-bg-color;
89 | }
90 |
91 | a {
92 | color: $dark-link-color;
93 |
94 | &:hover {
95 | color: $dark-link-hover-color;
96 | }
97 |
98 | &:visited {
99 | color: $dark-link-visited-color;
100 | }
101 | }
102 |
103 | .notice {
104 | background-color: $dark-link-color;
105 | border-left-color: darken($dark-link-color, 20%);
106 | color: $dark-base-bg-color;
107 |
108 | a {
109 | color: $dark-base-bg-color;
110 | }
111 | }
112 |
113 | .item-updated {
114 |
115 | .cat {
116 | color: $dark-base-color;
117 | border-color: $dark-base-color;
118 | }
119 | }
120 |
121 | .downloads {
122 |
123 | a.book {
124 | border-color: $dark-base-disabled-color;
125 | background-color: $dark-book-bg-color;
126 | }
127 | }
128 | }
129 |
130 | @import 'opds_custom';
131 |
--------------------------------------------------------------------------------
/recipes/nytimes-books.recipe.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 https://github.com/ping/
2 | #
3 | # This software is released under the GNU General Public License v3.0
4 | # https://opensource.org/licenses/GPL-3.0
5 |
6 | import os
7 | import sys
8 |
9 | # custom include to share code between recipes
10 | sys.path.append(os.environ["recipes_includes"])
11 | from recipes_shared import BasicNewsrackRecipe, format_title
12 | from nyt import NYTRecipe
13 |
14 | from calibre.web.feeds.news import BasicNewsRecipe
15 |
16 | _name = "New York Times Books"
17 |
18 |
19 | class NYTimesBooks(NYTRecipe, BasicNewsrackRecipe, BasicNewsRecipe):
20 | title = _name
21 | language = "en"
22 | description = (
23 | "The latest book reviews, best sellers, news and features from "
24 | "The NY TImes critics and reporters. https://www.nytimes.com/section/books"
25 | )
26 | __author__ = "ping"
27 | publication_type = "newspaper"
28 | oldest_article = 7 # days
29 | max_articles_per_feed = 25
30 |
31 | remove_attributes = ["style", "font"]
32 | remove_tags_before = [dict(id="story")]
33 | remove_tags_after = [dict(id="story")]
34 | remove_tags = [
35 | dict(
36 | id=["in-story-masthead", "sponsor-wrapper", "top-wrapper", "bottom-wrapper"]
37 | ),
38 | dict(
39 | class_=[
40 | "NYTAppHideMasthead",
41 | "css-170u9t6", # book affliate links
42 | ]
43 | ),
44 | dict(role=["toolbar", "navigation"]),
45 | dict(name=["script", "noscript", "style"]),
46 | ]
47 |
48 | extra_css = """
49 | time > span { margin-right: 0.5rem; }
50 | [data-testid="photoviewer-children"] span {
51 | font-size: 0.8rem;
52 | }
53 |
54 | .headline { font-size: 1.8rem; margin-bottom: 0.4rem; }
55 | .sub-headline { font-size: 1.2rem; font-style: italic; margin-bottom: 1rem; }
56 | .article-meta { margin-bottom: 1rem; }
57 | .author { font-weight: bold; color: #444; display: inline-block; }
58 | .published-dt { margin-left: 0.5rem; }
59 | .article-img { margin-bottom: 0.8rem; max-width: 100%; }
60 | .article-img img {
61 | display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto;
62 | box-sizing: border-box; }
63 | .article-img .caption { font-size: 0.8rem; }
64 | div.summary { font-size: 1.2rem; margin: 1rem 0; }
65 | """
66 |
67 | feeds = [
68 | ("NYTimes Books", "https://rss.nytimes.com/services/xml/rss/nyt/Books.xml"),
69 | ]
70 |
71 | def populate_article_metadata(self, article, __, _):
72 | if (not self.pub_date) or article.utctime > self.pub_date:
73 | self.pub_date = article.utctime
74 | self.title = format_title(_name, article.utctime)
75 |
76 | def parse_feeds(self):
77 | return self.group_feeds_by_date()
78 |
--------------------------------------------------------------------------------
/static/theme.js:
--------------------------------------------------------------------------------
1 | /*!
2 | * Color mode toggler for Bootstrap's docs (https://getbootstrap.com/)
3 | * Copyright 2011-2023 The Bootstrap Authors
4 | * Licensed under the Creative Commons Attribution 3.0 Unported License.
5 | */
6 |
7 | (() => {
8 | 'use strict'
9 |
10 | // supported keyCodes: enter=13, space=32
11 | const supportedKeyCodes = [13];
12 | const getStoredTheme = () => localStorage.getItem('theme');
13 | const setStoredTheme = theme => localStorage.setItem('theme', theme);
14 |
15 | const getPreferredTheme = () => {
16 | const storedTheme = getStoredTheme();
17 | if (storedTheme) {
18 | return storedTheme;
19 | }
20 | return window.matchMedia('(prefers-color-scheme: dark)').matches ? 'dark' : 'light';
21 | };
22 |
23 | const getCurrTheme = () => {
24 | return document.documentElement.getAttribute('data-theme');
25 | };
26 |
27 | const setTheme = theme => {
28 | if (theme === 'auto' && window.matchMedia('(prefers-color-scheme: dark)').matches) {
29 | document.documentElement.setAttribute('data-theme', 'dark');
30 | } else {
31 | document.documentElement.setAttribute('data-theme', theme);
32 | }
33 | };
34 |
35 | setTheme(getPreferredTheme());
36 |
37 | const showActiveTheme = (theme) => {
38 | const themeIcon = document.querySelector('#toggle-theme-icon use');
39 | if (!themeIcon) {
40 | return
41 | }
42 | let icon = 'auto';
43 | if (theme === 'dark') { icon = 'light'; }
44 | if (theme === 'light') { icon = 'dark'; }
45 | themeIcon.setAttribute('href', `reader_sprites.svg#icon-theme-${icon}`);
46 | }
47 |
48 | window.matchMedia('(prefers-color-scheme: dark)').addEventListener('change', () => {
49 | const storedTheme = getStoredTheme();
50 | if (storedTheme !== 'light' && storedTheme !== 'dark') {
51 | setTheme(getPreferredTheme());
52 | }
53 | });
54 |
55 | const toggleTheme = (e) => {
56 | if (e.type === "keyup" && supportedKeyCodes.indexOf(e.keyCode || e.which) < 0) { // not enter key
57 | return;
58 | }
59 | const newTheme = getCurrTheme() === 'dark' ? 'light' : 'dark';
60 | setStoredTheme(newTheme);
61 | setTheme(newTheme);
62 | const themeSvg = document.getElementById('toggle-theme-icon');
63 | themeSvg.addEventListener("animationend", (e) => {
64 | e.target.classList.remove("spin-it");
65 | showActiveTheme(newTheme);
66 | });
67 | themeSvg.classList.add("spin-it");
68 | };
69 |
70 | window.addEventListener('DOMContentLoaded', () => {
71 | showActiveTheme(getPreferredTheme());
72 |
73 | const themeToggler = document.getElementById("toggle-theme");
74 | if (themeToggler) {
75 | themeToggler.addEventListener("click", toggleTheme);
76 | themeToggler.addEventListener("keyup", toggleTheme);
77 | }
78 | });
79 | })();
--------------------------------------------------------------------------------
/static/index.html:
--------------------------------------------------------------------------------
1 |
7 |
8 |
9 |
10 | "
12 |
13 | import os
14 | import sys
15 |
16 | # custom include to share code between recipes
17 | sys.path.append(os.environ["recipes_includes"])
18 | from recipes_shared import BasicNewsrackRecipe, format_title
19 |
20 | from calibre.web.feeds.news import BasicNewsRecipe
21 |
22 | _name = "Asian Review of Books"
23 |
24 |
25 | class AsianReviewOfBooks(BasicNewsrackRecipe, BasicNewsRecipe):
26 | title = _name
27 | __author__ = "Darko Miletic"
28 | description = "In addition to reviewing books about or of relevance to Asia, the Asian Review of Books also features long-format essays by leading Asian writers and thinkers, to providing an unparalleled forum for discussion of key contemporary issues by Asians for Asia and a vehicle of intellectual depth and breadth where leading thinkers can write on the books, arts and ideas of the day. Widely quoted and referenced, with an archive of more than one thousand book reviews, it is the only web resource dedicated to Asian books. And now, with the addition of the new premium content, the Asian Review of Books, is a must-read publication. https://asianreviewofbooks.com/" # noqa
29 | publisher = "The Asian Review of Books"
30 | category = "literature, books, reviews, Asia"
31 | language = "en"
32 | publication_type = "magazine"
33 | masthead_url = "https://i2.wp.com/asianreviewofbooks.com/content/wp-content/uploads/2016/09/ARBwidelogo.png"
34 |
35 | oldest_article = 30
36 | max_articles_per_feed = 30
37 |
38 | conversion_options = {
39 | "comment": description,
40 | "tags": category,
41 | "publisher": publisher,
42 | "language": language,
43 | }
44 |
45 | remove_attributes = ["width", "height"]
46 | keep_only_tags = [
47 | dict(name="main"),
48 | ]
49 | remove_tags = [
50 | dict(class_=["entry-meta", "sharedaddy", "jp-relatedposts", "entry-footer"])
51 | ]
52 |
53 | extra_css = """
54 | blockquote { font-size: 1.2rem; margin-left: 0; font-style: italic; }
55 | .wp-caption-text, .entry-featured__caption { display: block; font-size: 0.8rem; margin-top: 0.2rem; }
56 | """
57 |
58 | feeds = [("Articles", "http://asianreviewofbooks.com/content/feed/")]
59 |
60 | def populate_article_metadata(self, article, soup, _):
61 | if not self.pub_date or self.pub_date < article.utctime:
62 | self.pub_date = article.utctime
63 | self.title = format_title(_name, self.pub_date)
64 |
65 | def preprocess_html(self, soup):
66 | # find empty
67 | paras = soup.find_all("p")
68 | for p in paras:
69 | if not p.text.strip():
70 | p.decompose()
71 |
72 | quotes = soup.find_all("h5")
73 | for q in quotes:
74 | q.name = "blockquote"
75 |
76 | bio = soup.find_all("h6")
77 | for b in bio:
78 | b.name = "div"
79 |
80 | return soup
81 |
--------------------------------------------------------------------------------
/recipes/mollywhite-newsletter.recipe.py:
--------------------------------------------------------------------------------
1 | """
2 | newsletter.mollywhite.net
3 | """
4 | import os
5 | import sys
6 | from datetime import timezone, timedelta
7 |
8 | # custom include to share code between recipes
9 | sys.path.append(os.environ["recipes_includes"])
10 | from recipes_shared import BasicNewsrackRecipe, format_title, get_date_format
11 |
12 | from calibre.web.feeds.news import BasicNewsRecipe
13 |
14 | _name = "Molly White"
15 |
16 |
17 | class MollyWhiteNewsletter(BasicNewsrackRecipe, BasicNewsRecipe):
18 | title = _name
19 | description = "Keep up with the happenings in the tech world without all the boosterism. Cryptocurrency critic, technology researcher, and software engineer Molly White publishes a weekly explainer of the latest news and developments in the cryptocurrency industry, with summaries of the latest disasters featured on her well-known project Web3 is Going Just Great. https://newsletter.mollywhite.net/"
20 | language = "en"
21 | __author__ = "ping"
22 | publication_type = "blog"
23 | use_embedded_content = True
24 | auto_cleanup = False
25 |
26 | oldest_article = 30 # days
27 | max_articles_per_feed = 30
28 |
29 | keep_only_tags = [dict(name="article")]
30 | remove_tags = [dict(class_=["subscription-widget-wrap", "image-link-expand"])]
31 | remove_attributes = ["width"]
32 |
33 | extra_css = """
34 | .article-meta { margin-top: 1rem; margin-bottom: 1rem; }
35 | .article-meta .author { font-weight: bold; color: #444; margin-right: 0.5rem; }
36 | .captioned-image-container img {
37 | display: block;
38 | max-width: 100%;
39 | height: auto;
40 | box-sizing: border-box;
41 | }
42 | .captioned-image-container .image-caption { font-size: 0.8rem; margin-top: 0.2rem; }
43 | blockquote { font-size: 1.25rem; margin-left: 0; text-align: center; }
44 | blockquote p { margin: 0.4rem 0; }
45 |
46 | .footnote { color: dimgray; }
47 | .footnote .footnote-content p { margin-top: 0; }
48 | """
49 |
50 | feeds = [
51 | (_name, "https://newsletter.mollywhite.net/feed"),
52 | ]
53 |
54 | def populate_article_metadata(self, article, __, _):
55 | if (not self.pub_date) or article.utctime > self.pub_date:
56 | self.pub_date = article.utctime
57 | self.title = format_title(_name, article.utctime)
58 |
59 | def parse_feeds(self):
60 | timezone_offset_hours = -6
61 | feeds = self.group_feeds_by_date(timezone_offset_hours=timezone_offset_hours)
62 | for feed in feeds:
63 | for article in feed.articles:
64 | # inject title and pub date
65 | date_published = article.utctime.replace(tzinfo=timezone.utc)
66 | date_published_loc = date_published.astimezone(
67 | timezone(offset=timedelta(hours=timezone_offset_hours))
68 | )
69 | article_soup = self.soup(
70 | f'{article.title}
'
71 | f'{article.author}'
72 | f'{date_published_loc:{get_date_format()}}'
73 | f"
{article.content}
"
74 | )
75 | article.content = str(article_soup)
76 | return feeds
77 |
--------------------------------------------------------------------------------
/recipes/channelnewsasia.recipe.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 https://github.com/ping/
2 | #
3 | # This software is released under the GNU General Public License v3.0
4 | # https://opensource.org/licenses/GPL-3.0
5 |
6 | """
7 | channelnewsasia.com
8 | """
9 | import os
10 | import sys
11 |
12 | # custom include to share code between recipes
13 | sys.path.append(os.environ["recipes_includes"])
14 | from recipes_shared import BasicNewsrackRecipe, format_title
15 |
16 | from calibre.web.feeds.news import BasicNewsRecipe
17 |
18 | _name = "ChannelNewsAsia"
19 |
20 |
21 | class ChannelNewsAsia(BasicNewsrackRecipe, BasicNewsRecipe):
22 | title = _name
23 | __author__ = "ping"
24 | description = "CNA: Breaking News, Singapore News, World and Asia https://www.channelnewsasia.com/"
25 | publisher = "Mediacorp"
26 | category = "news, Singapore"
27 | publication_type = "newspaper"
28 | language = "en"
29 | masthead_url = "https://www.channelnewsasia.com/sites/default/themes/mc_cna_theme/images/logo.png"
30 |
31 | oldest_article = 1
32 | max_articles_per_feed = 25
33 |
34 | remove_tags_before = [dict(class_=["h1--page-title"])]
35 | remove_tags_after = [dict(class_=["content"])]
36 | remove_attributes = ["style"]
37 | remove_tags = [
38 | dict(
39 | class_=[
40 | "js-popup-content",
41 | "referenced-card",
42 | "block--related-topics",
43 | "block-ad-entity",
44 | "block-block-content",
45 | "from-library",
46 | "block-field-blocknodearticlefield-author", # author bio
47 | "mobile_author_card", # author bio
48 | "block-field-blocknodearticlefield-text-to-speech", # article AI audio
49 | ]
50 | ),
51 | dict(name="div", attrs={"data-ad-entity": True}),
52 | dict(name="div", attrs={"data-js-options": True}),
53 | dict(name=["script", "noscript", "style", "svg"]),
54 | ]
55 |
56 | extra_css = """
57 | .figure__caption { font-size: 0.8rem; }
58 | .figure__caption p { margin-top: 0.2rem; margin-bottom: 1rem; }
59 | """
60 |
61 | feeds = [
62 | # (
63 | # "Latest News",
64 | # "https://www.channelnewsasia.com/api/v1/rss-outbound-feed?_format=xml",
65 | # ),
66 | (
67 | "Singapore",
68 | "https://www.channelnewsasia.com/api/v1/rss-outbound-feed?_format=xml&category=10416",
69 | ),
70 | (
71 | "Asia",
72 | "https://www.channelnewsasia.com/api/v1/rss-outbound-feed?_format=xml&category=6511",
73 | ),
74 | (
75 | "Business",
76 | "https://www.channelnewsasia.com/api/v1/rss-outbound-feed?_format=xml&category=6936",
77 | ),
78 | # (
79 | # "Sport",
80 | # "https://www.channelnewsasia.com/api/v1/rss-outbound-feed?_format=xml&category=10296",
81 | # ),
82 | # (
83 | # "World",
84 | # "https://www.channelnewsasia.com/api/v1/rss-outbound-feed?_format=xml&category=6311",
85 | # ),
86 | ]
87 |
88 | def populate_article_metadata(self, article, __, _):
89 | if (not self.pub_date) or article.utctime > self.pub_date:
90 | self.pub_date = article.utctime
91 | self.title = format_title(_name, article.utctime)
92 |
--------------------------------------------------------------------------------
/recipes/korea-herald.recipe.py:
--------------------------------------------------------------------------------
1 | """
2 | koreaherald.com
3 | """
4 | __license__ = "GPL v3"
5 | __copyright__ = "2011, Seongkyoun Yoo "
6 |
7 | import os
8 | import re
9 | import sys
10 |
11 | # custom include to share code between recipes
12 | sys.path.append(os.environ["recipes_includes"])
13 | from recipes_shared import BasicNewsrackRecipe, format_title
14 |
15 | from calibre.web.feeds.news import BasicNewsRecipe
16 |
17 | _name = "Korea Herald"
18 |
19 |
20 | class KoreaHerald(BasicNewsrackRecipe, BasicNewsRecipe):
21 | title = _name
22 | language = "en"
23 | description = "Korea Herald News articles https://koreaherald.com/"
24 | __author__ = "Seongkyoun Yoo"
25 | publication_type = "newspaper"
26 | masthead_url = "https://res.heraldm.com/new_201209/images/common/logo.gif"
27 |
28 | oldest_article = 1
29 | max_articles_per_feed = 25
30 |
31 | keep_only_tags = [dict(class_="news_content")]
32 | remove_attributes = ["style", "align"]
33 | remove_tags = [
34 | dict(name=["script", "style"]),
35 | dict(class_=["news_btn_wrap", "news_journalist_area"]),
36 | ]
37 |
38 | extra_css = """
39 | h1.news_title { font-size: 1.8rem; margin-bottom: 0.4rem; }
40 | h2.news_title { font-size: 1.2rem; font-style: italic; font-weight: normal; margin-bottom: 0.8rem; }
41 | p.news_date { margin-top: 0.2rem; }
42 | .img_caption { font-size: 0.8rem; margin-top: 0.2rem; display: block; }
43 | """
44 |
45 | feeds = [
46 | ("National", "http://www.koreaherald.com/common/rss_xml.php?ct=102"),
47 | ("Business", "http://www.koreaherald.com/common/rss_xml.php?ct=103"),
48 | ("Finance", "http://www.koreaherald.com/common/rss_xml.php?ct=305"),
49 | ("Life & Style", "http://www.koreaherald.com/common/rss_xml.php?ct=104"),
50 | ("Entertainment", "http://www.koreaherald.com/common/rss_xml.php?ct=105"),
51 | # ("Sports", "http://www.koreaherald.com/common/rss_xml.php?ct=106"),
52 | ("World", "http://www.koreaherald.com/common/rss_xml.php?ct=107"),
53 | ("Opinion", "http://www.koreaherald.com/common/rss_xml.php?ct=108"),
54 | ]
55 |
56 | def populate_article_metadata(self, article, __, _):
57 | if (not self.pub_date) or article.utctime > self.pub_date:
58 | self.pub_date = article.utctime
59 | self.title = format_title(_name, article.utctime)
60 |
61 | def preprocess_html(self, soup):
62 | byline_date = soup.find(attrs={"class": "view_tit_byline_r"})
63 | if byline_date:
64 | # format the published/updated date properly
65 | date_elements = []
66 | # Published : Apr 18, 2022 - 16:41 Updated : Apr 18, 2022 - 16:41
67 | date_re = r"(Published|Updated).+?\:.+?(?P[a-z]{3}\s\d+),.+?(?P