"
12 |
13 | import os
14 | import sys
15 |
16 | # custom include to share code between recipes
17 | sys.path.append(os.environ["recipes_includes"])
18 | from recipes_shared import BasicNewsrackRecipe, format_title
19 |
20 | from calibre.web.feeds.news import BasicNewsRecipe
21 |
22 | _name = "Asian Review of Books"
23 |
24 |
25 | class AsianReviewOfBooks(BasicNewsrackRecipe, BasicNewsRecipe):
26 | title = _name
27 | __author__ = "Darko Miletic"
28 | description = "In addition to reviewing books about or of relevance to Asia, the Asian Review of Books also features long-format essays by leading Asian writers and thinkers, to providing an unparalleled forum for discussion of key contemporary issues by Asians for Asia and a vehicle of intellectual depth and breadth where leading thinkers can write on the books, arts and ideas of the day. Widely quoted and referenced, with an archive of more than one thousand book reviews, it is the only web resource dedicated to Asian books. And now, with the addition of the new premium content, the Asian Review of Books, is a must-read publication. https://asianreviewofbooks.com/" # noqa
29 | publisher = "The Asian Review of Books"
30 | category = "literature, books, reviews, Asia"
31 | language = "en"
32 | publication_type = "magazine"
33 | masthead_url = "https://i2.wp.com/asianreviewofbooks.com/content/wp-content/uploads/2016/09/ARBwidelogo.png"
34 |
35 | oldest_article = 30
36 | max_articles_per_feed = 30
37 |
38 | conversion_options = {
39 | "comment": description,
40 | "tags": category,
41 | "publisher": publisher,
42 | "language": language,
43 | }
44 |
45 | remove_attributes = ["width", "height"]
46 | keep_only_tags = [
47 | dict(name="main"),
48 | ]
49 | remove_tags = [
50 | dict(class_=["entry-meta", "sharedaddy", "jp-relatedposts", "entry-footer"])
51 | ]
52 |
53 | extra_css = """
54 | blockquote { font-size: 1.2rem; margin-left: 0; font-style: italic; }
55 | .wp-caption-text, .entry-featured__caption { display: block; font-size: 0.8rem; margin-top: 0.2rem; }
56 | """
57 |
58 | feeds = [("Articles", "http://asianreviewofbooks.com/content/feed/")]
59 |
60 | def populate_article_metadata(self, article, soup, _):
61 | if not self.pub_date or self.pub_date < article.utctime:
62 | self.pub_date = article.utctime
63 | self.title = format_title(_name, self.pub_date)
64 |
65 | def preprocess_html(self, soup):
66 | # find empty
67 | paras = soup.find_all("p")
68 | for p in paras:
69 | if not p.text.strip():
70 | p.decompose()
71 |
72 | quotes = soup.find_all("h5")
73 | for q in quotes:
74 | q.name = "blockquote"
75 |
76 | bio = soup.find_all("h6")
77 | for b in bio:
78 | b.name = "div"
79 |
80 | return soup
81 |
--------------------------------------------------------------------------------
/recipes/bookforum-magazine.recipe.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from urllib.parse import urljoin
4 |
5 | # custom include to share code between recipes
6 | sys.path.append(os.environ["recipes_includes"])
7 | from recipes_shared import BasicNewsrackRecipe
8 |
9 | from mechanize import Request
10 | from calibre.web.feeds.news import BasicNewsRecipe
11 |
12 | _name = "Bookforum"
13 | _issue_url = ""
14 |
15 |
16 | class BookforumMagazine(BasicNewsrackRecipe, BasicNewsRecipe):
17 | title = _name
18 | description = (
19 | "Bookforum is an American book review magazine devoted to books and "
20 | "the discussion of literature. https://www.bookforum.com/print"
21 | )
22 | language = "en"
23 | __author__ = "ping"
24 | publication_type = "magazine"
25 | compress_news_images_auto_size = 8
26 |
27 | keep_only_tags = [dict(class_="blog-article")]
28 | remove_tags = [dict(name=["af-share-toggle", "af-related-articles"])]
29 |
30 | extra_css = """
31 | .blog-article__header { font-size: 1.8rem; margin-bottom: 0.4rem; }
32 | .blog-article__subtitle { font-size: 1.2rem; font-style: italic; margin-bottom: 1rem; }
33 | .blog-article__writer { font-size: 1rem; font-weight: bold; color: #444; }
34 | .blog-article__book-info { margin: 1rem 0; }
35 | .article-image-container img, .blog-article__publication-media img {
36 | display: block; max-width: 100%; height: auto;
37 | }
38 | .blog-article__caption { font-size: 0.8rem; display: block; margin-top: 0.2rem; }
39 | """
40 |
41 | def preprocess_html(self, soup):
42 | # strip away links that's not needed
43 | for ele in soup.select(".blog-article__header a"):
44 | ele.unwrap()
45 | return soup
46 |
47 | def parse_index(self):
48 | soup = self.index_to_soup(
49 | _issue_url if _issue_url else "https://www.bookforum.com/print"
50 | )
51 | meta_ele = soup.find("meta", property="og:title")
52 | if meta_ele:
53 | self.title = f'{_name}: {meta_ele["content"]}'
54 |
55 | cover_ele = soup.find("img", class_="toc-issue__cover")
56 | if cover_ele:
57 | self.cover_url = urljoin(
58 | "https://www.bookforum.com",
59 | soup.find("img", class_="toc-issue__cover")["src"],
60 | )
61 | # use cover image to get a published date
62 | br = self.get_browser()
63 | cover_res = br.open_novisit(
64 | Request(self.cover_url, timeout=self.timeout, method="HEAD")
65 | )
66 | cover_res_lastupdated = cover_res.get("last-modified", default=None)
67 | if cover_res_lastupdated:
68 | self.pub_date = self.parse_date(cover_res_lastupdated)
69 |
70 | articles = {}
71 | for sect_ele in soup.find_all("div", class_="toc-articles__section"):
72 | section_name = self.tag_to_string(
73 | sect_ele.find("a", class_="toc__anchor-links__link")
74 | )
75 | for article_ele in sect_ele.find_all("article"):
76 | title_ele = article_ele.find("h1")
77 | sub_title_ele = article_ele.find(class_="toc-article__subtitle")
78 | articles.setdefault(section_name, []).append(
79 | {
80 | "title": self.tag_to_string(title_ele),
81 | "url": article_ele.find("a", class_="toc-article__link")[
82 | "href"
83 | ],
84 | "description": self.tag_to_string(sub_title_ele)
85 | if sub_title_ele
86 | else "",
87 | }
88 | )
89 | return articles.items()
90 |
--------------------------------------------------------------------------------
/recipes/channelnewsasia.recipe.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 https://github.com/ping/
2 | #
3 | # This software is released under the GNU General Public License v3.0
4 | # https://opensource.org/licenses/GPL-3.0
5 |
6 | """
7 | channelnewsasia.com
8 | """
9 | import os
10 | import sys
11 |
12 | # custom include to share code between recipes
13 | sys.path.append(os.environ["recipes_includes"])
14 | from recipes_shared import BasicNewsrackRecipe, format_title
15 |
16 | from calibre.web.feeds.news import BasicNewsRecipe
17 |
18 | _name = "ChannelNewsAsia"
19 |
20 |
21 | class ChannelNewsAsia(BasicNewsrackRecipe, BasicNewsRecipe):
22 | title = _name
23 | __author__ = "ping"
24 | description = "CNA: Breaking News, Singapore News, World and Asia https://www.channelnewsasia.com/"
25 | publisher = "Mediacorp"
26 | category = "news, Singapore"
27 | publication_type = "newspaper"
28 | language = "en"
29 | masthead_url = "https://www.channelnewsasia.com/sites/default/themes/mc_cna_theme/images/logo.png"
30 |
31 | oldest_article = 1
32 | max_articles_per_feed = 25
33 |
34 | remove_tags_before = [dict(class_=["h1--page-title"])]
35 | remove_tags_after = [dict(class_=["content"])]
36 | remove_attributes = ["style"]
37 | remove_tags = [
38 | dict(
39 | class_=[
40 | "js-popup-content",
41 | "referenced-card",
42 | "block--related-topics",
43 | "block-ad-entity",
44 | "block-block-content",
45 | "from-library",
46 | "block-field-blocknodearticlefield-author", # author bio
47 | "mobile_author_card", # author bio
48 | "block-field-blocknodearticlefield-text-to-speech", # article AI audio
49 | ]
50 | ),
51 | dict(name="div", attrs={"data-ad-entity": True}),
52 | dict(name="div", attrs={"data-js-options": True}),
53 | dict(name=["script", "noscript", "style", "svg"]),
54 | ]
55 |
56 | extra_css = """
57 | .figure__caption { font-size: 0.8rem; }
58 | .figure__caption p { margin-top: 0.2rem; margin-bottom: 1rem; }
59 | """
60 |
61 | feeds = [
62 | # (
63 | # "Latest News",
64 | # "https://www.channelnewsasia.com/api/v1/rss-outbound-feed?_format=xml",
65 | # ),
66 | (
67 | "Singapore",
68 | "https://www.channelnewsasia.com/api/v1/rss-outbound-feed?_format=xml&category=10416",
69 | ),
70 | (
71 | "Asia",
72 | "https://www.channelnewsasia.com/api/v1/rss-outbound-feed?_format=xml&category=6511",
73 | ),
74 | (
75 | "Business",
76 | "https://www.channelnewsasia.com/api/v1/rss-outbound-feed?_format=xml&category=6936",
77 | ),
78 | # (
79 | # "Sport",
80 | # "https://www.channelnewsasia.com/api/v1/rss-outbound-feed?_format=xml&category=10296",
81 | # ),
82 | # (
83 | # "World",
84 | # "https://www.channelnewsasia.com/api/v1/rss-outbound-feed?_format=xml&category=6311",
85 | # ),
86 | ]
87 |
88 | def populate_article_metadata(self, article, __, _):
89 | if (not self.pub_date) or article.utctime > self.pub_date:
90 | self.pub_date = article.utctime
91 | self.title = format_title(_name, article.utctime)
92 |
--------------------------------------------------------------------------------
/recipes/fivebooks.recipe.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 https://github.com/ping/
2 | #
3 | # This software is released under the GNU General Public License v3.0
4 | # https://opensource.org/licenses/GPL-3.0
5 |
6 | """
7 | fivebooks.com
8 | """
9 | import os
10 | import re
11 | import sys
12 | from datetime import datetime
13 |
14 | # custom include to share code between recipes
15 | sys.path.append(os.environ["recipes_includes"])
16 | from recipes_shared import BasicNewsrackRecipe, format_title
17 |
18 | from calibre.web.feeds.news import BasicNewsRecipe
19 |
20 | _name = "Five Books"
21 |
22 |
23 | class FiveBooks(BasicNewsrackRecipe, BasicNewsRecipe):
24 | title = _name
25 | __author__ = "ping"
26 | description = "Expert book recommendations https://fivebooks.com/"
27 | language = "en"
28 | category = "books"
29 | publication_type = "blog"
30 | max_articles_per_feed = 15
31 | masthead_url = "https://fivebooks.com/app/themes/five-books/assets/images/logo.png"
32 | scale_news_images = (400, 400)
33 |
34 | remove_attributes = ["style", "font"]
35 | remove_tags = [
36 | dict(id=["interview-related", "buyfive"]),
37 | dict(
38 | class_=[
39 | "listen-button",
40 | "buy-button",
41 | "book-ad",
42 | "-newsletter",
43 | "read-later-and-social",
44 | "further-reading",
45 | "show-for-medium-up",
46 | "hide-for-small",
47 | "book-list-mobile",
48 | "-donate",
49 | "update",
50 | "social-buttons",
51 | "ebook-button",
52 | "book-links",
53 | "bio-component",
54 | ]
55 | ),
56 | dict(name=["script", "noscript", "style"]),
57 | ]
58 | remove_tags_before = [dict(class_=["main-content"])]
59 | remove_tags_after = [dict(class_=["main-content"])]
60 |
61 | extra_css = """
62 | p.book-number { font-weight: bold; font-size: 1.2rem; }
63 | ul.book-covers { list-style: none; list-style-type: none; padding-left: 0; }
64 | ul.book-covers li { display: block; margin-bottom: 1rem; }
65 | ul.book-covers li .cover-wrap { display: inline-block; vertical-align: top; }
66 | ul.book-covers li p.book-number { display: none; }
67 | ul.book-covers li h2 { display: inline-block; font-size: 0.8rem; margin-left: 1rem; }
68 | p.pullquote { margin-left: 3pt; font-size: 0.85rem; color: #333333; font-style: italic; }
69 | """
70 | feeds = [
71 | ("Newest", "https://fivebooks.com/interviews/?order=newest"),
72 | ("Popular", "https://fivebooks.com/interviews/?order=popular"),
73 | ]
74 |
75 | def populate_article_metadata(self, article, soup, first):
76 | post_date = None
77 | dt = soup.find(class_="date")
78 | if not dt:
79 | dated_tag = soup.find(attrs={"data-post-modified-date": True})
80 | if dated_tag:
81 | post_date = datetime.fromisoformat(dated_tag["data-post-modified-date"])
82 | else:
83 | # "%B %d, %Y"
84 | post_date = self.parse_date(dt.text)
85 | if post_date:
86 | if not self.pub_date or post_date > self.pub_date:
87 | self.pub_date = post_date
88 | self.title = format_title(_name, post_date)
89 | article.utctime = post_date
90 |
91 | description_tag = soup.find(attrs={"data-post-description": True})
92 | if description_tag:
93 | article.text_summary = description_tag["data-post-description"]
94 |
95 | def preprocess_raw_html(self, raw_html, url):
96 | soup = self.soup(raw_html)
97 | content = soup.find(class_="main-content")
98 | data = self.get_ld_json(soup, lambda d: d.get("@graph", []))
99 | if not data:
100 | return raw_html
101 | graph = data.get("@graph", [])
102 | if not graph:
103 | return raw_html
104 | for g in graph:
105 | if g.get("@type") != "WebPage":
106 | continue
107 | content["data-post-modified-date"] = (
108 | g.get("dateModified") or g["datePublished"]
109 | )
110 | content["data-post-description"] = g.get("description", "")
111 | break
112 | return str(soup)
113 |
114 | def parse_index(self):
115 | br = self.get_browser()
116 | articles = {}
117 | for feed_name, feed_url in self.feeds:
118 | articles[feed_name] = []
119 | raw_html = (
120 | br.open_novisit(feed_url, timeout=self.timeout).read().decode("utf-8")
121 | )
122 | soup = self.soup(raw_html)
123 | interviews = soup.find_all(class_="library-page")
124 | if self.max_articles_per_feed < len(interviews):
125 | interviews = interviews[: self.max_articles_per_feed]
126 | for interview in interviews:
127 | heading = interview.find("h2")
128 | title = re.sub(r"\s{2,}", " ", heading.text)
129 | link = heading.find("a")
130 | articles[feed_name].append(
131 | {
132 | "title": title,
133 | "url": link["href"],
134 | "date": "",
135 | "description": "",
136 | }
137 | )
138 | return articles.items()
139 |
--------------------------------------------------------------------------------
/recipes/fivethirtyeight.recipe.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 https://github.com/ping/
2 | #
3 | # This software is released under the GNU General Public License v3.0
4 | # https://opensource.org/licenses/GPL-3.0
5 |
6 | """
7 | fivethirtyeight.com is no more
8 | """
9 | import json
10 | import os
11 | import sys
12 | from datetime import timezone
13 | from html import unescape
14 |
15 | # custom include to share code between recipes
16 | sys.path.append(os.environ["recipes_includes"])
17 | from recipes_shared import WordPressNewsrackRecipe, format_title, get_date_format
18 |
19 | from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
20 | from calibre.web.feeds.news import BasicNewsRecipe
21 |
22 | _name = "FiveThirtyEight"
23 |
24 |
25 | class FiveThirtyEight(WordPressNewsrackRecipe, BasicNewsRecipe):
26 | title = _name
27 | description = "FiveThirtyEight uses statistical analysis — hard numbers — to tell compelling stories about politics, sports, science, economics and culture. https://fivethirtyeight.com/"
28 | language = "en"
29 | __author__ = "ping"
30 |
31 | oldest_article = 14
32 | max_articles_per_feed = 10
33 | masthead_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/1/13/FiveThirtyEight_Logo.svg/1024px-FiveThirtyEight_Logo.svg.png"
34 |
35 | reverse_article_order = False
36 | remove_attributes = ["style", "width", "height"]
37 | remove_tags = [dict(class_=["video-title", "videoplayer", "video-footer"])]
38 |
39 | extra_css = """
40 | h1.article-title { font-size: 1.8rem; margin-bottom: 0.4rem; }
41 | h2.article-subtitle { font-size: 1.2rem; font-style: italic; margin-bottom: 1rem; font-weight: normal; }
42 | .single-header-metadata-wrap { margin-bottom: 1rem; }
43 | .single-header-metadata-wrap .vcard {
44 | font-weight: bold; color: #444; margin-right: 0.5rem;
45 | margin-top: 0; margin-bottom: 0;
46 | }
47 | .single-topic { margin-top: 0; margin-bottom: 0; }
48 | .single-featured-image img, p img, .wp-block-image img { margin-bottom: 0.8rem; max-width: 100%; }
49 | .single-featured-image .caption { display: block; font-size: 0.8rem; margin-top: 0.2rem; }
50 | """
51 |
52 | feeds = [
53 | (_name, "https://fivethirtyeight.com/"),
54 | ]
55 |
56 | def preprocess_raw_html(self, raw_html, url):
57 | # formulate the api response into html
58 | post = json.loads(raw_html)
59 |
60 | return f"""
61 |
{post["title"]["rendered"]}
62 |
63 |
64 | {post["content"]["rendered"]}
65 |
66 | """
67 |
68 | def parse_index(self):
69 | br = self.get_browser()
70 | articles = {}
71 | self.temp_dir = PersistentTemporaryDirectory()
72 |
73 | for feed_name, feed_url in self.feeds:
74 | custom_params = {
75 | "rest_route": "/wp/v2/fte_features",
76 | "espn_verticals_exclude": 67, # Sports
77 | "tags_exclude": 329557888, # Podcasts
78 | }
79 | posts = self.get_posts(feed_url, self.oldest_article, custom_params, br)
80 |
81 | latest_post_date = None
82 | for p in posts:
83 | post_update_dt = self.parse_date(
84 | p["modified_gmt"], tz_info=timezone.utc
85 | )
86 | if not self.pub_date or post_update_dt > self.pub_date:
87 | self.pub_date = post_update_dt
88 | post_date = self.parse_date(p["date"], tz_info=None, as_utc=False)
89 | if not latest_post_date or post_date > latest_post_date:
90 | latest_post_date = post_date
91 | self.title = format_title(_name, post_date)
92 |
93 | section_name = f"{post_date:{get_date_format()}}"
94 | if len(self.get_feeds()) > 1:
95 | section_name = f"{feed_name}: {post_date:{get_date_format()}}"
96 | if section_name not in articles:
97 | articles[section_name] = []
98 |
99 | with PersistentTemporaryFile(suffix=".json", dir=self.temp_dir) as f:
100 | f.write(json.dumps(p).encode("utf-8"))
101 |
102 | verticals = []
103 | if p.get("espn_verticals"):
104 | try:
105 | for terms in p.get("_embedded", {}).get("wp:term", []):
106 | verticals.extend(
107 | [
108 | t["name"]
109 | for t in terms
110 | if t["taxonomy"] == "espn_verticals"
111 | ]
112 | )
113 |
114 | except (KeyError, TypeError):
115 | pass
116 |
117 | articles[section_name].append(
118 | {
119 | "title": unescape(p["title"]["rendered"]) or "Untitled",
120 | "url": "file://" + f.name,
121 | "date": f"{post_date:{get_date_format()}}",
122 | "description": unescape(" / ".join(verticals)),
123 | }
124 | )
125 | return articles.items()
126 |
--------------------------------------------------------------------------------
/recipes/forbes-editors-picks.recipe.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import sys
4 | from datetime import datetime, timezone, timedelta
5 | from urllib.parse import urlencode
6 |
7 | # custom include to share code between recipes
8 | sys.path.append(os.environ["recipes_includes"])
9 | from recipes_shared import BasicNewsrackRecipe, format_title
10 |
11 | from calibre.web.feeds.news import BasicNewsRecipe
12 |
13 | _name = "Forbes - Editor's Picks"
14 |
15 |
16 | class ForbesEditorsPicks(BasicNewsrackRecipe, BasicNewsRecipe):
17 | title = _name
18 | __author__ = "ping"
19 | description = "Forbe's Editors' Picks https://www.forbes.com/editors-picks/"
20 | language = "en"
21 |
22 | oldest_article = 7
23 | max_articles_per_feed = 10
24 |
25 | scale_news_images = (800, 1200)
26 | timeout = 10
27 | simultaneous_downloads = 1
28 |
29 | keep_only_tags = [dict(name="article")]
30 | remove_attributes = ["style", "height", "width"]
31 |
32 | remove_tags = [
33 | dict(
34 | class_=[
35 | "story-package__nav-wrapper",
36 | "container__subnav--outer",
37 | "edit-story-container",
38 | "article-sharing",
39 | "vert-pipe",
40 | "short-bio",
41 | "bottom-contrib-block",
42 | "article-footer",
43 | "sigfile",
44 | "hidden",
45 | "link-embed",
46 | "subhead3-embed",
47 | "recirc-module",
48 | "seo",
49 | "top-ad-container",
50 | "speakr-wrapper",
51 | ]
52 | ),
53 | dict(name=["fbs-cordial", "fbs-ad", "svg"]),
54 | ]
55 |
56 | extra_css = """
57 | .top-label-wrapper a { margin-right: 0.5rem; color: #444; }
58 | .issue { font-weight: bold; margin-bottom: 0.2rem; }
59 | h1 { font-size: 1.8rem; margin-bottom: 0.4rem; }
60 | h2.subhead-embed { font-size: 1.2rem; font-style: italic; font-weight: normal; margin-bottom: 0.5rem; }
61 | h2.subhead-embed strong { font-weight: normal; }
62 | .top-contrib-block { margin-top: 0.5rem; font-weight: bold; color: #444; }
63 | .content-data { margin-bottom: 1rem; font-weight: normal; color: unset; }
64 | .image-embed p { font-size: 0.8rem; margin-top: 0.2rem; margin-bottom: 0.5rem; }
65 | .image-embed img {
66 | display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto;
67 | box-sizing: border-box;
68 | }
69 | blockquote { font-size: 1.25rem; margin-left: 0; text-align: center; }
70 | blockquote .text-align { font-size: 1rem; }
71 | """
72 |
73 | def preprocess_raw_html(self, raw_html, url):
74 | soup = self.soup(raw_html)
75 | article = soup.find("article")
76 | meta = self.get_ld_json(soup, lambda d: d.get("@type", "") == "NewsArticle")
77 | modified_date = meta.get("dateModified") or meta.get("datePublished")
78 | article["data-og-modified-date"] = modified_date
79 | for img in soup.find_all("progressive-image"):
80 | img.name = "img"
81 | return str(soup)
82 |
83 | def populate_article_metadata(self, article, soup, first):
84 | article_date = soup.find(attrs={"data-og-modified-date": True})
85 | if article_date:
86 | modified_date = datetime.fromisoformat(
87 | article_date["data-og-modified-date"]
88 | ).replace(tzinfo=timezone.utc)
89 | if (not self.pub_date) or modified_date > self.pub_date:
90 | self.pub_date = modified_date
91 | self.title = format_title(_name, self.pub_date)
92 | article.utctime = modified_date
93 | article.localtime = modified_date
94 |
95 | def parse_index(self):
96 | br = self.get_browser()
97 | cutoff_date = datetime.utcnow().replace(tzinfo=timezone.utc) - timedelta(
98 | days=self.oldest_article
99 | )
100 | articles = []
101 |
102 | date_param = 0
103 | content_ids = None
104 | end_feed = False
105 | while not end_feed:
106 | query = {
107 | "limit": 25,
108 | "sourceValue": "editors-pick",
109 | "streamSourceType": "badge",
110 | }
111 | if content_ids:
112 | query["ids"] = content_ids
113 | if date_param:
114 | query["date"] = date_param
115 |
116 | endpoint = (
117 | f"https://www.forbes.com/simple-data/chansec/stream/?{urlencode(query)}"
118 | )
119 |
120 | res = br.open_novisit(endpoint, timeout=self.timeout)
121 | res_obj = json.loads(res.read().decode("utf-8"))
122 | items = res_obj.get("blocks", {}).get("items", [])
123 | if not items:
124 | break
125 |
126 | for item in items:
127 | item_date = datetime.utcfromtimestamp(item["date"] / 1000.0).replace(
128 | tzinfo=timezone.utc
129 | )
130 | if item_date < cutoff_date:
131 | end_feed = True
132 | break
133 |
134 | if (not self.pub_date) or item_date > self.pub_date:
135 | self.pub_date = item_date
136 | self.title = format_title(_name, self.pub_date)
137 |
138 | articles.append(
139 | {
140 | "title": item["title"],
141 | "url": item["url"],
142 | "description": item["description"],
143 | "date": item_date,
144 | }
145 | )
146 | date_param = item["date"]
147 | content_ids = item["id"]
148 | if len(articles) >= self.max_articles_per_feed:
149 | end_feed = True
150 | break
151 |
152 | return [(_name, articles)]
153 |
--------------------------------------------------------------------------------
/recipes/foreign-policy.recipe.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 https://github.com/ping/
2 | #
3 | # This software is released under the GNU General Public License v3.0
4 | # https://opensource.org/licenses/GPL-3.0
5 | import json
6 | import os
7 | import sys
8 |
9 | # custom include to share code between recipes
10 | sys.path.append(os.environ["recipes_includes"])
11 | from recipes_shared import WordPressNewsrackRecipe, get_datetime_format
12 |
13 | from calibre.web.feeds.news import BasicNewsRecipe
14 |
15 | _name = "Foreign Policy"
16 | _issue_url = ""
17 |
18 |
19 | class ForeignPolicy(WordPressNewsrackRecipe, BasicNewsRecipe):
20 | title = _name
21 | __author__ = "ping"
22 | description = (
23 | "Foreign Policy is an American news publication, founded in 1970 and "
24 | "focused on global affairs, current events, and domestic and international "
25 | "policy. It produces content daily on its website and app, and in four "
26 | "print issues annually. https://foreignpolicy.com/"
27 | )
28 | language = "en"
29 | publication_type = "blog"
30 | oldest_article = 7 # days
31 | masthead_url = "https://foreignpolicy.com/wp-content/themes/foreign-policy-2017/assets/src/images/logos/favicon-256.png"
32 | reverse_article_order = False
33 | compress_news_images_auto_size = 12
34 |
35 | remove_tags = [
36 | dict(
37 | class_=[
38 | "Apple-converted-space",
39 | "graphic-chatter",
40 | "fp_choose_placement_related_posts",
41 | "sidebar-box_right",
42 | "newsletter-unit-signup",
43 | "newsletter-unit-signup--shortcode-fallback",
44 | ]
45 | ),
46 | dict(style="height:0;opacity:0;"),
47 | dict(name=["noscript"]),
48 | ]
49 |
50 | extra_css = """
51 | .headline { font-size: 1.8rem; margin-bottom: 0.4rem; }
52 | .article-meta { margin-top: 1rem; margin-bottom: 1rem; }
53 | .article-meta .author { font-weight: bold; color: #444; margin-right: 0.5rem; }
54 | .article-section { display: block; font-weight: bold; color: #444; }
55 | .article-img img, img.attachment-full { display: block; max-width: 100%; height: auto; }
56 | .article-img p, .wp-caption-text {
57 | font-size: 0.8rem; display: block; margin-top: 0.2rem;
58 | }
59 | .pull-quote-sidebar {
60 | display: block; text-align: center;
61 | margin-left: 0; margin-bottom: 0.4rem; font-size: 1.25rem;
62 | }
63 | """
64 |
65 | feeds = [
66 | (_name, "https://www.foreignpolicy.com/"),
67 | ]
68 |
69 | def preprocess_raw_html(self, raw_html, url):
70 | # formulate the api response into html
71 | post = json.loads(raw_html)
72 | if not post:
73 | self.abort_article()
74 | date_published_loc = self.parse_date(post["date"], tz_info=None, as_utc=False)
75 | post_authors = self.extract_authors(post)
76 | categories = self.extract_categories(post)
77 |
78 | soup = self.soup(
79 | f"""
80 | {post["title"]["rendered"]}
81 |
82 |
83 | {f'{" / ".join(categories)}' if categories else ''}
84 | {post["title"]["rendered"]}
85 |
86 | {f'{", ".join(post_authors)}' if post_authors else ''}
87 |
88 | {date_published_loc:{get_datetime_format()}}
89 |
90 |
91 |
92 | """
93 | )
94 |
95 | content = self.soup(post["content"]["rendered"])
96 | # FP doesn't use featuremedia, the first attachment is the lede image
97 | attachment_endpoint = (
98 | post.get("_links", {}).get("wp:attachment", [{}])[0].get("href")
99 | )
100 | if attachment_endpoint:
101 | attachment = next(
102 | iter(json.loads(self.index_to_soup(attachment_endpoint, raw=True))), {}
103 | )
104 | if attachment:
105 | lede = soup.new_tag("div", attrs={"class": "image-attachment"})
106 | img = soup.new_tag("img", attrs={"src": attachment["source_url"]})
107 | lede.append(img)
108 | if attachment.get("caption", {}).get("rendered"):
109 | caption = soup.new_tag("div", attrs={"class": "wp-caption-text"})
110 | caption.append(self.soup(attachment["caption"]["rendered"]))
111 | lede.append(caption)
112 | soup.body.article.append(lede)
113 |
114 | soup.body.article.append(content)
115 |
116 | for img in soup.find_all("img", attrs={"data-lazy-src": True}):
117 | img["src"] = img["data-lazy-src"]
118 | # also cleanup a little
119 | for attribute in (
120 | "data-lazy-src",
121 | "data-lazy-srcset",
122 | "data-lazy-sizes",
123 | "data-src",
124 | "loading",
125 | ):
126 | if img.get(attribute):
127 | del img[attribute]
128 |
129 | return str(soup)
130 |
131 | def parse_index(self):
132 | articles = {}
133 | br = self.get_browser()
134 | for feed_name, feed_url in self.feeds:
135 | articles = self.get_articles(
136 | articles, feed_name, feed_url, self.oldest_article, {}, br
137 | )
138 | return articles.items()
139 |
--------------------------------------------------------------------------------
/recipes/harvard-intl-review.recipe.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 https://github.com/ping/
2 | #
3 | # This software is released under the GNU General Public License v3.0
4 | # https://opensource.org/licenses/GPL-3.0
5 |
6 | """
7 | hir.harvard.edu
8 | """
9 | import os
10 | import sys
11 | from datetime import timezone
12 |
13 | # custom include to share code between recipes
14 | sys.path.append(os.environ["recipes_includes"])
15 | from recipes_shared import (
16 | BasicNewsrackRecipe,
17 | format_title,
18 | get_date_format,
19 | get_datetime_format,
20 | )
21 |
22 | from calibre.web.feeds import Feed
23 | from calibre.web.feeds.news import BasicNewsRecipe
24 |
25 | _name = "Harvard International Review"
26 |
27 |
28 | class HarvardInternationalReview(BasicNewsrackRecipe, BasicNewsRecipe):
29 | title = _name
30 | description = "The Harvard International Review is a quarterly magazine offering insight on international affairs from the perspectives of scholars, leaders, and policymakers. https://hir.harvard.edu/"
31 | language = "en"
32 | __author__ = "ping"
33 | publication_type = "magazine"
34 | oldest_article = 30 # days
35 | max_articles_per_feed = 30
36 | use_embedded_content = True
37 | masthead_url = (
38 | "https://hir.harvard.edu/content/images/2020/12/HIRlogo_crimson-4.png"
39 | )
40 | compress_news_images_auto_size = 7
41 | auto_cleanup = True
42 | timeout = 60
43 |
44 | extra_css = """
45 | .article-meta { margin-bottom: 1rem; }
46 | .article-meta .author { font-weight: bold; color: #444; }
47 | .article-meta .published-dt { margin-left: 0.5rem; }
48 | """
49 |
50 | feeds = [
51 | (_name, "https://hir.harvard.edu/rss/"),
52 | ]
53 |
54 | def populate_article_metadata(self, article, __, _):
55 | if (not self.pub_date) or article.utctime > self.pub_date:
56 | self.pub_date = article.utctime
57 | self.title = format_title(_name, article.utctime)
58 |
59 | def parse_feeds(self):
60 | # convert single parsed feed into date-sectioned feed
61 | # use this only if there is just 1 feed
62 | parsed_feeds = super().parse_feeds()
63 | if len(parsed_feeds or []) != 1:
64 | return parsed_feeds
65 |
66 | articles = []
67 | for feed in parsed_feeds:
68 | articles.extend(feed.articles)
69 | articles = sorted(articles, key=lambda a: a.utctime, reverse=True)
70 | new_feeds = []
71 | curr_feed = None
72 | parsed_feed = parsed_feeds[0]
73 | for i, a in enumerate(articles, start=1):
74 | date_published = a.utctime.replace(tzinfo=timezone.utc)
75 | article_index = f"{date_published:{get_date_format()}}"
76 | # add author and pub date
77 | soup = self.soup(a.content)
78 | header = None
79 | if soup.body.contents[0].name in ["h1", "h2", "h3"]:
80 | header = soup.body.contents[0]
81 | meta = soup.new_tag("div", attrs={"class": "article-meta"})
82 | if a.author:
83 | author_ele = soup.new_tag("span", attrs={"class": "author"})
84 | author_ele.append(a.author)
85 | meta.append(author_ele)
86 | pub_ele = soup.new_tag("span", attrs={"class": "published-dt"})
87 | pub_ele.append(f"{date_published:{get_datetime_format()}}")
88 | meta.append(pub_ele)
89 | if header:
90 | header.insert_after(meta)
91 | else:
92 | soup.body.insert(0, meta)
93 | a.content = soup.body.decode_contents()
94 | if i == 1:
95 | curr_feed = Feed(log=parsed_feed.logger)
96 | curr_feed.title = article_index
97 | curr_feed.description = parsed_feed.description
98 | curr_feed.image_url = parsed_feed.image_url
99 | curr_feed.image_height = parsed_feed.image_height
100 | curr_feed.image_alt = parsed_feed.image_alt
101 | curr_feed.oldest_article = parsed_feed.oldest_article
102 | curr_feed.articles = []
103 | curr_feed.articles.append(a)
104 | continue
105 | if curr_feed.title == article_index:
106 | curr_feed.articles.append(a)
107 | else:
108 | new_feeds.append(curr_feed)
109 | curr_feed = Feed(log=parsed_feed.logger)
110 | curr_feed.title = article_index
111 | curr_feed.description = parsed_feed.description
112 | curr_feed.image_url = parsed_feed.image_url
113 | curr_feed.image_height = parsed_feed.image_height
114 | curr_feed.image_alt = parsed_feed.image_alt
115 | curr_feed.oldest_article = parsed_feed.oldest_article
116 | curr_feed.articles = []
117 | curr_feed.articles.append(a)
118 | if i == len(articles):
119 | # last article
120 | new_feeds.append(curr_feed)
121 |
122 | return new_feeds
123 |
--------------------------------------------------------------------------------
/recipes/japan-times.recipe.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # Original at https://github.com/kovidgoyal/calibre/blob/4a01a799f19c4d0711d826ec7c79821b4ea690b6/recipes/japan_times.recipe
5 | #
6 | # [!] Ad-blocked, requires login
7 | #
8 | """
9 | japantimes.co.jp
10 | """
11 |
12 | __license__ = "GPL v3"
13 | __copyright__ = (
14 | "2008-2013, Darko Miletic . "
15 | "2022, Albert Aparicio Isarn "
16 | )
17 |
18 | import os
19 | import sys
20 | from datetime import datetime
21 |
22 | # custom include to share code between recipes
23 | sys.path.append(os.environ["recipes_includes"])
24 | from recipes_shared import BasicNewsrackRecipe, format_title, get_datetime_format
25 |
26 | from calibre.web.feeds.news import BasicNewsRecipe
27 |
28 | _name = "Japan Times"
29 |
30 |
31 | class JapanTimes(BasicNewsrackRecipe, BasicNewsRecipe):
32 | title = _name
33 | __author__ = "Albert Aparicio Isarn (original recipe by Darko Miletic)"
34 | description = "The latest news from Japan Times, Japan's leading English-language daily newspaper"
35 | language = "en_JP"
36 | category = "news, politics, japan"
37 | publisher = "The Japan Times"
38 | oldest_article = 1
39 | max_articles_per_feed = 60
40 | publication_type = "newspaper"
41 | masthead_url = "https://cdn-japantimes.com/wp-content/themes/jt_theme/library/img/japantimes-logo-tagline.png"
42 |
43 | auto_cleanup = False
44 |
45 | conversion_options = {
46 | "comment": description,
47 | "tags": category,
48 | "publisher": publisher,
49 | "language": language,
50 | }
51 |
52 | remove_attributes = ["style"]
53 | remove_tags_before = [dict(name="main")]
54 | remove_tags_after = [dict(name="main")]
55 |
56 | remove_tags = [
57 | dict(name=["script", "style"]),
58 | dict(
59 | id=[
60 | "tpModal",
61 | "site_header",
62 | "nav_anchor_container",
63 | "nav",
64 | "no_js_blocker",
65 | "menu",
66 | "taboola-below-article-thumbnails",
67 | "disqus_thread",
68 | "piano-recommend",
69 | ]
70 | ),
71 | dict(
72 | class_=[
73 | "clearfix",
74 | "nav_search",
75 | "sub_menu_container",
76 | "sidebar",
77 | "ad",
78 | "site_footer",
79 | "post-attachments",
80 | "post-keywords",
81 | "newsletter-signup",
82 | "DisplayAd",
83 | "jt-subscribe-box",
84 | "single-sns-area",
85 | "single-upper-meta",
86 | "article_footer_ad",
87 | "note-to-commenters",
88 | "note-to-non-commenters",
89 | "pagetop-wrap",
90 | "jt-related-stories",
91 | ]
92 | ),
93 | ]
94 |
95 | extra_css = """
96 | .article-meta { margin-top: 1rem; margin-bottom: 1rem; }
97 | .article-meta .author { font-weight: bold; color: #444; margin-right: 0.5rem; }
98 | ul.slides { list-style: none; }
99 | .slide_image img { max-width: 100%; height: auto; }
100 | .slide_image div, .inline_image div { font-size: 0.8rem; margin-top: 0.2rem; }
101 | """
102 |
103 | feeds = [
104 | ("Top Stories", "https://www.japantimes.co.jp/feed/topstories/"),
105 | ("News", "https://www.japantimes.co.jp/news/feed/"),
106 | ("Opinion", "https://www.japantimes.co.jp/opinion/feed/"),
107 | ("Life", "https://www.japantimes.co.jp/life/feed/"),
108 | ("Community", "https://www.japantimes.co.jp/community/feed/"),
109 | ("Culture", "https://www.japantimes.co.jp/culture/feed/"),
110 | # ("Sports", "https://www.japantimes.co.jp/sports/feed/"),
111 | ]
112 |
113 | def preprocess_html(self, soup):
114 | # "unbullet" the images
115 | slides = soup.find(name="ul", attrs={"class": "slides"})
116 | if slides:
117 | for img_div in slides.find_all(attrs={"class": "slide_image"}):
118 | slides.insert_after(img_div.extract())
119 | slides.decompose()
120 |
121 | # embed the lazy loaded images
122 | lazy_loaded_images = soup.find_all(name="img", attrs={"data-src": True})
123 | for img in lazy_loaded_images:
124 | img["src"] = img["data-src"]
125 |
126 | # reformat the article meta
127 | meta = soup.new_tag("div", attrs={"class": "article-meta"})
128 | credit = soup.find(name="meta", attrs={"name": "cXenseParse:jat-credit"})
129 | if credit:
130 | sep = credit.get("data-separator", ",")
131 | authors = credit["content"].split(sep)
132 | author_ele = soup.new_tag("span", attrs={"class": "author"})
133 | author_ele.append(",".join(authors))
134 | meta.append(author_ele)
135 | pub_date = soup.find(name="meta", attrs={"property": "article:published_time"})
136 | if pub_date:
137 | pub_date = datetime.fromisoformat(pub_date["content"])
138 | pub_date_ele = soup.new_tag("span", attrs={"class": "published-date"})
139 | pub_date_ele.append(f"{pub_date:{get_datetime_format()}}")
140 | meta.append(pub_date_ele)
141 | if (not self.pub_date) or pub_date > self.pub_date:
142 | self.pub_date = pub_date
143 | self.title = format_title(_name, pub_date)
144 | soup.body.h1.insert_after(meta)
145 | return soup
146 |
147 | def parse_feeds(self):
148 | # because feed is not sorted by date
149 | parsed_feeds = super().parse_feeds()
150 | for feed in parsed_feeds:
151 | articles = feed.articles
152 | articles = sorted(articles, key=lambda a: a.utctime, reverse=True)
153 | feed.articles = articles
154 | return parsed_feeds
155 |
--------------------------------------------------------------------------------
/recipes/joongangdaily.recipe.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 https://github.com/ping/
2 | #
3 | # This software is released under the GNU General Public License v3.0
4 | # https://opensource.org/licenses/GPL-3.0
5 |
6 | """
7 | koreajoongangdaily.joins.com
8 | """
9 | import os
10 | import sys
11 |
12 | # custom include to share code between recipes
13 | sys.path.append(os.environ["recipes_includes"])
14 | from recipes_shared import BasicNewsrackRecipe, format_title
15 |
16 | from calibre.web.feeds.news import BasicNewsRecipe
17 |
18 | _name = "JoongAng Daily"
19 |
20 |
21 | class KoreaJoongAngDaily(BasicNewsrackRecipe, BasicNewsRecipe):
22 | title = _name
23 | description = "The Korea JoongAng Daily is an English-language daily published by the JoongAng Group, Korea’s leading media group, in association with The New York Times. https://koreajoongangdaily.joins.com/"
24 | language = "en"
25 | __author__ = "ping"
26 | publication_type = "newspaper"
27 | masthead_url = (
28 | "https://koreajoongangdaily.joins.com/resources/images/common/logo.png"
29 | )
30 | use_embedded_content = True
31 | auto_cleanup = True
32 | compress_news_images_auto_size = 10
33 |
34 | oldest_article = 1 # days
35 | max_articles_per_feed = 60
36 |
37 | extra_css = """
38 | .caption { font-size: 0.8rem; margin: 0.5rem 0; }
39 | """
40 |
41 | feeds = [
42 | ("Korea JoongAng Daily", "https://koreajoongangdaily.joins.com/xmls/joins"),
43 | ]
44 |
45 | def populate_article_metadata(self, article, __, _):
46 | if (not self.pub_date) or article.utctime > self.pub_date:
47 | self.pub_date = article.utctime
48 | self.title = format_title(_name, article.utctime)
49 |
50 | def parse_feeds(self):
51 | return self.group_feeds_by_date(timezone_offset_hours=9) # Seoul time
52 |
--------------------------------------------------------------------------------
/recipes/kirkus.recipe.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from urllib.parse import urljoin
4 |
5 | # custom include to share code between recipes
6 | sys.path.append(os.environ["recipes_includes"])
7 | from recipes_shared import BasicNewsrackRecipe
8 |
9 | from calibre.web.feeds.news import BasicNewsRecipe
10 |
11 | _name = "Kirkus"
12 |
13 |
14 | class Kirkus(BasicNewsrackRecipe, BasicNewsRecipe):
15 | title = _name
16 | description = "Kirkus Reviews is an American book review magazine founded in 1933 by Virginia Kirkus. The magazine is headquartered in New York City. https://www.kirkusreviews.com/magazine/current/"
17 | language = "en"
18 | __author__ = "ping"
19 | publication_type = "magazine"
20 | masthead_url = (
21 | "https://d1fd687oe6a92y.cloudfront.net/img/kir_images/logo/kirkus-nav-logo.svg"
22 | )
23 | max_articles_per_feed = 99
24 | compress_news_images_auto_size = 6
25 | keep_only_tags = [
26 | dict(
27 | class_=[
28 | "article-author",
29 | "article-author-img-start",
30 | "article-author-description-start",
31 | "single-review",
32 | ]
33 | )
34 | ]
35 | remove_tags = [
36 | dict(
37 | class_=[
38 | "sidebar-content",
39 | "article-social-share-desktop-first",
40 | "article-social-share-desktop-pagination",
41 | "article-social-share-mobile",
42 | "share-review-text",
43 | "like-dislike-article",
44 | "rate-this-book-text",
45 | "input-group",
46 | "user-comments",
47 | "show-all-response-text",
48 | "button-row",
49 | "hide-on-mobile",
50 | "related-article",
51 | "breadcrumb-row",
52 | "shop-now-dropdown",
53 | ]
54 | )
55 | ]
56 | remove_tags_after = [dict(class_="single-review")]
57 |
58 | extra_css = """
59 | .image-container img { max-width: 100%; height: auto; margin-bottom: 0.2rem; }
60 | .photo-caption { font-size: 0.8rem; margin-bottom: 0.5rem; display: block; }
61 | .book-review-img .image-container { text-align: center; }
62 | .book-rating-module .description-title { font-size: 1.25rem; margin-left: 0; text-align: center; }
63 | """
64 |
65 | def preprocess_html(self, soup):
66 | h1 = soup.find(class_="article-title")
67 | book_cover = soup.find("ul", class_="book-review-img")
68 | if book_cover:
69 | for li in book_cover.find_all("li"):
70 | li.name = "div"
71 | book_cover.name = "div"
72 | if h1:
73 | book_cover.insert_before(h1.extract())
74 |
75 | return soup
76 |
77 | def parse_index(self):
78 | issue_url = "https://www.kirkusreviews.com/magazine/current/"
79 | soup = self.index_to_soup(issue_url)
80 | issue = soup.find(name="article", class_="issue-container")
81 | cover_img = issue.select(".issue-header .cover-image img")
82 | if cover_img:
83 | self.cover_url = cover_img[0]["src"]
84 |
85 | h1 = issue.find("h1")
86 | if h1:
87 | edition = self.tag_to_string(h1)
88 | self.title = f"{_name}: {edition}"
89 | # Example: April 1, 2023 "%B %d, %Y"
90 | self.pub_date = self.parse_date(edition)
91 |
92 | articles = {}
93 | for book_ele in soup.find_all(name="div", class_="issue-featured-book"):
94 | link = book_ele.find("a")
95 | if not link:
96 | continue
97 | section = self.tag_to_string(book_ele.find("h3")).upper()
98 | articles.setdefault(section, []).append(
99 | {"url": urljoin(issue_url, link["href"]), "title": link["title"]}
100 | )
101 |
102 | for post_ele in issue.select("div.issue-more-posts ul li div.lead-text"):
103 | link = post_ele.find("a")
104 | if not link:
105 | continue
106 | section = self.tag_to_string(post_ele.find(class_="lead-text-type")).upper()
107 | articles.setdefault(section, []).append(
108 | {
109 | "url": urljoin(issue_url, link["href"]),
110 | "title": self.tag_to_string(link),
111 | }
112 | )
113 |
114 | for section_ele in issue.select("section.reviews-section"):
115 | section_articles = []
116 | for review in section_ele.select("ul li.starred"):
117 | link = review.select("h4 a")
118 | if not link:
119 | continue
120 | description = review.find("p")
121 | section_articles.append(
122 | {
123 | "url": urljoin(issue_url, link[0]["href"]),
124 | "title": self.tag_to_string(link[0]),
125 | "description": ""
126 | if not description
127 | else self.tag_to_string(description),
128 | }
129 | )
130 | if not section_articles:
131 | continue
132 | section = self.tag_to_string(section_ele.find("h3")).upper()
133 | if section not in articles:
134 | articles[section] = []
135 | articles.setdefault(section, []).extend(section_articles)
136 |
137 | return articles.items()
138 |
--------------------------------------------------------------------------------
/recipes/knowable-magazine.recipe.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 https://github.com/ping/
2 | #
3 | # This software is released under the GNU General Public License v3.0
4 | # https://opensource.org/licenses/GPL-3.0
5 |
6 | """
7 | knowablemagazine.org
8 | """
9 | import os
10 | import sys
11 |
12 | # custom include to share code between recipes
13 | sys.path.append(os.environ["recipes_includes"])
14 | from recipes_shared import BasicNewsrackRecipe, format_title
15 |
16 | from calibre.web.feeds.news import BasicNewsRecipe
17 |
18 | _name = "Knowable Magazine"
19 |
20 |
21 | class KnowableMagazine(BasicNewsrackRecipe, BasicNewsRecipe):
22 | title = _name
23 | __author__ = "ping"
24 | description = (
25 | "Knowable Magazine explores the real-world significance of scholarly work "
26 | "through a journalistic lens. We report on the current state of play across "
27 | "a wide variety of fields — from agriculture to high-energy physics; "
28 | "biochemistry to water security; the origins of the universe to psychology. "
29 | "https://knowablemagazine.org/"
30 | )
31 | masthead_url = "https://knowablemagazine.org/pb-assets/knowable-assets/images/logo-1586554394067.svg"
32 | language = "en"
33 | publication_type = "magazine"
34 | timeout = 60
35 |
36 | oldest_article = 45 # days
37 | max_articles_per_feed = 15
38 | scale_news_images = (800, 1200)
39 |
40 | keep_only_tags = [
41 | dict(class_=["article-container"]),
42 | ]
43 | remove_attributes = ["style"]
44 | remove_tags = [
45 | dict(name=["script", "style", "svg"]),
46 | dict(attrs={"data-widget-def": True}),
47 | dict(id=["newsletter-promo-item"]),
48 | dict(
49 | class_=[
50 | "promo",
51 | "ember-view",
52 | "promo-article-dark",
53 | "share-icons-box",
54 | "article-tags",
55 | "article-republish",
56 | ]
57 | ),
58 | ]
59 |
60 | extra_css = """
61 | h1 { font-size: 1.8rem; margin-bottom: 0.4rem; }
62 | .article-subhead { font-size: 1.2rem; font-style: italic; font-weight: normal; margin-bottom: 0.5rem; margin-top: 0; }
63 | .article-byline { margin-top: 0.5rem; margin-bottom: 1rem; }
64 | .article-byline .author-byline { font-weight: bold; color: #444; display: inline-block; }
65 | .article-byline .pub-date { display: inline-block; margin-left: 0.5rem; }
66 | .article-image img {
67 | display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto;
68 | box-sizing: border-box;
69 | }
70 | .article-image .caption { font-size: 0.8rem; }
71 | .pull-quote { font-size: 1.25rem; margin-left: 0; text-align: center; }
72 | """
73 |
74 | feeds = [
75 | (_name, "https://knowablemagazine.org/rss"),
76 | ]
77 |
78 | def populate_article_metadata(self, article, __, _):
79 | if (not self.pub_date) or article.utctime > self.pub_date:
80 | self.pub_date = article.utctime
81 | self.title = format_title(_name, article.utctime)
82 |
83 | def parse_feeds(self):
84 | return self.group_feeds_by_date(timezone_offset_hours=-7) # PST
85 |
--------------------------------------------------------------------------------
/recipes/korea-herald.recipe.py:
--------------------------------------------------------------------------------
1 | """
2 | koreaherald.com
3 | """
4 | __license__ = "GPL v3"
5 | __copyright__ = "2011, Seongkyoun Yoo "
6 |
7 | import os
8 | import re
9 | import sys
10 |
11 | # custom include to share code between recipes
12 | sys.path.append(os.environ["recipes_includes"])
13 | from recipes_shared import BasicNewsrackRecipe, format_title
14 |
15 | from calibre.web.feeds.news import BasicNewsRecipe
16 |
17 | _name = "Korea Herald"
18 |
19 |
20 | class KoreaHerald(BasicNewsrackRecipe, BasicNewsRecipe):
21 | title = _name
22 | language = "en"
23 | description = "Korea Herald News articles https://koreaherald.com/"
24 | __author__ = "Seongkyoun Yoo"
25 | publication_type = "newspaper"
26 | masthead_url = "https://res.heraldm.com/new_201209/images/common/logo.gif"
27 |
28 | oldest_article = 1
29 | max_articles_per_feed = 25
30 |
31 | keep_only_tags = [dict(class_="news_content")]
32 | remove_attributes = ["style", "align"]
33 | remove_tags = [
34 | dict(name=["script", "style"]),
35 | dict(class_=["news_btn_wrap", "news_journalist_area"]),
36 | ]
37 |
38 | extra_css = """
39 | h1.news_title { font-size: 1.8rem; margin-bottom: 0.4rem; }
40 | h2.news_title { font-size: 1.2rem; font-style: italic; font-weight: normal; margin-bottom: 0.8rem; }
41 | p.news_date { margin-top: 0.2rem; }
42 | .img_caption { font-size: 0.8rem; margin-top: 0.2rem; display: block; }
43 | """
44 |
45 | feeds = [
46 | ("National", "http://www.koreaherald.com/common/rss_xml.php?ct=102"),
47 | ("Business", "http://www.koreaherald.com/common/rss_xml.php?ct=103"),
48 | ("Finance", "http://www.koreaherald.com/common/rss_xml.php?ct=305"),
49 | ("Life & Style", "http://www.koreaherald.com/common/rss_xml.php?ct=104"),
50 | ("Entertainment", "http://www.koreaherald.com/common/rss_xml.php?ct=105"),
51 | # ("Sports", "http://www.koreaherald.com/common/rss_xml.php?ct=106"),
52 | ("World", "http://www.koreaherald.com/common/rss_xml.php?ct=107"),
53 | ("Opinion", "http://www.koreaherald.com/common/rss_xml.php?ct=108"),
54 | ]
55 |
56 | def populate_article_metadata(self, article, __, _):
57 | if (not self.pub_date) or article.utctime > self.pub_date:
58 | self.pub_date = article.utctime
59 | self.title = format_title(_name, article.utctime)
60 |
61 | def preprocess_html(self, soup):
62 | byline_date = soup.find(attrs={"class": "view_tit_byline_r"})
63 | if byline_date:
64 | # format the published/updated date properly
65 | date_elements = []
66 | # Published : Apr 18, 2022 - 16:41 Updated : Apr 18, 2022 - 16:41
67 | date_re = r"(Published|Updated).+?\:.+?(?P[a-z]{3}\s\d+),.+?(?P