├── .flake8 ├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── feature_request.md │ └── questions-others-template.md └── workflows │ ├── build.yml │ └── install_calibre.sh ├── .gitignore ├── .mypy.ini ├── .pylintrc ├── LICENSE ├── README.md ├── _generate.py ├── _opds.py ├── _recipe_utils.py ├── _recipes.py ├── _recipes_custom.py ├── _utils.py ├── babel.config.json ├── build-index.js ├── build.sh ├── debug.sh ├── package.json ├── recipes ├── README.txt ├── aeon.recipe.py ├── asahi-shimbun.recipe.py ├── asian-review.recipe.py ├── atlantic-magazine.recipe.py ├── atlantic.recipe.py ├── bloomberg-businessweek.recipe.py ├── bloomberg-news.recipe.py ├── bookforum-magazine.recipe.py ├── channelnewsasia.recipe.py ├── economist.recipe.py ├── eighteen-fortythree.recipe.py ├── fivebooks.recipe.py ├── fivethirtyeight.recipe.py ├── forbes-editors-picks.recipe.py ├── foreign-affairs.recipe.py ├── foreign-policy-magazine.recipe.py ├── foreign-policy.recipe.py ├── ft-paper.recipe.py ├── ft.recipe.py ├── fulcrum-sg.recipe.py ├── guardian.recipe.py ├── harpers-magazine.recipe.py ├── harvard-intl-review.recipe.py ├── hbr.recipe.py ├── includes │ ├── nyt.py │ └── recipes_shared.py ├── japan-times.recipe.py ├── joongangdaily.recipe.py ├── kirkus.recipe.py ├── knowable-magazine.recipe.py ├── korea-herald.recipe.py ├── lithub.recipe.py ├── logos │ ├── knowable.png │ └── thirdpole.png ├── london-review.recipe.py ├── longreads-features.recipe.py ├── mit-press-reader.recipe.py ├── mit-tech-review-magazine.recipe.py ├── mit-tech-review.recipe.py ├── mollywhite-newsletter.recipe.py ├── natesilver.recipe.py ├── nature.recipe.py ├── nautilus.recipe.py ├── new-republic-magazine.recipe.py ├── newyorker.recipe.py ├── nine-dashline.recipe.py ├── noema-magazine.recipe.py ├── nytimes-books.recipe.py ├── nytimes-global.recipe.py ├── nytimes-magazine.recipe.py ├── nytimes-paper.recipe.py ├── paris-review-blog.recipe.py ├── poetry.recipe.py ├── politico-magazine.recipe.py ├── propublica.recipe.py ├── prospect-magazine.recipe.py ├── quanta-magazine.recipe.py ├── restofworld.recipe.py ├── scientific-american.recipe.py ├── scmp.recipe.py ├── smithsonian-magazine.recipe.py ├── spectator-magazine.recipe.py ├── sydney-morning-herald.recipe.py ├── taipei-times.recipe.py ├── thediplomat.recipe.py ├── thirdpole.recipe.py ├── time-magazine.recipe.py ├── vox.recipe.py ├── wapo-paper.recipe.py ├── wapo.recipe.py ├── wired.recipe.py ├── world-today.recipe.py └── wsj-paper.recipe.py ├── recipes_custom └── README.txt ├── requirements.txt ├── static ├── OpenSans-Bold.ttf ├── OpenSans-Regular.ttf ├── OpenSans-Semibold.ttf ├── colours.scss ├── colours_custom.scss ├── favicon.svg ├── index.html ├── opds.scss ├── opds.xsl ├── opds_custom.scss ├── reader.html ├── reader.js ├── reader.scss ├── reader_custom.scss ├── reader_sprites.svg ├── site.js ├── site.scss ├── site_custom.scss ├── theme.js ├── viewer-theme-dark.scss ├── viewer-theme-light.scss ├── viewer-theme.scss └── viewer-theme_custom.scss ├── tests ├── __init__.py └── tests_recipe_utils.py └── tox.ini /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | #max-line-length = 120 3 | extend-ignore = 4 | # let black determine line length 5 | E501, 6 | # ref https://github.com/psf/black/issues/1437 7 | E203 8 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | custom: ['https://buymeacoffee.com/ping/'] 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Report a problem 4 | title: "[BUG] " 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | Please fill in the required information below: 11 | 12 | **Describe the Bug/Error:** 13 | 14 | *Please make sure the description is worded well enough to be understood with as much context and examples as possible.* 15 | 16 | **Newsrack repository URL:** 17 | 18 | **Error log (if relevant):** 19 | 20 | GitHub Action run url: 21 | 22 | ```text 23 | # paste error log here, do not use screenshots 24 | ``` 25 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest a feature 4 | title: "[FEATURE] " 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | Please fill in the required information below: 11 | 12 | **Describe the Feature Request:** 13 | 14 | *Please make sure the description is worded well enough to be understood with as much context and examples as possible.* 15 | 16 | **Newsrack repository URL:** 17 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/questions-others-template.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Questions/Others 3 | about: Questions or other issues (not bug/feature request) 4 | title: "[OTHERS] " 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | Please fill in the required information below: 11 | 12 | **Describe the Issue:** 13 | 14 | *Please make sure the description is worded well enough to be understood with as much context and examples as possible.* 15 | 16 | **Newsrack repository URL:** 17 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: "Build" 2 | 3 | on: 4 | push: 5 | branches: 6 | - '**' 7 | schedule: 8 | # Customise: Cron interval 9 | - cron: "0 3,7,11,15,19,23 * * *" 10 | workflow_dispatch: 11 | inputs: 12 | regenerate: 13 | description: Enter csv of recipe slugs to specifically regenerate 14 | required: false 15 | type: string 16 | skip: 17 | description: Enter csv of recipe slugs to specifically skip 18 | required: false 19 | type: string 20 | verbose: 21 | description: Run recipe in verbose mode 22 | required: false 23 | type: boolean 24 | 25 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 26 | permissions: 27 | contents: read 28 | pages: write 29 | id-token: write 30 | 31 | # Allow one concurrent deployment 32 | concurrency: 33 | group: "pages" 34 | cancel-in-progress: true 35 | 36 | jobs: 37 | deploy: 38 | environment: 39 | name: github-pages 40 | url: ${{ steps.deployment.outputs.page_url }} 41 | runs-on: ubuntu-latest 42 | if: github.event_name == 'schedule' || github.ref_name == github.event.repository.default_branch 43 | # Customise: Total job run time limit 44 | timeout-minutes: 60 45 | steps: 46 | - uses: actions/checkout@v4 47 | timeout-minutes: 1 48 | with: 49 | submodules: true 50 | fetch-depth: 1 51 | 52 | - name: Setup node 53 | uses: actions/setup-node@v3 54 | timeout-minutes: 1 55 | with: 56 | node-version: 'lts/*' 57 | 58 | - name: Install npm dependencies 59 | timeout-minutes: 1 60 | run: | 61 | npm install --no-fund 62 | 63 | - uses: actions/setup-python@v4 64 | timeout-minutes: 1 65 | with: 66 | python-version: '3.10' 67 | 68 | - name: Install python requirements 69 | timeout-minutes: 1 70 | run: python -m pip install -r requirements.txt --upgrade 71 | 72 | - name: Install calibre's and other dependencies 73 | timeout-minutes: 1 74 | run: sudo apt-fast update -y && sudo apt-fast install --no-install-recommends -y libegl1 libopengl0 imagemagick 75 | 76 | - name: Get latest calibre version 77 | id: calibrelatest 78 | run: | 79 | tag="$(curl -L --retry 3 --silent --fail 'https://api.github.com/repos/kovidgoyal/calibre/releases/latest' | jq -r .tag_name)" && \ 80 | echo "calibre_ver=${tag#*v}" && \ 81 | echo "calibre_ver=${tag#*v}" >> $GITHUB_ENV 82 | 83 | - name: Get calibre binaries cache 84 | id: cache-bin 85 | uses: actions/cache@v3 86 | timeout-minutes: 1 87 | with: 88 | path: cache/calibre 89 | key: cache-calibre-x86_64-${{ env.calibre_ver }} 90 | 91 | - name: Install calibre 92 | timeout-minutes: 1 93 | run: sh .github/workflows/install_calibre.sh 94 | 95 | - name: Download meta artifacts 96 | id: download-meta-artifact 97 | uses: dawidd6/action-download-artifact@v2 98 | timeout-minutes: 1 99 | with: 100 | name: meta-artifacts 101 | path: meta 102 | search_artifacts: true 103 | if_no_artifact_found: warn 104 | 105 | - name: Setup Pages 106 | id: setup_pages 107 | uses: actions/configure-pages@v3 108 | timeout-minutes: 1 109 | 110 | - name: Generate periodicals 111 | env: 112 | CI_PAGES_URL: ${{ steps.setup_pages.outputs.base_url }} 113 | CALIBRE_CONFIG_DIRECTORY: ${{ github.workspace }}/calibre_config/ 114 | regenerate: ${{ github.event.inputs.regenerate }} 115 | skip: ${{ github.event.inputs.skip }} 116 | verbose: ${{ github.event.inputs.verbose }} 117 | accounts: ${{ secrets.accounts }} 118 | run: | 119 | sh build.sh 120 | if [[ -f 'job_summary.md' ]]; then cat 'job_summary.md' >> $GITHUB_STEP_SUMMARY; fi 121 | echo -e "\n<"'!'"-- Commit ${GITHUB_SHA:0:7}, $(ebook-convert --version | head -n1) -->" >> public/index.html 122 | rm -rf "$CALIBRE_CONFIG_DIRECTORY" 123 | 124 | # Ref: https://github.com/actions/starter-workflows/blob/main/pages/static.yml 125 | - name: Upload artifact 126 | uses: actions/upload-pages-artifact@v2 127 | with: 128 | path: ./public 129 | 130 | - name: Deploy to GitHub Pages 131 | id: deployment 132 | uses: actions/deploy-pages@v2 133 | timeout-minutes: 2 134 | 135 | - uses: actions/upload-artifact@v3 136 | timeout-minutes: 1 137 | with: 138 | name: meta-artifacts 139 | path: meta 140 | if-no-files-found: warn 141 | -------------------------------------------------------------------------------- /.github/workflows/install_calibre.sh: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------- 2 | # Install script for calibre 3 | # ------------------------------------------------------- 4 | 5 | bin_folder="$GITHUB_WORKSPACE/cache/calibre" 6 | mkdir -p "$bin_folder" 7 | platform='x86_64' 8 | bin_file="calibre-${platform}.txz" 9 | sig_file="calibre-${platform}.txz.sha512" 10 | 11 | if [ -f "${bin_folder}/${bin_file}" ]; then 12 | echo "Cached $bin_file exists." 13 | else 14 | echo "Cached $bin_file does not exist." 15 | rm -rf "${bin_folder}/calibre-*" 16 | tag="$(curl -L --retry 3 --show-error --silent --fail 'https://api.github.com/repos/kovidgoyal/calibre/releases/latest' | jq -r .tag_name)" && \ 17 | latest_version="${tag#*v}" && \ 18 | echo "Latest version: ${latest_version}" && \ 19 | dl_url="https://github.com/kovidgoyal/calibre/releases/download/${tag}/calibre-${latest_version}-${platform}.txz" && \ 20 | sig_url="https://calibre-ebook.com/signatures/calibre-${latest_version}-${platform}.txz.sha512" && \ 21 | sig2_url="https://code.calibre-ebook.com/signatures/calibre-${latest_version}-${platform}.txz.sha512" && \ 22 | { echo "Downloading sig $sig_url ..." && curl -L --retry 3 --show-error --silent --fail --output "${bin_folder}/${sig_file}" "$sig_url" || \ 23 | echo "Downloading sig $sig2_url ..." && curl -L --retry 3 --show-error --insecure --fail --silent --output "${bin_folder}/${sig_file}" "$sig2_url"; } && \ 24 | echo "Downloading bin $dl_url ..." 25 | curl -L --retry 3 --show-error --silent --fail --output "${bin_folder}/${bin_file}.part" "$dl_url" && \ 26 | echo "$(cat "${bin_folder}/${sig_file}") ${bin_folder}/${bin_file}.part" | sha512sum --check --status && \ 27 | mv "${bin_folder}/${bin_file}.part" "${bin_folder}/${bin_file}" 28 | fi 29 | 30 | if [ -f "${bin_folder}/${bin_file}" ]; then 31 | echo "Install from local..." 32 | mkdir -p "$HOME/calibre-bin/calibre" && \ 33 | tar xf "${bin_folder}/${bin_file}" -C "$HOME/calibre-bin/calibre" && \ 34 | "$HOME/calibre-bin/calibre/calibre_postinstall" && \ 35 | export PATH=$PATH:$HOME/calibre-bin/calibre && \ 36 | calibre --version && \ 37 | echo "$HOME/calibre-bin/calibre" >> $GITHUB_PATH 38 | fi 39 | 40 | calibre --version || { 41 | echo "Install latest from calibre servers..." 42 | mkdir -p ~/calibre-bin 43 | wget --tries=3 --timeout=30 -nv -O- https://download.calibre-ebook.com/linux-installer.sh | sh /dev/stdin install_dir=~/calibre-bin isolated=y 44 | export PATH=$PATH:$HOME/calibre-bin/calibre 45 | calibre --version 46 | echo "$HOME/calibre-bin/calibre" >> $GITHUB_PATH 47 | } 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | static/*.compiled.js 2 | 3 | .env 4 | meta/ 5 | *.recipe 6 | *.epub 7 | *.mobi 8 | public/ 9 | debug/ 10 | static/*.css 11 | job_summary.md 12 | 13 | # Node # 14 | # Dependency directories 15 | node_modules/ 16 | jspm_packages/ 17 | 18 | venv/ 19 | venv3/ 20 | .vscode/ 21 | 22 | coverage.sh 23 | cov_html/ 24 | 25 | *.iml 26 | .idea/ 27 | 28 | # OS generated files # 29 | .DS_Store 30 | .DS_Store? 31 | ._* 32 | .Spotlight-V100 33 | .Trashes 34 | ehthumbs.db 35 | Thumbs.db 36 | 37 | # Byte-compiled / optimized / DLL files 38 | __pycache__/ 39 | *.py[cod] 40 | *$py.class 41 | 42 | # C extensions 43 | *.so 44 | 45 | # Distribution / packaging 46 | .Python 47 | env/ 48 | build/ 49 | develop-eggs/ 50 | dist/ 51 | downloads/ 52 | eggs/ 53 | .eggs/ 54 | lib/ 55 | lib64/ 56 | parts/ 57 | sdist/ 58 | var/ 59 | *.egg-info/ 60 | .installed.cfg 61 | *.egg 62 | 63 | # PyInstaller 64 | # Usually these files are written by a python script from a template 65 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 66 | *.manifest 67 | *.spec 68 | 69 | # Installer logs 70 | pip-log.txt 71 | pip-delete-this-directory.txt 72 | 73 | # Unit test / coverage reports 74 | htmlcov/ 75 | .tox/ 76 | .coverage 77 | .coverage.* 78 | .cache 79 | nosetests.xml 80 | coverage.xml 81 | *,cover 82 | .hypothesis/ 83 | 84 | # Translations 85 | *.mo 86 | *.pot 87 | 88 | # Django stuff: 89 | *.log 90 | 91 | # Sphinx documentation 92 | docs/_build/ 93 | 94 | # PyBuilder 95 | target/ 96 | 97 | #Ipython Notebook 98 | .ipynb_checkpoints 99 | -------------------------------------------------------------------------------- /.mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | ignore_missing_imports = True 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # newsrack 2 | 3 | Generate an online "newsrack" of periodicals for your ereader. 4 | 5 | Features: 6 | - Download anywhere using your device browser 7 | - Subscribe via OPDS feeds 8 | 9 | Uses [calibre](https://calibre-ebook.com/) + [recipes](https://manual.calibre-ebook.com/news_recipe.html), [GitHub Actions](.github/workflows/build.yml), and hosted 10 | on [GitHub Pages](https://pages.github.com/). 11 | 12 | ![eInk Kindle Screenshot](https://github.com/ping/newsrack/assets/104607/475daa53-f2d5-4469-b88e-7d5463399d73) 13 | ![Mobile Screenshot](https://github.com/ping/newsrack/assets/104607/76ec3514-8d89-43bc-a68c-909df42971cb) 14 | 15 | [![Buy me a coffee](https://img.buymeacoffee.com/button-api/?text=Buy%20me%20a%20coffee&emoji=&slug=ping&button_colour=FFDD00&font_colour=000000&font_family=Bree&outline_colour=000000&coffee_colour=ffffff)](https://www.buymeacoffee.com/ping) 16 | 17 | ## Running Your Own Instance 18 | 19 | ### General Steps 20 | 21 | 1. Fork this repository. 22 | 2. Create a new branch, for example `custom`. Using a new branch makes a few things, like contributing fixes for example, easier. 23 | 3. Add your own recipes to the [`recipes_custom/`](recipes_custom) folder and customise [_recipes_custom.py](_recipes_custom.py). Optional. 24 | 4. Customise the cron schedule and job run time in [.github/workflows/build.yml](.github/workflows/build.yml). Optional. 25 | 5. Set the new branch `custom` as default 26 | - from Settings > Branches > Default branch 27 | 6. Enable Pages in repository settings to deploy from `GitHub Actions` 28 | - from Settings > Pages > Build and deployment > Source 29 | 7. If needed, manually trigger the `Build` workflow from Actions to start your first build. 30 | 31 | ### What Can Be Customised 32 | 33 | `newsrack` supports extensive customisation such as: 34 | - add/remove recipes 35 | - the formats generated 36 | - when recipes are executed 37 | - cover colours and fonts 38 | 39 | Review the [wiki](https://github.com/ping/newsrack/wiki#customisation) page to understand what can be customised and how to do so. 40 | 41 | You can also refer to the [example fork repo](https://github.com/ping/newsrack-fork-test/) and see the [actual customisations](https://github.com/ping/newsrack-fork-test/compare/main...custom#files_bucket) in action. 42 | 43 | 44 | ## Available Recipes 45 | 46 | `newsrack` has its own set of customised recipes. The full list of available recipes can be viewed on [here](https://github.com/ping/newsrack/wiki/Available-Recipes). 47 | -------------------------------------------------------------------------------- /_opds.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 https://github.com/ping/ 2 | # 3 | # This software is released under the GNU General Public License v3.0 4 | # https://opensource.org/licenses/GPL-3.0 5 | 6 | # Helpers to generate opds xml - extremely minimal 7 | from datetime import datetime 8 | from typing import Dict, Optional 9 | from xml.dom import minidom 10 | 11 | extension_contenttype_map = { 12 | ".epub": "application/epub+zip", 13 | ".mobi": "application/x-mobipocket-ebook", 14 | ".azw": "application/x-mobipocket-ebook", 15 | ".azw3": "application/x-mobi8-ebook", 16 | ".pdf": "application/pdf", 17 | } 18 | 19 | 20 | def simple_tag( 21 | doc_root: minidom.Document, 22 | tag: str, 23 | value: Optional[str] = None, 24 | attributes: Optional[Dict] = None, 25 | ) -> minidom.Element: 26 | new_tag = doc_root.createElement(tag) 27 | if value: 28 | new_tag.appendChild(doc_root.createTextNode(value)) 29 | if attributes: 30 | for k, v in attributes.items(): 31 | new_tag.setAttribute(k, v) 32 | return new_tag 33 | 34 | 35 | def init_feed( 36 | doc: minidom.Document, publish_site: str, feed_id: str, title: str 37 | ) -> minidom.Element: 38 | feed = simple_tag( 39 | doc, 40 | "feed", 41 | attributes={ 42 | "xmlns": "http://www.w3.org/2005/Atom", 43 | "xmlns:dc": "http://purl.org/dc/terms/", 44 | "xmlns:opds": "http://opds-spec.org/2010/catalog", 45 | }, 46 | ) 47 | doc.appendChild(feed) 48 | feed.appendChild(simple_tag(doc, "id", feed_id)) 49 | feed.appendChild(simple_tag(doc, "title", title)) 50 | feed.appendChild(simple_tag(doc, "updated", f"{datetime.now():%Y-%m-%dT%H:%M:%SZ}")) 51 | feed_author = doc.createElement("author") 52 | feed_author.appendChild(simple_tag(doc, "name", publish_site)) 53 | feed_author.appendChild(simple_tag(doc, "uri", publish_site)) 54 | feed.appendChild(feed_author) 55 | return feed 56 | -------------------------------------------------------------------------------- /_recipes_custom.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from _recipe_utils import Recipe 4 | 5 | # Define the categories display order, optional 6 | categories_sort: List[str] = [] 7 | 8 | # Define your custom recipes list here 9 | # Example: https://github.com/ping/newsrack-fork-test/blob/custom/_recipes_custom.py 10 | 11 | recipes: List[Recipe] = [ 12 | # Recipe( 13 | # recipe="example", 14 | # slug="example", 15 | # src_ext="epub", 16 | # category="example", 17 | # ), 18 | ] 19 | -------------------------------------------------------------------------------- /babel.config.json: -------------------------------------------------------------------------------- 1 | { 2 | "presets": [ 3 | "@babel/preset-env", 4 | "minify" 5 | ], 6 | "comments": false 7 | } -------------------------------------------------------------------------------- /build-index.js: -------------------------------------------------------------------------------- 1 | // ref https://lunrjs.com/guides/index_prebuilding.html 2 | var lunr = require('lunr'), 3 | stdin = process.stdin, 4 | stdout = process.stdout, 5 | buffer = [] 6 | 7 | stdin.resume() 8 | stdin.setEncoding('utf8') 9 | 10 | stdin.on('data', function (data) { 11 | buffer.push(data) 12 | }) 13 | 14 | // Ref https://github.com/olivernn/lunr.js/blob/aa5a878f62a6bba1e8e5b95714899e17e8150b38/lib/stop_word_filter.js#L43 15 | customStopWordFilter = lunr.generateStopWordFilter(['li']) // to exclude
  • 16 | lunr.Pipeline.registerFunction(customStopWordFilter, 'customStopWordFilter') 17 | 18 | stdin.on('end', function () { 19 | // modified to exclude "/" "<" ">" 20 | lunr.tokenizer.separator = /[\s\-\/<>’]+/ 21 | var documents = JSON.parse(buffer.join('')) 22 | var idx = lunr(function () { 23 | this.ref('id') 24 | this.field('title') 25 | this.field('articles') 26 | this.field('tags') 27 | this.field('category') 28 | this.metadataWhitelist = ['position'] 29 | this.pipeline.before(lunr.stopWordFilter, customStopWordFilter) 30 | 31 | documents.forEach(function (doc) { 32 | this.add(doc) 33 | }, this) 34 | }) 35 | stdout.write(JSON.stringify(idx)) 36 | }) 37 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | for recipe_folder in 'recipes' 'recipes_custom' 2 | do 3 | # copy recipe_folder/*.recipe.py files to *.recipe 4 | if [ -n "$(ls -A "${recipe_folder}"/*.recipe.py 2>/dev/null)" ] 5 | then 6 | for f in "$recipe_folder"/*.recipe.py; do 7 | b="$(basename -- $f)" 8 | cp -p "$f" "${b%.py}" 9 | done 10 | fi 11 | # also support *.recipe files as is in calibre 12 | # copy recipe_folder/*.recipe files to *.recipe 13 | if [ -n "$(ls -A "${recipe_folder}"/*.recipe 2>/dev/null)" ] 14 | then 15 | for f in "$recipe_folder"/*.recipe; do 16 | b="$(basename -- $f)" 17 | cp -p "$f" "$b" 18 | done 19 | fi 20 | done 21 | 22 | mkdir -p public meta \ 23 | && cp -p static/*.svg public/ \ 24 | && cp -p static/opds.xsl public/ \ 25 | && npx babel static/site.js --out-file static/site.compiled.js \ 26 | && npx babel static/reader.js --out-file static/reader.compiled.js \ 27 | && npx babel static/theme.js --out-file static/theme.compiled.js \ 28 | && cp -p static/theme.compiled.js public/theme.min.js \ 29 | && npx sass -s compressed --no-source-map static/site.scss:static/site.css static/reader.scss:static/reader.css static/viewer-theme-light.scss:public/viewer-theme-light.css static/viewer-theme-dark.scss:public/viewer-theme-dark.css static/opds.scss:public/opds.css \ 30 | && python3 _generate.py "$CI_PAGES_URL" "$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/" "$GITHUB_SHA" "https://github.com/${GITHUB_REPOSITORY}/commit/${GITHUB_SHA}" "${GITHUB_RUN_ID}" "https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}" \ 31 | && node build-index.js < public/lunr_docs.json > public/lunr.json \ 32 | && npx html-minifier-terser --input-dir public/ --output-dir public/ --collapse-whitespace --file-ext html \ 33 | && rm -f *.recipe static/*.compiled.js public/lunr_docs.json 34 | -------------------------------------------------------------------------------- /debug.sh: -------------------------------------------------------------------------------- 1 | # helper script for debuging/developing new recipes 2 | if [ -z "$1" ]; 3 | then 4 | echo "No recipe specified." 5 | echo "Usage: sh debug.sh example" 6 | exit 9 7 | fi 8 | 9 | get_abs_dirname() { 10 | # $1 : relative filename 11 | echo "$(cd "$(dirname "$1")" && pwd)/" 12 | } 13 | # use to get shared code 14 | export recipes_includes=$(get_abs_dirname "recipes/includes/recipes_shared.py") 15 | 16 | for recipe_folder in 'recipes' 'recipes_custom' 17 | do 18 | if [ -f "$recipe_folder/$1.recipe.py" ]; then 19 | cp -p "$recipe_folder/$1.recipe.py" "$1.recipe" 20 | fi 21 | if [ -f "$recipe_folder/$1.recipe" ]; then 22 | cp -p "$recipe_folder/$1.recipe" "$1.recipe" 23 | fi 24 | done 25 | 26 | rm -rf debug 27 | ebook-convert "$1.recipe" .epub --test --debug-pipeline debug -vv && \ 28 | open debug/input/index.html 29 | 30 | if [ -f "$1.recipe" ]; then 31 | rm -f "$1.recipe" 32 | fi 33 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "devDependencies": { 3 | "@babel/cli": "^7.21.0", 4 | "@babel/core": "^7.21.3", 5 | "@babel/preset-env": "^7.20.2", 6 | "babel-preset-minify": "^0.5.2", 7 | "html-minifier-terser": "^7.1.0", 8 | "lunr": "^2.3.9", 9 | "sass": "^1.60.0", 10 | "terser": "^5.16.6" 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /recipes/README.txt: -------------------------------------------------------------------------------- 1 | This folder holds the default recipes distributed with newsrack. 2 | 3 | Do not add your custom recipes to this folder. 4 | Put them in recipes_custom/ instead. 5 | -------------------------------------------------------------------------------- /recipes/aeon.recipe.py: -------------------------------------------------------------------------------- 1 | # No longer working becauses css classes are dynamically generated 2 | import os 3 | import sys 4 | 5 | # custom include to share code between recipes 6 | sys.path.append(os.environ["recipes_includes"]) 7 | from recipes_shared import BasicNewsrackRecipe, format_title, get_date_format 8 | 9 | from calibre.web.feeds.news import BasicNewsRecipe 10 | 11 | _name = "Aeon" 12 | 13 | 14 | class Aeon(BasicNewsrackRecipe, BasicNewsRecipe): 15 | title = _name 16 | __author__ = "ping" 17 | language = "en" 18 | description = ( 19 | "A unique digital magazine, publishing some of the most profound and " 20 | "provocative thinking on the web. We ask the big questions and find " 21 | "the freshest, most original answers, provided by leading thinkers on " 22 | "science, philosophy, society and the arts. https://aeon.co/" 23 | ) 24 | encoding = "utf-8" 25 | publication_type = "blog" 26 | masthead_url = "https://aeon.co/logo.png" 27 | oldest_article = 30 28 | max_articles_per_feed = 30 29 | compress_news_images_auto_size = 10 30 | 31 | remove_tags = [ 32 | dict( 33 | class_=[ 34 | "sc-8c8cfef8-0", 35 | "sc-114c07c9-0", 36 | "sc-50e6fb3a-1", 37 | "sc-c3e98e6e-0", 38 | "sc-fd74dcf9-14", 39 | "sc-50e6fb3a-1", 40 | "sc-fd74dcf9-24", 41 | "sc-a70232b9-5", 42 | ] 43 | ), 44 | dict(attrs={"data-test": "footer"}), 45 | ] 46 | remove_attributes = ["align", "style", "width", "height"] 47 | 48 | extra_css = """ 49 | p .sc-2e8621ab-1 { margin-left: 0.5rem; } 50 | .sc-fd74dcf9-18 { margin-right: 0.6rem; } 51 | img.ld-image-block, img.lede-img, .sc-358cfb18-0 img { display: block; max-width: 100%; height: auto; } 52 | .ld-image-caption { font-size: 0.8rem; } 53 | """ 54 | feeds = [(_name, "https://aeon.co/feed.rss")] 55 | 56 | def _find_article(self, data): 57 | if isinstance(data, dict): 58 | return data.get("@type", "") == "Article" 59 | return False 60 | 61 | def preprocess_raw_html_(self, raw_html, url): 62 | soup = self.soup(raw_html) 63 | article = self.get_ld_json(soup, filter_fn=self._find_article) 64 | if not (article and article.get("articleBody")): 65 | err_msg = f"Unable to find article: {url}" 66 | self.log.warning(err_msg) 67 | self.abort_article(err_msg) 68 | 69 | # "%Y-%m-%d" 70 | published_date = self.parse_date(article["datePublished"]) 71 | if (not self.pub_date) or published_date > self.pub_date: 72 | self.pub_date = published_date 73 | self.title = format_title(_name, published_date) 74 | 75 | # display article date 76 | header = soup.find("h1") or soup.find("h2") 77 | if header: 78 | date_ele = soup.new_tag("div", attrs={"class": "custom-date-published"}) 79 | date_ele.append(f"{published_date:{get_date_format()}}") 80 | header.insert_after(date_ele) 81 | 82 | # re-position header image 83 | essay_header = soup.find("div", class_="sc-fd74dcf9-26") 84 | if essay_header: 85 | header_img = essay_header.find("img") 86 | attribution = essay_header.find("div", class_="sc-b78f3ea9-3") 87 | if header_img and attribution: 88 | header_img["class"] = "lede-img" 89 | attribution.insert_before(header_img.extract()) 90 | clean_up_ele = essay_header.find(class_="sc-358cfb18-6") 91 | if clean_up_ele: 92 | clean_up_ele.decompose() 93 | 94 | byline = soup.find("div", class_="rah-static") 95 | if byline: 96 | for br in byline.find_all("br"): # extraneous br 97 | br.decompose() 98 | 99 | for link_class in ( 100 | "a.sc-2e8621ab-1", # author link 101 | "a.sc-fd74dcf9-18", # article cat 102 | ): 103 | for a in soup.select(link_class): # tags 104 | a.name = "span" 105 | return str(soup) 106 | 107 | def parse_feeds(self): 108 | return self.group_feeds_by_date( 109 | filter_article=lambda a: "/videos/" not in a.url 110 | ) 111 | -------------------------------------------------------------------------------- /recipes/asian-review.recipe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python -*- 3 | # -*- coding: utf-8 -*- 4 | """ 5 | asianreviewofbooks.com 6 | """ 7 | 8 | # Original from https://github.com/kovidgoyal/calibre/blob/29cd8d64ea71595da8afdaec9b44e7100bff829a/recipes/asianreviewofbooks.recipe 9 | 10 | __license__ = "GPL v3" 11 | __copyright__ = "2012-2017, Darko Miletic " 12 | 13 | import os 14 | import sys 15 | 16 | # custom include to share code between recipes 17 | sys.path.append(os.environ["recipes_includes"]) 18 | from recipes_shared import BasicNewsrackRecipe, format_title 19 | 20 | from calibre.web.feeds.news import BasicNewsRecipe 21 | 22 | _name = "Asian Review of Books" 23 | 24 | 25 | class AsianReviewOfBooks(BasicNewsrackRecipe, BasicNewsRecipe): 26 | title = _name 27 | __author__ = "Darko Miletic" 28 | description = "In addition to reviewing books about or of relevance to Asia, the Asian Review of Books also features long-format essays by leading Asian writers and thinkers, to providing an unparalleled forum for discussion of key contemporary issues by Asians for Asia and a vehicle of intellectual depth and breadth where leading thinkers can write on the books, arts and ideas of the day. Widely quoted and referenced, with an archive of more than one thousand book reviews, it is the only web resource dedicated to Asian books. And now, with the addition of the new premium content, the Asian Review of Books, is a must-read publication. https://asianreviewofbooks.com/" # noqa 29 | publisher = "The Asian Review of Books" 30 | category = "literature, books, reviews, Asia" 31 | language = "en" 32 | publication_type = "magazine" 33 | masthead_url = "https://i2.wp.com/asianreviewofbooks.com/content/wp-content/uploads/2016/09/ARBwidelogo.png" 34 | 35 | oldest_article = 30 36 | max_articles_per_feed = 30 37 | 38 | conversion_options = { 39 | "comment": description, 40 | "tags": category, 41 | "publisher": publisher, 42 | "language": language, 43 | } 44 | 45 | remove_attributes = ["width", "height"] 46 | keep_only_tags = [ 47 | dict(name="main"), 48 | ] 49 | remove_tags = [ 50 | dict(class_=["entry-meta", "sharedaddy", "jp-relatedposts", "entry-footer"]) 51 | ] 52 | 53 | extra_css = """ 54 | blockquote { font-size: 1.2rem; margin-left: 0; font-style: italic; } 55 | .wp-caption-text, .entry-featured__caption { display: block; font-size: 0.8rem; margin-top: 0.2rem; } 56 | """ 57 | 58 | feeds = [("Articles", "http://asianreviewofbooks.com/content/feed/")] 59 | 60 | def populate_article_metadata(self, article, soup, _): 61 | if not self.pub_date or self.pub_date < article.utctime: 62 | self.pub_date = article.utctime 63 | self.title = format_title(_name, self.pub_date) 64 | 65 | def preprocess_html(self, soup): 66 | # find empty

    67 | paras = soup.find_all("p") 68 | for p in paras: 69 | if not p.text.strip(): 70 | p.decompose() 71 | 72 | quotes = soup.find_all("h5") 73 | for q in quotes: 74 | q.name = "blockquote" 75 | 76 | bio = soup.find_all("h6") 77 | for b in bio: 78 | b.name = "div" 79 | 80 | return soup 81 | -------------------------------------------------------------------------------- /recipes/bookforum-magazine.recipe.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from urllib.parse import urljoin 4 | 5 | # custom include to share code between recipes 6 | sys.path.append(os.environ["recipes_includes"]) 7 | from recipes_shared import BasicNewsrackRecipe 8 | 9 | from mechanize import Request 10 | from calibre.web.feeds.news import BasicNewsRecipe 11 | 12 | _name = "Bookforum" 13 | _issue_url = "" 14 | 15 | 16 | class BookforumMagazine(BasicNewsrackRecipe, BasicNewsRecipe): 17 | title = _name 18 | description = ( 19 | "Bookforum is an American book review magazine devoted to books and " 20 | "the discussion of literature. https://www.bookforum.com/print" 21 | ) 22 | language = "en" 23 | __author__ = "ping" 24 | publication_type = "magazine" 25 | compress_news_images_auto_size = 8 26 | 27 | keep_only_tags = [dict(class_="blog-article")] 28 | remove_tags = [dict(name=["af-share-toggle", "af-related-articles"])] 29 | 30 | extra_css = """ 31 | .blog-article__header { font-size: 1.8rem; margin-bottom: 0.4rem; } 32 | .blog-article__subtitle { font-size: 1.2rem; font-style: italic; margin-bottom: 1rem; } 33 | .blog-article__writer { font-size: 1rem; font-weight: bold; color: #444; } 34 | .blog-article__book-info { margin: 1rem 0; } 35 | .article-image-container img, .blog-article__publication-media img { 36 | display: block; max-width: 100%; height: auto; 37 | } 38 | .blog-article__caption { font-size: 0.8rem; display: block; margin-top: 0.2rem; } 39 | """ 40 | 41 | def preprocess_html(self, soup): 42 | # strip away links that's not needed 43 | for ele in soup.select(".blog-article__header a"): 44 | ele.unwrap() 45 | return soup 46 | 47 | def parse_index(self): 48 | soup = self.index_to_soup( 49 | _issue_url if _issue_url else "https://www.bookforum.com/print" 50 | ) 51 | meta_ele = soup.find("meta", property="og:title") 52 | if meta_ele: 53 | self.title = f'{_name}: {meta_ele["content"]}' 54 | 55 | cover_ele = soup.find("img", class_="toc-issue__cover") 56 | if cover_ele: 57 | self.cover_url = urljoin( 58 | "https://www.bookforum.com", 59 | soup.find("img", class_="toc-issue__cover")["src"], 60 | ) 61 | # use cover image to get a published date 62 | br = self.get_browser() 63 | cover_res = br.open_novisit( 64 | Request(self.cover_url, timeout=self.timeout, method="HEAD") 65 | ) 66 | cover_res_lastupdated = cover_res.get("last-modified", default=None) 67 | if cover_res_lastupdated: 68 | self.pub_date = self.parse_date(cover_res_lastupdated) 69 | 70 | articles = {} 71 | for sect_ele in soup.find_all("div", class_="toc-articles__section"): 72 | section_name = self.tag_to_string( 73 | sect_ele.find("a", class_="toc__anchor-links__link") 74 | ) 75 | for article_ele in sect_ele.find_all("article"): 76 | title_ele = article_ele.find("h1") 77 | sub_title_ele = article_ele.find(class_="toc-article__subtitle") 78 | articles.setdefault(section_name, []).append( 79 | { 80 | "title": self.tag_to_string(title_ele), 81 | "url": article_ele.find("a", class_="toc-article__link")[ 82 | "href" 83 | ], 84 | "description": self.tag_to_string(sub_title_ele) 85 | if sub_title_ele 86 | else "", 87 | } 88 | ) 89 | return articles.items() 90 | -------------------------------------------------------------------------------- /recipes/channelnewsasia.recipe.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 https://github.com/ping/ 2 | # 3 | # This software is released under the GNU General Public License v3.0 4 | # https://opensource.org/licenses/GPL-3.0 5 | 6 | """ 7 | channelnewsasia.com 8 | """ 9 | import os 10 | import sys 11 | 12 | # custom include to share code between recipes 13 | sys.path.append(os.environ["recipes_includes"]) 14 | from recipes_shared import BasicNewsrackRecipe, format_title 15 | 16 | from calibre.web.feeds.news import BasicNewsRecipe 17 | 18 | _name = "ChannelNewsAsia" 19 | 20 | 21 | class ChannelNewsAsia(BasicNewsrackRecipe, BasicNewsRecipe): 22 | title = _name 23 | __author__ = "ping" 24 | description = "CNA: Breaking News, Singapore News, World and Asia https://www.channelnewsasia.com/" 25 | publisher = "Mediacorp" 26 | category = "news, Singapore" 27 | publication_type = "newspaper" 28 | language = "en" 29 | masthead_url = "https://www.channelnewsasia.com/sites/default/themes/mc_cna_theme/images/logo.png" 30 | 31 | oldest_article = 1 32 | max_articles_per_feed = 25 33 | 34 | remove_tags_before = [dict(class_=["h1--page-title"])] 35 | remove_tags_after = [dict(class_=["content"])] 36 | remove_attributes = ["style"] 37 | remove_tags = [ 38 | dict( 39 | class_=[ 40 | "js-popup-content", 41 | "referenced-card", 42 | "block--related-topics", 43 | "block-ad-entity", 44 | "block-block-content", 45 | "from-library", 46 | "block-field-blocknodearticlefield-author", # author bio 47 | "mobile_author_card", # author bio 48 | "block-field-blocknodearticlefield-text-to-speech", # article AI audio 49 | ] 50 | ), 51 | dict(name="div", attrs={"data-ad-entity": True}), 52 | dict(name="div", attrs={"data-js-options": True}), 53 | dict(name=["script", "noscript", "style", "svg"]), 54 | ] 55 | 56 | extra_css = """ 57 | .figure__caption { font-size: 0.8rem; } 58 | .figure__caption p { margin-top: 0.2rem; margin-bottom: 1rem; } 59 | """ 60 | 61 | feeds = [ 62 | # ( 63 | # "Latest News", 64 | # "https://www.channelnewsasia.com/api/v1/rss-outbound-feed?_format=xml", 65 | # ), 66 | ( 67 | "Singapore", 68 | "https://www.channelnewsasia.com/api/v1/rss-outbound-feed?_format=xml&category=10416", 69 | ), 70 | ( 71 | "Asia", 72 | "https://www.channelnewsasia.com/api/v1/rss-outbound-feed?_format=xml&category=6511", 73 | ), 74 | ( 75 | "Business", 76 | "https://www.channelnewsasia.com/api/v1/rss-outbound-feed?_format=xml&category=6936", 77 | ), 78 | # ( 79 | # "Sport", 80 | # "https://www.channelnewsasia.com/api/v1/rss-outbound-feed?_format=xml&category=10296", 81 | # ), 82 | # ( 83 | # "World", 84 | # "https://www.channelnewsasia.com/api/v1/rss-outbound-feed?_format=xml&category=6311", 85 | # ), 86 | ] 87 | 88 | def populate_article_metadata(self, article, __, _): 89 | if (not self.pub_date) or article.utctime > self.pub_date: 90 | self.pub_date = article.utctime 91 | self.title = format_title(_name, article.utctime) 92 | -------------------------------------------------------------------------------- /recipes/fivebooks.recipe.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 https://github.com/ping/ 2 | # 3 | # This software is released under the GNU General Public License v3.0 4 | # https://opensource.org/licenses/GPL-3.0 5 | 6 | """ 7 | fivebooks.com 8 | """ 9 | import os 10 | import re 11 | import sys 12 | from datetime import datetime 13 | 14 | # custom include to share code between recipes 15 | sys.path.append(os.environ["recipes_includes"]) 16 | from recipes_shared import BasicNewsrackRecipe, format_title 17 | 18 | from calibre.web.feeds.news import BasicNewsRecipe 19 | 20 | _name = "Five Books" 21 | 22 | 23 | class FiveBooks(BasicNewsrackRecipe, BasicNewsRecipe): 24 | title = _name 25 | __author__ = "ping" 26 | description = "Expert book recommendations https://fivebooks.com/" 27 | language = "en" 28 | category = "books" 29 | publication_type = "blog" 30 | max_articles_per_feed = 15 31 | masthead_url = "https://fivebooks.com/app/themes/five-books/assets/images/logo.png" 32 | scale_news_images = (400, 400) 33 | 34 | remove_attributes = ["style", "font"] 35 | remove_tags = [ 36 | dict(id=["interview-related", "buyfive"]), 37 | dict( 38 | class_=[ 39 | "listen-button", 40 | "buy-button", 41 | "book-ad", 42 | "-newsletter", 43 | "read-later-and-social", 44 | "further-reading", 45 | "show-for-medium-up", 46 | "hide-for-small", 47 | "book-list-mobile", 48 | "-donate", 49 | "update", 50 | "social-buttons", 51 | "ebook-button", 52 | "book-links", 53 | "bio-component", 54 | ] 55 | ), 56 | dict(name=["script", "noscript", "style"]), 57 | ] 58 | remove_tags_before = [dict(class_=["main-content"])] 59 | remove_tags_after = [dict(class_=["main-content"])] 60 | 61 | extra_css = """ 62 | p.book-number { font-weight: bold; font-size: 1.2rem; } 63 | ul.book-covers { list-style: none; list-style-type: none; padding-left: 0; } 64 | ul.book-covers li { display: block; margin-bottom: 1rem; } 65 | ul.book-covers li .cover-wrap { display: inline-block; vertical-align: top; } 66 | ul.book-covers li p.book-number { display: none; } 67 | ul.book-covers li h2 { display: inline-block; font-size: 0.8rem; margin-left: 1rem; } 68 | p.pullquote { margin-left: 3pt; font-size: 0.85rem; color: #333333; font-style: italic; } 69 | """ 70 | feeds = [ 71 | ("Newest", "https://fivebooks.com/interviews/?order=newest"), 72 | ("Popular", "https://fivebooks.com/interviews/?order=popular"), 73 | ] 74 | 75 | def populate_article_metadata(self, article, soup, first): 76 | post_date = None 77 | dt = soup.find(class_="date") 78 | if not dt: 79 | dated_tag = soup.find(attrs={"data-post-modified-date": True}) 80 | if dated_tag: 81 | post_date = datetime.fromisoformat(dated_tag["data-post-modified-date"]) 82 | else: 83 | # "%B %d, %Y" 84 | post_date = self.parse_date(dt.text) 85 | if post_date: 86 | if not self.pub_date or post_date > self.pub_date: 87 | self.pub_date = post_date 88 | self.title = format_title(_name, post_date) 89 | article.utctime = post_date 90 | 91 | description_tag = soup.find(attrs={"data-post-description": True}) 92 | if description_tag: 93 | article.text_summary = description_tag["data-post-description"] 94 | 95 | def preprocess_raw_html(self, raw_html, url): 96 | soup = self.soup(raw_html) 97 | content = soup.find(class_="main-content") 98 | data = self.get_ld_json(soup, lambda d: d.get("@graph", [])) 99 | if not data: 100 | return raw_html 101 | graph = data.get("@graph", []) 102 | if not graph: 103 | return raw_html 104 | for g in graph: 105 | if g.get("@type") != "WebPage": 106 | continue 107 | content["data-post-modified-date"] = ( 108 | g.get("dateModified") or g["datePublished"] 109 | ) 110 | content["data-post-description"] = g.get("description", "") 111 | break 112 | return str(soup) 113 | 114 | def parse_index(self): 115 | br = self.get_browser() 116 | articles = {} 117 | for feed_name, feed_url in self.feeds: 118 | articles[feed_name] = [] 119 | raw_html = ( 120 | br.open_novisit(feed_url, timeout=self.timeout).read().decode("utf-8") 121 | ) 122 | soup = self.soup(raw_html) 123 | interviews = soup.find_all(class_="library-page") 124 | if self.max_articles_per_feed < len(interviews): 125 | interviews = interviews[: self.max_articles_per_feed] 126 | for interview in interviews: 127 | heading = interview.find("h2") 128 | title = re.sub(r"\s{2,}", " ", heading.text) 129 | link = heading.find("a") 130 | articles[feed_name].append( 131 | { 132 | "title": title, 133 | "url": link["href"], 134 | "date": "", 135 | "description": "", 136 | } 137 | ) 138 | return articles.items() 139 | -------------------------------------------------------------------------------- /recipes/fivethirtyeight.recipe.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 https://github.com/ping/ 2 | # 3 | # This software is released under the GNU General Public License v3.0 4 | # https://opensource.org/licenses/GPL-3.0 5 | 6 | """ 7 | fivethirtyeight.com is no more 8 | """ 9 | import json 10 | import os 11 | import sys 12 | from datetime import timezone 13 | from html import unescape 14 | 15 | # custom include to share code between recipes 16 | sys.path.append(os.environ["recipes_includes"]) 17 | from recipes_shared import WordPressNewsrackRecipe, format_title, get_date_format 18 | 19 | from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile 20 | from calibre.web.feeds.news import BasicNewsRecipe 21 | 22 | _name = "FiveThirtyEight" 23 | 24 | 25 | class FiveThirtyEight(WordPressNewsrackRecipe, BasicNewsRecipe): 26 | title = _name 27 | description = "FiveThirtyEight uses statistical analysis — hard numbers — to tell compelling stories about politics, sports, science, economics and culture. https://fivethirtyeight.com/" 28 | language = "en" 29 | __author__ = "ping" 30 | 31 | oldest_article = 14 32 | max_articles_per_feed = 10 33 | masthead_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/1/13/FiveThirtyEight_Logo.svg/1024px-FiveThirtyEight_Logo.svg.png" 34 | 35 | reverse_article_order = False 36 | remove_attributes = ["style", "width", "height"] 37 | remove_tags = [dict(class_=["video-title", "videoplayer", "video-footer"])] 38 | 39 | extra_css = """ 40 | h1.article-title { font-size: 1.8rem; margin-bottom: 0.4rem; } 41 | h2.article-subtitle { font-size: 1.2rem; font-style: italic; margin-bottom: 1rem; font-weight: normal; } 42 | .single-header-metadata-wrap { margin-bottom: 1rem; } 43 | .single-header-metadata-wrap .vcard { 44 | font-weight: bold; color: #444; margin-right: 0.5rem; 45 | margin-top: 0; margin-bottom: 0; 46 | } 47 | .single-topic { margin-top: 0; margin-bottom: 0; } 48 | .single-featured-image img, p img, .wp-block-image img { margin-bottom: 0.8rem; max-width: 100%; } 49 | .single-featured-image .caption { display: block; font-size: 0.8rem; margin-top: 0.2rem; } 50 | """ 51 | 52 | feeds = [ 53 | (_name, "https://fivethirtyeight.com/"), 54 | ] 55 | 56 | def preprocess_raw_html(self, raw_html, url): 57 | # formulate the api response into html 58 | post = json.loads(raw_html) 59 | 60 | return f""" 61 | {post["title"]["rendered"]} 62 | 63 |

    64 | {post["content"]["rendered"]} 65 |
    66 | """ 67 | 68 | def parse_index(self): 69 | br = self.get_browser() 70 | articles = {} 71 | self.temp_dir = PersistentTemporaryDirectory() 72 | 73 | for feed_name, feed_url in self.feeds: 74 | custom_params = { 75 | "rest_route": "/wp/v2/fte_features", 76 | "espn_verticals_exclude": 67, # Sports 77 | "tags_exclude": 329557888, # Podcasts 78 | } 79 | posts = self.get_posts(feed_url, self.oldest_article, custom_params, br) 80 | 81 | latest_post_date = None 82 | for p in posts: 83 | post_update_dt = self.parse_date( 84 | p["modified_gmt"], tz_info=timezone.utc 85 | ) 86 | if not self.pub_date or post_update_dt > self.pub_date: 87 | self.pub_date = post_update_dt 88 | post_date = self.parse_date(p["date"], tz_info=None, as_utc=False) 89 | if not latest_post_date or post_date > latest_post_date: 90 | latest_post_date = post_date 91 | self.title = format_title(_name, post_date) 92 | 93 | section_name = f"{post_date:{get_date_format()}}" 94 | if len(self.get_feeds()) > 1: 95 | section_name = f"{feed_name}: {post_date:{get_date_format()}}" 96 | if section_name not in articles: 97 | articles[section_name] = [] 98 | 99 | with PersistentTemporaryFile(suffix=".json", dir=self.temp_dir) as f: 100 | f.write(json.dumps(p).encode("utf-8")) 101 | 102 | verticals = [] 103 | if p.get("espn_verticals"): 104 | try: 105 | for terms in p.get("_embedded", {}).get("wp:term", []): 106 | verticals.extend( 107 | [ 108 | t["name"] 109 | for t in terms 110 | if t["taxonomy"] == "espn_verticals" 111 | ] 112 | ) 113 | 114 | except (KeyError, TypeError): 115 | pass 116 | 117 | articles[section_name].append( 118 | { 119 | "title": unescape(p["title"]["rendered"]) or "Untitled", 120 | "url": "file://" + f.name, 121 | "date": f"{post_date:{get_date_format()}}", 122 | "description": unescape(" / ".join(verticals)), 123 | } 124 | ) 125 | return articles.items() 126 | -------------------------------------------------------------------------------- /recipes/forbes-editors-picks.recipe.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | from datetime import datetime, timezone, timedelta 5 | from urllib.parse import urlencode 6 | 7 | # custom include to share code between recipes 8 | sys.path.append(os.environ["recipes_includes"]) 9 | from recipes_shared import BasicNewsrackRecipe, format_title 10 | 11 | from calibre.web.feeds.news import BasicNewsRecipe 12 | 13 | _name = "Forbes - Editor's Picks" 14 | 15 | 16 | class ForbesEditorsPicks(BasicNewsrackRecipe, BasicNewsRecipe): 17 | title = _name 18 | __author__ = "ping" 19 | description = "Forbe's Editors' Picks https://www.forbes.com/editors-picks/" 20 | language = "en" 21 | 22 | oldest_article = 7 23 | max_articles_per_feed = 10 24 | 25 | scale_news_images = (800, 1200) 26 | timeout = 10 27 | simultaneous_downloads = 1 28 | 29 | keep_only_tags = [dict(name="article")] 30 | remove_attributes = ["style", "height", "width"] 31 | 32 | remove_tags = [ 33 | dict( 34 | class_=[ 35 | "story-package__nav-wrapper", 36 | "container__subnav--outer", 37 | "edit-story-container", 38 | "article-sharing", 39 | "vert-pipe", 40 | "short-bio", 41 | "bottom-contrib-block", 42 | "article-footer", 43 | "sigfile", 44 | "hidden", 45 | "link-embed", 46 | "subhead3-embed", 47 | "recirc-module", 48 | "seo", 49 | "top-ad-container", 50 | "speakr-wrapper", 51 | ] 52 | ), 53 | dict(name=["fbs-cordial", "fbs-ad", "svg"]), 54 | ] 55 | 56 | extra_css = """ 57 | .top-label-wrapper a { margin-right: 0.5rem; color: #444; } 58 | .issue { font-weight: bold; margin-bottom: 0.2rem; } 59 | h1 { font-size: 1.8rem; margin-bottom: 0.4rem; } 60 | h2.subhead-embed { font-size: 1.2rem; font-style: italic; font-weight: normal; margin-bottom: 0.5rem; } 61 | h2.subhead-embed strong { font-weight: normal; } 62 | .top-contrib-block { margin-top: 0.5rem; font-weight: bold; color: #444; } 63 | .content-data { margin-bottom: 1rem; font-weight: normal; color: unset; } 64 | .image-embed p { font-size: 0.8rem; margin-top: 0.2rem; margin-bottom: 0.5rem; } 65 | .image-embed img { 66 | display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto; 67 | box-sizing: border-box; 68 | } 69 | blockquote { font-size: 1.25rem; margin-left: 0; text-align: center; } 70 | blockquote .text-align { font-size: 1rem; } 71 | """ 72 | 73 | def preprocess_raw_html(self, raw_html, url): 74 | soup = self.soup(raw_html) 75 | article = soup.find("article") 76 | meta = self.get_ld_json(soup, lambda d: d.get("@type", "") == "NewsArticle") 77 | modified_date = meta.get("dateModified") or meta.get("datePublished") 78 | article["data-og-modified-date"] = modified_date 79 | for img in soup.find_all("progressive-image"): 80 | img.name = "img" 81 | return str(soup) 82 | 83 | def populate_article_metadata(self, article, soup, first): 84 | article_date = soup.find(attrs={"data-og-modified-date": True}) 85 | if article_date: 86 | modified_date = datetime.fromisoformat( 87 | article_date["data-og-modified-date"] 88 | ).replace(tzinfo=timezone.utc) 89 | if (not self.pub_date) or modified_date > self.pub_date: 90 | self.pub_date = modified_date 91 | self.title = format_title(_name, self.pub_date) 92 | article.utctime = modified_date 93 | article.localtime = modified_date 94 | 95 | def parse_index(self): 96 | br = self.get_browser() 97 | cutoff_date = datetime.utcnow().replace(tzinfo=timezone.utc) - timedelta( 98 | days=self.oldest_article 99 | ) 100 | articles = [] 101 | 102 | date_param = 0 103 | content_ids = None 104 | end_feed = False 105 | while not end_feed: 106 | query = { 107 | "limit": 25, 108 | "sourceValue": "editors-pick", 109 | "streamSourceType": "badge", 110 | } 111 | if content_ids: 112 | query["ids"] = content_ids 113 | if date_param: 114 | query["date"] = date_param 115 | 116 | endpoint = ( 117 | f"https://www.forbes.com/simple-data/chansec/stream/?{urlencode(query)}" 118 | ) 119 | 120 | res = br.open_novisit(endpoint, timeout=self.timeout) 121 | res_obj = json.loads(res.read().decode("utf-8")) 122 | items = res_obj.get("blocks", {}).get("items", []) 123 | if not items: 124 | break 125 | 126 | for item in items: 127 | item_date = datetime.utcfromtimestamp(item["date"] / 1000.0).replace( 128 | tzinfo=timezone.utc 129 | ) 130 | if item_date < cutoff_date: 131 | end_feed = True 132 | break 133 | 134 | if (not self.pub_date) or item_date > self.pub_date: 135 | self.pub_date = item_date 136 | self.title = format_title(_name, self.pub_date) 137 | 138 | articles.append( 139 | { 140 | "title": item["title"], 141 | "url": item["url"], 142 | "description": item["description"], 143 | "date": item_date, 144 | } 145 | ) 146 | date_param = item["date"] 147 | content_ids = item["id"] 148 | if len(articles) >= self.max_articles_per_feed: 149 | end_feed = True 150 | break 151 | 152 | return [(_name, articles)] 153 | -------------------------------------------------------------------------------- /recipes/foreign-policy.recipe.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 https://github.com/ping/ 2 | # 3 | # This software is released under the GNU General Public License v3.0 4 | # https://opensource.org/licenses/GPL-3.0 5 | import json 6 | import os 7 | import sys 8 | 9 | # custom include to share code between recipes 10 | sys.path.append(os.environ["recipes_includes"]) 11 | from recipes_shared import WordPressNewsrackRecipe, get_datetime_format 12 | 13 | from calibre.web.feeds.news import BasicNewsRecipe 14 | 15 | _name = "Foreign Policy" 16 | _issue_url = "" 17 | 18 | 19 | class ForeignPolicy(WordPressNewsrackRecipe, BasicNewsRecipe): 20 | title = _name 21 | __author__ = "ping" 22 | description = ( 23 | "Foreign Policy is an American news publication, founded in 1970 and " 24 | "focused on global affairs, current events, and domestic and international " 25 | "policy. It produces content daily on its website and app, and in four " 26 | "print issues annually. https://foreignpolicy.com/" 27 | ) 28 | language = "en" 29 | publication_type = "blog" 30 | oldest_article = 7 # days 31 | masthead_url = "https://foreignpolicy.com/wp-content/themes/foreign-policy-2017/assets/src/images/logos/favicon-256.png" 32 | reverse_article_order = False 33 | compress_news_images_auto_size = 12 34 | 35 | remove_tags = [ 36 | dict( 37 | class_=[ 38 | "Apple-converted-space", 39 | "graphic-chatter", 40 | "fp_choose_placement_related_posts", 41 | "sidebar-box_right", 42 | "newsletter-unit-signup", 43 | "newsletter-unit-signup--shortcode-fallback", 44 | ] 45 | ), 46 | dict(style="height:0;opacity:0;"), 47 | dict(name=["noscript"]), 48 | ] 49 | 50 | extra_css = """ 51 | .headline { font-size: 1.8rem; margin-bottom: 0.4rem; } 52 | .article-meta { margin-top: 1rem; margin-bottom: 1rem; } 53 | .article-meta .author { font-weight: bold; color: #444; margin-right: 0.5rem; } 54 | .article-section { display: block; font-weight: bold; color: #444; } 55 | .article-img img, img.attachment-full { display: block; max-width: 100%; height: auto; } 56 | .article-img p, .wp-caption-text { 57 | font-size: 0.8rem; display: block; margin-top: 0.2rem; 58 | } 59 | .pull-quote-sidebar { 60 | display: block; text-align: center; 61 | margin-left: 0; margin-bottom: 0.4rem; font-size: 1.25rem; 62 | } 63 | """ 64 | 65 | feeds = [ 66 | (_name, "https://www.foreignpolicy.com/"), 67 | ] 68 | 69 | def preprocess_raw_html(self, raw_html, url): 70 | # formulate the api response into html 71 | post = json.loads(raw_html) 72 | if not post: 73 | self.abort_article() 74 | date_published_loc = self.parse_date(post["date"], tz_info=None, as_utc=False) 75 | post_authors = self.extract_authors(post) 76 | categories = self.extract_categories(post) 77 | 78 | soup = self.soup( 79 | f""" 80 | {post["title"]["rendered"]} 81 | 82 |
    83 | {f'{" / ".join(categories)}' if categories else ''} 84 |

    {post["title"]["rendered"]}

    85 | 91 |
    92 | """ 93 | ) 94 | 95 | content = self.soup(post["content"]["rendered"]) 96 | # FP doesn't use featuremedia, the first attachment is the lede image 97 | attachment_endpoint = ( 98 | post.get("_links", {}).get("wp:attachment", [{}])[0].get("href") 99 | ) 100 | if attachment_endpoint: 101 | attachment = next( 102 | iter(json.loads(self.index_to_soup(attachment_endpoint, raw=True))), {} 103 | ) 104 | if attachment: 105 | lede = soup.new_tag("div", attrs={"class": "image-attachment"}) 106 | img = soup.new_tag("img", attrs={"src": attachment["source_url"]}) 107 | lede.append(img) 108 | if attachment.get("caption", {}).get("rendered"): 109 | caption = soup.new_tag("div", attrs={"class": "wp-caption-text"}) 110 | caption.append(self.soup(attachment["caption"]["rendered"])) 111 | lede.append(caption) 112 | soup.body.article.append(lede) 113 | 114 | soup.body.article.append(content) 115 | 116 | for img in soup.find_all("img", attrs={"data-lazy-src": True}): 117 | img["src"] = img["data-lazy-src"] 118 | # also cleanup a little 119 | for attribute in ( 120 | "data-lazy-src", 121 | "data-lazy-srcset", 122 | "data-lazy-sizes", 123 | "data-src", 124 | "loading", 125 | ): 126 | if img.get(attribute): 127 | del img[attribute] 128 | 129 | return str(soup) 130 | 131 | def parse_index(self): 132 | articles = {} 133 | br = self.get_browser() 134 | for feed_name, feed_url in self.feeds: 135 | articles = self.get_articles( 136 | articles, feed_name, feed_url, self.oldest_article, {}, br 137 | ) 138 | return articles.items() 139 | -------------------------------------------------------------------------------- /recipes/harvard-intl-review.recipe.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 https://github.com/ping/ 2 | # 3 | # This software is released under the GNU General Public License v3.0 4 | # https://opensource.org/licenses/GPL-3.0 5 | 6 | """ 7 | hir.harvard.edu 8 | """ 9 | import os 10 | import sys 11 | from datetime import timezone 12 | 13 | # custom include to share code between recipes 14 | sys.path.append(os.environ["recipes_includes"]) 15 | from recipes_shared import ( 16 | BasicNewsrackRecipe, 17 | format_title, 18 | get_date_format, 19 | get_datetime_format, 20 | ) 21 | 22 | from calibre.web.feeds import Feed 23 | from calibre.web.feeds.news import BasicNewsRecipe 24 | 25 | _name = "Harvard International Review" 26 | 27 | 28 | class HarvardInternationalReview(BasicNewsrackRecipe, BasicNewsRecipe): 29 | title = _name 30 | description = "The Harvard International Review is a quarterly magazine offering insight on international affairs from the perspectives of scholars, leaders, and policymakers. https://hir.harvard.edu/" 31 | language = "en" 32 | __author__ = "ping" 33 | publication_type = "magazine" 34 | oldest_article = 30 # days 35 | max_articles_per_feed = 30 36 | use_embedded_content = True 37 | masthead_url = ( 38 | "https://hir.harvard.edu/content/images/2020/12/HIRlogo_crimson-4.png" 39 | ) 40 | compress_news_images_auto_size = 7 41 | auto_cleanup = True 42 | timeout = 60 43 | 44 | extra_css = """ 45 | .article-meta { margin-bottom: 1rem; } 46 | .article-meta .author { font-weight: bold; color: #444; } 47 | .article-meta .published-dt { margin-left: 0.5rem; } 48 | """ 49 | 50 | feeds = [ 51 | (_name, "https://hir.harvard.edu/rss/"), 52 | ] 53 | 54 | def populate_article_metadata(self, article, __, _): 55 | if (not self.pub_date) or article.utctime > self.pub_date: 56 | self.pub_date = article.utctime 57 | self.title = format_title(_name, article.utctime) 58 | 59 | def parse_feeds(self): 60 | # convert single parsed feed into date-sectioned feed 61 | # use this only if there is just 1 feed 62 | parsed_feeds = super().parse_feeds() 63 | if len(parsed_feeds or []) != 1: 64 | return parsed_feeds 65 | 66 | articles = [] 67 | for feed in parsed_feeds: 68 | articles.extend(feed.articles) 69 | articles = sorted(articles, key=lambda a: a.utctime, reverse=True) 70 | new_feeds = [] 71 | curr_feed = None 72 | parsed_feed = parsed_feeds[0] 73 | for i, a in enumerate(articles, start=1): 74 | date_published = a.utctime.replace(tzinfo=timezone.utc) 75 | article_index = f"{date_published:{get_date_format()}}" 76 | # add author and pub date 77 | soup = self.soup(a.content) 78 | header = None 79 | if soup.body.contents[0].name in ["h1", "h2", "h3"]: 80 | header = soup.body.contents[0] 81 | meta = soup.new_tag("div", attrs={"class": "article-meta"}) 82 | if a.author: 83 | author_ele = soup.new_tag("span", attrs={"class": "author"}) 84 | author_ele.append(a.author) 85 | meta.append(author_ele) 86 | pub_ele = soup.new_tag("span", attrs={"class": "published-dt"}) 87 | pub_ele.append(f"{date_published:{get_datetime_format()}}") 88 | meta.append(pub_ele) 89 | if header: 90 | header.insert_after(meta) 91 | else: 92 | soup.body.insert(0, meta) 93 | a.content = soup.body.decode_contents() 94 | if i == 1: 95 | curr_feed = Feed(log=parsed_feed.logger) 96 | curr_feed.title = article_index 97 | curr_feed.description = parsed_feed.description 98 | curr_feed.image_url = parsed_feed.image_url 99 | curr_feed.image_height = parsed_feed.image_height 100 | curr_feed.image_alt = parsed_feed.image_alt 101 | curr_feed.oldest_article = parsed_feed.oldest_article 102 | curr_feed.articles = [] 103 | curr_feed.articles.append(a) 104 | continue 105 | if curr_feed.title == article_index: 106 | curr_feed.articles.append(a) 107 | else: 108 | new_feeds.append(curr_feed) 109 | curr_feed = Feed(log=parsed_feed.logger) 110 | curr_feed.title = article_index 111 | curr_feed.description = parsed_feed.description 112 | curr_feed.image_url = parsed_feed.image_url 113 | curr_feed.image_height = parsed_feed.image_height 114 | curr_feed.image_alt = parsed_feed.image_alt 115 | curr_feed.oldest_article = parsed_feed.oldest_article 116 | curr_feed.articles = [] 117 | curr_feed.articles.append(a) 118 | if i == len(articles): 119 | # last article 120 | new_feeds.append(curr_feed) 121 | 122 | return new_feeds 123 | -------------------------------------------------------------------------------- /recipes/japan-times.recipe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Original at https://github.com/kovidgoyal/calibre/blob/4a01a799f19c4d0711d826ec7c79821b4ea690b6/recipes/japan_times.recipe 5 | # 6 | # [!] Ad-blocked, requires login 7 | # 8 | """ 9 | japantimes.co.jp 10 | """ 11 | 12 | __license__ = "GPL v3" 13 | __copyright__ = ( 14 | "2008-2013, Darko Miletic . " 15 | "2022, Albert Aparicio Isarn " 16 | ) 17 | 18 | import os 19 | import sys 20 | from datetime import datetime 21 | 22 | # custom include to share code between recipes 23 | sys.path.append(os.environ["recipes_includes"]) 24 | from recipes_shared import BasicNewsrackRecipe, format_title, get_datetime_format 25 | 26 | from calibre.web.feeds.news import BasicNewsRecipe 27 | 28 | _name = "Japan Times" 29 | 30 | 31 | class JapanTimes(BasicNewsrackRecipe, BasicNewsRecipe): 32 | title = _name 33 | __author__ = "Albert Aparicio Isarn (original recipe by Darko Miletic)" 34 | description = "The latest news from Japan Times, Japan's leading English-language daily newspaper" 35 | language = "en_JP" 36 | category = "news, politics, japan" 37 | publisher = "The Japan Times" 38 | oldest_article = 1 39 | max_articles_per_feed = 60 40 | publication_type = "newspaper" 41 | masthead_url = "https://cdn-japantimes.com/wp-content/themes/jt_theme/library/img/japantimes-logo-tagline.png" 42 | 43 | auto_cleanup = False 44 | 45 | conversion_options = { 46 | "comment": description, 47 | "tags": category, 48 | "publisher": publisher, 49 | "language": language, 50 | } 51 | 52 | remove_attributes = ["style"] 53 | remove_tags_before = [dict(name="main")] 54 | remove_tags_after = [dict(name="main")] 55 | 56 | remove_tags = [ 57 | dict(name=["script", "style"]), 58 | dict( 59 | id=[ 60 | "tpModal", 61 | "site_header", 62 | "nav_anchor_container", 63 | "nav", 64 | "no_js_blocker", 65 | "menu", 66 | "taboola-below-article-thumbnails", 67 | "disqus_thread", 68 | "piano-recommend", 69 | ] 70 | ), 71 | dict( 72 | class_=[ 73 | "clearfix", 74 | "nav_search", 75 | "sub_menu_container", 76 | "sidebar", 77 | "ad", 78 | "site_footer", 79 | "post-attachments", 80 | "post-keywords", 81 | "newsletter-signup", 82 | "DisplayAd", 83 | "jt-subscribe-box", 84 | "single-sns-area", 85 | "single-upper-meta", 86 | "article_footer_ad", 87 | "note-to-commenters", 88 | "note-to-non-commenters", 89 | "pagetop-wrap", 90 | "jt-related-stories", 91 | ] 92 | ), 93 | ] 94 | 95 | extra_css = """ 96 | .article-meta { margin-top: 1rem; margin-bottom: 1rem; } 97 | .article-meta .author { font-weight: bold; color: #444; margin-right: 0.5rem; } 98 | ul.slides { list-style: none; } 99 | .slide_image img { max-width: 100%; height: auto; } 100 | .slide_image div, .inline_image div { font-size: 0.8rem; margin-top: 0.2rem; } 101 | """ 102 | 103 | feeds = [ 104 | ("Top Stories", "https://www.japantimes.co.jp/feed/topstories/"), 105 | ("News", "https://www.japantimes.co.jp/news/feed/"), 106 | ("Opinion", "https://www.japantimes.co.jp/opinion/feed/"), 107 | ("Life", "https://www.japantimes.co.jp/life/feed/"), 108 | ("Community", "https://www.japantimes.co.jp/community/feed/"), 109 | ("Culture", "https://www.japantimes.co.jp/culture/feed/"), 110 | # ("Sports", "https://www.japantimes.co.jp/sports/feed/"), 111 | ] 112 | 113 | def preprocess_html(self, soup): 114 | # "unbullet" the images 115 | slides = soup.find(name="ul", attrs={"class": "slides"}) 116 | if slides: 117 | for img_div in slides.find_all(attrs={"class": "slide_image"}): 118 | slides.insert_after(img_div.extract()) 119 | slides.decompose() 120 | 121 | # embed the lazy loaded images 122 | lazy_loaded_images = soup.find_all(name="img", attrs={"data-src": True}) 123 | for img in lazy_loaded_images: 124 | img["src"] = img["data-src"] 125 | 126 | # reformat the article meta 127 | meta = soup.new_tag("div", attrs={"class": "article-meta"}) 128 | credit = soup.find(name="meta", attrs={"name": "cXenseParse:jat-credit"}) 129 | if credit: 130 | sep = credit.get("data-separator", ",") 131 | authors = credit["content"].split(sep) 132 | author_ele = soup.new_tag("span", attrs={"class": "author"}) 133 | author_ele.append(",".join(authors)) 134 | meta.append(author_ele) 135 | pub_date = soup.find(name="meta", attrs={"property": "article:published_time"}) 136 | if pub_date: 137 | pub_date = datetime.fromisoformat(pub_date["content"]) 138 | pub_date_ele = soup.new_tag("span", attrs={"class": "published-date"}) 139 | pub_date_ele.append(f"{pub_date:{get_datetime_format()}}") 140 | meta.append(pub_date_ele) 141 | if (not self.pub_date) or pub_date > self.pub_date: 142 | self.pub_date = pub_date 143 | self.title = format_title(_name, pub_date) 144 | soup.body.h1.insert_after(meta) 145 | return soup 146 | 147 | def parse_feeds(self): 148 | # because feed is not sorted by date 149 | parsed_feeds = super().parse_feeds() 150 | for feed in parsed_feeds: 151 | articles = feed.articles 152 | articles = sorted(articles, key=lambda a: a.utctime, reverse=True) 153 | feed.articles = articles 154 | return parsed_feeds 155 | -------------------------------------------------------------------------------- /recipes/joongangdaily.recipe.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 https://github.com/ping/ 2 | # 3 | # This software is released under the GNU General Public License v3.0 4 | # https://opensource.org/licenses/GPL-3.0 5 | 6 | """ 7 | koreajoongangdaily.joins.com 8 | """ 9 | import os 10 | import sys 11 | 12 | # custom include to share code between recipes 13 | sys.path.append(os.environ["recipes_includes"]) 14 | from recipes_shared import BasicNewsrackRecipe, format_title 15 | 16 | from calibre.web.feeds.news import BasicNewsRecipe 17 | 18 | _name = "JoongAng Daily" 19 | 20 | 21 | class KoreaJoongAngDaily(BasicNewsrackRecipe, BasicNewsRecipe): 22 | title = _name 23 | description = "The Korea JoongAng Daily is an English-language daily published by the JoongAng Group, Korea’s leading media group, in association with The New York Times. https://koreajoongangdaily.joins.com/" 24 | language = "en" 25 | __author__ = "ping" 26 | publication_type = "newspaper" 27 | masthead_url = ( 28 | "https://koreajoongangdaily.joins.com/resources/images/common/logo.png" 29 | ) 30 | use_embedded_content = True 31 | auto_cleanup = True 32 | compress_news_images_auto_size = 10 33 | 34 | oldest_article = 1 # days 35 | max_articles_per_feed = 60 36 | 37 | extra_css = """ 38 | .caption { font-size: 0.8rem; margin: 0.5rem 0; } 39 | """ 40 | 41 | feeds = [ 42 | ("Korea JoongAng Daily", "https://koreajoongangdaily.joins.com/xmls/joins"), 43 | ] 44 | 45 | def populate_article_metadata(self, article, __, _): 46 | if (not self.pub_date) or article.utctime > self.pub_date: 47 | self.pub_date = article.utctime 48 | self.title = format_title(_name, article.utctime) 49 | 50 | def parse_feeds(self): 51 | return self.group_feeds_by_date(timezone_offset_hours=9) # Seoul time 52 | -------------------------------------------------------------------------------- /recipes/kirkus.recipe.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from urllib.parse import urljoin 4 | 5 | # custom include to share code between recipes 6 | sys.path.append(os.environ["recipes_includes"]) 7 | from recipes_shared import BasicNewsrackRecipe 8 | 9 | from calibre.web.feeds.news import BasicNewsRecipe 10 | 11 | _name = "Kirkus" 12 | 13 | 14 | class Kirkus(BasicNewsrackRecipe, BasicNewsRecipe): 15 | title = _name 16 | description = "Kirkus Reviews is an American book review magazine founded in 1933 by Virginia Kirkus. The magazine is headquartered in New York City. https://www.kirkusreviews.com/magazine/current/" 17 | language = "en" 18 | __author__ = "ping" 19 | publication_type = "magazine" 20 | masthead_url = ( 21 | "https://d1fd687oe6a92y.cloudfront.net/img/kir_images/logo/kirkus-nav-logo.svg" 22 | ) 23 | max_articles_per_feed = 99 24 | compress_news_images_auto_size = 6 25 | keep_only_tags = [ 26 | dict( 27 | class_=[ 28 | "article-author", 29 | "article-author-img-start", 30 | "article-author-description-start", 31 | "single-review", 32 | ] 33 | ) 34 | ] 35 | remove_tags = [ 36 | dict( 37 | class_=[ 38 | "sidebar-content", 39 | "article-social-share-desktop-first", 40 | "article-social-share-desktop-pagination", 41 | "article-social-share-mobile", 42 | "share-review-text", 43 | "like-dislike-article", 44 | "rate-this-book-text", 45 | "input-group", 46 | "user-comments", 47 | "show-all-response-text", 48 | "button-row", 49 | "hide-on-mobile", 50 | "related-article", 51 | "breadcrumb-row", 52 | "shop-now-dropdown", 53 | ] 54 | ) 55 | ] 56 | remove_tags_after = [dict(class_="single-review")] 57 | 58 | extra_css = """ 59 | .image-container img { max-width: 100%; height: auto; margin-bottom: 0.2rem; } 60 | .photo-caption { font-size: 0.8rem; margin-bottom: 0.5rem; display: block; } 61 | .book-review-img .image-container { text-align: center; } 62 | .book-rating-module .description-title { font-size: 1.25rem; margin-left: 0; text-align: center; } 63 | """ 64 | 65 | def preprocess_html(self, soup): 66 | h1 = soup.find(class_="article-title") 67 | book_cover = soup.find("ul", class_="book-review-img") 68 | if book_cover: 69 | for li in book_cover.find_all("li"): 70 | li.name = "div" 71 | book_cover.name = "div" 72 | if h1: 73 | book_cover.insert_before(h1.extract()) 74 | 75 | return soup 76 | 77 | def parse_index(self): 78 | issue_url = "https://www.kirkusreviews.com/magazine/current/" 79 | soup = self.index_to_soup(issue_url) 80 | issue = soup.find(name="article", class_="issue-container") 81 | cover_img = issue.select(".issue-header .cover-image img") 82 | if cover_img: 83 | self.cover_url = cover_img[0]["src"] 84 | 85 | h1 = issue.find("h1") 86 | if h1: 87 | edition = self.tag_to_string(h1) 88 | self.title = f"{_name}: {edition}" 89 | # Example: April 1, 2023 "%B %d, %Y" 90 | self.pub_date = self.parse_date(edition) 91 | 92 | articles = {} 93 | for book_ele in soup.find_all(name="div", class_="issue-featured-book"): 94 | link = book_ele.find("a") 95 | if not link: 96 | continue 97 | section = self.tag_to_string(book_ele.find("h3")).upper() 98 | articles.setdefault(section, []).append( 99 | {"url": urljoin(issue_url, link["href"]), "title": link["title"]} 100 | ) 101 | 102 | for post_ele in issue.select("div.issue-more-posts ul li div.lead-text"): 103 | link = post_ele.find("a") 104 | if not link: 105 | continue 106 | section = self.tag_to_string(post_ele.find(class_="lead-text-type")).upper() 107 | articles.setdefault(section, []).append( 108 | { 109 | "url": urljoin(issue_url, link["href"]), 110 | "title": self.tag_to_string(link), 111 | } 112 | ) 113 | 114 | for section_ele in issue.select("section.reviews-section"): 115 | section_articles = [] 116 | for review in section_ele.select("ul li.starred"): 117 | link = review.select("h4 a") 118 | if not link: 119 | continue 120 | description = review.find("p") 121 | section_articles.append( 122 | { 123 | "url": urljoin(issue_url, link[0]["href"]), 124 | "title": self.tag_to_string(link[0]), 125 | "description": "" 126 | if not description 127 | else self.tag_to_string(description), 128 | } 129 | ) 130 | if not section_articles: 131 | continue 132 | section = self.tag_to_string(section_ele.find("h3")).upper() 133 | if section not in articles: 134 | articles[section] = [] 135 | articles.setdefault(section, []).extend(section_articles) 136 | 137 | return articles.items() 138 | -------------------------------------------------------------------------------- /recipes/knowable-magazine.recipe.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 https://github.com/ping/ 2 | # 3 | # This software is released under the GNU General Public License v3.0 4 | # https://opensource.org/licenses/GPL-3.0 5 | 6 | """ 7 | knowablemagazine.org 8 | """ 9 | import os 10 | import sys 11 | 12 | # custom include to share code between recipes 13 | sys.path.append(os.environ["recipes_includes"]) 14 | from recipes_shared import BasicNewsrackRecipe, format_title 15 | 16 | from calibre.web.feeds.news import BasicNewsRecipe 17 | 18 | _name = "Knowable Magazine" 19 | 20 | 21 | class KnowableMagazine(BasicNewsrackRecipe, BasicNewsRecipe): 22 | title = _name 23 | __author__ = "ping" 24 | description = ( 25 | "Knowable Magazine explores the real-world significance of scholarly work " 26 | "through a journalistic lens. We report on the current state of play across " 27 | "a wide variety of fields — from agriculture to high-energy physics; " 28 | "biochemistry to water security; the origins of the universe to psychology. " 29 | "https://knowablemagazine.org/" 30 | ) 31 | masthead_url = "https://knowablemagazine.org/pb-assets/knowable-assets/images/logo-1586554394067.svg" 32 | language = "en" 33 | publication_type = "magazine" 34 | timeout = 60 35 | 36 | oldest_article = 45 # days 37 | max_articles_per_feed = 15 38 | scale_news_images = (800, 1200) 39 | 40 | keep_only_tags = [ 41 | dict(class_=["article-container"]), 42 | ] 43 | remove_attributes = ["style"] 44 | remove_tags = [ 45 | dict(name=["script", "style", "svg"]), 46 | dict(attrs={"data-widget-def": True}), 47 | dict(id=["newsletter-promo-item"]), 48 | dict( 49 | class_=[ 50 | "promo", 51 | "ember-view", 52 | "promo-article-dark", 53 | "share-icons-box", 54 | "article-tags", 55 | "article-republish", 56 | ] 57 | ), 58 | ] 59 | 60 | extra_css = """ 61 | h1 { font-size: 1.8rem; margin-bottom: 0.4rem; } 62 | .article-subhead { font-size: 1.2rem; font-style: italic; font-weight: normal; margin-bottom: 0.5rem; margin-top: 0; } 63 | .article-byline { margin-top: 0.5rem; margin-bottom: 1rem; } 64 | .article-byline .author-byline { font-weight: bold; color: #444; display: inline-block; } 65 | .article-byline .pub-date { display: inline-block; margin-left: 0.5rem; } 66 | .article-image img { 67 | display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto; 68 | box-sizing: border-box; 69 | } 70 | .article-image .caption { font-size: 0.8rem; } 71 | .pull-quote { font-size: 1.25rem; margin-left: 0; text-align: center; } 72 | """ 73 | 74 | feeds = [ 75 | (_name, "https://knowablemagazine.org/rss"), 76 | ] 77 | 78 | def populate_article_metadata(self, article, __, _): 79 | if (not self.pub_date) or article.utctime > self.pub_date: 80 | self.pub_date = article.utctime 81 | self.title = format_title(_name, article.utctime) 82 | 83 | def parse_feeds(self): 84 | return self.group_feeds_by_date(timezone_offset_hours=-7) # PST 85 | -------------------------------------------------------------------------------- /recipes/korea-herald.recipe.py: -------------------------------------------------------------------------------- 1 | """ 2 | koreaherald.com 3 | """ 4 | __license__ = "GPL v3" 5 | __copyright__ = "2011, Seongkyoun Yoo " 6 | 7 | import os 8 | import re 9 | import sys 10 | 11 | # custom include to share code between recipes 12 | sys.path.append(os.environ["recipes_includes"]) 13 | from recipes_shared import BasicNewsrackRecipe, format_title 14 | 15 | from calibre.web.feeds.news import BasicNewsRecipe 16 | 17 | _name = "Korea Herald" 18 | 19 | 20 | class KoreaHerald(BasicNewsrackRecipe, BasicNewsRecipe): 21 | title = _name 22 | language = "en" 23 | description = "Korea Herald News articles https://koreaherald.com/" 24 | __author__ = "Seongkyoun Yoo" 25 | publication_type = "newspaper" 26 | masthead_url = "https://res.heraldm.com/new_201209/images/common/logo.gif" 27 | 28 | oldest_article = 1 29 | max_articles_per_feed = 25 30 | 31 | keep_only_tags = [dict(class_="news_content")] 32 | remove_attributes = ["style", "align"] 33 | remove_tags = [ 34 | dict(name=["script", "style"]), 35 | dict(class_=["news_btn_wrap", "news_journalist_area"]), 36 | ] 37 | 38 | extra_css = """ 39 | h1.news_title { font-size: 1.8rem; margin-bottom: 0.4rem; } 40 | h2.news_title { font-size: 1.2rem; font-style: italic; font-weight: normal; margin-bottom: 0.8rem; } 41 | p.news_date { margin-top: 0.2rem; } 42 | .img_caption { font-size: 0.8rem; margin-top: 0.2rem; display: block; } 43 | """ 44 | 45 | feeds = [ 46 | ("National", "http://www.koreaherald.com/common/rss_xml.php?ct=102"), 47 | ("Business", "http://www.koreaherald.com/common/rss_xml.php?ct=103"), 48 | ("Finance", "http://www.koreaherald.com/common/rss_xml.php?ct=305"), 49 | ("Life & Style", "http://www.koreaherald.com/common/rss_xml.php?ct=104"), 50 | ("Entertainment", "http://www.koreaherald.com/common/rss_xml.php?ct=105"), 51 | # ("Sports", "http://www.koreaherald.com/common/rss_xml.php?ct=106"), 52 | ("World", "http://www.koreaherald.com/common/rss_xml.php?ct=107"), 53 | ("Opinion", "http://www.koreaherald.com/common/rss_xml.php?ct=108"), 54 | ] 55 | 56 | def populate_article_metadata(self, article, __, _): 57 | if (not self.pub_date) or article.utctime > self.pub_date: 58 | self.pub_date = article.utctime 59 | self.title = format_title(_name, article.utctime) 60 | 61 | def preprocess_html(self, soup): 62 | byline_date = soup.find(attrs={"class": "view_tit_byline_r"}) 63 | if byline_date: 64 | # format the published/updated date properly 65 | date_elements = [] 66 | # Published : Apr 18, 2022 - 16:41 Updated : Apr 18, 2022 - 16:41 67 | date_re = r"(Published|Updated).+?\:.+?(?P[a-z]{3}\s\d+),.+?(?P