├── .gitignore ├── package.json ├── README.md └── index.js /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "url-to-markdown", 3 | "version": "0.0.1", 4 | "description": "", 5 | "type": "module", 6 | "main": "index.js", 7 | "scripts": { 8 | "test": "echo \"Error: no test specified\" && exit 1" 9 | }, 10 | "author": "Tomas Vik (viktomas)", 11 | "license": "MIT", 12 | "devDependencies": { 13 | "jest": "^28.1.1" 14 | }, 15 | "dependencies": { 16 | "@mozilla/readability": "^0.4.2", 17 | "jsdom": "^20.0.0", 18 | "node-fetch": "^3.2.6", 19 | "turndown": "^7.1.1", 20 | "yargs": "^17.5.1" 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # url-to-markdown: Parse webpages to Markdown from your shell 2 | 3 | `url-to-markdown` is a script that combines [mozilla/readability](https://github.com/mozilla/readability) with [mixmark-io/turndown](https://github.com/mixmark-io/turndown) to let you parse an article on the web into markdown with one command. 4 | 5 | 1. Install [Node](https://nodejs.org/en/) > 12 6 | 1. Install [Git](https://git-scm.com/) 7 | 1. Checkout this repo `git checkout https://github.com/viktomas/url-to-markdown.git` 8 | 1. Enter the folder `cd url-to-markdown` 9 | 1. Install dependencies `npm i` 10 | 1. Run the script: 11 | 12 | ```sh 13 | ❯❯❯ node index.js https://example.com 14 | ``` 15 | ```md 16 | This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission. 17 | 18 | [More information...](https://www.iana.org/domains/example) 19 | ``` 20 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | import { Readability } from '@mozilla/readability'; 2 | import { JSDOM } from 'jsdom'; 3 | import TurndownService from 'turndown'; 4 | import fetch from 'node-fetch'; 5 | import yargs from 'yargs'; 6 | import { hideBin } from 'yargs/helpers'; 7 | 8 | const argv = yargs(hideBin(process.argv)).argv; 9 | 10 | if (argv._.length !== 1) { 11 | console.error('You need to pass URL as the only argument'); 12 | process.exit(1); 13 | } 14 | const url = process.argv[2]; 15 | 16 | const makeMarkdownFromUrl = async (url) => { 17 | const downloadedPage = await (await fetch(url)).text(); 18 | const doc = new JSDOM(downloadedPage, { 19 | url, 20 | }); 21 | const reader = new Readability(doc.window.document); 22 | const article = reader.parse(); 23 | const html = article.content; 24 | const turndownService = new TurndownService({ 25 | headingStyle: 'atx', 26 | }); 27 | return turndownService.turndown(html); 28 | }; 29 | makeMarkdownFromUrl(url).then(console.log); 30 | --------------------------------------------------------------------------------