├── .gitignore
├── LICENSE
├── README.md
├── mercury.py
├── reader.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Zachary Yocum
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # reader
  2 | Extract clean(er), readable text from web pages via [Mercury Web Parser](https://github.com/postlight/mercury-parser).
  3 | 
  4 | ## A note on the Mercury Web Parser
  5 | The creators of the Mercury Web Parser initially offered it as a free service via a ReSTful API, but have since open sourced it.  The API was shut down April 15, 2019.  To continue using the parser, install its command-line driver using [`yarn`](https://github.com/yarnpkg/yarn) or [`npm`](https://github.com/npm/cli) package managers:
  6 | 
  7 | ```
  8 | # Install Mercury globally
  9 | yarn global add @postlight/mercury-parser
 10 | #   or
 11 | npm -g install @postlight/mercury-parser
 12 | ```
 13 | 
 14 | ## Install
 15 | 
 16 | Clone this repository, create a virtual environment, and install the Python requirements:
 17 | 
 18 | ```
 19 | $ python3 -m venv .
 20 | ...
 21 | $ source bin/activate
 22 | (reader) $ pip install -r requirements.txt
 23 | ...
 24 | ```
 25 | 
 26 | ## Usage
 27 | 
 28 | ```
 29 | (reader) $ ./reader.py -h
 30 | usage: reader.py [-h] [-f {json,md,txt}] [-w BODY_WIDTH] filename
 31 | 
 32 | Get a cleaner version of a web page for reading purposes. This script reads
 33 | JSON input from the Mercury Web Parser (https://github.com/postlight/mercury-
 34 | parser) and performs conversion of HTML to markdown and plain-text via
 35 | html2text.
 36 | 
 37 | positional arguments:
 38 |   filename              load Mercury Web Parser JSON result from file (use "-"
 39 |                         to read from stdin)
 40 | 
 41 | optional arguments:
 42 |   -h, --help            show this help message and exit
 43 |   -f {json,md,txt}, --format {json,md,txt}
 44 |                         output format (default: json)
 45 |   -w BODY_WIDTH, --body-width BODY_WIDTH
 46 |                         character offset at which to wrap lines for plain-text
 47 |                         (default: None)
 48 | ```
 49 | 
 50 | Alternatively, there is a `mercury.py` script that acts just like `reader.py`, except it wraps the `mercury-parser` command line on your behalf, so instead of loading the JSON from stdin or a file, it runs the Node.js javascript internally, so all it requires is a URL:
 51 | 
 52 | ```
 53 | (reader) $ ./mercury.py -h
 54 | usage: mercury.py [-h] [-f {json,md,txt}] [-w BODY_WIDTH] [-p MERCURY_PATH]
 55 |                   url
 56 | 
 57 | Python wrapper of the Mercury Parser command line This requires you've
 58 | installed Node.js (https://nodejs.org/en/) and the mercury-parser
 59 | (https://github.com/postlight/mercury-parser): # Install Mercury globally $
 60 | yarn global add @postlight/mercury-parser # or $ npm -g install
 61 | @postlight/mercury-parser
 62 | 
 63 | positional arguments:
 64 |   url                   URL to parse
 65 | 
 66 | optional arguments:
 67 |   -h, --help            show this help message and exit
 68 |   -f {json,md,txt}, --format {json,md,txt}
 69 |                         output format (default: json)
 70 |   -w BODY_WIDTH, --body-width BODY_WIDTH
 71 |                         character offset at which to wrap lines for plain-text
 72 |                         (default: None)
 73 |   -p MERCURY_PATH, --mercury-path MERCURY_PATH
 74 |                         path to mercury-parser command line driver (default:
 75 |                         /usr/local/bin/mercury-parser)
 76 | ```
 77 | 
 78 | If you installed `mercury-parser` somewhere other than the default path, just supply the path with the `-p/--mercury-path` option.
 79 | 
 80 | ## Examples
 81 | 
 82 | ### Mercury Web Parser JSON
 83 | 
 84 | The Mercury Web Parser's raw JSON results are useful on their own:
 85 | 
 86 | ```
 87 | (reader) $ mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source | jq .
 88 | {
 89 |   "title": "Mercury Goes Open Source! — Postlight — Digital product studio",
 90 |   "author": "Adam Pash",
 91 |   "date_published": "2019-02-06T14:36:45.000Z",
 92 |   "dek": null,
 93 |   "lead_image_url": "https://postlight.com/wp-content/uploads/2019/02/mercury-open-source-social-card-e1550670446269.png",
 94 |   "content": "<div class=\"body__content\"> <p>It&#x2019;s my pleasure to announce that today, Postlight is open-sourcing the <a href=\"https://mercury.postlight.com/web-parser/\">Mercury Web Parser</a>.</p>\n<p>Written in JavaScript and running on both Node and in the browser, Mercury Parser is the engine that powers the Mercury Parser API, <a href=\"https://mercury.postlight.com/amp-converter/\">Mercury AMP Converter</a>, <a href=\"https://mercury.postlight.com/reader/\">Mercury Reader</a>, and <a href=\"https://postlight.com/trackchanges/the-secret-engines-of-the-internet\">even more third-party software and services.</a></p>\n<p>Mercury Parser allows for better reading experiences, easier content migration, and endless opportunities for remixing the web, by making semantic sense out of any article. Mercury Parser sees web pages the same way you do: It sees titles, content, authors, and lead images, and makes all of that extracted data easily available to your software, which, unfortunately, sees only a sea of HTML markup, where page navigation, advertising, and the like are indistinguishable from content.</p>\n<p>Get <a href=\"https://github.com/postlight/mercury-parser\">Mercury Parser</a> for use in your projects on GitHub:</p>\n<blockquote class=\"embedly-card\"> <p>&#x1F4DC; Extracting content from the chaos of the web. Contribute to postlight/mercury-parser development by creating an account on GitHub.</p>\n</blockquote> <h3>Try Mercury Parser</h3>\n<p>Wanna see Mercury Parser in action in your own command line? First install it:</p>\n<pre>$ yarn global add @postlight/mercury-parser</pre>\n<p>Then parse an article and check out the results:</p>\n<pre>$ mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source</pre>\n<p>Now, as an open-source project &#x2014; and with your help &#x2014; we hope to make the Mercury Parser even better. Say, for example, Mercury&#x2019;s done a less-than-perfect job parsing an article from your favorite web site. You can <a href=\"https://github.com/postlight/mercury-parser/blob/master/src/extractors/custom/README.md\">write and submit a custom site parser</a> guaranteed to get it right quickly, every time. We&#x2019;re excited about <a href=\"https://github.com/postlight/mercury-parser/blob/master/CONTRIBUTING.md\">all sorts of ways</a> the Mercury community will contribute to this project.</p>\n<h3>What about the API?</h3>\n<p>Over time, we will deprecate the Mercury Parser API. We&#x2019;ll do it slowly, with lots of warning and advance email notifications, and <a href=\"https://github.com/postlight/mercury-parser-api\">drop-in replacement code</a>. We&#x2019;ve committed to creating an easy path for people who want to use Mercury in any way they see fit, using open source, well-documented code that can be easily rolled into any other service or API. We want to put our energy there, making a more tractable web together&#x2014;not behind a private, hosted API.</p>\n<p>Indeed, one of the main drivers for this choice was API users asking us to open source Mercury&#x2014;and asking how they could help improve it.</p>\n<p>Today we&#x2019;ve done exactly that. You can use Mercury Parser directly in any JavaScript project, whether on Node or in your browser, starting today, with no API required. If you&#x2019;d like to chat about the Mercury Parser or need some help getting started, join the community in the <a href=\"https://gitter.im/postlight/mercury\">Mercury Gitter channel</a>.</p>\n<p><em><a href=\"https://postlight.com/trackchanges/authors/adam-pash\">Adam Pash</a> is a Director of Engineering at Postlight. Want help making sense of big messy data? Get in touch: <a href=\"https://postlight.com/cdn-cgi/l/email-protection#6d05080101022d1d021e1901040a0519430e0200\"><span class=\"__cf_email__\">[email&#xA0;protected]</span></a>.</em></p> </div>",
 95 |   "next_page_url": null,
 96 |   "url": "https://postlight.com/trackchanges/mercury-goes-open-source",
 97 |   "domain": "postlight.com",
 98 |   "excerpt": "It’s my pleasure to announce that today, Postlight is open-sourcing the Mercury Web Parser. Written in JavaScript and running on both Node and in the ...",
 99 |   "word_count": 436,
100 |   "direction": "ltr",
101 |   "total_pages": 1,
102 |   "rendered_pages": 1
103 | }
104 | ```
105 | 
106 | ### Full JSON
107 | 
108 | `reader.py` augments the Mercury Web Parser's results with addition Markdown (`.content.mardkwon`) and plain-text (`.content.text`) conversions of the original HTML content:
109 | 
110 | ```
111 | (reader) $ mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source | ./reader.py - | jq .
112 | {
113 |   "title": "Mercury Goes Open Source! — Postlight — Digital product studio",
114 |   "author": "Adam Pash",
115 |   "date_published": "2019-02-06T14:36:45.000Z",
116 |   "dek": null,
117 |   "lead_image_url": "https://postlight.com/wp-content/uploads/2019/02/mercury-open-source-social-card-e1550670446269.png",
118 |   "content": {
119 |     "html": "<div class=\"body__content\"> <p>It&#x2019;s my pleasure to announce that today, Postlight is open-sourcing the <a href=\"https://mercury.postlight.com/web-parser/\">Mercury Web Parser</a>.</p>\n<p>Written in JavaScript and running on both Node and in the browser, Mercury Parser is the engine that powers the Mercury Parser API, <a href=\"https://mercury.postlight.com/amp-converter/\">Mercury AMP Converter</a>, <a href=\"https://mercury.postlight.com/reader/\">Mercury Reader</a>, and <a href=\"https://postlight.com/trackchanges/the-secret-engines-of-the-internet\">even more third-party software and services.</a></p>\n<p>Mercury Parser allows for better reading experiences, easier content migration, and endless opportunities for remixing the web, by making semantic sense out of any article. Mercury Parser sees web pages the same way you do: It sees titles, content, authors, and lead images, and makes all of that extracted data easily available to your software, which, unfortunately, sees only a sea of HTML markup, where page navigation, advertising, and the like are indistinguishable from content.</p>\n<p>Get <a href=\"https://github.com/postlight/mercury-parser\">Mercury Parser</a> for use in your projects on GitHub:</p>\n<blockquote class=\"embedly-card\"> <p>&#x1F4DC; Extracting content from the chaos of the web. Contribute to postlight/mercury-parser development by creating an account on GitHub.</p>\n</blockquote> <h3>Try Mercury Parser</h3>\n<p>Wanna see Mercury Parser in action in your own command line? First install it:</p>\n<pre>$ yarn global add @postlight/mercury-parser</pre>\n<p>Then parse an article and check out the results:</p>\n<pre>$ mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source</pre>\n<p>Now, as an open-source project &#x2014; and with your help &#x2014; we hope to make the Mercury Parser even better. Say, for example, Mercury&#x2019;s done a less-than-perfect job parsing an article from your favorite web site. You can <a href=\"https://github.com/postlight/mercury-parser/blob/master/src/extractors/custom/README.md\">write and submit a custom site parser</a> guaranteed to get it right quickly, every time. We&#x2019;re excited about <a href=\"https://github.com/postlight/mercury-parser/blob/master/CONTRIBUTING.md\">all sorts of ways</a> the Mercury community will contribute to this project.</p>\n<h3>What about the API?</h3>\n<p>Over time, we will deprecate the Mercury Parser API. We&#x2019;ll do it slowly, with lots of warning and advance email notifications, and <a href=\"https://github.com/postlight/mercury-parser-api\">drop-in replacement code</a>. We&#x2019;ve committed to creating an easy path for people who want to use Mercury in any way they see fit, using open source, well-documented code that can be easily rolled into any other service or API. We want to put our energy there, making a more tractable web together&#x2014;not behind a private, hosted API.</p>\n<p>Indeed, one of the main drivers for this choice was API users asking us to open source Mercury&#x2014;and asking how they could help improve it.</p>\n<p>Today we&#x2019;ve done exactly that. You can use Mercury Parser directly in any JavaScript project, whether on Node or in your browser, starting today, with no API required. If you&#x2019;d like to chat about the Mercury Parser or need some help getting started, join the community in the <a href=\"https://gitter.im/postlight/mercury\">Mercury Gitter channel</a>.</p>\n<p><em><a href=\"https://postlight.com/trackchanges/authors/adam-pash\">Adam Pash</a> is a Director of Engineering at Postlight. Want help making sense of big messy data? Get in touch: <a href=\"https://postlight.com/cdn-cgi/l/email-protection#1a727f7676755a6a75696e76737d726e34797577\"><span class=\"__cf_email__\">[email&#xA0;protected]</span></a>.</em></p> </div>",
120 |     "markdown": "It's my pleasure to announce that today, Postlight is open-sourcing the [Mercury Web Parser](https://mercury.postlight.com/web-parser/).\n\nWritten in JavaScript and running on both Node and in the browser, Mercury Parser is the engine that powers the Mercury Parser API, [Mercury AMP Converter](https://mercury.postlight.com/amp-converter/), [Mercury Reader](https://mercury.postlight.com/reader/), and [even more third-party software and services.](https://postlight.com/trackchanges/the-secret-engines-of-the-internet)\n\nMercury Parser allows for better reading experiences, easier content migration, and endless opportunities for remixing the web, by making semantic sense out of any article. Mercury Parser sees web pages the same way you do: It sees titles, content, authors, and lead images, and makes all of that extracted data easily available to your software, which, unfortunately, sees only a sea of HTML markup, where page navigation, advertising, and the like are indistinguishable from content.\n\nGet [Mercury Parser](https://github.com/postlight/mercury-parser) for use in your projects on GitHub:\n\n> 📜 Extracting content from the chaos of the web. Contribute to postlight/mercury-parser development by creating an account on GitHub.\n\n### Try Mercury Parser\n\nWanna see Mercury Parser in action in your own command line? First install it:\n    \n    \n    $ yarn global add @postlight/mercury-parser\n\nThen parse an article and check out the results:\n    \n    \n    $ mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source\n\nNow, as an open-source project -- and with your help -- we hope to make the Mercury Parser even better. Say, for example, Mercury's done a less-than-perfect job parsing an article from your favorite web site. You can [write and submit a custom site parser](https://github.com/postlight/mercury-parser/blob/master/src/extractors/custom/README.md) guaranteed to get it right quickly, every time. We're excited about [all sorts of ways](https://github.com/postlight/mercury-parser/blob/master/CONTRIBUTING.md) the Mercury community will contribute to this project.\n\n### What about the API?\n\nOver time, we will deprecate the Mercury Parser API. We'll do it slowly, with lots of warning and advance email notifications, and [drop-in replacement code](https://github.com/postlight/mercury-parser-api). We've committed to creating an easy path for people who want to use Mercury in any way they see fit, using open source, well-documented code that can be easily rolled into any other service or API. We want to put our energy there, making a more tractable web together--not behind a private, hosted API.\n\nIndeed, one of the main drivers for this choice was API users asking us to open source Mercury--and asking how they could help improve it.\n\nToday we've done exactly that. You can use Mercury Parser directly in any JavaScript project, whether on Node or in your browser, starting today, with no API required. If you'd like to chat about the Mercury Parser or need some help getting started, join the community in the [Mercury Gitter channel](https://gitter.im/postlight/mercury).\n\n_[Adam Pash](https://postlight.com/trackchanges/authors/adam-pash) is a Director of Engineering at Postlight. Want help making sense of big messy data? Get in touch: [ [email protected]](https://postlight.com/cdn-cgi/l/email-protection#1a727f7676755a6a75696e76737d726e34797577)._\n",
121 |     "text": "It's my pleasure to announce that today, Postlight is open-sourcing the Mercury Web Parser.\n\nWritten in JavaScript and running on both Node and in the browser, Mercury Parser is the engine that powers the Mercury Parser API, Mercury AMP Converter, Mercury Reader, and even more third-party software and services.\n\nMercury Parser allows for better reading experiences, easier content migration, and endless opportunities for remixing the web, by making semantic sense out of any article. Mercury Parser sees web pages the same way you do: It sees titles, content, authors, and lead images, and makes all of that extracted data easily available to your software, which, unfortunately, sees only a sea of HTML markup, where page navigation, advertising, and the like are indistinguishable from content.\n\nGet Mercury Parser for use in your projects on GitHub:\n\n> 📜 Extracting content from the chaos of the web. Contribute to postlight/mercury-parser development by creating an account on GitHub.\n\n### Try Mercury Parser\n\nWanna see Mercury Parser in action in your own command line? First install it:\n    \n    \n    $ yarn global add @postlight/mercury-parser\n\nThen parse an article and check out the results:\n    \n    \n    $ mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source\n\nNow, as an open-source project -- and with your help -- we hope to make the Mercury Parser even better. Say, for example, Mercury's done a less-than-perfect job parsing an article from your favorite web site. You can write and submit a custom site parser guaranteed to get it right quickly, every time. We're excited about all sorts of ways the Mercury community will contribute to this project.\n\n### What about the API?\n\nOver time, we will deprecate the Mercury Parser API. We'll do it slowly, with lots of warning and advance email notifications, and drop-in replacement code. We've committed to creating an easy path for people who want to use Mercury in any way they see fit, using open source, well-documented code that can be easily rolled into any other service or API. We want to put our energy there, making a more tractable web together--not behind a private, hosted API.\n\nIndeed, one of the main drivers for this choice was API users asking us to open source Mercury--and asking how they could help improve it.\n\nToday we've done exactly that. You can use Mercury Parser directly in any JavaScript project, whether on Node or in your browser, starting today, with no API required. If you'd like to chat about the Mercury Parser or need some help getting started, join the community in the Mercury Gitter channel.\n\nAdam Pash is a Director of Engineering at Postlight. Want help making sense of big messy data? Get in touch: [email protected].\n"
122 |   },
123 |   "next_page_url": null,
124 |   "url": "https://postlight.com/trackchanges/mercury-goes-open-source",
125 |   "domain": "postlight.com",
126 |   "excerpt": "It’s my pleasure to announce that today, Postlight is open-sourcing the Mercury Web Parser. Written in JavaScript and running on both Node and in the ...",
127 |   "word_count": 436,
128 |   "direction": "ltr",
129 |   "total_pages": 1,
130 |   "rendered_pages": 1
131 | }
132 | ```
133 | 
134 | ### HTML
135 | The original extracted HTML content from the Mercury Web Parser is accessible from `.content.html`:
136 | 
137 | ```
138 | (reader) $ mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source | ./reader.py - | jq -r .content.html
139 | <div class="body__content"> <p>It&#x2019;s my pleasure to announce that today, Postlight is open-sourcing the <a href="https://mercury.postlight.com/web-parser/">Mercury Web Parser</a>.</p>
140 | <p>Written in JavaScript and running on both Node and in the browser, Mercury Parser is the engine that powers the Mercury Parser API, <a href="https://mercury.postlight.com/amp-converter/">Mercury AMP Converter</a>, <a href="https://mercury.postlight.com/reader/">Mercury Reader</a>, and <a href="https://postlight.com/trackchanges/the-secret-engines-of-the-internet">even more third-party software and services.</a></p>
141 | <p>Mercury Parser allows for better reading experiences, easier content migration, and endless opportunities for remixing the web, by making semantic sense out of any article. Mercury Parser sees web pages the same way you do: It sees titles, content, authors, and lead images, and makes all of that extracted data easily available to your software, which, unfortunately, sees only a sea of HTML markup, where page navigation, advertising, and the like are indistinguishable from content.</p>
142 | <p>Get <a href="https://github.com/postlight/mercury-parser">Mercury Parser</a> for use in your projects on GitHub:</p>
143 | <blockquote class="embedly-card"> <p>&#x1F4DC; Extracting content from the chaos of the web. Contribute to postlight/mercury-parser development by creating an account on GitHub.</p>
144 | </blockquote> <h3>Try Mercury Parser</h3>
145 | <p>Wanna see Mercury Parser in action in your own command line? First install it:</p>
146 | <pre>$ yarn global add @postlight/mercury-parser</pre>
147 | <p>Then parse an article and check out the results:</p>
148 | <pre>$ mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source</pre>
149 | <p>Now, as an open-source project &#x2014; and with your help &#x2014; we hope to make the Mercury Parser even better. Say, for example, Mercury&#x2019;s done a less-than-perfect job parsing an article from your favorite web site. You can <a href="https://github.com/postlight/mercury-parser/blob/master/src/extractors/custom/README.md">write and submit a custom site parser</a> guaranteed to get it right quickly, every time. We&#x2019;re excited about <a href="https://github.com/postlight/mercury-parser/blob/master/CONTRIBUTING.md">all sorts of ways</a> the Mercury community will contribute to this project.</p>
150 | <h3>What about the API?</h3>
151 | <p>Over time, we will deprecate the Mercury Parser API. We&#x2019;ll do it slowly, with lots of warning and advance email notifications, and <a href="https://github.com/postlight/mercury-parser-api">drop-in replacement code</a>. We&#x2019;ve committed to creating an easy path for people who want to use Mercury in any way they see fit, using open source, well-documented code that can be easily rolled into any other service or API. We want to put our energy there, making a more tractable web together&#x2014;not behind a private, hosted API.</p>
152 | <p>Indeed, one of the main drivers for this choice was API users asking us to open source Mercury&#x2014;and asking how they could help improve it.</p>
153 | <p>Today we&#x2019;ve done exactly that. You can use Mercury Parser directly in any JavaScript project, whether on Node or in your browser, starting today, with no API required. If you&#x2019;d like to chat about the Mercury Parser or need some help getting started, join the community in the <a href="https://gitter.im/postlight/mercury">Mercury Gitter channel</a>.</p>
154 | <p><em><a href="https://postlight.com/trackchanges/authors/adam-pash">Adam Pash</a> is a Director of Engineering at Postlight. Want help making sense of big messy data? Get in touch: <a href="https://postlight.com/cdn-cgi/l/email-protection#4d25282121220d3d223e3921242a2539632e2220"><span class="__cf_email__">[email&#xA0;protected]</span></a>.</em></p> </div>
155 | ```
156 | 
157 | ### Markdown
158 | A Markdown conversion from the HTML is added in `.content.markdown` which can be extracted just like the HTML via `jq` in the previous example.  However, as a convenience `reader.py` can output the document as Markdown (as opposed to JSON) including some of the human-relevant metadata using the `-f/--format` option:
159 | 
160 | ```
161 | (reader) $ mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source | ./reader.py - --format=md
162 | 
163 | date: 2019-02-06 14:36:45  
164 | author(s): Adam Pash  
165 | 
166 | # [Mercury Goes Open Source! — Postlight — Digital product studio](https://postlight.com/trackchanges/mercury-goes-open-source)
167 | 
168 | It's my pleasure to announce that today, Postlight is open-sourcing the [Mercury Web Parser](https://mercury.postlight.com/web-parser/).
169 | 
170 | Written in JavaScript and running on both Node and in the browser, Mercury Parser is the engine that powers the Mercury Parser API, [Mercury AMP Converter](https://mercury.postlight.com/amp-converter/), [Mercury Reader](https://mercury.postlight.com/reader/), and [even more third-party software and services.](https://postlight.com/trackchanges/the-secret-engines-of-the-internet)
171 | 
172 | Mercury Parser allows for better reading experiences, easier content migration, and endless opportunities for remixing the web, by making semantic sense out of any article. Mercury Parser sees web pages the same way you do: It sees titles, content, authors, and lead images, and makes all of that extracted data easily available to your software, which, unfortunately, sees only a sea of HTML markup, where page navigation, advertising, and the like are indistinguishable from content.
173 | 
174 | Get [Mercury Parser](https://github.com/postlight/mercury-parser) for use in your projects on GitHub:
175 | 
176 | > 📜 Extracting content from the chaos of the web. Contribute to postlight/mercury-parser development by creating an account on GitHub.
177 | 
178 | ### Try Mercury Parser
179 | 
180 | Wanna see Mercury Parser in action in your own command line? First install it:
181 |     
182 |     
183 |     $ yarn global add @postlight/mercury-parser
184 | 
185 | Then parse an article and check out the results:
186 |     
187 |     
188 |     $ mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source
189 | 
190 | Now, as an open-source project -- and with your help -- we hope to make the Mercury Parser even better. Say, for example, Mercury's done a less-than-perfect job parsing an article from your favorite web site. You can [write and submit a custom site parser](https://github.com/postlight/mercury-parser/blob/master/src/extractors/custom/README.md) guaranteed to get it right quickly, every time. We're excited about [all sorts of ways](https://github.com/postlight/mercury-parser/blob/master/CONTRIBUTING.md) the Mercury community will contribute to this project.
191 | 
192 | ### What about the API?
193 | 
194 | Over time, we will deprecate the Mercury Parser API. We'll do it slowly, with lots of warning and advance email notifications, and [drop-in replacement code](https://github.com/postlight/mercury-parser-api). We've committed to creating an easy path for people who want to use Mercury in any way they see fit, using open source, well-documented code that can be easily rolled into any other service or API. We want to put our energy there, making a more tractable web together--not behind a private, hosted API.
195 | 
196 | Indeed, one of the main drivers for this choice was API users asking us to open source Mercury--and asking how they could help improve it.
197 | 
198 | Today we've done exactly that. You can use Mercury Parser directly in any JavaScript project, whether on Node or in your browser, starting today, with no API required. If you'd like to chat about the Mercury Parser or need some help getting started, join the community in the [Mercury Gitter channel](https://gitter.im/postlight/mercury).
199 | 
200 | _[Adam Pash](https://postlight.com/trackchanges/authors/adam-pash) is a Director of Engineering at Postlight. Want help making sense of big messy data? Get in touch: [ [email protected]](https://postlight.com/cdn-cgi/l/email-protection#86eee3eaeae9c6f6e9f5f2eaefe1eef2a8e5e9eb)._
201 | 
202 | ```
203 | ### Plain-text
204 | Similarly to the previous example, `reader.py` can also format the whole document, along with a subset of the metadata, as plain-text:
205 | 
206 | ```
207 | (reader) $ mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source | ./reader.py - --format=txt
208 | 
209 | url: https://postlight.com/trackchanges/mercury-goes-open-source
210 | date: 2019-02-06 14:36:45
211 | author(s): Adam Pash
212 | 
213 | Mercury Goes Open Source! — Postlight — Digital product studio
214 | 
215 | It's my pleasure to announce that today, Postlight is open-sourcing the Mercury Web Parser.
216 | 
217 | Written in JavaScript and running on both Node and in the browser, Mercury Parser is the engine that powers the Mercury Parser API, Mercury AMP Converter, Mercury Reader, and even more third-party software and services.
218 | 
219 | Mercury Parser allows for better reading experiences, easier content migration, and endless opportunities for remixing the web, by making semantic sense out of any article. Mercury Parser sees web pages the same way you do: It sees titles, content, authors, and lead images, and makes all of that extracted data easily available to your software, which, unfortunately, sees only a sea of HTML markup, where page navigation, advertising, and the like are indistinguishable from content.
220 | 
221 | Get Mercury Parser for use in your projects on GitHub:
222 | 
223 | > 📜 Extracting content from the chaos of the web. Contribute to postlight/mercury-parser development by creating an account on GitHub.
224 | 
225 | ### Try Mercury Parser
226 | 
227 | Wanna see Mercury Parser in action in your own command line? First install it:
228 |     
229 |     
230 |     $ yarn global add @postlight/mercury-parser
231 | 
232 | Then parse an article and check out the results:
233 |     
234 |     
235 |     $ mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source
236 | 
237 | Now, as an open-source project -- and with your help -- we hope to make the Mercury Parser even better. Say, for example, Mercury's done a less-than-perfect job parsing an article from your favorite web site. You can write and submit a custom site parser guaranteed to get it right quickly, every time. We're excited about all sorts of ways the Mercury community will contribute to this project.
238 | 
239 | ### What about the API?
240 | 
241 | Over time, we will deprecate the Mercury Parser API. We'll do it slowly, with lots of warning and advance email notifications, and drop-in replacement code. We've committed to creating an easy path for people who want to use Mercury in any way they see fit, using open source, well-documented code that can be easily rolled into any other service or API. We want to put our energy there, making a more tractable web together--not behind a private, hosted API.
242 | 
243 | Indeed, one of the main drivers for this choice was API users asking us to open source Mercury--and asking how they could help improve it.
244 | 
245 | Today we've done exactly that. You can use Mercury Parser directly in any JavaScript project, whether on Node or in your browser, starting today, with no API required. If you'd like to chat about the Mercury Parser or need some help getting started, join the community in the Mercury Gitter channel.
246 | 
247 | Adam Pash is a Director of Engineering at Postlight. Want help making sense of big messy data? Get in touch: [email protected].
248 | 
249 | ```
250 | 
251 | ### Read Web Content in Your Terminal
252 | One use case for this script is to convert content from the web to a format that is suitable for reading in your terminal.  Here's a short shell pipeline to extract the content and feed the converted plain-text to your `$PAGER` of choice for easy reading:
253 | 
254 | ```
255 | #!/bin/bash
256 | url=$1
257 | reader=path/to/reader.py
258 | mercury-parser "$url" | "$reader" - -w 80 -f txt | "$PAGER"
259 | ```
260 | 


--------------------------------------------------------------------------------
/mercury.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """Python wrapper of the Mercury Parser command line
 4 | 
 5 | This requires you've installed Node.js 
 6 | (https://nodejs.org/en/) 
 7 | and the mercury-parser 
 8 | (https://github.com/postlight/mercury-parser):
 9 | 
10 | # Install Mercury globally
11 | $ yarn global add @postlight/mercury-parser
12 | # or
13 | $ npm -g install @postlight/mercury-parser
14 | 
15 | """
16 | 
17 | import json
18 | import sys
19 | 
20 | from reader import HTML2Text, Format, unescape, main
21 | 
22 | from Naked.toolshed.shell import muterun_js
23 | 
24 | def mercury(url, mercury_cli_path):
25 |     """Wrap the Mercury Parser command line driver
26 |     
27 |     url: URL string to parse
28 |     mercur_cli_path: path to mercury-parser command line driver
29 |     """
30 |     response = muterun_js(
31 |         mercury_cli_path,
32 |         url
33 |     )
34 |     if response.exitcode != 0:
35 |         print('[ERROR] URL: {}'.format(url), file=sys.stderr)
36 |         print('[ERROR]', response.stderr.decode('utf-8'), file=sys.stderr)
37 |         sys.exit(response.exitcode)
38 |     else:
39 |         result = json.loads(response.stdout.decode('utf-8'))
40 |         if 'error' in result:
41 |             print('[ERROR] URL: {}'.format(url), file=sys.stderr)
42 |             print('[ERROR]', result['messages'], file=sys.stderr)
43 |             sys.exit(1)
44 |         return result
45 | 
46 | if __name__ == '__main__':
47 |     import argparse
48 |     parser = argparse.ArgumentParser(
49 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
50 |         description=__doc__
51 |     )
52 |     parser.add_argument(
53 |         'url',
54 |         help='URL to parse',
55 |     )
56 |     parser.add_argument(
57 |         '-f', '--format',
58 |         choices=list(Format.formatter),
59 |         default='json',
60 |         help='output format'
61 |     )
62 |     parser.add_argument(
63 |         '-w', '--body-width',
64 |         type=int,
65 |         default=None,
66 |         help='character offset at which to wrap lines for plain-text'
67 |     )
68 |     parser.add_argument(
69 |         '-p', '--mercury-path',
70 |         default='/opt/homebrew/bin/mercury-parser',
71 |         help='path to mercury-parser command line driver'
72 |     )
73 |     args = parser.parse_args()
74 |     obj = main(
75 |         mercury(args.url, args.mercury_path),
76 |         args.body_width
77 |     )
78 |     print(Format.formatter[args.format](obj))
79 | 


--------------------------------------------------------------------------------
/reader.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """Get a cleaner version of a web page for reading purposes.
  4 | 
  5 | This script reads JSON input from the Mercury Web Parser 
  6 | (https://github.com/postlight/mercury-parser) and performs conversion of HTML 
  7 | to markdown and plain-text via html2text.
  8 | """
  9 | 
 10 | import sys
 11 | import json
 12 | import textwrap
 13 | 
 14 | from datetime import datetime
 15 | from html import unescape
 16 | from html2text import HTML2Text
 17 | 
 18 | class Format():
 19 |     """This is a decorator class for registering document format methods.
 20 |     
 21 |     You can register additional document formatter functions by decorating
 22 |     them with @Format.
 23 |     
 24 |     A formatter should be a function that takes as input a response object
 25 |     from the Mercury API.  It's output can be any string derived from that
 26 |     input.
 27 |     
 28 |     By convention formatters should have a '_format' suffix in their function
 29 |     name.  By this convention, if you have a formatter named 'json_format',
 30 |     then you can call this with Format.formatter['json']().
 31 |     """
 32 |     formatter = {}
 33 |     def __init__(self, f):
 34 |         key, _ = f.__name__.rsplit('_', 1)
 35 |         self.formatter.update({key: f})
 36 |         self.format = f
 37 |     
 38 |     def __call__(self):
 39 |         self.format()
 40 | 
 41 | def format_date(obj):
 42 |     date = obj.get('date_published')
 43 |     if date is not None:        
 44 |         obj['date_published'] = datetime.strptime(
 45 |             obj['date_published'],
 46 |             "%Y-%m-%dT%H:%M:%S.%fZ"
 47 |         )
 48 | 
 49 | @Format
 50 | def json_format(obj):
 51 |     """Formatter that formats as JSON"""
 52 |     return json.dumps(obj, ensure_ascii=False)
 53 | 
 54 | @Format
 55 | def md_format(obj):
 56 |     """Formatter that formats as markdown"""
 57 |     format_date(obj)
 58 |     content = '''
 59 |     date: {date_published}  
 60 |     author(s): {author}  
 61 |     
 62 |     # [{title}]({url})
 63 |     '''
 64 |     return '\n'.join((
 65 |         textwrap.dedent(content.format(**obj)),
 66 |         obj['content'].get('markdown', '')
 67 |     ))
 68 | 
 69 | @Format
 70 | def txt_format(obj):
 71 |     """Formatter that formats as plain-text"""
 72 |     format_date(obj)
 73 |     content = '''
 74 |     url: {url}
 75 |     date: {date_published}
 76 |     author(s): {author}
 77 |     
 78 |     {title}
 79 |     '''
 80 |     return '\n'.join((
 81 |         textwrap.dedent(content.format(**obj)),
 82 |         obj['content'].get('text', '')
 83 |     ))
 84 | 
 85 | def load(filename):
 86 |     """Load Mercury Web Parser JSON results from file as a Python dict"""
 87 |     try:
 88 |         if filename in {"-", None}:
 89 |             return json.loads(sys.stdin.read())
 90 |         with open(filename, mode='r') as f:
 91 |             return json.load(f)
 92 |     except json.JSONDecodeError:
 93 |         print(f'failed to load JSON from file: {filename}', file=sys.stderr)
 94 |         sys.exit(1)
 95 | 
 96 | def main(result, body_width):
 97 |     """Convert Mercury parse result dict to Markdown and plain-text
 98 |     
 99 |     result: a mercury-parser result (as a Python dict)
100 |     """
101 |     text = HTML2Text()
102 |     text.body_width = body_width
103 |     text.ignore_emphasis = True
104 |     text.ignore_images = True
105 |     text.ignore_links = True
106 |     text.convert_charrefs = True
107 |     markdown = HTML2Text()
108 |     markdown.body_width = body_width
109 |     markdown.convert_charrefs = True
110 |     result['content'] = {
111 |         'html': result['content'],
112 |         'markdown': unescape(markdown.handle(result['content'])),
113 |         'text': unescape(text.handle(result['content']))
114 |     }
115 |     return result
116 | 
117 | if __name__ == '__main__':
118 |     import argparse
119 |     parser = argparse.ArgumentParser(
120 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
121 |         description=__doc__
122 |     )
123 |     parser.add_argument(
124 |         'filename',
125 |         help=(
126 |             'load Mercury Web Parser JSON result from file (use "-" '
127 |             'to read from stdin)'
128 |         )
129 |     )
130 |     parser.add_argument(
131 |         '-f', '--format',
132 |         choices=list(Format.formatter),
133 |         default='json',
134 |         help='output format'
135 |     )
136 |     parser.add_argument(
137 |         '-w', '--body-width',
138 |         type=int,
139 |         default=None,
140 |         help='character offset at which to wrap lines for plain-text'
141 |     )
142 |     args = parser.parse_args()
143 |     obj = main(
144 |         load(args.filename),
145 |         args.body_width,
146 |     )
147 |     print(Format.formatter[args.format](obj))
148 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | certifi==2021.10.8
2 | charset-normalizer==2.0.12
3 | html2text==2020.1.16
4 | idna==3.3
5 | requests==2.27.1
6 | urllib3==1.26.9
7 | 


--------------------------------------------------------------------------------