├── .node-version ├── .gitattributes ├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── something-else.md │ ├── question.md │ ├── feature_request.md │ └── bug_report.md └── workflows │ ├── lint.yml │ └── test.yml ├── .eslintignore ├── .gitignore ├── packages ├── html-to-md │ ├── CHANGELOG.md │ ├── test │ │ ├── snapshots │ │ │ ├── tags.js.snap │ │ │ ├── html-to-md.js.snap │ │ │ ├── html-to-md.js.md │ │ │ └── tags.js.md │ │ ├── html-to-md.js │ │ └── tags.js │ ├── rollup.config.js │ ├── README.md │ ├── package.json │ └── src │ │ ├── html-to-md.js │ │ ├── table-printer.js │ │ └── md-formatters.js ├── html-to-text │ ├── test │ │ ├── test-address.txt │ │ ├── test-orderby-occurrence.txt │ │ ├── test-orderby-selectors.txt │ │ ├── test-multiple-elements.txt │ │ ├── test.txt │ │ └── test.html │ ├── rollup.config.js │ ├── package.json │ ├── src │ │ ├── table-printer.js │ │ ├── html-to-text.js │ │ └── text-formatters.js │ └── CHANGELOG.md ├── base │ ├── package.json │ └── src │ │ ├── stack-item.js │ │ ├── generic-formatters.js │ │ ├── util.js │ │ ├── inline-text-builder.js │ │ ├── index.js │ │ ├── whitespace-processor.js │ │ ├── typedefs.js │ │ └── block-text-builder.js └── html-to-text-cli │ ├── rollup.config.js │ ├── CHANGELOG.md │ ├── package.json │ ├── src │ └── cli.js │ └── README.md ├── SECURITY.md ├── example ├── html-to-md.js ├── html-to-text.js └── test.html ├── LICENSE ├── package.json ├── README.md └── .eslintrc.cjs /.node-version: -------------------------------------------------------------------------------- 1 | 14.21 2 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto eol=lf 2 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | ko_fi: killymxi 2 | -------------------------------------------------------------------------------- /.eslintignore: -------------------------------------------------------------------------------- 1 | **/*{.,-}min.js 2 | node_modules 3 | coverage 4 | lib 5 | bin 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .nyc_output 3 | .vscode 4 | coverage 5 | node_modules 6 | lib 7 | bin 8 | __*.* 9 | packages/*/LICENSE 10 | -------------------------------------------------------------------------------- /packages/html-to-md/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## Version 0.5.0 4 | 5 | Initial release. 6 | 7 | Targeting Node.js version >= 14. 8 | -------------------------------------------------------------------------------- /packages/html-to-md/test/snapshots/tags.js.snap: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/html-to-text/node-html-to-text/HEAD/packages/html-to-md/test/snapshots/tags.js.snap -------------------------------------------------------------------------------- /packages/html-to-md/test/snapshots/html-to-md.js.snap: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/html-to-text/node-html-to-text/HEAD/packages/html-to-md/test/snapshots/html-to-md.js.snap -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/something-else.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Something else 3 | about: Blank issue 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /packages/html-to-text/test/test-address.txt: -------------------------------------------------------------------------------- 1 | INVOICE ADDRESS SHIPMENT ADDRESS 2 | Mr. Mr. 3 | John Doe John Doe 4 | Featherstone Street 49 Featherstone Street 49 5 | 28199 Bremen 28199 Bremen -------------------------------------------------------------------------------- /packages/html-to-text/test/test-orderby-occurrence.txt: -------------------------------------------------------------------------------- 1 | INVOICE ADDRESS SHIPMENT ADDRESS 2 | Mr. Mr. 3 | John Doe John Doe 4 | Featherstone Street 49 Featherstone Street 49 5 | 28199 Bremen 28199 Bremen 6 | 7 | Some Company 8 | Some Street 42 9 | Somewhere 10 | E-Mail: Click here [test@example.com] -------------------------------------------------------------------------------- /packages/html-to-text/test/test-orderby-selectors.txt: -------------------------------------------------------------------------------- 1 | Some Company 2 | Some Street 42 3 | Somewhere 4 | E-Mail: Click here [test@example.com] 5 | 6 | INVOICE ADDRESS SHIPMENT ADDRESS 7 | Mr. Mr. 8 | John Doe John Doe 9 | Featherstone Street 49 Featherstone Street 49 10 | 28199 Bremen 28199 Bremen -------------------------------------------------------------------------------- /packages/html-to-md/rollup.config.js: -------------------------------------------------------------------------------- 1 | 2 | const { nodeResolve } = require('@rollup/plugin-node-resolve'); 3 | 4 | 5 | /** 6 | * @type {import('rollup').RollupOptions} 7 | */ 8 | module.exports = { 9 | input: 'src/html-to-md.js', 10 | output: [ 11 | { file: 'lib/html-to-md.mjs', format: 'es' }, 12 | { file: 'lib/html-to-md.cjs', format: 'cjs' } 13 | ], 14 | plugins: [ 15 | nodeResolve({ resolveOnly: ['@html-to-text/base'] }) 16 | ], 17 | }; 18 | -------------------------------------------------------------------------------- /packages/html-to-text/rollup.config.js: -------------------------------------------------------------------------------- 1 | 2 | const { nodeResolve } = require('@rollup/plugin-node-resolve'); 3 | 4 | 5 | /** 6 | * @type {import('rollup').RollupOptions} 7 | */ 8 | module.exports = { 9 | input: 'src/html-to-text.js', 10 | output: [ 11 | { file: 'lib/html-to-text.mjs', format: 'es' }, 12 | { file: 'lib/html-to-text.cjs', format: 'cjs' }, 13 | ], 14 | plugins: [ 15 | nodeResolve({ resolveOnly: ['@html-to-text/base'] }) 16 | ], 17 | }; 18 | -------------------------------------------------------------------------------- /packages/base/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@html-to-text/base", 3 | "version": "0.5.0", 4 | "private": true, 5 | "description": "Base package for html-to-x converters", 6 | "license": "MIT", 7 | "engines": { 8 | "node": ">=14" 9 | }, 10 | "type": "module", 11 | "main": "src/index.js", 12 | "dependencies": { 13 | "@selderee/plugin-htmlparser2": "^0.11.0", 14 | "deepmerge": "^4.3.1", 15 | "dom-serializer": "^2.0.0", 16 | "htmlparser2": "^8.0.2", 17 | "selderee": "^0.11.0" 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /packages/html-to-text-cli/rollup.config.js: -------------------------------------------------------------------------------- 1 | 2 | const json = require('@rollup/plugin-json'); 3 | const { nodeResolve } = require('@rollup/plugin-node-resolve'); 4 | 5 | /** 6 | * @type {import('rollup').RollupOptions} 7 | */ 8 | module.exports = { 9 | input: 'src/cli.js', 10 | output: [ 11 | { 12 | banner: '#!/usr/bin/env node\n', 13 | file: 'bin/cli.js', 14 | format: 'es', 15 | } 16 | ], 17 | plugins: [ 18 | json(), 19 | nodeResolve({ resolveOnly: ['html-to-text'] }) 20 | ], 21 | }; 22 | -------------------------------------------------------------------------------- /packages/html-to-md/README.md: -------------------------------------------------------------------------------- 1 | # `@html-to/md` 2 | 3 | TODO: badges 4 | 5 | Advanced converter that parses HTML and returns MarkDown. 6 | 7 | Customizable like [html-to-text](https://github.com/html-to-text/node-html-to-text) but with output format reusable in other systems. 8 | 9 | ## Changelog 10 | 11 | Available here: [CHANGELOG.md](https://github.com/html-to-text/node-html-to-text/blob/master/packages/html-to-md/CHANGELOG.md) 12 | 13 | ## Installation 14 | 15 | ## Usage 16 | 17 | ## Options 18 | 19 | TODO: copy or reorganize shared docs? 20 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Question 3 | about: When you don't know what you're doing 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **The goal** 11 | *What are you trying to achieve?* 12 | 13 | **Best attempt** 14 | *What you've tried to do so far to achieve your goal?* 15 | 16 | **The question** 17 | *What you can't figure out yourself?* 18 | 19 | **Prior research** 20 | *Where did you look and what you've tried to do so far in order to figure out the answer before writing here? Is there anything else you can try?* 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: To help html-to-text become more capable 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Problem to solve** 11 | *What kind of problem you were trying to solve when you realized there is a missing feature?* 12 | 13 | **What works** 14 | *How close to your goal you can currently get?* 15 | 16 | **What is missing** 17 | *What prevents you from achieving your goal?* 18 | 19 | **How the missing feature should be implemented** 20 | *Explain any technical details here.* 21 | 22 | **Bigger picture** 23 | *Do you think this feature has different uses? Is there a better way to do it? What other options you've considered?* 24 | -------------------------------------------------------------------------------- /packages/html-to-text-cli/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## Version 0.5.4 4 | 5 | - Based on `html-to-text` version 9.0.5 ([changelog](https://github.com/html-to-text/node-html-to-text/blob/master/packages/html-to-text/CHANGELOG.md)) 6 | 7 | ## Version 0.5.3 8 | 9 | - Based on `html-to-text` version 9.0.4 ([changelog](https://github.com/html-to-text/node-html-to-text/blob/master/packages/html-to-text/CHANGELOG.md)) 10 | 11 | ## Version 0.5.2 12 | 13 | - Based on `html-to-text` version 9.0.2 ([changelog](https://github.com/html-to-text/node-html-to-text/blob/master/packages/html-to-text/CHANGELOG.md)) 14 | 15 | ## Version 0.5.1 16 | 17 | - Fix missing dependencies 18 | 19 | ## Version 0.5.0 20 | 21 | - Initial release 22 | - Requires Node.js version >= 14.13.1 23 | - Based on `html-to-text` version 9.0.0 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: When html-to-text output is wrong 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Minimal HTML example** 11 | 12 | ```html 13 | 14 | ``` 15 | 16 | **Options** 17 | 18 | ```javascript 19 | {} // Replace this line with the options you use, if it's not default 20 | ``` 21 | 22 | **Observed output** 23 | 24 | ``` 25 | Replace this line with what you get 26 | ``` 27 | 28 | **Expected output** 29 | 30 | ``` 31 | Replace this line with what you expected to see 32 | ``` 33 | 34 | **Version information** 35 | 36 | - html-to-text: 37 | - node: 38 | 39 | ---- 40 | 41 | 42 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | | Major Version | Minor Version | Supported | 6 | | ------------- | ------------- | ------------------ | 7 | | latest | latest | :white_check_mark: | 8 | | latest | latest - 1 | :white_check_mark: | 9 | | latest - 1 | latest | :white_check_mark: | 10 | | * | * | :x: | 11 | 12 | ## Reporting a Vulnerability 13 | 14 | You can write to `killy@mxii.eu.org`. 15 | 16 | If no reply followed in 3 days for whatever reason then you can [open an issue](https://github.com/html-to-text/node-html-to-text/issues/new?assignees=KillyMXI&labels=security&template=something-else.md) to confirm the email receipt. 17 | 18 | In case of a bus factor - try to contact another member: . 19 | -------------------------------------------------------------------------------- /example/html-to-md.js: -------------------------------------------------------------------------------- 1 | import { readFileSync } from 'fs'; 2 | 3 | import { htmlToMarkdown } from '../packages/html-to-md/src/html-to-md'; 4 | // const { htmlToMarkdown } = require('../packages/html-to-md/lib/html-to-md'); // build it first 5 | 6 | 7 | console.log('From string:'); 8 | const text = htmlToMarkdown( 9 | '

Hello World

', 10 | {} 11 | ); 12 | console.log(text); 13 | console.log(); 14 | 15 | console.log('From file:'); 16 | const filePath = new URL('test.html', import.meta.url); 17 | /** @type { Options } */ 18 | const options = { 19 | selectors: [ 20 | { selector: 'table', format: 'block' }, 21 | { selector: 'table#invoice', format: 'dataTable' }, 22 | { selector: 'table.address', format: 'dataTable' }, 23 | ] 24 | }; 25 | const text2 = htmlToMarkdown(readFileSync(filePath, 'utf8'), options); 26 | console.log(text2); 27 | -------------------------------------------------------------------------------- /example/html-to-text.js: -------------------------------------------------------------------------------- 1 | import { readFileSync } from 'fs'; 2 | 3 | import { htmlToText } from '../packages/html-to-text/src/html-to-text'; 4 | // const { htmlToText } = require('../packages/html-to-text/lib/html-to-text'); // build it first 5 | 6 | 7 | console.log('From string:'); 8 | const text = htmlToText( 9 | '

Hello World

', 10 | { wordwrap: 130 } 11 | ); 12 | console.log(text); 13 | console.log(); 14 | 15 | console.log('From file:'); 16 | const filePath = new URL('test.html', import.meta.url); 17 | /** @type { Options } */ 18 | const options = { 19 | selectors: [ 20 | { selector: 'table', format: 'block' }, 21 | { selector: 'table#invoice', format: 'dataTable' }, 22 | { selector: 'table.address', format: 'dataTable' }, 23 | ] 24 | }; 25 | const text2 = htmlToText(readFileSync(filePath, 'utf8'), options); 26 | console.log(text2); 27 | -------------------------------------------------------------------------------- /packages/html-to-text/test/test-multiple-elements.txt: -------------------------------------------------------------------------------- 1 | At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd 2 | gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum 3 | dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor 4 | invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos 5 | et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea 6 | takimata sanctus est Lorem ipsum dolor sit amet. Github [www.github.com] 7 | 8 | At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd 9 | gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum 10 | dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor 11 | invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos 12 | et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea 13 | takimata sanctus est Lorem ipsum dolor sit amet. 14 | 15 | Some Company 16 | Some Street 42 17 | Somewhere 18 | E-Mail: Click here [test@example.com] -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: lint 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | lint: 7 | name: Run linters 8 | 9 | runs-on: ubuntu-latest 10 | 11 | steps: 12 | - uses: actions/checkout@v3 13 | 14 | - name: Setup Node.js 16.x 15 | uses: actions/setup-node@v3 16 | with: 17 | node-version: 16.x 18 | 19 | - name: Setup NPM v7 20 | run: npm i -g npm@7 21 | 22 | - name: Get npm cache directory 23 | id: npm-cache-dir 24 | run: echo "dir=$(npm config get cache)" >> $GITHUB_OUTPUT 25 | 26 | - uses: actions/cache@v3 27 | id: npm-cache 28 | with: 29 | path: ${{ steps.npm-cache-dir.outputs.dir }} 30 | key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }} 31 | restore-keys: | 32 | ${{ runner.os }}-node- 33 | 34 | - name: Touch cli file to prevent npm ci from freaking out 35 | uses: DamianReeves/write-file-action@v1.2 36 | with: 37 | path: packages/html-to-text-cli/bin/cli.js 38 | contents: '// placeholder' 39 | 40 | - run: npm ci 41 | 42 | - run: npm run lint 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Portions Copyright (c) 2012-2019 werk85 4 | Portions Copyright (c) 2020-2022 KillyMXI 5 | 6 | Permission is hereby granted, free of charge, to any person 7 | obtaining a copy of this software and associated documentation 8 | files (the "Software"), to deal in the Software without 9 | restriction, including without limitation the rights to use, 10 | copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the 12 | Software is furnished to do so, subject to the following 13 | conditions: 14 | 15 | The above copyright notice and this permission notice shall be 16 | included in all copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | OTHER DEALINGS IN THE SOFTWARE. 26 | -------------------------------------------------------------------------------- /packages/html-to-md/test/html-to-md.js: -------------------------------------------------------------------------------- 1 | 2 | import test from 'ava'; 3 | 4 | import { htmlToMarkdown } from '../src/html-to-md'; 5 | 6 | 7 | const snapshotMacro = test.macro({ 8 | exec: function (t, html, options = undefined) { 9 | t.snapshot(htmlToMarkdown(html, options), '```html\n' + html + '\n```'); 10 | } 11 | }); 12 | 13 | test( 14 | 'should encode characters that can be confused as a part of markdown', 15 | snapshotMacro, 16 | '

!#[]()*+-.\\_`{}

' 17 | ); 18 | 19 | test( 20 | 'should not encode characters inside urls', 21 | snapshotMacro, 22 | '' 23 | ); 24 | 25 | test( 26 | 'should encode characters inside alt text and title', 27 | snapshotMacro, 28 | '**alt text**' 29 | ); 30 | 31 | test( 32 | 'should allow to disable encoding of some characters encoded by default', 33 | snapshotMacro, 34 | '

!#[]()*+-.\\_`{}

', 35 | { encodeCharacters: { '(': '(', ')': false } } 36 | ); 37 | 38 | test( 39 | 'should allow to encode additional symbols (single code point)', 40 | snapshotMacro, 41 | '

!#[]()*+-.\\_`{}

👁️ - eye

👁️‍🗨️ - eye in a speech bubble

😀 - smiley

', 42 | { encodeCharacters: { '👁️': ':eye:', '😀': ':smiley:' } } 43 | ); 44 | -------------------------------------------------------------------------------- /packages/html-to-text-cli/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@html-to/text-cli", 3 | "version": "0.5.4", 4 | "description": "CLI html to plain text converter", 5 | "keywords": [ 6 | "html", 7 | "node", 8 | "text", 9 | "converter", 10 | "html-to-text", 11 | "cli-wrapper" 12 | ], 13 | "license": "MIT", 14 | "author": "KillyMXI ", 15 | "homepage": "https://github.com/html-to-text/node-html-to-text", 16 | "repository": { 17 | "type": "git", 18 | "url": "git://github.com/html-to-text/node-html-to-text.git" 19 | }, 20 | "bugs": { 21 | "url": "https://github.com/html-to-text/node-html-to-text/issues" 22 | }, 23 | "type": "module", 24 | "bin": { 25 | "html-to-text": "./bin/cli.js" 26 | }, 27 | "files": [ 28 | "bin", 29 | "README.md", 30 | "CHANGELOG.md", 31 | "LICENSE" 32 | ], 33 | "engines": { 34 | "node": ">=14.13.1" 35 | }, 36 | "scripts": { 37 | "build:rollup": "rollup -c", 38 | "build": "npm run clean && npm run build:rollup && npm run copy:license", 39 | "clean": "rimraf bin", 40 | "copy:license": "copyfiles -f ../../LICENSE ." 41 | }, 42 | "dependencies": { 43 | "@selderee/plugin-htmlparser2": "^0.11.0", 44 | "aspargvs": "^0.6.0", 45 | "deepmerge": "^4.3.1", 46 | "htmlparser2": "^8.0.2", 47 | "selderee": "^0.11.0" 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | name: Run tests 8 | 9 | strategy: 10 | matrix: 11 | node-version: [14.x, 16.x] 12 | os: [ubuntu-latest, windows-latest] 13 | include: 14 | - os: windows-latest 15 | get-cache-path: '"dir=$(npm config get cache)" >> $env:GITHUB_OUTPUT' 16 | - os: ubuntu-latest 17 | get-cache-path: 'echo "dir=$(npm config get cache)" >> $GITHUB_OUTPUT' 18 | 19 | runs-on: ${{ matrix.os }} 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | 24 | - name: Setup Node.js ${{ matrix.node-version }} 25 | uses: actions/setup-node@v3 26 | with: 27 | node-version: ${{ matrix.node-version }} 28 | 29 | - name: Setup NPM v7 30 | run: npm i -g npm@7 31 | 32 | - name: Get npm cache directory 33 | id: npm-cache-dir 34 | run: ${{ matrix.get-cache-path }} 35 | 36 | - uses: actions/cache@v3 37 | id: npm-cache 38 | with: 39 | path: ${{ steps.npm-cache-dir.outputs.dir }} 40 | key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }} 41 | restore-keys: | 42 | ${{ runner.os }}-node- 43 | 44 | - name: Touch cli file to prevent npm ci from freaking out 45 | uses: DamianReeves/write-file-action@v1.2 46 | with: 47 | path: packages/html-to-text-cli/bin/cli.js 48 | contents: '// placeholder' 49 | 50 | - run: npm ci 51 | 52 | - run: npm test 53 | -------------------------------------------------------------------------------- /packages/html-to-md/test/snapshots/html-to-md.js.md: -------------------------------------------------------------------------------- 1 | # Snapshot report for `test/html-to-md.js` 2 | 3 | The actual snapshot is saved in `html-to-md.js.snap`. 4 | 5 | Generated by [AVA](https://avajs.dev). 6 | 7 | ## should encode characters that can be confused as a part of markdown 8 | 9 | > ```html 10 | >

!#[]()*+-.\_`{}

11 | > ``` 12 | 13 | '!#[]()*+-.\_`{}' 14 | 15 | ## should not encode characters inside urls 16 | 17 | > ```html 18 | > 19 | > ``` 20 | 21 | '[](/page_(1).html?foo=[1]&bar=baz+qux#qu-ux!)' 22 | 23 | ## should encode characters inside alt text and title 24 | 25 | > ```html 26 | > **alt text** 27 | > ``` 28 | 29 | '![**alt text**](test.png "*title*")' 30 | 31 | ## should allow to disable encoding of some characters encoded by default 32 | 33 | > ```html 34 | >

!#[]()*+-.\_`{}

35 | > ``` 36 | 37 | '!#[]()*+-.\_`{}' 38 | 39 | ## should allow to encode additional symbols (single code point) 40 | 41 | > ```html 42 | >

!#[]()*+-.\_`{}

👁️ - eye

👁️‍🗨️ - eye in a speech bubble

😀 - smiley

43 | > ``` 44 | 45 | `!#[]()*+-.\_`{}␊ 46 | ␊ 47 | :eye:️ - eye␊ 48 | ␊ 49 | :eye:️‍🗨️ - eye in a speech bubble␊ 50 | ␊ 51 | :smiley: - smiley` 52 | -------------------------------------------------------------------------------- /packages/html-to-md/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@html-to/md", 3 | "version": "0.5.0-next1", 4 | "description": "Advanced html to markdown converter", 5 | "keywords": [ 6 | "html", 7 | "markdown", 8 | "converter", 9 | "html-to-md", 10 | "html-to-markdown", 11 | "html2md" 12 | ], 13 | "license": "MIT", 14 | "author": "KillyMXI ", 15 | "homepage": "https://github.com/html-to-text/node-html-to-text", 16 | "repository": { 17 | "type": "git", 18 | "url": "git://github.com/html-to-text/node-html-to-text.git" 19 | }, 20 | "bugs": { 21 | "url": "https://github.com/html-to-text/node-html-to-text/issues" 22 | }, 23 | "type": "module", 24 | "main": "./lib/html-to-md.cjs", 25 | "module": "./lib/html-to-md.mjs", 26 | "exports": { 27 | "import": "./lib/html-to-md.mjs", 28 | "require": "./lib/html-to-md.cjs" 29 | }, 30 | "files": [ 31 | "lib", 32 | "README.md", 33 | "CHANGELOG.md", 34 | "LICENSE" 35 | ], 36 | "engines": { 37 | "node": ">=14" 38 | }, 39 | "scripts": { 40 | "build:rollup": "rollup -c", 41 | "build": "npm run clean && npm run build:rollup && npm run copy:license", 42 | "clean": "rimraf lib", 43 | "copy:license": "copyfiles -f ../../LICENSE .", 44 | "cover": "c8 --reporter=lcov --reporter=text-summary ava -t 20000", 45 | "test": "ava" 46 | }, 47 | "dependencies": { 48 | "@selderee/plugin-htmlparser2": "^0.11.0", 49 | "deepmerge": "^4.3.1", 50 | "dom-serializer": "^2.0.0", 51 | "domutils": "^3.0.1", 52 | "htmlparser2": "^8.0.2", 53 | "selderee": "^0.11.0" 54 | }, 55 | "ava": { 56 | "files": ["test/**/*.js"], 57 | "nodeArguments": [ 58 | "--experimental-specifier-resolution=node" 59 | ] 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /packages/html-to-text/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "html-to-text", 3 | "version": "9.0.5", 4 | "description": "Advanced html to plain text converter", 5 | "keywords": [ 6 | "html", 7 | "node", 8 | "text", 9 | "mail", 10 | "plain", 11 | "converter" 12 | ], 13 | "license": "MIT", 14 | "author": "Malte Legenhausen ", 15 | "contributors": [ 16 | "KillyMXI " 17 | ], 18 | "homepage": "https://github.com/html-to-text/node-html-to-text", 19 | "repository": { 20 | "type": "git", 21 | "url": "git://github.com/html-to-text/node-html-to-text.git" 22 | }, 23 | "bugs": { 24 | "url": "https://github.com/html-to-text/node-html-to-text/issues" 25 | }, 26 | "type": "module", 27 | "main": "./lib/html-to-text.cjs", 28 | "module": "./lib/html-to-text.mjs", 29 | "exports": { 30 | "import": "./lib/html-to-text.mjs", 31 | "require": "./lib/html-to-text.cjs" 32 | }, 33 | "files": [ 34 | "lib", 35 | "README.md", 36 | "CHANGELOG.md", 37 | "LICENSE" 38 | ], 39 | "engines": { 40 | "node": ">=14" 41 | }, 42 | "scripts": { 43 | "build:rollup": "rollup -c", 44 | "build": "npm run clean && npm run build:rollup && npm run copy:license", 45 | "clean": "rimraf lib", 46 | "copy:license": "copyfiles -f ../../LICENSE .", 47 | "cover": "c8 --reporter=lcov --reporter=text-summary mocha -t 20000", 48 | "test": "mocha" 49 | }, 50 | "dependencies": { 51 | "@selderee/plugin-htmlparser2": "^0.11.0", 52 | "deepmerge": "^4.3.1", 53 | "dom-serializer": "^2.0.0", 54 | "htmlparser2": "^8.0.2", 55 | "selderee": "^0.11.0" 56 | }, 57 | "mocha": { 58 | "node-option": [ 59 | "experimental-specifier-resolution=node" 60 | ] 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@html-to-text/monorepo", 3 | "license": "MIT", 4 | "engines": { 5 | "node": ">=14" 6 | }, 7 | "workspaces": [ 8 | "packages/base", 9 | "packages/html-to-text", 10 | "packages/html-to-text-cli", 11 | "packages/html-to-md" 12 | ], 13 | "type": "module", 14 | "scripts": { 15 | "build:html-to-text": "npm run build -w ./packages/html-to-text", 16 | "build:html-to-text-cli": "npm run build -w ./packages/html-to-text-cli", 17 | "build:html-to-md": "npm run build -w ./packages/html-to-md", 18 | "build": "npm run build:html-to-text && npm run build:html-to-text-cli && npm run build:html-to-md", 19 | "cover:html-to-text": "npm run cover -w ./packages/html-to-text", 20 | "cover:html-to-md": "npm run cover -w ./packages/html-to-md", 21 | "cover": "concurrently npm:cover:*", 22 | "example:md": "node --experimental-specifier-resolution=node ./example/html-to-md.js", 23 | "example:text": "node --experimental-specifier-resolution=node ./example/html-to-text.js", 24 | "lint": "eslint .", 25 | "test:html-to-text": "npm run test -w ./packages/html-to-text", 26 | "test:html-to-md": "npm run test -w ./packages/html-to-md", 27 | "test": "concurrently npm:test:*" 28 | }, 29 | "devDependencies": { 30 | "@rollup/plugin-json": "^6.0.0", 31 | "@rollup/plugin-node-resolve": "^15.0.2", 32 | "@types/node": "14.18.42", 33 | "ava": "^5.2.0", 34 | "c8": "^7.13.0", 35 | "chai": "^4.3.7", 36 | "concurrently": "^8.0.1", 37 | "copyfiles": "^2.4.1", 38 | "eslint": "^8.39.0", 39 | "eslint-plugin-filenames": "^1.3.2", 40 | "eslint-plugin-import": "^2.27.5", 41 | "eslint-plugin-jsdoc": "^43.0.7", 42 | "mocha": "^10.2.0", 43 | "rimraf": "^5.0.0", 44 | "rollup": "^2.79.1" 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HTML to X converters 2 | 3 | [![lint status](https://github.com/html-to-text/node-html-to-text/workflows/lint/badge.svg)](https://github.com/html-to-text/node-html-to-text/actions/workflows/lint.yml) 4 | [![test status](https://github.com/html-to-text/node-html-to-text/workflows/test/badge.svg)](https://github.com/html-to-text/node-html-to-text/actions/workflows/test.yml) 5 | [![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/html-to-text/node-html-to-text/blob/master/LICENSE-MIT) 6 | 7 | This is a monorepo. 8 | 9 | ## Packages 10 | 11 | - **html-to-text ** 12 | 13 | [![npm](https://img.shields.io/npm/v/html-to-text?logo=npm)](https://www.npmjs.com/package/html-to-text) 14 | 15 | Advanced html to plain text converter. 16 | 17 | Folder: [/packages/html-to-text](/packages/html-to-text) 18 | 19 | Docs: [README.md](/packages/html-to-text/README.md), [CHANGELOG.md](/packages/html-to-text/CHANGELOG.md) 20 | 21 | - **@html-to/text-cli** 22 | 23 | [![npm](https://img.shields.io/npm/v/@html-to/text-cli?logo=npm)](https://www.npmjs.com/package/@html-to/text-cli) 24 | 25 | CLI for html to text converter. 26 | 27 | Folder: [/packages/html-to-text-cli](/packages/html-to-text-cli) 28 | 29 | Docs: [README.md](/packages/html-to-text-cli/README.md), [CHANGELOG.md](/packages/html-to-text-cli/CHANGELOG.md) 30 | 31 | - **@html-to/md** 32 | 33 | Advanced html to markdown converter (WIP). 34 | 35 | Folder: [/packages/html-to-md](/packages/html-to-md) 36 | 37 | 38 | 39 | - **base** 40 | 41 | Shared code. Only exists in the monorepo and bundled into published packages. 42 | 43 | Folder: [/packages/base](/packages/base) 44 | 45 | ## Development 46 | 47 | Targeting Node.js version >=14. 48 | 49 | Monorepo uses NPM v7 workspaces (make sure v7 is installed when used with Node.js v14.) 50 | -------------------------------------------------------------------------------- /packages/html-to-text-cli/src/cli.js: -------------------------------------------------------------------------------- 1 | import process from 'node:process'; 2 | 3 | import { handleArgv } from 'aspargvs'; 4 | import deepmerge from 'deepmerge'; 5 | import { htmlToText } from 'html-to-text'; 6 | 7 | import { version as httVersion } from '../../html-to-text/package.json'; 8 | import { version as cliVersion } from '../package.json'; 9 | 10 | 11 | const kebabToCamelCase = (str) => str 12 | .replace(/-./g, x => x[1].toUpperCase()); 13 | 14 | const camelToKebabCase = (str) => str 15 | .replace(/\B([A-Z])(?=[a-z])/g, '-$1') 16 | .replace(/\B([a-z0-9])([A-Z])/g, '$1-$2') 17 | .toLowerCase(); 18 | 19 | const versionText = 20 | `${cliVersion} - cli version 21 | ${httVersion} - converter version`; 22 | 23 | const helpHeader = 24 | `Advanced html to plain text converter 25 | ${versionText.replace(/^/gm, ' ')} 26 | 27 | Usage: 28 | - send input HTML document to stdin; 29 | - get plain text from stdout; 30 | - use arguments to specify commands and options; 31 | - refer to html-to-text package docs for all available options; 32 | - all options except functions can be expressed in CLI args; 33 | - below is the short summary of the args syntax; 34 | - refer to @html-to/text-cli docs for further details. 35 | 36 | `; 37 | 38 | handleArgv({ 39 | handlers: { 40 | help: (text) => helpHeader + text, 41 | unparse: true, 42 | inspect: { depth: 5, }, 43 | json: businessLogic, 44 | merge: (acc, next) => deepmerge(acc, next), 45 | key: kebabToCamelCase, 46 | unkey: camelToKebabCase, 47 | bin: () => 'html-to-text', 48 | version: () => versionText, 49 | }, 50 | presets: { 51 | 'human': { 52 | description: 'Some options more suitable for human reading', 53 | json: { 54 | wordwrap: 80, 55 | longWordSplit: { forceWrapOnLimit: true }, 56 | selectors: [ 57 | { selector: 'table', format: 'dataTable' } 58 | ] 59 | } 60 | }, 61 | 'machine': { 62 | description: 'Some options more suitable for machine processing', 63 | json: { 64 | wordwrap: false, 65 | longWordSplit: { forceWrapOnLimit: false }, 66 | selectors: [ 67 | { selector: 'table', format: 'block' }, 68 | { selector: 'tr', format: 'block' }, 69 | { selector: 'th', format: 'block' }, 70 | { selector: 'td', format: 'block' }, 71 | ] 72 | } 73 | } 74 | } 75 | }); 76 | 77 | function businessLogic (optionsObject) { 78 | let text = ''; 79 | 80 | process.title = 'html-to-text'; 81 | 82 | process.stdin.resume(); 83 | process.stdin.setEncoding('utf8'); 84 | process.stdin.on('data', data => { text += data; }); 85 | process.stdin.on('end', () => { 86 | text = htmlToText(text, optionsObject); 87 | process.stdout.write(text + '\n', 'utf-8'); 88 | }); 89 | } 90 | -------------------------------------------------------------------------------- /packages/html-to-text/test/test.txt: -------------------------------------------------------------------------------- 1 | PARAGRAPHS 2 | 3 | At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd 4 | gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum 5 | dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor 6 | invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos 7 | et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea 8 | takimata sanctus est Lorem ipsum dolor sit amet. Github [www.github.com] 9 | 10 | At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd 11 | gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum 12 | dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor 13 | invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos 14 | et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea 15 | takimata sanctus est Lorem ipsum dolor sit amet. 16 | 17 | -------------------------------------------------------------------------------- 18 | 19 | 20 | PRETTY PRINTED TABLE 21 | 22 | ARTICLE PRICE TAXES AMOUNT TOTAL 23 | Product 1 6,99€ 7% 1 6,99€ 24 | Contains: 1x Product 1 25 | Shipment costs 3,25€ 7% 1 3,25€ 26 |     to pay: 10,24€ 27 | Taxes 7%: 0,72€ 28 | 29 | -------------------------------------------------------------------------------- 30 | 31 | 32 | LISTS 33 | 34 | * At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd 35 | gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. 36 | * At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd 37 | gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. 38 | 39 | 1. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd 40 | gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. 41 | 2. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd 42 | gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. 43 | 44 | -------------------------------------------------------------------------------- 45 | 46 | 47 | COLUMN LAYOUT WITH TABLES 48 | 49 | INVOICE ADDRESS SHIPMENT ADDRESS 50 | Mr. Mr. 51 | John Doe John Doe 52 | Featherstone Street 49 Featherstone Street 49 53 | 28199 Bremen 28199 Bremen 54 | 55 | -------------------------------------------------------------------------------- 56 | 57 | 58 | MAILTO FORMATING 59 | 60 | Some Company 61 | Some Street 42 62 | Somewhere 63 | E-Mail: Click here [test@example.com] 64 | 65 | We appreciate your business. And we hope you'll check out our new products 66 | [http://example.com/]! -------------------------------------------------------------------------------- /packages/base/src/stack-item.js: -------------------------------------------------------------------------------- 1 | /* eslint-disable max-classes-per-file */ 2 | 3 | import { InlineTextBuilder } from './inline-text-builder'; 4 | 5 | 6 | class StackItem { 7 | constructor (next = null) { this.next = next; } 8 | 9 | getRoot () { return (this.next) ? this.next : this; } 10 | } 11 | 12 | class BlockStackItem extends StackItem { 13 | constructor (options, next = null, leadingLineBreaks = 1, maxLineLength = undefined) { 14 | super(next); 15 | this.leadingLineBreaks = leadingLineBreaks; 16 | this.inlineTextBuilder = new InlineTextBuilder(options, maxLineLength); 17 | this.rawText = ''; 18 | this.stashedLineBreaks = 0; 19 | this.isPre = next && next.isPre; 20 | this.isNoWrap = next && next.isNoWrap; 21 | } 22 | } 23 | 24 | class ListStackItem extends BlockStackItem { 25 | constructor ( 26 | options, 27 | next = null, 28 | { 29 | interRowLineBreaks = 1, 30 | leadingLineBreaks = 2, 31 | maxLineLength = undefined, 32 | maxPrefixLength = 0, 33 | prefixAlign = 'left', 34 | } = {} 35 | ) { 36 | super(options, next, leadingLineBreaks, maxLineLength); 37 | this.maxPrefixLength = maxPrefixLength; 38 | this.prefixAlign = prefixAlign; 39 | this.interRowLineBreaks = interRowLineBreaks; 40 | } 41 | } 42 | 43 | class ListItemStackItem extends BlockStackItem { 44 | constructor ( 45 | options, 46 | next = null, 47 | { 48 | leadingLineBreaks = 1, 49 | maxLineLength = undefined, 50 | prefix = '', 51 | } = {} 52 | ) { 53 | super(options, next, leadingLineBreaks, maxLineLength); 54 | this.prefix = prefix; 55 | } 56 | } 57 | 58 | class TableStackItem extends StackItem { 59 | constructor (next = null) { 60 | super(next); 61 | this.rows = []; 62 | this.isPre = next && next.isPre; 63 | this.isNoWrap = next && next.isNoWrap; 64 | } 65 | } 66 | 67 | class TableRowStackItem extends StackItem { 68 | constructor (next = null) { 69 | super(next); 70 | this.cells = []; 71 | this.isPre = next && next.isPre; 72 | this.isNoWrap = next && next.isNoWrap; 73 | } 74 | } 75 | 76 | class TableCellStackItem extends StackItem { 77 | constructor (options, next = null, maxColumnWidth = undefined) { 78 | super(next); 79 | this.inlineTextBuilder = new InlineTextBuilder(options, maxColumnWidth); 80 | this.rawText = ''; 81 | this.stashedLineBreaks = 0; 82 | this.isPre = next && next.isPre; 83 | this.isNoWrap = next && next.isNoWrap; 84 | } 85 | } 86 | 87 | class TransformerStackItem extends StackItem { 88 | constructor (next = null, transform) { 89 | super(next); 90 | this.transform = transform; 91 | } 92 | } 93 | 94 | export { 95 | BlockStackItem, 96 | ListItemStackItem, 97 | ListStackItem, 98 | StackItem, 99 | TableCellStackItem, 100 | TableRowStackItem, 101 | TableStackItem, 102 | TransformerStackItem, 103 | }; 104 | -------------------------------------------------------------------------------- /packages/html-to-text/src/table-printer.js: -------------------------------------------------------------------------------- 1 | 2 | // eslint-disable-next-line import/no-unassigned-import 3 | import '@html-to-text/base/src/typedefs'; 4 | 5 | 6 | function getRow (matrix, j) { 7 | if (!matrix[j]) { matrix[j] = []; } 8 | return matrix[j]; 9 | } 10 | 11 | function findFirstVacantIndex (row, x = 0) { 12 | while (row[x]) { x++; } 13 | return x; 14 | } 15 | 16 | function transposeInPlace (matrix, maxSize) { 17 | for (let i = 0; i < maxSize; i++) { 18 | const rowI = getRow(matrix, i); 19 | for (let j = 0; j < i; j++) { 20 | const rowJ = getRow(matrix, j); 21 | if (rowI[j] || rowJ[i]) { 22 | const temp = rowI[j]; 23 | rowI[j] = rowJ[i]; 24 | rowJ[i] = temp; 25 | } 26 | } 27 | } 28 | } 29 | 30 | function putCellIntoLayout (cell, layout, baseRow, baseCol) { 31 | for (let r = 0; r < cell.rowspan; r++) { 32 | const layoutRow = getRow(layout, baseRow + r); 33 | for (let c = 0; c < cell.colspan; c++) { 34 | layoutRow[baseCol + c] = cell; 35 | } 36 | } 37 | } 38 | 39 | function getOrInitOffset (offsets, index) { 40 | if (offsets[index] === undefined) { 41 | offsets[index] = (index === 0) ? 0 : 1 + getOrInitOffset(offsets, index - 1); 42 | } 43 | return offsets[index]; 44 | } 45 | 46 | function updateOffset (offsets, base, span, value) { 47 | offsets[base + span] = Math.max( 48 | getOrInitOffset(offsets, base + span), 49 | getOrInitOffset(offsets, base) + value 50 | ); 51 | } 52 | 53 | /** 54 | * Render a table into a string. 55 | * Cells can contain multiline text and span across multiple rows and columns. 56 | * 57 | * Modifies cells to add lines array. 58 | * 59 | * @param { TablePrinterCell[][] } tableRows Table to render. 60 | * @param { number } rowSpacing Number of spaces between columns. 61 | * @param { number } colSpacing Number of empty lines between rows. 62 | * @returns { string } 63 | */ 64 | function tableToString (tableRows, rowSpacing, colSpacing) { 65 | const layout = []; 66 | let colNumber = 0; 67 | const rowNumber = tableRows.length; 68 | const rowOffsets = [0]; 69 | // Fill the layout table and row offsets row-by-row. 70 | for (let j = 0; j < rowNumber; j++) { 71 | const layoutRow = getRow(layout, j); 72 | const cells = tableRows[j]; 73 | let x = 0; 74 | for (let i = 0; i < cells.length; i++) { 75 | const cell = cells[i]; 76 | x = findFirstVacantIndex(layoutRow, x); 77 | putCellIntoLayout(cell, layout, j, x); 78 | x += cell.colspan; 79 | cell.lines = cell.text.split('\n'); 80 | const cellHeight = cell.lines.length; 81 | updateOffset(rowOffsets, j, cell.rowspan, cellHeight + rowSpacing); 82 | } 83 | colNumber = (layoutRow.length > colNumber) ? layoutRow.length : colNumber; 84 | } 85 | 86 | transposeInPlace(layout, (rowNumber > colNumber) ? rowNumber : colNumber); 87 | 88 | const outputLines = []; 89 | const colOffsets = [0]; 90 | // Fill column offsets and output lines column-by-column. 91 | for (let x = 0; x < colNumber; x++) { 92 | let y = 0; 93 | let cell; 94 | const rowsInThisColumn = Math.min(rowNumber, layout[x].length); 95 | while (y < rowsInThisColumn) { 96 | cell = layout[x][y]; 97 | if (cell) { 98 | if (!cell.rendered) { 99 | let cellWidth = 0; 100 | for (let j = 0; j < cell.lines.length; j++) { 101 | const line = cell.lines[j]; 102 | const lineOffset = rowOffsets[y] + j; 103 | outputLines[lineOffset] = (outputLines[lineOffset] || '').padEnd(colOffsets[x]) + line; 104 | cellWidth = (line.length > cellWidth) ? line.length : cellWidth; 105 | } 106 | updateOffset(colOffsets, x, cell.colspan, cellWidth + colSpacing); 107 | cell.rendered = true; 108 | } 109 | y += cell.rowspan; 110 | } else { 111 | const lineOffset = rowOffsets[y]; 112 | outputLines[lineOffset] = (outputLines[lineOffset] || ''); 113 | y++; 114 | } 115 | } 116 | } 117 | 118 | return outputLines.join('\n'); 119 | } 120 | 121 | export { tableToString }; 122 | -------------------------------------------------------------------------------- /packages/html-to-text-cli/README.md: -------------------------------------------------------------------------------- 1 | # @html-to/text-cli 2 | 3 | [![npm](https://img.shields.io/npm/v/@html-to/text-cli?logo=npm)](https://www.npmjs.com/package/@html-to/text-cli) 4 | 5 | Command line interface for [html-to-text](https://www.npmjs.com/package/html-to-text) Node.js package. 6 | 7 | 8 | ## Features 9 | 10 | - almost all `html-to-text` options can be specified via command line arguments or json config (the only exception is functions such as custom formatters); 11 | - a couple of presets for common use cases (human reading in terminal and machine indexing/search). 12 | 13 | 14 | ## Changelog 15 | 16 | Available here: [CHANGELOG.md](https://github.com/html-to-text/node-html-to-text/blob/master/packages/html-to-text-cli/CHANGELOG.md) 17 | 18 | 19 | ## Installation 20 | 21 | ``` 22 | npm i -g @html-to/text-cli 23 | ``` 24 | 25 | ### Name collisions 26 | 27 | - old versions of `html-to-text` package expose a command with the same name. Make sure that package is not installed globally anymore. 28 | - there is an old abandoned CLI package that exposes a command with the same name and actually has nothing to do with `html-to-text` package. Make sure to only use namespaced package `@html-to/text-cli`. 29 | 30 | 31 | ## Usage 32 | 33 | - Use `html-to-text` command (`html-to-text.cmd` in PowerShell); 34 | - Pipe HTML to `stdin`; 35 | - Get plain text from `stdout`; 36 | - Pass converter options as command arguments. 37 | 38 | ### Command line arguments 39 | 40 | ```shell 41 | > cat ./input.html | html-to-text [commands...] [keys and values...] > ./output.txt 42 | ``` 43 | 44 | In PowerShell: 45 | 46 | ```shell 47 | PS> Get-Content .\input.html | html-to-text.cmd [commands...] [keys and values...] > .\output.txt 48 | ``` 49 | 50 | `.ps1` wrapper installed by npm might not work with `stdin`, so use `.cmd` instead. 51 | 52 | ### Available commands 53 | 54 | | Command | Alias | Argument | Description 55 | | --------- | ----- | -------------- | ----------- 56 | | `json` | `-j` | \ | Merge given json file contents with the parsed options object. This way you can provide all or some options from a file rather than explicitly from CLI. 57 | | `preset` | `-p` | \ | Merge given preset into the parsed options object. Available presets listed below. 58 | | `inspect` | `-i` | | Pretty print the parsed options object and exit. Useful as a dry run to check how options are parsed. 59 | | `unparse` | `-u` | | Print the parsed options object back as args string and exit. Can be used to check what arguments produce the result equivalent to a given json file. 60 | | `help` | `-h` | | Print help message end exit. 61 | | `version` | `-v` | | Print version number and exit. 62 | 63 | Note: short aliases cannot be merged. 64 | 65 | ### Available presets 66 | 67 | | Preset | Description 68 | | --------- | ----------- 69 | | `human` | Some options more suitable for human reading in terminal (ensure line length of 80 characters, format tables visually) 70 | | `machine` | Some options more suitable for machine processing (no line length limit, format tables and cells as blocks) 71 | 72 | ### Options syntax 73 | 74 | Refer to `html-to-text help` output for brief syntax information. 75 | 76 | Refer to [aspargvs](https://github.com/mxxii/aspargvs) readme for more detailed information. 77 | 78 | Note: PowerShell requires to escape quotes and curly braces. 79 | 80 | ### Option examples 81 | 82 | All options that are representable in JSON format (that is all except functions) can be specified via CLI arguments. Below are some examples. 83 | 84 | | JSON | CLI 85 | | --------------------- | --- 86 | | `{ preserveNewlines: true }` | `--preserveNewlines` 87 | | `{ wordwrap: 100 }` | `--wordwrap=100` 88 | | `{ wordwrap: false }` | `--!wordwrap` 89 | | `{ baseElements: { orderBy: 'occurrence' } }` | `--baseElements.orderBy=occurrence` 90 | | `{ selectors: [`
`{ selector: 'img', format: 'skip' }`
`] }` | `--selectors[] {} :selector=img :format=skip` 91 | | `{ selectors: [`
`{ selector: 'h1', options: { uppercase: false } },`
`{ selector: 'h2', options: { uppercase: false } }`
`] }`| `--selectors[] {} :selector=h1 :!options.uppercase {} :selector=h2 :!options.uppercase` 92 | | `{ selectors: [`
`{ selector: 'table', format: 'dataTable', options: { uppercaseHeaderCells: false } }`
`] }` | `--selectors[] {} :selector=table :format=dataTable :options.uppercase-header-cells=false` 93 | | `{ selectors: [`
`{ selector: 'a', options: { linkBrackets: ['<', '>'] } }`
`] }` | `--selectors[] {} :selector=a :options.linkBrackets=['<','>']` 94 | 95 | 96 | ## License 97 | 98 | [MIT License](https://github.com/html-to-text/node-html-to-text/blob/master/LICENSE) 99 | -------------------------------------------------------------------------------- /packages/html-to-text/test/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 11 | 14 | 15 | 16 | 23 | 24 | 25 | 26 | 73 | 74 | 75 | 76 | 88 | 89 | 90 | 118 | 119 | 120 | 121 | 133 | 134 |
17 |

Paragraphs

18 |

At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Github 19 |

20 |

At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. 21 |

22 |
27 |
28 |

Pretty printed table

29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 |
ArticlePriceTaxesAmountTotal
42 |

43 | Product 1
44 | Contains: 1x Product 1 45 |

46 |
6,99€7%16,99€
Shipment costs3,25€7%13,25€
  to pay: 10,24€
Taxes 7%: 0,72€
71 | 72 |
77 |
78 |

Lists

79 |
    80 |
  • At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
  • 81 |
  • At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
  • 82 |
83 |
    84 |
  1. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
  2. 85 |
  3. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
  4. 86 |
87 |
91 |
92 |

Column Layout with tables

93 | 94 | 95 | 96 | 97 | 98 | 99 | 107 | 115 | 116 |
Invoice AddressShipment Address
100 |

101 | Mr.
102 | John Doe
103 | Featherstone Street 49
104 | 28199 Bremen
105 |

106 |
108 |

109 | Mr.
110 | John Doe
111 | Featherstone Street 49
112 | 28199 Bremen
113 |

114 |
117 |
122 | 123 |
124 |

Mailto formating

125 |

126 | Some Company
127 | Some Street 42
128 | Somewhere
129 | E-Mail: Click here 130 |

131 |

We appreciate your business. And we hope you'll check out our new products!

132 |
135 | 136 | 137 | -------------------------------------------------------------------------------- /packages/base/src/generic-formatters.js: -------------------------------------------------------------------------------- 1 | 2 | import { render } from 'dom-serializer'; 3 | 4 | // eslint-disable-next-line import/no-unassigned-import 5 | import './typedefs'; 6 | 7 | 8 | /** 9 | * Dummy formatter that discards the input and does nothing. 10 | * 11 | * @type { FormatCallback } 12 | */ 13 | function formatSkip (elem, walk, builder, formatOptions) { 14 | /* do nothing */ 15 | } 16 | 17 | /** 18 | * Insert the given string literal inline instead of a tag. 19 | * 20 | * @type { FormatCallback } 21 | */ 22 | function formatInlineString (elem, walk, builder, formatOptions) { 23 | builder.addLiteral(formatOptions.string || ''); 24 | } 25 | 26 | /** 27 | * Insert a block with the given string literal instead of a tag. 28 | * 29 | * @type { FormatCallback } 30 | */ 31 | function formatBlockString (elem, walk, builder, formatOptions) { 32 | builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 }); 33 | builder.addLiteral(formatOptions.string || ''); 34 | builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 }); 35 | } 36 | 37 | /** 38 | * Process an inline-level element. 39 | * 40 | * @type { FormatCallback } 41 | */ 42 | function formatInline (elem, walk, builder, formatOptions) { 43 | walk(elem.children, builder); 44 | } 45 | 46 | /** 47 | * Process a block-level container. 48 | * 49 | * @type { FormatCallback } 50 | */ 51 | function formatBlock (elem, walk, builder, formatOptions) { 52 | builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 }); 53 | walk(elem.children, builder); 54 | builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 }); 55 | } 56 | 57 | function renderOpenTag (elem) { 58 | const attrs = (elem.attribs && elem.attribs.length) 59 | ? ' ' + Object.entries(elem.attribs) 60 | .map(([k, v]) => ((v === '') ? k : `${k}=${v.replace(/"/g, '"')}`)) 61 | .join(' ') 62 | : ''; 63 | return `<${elem.name}${attrs}>`; 64 | } 65 | 66 | function renderCloseTag (elem) { 67 | return ``; 68 | } 69 | 70 | /** 71 | * Render an element as inline HTML tag, walk through it's children. 72 | * 73 | * @type { FormatCallback } 74 | */ 75 | function formatInlineTag (elem, walk, builder, formatOptions) { 76 | builder.startNoWrap(); 77 | builder.addLiteral(renderOpenTag(elem)); 78 | builder.stopNoWrap(); 79 | walk(elem.children, builder); 80 | builder.startNoWrap(); 81 | builder.addLiteral(renderCloseTag(elem)); 82 | builder.stopNoWrap(); 83 | } 84 | 85 | /** 86 | * Render an element as HTML block bag, walk through it's children. 87 | * 88 | * @type { FormatCallback } 89 | */ 90 | function formatBlockTag (elem, walk, builder, formatOptions) { 91 | builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 }); 92 | builder.startNoWrap(); 93 | builder.addLiteral(renderOpenTag(elem)); 94 | builder.stopNoWrap(); 95 | walk(elem.children, builder); 96 | builder.startNoWrap(); 97 | builder.addLiteral(renderCloseTag(elem)); 98 | builder.stopNoWrap(); 99 | builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 }); 100 | } 101 | 102 | /** 103 | * Render an element with all it's children as inline HTML. 104 | * 105 | * @type { FormatCallback } 106 | */ 107 | function formatInlineHtml (elem, walk, builder, formatOptions) { 108 | builder.startNoWrap(); 109 | builder.addLiteral( 110 | render(elem, { decodeEntities: builder.options.decodeEntities }) 111 | ); 112 | builder.stopNoWrap(); 113 | } 114 | 115 | /** 116 | * Render an element with all it's children as HTML block. 117 | * 118 | * @type { FormatCallback } 119 | */ 120 | function formatBlockHtml (elem, walk, builder, formatOptions) { 121 | builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 }); 122 | builder.startNoWrap(); 123 | builder.addLiteral( 124 | render(elem, { decodeEntities: builder.options.decodeEntities }) 125 | ); 126 | builder.stopNoWrap(); 127 | builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 }); 128 | } 129 | 130 | /** 131 | * Render inline element wrapped with given strings. 132 | * 133 | * @type { FormatCallback } 134 | */ 135 | function formatInlineSurround (elem, walk, builder, formatOptions) { 136 | builder.addLiteral(formatOptions.prefix || ''); 137 | walk(elem.children, builder); 138 | builder.addLiteral(formatOptions.suffix || ''); 139 | } 140 | 141 | 142 | export { 143 | formatBlock as block, 144 | formatBlockHtml as blockHtml, 145 | formatBlockString as blockString, 146 | formatBlockTag as blockTag, 147 | formatInline as inline, 148 | formatInlineHtml as inlineHtml, 149 | formatInlineString as inlineString, 150 | formatInlineSurround as inlineSurround, 151 | formatInlineTag as inlineTag, 152 | formatSkip as skip, 153 | }; 154 | -------------------------------------------------------------------------------- /example/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 11 | 14 | 15 | 16 | 23 | 24 | 25 | 26 | 73 | 74 | 75 | 76 | 88 | 89 | 90 | 118 | 119 | 120 | 121 | 132 | 133 |
17 |

Paragraphs

18 |

At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Github 19 |

20 |

At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. 21 |

22 |
27 |
28 |

Pretty printed table

29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 |
ArticlePriceTaxesAmountTotal
42 |

43 | Product 1
44 | Contains: 1x Product 1 45 |

46 |
6,99€7%16,99€
Shipment costs3,25€7%13,25€
  to pay: 10,24€
Taxes 7%: 0,72€
71 | 72 |
77 |
78 |

Lists

79 |
    80 |
  • At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
  • 81 |
  • At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
  • 82 |
83 |
    84 |
  1. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
  2. 85 |
  3. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
  4. 86 |
87 |
91 |
92 |

Column Layout with tables

93 | 94 | 95 | 96 | 97 | 98 | 99 | 107 | 115 | 116 |
Invoice AddressShipment Address
100 |

101 | Mr.
102 | John Doe
103 | Featherstone Street 49
104 | 28199 Bremen
105 |

106 |
108 |

109 | Mr.
110 | John Doe
111 | Featherstone Street 49
112 | 28199 Bremen
113 |

114 |
117 |
122 | 123 |
124 |

Mailto formating

125 |

126 | Some Company
127 | Some Street 42
128 | Somewhere
129 | E-Mail: Click here! 130 |

131 |
134 | 135 |
136 |

Pretty printed Source Code

137 |
138 | htmlToText.fromFile(new URL('test.html', import.meta.url), {
139 | 	tables: ['#invoice', '.address']
140 | }, function(err, text) {
141 | 	if (err) return console.error(err);
142 | 	console.log(text);
143 | });
144 | 		
145 | 146 | -------------------------------------------------------------------------------- /packages/base/src/util.js: -------------------------------------------------------------------------------- 1 | 2 | import merge from 'deepmerge'; // default 3 | 4 | /** 5 | * Make a recursive function that will only run to a given depth 6 | * and switches to an alternative function at that depth. \ 7 | * No limitation if `n` is `undefined` (Just wraps `f` in that case). 8 | * 9 | * @param { number | undefined } n Allowed depth of recursion. `undefined` for no limitation. 10 | * @param { Function } f Function that accepts recursive callback as the first argument. 11 | * @param { Function } [g] Function to run instead, when maximum depth was reached. Do nothing by default. 12 | * @returns { Function } 13 | */ 14 | function limitedDepthRecursive (n, f, g = () => undefined) { 15 | if (n === undefined) { 16 | const f1 = function (...args) { return f(f1, ...args); }; 17 | return f1; 18 | } 19 | if (n >= 0) { 20 | return function (...args) { return f(limitedDepthRecursive(n - 1, f, g), ...args); }; 21 | } 22 | return g; 23 | } 24 | 25 | /** 26 | * Return the same string or a substring with 27 | * the given character occurrences removed from each side. 28 | * 29 | * @param { string } str A string to trim. 30 | * @param { string } char A character to be trimmed. 31 | * @returns { string } 32 | */ 33 | function trimCharacter (str, char) { 34 | let start = 0; 35 | let end = str.length; 36 | while (start < end && str[start] === char) { ++start; } 37 | while (end > start && str[end - 1] === char) { --end; } 38 | return (start > 0 || end < str.length) 39 | ? str.substring(start, end) 40 | : str; 41 | } 42 | 43 | /** 44 | * Return the same string or a substring with 45 | * the given character occurrences removed from the end only. 46 | * 47 | * @param { string } str A string to trim. 48 | * @param { string } char A character to be trimmed. 49 | * @returns { string } 50 | */ 51 | function trimCharacterEnd (str, char) { 52 | let end = str.length; 53 | while (end > 0 && str[end - 1] === char) { --end; } 54 | return (end < str.length) 55 | ? str.substring(0, end) 56 | : str; 57 | } 58 | 59 | /** 60 | * Return a new string will all characters replaced with unicode escape sequences. 61 | * This extreme kind of escaping can used to be safely compose regular expressions. 62 | * 63 | * @param { string } str A string to escape. 64 | * @returns { string } A string of unicode escape sequences. 65 | */ 66 | function unicodeEscape (str) { 67 | return str.replace(/[\s\S]/g, c => '\\u' + c.charCodeAt().toString(16).padStart(4, '0')); 68 | } 69 | 70 | /** 71 | * Deduplicate an array by a given key callback. 72 | * Item properties are merged recursively and with the preference for last defined values. 73 | * Of items with the same key, merged item takes the place of the last item, 74 | * others are omitted. 75 | * 76 | * @param { any[] } items An array to deduplicate. 77 | * @param { (x: any) => string } getKey Callback to get a value that distinguishes unique items. 78 | * @returns { any[] } 79 | */ 80 | function mergeDuplicatesPreferLast (items, getKey) { 81 | const map = new Map(); 82 | for (let i = items.length; i-- > 0;) { 83 | const item = items[i]; 84 | const key = getKey(item); 85 | map.set( 86 | key, 87 | (map.has(key)) 88 | ? merge(item, map.get(key), { arrayMerge: overwriteMerge }) 89 | : item 90 | ); 91 | } 92 | return [...map.values()].reverse(); 93 | } 94 | 95 | const overwriteMerge = (acc, src, options) => [...src]; 96 | 97 | /** 98 | * Get a nested property from an object. 99 | * 100 | * @param { object } obj The object to query for the value. 101 | * @param { string[] } path The path to the property. 102 | * @returns { any } 103 | */ 104 | function get (obj, path) { 105 | for (const key of path) { 106 | if (!obj) { return undefined; } 107 | obj = obj[key]; 108 | } 109 | return obj; 110 | } 111 | 112 | /** 113 | * Convert a number into alphabetic sequence representation (Sequence without zeroes). 114 | * 115 | * For example: `a, ..., z, aa, ..., zz, aaa, ...`. 116 | * 117 | * @param { number } num Number to convert. Must be >= 1. 118 | * @param { string } [baseChar = 'a'] Character for 1 in the sequence. 119 | * @param { number } [base = 26] Number of characters in the sequence. 120 | * @returns { string } 121 | */ 122 | function numberToLetterSequence (num, baseChar = 'a', base = 26) { 123 | const digits = []; 124 | do { 125 | num -= 1; 126 | digits.push(num % base); 127 | num = (num / base) >> 0; // quick `floor` 128 | } while (num > 0); 129 | const baseCode = baseChar.charCodeAt(0); 130 | return digits 131 | .reverse() 132 | .map(n => String.fromCharCode(baseCode + n)) 133 | .join(''); 134 | } 135 | 136 | const I = ['I', 'X', 'C', 'M']; 137 | const V = ['V', 'L', 'D']; 138 | 139 | /** 140 | * Convert a number to it's Roman representation. No large numbers extension. 141 | * 142 | * @param { number } num Number to convert. `0 < num <= 3999`. 143 | * @returns { string } 144 | */ 145 | function numberToRoman (num) { 146 | return [...(num) + ''] 147 | .map(n => +n) 148 | .reverse() 149 | .map((v, i) => ((v % 5 < 4) 150 | ? (v < 5 ? '' : V[i]) + I[i].repeat(v % 5) 151 | : I[i] + (v < 5 ? V[i] : I[i + 1]))) 152 | .reverse() 153 | .join(''); 154 | } 155 | 156 | export { 157 | get, 158 | limitedDepthRecursive, 159 | mergeDuplicatesPreferLast, 160 | numberToLetterSequence, 161 | numberToRoman, 162 | trimCharacter, 163 | trimCharacterEnd, 164 | unicodeEscape 165 | }; 166 | -------------------------------------------------------------------------------- /packages/html-to-md/src/html-to-md.js: -------------------------------------------------------------------------------- 1 | 2 | import { compile as compile_ } from '@html-to-text/base'; 3 | import * as genericFormatters from '@html-to-text/base/src/generic-formatters'; 4 | import { mergeDuplicatesPreferLast } from '@html-to-text/base/src/util'; 5 | import merge from 'deepmerge'; // default 6 | 7 | import * as markdownFormatters from './md-formatters'; 8 | 9 | // eslint-disable-next-line import/no-unassigned-import 10 | import '@html-to-text/base/src/typedefs'; 11 | 12 | 13 | /** 14 | * Default options. 15 | * 16 | * @constant 17 | * @type { Options } 18 | * @default 19 | * @private 20 | */ 21 | const DEFAULT_OPTIONS = { 22 | baseElements: { 23 | selectors: [ 'body' ], 24 | orderBy: 'selectors', // 'selectors' | 'occurrence' 25 | returnDomByDefault: true 26 | }, 27 | decodeEntities: false, 28 | encodeCharacters: { 29 | '!': '!', 30 | '#': '#', 31 | '(': '(', 32 | ')': ')', 33 | '*': '*', 34 | '+': '+', 35 | '-': '-', // hyphen-minus 36 | '.': '.', 37 | '[': '[', 38 | '\\': '\', 39 | ']': ']', 40 | '_': '_', 41 | '`': '`', 42 | '{': '{', 43 | '}': '}', 44 | }, 45 | formatters: {}, 46 | limits: { 47 | ellipsis: '...', 48 | maxBaseElements: undefined, 49 | maxChildNodes: undefined, 50 | maxDepth: undefined, 51 | maxInputLength: (1 << 24) // 16_777_216 52 | }, 53 | selectors: [ 54 | { selector: '*', format: 'inline' }, 55 | { selector: 'a', format: 'anchor', options: { baseUrl: null, noAnchorUrl: true } }, 56 | { selector: 'article', format: 'block' }, 57 | { selector: 'aside', format: 'block' }, 58 | { selector: 'b', format: 'inlineSurround', options: { prefix: '**', suffix: '**' } }, 59 | { selector: 'blockquote', format: 'blockquote', options: { trimEmptyLines: true } }, 60 | { selector: 'br', format: 'inlineString', options: { string: '
' } }, 61 | { selector: 'code', format: 'inlineSurround', options: { prefix: '`', suffix: '`' } }, 62 | { selector: 'del', format: 'inlineSurround', options: { prefix: '~~', suffix: '~~' } }, 63 | { selector: 'div', format: 'block' }, 64 | { selector: 'dl', format: 'definitionList' }, 65 | { selector: 'em', format: 'inlineSurround', options: { prefix: '*', suffix: '*' } }, 66 | { selector: 'figure', format: 'block' }, 67 | { selector: 'figcaption', format: 'block' }, 68 | { selector: 'footer', format: 'block' }, 69 | { selector: 'form', format: 'block' }, 70 | { selector: 'h1', format: 'heading', options: { level: 1 } }, 71 | { selector: 'h2', format: 'heading', options: { level: 2 } }, 72 | { selector: 'h3', format: 'heading', options: { level: 3 } }, 73 | { selector: 'h4', format: 'heading', options: { level: 4 } }, 74 | { selector: 'h5', format: 'heading', options: { level: 5 } }, 75 | { selector: 'h6', format: 'heading', options: { level: 6 } }, 76 | { selector: 'header', format: 'block' }, 77 | { selector: 'hr', format: 'blockString', options: { string: '----' } }, 78 | { selector: 'i', format: 'inlineSurround', options: { prefix: '*', suffix: '*' } }, 79 | { selector: 'img', format: 'image', options: { baseUrl: null } }, 80 | { selector: 'kbd', format: 'inlineTag' }, 81 | { selector: 'main', format: 'block' }, 82 | { selector: 'nav', format: 'block' }, 83 | { selector: 'ol', format: 'orderedList', options: { interRowLineBreaks: 1 } }, 84 | { selector: 'p', format: 'block' }, 85 | { selector: 'picture', format: 'inline' }, 86 | { selector: 'pre', format: 'pre' }, 87 | { selector: 's', format: 'inlineSurround', options: { prefix: '~~', suffix: '~~' } }, 88 | { selector: 'section', format: 'block' }, 89 | { selector: 'source', format: 'skip' }, 90 | { selector: 'strong', format: 'inlineSurround', options: { prefix: '**', suffix: '**' } }, 91 | { selector: 'sub', format: 'inlineTag' }, 92 | { selector: 'sup', format: 'inlineTag' }, 93 | { selector: 'table', format: 'dataTable' }, 94 | { selector: 'ul', format: 'unorderedList', options: { marker: '-', interRowLineBreaks: 1 } }, 95 | { selector: 'wbr', format: 'wbr' }, 96 | ], 97 | whitespaceCharacters: ' \t\r\n\f\u200b', 98 | wordwrap: 80 99 | }; 100 | 101 | 102 | const concatMerge = (acc, src, options) => [...acc, ...src]; 103 | const overwriteMerge = (acc, src, options) => [...src]; 104 | const selectorsMerge = (acc, src, options) => ( 105 | (acc.some(s => typeof s === 'object')) 106 | ? concatMerge(acc, src, options) // selectors 107 | : overwriteMerge(acc, src, options) // baseElements.selectors 108 | ); 109 | 110 | /** 111 | * Preprocess options, compile selectors into a decision tree, 112 | * return a function intended for batch processing. 113 | * 114 | * @param { Options } [options = {}] HtmlToText options. 115 | * @returns { (html: string, metadata?: any) => string } Pre-configured converter function. 116 | * @static 117 | */ 118 | function compile (options = {}) { 119 | options = merge( 120 | DEFAULT_OPTIONS, 121 | options, 122 | { 123 | arrayMerge: overwriteMerge, 124 | customMerge: (key) => ((key === 'selectors') ? selectorsMerge : undefined) 125 | } 126 | ); 127 | options.formatters = Object.assign({}, genericFormatters, markdownFormatters, options.formatters); 128 | options.selectors = mergeDuplicatesPreferLast(options.selectors, (s => s.selector)); 129 | 130 | return compile_(options); 131 | } 132 | 133 | /** 134 | * Convert given HTML content to a markdown string. 135 | * 136 | * @param { string } html HTML content to convert. 137 | * @param { Options } [options = {}] HtmlToText options. 138 | * @param { any } [metadata] Optional metadata for HTML document, for use in formatters. 139 | * @returns { string } Plain text string. 140 | * @static 141 | * 142 | * @example 143 | * const { convert } = require('html-to-text'); 144 | * const text = convert('

Hello World

', {}); 145 | * console.log(text); // # Hello World 146 | */ 147 | function convert (html, options = {}, metadata = undefined) { 148 | return compile(options)(html, metadata); 149 | } 150 | 151 | export { 152 | compile, 153 | convert, 154 | convert as htmlToMarkdown 155 | }; 156 | -------------------------------------------------------------------------------- /packages/base/src/inline-text-builder.js: -------------------------------------------------------------------------------- 1 | 2 | import { get } from './util'; 3 | 4 | // eslint-disable-next-line import/no-unassigned-import 5 | import './typedefs'; 6 | 7 | /** 8 | * Helps to build text from words. 9 | */ 10 | class InlineTextBuilder { 11 | /** 12 | * Creates an instance of InlineTextBuilder. 13 | * 14 | * If `maxLineLength` is not provided then it is either `options.wordwrap` or unlimited. 15 | * 16 | * @param { Options } options HtmlToText options. 17 | * @param { number } [ maxLineLength ] This builder will try to wrap text to fit this line length. 18 | */ 19 | constructor (options, maxLineLength = undefined) { 20 | /** @type { string[][] } */ 21 | this.lines = []; 22 | /** @type { string[] } */ 23 | this.nextLineWords = []; 24 | this.maxLineLength = maxLineLength || options.wordwrap || Number.MAX_VALUE; 25 | this.nextLineAvailableChars = this.maxLineLength; 26 | this.wrapCharacters = get(options, ['longWordSplit', 'wrapCharacters']) || []; 27 | this.forceWrapOnLimit = get(options, ['longWordSplit', 'forceWrapOnLimit']) || false; 28 | 29 | this.stashedSpace = false; 30 | this.wordBreakOpportunity = false; 31 | } 32 | 33 | /** 34 | * Add a new word. 35 | * 36 | * @param { string } word A word to add. 37 | * @param { boolean } [noWrap] Don't wrap text even if the line is too long. 38 | */ 39 | pushWord (word, noWrap = false) { 40 | if (this.nextLineAvailableChars <= 0 && !noWrap) { 41 | this.startNewLine(); 42 | } 43 | const isLineStart = this.nextLineWords.length === 0; 44 | const cost = word.length + (isLineStart ? 0 : 1); 45 | if ((cost <= this.nextLineAvailableChars) || noWrap) { // Fits into available budget 46 | 47 | this.nextLineWords.push(word); 48 | this.nextLineAvailableChars -= cost; 49 | 50 | } else { // Does not fit - try to split the word 51 | 52 | // The word is moved to a new line - prefer to wrap between words. 53 | const [first, ...rest] = this.splitLongWord(word); 54 | if (!isLineStart) { this.startNewLine(); } 55 | this.nextLineWords.push(first); 56 | this.nextLineAvailableChars -= first.length; 57 | for (const part of rest) { 58 | this.startNewLine(); 59 | this.nextLineWords.push(part); 60 | this.nextLineAvailableChars -= part.length; 61 | } 62 | 63 | } 64 | } 65 | 66 | /** 67 | * Pop a word from the currently built line. 68 | * This doesn't affect completed lines. 69 | * 70 | * @returns { string } 71 | */ 72 | popWord () { 73 | const lastWord = this.nextLineWords.pop(); 74 | if (lastWord !== undefined) { 75 | const isLineStart = this.nextLineWords.length === 0; 76 | const cost = lastWord.length + (isLineStart ? 0 : 1); 77 | this.nextLineAvailableChars += cost; 78 | } 79 | return lastWord; 80 | } 81 | 82 | /** 83 | * Concat a word to the last word already in the builder. 84 | * Adds a new word in case there are no words yet in the last line. 85 | * 86 | * @param { string } word A word to be concatenated. 87 | * @param { boolean } [noWrap] Don't wrap text even if the line is too long. 88 | */ 89 | concatWord (word, noWrap = false) { 90 | if (this.wordBreakOpportunity && word.length > this.nextLineAvailableChars) { 91 | this.pushWord(word, noWrap); 92 | this.wordBreakOpportunity = false; 93 | } else { 94 | const lastWord = this.popWord(); 95 | this.pushWord((lastWord) ? lastWord.concat(word) : word, noWrap); 96 | } 97 | } 98 | 99 | /** 100 | * Add current line (and more empty lines if provided argument > 1) to the list of complete lines and start a new one. 101 | * 102 | * @param { number } n Number of line breaks that will be added to the resulting string. 103 | */ 104 | startNewLine (n = 1) { 105 | this.lines.push(this.nextLineWords); 106 | if (n > 1) { 107 | this.lines.push(...Array.from({ length: n - 1 }, () => [])); 108 | } 109 | this.nextLineWords = []; 110 | this.nextLineAvailableChars = this.maxLineLength; 111 | } 112 | 113 | /** 114 | * No words in this builder. 115 | * 116 | * @returns { boolean } 117 | */ 118 | isEmpty () { 119 | return this.lines.length === 0 120 | && this.nextLineWords.length === 0; 121 | } 122 | 123 | clear () { 124 | this.lines.length = 0; 125 | this.nextLineWords.length = 0; 126 | this.nextLineAvailableChars = this.maxLineLength; 127 | } 128 | 129 | /** 130 | * Join all lines of words inside the InlineTextBuilder into a complete string. 131 | * 132 | * @returns { string } 133 | */ 134 | toString () { 135 | return [...this.lines, this.nextLineWords] 136 | .map(words => words.join(' ')) 137 | .join('\n'); 138 | } 139 | 140 | /** 141 | * Split a long word up to fit within the word wrap limit. 142 | * Use either a character to split looking back from the word wrap limit, 143 | * or truncate to the word wrap limit. 144 | * 145 | * @param { string } word Input word. 146 | * @returns { string[] } Parts of the word. 147 | */ 148 | splitLongWord (word) { 149 | const parts = []; 150 | let idx = 0; 151 | while (word.length > this.maxLineLength) { 152 | 153 | const firstLine = word.substring(0, this.maxLineLength); 154 | const remainingChars = word.substring(this.maxLineLength); 155 | 156 | const splitIndex = firstLine.lastIndexOf(this.wrapCharacters[idx]); 157 | 158 | if (splitIndex > -1) { // Found a character to split on 159 | 160 | word = firstLine.substring(splitIndex + 1) + remainingChars; 161 | parts.push(firstLine.substring(0, splitIndex + 1)); 162 | 163 | } else { // Not found a character to split on 164 | 165 | idx++; 166 | if (idx < this.wrapCharacters.length) { // There is next character to try 167 | 168 | word = firstLine + remainingChars; 169 | 170 | } else { // No more characters to try 171 | 172 | if (this.forceWrapOnLimit) { 173 | parts.push(firstLine); 174 | word = remainingChars; 175 | if (word.length > this.maxLineLength) { 176 | continue; 177 | } 178 | } else { 179 | word = firstLine + remainingChars; 180 | } 181 | break; 182 | 183 | } 184 | 185 | } 186 | 187 | } 188 | parts.push(word); // Add remaining part to array 189 | return parts; 190 | } 191 | } 192 | 193 | export { InlineTextBuilder }; 194 | -------------------------------------------------------------------------------- /packages/base/src/index.js: -------------------------------------------------------------------------------- 1 | 2 | import { hp2Builder } from '@selderee/plugin-htmlparser2'; 3 | import { parseDocument } from 'htmlparser2'; 4 | import { DecisionTree } from 'selderee'; 5 | 6 | import { BlockTextBuilder } from './block-text-builder'; 7 | import { limitedDepthRecursive, unicodeEscape } from './util'; 8 | 9 | 10 | /** 11 | * Compile selectors into a decision tree, 12 | * return a function intended for batch processing. 13 | * 14 | * @param { Options } [options = {}] HtmlToText options (defaults, formatters, user options merged, deduplicated). 15 | * @returns { (html: string, metadata?: any) => string } Pre-configured converter function. 16 | * @static 17 | */ 18 | function compile (options = {}) { 19 | const selectorsWithoutFormat = options.selectors.filter(s => !s.format); 20 | if (selectorsWithoutFormat.length) { 21 | throw new Error( 22 | 'Following selectors have no specified format: ' + 23 | selectorsWithoutFormat.map(s => `\`${s.selector}\``).join(', ') 24 | ); 25 | } 26 | const picker = new DecisionTree( 27 | options.selectors.map(s => [s.selector, s]) 28 | ).build(hp2Builder); 29 | 30 | if (typeof options.encodeCharacters !== 'function') { 31 | options.encodeCharacters = makeReplacerFromDict(options.encodeCharacters); 32 | } 33 | 34 | const baseSelectorsPicker = new DecisionTree( 35 | options.baseElements.selectors.map((s, i) => [s, i + 1]) 36 | ).build(hp2Builder); 37 | function findBaseElements (dom) { 38 | return findBases(dom, options, baseSelectorsPicker); 39 | } 40 | 41 | const limitedWalk = limitedDepthRecursive( 42 | options.limits.maxDepth, 43 | recursiveWalk, 44 | function (dom, builder) { 45 | builder.addInline(options.limits.ellipsis || ''); 46 | } 47 | ); 48 | 49 | return function (html, metadata = undefined) { 50 | return process(html, metadata, options, picker, findBaseElements, limitedWalk); 51 | }; 52 | } 53 | 54 | 55 | /** 56 | * Convert given HTML according to preprocessed options. 57 | * 58 | * @param { string } html HTML content to convert. 59 | * @param { any } metadata Optional metadata for HTML document, for use in formatters. 60 | * @param { Options } options HtmlToText options (preprocessed). 61 | * @param { import('selderee').Picker } picker 62 | * Tag definition picker for DOM nodes processing. 63 | * @param { (dom: DomNode[]) => DomNode[] } findBaseElements 64 | * Function to extract elements from HTML DOM 65 | * that will only be present in the output text. 66 | * @param { RecursiveCallback } walk Recursive callback. 67 | * @returns { string } 68 | */ 69 | function process (html, metadata, options, picker, findBaseElements, walk) { 70 | const maxInputLength = options.limits.maxInputLength; 71 | if (maxInputLength && html && html.length > maxInputLength) { 72 | console.warn( 73 | `Input length ${html.length} is above allowed limit of ${maxInputLength}. Truncating without ellipsis.` 74 | ); 75 | html = html.substring(0, maxInputLength); 76 | } 77 | 78 | const document = parseDocument(html, { decodeEntities: options.decodeEntities }); 79 | const bases = findBaseElements(document.children); 80 | const builder = new BlockTextBuilder(options, picker, metadata); 81 | walk(bases, builder); 82 | return builder.toString(); 83 | } 84 | 85 | 86 | function findBases (dom, options, baseSelectorsPicker) { 87 | const results = []; 88 | 89 | function recursiveWalk (walk, /** @type { DomNode[] } */ dom) { 90 | dom = dom.slice(0, options.limits.maxChildNodes); 91 | for (const elem of dom) { 92 | if (elem.type !== 'tag') { 93 | continue; 94 | } 95 | const pickedSelectorIndex = baseSelectorsPicker.pick1(elem); 96 | if (pickedSelectorIndex > 0) { 97 | results.push({ selectorIndex: pickedSelectorIndex, element: elem }); 98 | } else if (elem.children) { 99 | walk(elem.children); 100 | } 101 | if (results.length >= options.limits.maxBaseElements) { 102 | return; 103 | } 104 | } 105 | } 106 | 107 | const limitedWalk = limitedDepthRecursive( 108 | options.limits.maxDepth, 109 | recursiveWalk 110 | ); 111 | limitedWalk(dom); 112 | 113 | if (options.baseElements.orderBy !== 'occurrence') { // 'selectors' 114 | results.sort((a, b) => a.selectorIndex - b.selectorIndex); 115 | } 116 | return (options.baseElements.returnDomByDefault && results.length === 0) 117 | ? dom 118 | : results.map(x => x.element); 119 | } 120 | 121 | /** 122 | * Function to walk through DOM nodes and accumulate their string representations. 123 | * 124 | * @param { RecursiveCallback } walk Recursive callback. 125 | * @param { DomNode[] } [dom] Nodes array to process. 126 | * @param { BlockTextBuilder } builder Passed around to accumulate output text. 127 | * @private 128 | */ 129 | function recursiveWalk (walk, dom, builder) { 130 | if (!dom) { return; } 131 | 132 | const options = builder.options; 133 | 134 | const tooManyChildNodes = dom.length > options.limits.maxChildNodes; 135 | if (tooManyChildNodes) { 136 | dom = dom.slice(0, options.limits.maxChildNodes); 137 | dom.push({ 138 | data: options.limits.ellipsis, 139 | type: 'text' 140 | }); 141 | } 142 | 143 | for (const elem of dom) { 144 | switch (elem.type) { 145 | case 'text': { 146 | builder.addInline(elem.data); 147 | break; 148 | } 149 | case 'tag': { 150 | const tagDefinition = builder.picker.pick1(elem); 151 | const format = options.formatters[tagDefinition.format]; 152 | format(elem, walk, builder, tagDefinition.options || {}); 153 | break; 154 | } 155 | default: 156 | /* do nothing */ 157 | break; 158 | } 159 | } 160 | 161 | return; 162 | } 163 | 164 | /** 165 | * @param { {[key: string]: string | false} } dict 166 | * A dictionary where keys are characters to replace 167 | * and values are replacement strings. 168 | * 169 | * First code point from dict keys is used. 170 | * Compound emojis with ZWJ are not supported (not until Node 16). 171 | * 172 | * @returns { ((str: string) => string) | undefined } 173 | */ 174 | function makeReplacerFromDict (dict) { 175 | if (!dict || Object.keys(dict).length === 0) { 176 | return undefined; 177 | } 178 | /** @type { [string, string][] } */ 179 | const entries = Object.entries(dict).filter(([, v]) => v !== false); 180 | const regex = new RegExp( 181 | entries 182 | .map(([c]) => `(${unicodeEscape([...c][0])})`) 183 | .join('|'), 184 | 'g' 185 | ); 186 | const values = entries.map(([, v]) => v); 187 | const replacer = (m, ...cgs) => values[cgs.findIndex(cg => cg)]; 188 | return (str) => str.replace(regex, replacer); 189 | } 190 | 191 | 192 | export { compile }; 193 | -------------------------------------------------------------------------------- /.eslintrc.cjs: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | plugins: [ 'filenames', 'jsdoc' ], 3 | extends: [ 4 | 'eslint:recommended', 5 | 'plugin:import/errors', 6 | 'plugin:import/warnings', 7 | 'plugin:jsdoc/recommended' 8 | ], 9 | ignorePatterns: [ 10 | '.vscode', 11 | '**/lib', 12 | '**/node_modules', 13 | '**/__*.*' 14 | ], 15 | parserOptions: { 16 | ecmaVersion: 2020, 17 | sourceType: 'module' 18 | }, 19 | env: { 20 | es6: true, 21 | node: true, 22 | }, 23 | globals: {}, 24 | settings: { 'jsdoc': { mode: 'typescript' } }, // allow compact callback types in particular 25 | rules: { 26 | // Best practices 27 | 'block-scoped-var': 'error', 28 | 'class-methods-use-this': 'error', 29 | 'complexity': ['error', 14], 30 | 'consistent-return': 'error', 31 | 'curly': 'error', 32 | 'default-case': 'error', 33 | 'eqeqeq': ['error'], 34 | 'guard-for-in': 'error', 35 | 'max-classes-per-file': ['error', 1], 36 | 'no-alert': 'error', 37 | 'no-caller': 'error', 38 | 'no-case-declarations': 'error', 39 | 'no-div-regex': 'error', 40 | 'no-empty-function': 'error', 41 | 'no-eq-null': 'error', 42 | 'no-eval': 'error', 43 | 'no-extend-native': 'error', 44 | 'no-extra-bind': 'error', 45 | 'no-implied-eval': 'error', 46 | 'no-invalid-this': 'error', 47 | 'no-iterator': 'error', 48 | 'no-lone-blocks': 'error', 49 | 'no-loop-func': 'error', 50 | 'no-multi-spaces': 'error', 51 | 'no-multi-str': 'error', 52 | 'no-new': 'error', 53 | 'no-new-func': 'error', 54 | 'no-new-wrappers': 'error', 55 | 'no-octal-escape': 'error', 56 | 'no-proto': 'error', 57 | 'no-return-assign': 'error', 58 | 'no-script-url': 'error', 59 | 'no-self-compare': 'error', 60 | 'no-throw-literal': 'error', 61 | 'no-unused-expressions': 'error', 62 | 'no-useless-call': 'error', 63 | 'no-useless-concat': 'error', 64 | 'no-void': 'error', 65 | 'no-warning-comments': 'warn', 66 | 'radix': 'error', 67 | 'wrap-iife': ['error', 'any'], 68 | 69 | // Variables 70 | 'no-undef-init': 'error', 71 | 'no-unused-vars': ['error', { 'args': 'none' }], // function shapes have to be clear 72 | 73 | // Stylistic Issues 74 | 'block-spacing': 'error', 75 | 'brace-style': ['error', '1tbs', { 'allowSingleLine': true }], 76 | 'camelcase': 'error', 77 | 'comma-spacing': 'error', 78 | 'comma-style': 'error', 79 | 'eol-last': 'error', 80 | 'func-call-spacing': 'error', 81 | 'func-name-matching': ['error', { 'includeCommonJSModuleExports': true }], 82 | 'function-call-argument-newline': ['error', 'consistent'], 83 | 'function-paren-newline': ['error', 'consistent'], 84 | 'indent': ['error', 2, { 'SwitchCase': 1, 'flatTernaryExpressions': true }], 85 | 'key-spacing': ['error'], 86 | 'keyword-spacing': 'error', 87 | 'lines-between-class-members': 'error', 88 | 'max-depth': ['error', 5], 89 | 'max-len': ['error', 130], 90 | 'max-lines-per-function': ['error', 80], 91 | 'max-nested-callbacks': ['error', 4], 92 | 'max-params': ['error', 6], 93 | 'max-statements': ['error', 41], 94 | 'max-statements-per-line': ['error', { 'max': 2 }], 95 | 'multiline-ternary': ['error', 'always-multiline'], 96 | 'newline-per-chained-call': ['error', { 'ignoreChainWithDepth': 3 }], 97 | 'no-lonely-if': 'warn', 98 | 'no-mixed-operators': 'error', 99 | 'no-multiple-empty-lines': 'error', 100 | 'no-trailing-spaces': 'error', 101 | 'no-whitespace-before-property': 'error', 102 | 'object-curly-newline': ['error', { 'multiline': true }], 103 | 'object-curly-spacing': ['error', 'always'], 104 | 'object-property-newline': ['error', { 'allowAllPropertiesOnSameLine': true }], 105 | 'one-var': ['warn', 'never'], 106 | 'quote-props': ['error', 'consistent'], 107 | 'quotes': ['error', 'single', { 'avoidEscape': true, 'allowTemplateLiterals': true }], 108 | 'semi': 'error', 109 | 'semi-spacing': 'error', 110 | 'semi-style': 'error', 111 | 'sort-keys': ['error', 'asc', { minKeys: 4 }], 112 | 'space-before-blocks': 'error', 113 | 'space-before-function-paren': ['error'], 114 | 'space-in-parens': 'error', 115 | 'space-infix-ops': 'error', 116 | 'spaced-comment': ['error', 'always', { 'block': { 'exceptions': ['html'], 'balanced': true } }], 117 | 'template-tag-spacing': ['error', 'never'], 118 | 119 | // ECMAScript 6 120 | 'arrow-body-style': ['error', 'as-needed'], 121 | 'arrow-spacing': 'error', 122 | 'no-confusing-arrow': 'error', 123 | 'no-var': 'error', 124 | 'object-shorthand': ['error', 'never'], 125 | 'prefer-const': 'error', 126 | 'prefer-destructuring': ['error', { AssignmentExpression: { array: false } }], 127 | 'prefer-rest-params': 'error', 128 | 'prefer-spread': 'error', 129 | 130 | // JSDoc 131 | 'jsdoc/check-examples': 'off', 132 | 'jsdoc/check-indentation': 'error', 133 | 'jsdoc/check-syntax': 'error', 134 | 'jsdoc/empty-tags': 'error', 135 | 'jsdoc/no-undefined-types': 'off', // doesn't work with typedefs in a different file 136 | 'jsdoc/require-description-complete-sentence': ['error', { tags: ['typedef'] }], 137 | 'jsdoc/require-hyphen-before-param-description': ['error', 'never'], 138 | 'jsdoc/require-jsdoc': ['error', { 'publicOnly': true }], 139 | 'jsdoc/require-returns-description': 'off', // description might tell this better, avoid repetition 140 | 'jsdoc/tag-lines': 'off', 141 | 142 | // Import 143 | 'import/no-deprecated': 'error', 144 | 'import/no-extraneous-dependencies': 'off', 145 | 'import/no-mutable-exports': 'error', 146 | 'import/no-amd': 'error', 147 | 'import/no-nodejs-modules': 'error', 148 | 'import/first': 'error', 149 | 'import/no-namespace': 'off', 150 | 'import/namespace': 'error', 151 | 'import/extensions': 'error', 152 | 'import/order': ['error', { 'newlines-between': 'always', 'alphabetize': { 'order': 'asc', 'caseInsensitive': true } }], 153 | 'import/newline-after-import': ['error', { 'count': 2 }], 154 | 'import/no-unassigned-import': 'error', 155 | 'import/no-named-default': 'error', 156 | 'import/no-named-as-default': 'error', 157 | 'import/group-exports': 'error', 158 | 'import/no-unresolved': 'off', 159 | 160 | // Filenames 161 | 'filenames/match-regex': ['error', '^[a-z][a-z0-9\\.\\-]+$'], 162 | 'filenames/match-exported': 'error' 163 | }, 164 | overrides: [ 165 | { 166 | 'files': ['packages/html-to-text-cli/**/*.js'], 167 | 'rules': { 168 | 'sort-keys': 'off', 169 | 'import/extensions': [ 'error', 'never', { 'json': 'always' } ], 170 | 'import/no-nodejs-modules': [ 'error', { 'allow': ['node:process'] }] 171 | } 172 | }, 173 | { 174 | 'files': ['example/*.js'], 175 | 'rules': { 'import/no-nodejs-modules': 'off' } 176 | }, 177 | { 178 | 'files': ['**/test/*.js'], 179 | 'env': { mocha: true }, 180 | 'rules': { 181 | 'import/no-nodejs-modules': 'off', 182 | 'max-len': 'off', 183 | 'max-lines-per-function': 'off' 184 | } 185 | }, 186 | { 187 | 'files': ['.eslintrc.cjs'], 188 | 'rules': { 189 | 'sort-keys': 'off', 190 | 'filenames/match-regex': 'off', 191 | } 192 | } 193 | ] 194 | }; 195 | -------------------------------------------------------------------------------- /packages/html-to-md/src/table-printer.js: -------------------------------------------------------------------------------- 1 | 2 | // eslint-disable-next-line import/no-unassigned-import 3 | import '@html-to-text/base/src/typedefs'; 4 | 5 | 6 | function getRow (matrix, j) { 7 | if (!matrix[j]) { matrix[j] = []; } 8 | return matrix[j]; 9 | } 10 | 11 | function findFirstVacantIndex (row, x = 0) { 12 | while (row[x]) { x++; } 13 | return x; 14 | } 15 | 16 | function transposeInPlace (matrix, maxSize) { 17 | for (let i = 0; i < maxSize; i++) { 18 | const rowI = getRow(matrix, i); 19 | for (let j = 0; j < i; j++) { 20 | const rowJ = getRow(matrix, j); 21 | const temp = rowI[j]; 22 | rowI[j] = rowJ[i]; 23 | rowJ[i] = temp; 24 | } 25 | } 26 | } 27 | 28 | function putCellIntoLayout (cell, layout, baseRow, baseCol) { 29 | for (let r = 0; r < cell.rowspan; r++) { 30 | const layoutRow = getRow(layout, baseRow + r); 31 | for (let c = 0; c < cell.colspan; c++) { 32 | layoutRow[baseCol + c] = cell; 33 | } 34 | } 35 | } 36 | 37 | function linearizeText (text) { 38 | return text 39 | .replace(/(?: *\n){2,} */g, (m) => '
'.repeat((m.match(/\n/g) || []).length - 1)) 40 | .replace(/ *\n */g, ' '); 41 | } 42 | 43 | function createLayout (tableRows) { 44 | const layout = []; 45 | let colNumber = 0; 46 | const rowNumber = tableRows.length; 47 | for (let j = 0; j < rowNumber; j++) { 48 | const layoutRow = getRow(layout, j); 49 | const cells = tableRows[j]; 50 | let x = 0; 51 | for (let i = 0; i < cells.length; i++) { 52 | const cell = cells[i]; 53 | x = findFirstVacantIndex(layoutRow, x); 54 | putCellIntoLayout(cell, layout, j, x); 55 | x += cell.colspan; 56 | cell.line = linearizeText(cell.text); 57 | } 58 | colNumber = (layoutRow.length > colNumber) ? layoutRow.length : colNumber; 59 | } 60 | return { 61 | layout: layout, 62 | rowNumber: rowNumber, 63 | colNumber: colNumber 64 | }; 65 | } 66 | 67 | function addCellText (lines, y, text, separator = '|') { 68 | if (lines[y]) { 69 | lines[y] += ` ${separator} ${text}`; 70 | } else { 71 | lines[y] = `${separator} ${text}`; 72 | } 73 | } 74 | 75 | function addEmptyCell (lines, y, separator = '|') { 76 | if (lines[y]) { 77 | lines[y] += ' ' + separator; 78 | } else { 79 | lines[y] = separator; 80 | } 81 | } 82 | 83 | function cellOpenTag (/** @type {TablePrinterCell} */cell) { 84 | const colspan = (cell.colspan === 1) ? '' : ` colspan="${cell.colspan}"`; 85 | const rowspan = (cell.rowspan === 1) ? '' : ` rowspan="${cell.rowspan}"`; 86 | return ``; 87 | } 88 | 89 | function renderRowsRepeat (layout, colNumber, rowNumber) { 90 | const outputLines = []; 91 | for (let x = 0; x < colNumber; x++) { 92 | for (let y = 0; y < rowNumber; y++) { 93 | const cell = layout[x][y]; 94 | if (cell) { 95 | addCellText(outputLines, y, layout[x][y].line); 96 | } else { 97 | addEmptyCell(outputLines, y); 98 | } 99 | } 100 | } 101 | return outputLines; 102 | } 103 | 104 | function renderRowsFirst (layout, colNumber, rowNumber) { 105 | const outputLines = []; 106 | for (let x = 0; x < colNumber; x++) { 107 | for (let y = 0; y < rowNumber; y++) { 108 | const cell = layout[x][y]; 109 | if (!cell || cell.rendered) { 110 | addEmptyCell(outputLines, y); 111 | } else { 112 | addCellText(outputLines, y, cell.line); 113 | cell.rendered = true; 114 | } 115 | } 116 | } 117 | return outputLines; 118 | } 119 | 120 | function renderRowsFirstCol (layout, colNumber, rowNumber) { 121 | const outputLines = []; 122 | for (let x = 0; x < colNumber; x++) { 123 | for (let y = 0; y < rowNumber; y++) { 124 | const cell = layout[x][y]; 125 | if (!cell || (cell.renderedCol !== undefined && cell.renderedCol !== x)) { 126 | addEmptyCell(outputLines, y); 127 | } else { 128 | addCellText(outputLines, y, cell.line); 129 | cell.renderedCol = x; 130 | } 131 | } 132 | } 133 | return outputLines; 134 | } 135 | 136 | function renderRowsFirstRow (layout, colNumber, rowNumber) { 137 | const outputLines = []; 138 | for (let x = 0; x < colNumber; x++) { 139 | for (let y = 0; y < rowNumber; y++) { 140 | const cell = layout[x][y]; 141 | if (!cell || (cell.renderedRow !== undefined && cell.renderedRow !== y)) { 142 | addEmptyCell(outputLines, y); 143 | } else { 144 | addCellText(outputLines, y, cell.line); 145 | cell.renderedRow = y; 146 | } 147 | } 148 | } 149 | return outputLines; 150 | } 151 | 152 | function renderRowsTag (layout, colNumber, rowNumber) { 153 | const outputLines = []; 154 | for (let x = 0; x < colNumber; x++) { 155 | for (let y = 0; y < rowNumber; y++) { 156 | const cell = layout[x][y]; 157 | if (cell && !cell.rendered) { 158 | const separator = (cell.colspan === 1 && cell.rowspan === 1) 159 | ? '|' 160 | : cellOpenTag(cell); 161 | addCellText(outputLines, y, cell.line, separator); 162 | cell.rendered = true; 163 | } 164 | } 165 | } 166 | return outputLines; 167 | } 168 | 169 | /** 170 | * Render a table into a string. 171 | * Cells can contain multiline text and span across multiple rows and columns. 172 | * 173 | * Can modify cells. 174 | * 175 | * Returns `null` if the table can't be rendered with chosen mode. 176 | * 177 | * @param { TablePrinterCell[][] } tableRows Table to render. 178 | * @param { boolean } firstRowIsHeader If false then the header row will contain empty comments. 179 | * @param { 'first' | 'firstCol' | 'firstRow' | 'repeat' | 'tag' } spanMode How to render cells with colspan/rowspan. 180 | * @returns { string | null } 181 | */ 182 | function tableToString (tableRows, firstRowIsHeader, spanMode) { 183 | if ( 184 | spanMode === 'tag' && 185 | tableRows.some(r => r[0] && (r[0].colspan > 1 || r[0].rowspan > 1)) 186 | ) { 187 | // `tag` mode has a limitation - first cell in any row must not be spanned. 188 | return null; 189 | } 190 | 191 | const { rowNumber, layout, colNumber } = createLayout(tableRows); 192 | 193 | transposeInPlace(layout, (rowNumber > colNumber) ? rowNumber : colNumber); 194 | 195 | let outputLines = []; 196 | switch (spanMode) { 197 | case 'repeat': { 198 | outputLines = renderRowsRepeat(layout, colNumber, rowNumber); 199 | break; 200 | } 201 | case 'first': { 202 | outputLines = renderRowsFirst(layout, colNumber, rowNumber); 203 | break; 204 | } 205 | case 'firstCol': { 206 | outputLines = renderRowsFirstCol(layout, colNumber, rowNumber); 207 | break; 208 | } 209 | case 'firstRow': { 210 | outputLines = renderRowsFirstRow(layout, colNumber, rowNumber); 211 | break; 212 | } 213 | case 'tag': { 214 | outputLines = renderRowsTag(layout, colNumber, rowNumber); 215 | break; 216 | } 217 | default: 218 | throw new Error(`Unhandled span mode: ${spanMode}`); 219 | } 220 | 221 | if (firstRowIsHeader) { 222 | outputLines.splice(1, 0, Array(colNumber).fill('| ---').join(' ')); 223 | } else { 224 | outputLines.unshift( 225 | Array(colNumber).fill('| ').join(' '), 226 | Array(colNumber).fill('| --------').join(' ') 227 | ); 228 | } 229 | 230 | return outputLines.join('\n'); 231 | } 232 | 233 | export { tableToString }; 234 | -------------------------------------------------------------------------------- /packages/base/src/whitespace-processor.js: -------------------------------------------------------------------------------- 1 | 2 | // eslint-disable-next-line no-unused-vars 3 | import { InlineTextBuilder } from './inline-text-builder'; 4 | 5 | // eslint-disable-next-line import/no-unassigned-import 6 | import './typedefs'; 7 | 8 | 9 | function charactersToCodes (str) { 10 | return [...str] 11 | .map(c => '\\u' + c.charCodeAt(0).toString(16).padStart(4, '0')) 12 | .join(''); 13 | } 14 | 15 | /** 16 | * Helps to handle HTML whitespaces. 17 | * 18 | * @class WhitespaceProcessor 19 | */ 20 | class WhitespaceProcessor { 21 | 22 | /** 23 | * Creates an instance of WhitespaceProcessor. 24 | * 25 | * @param { Options } options HtmlToText options. 26 | * @memberof WhitespaceProcessor 27 | */ 28 | constructor (options) { 29 | this.whitespaceChars = (options.preserveNewlines) 30 | ? options.whitespaceCharacters.replace(/\n/g, '') 31 | : options.whitespaceCharacters; 32 | const whitespaceCodes = charactersToCodes(this.whitespaceChars); 33 | this.leadingWhitespaceRe = new RegExp(`^[${whitespaceCodes}]`); 34 | this.trailingWhitespaceRe = new RegExp(`[${whitespaceCodes}]$`); 35 | this.allWhitespaceOrEmptyRe = new RegExp(`^[${whitespaceCodes}]*$`); 36 | this.newlineOrNonWhitespaceRe = new RegExp(`(\\n|[^\\n${whitespaceCodes}])`, 'g'); 37 | this.newlineOrNonNewlineStringRe = new RegExp(`(\\n|[^\\n]+)`, 'g'); 38 | 39 | if (options.preserveNewlines) { 40 | 41 | const wordOrNewlineRe = new RegExp(`\\n|[^\\n${whitespaceCodes}]+`, 'gm'); 42 | 43 | /** 44 | * Shrink whitespaces and wrap text, add to the builder. 45 | * 46 | * @param { string } text Input text. 47 | * @param { InlineTextBuilder } inlineTextBuilder A builder to receive processed text. 48 | * @param { (str: string) => string } [ transform ] A transform to be applied to words. 49 | * @param { boolean } [noWrap] Don't wrap text even if the line is too long. 50 | */ 51 | this.shrinkWrapAdd = function (text, inlineTextBuilder, transform = (str => str), noWrap = false) { 52 | if (!text) { return; } 53 | const previouslyStashedSpace = inlineTextBuilder.stashedSpace; 54 | let anyMatch = false; 55 | let m = wordOrNewlineRe.exec(text); 56 | if (m) { 57 | anyMatch = true; 58 | if (m[0] === '\n') { 59 | inlineTextBuilder.startNewLine(); 60 | } else if (previouslyStashedSpace || this.testLeadingWhitespace(text)) { 61 | inlineTextBuilder.pushWord(transform(m[0]), noWrap); 62 | } else { 63 | inlineTextBuilder.concatWord(transform(m[0]), noWrap); 64 | } 65 | while ((m = wordOrNewlineRe.exec(text)) !== null) { 66 | if (m[0] === '\n') { 67 | inlineTextBuilder.startNewLine(); 68 | } else { 69 | inlineTextBuilder.pushWord(transform(m[0]), noWrap); 70 | } 71 | } 72 | } 73 | inlineTextBuilder.stashedSpace = (previouslyStashedSpace && !anyMatch) || (this.testTrailingWhitespace(text)); 74 | // No need to stash a space in case last added item was a new line, 75 | // but that won't affect anything later anyway. 76 | }; 77 | 78 | } else { 79 | 80 | const wordRe = new RegExp(`[^${whitespaceCodes}]+`, 'g'); 81 | 82 | this.shrinkWrapAdd = function (text, inlineTextBuilder, transform = (str => str), noWrap = false) { 83 | if (!text) { return; } 84 | const previouslyStashedSpace = inlineTextBuilder.stashedSpace; 85 | let anyMatch = false; 86 | let m = wordRe.exec(text); 87 | if (m) { 88 | anyMatch = true; 89 | if (previouslyStashedSpace || this.testLeadingWhitespace(text)) { 90 | inlineTextBuilder.pushWord(transform(m[0]), noWrap); 91 | } else { 92 | inlineTextBuilder.concatWord(transform(m[0]), noWrap); 93 | } 94 | while ((m = wordRe.exec(text)) !== null) { 95 | inlineTextBuilder.pushWord(transform(m[0]), noWrap); 96 | } 97 | } 98 | inlineTextBuilder.stashedSpace = (previouslyStashedSpace && !anyMatch) || this.testTrailingWhitespace(text); 99 | }; 100 | 101 | } 102 | } 103 | 104 | /** 105 | * Add text with only minimal processing. 106 | * Everything between newlines considered a single word. 107 | * No whitespace is trimmed. 108 | * Not affected by preserveNewlines option - `\n` always starts a new line. 109 | * 110 | * `noWrap` argument is `true` by default - this won't start a new line 111 | * even if there is not enough space left in the current line. 112 | * 113 | * @param { string } text Input text. 114 | * @param { InlineTextBuilder } inlineTextBuilder A builder to receive processed text. 115 | * @param { boolean } [noWrap] Don't wrap text even if the line is too long. 116 | */ 117 | addLiteral (text, inlineTextBuilder, noWrap = true) { 118 | if (!text) { return; } 119 | const previouslyStashedSpace = inlineTextBuilder.stashedSpace; 120 | let anyMatch = false; 121 | let m = this.newlineOrNonNewlineStringRe.exec(text); 122 | if (m) { 123 | anyMatch = true; 124 | if (m[0] === '\n') { 125 | inlineTextBuilder.startNewLine(); 126 | } else if (previouslyStashedSpace) { 127 | inlineTextBuilder.pushWord(m[0], noWrap); 128 | } else { 129 | inlineTextBuilder.concatWord(m[0], noWrap); 130 | } 131 | while ((m = this.newlineOrNonNewlineStringRe.exec(text)) !== null) { 132 | if (m[0] === '\n') { 133 | inlineTextBuilder.startNewLine(); 134 | } else { 135 | inlineTextBuilder.pushWord(m[0], noWrap); 136 | } 137 | } 138 | } 139 | inlineTextBuilder.stashedSpace = (previouslyStashedSpace && !anyMatch); 140 | } 141 | 142 | /** 143 | * Test whether the given text starts with HTML whitespace character. 144 | * 145 | * @param { string } text The string to test. 146 | * @returns { boolean } 147 | */ 148 | testLeadingWhitespace (text) { 149 | return this.leadingWhitespaceRe.test(text); 150 | } 151 | 152 | /** 153 | * Test whether the given text ends with HTML whitespace character. 154 | * 155 | * @param { string } text The string to test. 156 | * @returns { boolean } 157 | */ 158 | testTrailingWhitespace (text) { 159 | return this.trailingWhitespaceRe.test(text); 160 | } 161 | 162 | /** 163 | * Test whether the given text contains any non-whitespace characters. 164 | * 165 | * @param { string } text The string to test. 166 | * @returns { boolean } 167 | */ 168 | testContainsWords (text) { 169 | return !this.allWhitespaceOrEmptyRe.test(text); 170 | } 171 | 172 | /** 173 | * Return the number of newlines if there are no words. 174 | * 175 | * If any word is found then return zero regardless of the actual number of newlines. 176 | * 177 | * @param { string } text Input string. 178 | * @returns { number } 179 | */ 180 | countNewlinesNoWords (text) { 181 | this.newlineOrNonWhitespaceRe.lastIndex = 0; 182 | let counter = 0; 183 | let match; 184 | while ((match = this.newlineOrNonWhitespaceRe.exec(text)) !== null) { 185 | if (match[0] === '\n') { 186 | counter++; 187 | } else { 188 | return 0; 189 | } 190 | } 191 | return counter; 192 | } 193 | 194 | } 195 | 196 | export { WhitespaceProcessor }; 197 | -------------------------------------------------------------------------------- /packages/html-to-text/src/html-to-text.js: -------------------------------------------------------------------------------- 1 | 2 | import { compile as compile_ } from '@html-to-text/base'; 3 | import * as genericFormatters from '@html-to-text/base/src/generic-formatters'; 4 | import { get, mergeDuplicatesPreferLast } from '@html-to-text/base/src/util'; 5 | import merge from 'deepmerge'; // default 6 | 7 | import * as textFormatters from './text-formatters'; 8 | 9 | 10 | // eslint-disable-next-line import/no-unassigned-import 11 | import '@html-to-text/base/src/typedefs'; 12 | 13 | 14 | /** 15 | * Default options. 16 | * 17 | * @constant 18 | * @type { Options } 19 | * @default 20 | * @private 21 | */ 22 | const DEFAULT_OPTIONS = { 23 | baseElements: { 24 | selectors: [ 'body' ], 25 | orderBy: 'selectors', // 'selectors' | 'occurrence' 26 | returnDomByDefault: true 27 | }, 28 | decodeEntities: true, 29 | encodeCharacters: {}, 30 | formatters: {}, 31 | limits: { 32 | ellipsis: '...', 33 | maxBaseElements: undefined, 34 | maxChildNodes: undefined, 35 | maxDepth: undefined, 36 | maxInputLength: (1 << 24) // 16_777_216 37 | }, 38 | longWordSplit: { 39 | forceWrapOnLimit: false, 40 | wrapCharacters: [] 41 | }, 42 | preserveNewlines: false, 43 | selectors: [ 44 | { selector: '*', format: 'inline' }, 45 | { 46 | selector: 'a', 47 | format: 'anchor', 48 | options: { 49 | baseUrl: null, 50 | hideLinkHrefIfSameAsText: false, 51 | ignoreHref: false, 52 | linkBrackets: ['[', ']'], 53 | noAnchorUrl: true 54 | } 55 | }, 56 | { selector: 'article', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } }, 57 | { selector: 'aside', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } }, 58 | { 59 | selector: 'blockquote', 60 | format: 'blockquote', 61 | options: { leadingLineBreaks: 2, trailingLineBreaks: 2, trimEmptyLines: true } 62 | }, 63 | { selector: 'br', format: 'lineBreak' }, 64 | { selector: 'div', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } }, 65 | { selector: 'footer', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } }, 66 | { selector: 'form', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } }, 67 | { selector: 'h1', format: 'heading', options: { leadingLineBreaks: 3, trailingLineBreaks: 2, uppercase: true } }, 68 | { selector: 'h2', format: 'heading', options: { leadingLineBreaks: 3, trailingLineBreaks: 2, uppercase: true } }, 69 | { selector: 'h3', format: 'heading', options: { leadingLineBreaks: 3, trailingLineBreaks: 2, uppercase: true } }, 70 | { selector: 'h4', format: 'heading', options: { leadingLineBreaks: 2, trailingLineBreaks: 2, uppercase: true } }, 71 | { selector: 'h5', format: 'heading', options: { leadingLineBreaks: 2, trailingLineBreaks: 2, uppercase: true } }, 72 | { selector: 'h6', format: 'heading', options: { leadingLineBreaks: 2, trailingLineBreaks: 2, uppercase: true } }, 73 | { selector: 'header', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } }, 74 | { 75 | selector: 'hr', 76 | format: 'horizontalLine', 77 | options: { leadingLineBreaks: 2, length: undefined, trailingLineBreaks: 2 } 78 | }, 79 | { 80 | selector: 'img', 81 | format: 'image', 82 | options: { baseUrl: null, linkBrackets: ['[', ']'] } 83 | }, 84 | { selector: 'main', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } }, 85 | { selector: 'nav', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } }, 86 | { 87 | selector: 'ol', 88 | format: 'orderedList', 89 | options: { leadingLineBreaks: 2, trailingLineBreaks: 2 } 90 | }, 91 | { selector: 'p', format: 'paragraph', options: { leadingLineBreaks: 2, trailingLineBreaks: 2 } }, 92 | { selector: 'pre', format: 'pre', options: { leadingLineBreaks: 2, trailingLineBreaks: 2 } }, 93 | { selector: 'section', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } }, 94 | { 95 | selector: 'table', 96 | format: 'table', 97 | options: { 98 | colSpacing: 3, 99 | leadingLineBreaks: 2, 100 | maxColumnWidth: 60, 101 | rowSpacing: 0, 102 | trailingLineBreaks: 2, 103 | uppercaseHeaderCells: true 104 | } 105 | }, 106 | { 107 | selector: 'ul', 108 | format: 'unorderedList', 109 | options: { itemPrefix: ' * ', leadingLineBreaks: 2, trailingLineBreaks: 2 } 110 | }, 111 | { selector: 'wbr', format: 'wbr' }, 112 | ], 113 | tables: [], // deprecated 114 | whitespaceCharacters: ' \t\r\n\f\u200b', 115 | wordwrap: 80 116 | }; 117 | 118 | const concatMerge = (acc, src, options) => [...acc, ...src]; 119 | const overwriteMerge = (acc, src, options) => [...src]; 120 | const selectorsMerge = (acc, src, options) => ( 121 | (acc.some(s => typeof s === 'object')) 122 | ? concatMerge(acc, src, options) // selectors 123 | : overwriteMerge(acc, src, options) // baseElements.selectors 124 | ); 125 | 126 | /** 127 | * Preprocess options, compile selectors into a decision tree, 128 | * return a function intended for batch processing. 129 | * 130 | * @param { Options } [options = {}] HtmlToText options. 131 | * @returns { (html: string, metadata?: any) => string } Pre-configured converter function. 132 | * @static 133 | */ 134 | function compile (options = {}) { 135 | options = merge( 136 | DEFAULT_OPTIONS, 137 | options, 138 | { 139 | arrayMerge: overwriteMerge, 140 | customMerge: (key) => ((key === 'selectors') ? selectorsMerge : undefined) 141 | } 142 | ); 143 | options.formatters = Object.assign({}, genericFormatters, textFormatters, options.formatters); 144 | options.selectors = mergeDuplicatesPreferLast(options.selectors, (s => s.selector)); 145 | 146 | handleDeprecatedOptions(options); 147 | 148 | return compile_(options); 149 | } 150 | 151 | /** 152 | * Convert given HTML content to plain text string. 153 | * 154 | * @param { string } html HTML content to convert. 155 | * @param { Options } [options = {}] HtmlToText options. 156 | * @param { any } [metadata] Optional metadata for HTML document, for use in formatters. 157 | * @returns { string } Plain text string. 158 | * @static 159 | * 160 | * @example 161 | * const { convert } = require('html-to-text'); 162 | * const text = convert('

Hello World

', { 163 | * wordwrap: 130 164 | * }); 165 | * console.log(text); // HELLO WORLD 166 | */ 167 | function convert (html, options = {}, metadata = undefined) { 168 | return compile(options)(html, metadata); 169 | } 170 | 171 | /** 172 | * Map previously existing and now deprecated options to the new options layout. 173 | * This is a subject for cleanup in major releases. 174 | * 175 | * @param { Options } options HtmlToText options. 176 | */ 177 | function handleDeprecatedOptions (options) { 178 | if (options.tags) { 179 | const tagDefinitions = Object.entries(options.tags).map( 180 | ([selector, definition]) => ({ ...definition, selector: selector || '*' }) 181 | ); 182 | options.selectors.push(...tagDefinitions); 183 | options.selectors = mergeDuplicatesPreferLast(options.selectors, (s => s.selector)); 184 | } 185 | 186 | function set (obj, path, value) { 187 | const valueKey = path.pop(); 188 | for (const key of path) { 189 | let nested = obj[key]; 190 | if (!nested) { 191 | nested = {}; 192 | obj[key] = nested; 193 | } 194 | obj = nested; 195 | } 196 | obj[valueKey] = value; 197 | } 198 | 199 | if (options['baseElement']) { 200 | const baseElement = options['baseElement']; 201 | set( 202 | options, 203 | ['baseElements', 'selectors'], 204 | (Array.isArray(baseElement) ? baseElement : [baseElement]) 205 | ); 206 | } 207 | if (options['returnDomByDefault'] !== undefined) { 208 | set(options, ['baseElements', 'returnDomByDefault'], options['returnDomByDefault']); 209 | } 210 | 211 | for (const definition of options.selectors) { 212 | if (definition.format === 'anchor' && get(definition, ['options', 'noLinkBrackets'])) { 213 | set(definition, ['options', 'linkBrackets'], false); 214 | } 215 | } 216 | } 217 | 218 | export { 219 | compile, 220 | convert, 221 | convert as htmlToText 222 | }; 223 | -------------------------------------------------------------------------------- /packages/html-to-md/test/tags.js: -------------------------------------------------------------------------------- 1 | 2 | import test from 'ava'; 3 | 4 | import { htmlToMarkdown } from '../src/html-to-md'; 5 | 6 | 7 | const snapshotMacro = test.macro({ 8 | exec: function (t, html, options = undefined, metadata = undefined) { 9 | t.snapshot(htmlToMarkdown(html, options, metadata), '```html\n' + html + '\n```'); 10 | } 11 | }); 12 | 13 | function tagsSequence (tagNames) { 14 | return tagNames.map(s => `<${s}>${s}`).join(' '); 15 | } 16 | 17 | test( 18 | 'common block-level elements', 19 | snapshotMacro, 20 | tagsSequence([ 21 | 'article', 'aside', 'div', 'figure', 'figcaption', 22 | 'footer', 'form', 'header', 'main', 'nav', 'p', 'section' 23 | ]) 24 | ); 25 | 26 | test( 27 | 'block with custom spacing', 28 | snapshotMacro, 29 | tagsSequence([ 30 | 'div', 'div', 'p', 'p', 'div', 'div' 31 | ]), 32 | { 33 | selectors: [ 34 | { selector: 'p', options: { leadingLineBreaks: 4, trailingLineBreaks: 3 } } 35 | ] 36 | } 37 | ); 38 | 39 | test( 40 | 'default formatter is inline', 41 | snapshotMacro, 42 | 'Lorem ipsum dolor met' 43 | ); 44 | 45 | test( 46 | 'headings', 47 | snapshotMacro, 48 | tagsSequence([ 49 | 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p' 50 | ]) 51 | ); 52 | 53 | test( 54 | 'line breaks (HTML tags by default)', 55 | snapshotMacro, 56 | `a
b

c


d` 57 | ); 58 | 59 | test( 60 | 'line breaks (two spaces)', 61 | snapshotMacro, 62 | `a
b

c


d`, 63 | { 64 | selectors: [ 65 | { selector: 'br', options: { string: ' \n' } } 66 | ] 67 | } 68 | ); 69 | 70 | test( 71 | 'line breaks (backslash)', 72 | snapshotMacro, 73 | `a
b

c


d`, 74 | { 75 | selectors: [ 76 | { selector: 'br', options: { string: '\\\n' } } 77 | ] 78 | } 79 | ); 80 | 81 | test( 82 | 'horizontal lines (default)', 83 | snapshotMacro, 84 | `a
b\n
\nc` 85 | ); 86 | 87 | test( 88 | 'horizontal lines (custom)', 89 | snapshotMacro, 90 | `a
b\n
\nc`, 91 | { 92 | selectors: [ 93 | { selector: 'hr', options: { string: '* * *' } } 94 | ] 95 | } 96 | ); 97 | 98 | test( 99 | 'pre', 100 | snapshotMacro, 101 | '

Code fragment:

  body {\n    color: red;\n  }
' 102 | ); 103 | 104 | test( 105 | 'blockquote', 106 | snapshotMacro, 107 | 'foo
quote
bar' 108 | ); 109 | 110 | test( 111 | 'img', 112 | snapshotMacro, 113 | 'alt text' 114 | ); 115 | 116 | test( 117 | 'img with rewritten path', 118 | snapshotMacro, 119 | '', 120 | { 121 | selectors: [ 122 | { 123 | selector: 'img', 124 | options: { pathRewrite: (path, meta) => path.replace('pictures/', meta.assetsPath) } 125 | } 126 | ] 127 | }, 128 | { assetsPath: 'assets/' } // metadata 129 | ); 130 | 131 | test( 132 | 'img with source encoded as data url', 133 | snapshotMacro, 134 | 'Red dot' 135 | ); 136 | 137 | test( 138 | 'link', 139 | snapshotMacro, 140 | 'test' 141 | ); 142 | 143 | test( 144 | 'email link', 145 | snapshotMacro, 146 | 'mail me' 147 | ); 148 | 149 | test( 150 | 'anchor link', 151 | snapshotMacro, 152 | 'test' 153 | ); 154 | 155 | test( 156 | 'link with title', 157 | snapshotMacro, 158 | 'test' 159 | ); 160 | 161 | test( 162 | 'link with rewritten path and baseUrl', 163 | snapshotMacro, 164 | 'test', 165 | { 166 | selectors: [ 167 | { 168 | selector: 'a', 169 | options: { 170 | baseUrl: 'https://example.com/', 171 | pathRewrite: (path, meta) => meta.path + path 172 | } 173 | } 174 | ] 175 | }, 176 | { path: '/foo/bar' } // metadata 177 | ); 178 | 179 | test( 180 | 'named anchor', 181 | snapshotMacro, 182 | '' 183 | ); 184 | 185 | test( 186 | 'bold, strong', 187 | snapshotMacro, 188 | 'bold, strong' 189 | ); 190 | 191 | test( 192 | 'italic, emphasis', 193 | snapshotMacro, 194 | 'italic, emphasis' 195 | ); 196 | 197 | test( 198 | 'strikethrough, del', 199 | snapshotMacro, 200 | 'strikethrough, deleted' 201 | ); 202 | 203 | test( 204 | 'inline code', 205 | snapshotMacro, 206 | 'Lorem ipsum code dolor sit' 207 | ); 208 | 209 | test( 210 | 'sub, sup', 211 | snapshotMacro, 212 | 'x2, x2' 213 | ); 214 | 215 | test( 216 | 'kbd', 217 | snapshotMacro, 218 | 'Ctrl + C' 219 | ); 220 | 221 | test( 222 | 'figure', 223 | snapshotMacro, 224 | /*html*/` 225 |
226 | Alt test 228 |
Caption
229 |
` 230 | ); 231 | 232 | test( 233 | 'picture - ignore sources', 234 | snapshotMacro, 235 | /*html*/` 236 | 237 | 239 | Alt text 240 | ` 241 | ); 242 | 243 | test( 244 | 'definition lists', 245 | snapshotMacro, 246 | /*html*/` 247 |
248 |
Title 1
249 |
Definition 1
250 |
Title 2a
251 |
Title 2b
252 |
Definition 2a
253 |
Definition 2b
254 |
` 255 | ); 256 | 257 | test( 258 | 'definition list with divs', 259 | snapshotMacro, 260 | /*html*/` 261 |
262 |
263 |
Title 1
264 |
Definition 1
265 |
266 |
267 |
Title 2a
268 |
Title 2b
269 |
Definition 2a
270 |
Definition 2b
271 |
272 |
` 273 | ); 274 | 275 | test( 276 | 'definition lists (compatible syntax)', 277 | snapshotMacro, 278 | /*html*/` 279 |
280 |
Title 1
281 |
Definition 1
282 |
Title 2a
283 |
Title 2b
284 |
Definition 2a
285 |
Definition 2b
286 |
`, 287 | { 288 | selectors: [ 289 | { selector: 'dl', format: 'definitionListCompatible' } 290 | ] 291 | } 292 | ); 293 | 294 | test( 295 | 'unordered list', 296 | snapshotMacro, 297 | /*html*/` 298 |
    299 |
  • Item 1
  • 300 |
  • Item 2
  • 301 |
  • Item 3
  • 302 |
` 303 | ); 304 | 305 | test( 306 | 'ordered list', 307 | snapshotMacro, 308 | /*html*/` 309 |
    310 |
  1. Item 1
  2. 311 |
  3. Item 2
  4. 312 |
  5. Item 3
  6. 313 |
` 314 | ); 315 | 316 | test( 317 | 'ordered list with start number (numbering type is ignored)', 318 | snapshotMacro, 319 | /*html*/` 320 |
    321 |
  1. Item 1
  2. 322 |
  3. Item 2
  4. 323 |
  5. Item 3
  6. 324 |
` 325 | ); 326 | 327 | test( 328 | 'ordered list with overridden start number', 329 | snapshotMacro, 330 | /*html*/` 331 |
    332 |
  1. Item 1
  2. 333 |
  3. Item 2
  4. 334 |
  5. Item 3
  6. 335 |
`, 336 | { 337 | selectors: [ 338 | { selector: 'ol', options: { start: 22 } } 339 | ] 340 | } 341 | ); 342 | 343 | test( 344 | 'table with header cells in the first row', 345 | snapshotMacro, 346 | /*html*/` 347 | 348 | 349 | 350 | 351 | 352 |
abc
def
g
g
h

h
i


i

j

k

k

l
` 353 | ); 354 | 355 | test( 356 | 'table with thead, tbody, tfoot', 357 | snapshotMacro, 358 | /*html*/` 359 | 360 | 361 | 362 | 363 |
abc
def
ghi
` 364 | ); 365 | 366 | test( 367 | 'table without a header', 368 | snapshotMacro, 369 | /*html*/` 370 | 371 | 372 | 373 | 374 |
abc
def
ghi
` 375 | ); 376 | 377 | const tableWithSpannedCells = /*html*/` 378 | 379 | 380 | 381 | 382 | 383 | 384 |
ac
de
g
kl
m
`; 385 | 386 | test( 387 | 'table with colspans and rowspans (repeat value by default)', 388 | snapshotMacro, 389 | tableWithSpannedCells 390 | ); 391 | 392 | test( 393 | 'table with colspans and rowspans (value in first cell only)', 394 | snapshotMacro, 395 | tableWithSpannedCells, 396 | { 397 | selectors: [ 398 | { selector: 'table', options: { spanMode: 'first' } } 399 | ] 400 | } 401 | ); 402 | 403 | test( 404 | 'table with colspans and rowspans (value repeated in cells of the first row only)', 405 | snapshotMacro, 406 | tableWithSpannedCells, 407 | { 408 | selectors: [ 409 | { selector: 'table', options: { spanMode: 'firstRow' } } 410 | ] 411 | } 412 | ); 413 | 414 | test( 415 | 'table with colspans and rowspans (value repeated in cells of the first column only)', 416 | snapshotMacro, 417 | tableWithSpannedCells, 418 | { 419 | selectors: [ 420 | { selector: 'table', options: { spanMode: 'firstCol' } } 421 | ] 422 | } 423 | ); 424 | 425 | test( 426 | 'table with colspans and rowspans (use HTML tag for spanned cells)', 427 | snapshotMacro, 428 | /*html*/` 429 | 430 | 431 | 432 | 433 | 434 |
abc
de
g
j
`, 435 | { 436 | selectors: [ 437 | { selector: 'table', options: { spanMode: 'tag' } } 438 | ] 439 | } 440 | ); 441 | 442 | test( 443 | 'table with colspans and rowspans (fallback to HTML from "tag" mode)', 444 | snapshotMacro, 445 | tableWithSpannedCells, 446 | { 447 | selectors: [ 448 | { selector: 'table', options: { spanMode: 'tag' } } 449 | ] 450 | } 451 | ); 452 | -------------------------------------------------------------------------------- /packages/base/src/typedefs.js: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @typedef { object } Options 4 | * HtmlToText options. 5 | * 6 | * @property { BaseElementsOptions } [baseElements] 7 | * Options for narrowing down to informative parts of HTML document. 8 | * 9 | * @property { boolean } [decodeEntities] 10 | * Specify whether HTML entities should be decoded in the text output. 11 | * 12 | * @property { {[key: string]: string | false} | ((str: string) => string) | undefined } [encodeCharacters] 13 | * A dictionary mapping from input text characters to escape sequences 14 | * (you can set values to false to disable escaping characters that are enabled by default) 15 | * or a function that does the replacement. 16 | * 17 | * @property { {[key: string]: FormatCallback} } [formatters = {}] 18 | * A dictionary with custom formatting functions for specific kinds of elements. 19 | * 20 | * Keys are custom string identifiers, values are callbacks. 21 | * 22 | * @property { LimitsOptions } [limits] 23 | * Options for handling complex documents and limiting the output size. 24 | * 25 | * @property { LongWordSplitOptions } [longWordSplit] 26 | * Describes how to wrap long words. 27 | * 28 | * @property { boolean } [preserveNewlines = false] 29 | * By default, any newlines `\n` from the input HTML are collapsed into space as any other HTML whitespace characters. 30 | * If `true`, these newlines will be preserved in the output. 31 | * This is only useful when input HTML carries some plain text formatting instead of proper tags. 32 | * 33 | * @property { SelectorDefinition[] } [selectors = []] 34 | * Instructions for how to render HTML elements based on matched selectors. 35 | * 36 | * Use this to (re)define options for new or already supported tags. 37 | * 38 | * @property { string[] | boolean } [tables = []] 39 | * Deprecated. Use selectors with `format: 'dataTable'` instead. 40 | * 41 | * @property { string } [whitespaceCharacters = ' \t\r\n\f\u200b'] 42 | * All characters that are considered whitespace. 43 | * Default is according to HTML specifications. 44 | * 45 | * @property { number | boolean | null } [wordwrap = 80] 46 | * After how many chars a line break should follow in blocks. 47 | * 48 | * Set to `null` or `false` to disable word-wrapping. 49 | */ 50 | 51 | /** 52 | * @typedef { object } BaseElementsOptions 53 | * Options for narrowing down to informative parts of HTML document. 54 | * 55 | * @property { string[] } [selectors = ['body']] 56 | * The resulting text output will be composed from the text content of elements 57 | * matched with these selectors. 58 | * 59 | * @property { 'selectors' | 'occurrence' } [orderBy = 'selectors'] 60 | * When multiple selectors are set, this option specifies 61 | * whether the selectors order has to be reflected in the output text. 62 | * 63 | * `'selectors'` (default) - matches for the first selector will appear first, etc; 64 | * 65 | * `'occurrence'` - all bases will appear in the same order as in input HTML. 66 | * 67 | * @property { boolean } [returnDomByDefault = true] 68 | * Use the entire document if none of provided selectors matched. 69 | */ 70 | 71 | /** 72 | * @typedef { object } DecodeOptions 73 | * Text decoding options given to `he.decode`. 74 | * 75 | * For more information see the [he](https://github.com/mathiasbynens/he) module. 76 | * 77 | * @property { boolean } [isAttributeValue = false] 78 | * TL;DR: If set to `true` - leave attribute values raw, don't parse them as text content. 79 | * 80 | * @property { boolean } [strict = false] 81 | * TL;DR: If set to `true` - throw an error on invalid HTML input. 82 | */ 83 | 84 | /** 85 | * @typedef { object } LimitsOptions 86 | * Options for handling complex documents and limiting the output size. 87 | * 88 | * @property { string } [ellipsis = ...] 89 | * A string to put in place of skipped content. 90 | * 91 | * @property { number | undefined } [maxBaseElements = undefined] 92 | * Stop looking for new base elements after this number of matches. 93 | * 94 | * No ellipsis is used when this condition is met. 95 | * 96 | * No limit if undefined. 97 | * 98 | * @property { number | undefined } [maxChildNodes = undefined] 99 | * Process only this many child nodes of any element. 100 | * 101 | * Remaining nodes, if any, will be replaced with ellipsis. 102 | * 103 | * Text nodes are counted along with tags. 104 | * 105 | * No limit if undefined. 106 | * 107 | * @property { number | undefined } [maxDepth = undefined] 108 | * Only go to a certain depth starting from `Options.baseElement`. 109 | * 110 | * Replace deeper nodes with ellipsis. 111 | * 112 | * No depth limit if undefined. 113 | * 114 | * @property { number } [maxInputLength = 16_777_216] 115 | * If the input string is longer than this value - it will be truncated 116 | * and a message will be sent to `stderr`. 117 | * 118 | * Ellipsis is not used in this case. 119 | */ 120 | 121 | /** 122 | * @typedef { object } LongWordSplitOptions 123 | * Describes how to wrap long words. 124 | * 125 | * @property { boolean } [forceWrapOnLimit = false] 126 | * Break long words on the `Options.wordwrap` limit when there are no characters to wrap on. 127 | * 128 | * @property { string[] } [wrapCharacters = []] 129 | * An array containing the characters that may be wrapped on. 130 | */ 131 | 132 | /** 133 | * @typedef { object } SelectorDefinition 134 | * Describes how to handle tags matched by a selector. 135 | * 136 | * @property { string } selector 137 | * CSS selector. Refer to README for notes on supported selectors etc. 138 | * 139 | * @property { string } format 140 | * Identifier of a {@link FormatCallback}, built-in or provided in `Options.formatters` dictionary. 141 | * 142 | * @property { FormatOptions } options 143 | * Options to customize the formatter for this element. 144 | */ 145 | 146 | /** 147 | * @typedef { object } FormatOptions 148 | * Options specific to different formatters ({@link FormatCallback}). 149 | * This is an umbrella type definition. Each formatter supports it's own subset of options. 150 | * 151 | * @property { number } [leadingLineBreaks] 152 | * Number of line breaks to separate previous block from this one. 153 | * 154 | * Note that N+1 line breaks are needed to make N empty lines. 155 | * 156 | * @property { number } [trailingLineBreaks] 157 | * Number of line breaks to separate this block from the next one. 158 | * 159 | * Note that N+1 line breaks are needed to make N empty lines. 160 | * 161 | * @property { string | null } [baseUrl = null] 162 | * (Only for: `anchor` and `image` formatters.) Server host for link `href` attributes and image `src` attributes 163 | * relative to the root (the ones that start with `/`). 164 | * 165 | * For example, with `baseUrl = 'http://asdf.com'` and `...` 166 | * the link in the text will be `http://asdf.com/dir/subdir`. 167 | * 168 | * Keep in mind that `baseUrl` should not end with a `/`. 169 | * 170 | * @property { boolean } [hideLinkHrefIfSameAsText = false] 171 | * (Only for: `anchor` formatter.) By default links are translated in the following way: 172 | * 173 | * `text` => becomes => `text [link]`. 174 | * 175 | * If this option is set to `true` and `link` and `text` are the same, 176 | * `[link]` will be omitted and only `text` will be present. 177 | * 178 | * @property { boolean } [ignoreHref = false] 179 | * (Only for: `anchor` formatter.) Ignore all links. Only process internal text of anchor tags. 180 | * 181 | * @property { [string, string] | false } [linkBrackets] 182 | * (Only for: `anchor` and `image` formatters.) Surround links with these brackets. Default: `['[', ']']`. 183 | * 184 | * Set to `false` or `['', '']` to disable. 185 | * 186 | * @property { boolean } [noAnchorUrl = true] 187 | * (Only for: `anchor` formatter.) Ignore anchor links (where `href='#...'`). 188 | * 189 | * @property { string } [itemPrefix = ' * '] 190 | * (Only for: `unorderedList` formatter.) String prefix for each list item. 191 | * 192 | * @property { boolean } [uppercase = true] 193 | * (Only for: `heading` formatter.) By default, headings (`

`, `

`, etc) are uppercased. 194 | * 195 | * Set this to `false` to leave headings as they are. 196 | * 197 | * @property { number | undefined } [length = undefined] 198 | * (Only for: `horizontalLine` formatter.) Length of the `
` line. 199 | * 200 | * If numeric value is provided - it is used. 201 | * Otherwise, if global `wordwrap` number is provided - it is used. 202 | * If neither is true, then the fallback value of 40 is used. 203 | * 204 | * @property { boolean } [trimEmptyLines = true] 205 | * (Only for: `blockquote` formatter.) Trim empty lines from blockquote. 206 | * 207 | * @property { boolean } [uppercaseHeaderCells = true] 208 | * (Only for: `table`, `dataTable` formatter.) By default, heading cells (``) are uppercased. 209 | * 210 | * Set this to `false` to leave heading cells as they are. 211 | * 212 | * @property { number } [maxColumnWidth = 60] 213 | * (Only for: `table`, `dataTable` formatter.) Data table cell content will be wrapped to fit this width 214 | * instead of global `wordwrap` limit. 215 | * 216 | * Set this to `undefined` in order to fall back to `wordwrap` limit. 217 | * 218 | * @property { number } [colSpacing = 3] 219 | * (Only for: `table`, `dataTable` formatter.) Number of spaces between data table columns. 220 | * 221 | * @property { number } [rowSpacing = 0] 222 | * (Only for: `table`, `dataTable` formatter.) Number of empty lines between data table rows. 223 | * 224 | */ 225 | 226 | /** 227 | * @typedef { object } DomNode 228 | * Simplified definition of [htmlparser2](https://github.com/fb55/htmlparser2) Node type. 229 | * 230 | * Makes no distinction between elements (tags) and data nodes (good enough for now). 231 | * 232 | * @property { string } type Type of node - "text", "tag", "comment", "script", etc. 233 | * @property { string } [data] Content of a data node. 234 | * @property { string } [name] Tag name. 235 | * @property { {[key: string]: string} } [attribs] Tag attributes dictionary. 236 | * @property { DomNode[] } [children] Child nodes. 237 | * @property { DomNode } [parent] Parent node. 238 | */ 239 | 240 | /** 241 | * A function to stringify a DOM node. 242 | * 243 | * @callback FormatCallback 244 | * 245 | * @param { DomNode } elem A DOM node as returned by [htmlparser2](https://github.com/fb55/htmlparser2). 246 | * @param { RecursiveCallback } walk Recursive callback to process child nodes. 247 | * @param { BlockTextBuilder } builder Passed around to accumulate output text. Contains options object. 248 | * @param { FormatOptions } formatOptions Options specific to this callback. 249 | */ 250 | 251 | /** 252 | * A function to process child nodes. 253 | * Passed into a {@link FormatCallback} as an argument. 254 | * 255 | * @callback RecursiveCallback 256 | * 257 | * @param { DomNode[] } [nodes] DOM nodes array. 258 | * @param { BlockTextBuilder } builder Passed around to accumulate output text. Contains options object. 259 | */ 260 | 261 | /** 262 | * @typedef { object } TablePrinterCell 263 | * Cell definition for a table printer. 264 | * 265 | * @property { number } colspan Number of columns this cell occupies. 266 | * @property { number } rowspan Number of rows this cell occupies. 267 | * @property { string } text Cell contents (pre-wrapped). 268 | */ 269 | 270 | /** 271 | * Render a table into a string. 272 | * Cells can contain multiline text and span across multiple rows and columns. 273 | * 274 | * Can modify cells. 275 | * 276 | * @callback TablePrinter 277 | * 278 | * @param { TablePrinterCell[][] } tableRows Table to render. 279 | * @returns { string } 280 | */ 281 | -------------------------------------------------------------------------------- /packages/html-to-text/src/text-formatters.js: -------------------------------------------------------------------------------- 1 | 2 | import { get, numberToLetterSequence, numberToRoman, trimCharacter, trimCharacterEnd } from '@html-to-text/base/src/util'; 3 | 4 | import { tableToString } from './table-printer'; 5 | 6 | // eslint-disable-next-line import/no-unassigned-import 7 | import '@html-to-text/base/src/typedefs'; 8 | 9 | 10 | /** 11 | * Process a line-break. 12 | * 13 | * @type { FormatCallback } 14 | */ 15 | function formatLineBreak (elem, walk, builder, formatOptions) { 16 | builder.addLineBreak(); 17 | } 18 | 19 | /** 20 | * Process a `wbr` tag (word break opportunity). 21 | * 22 | * @type { FormatCallback } 23 | */ 24 | function formatWbr (elem, walk, builder, formatOptions) { 25 | builder.addWordBreakOpportunity(); 26 | } 27 | 28 | /** 29 | * Process a horizontal line. 30 | * 31 | * @type { FormatCallback } 32 | */ 33 | function formatHorizontalLine (elem, walk, builder, formatOptions) { 34 | builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 }); 35 | builder.addInline('-'.repeat(formatOptions.length || builder.options.wordwrap || 40)); 36 | builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 }); 37 | } 38 | 39 | /** 40 | * Process a paragraph. 41 | * 42 | * @type { FormatCallback } 43 | */ 44 | function formatParagraph (elem, walk, builder, formatOptions) { 45 | builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 }); 46 | walk(elem.children, builder); 47 | builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 }); 48 | } 49 | 50 | /** 51 | * Process a preformatted content. 52 | * 53 | * @type { FormatCallback } 54 | */ 55 | function formatPre (elem, walk, builder, formatOptions) { 56 | builder.openBlock({ 57 | isPre: true, 58 | leadingLineBreaks: formatOptions.leadingLineBreaks || 2 59 | }); 60 | walk(elem.children, builder); 61 | builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 }); 62 | } 63 | 64 | /** 65 | * Process a heading. 66 | * 67 | * @type { FormatCallback } 68 | */ 69 | function formatHeading (elem, walk, builder, formatOptions) { 70 | builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 }); 71 | if (formatOptions.uppercase !== false) { 72 | builder.pushWordTransform(str => str.toUpperCase()); 73 | walk(elem.children, builder); 74 | builder.popWordTransform(); 75 | } else { 76 | walk(elem.children, builder); 77 | } 78 | builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 }); 79 | } 80 | 81 | /** 82 | * Process a blockquote. 83 | * 84 | * @type { FormatCallback } 85 | */ 86 | function formatBlockquote (elem, walk, builder, formatOptions) { 87 | builder.openBlock({ 88 | leadingLineBreaks: formatOptions.leadingLineBreaks || 2, 89 | reservedLineLength: 2 90 | }); 91 | walk(elem.children, builder); 92 | builder.closeBlock({ 93 | trailingLineBreaks: formatOptions.trailingLineBreaks || 2, 94 | blockTransform: str => ((formatOptions.trimEmptyLines !== false) ? trimCharacter(str, '\n') : str) 95 | .split('\n') 96 | .map(line => '> ' + line) 97 | .join('\n') 98 | }); 99 | } 100 | 101 | function withBrackets (str, brackets) { 102 | if (!brackets) { return str; } 103 | 104 | const lbr = (typeof brackets[0] === 'string') 105 | ? brackets[0] 106 | : '['; 107 | const rbr = (typeof brackets[1] === 'string') 108 | ? brackets[1] 109 | : ']'; 110 | return lbr + str + rbr; 111 | } 112 | 113 | function pathRewrite (path, rewriter, baseUrl, metadata, elem) { 114 | const modifiedPath = (typeof rewriter === 'function') 115 | ? rewriter(path, metadata, elem) 116 | : path; 117 | return (modifiedPath[0] === '/' && baseUrl) 118 | ? trimCharacterEnd(baseUrl, '/') + modifiedPath 119 | : modifiedPath; 120 | } 121 | 122 | /** 123 | * Process an image. 124 | * 125 | * @type { FormatCallback } 126 | */ 127 | function formatImage (elem, walk, builder, formatOptions) { 128 | const attribs = elem.attribs || {}; 129 | const alt = (attribs.alt) 130 | ? attribs.alt 131 | : ''; 132 | const src = (!attribs.src) 133 | ? '' 134 | : pathRewrite(attribs.src, formatOptions.pathRewrite, formatOptions.baseUrl, builder.metadata, elem); 135 | const text = (!src) 136 | ? alt 137 | : (!alt) 138 | ? withBrackets(src, formatOptions.linkBrackets) 139 | : alt + ' ' + withBrackets(src, formatOptions.linkBrackets); 140 | 141 | builder.addInline(text, { noWordTransform: true }); 142 | } 143 | 144 | /** 145 | * Process an anchor. 146 | * 147 | * @type { FormatCallback } 148 | */ 149 | function formatAnchor (elem, walk, builder, formatOptions) { 150 | function getHref () { 151 | if (formatOptions.ignoreHref) { return ''; } 152 | if (!elem.attribs || !elem.attribs.href) { return ''; } 153 | let href = elem.attribs.href.replace(/^mailto:/, ''); 154 | if (formatOptions.noAnchorUrl && href[0] === '#') { return ''; } 155 | href = pathRewrite(href, formatOptions.pathRewrite, formatOptions.baseUrl, builder.metadata, elem); 156 | return href; 157 | } 158 | const href = getHref(); 159 | if (!href) { 160 | walk(elem.children, builder); 161 | } else { 162 | let text = ''; 163 | builder.pushWordTransform( 164 | str => { 165 | if (str) { text += str; } 166 | return str; 167 | } 168 | ); 169 | walk(elem.children, builder); 170 | builder.popWordTransform(); 171 | 172 | const hideSameLink = formatOptions.hideLinkHrefIfSameAsText && href === text; 173 | if (!hideSameLink) { 174 | builder.addInline( 175 | (!text) 176 | ? href 177 | : ' ' + withBrackets(href, formatOptions.linkBrackets), 178 | { noWordTransform: true } 179 | ); 180 | } 181 | } 182 | } 183 | 184 | /** 185 | * @param { DomNode } elem List items with their prefixes. 186 | * @param { RecursiveCallback } walk Recursive callback to process child nodes. 187 | * @param { BlockTextBuilder } builder Passed around to accumulate output text. 188 | * @param { FormatOptions } formatOptions Options specific to a formatter. 189 | * @param { () => string } nextPrefixCallback Function that returns increasing index each time it is called. 190 | */ 191 | function formatList (elem, walk, builder, formatOptions, nextPrefixCallback) { 192 | const isNestedList = get(elem, ['parent', 'name']) === 'li'; 193 | 194 | // With Roman numbers, index length is not as straightforward as with Arabic numbers or letters, 195 | // so the dumb length comparison is the most robust way to get the correct value. 196 | let maxPrefixLength = 0; 197 | const listItems = (elem.children || []) 198 | // it might be more accurate to check only for html spaces here, but no significant benefit 199 | .filter(child => child.type !== 'text' || !/^\s*$/.test(child.data)) 200 | .map(function (child) { 201 | if (child.name !== 'li') { 202 | return { node: child, prefix: '' }; 203 | } 204 | const prefix = (isNestedList) 205 | ? nextPrefixCallback().trimStart() 206 | : nextPrefixCallback(); 207 | if (prefix.length > maxPrefixLength) { maxPrefixLength = prefix.length; } 208 | return { node: child, prefix: prefix }; 209 | }); 210 | if (!listItems.length) { return; } 211 | 212 | builder.openList({ 213 | interRowLineBreaks: 1, 214 | leadingLineBreaks: isNestedList ? 1 : (formatOptions.leadingLineBreaks || 2), 215 | maxPrefixLength: maxPrefixLength, 216 | prefixAlign: 'left' 217 | }); 218 | 219 | for (const { node, prefix } of listItems) { 220 | builder.openListItem({ prefix: prefix }); 221 | walk([node], builder); 222 | builder.closeListItem(); 223 | } 224 | 225 | builder.closeList({ trailingLineBreaks: isNestedList ? 1 : (formatOptions.trailingLineBreaks || 2) }); 226 | } 227 | 228 | /** 229 | * Process an unordered list. 230 | * 231 | * @type { FormatCallback } 232 | */ 233 | function formatUnorderedList (elem, walk, builder, formatOptions) { 234 | const prefix = formatOptions.itemPrefix || ' * '; 235 | return formatList(elem, walk, builder, formatOptions, () => prefix); 236 | } 237 | 238 | /** 239 | * Process an ordered list. 240 | * 241 | * @type { FormatCallback } 242 | */ 243 | function formatOrderedList (elem, walk, builder, formatOptions) { 244 | let nextIndex = Number(elem.attribs.start || '1'); 245 | const indexFunction = getOrderedListIndexFunction(elem.attribs.type); 246 | const nextPrefixCallback = () => ' ' + indexFunction(nextIndex++) + '. '; 247 | return formatList(elem, walk, builder, formatOptions, nextPrefixCallback); 248 | } 249 | 250 | /** 251 | * Return a function that can be used to generate index markers of a specified format. 252 | * 253 | * @param { string } [olType='1'] Marker type. 254 | * @returns { (i: number) => string } 255 | */ 256 | function getOrderedListIndexFunction (olType = '1') { 257 | switch (olType) { 258 | case 'a': return (i) => numberToLetterSequence(i, 'a'); 259 | case 'A': return (i) => numberToLetterSequence(i, 'A'); 260 | case 'i': return (i) => numberToRoman(i).toLowerCase(); 261 | case 'I': return (i) => numberToRoman(i); 262 | case '1': 263 | default: return (i) => (i).toString(); 264 | } 265 | } 266 | 267 | /** 268 | * Given a list of class and ID selectors (prefixed with '.' and '#'), 269 | * return them as separate lists of names without prefixes. 270 | * 271 | * @param { string[] } selectors Class and ID selectors (`[".class", "#id"]` etc). 272 | * @returns { { classes: string[], ids: string[] } } 273 | */ 274 | function splitClassesAndIds (selectors) { 275 | const classes = []; 276 | const ids = []; 277 | for (const selector of selectors) { 278 | if (selector.startsWith('.')) { 279 | classes.push(selector.substring(1)); 280 | } else if (selector.startsWith('#')) { 281 | ids.push(selector.substring(1)); 282 | } 283 | } 284 | return { classes: classes, ids: ids }; 285 | } 286 | 287 | function isDataTable (attr, tables) { 288 | if (tables === true) { return true; } 289 | if (!attr) { return false; } 290 | 291 | const { classes, ids } = splitClassesAndIds(tables); 292 | const attrClasses = (attr['class'] || '').split(' '); 293 | const attrIds = (attr['id'] || '').split(' '); 294 | 295 | return attrClasses.some(x => classes.includes(x)) || attrIds.some(x => ids.includes(x)); 296 | } 297 | 298 | /** 299 | * Process a table (either as a container or as a data table, depending on options). 300 | * 301 | * @type { FormatCallback } 302 | */ 303 | function formatTable (elem, walk, builder, formatOptions) { 304 | return isDataTable(elem.attribs, builder.options.tables) 305 | ? formatDataTable(elem, walk, builder, formatOptions) 306 | : formatBlock(elem, walk, builder, formatOptions); 307 | } 308 | 309 | function formatBlock (elem, walk, builder, formatOptions) { 310 | builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks }); 311 | walk(elem.children, builder); 312 | builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks }); 313 | } 314 | 315 | /** 316 | * Process a data table. 317 | * 318 | * @type { FormatCallback } 319 | */ 320 | function formatDataTable (elem, walk, builder, formatOptions) { 321 | builder.openTable(); 322 | elem.children.forEach(walkTable); 323 | builder.closeTable({ 324 | tableToString: (rows) => tableToString(rows, formatOptions.rowSpacing ?? 0, formatOptions.colSpacing ?? 3), 325 | leadingLineBreaks: formatOptions.leadingLineBreaks, 326 | trailingLineBreaks: formatOptions.trailingLineBreaks 327 | }); 328 | 329 | function formatCell (cellNode) { 330 | const colspan = +get(cellNode, ['attribs', 'colspan']) || 1; 331 | const rowspan = +get(cellNode, ['attribs', 'rowspan']) || 1; 332 | builder.openTableCell({ maxColumnWidth: formatOptions.maxColumnWidth }); 333 | walk(cellNode.children, builder); 334 | builder.closeTableCell({ colspan: colspan, rowspan: rowspan }); 335 | } 336 | 337 | function walkTable (elem) { 338 | if (elem.type !== 'tag') { return; } 339 | 340 | const formatHeaderCell = (formatOptions.uppercaseHeaderCells !== false) 341 | ? (cellNode) => { 342 | builder.pushWordTransform(str => str.toUpperCase()); 343 | formatCell(cellNode); 344 | builder.popWordTransform(); 345 | } 346 | : formatCell; 347 | 348 | switch (elem.name) { 349 | case 'thead': 350 | case 'tbody': 351 | case 'tfoot': 352 | case 'center': 353 | elem.children.forEach(walkTable); 354 | return; 355 | 356 | case 'tr': { 357 | builder.openTableRow(); 358 | for (const childOfTr of elem.children) { 359 | if (childOfTr.type !== 'tag') { continue; } 360 | switch (childOfTr.name) { 361 | case 'th': { 362 | formatHeaderCell(childOfTr); 363 | break; 364 | } 365 | case 'td': { 366 | formatCell(childOfTr); 367 | break; 368 | } 369 | default: 370 | // do nothing 371 | } 372 | } 373 | builder.closeTableRow(); 374 | break; 375 | } 376 | 377 | default: 378 | // do nothing 379 | } 380 | } 381 | } 382 | 383 | 384 | export { 385 | formatAnchor as anchor, 386 | formatBlockquote as blockquote, 387 | formatDataTable as dataTable, 388 | formatHeading as heading, 389 | formatHorizontalLine as horizontalLine, 390 | formatImage as image, 391 | formatLineBreak as lineBreak, 392 | formatOrderedList as orderedList, 393 | formatParagraph as paragraph, 394 | formatPre as pre, 395 | formatTable as table, 396 | formatUnorderedList as unorderedList, 397 | formatWbr as wbr, 398 | }; 399 | -------------------------------------------------------------------------------- /packages/html-to-md/test/snapshots/tags.js.md: -------------------------------------------------------------------------------- 1 | # Snapshot report for `test/tags.js` 2 | 3 | The actual snapshot is saved in `tags.js.snap`. 4 | 5 | Generated by [AVA](https://avajs.dev). 6 | 7 | ## common block-level elements 8 | 9 | > ```html 10 | >
article
div
figure
figcaption
footer
form
header
main

p

section
11 | > ``` 12 | 13 | `article␊ 14 | ␊ 15 | aside␊ 16 | ␊ 17 | div␊ 18 | ␊ 19 | figure␊ 20 | ␊ 21 | figcaption␊ 22 | ␊ 23 | footer␊ 24 | ␊ 25 | form␊ 26 | ␊ 27 | header␊ 28 | ␊ 29 | main␊ 30 | ␊ 31 | nav␊ 32 | ␊ 33 | p␊ 34 | ␊ 35 | section` 36 | 37 | ## block with custom spacing 38 | 39 | > ```html 40 | >
div
div

p

p

div
div
41 | > ``` 42 | 43 | `div␊ 44 | ␊ 45 | div␊ 46 | ␊ 47 | ␊ 48 | ␊ 49 | p␊ 50 | ␊ 51 | ␊ 52 | ␊ 53 | p␊ 54 | ␊ 55 | ␊ 56 | div␊ 57 | ␊ 58 | div` 59 | 60 | ## default formatter is inline 61 | 62 | > ```html 63 | > Lorem ipsum dolor met 64 | > ``` 65 | 66 | 'Lorem ipsum dolor met' 67 | 68 | ## headings 69 | 70 | > ```html 71 | >

p

h1

h2

h3

h4

h5
h6

p

72 | > ``` 73 | 74 | `p␊ 75 | ␊ 76 | # h1␊ 77 | ␊ 78 | ## h2␊ 79 | ␊ 80 | ### h3␊ 81 | ␊ 82 | #### h4␊ 83 | ␊ 84 | ##### h5␊ 85 | ␊ 86 | ###### h6␊ 87 | ␊ 88 | p` 89 | 90 | ## line breaks (HTML tags by default) 91 | 92 | > ```html 93 | > a
b

c


d 94 | > ``` 95 | 96 | 'a
b

c


d' 97 | 98 | ## line breaks (two spaces) 99 | 100 | > ```html 101 | > a
b

c


d 102 | > ``` 103 | 104 | `a ␊ 105 | b ␊ 106 | ␊ 107 | c ␊ 108 | ␊ 109 | ␊ 110 | d` 111 | 112 | ## line breaks (backslash) 113 | 114 | > ```html 115 | > a
b

c


d 116 | > ``` 117 | 118 | `a\\␊ 119 | b\\␊ 120 | \\␊ 121 | c\\␊ 122 | \\␊ 123 | \\␊ 124 | d` 125 | 126 | ## horizontal lines (default) 127 | 128 | > ```html 129 | > a
b 130 | >
131 | > c 132 | > ``` 133 | 134 | `a␊ 135 | ␊ 136 | ----␊ 137 | ␊ 138 | b␊ 139 | ␊ 140 | ----␊ 141 | ␊ 142 | c` 143 | 144 | ## horizontal lines (custom) 145 | 146 | > ```html 147 | > a
b 148 | >
149 | > c 150 | > ``` 151 | 152 | `a␊ 153 | ␊ 154 | * * *␊ 155 | ␊ 156 | b␊ 157 | ␊ 158 | * * *␊ 159 | ␊ 160 | c` 161 | 162 | ## pre 163 | 164 | > ```html 165 | >

Code fragment:

  body {
166 | >     color: red;
167 | >   }
168 | > ``` 169 | 170 | `Code fragment:␊ 171 | ␊ 172 | body {␊ 173 | color: red;␊ 174 | }` 175 | 176 | ## blockquote 177 | 178 | > ```html 179 | > foo
quote
bar 180 | > ``` 181 | 182 | `foo␊ 183 | ␊ 184 | > quote␊ 185 | ␊ 186 | bar` 187 | 188 | ## img 189 | 190 | > ```html 191 | > alt text 192 | > ``` 193 | 194 | '![alt text](test.png "title")' 195 | 196 | ## img with rewritten path 197 | 198 | > ```html 199 | > 200 | > ``` 201 | 202 | '![](assets/test.png)' 203 | 204 | ## img with source encoded as data url 205 | 206 | > ```html 207 | > Red dot 208 | > ``` 209 | 210 | 'Red dot' 211 | 212 | ## link 213 | 214 | > ```html 215 | > test 216 | > ``` 217 | 218 | '[test](/test.html)' 219 | 220 | ## email link 221 | 222 | > ```html 223 | > mail me 224 | > ``` 225 | 226 | '[mail me](mailto:foo@example.com)' 227 | 228 | ## anchor link 229 | 230 | > ```html 231 | > test 232 | > ``` 233 | 234 | '[test](#anchor)' 235 | 236 | ## link with title 237 | 238 | > ```html 239 | > test 240 | > ``` 241 | 242 | '[test](/test.html "Click me")' 243 | 244 | ## link with rewritten path and baseUrl 245 | 246 | > ```html 247 | > test 248 | > ``` 249 | 250 | '[test](https://example.com/foo/bar/test.html)' 251 | 252 | ## named anchor 253 | 254 | > ```html 255 | > 256 | > ``` 257 | 258 | '' 259 | 260 | ## bold, strong 261 | 262 | > ```html 263 | > bold, strong 264 | > ``` 265 | 266 | '**bold**, **strong**' 267 | 268 | ## italic, emphasis 269 | 270 | > ```html 271 | > italic, emphasis 272 | > ``` 273 | 274 | '*italic*, *emphasis*' 275 | 276 | ## strikethrough, del 277 | 278 | > ```html 279 | > strikethrough, deleted 280 | > ``` 281 | 282 | '~~strikethrough~~, ~~deleted~~' 283 | 284 | ## inline code 285 | 286 | > ```html 287 | > Lorem ipsum code dolor sit 288 | > ``` 289 | 290 | 'Lorem ipsum `code` dolor sit' 291 | 292 | ## sub, sup 293 | 294 | > ```html 295 | > x2, x2 296 | > ``` 297 | 298 | 'x2, x2' 299 | 300 | ## kbd 301 | 302 | > ```html 303 | > Ctrl + C 304 | > ``` 305 | 306 | 'Ctrl + C' 307 | 308 | ## figure 309 | 310 | > ```html 311 | > 312 | >
313 | > alt="Alt test"> 315 | >
Caption
316 | >
317 | > ``` 318 | 319 | `![Alt test](/media/image.jpg)␊ 320 | ␊ 321 | Caption` 322 | 323 | ## picture - ignore sources 324 | 325 | > ```html 326 | > 327 | > 328 | > media="(min-width: 800px)"> 330 | > Alt text 331 | > 332 | > ``` 333 | 334 | '![Alt text](/media/cc0-images/painted-hand-298-332.jpg)' 335 | 336 | ## definition lists 337 | 338 | > ```html 339 | > 340 | >
341 | >
Title 1
342 | >
Definition 1
343 | >
Title 2a
344 | >
Title 2b
345 | >
Definition 2a
346 | >
Definition 2b
347 | >
348 | > ``` 349 | 350 | `Title 1␊ 351 | : Definition 1␊ 352 | ␊ 353 | Title 2a␊ 354 | Title 2b␊ 355 | : Definition 2a␊ 356 | : Definition 2b` 357 | 358 | ## definition list with divs 359 | 360 | > ```html 361 | > 362 | >
363 | >
364 | >
Title 1
365 | >
Definition 1
366 | >
367 | >
368 | >
Title 2a
369 | >
Title 2b
370 | >
Definition 2a
371 | >
Definition 2b
372 | >
373 | >
374 | > ``` 375 | 376 | `Title 1␊ 377 | : Definition 1␊ 378 | ␊ 379 | Title 2a␊ 380 | Title 2b␊ 381 | : Definition 2a␊ 382 | : Definition 2b` 383 | 384 | ## definition lists (compatible syntax) 385 | 386 | > ```html 387 | > 388 | >
389 | >
Title 1
390 | >
Definition 1
391 | >
Title 2a
392 | >
Title 2b
393 | >
Definition 2a
394 | >
Definition 2b
395 | >
396 | > ``` 397 | 398 | `**Title 1**␊ 399 | ␊ 400 | - Definition 1␊ 401 | ␊ 402 | **Title 2a**␊ 403 | ␊ 404 | **Title 2b**␊ 405 | ␊ 406 | - Definition 2a␊ 407 | - Definition 2b` 408 | 409 | ## unordered list 410 | 411 | > ```html 412 | > 413 | >
    414 | >
  • Item 1
  • 415 | >
  • Item 2
  • 416 | >
  • Item 3
  • 417 | >
418 | > ``` 419 | 420 | `- Item 1␊ 421 | - Item 2␊ 422 | - Item 3` 423 | 424 | ## ordered list 425 | 426 | > ```html 427 | > 428 | >
    429 | >
  1. Item 1
  2. 430 | >
  3. Item 2
  4. 431 | >
  5. Item 3
  6. 432 | >
433 | > ``` 434 | 435 | `1. Item 1␊ 436 | 2. Item 2␊ 437 | 3. Item 3` 438 | 439 | ## ordered list with start number (numbering type is ignored) 440 | 441 | > ```html 442 | > 443 | >
    444 | >
  1. Item 1
  2. 445 | >
  3. Item 2
  4. 446 | >
  5. Item 3
  6. 447 | >
448 | > ``` 449 | 450 | `11. Item 1␊ 451 | 12. Item 2␊ 452 | 13. Item 3` 453 | 454 | ## ordered list with overridden start number 455 | 456 | > ```html 457 | > 458 | >
    459 | >
  1. Item 1
  2. 460 | >
  3. Item 2
  4. 461 | >
  5. Item 3
  6. 462 | >
463 | > ``` 464 | 465 | `22. Item 1␊ 466 | 23. Item 2␊ 467 | 24. Item 3` 468 | 469 | ## table with header cells in the first row 470 | 471 | > ```html 472 | > 473 | > 474 | > 475 | > 476 | > 477 | > 478 | >
abc
def
g
g
h

h
i


i

j

k

k

l
479 | > ``` 480 | 481 | `| a | b | c␊ 482 | | --- | --- | ---␊ 483 | | d | e | f␊ 484 | | g
g | h

h | i


i␊ 485 | | j | k
k | l` 486 | 487 | ## table with thead, tbody, tfoot 488 | 489 | > ```html 490 | > 491 | > 492 | > 493 | > 494 | > 495 | >
abc
def
ghi
496 | > ``` 497 | 498 | `| a | b | c␊ 499 | | --- | --- | ---␊ 500 | | d | e | f␊ 501 | | g | h | i` 502 | 503 | ## table without a header 504 | 505 | > ```html 506 | > 507 | > 508 | > 509 | > 510 | > 511 | >
abc
def
ghi
512 | > ``` 513 | 514 | `| | | ␊ 515 | | -------- | -------- | --------␊ 516 | | a | b | c␊ 517 | | d | e | f␊ 518 | | g | h | i` 519 | 520 | ## table with colspans and rowspans (repeat value by default) 521 | 522 | > ```html 523 | > 524 | > 525 | > 526 | > 527 | > 528 | > 529 | > 530 | >
ac
de
g
kl
m
531 | > ``` 532 | 533 | `| a | a | c␊ 534 | | --- | --- | ---␊ 535 | | d | e | e␊ 536 | | g | e | e␊ 537 | | g | k | l␊ 538 | | m | | l` 539 | 540 | ## table with colspans and rowspans (value in first cell only) 541 | 542 | > ```html 543 | > 544 | > 545 | > 546 | > 547 | > 548 | > 549 | > 550 | >
ac
de
g
kl
m
551 | > ``` 552 | 553 | `| a | | c␊ 554 | | --- | --- | ---␊ 555 | | d | e |␊ 556 | | g | |␊ 557 | | | k | l␊ 558 | | m | |` 559 | 560 | ## table with colspans and rowspans (value repeated in cells of the first row only) 561 | 562 | > ```html 563 | > 564 | > 565 | > 566 | > 567 | > 568 | > 569 | > 570 | >
ac
de
g
kl
m
571 | > ``` 572 | 573 | `| a | a | c␊ 574 | | --- | --- | ---␊ 575 | | d | e | e␊ 576 | | g | |␊ 577 | | | k | l␊ 578 | | m | |` 579 | 580 | ## table with colspans and rowspans (value repeated in cells of the first column only) 581 | 582 | > ```html 583 | > 584 | > 585 | > 586 | > 587 | > 588 | > 589 | > 590 | >
ac
de
g
kl
m
591 | > ``` 592 | 593 | `| a | | c␊ 594 | | --- | --- | ---␊ 595 | | d | e |␊ 596 | | g | e |␊ 597 | | g | k | l␊ 598 | | m | | l` 599 | 600 | ## table with colspans and rowspans (use HTML tag for spanned cells) 601 | 602 | > ```html 603 | > 604 | > 605 | > 606 | > 607 | > 608 | > 609 | >
abc
de
g
j
610 | > ``` 611 | 612 | `| | | | ␊ 613 | | -------- | -------- | -------- | --------␊ 614 | | a b c␊ 615 | | d e␊ 616 | | g␊ 617 | | j` 618 | 619 | ## table with colspans and rowspans (fallback to HTML from "tag" mode) 620 | 621 | > ```html 622 | > 623 | > 624 | > 625 | > 626 | > 627 | > 628 | > 629 | >
ac
de
g
kl
m
630 | > ``` 631 | 632 | `␊ 633 | ␊ 634 | ␊ 635 | ␊ 636 | ␊ 637 | ␊ 638 |
ac
de
g
kl
m
` 639 | -------------------------------------------------------------------------------- /packages/html-to-md/src/md-formatters.js: -------------------------------------------------------------------------------- 1 | 2 | import { get, trimCharacter, trimCharacterEnd } from '@html-to-text/base/src/util'; 3 | import { render } from 'dom-serializer'; 4 | import { existsOne, innerText } from 'domutils'; 5 | 6 | import { tableToString } from './table-printer'; 7 | 8 | // eslint-disable-next-line import/no-unassigned-import 9 | import '@html-to-text/base/src/typedefs'; 10 | 11 | 12 | /** 13 | * Process a `wbr` tag (word break opportunity). 14 | * 15 | * @type { FormatCallback } 16 | */ 17 | function formatWbr (elem, walk, builder, formatOptions) { 18 | builder.addWordBreakOpportunity(); 19 | } 20 | 21 | /** 22 | * Process a preformatted content. 23 | * 24 | * @type { FormatCallback } 25 | */ 26 | function formatPre (elem, walk, builder, formatOptions) { 27 | builder.openBlock({ 28 | isPre: true, 29 | leadingLineBreaks: formatOptions.leadingLineBreaks || 2, 30 | reservedLineLength: 2 31 | }); 32 | walk(elem.children, builder); 33 | builder.closeBlock({ 34 | trailingLineBreaks: formatOptions.trailingLineBreaks || 2, 35 | blockTransform: str => str 36 | .split('\n') 37 | .map(line => ' ' + line) 38 | .join('\n') 39 | }); 40 | } 41 | 42 | /** 43 | * Process a heading. 44 | * 45 | * @type { FormatCallback } 46 | */ 47 | function formatHeading (elem, walk, builder, formatOptions) { 48 | builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 }); 49 | builder.addLiteral('#'.repeat(formatOptions.level || 1) + ' '); 50 | walk(elem.children, builder); 51 | builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 }); 52 | } 53 | 54 | /** 55 | * Process a blockquote. 56 | * 57 | * @type { FormatCallback } 58 | */ 59 | function formatBlockquote (elem, walk, builder, formatOptions) { 60 | builder.openBlock({ 61 | leadingLineBreaks: formatOptions.leadingLineBreaks || 2, 62 | reservedLineLength: 2 63 | }); 64 | walk(elem.children, builder); 65 | builder.closeBlock({ 66 | trailingLineBreaks: formatOptions.trailingLineBreaks || 2, 67 | blockTransform: str => ((formatOptions.trimEmptyLines !== false) ? trimCharacter(str, '\n') : str) 68 | .split('\n') 69 | .map(line => '> ' + line) 70 | .join('\n') 71 | }); 72 | } 73 | 74 | /** 75 | * Render code block. 76 | * 77 | * @type { FormatCallback } 78 | */ 79 | function formatCodeBlock (elem, walk, builder, formatOptions) { 80 | builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 }); 81 | builder.addLiteral('```' + (formatOptions.language || '') + '\n'); 82 | walk(elem.children, builder); 83 | builder.addLiteral('\n```'); 84 | builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 }); 85 | } 86 | 87 | function pathRewrite (path, rewriter, baseUrl, metadata, elem) { 88 | const modifiedPath = (typeof rewriter === 'function') 89 | ? rewriter(path, metadata, elem) 90 | : path; 91 | return (modifiedPath[0] === '/' && baseUrl) 92 | ? trimCharacterEnd(baseUrl, '/') + modifiedPath 93 | : modifiedPath; 94 | } 95 | 96 | /** 97 | * Process an image. 98 | * 99 | * @type { FormatCallback } 100 | */ 101 | function formatImage (elem, walk, builder, formatOptions) { 102 | const attribs = elem.attribs || {}; 103 | if (attribs.src && attribs.src.startsWith('data:')) { 104 | builder.startNoWrap(); 105 | builder.addInline( 106 | render(elem, { decodeEntities: builder.options.decodeEntities }), 107 | { noWordTransform: true } 108 | ); 109 | builder.stopNoWrap(); 110 | return; 111 | } 112 | const src = (!attribs.src) 113 | ? '' 114 | : pathRewrite(attribs.src, formatOptions.pathRewrite, formatOptions.baseUrl, builder.metadata, elem); 115 | builder.startNoWrap(); 116 | builder.addLiteral(`![`); 117 | builder.addInline(attribs.alt || ''); 118 | builder.addLiteral(`](`); 119 | builder.addInline(src, { noWordTransform: true }); 120 | if (attribs.title) { 121 | builder.addLiteral(` "`); 122 | builder.addInline(attribs.title); 123 | builder.addLiteral(`"`); 124 | } 125 | builder.addLiteral(`)`); 126 | builder.stopNoWrap(); 127 | } 128 | 129 | /** 130 | * Process a link/anchor. 131 | * 132 | * @type { FormatCallback } 133 | */ 134 | function formatAnchor (elem, walk, builder, formatOptions) { 135 | const attribs = elem.attribs || {}; 136 | if (attribs.name && !attribs.href) { 137 | builder.startNoWrap(); 138 | builder.addInline( 139 | render(elem, { decodeEntities: builder.options.decodeEntities }), 140 | { noWordTransform: true } 141 | ); 142 | builder.stopNoWrap(); 143 | return; 144 | } 145 | const href = (!attribs.href) 146 | ? '' 147 | : pathRewrite(attribs.href, formatOptions.pathRewrite, formatOptions.baseUrl, builder.metadata, elem); 148 | const text = innerText(elem); 149 | builder.startNoWrap(); 150 | if (href === text && text.length) { 151 | builder.addInline(`<${href}>`, { noWordTransform: true }); 152 | } else { 153 | builder.addLiteral(`[`); 154 | walk(elem.children, builder); 155 | builder.addLiteral(`](`); 156 | builder.addInline(href, { noWordTransform: true }); 157 | if (attribs.title) { 158 | builder.addLiteral(` "`); 159 | builder.addInline(attribs.title); 160 | builder.addLiteral(`"`); 161 | } 162 | builder.addLiteral(`)`); 163 | } 164 | builder.stopNoWrap(); 165 | } 166 | 167 | /** 168 | * @param { DomNode } elem List items with their prefixes. 169 | * @param { RecursiveCallback } walk Recursive callback to process child nodes. 170 | * @param { BlockTextBuilder } builder Passed around to accumulate output text. 171 | * @param { FormatOptions } formatOptions Options specific to a formatter. 172 | * @param { () => string } nextPrefixCallback Function that returns increasing index each time it is called. 173 | */ 174 | function formatList (elem, walk, builder, formatOptions, nextPrefixCallback) { 175 | const isNestedList = get(elem, ['parent', 'name']) === 'li'; 176 | 177 | // With Roman numbers, index length is not as straightforward as with Arabic numbers or letters, 178 | // so the dumb length comparison is the most robust way to get the correct value. 179 | let maxPrefixLength = 0; 180 | const listItems = (elem.children || []) 181 | // it might be more accurate to check only for html spaces here, but no significant benefit 182 | .filter(child => child.type !== 'text' || !/^\s*$/.test(child.data)) 183 | .map(function (child) { 184 | if (child.name !== 'li') { 185 | return { node: child, prefix: '' }; 186 | } 187 | const prefix = (isNestedList) 188 | ? nextPrefixCallback().trimStart() 189 | : nextPrefixCallback(); 190 | if (prefix.length > maxPrefixLength) { maxPrefixLength = prefix.length; } 191 | return { node: child, prefix: prefix }; 192 | }); 193 | if (!listItems.length) { return; } 194 | 195 | builder.openList({ 196 | interRowLineBreaks: formatOptions.interRowLineBreaks || 1, 197 | leadingLineBreaks: isNestedList ? 1 : (formatOptions.leadingLineBreaks || 2), 198 | maxPrefixLength: maxPrefixLength, 199 | prefixAlign: 'left' 200 | }); 201 | 202 | for (const { node, prefix } of listItems) { 203 | builder.openListItem({ prefix: prefix }); 204 | walk([node], builder); 205 | builder.closeListItem(); 206 | } 207 | 208 | builder.closeList({ trailingLineBreaks: isNestedList ? 1 : (formatOptions.trailingLineBreaks || 2) }); 209 | } 210 | 211 | /** 212 | * Process an unordered list. 213 | * 214 | * @type { FormatCallback } 215 | */ 216 | function formatUnorderedList (elem, walk, builder, formatOptions) { 217 | const prefix = (formatOptions.marker || '-') + ' '; // can be any of [-*+] 218 | return formatList(elem, walk, builder, formatOptions, () => prefix); 219 | } 220 | 221 | /** 222 | * Process an ordered list. 223 | * 224 | * @type { FormatCallback } 225 | */ 226 | function formatOrderedList (elem, walk, builder, formatOptions) { 227 | let nextIndex = Number(formatOptions.start || elem.attribs.start || '1'); 228 | const nextPrefixCallback = () => `${nextIndex++}. `; 229 | return formatList(elem, walk, builder, formatOptions, nextPrefixCallback); 230 | } 231 | 232 | function collectDefinitionGroups (elem) { 233 | const defItems = []; 234 | function handleDtDd (el) { 235 | if (el.name === 'dt' || el.name === 'dd') { 236 | defItems.push(el); 237 | } 238 | } 239 | for (const child of (elem.children || [])) { 240 | if (child.name === 'div') { 241 | (child.children || []).forEach(handleDtDd); 242 | } else { 243 | handleDtDd(child); 244 | } 245 | } 246 | const groups = []; 247 | let group = null; 248 | for (const item of defItems) { 249 | if (item.name === 'dt') { 250 | if (group && group.definitions.length === 0) { 251 | group.titleItems.push(item); 252 | } else { 253 | group = { titleItems: [item], definitions: [] }; 254 | groups.push(group); 255 | } 256 | } else { // dd 257 | group.definitions.push(item); 258 | } 259 | } 260 | return groups; 261 | } 262 | 263 | /** 264 | * Render a definition list in a form supported by some markdown systems 265 | * (each definition starts with ": "). 266 | * 267 | * @type { FormatCallback } 268 | */ 269 | function formatDefinitionList (elem, walk, builder, formatOptions) { 270 | const groups = collectDefinitionGroups(elem); 271 | for (const group of groups) { 272 | builder.openList({ 273 | interRowLineBreaks: 1, 274 | leadingLineBreaks: formatOptions.leadingLineBreaks || 2, 275 | maxPrefixLength: 0, 276 | prefixAlign: 'left' 277 | }); 278 | 279 | for (const titleItem of group.titleItems) { 280 | builder.openListItem({ prefix: '' }); 281 | walk([titleItem], builder); 282 | builder.closeListItem(); 283 | } 284 | 285 | for (const definition of group.definitions) { 286 | builder.openListItem({ prefix: ': ' }); 287 | walk([definition], builder); 288 | builder.closeListItem(); 289 | } 290 | 291 | builder.closeList({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 }); 292 | } 293 | } 294 | 295 | /** 296 | * Render a definition list in a compatible form 297 | * (substitute with bold titles and regular lists). 298 | * 299 | * @type { FormatCallback } 300 | */ 301 | function formatDefinitionListCompatible (elem, walk, builder, formatOptions) { 302 | const definitionPrefix = (formatOptions.marker || '-') + ' '; // can be any of [-*+] 303 | const groups = collectDefinitionGroups(elem); 304 | for (const group of groups) { 305 | builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 }); 306 | 307 | for (const titleItem of group.titleItems) { 308 | builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 }); 309 | builder.addLiteral('**'); 310 | walk(titleItem.children, builder); 311 | builder.addLiteral('**'); 312 | builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 }); 313 | } 314 | 315 | builder.openList({ 316 | interRowLineBreaks: formatOptions.interRowLineBreaks || 1, 317 | leadingLineBreaks: formatOptions.leadingLineBreaks || 2, 318 | maxPrefixLength: definitionPrefix.length 319 | }); 320 | 321 | for (const definition of group.definitions) { 322 | builder.openListItem({ prefix: definitionPrefix }); 323 | walk([definition], builder); 324 | builder.closeListItem(); 325 | } 326 | 327 | builder.closeList({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 }); 328 | 329 | builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 }); 330 | } 331 | } 332 | 333 | /** 334 | * Process a data table. 335 | * 336 | * @type { FormatCallback } 337 | */ 338 | function formatDataTable (elem, walk, builder, formatOptions) { 339 | builder.openTable(); 340 | elem.children.forEach(walkTable); 341 | const hasHeader = existsOne( 342 | (el) => el.name === 'thead' || el.name === 'th', 343 | elem.children 344 | ); 345 | builder.closeTable({ 346 | tableToString: (rows) => tableToString(rows, hasHeader, formatOptions.spanMode || 'repeat') || render(elem), 347 | leadingLineBreaks: formatOptions.leadingLineBreaks, 348 | trailingLineBreaks: formatOptions.trailingLineBreaks, 349 | }); 350 | 351 | function formatCell (cellNode) { 352 | const colspan = +get(cellNode, ['attribs', 'colspan']) || 1; 353 | const rowspan = +get(cellNode, ['attribs', 'rowspan']) || 1; 354 | builder.openTableCell({ maxColumnWidth: formatOptions.maxColumnWidth }); 355 | walk(cellNode.children, builder); 356 | builder.closeTableCell({ colspan: colspan, rowspan: rowspan }); 357 | } 358 | 359 | function walkTable (elem) { 360 | if (elem.type !== 'tag') { return; } 361 | 362 | switch (elem.name) { 363 | case 'thead': 364 | case 'tbody': 365 | case 'tfoot': 366 | case 'center': 367 | elem.children.forEach(walkTable); 368 | return; 369 | 370 | case 'tr': { 371 | builder.openTableRow(); 372 | for (const cellElem of elem.children) { 373 | if (cellElem.type !== 'tag') { continue; } 374 | switch (cellElem.name) { 375 | case 'th': 376 | case 'td': { 377 | formatCell(cellElem); 378 | break; 379 | } 380 | default: 381 | // do nothing 382 | } 383 | } 384 | builder.closeTableRow(); 385 | break; 386 | } 387 | 388 | default: 389 | // do nothing 390 | } 391 | } 392 | } 393 | 394 | 395 | export { 396 | formatAnchor as anchor, 397 | formatBlockquote as blockquote, 398 | formatCodeBlock as codeBlock, 399 | formatDataTable as dataTable, 400 | formatDefinitionList as definitionList, 401 | formatDefinitionListCompatible as definitionListCompatible, 402 | formatHeading as heading, 403 | formatImage as image, 404 | formatOrderedList as orderedList, 405 | formatPre as pre, 406 | formatUnorderedList as unorderedList, 407 | formatWbr as wbr, 408 | }; 409 | -------------------------------------------------------------------------------- /packages/base/src/block-text-builder.js: -------------------------------------------------------------------------------- 1 | 2 | import { 3 | // eslint-disable-next-line no-unused-vars 4 | StackItem, BlockStackItem, 5 | TableCellStackItem, TableRowStackItem, TableStackItem, 6 | TransformerStackItem, ListStackItem, ListItemStackItem 7 | } from './stack-item'; 8 | import { trimCharacter } from './util'; 9 | import { WhitespaceProcessor } from './whitespace-processor'; 10 | 11 | // eslint-disable-next-line import/no-unassigned-import 12 | import './typedefs'; 13 | 14 | 15 | /** 16 | * Helps to build text from inline and block elements. 17 | * 18 | * @class BlockTextBuilder 19 | */ 20 | class BlockTextBuilder { 21 | 22 | /** 23 | * Creates an instance of BlockTextBuilder. 24 | * 25 | * @param { Options } options HtmlToText options. 26 | * @param { import('selderee').Picker } picker Selectors decision tree picker. 27 | * @param { any} [metadata] Optional metadata for HTML document, for use in formatters. 28 | */ 29 | constructor (options, picker, metadata = undefined) { 30 | this.options = options; 31 | this.picker = picker; 32 | this.metadata = metadata; 33 | this.whitespaceProcessor = new WhitespaceProcessor(options); 34 | /** @type { StackItem } */ 35 | this._stackItem = new BlockStackItem(options); 36 | /** @type { TransformerStackItem } */ 37 | this._wordTransformer = undefined; 38 | } 39 | 40 | /** 41 | * Put a word-by-word transform function onto the transformations stack. 42 | * 43 | * Mainly used for uppercasing. Can be bypassed to add unformatted text such as URLs. 44 | * 45 | * Word transformations applied before wrapping. 46 | * 47 | * @param { (str: string) => string } wordTransform Word transformation function. 48 | */ 49 | pushWordTransform (wordTransform) { 50 | this._wordTransformer = new TransformerStackItem(this._wordTransformer, wordTransform); 51 | } 52 | 53 | /** 54 | * Remove a function from the word transformations stack. 55 | * 56 | * @returns { (str: string) => string } A function that was removed. 57 | */ 58 | popWordTransform () { 59 | if (!this._wordTransformer) { return undefined; } 60 | const transform = this._wordTransformer.transform; 61 | this._wordTransformer = this._wordTransformer.next; 62 | return transform; 63 | } 64 | 65 | /** 66 | * Ignore wordwrap option in followup inline additions and disable automatic wrapping. 67 | */ 68 | startNoWrap () { 69 | this._stackItem.isNoWrap = true; 70 | } 71 | 72 | /** 73 | * Return automatic wrapping to behavior defined by options. 74 | */ 75 | stopNoWrap () { 76 | this._stackItem.isNoWrap = false; 77 | } 78 | 79 | /** @returns { (str: string) => string } */ 80 | _getCombinedWordTransformer () { 81 | const wt = (this._wordTransformer) 82 | ? ((str) => applyTransformer(str, this._wordTransformer)) 83 | : undefined; 84 | const ce = this.options.encodeCharacters; 85 | return (wt) 86 | ? ((ce) ? (str) => ce(wt(str)) : wt) 87 | : ce; 88 | } 89 | 90 | _popStackItem () { 91 | const item = this._stackItem; 92 | this._stackItem = item.next; 93 | return item; 94 | } 95 | 96 | /** 97 | * Add a line break into currently built block. 98 | */ 99 | addLineBreak () { 100 | if (!( 101 | this._stackItem instanceof BlockStackItem 102 | || this._stackItem instanceof ListItemStackItem 103 | || this._stackItem instanceof TableCellStackItem 104 | )) { return; } 105 | if (this._stackItem.isPre) { 106 | this._stackItem.rawText += '\n'; 107 | } else { 108 | this._stackItem.inlineTextBuilder.startNewLine(); 109 | } 110 | } 111 | 112 | /** 113 | * Allow to break line in case directly following text will not fit. 114 | */ 115 | addWordBreakOpportunity () { 116 | if ( 117 | this._stackItem instanceof BlockStackItem 118 | || this._stackItem instanceof ListItemStackItem 119 | || this._stackItem instanceof TableCellStackItem 120 | ) { 121 | this._stackItem.inlineTextBuilder.wordBreakOpportunity = true; 122 | } 123 | } 124 | 125 | /** 126 | * Add a node inline into the currently built block. 127 | * 128 | * @param { string } str 129 | * Text content of a node to add. 130 | * 131 | * @param { object } [param1] 132 | * Object holding the parameters of the operation. 133 | * 134 | * @param { boolean } [param1.noWordTransform] 135 | * Ignore word transformers if there are any. 136 | * Don't encode characters as well. 137 | * (Use this for things like URL addresses). 138 | */ 139 | addInline (str, { noWordTransform = false } = {}) { 140 | if (!( 141 | this._stackItem instanceof BlockStackItem 142 | || this._stackItem instanceof ListItemStackItem 143 | || this._stackItem instanceof TableCellStackItem 144 | )) { return; } 145 | 146 | if (this._stackItem.isPre) { 147 | this._stackItem.rawText += str; 148 | return; 149 | } 150 | 151 | if ( 152 | str.length === 0 || // empty string 153 | ( 154 | this._stackItem.stashedLineBreaks && // stashed linebreaks make whitespace irrelevant 155 | !this.whitespaceProcessor.testContainsWords(str) // no words to add 156 | ) 157 | ) { return; } 158 | 159 | if (this.options.preserveNewlines) { 160 | const newlinesNumber = this.whitespaceProcessor.countNewlinesNoWords(str); 161 | if (newlinesNumber > 0) { 162 | this._stackItem.inlineTextBuilder.startNewLine(newlinesNumber); 163 | // keep stashedLineBreaks unchanged 164 | return; 165 | } 166 | } 167 | 168 | if (this._stackItem.stashedLineBreaks) { 169 | this._stackItem.inlineTextBuilder.startNewLine(this._stackItem.stashedLineBreaks); 170 | } 171 | this.whitespaceProcessor.shrinkWrapAdd( 172 | str, 173 | this._stackItem.inlineTextBuilder, 174 | (noWordTransform) ? undefined : this._getCombinedWordTransformer(), 175 | this._stackItem.isNoWrap 176 | ); 177 | this._stackItem.stashedLineBreaks = 0; // inline text doesn't introduce line breaks 178 | } 179 | 180 | /** 181 | * Add a string inline into the currently built block. 182 | * 183 | * Use this for markup elements that don't have to adhere 184 | * to text layout rules. 185 | * 186 | * @param { string } str Text to add. 187 | */ 188 | addLiteral (str) { 189 | if (!( 190 | this._stackItem instanceof BlockStackItem 191 | || this._stackItem instanceof ListItemStackItem 192 | || this._stackItem instanceof TableCellStackItem 193 | )) { return; } 194 | 195 | if (str.length === 0) { return; } 196 | 197 | if (this._stackItem.isPre) { 198 | this._stackItem.rawText += str; 199 | return; 200 | } 201 | 202 | if (this._stackItem.stashedLineBreaks) { 203 | this._stackItem.inlineTextBuilder.startNewLine(this._stackItem.stashedLineBreaks); 204 | } 205 | this.whitespaceProcessor.addLiteral( 206 | str, 207 | this._stackItem.inlineTextBuilder, 208 | this._stackItem.isNoWrap 209 | ); 210 | this._stackItem.stashedLineBreaks = 0; 211 | } 212 | 213 | /** 214 | * Start building a new block. 215 | * 216 | * @param { object } [param0] 217 | * Object holding the parameters of the block. 218 | * 219 | * @param { number } [param0.leadingLineBreaks] 220 | * This block should have at least this number of line breaks to separate it from any preceding block. 221 | * 222 | * @param { number } [param0.reservedLineLength] 223 | * Reserve this number of characters on each line for block markup. 224 | * 225 | * @param { boolean } [param0.isPre] 226 | * Should HTML whitespace be preserved inside this block. 227 | */ 228 | openBlock ({ leadingLineBreaks = 1, reservedLineLength = 0, isPre = false } = {}) { 229 | const maxLineLength = Math.max(20, this._stackItem.inlineTextBuilder.maxLineLength - reservedLineLength); 230 | this._stackItem = new BlockStackItem( 231 | this.options, 232 | this._stackItem, 233 | leadingLineBreaks, 234 | maxLineLength 235 | ); 236 | if (isPre) { this._stackItem.isPre = true; } 237 | } 238 | 239 | /** 240 | * Finalize currently built block, add it's content to the parent block. 241 | * 242 | * @param { object } [param0] 243 | * Object holding the parameters of the block. 244 | * 245 | * @param { number } [param0.trailingLineBreaks] 246 | * This block should have at least this number of line breaks to separate it from any following block. 247 | * 248 | * @param { (str: string) => string } [param0.blockTransform] 249 | * A function to transform the block text before adding to the parent block. 250 | * This happens after word wrap and should be used in combination with reserved line length 251 | * in order to keep line lengths correct. 252 | * Used for whole block markup. 253 | */ 254 | closeBlock ({ trailingLineBreaks = 1, blockTransform = undefined } = {}) { 255 | const block = this._popStackItem(); 256 | const blockText = (blockTransform) ? blockTransform(getText(block)) : getText(block); 257 | addText(this._stackItem, blockText, block.leadingLineBreaks, Math.max(block.stashedLineBreaks, trailingLineBreaks)); 258 | } 259 | 260 | /** 261 | * Start building a new list. 262 | * 263 | * @param { object } [param0] 264 | * Object holding the parameters of the list. 265 | * 266 | * @param { number } [param0.maxPrefixLength] 267 | * Length of the longest list item prefix. 268 | * If not supplied or too small then list items won't be aligned properly. 269 | * 270 | * @param { 'left' | 'right' } [param0.prefixAlign] 271 | * Specify how prefixes of different lengths have to be aligned 272 | * within a column. 273 | * 274 | * @param { number } [param0.interRowLineBreaks] 275 | * Minimum number of line breaks between list items. 276 | * 277 | * @param { number } [param0.leadingLineBreaks] 278 | * This list should have at least this number of line breaks to separate it from any preceding block. 279 | */ 280 | openList ({ maxPrefixLength = 0, prefixAlign = 'left', interRowLineBreaks = 1, leadingLineBreaks = 2 } = {}) { 281 | this._stackItem = new ListStackItem(this.options, this._stackItem, { 282 | interRowLineBreaks: interRowLineBreaks, 283 | leadingLineBreaks: leadingLineBreaks, 284 | maxLineLength: this._stackItem.inlineTextBuilder.maxLineLength, 285 | maxPrefixLength: maxPrefixLength, 286 | prefixAlign: prefixAlign 287 | }); 288 | } 289 | 290 | /** 291 | * Start building a new list item. 292 | * 293 | * @param {object} param0 294 | * Object holding the parameters of the list item. 295 | * 296 | * @param { string } [param0.prefix] 297 | * Prefix for this list item (item number, bullet point, etc). 298 | */ 299 | openListItem ({ prefix = '' } = {}) { 300 | if (!(this._stackItem instanceof ListStackItem)) { 301 | throw new Error('Can\'t add a list item to something that is not a list! Check the formatter.'); 302 | } 303 | const list = this._stackItem; 304 | const prefixLength = Math.max(prefix.length, list.maxPrefixLength); 305 | const maxLineLength = Math.max(20, list.inlineTextBuilder.maxLineLength - prefixLength); 306 | this._stackItem = new ListItemStackItem(this.options, list, { 307 | prefix: prefix, 308 | maxLineLength: maxLineLength, 309 | leadingLineBreaks: list.interRowLineBreaks 310 | }); 311 | } 312 | 313 | /** 314 | * Finalize currently built list item, add it's content to the parent list. 315 | */ 316 | closeListItem () { 317 | const listItem = this._popStackItem(); 318 | const list = listItem.next; 319 | 320 | const prefixLength = Math.max(listItem.prefix.length, list.maxPrefixLength); 321 | const spacing = '\n' + ' '.repeat(prefixLength); 322 | const prefix = (list.prefixAlign === 'right') 323 | ? listItem.prefix.padStart(prefixLength) 324 | : listItem.prefix.padEnd(prefixLength); 325 | const text = prefix + getText(listItem).replace(/\n/g, spacing); 326 | 327 | addText( 328 | list, 329 | text, 330 | listItem.leadingLineBreaks, 331 | Math.max(listItem.stashedLineBreaks, list.interRowLineBreaks) 332 | ); 333 | } 334 | 335 | /** 336 | * Finalize currently built list, add it's content to the parent block. 337 | * 338 | * @param { object } param0 339 | * Object holding the parameters of the list. 340 | * 341 | * @param { number } [param0.trailingLineBreaks] 342 | * This list should have at least this number of line breaks to separate it from any following block. 343 | */ 344 | closeList ({ trailingLineBreaks = 2 } = {}) { 345 | const list = this._popStackItem(); 346 | const text = getText(list); 347 | if (text) { 348 | addText(this._stackItem, text, list.leadingLineBreaks, trailingLineBreaks); 349 | } 350 | } 351 | 352 | /** 353 | * Start building a table. 354 | */ 355 | openTable () { 356 | this._stackItem = new TableStackItem(this._stackItem); 357 | } 358 | 359 | /** 360 | * Start building a table row. 361 | */ 362 | openTableRow () { 363 | if (!(this._stackItem instanceof TableStackItem)) { 364 | throw new Error('Can\'t add a table row to something that is not a table! Check the formatter.'); 365 | } 366 | this._stackItem = new TableRowStackItem(this._stackItem); 367 | } 368 | 369 | /** 370 | * Start building a table cell. 371 | * 372 | * @param { object } [param0] 373 | * Object holding the parameters of the cell. 374 | * 375 | * @param { number } [param0.maxColumnWidth] 376 | * Wrap cell content to this width. Fall back to global wordwrap value if undefined. 377 | */ 378 | openTableCell ({ maxColumnWidth = undefined } = {}) { 379 | if (!(this._stackItem instanceof TableRowStackItem)) { 380 | throw new Error('Can\'t add a table cell to something that is not a table row! Check the formatter.'); 381 | } 382 | this._stackItem = new TableCellStackItem(this.options, this._stackItem, maxColumnWidth); 383 | } 384 | 385 | /** 386 | * Finalize currently built table cell and add it to parent table row's cells. 387 | * 388 | * @param { object } [param0] 389 | * Object holding the parameters of the cell. 390 | * 391 | * @param { number } [param0.colspan] How many columns this cell should occupy. 392 | * @param { number } [param0.rowspan] How many rows this cell should occupy. 393 | */ 394 | closeTableCell ({ colspan = 1, rowspan = 1 } = {}) { 395 | const cell = this._popStackItem(); 396 | const text = trimCharacter(getText(cell), '\n'); 397 | cell.next.cells.push({ colspan: colspan, rowspan: rowspan, text: text }); 398 | } 399 | 400 | /** 401 | * Finalize currently built table row and add it to parent table's rows. 402 | */ 403 | closeTableRow () { 404 | const row = this._popStackItem(); 405 | row.next.rows.push(row.cells); 406 | } 407 | 408 | /** 409 | * Finalize currently built table and add the rendered text to the parent block. 410 | * 411 | * @param { object } param0 412 | * Object holding the parameters of the table. 413 | * 414 | * @param { TablePrinter } param0.tableToString 415 | * A function to convert a table of stringified cells into a complete table. 416 | * 417 | * @param { number } [param0.leadingLineBreaks] 418 | * This table should have at least this number of line breaks to separate if from any preceding block. 419 | * 420 | * @param { number } [param0.trailingLineBreaks] 421 | * This table should have at least this number of line breaks to separate it from any following block. 422 | */ 423 | closeTable ({ tableToString, leadingLineBreaks = 2, trailingLineBreaks = 2 }) { 424 | const table = this._popStackItem(); 425 | const output = tableToString(table.rows); 426 | if (output) { 427 | addText(this._stackItem, output, leadingLineBreaks, trailingLineBreaks); 428 | } 429 | } 430 | 431 | /** 432 | * Return the rendered text content of this builder. 433 | * 434 | * @returns { string } 435 | */ 436 | toString () { 437 | return getText(this._stackItem.getRoot()); 438 | // There should only be the root item if everything is closed properly. 439 | } 440 | 441 | } 442 | 443 | function getText (stackItem) { 444 | if (!( 445 | stackItem instanceof BlockStackItem 446 | || stackItem instanceof ListItemStackItem 447 | || stackItem instanceof TableCellStackItem 448 | )) { 449 | throw new Error('Only blocks, list items and table cells can be requested for text contents.'); 450 | } 451 | return (stackItem.inlineTextBuilder.isEmpty()) 452 | ? stackItem.rawText 453 | : stackItem.rawText + stackItem.inlineTextBuilder.toString(); 454 | } 455 | 456 | function addText (stackItem, text, leadingLineBreaks, trailingLineBreaks) { 457 | if (!( 458 | stackItem instanceof BlockStackItem 459 | || stackItem instanceof ListItemStackItem 460 | || stackItem instanceof TableCellStackItem 461 | )) { 462 | throw new Error('Only blocks, list items and table cells can contain text.'); 463 | } 464 | const parentText = getText(stackItem); 465 | const lineBreaks = Math.max(stackItem.stashedLineBreaks, leadingLineBreaks); 466 | stackItem.inlineTextBuilder.clear(); 467 | if (parentText) { 468 | stackItem.rawText = parentText + '\n'.repeat(lineBreaks) + text; 469 | } else { 470 | stackItem.rawText = text; 471 | stackItem.leadingLineBreaks = lineBreaks; 472 | } 473 | stackItem.stashedLineBreaks = trailingLineBreaks; 474 | } 475 | 476 | /** 477 | * @param { string } str A string to transform. 478 | * @param { TransformerStackItem } transformer A transformer item (with possible continuation). 479 | * @returns { string } 480 | */ 481 | function applyTransformer (str, transformer) { 482 | return ((transformer) ? applyTransformer(transformer.transform(str), transformer.next) : str); 483 | } 484 | 485 | export { BlockTextBuilder }; 486 | -------------------------------------------------------------------------------- /packages/html-to-text/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## Version 9.0.5 4 | 5 | * `selderee` updated from 0.10.0 to 0.11.0 ([changelog](https://github.com/mxxii/selderee/blob/main/packages/selderee/CHANGELOG.md)) and `parseley` - from 0.11.0 to 0.12.0 ([changelog](https://github.com/mxxii/parseley/blob/main/CHANGELOG.md)) - support escape sequences in selectors according to specification, fixes [#273](https://github.com/html-to-text/node-html-to-text/issues/273); 6 | * `htmlparser2` updated from 8.0.1 to 8.0.2 ([release notes](https://github.com/fb55/htmlparser2/releases)) - this fixes broken parsing in certain situations: [#285](https://github.com/html-to-text/node-html-to-text/issues/285); 7 | * `deepmerge` updated from 4.3.0 to 4.3.1 - no functional changes; 8 | * added a link to attribute selectors syntax to Readme. 9 | 10 | All commits: [9.0.4...9.0.5](https://github.com/html-to-text/node-html-to-text/compare/9.0.4...9.0.5) 11 | 12 | ## Version 9.0.4 13 | 14 | * fixed: `dataTable` formatter was missing some existing cells in incompletely defined tables: [#282](https://github.com/html-to-text/node-html-to-text/issues/282); 15 | * updated readme a bit to clarify the usage: [#281](https://github.com/html-to-text/node-html-to-text/issues/281). 16 | 17 | All commits: [9.0.3...9.0.4](https://github.com/html-to-text/node-html-to-text/compare/9.0.3...9.0.4) 18 | 19 | ## Version 9.0.3 20 | 21 | * document the usage of metadata object; 22 | * explicitly mention `dom-serializer` dependency: [#269](https://github.com/html-to-text/node-html-to-text/issues/269). 23 | 24 | All commits: [9.0.2...9.0.3](https://github.com/html-to-text/node-html-to-text/compare/9.0.2...9.0.3) 25 | 26 | ## Version 9.0.2 27 | 28 | * support multi-character code points in `encodeCharacters` option: [#267](https://github.com/html-to-text/node-html-to-text/issues/267). 29 | 30 | All commits: [9.0.1...9.0.2](https://github.com/html-to-text/node-html-to-text/compare/9.0.1...9.0.2) 31 | 32 | ## Version 9.0.1 33 | 34 | * fixed a broken link in readme: [#262](https://github.com/html-to-text/node-html-to-text/pull/262); 35 | * test and documented the usage of existing formatters from custom formatters in readme: [#263](https://github.com/html-to-text/node-html-to-text/issues/263); 36 | * fixed jsdoc comment for `BlockTextBuilder.closeTable`: [#264](https://github.com/html-to-text/node-html-to-text/issues/264); 37 | * added missing entry in the 9.0.0 changelog below regarding `BlockTextBuilder.closeTable`. 38 | 39 | All commits: [9.0.0...9.0.1](https://github.com/html-to-text/node-html-to-text/compare/9.0.0...9.0.1) 40 | 41 | ## Version 9.0.0 42 | 43 | All commits: [8.2.1...9.0.0](https://github.com/html-to-text/node-html-to-text/compare/8.2.1...9.0.0) 44 | 45 | Version 9 roadmap: [#240](https://github.com/html-to-text/node-html-to-text/issues/240) 46 | 47 | Request for comments: [#261 \[RFC\] Naming issue](https://github.com/html-to-text/node-html-to-text/discussions/261) - please take a look and share opinions while you're here 48 | 49 | ### Node version 50 | 51 | Required Node version is now >=14. 52 | 53 | ### CommonJS and ES Module 54 | 55 | Package now provides `cjs` and `mjs` exports. 56 | 57 | ### CLI is no longer built in 58 | 59 | If you use CLI then install [that package](https://github.com/html-to-text/node-html-to-text/tree/master/packages/html-to-text-cli/) instead. 60 | 61 | The new package uses new arg parser [aspargvs](https://github.com/mxxii/aspargvs) instead of minimist in order to deal with the vast options space of `html-to-text`. 62 | 63 | ### Dependency updates 64 | 65 | * `htmlparser2` updated from 6.1.0 to 8.0.1 ([Release notes](https://github.com/fb55/htmlparser2/releases)); 66 | * `he` dependency is removed. It was needed at the time it was introduced, apparently, but at this point `htmlparser2` seems to do a better job itself. 67 | 68 | ### Removed features 69 | 70 | * Options deprecated in version 6 are now removed; 71 | * `decodeOptions` section removed with `he` dependency; 72 | * `fromString` method removed; 73 | * deprecated positional arguments in `BlockTextBuilder` methods are now removed. 74 | 75 | Refer to README for [migration instructions](https://github.com/html-to-text/node-html-to-text/tree/master/packages/html-to-text#deprecated-or-removed-options). 76 | 77 | ### New options 78 | 79 | * `decodeEntities` - controls whether HTML entities found in the input HTML should be decoded or left as is in the output text; 80 | * `encodeCharacters` - a dictionary with characters that should be replaced in the output text and corresponding escape sequences. 81 | 82 | ### New built-in formatters 83 | 84 | New generic formatters `blockString`, `blockTag`, `blockHtml`, `inlineString`, `inlineSurround`, `inlineTag`, `inlineHtml` cover some common usage scenarios such as [#231](https://github.com/html-to-text/node-html-to-text/issues/231). 85 | 86 | ### Changes to existing built-in formatters 87 | 88 | * `anchor` and `image` got `pathRewrite` option; 89 | * `dataTable` formatter allows zero `colSpacing`. 90 | 91 | ### Improvements for writing custom formatters 92 | 93 | * Some logic for making lists is moved to BlockTextBuilder and can be reused for custom lists (`openList`, `openListItem`, `closeListItem`, `closeList`). Addresses [#238](https://github.com/html-to-text/node-html-to-text/issues/238); 94 | * `startNoWrap`, `stopNoWrap` - allows to keep local inline content in a single line regardless of wrapping options; 95 | * `addLiteral` - it is like `addInline` but circumvents most of the text processing logic. This should be preferred when inserting markup elements; 96 | * It is now possible to provide a metadata object along with the HTML string to convert. Metadata object is available for custom formatters via `builder.metadata`. This allows to compile the converter once and still being able to supply per-document data. Metadata object is supplied as the last optional argument to `convert` function and the function returned by `compile` function; 97 | * Breaking change for those who dare to write their own table formatter (in case there is anyone) - `closeTable` function got a required property in the options object - `tableToString` function, and previously existed `colSpacing` and `rowSpacing` are removed (now a responsibility of the `tableToString` function). 98 | 99 | ### Other 100 | 101 | * Fix deprecated `tags` option support. Addresses [#253](https://github.com/html-to-text/node-html-to-text/issues/253). 102 | 103 | 104 | ---- 105 | 106 | ## Version 8.2.1 107 | 108 | No changes in published package. Bumped dev dependencies and regenerated `package-lock.json`. 109 | 110 | ## Version 8.2.0 111 | 112 | Fix for the issue [#249](https://github.com/html-to-text/node-html-to-text/issues/249) and possibly other obscure issues when some selector options are ignored. `options.selectors` array was not fully processed before. 113 | 114 | ## Version 8.1.1 115 | 116 | Bump `minimist` dependency, regenerate `package-lock.json`. 117 | 118 | ## Version 8.1.0 119 | 120 | * Fix for too many newlines in certain cases when `preserveNewlines` option is used. Addresses [#232](https://github.com/html-to-text/node-html-to-text/issues/232); 121 | * Link and image formatters now have a `linkBrackets` option - it accepts an array of two strings (default: `['[', ']']`) or `false` to remove the brackets. Addresses [#236](https://github.com/html-to-text/node-html-to-text/issues/236); 122 | * `noLinkBrackets` formatters option is now deprecated. 123 | 124 | All commits: [8.0.0...8.1.0](https://github.com/html-to-text/node-html-to-text/compare/8.0.0...8.1.0) 125 | 126 | ## Version 8.0.0 127 | 128 | All commits: [7.1.1...8.0.0](https://github.com/html-to-text/node-html-to-text/compare/7.1.1...8.0.0) 129 | 130 | Version 8 roadmap issue: [#228](https://github.com/html-to-text/node-html-to-text/issues/228) 131 | 132 | ### Selectors 133 | 134 | The main focus of this version. Addresses the most demanded user requests ([#159](https://github.com/html-to-text/node-html-to-text/issues/159), [#179](https://github.com/html-to-text/node-html-to-text/issues/179), partially [#143](https://github.com/html-to-text/node-html-to-text/issues/143)). 135 | 136 | It is now possible to specify formatting options or assign custom formatters not only by tag names but by almost any selectors. 137 | 138 | See the README [Selectors](https://github.com/html-to-text/node-html-to-text#selectors) section for details. 139 | 140 | Note: The new `selectors` option is an array, in contrast to the `tags` option introduced in version 6 (and now deprecated). Selectors have to have a well defined order and object properties is not a right tool for that. 141 | 142 | Two new packages were created to enable this feature - [parseley](https://github.com/mxxii/parseley) and [selderee](https://github.com/mxxii/selderee). 143 | 144 | ### Base elements 145 | 146 | The same selectors implementation is used now to narrow down the conversion to specific HTML DOM fragments. Addresses [#96](https://github.com/html-to-text/node-html-to-text/issues/96). (Previous implementation had more limited selectors format.) 147 | 148 | BREAKING CHANGE: All outermost elements matching provided selectors will be present in the output (previously it was only the first match for each selector). Addresses [#215](https://github.com/html-to-text/node-html-to-text/issues/215). 149 | 150 | `limits.maxBaseElements` can be used when you only need a fixed number of base elements and would like to avoid checking the rest of the source HTML document. 151 | 152 | Base elements can be arranged in output text in the order of matched selectors (default, to keep it closer to the old implementation) or in the order of appearance in source HTML document. 153 | 154 | BREAKING CHANGE: previous implementation was treating id selectors in the same way as class selectors (could match `` with `foo#a` selector). New implementation is closer to the spec and doesn't expect multiple ids on an element. You can achieve the old behavior with `foo[id~=a]` selector in case you rely on it for some poorly formatted documents (note that it has different specificity though). 155 | 156 | ### Batch processing 157 | 158 | Since options preprocessing is getting more involved with selectors compilation, it seemed reasonable to break the single `htmlToText()` function into compilation and convertation steps. It might provide some performance benefits in client code. 159 | 160 | * new function `compile(options)` returns a function of a single argument (html string); 161 | * `htmlToText(html, options)` is now an alias to `convert(html, options)` function and works as before. 162 | 163 | ### Deprecated options 164 | 165 | * `baseElement`; 166 | * `returnDomByDefault`; 167 | * `tables`; 168 | * `tags`. 169 | 170 | Refer to README for [migration instructions](https://github.com/html-to-text/node-html-to-text#deprecated-or-removed-options). 171 | 172 | No previously deprecated stuff is removed in this version. Significant cleanup is planned for version 9 instead. 173 | 174 | ---- 175 | 176 | ## Version ~~7.1.2~~ 7.1.3 177 | 178 | Bump `minimist` dependency and dev dependencies, regenerate `package-lock.json`. 179 | 180 | ## Version 7.1.1 181 | 182 | Regenerate `package-lock.json`. 183 | 184 | ## Version 7.1.0 185 | 186 | ### Dependency updates 187 | 188 | * `htmlparser2` updated from 6.0.0 to 6.1.0 ([Release notes](https://github.com/fb55/htmlparser2/releases)); 189 | * dev dependencies are bumped. 190 | 191 | ## Version 7.0.0 192 | 193 | ### Node version 194 | 195 | Required Node version is now >=10.23.2. 196 | 197 | ### Dependency updates 198 | 199 | * `lodash` dependency is removed; 200 | * `htmlparser2` updated from 4.1.0 to 6.0.0 ([Release notes](https://github.com/fb55/htmlparser2/releases), also [domhandler](https://github.com/fb55/domhandler/releases/tag/v4.0.0)). There is a slim chance you can run into some differences in case you're relying on it heavily in your custom formatters; 201 | * dev dependencies are bumped. 202 | 203 | ### Custom formatters API change 204 | 205 | [BlockTextBuilder](https://github.com/html-to-text/node-html-to-text/blob/master/lib/block-text-builder.js) methods now accept option objects for optional arguments. This improves client code readability and allows to introduce extra options with ease. It will see some use in future updates. 206 | 207 | Positional arguments introduced in version 6.0.0 are now deprecated. Formatters written for the version 6.0.0 should keep working for now but the compatibility layer is rather inconvenient and will be removed with the next major version. 208 | 209 | See the commit [f50f10f](https://github.com/html-to-text/node-html-to-text/commit/f50f10f54cf814efb2f7633d9d377ba7eadeaf1e). Changes in `lib/formatter.js` file are illustrative for how to migrate to the new API. 210 | 211 | ### And more 212 | 213 | * Bunch of documentation and test updates. 214 | 215 | All commits: [6.0.0...7.0.0](https://github.com/html-to-text/node-html-to-text/compare/6.0.0...7.0.0) 216 | 217 | Version 7 roadmap issue: [#222](https://github.com/html-to-text/node-html-to-text/issues/222) 218 | 219 | ---- 220 | 221 | ## Version 6.0.0 222 | 223 | This is a major update. No code left untouched. While the goal was to keep as much compatibility as possible, some client-facing changes were unavoidable. 224 | 225 | ### fromString() is deprecated in favor of htmlToText() 226 | 227 | Since the library has the only exported function, it is now self-titled. 228 | 229 | ### Inline and block-level tags, HTML whitespace 230 | 231 | Formatting code was rewritten almost entirely to make it aware of block-level tags and to handle HTML whitespace properly. One of popular requests was to support divs, and it is here now, after a lot of effort. 232 | 233 | ### Options reorganized 234 | 235 | Options are reorganized to make room for some extra format options while making everything more structured. Now tag-specific options live within that tag configuration. 236 | 237 | For the majority of changed options there is a compatibility layer that will remain until next major release. But you are encouraged to explore new options since they provide a bit more flexibility. 238 | 239 | ### Custom formatters are different now 240 | 241 | Because formatters are integral part of the formatting code (as the name suggests), it wasn't possible to provide a compatibility layer. 242 | 243 | Please refer to the Readme to see how things are wired now, in case you were using them for anything other than dealing with the lack of block-level tags support. 244 | 245 | ### Tables support was improved 246 | 247 | Cells can make use of extra space with colspan and rowspan attributes. Max column width is defined separately from global wordwrap limit. 248 | 249 | ### Limits 250 | 251 | Multiple options to cut content in large HTML documents. 252 | 253 | By default, any input longer than 16 million characters will be truncated. 254 | 255 | ### Node and dependencies 256 | 257 | Required Node version is now >=8.10.0. 258 | 259 | Dependency versions are bumped. 260 | 261 | ### Repository is moved to it's own organization 262 | 263 | [https://github.com/html-to-text/node-html-to-text](https://github.com/html-to-text/node-html-to-text) is the new home. 264 | 265 | GitHub should handle all redirects from the old url, so it shouldn't break anything, even if you have a local fork pointing at the old origin. But it is still a good idea to [update](https://docs.github.com/en/free-pro-team@latest/github/using-git/changing-a-remotes-url) the url. 266 | 267 | ### And more 268 | 269 | Version 6 roadmap issue: [#200](https://github.com/html-to-text/node-html-to-text/issues/200) 270 | 271 | ---- 272 | 273 | ## Version 5.1.1 274 | 275 | * `preserveNewLines` whitespace issue fixed [#162](https://github.com/html-to-text/node-html-to-text/pull/162) 276 | 277 | ## Version 5.1.0 278 | 279 | * Hard-coded CLI options removed [#173](https://github.com/html-to-text/node-html-to-text/pull/173) 280 | 281 | ## Version 5.0.0 282 | 283 | ### BREAKING CHANGES 284 | 285 | #### fromFile removed 286 | 287 | The function `fromFile` is removed. It was the main reason `html-to-text` could not be used in the browser [#164](https://github.com/html-to-text/node-html-to-text/pull/164). 288 | 289 | You can get the `fromFile` functionality back by using the following code 290 | 291 | ```js 292 | const fs = require('fs'); 293 | const { fromString } = require('html-to-text'); 294 | 295 | // Callback version 296 | const fromFile = (file, options, callback) => { 297 | if (!callback) { 298 | callback = options; 299 | options = {}; 300 | } 301 | fs.readFile(file, 'utf8', (err, str) => { 302 | if (err) return callback(err); 303 | callback(null, fromString(str, options)); 304 | }); 305 | }; 306 | 307 | // Promise version 308 | const fromFile = (file, option) => fs.promises.readFile(file, 'utf8').then(html => fromString(html, options)); 309 | 310 | // Sync version 311 | const fromFileSync = (file, options) => fromString(fs.readFileSync(file, 'utf8'), options); 312 | ``` 313 | 314 | #### Supported NodeJS Versions 315 | 316 | Node versions < 6 are no longer supported. 317 | 318 | ---- 319 | 320 | ## Version 4.0.0 321 | 322 | * Support dropped for node version < 4. 323 | * New option `unorderedListItemPrefix` added. 324 | * HTML entities in links are not supported. 325 | 326 | ---- 327 | 328 | ## Version 3.3.0 329 | 330 | * Ability to pass custom formatting via the `format` option #128 331 | * Enhanced support for alpha ordered list types added #123 332 | 333 | ## Version 3.2.0 334 | 335 | * Basic support for alpha ordered list types added #122 336 | * This includes support for the `ol` type values `1`, `a` and `A` 337 | 338 | ## Version 3.1.0 339 | 340 | * Support for the ordered list start attribute added #117 341 | * Option to format paragraph with single new line #112 342 | * `noLinksBrackets` options added #119 343 | 344 | ## Version 3.0.0 345 | 346 | * Switched from `htmlparser` to `htmlparser2` #113 347 | * Treat non-numeric colspans as zero and handle them gracefully #105 348 | 349 | ---- 350 | 351 | ## Version 2.1.1 352 | 353 | * Extra space problem fixed. #88 354 | 355 | ## Version 2.1.0 356 | 357 | * New option to disable `uppercaseHeadings` added. #86 358 | * Starting point of html to text conversion can now be defined in the options via the `baseElement` option. #83 359 | * Support for long words added. The behaviour can be configured via the `longWordSplit` option. #83 360 | 361 | ## Version 2.0.0 362 | 363 | * Unicode support added. #81 364 | * New option `decodeOptions` added. 365 | * Dependencies updated. 366 | 367 | Breaking Changes: 368 | 369 | * Minimum node version increased to >=0.10.0 370 | 371 | ---- 372 | 373 | ## Version 1.6.2 374 | 375 | * Fixed: correctly handle HTML entities for images #82 376 | 377 | ## Version 1.6.1 378 | 379 | * Fixed: using --tables=true doesn't produce the expected results. #80 380 | 381 | ## Version 1.6.0 382 | 383 | * Preserve newlines in text feature added #75 384 | 385 | ## Version 1.5.1 386 | 387 | * Support for h5 and h6 tags added #74 388 | 389 | ## Version 1.5.0 390 | 391 | * Entity regex is now less greedy #69 #70 392 | 393 | ## Version 1.4.0 394 | 395 | * Uppercase tag processing added. Table center support added. #56 396 | * Unused dependencies removed. 397 | 398 | ## Version 1.3.2 399 | 400 | * Support Node 4 engine #64 401 | --------------------------------------------------------------------------------