├── .editorconfig ├── .github └── workflows │ ├── lint.yml │ └── test.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE.md ├── README.md ├── composer.json ├── convert.php ├── html2text.php ├── phpstan.neon.dist ├── phpunit.xml ├── src ├── Html2Text.php └── Html2TextException.php └── tests ├── Html2TextTest.php ├── failures └── .gitignore ├── html ├── anchors.html ├── basic.html ├── blockquotes.html ├── dom-processing.html ├── empty.html ├── full_email.html ├── huge-msoffice.html ├── images.html ├── invalid.html ├── lists.html ├── more-anchors.html ├── msoffice.html ├── nbsp.html ├── nested-divs.html ├── newlines.html ├── non-breaking-spaces.html ├── pre.html ├── table.html ├── test3.html ├── test4.html ├── utf8-example.html ├── windows-1252-example.html └── zero-width-non-joiners.html └── txt ├── anchors.no-links.txt ├── anchors.txt ├── basic.no-links.txt ├── basic.txt ├── blockquotes.txt ├── dom-processing.txt ├── empty.txt ├── full_email.txt ├── huge-msoffice.txt ├── images.txt ├── invalid.txt ├── lists.txt ├── more-anchors.txt ├── msoffice.txt ├── nbsp.txt ├── nested-divs.txt ├── newlines.txt ├── non-breaking-spaces.txt ├── pre.txt ├── table.txt ├── test3.txt ├── test4.txt ├── utf8-example.txt ├── windows-1252-example.txt └── zero-width-non-joiners.txt /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig is awesome: http://EditorConfig.org 2 | 3 | # top-most EditorConfig file 4 | root = true 5 | 6 | # Unix-style newlines with a newline ending every file 7 | [*] 8 | end_of_line = lf 9 | charset = utf-8 10 | insert_final_newline = true 11 | trim_trailing_whitespace = true 12 | indent_style = tab 13 | indent_size = 4 14 | 15 | [*.md] 16 | indent_style = space 17 | indent_size = 2 18 | 19 | # don't add newlines to test files 20 | [tests/*] 21 | indent_style = tabs 22 | trim_trailing_whitespace = false 23 | insert_final_newline = false 24 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | on: 3 | - push 4 | jobs: 5 | lint: 6 | name: Lint 7 | runs-on: ubuntu-latest 8 | steps: 9 | - name: Checkout 10 | uses: actions/checkout@v3 11 | - name: Setup PHP 12 | uses: shivammathur/setup-php@v2 13 | with: 14 | php-version: '7.4' 15 | tools: phplint 16 | - name: Check syntax 17 | run: phplint . 18 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | on: 3 | - push 4 | jobs: 5 | test: 6 | strategy: 7 | matrix: 8 | operating-system: 9 | - ubuntu-latest 10 | php-version: 11 | - '7.3' 12 | - '7.4' 13 | - '8.0' 14 | - '8.1' 15 | - '8.2' 16 | name: php ${{ matrix.php-version }} on ${{ matrix.operating-system }} 17 | runs-on: ${{ matrix.operating-system }} 18 | steps: 19 | - name: Checkout 20 | uses: actions/checkout@v3 21 | - name: Setup PHP 22 | uses: shivammathur/setup-php@v2 23 | with: 24 | php-version: ${{ matrix.php-version }} 25 | extensions: mbstring 26 | coverage: none 27 | - name: Get composer cache directory 28 | id: composer-cache 29 | run: echo "::set-output name=dir::$(composer config cache-files-dir)" 30 | - name: Setup composer cache 31 | uses: actions/cache@v3 32 | with: 33 | path: ${{ steps.composer-cache.outputs.dir }} 34 | key: ${{ runner.os }}-composer-${{ hashFiles('**/composer.lock') }} 35 | restore-keys: ${{ runner.os }}-composer- 36 | - name: Install composer dependencies 37 | env: 38 | COMPOSER_AUTH: ${{ secrets.COMPOSER_AUTH }} 39 | run: composer install --no-ansi --no-interaction --no-scripts --no-progress --prefer-dist 40 | - name: Run tests 41 | run: vendor/bin/phpunit 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | tests/*.output 2 | *.sublime-project 3 | *.sublime-workspace 4 | vendor/ 5 | **/*.DS_Store 6 | .phpunit.result.cache 7 | composer.lock 8 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 6 | 7 | ## [Unreleased] 8 | 9 | ## [1.1.0] - 2019-02-15 10 | ### Added 11 | - Zero-width non-joiners are now stripped to prevent output issues, similar to non-breaking whitespace 12 | 13 | ### Fixed 14 | - Fix namespace in composer [#67](https://github.com/soundasleep/html2text/pull/67) 15 | 16 | ## [1.0.0] - 2019-02-14 17 | ### Added 18 | - Added `drop_links` option to render links without the target href [#65](https://github.com/soundasleep/html2text/pull/65) 19 | 20 | ### Changed 21 | - **Important:** Changed namespace from `\Html2Text\Html2Text` to `\Soundasleep\Html2text` [#45](https://github.com/soundasleep/html2text/issues/45) 22 | - Treat non-breaking spaces consistently: never include them in output text [#64](https://github.com/soundasleep/html2text/pull/64) 23 | - Second argument to `convert()` is now an array, rather than boolean [#65](https://github.com/soundasleep/html2text/pull/65) 24 | - Optimise/improve newline & whitespace handling [#47](https://github.com/soundasleep/html2text/pull/47) 25 | - Upgrade PHP support to PHP 7.3+ 26 | - Upgrade PHPUnit to 7.x 27 | - Re-release project under MIT license [#58](https://github.com/soundasleep/html2text/issues/58) 28 | 29 | ## [0.5.0] - 2017-04-20 30 | ### Added 31 | - Add ignore_error optional argument [#63](https://github.com/soundasleep/html2text/pull/63) 32 | - Blockquote support [#50](https://github.com/soundasleep/html2text/pull/50) 33 | 34 | [Unreleased]: https://github.com/soundasleep/html2text/compare/1.1.0...HEAD 35 | [1.1.0]: https://github.com/soundasleep/html2text/compare/1.0.0...1.1.0 36 | [1.0.0]: https://github.com/soundasleep/html2text/compare/0.5.0...1.0.0 37 | [0.5.0]: https://github.com/soundasleep/html2text/compare/0.5.0...0.3.4 38 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Jevon Wright 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![example workflow](https://github.com/soundasleep/html2text/actions/workflows/test.yml/badge.svg) [![Total Downloads](https://poser.pugx.org/soundasleep/html2text/downloads.png)](https://packagist.org/packages/soundasleep/html2text) 2 | ========= 3 | 4 | html2text is a very simple script that uses DOM methods to convert HTML into a format similar to what would be 5 | rendered by a browser - perfect for places where you need a quick text representation. For example: 6 | 7 | ```html 8 | 9 | Ignored Title 10 | 11 |

Hello, World!

12 | 13 |

This is some e-mail content. 14 | Even though it has whitespace and newlines, the e-mail converter 15 | will handle it correctly. 16 | 17 |

Even mismatched tags.

18 | 19 |
A div
20 |
Another div
21 |
A div
within a div
22 | 23 | A link 24 | 25 | 26 | 27 | ``` 28 | 29 | Will be converted into: 30 | 31 | ```text 32 | Hello, World! 33 | 34 | This is some e-mail content. Even though it has whitespace and newlines, the e-mail converter will handle it correctly. 35 | 36 | Even mismatched tags. 37 | 38 | A div 39 | Another div 40 | A div 41 | within a div 42 | 43 | [A link](http://foo.com) 44 | ``` 45 | 46 | See the [original blog post](http://journals.jevon.org/users/jevon-phd/entry/19818) or the related [StackOverflow answer](http://stackoverflow.com/a/2564472/39531). 47 | 48 | ## Installing 49 | 50 | You can use [Composer](http://getcomposer.org/) to add the [package](https://packagist.org/packages/soundasleep/html2text) to your project: 51 | 52 | ```json 53 | { 54 | "require": { 55 | "soundasleep/html2text": "~1.1" 56 | } 57 | } 58 | ``` 59 | 60 | And then use it quite simply: 61 | 62 | ```php 63 | $text = \Soundasleep\Html2Text::convert($html); 64 | ``` 65 | 66 | You can also include the supplied `html2text.php` and use `$text = convert_html_to_text($html);` instead. 67 | 68 | ### Options 69 | 70 | | Option | Default | Description | 71 | |--------|---------|-------------| 72 | | **ignore_errors** | `false` | Set to `true` to ignore any XML parsing errors. | 73 | | **drop_links** | `false` | Set to `true` to not render links as `[http://foo.com](My Link)`, but rather just `My Link`. | 74 | | **char_set** | `'auto'` | Specify a specific character set. Pass multiple character sets (comma separated) to detect encoding, default is ASCII,UTF-8 | 75 | 76 | Pass along options as a second argument to `convert`, for example: 77 | 78 | ```php 79 | $options = array( 80 | 'ignore_errors' => true, 81 | // other options go here 82 | ); 83 | $text = \Soundasleep\Html2Text::convert($html, $options); 84 | ``` 85 | 86 | ## Tests 87 | 88 | Some very basic tests are provided in the `tests/` directory. Run them with `composer install && vendor/bin/phpunit`. 89 | 90 | ## Troubleshooting 91 | 92 | ### Class 'DOMDocument' not found 93 | 94 | You need to [install the PHP XML extension](https://github.com/soundasleep/html2text/issues/55) for your PHP version. e.g. `apt-get install php7.4-xml` 95 | 96 | ## License 97 | 98 | `html2text` is [licensed under MIT](LICENSE.md), making it suitable for both Eclipse and GPL projects. 99 | 100 | ## Other versions 101 | 102 | Also see [html2text_ruby](https://github.com/soundasleep/html2text_ruby), a Ruby implementation. 103 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "soundasleep/html2text", 3 | "description": "A PHP script to convert HTML into a plain text format", 4 | "type": "library", 5 | "keywords": [ "php", "html", "text", "email" ], 6 | "homepage": "https://github.com/soundasleep/html2text", 7 | "license": "MIT", 8 | "authors": [ 9 | { 10 | "name": "Jevon Wright", 11 | "homepage": "https://jevon.org", 12 | "role": "Developer" 13 | } 14 | ], 15 | "autoload": { 16 | "psr-4": { 17 | "Soundasleep\\": "src" 18 | } 19 | }, 20 | "support": { 21 | "email": "support@jevon.org" 22 | }, 23 | "require": { 24 | "php": "^7.3|^8.0", 25 | "ext-dom": "*", 26 | "ext-libxml": "*" 27 | }, 28 | "require-dev": { 29 | "phpunit/phpunit": "^7.0|^8.0|^9.0", 30 | "phpstan/phpstan": "^1.9" 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /convert.php: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | tests 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /src/Html2Text.php: -------------------------------------------------------------------------------- 1 | */ 8 | public static function defaultOptions(): array { 9 | return [ 10 | 'ignore_errors' => false, 11 | 'drop_links' => false, 12 | 'char_set' => 'auto' 13 | ]; 14 | } 15 | 16 | /** 17 | * Tries to convert the given HTML into a plain text format - best suited for 18 | * e-mail display, etc. 19 | * 20 | *

In particular, it tries to maintain the following features: 21 | *

25 | * 26 | * @param string $html the input HTML 27 | * @param boolean|array $options if boolean, Ignore xml parsing errors, else ['ignore_errors' => false, 'drop_links' => false, 'char_set' => 'auto'] 28 | * @return string the HTML converted, as best as possible, to text 29 | * @throws Html2TextException if the HTML could not be loaded as a {@link \DOMDocument} 30 | */ 31 | public static function convert(string $html, $options = []): string { 32 | 33 | if ($options === false || $options === true) { 34 | // Using old style (< 1.0) of passing in options 35 | $options = ['ignore_errors' => $options]; 36 | } 37 | 38 | $options = array_merge(static::defaultOptions(), $options); 39 | 40 | // check all options are valid 41 | foreach ($options as $key => $value) { 42 | if (!in_array($key, array_keys(static::defaultOptions()))) { 43 | throw new \InvalidArgumentException("Unknown html2text option '$key'. Valid options are " . implode(',', static::defaultOptions())); 44 | } 45 | } 46 | 47 | $is_office_document = self::isOfficeDocument($html); 48 | 49 | if ($is_office_document) { 50 | // remove office namespace 51 | $html = str_replace(["", ""], "", $html); 52 | } 53 | 54 | $html = self::fixNewlines($html); 55 | 56 | // use mb_convert_encoding for legacy versions of php 57 | if (PHP_MAJOR_VERSION * 10 + PHP_MINOR_VERSION < 81 && mb_detect_encoding($html, "UTF-8", true)) { 58 | $html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8"); 59 | } 60 | 61 | $doc = self::getDocument($html, $options); 62 | 63 | $output = self::iterateOverNode($doc, null, false, $is_office_document, $options); 64 | 65 | // process output for whitespace/newlines 66 | $output = self::processWhitespaceNewlines($output); 67 | 68 | return $output; 69 | } 70 | 71 | /** 72 | * Unify newlines; in particular, \r\n becomes \n, and 73 | * then \r becomes \n. This means that all newlines (Unix, Windows, Mac) 74 | * all become \ns. 75 | * 76 | * @param string $text text with any number of \r, \r\n and \n combinations 77 | * @return string the fixed text 78 | */ 79 | public static function fixNewlines(string $text): string { 80 | // replace \r\n to \n 81 | $text = str_replace("\r\n", "\n", $text); 82 | // remove \rs 83 | $text = str_replace("\r", "\n", $text); 84 | 85 | return $text; 86 | } 87 | 88 | /** @return array */ 89 | public static function nbspCodes(): array { 90 | return [ 91 | "\xc2\xa0", 92 | "\u00a0", 93 | ]; 94 | } 95 | 96 | /** @return array */ 97 | public static function zwnjCodes(): array { 98 | return [ 99 | "\xe2\x80\x8c", 100 | "\u200c", 101 | ]; 102 | } 103 | 104 | /** 105 | * Remove leading or trailing spaces and excess empty lines from provided multiline text 106 | * 107 | * @param string $text multiline text any number of leading or trailing spaces or excess lines 108 | * @return string the fixed text 109 | */ 110 | public static function processWhitespaceNewlines(string $text): string { 111 | 112 | // remove excess spaces around tabs 113 | $text = preg_replace("/ *\t */im", "\t", $text); 114 | 115 | // remove leading whitespace 116 | $text = ltrim($text); 117 | 118 | // remove leading spaces on each line 119 | $text = preg_replace("/\n[ \t]*/im", "\n", $text); 120 | 121 | // convert non-breaking spaces to regular spaces to prevent output issues, 122 | // do it here so they do NOT get removed with other leading spaces, as they 123 | // are sometimes used for indentation 124 | $text = self::renderText($text); 125 | 126 | // remove trailing whitespace 127 | $text = rtrim($text); 128 | 129 | // remove trailing spaces on each line 130 | $text = preg_replace("/[ \t]*\n/im", "\n", $text); 131 | 132 | // unarmor pre blocks 133 | $text = self::fixNewLines($text); 134 | 135 | // remove unnecessary empty lines 136 | $text = preg_replace("/\n\n\n*/im", "\n\n", $text); 137 | 138 | return $text; 139 | } 140 | 141 | /** 142 | * Can we guess that this HTML is generated by Microsoft Office? 143 | */ 144 | public static function isOfficeDocument(string $html): bool { 145 | return strpos($html, "urn:schemas-microsoft-com:office") !== false; 146 | } 147 | 148 | public static function isWhitespace(string $text): bool { 149 | return strlen(trim(self::renderText($text), "\n\r\t ")) === 0; 150 | } 151 | 152 | /** 153 | * Parse HTML into a DOMDocument 154 | * 155 | * @param string $html the input HTML 156 | * @param array $options 157 | * @return \DOMDocument the parsed document tree 158 | */ 159 | private static function getDocument(string $html, array $options): \DOMDocument { 160 | 161 | $doc = new \DOMDocument(); 162 | 163 | $html = trim($html); 164 | 165 | if (!$html) { 166 | // DOMDocument doesn't support empty value and throws an error 167 | // Return empty document instead 168 | return $doc; 169 | } 170 | 171 | if ($html[0] !== '<') { 172 | // If HTML does not begin with a tag, we put a body tag around it. 173 | // If we do not do this, PHP will insert a paragraph tag around 174 | // the first block of text for some reason which can mess up 175 | // the newlines. See pre.html test for an example. 176 | $html = '' . $html . ''; 177 | } 178 | 179 | $header = ''; 180 | // use char sets for modern versions of php 181 | if (PHP_MAJOR_VERSION * 10 + PHP_MINOR_VERSION >= 81) { 182 | // use specified char_set, or auto detect if not set 183 | $char_set = ! empty($options['char_set']) ? $options['char_set'] : 'auto'; 184 | if ('auto' === $char_set) { 185 | $char_set = mb_detect_encoding($html); 186 | } else if (strpos($char_set, ',')) { 187 | mb_detect_order($char_set); 188 | $char_set = mb_detect_encoding($html); 189 | } 190 | // turn off error detection for Windows-1252 legacy html 191 | if (strpos($char_set, '1252')) { 192 | $options['ignore_errors'] = true; 193 | } 194 | $header = ''; 195 | } 196 | 197 | if (! empty($options['ignore_errors'])) { 198 | $doc->strictErrorChecking = false; 199 | $doc->recover = true; 200 | $doc->xmlStandalone = true; 201 | $old_internal_errors = libxml_use_internal_errors(true); 202 | $load_result = $doc->loadHTML($header . $html, LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NONET | LIBXML_PARSEHUGE); 203 | libxml_use_internal_errors($old_internal_errors); 204 | } 205 | else { 206 | $load_result = $doc->loadHTML($header . $html); 207 | } 208 | 209 | if (!$load_result) { 210 | throw new Html2TextException("Could not load HTML - badly formed?", $html); 211 | } 212 | 213 | return $doc; 214 | } 215 | 216 | /** 217 | * Replace any special characters with simple text versions, to prevent output issues: 218 | * - Convert non-breaking spaces to regular spaces; and 219 | * - Convert zero-width non-joiners to '' (nothing). 220 | * 221 | * This is to match our goal of rendering documents as they would be rendered 222 | * by a browser. 223 | */ 224 | private static function renderText(string $text): string { 225 | $text = str_replace(self::nbspCodes(), " ", $text); 226 | $text = str_replace(self::zwnjCodes(), "", $text); 227 | return $text; 228 | } 229 | 230 | private static function nextChildName(?\DOMNode $node): ?string { 231 | // get the next child 232 | $nextNode = $node->nextSibling; 233 | while ($nextNode != null) { 234 | if ($nextNode instanceof \DOMText) { 235 | if (!self::isWhitespace($nextNode->wholeText)) { 236 | break; 237 | } 238 | } 239 | 240 | if ($nextNode instanceof \DOMElement) { 241 | break; 242 | } 243 | 244 | $nextNode = $nextNode->nextSibling; 245 | } 246 | 247 | $nextName = null; 248 | if (($nextNode instanceof \DOMElement || $nextNode instanceof \DOMText) && $nextNode != null) { 249 | $nextName = strtolower($nextNode->nodeName); 250 | } 251 | 252 | return $nextName; 253 | } 254 | 255 | /** @param array $options */ 256 | private static function iterateOverNode(\DOMNode $node, ?string $prevName, bool $in_pre, bool $is_office_document, array $options): string { 257 | if ($node instanceof \DOMText) { 258 | // Replace whitespace characters with a space (equivilant to \s) 259 | if ($in_pre) { 260 | $text = "\n" . trim(self::renderText($node->wholeText), "\n\r\t ") . "\n"; 261 | 262 | // Remove trailing whitespace only 263 | $text = preg_replace("/[ \t]*\n/im", "\n", $text); 264 | 265 | // armor newlines with \r. 266 | return str_replace("\n", "\r", $text); 267 | 268 | } 269 | $text = self::renderText($node->wholeText); 270 | $text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $text); 271 | 272 | if (!self::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) { 273 | return "\n" . $text; 274 | } 275 | return $text; 276 | } 277 | 278 | if ($node instanceof \DOMDocumentType || $node instanceof \DOMProcessingInstruction) { 279 | // ignore 280 | return ""; 281 | } 282 | 283 | $name = strtolower($node->nodeName); 284 | $nextName = self::nextChildName($node); 285 | 286 | // start whitespace 287 | switch ($name) { 288 | case "hr": 289 | $prefix = ''; 290 | if ($prevName != null) { 291 | $prefix = "\n"; 292 | } 293 | return $prefix . "---------------------------------------------------------------\n"; 294 | 295 | case "style": 296 | case "head": 297 | case "title": 298 | case "meta": 299 | case "script": 300 | // ignore these tags 301 | return ""; 302 | 303 | case "h1": 304 | case "h2": 305 | case "h3": 306 | case "h4": 307 | case "h5": 308 | case "h6": 309 | case "ol": 310 | case "ul": 311 | case "pre": 312 | // add two newlines 313 | $output = "\n\n"; 314 | break; 315 | 316 | case "td": 317 | case "th": 318 | // add tab char to separate table fields 319 | $output = "\t"; 320 | break; 321 | 322 | case "p": 323 | // Microsoft exchange emails often include HTML which, when passed through 324 | // html2text, results in lots of double line returns everywhere. 325 | // 326 | // To fix this, for any p element with a className of `MsoNormal` (the standard 327 | // classname in any Microsoft export or outlook for a paragraph that behaves 328 | // like a line return) we skip the first line returns and set the name to br. 329 | // @phpstan-ignore-next-line 330 | if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') { 331 | $output = ""; 332 | $name = 'br'; 333 | break; 334 | } 335 | 336 | // add two lines 337 | $output = "\n\n"; 338 | break; 339 | 340 | case "tr": 341 | // add one line 342 | $output = "\n"; 343 | break; 344 | 345 | case "div": 346 | $output = ""; 347 | if ($prevName !== null) { 348 | // add one line 349 | $output .= "\n"; 350 | } 351 | break; 352 | 353 | case "li": 354 | $output = "- "; 355 | break; 356 | 357 | default: 358 | // print out contents of unknown tags 359 | $output = ""; 360 | break; 361 | } 362 | 363 | // debug 364 | //$output .= "[$name,$nextName]"; 365 | 366 | if (isset($node->childNodes)) { 367 | 368 | $n = $node->childNodes->item(0); 369 | $previousSiblingNames = []; 370 | $previousSiblingName = null; 371 | 372 | $parts = []; 373 | $trailing_whitespace = 0; 374 | 375 | while ($n != null) { 376 | 377 | $text = self::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document, $options); 378 | 379 | // Pass current node name to next child, as previousSibling does not appear to get populated 380 | if ($n instanceof \DOMDocumentType 381 | || $n instanceof \DOMProcessingInstruction 382 | || ($n instanceof \DOMText && self::isWhitespace($text))) { 383 | // Keep current previousSiblingName, these are invisible 384 | $trailing_whitespace++; 385 | } 386 | else { 387 | $previousSiblingName = strtolower($n->nodeName); 388 | $previousSiblingNames[] = $previousSiblingName; 389 | $trailing_whitespace = 0; 390 | } 391 | 392 | $node->removeChild($n); 393 | $n = $node->childNodes->item(0); 394 | 395 | $parts[] = $text; 396 | } 397 | 398 | // Remove trailing whitespace, important for the br check below 399 | while ($trailing_whitespace-- > 0) { 400 | array_pop($parts); 401 | } 402 | 403 | // suppress last br tag inside a node list if follows text 404 | $last_name = array_pop($previousSiblingNames); 405 | if ($last_name === 'br') { 406 | $last_name = array_pop($previousSiblingNames); 407 | if ($last_name === '#text') { 408 | array_pop($parts); 409 | } 410 | } 411 | 412 | $output .= implode('', $parts); 413 | } 414 | 415 | // end whitespace 416 | switch ($name) { 417 | case "h1": 418 | case "h2": 419 | case "h3": 420 | case "h4": 421 | case "h5": 422 | case "h6": 423 | case "pre": 424 | case "p": 425 | // add two lines 426 | $output .= "\n\n"; 427 | break; 428 | 429 | case "br": 430 | // add one line 431 | $output .= "\n"; 432 | break; 433 | 434 | case "div": 435 | break; 436 | 437 | case "a": 438 | // links are returned in [text](link) format 439 | // @phpstan-ignore-next-line 440 | $href = $node->getAttribute("href"); 441 | 442 | $output = trim($output); 443 | 444 | // remove double [[ ]] s from linking images 445 | if (substr($output, 0, 1) == "[" && substr($output, -1) == "]") { 446 | $output = substr($output, 1, strlen($output) - 2); 447 | 448 | // for linking images, the title of the overrides the title of the 449 | // @phpstan-ignore-next-line 450 | if ($node->getAttribute("title")) { 451 | // @phpstan-ignore-next-line 452 | $output = $node->getAttribute("title"); 453 | } 454 | } 455 | 456 | // if there is no link text, but a title attr 457 | // @phpstan-ignore-next-line 458 | if (!$output && $node->getAttribute("title")) { 459 | // @phpstan-ignore-next-line 460 | $output = $node->getAttribute("title"); 461 | } 462 | 463 | if ($href == null) { 464 | // it doesn't link anywhere 465 | // @phpstan-ignore-next-line 466 | if ($node->getAttribute("name") != null) { 467 | if ($options['drop_links']) { 468 | $output = "$output"; 469 | } else { 470 | $output = "[$output]"; 471 | } 472 | } 473 | } else { 474 | if ($href == $output || $href == "mailto:$output" || $href == "http://$output" || $href == "https://$output") { 475 | // link to the same address: just use link 476 | $output = "$output"; 477 | } else { 478 | // replace it 479 | if ($output) { 480 | if ($options['drop_links']) { 481 | $output = "$output"; 482 | } else { 483 | $output = "[$output]($href)"; 484 | } 485 | } else { 486 | // empty string 487 | $output = "$href"; 488 | } 489 | } 490 | } 491 | 492 | // does the next node require additional whitespace? 493 | switch ($nextName) { 494 | case "h1": case "h2": case "h3": case "h4": case "h5": case "h6": 495 | $output .= "\n"; 496 | break; 497 | } 498 | break; 499 | 500 | case "img": 501 | // @phpstan-ignore-next-line 502 | if ($node->getAttribute("title")) { 503 | // @phpstan-ignore-next-line 504 | $output = "[" . $node->getAttribute("title") . "]"; 505 | // @phpstan-ignore-next-line 506 | } elseif ($node->getAttribute("alt")) { 507 | // @phpstan-ignore-next-line 508 | $output = "[" . $node->getAttribute("alt") . "]"; 509 | } else { 510 | $output = ""; 511 | } 512 | break; 513 | 514 | case "li": 515 | $output .= "\n"; 516 | break; 517 | 518 | case "blockquote": 519 | // process quoted text for whitespace/newlines 520 | $output = self::processWhitespaceNewlines($output); 521 | 522 | // add leading newline 523 | $output = "\n" . $output; 524 | 525 | // prepend '> ' at the beginning of all lines 526 | $output = preg_replace("/\n/im", "\n> ", $output); 527 | 528 | // replace leading '> >' with '>>' 529 | $output = preg_replace("/\n> >/im", "\n>>", $output); 530 | 531 | // add another leading newline and trailing newlines 532 | $output = "\n" . $output . "\n\n"; 533 | break; 534 | default: 535 | // do nothing 536 | } 537 | 538 | return $output; 539 | } 540 | } 541 | -------------------------------------------------------------------------------- /src/Html2TextException.php: -------------------------------------------------------------------------------- 1 | more_info = $more_info; 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /tests/Html2TextTest.php: -------------------------------------------------------------------------------- 1 | getFileName()[0] != '.') { 11 | unlink($fileInfo->getPathname()); 12 | } 13 | } 14 | } 15 | 16 | /** 17 | * @dataProvider providerFiles 18 | */ 19 | public function testFile(string $test): void { 20 | $this->doTestWithResults($test, $test, []); 21 | } 22 | 23 | /** @param bool | array $options */ 24 | function doTestWithResults(string $test, string $result, $options = []): void { 25 | $html = __DIR__ . "/html/$test.html"; 26 | $txt = __DIR__ . "/txt/$result.txt"; 27 | $this->assertTrue(file_exists($html), "File '{$html}' does not exist"); 28 | $this->assertTrue(file_exists($txt), "File '{$txt}' does not exist"); 29 | $input = file_get_contents($html); 30 | $expected = \Soundasleep\Html2Text::fixNewlines(file_get_contents($txt)); 31 | 32 | $output = \Soundasleep\Html2Text::convert($input, $options); 33 | 34 | if ($output != $expected) { 35 | file_put_contents(__DIR__ . "/failures/$result.output", $output); 36 | } 37 | $this->assertEquals($expected, $output, "{$html} file failed to convert to {$txt}"); 38 | } 39 | 40 | /** @return array> */ 41 | public function providerFiles(): array { 42 | return [ 43 | ['basic'], 44 | ['anchors'], 45 | ['more-anchors'], 46 | ['test3'], 47 | ['test4'], 48 | ['table'], 49 | ['nbsp'], 50 | ['lists'], 51 | ['pre'], 52 | ['newlines'], 53 | ['nested-divs'], 54 | ['blockquotes'], 55 | ['full_email'], 56 | ['images'], 57 | ['non-breaking-spaces'], 58 | ['utf8-example'], 59 | ['msoffice'], 60 | ['dom-processing'], 61 | ['empty'], 62 | ['huge-msoffice'], 63 | ['zero-width-non-joiners'], 64 | ]; 65 | } 66 | 67 | public function testInvalidXML(): void { 68 | $this->expectWarning(); 69 | $this->doTestWithResults("invalid", "invalid", ['ignore_errors' => false]); 70 | } 71 | 72 | public function testInvalidXMLIgnore(): void { 73 | $this->doTestWithResults("invalid", "invalid", ['ignore_errors' => true]); 74 | } 75 | 76 | public function testInvalidXMLIgnoreOldSyntax(): void { 77 | // for BC, allow old #convert(text, bool) syntax 78 | $this->doTestWithResults("invalid", "invalid", true); 79 | } 80 | 81 | public function testInvalidOption(): void { 82 | $this->expectException(InvalidArgumentException::class); 83 | $this->doTestWithResults("basic", "basic", ['invalid_option' => true]); 84 | } 85 | 86 | public function testBasicDropLinks(): void { 87 | $this->doTestWithResults("basic", "basic.no-links", ['drop_links' => true]); 88 | } 89 | 90 | public function testAnchorsDropLinks(): void { 91 | $this->doTestWithResults("anchors", "anchors.no-links", ['drop_links' => true]); 92 | } 93 | 94 | public function testWindows1252(): void { 95 | $this->doTestWithResults("windows-1252-example", "windows-1252-example", ['char_set' => 'windows-1252']); 96 | } 97 | } -------------------------------------------------------------------------------- /tests/failures/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything 2 | * 3 | 4 | # But not these files... 5 | !.gitignore 6 | 7 | # ...even if they are in subdirectories 8 | !*/ 9 | -------------------------------------------------------------------------------- /tests/html/anchors.html: -------------------------------------------------------------------------------- 1 | A document without any HTML open/closing tags. 2 | 3 |
4 | 5 | We try and use the representation given by common browsers of the 6 | HTML document, so that it looks similar when converted to plain text. 7 | 8 |
visit foo.com - or http://www.foo.com 9 | 10 | link 11 | 12 |

An anchor which will not appear

13 | -------------------------------------------------------------------------------- /tests/html/basic.html: -------------------------------------------------------------------------------- 1 | 2 | Ignored Title 3 | 4 |

Hello, World!

5 | 6 |

This is some e-mail content. 7 | Even though it has whitespace and newlines, the e-mail converter 8 | will handle it correctly. 9 | 10 |

Even mismatched tags.

11 | 12 |
A div
13 |
Another div
14 |
A div
within a div
15 | 16 |

Another line
Yet another line

17 | 18 | A link 19 | 20 | 21 | -------------------------------------------------------------------------------- /tests/html/blockquotes.html: -------------------------------------------------------------------------------- 1 | 2 | Hello 3 |
4 | Nest some block quotes with preformated text 5 |
6 | Here is the code 7 |
 8 | #include <stdlib.h>
 9 | #include <stdio.h>
10 | 
11 | int main(){
12 | 	return 0;
13 | };
14 | 
15 | 
16 | 17 | Put some tags 18 | at the end 19 |
20 | 21 | Some text and tags here 22 | 23 |
24 | First line 25 |

Header 1

26 | Some text 27 |
28 | Some more text 29 |

Paragraph tag!

30 |

Header 2

31 |
32 |

Header 3

33 | Some text 34 |

Header 4

35 |
36 | More quoted text! 37 |
38 |

Paragraph tag!

39 | Final line 40 |
41 |
42 | Some ending text 43 | just to make sure -------------------------------------------------------------------------------- /tests/html/dom-processing.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | Hello 7 | 8 | -------------------------------------------------------------------------------- /tests/html/empty.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soundasleep/html2text/83502b6f8f1aaef8e2e238897199d64f284b4af3/tests/html/empty.html -------------------------------------------------------------------------------- /tests/html/full_email.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 22 | 23 | 24 | 38 | 39 |
25 | 26 | 27 | 31 | 35 | 36 |
28 |
37 |
40 | 41 | 42 | 43 | 52 | 53 |
44 |

45 | Hi Susan 46 |

47 |

48 | Here is your cat report. 49 |

50 | 51 |
54 | 55 | 56 | 57 | 58 | 59 | 60 | 69 | 70 |
61 | 62 | 63 |
64 |
65 | Find more cats 66 |
67 |
68 |
71 | 72 | 73 | 74 | 122 | 123 |
75 |
76 |

Down the road

77 |

Across the hall

78 | 79 |

Your achievements

80 | 81 | 82 | 83 | 88 | 89 | 90 | 91 | 92 | 93 |
84 |
You're currently finding about
85 |
12 cats
86 |
per day
87 |
 
Number of cats found
94 |
95 | 96 | 97 |
98 |
99 | 100 |

Your last cat was found two days ago.

101 |

One type of cat is a kitten.

102 | 103 | 104 | 105 | 108 | 112 | 115 | 116 | 117 |
106 | 107 | 109 |

Special account A1 110 |

111 |
113 |

12.345

114 |
118 | 119 |
120 | 121 |
124 | 125 | 130 | 131 | 132 | 133 | 188 | 189 |
134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 163 | 168 | 173 | 174 | 175 | 176 | 179 | 182 | 185 | 186 |

How can you find more cats?

Look in trash cans

Start meowing

Eat cat food

Some cats like to hang out in trash cans. Some cats do not.Some cats are attracted to similar tones.So one day your tears may smell like cat food, attracting more cats.
159 | 160 | 161 | 162 | 164 | 165 | 166 | 167 | 169 | 170 | 171 | 172 |
187 |
190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | -------------------------------------------------------------------------------- /tests/html/images.html: -------------------------------------------------------------------------------- 1 | 2 |

3 | One: 4 |

5 | 6 |

7 | Two: two 8 |

9 | 10 |

11 | Three: 12 |

13 | 14 |

15 | Four: four alt 16 |

17 | 18 |

With links

19 | 20 |

21 | One: 22 |

23 | 24 |

25 | Two: two 26 |

27 | 28 |

29 | Three: 30 |

31 | 32 |

33 | Four: four alt 34 |

35 | 36 |

With links with titles

37 | 38 |

39 | One: 40 |

41 | 42 |

43 | Two: two 44 |

45 | 46 |

47 | Three: 48 |

49 | 50 |

51 | Four: four alt 52 |

53 | 54 | -------------------------------------------------------------------------------- /tests/html/invalid.html: -------------------------------------------------------------------------------- 1 | Hello &nbsnbsp; world 2 |
openiaml.org or http://openiaml.org. 5 |

6 | 7 |

8 | To visit with SSL, visit https://openiaml.org or openiaml.org or https://openiaml.org. 9 |

10 | 11 |

12 | To mail, email support@openiaml.org or mailto:support@openiaml.org 13 | or support@openiaml.org or mailto:support@openiaml.org. 14 |

15 | -------------------------------------------------------------------------------- /tests/html/msoffice.html: -------------------------------------------------------------------------------- 1 |

Dear html2text,

 

This is an example email that can be used to test html2text conversion of outlook / exchange emails.

 

The addition of <o:p> tags is very annoying!

This is a single line return

 

This is bold

This is italic

This is underline

 

Andrew

-------------------------------------------------------------------------------- /tests/html/nbsp.html: -------------------------------------------------------------------------------- 1 | hello   world & people < > &NBSP; -------------------------------------------------------------------------------- /tests/html/nested-divs.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | Just two divs 5 |
6 |
7 | Hanging out 8 |
9 |
Nested divs and line breaks

10 |
Nested divs and line breaks
More text
11 |

12 |
Just text
13 |
Just text
14 |
Just text

15 | This is the end! 16 | 17 | 18 | -------------------------------------------------------------------------------- /tests/html/newlines.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | Hello 5 |
6 |
7 |
8 | How are you? 9 |
10 |
11 | 12 |

13 | How are you? 14 |
15 |

16 | 17 |

18 | How are you? 19 |
20 |

21 | 22 |
23 | Just two divs 24 |
25 |
26 | Hanging out 27 |
28 | 29 | This is not the end! 30 |
31 | How are you again? 32 |
33 |
34 | This is the end! 35 |
36 | Just kidding 37 |

Header 1

38 | Some text 39 |
40 | Some more text 41 |

Paragraph tag!

42 |

Header 2

43 |
44 |

Header 3

45 | Some text 46 |

Header 4

47 |

Paragraph tag!

48 | Final line 49 | 50 | -------------------------------------------------------------------------------- /tests/html/non-breaking-spaces.html: -------------------------------------------------------------------------------- 1 | these spaces are non-breaking -------------------------------------------------------------------------------- /tests/html/pre.html: -------------------------------------------------------------------------------- 1 | Here is the code 2 |
 3 | #include <stdlib.h>
 4 | #include <stdio.h>
 5 | 
 6 | int main(){
 7 | 	return 0;
 8 | };
 9 | 
10 | 
-------------------------------------------------------------------------------- /tests/html/table.html: -------------------------------------------------------------------------------- 1 | 2 | Ignored Title 3 | 4 |

Hello, World!

5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 17 | 20 | 21 | 22 | 25 | 28 | 29 | 30 | 33 | 36 | 37 | 38 | 39 | 40 | 43 | 46 | 47 | 48 | 49 | 50 |
Col ACol B
15 | Data A1 16 | 18 | Data B1 19 |
23 | Data A2 24 | 26 | Data B2 27 |
31 | Data A3 32 | 34 | Data B4 35 |
41 | Total A 42 | 44 | Total B 45 |
51 | 52 | 53 | -------------------------------------------------------------------------------- /tests/html/test3.html: -------------------------------------------------------------------------------- 1 | test one
test two -------------------------------------------------------------------------------- /tests/html/test4.html: -------------------------------------------------------------------------------- 1 | 1
2
3
4
5 < 6 -------------------------------------------------------------------------------- /tests/html/utf8-example.html: -------------------------------------------------------------------------------- 1 |
    2 |
  • ÅÄÖ
  • 3 |
  • åäö
  • 4 |
5 | -------------------------------------------------------------------------------- /tests/html/windows-1252-example.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soundasleep/html2text/83502b6f8f1aaef8e2e238897199d64f284b4af3/tests/html/windows-1252-example.html -------------------------------------------------------------------------------- /tests/html/zero-width-non-joiners.html: -------------------------------------------------------------------------------- 1 |

foo‌bar

2 | -------------------------------------------------------------------------------- /tests/txt/anchors.no-links.txt: -------------------------------------------------------------------------------- 1 | A document without any HTML open/closing tags. 2 | --------------------------------------------------------------- 3 | We try and use the representation given by common browsers of the HTML document, so that it looks similar when converted to plain text. visit foo.com - or http://www.foo.com link 4 | 5 | An anchor which will not appear -------------------------------------------------------------------------------- /tests/txt/anchors.txt: -------------------------------------------------------------------------------- 1 | A document without any HTML open/closing tags. 2 | --------------------------------------------------------------- 3 | We try and use the representation given by common browsers of the HTML document, so that it looks similar when converted to plain text. [visit foo.com](http://foo.com) - or http://www.foo.com [link](http://foo.com) 4 | 5 | [An anchor which will not appear] -------------------------------------------------------------------------------- /tests/txt/basic.no-links.txt: -------------------------------------------------------------------------------- 1 | Hello, World! 2 | 3 | This is some e-mail content. Even though it has whitespace and newlines, the e-mail converter will handle it correctly. 4 | 5 | Even mismatched tags. 6 | 7 | A div 8 | Another div 9 | A div 10 | within a div 11 | 12 | Another line 13 | Yet another line 14 | 15 | A link -------------------------------------------------------------------------------- /tests/txt/basic.txt: -------------------------------------------------------------------------------- 1 | Hello, World! 2 | 3 | This is some e-mail content. Even though it has whitespace and newlines, the e-mail converter will handle it correctly. 4 | 5 | Even mismatched tags. 6 | 7 | A div 8 | Another div 9 | A div 10 | within a div 11 | 12 | Another line 13 | Yet another line 14 | 15 | [A link](http://foo.com) -------------------------------------------------------------------------------- /tests/txt/blockquotes.txt: -------------------------------------------------------------------------------- 1 | Hello 2 | 3 | > Nest some block quotes with preformated text 4 | > 5 | >> Here is the code 6 | >> 7 | >> #include 8 | >> #include 9 | >> 10 | >> int main(){ 11 | >> return 0; 12 | >> }; 13 | >> 14 | >> Put some tags at the end 15 | > 16 | > Some text and tags here 17 | > 18 | >> First line 19 | >> 20 | >> Header 1 21 | >> 22 | >> Some text 23 | >> --------------------------------------------------------------- 24 | >> Some more text 25 | >> 26 | >> Paragraph tag! 27 | >> 28 | >> Header 2 29 | >> 30 | >> --------------------------------------------------------------- 31 | >> 32 | >> Header 3 33 | >> 34 | >> Some text 35 | >> 36 | >> Header 4 37 | >> 38 | >>> More quoted text! 39 | >> 40 | >> Paragraph tag! 41 | >> 42 | >> Final line 43 | 44 | Some ending text just to make sure -------------------------------------------------------------------------------- /tests/txt/dom-processing.txt: -------------------------------------------------------------------------------- 1 | Hello -------------------------------------------------------------------------------- /tests/txt/empty.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soundasleep/html2text/83502b6f8f1aaef8e2e238897199d64f284b4af3/tests/txt/empty.txt -------------------------------------------------------------------------------- /tests/txt/full_email.txt: -------------------------------------------------------------------------------- 1 | http://localhost/home 16 December 2015 2 | Account 123 3 | 4 | Hi Susan 5 | 6 | Here is your cat report. 7 | 8 | You have found 5 cats less than anyone else 9 | [Find more cats](http://localhost/cats) 10 | 11 | Down the road 12 | 13 | Across the hall 14 | 15 | Your achievements 16 | 17 | You're currently finding about 18 | 12 cats 19 | per day 20 | 21 | [Number of cats found] 22 | --------------------------------------------------------------- 23 | 24 | Your last cat was found two days ago. 25 | 26 | One type of cat is a kitten. 27 | 28 | Special account A1 29 | 30 | 12.345 31 | 32 | http://localhost/logout 33 | 34 | How can you find more cats? 35 | 36 | Look in trash cans 37 | 38 | Start meowing 39 | 40 | Eat cat food 41 | 42 | Some cats like to hang out in trash cans. Some cats do not. Some cats are attracted to similar tones. So one day your tears may smell like cat food, attracting more cats. 43 | https://localhost/about https://localhost/about https://localhost/about 44 | [Cats are great.](https://github.com/soundasleep/html2text_ruby) [Find more cats.](https://github.com/soundasleep/html2text_ruby) [Do more things.](https://github.com/soundasleep/html2text_ruby) 45 | 46 | [Contact us](http://localhost/contact) 47 | 48 | cats@cats.com 49 | Monday and Friday 50 | 51 | https://github.com/soundasleep/html2text https://github.com/soundasleep/html2text_ruby 52 | 53 | Having trouble seeing this email? [View it online](http://localhost/view_it_online). -------------------------------------------------------------------------------- /tests/txt/images.txt: -------------------------------------------------------------------------------- 1 | One: 2 | 3 | Two: [two] 4 | 5 | Three: [three] 6 | 7 | Four: [four] 8 | 9 | With links 10 | 11 | One: http://localhost 12 | 13 | Two: [two](http://localhost) 14 | 15 | Three: [three](http://localhost) 16 | 17 | Four: [four](http://localhost) 18 | 19 | With links with titles 20 | 21 | One: [one link](http://localhost) 22 | 23 | Two: [two link](http://localhost) 24 | 25 | Three: [three link](http://localhost) 26 | 27 | Four: [four link](http://localhost) -------------------------------------------------------------------------------- /tests/txt/invalid.txt: -------------------------------------------------------------------------------- 1 | Hello &nbsnbsp; world -------------------------------------------------------------------------------- /tests/txt/lists.txt: -------------------------------------------------------------------------------- 1 | List tests 2 | 3 | Add some lists. 4 | 5 | - one 6 | - two 7 | - three 8 | 9 | An unordered list 10 | 11 | - one 12 | - two 13 | - three 14 | 15 | - one 16 | - two 17 | - three -------------------------------------------------------------------------------- /tests/txt/more-anchors.txt: -------------------------------------------------------------------------------- 1 | Anchor tests 2 | 3 | Visit http://openiaml.org or openiaml.org or http://openiaml.org. 4 | 5 | To visit with SSL, visit https://openiaml.org or openiaml.org or https://openiaml.org. 6 | 7 | To mail, email support@openiaml.org or mailto:support@openiaml.org or support@openiaml.org or mailto:support@openiaml.org. -------------------------------------------------------------------------------- /tests/txt/msoffice.txt: -------------------------------------------------------------------------------- 1 | Dear html2text, 2 | 3 | This is an example email that can be used to test html2text conversion of outlook / exchange emails. 4 | 5 | The addition of tags is very annoying! 6 | This is a single line return 7 | 8 | This is bold 9 | This is italic 10 | This is underline 11 | 12 | Andrew -------------------------------------------------------------------------------- /tests/txt/nbsp.txt: -------------------------------------------------------------------------------- 1 | hello world & people < > &NBSP; -------------------------------------------------------------------------------- /tests/txt/nested-divs.txt: -------------------------------------------------------------------------------- 1 | Just two divs 2 | Hanging out 3 | Nested divs and line breaks 4 | 5 | Nested divs and line breaks 6 | More text 7 | 8 | Just text 9 | Just text 10 | Just text 11 | 12 | This is the end! -------------------------------------------------------------------------------- /tests/txt/newlines.txt: -------------------------------------------------------------------------------- 1 | Hello 2 | How are you? 3 | 4 | How are you? 5 | 6 | How are you? 7 | 8 | Just two divs 9 | Hanging out 10 | This is not the end! 11 | How are you again? 12 | This is the end! 13 | Just kidding 14 | 15 | Header 1 16 | 17 | Some text 18 | --------------------------------------------------------------- 19 | Some more text 20 | 21 | Paragraph tag! 22 | 23 | Header 2 24 | 25 | --------------------------------------------------------------- 26 | 27 | Header 3 28 | 29 | Some text 30 | 31 | Header 4 32 | 33 | Paragraph tag! 34 | 35 | Final line -------------------------------------------------------------------------------- /tests/txt/non-breaking-spaces.txt: -------------------------------------------------------------------------------- 1 | these spaces are non-breaking -------------------------------------------------------------------------------- /tests/txt/pre.txt: -------------------------------------------------------------------------------- 1 | Here is the code 2 | 3 | #include 4 | #include 5 | 6 | int main(){ 7 | return 0; 8 | }; -------------------------------------------------------------------------------- /tests/txt/table.txt: -------------------------------------------------------------------------------- 1 | Hello, World! 2 | 3 | Col A Col B 4 | Data A1 Data B1 5 | Data A2 Data B2 6 | Data A3 Data B4 7 | Total A Total B -------------------------------------------------------------------------------- /tests/txt/test3.txt: -------------------------------------------------------------------------------- 1 | test one 2 | test two -------------------------------------------------------------------------------- /tests/txt/test4.txt: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | 3 4 | 4 5 | 5 < 6 -------------------------------------------------------------------------------- /tests/txt/utf8-example.txt: -------------------------------------------------------------------------------- 1 | - ÅÄÖ 2 | - åäö -------------------------------------------------------------------------------- /tests/txt/windows-1252-example.txt: -------------------------------------------------------------------------------- 1 | - ÅÄÖ 2 | - åäö -------------------------------------------------------------------------------- /tests/txt/zero-width-non-joiners.txt: -------------------------------------------------------------------------------- 1 | foobar --------------------------------------------------------------------------------