├── .gitignore
├── .github
    └── workflows
    │   └── test.yml
├── LICENSE
├── package.json
├── Readme.md
├── test
    └── index.js
└── index.js


/.gitignore:
--------------------------------------------------------------------------------
1 | /node_modules
2 | .nyc_output
3 | coverage
4 | package-lock.json
5 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Testing
 2 | 
 3 | on: [ push, pull_request ]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     runs-on: ubuntu-16.04
 8 |     strategy:
 9 |       matrix:
10 |         node: [ '12', '14' ]
11 |     name: Node ${{ matrix.node }} sample
12 |     steps:
13 |       - uses: actions/checkout@v2
14 |       - uses: actions/setup-node@v2
15 |         with:
16 |           node-version: ${{ matrix.node }}
17 |       - run: npm install
18 |       - uses: paambaati/codeclimate-action@v2.7.5
19 |         if: ${{ matrix.node == '14' }}
20 |         env:
21 |           CC_TEST_REPORTER_ID: ${{ secrets.CC_TEST_REPORTER_ID }}
22 |         with:
23 |           coverageCommand: npm run ci


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015-2021 Martin Heidegger
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "excerpt-html",
 3 |   "version": "1.2.2",
 4 |   "description": "Get the Excerpt from a markdown file (like in jekyll or *smith)",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "test": "npm run lint && tap test/**",
 8 |     "ci": "npm run test -- --cov --coverage-report=lcov",
 9 |     "quick": "tap test/**",
10 |     "coverage": "tap --coverage --coverage-report=html test/**",
11 |     "lint": "standard"
12 |   },
13 |   "keywords": [
14 |     "markdown",
15 |     "blog",
16 |     "excerpt",
17 |     "text-processing"
18 |   ],
19 |   "author": "Martin Heidegger <martin.heidegger@gmail.com>",
20 |   "contributors": [
21 |     "Tim Carry"
22 |   ],
23 |   "license": "MIT",
24 |   "dependencies": {
25 |     "cheerio": "^0.22.0",
26 |     "he": "^1.1.0",
27 |     "lodash.truncate": "^4.4.2",
28 |     "striptags": "^3.1.1"
29 |   },
30 |   "devDependencies": {
31 |     "standard": "^16.0.3",
32 |     "tap": "^15.0.2"
33 |   },
34 |   "repository": {
35 |     "type": "git",
36 |     "url": "https://github.com/martinheidegger/excerpt-html.git"
37 |   },
38 |   "bugs": {
39 |     "url": "https://github.com/martinheidegger/excerpt-html/issues"
40 |   },
41 |   "homepage": "https://github.com/martinheidegger/excerpt-html"
42 | }
43 | 


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
 1 | [![Build Status](https://travis-ci.org/martinheidegger/excerpt-html.svg?branch=master)](https://travis-ci.org/martinheidegger/excerpt-html)
 2 | [![js-standard-style](https://img.shields.io/badge/code%20style-standard-brightgreen.svg)](http://standardjs.com/)
 3 | [![Maintainability](https://api.codeclimate.com/v1/badges/d1b611efce3f2c6eeb98/maintainability)](https://codeclimate.com/github/martinheidegger/excerpt-html/maintainability)
 4 | [![Test Coverage](https://api.codeclimate.com/v1/badges/d1b611efce3f2c6eeb98/test_coverage)](https://codeclimate.com/github/martinheidegger/excerpt-html/test_coverage)
 5 | 
 6 | # excerpt-html
 7 | 
 8 | parses a given html text for a good excerpt.
 9 | 
10 | # Install
11 | 
12 | ```
13 | $ npm i excerpt-html --save
14 | ```
15 | 
16 | # API usage
17 | 
18 | ```JavaScript
19 | var htmlCode = '<p>Hello world</p>';
20 | var excerptHtml = require('excerpt-html');
21 | var excerpt = excerptHtml(htmlCode);
22 | ```
23 | 
24 | It will either use the first found paragraph or everything up to a 
25 | 
26 | `<!-- more -->`
27 | 
28 | # Options
29 | 
30 | You can specify a few options that modify the way the excerpt is parsed:
31 | 
32 | ``` JavaScript
33 | excerptHtml(htmlCode, {
34 |     moreRegExp:  /\s*<!--\s*more\s*-->/i,  // Search for the slug
35 |     stripTags:   true, // Set to false to get html code
36 |     pruneLength: 140, // Amount of characters that the excerpt should contain
37 |     pruneString: '…', // Character that will be added to the pruned string
38 |     pruneSeparator: ' ', // Separator to be used to separate words
39 | })
40 | ```
41 | 
42 | Note: `pruneLength` and `prunestring` only work when `stripTags` is set to `true` (default).
43 | 
44 | # History
45 | 
46 | To make this project we detached the code of [metalsmith-better-excerpts](https://github.com/simbo/metalsmith-better-excerpts) from `metalsmith`.
47 | 
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/test/index.js:
--------------------------------------------------------------------------------
  1 | 'use strict'
  2 | 
  3 | const test = require('tap').test
  4 | const excerptHtml = require('..')
  5 | 
  6 | test('documentation example', function (t) {
  7 |   t.equal(excerptHtml('<p>Hello world</p>'), 'Hello world')
  8 |   t.end()
  9 | })
 10 | 
 11 | test('non html example', function (t) {
 12 |   t.equal(excerptHtml('Hello world'), 'Hello world')
 13 |   t.end()
 14 | })
 15 | 
 16 | test('more section without html tags', function (t) {
 17 |   t.equal(excerptHtml('Fancy text it is more than I need <!-- more --> Is still here'), 'Fancy text it is more than I need')
 18 |   t.end()
 19 | })
 20 | 
 21 | test('more section without html tags doesnt work with or without spaces', function (t) {
 22 |   t.equal(excerptHtml('Fancy text it is more than I need<!-- more --> Is still here'), 'Fancy text it is more than I need')
 23 |   t.end()
 24 | })
 25 | 
 26 | test('cut off by word at default', function (t) {
 27 |   t.equal(excerptHtml('Hello you', {
 28 |     pruneLength: 8,
 29 |     pruneString: ''
 30 |   }), 'Hello')
 31 |   t.end()
 32 | })
 33 | 
 34 | test('cut off characters without prune separator', function (t) {
 35 |   t.equal(excerptHtml('Hello you', {
 36 |     pruneLength: 8,
 37 |     pruneString: '',
 38 |     pruneSeparator: ''
 39 |   }), 'Hello yo')
 40 |   t.end()
 41 | })
 42 | 
 43 | test('cut off characters without prune separator', function (t) {
 44 |   t.equal(excerptHtml('Hello you', {
 45 |     pruneLength: 8,
 46 |     pruneSeparator: ''
 47 |   }), 'Hello y…')
 48 |   t.end()
 49 | })
 50 | 
 51 | test('strip html tags', function (t) {
 52 |   t.equal(excerptHtml('<p>This is a <b>fancy world, I think</b> it might be weird to ask me.</p>', {
 53 |     stripTags: false
 54 |   }), 'This is a <b>fancy world, I think</b> it might be weird to ask me.')
 55 |   t.end()
 56 | })
 57 | 
 58 | test('cropping stripped html tags doesnt work', function (t) {
 59 |   t.equal(excerptHtml('<p>This is a <b>fancy world, I think</b> it might be weird to ask me.</p>', {
 60 |     stripTags: false,
 61 |     pruneLength: 16
 62 |   }), 'This is a <b>fancy world, I think</b> it might be weird to ask me.')
 63 |   t.end()
 64 | })
 65 | 
 66 | test('cropping stripped html tags doesnt work', function (t) {
 67 |   t.equal(excerptHtml('<p>Hello World</p><p>This is not taken</p>', {
 68 |     stripTags: false,
 69 |     pruneLength: 16
 70 |   }), 'Hello World')
 71 |   t.end()
 72 | })
 73 | 
 74 | test('cropping stripped html tags doesnt work', function (t) {
 75 |   t.equal(excerptHtml('<p>Hello World</p><p>This is not taken</p>', {
 76 |     stripTags: false,
 77 |     pruneLength: 50
 78 |   }), 'Hello World')
 79 |   t.end()
 80 | })
 81 | 
 82 | test('unescaping should work for all characters', function (t) {
 83 |   t.equal(excerptHtml('Hello &amp; World &ouml; &hearts;'), 'Hello & World ö ♥')
 84 |   t.end()
 85 | })
 86 | 
 87 | test('dont prune text if pruneLength is < 1', function (t) {
 88 |   const longString = 'This is text. This text is longer than 140 characters, the default value for' +
 89 |     'this method. If pruneLength is set to a number < 1 it will ignore the default' +
 90 |     'limit of 140. Let us make the text a little longer.'
 91 |   t.equal(excerptHtml(longString, {
 92 |     pruneLength: -1
 93 |   }), longString)
 94 |   t.end()
 95 | })
 96 | 
 97 | test('empty text', function (t) {
 98 |   t.equal(excerptHtml('', {
 99 |     pruneLength: -1
100 |   }), '')
101 |   t.end()
102 | })
103 | 
104 | test('make sure that empty tags are removed', function (t) {
105 |   t.equal(excerptHtml('<p><img></img> <img></img> test <img></img> </p>', {
106 |     stripTags: true
107 |   }), 'test')
108 |   t.end()
109 | })
110 | 
111 | test('using a custom regExp to cut the excertp', function (t) {
112 |   t.equals(excerptHtml('a $ b c', { moreRegExp: /\$/ }), 'a')
113 |   t.end()
114 | })
115 | 
116 | test('ignore empty first paragraphs', function (t) {
117 |   t.equals(
118 |     excerptHtml(
119 |       '<p><img src="cat.png" alt="hello" title="test" /></p><p><img src="dog.png" /></p><p>test</p>'
120 |       , {}
121 |     ),
122 |     'test'
123 |   )
124 |   t.end()
125 | })
126 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
  1 | 'use strict'
  2 | /**
  3 |  * Extracted from https://github.com/simbo/metalsmith-better-excerpts
  4 |  * (published under MIT license)
  5 |  */
  6 | 
  7 | const cheerio = require('cheerio')
  8 | const unescapeHTML = require('he').unescape
  9 | const stripTags = require('striptags')
 10 | const truncate = require('lodash.truncate')
 11 | 
 12 | /**
 13 |  * retrieve excerpt from file object by extracting contents until a 'more' tag
 14 |  * @param  {string} html   file object
 15 |  * @param  {RegExp} regExp 'more' tag regexp
 16 |  * @return {string}         excerpt string or undefined
 17 |  */
 18 | function getExcerptByMoreTag (html, regExp) {
 19 |   html = cheerio.load('<root>' + html + '</root>')('root').html()
 20 |   const match = html.search(regExp)
 21 |   if (match > -1) {
 22 |     const excerpt = html.slice(0, Buffer.byteLength(html.slice(0, match)))
 23 |     return unescapeHTML(excerpt)
 24 |   }
 25 | }
 26 | 
 27 | /**
 28 |  * retrieve excerpt from file object by extracting the first p's contents
 29 |  * @param  {string} html file object
 30 |  * @return {string}       excerpt string
 31 |  */
 32 | function getExcerptByFirstParagraph (html) {
 33 |   const $ = cheerio.load(html)
 34 |   const isEmpty = element => $(element).text().trim().length === 0
 35 |   const p = $('p').filter(
 36 |     (_index, element) => !isEmpty(element)
 37 |   ).first()
 38 | 
 39 |   const excerpt = p.length ? p.html().trim() : html
 40 |   return unescapeHTML(excerpt)
 41 | }
 42 | 
 43 | /**
 44 |  * @param {string} excerpt Already extracted excerpt
 45 |  * @param {Object} options stripping options
 46 |  * @param {number} [options.pruneLength]
 47 |  * @param {string} [options.pruneSeparator]
 48 |  * @param {string} [options.pruneString]
 49 |  * @return {string} The striped and pruned excerpt
 50 |  */
 51 | function stripTagsFromExcerpt (excerpt, options) {
 52 |   excerpt = stripTags(excerpt)
 53 |   excerpt = excerpt.replace(/^\s+|\s+$|\s+(?=\s)/g, '')
 54 |   const pruneLength = typeof options.pruneLength === 'number' ? options.pruneLength : 140
 55 |   if (pruneLength > 0) {
 56 |     excerpt = truncate(excerpt, {
 57 |       length: pruneLength,
 58 |       omission: typeof options.pruneString === 'string' ? options.pruneString : '…',
 59 |       separator: typeof options.pruneSeparator === 'string' ? options.pruneSeparator : ' '
 60 |     })
 61 |   }
 62 |   return excerpt
 63 | }
 64 | 
 65 | /**
 66 |  * Extracts the raw excerpt (without stripped tags) from the html
 67 |  *
 68 |  * @param {string} html Html string to look for the excerpt
 69 |  * @param {RegExp} [moreRegExp=/\s*<!--\s*more\s*-->/i] RegExp used to look for the end of the excerpt
 70 |  * @return If found, the excerpt from the more tag, else the excerpt contained in the first <p></p>
 71 |  */
 72 | function getRawExcerpt (html, moreRegExp) {
 73 |   if (!moreRegExp) {
 74 |     moreRegExp = /\s*<!--\s*more\s*-->/i
 75 |   }
 76 |   return getExcerptByMoreTag(html, moreRegExp) || getExcerptByFirstParagraph(html)
 77 | }
 78 | 
 79 | /**
 80 |  * Parses the excerpt for a given html string.
 81 |  *
 82 |  * @param {string}  html Html code to parse for the excerpt.
 83 |  * @param {Object}  [options] Options for parsing.
 84 |  * @param {RegExp}  [options.moreRegExp=/\s*<!--\s*more\s*-->/i] Regexp to look for the end of the excerpt. If this is not found
 85 |  * @param {boolean} [options.stripTags=true] Strip the tags from the html code when getting the excerpt.
 86 |  * @param {number}  [options.pruneLength=140] Maximum size of the excerpt (only functional if stripTags=true)
 87 |  * @param {string}  [options.pruneSeparator=' '] Character to look for when truncating a text
 88 |  * @param {string}  [options.pruneString='…'] String to be attached if pruning needs to happen
 89 |  * @returns {string} The excerpt found in the given html code.
 90 |  */
 91 | module.exports = function excerptHtml (html, options) {
 92 |   if (!options) {
 93 |     options = {}
 94 |   }
 95 |   const rawExcerpt = getRawExcerpt(html, options.moreRegExp)
 96 |   if (options.stripTags === false) {
 97 |     return rawExcerpt
 98 |   }
 99 |   return stripTagsFromExcerpt(rawExcerpt, options)
100 | }
101 | 


--------------------------------------------------------------------------------