├── .github └── workflows │ └── test.yml ├── .gitignore ├── LICENSE ├── Readme.md ├── index.js ├── package.json └── test └── index.js /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Testing 2 | 3 | on: [ push, pull_request ] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-16.04 8 | strategy: 9 | matrix: 10 | node: [ '12', '14' ] 11 | name: Node ${{ matrix.node }} sample 12 | steps: 13 | - uses: actions/checkout@v2 14 | - uses: actions/setup-node@v2 15 | with: 16 | node-version: ${{ matrix.node }} 17 | - run: npm install 18 | - uses: paambaati/codeclimate-action@v2.7.5 19 | if: ${{ matrix.node == '14' }} 20 | env: 21 | CC_TEST_REPORTER_ID: ${{ secrets.CC_TEST_REPORTER_ID }} 22 | with: 23 | coverageCommand: npm run ci -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /node_modules 2 | .nyc_output 3 | coverage 4 | package-lock.json 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015-2021 Martin Heidegger 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/martinheidegger/excerpt-html.svg?branch=master)](https://travis-ci.org/martinheidegger/excerpt-html) 2 | [![js-standard-style](https://img.shields.io/badge/code%20style-standard-brightgreen.svg)](http://standardjs.com/) 3 | [![Maintainability](https://api.codeclimate.com/v1/badges/d1b611efce3f2c6eeb98/maintainability)](https://codeclimate.com/github/martinheidegger/excerpt-html/maintainability) 4 | [![Test Coverage](https://api.codeclimate.com/v1/badges/d1b611efce3f2c6eeb98/test_coverage)](https://codeclimate.com/github/martinheidegger/excerpt-html/test_coverage) 5 | 6 | # excerpt-html 7 | 8 | parses a given html text for a good excerpt. 9 | 10 | # Install 11 | 12 | ``` 13 | $ npm i excerpt-html --save 14 | ``` 15 | 16 | # API usage 17 | 18 | ```JavaScript 19 | var htmlCode = '

Hello world

'; 20 | var excerptHtml = require('excerpt-html'); 21 | var excerpt = excerptHtml(htmlCode); 22 | ``` 23 | 24 | It will either use the first found paragraph or everything up to a 25 | 26 | `` 27 | 28 | # Options 29 | 30 | You can specify a few options that modify the way the excerpt is parsed: 31 | 32 | ``` JavaScript 33 | excerptHtml(htmlCode, { 34 | moreRegExp: /\s*/i, // Search for the slug 35 | stripTags: true, // Set to false to get html code 36 | pruneLength: 140, // Amount of characters that the excerpt should contain 37 | pruneString: '…', // Character that will be added to the pruned string 38 | pruneSeparator: ' ', // Separator to be used to separate words 39 | }) 40 | ``` 41 | 42 | Note: `pruneLength` and `prunestring` only work when `stripTags` is set to `true` (default). 43 | 44 | # History 45 | 46 | To make this project we detached the code of [metalsmith-better-excerpts](https://github.com/simbo/metalsmith-better-excerpts) from `metalsmith`. 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | /** 3 | * Extracted from https://github.com/simbo/metalsmith-better-excerpts 4 | * (published under MIT license) 5 | */ 6 | 7 | const cheerio = require('cheerio') 8 | const unescapeHTML = require('he').unescape 9 | const stripTags = require('striptags') 10 | const truncate = require('lodash.truncate') 11 | 12 | /** 13 | * retrieve excerpt from file object by extracting contents until a 'more' tag 14 | * @param {string} html file object 15 | * @param {RegExp} regExp 'more' tag regexp 16 | * @return {string} excerpt string or undefined 17 | */ 18 | function getExcerptByMoreTag (html, regExp) { 19 | html = cheerio.load('' + html + '')('root').html() 20 | const match = html.search(regExp) 21 | if (match > -1) { 22 | const excerpt = html.slice(0, Buffer.byteLength(html.slice(0, match))) 23 | return unescapeHTML(excerpt) 24 | } 25 | } 26 | 27 | /** 28 | * retrieve excerpt from file object by extracting the first p's contents 29 | * @param {string} html file object 30 | * @return {string} excerpt string 31 | */ 32 | function getExcerptByFirstParagraph (html) { 33 | const $ = cheerio.load(html) 34 | const isEmpty = element => $(element).text().trim().length === 0 35 | const p = $('p').filter( 36 | (_index, element) => !isEmpty(element) 37 | ).first() 38 | 39 | const excerpt = p.length ? p.html().trim() : html 40 | return unescapeHTML(excerpt) 41 | } 42 | 43 | /** 44 | * @param {string} excerpt Already extracted excerpt 45 | * @param {Object} options stripping options 46 | * @param {number} [options.pruneLength] 47 | * @param {string} [options.pruneSeparator] 48 | * @param {string} [options.pruneString] 49 | * @return {string} The striped and pruned excerpt 50 | */ 51 | function stripTagsFromExcerpt (excerpt, options) { 52 | excerpt = stripTags(excerpt) 53 | excerpt = excerpt.replace(/^\s+|\s+$|\s+(?=\s)/g, '') 54 | const pruneLength = typeof options.pruneLength === 'number' ? options.pruneLength : 140 55 | if (pruneLength > 0) { 56 | excerpt = truncate(excerpt, { 57 | length: pruneLength, 58 | omission: typeof options.pruneString === 'string' ? options.pruneString : '…', 59 | separator: typeof options.pruneSeparator === 'string' ? options.pruneSeparator : ' ' 60 | }) 61 | } 62 | return excerpt 63 | } 64 | 65 | /** 66 | * Extracts the raw excerpt (without stripped tags) from the html 67 | * 68 | * @param {string} html Html string to look for the excerpt 69 | * @param {RegExp} [moreRegExp=/\s*/i] RegExp used to look for the end of the excerpt 70 | * @return If found, the excerpt from the more tag, else the excerpt contained in the first

71 | */ 72 | function getRawExcerpt (html, moreRegExp) { 73 | if (!moreRegExp) { 74 | moreRegExp = /\s*/i 75 | } 76 | return getExcerptByMoreTag(html, moreRegExp) || getExcerptByFirstParagraph(html) 77 | } 78 | 79 | /** 80 | * Parses the excerpt for a given html string. 81 | * 82 | * @param {string} html Html code to parse for the excerpt. 83 | * @param {Object} [options] Options for parsing. 84 | * @param {RegExp} [options.moreRegExp=/\s*/i] Regexp to look for the end of the excerpt. If this is not found 85 | * @param {boolean} [options.stripTags=true] Strip the tags from the html code when getting the excerpt. 86 | * @param {number} [options.pruneLength=140] Maximum size of the excerpt (only functional if stripTags=true) 87 | * @param {string} [options.pruneSeparator=' '] Character to look for when truncating a text 88 | * @param {string} [options.pruneString='…'] String to be attached if pruning needs to happen 89 | * @returns {string} The excerpt found in the given html code. 90 | */ 91 | module.exports = function excerptHtml (html, options) { 92 | if (!options) { 93 | options = {} 94 | } 95 | const rawExcerpt = getRawExcerpt(html, options.moreRegExp) 96 | if (options.stripTags === false) { 97 | return rawExcerpt 98 | } 99 | return stripTagsFromExcerpt(rawExcerpt, options) 100 | } 101 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "excerpt-html", 3 | "version": "1.2.2", 4 | "description": "Get the Excerpt from a markdown file (like in jekyll or *smith)", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "npm run lint && tap test/**", 8 | "ci": "npm run test -- --cov --coverage-report=lcov", 9 | "quick": "tap test/**", 10 | "coverage": "tap --coverage --coverage-report=html test/**", 11 | "lint": "standard" 12 | }, 13 | "keywords": [ 14 | "markdown", 15 | "blog", 16 | "excerpt", 17 | "text-processing" 18 | ], 19 | "author": "Martin Heidegger ", 20 | "contributors": [ 21 | "Tim Carry" 22 | ], 23 | "license": "MIT", 24 | "dependencies": { 25 | "cheerio": "^0.22.0", 26 | "he": "^1.1.0", 27 | "lodash.truncate": "^4.4.2", 28 | "striptags": "^3.1.1" 29 | }, 30 | "devDependencies": { 31 | "standard": "^16.0.3", 32 | "tap": "^15.0.2" 33 | }, 34 | "repository": { 35 | "type": "git", 36 | "url": "https://github.com/martinheidegger/excerpt-html.git" 37 | }, 38 | "bugs": { 39 | "url": "https://github.com/martinheidegger/excerpt-html/issues" 40 | }, 41 | "homepage": "https://github.com/martinheidegger/excerpt-html" 42 | } 43 | -------------------------------------------------------------------------------- /test/index.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const test = require('tap').test 4 | const excerptHtml = require('..') 5 | 6 | test('documentation example', function (t) { 7 | t.equal(excerptHtml('

Hello world

'), 'Hello world') 8 | t.end() 9 | }) 10 | 11 | test('non html example', function (t) { 12 | t.equal(excerptHtml('Hello world'), 'Hello world') 13 | t.end() 14 | }) 15 | 16 | test('more section without html tags', function (t) { 17 | t.equal(excerptHtml('Fancy text it is more than I need Is still here'), 'Fancy text it is more than I need') 18 | t.end() 19 | }) 20 | 21 | test('more section without html tags doesnt work with or without spaces', function (t) { 22 | t.equal(excerptHtml('Fancy text it is more than I need Is still here'), 'Fancy text it is more than I need') 23 | t.end() 24 | }) 25 | 26 | test('cut off by word at default', function (t) { 27 | t.equal(excerptHtml('Hello you', { 28 | pruneLength: 8, 29 | pruneString: '' 30 | }), 'Hello') 31 | t.end() 32 | }) 33 | 34 | test('cut off characters without prune separator', function (t) { 35 | t.equal(excerptHtml('Hello you', { 36 | pruneLength: 8, 37 | pruneString: '', 38 | pruneSeparator: '' 39 | }), 'Hello yo') 40 | t.end() 41 | }) 42 | 43 | test('cut off characters without prune separator', function (t) { 44 | t.equal(excerptHtml('Hello you', { 45 | pruneLength: 8, 46 | pruneSeparator: '' 47 | }), 'Hello y…') 48 | t.end() 49 | }) 50 | 51 | test('strip html tags', function (t) { 52 | t.equal(excerptHtml('

This is a fancy world, I think it might be weird to ask me.

', { 53 | stripTags: false 54 | }), 'This is a fancy world, I think it might be weird to ask me.') 55 | t.end() 56 | }) 57 | 58 | test('cropping stripped html tags doesnt work', function (t) { 59 | t.equal(excerptHtml('

This is a fancy world, I think it might be weird to ask me.

', { 60 | stripTags: false, 61 | pruneLength: 16 62 | }), 'This is a fancy world, I think it might be weird to ask me.') 63 | t.end() 64 | }) 65 | 66 | test('cropping stripped html tags doesnt work', function (t) { 67 | t.equal(excerptHtml('

Hello World

This is not taken

', { 68 | stripTags: false, 69 | pruneLength: 16 70 | }), 'Hello World') 71 | t.end() 72 | }) 73 | 74 | test('cropping stripped html tags doesnt work', function (t) { 75 | t.equal(excerptHtml('

Hello World

This is not taken

', { 76 | stripTags: false, 77 | pruneLength: 50 78 | }), 'Hello World') 79 | t.end() 80 | }) 81 | 82 | test('unescaping should work for all characters', function (t) { 83 | t.equal(excerptHtml('Hello & World ö ♥'), 'Hello & World ö ♥') 84 | t.end() 85 | }) 86 | 87 | test('dont prune text if pruneLength is < 1', function (t) { 88 | const longString = 'This is text. This text is longer than 140 characters, the default value for' + 89 | 'this method. If pruneLength is set to a number < 1 it will ignore the default' + 90 | 'limit of 140. Let us make the text a little longer.' 91 | t.equal(excerptHtml(longString, { 92 | pruneLength: -1 93 | }), longString) 94 | t.end() 95 | }) 96 | 97 | test('empty text', function (t) { 98 | t.equal(excerptHtml('', { 99 | pruneLength: -1 100 | }), '') 101 | t.end() 102 | }) 103 | 104 | test('make sure that empty tags are removed', function (t) { 105 | t.equal(excerptHtml('

test

', { 106 | stripTags: true 107 | }), 'test') 108 | t.end() 109 | }) 110 | 111 | test('using a custom regExp to cut the excertp', function (t) { 112 | t.equals(excerptHtml('a $ b c', { moreRegExp: /\$/ }), 'a') 113 | t.end() 114 | }) 115 | 116 | test('ignore empty first paragraphs', function (t) { 117 | t.equals( 118 | excerptHtml( 119 | '

hello

test

' 120 | , {} 121 | ), 122 | 'test' 123 | ) 124 | t.end() 125 | }) 126 | --------------------------------------------------------------------------------