├── logos ├── logo-box-builtby.png └── logo-box-madefor.png ├── .gitignore ├── package.json ├── LICENSE.md ├── tests └── test.js ├── README.md └── index.js /logos/logo-box-builtby.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apostrophecms/split-html/main/logos/logo-box-builtby.png -------------------------------------------------------------------------------- /logos/logo-box-madefor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apostrophecms/split-html/main/logos/logo-box-madefor.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /package-lock.json 2 | npm-debug.log 3 | *.DS_Store 4 | /node_modules 5 | # We do not commit CSS, only LESS 6 | /public/css/*.css 7 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "split-html", 3 | "version": "1.1.0", 4 | "description": "Split HTML into two valid fragments wherever a certain selector is matched. Works on the server side.", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "mocha tests/test.js" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "https://github.com/apostrophecms/split-html" 12 | }, 13 | "keywords": [ 14 | "html", 15 | "split", 16 | "split html", 17 | "fragments", 18 | "html fragments", 19 | "dom" 20 | ], 21 | "author": "Apostrophe Technologies, Inc", 22 | "license": "MIT", 23 | "bugs": { 24 | "url": "https://github.com/apostrophecms/split-html/issues" 25 | }, 26 | "homepage": "https://github.com/apostrophecms/split-html", 27 | "dependencies": { 28 | "cheerio": "^0.22.0" 29 | }, 30 | "devDependencies": { 31 | "mocha": "^10.2.0" 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016 P'unk Avenue LLC 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /tests/test.js: -------------------------------------------------------------------------------- 1 | var assert = require("assert"); 2 | var splitHtml = require("../index.js"); 3 | describe('splitHtml', function(){ 4 | it('Leaves img-free HTML alone when splitting on img', function() { 5 | var html = '

Hi there.

This is fun.

Loose text.'; 6 | var result = splitHtml(html, 'img'); 7 | assert(result.length === 1); 8 | assert(result[0] === html); 9 | }); 10 | it('Splits into the correct three components in the presence of img', function() { 11 | var html = '

First component.

Second component.

'; 12 | var result = splitHtml(html, 'img'); 13 | assert(result.length === 3); 14 | assert(result[0] === '

First component.

'); 15 | assert(result[1] === ''); 16 | assert(result[2] === '

Second component.

'); 17 | }); 18 | it('Correctly closes open tags in the first component and reopens them at the start of the second component', function() { 19 | var html = '

First component.

Second component.

'; 20 | var result = splitHtml(html, 'img'); 21 | assert(result.length === 3); 22 | assert(result[0] === '

First component.

'); 23 | assert(result[1] === ''); 24 | assert(result[2] === '

Second component.

'); 25 | }); 26 | it('Works with free text', function() { 27 | var html = 'Text oneText two'; 28 | var result = splitHtml(html, 'img'); 29 | assert(result.length === 3); 30 | assert(result[0] === 'Text one'); 31 | assert(result[1] === ''); 32 | assert(result[2] === 'Text two'); 33 | }); 34 | it('Works with multiple instances', function() { 35 | var html = '

First component.

Second component.

Third component.

'; 36 | var result = splitHtml(html, 'img'); 37 | assert(result.length === 5); 38 | assert(result[0] === '

First component.

'); 39 | assert(result[1] === ''); 40 | assert(result[2] === '

Second component.

'); 41 | assert(result[3] === ''); 42 | assert(result[4] === '

Third component.

'); 43 | }); 44 | it('Respects test function', function() { 45 | var html = '

First component.

Second component.

Link text.

More text in second component.

'; 46 | var result = splitHtml(html, 'a', function($el) { 47 | if ($el.find('img').length) { 48 | return true; 49 | } else { 50 | return false; 51 | } 52 | }); 53 | assert(result.length === 3); 54 | assert(result[0] === '

First component.

'); 55 | assert(result[1] === ''); 56 | assert(result[2] === '

Second component.

Link text.

More text in second component.

'); 57 | }); 58 | it('Respects a split at the end of a word', function() { 59 | var html = '

One

Two

Three

Four

'; 60 | var result = splitHtml(html, 'span[data-split-marker]'); 61 | assert(result.length === 3); 62 | assert(result[0] === '

One

Two

'); 63 | assert(result[1] === ''); 64 | assert(result[2] === '

Three

Four

'); 65 | }); 66 | it('Splits properly when parents are multiple levels deep', function() { 67 | var html = '

Hello!Goodbye.

'; 68 | var result = splitHtml(html, 'span[data-split-marker]'); 69 | assert(result.length === 3); 70 | assert(result[0] === '

Hello!

'); 71 | assert(result[1] === ''); 72 | assert(result[2] === '

Goodbye.

'); 73 | }); 74 | }); 75 | 76 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | split-html 2 | ========== 3 | 4 | 5 | 6 | Given a string containing an HTML fragment, split that string into two or more **correctly balanced** HTML fragments wherever the specified selector is found. Returns both the new fragments and the elements that matched the selector, in alternation. **Works on the server and in the browser.** Powered by Cheerio on the server side, jQuery in the browser. 7 | 8 | ```javascript 9 | var splitHtml = require('split-html'); 10 | var html = '
' + 11 | '

First component.

' + 12 | '' + 13 | '

Second component.

' + 14 | '
'; 15 | var fragments = splitHtml(html, 'img'); 16 | console.log(fragments); 17 | ``` 18 | 19 | This outputs: 20 | 21 | ```javascript 22 | [ 23 | '

First component.

', 24 | '', 25 | '

Second component.

' 26 | ] 27 | ``` 28 | 29 | Note that the `img` itself is returned. The first element in the array is always an HTML fragment, the second is always an element that matched the selector, and so on in alternation. 30 | 31 | **Any container tags already open when the `img` tag is encountered are automatically closed at the end of the first fragment and re-opened at the start of the next one with the same attributes.** 32 | 33 | ## Optional test function 34 | 35 | If a jQuery/CSS-style selector isn't specific enough, you can pass a function as the third argument. This function is called with a Cheerio or jQuery object representing the matching element. If you want to split around this element, return `true`. Otherwise, return `false`. 36 | 37 | This is useful because Cheerio does not currently support `:has`, and also because in some situations even `:has` might not be specific enough. 38 | 39 | ```javascript 40 | // Split on 'a', but only if it contains 'img' 41 | var result = splitHtml(html, 'a', function($el) { 42 | if ($el.find('img').length) { 43 | return true; 44 | } else { 45 | return false; 46 | } 47 | }); 48 | ``` 49 | 50 | ## Additional options 51 | 52 | The following options should be passed as the fourth argument to `splitHtml` in an object. 53 | 54 | ### `cheerio` 55 | 56 | An object of Cheerio v0.x `.load()` options, [as documented here](https://www.npmjs.com/package/cheerio/v/0.22.0). This is used when using split-html on the server when preparing the HTML fragment to parse. 57 | 58 | ## Why? 59 | 60 | We wanted to import Wordpress blog posts into the [ApostropheCMS](https://apostrophecms.com) CMS. Wordpress uses HTML to embed images and videos, while Apostrophe represents blocks of text and widgets like slideshows as separate objects in an array. `split-html` allows us to neatly slice and dice existing HTML so we can transform it into Apostrophe widgets easily. 61 | 62 | ## What about errors? 63 | 64 | If `split-html` encounters something it can't figure out, such as terrible markup, it will return the original string as the only element in the array. 65 | 66 | ## Using split-html in the browser 67 | 68 | `split-html` has been coded to work with either Cheerio or actual jQuery. It will automatically just use jQuery if that is present in the browser. We use this feature in production in [ApostropheCMS](http://apostrophecms.com)'s rich text editor. 69 | 70 | ## About P'unk Avenue and ApostropheCMS 71 | 72 | `split-html` was created at [P'unk Avenue](https://punkave.com) for use in many projects built with Apostrophe, an open-source content management system built on node.js. If you like `split-html` you should definitely [check out apostrophecms.com](https://apostrophecms.com). 73 | 74 | ## Support 75 | 76 | Feel free to open issues on [github](http://github.com/punkave/split-html). 77 | 78 | 79 | 80 | ## Changelog 81 | 82 | ### 1.1.0 - 2020-09-09 83 | 84 | * Adds the Cheerio configuration option in a fourth options argument. 85 | 86 | ### CHANGES IN 1.0.3 87 | 88 | * Included an explicit LICENSE.md file (no change, still MIT licensed). No changes in functionality. 89 | 90 | ### CHANGES IN 1.0.2 91 | 92 | * Undeclared variable fixed. No functional changes. 93 | 94 | ### CHANGES IN 1.0.1 95 | 96 | * Clarified that this code is mature for browser use as well. No code changes. 97 | 98 | ### CHANGES IN 1.0.0 99 | 100 | * Updated documentation and released 1.0.0 stable. No code changes. 101 | 102 | ### CHANGES IN 0.1.1 103 | 104 | * Works correctly with actual jQuery, in addition to working correctly in node with Cheerio as before. This required changes to be more pedantic about closing parent tags in the first fragment, and a better simulation of Cheerio's document object. 105 | * Handles nested parent elements correctly. 106 | 107 | ### CHANGES IN 0.1.0 108 | 109 | Initial release. With shiny unit tests, of course. 110 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | (function() { 2 | var cheerio; 3 | if (typeof window === 'undefined') { 4 | // node 5 | module.exports = splitHtml; 6 | cheerio = require('cheerio'); 7 | } else { 8 | window.splitHtml = splitHtml; 9 | // In the browser, use actual jQuery in place of Cheerio. 10 | // Create a simulated cheerio object. 11 | cheerio = { 12 | load: function(html) { 13 | var $wrapper = jQuery('
'); 14 | var $el = jQuery(html); 15 | $wrapper.append($el); 16 | function c(s) { 17 | if (s[0] === '<') { 18 | return jQuery(s); 19 | } 20 | return $wrapper.find(s); 21 | } 22 | c.html = function() { 23 | return $wrapper.html(); 24 | }; 25 | return c; 26 | }, 27 | }; 28 | } 29 | function splitHtml(html, splitOn, test, opts) { 30 | if (!test) { 31 | test = function($el) { 32 | return true; 33 | }; 34 | } 35 | opts = opts || {}; 36 | var cheerioOpts = opts.cheerio || null; 37 | var result = []; 38 | var splitAttr = 'data-' + token(); 39 | var ignoreAttr = 'data-' + token(); 40 | var $; 41 | var $matches; 42 | var i; 43 | var $match; 44 | var $wrapper; 45 | var tag; 46 | var second; 47 | while (true) { 48 | $ = cheerio.load(html, cheerioOpts); 49 | $matches = $(splitOn); 50 | $match = null; 51 | for (i = 0; (i < $matches.length); i++) { 52 | $match = $matches.eq(i); 53 | if ((!$match.attr(ignoreAttr)) && test($match)) { 54 | break; 55 | } else { 56 | $match.attr(ignoreAttr, '1'); 57 | } 58 | $match = null; 59 | } 60 | if (!$match) { 61 | result.push(html); 62 | break; 63 | } 64 | $match.attr(splitAttr, '1'); 65 | var markup = $.html(); 66 | var splitAt = markup.indexOf(splitAttr); 67 | var leftAt = markup.lastIndexOf('<', splitAt); 68 | if (leftAt === -1) { 69 | result.push(html); 70 | break; 71 | } 72 | var first = markup.substr(0, leftAt); 73 | 74 | // For the second segment we need to reopen the 75 | // open tags from the first segment. Reconstruct that. 76 | 77 | var reopen = ''; 78 | $wrapper = cheerio.load('
')('div').eq(0); 79 | var $parents = $match.parents(); 80 | for (i = 0; (i < $parents.length); i++) { 81 | var $original = $parents.eq(i); 82 | if ($original.is('[data-cheerio-root]')) { 83 | // Simulated cheerio used in browser has 84 | // a wrapper element 85 | break; 86 | } 87 | var $parent = $original.clone(); 88 | $parent.empty(); 89 | $wrapper.empty(); 90 | $wrapper.append($parent); 91 | var parentMarkup = $wrapper.html(); 92 | var endTagAt = parentMarkup.indexOf('>'); 93 | tag = tagName($parent); 94 | // Cheerio tolerates missing closing tags, 95 | // but real jQuery will discard any text 96 | // preceding them, so play nice 97 | first += ''; 98 | reopen = parentMarkup.substr(0, endTagAt + 1) + reopen; 99 | } 100 | 101 | // We can't just split off the next fragment at 102 | // > because the matching tag may be a container. 103 | // Move it to a wrapper to get its full markup, 104 | // then remove it from the original document. The 105 | // remainder of the original document now begins 106 | // where the matching tag used to 107 | 108 | markup = $.html(); 109 | 110 | $wrapper = cheerio.load('
')('div').eq(0); 111 | $match.removeAttr(splitAttr); 112 | $wrapper.append($match); 113 | tag = $wrapper.html(); 114 | $match.remove(); 115 | markup = $.html(); 116 | second = reopen + markup.substr(leftAt); 117 | // Let Cheerio close the open tags in the 118 | // first segment for us. Also mop up the attributes 119 | // we used to mark elements that matched the selector 120 | // but didn't match our test function 121 | first = cleanup(first); 122 | result.push(first); 123 | result.push(tag); 124 | html = cleanup(second); 125 | } 126 | return result; 127 | // Use Cheerio to strip out any attributes we used to keep 128 | // track of our work, then generate new HTML. This also 129 | // closes any tags we opened but did not close. 130 | function cleanup(html) { 131 | html = cheerio.load(html, cheerioOpts); 132 | html('[' + ignoreAttr + ']').removeAttr(ignoreAttr); 133 | html = html.html(); 134 | return html; 135 | } 136 | 137 | function token() { 138 | return Math.floor(Math.random() * 1000000000).toString(); 139 | } 140 | } 141 | 142 | function tagName($el) { 143 | // Different in DOM and Cheerio. Cheerio 144 | // doesn't support prop() either. 145 | return $el[0].tagName || $el[0].name; 146 | } 147 | })(); 148 | 149 | --------------------------------------------------------------------------------