├── .eslintrc.js ├── .github └── workflows │ ├── lint.yml │ └── ci.yml ├── package.json ├── LICENSE ├── test └── test.js ├── README.md └── lib └── sbmh.js /.eslintrc.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | module.exports = { 4 | extends: '@mscdex/eslint-config', 5 | }; 6 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: lint 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: [ master ] 7 | 8 | env: 9 | NODE_VERSION: 18.x 10 | 11 | jobs: 12 | lint-js: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v3 16 | with: 17 | persist-credentials: false 18 | - name: Use Node.js ${{ env.NODE_VERSION }} 19 | uses: actions/setup-node@v3 20 | with: 21 | node-version: ${{ env.NODE_VERSION }} 22 | - name: Check Node.js version 23 | run: node -pe process.versions 24 | - name: Install ESLint + ESLint configs/plugins 25 | run: npm install --only=dev 26 | - name: Lint files 27 | run: npm run lint 28 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: [ master ] 7 | 8 | jobs: 9 | tests-linux: 10 | runs-on: ubuntu-latest 11 | strategy: 12 | fail-fast: false 13 | matrix: 14 | node-version: [10.x, 12.x, 14.x, 16.x, 18.x, 20.x] 15 | steps: 16 | - uses: actions/checkout@v3 17 | with: 18 | persist-credentials: false 19 | - name: Use Node.js ${{ matrix.node-version }} 20 | uses: actions/setup-node@v3 21 | with: 22 | node-version: ${{ matrix.node-version }} 23 | - name: Check Node.js version 24 | run: node -pe process.versions 25 | - name: Install module 26 | run: npm install 27 | - name: Run tests 28 | run: npm test 29 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "streamsearch", 3 | "version": "1.1.0", 4 | "author": "Brian White ", 5 | "description": "Streaming Boyer-Moore-Horspool searching for node.js", 6 | "main": "./lib/sbmh.js", 7 | "engines": { 8 | "node": ">=10.0.0" 9 | }, 10 | "devDependencies": { 11 | "@mscdex/eslint-config": "^1.1.0", 12 | "eslint": "^7.32.0" 13 | }, 14 | "scripts": { 15 | "test": "node test/test.js", 16 | "lint": "eslint --cache --report-unused-disable-directives --ext=.js .eslintrc.js lib test", 17 | "lint:fix": "npm run lint -- --fix" 18 | }, 19 | "keywords": [ 20 | "stream", 21 | "horspool", 22 | "boyer-moore-horspool", 23 | "boyer-moore", 24 | "search" 25 | ], 26 | "licenses": [{ 27 | "type": "MIT", 28 | "url": "http://github.com/mscdex/streamsearch/raw/master/LICENSE" 29 | }], 30 | "repository": { 31 | "type": "git", 32 | "url": "http://github.com/mscdex/streamsearch.git" 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Brian White. All rights reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to 5 | deal in the Software without restriction, including without limitation the 6 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | sell copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | IN THE SOFTWARE. -------------------------------------------------------------------------------- /test/test.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const assert = require('assert'); 4 | 5 | const StreamSearch = require('../lib/sbmh.js'); 6 | 7 | [ 8 | { 9 | needle: '\r\n', 10 | chunks: [ 11 | 'foo', 12 | ' bar', 13 | '\r', 14 | '\n', 15 | 'baz, hello\r', 16 | '\n world.', 17 | '\r\n Node.JS rules!!\r\n\r\n', 18 | ], 19 | expect: [ 20 | [false, 'foo'], 21 | [false, ' bar'], 22 | [ true, null], 23 | [false, 'baz, hello'], 24 | [ true, null], 25 | [false, ' world.'], 26 | [ true, null], 27 | [ true, ' Node.JS rules!!'], 28 | [ true, ''], 29 | ], 30 | }, 31 | { 32 | needle: '---foobarbaz', 33 | chunks: [ 34 | '---foobarbaz', 35 | 'asdf', 36 | '\r\n', 37 | '---foobarba', 38 | '---foobar', 39 | 'ba', 40 | '\r\n---foobarbaz--\r\n', 41 | ], 42 | expect: [ 43 | [ true, null], 44 | [false, 'asdf'], 45 | [false, '\r\n'], 46 | [false, '---foobarba'], 47 | [false, '---foobarba'], 48 | [ true, '\r\n'], 49 | [false, '--\r\n'], 50 | ], 51 | }, 52 | ].forEach((test, i) => { 53 | console.log(`Running test #${i + 1}`); 54 | const { needle, chunks, expect } = test; 55 | 56 | const results = []; 57 | const ss = new StreamSearch(Buffer.from(needle), 58 | (isMatch, data, start, end) => { 59 | if (data) 60 | data = data.toString('latin1', start, end); 61 | else 62 | data = null; 63 | results.push([isMatch, data]); 64 | }); 65 | 66 | for (const chunk of chunks) 67 | ss.push(Buffer.from(chunk)); 68 | 69 | assert.deepStrictEqual(results, expect); 70 | }); 71 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Description 2 | =========== 3 | 4 | streamsearch is a module for [node.js](http://nodejs.org/) that allows searching a stream using the Boyer-Moore-Horspool algorithm. 5 | 6 | This module is based heavily on the Streaming Boyer-Moore-Horspool C++ implementation by Hongli Lai [here](https://github.com/FooBarWidget/boyer-moore-horspool). 7 | 8 | 9 | Requirements 10 | ============ 11 | 12 | * [node.js](http://nodejs.org/) -- v10.0.0 or newer 13 | 14 | 15 | Installation 16 | ============ 17 | 18 | npm install streamsearch 19 | 20 | Example 21 | ======= 22 | 23 | ```js 24 | const { inspect } = require('util'); 25 | 26 | const StreamSearch = require('streamsearch'); 27 | 28 | const needle = Buffer.from('\r\n'); 29 | const ss = new StreamSearch(needle, (isMatch, data, start, end) => { 30 | if (data) 31 | console.log('data: ' + inspect(data.toString('latin1', start, end))); 32 | if (isMatch) 33 | console.log('match!'); 34 | }); 35 | 36 | const chunks = [ 37 | 'foo', 38 | ' bar', 39 | '\r', 40 | '\n', 41 | 'baz, hello\r', 42 | '\n world.', 43 | '\r\n Node.JS rules!!\r\n\r\n', 44 | ]; 45 | for (const chunk of chunks) 46 | ss.push(Buffer.from(chunk)); 47 | 48 | // output: 49 | // 50 | // data: 'foo' 51 | // data: ' bar' 52 | // match! 53 | // data: 'baz, hello' 54 | // match! 55 | // data: ' world.' 56 | // match! 57 | // data: ' Node.JS rules!!' 58 | // match! 59 | // data: '' 60 | // match! 61 | ``` 62 | 63 | 64 | API 65 | === 66 | 67 | Properties 68 | ---------- 69 | 70 | * **maxMatches** - < _integer_ > - The maximum number of matches. Defaults to `Infinity`. 71 | 72 | * **matches** - < _integer_ > - The current match count. 73 | 74 | 75 | Functions 76 | --------- 77 | 78 | * **(constructor)**(< _mixed_ >needle, < _function_ >callback) - Creates and returns a new instance for searching for a _Buffer_ or _string_ `needle`. `callback` is called any time there is non-matching data and/or there is a needle match. `callback` will be called with the following arguments: 79 | 80 | 1. `isMatch` - _boolean_ - Indicates whether a match has been found 81 | 82 | 2. `data` - _mixed_ - If set, this contains data that did not match the needle. 83 | 84 | 3. `start` - _integer_ - The index in `data` where the non-matching data begins (inclusive). 85 | 86 | 4. `end` - _integer_ - The index in `data` where the non-matching data ends (exclusive). 87 | 88 | 5. `isSafeData` - _boolean_ - Indicates if it is safe to store a reference to `data` (e.g. as-is or via `data.slice()`) or not, as in some cases `data` may point to a Buffer whose contents change over time. 89 | 90 | * **destroy**() - _(void)_ - Emits any last remaining unmatched data that may still be buffered and then resets internal state. 91 | 92 | * **push**(< _Buffer_ >chunk) - _integer_ - Processes `chunk`, searching for a match. The return value is the last processed index in `chunk` + 1. 93 | 94 | * **reset**() - _(void)_ - Resets internal state. Useful for when you wish to start searching a new/different stream for example. 95 | 96 | -------------------------------------------------------------------------------- /lib/sbmh.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | /* 3 | Based heavily on the Streaming Boyer-Moore-Horspool C++ implementation 4 | by Hongli Lai at: https://github.com/FooBarWidget/boyer-moore-horspool 5 | */ 6 | function memcmp(buf1, pos1, buf2, pos2, num) { 7 | for (let i = 0; i < num; ++i) { 8 | if (buf1[pos1 + i] !== buf2[pos2 + i]) 9 | return false; 10 | } 11 | return true; 12 | } 13 | 14 | class SBMH { 15 | constructor(needle, cb) { 16 | if (typeof cb !== 'function') 17 | throw new Error('Missing match callback'); 18 | 19 | if (typeof needle === 'string') 20 | needle = Buffer.from(needle); 21 | else if (!Buffer.isBuffer(needle)) 22 | throw new Error(`Expected Buffer for needle, got ${typeof needle}`); 23 | 24 | const needleLen = needle.length; 25 | 26 | this.maxMatches = Infinity; 27 | this.matches = 0; 28 | 29 | this._cb = cb; 30 | this._lookbehindSize = 0; 31 | this._needle = needle; 32 | this._bufPos = 0; 33 | 34 | this._lookbehind = Buffer.allocUnsafe(needleLen); 35 | 36 | // Initialize occurrence table. 37 | this._occ = [ 38 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 39 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 40 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 41 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 42 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 43 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 44 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 45 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 46 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 47 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 48 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 49 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 50 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 51 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 52 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 53 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 54 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 55 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 56 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 57 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 58 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 59 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 60 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 61 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 62 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 63 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 64 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 65 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 66 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 67 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 68 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 69 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 70 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 71 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 72 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 73 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 74 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 75 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 76 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 77 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 78 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 79 | needleLen, needleLen, needleLen, needleLen, needleLen, needleLen, 80 | needleLen, needleLen, needleLen, needleLen 81 | ]; 82 | 83 | // Populate occurrence table with analysis of the needle, ignoring the last 84 | // letter. 85 | if (needleLen > 1) { 86 | for (let i = 0; i < needleLen - 1; ++i) 87 | this._occ[needle[i]] = needleLen - 1 - i; 88 | } 89 | } 90 | 91 | reset() { 92 | this.matches = 0; 93 | this._lookbehindSize = 0; 94 | this._bufPos = 0; 95 | } 96 | 97 | push(chunk, pos) { 98 | let result; 99 | if (!Buffer.isBuffer(chunk)) 100 | chunk = Buffer.from(chunk, 'latin1'); 101 | const chunkLen = chunk.length; 102 | this._bufPos = pos || 0; 103 | while (result !== chunkLen && this.matches < this.maxMatches) 104 | result = feed(this, chunk); 105 | return result; 106 | } 107 | 108 | destroy() { 109 | const lbSize = this._lookbehindSize; 110 | if (lbSize) 111 | this._cb(false, this._lookbehind, 0, lbSize, false); 112 | this.reset(); 113 | } 114 | } 115 | 116 | function feed(self, data) { 117 | const len = data.length; 118 | const needle = self._needle; 119 | const needleLen = needle.length; 120 | 121 | // Positive: points to a position in `data` 122 | // pos == 3 points to data[3] 123 | // Negative: points to a position in the lookbehind buffer 124 | // pos == -2 points to lookbehind[lookbehindSize - 2] 125 | let pos = -self._lookbehindSize; 126 | const lastNeedleCharPos = needleLen - 1; 127 | const lastNeedleChar = needle[lastNeedleCharPos]; 128 | const end = len - needleLen; 129 | const occ = self._occ; 130 | const lookbehind = self._lookbehind; 131 | 132 | if (pos < 0) { 133 | // Lookbehind buffer is not empty. Perform Boyer-Moore-Horspool 134 | // search with character lookup code that considers both the 135 | // lookbehind buffer and the current round's haystack data. 136 | // 137 | // Loop until 138 | // there is a match. 139 | // or until 140 | // we've moved past the position that requires the 141 | // lookbehind buffer. In this case we switch to the 142 | // optimized loop. 143 | // or until 144 | // the character to look at lies outside the haystack. 145 | while (pos < 0 && pos <= end) { 146 | const nextPos = pos + lastNeedleCharPos; 147 | const ch = (nextPos < 0 148 | ? lookbehind[self._lookbehindSize + nextPos] 149 | : data[nextPos]); 150 | 151 | if (ch === lastNeedleChar 152 | && matchNeedle(self, data, pos, lastNeedleCharPos)) { 153 | self._lookbehindSize = 0; 154 | ++self.matches; 155 | if (pos > -self._lookbehindSize) 156 | self._cb(true, lookbehind, 0, self._lookbehindSize + pos, false); 157 | else 158 | self._cb(true, undefined, 0, 0, true); 159 | 160 | return (self._bufPos = pos + needleLen); 161 | } 162 | 163 | pos += occ[ch]; 164 | } 165 | 166 | // No match. 167 | 168 | // There's too few data for Boyer-Moore-Horspool to run, 169 | // so let's use a different algorithm to skip as much as 170 | // we can. 171 | // Forward pos until 172 | // the trailing part of lookbehind + data 173 | // looks like the beginning of the needle 174 | // or until 175 | // pos == 0 176 | while (pos < 0 && !matchNeedle(self, data, pos, len - pos)) 177 | ++pos; 178 | 179 | if (pos < 0) { 180 | // Cut off part of the lookbehind buffer that has 181 | // been processed and append the entire haystack 182 | // into it. 183 | const bytesToCutOff = self._lookbehindSize + pos; 184 | 185 | if (bytesToCutOff > 0) { 186 | // The cut off data is guaranteed not to contain the needle. 187 | self._cb(false, lookbehind, 0, bytesToCutOff, false); 188 | } 189 | 190 | self._lookbehindSize -= bytesToCutOff; 191 | lookbehind.copy(lookbehind, 0, bytesToCutOff, self._lookbehindSize); 192 | lookbehind.set(data, self._lookbehindSize); 193 | self._lookbehindSize += len; 194 | 195 | self._bufPos = len; 196 | return len; 197 | } 198 | 199 | // Discard lookbehind buffer. 200 | self._cb(false, lookbehind, 0, self._lookbehindSize, false); 201 | self._lookbehindSize = 0; 202 | } 203 | 204 | pos += self._bufPos; 205 | 206 | const firstNeedleChar = needle[0]; 207 | 208 | // Lookbehind buffer is now empty. Perform Boyer-Moore-Horspool 209 | // search with optimized character lookup code that only considers 210 | // the current round's haystack data. 211 | while (pos <= end) { 212 | const ch = data[pos + lastNeedleCharPos]; 213 | 214 | if (ch === lastNeedleChar 215 | && data[pos] === firstNeedleChar 216 | && memcmp(needle, 0, data, pos, lastNeedleCharPos)) { 217 | ++self.matches; 218 | if (pos > 0) 219 | self._cb(true, data, self._bufPos, pos, true); 220 | else 221 | self._cb(true, undefined, 0, 0, true); 222 | 223 | return (self._bufPos = pos + needleLen); 224 | } 225 | 226 | pos += occ[ch]; 227 | } 228 | 229 | // There was no match. If there's trailing haystack data that we cannot 230 | // match yet using the Boyer-Moore-Horspool algorithm (because the trailing 231 | // data is less than the needle size) then match using a modified 232 | // algorithm that starts matching from the beginning instead of the end. 233 | // Whatever trailing data is left after running this algorithm is added to 234 | // the lookbehind buffer. 235 | while (pos < len) { 236 | if (data[pos] !== firstNeedleChar 237 | || !memcmp(data, pos, needle, 0, len - pos)) { 238 | ++pos; 239 | continue; 240 | } 241 | data.copy(lookbehind, 0, pos, len); 242 | self._lookbehindSize = len - pos; 243 | break; 244 | } 245 | 246 | // Everything until `pos` is guaranteed not to contain needle data. 247 | if (pos > 0) 248 | self._cb(false, data, self._bufPos, pos < len ? pos : len, true); 249 | 250 | self._bufPos = len; 251 | return len; 252 | } 253 | 254 | function matchNeedle(self, data, pos, len) { 255 | const lb = self._lookbehind; 256 | const lbSize = self._lookbehindSize; 257 | const needle = self._needle; 258 | 259 | for (let i = 0; i < len; ++i, ++pos) { 260 | const ch = (pos < 0 ? lb[lbSize + pos] : data[pos]); 261 | if (ch !== needle[i]) 262 | return false; 263 | } 264 | return true; 265 | } 266 | 267 | module.exports = SBMH; 268 | --------------------------------------------------------------------------------