├── .gitignore ├── .travis.yml ├── test.json ├── bench.js ├── package.json ├── LICENSE ├── readme.md ├── index.js └── test.js /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - "stable" 4 | -------------------------------------------------------------------------------- /test.json: -------------------------------------------------------------------------------- 1 | {"hello": "world"} 2 | {"foo": "bar"} 3 | {"fizz": "buzz"} 4 | -------------------------------------------------------------------------------- /bench.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const { Transform } = require('stream') 4 | const split = require('./') 5 | 6 | let str = '' 7 | for (let i = 0; i < 1000000; i++) { 8 | str += 'Hello beautiful world\n' 9 | } 10 | 11 | const stream = new Transform() 12 | .pipe(split()) 13 | .on('data', function () {}) 14 | .on('end', function () { 15 | console.timeEnd('split') 16 | }) 17 | 18 | console.time('split') 19 | 20 | stream.write(str) 21 | stream.end() 22 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "binary-split", 3 | "version": "1.0.5", 4 | "description": "a fast newline (or any delimiter) splitter stream - like require('split') but faster", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "standard --verbose && node test.js" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "git://github.com/maxogden/binary-split.git" 12 | }, 13 | "author": "max ogden", 14 | "license": "BSD-2-Clause", 15 | "bugs": { 16 | "url": "https://github.com/maxogden/binary-split/issues" 17 | }, 18 | "devDependencies": { 19 | "standard": "^17.1.0", 20 | "tape": "~5.7.2" 21 | }, 22 | "files": [ 23 | "index.js" 24 | ] 25 | } 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016 Max Ogden 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are met: 5 | 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | 12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 13 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 14 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 15 | ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 16 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 17 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 18 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 19 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 20 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 21 | THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 22 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # binary-split 2 | 3 | Split streams of binary data. Similar to [split](http://npmjs.org/split) but for Buffers. 4 | Whereas split is String specific, this library never converts binary data into non-binary data. 5 | 6 | [![travis][travis-image]][travis-url] 7 | 8 | [travis-image]: https://img.shields.io/travis/maxogden/binary-split.svg?style=flat 9 | [travis-url]: https://travis-ci.org/maxogden/binary-split 10 | 11 | ## How fast is it? 12 | 13 | On a SSD w/ a Haswell i5 1.3ghz CPU and 4GB RAM reading a 2.6GB, 5.2 million entry line delimited JSON file takes 15 seconds. Using `split` for the same benchmark takes 1m23s. 14 | 15 | ## Example usage 16 | 17 | ```js 18 | const split = require('binary-split') 19 | 20 | fs.createReadStream('log.txt') 21 | .pipe(split()) 22 | .on('data', line => console.log(line)) 23 | ``` 24 | 25 | ## API 26 | 27 | #### split([splitOn]) 28 | 29 | Returns a stream. 30 | You can `.pipe` other streams to it or `.write` them yourself 31 | (if you `.write` don't forget to `.end`). 32 | 33 | The stream will emit a stream of binary objects representing the split data. 34 | 35 | Pass in the optional `splitOn` argument to specify where to split the data. 36 | The default is your current operating systems EOL sequence (via `require('os').EOL`). 37 | 38 | For more examples of usage see `test.js`. 39 | 40 | ## Collaborators 41 | 42 | binary-split is only possible due to the excellent work of the following collaborators: 43 | 44 | - Max Ogden ([@maxogden](https://github.com/maxogden)) 45 | - Vladimir Agafonkin ([@mourner](https://github.com/mourner)) 46 | - Martin Raifer ([@tyrasd](https://github.com/tyrasd)) 47 | - Julian Gruber ([@juliangruber](https://github.com/juliangruber)) 48 | 49 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | const { Transform } = require('stream') 2 | const { EOL } = require('os') 3 | 4 | module.exports = BinarySplit 5 | 6 | function firstMatch (buf, offset, matcher) { 7 | if (offset >= buf.length) return -1 8 | let i 9 | for (i = offset; i < buf.length; i++) { 10 | if (buf[i] === matcher[0]) { 11 | if (matcher.length > 1) { 12 | let fullMatch = true 13 | let j = i 14 | for (let k = 0; j < i + matcher.length; j++, k++) { 15 | if (buf[j] !== matcher[k]) { 16 | fullMatch = false 17 | break 18 | } 19 | } 20 | if (fullMatch) return j - matcher.length 21 | } else { 22 | break 23 | } 24 | } 25 | } 26 | 27 | const idx = i + matcher.length - 1 28 | return idx 29 | } 30 | 31 | function BinarySplit (splitOn = EOL) { 32 | const matcher = Buffer.from(splitOn) 33 | let buffered 34 | 35 | return new Transform({ 36 | readableObjectMode: true, 37 | 38 | transform (buf, enc, done) { 39 | let offset = 0 40 | let lastMatch = 0 41 | if (buffered) { 42 | buf = Buffer.concat([buffered, buf]) 43 | offset = buffered.length 44 | buffered = undefined 45 | } 46 | 47 | while (true) { 48 | const idx = firstMatch(buf, offset - matcher.length + 1, matcher) 49 | if (idx !== -1 && idx < buf.length) { 50 | if (lastMatch !== idx) { 51 | this.push(buf.slice(lastMatch, idx)) 52 | } 53 | offset = idx + matcher.length 54 | lastMatch = offset 55 | } else { 56 | buffered = buf.slice(lastMatch) 57 | break 58 | } 59 | } 60 | 61 | done() 62 | }, 63 | 64 | flush (done) { 65 | if (buffered && buffered.length > 0) this.push(buffered) 66 | done() 67 | } 68 | }) 69 | } 70 | -------------------------------------------------------------------------------- /test.js: -------------------------------------------------------------------------------- 1 | const test = require('tape') 2 | const fs = require('fs') 3 | const PassThrough = require('stream').PassThrough 4 | const split = require('./') 5 | 6 | function splitTest (matcher, cb) { 7 | if (!cb) { 8 | cb = matcher 9 | matcher = undefined 10 | } 11 | const splitter = split(matcher) 12 | const items = [] 13 | splitter.on('data', function (item) { 14 | items.push(item) 15 | }) 16 | splitter.on('error', function (e) { 17 | cb(e) 18 | }) 19 | splitter.on('end', function () { 20 | cb(null, items) 21 | }) 22 | return splitter 23 | } 24 | 25 | test('ldjson file', function (t) { 26 | fs.createReadStream('test.json').pipe(splitTest(function (err, items) { 27 | if (err) throw err 28 | t.equals(items.length, 3) 29 | t.end() 30 | })) 31 | }) 32 | 33 | test('custom matcher', function (t) { 34 | const splitStream = splitTest(' ', function (err, items) { 35 | if (err) throw err 36 | t.equals(items.length, 5) 37 | t.equals(items.join(' '), 'hello yes this is dog') 38 | t.end() 39 | }); 40 | 41 | ['hello yes ', 'this', ' is d', 'og'].map(function (chunk) { 42 | return Buffer.from(chunk) 43 | }).forEach(function (chunk) { 44 | splitStream.write(chunk) 45 | }) 46 | splitStream.end() 47 | }) 48 | 49 | test('long matcher', function (t) { 50 | const data = 'hello yes this is dog' 51 | const splitStream = splitTest('this', function (err, items) { 52 | if (err) throw err 53 | t.equals(items.length, 2) 54 | t.equals(items[0].toString(), 'hello yes ') 55 | t.equals(items[1].toString(), ' is dog') 56 | t.end() 57 | }) 58 | splitStream.write(Buffer.from(data)) 59 | splitStream.end() 60 | }) 61 | 62 | test('matcher at index 0 check', function (t) { 63 | const data = '\nhello\nmax' 64 | const splitStream = splitTest(function (err, items) { 65 | if (err) throw err 66 | 67 | t.equals(items.length, 2) 68 | t.equals(items[0].toString(), 'hello') 69 | t.equals(items[1].toString(), 'max') 70 | t.end() 71 | }) 72 | 73 | splitStream.write(Buffer.from(data)) 74 | splitStream.end() 75 | }) 76 | 77 | test('chunked input', function (t) { 78 | fs.createReadStream('test.json') 79 | .pipe(split('\n')) 80 | .pipe(split('i')) 81 | .pipe(splitTest(':', function (err, items) { 82 | if (err) throw err 83 | t.equals(items.length, 4) 84 | t.end() 85 | })) 86 | }) 87 | 88 | test('chunked input with long matcher', function (t) { 89 | fs.createReadStream('test.json') 90 | .pipe(split('\n')) 91 | .pipe(splitTest('hello', function (err, items) { 92 | if (err) throw err 93 | t.equals(items.length, 2) 94 | t.equals(items[0].toString(), '{"') 95 | t.end() 96 | })) 97 | }) 98 | 99 | test('lookbehind in multi character matcher', function (t) { 100 | const splitStream = splitTest('\r\n\r', function (err, items) { 101 | if (err) throw err 102 | t.equals(items.length, 2) 103 | t.equals(items[0].toString(), 'a') 104 | t.equals(items[1].toString(), 'b') 105 | t.end() 106 | }) 107 | 108 | splitStream.write('a\r') 109 | splitStream.write('\n') 110 | splitStream.write('\rb') 111 | splitStream.end() 112 | }) 113 | 114 | test('should not combine outputs', function (t) { 115 | const pt = new PassThrough() 116 | const stream = pt.pipe(split('.')) 117 | pt.write('a.b') 118 | pt.end('c.d') 119 | setImmediate(function () { 120 | t.equal(stream.read().toString(), 'a') 121 | t.equal(stream.read().toString(), 'bc') 122 | t.equal(stream.read().toString(), 'd') 123 | t.end() 124 | }) 125 | }) 126 | --------------------------------------------------------------------------------