├── .github ├── FUNDING.yml ├── dependabot.yml └── workflows │ └── tests.yml ├── tests ├── sample.csv.gz ├── tests.js ├── ReadString.js ├── test_sliding.js ├── test_main.js ├── test_asObject.js ├── test_parser.js └── test_stringer.js ├── .gitignore ├── .prettierrc ├── .editorconfig ├── index.js ├── package.json ├── LICENSE ├── Stringer.js ├── AsObjects.js ├── README.md └── Parser.js /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: uhop 2 | buy_me_a_coffee: uhop 3 | -------------------------------------------------------------------------------- /tests/sample.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uhop/stream-csv-as-json/HEAD/tests/sample.csv.gz -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/* 2 | *.sublime-* 3 | report/* 4 | coverage/* 5 | .AppleDouble 6 | .DS_Store 7 | -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "printWidth": 160, 3 | "singleQuote": true, 4 | "bracketSpacing": false 5 | } 6 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | charset = utf-8 5 | end_of_line = lf 6 | insert_final_newline = true 7 | trim_trailing_whitespace = true 8 | indent_style = space 9 | indent_size = 2 10 | -------------------------------------------------------------------------------- /tests/tests.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const unit = require('heya-unit'); 4 | 5 | require('./test_parser'); 6 | require('./test_sliding'); 7 | require('./test_main'); 8 | require('./test_stringer'); 9 | require('./test_asObject'); 10 | 11 | unit.run(); 12 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const Parser = require('./Parser'); 4 | const emit = require('stream-json/utils/emit'); 5 | 6 | const make = options => emit(new Parser(options)); 7 | 8 | make.Parser = Parser; 9 | make.parser = Parser.parser; 10 | 11 | module.exports = make; 12 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | -------------------------------------------------------------------------------- /tests/ReadString.js: -------------------------------------------------------------------------------- 1 | const {Readable} = require('stream'); 2 | 3 | class ReadString extends Readable { 4 | static readString(string, quant, options) { 5 | return new ReadString(string, quant, options); 6 | } 7 | 8 | constructor(string, quant, options) { 9 | super(options); 10 | this._string = string; 11 | this._quant = quant; 12 | } 13 | _read(size) { 14 | if (isNaN(this._quant)) { 15 | this.push(this._string, 'utf8'); 16 | } else { 17 | for (let i = 0; i < this._string.length; i += this._quant) { 18 | this.push(this._string.substr(i, this._quant), 'utf8'); 19 | } 20 | } 21 | this.push(null); 22 | } 23 | } 24 | ReadString.make = ReadString.readString; 25 | 26 | module.exports = ReadString; 27 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "stream-csv-as-json", 3 | "version": "1.0.5", 4 | "description": "Streams CSV files.", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "node tests/tests.js" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "git+https://github.com/uhop/stream-csv-as-json.git" 12 | }, 13 | "keywords": [ 14 | "scanner", 15 | "lexer", 16 | "tokenizer", 17 | "parser", 18 | "csv", 19 | "stream", 20 | "streaming" 21 | ], 22 | "author": "Eugene Lazutkin (http://lazutkin.com/)", 23 | "license": "BSD-3-Clause", 24 | "bugs": { 25 | "url": "https://github.com/uhop/stream-csv-as-json/issues" 26 | }, 27 | "homepage": "https://github.com/uhop/stream-csv-as-json#readme", 28 | "devDependencies": { 29 | "heya-unit": "^0.3.0", 30 | "stream-chain": "^2.2.5" 31 | }, 32 | "dependencies": { 33 | "stream-json": "^1.8.0" 34 | }, 35 | "files": [ 36 | "/*.js", 37 | "/utils" 38 | ] 39 | } 40 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | # This workflow will do a clean install of node dependencies, build the source code and run tests across different versions of node 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-nodejs-with-github-actions 3 | 4 | name: Node.js CI 5 | 6 | on: 7 | push: 8 | branches: ['*'] 9 | pull_request: 10 | branches: [master] 11 | 12 | jobs: 13 | tests: 14 | name: Node.js ${{matrix.node-version}} on ${{matrix.os}} 15 | runs-on: ${{matrix.os}} 16 | 17 | strategy: 18 | matrix: 19 | os: [ubuntu-latest] 20 | node-version: [18, 20, 22] 21 | 22 | steps: 23 | - uses: actions/checkout@v2 24 | with: 25 | submodules: true 26 | - name: Setup Node.js ${{matrix.node-version}} 27 | uses: actions/setup-node@v1 28 | with: 29 | node-version: ${{matrix.node-version}} 30 | - name: Get NPM cache directory 31 | id: npm-cache 32 | run: echo "::set-output name=dir::$(npm config get cache)" 33 | - name: Cache node modules 34 | uses: actions/cache@v2 35 | with: 36 | path: ${{steps.npm-cache.outputs.dir}} 37 | key: ${{runner.os}}-node-${{hashFiles('**/package-lock.json')}} 38 | restore-keys: | 39 | ${{runner.os}}-node- 40 | ${{runner.os}}- 41 | - name: Install the package and run tests 42 | run: | 43 | npm ci 44 | npm run build --if-present 45 | npm test 46 | -------------------------------------------------------------------------------- /tests/test_sliding.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const unit = require('heya-unit'); 4 | 5 | const Assembler = require('stream-json/Assembler'); 6 | 7 | const Parser = require('../Parser'); 8 | const ReadString = require('./ReadString'); 9 | 10 | const runSlidingWindowTest = (t, quant) => { 11 | const async = t.startAsync('test_sliding: ' + quant); 12 | 13 | const table = [['1', '', '', '"'], ['2', 'three', 'four', 'five']], 14 | input = table.map(row => row.map(value => (/[,\r\n\"]/.test(value) ? '"' + value.replace('"', '""') + '"' : value)).join(',')).join('\r\n'), 15 | pipeline = new ReadString(input, quant).pipe(new Parser()), 16 | asm = new Assembler(), 17 | result = []; 18 | 19 | pipeline.on('data', token => { 20 | asm[token.name] && asm[token.name](token.value); 21 | asm.done && result.push(asm.current); 22 | }); 23 | pipeline.on('end', () => { 24 | eval(t.TEST('t.unify(result, table)')); 25 | async.done(); 26 | }); 27 | }; 28 | 29 | unit.add(module, [ 30 | function test_sliding_1(t) { 31 | runSlidingWindowTest(t, 1); 32 | }, 33 | function test_sliding_2(t) { 34 | runSlidingWindowTest(t, 2); 35 | }, 36 | function test_sliding_3(t) { 37 | runSlidingWindowTest(t, 3); 38 | }, 39 | function test_sliding_4(t) { 40 | runSlidingWindowTest(t, 4); 41 | }, 42 | function test_sliding_5(t) { 43 | runSlidingWindowTest(t, 5); 44 | }, 45 | function test_sliding_6(t) { 46 | runSlidingWindowTest(t, 6); 47 | }, 48 | function test_sliding_7(t) { 49 | runSlidingWindowTest(t, 7); 50 | }, 51 | function test_sliding_8(t) { 52 | runSlidingWindowTest(t, 8); 53 | }, 54 | function test_sliding_9(t) { 55 | runSlidingWindowTest(t, 9); 56 | }, 57 | function test_sliding_10(t) { 58 | runSlidingWindowTest(t, 10); 59 | }, 60 | function test_sliding_11(t) { 61 | runSlidingWindowTest(t, 11); 62 | }, 63 | function test_sliding_12(t) { 64 | runSlidingWindowTest(t, 12); 65 | } 66 | ]); 67 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This library is available under the terms of the modified BSD license. No external contributions 2 | are allowed under licenses which are fundamentally incompatible with the BSD license that this library is distributed under. 3 | 4 | The text of the BSD license is reproduced below. 5 | 6 | ------------------------------------------------------------------------------- 7 | The "New" BSD License: 8 | ********************** 9 | 10 | Copyright (c) 2005-2024, Eugene Lazutkin 11 | All rights reserved. 12 | 13 | Redistribution and use in source and binary forms, with or without 14 | modification, are permitted provided that the following conditions are met: 15 | 16 | * Redistributions of source code must retain the above copyright notice, this 17 | list of conditions and the following disclaimer. 18 | * Redistributions in binary form must reproduce the above copyright notice, 19 | this list of conditions and the following disclaimer in the documentation 20 | and/or other materials provided with the distribution. 21 | * Neither the name of Eugene Lazutkin nor the names of other contributors 22 | may be used to endorse or promote products derived from this software 23 | without specific prior written permission. 24 | 25 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 26 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 27 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 28 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 29 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 31 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 32 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 33 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 34 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 35 | -------------------------------------------------------------------------------- /tests/test_main.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const unit = require('heya-unit'); 4 | 5 | const {parser} = require('../index'); 6 | 7 | const emit = require('stream-json/utils/emit'); 8 | 9 | const ReadString = require('./ReadString'); 10 | 11 | unit.add(module, [ 12 | function test_main(t) { 13 | const async = t.startAsync('test_main'); 14 | 15 | const input = '1,,"",""""\r\n2,three,"four",'; 16 | 17 | const pipeline = emit(new ReadString(input).pipe(parser())); 18 | 19 | let values = 0, 20 | starts = 0, 21 | rows = 0; 22 | pipeline.on('startArray', () => ++rows); 23 | pipeline.on('startString', () => ++starts); 24 | pipeline.on('stringValue', () => ++values); 25 | pipeline.on('end', () => { 26 | eval(t.TEST('rows === 2')); 27 | eval(t.TEST('values === 8')); 28 | eval(t.TEST('values === starts')); 29 | async.done(); 30 | }); 31 | }, 32 | function test_main_no_values(t) { 33 | const async = t.startAsync('test_main_no_values'); 34 | 35 | const input = '1,,"",""","\r\n2,three,"four\r\n",\r\n'; 36 | 37 | const pipeline = emit(new ReadString(input).pipe(parser({packStrings: false}))); 38 | 39 | let starts = 0, 40 | rows = 0; 41 | pipeline.on('startArray', () => ++rows); 42 | pipeline.on('startString', () => ++starts); 43 | pipeline.on('stringValue', () => { 44 | t.test(false); // we shouldn't be here 45 | }); 46 | pipeline.on('end', () => { 47 | eval(t.TEST('rows === 2')); 48 | eval(t.TEST('starts === 8')); 49 | async.done(); 50 | }); 51 | }, 52 | function test_main_no_streaming(t) { 53 | const async = t.startAsync('test_main_no_streaming'); 54 | 55 | const input = '1,,"",""""\r\n2,three,"four",\r\n'; 56 | 57 | const pipeline = emit(new ReadString(input).pipe(parser({streamStrings: false}))); 58 | 59 | let values = 0, 60 | rows = 0; 61 | pipeline.on('startArray', () => ++rows); 62 | pipeline.on('startString', () => { 63 | t.test(false); // we shouldn't be here 64 | }); 65 | pipeline.on('stringValue', () => ++values); 66 | pipeline.on('end', () => { 67 | eval(t.TEST('rows === 2')); 68 | eval(t.TEST('values === 8')); 69 | async.done(); 70 | }); 71 | } 72 | ]); 73 | -------------------------------------------------------------------------------- /tests/test_asObject.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const unit = require('heya-unit'); 4 | 5 | const Assembler = require('stream-json/Assembler'); 6 | 7 | const {parser} = require('../Parser'); 8 | const {asObjects} = require('../AsObjects'); 9 | const ReadString = require('./ReadString'); 10 | 11 | unit.add(module, [ 12 | function test_asObject_simple(t) { 13 | const async = t.startAsync('test_asObject_simple'); 14 | 15 | const input = 'alpha,beta,gamma\r\n1,,"",""""\r\n2,three,"four",five\r\n', 16 | expected = [{alpha: '1', beta: '', gamma: '', field3: '"'}, {alpha: '2', beta: 'three', gamma: 'four', field3: 'five'}], 17 | result = []; 18 | 19 | const pipeline = new ReadString(input).pipe(parser()).pipe(asObjects()); 20 | const asm = new Assembler(); 21 | 22 | pipeline.on('data', token => { 23 | asm[token.name] && asm[token.name](token.value); 24 | asm.done && result.push(asm.current); 25 | }); 26 | pipeline.on('end', () => { 27 | eval(t.TEST('t.unify(result, expected)')); 28 | async.done(); 29 | }); 30 | }, 31 | function test_asObject_values(t) { 32 | const async = t.startAsync('test_asObject_values'); 33 | 34 | const input = 'alpha,beta,gamma\r\n1,,"",""""\r\n2,three,"four",five\r\n', 35 | expected = [{alpha: '1', beta: '', gamma: '', field3: '"'}, {alpha: '2', beta: 'three', gamma: 'four', field3: 'five'}], 36 | result = []; 37 | 38 | const pipeline = new ReadString(input).pipe(parser({useValues: true})).pipe(asObjects()); 39 | const asm = new Assembler(); 40 | 41 | pipeline.on('data', token => { 42 | asm[token.name] && asm[token.name](token.value); 43 | asm.done && result.push(asm.current); 44 | }); 45 | pipeline.on('end', () => { 46 | eval(t.TEST('t.unify(result, expected)')); 47 | async.done(); 48 | }); 49 | }, 50 | function test_asObject_empty_values(t) { 51 | const async = t.startAsync('test_asObject_empty_values'); 52 | 53 | const input = 'alpha,,gamma\r\n1,,"",""""\r\n2,three,"four",five\r\n', 54 | expected = [{alpha: '1', column1: '', gamma: '', column3: '"'}, {alpha: '2', column1: 'three', gamma: 'four', column3: 'five'}], 55 | result = []; 56 | 57 | const pipeline = new ReadString(input).pipe(parser({useValues: true})).pipe(asObjects({fieldPrefix: 'column'})); 58 | const asm = new Assembler(); 59 | 60 | pipeline.on('data', token => { 61 | asm[token.name] && asm[token.name](token.value); 62 | asm.done && result.push(asm.current); 63 | }); 64 | pipeline.on('end', () => { 65 | eval(t.TEST('t.unify(result, expected)')); 66 | async.done(); 67 | }); 68 | } 69 | ]); 70 | -------------------------------------------------------------------------------- /Stringer.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const {Transform} = require('stream'); 4 | 5 | class Stringer extends Transform { 6 | static make(options) { 7 | return new Stringer(options); 8 | } 9 | 10 | constructor(options) { 11 | super(Object.assign({}, options, {writableObjectMode: true, readableObjectMode: false})); 12 | 13 | this._useStringValues = false; 14 | this._separator = ','; 15 | this._containsQuotables = /[,\r\n\"]/; 16 | if (options) { 17 | 'useValues' in options && (this._useStringValues = options.useValues); 18 | 'useStringValues' in options && (this._useStringValues = options.useStringValues); 19 | this._separator = options.separator || ','; 20 | const sep = this._separator.replace(/[#-.]|[[-^]|[?|{}]/g, '\\$&'); 21 | this._containsQuotables = new RegExp(this._containsQuotables.source.replace('[,', '[' + sep)); 22 | } 23 | 24 | if (this._useStringValues) { 25 | this._transform = this._valueTransform; 26 | } 27 | } 28 | 29 | _transform(chunk, _, callback) { 30 | switch (chunk.name) { 31 | case 'startArray': 32 | this._skipSeparator = true; 33 | break; 34 | case 'endArray': 35 | this.push('\r\n'); 36 | break; 37 | case 'startString': 38 | if (this._skipSeparator) { 39 | this._skipSeparator = false; 40 | } else { 41 | this.push(this._separator); 42 | } 43 | // intentional fall through 44 | case 'endString': 45 | this.push('"'); 46 | break; 47 | case 'stringChunk': 48 | this.push(chunk.value.replace('"', '""')); 49 | break; 50 | case 'stringValue': 51 | break; // skip 52 | default: 53 | return callback(new Error('Unexpected token: ' + chunk.name)); 54 | } 55 | callback(null); 56 | } 57 | 58 | _valueTransform(chunk, _, callback) { 59 | switch (chunk.name) { 60 | case 'startArray': 61 | this._skipSeparator = true; 62 | break; 63 | case 'endArray': 64 | this.push('\r\n'); 65 | break; 66 | case 'stringValue': 67 | if (this._skipSeparator) { 68 | this._skipSeparator = false; 69 | } else { 70 | this.push(this._separator); 71 | } 72 | const value = chunk.value; 73 | if (this._containsQuotables.test(value)) { 74 | this.push('"' + value.replace('"', '""') + '"'); 75 | } else { 76 | this.push(value); 77 | } 78 | break; 79 | case 'startString': 80 | this._transform = this._skipString; 81 | break; // skip 82 | default: 83 | return callback(new Error('Unexpected token: ' + chunk.name)); 84 | } 85 | callback(null); 86 | } 87 | 88 | _skipString(chunk, encoding, callback) { 89 | if (chunk.name === 'endString') { 90 | this._transform = this._valueTransform; 91 | } 92 | callback(null); 93 | } 94 | } 95 | Stringer.stringer = Stringer.make; 96 | Stringer.make.Constructor = Stringer; 97 | 98 | module.exports = Stringer; 99 | -------------------------------------------------------------------------------- /AsObjects.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const {Transform} = require('stream'); 4 | 5 | const withParser = require('stream-json/utils/withParser'); 6 | 7 | class AsObjects extends Transform { 8 | static make(options) { 9 | return new AsObjects(options); 10 | } 11 | 12 | static withParser(options) { 13 | return withParser(AsObjects.make, options); 14 | } 15 | 16 | constructor(options) { 17 | super(Object.assign({}, options, {writableObjectMode: true, readableObjectMode: true})); 18 | 19 | this._fieldPrefix = 'field'; 20 | this._useStringValues = false; 21 | this._packKeys = this._streamKeys = true; 22 | if (options) { 23 | 'packValues' in options && (this._packStrings = options.packValues); 24 | 'packKeys' in options && (this._packKeys = options.packKeys); 25 | 'streamValues' in options && (this._streamStrings = options.streamValues); 26 | 'streamKeys' in options && (this._streamKeys = options.streamKeys); 27 | 'useValues' in options && (this._useStringValues = options.useValues); 28 | 'useStringValues' in options && (this._useStringValues = options.useStringValues); 29 | 'fieldPrefix' in options && (this._fieldPrefix = options.fieldPrefix); 30 | } 31 | !this._packKeys && (this._streamKeys = true); 32 | 33 | this._useStringValues && (this._transform = this._valueTransform); 34 | this._keys = []; 35 | this._buffer = ''; 36 | this._index = 0; 37 | } 38 | 39 | _transform(chunk, _, callback) { 40 | switch (chunk.name) { 41 | case 'endArray': 42 | this._transform = this._transformToObject; 43 | break; 44 | case 'stringChunk': 45 | this._buffer += chunk.value; 46 | break; 47 | case 'endString': 48 | this._keys.push(this._buffer); 49 | this._buffer = ''; 50 | break; 51 | } 52 | callback(null); 53 | } 54 | 55 | _valueTransform(chunk, _, callback) { 56 | switch (chunk.name) { 57 | case 'endArray': 58 | this._transform = this._transformToObject; 59 | break; 60 | case 'stringValue': 61 | this._keys.push(chunk.value); 62 | break; 63 | } 64 | callback(null); 65 | } 66 | 67 | _transformToObject(chunk, encoding, callback) { 68 | switch (chunk.name) { 69 | case 'startArray': 70 | this.push({name: 'startObject'}); 71 | break; 72 | case 'endArray': 73 | this.push({name: 'endObject'}); 74 | this._index = 0; 75 | break; 76 | case 'startString': 77 | case 'stringValue': 78 | const key = (this._index < this._keys.length && this._keys[this._index]) || this._fieldPrefix + this._index; 79 | ++this._index; 80 | if (this._streamKeys) { 81 | this.push({name: 'startKey'}); 82 | this.push({name: 'stringChunk', value: key}); 83 | this.push({name: 'endKey'}); 84 | } 85 | this._packKeys && this.push({name: 'keyValue', value: key}); 86 | if (chunk.name === 'startString') { 87 | this._transform = this._passString; 88 | return this._transform(chunk, encoding, callback); 89 | } 90 | this.push(chunk); 91 | break; 92 | } 93 | callback(null); 94 | } 95 | 96 | _passString(chunk, _, callback) { 97 | if (this._expected) { 98 | const expected = this._expected; 99 | this._expected = ''; 100 | this._transform = this._transformToObject; 101 | if (expected === chunk.name) { 102 | this.push(chunk); 103 | } else { 104 | return this._transform(chunk, _, callback); 105 | } 106 | } else { 107 | this.push(chunk); 108 | if (chunk.name === 'endString') { 109 | this._expected = 'stringValue'; 110 | } 111 | } 112 | callback(null); 113 | } 114 | } 115 | AsObjects.asObjects = AsObjects.make; 116 | AsObjects.make.Constructor = AsObjects; 117 | 118 | module.exports = AsObjects; 119 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # stream-csv-as-json [![NPM version][npm-img]][npm-url] 2 | 3 | [npm-img]: https://img.shields.io/npm/v/stream-csv-as-json.svg 4 | [npm-url]: https://npmjs.org/package/stream-csv-as-json 5 | 6 | `stream-csv-as-json` is a micro-library of node.js stream components with minimal dependencies for creating custom data processors oriented on processing huge CSV files while requiring a minimal memory footprint. It can parse CSV files far exceeding available memory. Even individual primitive data items can be streamed piece-wise. Streaming SAX-inspired event-based API is included as well. 7 | 8 | `stream-csv-as-json` is a companion project for [stream-json](https://www.npmjs.com/package/stream-json) and it is meant to be used with its filters, streamers and general infrastructure. 9 | 10 | Available components: 11 | 12 | * Streaming JSON [Parser](https://github.com/uhop/stream-csv-as-json/wiki/Parser). 13 | * It produces a SAX-like token stream. 14 | * Optionally it can pack individual values. 15 | * The [main module](https://github.com/uhop/stream-csv-as-json/wiki/Main-module) provides helpers to create a parser. 16 | * Essentials: 17 | * [AsObjects](https://github.com/uhop/stream-csv-as-json/wiki/AsObjects) uses the first row as a list of field names and produces rows as shallow objects with named fields. 18 | * [Stringer](https://github.com/uhop/stream-csv-as-json/wiki/Stringer) converts a token stream back into a JSON text stream. 19 | 20 | All components are meant to be building blocks to create flexible custom data processing pipelines. They can be extended and/or combined with custom code. They can be used together with [stream-chain](https://www.npmjs.com/package/stream-chain) and [stream-json](https://www.npmjs.com/package/stream-json) to simplify data processing. 21 | 22 | This toolkit is distributed under New BSD license. 23 | 24 | ## Introduction 25 | 26 | ```js 27 | const {chain} = require('stream-chain'); 28 | 29 | const {parser} = require('stream-csv-as-json'); 30 | const {asObjects} = require('stream-csv-as-json/AsObjects'); 31 | const {StreamValues} = require('stream-json/streamers/StreamValues'); 32 | 33 | const fs = require('fs'); 34 | const zlib = require('zlib'); 35 | 36 | const pipeline = chain([ 37 | fs.createReadStream('sample.csv.gz'), 38 | zlib.createGunzip(), 39 | parser(), 40 | asObjects(), 41 | streamValues(), 42 | data => { 43 | const value = data.value; 44 | return value && value.department === 'accounting' ? data : null; 45 | } 46 | ]); 47 | 48 | let counter = 0; 49 | pipeline.on('data', () => ++counter); 50 | pipeline.on('end', () => 51 | console.log(`The accounting department has ${counter} employees.`)); 52 | ``` 53 | 54 | See the full documentation in [Wiki](https://github.com/uhop/stream-csv-as-json/wiki). 55 | 56 | ## Installation 57 | 58 | ```bash 59 | npm install --save stream-csv-as-json 60 | # or: 61 | yarn add stream-csv-as-json 62 | ``` 63 | 64 | ## Use 65 | 66 | The whole library is organized as a set of small components, which can be combined to produce the most effective pipeline. All components are based on node.js [streams](http://nodejs.org/api/stream.html), and [events](http://nodejs.org/api/events.html). They implement all required standard APIs. It is easy to add your own components to solve your unique tasks. 67 | 68 | The code of all components is compact and simple. Please take a look at their source code to see how things are implemented, so you can produce your own components in no time. 69 | 70 | Obviously, if a bug is found, or a way to simplify existing components, or new generic components are created, which can be reused in a variety of projects, don't hesitate to open a ticket, and/or create a pull request. 71 | 72 | ## License 73 | 74 | BSD-3-Clause 75 | 76 | ## Release History 77 | 78 | - 1.0.5 *technical release: updated deps.* 79 | - 1.0.4 *technical release: updated deps.* 80 | - 1.0.3 *technical release: updated deps.* 81 | - 1.0.2 *technical release: updated deps, updated license's year.* 82 | - 1.0.1 *minor readme tweaks, added TypeScript typings and the badge.* 83 | - 1.0.0 *the first 1.0 release.* 84 | -------------------------------------------------------------------------------- /tests/test_parser.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const unit = require('heya-unit'); 4 | 5 | const fs = require('fs'); 6 | const path = require('path'); 7 | const zlib = require('zlib'); 8 | 9 | const {chain} = require('stream-chain'); 10 | 11 | const Assembler = require('stream-json/Assembler'); 12 | 13 | const Parser = require('../Parser'); 14 | const ReadString = require('./ReadString'); 15 | 16 | unit.add(module, [ 17 | function test_parser_low_level(t) { 18 | const async = t.startAsync('test_parser_low_level'); 19 | 20 | const input = ',x,\r\n"""\r\n"', 21 | result = [], 22 | expected = [ 23 | {name: 'startArray'}, 24 | {name: 'startString'}, 25 | {name: 'endString'}, 26 | {name: 'stringValue', value: ''}, 27 | {name: 'startString'}, 28 | {name: 'stringChunk', value: 'x'}, 29 | {name: 'endString'}, 30 | {name: 'stringValue', value: 'x'}, 31 | {name: 'startString'}, 32 | {name: 'endString'}, 33 | {name: 'stringValue', value: ''}, 34 | {name: 'endArray'}, 35 | {name: 'startArray'}, 36 | {name: 'startString'}, 37 | {name: 'stringChunk', value: '"'}, 38 | {name: 'stringChunk', value: '\r\n'}, 39 | {name: 'endString'}, 40 | {name: 'stringValue', value: '"\r\n'}, 41 | {name: 'endArray'} 42 | ]; 43 | 44 | const pipeline = new ReadString(input).pipe(new Parser()); 45 | 46 | pipeline.on('data', token => result.push(token)); 47 | pipeline.on('end', () => { 48 | eval(t.TEST('t.unify(result, expected)')); 49 | async.done(); 50 | }); 51 | }, 52 | function test_parser_simple(t) { 53 | const async = t.startAsync('test_parser_simple'); 54 | 55 | const input = '1,,"",""""\r\n2,three,"four",five\r\n', 56 | expected = [['1', '', '', '"'], ['2', 'three', 'four', 'five']], 57 | result = []; 58 | 59 | const pipeline = new ReadString(input).pipe(new Parser()); 60 | const asm = new Assembler(); 61 | 62 | pipeline.on('data', token => { 63 | asm[token.name] && asm[token.name](token.value); 64 | asm.done && result.push(asm.current); 65 | }); 66 | pipeline.on('end', () => { 67 | eval(t.TEST('t.unify(result, expected)')); 68 | async.done(); 69 | }); 70 | }, 71 | function test_parser_tricky_values(t) { 72 | const async = t.startAsync('test_parser_tricky_values'); 73 | 74 | const pipeline = chain([fs.createReadStream(path.resolve(__dirname, 'sample.csv.gz')), zlib.createGunzip(), new Parser()]); 75 | 76 | let rows = 0, 77 | empties = 0, 78 | valuesWithCrLf = 0, 79 | valuesWithDoubleQuote = 0; 80 | pipeline.on('data', data => { 81 | if (data.name === 'startArray') { 82 | ++rows; 83 | return; 84 | } 85 | if (data.name === 'stringValue') { 86 | const value = data.value; 87 | if (value) { 88 | if (/[\u000A\u000D]/.test(value)) { 89 | ++valuesWithCrLf; 90 | } 91 | if (/"/.test(value)) { 92 | ++valuesWithDoubleQuote; 93 | } 94 | } else { 95 | ++empties; 96 | } 97 | } 98 | }); 99 | pipeline.on('end', () => { 100 | eval(t.TEST('rows === 18126')); 101 | eval(t.TEST('empties === 159203')); 102 | eval(t.TEST('valuesWithCrLf === 1')); 103 | eval(t.TEST('valuesWithDoubleQuote === 1')); 104 | async.done(); 105 | }); 106 | }, 107 | function test_parser_separator(t) { 108 | const async = t.startAsync('test_parser_separator'); 109 | 110 | const input = '1||""|"""|"\r\n2|three|"four\r\n"|five\r\n', 111 | expected = [['1', '', '', '"|'], ['2', 'three', 'four\r\n', 'five']], 112 | result = []; 113 | 114 | const pipeline = new ReadString(input).pipe(new Parser({separator: '|'})); 115 | const asm = new Assembler(); 116 | 117 | pipeline.on('data', token => { 118 | asm[token.name] && asm[token.name](token.value); 119 | asm.done && result.push(asm.current); 120 | }); 121 | pipeline.on('end', () => { 122 | eval(t.TEST('t.unify(result, expected)')); 123 | async.done(); 124 | }); 125 | } 126 | ]); 127 | -------------------------------------------------------------------------------- /tests/test_stringer.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const unit = require('heya-unit'); 4 | 5 | const {Writable} = require('stream'); 6 | 7 | const {parser} = require('../Parser'); 8 | const {stringer} = require('../Stringer'); 9 | const ReadString = require('./ReadString'); 10 | 11 | const toCsv = array => array.map(row => row.map(value => (/[,\r\n\"]/.test(value) ? '"' + value.replace('"', '""') + '"' : value)).join(',')).join('\r\n'); 12 | 13 | unit.add(module, [ 14 | function test_stringer_simple(t) { 15 | const async = t.startAsync('test_stringer_simple'); 16 | 17 | let result = ''; 18 | 19 | const table = [['1', '', '', '"'], ['2', 'three', 'four', 'five']], 20 | expected = '"1","","",""""\r\n"2","three","four","five"\r\n', 21 | pipeline = new ReadString(toCsv(table)) 22 | .pipe(parser()) 23 | .pipe(stringer()) 24 | .pipe( 25 | new Writable({ 26 | write(chunk, encoding, callback) { 27 | result += chunk.toString(); 28 | callback(null); 29 | } 30 | }) 31 | ); 32 | 33 | pipeline.on('finish', () => { 34 | eval(t.TEST('result === expected')); 35 | async.done(); 36 | }); 37 | }, 38 | function test_stringer_simple_quoted(t) { 39 | const async = t.startAsync('test_stringer_simple_quoted'); 40 | 41 | let result = ''; 42 | 43 | const table = [['1', ',', '', '"'], ['2', 'three\r\n', 'four', 'five']], 44 | expected = '"1",",","",""""\r\n"2","three\r\n","four","five"\r\n', 45 | pipeline = new ReadString(toCsv(table)) 46 | .pipe(parser()) 47 | .pipe(stringer()) 48 | .pipe( 49 | new Writable({ 50 | write(chunk, encoding, callback) { 51 | result += chunk.toString(); 52 | callback(null); 53 | } 54 | }) 55 | ); 56 | 57 | pipeline.on('finish', () => { 58 | eval(t.TEST('result === expected')); 59 | async.done(); 60 | }); 61 | }, 62 | function test_stringer_values(t) { 63 | const async = t.startAsync('test_stringer_simple'); 64 | 65 | let result = ''; 66 | 67 | const table = [['1', '', '', '"'], ['2', 'three', 'four', 'five']], 68 | expected = '1,,,""""\r\n2,three,four,five\r\n', 69 | pipeline = new ReadString(toCsv(table)) 70 | .pipe(parser()) 71 | .pipe(stringer({useValues: true})) 72 | .pipe( 73 | new Writable({ 74 | write(chunk, encoding, callback) { 75 | result += chunk.toString(); 76 | callback(null); 77 | } 78 | }) 79 | ); 80 | 81 | pipeline.on('finish', () => { 82 | eval(t.TEST('result === expected')); 83 | async.done(); 84 | }); 85 | }, 86 | function test_stringer_values_quoted(t) { 87 | const async = t.startAsync('test_stringer_values_quoted'); 88 | 89 | let result = ''; 90 | 91 | const table = [['1', ',', '', '"'], ['2', 'three\r\n', 'four', 'five']], 92 | expected = '1,",",,""""\r\n2,"three\r\n",four,five\r\n', 93 | pipeline = new ReadString(toCsv(table)) 94 | .pipe(parser()) 95 | .pipe(stringer({useValues: true})) 96 | .pipe( 97 | new Writable({ 98 | write(chunk, encoding, callback) { 99 | result += chunk.toString(); 100 | callback(null); 101 | } 102 | }) 103 | ); 104 | 105 | pipeline.on('finish', () => { 106 | eval(t.TEST('result === expected')); 107 | async.done(); 108 | }); 109 | }, 110 | function test_stringer_simple_quoted_separator(t) { 111 | const async = t.startAsync('test_stringer_simple_quoted_separator'); 112 | 113 | let result = ''; 114 | 115 | const table = [['1', '|', '', '"'], ['2', 'three\r\n', 'four', 'five']], 116 | expected = '"1"|"|"|""|""""\r\n"2"|"three\r\n"|"four"|"five"\r\n', 117 | pipeline = new ReadString(toCsv(table)) 118 | .pipe(parser()) 119 | .pipe(stringer({separator: '|'})) 120 | .pipe( 121 | new Writable({ 122 | write(chunk, encoding, callback) { 123 | result += chunk.toString(); 124 | callback(null); 125 | } 126 | }) 127 | ); 128 | 129 | pipeline.on('finish', () => { 130 | eval(t.TEST('result === expected')); 131 | async.done(); 132 | }); 133 | }, 134 | function test_stringer_values_quoted_separator(t) { 135 | const async = t.startAsync('test_stringer_values_quoted_separator'); 136 | 137 | let result = ''; 138 | 139 | const table = [['1', '|', '', '"'], ['2', 'three\r\n', 'four', 'five']], 140 | expected = '1|"|"||""""\r\n2|"three\r\n"|four|five\r\n', 141 | pipeline = new ReadString(toCsv(table)) 142 | .pipe(parser()) 143 | .pipe(stringer({useValues: true, separator: '|'})) 144 | .pipe( 145 | new Writable({ 146 | write(chunk, encoding, callback) { 147 | result += chunk.toString(); 148 | callback(null); 149 | } 150 | }) 151 | ); 152 | 153 | pipeline.on('finish', () => { 154 | eval(t.TEST('result === expected')); 155 | async.done(); 156 | }); 157 | }, 158 | function test_stringer_values_quoted_tabbed(t) { 159 | const async = t.startAsync('test_stringer_values_quoted_tabbed'); 160 | 161 | let result = ''; 162 | 163 | const table = [['1', '\t', '', '"', ''], ['2', 'three\r\n', 'four', 'five']], 164 | expected = '1\t"\t"\t\t""""\t\r\n2\t"three\r\n"\tfour\tfive\r\n', 165 | pipeline = new ReadString(toCsv(table)) 166 | .pipe(parser()) 167 | .pipe(stringer({useValues: true, separator: '\t'})) 168 | .pipe( 169 | new Writable({ 170 | write(chunk, encoding, callback) { 171 | result += chunk.toString(); 172 | callback(null); 173 | } 174 | }) 175 | ); 176 | 177 | pipeline.on('finish', () => { 178 | eval(t.TEST('result === expected')); 179 | async.done(); 180 | }); 181 | } 182 | ]); 183 | -------------------------------------------------------------------------------- /Parser.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const {Transform} = require('stream'); 4 | 5 | const patterns = { 6 | value: /^(?:\"|,|\n|\r|[\s\S])/, 7 | regularValue: /^(?:[^,\r\n]{1,256}|,|\n|\r)/, 8 | quotedValue: /^(?:[^\"]{1,256}|\")/, 9 | quotedContinuation: /^(?:\"|,|\n|\r)/ 10 | }; 11 | 12 | let noSticky = true; 13 | try { 14 | new RegExp('.', 'y'); 15 | noSticky = false; 16 | } catch (e) { 17 | // suppress 18 | } 19 | 20 | !noSticky && 21 | Object.keys(patterns).forEach(key => { 22 | let src = patterns[key].source.slice(1); // lop off ^ 23 | if (src.slice(0, 3) === '(?:' && src.slice(-1) === ')') { 24 | src = src.slice(3, -1); 25 | } 26 | patterns[key] = new RegExp(src, 'y'); 27 | }); 28 | 29 | class Parser extends Transform { 30 | static make(options) { 31 | return new Parser(options); 32 | } 33 | 34 | constructor(options) { 35 | super(Object.assign({}, options, {writableObjectMode: false, readableObjectMode: true})); 36 | 37 | this._packStrings = this._streamStrings = true; 38 | if (options) { 39 | 'packValues' in options && (this._packStrings = options.packValues); 40 | 'packStrings' in options && (this._packStrings = options.packStrings); 41 | 'streamValues' in options && (this._streamStrings = options.streamValues); 42 | 'streamStrings' in options && (this._streamStrings = options.streamStrings); 43 | } 44 | !this._packStrings && (this._streamStrings = true); 45 | 46 | this._separator = (options && options.separator) || ','; 47 | if (this._separator === ',') { 48 | this._patterns = patterns; 49 | } else { 50 | this._patterns = {}; 51 | const sep = this._separator.replace(/[#-.]|[[-^]|[?|{}]/g, '\\$&'), 52 | sepOr = '|' + sep + '|', 53 | sepNot = '[^' + sep; 54 | Object.keys(patterns).forEach(key => { 55 | this._patterns[key] = new RegExp(patterns[key].source.replace('|,|', sepOr).replace('[^,', sepNot), noSticky ? '' : 'y'); 56 | }); 57 | } 58 | 59 | this._buffer = ''; 60 | this._expect = 'value1'; 61 | this._expectLF = false; 62 | this._accumulator = ''; 63 | } 64 | 65 | _transform(chunk, encoding, callback) { 66 | this._buffer += chunk.toString(); 67 | this._processInput(callback); 68 | } 69 | 70 | _flush(callback) { 71 | switch (this._expect) { 72 | case 'quotedValue': 73 | return callback(new Error('Parser cannot parse input: expected a quoted value')); 74 | case 'value1': 75 | break; 76 | case 'value': 77 | if (this._streamStrings) { 78 | this.push({name: 'startString'}); 79 | this.push({name: 'endString'}); 80 | } 81 | this._packStrings && this.push({name: 'stringValue', value: ''}); 82 | this.push({name: 'endArray'}); 83 | break; 84 | default: 85 | this._streamStrings && this.push({name: 'endString'}); 86 | this._packStrings && this.push({name: 'stringValue', value: this._accumulator}); 87 | this.push({name: 'endArray'}); 88 | break; 89 | } 90 | callback(null); 91 | } 92 | 93 | _processInput(callback) { 94 | let match, 95 | value, 96 | index = 0; 97 | main: while (index < this._buffer.length) { 98 | switch (this._expect) { 99 | case 'value1': 100 | case 'value': 101 | this._patterns.value.lastIndex = index; 102 | match = this._patterns.value.exec(this._buffer); 103 | if (!match) { 104 | if (index < this._buffer.length) return callback(new Error('Parser cannot parse input: expected a value')); 105 | break main; // wait for more input 106 | } 107 | value = match[0]; 108 | this._expect === 'value1' && !(value === '\n' && this._expectLF) && this.push({name: 'startArray'}); 109 | switch (value) { 110 | case '"': 111 | this._streamStrings && this.push({name: 'startString'}); 112 | this._expect = 'quotedValue'; 113 | break; 114 | case '\n': 115 | if (this._expectLF) break; 116 | // intentional fall down 117 | case '\r': 118 | if (this._expect === 'value') { 119 | if (this._streamStrings) { 120 | this.push({name: 'startString'}); 121 | this.push({name: 'endString'}); 122 | } 123 | this._packStrings && this.push({name: 'stringValue', value: ''}); 124 | } 125 | this.push({name: 'endArray'}); 126 | this._expect = 'value1'; 127 | break; 128 | case this._separator: 129 | if (this._streamStrings) { 130 | this.push({name: 'startString'}); 131 | this.push({name: 'endString'}); 132 | } 133 | this._packStrings && this.push({name: 'stringValue', value: ''}); 134 | this._expect = 'value'; 135 | break; 136 | default: 137 | if (this._streamStrings) { 138 | this.push({name: 'startString'}); 139 | this.push({name: 'stringChunk', value}); 140 | } 141 | this._packStrings && (this._accumulator = value); 142 | this._expect = 'regularValue'; 143 | break; 144 | } 145 | this._expectLF = value === '\r'; 146 | if (noSticky) { 147 | this._buffer = this._buffer.slice(value.length); 148 | } else { 149 | index += value.length; 150 | } 151 | break; 152 | case 'regularValue': 153 | this._patterns.regularValue.lastIndex = index; 154 | match = this._patterns.regularValue.exec(this._buffer); 155 | if (!match) { 156 | if (index < this._buffer.length) return callback(new Error('Parser cannot parse input: a regular value')); 157 | break main; // wait for more input 158 | } 159 | value = match[0]; 160 | switch (value) { 161 | case this._separator: 162 | this._streamStrings && this.push({name: 'endString'}); 163 | if (this._packStrings) { 164 | this.push({name: 'stringValue', value: this._accumulator}); 165 | this._accumulator = ''; 166 | } 167 | this._expect = 'value'; 168 | break; 169 | case '\n': 170 | if (this._expectLF) break; 171 | // intentional fall down 172 | case '\r': 173 | this._streamStrings && this.push({name: 'endString'}); 174 | if (this._packStrings) { 175 | this.push({name: 'stringValue', value: this._accumulator}); 176 | this._accumulator = ''; 177 | } 178 | this.push({name: 'endArray'}); 179 | this._expect = 'value1'; 180 | break; 181 | default: 182 | this._streamStrings && this.push({name: 'stringChunk', value}); 183 | this._packStrings && (this._accumulator += value); 184 | break; 185 | } 186 | this._expectLF = value === '\r'; 187 | if (noSticky) { 188 | this._buffer = this._buffer.slice(value.length); 189 | } else { 190 | index += value.length; 191 | } 192 | break; 193 | case 'quotedValue': 194 | this._patterns.quotedValue.lastIndex = index; 195 | match = this._patterns.quotedValue.exec(this._buffer); 196 | if (!match) { 197 | if (index < this._buffer.length) return callback(new Error('Parser cannot parse input: expected a quoted value')); 198 | break main; // wait for more input 199 | } 200 | value = match[0]; 201 | if (value === '"') { 202 | this._expect = 'quotedContinuation'; 203 | } else { 204 | this._streamStrings && this.push({name: 'stringChunk', value}); 205 | this._packStrings && (this._accumulator += value); 206 | } 207 | if (noSticky) { 208 | this._buffer = this._buffer.slice(value.length); 209 | } else { 210 | index += value.length; 211 | } 212 | break; 213 | case 'quotedContinuation': 214 | this._patterns.quotedContinuation.lastIndex = index; 215 | match = this._patterns.quotedContinuation.exec(this._buffer); 216 | if (!match) { 217 | if (index < this._buffer.length) return callback(new Error("Parser cannot parse input: expected '\"', a separator, or EOL")); 218 | break main; // wait for more input 219 | } 220 | value = match[0]; 221 | if (value === '"') { 222 | this._streamStrings && this.push({name: 'stringChunk', value: '"'}); 223 | this._packStrings && (this._accumulator += '"'); 224 | this._expect = 'quotedValue'; 225 | } else { 226 | this._streamStrings && this.push({name: 'endString'}); 227 | if (this._packStrings) { 228 | this.push({name: 'stringValue', value: this._accumulator}); 229 | this._accumulator = ''; 230 | } 231 | if (value === this._separator) { 232 | this._expect = 'value'; 233 | } else { 234 | this.push({name: 'endArray'}); 235 | this._expect = 'value1'; 236 | } 237 | } 238 | this._expectLF = value === '\r'; 239 | if (noSticky) { 240 | this._buffer = this._buffer.slice(value.length); 241 | } else { 242 | index += value.length; 243 | } 244 | break; 245 | } 246 | } 247 | if (!noSticky && index) { 248 | this._buffer = this._buffer.slice(index); 249 | } 250 | callback(null); 251 | } 252 | } 253 | Parser.parser = Parser.make; 254 | Parser.make.Constructor = Parser; 255 | 256 | module.exports = Parser; 257 | --------------------------------------------------------------------------------