├── .gitattributes ├── .gitignore ├── .vscode └── launch.json ├── LICENSE ├── README.md ├── big.arrow ├── big.csv ├── csv-to-arrow-js.cpuprofile ├── index.js └── package.json /.gitattributes: -------------------------------------------------------------------------------- 1 | *.csv filter=lfs diff=lfs merge=lfs -text 2 | *.arrow filter=lfs diff=lfs merge=lfs -text 3 | *.cpuprofile filter=lfs diff=lfs merge=lfs -text 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | 8 | # Runtime data 9 | pids 10 | *.pid 11 | *.seed 12 | *.pid.lock 13 | 14 | # Directory for instrumented libs generated by jscoverage/JSCover 15 | lib-cov 16 | 17 | # Coverage directory used by tools like istanbul 18 | coverage 19 | 20 | # nyc test coverage 21 | .nyc_output 22 | 23 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 24 | .grunt 25 | 26 | # Bower dependency directory (https://bower.io/) 27 | bower_components 28 | 29 | # node-waf configuration 30 | .lock-wscript 31 | 32 | # Compiled binary addons (https://nodejs.org/api/addons.html) 33 | build/Release 34 | 35 | # Dependency directories 36 | node_modules/ 37 | jspm_packages/ 38 | 39 | # TypeScript v1 declaration files 40 | typings/ 41 | 42 | # Optional npm cache directory 43 | .npm 44 | 45 | # Optional eslint cache 46 | .eslintcache 47 | 48 | # Optional REPL history 49 | .node_repl_history 50 | 51 | # Output of 'npm pack' 52 | *.tgz 53 | 54 | # Yarn Integrity file 55 | .yarn-integrity 56 | 57 | # dotenv environment variables file 58 | .env 59 | 60 | # next.js build output 61 | .next 62 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "type": "node", 9 | "request": "launch", 10 | "name": "Launch Program", 11 | "program": "${workspaceFolder}/index.js", 12 | "stopOnEntry": true 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Paul Taylor 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # csv-to-arrow-js 2 | A proof of concept demo to transform CSV to Arrow in JS using [`csvtojson`](https://www.npmjs.com/package/csvtojson) and the new Arrow Builder stream APIs. 3 | 4 | See this PR for more information: https://github.com/apache/arrow/pull/4476 5 | 6 | ### Cloning 7 | 8 | This demo uses the file `big.csv` from the PapaParse examples. I've committed this file with [git large-file storage](https://git-lfs.github.com/). 9 | 10 | After installing `git-lfs`, run these commands to clone the repository and pull the large files: 11 | 12 | ```sh 13 | git clone https://github.com/trxcllnt/csv-to-arrow-js.git 14 | cd csv-to-arrow-js 15 | git lfs pull 16 | ``` 17 | 18 | If you can't install `git-lfs`, you can download the "Large file" from the [PapaParse demo page](https://www.papaparse.com/demo). 19 | 20 | ### Running 21 | 22 | ```sh 23 | # install the dependencies 24 | npm install 25 | # run the demo to convert big.csv to an Arrow RecordBatch stream. 26 | # The Arrow table is printed to the console with the `arrow2csv` utility 27 | npm start 28 | # or time how long it takes 29 | time node index.js > big.arrow 30 | ``` 31 | -------------------------------------------------------------------------------- /big.arrow: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5d0ec41fbdcd25a8654d014f3e2d96924b5c2b495267f3f1f4be557ec2ab66ec 3 | size 26650228 4 | -------------------------------------------------------------------------------- /big.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:223f010c44e8e91be9a55c82051e5886b8571b8601f7f00071d0ef45de7470c3 3 | size 50173280 4 | -------------------------------------------------------------------------------- /csv-to-arrow-js.cpuprofile: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:0bd766f4c96625bde8e293eaeb0f73307888049601acab1069fee12ff64545de 3 | size 1058158 4 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const csvToJSON = require('csvtojson') 3 | const { AsyncIterable } = require('ix'); 4 | const { metrohash64 } = require('metrohash'); 5 | const { 6 | Bool, Utf8, Int64, Float64, Struct, Dictionary, Int32, 7 | Field, Builder, RecordBatch, RecordBatchWriter 8 | } = require('apache-arrow'); 9 | 10 | const csvToJSONStream = fs 11 | .createReadStream('./big.csv') 12 | .pipe(csvToJSON({}, { objectMode: true })); 13 | 14 | AsyncIterable.fromNodeStream(csvToJSONStream) 15 | // multicast the source CSV stream so we can share a single 16 | // underlying iterator between multiple consumers. 17 | .publish((JSONRows) => { 18 | return AsyncIterable.defer(async () => { 19 | // Determine the schema from the types of values present in the first row 20 | const rest = JSONRows.skip(1); 21 | const row0 = await JSONRows.first(); 22 | // This top-level Builder builds an Arrow StructVector, which is a 23 | // nested Vector that parents other Vectors, addressing them by 24 | // field name. The StructBuilder has built-in support for plucking 25 | // child values from arbitrary JS objects by key and writing them 26 | // into the child Vector Builders, which makes it the perfect type 27 | // to use to transpose a stream of JSON rows to a columnar layout 28 | const { type, ...otherBuilderOptions } = jsValueToArrowBuilderOptions(row0); 29 | const transform = Builder.throughAsyncIterable({ 30 | type, ...otherBuilderOptions, 31 | // flush chunks once their size grows beyond 64kb 32 | queueingStrategy: 'bytes', highWaterMark: 1 << 16, 33 | // null-value sentinels that will signify "null" slots 34 | nullValues: [null, undefined, 'n/a', 'NULL'], 35 | }); 36 | // Concatenate the first row with the rest of the rows, and 37 | // pipe them through the Arrow StructBuilder transform function 38 | return AsyncIterable.of(row0).concat(rest).pipe(transform); 39 | }); 40 | }) 41 | // Translate each Arrow StructVector chunk into a RecordBatch so it can be 42 | // flushed as an Arrow IPC Message by the RecordBatchStreamWriter transform stream 43 | .map((chunk) => RecordBatch.new(chunk.data.childData, chunk.type.children)) 44 | // Pipe each RecordBatch through the stream writer transform 45 | .pipe(RecordBatchWriter.throughNode()) 46 | // And finally, direct each Arrow IPC Message to stdout 47 | .pipe(process.stdout); 48 | 49 | 50 | // Naively translate JS values to their rough Arrow equivalents 51 | function jsValueToArrowBuilderOptions(value) { 52 | if (value) { 53 | switch (typeof value) { 54 | case 'bigint': 55 | return { type: new Int64() }; 56 | case 'boolean': 57 | return { type: new Bool() }; 58 | case 'number': 59 | return { type: new Float64() }; 60 | case 'string': 61 | return { type: new Dictionary(new Utf8(), new Int32()), dictionaryHashFunction: metrohash64 }; 62 | case 'object': 63 | 64 | const { childFields, childBuilderOptions } = Object.keys(value).reduce((memo, name) => { 65 | const { type, ...childBuilderOptions } = jsValueToArrowBuilderOptions(value[name]); 66 | if (type) { 67 | memo.childBuilderOptions.push(childBuilderOptions); 68 | memo.childFields.push(new Field(name, type, true)); 69 | } 70 | return memo; 71 | }, { childFields: [], childBuilderOptions: [] }); 72 | 73 | if (Array.isArray(value)) { 74 | return { type: new Struct(childFields), children: childBuilderOptions }; 75 | } 76 | 77 | return { 78 | type: new Struct(childFields), 79 | children: childBuilderOptions.reduce((children, childBuilderOptions, childIndex) => ({ 80 | ...children, [childFields[childIndex].name]: childBuilderOptions 81 | }), {}) 82 | }; 83 | } 84 | } 85 | return {}; 86 | } 87 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "csv-to-arrow-js", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "scripts": { 7 | "start": "node index.js | arrow2csv" 8 | }, 9 | "author": "", 10 | "license": "ISC", 11 | "dependencies": { 12 | "apache-arrow": "trxcllnt/apache-arrow-js", 13 | "csvtojson": "2.0.8", 14 | "ix": "2.5.3", 15 | "metrohash": "2.6.0" 16 | } 17 | } 18 | --------------------------------------------------------------------------------