├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── cli.js ├── fixtures ├── a.csv ├── all_hour.csv ├── all_hour.json ├── b.csv ├── blob.txt └── c.csv ├── images └── diff.png ├── index.js ├── package.json └── test.js /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | 5 | # Runtime data 6 | pids 7 | *.pid 8 | *.seed 9 | 10 | # Directory for instrumented libs generated by jscoverage/JSCover 11 | lib-cov 12 | 13 | # Coverage directory used by tools like istanbul 14 | coverage 15 | 16 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 17 | .grunt 18 | 19 | # Compiled binary addons (http://nodejs.org/api/addons.html) 20 | build/Release 21 | 22 | # Dependency directory 23 | # Commenting this out is preferred by some people, see 24 | # https://www.npmjs.org/doc/misc/npm-faq.html#should-i-check-my-node_modules-folder-into-git- 25 | node_modules 26 | 27 | # Users Environment Variables 28 | .lock-wscript 29 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | 3 | node_js: 4 | - "0.12" 5 | 6 | install: 7 | - npm install 8 | 9 | before_script: 10 | - npm install npm 11 | 12 | script: 13 | - npm test 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Karissa McKelvey 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # knead 2 | 3 | ![dat](http://img.shields.io/badge/Development%20sponsored%20by-dat-green.svg?style=flat) 4 | 5 | ``` 6 | $ npm install -g knead 7 | ``` 8 | 9 | ##### Resolve data table conflicts one step at a time 10 | 11 | *Not Small Data.* When data conflicts are sufficiently troubling to resolve manually. 12 | 13 | ![diff](/images/diff.png) 14 | 15 | You can send two files into `knead`. You'll see a [daff](https://github.com/paulfitz/daff) for each chunk, with a prompt to keep the changes or not. Changes that are kept will be written to the given resolved file. 16 | 17 | ## Usage 18 | 19 | ``` 20 | $ knead [--format] [--limit] 21 | ``` 22 | 23 | Stream from stdin to stdout: 24 | ``` 25 | $ knead - 26 | ``` 27 | 28 | `base-file`: also known as `local file`, this is the file that will work as the 'truth' for the diff 29 | 30 | `changed-file`: also known as `remote file`, this is the file that is proposing changes 31 | 32 | `resolved-file`: this is where the approved or disapproved changes will be saved. 33 | 34 | `--format`: 'csv' (default). the data format to write to the resolved file. 'csv','json', or 'ndjson' 35 | 36 | `--limit`: 1 (default). the number of rows per page 37 | 38 | ## Examples 39 | 40 | ``` 41 | $ knead 2012.csv 2015_changes.csv current.json --format json --limit 20 42 | ``` 43 | -------------------------------------------------------------------------------- /cli.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | var argv = require('minimist')(process.argv.slice(2)) 4 | var detect = require('detect-data-stream') 5 | var formatData = require('format-data') 6 | var diff = require('sorted-diff-stream') 7 | var fs = require('fs') 8 | var knead = require('./') 9 | 10 | if (argv._.length !== 3 && argv._[0] !== '-') { 11 | usage() 12 | process.exit() 13 | } 14 | 15 | function usage () { 16 | console.log('knead [--format csv,ndjson,json] [--limit ]') 17 | } 18 | 19 | function jsonEquals (a, b, cb) { 20 | if (JSON.stringify(a) === JSON.stringify(b)) cb(null, true) 21 | else cb(null, false) 22 | } 23 | 24 | var opts = { 25 | limit: argv.limit, 26 | strategy: 'rows' 27 | } 28 | 29 | if (argv._[0] === '-') { 30 | knead(process.stdin, opts).pipe(process.stdout) 31 | } else { 32 | var localPath = argv._[0] 33 | var remotePath = argv._[1] 34 | var outPath = argv._[2] 35 | var format = argv.format || 'csv' 36 | 37 | if (fs.existsSync(outPath)) { 38 | console.log(outPath, 'exists. Appending to end of file.') 39 | } else { 40 | console.log('Creating new file', outPath) 41 | } 42 | 43 | var localStream = fs.createReadStream(localPath).pipe(detect()) 44 | var newStream = fs.createReadStream(remotePath).pipe(detect()) 45 | var outStream = fs.createWriteStream(outPath, {flags: 'a'}) 46 | 47 | var diffStream = diff(localStream, newStream, jsonEquals) 48 | var kneadStream = knead(diffStream, opts).pipe(formatData(format)) 49 | 50 | kneadStream.on('data', function (data) { 51 | outStream.write(data) 52 | }) 53 | 54 | outStream.on('end', function () { 55 | process.exit() 56 | }) 57 | } 58 | -------------------------------------------------------------------------------- /fixtures/a.csv: -------------------------------------------------------------------------------- 1 | key,name 2 | 1,max -------------------------------------------------------------------------------- /fixtures/all_hour.csv: -------------------------------------------------------------------------------- 1 | time,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,net,id,updated,place,type 2 | 2014-04-30T03:34:57.000Z,60.0366,-141.2214,14.6,1.4,ml,,,,1.51,ak,ak11246293,2014-04-30T03:39:27.956Z,"67km E of Cape Yakataga, Alaska",earthquake 3 | 2014-04-30T03:16:54.860Z,33.9233322,-117.9376678,0.81,2.4,ml,71,51,0.02487,0.35,ci,ci37218696,2014-04-30T03:35:24.239Z,"1km SE of La Habra, California",earthquake 4 | 2014-04-30T03:03:09.000Z,61.126,-149.7035,28.2,1,ml,,,,0.75,ak,ak11246291,2014-04-30T03:09:51.716Z,"14km SE of Anchorage, Alaska",earthquake 5 | 2014-04-30T02:57:51.800Z,37.3798,-122.1912,4.5,1.3,Md,8,104.4,0.01796631,0.02,nc,nc72212216,2014-04-30T03:28:05.271Z,"2km SSE of Ladera, California",earthquake 6 | 2014-04-30T02:51:26.000Z,60.2062,-147.7298,0,1.8,ml,,,,0.1,ak,ak11246289,2014-04-30T02:54:55.916Z,"82km SE of Whittier, Alaska",earthquake 7 | 2014-04-30T02:48:54.000Z,60.4899,-149.4879,28.5,1.4,ml,,,,0.54,ak,ak11246287,2014-04-30T02:54:51.149Z,"36km N of Bear Creek, Alaska",earthquake 8 | 2014-04-30T02:47:44.100Z,38.819,-122.7952,3.3,0.8,Md,12,75.6,0.00898315,0.02,nc,nc72212211,2014-04-30T03:17:09.173Z,"5km NW of The Geysers, California",earthquake 9 | 2014-04-30T02:46:11.100Z,37.9812,-122.054,14.7,2.1,Md,35,126,0.09881468,0.09,nc,nc72212206,2014-04-30T03:38:06.391Z,"1km E of Pacheco, California",earthquake 10 | 2014-04-30T02:44:49.000Z,61.3482,-151.3611,48.5,1.7,ml,,,,0.92,ak,ak11246285,2014-04-30T02:54:49.809Z,"73km N of Nikiski, Alaska",earthquake 11 | -------------------------------------------------------------------------------- /fixtures/all_hour.json: -------------------------------------------------------------------------------- 1 | {"time":"2014-04-30T03:34:57.000Z","latitude":"60.0366","longitude":"-141.2214","depth":"14.6","mag":"1.4","magType":"ml","nst":"","gap":"","dmin":"","rms":"1.51","net":"ak","id":"ak11246293","updated":"2014-04-30T03:39:27.956Z","place":"67km E of Cape Yakataga, Alaska","type":"earthquake"} 2 | {"time":"2014-04-30T03:16:54.860Z","latitude":"33.9233322","longitude":"-117.9376678","depth":"0.81","mag":"2.4","magType":"ml","nst":"71","gap":"51","dmin":"0.02487","rms":"0.35","net":"ci","id":"ci37218696","updated":"2014-04-30T03:35:24.239Z","place":"1km SE of La Habra, California","type":"earthquake"} 3 | {"time":"2014-04-30T03:03:09.000Z","latitude":"61.126","longitude":"-149.7035","depth":"28.2","mag":"1","magType":"ml","nst":"","gap":"","dmin":"","rms":"0.75","net":"ak","id":"ak11246291","updated":"2014-04-30T03:09:51.716Z","place":"14km SE of Anchorage, Alaska","type":"earthquake"} 4 | {"time":"2014-04-30T02:57:51.800Z","latitude":"37.3798","longitude":"-122.1912","depth":"4.5","mag":"1.3","magType":"Md","nst":"8","gap":"104.4","dmin":"0.01796631","rms":"0.02","net":"nc","id":"nc72212216","updated":"2014-04-30T03:28:05.271Z","place":"2km SSE of Ladera, California","type":"earthquake"} 5 | {"time":"2014-04-30T02:51:26.000Z","latitude":"60.2062","longitude":"-147.7298","depth":"0","mag":"1.8","magType":"ml","nst":"","gap":"","dmin":"","rms":"0.1","net":"ak","id":"ak11246289","updated":"2014-04-30T02:54:55.916Z","place":"82km SE of Whittier, Alaska","type":"earthquake"} 6 | {"time":"2014-04-30T02:48:54.000Z","latitude":"60.4899","longitude":"-149.4879","depth":"28.5","mag":"1.4","magType":"ml","nst":"","gap":"","dmin":"","rms":"0.54","net":"ak","id":"ak11246287","updated":"2014-04-30T02:54:51.149Z","place":"36km N of Bear Creek, Alaska","type":"earthquake"} 7 | {"time":"2014-04-30T02:47:44.100Z","latitude":"38.819","longitude":"-122.7952","depth":"3.3","mag":"0.8","magType":"Md","nst":"12","gap":"75.6","dmin":"0.00898315","rms":"0.02","net":"nc","id":"nc72212211","updated":"2014-04-30T03:17:09.173Z","place":"5km NW of The Geysers, California","type":"earthquake"} 8 | {"time":"2014-04-30T02:46:11.100Z","latitude":"37.9812","longitude":"-122.054","depth":"14.7","mag":"2.1","magType":"Md","nst":"35","gap":"126","dmin":"0.09881468","rms":"0.09","net":"nc","id":"nc72212206","updated":"2014-04-30T03:38:06.391Z","place":"1km E of Pacheco, California","type":"earthquake"} 9 | {"time":"2014-04-30T02:44:49.000Z","latitude":"61.3482","longitude":"-151.3611","depth":"48.5","mag":"1.7","magType":"ml","nst":"","gap":"","dmin":"","rms":"0.92","net":"ak","id":"ak11246285","updated":"2014-04-30T02:54:49.809Z","place":"73km N of Nikiski, Alaska","type":"earthquake"} -------------------------------------------------------------------------------- /fixtures/b.csv: -------------------------------------------------------------------------------- 1 | key,name 2 | 1,Max -------------------------------------------------------------------------------- /fixtures/blob.txt: -------------------------------------------------------------------------------- 1 | i am a blob 2 | -------------------------------------------------------------------------------- /fixtures/c.csv: -------------------------------------------------------------------------------- 1 | key,name 2 | 1,MAX -------------------------------------------------------------------------------- /images/diff.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/okdistribute/knead/0c73a58caa9795cd9bed9f9a7a07b402539c5794/images/diff.png -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | var Batcher = require('byte-stream') 2 | var manualMergeStream = require('manual-merge-stream') 3 | 4 | module.exports = function (diffStream, opts, merge) { 5 | if (!opts) { 6 | opts = {} 7 | } 8 | var limit = (opts.limit || 1) * 2 9 | var batchStream = Batcher(limit) 10 | var opts = { 11 | vizFn: opts.vizFn, 12 | merge: merge 13 | } 14 | 15 | return diffStream.pipe(batchStream).pipe(manualMergeStream(opts)) 16 | } -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "knead", 3 | "version": "3.2.1", 4 | "main": "index.js", 5 | "description": "Resolve data table conflicts one step at a time.", 6 | "scripts": { 7 | "test": "standard && tape test.js" 8 | }, 9 | "bin": { 10 | "knead": "cli.js" 11 | }, 12 | "repository": { 13 | "type": "git", 14 | "url": "https://github.com/karissa/knead.git" 15 | }, 16 | "keywords": [ 17 | "diff", 18 | "merge", 19 | "streaming", 20 | "big", 21 | "visual", 22 | "data", 23 | "tabular", 24 | "dat", 25 | "csv" 26 | ], 27 | "author": "Karissa McKelvey (http://karissamck.com/)", 28 | "license": "BSD", 29 | "bugs": { 30 | "url": "https://github.com/karissa/knead/issues" 31 | }, 32 | "homepage": "https://github.com/karissa/knead", 33 | "dependencies": { 34 | "byte-stream": "^2.1.0", 35 | "detect-data-stream": "^1.0.0", 36 | "format-data": "^2.1.1", 37 | "from2": "^1.3.0", 38 | "manual-merge-stream": "^3.2.2", 39 | "minimist": "^1.1.1", 40 | "readable-stream": "^1.0.33", 41 | "sorted-diff-stream": "^1.0.1", 42 | "through2": "^0.6.5" 43 | }, 44 | "devDependencies": { 45 | "conflict-spectrum": "^2.0.0", 46 | "tape": "^4.0.0" 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /test.js: -------------------------------------------------------------------------------- 1 | var test = require('tape') 2 | var diff = require('sorted-diff-stream') 3 | var DATA = require('conflict-spectrum') 4 | var from = require('from2') 5 | var diff2daff = require('diff2daff') 6 | var diffs2string = require('diffs-to-string') 7 | 8 | var knead = require('./') 9 | 10 | var TABLES = DATA[0].json 11 | 12 | test('knead from sorted-diff-stream using daff-stream', function (t) { 13 | function keyData (data) { 14 | var index = 0 15 | data.map(function (obj) { 16 | var rObj = {} 17 | rObj.key = index 18 | rObj.value = obj 19 | index++ 20 | return rObj 21 | }) 22 | return data 23 | } 24 | 25 | var older = from.obj(keyData(TABLES[1])) 26 | var newer = from.obj(keyData(TABLES[2])) 27 | 28 | function jsonEquals (a, b, cb) { 29 | if (JSON.stringify(a) === JSON.stringify(b)) cb(null, true) 30 | else cb(null, false) 31 | } 32 | 33 | var diffStream = diff(older, newer, jsonEquals) 34 | var opts = { 35 | limit: 3, 36 | vizFn: diff2daff 37 | } 38 | 39 | knead(diffStream, opts, function (diffs, visual, push, next) { 40 | var cols = Object.keys(diffs[0][0]) 41 | var cols2 = Object.keys(diffs[0][1]) 42 | 43 | t.deepEquals(cols, ['country', 'capital']) 44 | t.deepEquals(cols2, ['country', 'code', 'capital']) 45 | t.same(typeof next, 'function') 46 | t.end() 47 | }) 48 | }) 49 | 50 | test('knead from sorted-diff-stream using the simple differ', function (t) { 51 | function keyData (data) { 52 | var index = 0 53 | data.map(function (obj) { 54 | var rObj = {} 55 | rObj.key = index 56 | rObj.value = obj 57 | index++ 58 | return rObj 59 | }) 60 | return data 61 | } 62 | 63 | var older = from.obj(keyData(TABLES[1])) 64 | var newer = from.obj(keyData(TABLES[2])) 65 | 66 | function jsonEquals (a, b, cb) { 67 | if (JSON.stringify(a) === JSON.stringify(b)) cb(null, true) 68 | else cb(null, false) 69 | } 70 | 71 | var diffStream = diff(older, newer, jsonEquals) 72 | var opts = { 73 | limit: 10 74 | } 75 | 76 | knead(diffStream, opts, function (tables, visual, push, next) { 77 | console.log(tables) 78 | console.log(visual) 79 | t.end() 80 | }) 81 | }) 82 | 83 | 84 | --------------------------------------------------------------------------------