├── .gitignore ├── LICENSE ├── README.md ├── benchmark ├── QUESTIONS.md ├── create.js ├── datavore │ ├── README.md │ ├── create.js │ ├── datavore │ │ └── index.js │ ├── table.query.sum.js │ └── table.query.sum.multi.js ├── groupby.sum.js ├── groupby_sum.js ├── median.js ├── sum.js ├── where.js └── where_sum.js ├── lib ├── frame-index.js ├── frame.js ├── stream-reducers.js └── test.js ├── package.json ├── requirements.txt └── test ├── argmax.js ├── count.js ├── create.js ├── data ├── binary_matrix.py ├── generate.js ├── generate.py ├── groupby.count │ ├── operation.py │ └── small.json ├── groupby.mean │ ├── operation.py │ └── small.json ├── groupby.sum │ ├── operation.py │ └── small.json ├── groupby.where.sum │ ├── operation.py │ └── small.json ├── mean │ ├── operation.py │ └── small.json ├── where.in.sum │ ├── operation.py │ └── small.json └── where.mean │ ├── operation.py │ └── small.json ├── groupby.count.js ├── groupby.js ├── groupby.mean.js ├── groupby.sum.js ├── groupby.where.sum.js ├── join.js ├── mean.js ├── ungroup.js ├── where.in.sum.js ├── where.js └── where.mean.js /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | __pycache__ 3 | *.pyc 4 | 5 | test/data/*/00* 6 | 7 | # Logs 8 | logs 9 | *.log 10 | npm-debug.log* 11 | 12 | # Runtime data 13 | pids 14 | *.pid 15 | *.seed 16 | 17 | # Directory for instrumented libs generated by jscoverage/JSCover 18 | lib-cov 19 | 20 | # Coverage directory used by tools like istanbul 21 | coverage 22 | 23 | # nyc test coverage 24 | .nyc_output 25 | 26 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 27 | .grunt 28 | 29 | # node-waf configuration 30 | .lock-wscript 31 | 32 | # Compiled binary addons (http://nodejs.org/api/addons.html) 33 | build/Release 34 | 35 | # Dependency directories 36 | node_modules 37 | jspm_packages 38 | 39 | # Optional npm cache directory 40 | .npm 41 | 42 | # Optional REPL history 43 | .node_repl_history 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Dataship 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # frame 2 | 3 | a DataFrame for Javascript. 4 | 5 | _crunch numbers in Node or the Browser_ 6 | 7 | ## features 8 | * Interactive performance (<100ms) on millions of rows 9 | * Syntax similar to SQL and Pandas 10 | * Compatible with `PapaParse` and [`BabyParse`](https://github.com/Rich-Harris/BabyParse) 11 | 12 | ## examples 13 | Parse the [Iris](https://vincentarelbundock.github.io/Rdatasets/datasets.html) 14 | dataset (with [`BabyParse`](https://github.com/Rich-Harris/BabyParse)) and create a `Frame` from the result. 15 | 16 | ```javascript 17 | var baby = require('babyparse'), 18 | Frame = require('frame'); 19 | 20 | // parse the csv file 21 | config = {"header" :true, "dynamicTyping" : true, "skipEmptyLines" : true}; 22 | iris = baby.parseFiles('iris.csv', config).data; 23 | 24 | // create a frame from the parsed results 25 | frame = new Frame(iris); 26 | ``` 27 | ### groupby 28 | 29 | Group on `Species` and find the average value (`mean`) for `Sepal.Length`. 30 | ```javascript 31 | g = frame.groupby("Species"); 32 | g.mean("Sepal.Length"); 33 | ``` 34 | ```json 35 | { "virginica": 6.58799, "versicolor": 5.9360, "setosa": 5.006 } 36 | ``` 37 | Using the same grouping, find the average value for `Sepal.Width`. 38 | ```javascript 39 | g.mean("Sepal.Width"); 40 | ``` 41 | ```json 42 | { "virginica": 2.97399, "versicolor": 2.770, "setosa": 3.4279 } 43 | ``` 44 | 45 | ### where 46 | Filter by `Species` value `virginica` then find the average. 47 | ```javascript 48 | f = frame.where("Species", "virginica"); 49 | f.mean("Sepal.Length"); 50 | ``` 51 | ```json 52 | 6.58799 53 | ``` 54 | Get the number of rows that match the filter. 55 | ```javascript 56 | f.count(); 57 | ``` 58 | ```json 59 | 50 60 | ``` 61 | Columns can also be accessed directly (with the filter applied). 62 | ```javascript 63 | f["Species"] 64 | ``` 65 | ```javascript 66 | ["virginica", "virginica", "virginica", ..., "virginica"] 67 | ``` 68 | # tests 69 | Hundreds of tests verify correctness on millions of data points (against a Pandas reference). 70 | 71 | `npm run data && npm run test` 72 | 73 | # benchmarks 74 | `npm run bench` 75 | 76 | typical performance on one million rows 77 | 78 | operation | time 79 | ----------|------ 80 | `groupby` | 54ms 81 | `where` | 29ms 82 | `sum` | 5ms 83 | 84 | # design goals and inspiration 85 | 86 | * compatibility with [feather](https://github.com/wesm/feather) 87 | 88 | ## interface 89 | 90 | * pandas 91 | * R 92 | * Linq 93 | * rethinkDB 94 | * Matlab 95 | 96 | ## performance 97 | 98 | * [datavore](https://github.com/StanfordHCI/datavore) 99 | -------------------------------------------------------------------------------- /benchmark/QUESTIONS.md: -------------------------------------------------------------------------------- 1 | 2 | ### Why are my dv results not consistent with their benchmark webpage? 3 | because it slows down with consecutive runs, dropping to a quarter of initial performance. 4 | 5 | ok 1 table.query.sum: 1000000x3 6 | # 12.019 MFlops/sec ±16.51% n = 15 µ = 83ms : [0.022,0.02225,0.0935,0.092,0.0925,0.0925,0.09325,0.092,0.093,0.09275,0.09275,0.09225,0.09275,0.092,0.0925] 7 | 8 | 9 | ### Can I make Frame as fast as dv by encoding the strings? 10 | likely it will give a 3x speedup. 11 | 12 | #### integers 13 | ok 1 groupby.sum: 1000000x3 14 | # 13.952 MFlops/sec ±2.01% n = 29 µ = 72ms : [0.0645,0.0625,0.0635,0.062,0.0775,0.0725,0.0715,0.073,0.0725,0.073,0.0735,0.0745,0.0725,0.0725,0.074,0.073,0.0715,0.077,0.072,0.071,0.072,0.0715,0.0725,0.0735,0.0725,0.073,0.0745,0.0715,0.0735] 15 | 16 | #### strings 17 | ok 1 groupby.sum: 1000000x3 18 | # 4.120 MFlops/sec ±3.74% n = 14 µ = 243ms : [0.239,0.235,0.232,0.234,0.267,0.267,0.24,0.235,0.235,0.236,0.279,0.233,0.233,0.233] 19 | 20 | ### Is the FrameIndex.reduce faster than dv.query, when Frame.groupby has already been run? 21 | yes, but not quite faster than the ultra-fast first two runs of dv.query 22 | 23 | ok 1 sum: 1000000x3 24 | # 23.298 MFlops/sec ±1.40% n = 34 µ = 43ms : [0.037333333333333336,0.042333333333333334,0.042333333333333334,0.042666666666666665,0.042333333333333334,0.042666666666666665,0.042333333333333334,0.043333333333333335,0.042666666666666665,0.04566666666666667,0.044000000000000004,0.042666666666666665,0.044333333333333336,0.042666666666666665,0.04733333333333333,0.043666666666666666,0.042,0.042333333333333334,0.043000000000000003,0.042333333333333334,0.042666666666666665,0.042,0.044000000000000004,0.042666666666666665,0.042666666666666665,0.042666666666666665,0.042,0.042333333333333334,0.043000000000000003,0.048666666666666664,0.041666666666666664,0.041666666666666664,0.042333333333333334,0.043000000000000003] 25 | 26 | ### Can I make FrameIndex.reduce faster than the ultra-fast dv.query? 27 | 28 | try: 29 | 1. reproducing results 30 | 2. removing the function call 31 | 32 | 33 | ### Is the dv setup longer? 34 | 35 | ### Why is dv faster initially? 36 | -------------------------------------------------------------------------------- /benchmark/create.js: -------------------------------------------------------------------------------- 1 | var benchtap = require('benchtap'), 2 | gen = require('../lib/test').generate, 3 | Frame = require('../lib/frame'); 4 | 5 | function createSetup(N, K, useStrings){ 6 | return function(event){ 7 | // generate data 8 | this.groupCol = gen.Array.int(N, K); 9 | this.valueCol = gen.Array.int(N, 100); 10 | 11 | // map to strings 12 | if(useStrings) 13 | this.groupCol = this.groupCol.map(i => ["a", "b", "c"][i]); 14 | 15 | }; 16 | } 17 | 18 | function test(){ 19 | 20 | // create frame 21 | var columnDict = { 22 | "group-col" : this.groupCol, 23 | "reduce-col" : this.valueCol 24 | }; 25 | 26 | this.frame = new Frame(columnDict); 27 | } 28 | 29 | var N = 100000, 30 | K = 3; 31 | 32 | var name = "create: " + N + "x" + K; 33 | benchtap(name, {"operations": N}, createSetup(N, K), test); 34 | 35 | 36 | name += " (strings)"; 37 | benchtap(name, {"operations": N}, createSetup(N, K, true), test); 38 | 39 | 40 | 41 | var N = 1000000; 42 | 43 | name = "create: " + N + "x" + K; 44 | benchtap(name, {"operations": N}, createSetup(N, K), test); 45 | 46 | 47 | name += " (strings)"; 48 | benchtap(name, {"operations": N}, createSetup(N, K, true), test); 49 | -------------------------------------------------------------------------------- /benchmark/datavore/README.md: -------------------------------------------------------------------------------- 1 | Comparison benchmarks of similar operations for [datavore](https://github.com/StanfordHCI/datavore) 2 | -------------------------------------------------------------------------------- /benchmark/datavore/create.js: -------------------------------------------------------------------------------- 1 | var benchtap = require('benchtap'), 2 | gen = require('../../generate'), 3 | dv = require('./datavore'); 4 | 5 | function createSetup(N, K, useStrings){ 6 | return function(event){ 7 | 8 | this.groupCol = gen.Array.int(N, K); 9 | this.valueCol = gen.Array.int(N, 100); 10 | 11 | if(useStrings) 12 | this.groupCol = this.groupCol.map(i => ["a", "b", "c"][i]); 13 | }; 14 | } 15 | 16 | function test(){ 17 | 18 | // create table 19 | var table = dv.table([ 20 | {name:"group-col", type:"nominal", values:this.groupCol}, 21 | {name:"reduce-col", type:"numeric", values:this.valueCol} 22 | ]); 23 | } 24 | 25 | 26 | // 1 hundred thousand data points/rows 27 | var N = 100000, 28 | K = 3; 29 | 30 | var name = "create: " + N + "x" + K; 31 | benchtap(name, {"operations" : 2*N}, createSetup(N, K), test); 32 | 33 | name += " (strings)"; 34 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, true), test); 35 | 36 | // 1 million data points/rows 37 | var N = 1000000; 38 | 39 | name = "create: " + N + "x" + K; 40 | benchtap(name, {"operations" : 2*N}, createSetup(N, K), test); 41 | 42 | name += " (strings)"; 43 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, true), test); 44 | -------------------------------------------------------------------------------- /benchmark/datavore/datavore/index.js: -------------------------------------------------------------------------------- 1 | 2 | 3 | module.exports = (function() { 4 | /** 5 | * The top-level Datavore namespace. All public methods and fields should be 6 | * registered on this object. Note that core Datavore source is surrounded by an 7 | * anonymous function, so any other declared globals will not be visible outside 8 | * of core methods. This also allows multiple versions of Datavore to coexist, 9 | * since each version will see their own dv namespace. 10 | * 11 | * @namespace The top-level Datavore namespace, dv. 12 | */ 13 | var dv = {version: "1.0.0"}; 14 | 15 | dv.array = function(n) { 16 | var a = Array(n); 17 | for (var i = n; --i >= 0;) { a[i] = 0; } 18 | return a; 19 | } 20 | 21 | // -- RANDOM NUMBER GENERATORS ------------------------------------------------ 22 | 23 | dv.rand = {}; 24 | 25 | dv.rand.uniform = function(min, max) { 26 | min = min || 0; 27 | max = max || 1; 28 | var delta = max - min; 29 | return function() { 30 | return min + delta * Math.random(); 31 | } 32 | }; 33 | 34 | dv.rand.integer = function(a, b) { 35 | if (b === undefined) { 36 | b = a; 37 | a = 0; 38 | } 39 | return function() { 40 | return a + Math.max(0, Math.floor(b * (Math.random() - 0.001))); 41 | } 42 | } 43 | 44 | dv.rand.normal = function(mean, stdev) { 45 | mean = mean || 0; 46 | stdev = stdev || 1; 47 | var next = undefined; 48 | return function() { 49 | var x = 0, y = 0, rds, c; 50 | if (next !== undefined) { 51 | x = next; 52 | next = undefined; 53 | return x; 54 | } 55 | do { 56 | x = Math.random() * 2 - 1; 57 | y = Math.random() * 2 - 1; 58 | rds = x * x + y * y; 59 | } while (rds == 0 || rds > 1); 60 | c = Math.sqrt(-2 * Math.log(rds) / rds); // Box-Muller transform 61 | next = mean + y * c * stdev; 62 | return mean + x * c * stdev; 63 | } 64 | } 65 | // -- DATA TABLE -------------------------------------------------------------- 66 | 67 | dv.type = { 68 | nominal: "nominal", 69 | ordinal: "ordinal", 70 | numeric: "numeric", 71 | unknown: "unknown" 72 | }; 73 | 74 | dv.table = function(input) 75 | { 76 | var table = []; // the data table 77 | 78 | table.addColumn = function(name, values, type, iscolumn) { 79 | type = type || dv.type.unknown; 80 | var compress = (type === dv.type.nominal || type === dv.type.ordinal); 81 | var vals = values; 82 | 83 | if (compress && !iscolumn) { 84 | vals = []; 85 | vals.lut = code(values); 86 | for (var i = 0, map=dict(vals.lut); i < values.length; ++i) { 87 | vals.push(map[values[i]]); 88 | } 89 | vals.get = function(idx) { return this.lut[this[idx]]; } 90 | } else if (!iscolumn) { 91 | vals.get = function(idx) { return this[idx]; } 92 | } 93 | vals.name = name; 94 | vals.index = table.length; 95 | vals.type = type; 96 | 97 | table.push(vals); 98 | table[name] = vals; 99 | }; 100 | 101 | table.removeColumn = function(col) { 102 | col = table[col] || null; 103 | if (col != null) { 104 | delete table[col.name]; 105 | table.splice(col.index, 1); 106 | } 107 | return col; 108 | }; 109 | 110 | table.rows = function() { return table[0] ? table[0].length : 0; }; 111 | 112 | table.cols = function() { return table.length; }; 113 | 114 | table.get = function(col, row) { return table[col].get(row); } 115 | 116 | table.dense_query = function(q) { 117 | var tab = q.where ? table.where(q.where) : table; 118 | var dims = [], sz = [1], hasDims = q.dims; 119 | if (hasDims) { 120 | sz = []; 121 | for (i = 0; i < q.dims.length; ++i) { 122 | var dim = q.dims[i], type = typeof dim; 123 | if (type === "string" || type === "number") { 124 | col = tab[dim]; 125 | } else if (dim.array) { 126 | col = dim.array(tab[dim.value]); 127 | } 128 | dims.push(col); 129 | sz.push(col.lut.length); 130 | } 131 | } 132 | 133 | var vals = q.vals, // aggregate query operators 134 | C = sz.reduce(function(a,b) { return a * b; }, 1), // cube cardinality 135 | N = tab[0].length, p, col, v, name, expr, // temp vars 136 | cnt, sum, ssq, min, max, // aggregate values 137 | _cnt, _sum, _ssq, _min, _max, // aggregate flags 138 | ctx = {}, emap = {}, exp = [], lut, // aggregate state vars 139 | i = 0, j = 0, k = 0, l = 0, idx = 0, len, slen = sz.length; // indices 140 | 141 | // Identify Requested Aggregates 142 | var star = false; 143 | for (i = 0; i < vals.length; ++i) { 144 | var req = vals[i].init(); 145 | for (expr in req) { 146 | if (expr == "*") { 147 | req[expr].map(function(func) { 148 | ctx[func] = dv.array(C); 149 | }); 150 | star = true; 151 | } else { 152 | idx = tab[expr].index; 153 | name = tab[expr].name; 154 | req[expr].map(function(func) { 155 | ctx[func + "_" + name] = (ctx[func + "_" + idx] = dv.array(C)); 156 | }); 157 | if (!emap[idx]) { 158 | emap[idx] = true; 159 | exp.push(idx); 160 | } 161 | } 162 | } 163 | } 164 | if (exp.length == 0 && star) { exp.push(-1) }; 165 | 166 | // Compute Cube Index Coefficients 167 | for (i = 0, p = [1]; i < slen; ++i) { 168 | p.push(p[i] * sz[i]); 169 | } 170 | 171 | // Execute Query: Compute Aggregates 172 | for (j = 0, len = exp.length; j < len; ++j) { 173 | expr = exp[j]; 174 | cnt = ctx["cnt"]; _cnt = (cnt && j==0); 175 | sum = ctx["sum_" + expr]; _sum = (sum !== undefined); 176 | ssq = ctx["ssq_" + expr]; _ssq = (ssq !== undefined); 177 | min = ctx["min_" + expr]; _min = (min !== undefined); 178 | max = ctx["max_" + expr]; _max = (max !== undefined); 179 | col = tab[expr]; 180 | outer: 181 | for (i = 0; i < N; ++i) { 182 | for (idx = 0, k = 0; k < slen; ++k) { 183 | // compute cube index 184 | l = (hasDims ? dims[k][i] : 0); 185 | if (l < 0) continue outer; 186 | idx += p[k] * l; 187 | } 188 | if (col) { v = col[i]; } 189 | if (_cnt) { cnt[idx] += 1; } 190 | if (_sum) { sum[idx] += v; } 191 | if (_ssq) { ssq[idx] += v * v; } 192 | if (_min && v < min[idx]) { min[idx] = v; } 193 | if (_max && v > max[idx]) { max[idx] = v; } 194 | } 195 | } 196 | 197 | // Generate Results 198 | var result = [], stride = 1, s, val, code = q.code || false; 199 | for (i = 0; i < dims.length; ++i) { 200 | col = []; 201 | lut = dims[i].lut; 202 | s = sz[i]; 203 | val = 0; 204 | for (j = 0, k = 0, c = -1; j < C; ++j, ++k) { 205 | if (k == stride) { k = 0; val = (val + 1) % s; } 206 | col[j] = code ? val : lut[val]; 207 | } 208 | stride *= s; 209 | col.unique = lut.length; 210 | result.push(col); 211 | } 212 | vals.map(function(op) { result.push(op.done(ctx)); }); 213 | return result; 214 | }; 215 | 216 | table.query = table.dense_query; 217 | 218 | table.sparse_query = function(q) { 219 | var tab = q.where ? table.where(q.where) : table; 220 | var dims = [], sz = [1], hasDims = q.dims; 221 | if (hasDims) { 222 | sz = []; 223 | for (i=0; i max[idx])) { 306 | max[idx] = v; 307 | } 308 | } 309 | } 310 | 311 | // Generate Results 312 | var rr = vals.map(function(op) { return op.done(ctx); }); 313 | var keys = rr[0]; 314 | if (rr.length > 1) { 315 | keys = {}; 316 | rr.forEach(function(o) { for (var k in o) keys[k] = 1; }); 317 | } 318 | var result = dims.map(function() { return []; }); 319 | vals.forEach(function() { result.push([]); }); 320 | len = dims.length; 321 | 322 | for (k in keys) { 323 | // map index i to dimensional indices 324 | var nn = C, uv, div; 325 | for (i = k, j = len; --j >= 0;) { 326 | uv = dims[j].lut.length; 327 | div = ~~(nn / uv); 328 | result[j].push(dims[j].lut[~~(i / div)]); 329 | i = i % div; 330 | nn = ~~(nn / uv); 331 | } 332 | for (j = 0; j < rr.length; ++j) { 333 | val = rr[j][k]; 334 | result[len + j].push(val === undefined ? 0 : val); 335 | } 336 | } 337 | return result; 338 | }; 339 | 340 | table.where = function(f) { 341 | var nrows = table.rows(), 342 | ncols = table.cols(); 343 | 344 | // initialize result table 345 | var result = dv.table([]); 346 | for (var i = 0; i < ncols; ++i) { 347 | result.push([]); 348 | result[i].name = table[i].name; 349 | result[i].type = table[i].type; 350 | result[i].index = i; 351 | result[table[i].name] = result[i]; 352 | if (table[i].lut) { result[i].lut = table[i].lut; } 353 | } 354 | 355 | // populate result table 356 | for (var row = 0, j = -1; row < nrows; ++row) { 357 | if (f(table, row)) { 358 | for (i = 0, ++j; i < ncols; ++i) { 359 | result[i][j] = table[i][row]; 360 | } 361 | } 362 | } 363 | return result; 364 | }; 365 | 366 | /** @private */ 367 | function code(a) { 368 | var c = [], d = {}, v; 369 | for (var i=0, len=a.length; i maxv) { maxv = val; } 517 | } 518 | if (minb) { minv = Math.floor(minv / step) * step; } 519 | if (maxb) { maxv = Math.ceil(maxv / step) * step; } 520 | } 521 | // compute index array 522 | var a = [], lut = (a.lut = []), 523 | range = (maxv - minv), unique = Math.ceil(range / step); 524 | for (i = 0; i < N; ++i) { 525 | val = values[i]; 526 | if (val < minv || val > maxv) { a.push(-1); } 527 | else if (val == maxv) { a.push(unique - 1); } 528 | else { a.push(~~((values[i] - minv) / step)); } 529 | } 530 | for (i = 0; i < unique; ++i) { 531 | // multiply b/c adding garners round-off error 532 | lut.push(minv + i * step); 533 | } 534 | return a; 535 | }; 536 | op.step = function(x) { 537 | if (x === undefined) return step; 538 | step = x; 539 | return op; 540 | }; 541 | op.min = function(x) { 542 | if (x === undefined) return min; 543 | min = x; 544 | return op; 545 | }; 546 | op.max = function(x) { 547 | if (x === undefined) return max; 548 | max = x; 549 | return op; 550 | }; 551 | op.value = expr; 552 | return op; 553 | }; 554 | 555 | dv.quantile = function(expr, n) { 556 | function search(array, value) { 557 | var low = 0, high = array.length - 1; 558 | while (low <= high) { 559 | var mid = (low + high) >> 1, midValue = array[mid]; 560 | if (midValue < value) { low = mid + 1; } 561 | else if (midValue > value) { high = mid - 1; } 562 | else { return mid; } 563 | } 564 | var i = -low - 1; 565 | return (i < 0) ? (-i - 1) : i; 566 | } 567 | 568 | var op = {}; 569 | op.array = function(values) { 570 | // get sorted data values 571 | var i, d = values.sorted; 572 | if (!d) { 573 | var cmp; 574 | if (values.type && values.type === "numeric") { 575 | cmp = function(a,b) { return a - b; } 576 | } else { 577 | cmp = function(a,b) { return a < b ? -1 : a > b ? 1 : 0; } 578 | } 579 | values.sorted = (d = values.slice().sort(cmp)); 580 | } 581 | // compute quantile boundaries 582 | var q = [d[0]], a = [], lut = (a.lut = []); 583 | for (i = 1; i <= n; ++i) { 584 | q[i] = d[~~(i * (d.length - 1) / n)]; 585 | lut.push(i - 1); 586 | } 587 | // iterate through data and label quantiles 588 | for (i = 0; i < values.length; ++i) { 589 | a.push(Math.max(0, search(q, values[i]) - 1)); 590 | } 591 | return a; 592 | } 593 | op.bins = function(x) { 594 | if (x === undefined) return n; 595 | n = x; 596 | return op; 597 | } 598 | op.value = expr; 599 | return op; 600 | }; 601 | 602 | return dv; })(); 603 | -------------------------------------------------------------------------------- /benchmark/datavore/table.query.sum.js: -------------------------------------------------------------------------------- 1 | var benchtap = require('benchtap'), 2 | gen = require('../../generate'), 3 | dv = require('./datavore'); 4 | 5 | function createSetup(N, K, useStrings){ 6 | return function(event){ 7 | 8 | var groupCol = gen.Array.int(N, K); 9 | var valueCol = gen.Array.int(N, 100); 10 | 11 | if(useStrings) 12 | groupCol = groupCol.map(i => ["a", "b", "c"][i]); 13 | 14 | // create table 15 | this.table = dv.table([ 16 | {name:"group-col", type:"nominal", values:groupCol}, 17 | {name:"reduce-col", type:"numeric", values:valueCol} 18 | ]); 19 | 20 | // generate data 21 | /* 22 | this.table = dv.table(); 23 | this.table.addColumn("group-col", groupCol, dv.type.nominal); 24 | this.table.addColumn("reduce-col", valueCol, dv.type.numeric); 25 | */ 26 | 27 | }; 28 | } 29 | 30 | function test(){ 31 | 32 | var result = this.table.query({ 33 | "dims" : [0], 34 | "vals" : [dv.sum("reduce-col")] 35 | }); 36 | } 37 | 38 | 39 | var N = 100000, 40 | K = 3; 41 | 42 | var name = "table.query.sum: " + N + "x" + K; 43 | benchtap(name, {"operations" : 2*N}, createSetup(N, K), test); 44 | 45 | name += " (strings)"; 46 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, true), test); 47 | 48 | 49 | var N = 1000000; 50 | 51 | name = "table.query.sum: " + N + "x" + K; 52 | benchtap(name, {"operations" : 2*N}, createSetup(N, K), test); 53 | 54 | name += " (strings)"; 55 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, true), test); 56 | -------------------------------------------------------------------------------- /benchmark/datavore/table.query.sum.multi.js: -------------------------------------------------------------------------------- 1 | var benchtap = require('benchtap'), 2 | gen = require('../../generate'), 3 | dv = require('./datavore'); 4 | 5 | var STRINGS = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"]; 6 | 7 | function createSetup(N, K, M, useStrings){ 8 | return function(event){ 9 | 10 | var columns = [ 11 | {"name" : "value", "type":"numeric", "values": gen.Array.int(N, 100)} 12 | ]; 13 | var names = []; 14 | for (var m = 0; m < M; m++){ 15 | var name = "id_"+m; 16 | var column = { 17 | "name" : name, 18 | "type" : "ordinal", 19 | "values" : gen.Array.int(N, K) 20 | }; 21 | 22 | // map to strings 23 | if(useStrings){ 24 | column.values = column.values.map(i => STRINGS[i]); 25 | } 26 | 27 | columns.push(column); 28 | 29 | 30 | names[m] = name; 31 | } 32 | 33 | // create table 34 | this.table = dv.table(columns); 35 | 36 | // generate data 37 | /* 38 | this.table = dv.table(); 39 | this.table.addColumn("group-col", groupCol, dv.type.nominal); 40 | this.table.addColumn("reduce-col", valueCol, dv.type.numeric); 41 | */ 42 | 43 | }; 44 | } 45 | 46 | 47 | function test(){ 48 | 49 | //var names = this.names; 50 | //"dims" : ["id_0", "id_1"], 51 | var result = this.table.query({ 52 | "dims" : ["id_0", "id_1", "id_2", "id_3"], 53 | "vals" : [dv.sum("value")] 54 | }); 55 | } 56 | 57 | 58 | var N = 100000, 59 | K = 3, 60 | M = 4; 61 | 62 | var name = "table.query.sum.multi: " + N + "x" + K + "x" + M; 63 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, M), test); 64 | 65 | name += " (strings)"; 66 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, M, true), test); 67 | 68 | 69 | var N = 1000000; 70 | 71 | name = "table.query.sum.multi: " + N + "x" + K + "x" + M; 72 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, M), test); 73 | 74 | name += " (strings)"; 75 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, M, true), test); 76 | -------------------------------------------------------------------------------- /benchmark/groupby.sum.js: -------------------------------------------------------------------------------- 1 | var benchtap = require('benchtap'), 2 | gen = require('../lib/test').generate, 3 | Frame = require('../lib/frame'); 4 | 5 | 6 | var STRINGS = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"]; 7 | 8 | // create a frame for multidimensional groupby 9 | function createSetup(N, K, M, useStrings){ 10 | return function(event){ 11 | // generate data 12 | var columns = { 13 | "value" : gen.Array.int(N, 100) 14 | }; 15 | var names = []; 16 | for (var m = 0; m < M; m++){ 17 | var name = "id_"+m; 18 | columns[name] = gen.Array.int(N, K); 19 | 20 | // map to strings 21 | if(useStrings){ 22 | columns[name] = columns[name].map(i => STRINGS[i]); 23 | } 24 | 25 | names[m] = name; 26 | } 27 | //console.log(names); 28 | 29 | 30 | // create frame 31 | this.frame = new Frame(columns); 32 | }; 33 | } 34 | 35 | 36 | var N = 100000, 37 | K = 3, 38 | M = 1; 39 | 40 | var groups = []; 41 | for(var i = 0; i < M; i ++) groups.push("id_"+i); 42 | 43 | var name = "groupby.sum: " + N + "x" + K + "x" + M; 44 | 45 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, M), function(){ 46 | 47 | //var group = this.frame.groupbymulti(["group-col0", "group-col1"]); 48 | var group = this.frame.groupby(groups); 49 | var result = group.sum("value"); 50 | }); 51 | 52 | name += " (strings)"; 53 | 54 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, M, true), function(){ 55 | 56 | //var group = this.frame.groupbymulti(["group-col0", "group-col1"]); 57 | var group = this.frame.groupby(groups); 58 | var result = group.sum("value"); 59 | }); 60 | 61 | N = 1000000; 62 | name = "groupby.sum: " + N + "x" + K + "x" + M; 63 | 64 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, M), function(){ 65 | 66 | //var group = this.frame.groupbymulti(["group-col0", "group-col1"]); 67 | var group = this.frame.groupby(groups); 68 | var result = group.sum("value"); 69 | }); 70 | 71 | name += " (strings)"; 72 | 73 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, M, true), function(){ 74 | 75 | //var group = this.frame.groupbymulti(["group-col0", "group-col1"]); 76 | var group = this.frame.groupby(groups); 77 | var result = group.sum("value"); 78 | }); 79 | 80 | M = 2; 81 | 82 | name = "groupby.sum: " + N + "x" + K + "x" + M; 83 | 84 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, M), function(){ 85 | 86 | //var group = this.frame.groupbymulti(["group-col0", "group-col1"]); 87 | var group = this.frame.groupby(groups); 88 | var result = group.sum("value"); 89 | }); 90 | 91 | K = 200; 92 | M = 2; 93 | name = "groupby.sum: " + N + "x" + K + "x" + M; 94 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, M, true), function(){ 95 | 96 | //var group = this.frame.groupbymulti(["group-col0", "group-col1"]); 97 | var group = this.frame.groupby(groups); 98 | var result = group.sum("value"); 99 | }); 100 | /* 101 | var tests = [ 102 | 0 103 | ]; 104 | 105 | var RTOL = 1e-05, // 1e-05 106 | ATOL = 1e-12; // 1e-12 107 | 108 | var dataDirectory = 'test/data/sum/', 109 | testFile = 'small.json'; 110 | 111 | var floader = require('floader'), 112 | dtest = require('../lib/test'); 113 | 114 | floader.load(dataDirectory + testFile, function(err, config){ 115 | 116 | var suite = JSON.parse(config); 117 | 118 | for(var j = 0; j < tests.length; j++){ 119 | 120 | var i = tests[j]; 121 | var prefix = String("0000" + (i + 1)).slice(-4); 122 | 123 | // directory containing matrix data files for current test 124 | var directory = dataDirectory + prefix + '/'; 125 | 126 | var test = suite[i]; 127 | 128 | var names = test.id.map(function(spec, i){ return "id_" + i;}); 129 | var types = test.id.map(function(spec, i){ return spec['type'];}); 130 | 131 | var value_names = ["value_0"]; 132 | var value_types = [test.value[0].type]; 133 | 134 | var N = test.N; // number of rows 135 | var distincts = test.id.map(function(spec, i){ return spec.K; }); 136 | 137 | var testName = "groupby.summulti: " + N + " x " + "(" + distincts.join(", ") + ")" 138 | //tape(testName, generateTestCase(directory, names, types, ["value_0"], [test.value[0].type])); 139 | 140 | //var name = "groupby.sum.multi: " + N + "x" + K + "x" + M; 141 | 142 | benchtap(testName, {"operations" : 2*N}, 143 | createSetup(directory, names, types, value_names, value_types), 144 | function(event){ 145 | 146 | var g = this.frame.groupbymulti(names); 147 | var actual = g.summulti(value_names[0]); 148 | 149 | event.resolve(); 150 | }); 151 | } 152 | }); 153 | 154 | var OUT_FILENAME = "out.json"; 155 | 156 | function createSetup(directory, id_names, id_types, value_names, value_types){ 157 | return function(event){ 158 | 159 | var self = this; 160 | var names = id_names.concat(value_names); 161 | var types = id_types.concat(value_types); 162 | 163 | // which columns require a key file? 164 | var key_names = id_names.filter(function(item, i){ 165 | return id_types[i] in dtest.string_types 166 | }); 167 | var key_types = id_types.filter(function(item, i){ 168 | return item in dtest.string_types 169 | }); 170 | 171 | console.log(directory); 172 | // load columns from files 173 | dtest.load(directory, names, types, function(err, columns){ 174 | 175 | if(err) return console.log(err); 176 | 177 | console.log("running setup."); 178 | // load key files 179 | dtest.load_key(directory, key_names, key_types, function(err, keys){ 180 | 181 | floader.load(directory + OUT_FILENAME, function(err, out){ 182 | var expected = JSON.parse(out); 183 | 184 | var column_set = {}; 185 | for (var i = 0; i < names.length; i++){ 186 | var name = names[i]; 187 | var column = columns[i]; 188 | column_set[name] = column; 189 | } 190 | // keys map a small set of integers to other things (like strings) 191 | // they're a very simple form of fixed length coding 192 | var key_set = {}; 193 | for (var i = 0; i < keys.length; i++){ 194 | var name = key_names[i]; 195 | var key = keys[i]; 196 | key_set[name] = key; 197 | } 198 | 199 | self.frame = new Frame(column_set, key_set); 200 | 201 | event.resolve(); 202 | 203 | }); 204 | 205 | }); 206 | }); 207 | }; 208 | } 209 | */ 210 | -------------------------------------------------------------------------------- /benchmark/groupby_sum.js: -------------------------------------------------------------------------------- 1 | var benchtap = require('benchtap'), 2 | gen = require('../lib/test').generate, 3 | Frame = require('../lib/frame'); 4 | 5 | var STRINGS = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"]; 6 | /* 7 | N - number of rows 8 | K - number of distinct values in id columns 9 | M - number of id columns 10 | */ 11 | function createSetup(N, K, M, useStrings){ 12 | return function(event){ 13 | // generate data 14 | var columns = { 15 | "value" : gen.Array.int(N, 100) 16 | }; 17 | var names = []; 18 | for (var m = 0; m < M; m++){ 19 | var name = "id_"+m; 20 | columns[name] = gen.Array.int(N, K); 21 | 22 | // map to strings 23 | if(useStrings){ 24 | columns[name] = columns[name].map(i => STRINGS[i]); 25 | } 26 | 27 | names[m] = name; 28 | } 29 | 30 | // create frame 31 | this.frame = new Frame(columns); 32 | // group on all id columns 33 | this.group = this.frame.groupby(names); 34 | }; 35 | } 36 | 37 | var N = 100000, 38 | K = 3, 39 | M = 1; 40 | 41 | var name = "sum: " + N + "x" + K + "x" + M; 42 | 43 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){ 44 | var result = this.group.sum("value"); 45 | }); 46 | 47 | /* 48 | name += " (strings)"; 49 | 50 | benchtap(name, {"operations": N}, createSetup(N, K, true), function(){ 51 | var result = this.group.reduce("reduce-col"); 52 | }); 53 | */ 54 | 55 | 56 | var N = 1000000; 57 | 58 | name = "sum: " + N + "x" + K + "x" + M; 59 | 60 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){ 61 | var result = this.group.sum("value"); 62 | }); 63 | 64 | M = 2; 65 | 66 | name = "sum: " + N + "x" + K + "x" + M; 67 | 68 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){ 69 | var result = this.group.sum("value"); 70 | }); 71 | 72 | /* 73 | name += " (strings)"; 74 | 75 | benchtap(name, {"operations": N}, createSetup(N, K, true), function(){ 76 | var result = this.group.reduce("reduce-col"); 77 | }); 78 | */ 79 | 80 | K = 200; 81 | M = 2; 82 | 83 | var name = "sum: " + N + "x" + K + "x" + M; 84 | 85 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){ 86 | var result = this.group.sum("value"); 87 | }); 88 | -------------------------------------------------------------------------------- /benchmark/median.js: -------------------------------------------------------------------------------- 1 | var benchtap = require('benchtap'), 2 | gen = require('../lib/test').generate, 3 | Frame = require('../lib/frame'); 4 | 5 | var STRINGS = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"]; 6 | /* 7 | N - number of rows 8 | K - number of distinct values in id columns 9 | M - number of id columns 10 | */ 11 | function createSetup(N, K, M, useStrings){ 12 | return function(event){ 13 | // generate data 14 | var columns = { 15 | "value" : gen.Array.int(N, 100) 16 | }; 17 | var names = []; 18 | for (var m = 0; m < M; m++){ 19 | var name = "id_"+m; 20 | columns[name] = gen.Array.int(N, K); 21 | 22 | // map to strings 23 | if(useStrings){ 24 | columns[name] = columns[name].map(i => STRINGS[i]); 25 | } 26 | 27 | names[m] = name; 28 | } 29 | 30 | // create frame 31 | this.frame = new Frame(columns); 32 | }; 33 | } 34 | 35 | var N = 100000, 36 | K = 3, 37 | M = 1; 38 | 39 | var name = "median: " + N + "x" + K + "x" + M; 40 | 41 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){ 42 | var result = this.frame.median("value"); 43 | }); 44 | 45 | /* 46 | name += " (strings)"; 47 | 48 | benchtap(name, {"operations": N}, createSetup(N, K, true), function(){ 49 | var result = this.group.reduce("reduce-col"); 50 | }); 51 | */ 52 | 53 | 54 | var N = 1000000; 55 | 56 | name = "median: " + N + "x" + K + "x" + M; 57 | 58 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){ 59 | var result = this.frame.median("value"); 60 | }); 61 | 62 | /* 63 | name += " (strings)"; 64 | 65 | benchtap(name, {"operations": N}, createSetup(N, K, true), function(){ 66 | var result = this.group.reduce("reduce-col"); 67 | }); 68 | */ 69 | 70 | K = 200; 71 | M = 2; 72 | 73 | var name = "median: " + N + "x" + K + "x" + M; 74 | 75 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){ 76 | var result = this.frame.median("value"); 77 | }); 78 | -------------------------------------------------------------------------------- /benchmark/sum.js: -------------------------------------------------------------------------------- 1 | var benchtap = require('benchtap'), 2 | gen = require('../lib/test').generate, 3 | Frame = require('../lib/frame'); 4 | 5 | var STRINGS = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"]; 6 | /* 7 | N - number of rows 8 | K - number of distinct values in id columns 9 | M - number of id columns 10 | */ 11 | function createSetup(N, K, M, useStrings){ 12 | return function(event){ 13 | // generate data 14 | var columns = { 15 | "value" : gen.Array.int(N, 100) 16 | }; 17 | var names = []; 18 | for (var m = 0; m < M; m++){ 19 | var name = "id_"+m; 20 | columns[name] = gen.Array.int(N, K); 21 | 22 | // map to strings 23 | if(useStrings){ 24 | columns[name] = columns[name].map(i => STRINGS[i]); 25 | } 26 | 27 | names[m] = name; 28 | } 29 | 30 | // create frame 31 | this.frame = new Frame(columns); 32 | }; 33 | } 34 | 35 | var N = 100000, 36 | K = 3, 37 | M = 1; 38 | 39 | var name = "sum: " + N + "x" + K + "x" + M; 40 | 41 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){ 42 | var result = this.frame.sum("value"); 43 | }); 44 | 45 | /* 46 | name += " (strings)"; 47 | 48 | benchtap(name, {"operations": N}, createSetup(N, K, true), function(){ 49 | var result = this.group.reduce("reduce-col"); 50 | }); 51 | */ 52 | 53 | 54 | var N = 1000000; 55 | 56 | name = "sum: " + N + "x" + K + "x" + M; 57 | 58 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){ 59 | var result = this.frame.sum("value"); 60 | }); 61 | 62 | /* 63 | name += " (strings)"; 64 | 65 | benchtap(name, {"operations": N}, createSetup(N, K, true), function(){ 66 | var result = this.group.reduce("reduce-col"); 67 | }); 68 | */ 69 | 70 | K = 200; 71 | M = 2; 72 | 73 | var name = "sum: " + N + "x" + K + "x" + M; 74 | 75 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){ 76 | var result = this.frame.sum("value"); 77 | }); 78 | -------------------------------------------------------------------------------- /benchmark/where.js: -------------------------------------------------------------------------------- 1 | var benchtap = require('benchtap'), 2 | gen = require('../lib/test').generate, 3 | Frame = require('../lib/frame'); 4 | 5 | var STRINGS = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"]; 6 | /* 7 | N - number of rows 8 | K - number of distinct values in id columns 9 | M - number of id columns 10 | */ 11 | function createSetup(N, K, M, useStrings){ 12 | return function(event){ 13 | // generate data 14 | var columns = { 15 | "value" : gen.Array.int(N, 100) 16 | }; 17 | var names = []; 18 | for (var m = 0; m < M; m++){ 19 | var name = "id_"+m; 20 | columns[name] = gen.Array.int(N, K); 21 | 22 | // map to strings 23 | if(useStrings){ 24 | columns[name] = columns[name].map(i => STRINGS[i]); 25 | } 26 | 27 | names[m] = name; 28 | } 29 | 30 | // create frame 31 | this.frame = new Frame(columns); 32 | //this.frame.where(row => row["id_1"] == 1); 33 | //this.frame.where("id_1", id => id == 1); 34 | }; 35 | } 36 | 37 | var N = 100000, 38 | K = 3, 39 | M = 2; 40 | 41 | var name = "where.function: " + N + "x" + K + "x" + M; 42 | 43 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){ 44 | //var result = this.frame.where(row => row["id_0"] == 1); 45 | var result = this.frame.where("id_0", id => id == 1); 46 | }); 47 | 48 | /* 49 | name += " (strings)"; 50 | 51 | benchtap(name, {"operations": N}, createSetup(N, K, true), function(){ 52 | var result = this.group.reduce("reduce-col"); 53 | }); 54 | */ 55 | 56 | 57 | var N = 1000000; 58 | 59 | name = "where.function: " + N + "x" + K + "x" + M; 60 | 61 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){ 62 | //var result = this.frame.where(row => row["id_0"] == 1); 63 | var result = this.frame.where("id_0", id => id == 1); 64 | }); 65 | 66 | /* 67 | name += " (strings)"; 68 | 69 | benchtap(name, {"operations": N}, createSetup(N, K, true), function(){ 70 | var result = this.group.reduce("reduce-col"); 71 | }); 72 | */ 73 | 74 | N = 1000000; 75 | K = 200; 76 | M = 2; 77 | 78 | var name = "where.equal: " + N + "x" + K + "x" + M; 79 | 80 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){ 81 | //var result = this.frame.where(row => row["id_0"] == 1); 82 | var result = this.frame.where("id_0", 1); 83 | }); 84 | 85 | var name = "where.in: " + N + "x" + K + "x" + M; 86 | 87 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){ 88 | //var result = this.frame.where(row => row["id_0"] == 1); 89 | var result = this.frame.where("id_0", [0, 1, 3, 10, 12, 18, 101, 52, 23, 18, 7, 12, 154, 34, 117, 5]); 90 | }); 91 | -------------------------------------------------------------------------------- /benchmark/where_sum.js: -------------------------------------------------------------------------------- 1 | var benchtap = require('benchtap'), 2 | gen = require('../lib/test').generate, 3 | Frame = require('../lib/frame'); 4 | 5 | var STRINGS = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"]; 6 | /* 7 | N - number of rows 8 | K - number of distinct values in id columns 9 | M - number of id columns 10 | */ 11 | function createSetup(N, K, M, useStrings){ 12 | return function(event){ 13 | // generate data 14 | var columns = { 15 | "value" : gen.Array.int(N, 100) 16 | }; 17 | var names = []; 18 | for (var m = 0; m < M; m++){ 19 | var name = "id_"+m; 20 | columns[name] = gen.Array.int(N, K); 21 | 22 | // map to strings 23 | if(useStrings){ 24 | columns[name] = columns[name].map(i => STRINGS[i]); 25 | } 26 | 27 | names[m] = name; 28 | } 29 | 30 | // create frame 31 | this.frame = new Frame(columns); 32 | this.frame = this.frame.where("id_0", 0); 33 | //this.frame.where(row => row["id_1"] == 1); 34 | //this.frame.where("id_1", id => id == 1); 35 | }; 36 | } 37 | 38 | var N = 100000, 39 | K = 3, 40 | M = 2; 41 | 42 | var name = "where.sum: " + N + "x" + K + "x" + M; 43 | 44 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){ 45 | //var result = this.frame.where(row => row["id_0"] == 1); 46 | var result = this.frame.sum("value"); 47 | }); 48 | 49 | /* 50 | name += " (strings)"; 51 | 52 | benchtap(name, {"operations": N}, createSetup(N, K, true), function(){ 53 | var result = this.group.reduce("reduce-col"); 54 | }); 55 | */ 56 | 57 | 58 | var N = 1000000; 59 | 60 | name = "where.sum: " + N + "x" + K + "x" + M; 61 | 62 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){ 63 | //var result = this.frame.where(row => row["id_0"] == 1); 64 | var result = this.frame.sum("value"); 65 | }); 66 | 67 | /* 68 | name += " (strings)"; 69 | 70 | benchtap(name, {"operations": N}, createSetup(N, K, true), function(){ 71 | var result = this.group.reduce("reduce-col"); 72 | }); 73 | */ 74 | 75 | N = 1000000; 76 | K = 200; 77 | M = 2; 78 | 79 | var name = "where.sum: " + N + "x" + K + "x" + M; 80 | 81 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){ 82 | var result = this.frame.sum("value") 83 | }); 84 | -------------------------------------------------------------------------------- /lib/frame-index.js: -------------------------------------------------------------------------------- 1 | 2 | var reducers = require('./stream-reducers'); 3 | 4 | function isobject(obj){ return Object.prototype.toString.call(obj) === "[object Object]";} 5 | 6 | /* A heirarchical index for the Frame data structure, the result of a call to 7 | * Frame.groupby 8 | */ 9 | function FrameIndex(frame, index, groups){ 10 | this._frame = frame; 11 | this._index = index; 12 | this._groups = groups; 13 | } 14 | 15 | module.exports = FrameIndex; 16 | 17 | /* 18 | */ 19 | FrameIndex.prototype.columns = function(){ 20 | return this._frame.columns(); 21 | }; 22 | 23 | FrameIndex.prototype.groups = function(){ 24 | return this._groups; 25 | }; 26 | 27 | FrameIndex.prototype.count = function(){ 28 | var reduced = {}; 29 | var index = this._index; 30 | 31 | // depth first iteration 32 | var todo = [[index, reduced, 0]]; 33 | 34 | var result; 35 | while (todo.length > 0){ 36 | n = todo.pop();// object 37 | index = n[0]; 38 | result = n[1]; 39 | level = n[2]; 40 | 41 | var c, name; 42 | for(key in index){ // keys in object 43 | c = index[key]; 44 | name = this._groups[level]; 45 | 46 | // decode the key, if possible 47 | if(this._frame._keys && name in this._frame._keys){ 48 | decoder = this._frame._keys[name]; 49 | key = decoder[key]; 50 | } 51 | 52 | if(isobject(c)){ 53 | result[key] = {}; 54 | todo.push([c, result[key], level + 1]); 55 | } else { 56 | result[key] = c.length; // reduce 57 | } 58 | } 59 | } 60 | 61 | return reduced; 62 | 63 | }; 64 | 65 | FrameIndex.prototype.sum = function(selector){ 66 | return this.reduce(selector, reducers.sum); 67 | }; 68 | 69 | FrameIndex.prototype.reduce = function(selector, reducer, initial){ 70 | 71 | var reduced = {}; 72 | var index = this._index; 73 | var column = this._frame._cols[selector]; 74 | 75 | reducer = reducer || 76 | ((column.length > 0 && Object.prototype.toString.call(column[0]) == "[object Number]") ? 77 | reducers.sum : 78 | reducers.max); 79 | 80 | // depth first traversal 81 | var todo = [[index, reduced, 0]]; 82 | 83 | var result; 84 | while (todo.length > 0){ 85 | n = todo.pop();// object 86 | index = n[0]; 87 | result = n[1]; 88 | level = n[2]; 89 | 90 | var c, name; 91 | for(key in index){ // keys in object 92 | c = index[key]; 93 | group = this._groups[level]; 94 | 95 | // decode the key, if possible 96 | if(this._frame._keys && group in this._frame._keys){ 97 | decoder = this._frame._keys[group]; 98 | key = decoder[key]; 99 | } 100 | 101 | if(isobject(c)){ 102 | result[key] = {}; 103 | todo.push([c, result[key], level + 1]); 104 | } else { 105 | var indices = c; 106 | var value = indexreduce(column, indices, reducer, initial); 107 | 108 | result[key] = value; 109 | } 110 | } 111 | } 112 | 113 | return reduced; 114 | 115 | }; 116 | 117 | /* reduce a subset of an array given by a set of indices using a supplied 118 | reducing function. 119 | */ 120 | function indexreduce(column, indices, reducer, initial){ 121 | 122 | var start, 123 | value; 124 | 125 | // chose initial values and start of loop based on number of inputs and 126 | // supplied initial value 127 | if(initial !== void(0)){ 128 | start = 0; 129 | value = initial; 130 | } else if(indices.length > 0) { 131 | start = 1; 132 | value = column[indices[0]]; 133 | } else { 134 | start = 0; 135 | value = 0; 136 | } 137 | 138 | for(var i = start; i < indices.length; i++){ 139 | index = indices[i]; 140 | value = reducer(value, column[index], i); 141 | } 142 | 143 | return value; 144 | 145 | } 146 | -------------------------------------------------------------------------------- /lib/frame.js: -------------------------------------------------------------------------------- 1 | 2 | var reducers = require('./stream-reducers'); 3 | var BitArray = require('bit-array'); 4 | 5 | 6 | function isarray(obj){ return Object.prototype.toString.call(obj) === "[object Array]";} 7 | function isobject(obj){ return Object.prototype.toString.call(obj) === "[object Object]";} 8 | function isnumber(obj){ return Object.prototype.toString.call(obj) === "[object Number]";} 9 | function isinteger(num){ return num % 1 === 0;} 10 | function isstring(obj){ return Object.prototype.toString.call(obj) === "[object String]";} 11 | function isfunction(obj){ return Object.prototype.toString.call(obj) === "[object Function]"; } 12 | function isdate(obj){ return Object.prototype.toString.call(obj) === "[object Date]";} 13 | var typed_array_constructors = { 14 | "[object Int32Array]" : true, 15 | "[object Uint32Array]" : true, 16 | "[object Float32Array]" : true, 17 | "[object Int8Array]" : true, 18 | "[object Uint8Array]" : true, 19 | "[object Int16Array]" : true, 20 | "[object Uint16Array]" : true, 21 | "[object Float64Array]" : true 22 | } 23 | function istypedarray(obj){ 24 | var tag = Object.prototype.toString.call(obj); 25 | return tag in typed_array_constructors; 26 | } 27 | 28 | 29 | function shallowcopy(obj){ 30 | if(obj == null) return obj; // null or undefined 31 | 32 | var copy = {}; 33 | for(var key in obj){ 34 | copy[key] = obj[key]; 35 | } 36 | 37 | return copy; 38 | } 39 | 40 | //function isframe(obj){ return isarray(obj) && (obj.length == 0 || isobject(obj[0])); } 41 | 42 | 43 | /* A lightweight, high performance Columnar Data Store disguised as a Data Frame 44 | * 45 | * Interface similarity targets and inspiration: 46 | * pandas, R, Linq, rethinkDB, Matlab 47 | * 48 | * column names: 49 | * columns.values.tolist(), colnames(f), 50 | * 51 | * aggregation: 52 | * groupby, , , 53 | * 54 | * filtering: 55 | * 56 | * # References 57 | * https://github.com/StanfordHCI/datavore 58 | * http://vincentarelbundock.github.io/Rdatasets/datasets.html 59 | * https://galeascience.wordpress.com/2016/08/10/top-10-pandas-numpy-and-scipy-functions-on-github/ 60 | * https://github.com/visualfabriq/bquery/blob/master/bquery/khash.h 61 | * ## R 62 | * http://www.r-tutor.com/r-introduction/data-frame 63 | * https://www.datacamp.com/community/tutorials/15-easy-solutions-data-frame-problems-r#gs.ArNaS44 64 | * ## Pandas 65 | * http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html 66 | * http://chrisalbon.com/python/pandas_index_select_and_filter.html 67 | * ## Linq 68 | * https://msdn.microsoft.com/en-us/library/bb534304(v=vs.110).aspx?cs-save-lang=1&cs-lang=csharp#code-snippet-1 69 | */ 70 | /* Create a data frame object from some data, like the Pandas and R objects 71 | * of similar name. 72 | * 73 | * @examples 74 | * 75 | * // an array of row objects, like the output from babyparse and papaparse 76 | * 77 | * rows = 78 | [ 79 | { "name" : "Finn", "age" : 16, "title" : "Finn the Human"}, 80 | { "name" : "Jake", "age" : 32 , "title" : "Jake the Dog"}, 81 | { "name" : "Simon", "age" : 1043, "title" : "Ice King"}, 82 | { "name" : "Bonnibel", "age" : 827, "title" : "Princess Bubblegum"}, 83 | { "name" : "Marceline", "age" : 1004, "title" : "Marceline the Vampire Queen"} 84 | ]; 85 | * df = Frame(rows); 86 | * 87 | * // an object (dict) mapping column names to arrays of values 88 | * 89 | * columns = 90 | * { 91 | * "name" : ["Finn", "Jake", "Simon", "Bonnibel", "Marceline"], 92 | * "age" : [16, 32, 1043, 827, 1004], 93 | * "title" : ["Finn the Human", "Jake the Dog", "Ice King", "Princess Bubblegum", "Marceline the Vampire Queen"] 94 | * }; 95 | * 96 | * df = Frame(columns); 97 | * 98 | * // an optional keys argument allows string columns to be more compactly 99 | * // represented when duplicates are present 100 | * 101 | * columns = 102 | * { 103 | * "name" : [0, 1, 2, 3, 4], 104 | * "age" : [16, 32, 1043, 827, 1004], 105 | * "title" : [0, 1, 2, 3, 4] 106 | * }; 107 | * 108 | * keys = { 109 | * "name" : ["Finn", "Jake", "Simon", "Bonnibel", "Marceline"], 110 | * "title" : ["Finn the Human", "Jake the Dog", "Ice King", "Princess Bubblegum", "Marceline the Vampire Queen"] 111 | * } 112 | * 113 | * df = Frame(columns, keys); 114 | * 115 | */ 116 | function Frame(data, keys, index, groups, filters){ 117 | // f.constructor.name return "Frame" 118 | if(!(this instanceof Frame)) return new Frame(data, keys, index, groups, filters); 119 | 120 | if(Symbol && Symbol.toStringTag) this[Symbol.toStringTag] = 'Frame'; 121 | 122 | 123 | // TODO: deep copy index 124 | if(index){ 125 | Object.defineProperty(this, "_index", { 126 | "enumerable" : false, 127 | "value" : index 128 | }); 129 | } 130 | 131 | // was a filters argument provided? 132 | if(filters){ 133 | // yes, construct a single filter from the values 134 | var filter; 135 | for(key in filters){ 136 | if(filter == null){ 137 | filter = filters[key].copy(); 138 | } else { 139 | filter.and(filters[key]); 140 | } 141 | } 142 | // copy of all defined filters 143 | Object.defineProperty(this, "_filters", { 144 | "enumerable" : false, 145 | "value" : filters 146 | }); 147 | // single filter produced from combining all filters 148 | Object.defineProperty(this, "_filter", { 149 | "enumerable" : false, 150 | "value" : filter 151 | }); 152 | Object.defineProperty(this, "_count", { 153 | "enumerable" : false, 154 | "value" : filter.count() 155 | }); 156 | } 157 | if(groups){ 158 | Object.defineProperty(this, "_groups", { 159 | "enumerable" : false, 160 | "value" : groups.slice(0) 161 | }); 162 | } 163 | 164 | // do we have input? 165 | if(data == null){ 166 | // no, just return an empty Frame 167 | return; 168 | } 169 | 170 | // what type of data input do we have? 171 | if(isobject(data)){ 172 | // object, check it's values 173 | var column, length; 174 | 175 | for(var key in data){ 176 | column = data[key]; 177 | 178 | // are the items arrays? 179 | if(isarray(column) || istypedarray(column)){ 180 | // yes, check for consistent lengths 181 | 182 | if(length == null){ 183 | length = column.length; 184 | } else if(length !== column.length){ 185 | throw new Error("Invalid data, arrays in object must be of equal length"); 186 | } 187 | } else { 188 | // no, invalid data 189 | throw new Error("Invalid data, must be array of rows or dict of columns"); 190 | } 191 | } 192 | 193 | Object.defineProperty(this, "length", { 194 | "enumerable" : false, 195 | "value" : length 196 | }); 197 | 198 | // all checks pass use data as columns 199 | Object.defineProperty(this, "_cols", { 200 | "enumerable" : false, 201 | "value" : shallowcopy(data) 202 | }); 203 | 204 | // do we also have a key/decoding object? 205 | if(keys && isobject(keys)){ 206 | 207 | // check validity 208 | for(var key in keys){ 209 | if(!(key in this._cols)) throw new Error("Invalid data, keys object doesn't match columns"); 210 | } 211 | 212 | Object.defineProperty(this, "_keys", { 213 | "enumerable" : false, 214 | "value" : shallowcopy(keys) 215 | }); 216 | } 217 | 218 | } else if(isarray(data)) { 219 | // array, check it's elements 220 | if(data.length == 0){ 221 | return; 222 | } 223 | 224 | Object.defineProperty(this, "length", { 225 | "enumerable" : false, 226 | "value" : data.length 227 | }); 228 | // all checks pass use data as columns 229 | Object.defineProperty(this, "_cols", { 230 | "enumerable" : false, 231 | "value" : {} 232 | }); 233 | 234 | var row; 235 | for(key in data[0]){ 236 | this._cols[key] = []; 237 | } 238 | for(var i = 0; i < data.length; i++){ 239 | row = data[i]; 240 | 241 | // are the rows objects? 242 | if(isobject(row)){ 243 | // yes 244 | for(key in this._cols){ 245 | if(key in row) 246 | this._cols[key][i] = row[key]; 247 | else 248 | this._cols[key][i] = null; 249 | } 250 | } else { 251 | // no, invalid data 252 | throw new Error("Invalid data, must be array of rows or dict of columns"); 253 | } 254 | } 255 | } 256 | 257 | // expose columns as properties 258 | for(name in this._cols){ 259 | addColumn(this, name); 260 | } 261 | } 262 | 263 | Object.defineProperty(Frame.prototype, "add", { 264 | enumerable: false, 265 | value : function(name, values){ 266 | 267 | if(this.length !== values.length) 268 | throw new Error("Invalid data, arrays in object must be of equal length"); 269 | 270 | this._cols[name] = values; 271 | addColumn(this, name); 272 | } 273 | }); 274 | 275 | // internal function for exposing a data column as a property on the Frame 276 | function addColumn(frame, name){ 277 | Object.defineProperty(frame, name, { 278 | enumerable : true, 279 | configurable: true, 280 | get: function(){ 281 | // decode? 282 | var result = []; 283 | if(frame._keys && name in frame._keys){ 284 | // yes, get keys 285 | var keys = frame._keys[name]; 286 | 287 | // map data column onto decoded column 288 | // data column should be an array of indices into 289 | // the keys array 290 | var column = frame._cols[name]; 291 | result = new Array(column.length); 292 | for(var i = 0; i < column.length;i++){ 293 | result[i] = keys[column[i]]; 294 | } 295 | } else { 296 | // no, just return the column 297 | result = frame._cols[name]; 298 | } 299 | 300 | if(frame._filter){ 301 | return result.filter(function(item, i){ return frame._filter.get(i);}); 302 | } else { 303 | return result; 304 | } 305 | }, 306 | set : function(data){ 307 | if(!isarray(data)) throw new Error("data must be an array"); 308 | if(data.length != frame.length) throw new Error("array must match length"); 309 | 310 | if(frame._keys && name in frame._keys){ 311 | throw new Error("setting keyed column not supported yet"); 312 | } else { 313 | frame._cols[name] = data.slice(0); 314 | } 315 | } 316 | }); 317 | } 318 | 319 | /* 320 | // alternate syntax for toStringTag 321 | get [Symbol.toStringTag]() { 322 | return 'Validator'; 323 | } 324 | */ 325 | module.exports = Frame; 326 | 327 | /* 328 | Get column names 329 | */ 330 | Object.defineProperty(Frame.prototype, "columns", { 331 | enumerable: false, 332 | get : function(){ 333 | return Object.keys(this._cols); 334 | } 335 | }); 336 | 337 | Object.defineProperty(Frame.prototype, "rename", { 338 | enumerable: false, 339 | value : function(old_name, new_name){ 340 | if(!(old_name in this._cols)) 341 | throw new Error("Couldn't find a column named '" + selector + "'"); 342 | 343 | // copy column to new name 344 | var column = this._cols[old_name]; 345 | this._cols[new_name] = column; 346 | 347 | // delete old column 348 | delete this._cols[old_name]; 349 | delete this[old_name]; 350 | 351 | // rename any decode key 352 | if(this._keys && old_name in this._keys){ 353 | this._keys[new_name] = this._keys[old_name]; 354 | delete this._keys[old_name] 355 | } 356 | 357 | addColumn(this, new_name); 358 | 359 | } 360 | }) 361 | 362 | Object.defineProperty(Frame.prototype, "distinct", {"enumerable": false, "value" : distinct}); 363 | 364 | function distinct(selector){ 365 | if(!(selector in this._cols)) 366 | throw new Error("Couldn't find a column named '" + selector + "'"); 367 | 368 | var key; 369 | if(this._keys) key = this._keys[selector]; 370 | 371 | var column = this._cols[selector]; 372 | var set = {}; 373 | var value; 374 | for(var i = 0; i < column.length; i++){ 375 | if(key) value = key[column[i]]; 376 | else value = column[i]; 377 | if(this._filter){ 378 | if(this._filter.get(i)) set[value] = value; 379 | } else { 380 | set[value] = value; 381 | } 382 | } 383 | 384 | // this step enables non-string values 385 | var vals = []; 386 | for(key in set) vals.push(set[key]); 387 | 388 | return vals; 389 | }; 390 | 391 | Object.defineProperty(Frame.prototype, "where", {"enumerable" : false, "value" : where}); 392 | 393 | /* element of, takes an array as an argument 394 | create and return a function that takes a single argument and returns true if 395 | that argument is contained in the given array 396 | 397 | NOTE: null and undefined may both be present in arr, and will be distinct from one another 398 | */ 399 | function el(arr){ 400 | var set = {}; 401 | for (var i = 0; i < arr.length; i++) set[arr[i]] = true; 402 | return function(v){ return set[v] != null;}; 403 | } 404 | 405 | function eq(a){ 406 | return function(v){ return v == a; }; 407 | } 408 | 409 | function where(selector, condition){ 410 | 411 | if(!(selector in this._cols)) 412 | throw new Error("Couldn't find a column named '" + selector + "'"); 413 | 414 | var column = this._cols[selector]; 415 | var filter = new BitArray(this.length); 416 | 417 | var bits = filter.wordArray; 418 | var index = 0; 419 | var word = 0|0; 420 | var offset; 421 | var max = column.length - 1; 422 | 423 | if(isnumber(condition) || isstring(condition)){ 424 | // keyed selector column? 425 | if(isstring(condition) && this._keys && selector in this._keys){ 426 | // yes, encode condition 427 | var keys = this._keys[selector]; 428 | condition = keys.indexOf(condition); 429 | } 430 | for(var i = 0; i < bits.length; i++){ 431 | word = 0|0; 432 | offset = i * 32; 433 | var j = 31 + offset; 434 | if(j > max) j = max; 435 | for(; j >= offset; j--){ 436 | if(column[j] === condition) word |= 1; 437 | if(j > offset) word <<= 1; 438 | } 439 | bits[i] = word; 440 | } 441 | } else { 442 | if(isarray(condition) || istypedarray(condition)){ 443 | condition = el(condition); 444 | } 445 | if(this._keys && selector in this._keys){ 446 | // yes, encode condition 447 | var keys = this._keys[selector]; 448 | } 449 | 450 | var value; 451 | for(var i = 0; i < bits.length; i++){ 452 | word = 0|0; 453 | offset = i * 32; 454 | var j = 31 + offset; 455 | if(j > max) j = max; 456 | for(; j >= offset; j--){ 457 | if(keys) value = keys[column[j]]; 458 | else value = column[j]; 459 | if(condition(value)) word |= 1; 460 | if(j > offset) word <<= 1; 461 | } 462 | bits[i] = word; 463 | } 464 | } 465 | 466 | // create and return a new Frame with the new filter 467 | var filters = {}; 468 | if(this._filters){ 469 | Object.assign(filters, this._filters); 470 | } 471 | filters[selector] = filter; 472 | 473 | return new Frame(this._cols, this._keys, this._index, this._groups, filters); 474 | 475 | } 476 | 477 | Object.defineProperty(Frame.prototype, "join", {"enumerable" : false, "value" : join}); 478 | Object.defineProperty(Frame.prototype, "groupby", {"enumerable" : false, "value" : groupby}); 479 | Object.defineProperty(Frame.prototype, "ungroup", {"enumerable" : false, "value" : ungroup}); 480 | Object.defineProperty(Frame.prototype, "count", {"enumerable" : false, "value" : count}); 481 | Object.defineProperty(Frame.prototype, "argmax", {"enumerable" : false, "value": argmax}); 482 | Object.defineProperty(Frame.prototype, "argmin", {"enumerable" : false, "value": argmin}); 483 | Object.defineProperty(Frame.prototype, "min", {"enumerable" : false, "value": min}); 484 | Object.defineProperty(Frame.prototype, "max", {"enumerable" : false, "value": max}); 485 | Object.defineProperty(Frame.prototype, "sum", {"enumerable" : false, "value": sum}); 486 | Object.defineProperty(Frame.prototype, "mean", {"enumerable" : false, "value": mean}); 487 | Object.defineProperty(Frame.prototype, "median", {"enumerable" : false, "value": median}); 488 | Object.defineProperty(Frame.prototype, "reduce", {"enumerable" : false, "value": reduce}); 489 | 490 | 491 | /* use the partition method to find the median */ 492 | function median(selector){ 493 | 494 | var column = this._cols[selector]; 495 | var key = selector && this._keys ? this._keys[selector] : null; 496 | 497 | if (column.length == 0) return null; 498 | 499 | var p, m; 500 | 501 | middle = column.length / 2 | 0; 502 | 503 | var low = 0, 504 | high = column.length - 1; 505 | 506 | var i = 0; 507 | // partition the array 508 | while(p != middle && i < column.length){ 509 | i++; 510 | p = partition(column, low, high); 511 | 512 | if( p < middle) low = p + 1; 513 | else high = p - 1; 514 | } 515 | 516 | if(i == column.length){ 517 | console.error("Maximum partition reached"); 518 | } 519 | 520 | if(key) return key[column[p]]; 521 | else return column[p]; 522 | } 523 | 524 | /* partition an array, in place */ 525 | function partition(arr, low, high){ 526 | 527 | if (low >= high) return high; 528 | 529 | // choose a random index for the pivot 530 | var pivot = randint(low, high); 531 | 532 | // swap pivot into last location 533 | swap(arr, high, pivot); 534 | 535 | pivot = low; // location of pivot in result 536 | // scan array and swap elements less than pivot into low end 537 | for(var i = low; i < high; i++){ 538 | if (arr[i] < arr[high]){ 539 | swap(arr, i, pivot); 540 | pivot++; 541 | } 542 | } 543 | 544 | swap(arr, high, pivot); 545 | 546 | return pivot; 547 | 548 | } 549 | 550 | /* get random integer in the inclusive interval [a, b] 551 | a and b must be integers for correct performance 552 | */ 553 | function randint(a, b){ 554 | r = Math.random(); //[0, 1) 555 | return a + Math.floor((b - a + 1)*r); 556 | } 557 | 558 | function swap(arr, i, j){ 559 | var temp = arr[i]; 560 | arr[i] = arr[j]; 561 | arr[j] = temp; 562 | } 563 | 564 | function join(frame, link){ 565 | 566 | // verify length of link column 567 | if(link.length !== this.length) throw new Error("Length of link column must match frame."); 568 | 569 | if(!("_cols" in frame)) throw new Error("First argument must be a frame."); 570 | 571 | // duplicate columns and keys 572 | var columns = shallowcopy(this._cols), 573 | keys = shallowcopy(this._keys) || {}; 574 | 575 | // add virtual columns for each column in the joining frame 576 | for(name in frame._cols){ 577 | // skip columns with duplicate names 578 | if(name in columns) continue; 579 | 580 | // don't join encoded columns 581 | if(frame._keys && name in frame._keys) continue; 582 | 583 | // add link column as encoded column data 584 | columns[name] = link; 585 | // add joining frame column as key column 586 | keys[name] = frame._cols[name]; 587 | } 588 | 589 | return new Frame(columns, keys, this._index, this._groups, this._filters); 590 | 591 | } 592 | 593 | 594 | /* 595 | * group the data in the frame by a selector or set of selectors 596 | */ 597 | function groupby(){ 598 | 599 | if(arguments.length == 0) throw new Error("No arguments provided"); 600 | 601 | // collect arguments into list of selectors 602 | var selectors = [], 603 | arg; 604 | if(arguments.length === 1){ 605 | arg = arguments[0]; 606 | if(isstring(arg)) selectors = [arg]; 607 | else if(isarray(arg)) selectors = arg; 608 | } else { 609 | for(var i = 0; i < arguments.length; i++){ 610 | arg = arguments[i]; 611 | if(!isstring(arg)) throw new Error("Invalid arguments"); 612 | 613 | selectors.push(arg); 614 | } 615 | } 616 | 617 | var index = {}; 618 | if(this._index){ 619 | index = this._index; 620 | selectors = this._groups.concat(selectors); 621 | } 622 | 623 | // get references to all the columns involved in groups 624 | var columns = Array(selectors.length); 625 | var keys = {}; 626 | for (var m = 0; m < selectors.length; m++){ 627 | selector = selectors[m]; 628 | 629 | if(!(selector in this._cols)) 630 | throw new Error("Couldn't find a column named '" + selector + "'"); 631 | 632 | columns[m] = this._cols[selector]; 633 | if(this._keys && selector in this._keys) keys[m] = this._keys[selector]; 634 | } 635 | 636 | var N = columns[0].length; 637 | var path = Array(columns.length); 638 | // iterate through rows 639 | for(var i = 0; i < N; i++){ 640 | 641 | // compute distinct values for group columns describing the bin for 642 | // the current row 643 | for (var m = 0; m < columns.length; m++){ 644 | var column = columns[m]; 645 | if(m in keys) path[m] = keys[m][column[i]]; 646 | else path[m] = column[i]; 647 | } 648 | 649 | // add this row to the index using the group column values 650 | // by descending the hierarchy to the correct leaf 651 | var level = index; 652 | for(var j = 0; j < path.length - 1; j++){ 653 | 654 | key = path[j]; 655 | next = level[key]; 656 | if(next == null || isarray(next)){ 657 | next = {}; 658 | level[key] = next; 659 | } 660 | level = next; 661 | } 662 | 663 | // update array of row indices stored in leaf 664 | key = path[path.length - 1]; 665 | var arr = level[key]; 666 | if(arr == null){ 667 | level[key] = [i]; 668 | } else { 669 | arr[arr.length] = i; 670 | } 671 | } 672 | 673 | /* 674 | this._index = index; 675 | this._groups = selectors.slice(0); 676 | return this; 677 | */ 678 | return new Frame(this._cols, this._keys, index, selectors, this._filters); 679 | } 680 | 681 | /* remove the grouping created by the last remaining groupby selector */ 682 | function ungroup(){ 683 | if(this._index == null || this._groups.length < 1) 684 | throw new Error("Not enough groups") 685 | 686 | var frame = new Frame(this._cols, this._keys, null, null, this._filters); 687 | 688 | // handle special case of single group 689 | if(this._groups.length == 1) 690 | return frame; 691 | 692 | // for other cases do new groupby with one fewer groups 693 | return frame.groupby(this._groups.slice(0, -1)); 694 | } 695 | 696 | function count(){ 697 | if(this._index) return this.reduce(); 698 | 699 | if(this._filter) return this._count; 700 | 701 | return this.length; 702 | } 703 | 704 | function min(selector){ 705 | return this.reduce(selector, reducers.min); 706 | } 707 | 708 | function max(selector){ 709 | return this.reduce(selector, reducers.max); 710 | } 711 | 712 | function sum(selector){ 713 | return this.reduce(selector, reducers.sum); 714 | } 715 | 716 | function mean(selector){ 717 | return this.reduce(selector, reducers.mean); 718 | } 719 | 720 | function argmax(selector){ 721 | return this.reduce(selector, reducers.argmax); 722 | } 723 | 724 | function argmin(selector){ 725 | return this.reduce(selector, reducers.argmin); 726 | } 727 | 728 | function reduce(selector, reducer, initial){ 729 | 730 | var column = selector ? this._cols[selector] : null; 731 | var key = selector && this._keys ? this._keys[selector] : null; 732 | 733 | // choose default reduce, if none was supplied 734 | var is_numeric = column && column.length > 0 && Object.prototype.toString.call(column[0]) == "[object Number]"; 735 | reducer = reducer || (is_numeric ? reducers.sum : reducers.max); 736 | 737 | if(this._index){ 738 | return treereduce(column, key, this._index, this._keys, this._groups, this._filter, reducer, initial); 739 | } else if(this._filter) { 740 | return filterreduce(column, key, this._filter, reducer, initial); 741 | } else { 742 | return fullreduce(column, key, reducer, initial); 743 | } 744 | } 745 | 746 | function treereduce(column, rkey, index, keys, groups, filter, reducer, initial){ 747 | 748 | var reduced = {}; 749 | var parents = {}; 750 | 751 | // depth first traversal 752 | var todo = [[index, null, 0]]; 753 | var leaves = []; 754 | 755 | var result, pkey, level, n; 756 | while (todo.length > 0){ 757 | n = todo.pop();// object 758 | index = n[0]; 759 | pkey = n[1]; 760 | level = n[2]; 761 | result = {}; // container for this subtree in result 762 | 763 | var c, name; 764 | for(key in index){ // keys in object 765 | c = index[key]; 766 | group = groups[level]; 767 | 768 | // decode the key, if possible 769 | /* 770 | if(keys && group in keys){ 771 | decoder = keys[group]; 772 | key = decoder[key]; 773 | }*/ 774 | 775 | ckey = pkey ? pkey + "@" + key : key; 776 | 777 | if(isobject(c)){ 778 | todo.push([c, ckey, level + 1]); 779 | } else { 780 | var indices = c; 781 | var filtered = filterindices(indices, filter); 782 | if(filtered.length != 0){ 783 | var value; 784 | if(column){ 785 | value = subsetreduce(column, rkey, filtered, reducer, initial); 786 | } else { 787 | value = filtered.length; // default to count 788 | } 789 | leaves.push([ckey, value]); 790 | } 791 | } 792 | parents[ckey] = [pkey, result]; 793 | } 794 | } 795 | 796 | var root; 797 | while (leaves.length > 0){ 798 | n = leaves.pop(); 799 | ckey = n[0]; // composite key, parent + child 800 | value = n[1]; 801 | 802 | p = parents[ckey]; 803 | pkey = p[0]; 804 | index = p[1]; 805 | 806 | key = pkey ? ckey.slice(pkey.length + 1) : ckey; 807 | index[key] = value; 808 | if(pkey == null){ 809 | root = index; 810 | } else { 811 | leaves.push([pkey, index]); 812 | } 813 | } 814 | 815 | return root; 816 | }; 817 | 818 | function empty (obj){ 819 | for (var key in obj) { 820 | if (obj.hasOwnProperty(key)) { 821 | return false 822 | } 823 | } 824 | return true 825 | } 826 | 827 | function filterindices(indices, filter){ 828 | if(!filter) return indices; 829 | 830 | result = []; 831 | for(var i = 0; i < indices.length; i++){ 832 | index = indices[i]; 833 | if(filter.get(index)){ 834 | result.push(index); 835 | } 836 | } 837 | return result; 838 | } 839 | 840 | /* reduce a subset of an array given by a set of indices using a supplied 841 | reducing function. 842 | 843 | Extracting this code into a function produces an order of magnitude speedup. 844 | I don't know why. 845 | */ 846 | function subsetreduce(column, key, indices, reducer, initial){ 847 | 848 | var value = null; 849 | if(initial) value = initial; 850 | 851 | if(key){ 852 | for(var i = 0; i < indices.length; i++){ 853 | index = indices[i]; 854 | if(value === null) value = key[column[index]]; 855 | else value = reducer(value, key[column[index]], i); 856 | } 857 | } else { 858 | for(var i = 0; i < indices.length; i++){ 859 | index = indices[i]; 860 | if(value === null) value = column[index]; 861 | else value = reducer(value, column[index], i); 862 | } 863 | } 864 | 865 | return value || 0; 866 | } 867 | 868 | function filterreduce(column, key, filter, reducer, initial){ 869 | 870 | var value = null; 871 | if(initial) value = initial; 872 | 873 | var word, 874 | mask, 875 | cutoff; 876 | var bits = filter.wordArray; 877 | var total = 0; 878 | var max = column.length; 879 | 880 | if(key){ 881 | for(var i = 0; i < bits.length; i++){ 882 | word = bits[i]; 883 | if(word !== 0){ 884 | cutoff = (i + 1) * 32; 885 | if(cutoff > max) cutoff = max; 886 | mask = 1; 887 | for(var j = i * 32; j < cutoff; j++){ 888 | if((word & mask) !== 0) { 889 | if(value === null) value = key[column[j]]; 890 | else value = reducer(value, key[column[j]], total); 891 | total++; 892 | } 893 | mask <<= 1; 894 | } 895 | } 896 | } 897 | } else { 898 | for(var i = 0; i < bits.length; i++){ 899 | word = bits[i]; 900 | if(word !== 0){ 901 | cutoff = (i + 1) * 32; 902 | if(cutoff > max) cutoff = max; 903 | mask = 1; 904 | for(var j = i * 32; j < cutoff; j++){ 905 | if((word & mask) !== 0) { 906 | if(value === null) value = column[j]; 907 | else value = reducer(value, column[j], total); 908 | total++; 909 | } 910 | mask <<= 1; 911 | } 912 | } 913 | } 914 | } 915 | 916 | 917 | return value || 0; 918 | } 919 | 920 | function fullreduce(column, key, reducer, initial){ 921 | 922 | var start, 923 | value; 924 | 925 | // chose initial values and start of loop based on number of inputs and 926 | // supplied initial value 927 | if(initial !== void(0)){ 928 | start = 0; 929 | value = initial; 930 | } else if(column.length > 0) { 931 | start = 1; 932 | value = key ? key[column[0]] : column[0]; 933 | } else { 934 | start = 0; 935 | value = 0; 936 | } 937 | 938 | if(key){ 939 | for(var i = start; i < column.length; i++){ 940 | value = reducer(value, key[column[i]], i); 941 | } 942 | } else { 943 | for(var i = start; i < column.length; i++){ 944 | value = reducer(value, column[i], i); 945 | } 946 | } 947 | 948 | return value; 949 | } 950 | -------------------------------------------------------------------------------- /lib/stream-reducers.js: -------------------------------------------------------------------------------- 1 | 2 | module.exports = { 3 | "count" : count, 4 | "sum" : sum, 5 | "max" : max, 6 | "min" : min, 7 | "mean" : mean, 8 | "mode" : mode, 9 | "median" : median, 10 | "argmax" : argmax, 11 | "argmin" : argmin 12 | }; 13 | 14 | /* Array.prototype.reduce style function for finding the maximum 15 | * @examples 16 | * [1, 1, 1].reduce(ds.reduce.max); // => 1 17 | * [3, 1, 3, 5].reduce(ds.reduce.max); // => 5 18 | * reduce({"a" : 1, "b" : 0, "c" : 2}, ds.reduce.max); // => 2 19 | */ 20 | function max(agg, val) { return agg > val ? agg : val; }; 21 | 22 | /* Array.prototype.reduce style function for finding the minimum 23 | * @examples 24 | * [1, 1, 1].reduce(ds.reduce.min); // => 1 25 | * [3, 1, 3, 5].reduce(ds.reduce.min); // => 1 26 | * reduce({"a" : 1, "b" : 0, "c" : 2}, ds.reduce.min); // => 0 27 | */ 28 | function min(agg, val) { return agg < val ? agg : val; }; 29 | 30 | /* Array.prototype.reduce style function for finding the most common value 31 | * @examples 32 | * [1, 1, 1].reduce(ds.reduce.mode); // => 1 33 | * [1, 3, 3, 7].reduce(ds.reduce.mode); // => 3 34 | * reduce({"a" : 1, "b" : 0, "c" : 2}, ds.reduce.mode); // => 1 35 | */ 36 | function mode(agg, val, n) { 37 | if(n === 0) return val; 38 | 39 | var self; 40 | if(n === 1){ 41 | // internal state hack (compatible with groupby) 42 | self = mode.state = {}; 43 | self.values = {}; 44 | self.values[agg] = 1; 45 | self.argmax = agg; 46 | } else { 47 | self = mode.state; 48 | } 49 | 50 | if(val in self.values) 51 | self.values[val] += 1; 52 | else 53 | self.values[val] = 1; 54 | 55 | if(self.values[val] > self.values[agg]) 56 | self.argmax = val; 57 | 58 | return self.argmax; 59 | } 60 | 61 | function argmax(agg, val, n){ 62 | var self; 63 | if(n === 0){ 64 | // internal state hack (compatible with groupby) 65 | self = argmax.state = {}; 66 | self.max = val; 67 | return 0; 68 | } 69 | 70 | if(n === 1){ 71 | if(argmax.state == null) self = argmax.state = {}; 72 | else self = argmax.state; 73 | // is this the first time we've called this function on this array? 74 | if(self.max != null && self.argmax == null){ 75 | // no 76 | } else { 77 | // yes 78 | self.max = agg; 79 | } 80 | self.argmax = 0; 81 | } else { 82 | self = argmax.state; 83 | } 84 | 85 | if(val > self.max){ 86 | self.max = val; 87 | self.argmax = n; 88 | } 89 | 90 | return self.argmax; 91 | } 92 | 93 | 94 | function argmin(agg, val, n){ 95 | var self; 96 | if(n === 0){ 97 | // internal state hack (compatible with groupby) 98 | self = argmin.state = {}; 99 | self.min = val; 100 | return 0; 101 | } 102 | 103 | if(n === 1){ 104 | if(argmin.state == null) self = argmin.state = {}; 105 | else self = argmin.state; 106 | // is this the first time we've called this function on this array? 107 | if(self.min != null && self.argmin == null){ 108 | // no 109 | } else { 110 | // yes 111 | self.min = agg; 112 | } 113 | self.argmin = 0; 114 | } else { 115 | self = argmin.state; 116 | } 117 | 118 | if(val < self.min){ 119 | self.min = val; 120 | self.argmin = n; 121 | } 122 | 123 | return self.argmin; 124 | } 125 | 126 | /* Array.prototype.reduce style function for finding the middle value 127 | * @examples 128 | * [1, 1, 1].reduce(ds.reduce.median); // => 1 129 | * [1, 3, 3, 7].reduce(ds.reduce.median); // => 3 130 | * [4, 1, 7].reduce(ds.reduce.median); // => 4 131 | * reduce({"a" : 4, "b" : 1, "c" : 7}, ds.reduce.median); // => 4 132 | 133 | DON'T USE THIS FUNCTION, IT'S VERY SLOW 134 | */ 135 | function median(agg, val, n) { 136 | if(n === 0) return val; 137 | 138 | if(n === 1){ 139 | // internal state hack (compatible with groupby) 140 | self = median.state = {}; 141 | self.values = [agg]; 142 | } else { 143 | self = median.state; 144 | } 145 | 146 | // insert the new value into the sorted array 147 | insert(self.values, val); 148 | 149 | var middle = self.values.length / 2 | 0; 150 | // even number of elements? 151 | if(self.values.length % 2 !== 0){ 152 | // no, return the middle one 153 | return self.values[middle]; 154 | } else { 155 | // yes, return the average of the middle two 156 | return (self.values[middle - 1] + self.values[middle]) / 2; 157 | } 158 | } 159 | 160 | /* Array.prototype.reduce style function for counting number of elements 161 | * @examples 162 | * [1, 1, 1].reduce(ds.reduce.count); // => 3 163 | * [3, 1, 3, 5].reduce(ds.reduce.count); // => 4 164 | * reduce({"a" : 1, "b" : 0, "c" : 2}, ds.reduce.count); // => 3 165 | */ 166 | function count(agg, val, n){ return n + 1; }; 167 | 168 | /* Array.prototype.reduce style function for finding the sum 169 | * @examples 170 | * [1, 1, 1].reduce(ds.reduce.sum); // => 3 171 | * [3, 1, 3, 5].reduce(ds.reduce.sum); // => 12 172 | * reduce({"a" : 1, "b" : 0, "c" : 2}, ds.reduce.sum); // => 3 173 | */ 174 | function sum(agg, val){ return agg + val; }; 175 | 176 | /* Array.prototype.reduce style function for finding the arithmetic mean 177 | * @examples 178 | * [1, 1, 1].reduce(ds.reduce.mean); // => 1 179 | * [3, 1, 3, 5].reduce(ds.reduce.mean); // => 3 180 | * reduce({"a" : 1, "b" : 0, "c" : 2}, ds.reduce.mean); // => 1 181 | */ 182 | function mean(agg, val, n){ return (agg + ((val - agg)/(n + 1))); }; 183 | 184 | var d = function(a, b){ return a > b ? 1 : a < b ? -1 : 0;}; 185 | 186 | function insert(arr, el){ 187 | var index = binarySearch(arr, el, d); 188 | arr.splice(index, 0, el); 189 | 190 | return arr; 191 | }; 192 | 193 | var binarySearch = function binarySearch(arr, el, comparator) { 194 | 195 | var m = 0; 196 | var n = arr.length - 1; 197 | while (m <= n) { 198 | var k = (n + m) >> 1; 199 | var cmp = comparator(el, arr[k]); // comparator(arr[k], el); 200 | if (cmp > 0) { 201 | m = k + 1; 202 | } else if(cmp < 0) { 203 | n = k - 1; 204 | } else { 205 | return k; 206 | } 207 | } 208 | 209 | return m; 210 | } 211 | -------------------------------------------------------------------------------- /lib/test.js: -------------------------------------------------------------------------------- 1 | var async = require('async'), 2 | path = require('path'), 3 | floader = require('floader'), 4 | aloader = require('arrayloader'); 5 | 6 | test = {}; 7 | 8 | test.DEFAULT_TYPE = DEFAULT_TYPE = "int32"; 9 | 10 | test.type_map = type_map = { 11 | "int8" : ".i8", 12 | "uint8" : ".u8", 13 | "int16" : ".i16", 14 | "uint16" : ".u16", 15 | "int32" : ".i32", 16 | "uint32" : ".u32", 17 | "float32" : ".f32", 18 | "float64" : ".f64", 19 | "str8" : ".s8", 20 | "str16" : ".s16" 21 | }; 22 | 23 | test.extension_map = extension_map = { 24 | ".i8" : Int8Array, 25 | ".u8" : Uint8Array, 26 | ".i16" : Int16Array, 27 | ".u16" : Uint16Array, 28 | ".i32" : Int32Array, 29 | ".u32" : Uint32Array, 30 | ".f32" : Float32Array, 31 | ".f64" : Float64Array, 32 | ".s8" : Int8Array, 33 | ".s16" : Int16Array 34 | }; 35 | 36 | test.float_types = { 37 | "float32" : true, 38 | "float64" : true 39 | }; 40 | 41 | test.string_types = { 42 | "str8" : true, 43 | "str16" : true 44 | }; 45 | 46 | 47 | /* load a binary file as a TypedArray with the type given by the extension */ 48 | function loadArray(filePath, cb){ 49 | 50 | var ext = path.extname(filePath); 51 | ext = ext.toLowerCase(); 52 | 53 | if (ext in extension_map) 54 | constructor = extension_map[ext]; 55 | else 56 | constructor = Int32Array; 57 | 58 | return aloader.load(filePath, constructor, cb); 59 | } 60 | 61 | test.load = function(directory, names, types, callback){ 62 | 63 | // array of paths to matrix data files for current test 64 | var paths = names.map(function(name, i){ 65 | type = types[i]; 66 | if (!(type in type_map)) type = DEFAULT_TYPE; 67 | 68 | ext = type_map[types[i]]; 69 | 70 | return directory + name + ext; 71 | }); 72 | 73 | //console.log(testFiles); 74 | async.map(paths, loadArray, 75 | function(err, results){ 76 | 77 | if(err) return callback(err); 78 | 79 | callback(err, results); 80 | } 81 | ); 82 | } 83 | /* a key file is just a JSON array of strings 84 | the index of the string in the array is it's code 85 | */ 86 | function loadKey(filePath, cb){ 87 | 88 | floader.load(filePath, function(err, key){ 89 | if(err) return cb(err); 90 | 91 | return cb(null, JSON.parse(key)); 92 | }); 93 | } 94 | 95 | test.load_key = function(directory, names, types, callback){ 96 | 97 | // array of paths to matrix data files for current test 98 | var paths = names.map(function(name, i){ 99 | return directory + name + ".key"; 100 | }); 101 | 102 | //console.log(testFiles); 103 | async.map(paths, loadKey, 104 | function(err, results){ 105 | 106 | if(err) return callback(err); 107 | 108 | callback(err, results); 109 | } 110 | ); 111 | } 112 | 113 | function isobject(obj){ return Object.prototype.toString.call(obj) === "[object Object]";} 114 | 115 | /* is there a key in object 'a' not found in object 'b'? 116 | if so, return the first key that's not found 117 | if not, return null 118 | */ 119 | function diffkeys(a, b){ 120 | same_keys = true; 121 | for(key in a){ 122 | same_keys &= (key in b); 123 | if(!same_keys){ 124 | return key; 125 | } 126 | } 127 | 128 | return null; 129 | } 130 | 131 | /* comp is s comparison function for leaves 132 | a - actual 133 | b - expected 134 | */ 135 | function treediff(a, b, comp){ 136 | 137 | var p = "(r)"; 138 | var todo = [[a, b, p]]; // tuple of (a, b, p) 139 | var parents = { p : null }; 140 | 141 | var diff_key = null, 142 | diff_a = null, 143 | diff_b = null; 144 | var t; 145 | while (todo.length > 0 && !diff_key){ 146 | t = todo.pop(); 147 | n_a = t[0]; 148 | n_b = t[1]; 149 | p = t[2]; 150 | 151 | // are all the keys the same? 152 | diff_b = diffkeys(n_b, n_a); 153 | if(diff_b){ 154 | diff_key = p; 155 | break; 156 | } 157 | diff_a = diffkeys(n_a, n_b); 158 | if(diff_a){ 159 | diff_key = p; 160 | break; 161 | } 162 | 163 | // check children 164 | for(key in n_b){ 165 | // both objects/internal nodes? 166 | if(isobject(n_b[key]) && isobject(n_a[key])){ 167 | // yes, add to stack 168 | parents[key] = p; 169 | todo.push([n_a[key], n_b[key], key]); 170 | 171 | // both leaves? 172 | } else if(!isobject(n_b[key]) && !isobject(n_a[key])) { 173 | // yes, compare values 174 | if(!comp(n_b[key], n_a[key])){ 175 | diff_key = key; 176 | diff_a = n_a[key]; 177 | diff_b = n_b[key]; 178 | break; 179 | } 180 | } else { 181 | // one is leaf the other is internal 182 | diff_key = key; 183 | if(isobject(n_b)){ 184 | diff_a = n_a[key]; 185 | } else { 186 | diff_b = n_b[key]; 187 | } 188 | break; 189 | } 190 | } 191 | } 192 | 193 | var path; 194 | // difference found? 195 | if(diff_key){ 196 | // yes, reconstruct the path 197 | var n = diff_key; 198 | path = [n]; 199 | while(parents[n]){ 200 | n = parents[n]; 201 | path.push(n); 202 | } 203 | 204 | // diff_a and diff_b are both present on a leaf difference 205 | // only one is present for an internal node difference 206 | return {"path" : path.reverse(), "a" : diff_a, "b" : diff_b}; 207 | } 208 | 209 | return null; 210 | 211 | } 212 | 213 | test.assert = {}; 214 | test.assert.tree = {}; 215 | 216 | /* determine whether two trees are equivalent 217 | */ 218 | test.assert.tree.equal = function(t, a, b, msg) { 219 | var fail = treediff(a, b, function(a_n, b_n){ 220 | return a_n === b_n; 221 | }); 222 | 223 | msg = msg || 'trees should be equal'; 224 | return treeassert(t, fail, msg); 225 | }; 226 | 227 | /* determine whether two trees are approximately equivalent: 228 | internal nodes are identical 229 | leaves are within specified floating point tolerances 230 | */ 231 | test.assert.tree.allclose = function(t, a, b, msg, RTOL, ATOL) { 232 | RTOL= RTOL || 1e-05; // for 32 bit precision: 1e-06 233 | ATOL= ATOL || 1e-08; 234 | 235 | // treeequal with a floating point comparison function 236 | var fail = treediff(a, b, function(a_n, b_n){ 237 | return Math.abs(a_n - b_n) <= ATOL + RTOL * Math.abs(b_n) 238 | }); 239 | 240 | msg = msg || 'trees should be allclose'; 241 | return treeassert(t, fail, msg); 242 | }; 243 | 244 | test.assert.close = function(t, a, b, msg, RTOL, ATOL){ 245 | RTOL= RTOL || 1e-05; // for 32 bit precision: 1e-06 246 | ATOL= ATOL || 1e-08; 247 | 248 | // treeequal with a floating point comparison function 249 | var success = Math.abs(a - b) <= ATOL + RTOL * Math.abs(b) 250 | 251 | t._assert(success, { 252 | message : msg, 253 | operator : 'close', 254 | actual : a, 255 | expected : b, 256 | extra : null 257 | }); 258 | 259 | return success; 260 | } 261 | 262 | var NULL_PLACEHOLDER = "(null)"; 263 | function treeassert(t, fail, msg){ 264 | 265 | if(fail){ 266 | var actual = fail.path.join(" -> "), 267 | expected = fail.path.join(" -> "); 268 | 269 | fail.a = fail.a || NULL_PLACEHOLDER; 270 | fail.b = fail.b || NULL_PLACEHOLDER; 271 | actual += " -> " + fail.a; 272 | expected += " -> " + fail.b; 273 | } 274 | 275 | t._assert(!fail, { 276 | message : msg, 277 | operator : 'tree.equal', 278 | actual : actual, 279 | expected : expected, 280 | extra : null 281 | }); 282 | 283 | return !fail; 284 | }; 285 | 286 | test.generate = { 287 | "Array" : { 288 | "int" : randomIntArray, 289 | "float" : randomFloatArray 290 | } 291 | }; 292 | 293 | function randomIntArray(N, K){ 294 | 295 | var data = []; 296 | 297 | for(var i = 0; i < N; i++){ 298 | data.push(Math.random() * K | 0); 299 | } 300 | 301 | return data; 302 | } 303 | 304 | function randomFloatArray(N){ 305 | 306 | var data = []; 307 | 308 | for(var i = 0; i < N; i++){ 309 | data.push(Math.random() / Math.sqrt(N)); 310 | } 311 | 312 | return data; 313 | } 314 | 315 | module.exports = test; 316 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "dataship-frame", 3 | "version": "2.1.1", 4 | "description": "A Data Frame for Javascript. Crunch numbers in node and the browser.", 5 | "main": "lib/frame.js", 6 | "directories": { 7 | "test": "test" 8 | }, 9 | "scripts": { 10 | "data": "node test/data/generate.js", 11 | "test": "browserify test/*.js | testling -x $npm_config_browser", 12 | "dist": "mkdir -p dist && browserify lib/frame.js -s Frame > dist/frame.js", 13 | "bench": "browserify benchmark/*.js | testling -x $npm_config_browser", 14 | "bench-datavore": "browserify benchmark/datavore/*.js | testling -x $npm_config_browser" 15 | }, 16 | "repository": { 17 | "type": "git", 18 | "url": "git+https://github.com/dataship/frame.git" 19 | }, 20 | "keywords": [ 21 | "dataframe", 22 | "statistics", 23 | "math", 24 | "pandas", 25 | "R" 26 | ], 27 | "author": "", 28 | "license": "MIT", 29 | "bugs": { 30 | "url": "https://github.com/dataship/frame/issues" 31 | }, 32 | "homepage": "https://github.com/dataship/frame#readme", 33 | "devDependencies": { 34 | "arrayloader": "^1.1.2", 35 | "async": "^2.1.5", 36 | "benchtap": "^1.0.0", 37 | "floader": "^1.0.1", 38 | "tape": "^4.6.3" 39 | }, 40 | "dependencies": { 41 | "bit-array": "^0.2.2" 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | docopt 3 | -------------------------------------------------------------------------------- /test/argmax.js: -------------------------------------------------------------------------------- 1 | var tape = require('tape'), 2 | Frame = require('../lib/frame'); 3 | 4 | // simple instructive test cases 5 | function simpleTestCases(){ 6 | 7 | tape("argmax works with integers", function(t){ 8 | t.plan(1); 9 | var frame = new Frame({ 10 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 11 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 12 | }); 13 | 14 | var expected = 6; // 1 + 2 + 2 + 3 + 1 + 3 + 4 + 2 + 1 15 | 16 | var actual = frame.argmax("value"); 17 | 18 | t.equal(actual, expected); 19 | }); 20 | 21 | tape("argmax works with integers", function(t){ 22 | t.plan(1); 23 | var frame = new Frame({ 24 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1], 25 | "value" : [4, 2, 7, 1, 3, 6, 5, 2, 1, 7, 8] 26 | }); 27 | 28 | var expected = 10; // (4 + 2 + 7 + 1 + 3 + 6 + 5 + 2 + 1 + 7 + 8) / 11 29 | 30 | var actual = frame.argmax("value"); 31 | 32 | t.equal(actual, expected); 33 | }); 34 | 35 | tape("argmax works floats", function(t){ 36 | t.plan(1); 37 | var frame = new Frame({ 38 | "id" : [0, 2, 0, 1, 1, 0, 2, 0, 1], 39 | "value" : [1.2, 6.4, 2.3, 12.1, 1.6, 3.5, 7.2, 2.1, 10.2] 40 | }); 41 | 42 | var expected = 3; // (1.2 + 6.4 + 2.3 + 12.1 + 1.6 + 3.5 + 7.2 + 2.1 + 10.2) / 9 43 | var actual = frame.argmax("value"); 44 | 45 | t.equal(actual, expected); 46 | }); 47 | 48 | tape("argmax works floats", function(t){ 49 | t.plan(1); 50 | var frame = new Frame({ 51 | "id" : [0, 2, 0, 1, 1, 0, 2, 0, 1], 52 | "value" : [1.2, 6.4, 2.3, 1.1, 10.6, 3.5, 7.2, 2.1, 9.2] 53 | }); 54 | 55 | var expected = 4; // (1.2 + 6.4 + 2.3 + 12.1 + 1.6 + 3.5 + 7.2 + 2.1 + 10.2) / 9 56 | var actual = frame.argmax("value"); 57 | 58 | t.equal(actual, expected); 59 | }); 60 | 61 | tape("argmax wonky edge case", function(t){ 62 | t.plan(1); 63 | var frame = new Frame({ 64 | "id" : [0, 2, 0, 1, 1, 0, 2, 0, 1], 65 | "value" : [11.2, 6.4, 2.3, 1.1, 10.6, 3.5, 7.2, 2.1, 9.2] 66 | }); 67 | 68 | // zero argmax 69 | var primer = frame.argmax("value"); 70 | 71 | var expected = 4; 72 | var frame2 = new Frame({ 73 | "id" : [0, 2, 0, 1, 1, 0, 2, 0, 1], 74 | "value" : [1.2, 6.4, 2.3, 1.1, 10.6, 3.5, 7.2, 2.1, 9.2] 75 | }); 76 | 77 | var actual = frame2.argmax("value"); 78 | t.equal(actual, expected); 79 | }); 80 | } 81 | 82 | simpleTestCases(); 83 | /* 84 | var RTOL = 1e-05, // 1e-05 85 | ATOL = 1e-12; // 1e-12 86 | 87 | var dataDirectory = 'test/data/mean/', 88 | testFile = 'small.json'; 89 | 90 | var floader = require('floader'), 91 | dtest = require('../lib/test'); 92 | 93 | floader.load(dataDirectory + testFile, function(err, config){ 94 | 95 | var suite = JSON.parse(config); 96 | simpleTestCases(); 97 | 98 | for(var i = 0; i < suite.length; i++){ 99 | 100 | var prefix = String("0000" + (i + 1)).slice(-4); 101 | 102 | // directory containing matrix data files for current test 103 | var directory = dataDirectory + prefix + '/'; 104 | 105 | var test = suite[i]; 106 | 107 | var names = test.id.map(function(spec, i){ return "id_" + i;}); 108 | var types = test.id.map(function(spec, i){ return spec['type'];}); 109 | 110 | var N = test.N; // number of rows 111 | var distincts = test.id.map(function(spec, i){ return spec.K; }); 112 | 113 | var testName = "mean: " + N + " x " + "(" + distincts.join(", ") + ")" 114 | tape(testName, generateTestCase(directory, names, types, ["value_0"], [test.value[0].type])); 115 | } 116 | }); 117 | 118 | var OUT_FILENAME = "out.json"; 119 | 120 | 121 | function generateTestCase(directory, id_names, id_types, value_names, value_types){ 122 | return function(t){ 123 | t.plan(1); 124 | 125 | var names = id_names.concat(value_names); 126 | var types = id_types.concat(value_types); 127 | 128 | // which columns require a key file? 129 | var key_names = id_names.filter(function(item, i){ 130 | return id_types[i] in dtest.string_types 131 | }); 132 | var key_types = id_types.filter(function(item, i){ 133 | return item in dtest.string_types 134 | }); 135 | 136 | // load columns from files 137 | dtest.load(directory, names, types, function(err, columns){ 138 | 139 | // load key files 140 | dtest.load_key(directory, key_names, key_types, function(err, keys){ 141 | 142 | floader.load(directory + OUT_FILENAME, function(err, out){ 143 | var expected = JSON.parse(out); 144 | 145 | var column_set = {}; 146 | for (var i = 0; i < names.length; i++){ 147 | var name = names[i]; 148 | var column = columns[i]; 149 | column_set[name] = column; 150 | } 151 | // keys map a small set of integers to other things (like strings) 152 | // they're a very simple form of fixed length coding 153 | var key_set = {}; 154 | for (var i = 0; i < keys.length; i++){ 155 | var name = key_names[i]; 156 | var key = keys[i]; 157 | key_set[name] = key; 158 | } 159 | 160 | var frame = new Frame(column_set, key_set); 161 | 162 | //console.log(subset); 163 | var actual = frame.mean("value_0"); 164 | 165 | dtest.assert.close(t, actual, expected, "close", RTOL, ATOL); 166 | }); 167 | 168 | }); 169 | }); 170 | }; 171 | } 172 | */ 173 | -------------------------------------------------------------------------------- /test/count.js: -------------------------------------------------------------------------------- 1 | var tape = require('tape'), 2 | Frame = require('../lib/frame'); 3 | 4 | tape("count gives length with no filter", function(t){ 5 | t.plan(1); 6 | 7 | var frame = new Frame({ 8 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 9 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 10 | }); 11 | 12 | var expected = 9; 13 | 14 | var actual = frame.count(); 15 | t.equals(actual, expected); 16 | }); 17 | 18 | tape("count works with where", function(t){ 19 | t.plan(1); 20 | 21 | var frame = new Frame({ 22 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 23 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 24 | }); 25 | 26 | //frame.where(row => row.id == 1); 27 | frame = frame.where("id", v => v == 1); 28 | 29 | var expected = 4; 30 | 31 | var actual = frame.count(); 32 | t.equals(actual, expected); 33 | }); 34 | 35 | tape("count works with where.equals", function(t){ 36 | t.plan(1); 37 | 38 | var frame = new Frame({ 39 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 40 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 41 | }); 42 | 43 | frame = frame.where("id", 1); 44 | 45 | var expected = 4; 46 | 47 | var actual = frame.count(); 48 | t.equals(actual, expected); 49 | }); 50 | 51 | tape("count works with where.in", function(t){ 52 | t.plan(1); 53 | 54 | var frame = new Frame({ 55 | "id" : [0, 2, 0, 1, 1, 0, 2, 0, 1], 56 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 57 | }); 58 | 59 | frame = frame.where("id", [0, 2]); 60 | 61 | var expected = 6; 62 | 63 | var actual = frame.count(); 64 | t.equals(actual, expected); 65 | }); 66 | 67 | tape("count works with multiple where", function(t){ 68 | t.plan(1); 69 | 70 | var frame = new Frame({ 71 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 72 | "id_1" : [0, 0, 1, 1, 0, 1, 0, 0, 1], 73 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 74 | }); 75 | 76 | //frame.where(row => row.id == 1); 77 | frame = frame.where("id_1", id => id == 1); 78 | frame = frame.where("id_0", id => id == 1); 79 | 80 | var expected = 2; 81 | 82 | var actual = frame.count(); 83 | t.equals(actual, expected); 84 | }); 85 | 86 | 87 | /* 88 | function eq(a){ 89 | return function(v){ v == a; }; 90 | } 91 | 92 | function in(arr){ 93 | var set = {}; 94 | for (a in arr) set[a] = true; 95 | return function(v){ return v in set;}; 96 | }*/ 97 | -------------------------------------------------------------------------------- /test/create.js: -------------------------------------------------------------------------------- 1 | var test = require('tape'), 2 | Frame = require('../lib/frame'); 3 | 4 | test("access column from hidden property", function(t){ 5 | t.plan(1); 6 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 7 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1]; 8 | 9 | var frame = new Frame({ 10 | "a" : a, 11 | "b" : b 12 | }); 13 | 14 | t.equals(JSON.stringify(frame._cols["a"]), JSON.stringify(a)); 15 | }); 16 | 17 | test("access keys from hidden property", function(t){ 18 | t.plan(1); 19 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 20 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1]; 21 | var k = ["one", "two"]; 22 | 23 | var frame = new Frame({ 24 | "a" : a, 25 | "b" : b 26 | }, 27 | { 28 | "a" : k 29 | }); 30 | 31 | 32 | t.equals(JSON.stringify(frame._keys["a"]), JSON.stringify(k)); 33 | }); 34 | 35 | test("row based constructor creates columns correctly", function(t){ 36 | t.plan(2); 37 | var rows = [ 38 | {"a" : 0, "b" : 1}, 39 | {"a" : 0, "b" : 2}, 40 | {"a" : 0, "b" : 2}, 41 | {"a" : 1, "b" : 3}, 42 | {"a" : 1, "b" : 1}, 43 | {"a" : 0, "b" : 3}, 44 | {"a" : 1, "b" : 4}, 45 | {"a" : 0, "b" : 2}, 46 | {"a" : 1, "b" : 1}, 47 | ]; 48 | 49 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 50 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1]; 51 | 52 | var frame = new Frame(rows); 53 | 54 | 55 | t.equals(JSON.stringify(frame._cols["a"]), JSON.stringify(a)); 56 | t.equals(JSON.stringify(frame._cols["b"]), JSON.stringify(b)); 57 | }); 58 | 59 | test("access column as property", function(t){ 60 | t.plan(1); 61 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 62 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1]; 63 | 64 | var frame = new Frame({ 65 | "a" : a, 66 | "b" : b 67 | }); 68 | 69 | 70 | t.equals(JSON.stringify(frame["a"]), JSON.stringify(a)); 71 | }); 72 | 73 | test("accessing column as property decodes when key is present", function(t){ 74 | t.plan(1); 75 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 76 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1]; 77 | var k = ["one", "two"]; 78 | 79 | var frame = new Frame({ 80 | "a" : a, 81 | "b" : b 82 | }, 83 | { 84 | "a" : k 85 | }); 86 | 87 | 88 | var expected = ["one", "one", "one", "two", "two", "one", "two", "one", "two"]; 89 | t.equals(JSON.stringify(frame["a"]), JSON.stringify(expected)); 90 | }); 91 | 92 | test("only columns are enumerable", function(t){ 93 | t.plan(2); 94 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 95 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1]; 96 | 97 | var frame = new Frame({ 98 | "a" : a, 99 | "b" : b 100 | }); 101 | 102 | var expected = ["a", "b"]; 103 | 104 | t.equals(JSON.stringify(Object.keys(frame)), JSON.stringify(expected)); 105 | 106 | var found = []; 107 | 108 | for(name in frame){ 109 | found.push(name); 110 | } 111 | 112 | t.equals(JSON.stringify(found), JSON.stringify(expected)); 113 | }); 114 | 115 | test("Symbol.toStringTag correctly overridden", function(t){ 116 | t.plan(1); 117 | var frame = new Frame({ 118 | "a" : [0], 119 | "b" : [1] 120 | }); 121 | 122 | var expected = "[object Frame]"; 123 | 124 | t.equals(Object.prototype.toString.call(frame), expected); 125 | }); 126 | 127 | test("rename column correctly modifies frame properties", function(t){ 128 | t.plan(2); 129 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 130 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1]; 131 | 132 | var frame = new Frame({ 133 | "a" : a, 134 | "b" : b 135 | }); 136 | 137 | var expected = ["a", "c"]; 138 | 139 | frame.rename("b", "c"); 140 | 141 | t.equals(JSON.stringify(Object.keys(frame)), JSON.stringify(expected)); 142 | 143 | var found = []; 144 | 145 | for(name in frame){ 146 | found.push(name); 147 | } 148 | 149 | t.equals(JSON.stringify(found), JSON.stringify(expected)); 150 | }); 151 | 152 | test("rename column correctly adds accessor", function(t){ 153 | t.plan(1); 154 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 155 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1]; 156 | 157 | var frame = new Frame({ 158 | "a" : a, 159 | "b" : b 160 | }); 161 | 162 | var expected = b; 163 | 164 | frame.rename("b", "c"); 165 | 166 | t.equals(JSON.stringify(frame["c"]), JSON.stringify(expected)); 167 | }); 168 | 169 | test("rename column correctly converts key", function(t){ 170 | t.plan(1); 171 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 172 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1]; 173 | 174 | var frame = new Frame({ 175 | "a" : a, 176 | "b" : b 177 | }, 178 | { 179 | "b" : ["zero", "one", "two", "three", "four"] 180 | }); 181 | 182 | var expected = ["one", "two", "two", "three", "one", "three", "four", "two", "one"]; 183 | 184 | frame.rename("b", "c"); 185 | 186 | t.equals(JSON.stringify(frame["c"]), JSON.stringify(expected)); 187 | }); 188 | 189 | test("setting via property accessor works correctly", function(t){ 190 | t.plan(1); 191 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 192 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1]; 193 | 194 | var frame = new Frame({ 195 | "a" : a, 196 | "b" : b 197 | }); 198 | var c = [3, 4, 1, 0, 2, 1, 2, 3, 3]; 199 | 200 | frame["b"] = c; 201 | 202 | var expected = c.slice(0); 203 | t.equals(JSON.stringify(frame["b"]), JSON.stringify(expected)); 204 | }); 205 | 206 | test("distinct works correctly", function(t){ 207 | t.plan(2); 208 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 209 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1]; 210 | 211 | var frame = new Frame({ 212 | "a" : a, 213 | "b" : b 214 | }); 215 | 216 | var expected = [1, 2, 3, 4]; 217 | var actual = frame.distinct("b"); 218 | 219 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 220 | 221 | var expected = [0, 1]; 222 | var actual = frame.distinct("a"); 223 | 224 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 225 | }); 226 | 227 | test("distinct works with keyed column", function(t){ 228 | t.plan(1); 229 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 230 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1]; 231 | 232 | var frame = new Frame({ 233 | "a" : a, 234 | "b" : b 235 | }, { 236 | "a" : ["zero", "one"] 237 | }); 238 | 239 | var expected = ["zero", "one"]; 240 | var actual = frame.distinct("a"); 241 | 242 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 243 | }); 244 | 245 | test("distinct works with where", function(t){ 246 | t.plan(2); 247 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 248 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1]; 249 | 250 | var frame = new Frame({ 251 | "a" : a, 252 | "b" : b 253 | }); 254 | 255 | var expected = [1, 3, 4]; 256 | frame = frame.where("a", 1); 257 | var actual = frame.distinct("b"); 258 | 259 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 260 | 261 | var expected = [1]; 262 | var actual = frame.distinct("a"); 263 | 264 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 265 | }); 266 | 267 | test("argmax works correctly", function(t){ 268 | t.plan(1); 269 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 270 | var b = [1, 2, 2, 3, 1, 0, 4, 2, 1]; 271 | 272 | var frame = new Frame({ 273 | "a" : a, 274 | "b" : b 275 | }); 276 | 277 | var expected = 6; 278 | var actual = frame.argmax("b"); 279 | 280 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 281 | 282 | }); 283 | 284 | test("argmin works correctly", function(t){ 285 | t.plan(1); 286 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 287 | var b = [1, 2, 2, 3, 1, 0, 4, 2, 1]; 288 | 289 | var frame = new Frame({ 290 | "a" : a, 291 | "b" : b 292 | }); 293 | 294 | var expected = 5; 295 | var actual = frame.argmin("b"); 296 | 297 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 298 | 299 | }); 300 | 301 | test("median works correctly", function(t){ 302 | t.plan(2); 303 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 304 | var b = [1, 2, 2, 3, 4, 3, 4, 2, 1]; 305 | 306 | var frame = new Frame({ 307 | "a" : a, 308 | "b" : b 309 | }); 310 | 311 | var expected = 2; 312 | var actual = frame.median("b"); 313 | 314 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 315 | 316 | var expected = 0; 317 | var actual = frame.median("a"); 318 | 319 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 320 | }); 321 | 322 | test("min works correctly", function(t){ 323 | t.plan(2); 324 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 325 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1]; 326 | 327 | var frame = new Frame({ 328 | "a" : a, 329 | "b" : b 330 | }); 331 | 332 | var expected = 1; 333 | var actual = frame.min("b"); 334 | 335 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 336 | 337 | var expected = 0; 338 | var actual = frame.min("a"); 339 | 340 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 341 | }); 342 | 343 | test("min works with where", function(t){ 344 | t.plan(2); 345 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 346 | var b = [1, 2, 2, 3, 4, 3, 4, 2, 3]; 347 | 348 | var frame = new Frame({ 349 | "a" : a, 350 | "b" : b 351 | }); 352 | 353 | var expected = 3; 354 | frame = frame.where("a", 1); 355 | var actual = frame.min("b"); 356 | 357 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 358 | 359 | var expected = 1; 360 | var actual = frame.min("a"); 361 | 362 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 363 | }); 364 | 365 | test("min works correctly on ISO date strings", function(t){ 366 | t.plan(1); 367 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 368 | var b = ["2016-03-11", "2016-05-11", "2016-04-10", "2016-03-15", 369 | "2016-03-03", "2016-04-21", "2016-05-28", "2016-03-17", 370 | "2016-04-04"]; 371 | 372 | var frame = new Frame({ 373 | "a" : a, 374 | "b" : b 375 | }); 376 | 377 | var expected = "2016-03-03"; 378 | var actual = frame.min("b"); 379 | 380 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 381 | }); 382 | 383 | test("min works correctly on keyed column", function(t){ 384 | t.plan(1); 385 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 386 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1]; 387 | var k = ["b", "a"]; 388 | 389 | var frame = new Frame({ 390 | "a" : a, 391 | "b" : b 392 | }, { 393 | "a" : k 394 | }); 395 | 396 | var expected = "a"; 397 | var actual = frame.min("a"); 398 | 399 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 400 | }); 401 | 402 | test("min works with where on keyed column", function(t){ 403 | t.plan(1); 404 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 405 | var b = [1, 2, 2, 3, 4, 3, 4, 2, 3]; 406 | var k = ["b", "a"]; 407 | 408 | var frame = new Frame({ 409 | "a" : a, 410 | "b" : b 411 | }, { 412 | "a" : k 413 | }); 414 | 415 | var expected = "a"; 416 | frame = frame.where("b", 3); 417 | var actual = frame.min("a"); 418 | 419 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 420 | }); 421 | 422 | test("max works correctly", function(t){ 423 | t.plan(2); 424 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 425 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1]; 426 | 427 | var frame = new Frame({ 428 | "a" : a, 429 | "b" : b 430 | }); 431 | 432 | var expected = 4; 433 | var actual = frame.max("b"); 434 | 435 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 436 | 437 | var expected = 1; 438 | var actual = frame.max("a"); 439 | 440 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 441 | }); 442 | 443 | test("max works correctly on ISO date strings", function(t){ 444 | t.plan(1); 445 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 446 | var b = ["2016-03-11", "2016-05-11", "2016-04-10", "2016-03-15", 447 | "2016-03-03", "2016-04-21", "2016-05-28", "2016-03-17", 448 | "2016-04-04"]; 449 | 450 | var frame = new Frame({ 451 | "a" : a, 452 | "b" : b 453 | }); 454 | 455 | var expected = "2016-05-28"; 456 | var actual = frame.max("b"); 457 | 458 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 459 | }); 460 | 461 | test("max works correctly on keyed column", function(t){ 462 | t.plan(1); 463 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 464 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1]; 465 | var k = ["b", "a"]; 466 | 467 | var frame = new Frame({ 468 | "a" : a, 469 | "b" : b 470 | }, { 471 | "a" : k 472 | }); 473 | 474 | var expected = "b"; 475 | var actual = frame.max("a"); 476 | 477 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 478 | }); 479 | 480 | test("max works with where on keyed column", function(t){ 481 | t.plan(1); 482 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 483 | var b = [1, 2, 2, 3, 4, 3, 4, 2, 3]; 484 | var k = ["b", "a"]; 485 | 486 | var frame = new Frame({ 487 | "a" : a, 488 | "b" : b 489 | }, { 490 | "a" : k 491 | }); 492 | 493 | var expected = "b"; 494 | frame = frame.where("b", 3); 495 | var actual = frame.max("a"); 496 | 497 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 498 | }); 499 | 500 | test("add creates new column", function(t){ 501 | t.plan(3); 502 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 503 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1]; 504 | var c = [2, 7, 2, 1, 9, 3, 2, 1, 1]; 505 | 506 | var frame = new Frame({ 507 | "a" : a, 508 | "b" : b 509 | }); 510 | 511 | var expected = ["a", "b", "c"]; 512 | 513 | frame.add("c", c); 514 | 515 | t.equals(JSON.stringify(Object.keys(frame)), JSON.stringify(expected)); 516 | 517 | var found = []; 518 | 519 | for(name in frame){ 520 | found.push(name); 521 | } 522 | 523 | t.equals(JSON.stringify(found), JSON.stringify(expected)); 524 | 525 | t.equals(JSON.stringify(frame["c"]), JSON.stringify(c)); 526 | }); 527 | -------------------------------------------------------------------------------- /test/data/binary_matrix.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Create two randomly generated matrices, of the specified sizes and write them 3 | to JSON files. 4 | 5 | """ 6 | import json 7 | import numpy as np 8 | import os 9 | 10 | 11 | type_map = { 12 | '.i8' : np.int8, 13 | '.u8' : np.uint8, 14 | '.i16' : np.int16, 15 | '.u16' : np.uint16, 16 | '.i32' : np.int32, 17 | '.u32' : np.uint32, 18 | '.f32' : np.float32, 19 | '.i64' : np.int64, # not compatible with javascript 20 | '.u64' : np.uint64,# not compatible with javascript 21 | '.f64' : np.float64, 22 | '.s8' : np.int8, 23 | '.s16' : np.int16 24 | } 25 | 26 | def get_extension(path): 27 | filename, file_extension = os.path.splitext(path) 28 | return file_extension 29 | 30 | def read(path): 31 | 32 | extension = get_extension(path) 33 | if extension in type_map: 34 | dtype = type_map[extension] 35 | else: 36 | dtype=np.float32 37 | 38 | with open(path, 'rb') as f: 39 | matrix = np.fromfile(f, dtype=dtype) 40 | 41 | return matrix 42 | 43 | def write(path, matrix): 44 | 45 | extension = get_extension(path) 46 | if extension in type_map: 47 | dtype = type_map[extension] 48 | else: 49 | dtype=np.float32 50 | 51 | with open(path, 'wb') as f: 52 | f.write(matrix.astype(dtype=dtype).tostring()) 53 | 54 | return matrix 55 | -------------------------------------------------------------------------------- /test/data/generate.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | var spawn = require('child_process').spawn, 3 | async = require('async'); 4 | 5 | 6 | /* 7 | ./generate.py count/ count/small.json 8 | ./generate.py sum/ sum/small.json 9 | */ 10 | 11 | var tasks = [ 12 | ['generate.py', 'groupby.count/', 'groupby.count/small.json'], 13 | ['generate.py', 'groupby.sum/', 'groupby.sum/small.json'], 14 | ['generate.py', 'groupby.mean/', 'groupby.mean/small.json'], 15 | ['generate.py', 'groupby.where.sum/', 'groupby.where.sum/small.json'], 16 | ['generate.py', 'where.in.sum/', 'where.in.sum/small.json'], 17 | ['generate.py', 'mean/', 'mean/small.json'], 18 | ['generate.py', 'where.mean/', 'where.mean/small.json'] 19 | ]; 20 | var options = { 21 | "cwd" : __dirname, 22 | "stdio": ["inherit", "inherit", "inherit"] 23 | }; 24 | 25 | 26 | async.eachSeries(tasks, function(task, callback){ 27 | spawn('python', task, options).on('close', callback); 28 | }, 29 | function(){ 30 | // all done 31 | }); 32 | -------------------------------------------------------------------------------- /test/data/generate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Create data for the test suite described by the given specification. 3 | Deleting the file out.json in a subdirectory will cause it to be recreated 4 | with existing data and new "args". Deleting all files in a subdirectory will 5 | case all data to be recreated. 6 | 7 | spec.json contains an array of objects, each object contains 8 | 9 | "N" - a number of rows to generate 10 | "id" - a list of id columns to generate 11 | K - number of distinct values to generate 12 | type - type of data to generate for column 13 | [{"K" : 3, "type": "int32"}, {"K" : 3, "type": "int32"}], 14 | "value" - a list of value columns to generate 15 | [{"K" : 100, "type": "int32"}, {"K" : 100, "type": "int32"}] 16 | 17 | NOTE: 64 bit integer types are not compatible with Javascript. 18 | This includes np.int64 and np.uint64 19 | 20 | Implementing test data generation for a new operation involves two things: 21 | 1. creation of a json spec 22 | 2. implementing an operation file with a single function named execute 23 | 24 | Usage: 25 | generate.py 26 | """ 27 | from docopt import docopt 28 | import os 29 | import sys 30 | import json 31 | import math 32 | import collections 33 | import numpy as np 34 | import binary_matrix 35 | 36 | OUT_FILENAME = "out.json" 37 | 38 | KEY_EXT = ".key" 39 | 40 | extension_map = { 41 | "int8" : ".i8", 42 | "uint8" : ".u8", 43 | "int16" : ".i16", 44 | "uint16" : ".u16", 45 | "int32" : ".i32", 46 | "uint32" : '.u32', 47 | "float32" : '.f32', 48 | "int64" : '.i64', # not compatible with javascript 49 | "uint64" : '.u64', # not compatible with javascript 50 | "float64" : '.f64' 51 | } 52 | 53 | # function adapted from this issue request on numpy 54 | # https://github.com/numpy/numpy/issues/3155 55 | def random_sample(size=None, dtype=np.float64): 56 | 57 | if type(dtype) == str or type(dtype) == unicode: 58 | dtype = np.dtype(dtype).type 59 | 60 | type_max = 1 << np.finfo(dtype).nmant 61 | sample = np.empty(size, dtype=dtype) 62 | sample[...] = np.random.randint(0, type_max, size=size) / dtype(type_max) 63 | if size is None: 64 | sample = sample[()] 65 | return sample 66 | 67 | int_types = set([ 68 | "int8", "uint8", 69 | "int16", "uint16", 70 | "int32", "uint32", 71 | "int64", "uint64"]) # not compatible with javascript 72 | float_types = set(["float32", "float64"]) 73 | 74 | def create_column(N, K, type="int32"): 75 | 76 | if type in int_types: 77 | return np.random.randint(0, K, N, dtype=type) 78 | 79 | if type in float_types: 80 | return K * random_sample(N, dtype=type) 81 | 82 | 83 | 84 | return np.random.randint(0, K, N, dtype="int32") 85 | 86 | def write_result(result, location): 87 | """write a dict to a file as a json document""" 88 | try: 89 | with open(location, 'w') as f: 90 | json.dump(result, f, indent=4) 91 | except Exception as e: 92 | print("Couldn't write output JSON file: {0}".format(e.message)) 93 | sys.exit(1) 94 | 95 | def write_code(result, location): 96 | write_result(result, location) 97 | 98 | def read_code(location): 99 | with open(location, 'r') as f: 100 | code = json.load(f) 101 | 102 | return code 103 | 104 | class queue(collections.deque): 105 | def pop(self): 106 | return self.popleft() 107 | def push(self, n): 108 | self.append(n) 109 | 110 | def gen_strings(N): 111 | chars = [chr(i) for i in range(ord('a'), ord('z') + 1)] 112 | L = int(math.ceil(math.log(N) / math.log(len(chars)))) 113 | 114 | results = queue([""]) 115 | 116 | for i in range(L): 117 | for j in range(len(results)): 118 | r = results.pop() 119 | 120 | for c in chars: 121 | results.push(r+c) 122 | 123 | return list(results)[:N] 124 | 125 | if __name__ == '__main__': 126 | arguments = docopt(__doc__, version='JSON Groupby Generator') 127 | 128 | # arguments parsed from Usage statement by docopt 129 | base_directory = os.path.join(arguments[''], '') 130 | test_file = arguments[''] 131 | 132 | sys.path.insert(0, './' + base_directory) 133 | 134 | operation = __import__("operation") 135 | 136 | with open(test_file, 'r') as f: 137 | try: 138 | tests = json.load(f) 139 | except Exception as e: 140 | print("Couldn't parse JSON configuration file: {0}".format(e.message)) 141 | sys.exit(1) 142 | 143 | 144 | for i in range(len(tests)): 145 | 146 | options = tests[i] 147 | N = options['N'] 148 | 149 | # test directory is a string of four numbers starting at 0001 150 | directory = base_directory + "{0:0>4}/".format(i + 1) 151 | 152 | if not os.path.exists(directory): 153 | os.makedirs(directory) 154 | 155 | # if a result exists, skip this data set 156 | if os.path.exists(directory + OUT_FILENAME): 157 | print("Skipping {0}".format(directory)) 158 | continue 159 | 160 | id_columns = {} 161 | for i in range(len(options['id'])): 162 | name = "id_{0}".format(i) 163 | spec = options['id'][i] 164 | K = spec['K'] 165 | dtype = spec['type'] 166 | if dtype[:3] == 'str': 167 | 168 | if K <= 256 and dtype == 'str8': 169 | dtype = "int8" 170 | extension = ".s8" 171 | elif K <= 65536 and dtype == 'str16': 172 | dtype = "int16" 173 | extension = ".s16" 174 | else: 175 | raise Exception("Too many strings!") 176 | 177 | if os.path.exists(directory + name + KEY_EXT) and os.path.exists(directory + name + extension): 178 | # read binary row file 179 | rows = binary_matrix.read(directory + name + extension) 180 | # read key file 181 | code = read_code(directory + name + KEY_EXT) 182 | 183 | else: 184 | rows = create_column(N, K, dtype) 185 | # map integers onto random strings 186 | code = gen_strings(K) 187 | # write key file 188 | write_code(code, directory + name + KEY_EXT) 189 | binary_matrix.write(directory + name + extension, rows) 190 | 191 | column = [code[index] for index in rows] 192 | 193 | else: 194 | if dtype not in extension_map: 195 | dtype = "int32" 196 | 197 | extension = extension_map[dtype] 198 | 199 | if os.path.exists(directory + name + extension): 200 | column = binary_matrix.read(directory + name + extension) 201 | else: 202 | column = create_column(N, K, dtype) 203 | binary_matrix.write(directory + name + extension, column) 204 | 205 | id_columns[name] = column 206 | 207 | value_columns = {} 208 | for i in range(len(options['value'])): 209 | name = "value_{0}".format(i) 210 | spec = options['value'][i] 211 | K = spec['K'] 212 | dtype = spec['type'] 213 | if dtype not in extension_map: 214 | dtype = "int32" 215 | 216 | extension = extension_map[dtype] 217 | if os.path.exists(directory + name + extension): 218 | column = binary_matrix.read(directory + name + extension) 219 | else: 220 | column = create_column(N, K, dtype) 221 | binary_matrix.write(directory + name + extension, column) 222 | 223 | value_columns[name] = column 224 | 225 | # run reduction 226 | arguments = options['arg'] if 'arg' in options else {} 227 | out = operation.execute(arguments, id_columns, value_columns) 228 | 229 | # write result 230 | #binary_matrix.write(directory + "out.arr", out.flatten()) 231 | write_result(out, directory + OUT_FILENAME) 232 | 233 | print("Created {0}".format(directory)) 234 | -------------------------------------------------------------------------------- /test/data/groupby.count/operation.py: -------------------------------------------------------------------------------- 1 | """count operation 2 | """ 3 | import pandas as pd 4 | 5 | def convert_to_dict(r): 6 | 7 | # returns a dictionary whose keys are tuples 8 | tupled = r.to_dict() 9 | 10 | # convert tuple keys to nested dictionaries 11 | dicted = {} 12 | for (t, k) in tupled.items(): 13 | level = dicted 14 | 15 | # create a nested dictionary for each item in the tuple 16 | for l in t[:-1]: 17 | if l in level: 18 | level = level[l] 19 | else: 20 | level[l] = {} 21 | level = level[l] 22 | 23 | # the last level points to the value 24 | l = t[-1] 25 | level[l] = k.item() # convert numpy type to python type 26 | 27 | return dicted 28 | 29 | def execute(options, id_columns, value_columns): 30 | 31 | columns = id_columns.copy() 32 | columns.update(value_columns) 33 | #print(columns) 34 | 35 | frame = pd.DataFrame(columns) 36 | 37 | g = frame.groupby(by=list(id_columns.keys())) 38 | return convert_to_dict(g.count()["value_0"]) 39 | -------------------------------------------------------------------------------- /test/data/groupby.count/small.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"N" : 10000, 3 | "id" : [{"K" : 3, "type" : "int8"}, {"K" : 3, "type" : "int8"}], 4 | "value" : [{"K" : 100, "type" : "int32"}] 5 | }, 6 | {"N" : 10000, 7 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "int32"}], 8 | "value" : [{"K" : 100, "type" : "float32"}] 9 | }, 10 | {"N" : 10000, 11 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "int32"}], 12 | "value" : [{"K" : 100, "type" : "int32"}] 13 | }, 14 | {"N" : 100000, 15 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "int32"}], 16 | "value" : [{"K" : 100, "type" : "int32"}] 17 | }, 18 | {"N" : 1000000, 19 | "id" : [{"K" : 3, "type" : "uint8"}, {"K" : 3, "type" : "int32"}], 20 | "value" : [{"K" : 100, "type" : "float64"}] 21 | } 22 | ] 23 | -------------------------------------------------------------------------------- /test/data/groupby.mean/operation.py: -------------------------------------------------------------------------------- 1 | """groupby mean operation 2 | """ 3 | import pandas as pd 4 | 5 | def convert_to_dict(r): 6 | 7 | # returns a dictionary whose keys are tuples 8 | tupled = r.to_dict() 9 | 10 | # convert tuple keys to nested dictionaries 11 | dicted = {} 12 | for (t, k) in tupled.items(): 13 | level = dicted 14 | 15 | # create a nested dictionary for each item in the tuple 16 | for l in t[:-1]: 17 | if l in level: 18 | level = level[l] 19 | else: 20 | level[l] = {} 21 | level = level[l] 22 | 23 | # the last level points to the value 24 | l = t[-1] 25 | level[l] = k.item() # convert numpy type to python type 26 | 27 | return dicted 28 | 29 | def execute(options, id_columns, value_columns): 30 | 31 | columns = id_columns.copy() 32 | columns.update(value_columns) 33 | #print(columns) 34 | 35 | frame = pd.DataFrame(columns) 36 | 37 | g = frame.groupby(by=list(id_columns.keys())) 38 | return convert_to_dict(g.mean()["value_0"]) 39 | -------------------------------------------------------------------------------- /test/data/groupby.mean/small.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"N" : 10000, 3 | "id" : [{"K" : 3, "type" : "int8"}, {"K" : 3, "type" : "uint8"}], 4 | "value" : [{"K" : 100, "type" : "int32"}] 5 | }, 6 | {"N" : 10000, 7 | "id" : [{"K" : 3, "type" : "int8"}, {"K" : 3, "type" : "str8"}], 8 | "value" : [{"K" : 100, "type" : "int32"}] 9 | }, 10 | {"N" : 10000, 11 | "id" : [{"K" : 100, "type" : "int8"}, {"K" : 3, "type" : "str8"}], 12 | "value" : [{"K" : 1000, "type" : "int32"}] 13 | }, 14 | {"N" : 10000, 15 | "id" : [{"K" : 100, "type" : "int8"}, {"K" : 100, "type" : "str8"}], 16 | "value" : [{"K" : 1000, "type" : "int32"}] 17 | }, 18 | {"N" : 10000, 19 | "id" : [{"K" : 3, "type" : "int16"}, {"K" : 3, "type" : "uint16"}], 20 | "value" : [{"K" : 100, "type" : "float32"}] 21 | }, 22 | {"N" : 10000, 23 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "uint32"}], 24 | "value" : [{"K" : 100, "type" : "int32"}] 25 | }, 26 | {"N" : 100000, 27 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "int32"}], 28 | "value" : [{"K" : 100, "type" : "int32"}] 29 | }, 30 | {"N" : 100000, 31 | "id" : [{"K" : 1000, "type" : "int32"}, {"K" : 300, "type" : "str16"}], 32 | "value" : [{"K" : 100, "type" : "int32"}] 33 | }, 34 | {"N" : 100000, 35 | "id" : [{"K" : 1000, "type" : "int32"}, {"K" : 300, "type" : "str16"}], 36 | "value" : [{"K" : 100, "type" : "float32"}] 37 | }, 38 | {"N" : 1000000, 39 | "id" : [{"K" : 3, "type" : "uint8"}, {"K" : 3, "type" : "int32"}], 40 | "value" : [{"K" : 100, "type" : "float64"}] 41 | } 42 | ] 43 | -------------------------------------------------------------------------------- /test/data/groupby.sum/operation.py: -------------------------------------------------------------------------------- 1 | """sum operation 2 | """ 3 | import pandas as pd 4 | 5 | def convert_to_dict(r): 6 | 7 | # returns a dictionary whose keys are tuples 8 | tupled = r.to_dict() 9 | 10 | # convert tuple keys to nested dictionaries 11 | dicted = {} 12 | for (t, k) in tupled.items(): 13 | level = dicted 14 | 15 | # create a nested dictionary for each item in the tuple 16 | for l in t[:-1]: 17 | if l in level: 18 | level = level[l] 19 | else: 20 | level[l] = {} 21 | level = level[l] 22 | 23 | # the last level points to the value 24 | l = t[-1] 25 | level[l] = k.item() # convert numpy type to python type 26 | 27 | return dicted 28 | 29 | def execute(options, id_columns, value_columns): 30 | 31 | columns = id_columns.copy() 32 | columns.update(value_columns) 33 | #print(columns) 34 | 35 | frame = pd.DataFrame(columns) 36 | 37 | g = frame.groupby(by=list(id_columns.keys())) 38 | return convert_to_dict(g.sum()["value_0"]) 39 | -------------------------------------------------------------------------------- /test/data/groupby.sum/small.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"N" : 10000, 3 | "id" : [{"K" : 3, "type" : "int8"}, {"K" : 3, "type" : "uint8"}], 4 | "value" : [{"K" : 100, "type" : "int32"}] 5 | }, 6 | {"N" : 10000, 7 | "id" : [{"K" : 3, "type" : "int8"}, {"K" : 3, "type" : "str8"}], 8 | "value" : [{"K" : 100, "type" : "int32"}] 9 | }, 10 | {"N" : 10000, 11 | "id" : [{"K" : 100, "type" : "int8"}, {"K" : 3, "type" : "str8"}], 12 | "value" : [{"K" : 1000, "type" : "int32"}] 13 | }, 14 | {"N" : 10000, 15 | "id" : [{"K" : 100, "type" : "int8"}, {"K" : 100, "type" : "str8"}], 16 | "value" : [{"K" : 1000, "type" : "int32"}] 17 | }, 18 | {"N" : 10000, 19 | "id" : [{"K" : 3, "type" : "int16"}, {"K" : 3, "type" : "uint16"}], 20 | "value" : [{"K" : 100, "type" : "float32"}] 21 | }, 22 | {"N" : 10000, 23 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "uint32"}], 24 | "value" : [{"K" : 100, "type" : "int32"}] 25 | }, 26 | {"N" : 100000, 27 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "int32"}], 28 | "value" : [{"K" : 100, "type" : "int32"}] 29 | }, 30 | {"N" : 100000, 31 | "id" : [{"K" : 1000, "type" : "int32"}, {"K" : 300, "type" : "str16"}], 32 | "value" : [{"K" : 100, "type" : "int32"}] 33 | }, 34 | {"N" : 100000, 35 | "id" : [{"K" : 1000, "type" : "int32"}, {"K" : 300, "type" : "str16"}], 36 | "value" : [{"K" : 100, "type" : "float32"}] 37 | }, 38 | {"N" : 1000000, 39 | "id" : [{"K" : 3, "type" : "uint8"}, {"K" : 3, "type" : "int32"}], 40 | "value" : [{"K" : 100, "type" : "float64"}] 41 | } 42 | ] 43 | -------------------------------------------------------------------------------- /test/data/groupby.where.sum/operation.py: -------------------------------------------------------------------------------- 1 | """sum operation 2 | """ 3 | import pandas as pd 4 | import math 5 | 6 | def convert_to_dict(r): 7 | 8 | # returns a dictionary whose keys are tuples 9 | tupled = r.to_dict() 10 | 11 | # convert tuple keys to nested dictionaries 12 | dicted = {} 13 | for (t, k) in tupled.items(): 14 | level = dicted 15 | 16 | # create a nested dictionary for each item in the tuple 17 | for l in t[:-1]: 18 | if l in level: 19 | level = level[l] 20 | else: 21 | level[l] = {} 22 | level = level[l] 23 | 24 | # the last level points to the value 25 | l = t[-1] 26 | level[l] = k.item() # convert numpy type to python type 27 | 28 | return dicted 29 | 30 | SAMPLE = 10 31 | 32 | def execute(options, id_columns, value_columns): 33 | ''' 34 | id_columns - a dictionary mapping names (strings) to numpy arrays 35 | value_columns - a dictionary mapping names (strings) to numpy arrays 36 | 37 | ''' 38 | 39 | columns = id_columns.copy() 40 | columns.update(value_columns) 41 | 42 | frame = pd.DataFrame(columns) 43 | 44 | id_name = "id_0" 45 | value_name = "value_0" 46 | 47 | # create a subset of the column values 48 | column = id_columns[id_name] 49 | uniques = set(column[:SAMPLE]) 50 | l = int(math.ceil(len(uniques)/2.0)) 51 | subset = sorted(list(uniques))[:l] 52 | #print(subset) 53 | 54 | #frame.loc[frame[id_name] == 1, value_name].sum() 55 | #v = frame.loc[frame[id_name].isin(subset), value_name].sum() 56 | filtered = frame.loc[frame[id_name].isin(subset)] 57 | grouped = filtered.groupby(by=list(id_columns.keys())) 58 | 59 | return convert_to_dict(grouped.sum()["value_0"]) 60 | -------------------------------------------------------------------------------- /test/data/groupby.where.sum/small.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"N" : 10000, 3 | "id" : [{"K" : 3, "type" : "int8"}, {"K" : 3, "type" : "uint8"}], 4 | "value" : [{"K" : 100, "type" : "int32"}] 5 | }, 6 | {"N" : 10000, 7 | "id" : [{"K" : 3, "type" : "int8"}, {"K" : 3, "type" : "str8"}], 8 | "value" : [{"K" : 100, "type" : "int32"}] 9 | }, 10 | {"N" : 10000, 11 | "id" : [{"K" : 100, "type" : "int8"}, {"K" : 3, "type" : "str8"}], 12 | "value" : [{"K" : 1000, "type" : "int32"}] 13 | }, 14 | {"N" : 10000, 15 | "id" : [{"K" : 100, "type" : "int8"}, {"K" : 100, "type" : "str8"}], 16 | "value" : [{"K" : 1000, "type" : "int32"}] 17 | }, 18 | {"N" : 10000, 19 | "id" : [{"K" : 3, "type" : "int16"}, {"K" : 3, "type" : "uint16"}], 20 | "value" : [{"K" : 100, "type" : "float32"}] 21 | }, 22 | {"N" : 10000, 23 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "uint32"}], 24 | "value" : [{"K" : 100, "type" : "int32"}] 25 | }, 26 | {"N" : 100000, 27 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "int32"}], 28 | "value" : [{"K" : 100, "type" : "int32"}] 29 | }, 30 | {"N" : 100000, 31 | "id" : [{"K" : 1000, "type" : "int32"}, {"K" : 300, "type" : "str16"}], 32 | "value" : [{"K" : 100, "type" : "int32"}] 33 | }, 34 | {"N" : 100000, 35 | "id" : [{"K" : 1000, "type" : "int32"}, {"K" : 300, "type" : "str16"}], 36 | "value" : [{"K" : 100, "type" : "float32"}] 37 | }, 38 | {"N" : 1000000, 39 | "id" : [{"K" : 3, "type" : "uint8"}, {"K" : 3, "type" : "int32"}], 40 | "value" : [{"K" : 100, "type" : "float64"}] 41 | } 42 | ] 43 | -------------------------------------------------------------------------------- /test/data/mean/operation.py: -------------------------------------------------------------------------------- 1 | """mean operation 2 | find the mean (average) of a column 3 | """ 4 | import pandas as pd 5 | import math 6 | 7 | def execute(options, id_columns, value_columns): 8 | ''' 9 | id_columns - a dictionary mapping names (strings) to numpy arrays 10 | value_columns - a dictionary mapping names (strings) to numpy arrays 11 | 12 | ''' 13 | 14 | columns = id_columns.copy() 15 | columns.update(value_columns) 16 | 17 | frame = pd.DataFrame(columns) 18 | 19 | v = frame.mean()["value_0"] 20 | 21 | return v.item() # convert from numpy type to python type 22 | -------------------------------------------------------------------------------- /test/data/mean/small.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"N" : 10000, 3 | "id" : [{"K" : 3, "type" : "int8"}], 4 | "value" : [{"K" : 100, "type" : "int32"}] 5 | }, 6 | {"N" : 10000, 7 | "id" : [{"K" : 3, "type" : "int8"}], 8 | "value" : [{"K" : 100, "type" : "int32"}] 9 | }, 10 | {"N" : 10000, 11 | "id" : [{"K" : 100, "type" : "int8"}], 12 | "value" : [{"K" : 1000, "type" : "int32"}] 13 | }, 14 | {"N" : 10000, 15 | "id" : [{"K" : 100, "type" : "int8"}], 16 | "value" : [{"K" : 1000, "type" : "int32"}] 17 | }, 18 | {"N" : 10000, 19 | "id" : [{"K" : 3, "type" : "int16"}], 20 | "value" : [{"K" : 100, "type" : "float32"}] 21 | }, 22 | {"N" : 10000, 23 | "id" : [{"K" : 3, "type" : "int32"}], 24 | "value" : [{"K" : 100, "type" : "float32"}] 25 | }, 26 | {"N" : 100000, 27 | "id" : [{"K" : 3, "type" : "int32"}], 28 | "value" : [{"K" : 100, "type" : "int32"}] 29 | }, 30 | {"N" : 100000, 31 | "id" : [{"K" : 1000, "type" : "int32"}], 32 | "value" : [{"K" : 100, "type" : "int32"}] 33 | }, 34 | {"N" : 100000, 35 | "id" : [{"K" : 1000, "type" : "int32"}], 36 | "value" : [{"K" : 100, "type" : "float32"}] 37 | }, 38 | {"N" : 1000000, 39 | "id" : [{"K" : 3, "type" : "uint8"}], 40 | "value" : [{"K" : 100, "type" : "float64"}] 41 | } 42 | ] 43 | -------------------------------------------------------------------------------- /test/data/where.in.sum/operation.py: -------------------------------------------------------------------------------- 1 | """where.in sum operation 2 | filter by inclusion in a list, then sum the matches 3 | """ 4 | import pandas as pd 5 | import math 6 | 7 | SAMPLE = 10 8 | 9 | def execute(options, id_columns, value_columns): 10 | ''' 11 | id_columns - a dictionary mapping names (strings) to numpy arrays 12 | value_columns - a dictionary mapping names (strings) to numpy arrays 13 | 14 | ''' 15 | 16 | columns = id_columns.copy() 17 | columns.update(value_columns) 18 | 19 | frame = pd.DataFrame(columns) 20 | 21 | id_name = "id_0" 22 | value_name = "value_0" 23 | 24 | # create a subset of the column values 25 | column = id_columns[id_name] 26 | uniques = set(column[:SAMPLE]) 27 | l = int(math.ceil(len(uniques)/2.0)) 28 | subset = sorted(list(uniques))[:l] 29 | 30 | #frame.loc[frame[id_name] == 1, value_name].sum() 31 | v = frame.loc[frame[id_name].isin(subset), value_name].sum() 32 | 33 | return v.item() # convert from numpy type to python type 34 | -------------------------------------------------------------------------------- /test/data/where.in.sum/small.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"N" : 10000, 3 | "id" : [{"K" : 3, "type" : "int8"}, {"K" : 3, "type" : "uint8"}], 4 | "value" : [{"K" : 100, "type" : "int32"}] 5 | }, 6 | {"N" : 10000, 7 | "id" : [{"K" : 3, "type" : "int8"}, {"K" : 3, "type" : "str8"}], 8 | "value" : [{"K" : 100, "type" : "int32"}] 9 | }, 10 | {"N" : 10000, 11 | "id" : [{"K" : 100, "type" : "int8"}, {"K" : 3, "type" : "str8"}], 12 | "value" : [{"K" : 1000, "type" : "int32"}] 13 | }, 14 | {"N" : 10000, 15 | "id" : [{"K" : 100, "type" : "int8"}, {"K" : 100, "type" : "str8"}], 16 | "value" : [{"K" : 1000, "type" : "int32"}] 17 | }, 18 | {"N" : 10000, 19 | "id" : [{"K" : 3, "type" : "int16"}, {"K" : 3, "type" : "uint16"}], 20 | "value" : [{"K" : 100, "type" : "float32"}] 21 | }, 22 | {"N" : 10000, 23 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "uint32"}], 24 | "value" : [{"K" : 100, "type" : "int32"}] 25 | }, 26 | {"N" : 100000, 27 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "int32"}], 28 | "value" : [{"K" : 100, "type" : "int32"}] 29 | }, 30 | {"N" : 100000, 31 | "id" : [{"K" : 1000, "type" : "int32"}, {"K" : 300, "type" : "str16"}], 32 | "value" : [{"K" : 100, "type" : "int32"}] 33 | }, 34 | {"N" : 100000, 35 | "id" : [{"K" : 1000, "type" : "int32"}, {"K" : 300, "type" : "str16"}], 36 | "value" : [{"K" : 100, "type" : "float32"}] 37 | }, 38 | {"N" : 1000000, 39 | "id" : [{"K" : 3, "type" : "uint8"}, {"K" : 3, "type" : "int32"}], 40 | "value" : [{"K" : 100, "type" : "float64"}] 41 | } 42 | ] 43 | -------------------------------------------------------------------------------- /test/data/where.mean/operation.py: -------------------------------------------------------------------------------- 1 | """where mean operation 2 | filter by equality with a value, then take the mean of (average) the matches 3 | """ 4 | import pandas as pd 5 | import math 6 | 7 | SAMPLE = 10 8 | 9 | def execute(options, id_columns, value_columns): 10 | ''' 11 | id_columns - a dictionary mapping names (strings) to numpy arrays 12 | value_columns - a dictionary mapping names (strings) to numpy arrays 13 | 14 | ''' 15 | 16 | columns = id_columns.copy() 17 | columns.update(value_columns) 18 | 19 | frame = pd.DataFrame(columns) 20 | 21 | id_name = "id_0" 22 | value_name = "value_0" 23 | 24 | # create a subset of the column values 25 | column = id_columns[id_name] 26 | first = column[0] 27 | 28 | #frame.loc[frame[id_name] == 1, value_name].sum() 29 | v = frame.loc[frame[id_name] == first, value_name].mean() 30 | 31 | return v.item() # convert from numpy type to python type 32 | -------------------------------------------------------------------------------- /test/data/where.mean/small.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"N" : 10000, 3 | "id" : [{"K" : 3, "type" : "int8"}, {"K" : 3, "type" : "uint8"}], 4 | "value" : [{"K" : 100, "type" : "int32"}] 5 | }, 6 | {"N" : 10000, 7 | "id" : [{"K" : 3, "type" : "int8"}, {"K" : 3, "type" : "str8"}], 8 | "value" : [{"K" : 100, "type" : "int32"}] 9 | }, 10 | {"N" : 10000, 11 | "id" : [{"K" : 100, "type" : "int8"}, {"K" : 3, "type" : "str8"}], 12 | "value" : [{"K" : 1000, "type" : "int32"}] 13 | }, 14 | {"N" : 10000, 15 | "id" : [{"K" : 100, "type" : "int8"}, {"K" : 100, "type" : "str8"}], 16 | "value" : [{"K" : 1000, "type" : "int32"}] 17 | }, 18 | {"N" : 10000, 19 | "id" : [{"K" : 3, "type" : "int16"}, {"K" : 3, "type" : "uint16"}], 20 | "value" : [{"K" : 100, "type" : "float32"}] 21 | }, 22 | {"N" : 10000, 23 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "uint32"}], 24 | "value" : [{"K" : 100, "type" : "int32"}] 25 | }, 26 | {"N" : 100000, 27 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "int32"}], 28 | "value" : [{"K" : 100, "type" : "int32"}] 29 | }, 30 | {"N" : 100000, 31 | "id" : [{"K" : 1000, "type" : "int32"}, {"K" : 300, "type" : "str16"}], 32 | "value" : [{"K" : 100, "type" : "int32"}] 33 | }, 34 | {"N" : 100000, 35 | "id" : [{"K" : 1000, "type" : "int32"}, {"K" : 300, "type" : "str16"}], 36 | "value" : [{"K" : 100, "type" : "float32"}] 37 | }, 38 | {"N" : 1000000, 39 | "id" : [{"K" : 3, "type" : "uint8"}, {"K" : 3, "type" : "int32"}], 40 | "value" : [{"K" : 100, "type" : "float64"}] 41 | } 42 | ] 43 | -------------------------------------------------------------------------------- /test/groupby.count.js: -------------------------------------------------------------------------------- 1 | var tape = require('tape'), 2 | Frame = require('../lib/frame'); 3 | 4 | 5 | tape("groupby.count", function(t){ 6 | t.plan(1); 7 | var frame = new Frame({ 8 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 9 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 10 | }); 11 | 12 | var expected = { 13 | 0 : 5, 14 | 1 : 4 15 | } 16 | 17 | var g = frame.groupby("id"); 18 | var actual = g.count(); 19 | 20 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 21 | 22 | }); 23 | 24 | tape("groupby.count", function(t){ 25 | t.plan(2); 26 | var frame = new Frame({ 27 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 28 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 29 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 30 | }); 31 | 32 | var expected = { 33 | "0" : { 34 | "0" : 4, 35 | "1" : 1 36 | }, 37 | "1" : { 38 | "0" : 1, 39 | "1" : 3 40 | } 41 | }; 42 | 43 | 44 | var g = frame.groupby(["id_0", "id_1"]); 45 | var actual = g.count(); 46 | 47 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 48 | 49 | 50 | var g = frame.groupby("id_0", "id_1"); 51 | var actual = g.count(); 52 | 53 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 54 | }); 55 | 56 | 57 | 58 | var dataDirectory = 'test/data/groupby.count/', 59 | testFile = 'small.json'; 60 | 61 | var RTOL = 1e-05, // 1e-05 62 | ATOL = 1e-12; // 1e-12 63 | 64 | var floader = require('floader'), 65 | dtest = require('../lib/test'); 66 | 67 | floader.load(dataDirectory + testFile, function(err, config){ 68 | 69 | var suite = JSON.parse(config); 70 | 71 | for(var i = 0; i < suite.length; i++){ 72 | 73 | var prefix = String("0000" + (i + 1)).slice(-4); 74 | 75 | // directory containing matrix data files for current test 76 | var directory = dataDirectory + prefix + '/'; 77 | 78 | var test = suite[i]; 79 | /* 80 | "N" : 10000, 81 | "id" : [{"M" : 3, "strings" : false}, {"M" : 3, "strings" : false}], 82 | "value" : [{"M" : 100}, {"M" : 100}] 83 | */ 84 | 85 | var names = test.id.map(function(spec, i){ return "id_" + i;}); 86 | var types = test.id.map(function(spec, i){ return spec['type'];}); 87 | 88 | var N = test.N; // number of rows 89 | distincts = test.id.map(function(spec, i){ return spec.K; }); 90 | 91 | var testName = "groupby.count: " + N + " x " + "(" + distincts.join(", ") + ")" 92 | tape(testName, generateTestCase(directory, names, types, ["value_0"], [test.value[0].type])); 93 | } 94 | }); 95 | 96 | function generateTestCase(directory, id_names, id_types, value_names, value_types){ 97 | return function(t){ 98 | t.plan(1); 99 | 100 | var names = id_names.concat(value_names); 101 | var types = id_types.concat(value_types); 102 | // load columns from files 103 | dtest.load(directory, names, types, function(err, columns){ 104 | 105 | floader.load(directory + "out.json", function(err, out){ 106 | var expected = JSON.parse(out); 107 | 108 | var column_set = {}; 109 | for (var i = 0; i < names.length; i++){ 110 | var name = names[i]; 111 | var column = columns[i]; 112 | column_set[name] = column; 113 | } 114 | var frame = new Frame(column_set); 115 | 116 | var g = frame.groupby(id_names); 117 | var actual = g.count(); 118 | 119 | var assert; 120 | if(value_types[0] in dtest.float_types){ 121 | assert = dtest.assert.tree.allclose; 122 | } else { 123 | assert = dtest.assert.tree.equal; 124 | } 125 | 126 | assert(t, actual, expected, null, RTOL, ATOL); 127 | }); 128 | 129 | }); 130 | }; 131 | } 132 | -------------------------------------------------------------------------------- /test/groupby.js: -------------------------------------------------------------------------------- 1 | var tape = require('tape'), 2 | Frame = require('../lib/frame'); 3 | 4 | tape("groupby has correct index", function(t){ 5 | t.plan(1); 6 | var frame = new Frame({ 7 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 8 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 9 | }); 10 | 11 | var expected = { 12 | "0" : [0, 1, 2, 5, 7], 13 | "1" : [3, 4, 6, 8] 14 | }; 15 | 16 | var g = frame.groupby("id"); 17 | var actual = g._index; 18 | 19 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 20 | 21 | }); 22 | 23 | tape("groupby with two arguments has correct index", function(t){ 24 | t.plan(1); 25 | var frame = new Frame({ 26 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 27 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 28 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 29 | }); 30 | 31 | var expected = { 32 | "0" : { 33 | "0" : [0, 1, 5, 7], 34 | "1" : [2] 35 | }, 36 | "1" : { 37 | "0" : [4], 38 | "1" : [3, 6, 8] 39 | } 40 | }; 41 | 42 | var g = frame.groupby("id_0", "id_1"); 43 | var actual = g._index; 44 | 45 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 46 | }); 47 | 48 | tape("successive groupby has correct index", function(t){ 49 | t.plan(1); 50 | var frame = new Frame({ 51 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 52 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 53 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 54 | }); 55 | 56 | var expected = { 57 | "0" : { 58 | "0" : [0, 1, 5, 7], 59 | "1" : [2] 60 | }, 61 | "1" : { 62 | "0" : [4], 63 | "1" : [3, 6, 8] 64 | } 65 | }; 66 | 67 | var g = frame.groupby("id_0"); 68 | g = g.groupby("id_1"); 69 | var actual = g._index; 70 | 71 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 72 | }); 73 | 74 | 75 | var dataDirectory = 'test/data/groupby.count/', 76 | testFile = 'small.json'; 77 | 78 | var RTOL = 1e-05, // 1e-05 79 | ATOL = 1e-12; // 1e-12 80 | 81 | var floader = require('floader'), 82 | dtest = require('../lib/test'); 83 | 84 | floader.load(dataDirectory + testFile, function(err, config){ 85 | 86 | var suite = JSON.parse(config); 87 | 88 | for(var i = 0; i < suite.length; i++){ 89 | 90 | var prefix = String("0000" + (i + 1)).slice(-4); 91 | 92 | // directory containing matrix data files for current test 93 | var directory = dataDirectory + prefix + '/'; 94 | 95 | var test = suite[i]; 96 | var names = test.id.map(function(spec, i){ return "id_" + i;}); 97 | var types = test.id.map(function(spec, i){ return spec['type'];}); 98 | 99 | var N = test.N; // number of rows 100 | distincts = test.id.map(function(spec, i){ return spec.K; }); 101 | 102 | var testName = "groupby.count: " + N + " x " + "(" + distincts.join(", ") + ")" 103 | tape(testName, generateTestCase(directory, names, types, ["value_0"], [test.value[0].type])); 104 | } 105 | }); 106 | 107 | function generateTestCase(directory, id_names, id_types, value_names, value_types){ 108 | return function(t){ 109 | t.plan(1); 110 | 111 | var names = id_names.concat(value_names); 112 | var types = id_types.concat(value_types); 113 | // load columns from files 114 | dtest.load(directory, names, types, function(err, columns){ 115 | 116 | floader.load(directory + "out.json", function(err, out){ 117 | var expected = JSON.parse(out); 118 | 119 | var column_set = {}; 120 | for (var i = 0; i < names.length; i++){ 121 | var name = names[i]; 122 | var column = columns[i]; 123 | column_set[name] = column; 124 | } 125 | var frame = new Frame(column_set); 126 | 127 | var g = frame; 128 | for(var i = 0; i < id_names.length; i++){ 129 | id_name = id_names[i]; 130 | g = g.groupby(id_name); 131 | } 132 | var actual = g.count(); 133 | 134 | var assert; 135 | if(value_types[0] in dtest.float_types){ 136 | assert = dtest.assert.tree.allclose; 137 | } else { 138 | assert = dtest.assert.tree.equal; 139 | } 140 | 141 | assert(t, actual, expected, null, RTOL, ATOL); 142 | }); 143 | 144 | }); 145 | }; 146 | } 147 | -------------------------------------------------------------------------------- /test/groupby.mean.js: -------------------------------------------------------------------------------- 1 | var tape = require('tape'), 2 | Frame = require('../lib/frame'), 3 | dtest = require('../lib/test'); 4 | 5 | var RTOL = 1e-05, // 1e-05 6 | ATOL = 1e-12; // 1e-12 7 | 8 | // simple instructive test cases 9 | function simpleTestCases(){ 10 | 11 | tape("groupby accepts single string", function(t){ 12 | t.plan(1); 13 | var frame = new Frame({ 14 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 15 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 16 | }); 17 | 18 | var expected = { 19 | 0: 2, // 1 + 2 + 2 + 3 + 2 20 | 1: 2.25 // 3 + 1 + 4 + 1 21 | }; 22 | 23 | frame = frame.groupby("id"); 24 | var actual = frame.mean("value"); 25 | 26 | dtest.assert.tree.allclose(t, actual, expected, "mean", RTOL, ATOL); 27 | 28 | }); 29 | 30 | tape("groupby accepts single string", function(t){ 31 | t.plan(1); 32 | var frame = new Frame({ 33 | "id" : [ 0, 0, 0, 1, 1, 0, 1, 0, 1], 34 | "value" : [1.4, 10.3, 24.2, 31.2, 1.9, 8.6, 4.7, 21.2, 7.4] 35 | }); 36 | 37 | var expected = { 38 | 0: 13.14, // 1.4 + 10.3 + 24.2 + 8.6 + 21.2 39 | 1: 11.3 // 31.2 + 1.9 + 4.7 + 7.4 40 | }; 41 | 42 | frame = frame.groupby("id"); 43 | var actual = frame.mean("value"); 44 | 45 | dtest.assert.tree.allclose(t, actual, expected, "mean", RTOL, ATOL); 46 | 47 | }); 48 | 49 | tape("groupby accepts single string argument over string variable", function(t){ 50 | t.plan(1); 51 | var frame = new Frame({ 52 | "id" : ["b", "a", "a", "a", "b", "a", "b", "a", "b"], 53 | "value" : [ 3, 1, 2, 2, 1, 3, 4, 2, 1] 54 | }); 55 | expected = { 56 | "a": 2, // 1 + 2 + 2 + 3 + 2 57 | "b": 2.25 // 3 + 1 + 4 + 1 58 | }; 59 | 60 | frame = frame.groupby("id"); 61 | var actual = frame.mean("value"); 62 | 63 | dtest.assert.tree.allclose(t, actual, expected, "mean", RTOL, ATOL); 64 | }); 65 | 66 | tape("groupby accepts array argument", function(t){ 67 | t.plan(1); 68 | var frame = new Frame({ 69 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 70 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 71 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 72 | }); 73 | 74 | var expected = { 75 | "0" : { 76 | "0" : 2, // 1 + 2 + 3 + 2 77 | "1" : 2 // 2 78 | }, 79 | "1" : { 80 | "0" : 1, // 1 81 | "1" : 2.6666666666 // 3 + 4 + 1 82 | } 83 | }; 84 | 85 | frame = frame.groupby(["id_0", "id_1"]); 86 | var actual = frame.mean("value"); 87 | 88 | dtest.assert.tree.allclose(t, actual, expected, "mean", RTOL, ATOL); 89 | }); 90 | 91 | tape("groupby accepts multiple string arguments", function(t){ 92 | t.plan(1); 93 | var frame = new Frame({ 94 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 95 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 96 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 97 | }); 98 | 99 | var expected = { 100 | "0" : { 101 | "0" : 2, // 1 + 2 + 3 + 2 102 | "1" : 2 // 2 103 | }, 104 | "1" : { 105 | "0" : 1, // 1 106 | "1" : 2.6666666666 // 3 + 4 + 1 107 | } 108 | }; 109 | 110 | 111 | frame = frame.groupby("id_0", "id_1"); 112 | var actual = frame.mean("value"); 113 | 114 | dtest.assert.tree.allclose(t, actual, expected, "mean", RTOL, ATOL); 115 | }); 116 | 117 | tape("mean works without groupby", function(t){ 118 | t.plan(1); 119 | var frame = new Frame({ 120 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 121 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 122 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 123 | }); 124 | 125 | var expected = 2.11111111111111; // (1 + 2 + 2 + 3 + 1 + 3 + 4 + 2 + 1) / 9 126 | 127 | var actual = frame.mean("value"); 128 | 129 | dtest.assert.close(t, actual, expected, "close", RTOL, ATOL); 130 | }); 131 | 132 | tape("mean works without groupby", function(t){ 133 | t.plan(1); 134 | var frame = new Frame({ 135 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 136 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 137 | "value" : [3.5, 4.0, 2.1, 3.4, 1.3, 3.8, 4.2, 2.0, 1.5] 138 | }); 139 | 140 | var expected = 2.8666666666; // (3.5, 4.0, 2.1, 3.4, 1.3, 3.8, 4.2, 2.0, 1.5) / 9 141 | 142 | var actual = frame.mean("value"); 143 | 144 | dtest.assert.close(t, actual, expected, "close", RTOL, ATOL); 145 | }); 146 | } 147 | 148 | 149 | var dataDirectory = 'test/data/groupby.mean/', 150 | testFile = 'small.json'; 151 | 152 | var floader = require('floader'), 153 | dtest = require('../lib/test'); 154 | 155 | floader.load(dataDirectory + testFile, function(err, config){ 156 | 157 | var suite = JSON.parse(config); 158 | simpleTestCases(); 159 | 160 | for(var i = 0; i < suite.length; i++){ 161 | 162 | var prefix = String("0000" + (i + 1)).slice(-4); 163 | 164 | // directory containing matrix data files for current test 165 | var directory = dataDirectory + prefix + '/'; 166 | 167 | var test = suite[i]; 168 | 169 | var names = test.id.map(function(spec, i){ return "id_" + i;}); 170 | var types = test.id.map(function(spec, i){ return spec['type'];}); 171 | 172 | var N = test.N; // number of rows 173 | var distincts = test.id.map(function(spec, i){ return spec.K; }); 174 | 175 | var testName = "groupby.mean: " + N + " x " + "(" + distincts.join(", ") + ")" 176 | tape(testName, generateTestCase(directory, names, types, ["value_0"], [test.value[0].type])); 177 | } 178 | }); 179 | 180 | var OUT_FILENAME = "out.json"; 181 | 182 | 183 | function generateTestCase(directory, id_names, id_types, value_names, value_types){ 184 | return function(t){ 185 | t.plan(1); 186 | 187 | var names = id_names.concat(value_names); 188 | var types = id_types.concat(value_types); 189 | 190 | // which columns require a key file? 191 | var key_names = id_names.filter(function(item, i){ 192 | return id_types[i] in dtest.string_types 193 | }); 194 | var key_types = id_types.filter(function(item, i){ 195 | return item in dtest.string_types 196 | }); 197 | 198 | // load columns from files 199 | dtest.load(directory, names, types, function(err, columns){ 200 | 201 | // load key files 202 | dtest.load_key(directory, key_names, key_types, function(err, keys){ 203 | 204 | floader.load(directory + OUT_FILENAME, function(err, out){ 205 | var expected = JSON.parse(out); 206 | 207 | var column_set = {}; 208 | for (var i = 0; i < names.length; i++){ 209 | var name = names[i]; 210 | var column = columns[i]; 211 | column_set[name] = column; 212 | } 213 | // keys map a small set of integers to other things (like strings) 214 | // they're a very simple form of fixed length coding 215 | var key_set = {}; 216 | for (var i = 0; i < keys.length; i++){ 217 | var name = key_names[i]; 218 | var key = keys[i]; 219 | key_set[name] = key; 220 | } 221 | 222 | var frame = new Frame(column_set, key_set); 223 | 224 | var g = frame.groupby(id_names); 225 | var actual = g.mean(value_names[0]); 226 | 227 | var assert = dtest.assert.tree.allclose; 228 | 229 | //console.log(actual); 230 | assert(t, actual, expected, null, RTOL, ATOL); 231 | }); 232 | 233 | }); 234 | }); 235 | }; 236 | } 237 | -------------------------------------------------------------------------------- /test/groupby.sum.js: -------------------------------------------------------------------------------- 1 | var tape = require('tape'), 2 | Frame = require('../lib/frame'); 3 | 4 | // simple instructive test cases 5 | function simpleTestCases(){ 6 | 7 | tape("groupby accepts single string", function(t){ 8 | t.plan(1); 9 | var frame = new Frame({ 10 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 11 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 12 | }); 13 | 14 | var expected = { 15 | 0: 10, // 1 + 2 + 2 + 3 + 2 16 | 1: 9 // 3 + 1 + 4 + 1 17 | }; 18 | 19 | frame = frame.groupby("id"); 20 | var actual = frame.sum("value"); 21 | 22 | t.equals(JSON.stringify(actual), JSON.stringify(expected), "reduce"); 23 | 24 | }); 25 | 26 | tape("groupby accepts single string argument over string variable", function(t){ 27 | t.plan(1); 28 | var frame = new Frame({ 29 | "id" : ["b", "a", "a", "a", "b", "a", "b", "a", "b"], 30 | "value" : [ 3, 1, 2, 2, 1, 3, 4, 2, 1] 31 | }); 32 | expected = { 33 | "a": 10, // 1 + 2 + 2 + 3 + 2 34 | "b": 9 // 3 + 1 + 4 + 1 35 | }; 36 | 37 | frame = frame.groupby("id"); 38 | var actual = frame.sum("value"); 39 | 40 | t.equals(JSON.stringify(actual), JSON.stringify(expected), "reduce"); 41 | 42 | }); 43 | 44 | tape("groupby accepts array argument", function(t){ 45 | t.plan(1); 46 | var frame = new Frame({ 47 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 48 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 49 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 50 | }); 51 | 52 | var expected = { 53 | "0" : { 54 | "0" : 8, // 1 + 2 + 3 + 2 55 | "1" : 2 // 2 56 | }, 57 | "1" : { 58 | "0" : 1, // 1 59 | "1" : 8 // 3 + 4 + 1 60 | } 61 | }; 62 | 63 | 64 | frame = frame.groupby(["id_0", "id_1"]); 65 | var actual = frame.sum("value"); 66 | 67 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 68 | }); 69 | 70 | tape("groupby accepts multiple string arguments", function(t){ 71 | t.plan(1); 72 | var frame = new Frame({ 73 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 74 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 75 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 76 | }); 77 | 78 | var expected = { 79 | "0" : { 80 | "0" : 8, // 1 + 2 + 3 + 2 81 | "1" : 2 // 2 82 | }, 83 | "1" : { 84 | "0" : 1, // 1 85 | "1" : 8 // 3 + 4 + 1 86 | } 87 | }; 88 | 89 | 90 | frame = frame.groupby("id_0", "id_1"); 91 | var actual = frame.sum("value"); 92 | 93 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 94 | }); 95 | 96 | tape("sum works without groupby", function(t){ 97 | t.plan(1); 98 | var frame = new Frame({ 99 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 100 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 101 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 102 | }); 103 | 104 | var expected = 19; // 1 + 2 + 2 + 3 + 1 + 3 + 4 + 2 + 1 105 | 106 | var actual = frame.sum("value"); 107 | 108 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 109 | }); 110 | 111 | tape("groupby sum, reduce over keyed column", function(t){ 112 | t.plan(1); 113 | var frame = new Frame({ 114 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 115 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 116 | },{ 117 | "value" : [1, 2, 3, 4, 5] 118 | }); 119 | 120 | var expected = { 121 | 0 : 15, // 2 + 3 + 3 + 4 + 3 122 | 1 : 13 // 4 + 2 + 5 + 2 123 | } 124 | 125 | var g = frame.groupby("id"); 126 | var actual = g.sum("value"); 127 | 128 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 129 | 130 | }); 131 | } 132 | 133 | var RTOL = 1e-05, // 1e-05 134 | ATOL = 1e-12; // 1e-12 135 | 136 | var dataDirectory = 'test/data/groupby.sum/', 137 | testFile = 'small.json'; 138 | 139 | var floader = require('floader'), 140 | dtest = require('../lib/test'); 141 | 142 | floader.load(dataDirectory + testFile, function(err, config){ 143 | 144 | var suite = JSON.parse(config); 145 | simpleTestCases(); 146 | 147 | for(var i = 0; i < suite.length; i++){ 148 | 149 | var prefix = String("0000" + (i + 1)).slice(-4); 150 | 151 | // directory containing matrix data files for current test 152 | var directory = dataDirectory + prefix + '/'; 153 | 154 | var test = suite[i]; 155 | 156 | var names = test.id.map(function(spec, i){ return "id_" + i;}); 157 | var types = test.id.map(function(spec, i){ return spec['type'];}); 158 | 159 | var N = test.N; // number of rows 160 | var distincts = test.id.map(function(spec, i){ return spec.K; }); 161 | 162 | var testName = "groupby.sum: " + N + " x " + "(" + distincts.join(", ") + ")" 163 | tape(testName, generateTestCase(directory, names, types, ["value_0"], [test.value[0].type])); 164 | } 165 | }); 166 | 167 | var OUT_FILENAME = "out.json"; 168 | 169 | 170 | function generateTestCase(directory, id_names, id_types, value_names, value_types){ 171 | return function(t){ 172 | t.plan(1); 173 | 174 | var names = id_names.concat(value_names); 175 | var types = id_types.concat(value_types); 176 | 177 | // which columns require a key file? 178 | var key_names = id_names.filter(function(item, i){ 179 | return id_types[i] in dtest.string_types 180 | }); 181 | var key_types = id_types.filter(function(item, i){ 182 | return item in dtest.string_types 183 | }); 184 | 185 | // load columns from files 186 | dtest.load(directory, names, types, function(err, columns){ 187 | 188 | // load key files 189 | dtest.load_key(directory, key_names, key_types, function(err, keys){ 190 | 191 | floader.load(directory + OUT_FILENAME, function(err, out){ 192 | var expected = JSON.parse(out); 193 | 194 | var column_set = {}; 195 | for (var i = 0; i < names.length; i++){ 196 | var name = names[i]; 197 | var column = columns[i]; 198 | column_set[name] = column; 199 | } 200 | // keys map a small set of integers to other things (like strings) 201 | // they're a very simple form of fixed length coding 202 | var key_set = {}; 203 | for (var i = 0; i < keys.length; i++){ 204 | var name = key_names[i]; 205 | var key = keys[i]; 206 | key_set[name] = key; 207 | } 208 | 209 | var frame = new Frame(column_set, key_set); 210 | 211 | var g = frame.groupby(id_names); 212 | var actual = g.sum(value_names[0]); 213 | 214 | var assert; 215 | if(value_types[0] in dtest.float_types){ 216 | assert = dtest.assert.tree.allclose; 217 | } else { 218 | assert = dtest.assert.tree.equal; 219 | } 220 | 221 | //console.log(actual); 222 | assert(t, actual, expected, null, RTOL, ATOL); 223 | }); 224 | 225 | }); 226 | }); 227 | }; 228 | } 229 | -------------------------------------------------------------------------------- /test/groupby.where.sum.js: -------------------------------------------------------------------------------- 1 | var tape = require('tape'), 2 | Frame = require('../lib/frame'); 3 | 4 | // simple instructive test cases 5 | function simpleTestCases(){ 6 | 7 | tape("sum works with where before groupby", function(t){ 8 | t.plan(1); 9 | var frame = new Frame({ 10 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 11 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 12 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 13 | }); 14 | 15 | var expected = { 16 | "0" : 1, // 1 17 | "1" : 8 // 3 + 4 + 1 18 | }; 19 | 20 | var actual = frame.where("id_0", 1).groupby("id_1").sum("value"); 21 | 22 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 23 | }); 24 | 25 | tape("sum works with where before groupby", function(t){ 26 | t.plan(1); 27 | var frame = new Frame({ 28 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 29 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 30 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 31 | }); 32 | 33 | var expected = { 34 | "0" : 8, 35 | "1" : 2 36 | }; 37 | 38 | var actual = frame.where("id_0", 0).groupby("id_1").sum("value"); 39 | 40 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 41 | }); 42 | 43 | tape("sum works with groupby before where", function(t){ 44 | t.plan(1); 45 | var frame = new Frame({ 46 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 47 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 48 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 49 | }); 50 | 51 | var expected = { 52 | "0" : 1, 53 | "1" : 8 54 | }; 55 | 56 | var actual = frame.groupby("id_1").where("id_0", 1).sum("value"); 57 | 58 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 59 | }); 60 | 61 | 62 | 63 | tape("sum works with where.in before groupby", function(t){ 64 | t.plan(1); 65 | var frame = new Frame({ 66 | "id_0" : [0, 2, 0, 1, 1, 0, 2, 0, 1], 67 | "id_1" : [0, 0, 1, 0, 0, 0, 1, 1, 1], 68 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 69 | }); 70 | 71 | var expected = { 72 | "0" : 6, // 1 + 2 + 3 73 | "1" : 8 // 2 + 4 + 2 74 | }; 75 | frame = frame.where("id_0", [0, 2]).groupby("id_1"); 76 | var actual = frame.sum("value"); 77 | 78 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 79 | }); 80 | } 81 | 82 | //simpleTestCases(); 83 | 84 | var SAMPLE = 10; 85 | function numberCompare(a, b){ return a - b; } 86 | // get a predefined subset of a column (matches test data generation) 87 | function generate_subset(column){ 88 | //column = id_columns[id_name] 89 | var uniques = {}; 90 | for(var i = 0; i < SAMPLE; i++){ 91 | uniques[column[i]] = column[i]; 92 | } 93 | var keys = Object.keys(uniques); 94 | var subset = keys.map(function(k){ return uniques[k]}); 95 | 96 | l = Math.ceil(subset.length / 2); 97 | return subset.sort(numberCompare).slice(0, l); 98 | } 99 | 100 | var RTOL = 1e-05, // 1e-05 101 | ATOL = 1e-12; // 1e-12 102 | 103 | var dataDirectory = 'test/data/groupby.where.sum/', 104 | testFile = 'small.json'; 105 | 106 | var floader = require('floader'), 107 | dtest = require('../lib/test'); 108 | 109 | floader.load(dataDirectory + testFile, function(err, config){ 110 | 111 | var suite = JSON.parse(config); 112 | simpleTestCases(); 113 | 114 | for(var i = 0; i < suite.length; i++){ 115 | 116 | var prefix = String("0000" + (i + 1)).slice(-4); 117 | 118 | // directory containing matrix data files for current test 119 | var directory = dataDirectory + prefix + '/'; 120 | 121 | var test = suite[i]; 122 | 123 | var names = test.id.map(function(spec, i){ return "id_" + i;}); 124 | var types = test.id.map(function(spec, i){ return spec['type'];}); 125 | 126 | var N = test.N; // number of rows 127 | var distincts = test.id.map(function(spec, i){ return spec.K; }); 128 | 129 | var testName = "groupby.where.sum: " + N + " x " + "(" + distincts.join(", ") + ")" 130 | tape(testName, generateTestCase(directory, names, types, ["value_0"], [test.value[0].type])); 131 | } 132 | }); 133 | 134 | var OUT_FILENAME = "out.json"; 135 | 136 | 137 | function generateTestCase(directory, id_names, id_types, value_names, value_types){ 138 | return function(t){ 139 | t.plan(1); 140 | 141 | var names = id_names.concat(value_names); 142 | var types = id_types.concat(value_types); 143 | 144 | // which columns require a key file? 145 | var key_names = id_names.filter(function(item, i){ 146 | return id_types[i] in dtest.string_types 147 | }); 148 | var key_types = id_types.filter(function(item, i){ 149 | return item in dtest.string_types 150 | }); 151 | 152 | // load columns from files 153 | dtest.load(directory, names, types, function(err, columns){ 154 | 155 | // load key files 156 | dtest.load_key(directory, key_names, key_types, function(err, keys){ 157 | 158 | floader.load(directory + OUT_FILENAME, function(err, out){ 159 | var expected = JSON.parse(out); 160 | 161 | var column_set = {}; 162 | for (var i = 0; i < names.length; i++){ 163 | var name = names[i]; 164 | var column = columns[i]; 165 | column_set[name] = column; 166 | } 167 | // keys map a small set of integers to other things (like strings) 168 | // they're a very simple form of fixed length coding 169 | var key_set = {}; 170 | for (var i = 0; i < keys.length; i++){ 171 | var name = key_names[i]; 172 | var key = keys[i]; 173 | key_set[name] = key; 174 | } 175 | 176 | var frame = new Frame(column_set, key_set); 177 | 178 | var subset = generate_subset(column_set["id_0"]); 179 | //console.log(subset); 180 | frame = frame.where("id_0", subset).groupby(id_names); 181 | 182 | var actual = frame.sum(value_names[0]); 183 | 184 | var assert; 185 | if(value_types[0] in dtest.float_types){ 186 | assert = dtest.assert.tree.allclose; 187 | } else { 188 | assert = dtest.assert.tree.equal; 189 | } 190 | 191 | //console.log(actual); 192 | var success = assert(t, actual, expected, null, RTOL, ATOL); 193 | /* 194 | if(!success){ 195 | console.log(actual); 196 | console.log(expected); 197 | }*/ 198 | }); 199 | 200 | }); 201 | }); 202 | }; 203 | } 204 | -------------------------------------------------------------------------------- /test/join.js: -------------------------------------------------------------------------------- 1 | var tape = require('tape'), 2 | Frame = require('../lib/frame'); 3 | 4 | tape("join to smaller frame produces correct virtual column", function(t){ 5 | t.plan(1); 6 | var frame0 = new Frame({ 7 | "value0" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 8 | }); 9 | 10 | //console.log(JSON.stringify(frame0._cols)); 11 | var frame1 = new Frame({ 12 | "value1" : [1, 2] 13 | }); 14 | 15 | var link = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 16 | 17 | var joined = frame0.join(frame1, link); 18 | 19 | var expected = [1, 1, 1, 2, 2, 1, 2, 1, 2]; // 1 + 1 + 1 + 2 + 2 + 1 + 2 + 1 + 2 20 | 21 | var actual = joined["value1"]; 22 | 23 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 24 | 25 | }); 26 | 27 | tape("join to smaller frame produces correct sum", function(t){ 28 | t.plan(1); 29 | var frame0 = new Frame({ 30 | "value0" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 31 | }); 32 | 33 | var frame1 = new Frame({ 34 | "value1" : [1, 2] 35 | }); 36 | 37 | var link = [0, 0, 0, 1, 1, 0, 1, 0, 1]; 38 | 39 | var joined = frame0.join(frame1, link); 40 | 41 | var expected = 13; // 1 + 1 + 1 + 2 + 2 + 1 + 2 + 1 + 2 42 | 43 | var actual = joined.sum("value1"); 44 | 45 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 46 | 47 | }); 48 | 49 | tape("join to larger frame produces correct virtual column", function(t){ 50 | t.plan(1); 51 | var frame0 = new Frame({ 52 | "value0" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 53 | }); 54 | 55 | //console.log(JSON.stringify(frame0._cols)); 56 | var frame1 = new Frame({ 57 | "value1" : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] 58 | }); 59 | 60 | var link = [9, 1, 12, 2, 3, 7, 10, 5, 11]; 61 | 62 | var joined = frame0.join(frame1, link); 63 | 64 | var expected = [10, 2, 13, 3, 4, 8, 11, 6, 12]; // 1 + 1 + 1 + 2 + 2 + 1 + 2 + 1 + 2 65 | 66 | var actual = joined["value1"]; 67 | 68 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 69 | 70 | }); 71 | 72 | tape("join to larger frame produces correct argmax and argmin", function(t){ 73 | t.plan(2); 74 | var frame0 = new Frame({ 75 | "value0" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 76 | }); 77 | 78 | //console.log(JSON.stringify(frame0._cols)); 79 | var frame1 = new Frame({ 80 | "value1" : [5, 2, 13, 4, 6, 1, 7, 8, 9, 10, 11, 12, 3] 81 | }); 82 | 83 | var link = [9, 3, 12, 2, 1, 7, 10, 5, 11]; 84 | 85 | var joined = frame0.join(frame1, link); 86 | 87 | var expected = 3; 88 | 89 | var actual = joined.argmax("value1"); 90 | 91 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 92 | 93 | var expected = 7; 94 | 95 | var actual = joined.argmin("value1"); 96 | 97 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 98 | }); 99 | 100 | tape("join to larger frame produces correct sum", function(t){ 101 | t.plan(1); 102 | var frame0 = new Frame({ 103 | "value0" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 104 | }); 105 | 106 | //console.log(JSON.stringify(frame0._cols)); 107 | var frame1 = new Frame({ 108 | "value1" : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] 109 | }); 110 | 111 | var link = [9, 1, 12, 2, 3, 7, 10, 5, 11]; 112 | 113 | var joined = frame0.join(frame1, link); 114 | 115 | var expected = 69; // 10 + 2 + 13 + 3 + 4 + 8 + 11 + 6 + 12 116 | 117 | var actual = joined.sum("value1"); 118 | 119 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 120 | 121 | }); 122 | 123 | tape("join with where produces correct sum", function(t){ 124 | t.plan(1); 125 | var frame0 = new Frame({ 126 | "value0" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 127 | }); 128 | 129 | //console.log(JSON.stringify(frame0._cols)); 130 | var frame1 = new Frame({ 131 | "value1" : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] 132 | }); 133 | 134 | var link = [9, 1, 12, 2, 3, 7, 10, 5, 11]; 135 | 136 | var joined = frame0.join(frame1, link); 137 | 138 | var filtered = joined.where("value0", function(v){ return v > 2; }); 139 | var expected = 22; // 3 + 8 + 11 140 | 141 | var actual = filtered.sum("value1"); 142 | 143 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 144 | 145 | }); 146 | 147 | tape("join with where produces correct argmax", function(t){ 148 | t.plan(2); 149 | var frame0 = new Frame({ 150 | "value0" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 151 | }); 152 | 153 | //console.log(JSON.stringify(frame0._cols)); 154 | var frame1 = new Frame({ 155 | "value1" : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] 156 | }); 157 | 158 | var link = [9, 1, 12, 2, 3, 7, 10, 5, 11]; 159 | 160 | var joined = frame0.join(frame1, link); 161 | 162 | var filtered = joined.where("value0", function(v){ return v > 2; }); 163 | var expected = 2; // 3 + 8 + 11 164 | 165 | var actual = filtered.argmax("value1"); 166 | 167 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 168 | 169 | var expected = 11; 170 | 171 | var argmax = actual; 172 | var actual = filtered["value1"][argmax]; 173 | 174 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 175 | }); 176 | 177 | /* 178 | tape("groupby has correct index", function(t){ 179 | t.plan(1); 180 | var frame = new Frame({ 181 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 182 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 183 | }); 184 | 185 | var expected = { 186 | "0" : [0, 1, 2, 5, 7], 187 | "1" : [3, 4, 6, 8] 188 | }; 189 | 190 | var g = frame.groupby("id"); 191 | var actual = g._index; 192 | 193 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 194 | 195 | }); 196 | 197 | tape("groupby with two arguments has correct index", function(t){ 198 | t.plan(1); 199 | var frame = new Frame({ 200 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 201 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 202 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 203 | }); 204 | 205 | var expected = { 206 | "0" : { 207 | "0" : [0, 1, 5, 7], 208 | "1" : [2] 209 | }, 210 | "1" : { 211 | "0" : [4], 212 | "1" : [3, 6, 8] 213 | } 214 | }; 215 | 216 | var g = frame.groupby("id_0", "id_1"); 217 | var actual = g._index; 218 | 219 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 220 | }); 221 | 222 | tape("successive groupby has correct index", function(t){ 223 | t.plan(1); 224 | var frame = new Frame({ 225 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 226 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 227 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 228 | }); 229 | 230 | var expected = { 231 | "0" : { 232 | "0" : [0, 1, 5, 7], 233 | "1" : [2] 234 | }, 235 | "1" : { 236 | "0" : [4], 237 | "1" : [3, 6, 8] 238 | } 239 | }; 240 | 241 | var g = frame.groupby("id_0"); 242 | g = g.groupby("id_1"); 243 | var actual = g._index; 244 | 245 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 246 | }); 247 | 248 | */ 249 | /* 250 | var dataDirectory = 'test/data/groupby.count/', 251 | testFile = 'small.json'; 252 | 253 | var RTOL = 1e-05, // 1e-05 254 | ATOL = 1e-12; // 1e-12 255 | 256 | var floader = require('floader'), 257 | dtest = require('../lib/test'); 258 | 259 | floader.load(dataDirectory + testFile, function(err, config){ 260 | 261 | var suite = JSON.parse(config); 262 | 263 | for(var i = 0; i < suite.length; i++){ 264 | 265 | var prefix = String("0000" + (i + 1)).slice(-4); 266 | 267 | // directory containing matrix data files for current test 268 | var directory = dataDirectory + prefix + '/'; 269 | 270 | var test = suite[i]; 271 | var names = test.id.map(function(spec, i){ return "id_" + i;}); 272 | var types = test.id.map(function(spec, i){ return spec['type'];}); 273 | 274 | var N = test.N; // number of rows 275 | distincts = test.id.map(function(spec, i){ return spec.K; }); 276 | 277 | var testName = "groupby.count: " + N + " x " + "(" + distincts.join(", ") + ")" 278 | tape(testName, generateTestCase(directory, names, types, ["value_0"], [test.value[0].type])); 279 | } 280 | }); 281 | 282 | function generateTestCase(directory, id_names, id_types, value_names, value_types){ 283 | return function(t){ 284 | t.plan(1); 285 | 286 | var names = id_names.concat(value_names); 287 | var types = id_types.concat(value_types); 288 | // load columns from files 289 | dtest.load(directory, names, types, function(err, columns){ 290 | 291 | floader.load(directory + "out.json", function(err, out){ 292 | var expected = JSON.parse(out); 293 | 294 | var column_set = {}; 295 | for (var i = 0; i < names.length; i++){ 296 | var name = names[i]; 297 | var column = columns[i]; 298 | column_set[name] = column; 299 | } 300 | var frame = new Frame(column_set); 301 | 302 | var g = frame; 303 | for(var i = 0; i < id_names.length; i++){ 304 | id_name = id_names[i]; 305 | g = g.groupby(id_name); 306 | } 307 | var actual = g.count(); 308 | 309 | var assert; 310 | if(value_types[0] in dtest.float_types){ 311 | assert = dtest.assert.tree.allclose; 312 | } else { 313 | assert = dtest.assert.tree.equal; 314 | } 315 | 316 | assert(t, actual, expected, null, RTOL, ATOL); 317 | }); 318 | 319 | }); 320 | }; 321 | } 322 | */ 323 | -------------------------------------------------------------------------------- /test/mean.js: -------------------------------------------------------------------------------- 1 | var tape = require('tape'), 2 | Frame = require('../lib/frame'); 3 | 4 | // simple instructive test cases 5 | function simpleTestCases(){ 6 | 7 | tape("mean works with integers", function(t){ 8 | t.plan(1); 9 | var frame = new Frame({ 10 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 11 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 12 | }); 13 | 14 | var expected = 2.111111111; // 1 + 2 + 2 + 3 + 1 + 3 + 4 + 2 + 1 15 | 16 | var actual = frame.mean("value"); 17 | 18 | dtest.assert.close(t, actual, expected); 19 | }); 20 | 21 | tape("mean works with integers", function(t){ 22 | t.plan(1); 23 | var frame = new Frame({ 24 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1], 25 | "value" : [4, 2, 7, 1, 3, 6, 5, 2, 1, 7, 8] 26 | }); 27 | 28 | var expected = 4.1818181818; // (4 + 2 + 7 + 1 + 3 + 6 + 5 + 2 + 1 + 7 + 8) / 11 29 | 30 | var actual = frame.mean("value"); 31 | 32 | dtest.assert.close(t, actual, expected); 33 | }); 34 | 35 | tape("mean works floats", function(t){ 36 | t.plan(1); 37 | var frame = new Frame({ 38 | "id" : [0, 2, 0, 1, 1, 0, 2, 0, 1], 39 | "value" : [1.2, 6.4, 2.3, 12.1, 1.6, 3.5, 7.2, 2.1, 10.2] 40 | }); 41 | 42 | var expected = 5.177777777777779; // (1.2 + 6.4 + 2.3 + 12.1 + 1.6 + 3.5 + 7.2 + 2.1 + 10.2) / 9 43 | var actual = frame.mean("value"); 44 | 45 | dtest.assert.close(t, actual, expected); 46 | }); 47 | } 48 | 49 | //simpleTestCases(); 50 | 51 | var RTOL = 1e-05, // 1e-05 52 | ATOL = 1e-12; // 1e-12 53 | 54 | var dataDirectory = 'test/data/mean/', 55 | testFile = 'small.json'; 56 | 57 | var floader = require('floader'), 58 | dtest = require('../lib/test'); 59 | 60 | floader.load(dataDirectory + testFile, function(err, config){ 61 | 62 | var suite = JSON.parse(config); 63 | simpleTestCases(); 64 | 65 | for(var i = 0; i < suite.length; i++){ 66 | 67 | var prefix = String("0000" + (i + 1)).slice(-4); 68 | 69 | // directory containing matrix data files for current test 70 | var directory = dataDirectory + prefix + '/'; 71 | 72 | var test = suite[i]; 73 | 74 | var names = test.id.map(function(spec, i){ return "id_" + i;}); 75 | var types = test.id.map(function(spec, i){ return spec['type'];}); 76 | 77 | var N = test.N; // number of rows 78 | var distincts = test.id.map(function(spec, i){ return spec.K; }); 79 | 80 | var testName = "mean: " + N + " x " + "(" + distincts.join(", ") + ")" 81 | tape(testName, generateTestCase(directory, names, types, ["value_0"], [test.value[0].type])); 82 | } 83 | }); 84 | 85 | var OUT_FILENAME = "out.json"; 86 | 87 | 88 | function generateTestCase(directory, id_names, id_types, value_names, value_types){ 89 | return function(t){ 90 | t.plan(1); 91 | 92 | var names = id_names.concat(value_names); 93 | var types = id_types.concat(value_types); 94 | 95 | // which columns require a key file? 96 | var key_names = id_names.filter(function(item, i){ 97 | return id_types[i] in dtest.string_types 98 | }); 99 | var key_types = id_types.filter(function(item, i){ 100 | return item in dtest.string_types 101 | }); 102 | 103 | // load columns from files 104 | dtest.load(directory, names, types, function(err, columns){ 105 | 106 | // load key files 107 | dtest.load_key(directory, key_names, key_types, function(err, keys){ 108 | 109 | floader.load(directory + OUT_FILENAME, function(err, out){ 110 | var expected = JSON.parse(out); 111 | 112 | var column_set = {}; 113 | for (var i = 0; i < names.length; i++){ 114 | var name = names[i]; 115 | var column = columns[i]; 116 | column_set[name] = column; 117 | } 118 | // keys map a small set of integers to other things (like strings) 119 | // they're a very simple form of fixed length coding 120 | var key_set = {}; 121 | for (var i = 0; i < keys.length; i++){ 122 | var name = key_names[i]; 123 | var key = keys[i]; 124 | key_set[name] = key; 125 | } 126 | 127 | var frame = new Frame(column_set, key_set); 128 | 129 | //console.log(subset); 130 | var actual = frame.mean("value_0"); 131 | 132 | dtest.assert.close(t, actual, expected, "close", RTOL, ATOL); 133 | }); 134 | 135 | }); 136 | }); 137 | }; 138 | } 139 | -------------------------------------------------------------------------------- /test/ungroup.js: -------------------------------------------------------------------------------- 1 | var tape = require('tape'), 2 | Frame = require('../lib/frame'); 3 | 4 | tape("ungroup single groupby has correct index", function(t){ 5 | t.plan(1); 6 | var frame = new Frame({ 7 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 8 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 9 | }); 10 | 11 | var expected; // undefined 12 | 13 | var g = frame.groupby("id"); 14 | var g = g.ungroup(); 15 | var actual = g._index; 16 | 17 | t.equals(actual, expected); 18 | 19 | }); 20 | 21 | tape("ungroup on multiple groupby has correct index", function(t){ 22 | t.plan(1); 23 | var frame = new Frame({ 24 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 25 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 26 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 27 | }); 28 | 29 | 30 | var expected = { 31 | "0" : [0, 1, 2, 5, 7], 32 | "1" : [3, 4, 6, 8] 33 | }; 34 | 35 | var g = frame.groupby("id_0", "id_1"); 36 | g = g.ungroup(); 37 | var actual = g._index; 38 | 39 | t.equals(JSON.stringify(actual), JSON.stringify(expected)); 40 | }); 41 | 42 | tape("successive ungroup on multiple groupby has correct index", function(t){ 43 | t.plan(1); 44 | var frame = new Frame({ 45 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 46 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 47 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 48 | }); 49 | 50 | var expected; // undefined 51 | 52 | var g = frame.groupby("id_0", "id_1"); 53 | g = g.ungroup(); 54 | g = g.ungroup(); 55 | var actual = g._index; 56 | 57 | t.equals(actual, expected); 58 | }); 59 | 60 | /* 61 | var dataDirectory = 'test/data/groupby.count/', 62 | testFile = 'small.json'; 63 | 64 | var RTOL = 1e-05, // 1e-05 65 | ATOL = 1e-12; // 1e-12 66 | 67 | var floader = require('floader'), 68 | dtest = require('../lib/test'); 69 | 70 | floader.load(dataDirectory + testFile, function(err, config){ 71 | 72 | var suite = JSON.parse(config); 73 | 74 | for(var i = 0; i < suite.length; i++){ 75 | 76 | var prefix = String("0000" + (i + 1)).slice(-4); 77 | 78 | // directory containing matrix data files for current test 79 | var directory = dataDirectory + prefix + '/'; 80 | 81 | var test = suite[i]; 82 | var names = test.id.map(function(spec, i){ return "id_" + i;}); 83 | var types = test.id.map(function(spec, i){ return spec['type'];}); 84 | 85 | var N = test.N; // number of rows 86 | distincts = test.id.map(function(spec, i){ return spec.K; }); 87 | 88 | var testName = "groupby.count: " + N + " x " + "(" + distincts.join(", ") + ")" 89 | tape(testName, generateTestCase(directory, names, types, ["value_0"], [test.value[0].type])); 90 | } 91 | }); 92 | 93 | function generateTestCase(directory, id_names, id_types, value_names, value_types){ 94 | return function(t){ 95 | t.plan(1); 96 | 97 | var names = id_names.concat(value_names); 98 | var types = id_types.concat(value_types); 99 | // load columns from files 100 | dtest.load(directory, names, types, function(err, columns){ 101 | 102 | floader.load(directory + "out.json", function(err, out){ 103 | var expected = JSON.parse(out); 104 | 105 | var column_set = {}; 106 | for (var i = 0; i < names.length; i++){ 107 | var name = names[i]; 108 | var column = columns[i]; 109 | column_set[name] = column; 110 | } 111 | var frame = new Frame(column_set); 112 | 113 | var g = frame; 114 | for(var i = 0; i < id_names.length; i++){ 115 | id_name = id_names[i]; 116 | g = g.groupby(id_name); 117 | } 118 | var actual = g.count(); 119 | 120 | var assert; 121 | if(value_types[0] in dtest.float_types){ 122 | assert = dtest.assert.tree.allclose; 123 | } else { 124 | assert = dtest.assert.tree.equal; 125 | } 126 | 127 | assert(t, actual, expected, null, RTOL, ATOL); 128 | }); 129 | 130 | }); 131 | }; 132 | } 133 | */ 134 | -------------------------------------------------------------------------------- /test/where.in.sum.js: -------------------------------------------------------------------------------- 1 | var tape = require('tape'), 2 | Frame = require('../lib/frame'); 3 | 4 | // simple instructive test cases 5 | function simpleTestCases(){ 6 | 7 | tape("sum works with where", function(t){ 8 | t.plan(1); 9 | var frame = new Frame({ 10 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 11 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 12 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 13 | }); 14 | 15 | var expected = 9; // 3 + 1 + 4 + 1 16 | 17 | var actual = frame.where("id_0", 1).sum("value"); 18 | 19 | t.equals(actual, expected); 20 | }); 21 | 22 | tape("sum works with where", function(t){ 23 | t.plan(1); 24 | var frame = new Frame({ 25 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 26 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 27 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 28 | }); 29 | 30 | var expected = 10; // 1 + 2 + 2 + 3 + 2 31 | 32 | var actual = frame.where("id_0", 0).sum("value"); 33 | 34 | t.equals(actual, expected); 35 | }); 36 | 37 | tape("where does not modify sum on original Frame", function(t){ 38 | t.plan(1); 39 | var frame = new Frame({ 40 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 41 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 42 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 43 | }); 44 | 45 | var expected = 19; // 1 + 2 + 2 + 3 + 1 + 3 + 4 + 2 + 1 46 | 47 | var fw = frame.where("id_0", 0); 48 | var actual = frame.sum("value"); 49 | 50 | t.equals(actual, expected); 51 | }); 52 | 53 | tape("sum works with multiple wheres", function(t){ 54 | t.plan(1); 55 | var frame = new Frame({ 56 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 57 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 58 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 59 | }); 60 | 61 | var expected = 8; // 3 + 4 + 1 62 | var actual = frame.where("id_0", 1).where("id_1", 1).sum("value"); 63 | 64 | t.equals(actual, expected); 65 | }); 66 | 67 | 68 | tape("sum works with where in", function(t){ 69 | t.plan(1); 70 | var frame = new Frame({ 71 | "id" : [0, 2, 0, 1, 1, 0, 2, 0, 1], 72 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 73 | }); 74 | 75 | var expected = 14; // 1 + 2 + 2 + 3 + 4 + 2 76 | frame = frame.where("id", [0, 2]); 77 | var actual = frame.sum("value"); 78 | 79 | t.equals(actual, expected); 80 | }); 81 | 82 | 83 | tape("sum works with where in undefined", function(t){ 84 | t.plan(1); 85 | var frame = new Frame({ 86 | "id" : [0, 2, 0, 1, 1, 0, 2, 0, 1], 87 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 88 | }); 89 | 90 | var a; // undefined 91 | var expected = 14; // 1 + 2 + 2 + 3 + 4 + 2 92 | frame = frame.where("id", [0, 2, a]); 93 | var actual = frame.sum("value"); 94 | 95 | t.equals(actual, expected); 96 | }); 97 | } 98 | 99 | //simpleTestCases(); 100 | 101 | var SAMPLE = 10; 102 | function numberCompare(a, b){ return a - b; } 103 | // get a predefined subset of a column (matches test data generation) 104 | function generate_subset(column){ 105 | //column = id_columns[id_name] 106 | var uniques = {}; 107 | for(var i = 0; i < SAMPLE; i++){ 108 | uniques[column[i]] = column[i]; 109 | } 110 | var keys = Object.keys(uniques); 111 | var subset = keys.map(function(k){ return uniques[k]}); 112 | 113 | l = Math.ceil(subset.length / 2); 114 | return subset.sort(numberCompare).slice(0, l); 115 | } 116 | 117 | var RTOL = 1e-05, // 1e-05 118 | ATOL = 1e-12; // 1e-12 119 | 120 | var dataDirectory = 'test/data/where.in.sum/', 121 | testFile = 'small.json'; 122 | 123 | var floader = require('floader'), 124 | dtest = require('../lib/test'); 125 | 126 | floader.load(dataDirectory + testFile, function(err, config){ 127 | 128 | var suite = JSON.parse(config); 129 | simpleTestCases(); 130 | 131 | for(var i = 0; i < suite.length; i++){ 132 | 133 | var prefix = String("0000" + (i + 1)).slice(-4); 134 | 135 | // directory containing matrix data files for current test 136 | var directory = dataDirectory + prefix + '/'; 137 | 138 | var test = suite[i]; 139 | 140 | var names = test.id.map(function(spec, i){ return "id_" + i;}); 141 | var types = test.id.map(function(spec, i){ return spec['type'];}); 142 | 143 | var N = test.N; // number of rows 144 | var distincts = test.id.map(function(spec, i){ return spec.K; }); 145 | 146 | var testName = "where.in.sum: " + N + " x " + "(" + distincts.join(", ") + ")" 147 | tape(testName, generateTestCase(directory, names, types, ["value_0"], [test.value[0].type])); 148 | } 149 | }); 150 | 151 | var OUT_FILENAME = "out.json"; 152 | 153 | 154 | function generateTestCase(directory, id_names, id_types, value_names, value_types){ 155 | return function(t){ 156 | t.plan(1); 157 | 158 | var names = id_names.concat(value_names); 159 | var types = id_types.concat(value_types); 160 | 161 | // which columns require a key file? 162 | var key_names = id_names.filter(function(item, i){ 163 | return id_types[i] in dtest.string_types 164 | }); 165 | var key_types = id_types.filter(function(item, i){ 166 | return item in dtest.string_types 167 | }); 168 | 169 | // load columns from files 170 | dtest.load(directory, names, types, function(err, columns){ 171 | 172 | // load key files 173 | dtest.load_key(directory, key_names, key_types, function(err, keys){ 174 | 175 | floader.load(directory + OUT_FILENAME, function(err, out){ 176 | var expected = JSON.parse(out); 177 | 178 | var column_set = {}; 179 | for (var i = 0; i < names.length; i++){ 180 | var name = names[i]; 181 | var column = columns[i]; 182 | column_set[name] = column; 183 | } 184 | // keys map a small set of integers to other things (like strings) 185 | // they're a very simple form of fixed length coding 186 | var key_set = {}; 187 | for (var i = 0; i < keys.length; i++){ 188 | var name = key_names[i]; 189 | var key = keys[i]; 190 | key_set[name] = key; 191 | } 192 | 193 | var frame = new Frame(column_set, key_set); 194 | 195 | var subset = generate_subset(column_set["id_0"]); 196 | //console.log(subset); 197 | frame = frame.where("id_0", subset); 198 | var actual = frame.sum("value_0"); 199 | 200 | dtest.assert.close(t, actual, expected, "close", RTOL, ATOL); 201 | }); 202 | 203 | }); 204 | }); 205 | }; 206 | } 207 | -------------------------------------------------------------------------------- /test/where.js: -------------------------------------------------------------------------------- 1 | var tape = require('tape'), 2 | BitArray = require('bit-array'), 3 | Frame = require('../lib/frame'); 4 | 5 | tape("where creates correct filter", function(t){ 6 | t.plan(1); 7 | 8 | var frame = new Frame({ 9 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 10 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 11 | }); 12 | 13 | //frame.where(row => row.id == 1); 14 | frame = frame.where("id", v => v == 1); 15 | 16 | var expected = new BitArray(9); 17 | 18 | expected.set(3, true); 19 | expected.set(4, true); 20 | expected.set(6, true); 21 | expected.set(8, true); 22 | 23 | var actual = frame._filter; 24 | t.equals(actual.toString(), expected.toString()); 25 | }); 26 | 27 | tape("where with numerical argument creates correct filter", function(t){ 28 | t.plan(1); 29 | 30 | var frame = new Frame({ 31 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 32 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 33 | }); 34 | 35 | frame = frame.where("id", 1); 36 | 37 | var expected = new BitArray(9); 38 | 39 | expected.set(3, true); 40 | expected.set(4, true); 41 | expected.set(6, true); 42 | expected.set(8, true); 43 | 44 | var actual = frame._filter; 45 | t.equals(actual.toString(), expected.toString()); 46 | }); 47 | 48 | tape("where with array argument creates correct filter", function(t){ 49 | t.plan(1); 50 | 51 | var frame = new Frame({ 52 | "id" : [0, 2, 0, 1, 1, 0, 2, 0, 1], 53 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 54 | }); 55 | 56 | frame = frame.where("id", [0, 2]); 57 | 58 | var expected = new BitArray(9); 59 | 60 | expected.set(0, true); 61 | expected.set(1, true); 62 | expected.set(2, true); 63 | expected.set(5, true); 64 | expected.set(6, true); 65 | expected.set(7, true); 66 | 67 | var actual = frame._filter; 68 | t.equals(actual.toString(), expected.toString()); 69 | }); 70 | 71 | tape("where creates second filter correctly", function(t){ 72 | t.plan(1); 73 | 74 | var frame = new Frame({ 75 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 76 | "id_1" : [0, 0, 1, 1, 0, 1, 0, 0, 1], 77 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 78 | }); 79 | 80 | //frame.where(row => row.id == 1); 81 | frame = frame.where("id_1", id => id == 1); 82 | frame = frame.where("id_0", id => id == 1); 83 | 84 | var expected = new BitArray(9); 85 | 86 | expected.set(3, true); 87 | expected.set(8, true); 88 | 89 | var actual = frame._filter; 90 | t.equals(actual.toString(), expected.toString()); 91 | }); 92 | 93 | tape("where filters column via accessor", function(t){ 94 | t.plan(1); 95 | 96 | var frame = new Frame({ 97 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 98 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 99 | }); 100 | 101 | //frame.where(row => row.id == 1); 102 | frame = frame.where("id", v => v == 1); 103 | 104 | var expected = [3, 1, 4, 1]; 105 | 106 | 107 | var actual = frame["value"]; 108 | t.equals(actual.toString(), expected.toString()); 109 | }); 110 | 111 | tape("where filters keyed column via accessor", function(t){ 112 | t.plan(1); 113 | 114 | var columns = { 115 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 116 | "value" : [6, 1, 5, 3, 1, 2, 4, 0, 1] 117 | }; 118 | var keys = { 119 | "value" : ["fare", "fish", "my", "red", "blue", "to", "add"] 120 | }; 121 | 122 | var frame = new Frame(columns, keys); 123 | 124 | frame = frame.where("id", v => v == 1); 125 | 126 | var expected = ["red", "fish", "blue", "fish"]; 127 | 128 | 129 | var actual = frame["value"]; 130 | t.equals(actual.toString(), expected.toString()); 131 | }); 132 | 133 | tape("where accepts string filter on keyed column", function(t){ 134 | t.plan(1); 135 | 136 | var columns = { 137 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 138 | "value" : [6, 1, 5, 3, 1, 2, 4, 0, 1] 139 | }; 140 | var keys = { 141 | "id" : ["thoreau", "seuss"], 142 | "value" : ["fare", "fish", "my", "red", "blue", "to", "add"] 143 | }; 144 | 145 | var frame = new Frame(columns, keys); 146 | 147 | frame = frame.where("id", "thoreau"); 148 | 149 | var expected = ["add", "fish", "to", "my", "fare"]; 150 | 151 | 152 | var actual = frame["value"]; 153 | t.equals(actual.toString(), expected.toString()); 154 | }); 155 | 156 | tape("where accepts function with string on keyed column", function(t){ 157 | t.plan(1); 158 | 159 | var columns = { 160 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 161 | "value" : [6, 1, 5, 3, 1, 2, 4, 0, 1] 162 | }; 163 | var keys = { 164 | "id" : ["thoreau", "seuss"], 165 | "value" : ["fare", "fish", "my", "red", "blue", "to", "add"] 166 | }; 167 | 168 | var frame = new Frame(columns, keys); 169 | 170 | frame = frame.where("id", v => v == "seuss"); 171 | 172 | var expected = ["red", "fish", "blue", "fish"]; 173 | 174 | var actual = frame["value"]; 175 | t.equals(actual.toString(), expected.toString()); 176 | }); 177 | 178 | tape("where filter can be modified", function(t){ 179 | t.plan(2); 180 | 181 | var columns = { 182 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 183 | "value" : [6, 1, 5, 3, 1, 2, 4, 0, 1] 184 | }; 185 | var keys = { 186 | "id" : ["thoreau", "seuss"], 187 | "value" : ["fare", "fish", "my", "red", "blue", "to", "add"] 188 | }; 189 | 190 | var frame = new Frame(columns, keys); 191 | 192 | 193 | frame = frame.where("id", "thoreau"); 194 | var expected = ["add", "fish", "to", "my", "fare"]; 195 | 196 | var actual = frame["value"]; 197 | t.equals(actual.toString(), expected.toString()); 198 | 199 | frame = frame.where("id", v => v == "seuss"); 200 | var expected = ["red", "fish", "blue", "fish"]; 201 | 202 | var actual = frame["value"]; 203 | t.equals(actual.toString(), expected.toString()); 204 | }); 205 | /* 206 | function eq(a){ 207 | return function(v){ v == a; }; 208 | } 209 | 210 | function in(arr){ 211 | var set = {}; 212 | for (a in arr) set[a] = true; 213 | return function(v){ return v in set;}; 214 | }*/ 215 | -------------------------------------------------------------------------------- /test/where.mean.js: -------------------------------------------------------------------------------- 1 | var tape = require('tape'), 2 | Frame = require('../lib/frame'); 3 | 4 | // simple instructive test cases 5 | function simpleTestCases(){ 6 | 7 | tape("mean works with where", function(t){ 8 | t.plan(1); 9 | var frame = new Frame({ 10 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 11 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 12 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 13 | }); 14 | 15 | var expected = 2.25; // 3 + 1 + 4 + 1 16 | 17 | var actual = frame.where("id_0", 1).mean("value"); 18 | 19 | dtest.assert.close(t, actual, expected); 20 | }); 21 | 22 | tape("mean works with where", function(t){ 23 | t.plan(1); 24 | var frame = new Frame({ 25 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 26 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 27 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 28 | }); 29 | 30 | var expected = 2; // 1 + 2 + 2 + 3 + 2 31 | 32 | var actual = frame.where("id_0", 0).mean("value"); 33 | 34 | dtest.assert.close(t, actual, expected); 35 | }); 36 | 37 | tape("where does not modify mean on original Frame", function(t){ 38 | t.plan(1); 39 | var frame = new Frame({ 40 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 41 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 42 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 43 | }); 44 | 45 | var expected = 2.1111111111; // (1 + 2 + 2 + 3 + 1 + 3 + 4 + 2 + 1) / 9 46 | 47 | var fw = frame.where("id_0", 0); 48 | var actual = frame.mean("value"); 49 | 50 | dtest.assert.close(t, actual, expected); 51 | }); 52 | 53 | tape("mean works with multiple wheres", function(t){ 54 | t.plan(1); 55 | var frame = new Frame({ 56 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1], 57 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1], 58 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 59 | }); 60 | 61 | var expected = 2.666666666; // (3 + 4 + 1) / 3 62 | var actual = frame.where("id_0", 1).where("id_1", 1).mean("value"); 63 | 64 | dtest.assert.close(t, actual, expected); 65 | }); 66 | 67 | 68 | tape("mean works with where in", function(t){ 69 | t.plan(1); 70 | var frame = new Frame({ 71 | "id" : [0, 2, 0, 1, 1, 0, 2, 0, 1], 72 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1] 73 | }); 74 | 75 | var expected = 2.3333333333333; // 1 + 2 + 2 + 3 + 4 + 2 76 | frame = frame.where("id", [0, 2]); 77 | var actual = frame.mean("value"); 78 | 79 | dtest.assert.close(t, actual, expected); 80 | }); 81 | } 82 | 83 | //simpleTestCases(); 84 | 85 | var RTOL = 1e-05, // 1e-05 86 | ATOL = 1e-12; // 1e-12 87 | 88 | var dataDirectory = 'test/data/where.mean/', 89 | testFile = 'small.json'; 90 | 91 | var floader = require('floader'), 92 | dtest = require('../lib/test'); 93 | 94 | floader.load(dataDirectory + testFile, function(err, config){ 95 | 96 | var suite = JSON.parse(config); 97 | simpleTestCases(); 98 | 99 | for(var i = 0; i < suite.length; i++){ 100 | 101 | var prefix = String("0000" + (i + 1)).slice(-4); 102 | 103 | // directory containing matrix data files for current test 104 | var directory = dataDirectory + prefix + '/'; 105 | 106 | var test = suite[i]; 107 | 108 | var names = test.id.map(function(spec, i){ return "id_" + i;}); 109 | var types = test.id.map(function(spec, i){ return spec['type'];}); 110 | 111 | var N = test.N; // number of rows 112 | var distincts = test.id.map(function(spec, i){ return spec.K; }); 113 | 114 | var testName = "where.mean: " + N + " x " + "(" + distincts.join(", ") + ")" 115 | tape(testName, generateTestCase(directory, names, types, ["value_0"], [test.value[0].type])); 116 | } 117 | }); 118 | 119 | var OUT_FILENAME = "out.json"; 120 | 121 | 122 | function generateTestCase(directory, id_names, id_types, value_names, value_types){ 123 | return function(t){ 124 | t.plan(1); 125 | 126 | var names = id_names.concat(value_names); 127 | var types = id_types.concat(value_types); 128 | 129 | // which columns require a key file? 130 | var key_names = id_names.filter(function(item, i){ 131 | return id_types[i] in dtest.string_types 132 | }); 133 | var key_types = id_types.filter(function(item, i){ 134 | return item in dtest.string_types 135 | }); 136 | 137 | // load columns from files 138 | dtest.load(directory, names, types, function(err, columns){ 139 | 140 | // load key files 141 | dtest.load_key(directory, key_names, key_types, function(err, keys){ 142 | 143 | floader.load(directory + OUT_FILENAME, function(err, out){ 144 | var expected = JSON.parse(out); 145 | 146 | var column_set = {}; 147 | for (var i = 0; i < names.length; i++){ 148 | var name = names[i]; 149 | var column = columns[i]; 150 | column_set[name] = column; 151 | } 152 | // keys map a small set of integers to other things (like strings) 153 | // they're a very simple form of fixed length coding 154 | var key_set = {}; 155 | for (var i = 0; i < keys.length; i++){ 156 | var name = key_names[i]; 157 | var key = keys[i]; 158 | key_set[name] = key; 159 | } 160 | 161 | var frame = new Frame(column_set, key_set); 162 | 163 | var value = column_set["id_0"][0]; 164 | //console.log(subset); 165 | frame = frame.where("id_0", value); 166 | var actual = frame.mean("value_0"); 167 | 168 | dtest.assert.close(t, actual, expected, "close", RTOL, ATOL); 169 | }); 170 | 171 | }); 172 | }); 173 | }; 174 | } 175 | --------------------------------------------------------------------------------