├── .gitignore
├── LICENSE
├── README.md
├── benchmark
├── QUESTIONS.md
├── create.js
├── datavore
│ ├── README.md
│ ├── create.js
│ ├── datavore
│ │ └── index.js
│ ├── table.query.sum.js
│ └── table.query.sum.multi.js
├── groupby.sum.js
├── groupby_sum.js
├── median.js
├── sum.js
├── where.js
└── where_sum.js
├── lib
├── frame-index.js
├── frame.js
├── stream-reducers.js
└── test.js
├── package.json
├── requirements.txt
└── test
├── argmax.js
├── count.js
├── create.js
├── data
├── binary_matrix.py
├── generate.js
├── generate.py
├── groupby.count
│ ├── operation.py
│ └── small.json
├── groupby.mean
│ ├── operation.py
│ └── small.json
├── groupby.sum
│ ├── operation.py
│ └── small.json
├── groupby.where.sum
│ ├── operation.py
│ └── small.json
├── mean
│ ├── operation.py
│ └── small.json
├── where.in.sum
│ ├── operation.py
│ └── small.json
└── where.mean
│ ├── operation.py
│ └── small.json
├── groupby.count.js
├── groupby.js
├── groupby.mean.js
├── groupby.sum.js
├── groupby.where.sum.js
├── join.js
├── mean.js
├── ungroup.js
├── where.in.sum.js
├── where.js
└── where.mean.js
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | __pycache__
3 | *.pyc
4 |
5 | test/data/*/00*
6 |
7 | # Logs
8 | logs
9 | *.log
10 | npm-debug.log*
11 |
12 | # Runtime data
13 | pids
14 | *.pid
15 | *.seed
16 |
17 | # Directory for instrumented libs generated by jscoverage/JSCover
18 | lib-cov
19 |
20 | # Coverage directory used by tools like istanbul
21 | coverage
22 |
23 | # nyc test coverage
24 | .nyc_output
25 |
26 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
27 | .grunt
28 |
29 | # node-waf configuration
30 | .lock-wscript
31 |
32 | # Compiled binary addons (http://nodejs.org/api/addons.html)
33 | build/Release
34 |
35 | # Dependency directories
36 | node_modules
37 | jspm_packages
38 |
39 | # Optional npm cache directory
40 | .npm
41 |
42 | # Optional REPL history
43 | .node_repl_history
44 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2016 Dataship
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # frame
2 |
3 | a DataFrame for Javascript.
4 |
5 | _crunch numbers in Node or the Browser_
6 |
7 | ## features
8 | * Interactive performance (<100ms) on millions of rows
9 | * Syntax similar to SQL and Pandas
10 | * Compatible with `PapaParse` and [`BabyParse`](https://github.com/Rich-Harris/BabyParse)
11 |
12 | ## examples
13 | Parse the [Iris](https://vincentarelbundock.github.io/Rdatasets/datasets.html)
14 | dataset (with [`BabyParse`](https://github.com/Rich-Harris/BabyParse)) and create a `Frame` from the result.
15 |
16 | ```javascript
17 | var baby = require('babyparse'),
18 | Frame = require('frame');
19 |
20 | // parse the csv file
21 | config = {"header" :true, "dynamicTyping" : true, "skipEmptyLines" : true};
22 | iris = baby.parseFiles('iris.csv', config).data;
23 |
24 | // create a frame from the parsed results
25 | frame = new Frame(iris);
26 | ```
27 | ### groupby
28 |
29 | Group on `Species` and find the average value (`mean`) for `Sepal.Length`.
30 | ```javascript
31 | g = frame.groupby("Species");
32 | g.mean("Sepal.Length");
33 | ```
34 | ```json
35 | { "virginica": 6.58799, "versicolor": 5.9360, "setosa": 5.006 }
36 | ```
37 | Using the same grouping, find the average value for `Sepal.Width`.
38 | ```javascript
39 | g.mean("Sepal.Width");
40 | ```
41 | ```json
42 | { "virginica": 2.97399, "versicolor": 2.770, "setosa": 3.4279 }
43 | ```
44 |
45 | ### where
46 | Filter by `Species` value `virginica` then find the average.
47 | ```javascript
48 | f = frame.where("Species", "virginica");
49 | f.mean("Sepal.Length");
50 | ```
51 | ```json
52 | 6.58799
53 | ```
54 | Get the number of rows that match the filter.
55 | ```javascript
56 | f.count();
57 | ```
58 | ```json
59 | 50
60 | ```
61 | Columns can also be accessed directly (with the filter applied).
62 | ```javascript
63 | f["Species"]
64 | ```
65 | ```javascript
66 | ["virginica", "virginica", "virginica", ..., "virginica"]
67 | ```
68 | # tests
69 | Hundreds of tests verify correctness on millions of data points (against a Pandas reference).
70 |
71 | `npm run data && npm run test`
72 |
73 | # benchmarks
74 | `npm run bench`
75 |
76 | typical performance on one million rows
77 |
78 | operation | time
79 | ----------|------
80 | `groupby` | 54ms
81 | `where` | 29ms
82 | `sum` | 5ms
83 |
84 | # design goals and inspiration
85 |
86 | * compatibility with [feather](https://github.com/wesm/feather)
87 |
88 | ## interface
89 |
90 | * pandas
91 | * R
92 | * Linq
93 | * rethinkDB
94 | * Matlab
95 |
96 | ## performance
97 |
98 | * [datavore](https://github.com/StanfordHCI/datavore)
99 |
--------------------------------------------------------------------------------
/benchmark/QUESTIONS.md:
--------------------------------------------------------------------------------
1 |
2 | ### Why are my dv results not consistent with their benchmark webpage?
3 | because it slows down with consecutive runs, dropping to a quarter of initial performance.
4 |
5 | ok 1 table.query.sum: 1000000x3
6 | # 12.019 MFlops/sec ±16.51% n = 15 µ = 83ms : [0.022,0.02225,0.0935,0.092,0.0925,0.0925,0.09325,0.092,0.093,0.09275,0.09275,0.09225,0.09275,0.092,0.0925]
7 |
8 |
9 | ### Can I make Frame as fast as dv by encoding the strings?
10 | likely it will give a 3x speedup.
11 |
12 | #### integers
13 | ok 1 groupby.sum: 1000000x3
14 | # 13.952 MFlops/sec ±2.01% n = 29 µ = 72ms : [0.0645,0.0625,0.0635,0.062,0.0775,0.0725,0.0715,0.073,0.0725,0.073,0.0735,0.0745,0.0725,0.0725,0.074,0.073,0.0715,0.077,0.072,0.071,0.072,0.0715,0.0725,0.0735,0.0725,0.073,0.0745,0.0715,0.0735]
15 |
16 | #### strings
17 | ok 1 groupby.sum: 1000000x3
18 | # 4.120 MFlops/sec ±3.74% n = 14 µ = 243ms : [0.239,0.235,0.232,0.234,0.267,0.267,0.24,0.235,0.235,0.236,0.279,0.233,0.233,0.233]
19 |
20 | ### Is the FrameIndex.reduce faster than dv.query, when Frame.groupby has already been run?
21 | yes, but not quite faster than the ultra-fast first two runs of dv.query
22 |
23 | ok 1 sum: 1000000x3
24 | # 23.298 MFlops/sec ±1.40% n = 34 µ = 43ms : [0.037333333333333336,0.042333333333333334,0.042333333333333334,0.042666666666666665,0.042333333333333334,0.042666666666666665,0.042333333333333334,0.043333333333333335,0.042666666666666665,0.04566666666666667,0.044000000000000004,0.042666666666666665,0.044333333333333336,0.042666666666666665,0.04733333333333333,0.043666666666666666,0.042,0.042333333333333334,0.043000000000000003,0.042333333333333334,0.042666666666666665,0.042,0.044000000000000004,0.042666666666666665,0.042666666666666665,0.042666666666666665,0.042,0.042333333333333334,0.043000000000000003,0.048666666666666664,0.041666666666666664,0.041666666666666664,0.042333333333333334,0.043000000000000003]
25 |
26 | ### Can I make FrameIndex.reduce faster than the ultra-fast dv.query?
27 |
28 | try:
29 | 1. reproducing results
30 | 2. removing the function call
31 |
32 |
33 | ### Is the dv setup longer?
34 |
35 | ### Why is dv faster initially?
36 |
--------------------------------------------------------------------------------
/benchmark/create.js:
--------------------------------------------------------------------------------
1 | var benchtap = require('benchtap'),
2 | gen = require('../lib/test').generate,
3 | Frame = require('../lib/frame');
4 |
5 | function createSetup(N, K, useStrings){
6 | return function(event){
7 | // generate data
8 | this.groupCol = gen.Array.int(N, K);
9 | this.valueCol = gen.Array.int(N, 100);
10 |
11 | // map to strings
12 | if(useStrings)
13 | this.groupCol = this.groupCol.map(i => ["a", "b", "c"][i]);
14 |
15 | };
16 | }
17 |
18 | function test(){
19 |
20 | // create frame
21 | var columnDict = {
22 | "group-col" : this.groupCol,
23 | "reduce-col" : this.valueCol
24 | };
25 |
26 | this.frame = new Frame(columnDict);
27 | }
28 |
29 | var N = 100000,
30 | K = 3;
31 |
32 | var name = "create: " + N + "x" + K;
33 | benchtap(name, {"operations": N}, createSetup(N, K), test);
34 |
35 |
36 | name += " (strings)";
37 | benchtap(name, {"operations": N}, createSetup(N, K, true), test);
38 |
39 |
40 |
41 | var N = 1000000;
42 |
43 | name = "create: " + N + "x" + K;
44 | benchtap(name, {"operations": N}, createSetup(N, K), test);
45 |
46 |
47 | name += " (strings)";
48 | benchtap(name, {"operations": N}, createSetup(N, K, true), test);
49 |
--------------------------------------------------------------------------------
/benchmark/datavore/README.md:
--------------------------------------------------------------------------------
1 | Comparison benchmarks of similar operations for [datavore](https://github.com/StanfordHCI/datavore)
2 |
--------------------------------------------------------------------------------
/benchmark/datavore/create.js:
--------------------------------------------------------------------------------
1 | var benchtap = require('benchtap'),
2 | gen = require('../../generate'),
3 | dv = require('./datavore');
4 |
5 | function createSetup(N, K, useStrings){
6 | return function(event){
7 |
8 | this.groupCol = gen.Array.int(N, K);
9 | this.valueCol = gen.Array.int(N, 100);
10 |
11 | if(useStrings)
12 | this.groupCol = this.groupCol.map(i => ["a", "b", "c"][i]);
13 | };
14 | }
15 |
16 | function test(){
17 |
18 | // create table
19 | var table = dv.table([
20 | {name:"group-col", type:"nominal", values:this.groupCol},
21 | {name:"reduce-col", type:"numeric", values:this.valueCol}
22 | ]);
23 | }
24 |
25 |
26 | // 1 hundred thousand data points/rows
27 | var N = 100000,
28 | K = 3;
29 |
30 | var name = "create: " + N + "x" + K;
31 | benchtap(name, {"operations" : 2*N}, createSetup(N, K), test);
32 |
33 | name += " (strings)";
34 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, true), test);
35 |
36 | // 1 million data points/rows
37 | var N = 1000000;
38 |
39 | name = "create: " + N + "x" + K;
40 | benchtap(name, {"operations" : 2*N}, createSetup(N, K), test);
41 |
42 | name += " (strings)";
43 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, true), test);
44 |
--------------------------------------------------------------------------------
/benchmark/datavore/datavore/index.js:
--------------------------------------------------------------------------------
1 |
2 |
3 | module.exports = (function() {
4 | /**
5 | * The top-level Datavore namespace. All public methods and fields should be
6 | * registered on this object. Note that core Datavore source is surrounded by an
7 | * anonymous function, so any other declared globals will not be visible outside
8 | * of core methods. This also allows multiple versions of Datavore to coexist,
9 | * since each version will see their own dv namespace.
10 | *
11 | * @namespace The top-level Datavore namespace, dv.
12 | */
13 | var dv = {version: "1.0.0"};
14 |
15 | dv.array = function(n) {
16 | var a = Array(n);
17 | for (var i = n; --i >= 0;) { a[i] = 0; }
18 | return a;
19 | }
20 |
21 | // -- RANDOM NUMBER GENERATORS ------------------------------------------------
22 |
23 | dv.rand = {};
24 |
25 | dv.rand.uniform = function(min, max) {
26 | min = min || 0;
27 | max = max || 1;
28 | var delta = max - min;
29 | return function() {
30 | return min + delta * Math.random();
31 | }
32 | };
33 |
34 | dv.rand.integer = function(a, b) {
35 | if (b === undefined) {
36 | b = a;
37 | a = 0;
38 | }
39 | return function() {
40 | return a + Math.max(0, Math.floor(b * (Math.random() - 0.001)));
41 | }
42 | }
43 |
44 | dv.rand.normal = function(mean, stdev) {
45 | mean = mean || 0;
46 | stdev = stdev || 1;
47 | var next = undefined;
48 | return function() {
49 | var x = 0, y = 0, rds, c;
50 | if (next !== undefined) {
51 | x = next;
52 | next = undefined;
53 | return x;
54 | }
55 | do {
56 | x = Math.random() * 2 - 1;
57 | y = Math.random() * 2 - 1;
58 | rds = x * x + y * y;
59 | } while (rds == 0 || rds > 1);
60 | c = Math.sqrt(-2 * Math.log(rds) / rds); // Box-Muller transform
61 | next = mean + y * c * stdev;
62 | return mean + x * c * stdev;
63 | }
64 | }
65 | // -- DATA TABLE --------------------------------------------------------------
66 |
67 | dv.type = {
68 | nominal: "nominal",
69 | ordinal: "ordinal",
70 | numeric: "numeric",
71 | unknown: "unknown"
72 | };
73 |
74 | dv.table = function(input)
75 | {
76 | var table = []; // the data table
77 |
78 | table.addColumn = function(name, values, type, iscolumn) {
79 | type = type || dv.type.unknown;
80 | var compress = (type === dv.type.nominal || type === dv.type.ordinal);
81 | var vals = values;
82 |
83 | if (compress && !iscolumn) {
84 | vals = [];
85 | vals.lut = code(values);
86 | for (var i = 0, map=dict(vals.lut); i < values.length; ++i) {
87 | vals.push(map[values[i]]);
88 | }
89 | vals.get = function(idx) { return this.lut[this[idx]]; }
90 | } else if (!iscolumn) {
91 | vals.get = function(idx) { return this[idx]; }
92 | }
93 | vals.name = name;
94 | vals.index = table.length;
95 | vals.type = type;
96 |
97 | table.push(vals);
98 | table[name] = vals;
99 | };
100 |
101 | table.removeColumn = function(col) {
102 | col = table[col] || null;
103 | if (col != null) {
104 | delete table[col.name];
105 | table.splice(col.index, 1);
106 | }
107 | return col;
108 | };
109 |
110 | table.rows = function() { return table[0] ? table[0].length : 0; };
111 |
112 | table.cols = function() { return table.length; };
113 |
114 | table.get = function(col, row) { return table[col].get(row); }
115 |
116 | table.dense_query = function(q) {
117 | var tab = q.where ? table.where(q.where) : table;
118 | var dims = [], sz = [1], hasDims = q.dims;
119 | if (hasDims) {
120 | sz = [];
121 | for (i = 0; i < q.dims.length; ++i) {
122 | var dim = q.dims[i], type = typeof dim;
123 | if (type === "string" || type === "number") {
124 | col = tab[dim];
125 | } else if (dim.array) {
126 | col = dim.array(tab[dim.value]);
127 | }
128 | dims.push(col);
129 | sz.push(col.lut.length);
130 | }
131 | }
132 |
133 | var vals = q.vals, // aggregate query operators
134 | C = sz.reduce(function(a,b) { return a * b; }, 1), // cube cardinality
135 | N = tab[0].length, p, col, v, name, expr, // temp vars
136 | cnt, sum, ssq, min, max, // aggregate values
137 | _cnt, _sum, _ssq, _min, _max, // aggregate flags
138 | ctx = {}, emap = {}, exp = [], lut, // aggregate state vars
139 | i = 0, j = 0, k = 0, l = 0, idx = 0, len, slen = sz.length; // indices
140 |
141 | // Identify Requested Aggregates
142 | var star = false;
143 | for (i = 0; i < vals.length; ++i) {
144 | var req = vals[i].init();
145 | for (expr in req) {
146 | if (expr == "*") {
147 | req[expr].map(function(func) {
148 | ctx[func] = dv.array(C);
149 | });
150 | star = true;
151 | } else {
152 | idx = tab[expr].index;
153 | name = tab[expr].name;
154 | req[expr].map(function(func) {
155 | ctx[func + "_" + name] = (ctx[func + "_" + idx] = dv.array(C));
156 | });
157 | if (!emap[idx]) {
158 | emap[idx] = true;
159 | exp.push(idx);
160 | }
161 | }
162 | }
163 | }
164 | if (exp.length == 0 && star) { exp.push(-1) };
165 |
166 | // Compute Cube Index Coefficients
167 | for (i = 0, p = [1]; i < slen; ++i) {
168 | p.push(p[i] * sz[i]);
169 | }
170 |
171 | // Execute Query: Compute Aggregates
172 | for (j = 0, len = exp.length; j < len; ++j) {
173 | expr = exp[j];
174 | cnt = ctx["cnt"]; _cnt = (cnt && j==0);
175 | sum = ctx["sum_" + expr]; _sum = (sum !== undefined);
176 | ssq = ctx["ssq_" + expr]; _ssq = (ssq !== undefined);
177 | min = ctx["min_" + expr]; _min = (min !== undefined);
178 | max = ctx["max_" + expr]; _max = (max !== undefined);
179 | col = tab[expr];
180 | outer:
181 | for (i = 0; i < N; ++i) {
182 | for (idx = 0, k = 0; k < slen; ++k) {
183 | // compute cube index
184 | l = (hasDims ? dims[k][i] : 0);
185 | if (l < 0) continue outer;
186 | idx += p[k] * l;
187 | }
188 | if (col) { v = col[i]; }
189 | if (_cnt) { cnt[idx] += 1; }
190 | if (_sum) { sum[idx] += v; }
191 | if (_ssq) { ssq[idx] += v * v; }
192 | if (_min && v < min[idx]) { min[idx] = v; }
193 | if (_max && v > max[idx]) { max[idx] = v; }
194 | }
195 | }
196 |
197 | // Generate Results
198 | var result = [], stride = 1, s, val, code = q.code || false;
199 | for (i = 0; i < dims.length; ++i) {
200 | col = [];
201 | lut = dims[i].lut;
202 | s = sz[i];
203 | val = 0;
204 | for (j = 0, k = 0, c = -1; j < C; ++j, ++k) {
205 | if (k == stride) { k = 0; val = (val + 1) % s; }
206 | col[j] = code ? val : lut[val];
207 | }
208 | stride *= s;
209 | col.unique = lut.length;
210 | result.push(col);
211 | }
212 | vals.map(function(op) { result.push(op.done(ctx)); });
213 | return result;
214 | };
215 |
216 | table.query = table.dense_query;
217 |
218 | table.sparse_query = function(q) {
219 | var tab = q.where ? table.where(q.where) : table;
220 | var dims = [], sz = [1], hasDims = q.dims;
221 | if (hasDims) {
222 | sz = [];
223 | for (i=0; i max[idx])) {
306 | max[idx] = v;
307 | }
308 | }
309 | }
310 |
311 | // Generate Results
312 | var rr = vals.map(function(op) { return op.done(ctx); });
313 | var keys = rr[0];
314 | if (rr.length > 1) {
315 | keys = {};
316 | rr.forEach(function(o) { for (var k in o) keys[k] = 1; });
317 | }
318 | var result = dims.map(function() { return []; });
319 | vals.forEach(function() { result.push([]); });
320 | len = dims.length;
321 |
322 | for (k in keys) {
323 | // map index i to dimensional indices
324 | var nn = C, uv, div;
325 | for (i = k, j = len; --j >= 0;) {
326 | uv = dims[j].lut.length;
327 | div = ~~(nn / uv);
328 | result[j].push(dims[j].lut[~~(i / div)]);
329 | i = i % div;
330 | nn = ~~(nn / uv);
331 | }
332 | for (j = 0; j < rr.length; ++j) {
333 | val = rr[j][k];
334 | result[len + j].push(val === undefined ? 0 : val);
335 | }
336 | }
337 | return result;
338 | };
339 |
340 | table.where = function(f) {
341 | var nrows = table.rows(),
342 | ncols = table.cols();
343 |
344 | // initialize result table
345 | var result = dv.table([]);
346 | for (var i = 0; i < ncols; ++i) {
347 | result.push([]);
348 | result[i].name = table[i].name;
349 | result[i].type = table[i].type;
350 | result[i].index = i;
351 | result[table[i].name] = result[i];
352 | if (table[i].lut) { result[i].lut = table[i].lut; }
353 | }
354 |
355 | // populate result table
356 | for (var row = 0, j = -1; row < nrows; ++row) {
357 | if (f(table, row)) {
358 | for (i = 0, ++j; i < ncols; ++i) {
359 | result[i][j] = table[i][row];
360 | }
361 | }
362 | }
363 | return result;
364 | };
365 |
366 | /** @private */
367 | function code(a) {
368 | var c = [], d = {}, v;
369 | for (var i=0, len=a.length; i maxv) { maxv = val; }
517 | }
518 | if (minb) { minv = Math.floor(minv / step) * step; }
519 | if (maxb) { maxv = Math.ceil(maxv / step) * step; }
520 | }
521 | // compute index array
522 | var a = [], lut = (a.lut = []),
523 | range = (maxv - minv), unique = Math.ceil(range / step);
524 | for (i = 0; i < N; ++i) {
525 | val = values[i];
526 | if (val < minv || val > maxv) { a.push(-1); }
527 | else if (val == maxv) { a.push(unique - 1); }
528 | else { a.push(~~((values[i] - minv) / step)); }
529 | }
530 | for (i = 0; i < unique; ++i) {
531 | // multiply b/c adding garners round-off error
532 | lut.push(minv + i * step);
533 | }
534 | return a;
535 | };
536 | op.step = function(x) {
537 | if (x === undefined) return step;
538 | step = x;
539 | return op;
540 | };
541 | op.min = function(x) {
542 | if (x === undefined) return min;
543 | min = x;
544 | return op;
545 | };
546 | op.max = function(x) {
547 | if (x === undefined) return max;
548 | max = x;
549 | return op;
550 | };
551 | op.value = expr;
552 | return op;
553 | };
554 |
555 | dv.quantile = function(expr, n) {
556 | function search(array, value) {
557 | var low = 0, high = array.length - 1;
558 | while (low <= high) {
559 | var mid = (low + high) >> 1, midValue = array[mid];
560 | if (midValue < value) { low = mid + 1; }
561 | else if (midValue > value) { high = mid - 1; }
562 | else { return mid; }
563 | }
564 | var i = -low - 1;
565 | return (i < 0) ? (-i - 1) : i;
566 | }
567 |
568 | var op = {};
569 | op.array = function(values) {
570 | // get sorted data values
571 | var i, d = values.sorted;
572 | if (!d) {
573 | var cmp;
574 | if (values.type && values.type === "numeric") {
575 | cmp = function(a,b) { return a - b; }
576 | } else {
577 | cmp = function(a,b) { return a < b ? -1 : a > b ? 1 : 0; }
578 | }
579 | values.sorted = (d = values.slice().sort(cmp));
580 | }
581 | // compute quantile boundaries
582 | var q = [d[0]], a = [], lut = (a.lut = []);
583 | for (i = 1; i <= n; ++i) {
584 | q[i] = d[~~(i * (d.length - 1) / n)];
585 | lut.push(i - 1);
586 | }
587 | // iterate through data and label quantiles
588 | for (i = 0; i < values.length; ++i) {
589 | a.push(Math.max(0, search(q, values[i]) - 1));
590 | }
591 | return a;
592 | }
593 | op.bins = function(x) {
594 | if (x === undefined) return n;
595 | n = x;
596 | return op;
597 | }
598 | op.value = expr;
599 | return op;
600 | };
601 |
602 | return dv; })();
603 |
--------------------------------------------------------------------------------
/benchmark/datavore/table.query.sum.js:
--------------------------------------------------------------------------------
1 | var benchtap = require('benchtap'),
2 | gen = require('../../generate'),
3 | dv = require('./datavore');
4 |
5 | function createSetup(N, K, useStrings){
6 | return function(event){
7 |
8 | var groupCol = gen.Array.int(N, K);
9 | var valueCol = gen.Array.int(N, 100);
10 |
11 | if(useStrings)
12 | groupCol = groupCol.map(i => ["a", "b", "c"][i]);
13 |
14 | // create table
15 | this.table = dv.table([
16 | {name:"group-col", type:"nominal", values:groupCol},
17 | {name:"reduce-col", type:"numeric", values:valueCol}
18 | ]);
19 |
20 | // generate data
21 | /*
22 | this.table = dv.table();
23 | this.table.addColumn("group-col", groupCol, dv.type.nominal);
24 | this.table.addColumn("reduce-col", valueCol, dv.type.numeric);
25 | */
26 |
27 | };
28 | }
29 |
30 | function test(){
31 |
32 | var result = this.table.query({
33 | "dims" : [0],
34 | "vals" : [dv.sum("reduce-col")]
35 | });
36 | }
37 |
38 |
39 | var N = 100000,
40 | K = 3;
41 |
42 | var name = "table.query.sum: " + N + "x" + K;
43 | benchtap(name, {"operations" : 2*N}, createSetup(N, K), test);
44 |
45 | name += " (strings)";
46 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, true), test);
47 |
48 |
49 | var N = 1000000;
50 |
51 | name = "table.query.sum: " + N + "x" + K;
52 | benchtap(name, {"operations" : 2*N}, createSetup(N, K), test);
53 |
54 | name += " (strings)";
55 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, true), test);
56 |
--------------------------------------------------------------------------------
/benchmark/datavore/table.query.sum.multi.js:
--------------------------------------------------------------------------------
1 | var benchtap = require('benchtap'),
2 | gen = require('../../generate'),
3 | dv = require('./datavore');
4 |
5 | var STRINGS = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"];
6 |
7 | function createSetup(N, K, M, useStrings){
8 | return function(event){
9 |
10 | var columns = [
11 | {"name" : "value", "type":"numeric", "values": gen.Array.int(N, 100)}
12 | ];
13 | var names = [];
14 | for (var m = 0; m < M; m++){
15 | var name = "id_"+m;
16 | var column = {
17 | "name" : name,
18 | "type" : "ordinal",
19 | "values" : gen.Array.int(N, K)
20 | };
21 |
22 | // map to strings
23 | if(useStrings){
24 | column.values = column.values.map(i => STRINGS[i]);
25 | }
26 |
27 | columns.push(column);
28 |
29 |
30 | names[m] = name;
31 | }
32 |
33 | // create table
34 | this.table = dv.table(columns);
35 |
36 | // generate data
37 | /*
38 | this.table = dv.table();
39 | this.table.addColumn("group-col", groupCol, dv.type.nominal);
40 | this.table.addColumn("reduce-col", valueCol, dv.type.numeric);
41 | */
42 |
43 | };
44 | }
45 |
46 |
47 | function test(){
48 |
49 | //var names = this.names;
50 | //"dims" : ["id_0", "id_1"],
51 | var result = this.table.query({
52 | "dims" : ["id_0", "id_1", "id_2", "id_3"],
53 | "vals" : [dv.sum("value")]
54 | });
55 | }
56 |
57 |
58 | var N = 100000,
59 | K = 3,
60 | M = 4;
61 |
62 | var name = "table.query.sum.multi: " + N + "x" + K + "x" + M;
63 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, M), test);
64 |
65 | name += " (strings)";
66 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, M, true), test);
67 |
68 |
69 | var N = 1000000;
70 |
71 | name = "table.query.sum.multi: " + N + "x" + K + "x" + M;
72 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, M), test);
73 |
74 | name += " (strings)";
75 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, M, true), test);
76 |
--------------------------------------------------------------------------------
/benchmark/groupby.sum.js:
--------------------------------------------------------------------------------
1 | var benchtap = require('benchtap'),
2 | gen = require('../lib/test').generate,
3 | Frame = require('../lib/frame');
4 |
5 |
6 | var STRINGS = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"];
7 |
8 | // create a frame for multidimensional groupby
9 | function createSetup(N, K, M, useStrings){
10 | return function(event){
11 | // generate data
12 | var columns = {
13 | "value" : gen.Array.int(N, 100)
14 | };
15 | var names = [];
16 | for (var m = 0; m < M; m++){
17 | var name = "id_"+m;
18 | columns[name] = gen.Array.int(N, K);
19 |
20 | // map to strings
21 | if(useStrings){
22 | columns[name] = columns[name].map(i => STRINGS[i]);
23 | }
24 |
25 | names[m] = name;
26 | }
27 | //console.log(names);
28 |
29 |
30 | // create frame
31 | this.frame = new Frame(columns);
32 | };
33 | }
34 |
35 |
36 | var N = 100000,
37 | K = 3,
38 | M = 1;
39 |
40 | var groups = [];
41 | for(var i = 0; i < M; i ++) groups.push("id_"+i);
42 |
43 | var name = "groupby.sum: " + N + "x" + K + "x" + M;
44 |
45 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, M), function(){
46 |
47 | //var group = this.frame.groupbymulti(["group-col0", "group-col1"]);
48 | var group = this.frame.groupby(groups);
49 | var result = group.sum("value");
50 | });
51 |
52 | name += " (strings)";
53 |
54 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, M, true), function(){
55 |
56 | //var group = this.frame.groupbymulti(["group-col0", "group-col1"]);
57 | var group = this.frame.groupby(groups);
58 | var result = group.sum("value");
59 | });
60 |
61 | N = 1000000;
62 | name = "groupby.sum: " + N + "x" + K + "x" + M;
63 |
64 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, M), function(){
65 |
66 | //var group = this.frame.groupbymulti(["group-col0", "group-col1"]);
67 | var group = this.frame.groupby(groups);
68 | var result = group.sum("value");
69 | });
70 |
71 | name += " (strings)";
72 |
73 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, M, true), function(){
74 |
75 | //var group = this.frame.groupbymulti(["group-col0", "group-col1"]);
76 | var group = this.frame.groupby(groups);
77 | var result = group.sum("value");
78 | });
79 |
80 | M = 2;
81 |
82 | name = "groupby.sum: " + N + "x" + K + "x" + M;
83 |
84 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, M), function(){
85 |
86 | //var group = this.frame.groupbymulti(["group-col0", "group-col1"]);
87 | var group = this.frame.groupby(groups);
88 | var result = group.sum("value");
89 | });
90 |
91 | K = 200;
92 | M = 2;
93 | name = "groupby.sum: " + N + "x" + K + "x" + M;
94 | benchtap(name, {"operations" : 2*N}, createSetup(N, K, M, true), function(){
95 |
96 | //var group = this.frame.groupbymulti(["group-col0", "group-col1"]);
97 | var group = this.frame.groupby(groups);
98 | var result = group.sum("value");
99 | });
100 | /*
101 | var tests = [
102 | 0
103 | ];
104 |
105 | var RTOL = 1e-05, // 1e-05
106 | ATOL = 1e-12; // 1e-12
107 |
108 | var dataDirectory = 'test/data/sum/',
109 | testFile = 'small.json';
110 |
111 | var floader = require('floader'),
112 | dtest = require('../lib/test');
113 |
114 | floader.load(dataDirectory + testFile, function(err, config){
115 |
116 | var suite = JSON.parse(config);
117 |
118 | for(var j = 0; j < tests.length; j++){
119 |
120 | var i = tests[j];
121 | var prefix = String("0000" + (i + 1)).slice(-4);
122 |
123 | // directory containing matrix data files for current test
124 | var directory = dataDirectory + prefix + '/';
125 |
126 | var test = suite[i];
127 |
128 | var names = test.id.map(function(spec, i){ return "id_" + i;});
129 | var types = test.id.map(function(spec, i){ return spec['type'];});
130 |
131 | var value_names = ["value_0"];
132 | var value_types = [test.value[0].type];
133 |
134 | var N = test.N; // number of rows
135 | var distincts = test.id.map(function(spec, i){ return spec.K; });
136 |
137 | var testName = "groupby.summulti: " + N + " x " + "(" + distincts.join(", ") + ")"
138 | //tape(testName, generateTestCase(directory, names, types, ["value_0"], [test.value[0].type]));
139 |
140 | //var name = "groupby.sum.multi: " + N + "x" + K + "x" + M;
141 |
142 | benchtap(testName, {"operations" : 2*N},
143 | createSetup(directory, names, types, value_names, value_types),
144 | function(event){
145 |
146 | var g = this.frame.groupbymulti(names);
147 | var actual = g.summulti(value_names[0]);
148 |
149 | event.resolve();
150 | });
151 | }
152 | });
153 |
154 | var OUT_FILENAME = "out.json";
155 |
156 | function createSetup(directory, id_names, id_types, value_names, value_types){
157 | return function(event){
158 |
159 | var self = this;
160 | var names = id_names.concat(value_names);
161 | var types = id_types.concat(value_types);
162 |
163 | // which columns require a key file?
164 | var key_names = id_names.filter(function(item, i){
165 | return id_types[i] in dtest.string_types
166 | });
167 | var key_types = id_types.filter(function(item, i){
168 | return item in dtest.string_types
169 | });
170 |
171 | console.log(directory);
172 | // load columns from files
173 | dtest.load(directory, names, types, function(err, columns){
174 |
175 | if(err) return console.log(err);
176 |
177 | console.log("running setup.");
178 | // load key files
179 | dtest.load_key(directory, key_names, key_types, function(err, keys){
180 |
181 | floader.load(directory + OUT_FILENAME, function(err, out){
182 | var expected = JSON.parse(out);
183 |
184 | var column_set = {};
185 | for (var i = 0; i < names.length; i++){
186 | var name = names[i];
187 | var column = columns[i];
188 | column_set[name] = column;
189 | }
190 | // keys map a small set of integers to other things (like strings)
191 | // they're a very simple form of fixed length coding
192 | var key_set = {};
193 | for (var i = 0; i < keys.length; i++){
194 | var name = key_names[i];
195 | var key = keys[i];
196 | key_set[name] = key;
197 | }
198 |
199 | self.frame = new Frame(column_set, key_set);
200 |
201 | event.resolve();
202 |
203 | });
204 |
205 | });
206 | });
207 | };
208 | }
209 | */
210 |
--------------------------------------------------------------------------------
/benchmark/groupby_sum.js:
--------------------------------------------------------------------------------
1 | var benchtap = require('benchtap'),
2 | gen = require('../lib/test').generate,
3 | Frame = require('../lib/frame');
4 |
5 | var STRINGS = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"];
6 | /*
7 | N - number of rows
8 | K - number of distinct values in id columns
9 | M - number of id columns
10 | */
11 | function createSetup(N, K, M, useStrings){
12 | return function(event){
13 | // generate data
14 | var columns = {
15 | "value" : gen.Array.int(N, 100)
16 | };
17 | var names = [];
18 | for (var m = 0; m < M; m++){
19 | var name = "id_"+m;
20 | columns[name] = gen.Array.int(N, K);
21 |
22 | // map to strings
23 | if(useStrings){
24 | columns[name] = columns[name].map(i => STRINGS[i]);
25 | }
26 |
27 | names[m] = name;
28 | }
29 |
30 | // create frame
31 | this.frame = new Frame(columns);
32 | // group on all id columns
33 | this.group = this.frame.groupby(names);
34 | };
35 | }
36 |
37 | var N = 100000,
38 | K = 3,
39 | M = 1;
40 |
41 | var name = "sum: " + N + "x" + K + "x" + M;
42 |
43 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){
44 | var result = this.group.sum("value");
45 | });
46 |
47 | /*
48 | name += " (strings)";
49 |
50 | benchtap(name, {"operations": N}, createSetup(N, K, true), function(){
51 | var result = this.group.reduce("reduce-col");
52 | });
53 | */
54 |
55 |
56 | var N = 1000000;
57 |
58 | name = "sum: " + N + "x" + K + "x" + M;
59 |
60 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){
61 | var result = this.group.sum("value");
62 | });
63 |
64 | M = 2;
65 |
66 | name = "sum: " + N + "x" + K + "x" + M;
67 |
68 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){
69 | var result = this.group.sum("value");
70 | });
71 |
72 | /*
73 | name += " (strings)";
74 |
75 | benchtap(name, {"operations": N}, createSetup(N, K, true), function(){
76 | var result = this.group.reduce("reduce-col");
77 | });
78 | */
79 |
80 | K = 200;
81 | M = 2;
82 |
83 | var name = "sum: " + N + "x" + K + "x" + M;
84 |
85 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){
86 | var result = this.group.sum("value");
87 | });
88 |
--------------------------------------------------------------------------------
/benchmark/median.js:
--------------------------------------------------------------------------------
1 | var benchtap = require('benchtap'),
2 | gen = require('../lib/test').generate,
3 | Frame = require('../lib/frame');
4 |
5 | var STRINGS = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"];
6 | /*
7 | N - number of rows
8 | K - number of distinct values in id columns
9 | M - number of id columns
10 | */
11 | function createSetup(N, K, M, useStrings){
12 | return function(event){
13 | // generate data
14 | var columns = {
15 | "value" : gen.Array.int(N, 100)
16 | };
17 | var names = [];
18 | for (var m = 0; m < M; m++){
19 | var name = "id_"+m;
20 | columns[name] = gen.Array.int(N, K);
21 |
22 | // map to strings
23 | if(useStrings){
24 | columns[name] = columns[name].map(i => STRINGS[i]);
25 | }
26 |
27 | names[m] = name;
28 | }
29 |
30 | // create frame
31 | this.frame = new Frame(columns);
32 | };
33 | }
34 |
35 | var N = 100000,
36 | K = 3,
37 | M = 1;
38 |
39 | var name = "median: " + N + "x" + K + "x" + M;
40 |
41 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){
42 | var result = this.frame.median("value");
43 | });
44 |
45 | /*
46 | name += " (strings)";
47 |
48 | benchtap(name, {"operations": N}, createSetup(N, K, true), function(){
49 | var result = this.group.reduce("reduce-col");
50 | });
51 | */
52 |
53 |
54 | var N = 1000000;
55 |
56 | name = "median: " + N + "x" + K + "x" + M;
57 |
58 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){
59 | var result = this.frame.median("value");
60 | });
61 |
62 | /*
63 | name += " (strings)";
64 |
65 | benchtap(name, {"operations": N}, createSetup(N, K, true), function(){
66 | var result = this.group.reduce("reduce-col");
67 | });
68 | */
69 |
70 | K = 200;
71 | M = 2;
72 |
73 | var name = "median: " + N + "x" + K + "x" + M;
74 |
75 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){
76 | var result = this.frame.median("value");
77 | });
78 |
--------------------------------------------------------------------------------
/benchmark/sum.js:
--------------------------------------------------------------------------------
1 | var benchtap = require('benchtap'),
2 | gen = require('../lib/test').generate,
3 | Frame = require('../lib/frame');
4 |
5 | var STRINGS = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"];
6 | /*
7 | N - number of rows
8 | K - number of distinct values in id columns
9 | M - number of id columns
10 | */
11 | function createSetup(N, K, M, useStrings){
12 | return function(event){
13 | // generate data
14 | var columns = {
15 | "value" : gen.Array.int(N, 100)
16 | };
17 | var names = [];
18 | for (var m = 0; m < M; m++){
19 | var name = "id_"+m;
20 | columns[name] = gen.Array.int(N, K);
21 |
22 | // map to strings
23 | if(useStrings){
24 | columns[name] = columns[name].map(i => STRINGS[i]);
25 | }
26 |
27 | names[m] = name;
28 | }
29 |
30 | // create frame
31 | this.frame = new Frame(columns);
32 | };
33 | }
34 |
35 | var N = 100000,
36 | K = 3,
37 | M = 1;
38 |
39 | var name = "sum: " + N + "x" + K + "x" + M;
40 |
41 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){
42 | var result = this.frame.sum("value");
43 | });
44 |
45 | /*
46 | name += " (strings)";
47 |
48 | benchtap(name, {"operations": N}, createSetup(N, K, true), function(){
49 | var result = this.group.reduce("reduce-col");
50 | });
51 | */
52 |
53 |
54 | var N = 1000000;
55 |
56 | name = "sum: " + N + "x" + K + "x" + M;
57 |
58 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){
59 | var result = this.frame.sum("value");
60 | });
61 |
62 | /*
63 | name += " (strings)";
64 |
65 | benchtap(name, {"operations": N}, createSetup(N, K, true), function(){
66 | var result = this.group.reduce("reduce-col");
67 | });
68 | */
69 |
70 | K = 200;
71 | M = 2;
72 |
73 | var name = "sum: " + N + "x" + K + "x" + M;
74 |
75 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){
76 | var result = this.frame.sum("value");
77 | });
78 |
--------------------------------------------------------------------------------
/benchmark/where.js:
--------------------------------------------------------------------------------
1 | var benchtap = require('benchtap'),
2 | gen = require('../lib/test').generate,
3 | Frame = require('../lib/frame');
4 |
5 | var STRINGS = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"];
6 | /*
7 | N - number of rows
8 | K - number of distinct values in id columns
9 | M - number of id columns
10 | */
11 | function createSetup(N, K, M, useStrings){
12 | return function(event){
13 | // generate data
14 | var columns = {
15 | "value" : gen.Array.int(N, 100)
16 | };
17 | var names = [];
18 | for (var m = 0; m < M; m++){
19 | var name = "id_"+m;
20 | columns[name] = gen.Array.int(N, K);
21 |
22 | // map to strings
23 | if(useStrings){
24 | columns[name] = columns[name].map(i => STRINGS[i]);
25 | }
26 |
27 | names[m] = name;
28 | }
29 |
30 | // create frame
31 | this.frame = new Frame(columns);
32 | //this.frame.where(row => row["id_1"] == 1);
33 | //this.frame.where("id_1", id => id == 1);
34 | };
35 | }
36 |
37 | var N = 100000,
38 | K = 3,
39 | M = 2;
40 |
41 | var name = "where.function: " + N + "x" + K + "x" + M;
42 |
43 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){
44 | //var result = this.frame.where(row => row["id_0"] == 1);
45 | var result = this.frame.where("id_0", id => id == 1);
46 | });
47 |
48 | /*
49 | name += " (strings)";
50 |
51 | benchtap(name, {"operations": N}, createSetup(N, K, true), function(){
52 | var result = this.group.reduce("reduce-col");
53 | });
54 | */
55 |
56 |
57 | var N = 1000000;
58 |
59 | name = "where.function: " + N + "x" + K + "x" + M;
60 |
61 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){
62 | //var result = this.frame.where(row => row["id_0"] == 1);
63 | var result = this.frame.where("id_0", id => id == 1);
64 | });
65 |
66 | /*
67 | name += " (strings)";
68 |
69 | benchtap(name, {"operations": N}, createSetup(N, K, true), function(){
70 | var result = this.group.reduce("reduce-col");
71 | });
72 | */
73 |
74 | N = 1000000;
75 | K = 200;
76 | M = 2;
77 |
78 | var name = "where.equal: " + N + "x" + K + "x" + M;
79 |
80 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){
81 | //var result = this.frame.where(row => row["id_0"] == 1);
82 | var result = this.frame.where("id_0", 1);
83 | });
84 |
85 | var name = "where.in: " + N + "x" + K + "x" + M;
86 |
87 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){
88 | //var result = this.frame.where(row => row["id_0"] == 1);
89 | var result = this.frame.where("id_0", [0, 1, 3, 10, 12, 18, 101, 52, 23, 18, 7, 12, 154, 34, 117, 5]);
90 | });
91 |
--------------------------------------------------------------------------------
/benchmark/where_sum.js:
--------------------------------------------------------------------------------
1 | var benchtap = require('benchtap'),
2 | gen = require('../lib/test').generate,
3 | Frame = require('../lib/frame');
4 |
5 | var STRINGS = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"];
6 | /*
7 | N - number of rows
8 | K - number of distinct values in id columns
9 | M - number of id columns
10 | */
11 | function createSetup(N, K, M, useStrings){
12 | return function(event){
13 | // generate data
14 | var columns = {
15 | "value" : gen.Array.int(N, 100)
16 | };
17 | var names = [];
18 | for (var m = 0; m < M; m++){
19 | var name = "id_"+m;
20 | columns[name] = gen.Array.int(N, K);
21 |
22 | // map to strings
23 | if(useStrings){
24 | columns[name] = columns[name].map(i => STRINGS[i]);
25 | }
26 |
27 | names[m] = name;
28 | }
29 |
30 | // create frame
31 | this.frame = new Frame(columns);
32 | this.frame = this.frame.where("id_0", 0);
33 | //this.frame.where(row => row["id_1"] == 1);
34 | //this.frame.where("id_1", id => id == 1);
35 | };
36 | }
37 |
38 | var N = 100000,
39 | K = 3,
40 | M = 2;
41 |
42 | var name = "where.sum: " + N + "x" + K + "x" + M;
43 |
44 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){
45 | //var result = this.frame.where(row => row["id_0"] == 1);
46 | var result = this.frame.sum("value");
47 | });
48 |
49 | /*
50 | name += " (strings)";
51 |
52 | benchtap(name, {"operations": N}, createSetup(N, K, true), function(){
53 | var result = this.group.reduce("reduce-col");
54 | });
55 | */
56 |
57 |
58 | var N = 1000000;
59 |
60 | name = "where.sum: " + N + "x" + K + "x" + M;
61 |
62 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){
63 | //var result = this.frame.where(row => row["id_0"] == 1);
64 | var result = this.frame.sum("value");
65 | });
66 |
67 | /*
68 | name += " (strings)";
69 |
70 | benchtap(name, {"operations": N}, createSetup(N, K, true), function(){
71 | var result = this.group.reduce("reduce-col");
72 | });
73 | */
74 |
75 | N = 1000000;
76 | K = 200;
77 | M = 2;
78 |
79 | var name = "where.sum: " + N + "x" + K + "x" + M;
80 |
81 | benchtap(name, {"operations": N}, createSetup(N, K, M), function(){
82 | var result = this.frame.sum("value")
83 | });
84 |
--------------------------------------------------------------------------------
/lib/frame-index.js:
--------------------------------------------------------------------------------
1 |
2 | var reducers = require('./stream-reducers');
3 |
4 | function isobject(obj){ return Object.prototype.toString.call(obj) === "[object Object]";}
5 |
6 | /* A heirarchical index for the Frame data structure, the result of a call to
7 | * Frame.groupby
8 | */
9 | function FrameIndex(frame, index, groups){
10 | this._frame = frame;
11 | this._index = index;
12 | this._groups = groups;
13 | }
14 |
15 | module.exports = FrameIndex;
16 |
17 | /*
18 | */
19 | FrameIndex.prototype.columns = function(){
20 | return this._frame.columns();
21 | };
22 |
23 | FrameIndex.prototype.groups = function(){
24 | return this._groups;
25 | };
26 |
27 | FrameIndex.prototype.count = function(){
28 | var reduced = {};
29 | var index = this._index;
30 |
31 | // depth first iteration
32 | var todo = [[index, reduced, 0]];
33 |
34 | var result;
35 | while (todo.length > 0){
36 | n = todo.pop();// object
37 | index = n[0];
38 | result = n[1];
39 | level = n[2];
40 |
41 | var c, name;
42 | for(key in index){ // keys in object
43 | c = index[key];
44 | name = this._groups[level];
45 |
46 | // decode the key, if possible
47 | if(this._frame._keys && name in this._frame._keys){
48 | decoder = this._frame._keys[name];
49 | key = decoder[key];
50 | }
51 |
52 | if(isobject(c)){
53 | result[key] = {};
54 | todo.push([c, result[key], level + 1]);
55 | } else {
56 | result[key] = c.length; // reduce
57 | }
58 | }
59 | }
60 |
61 | return reduced;
62 |
63 | };
64 |
65 | FrameIndex.prototype.sum = function(selector){
66 | return this.reduce(selector, reducers.sum);
67 | };
68 |
69 | FrameIndex.prototype.reduce = function(selector, reducer, initial){
70 |
71 | var reduced = {};
72 | var index = this._index;
73 | var column = this._frame._cols[selector];
74 |
75 | reducer = reducer ||
76 | ((column.length > 0 && Object.prototype.toString.call(column[0]) == "[object Number]") ?
77 | reducers.sum :
78 | reducers.max);
79 |
80 | // depth first traversal
81 | var todo = [[index, reduced, 0]];
82 |
83 | var result;
84 | while (todo.length > 0){
85 | n = todo.pop();// object
86 | index = n[0];
87 | result = n[1];
88 | level = n[2];
89 |
90 | var c, name;
91 | for(key in index){ // keys in object
92 | c = index[key];
93 | group = this._groups[level];
94 |
95 | // decode the key, if possible
96 | if(this._frame._keys && group in this._frame._keys){
97 | decoder = this._frame._keys[group];
98 | key = decoder[key];
99 | }
100 |
101 | if(isobject(c)){
102 | result[key] = {};
103 | todo.push([c, result[key], level + 1]);
104 | } else {
105 | var indices = c;
106 | var value = indexreduce(column, indices, reducer, initial);
107 |
108 | result[key] = value;
109 | }
110 | }
111 | }
112 |
113 | return reduced;
114 |
115 | };
116 |
117 | /* reduce a subset of an array given by a set of indices using a supplied
118 | reducing function.
119 | */
120 | function indexreduce(column, indices, reducer, initial){
121 |
122 | var start,
123 | value;
124 |
125 | // chose initial values and start of loop based on number of inputs and
126 | // supplied initial value
127 | if(initial !== void(0)){
128 | start = 0;
129 | value = initial;
130 | } else if(indices.length > 0) {
131 | start = 1;
132 | value = column[indices[0]];
133 | } else {
134 | start = 0;
135 | value = 0;
136 | }
137 |
138 | for(var i = start; i < indices.length; i++){
139 | index = indices[i];
140 | value = reducer(value, column[index], i);
141 | }
142 |
143 | return value;
144 |
145 | }
146 |
--------------------------------------------------------------------------------
/lib/frame.js:
--------------------------------------------------------------------------------
1 |
2 | var reducers = require('./stream-reducers');
3 | var BitArray = require('bit-array');
4 |
5 |
6 | function isarray(obj){ return Object.prototype.toString.call(obj) === "[object Array]";}
7 | function isobject(obj){ return Object.prototype.toString.call(obj) === "[object Object]";}
8 | function isnumber(obj){ return Object.prototype.toString.call(obj) === "[object Number]";}
9 | function isinteger(num){ return num % 1 === 0;}
10 | function isstring(obj){ return Object.prototype.toString.call(obj) === "[object String]";}
11 | function isfunction(obj){ return Object.prototype.toString.call(obj) === "[object Function]"; }
12 | function isdate(obj){ return Object.prototype.toString.call(obj) === "[object Date]";}
13 | var typed_array_constructors = {
14 | "[object Int32Array]" : true,
15 | "[object Uint32Array]" : true,
16 | "[object Float32Array]" : true,
17 | "[object Int8Array]" : true,
18 | "[object Uint8Array]" : true,
19 | "[object Int16Array]" : true,
20 | "[object Uint16Array]" : true,
21 | "[object Float64Array]" : true
22 | }
23 | function istypedarray(obj){
24 | var tag = Object.prototype.toString.call(obj);
25 | return tag in typed_array_constructors;
26 | }
27 |
28 |
29 | function shallowcopy(obj){
30 | if(obj == null) return obj; // null or undefined
31 |
32 | var copy = {};
33 | for(var key in obj){
34 | copy[key] = obj[key];
35 | }
36 |
37 | return copy;
38 | }
39 |
40 | //function isframe(obj){ return isarray(obj) && (obj.length == 0 || isobject(obj[0])); }
41 |
42 |
43 | /* A lightweight, high performance Columnar Data Store disguised as a Data Frame
44 | *
45 | * Interface similarity targets and inspiration:
46 | * pandas, R, Linq, rethinkDB, Matlab
47 | *
48 | * column names:
49 | * columns.values.tolist(), colnames(f),
50 | *
51 | * aggregation:
52 | * groupby, , ,
53 | *
54 | * filtering:
55 | *
56 | * # References
57 | * https://github.com/StanfordHCI/datavore
58 | * http://vincentarelbundock.github.io/Rdatasets/datasets.html
59 | * https://galeascience.wordpress.com/2016/08/10/top-10-pandas-numpy-and-scipy-functions-on-github/
60 | * https://github.com/visualfabriq/bquery/blob/master/bquery/khash.h
61 | * ## R
62 | * http://www.r-tutor.com/r-introduction/data-frame
63 | * https://www.datacamp.com/community/tutorials/15-easy-solutions-data-frame-problems-r#gs.ArNaS44
64 | * ## Pandas
65 | * http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html
66 | * http://chrisalbon.com/python/pandas_index_select_and_filter.html
67 | * ## Linq
68 | * https://msdn.microsoft.com/en-us/library/bb534304(v=vs.110).aspx?cs-save-lang=1&cs-lang=csharp#code-snippet-1
69 | */
70 | /* Create a data frame object from some data, like the Pandas and R objects
71 | * of similar name.
72 | *
73 | * @examples
74 | *
75 | * // an array of row objects, like the output from babyparse and papaparse
76 | *
77 | * rows =
78 | [
79 | { "name" : "Finn", "age" : 16, "title" : "Finn the Human"},
80 | { "name" : "Jake", "age" : 32 , "title" : "Jake the Dog"},
81 | { "name" : "Simon", "age" : 1043, "title" : "Ice King"},
82 | { "name" : "Bonnibel", "age" : 827, "title" : "Princess Bubblegum"},
83 | { "name" : "Marceline", "age" : 1004, "title" : "Marceline the Vampire Queen"}
84 | ];
85 | * df = Frame(rows);
86 | *
87 | * // an object (dict) mapping column names to arrays of values
88 | *
89 | * columns =
90 | * {
91 | * "name" : ["Finn", "Jake", "Simon", "Bonnibel", "Marceline"],
92 | * "age" : [16, 32, 1043, 827, 1004],
93 | * "title" : ["Finn the Human", "Jake the Dog", "Ice King", "Princess Bubblegum", "Marceline the Vampire Queen"]
94 | * };
95 | *
96 | * df = Frame(columns);
97 | *
98 | * // an optional keys argument allows string columns to be more compactly
99 | * // represented when duplicates are present
100 | *
101 | * columns =
102 | * {
103 | * "name" : [0, 1, 2, 3, 4],
104 | * "age" : [16, 32, 1043, 827, 1004],
105 | * "title" : [0, 1, 2, 3, 4]
106 | * };
107 | *
108 | * keys = {
109 | * "name" : ["Finn", "Jake", "Simon", "Bonnibel", "Marceline"],
110 | * "title" : ["Finn the Human", "Jake the Dog", "Ice King", "Princess Bubblegum", "Marceline the Vampire Queen"]
111 | * }
112 | *
113 | * df = Frame(columns, keys);
114 | *
115 | */
116 | function Frame(data, keys, index, groups, filters){
117 | // f.constructor.name return "Frame"
118 | if(!(this instanceof Frame)) return new Frame(data, keys, index, groups, filters);
119 |
120 | if(Symbol && Symbol.toStringTag) this[Symbol.toStringTag] = 'Frame';
121 |
122 |
123 | // TODO: deep copy index
124 | if(index){
125 | Object.defineProperty(this, "_index", {
126 | "enumerable" : false,
127 | "value" : index
128 | });
129 | }
130 |
131 | // was a filters argument provided?
132 | if(filters){
133 | // yes, construct a single filter from the values
134 | var filter;
135 | for(key in filters){
136 | if(filter == null){
137 | filter = filters[key].copy();
138 | } else {
139 | filter.and(filters[key]);
140 | }
141 | }
142 | // copy of all defined filters
143 | Object.defineProperty(this, "_filters", {
144 | "enumerable" : false,
145 | "value" : filters
146 | });
147 | // single filter produced from combining all filters
148 | Object.defineProperty(this, "_filter", {
149 | "enumerable" : false,
150 | "value" : filter
151 | });
152 | Object.defineProperty(this, "_count", {
153 | "enumerable" : false,
154 | "value" : filter.count()
155 | });
156 | }
157 | if(groups){
158 | Object.defineProperty(this, "_groups", {
159 | "enumerable" : false,
160 | "value" : groups.slice(0)
161 | });
162 | }
163 |
164 | // do we have input?
165 | if(data == null){
166 | // no, just return an empty Frame
167 | return;
168 | }
169 |
170 | // what type of data input do we have?
171 | if(isobject(data)){
172 | // object, check it's values
173 | var column, length;
174 |
175 | for(var key in data){
176 | column = data[key];
177 |
178 | // are the items arrays?
179 | if(isarray(column) || istypedarray(column)){
180 | // yes, check for consistent lengths
181 |
182 | if(length == null){
183 | length = column.length;
184 | } else if(length !== column.length){
185 | throw new Error("Invalid data, arrays in object must be of equal length");
186 | }
187 | } else {
188 | // no, invalid data
189 | throw new Error("Invalid data, must be array of rows or dict of columns");
190 | }
191 | }
192 |
193 | Object.defineProperty(this, "length", {
194 | "enumerable" : false,
195 | "value" : length
196 | });
197 |
198 | // all checks pass use data as columns
199 | Object.defineProperty(this, "_cols", {
200 | "enumerable" : false,
201 | "value" : shallowcopy(data)
202 | });
203 |
204 | // do we also have a key/decoding object?
205 | if(keys && isobject(keys)){
206 |
207 | // check validity
208 | for(var key in keys){
209 | if(!(key in this._cols)) throw new Error("Invalid data, keys object doesn't match columns");
210 | }
211 |
212 | Object.defineProperty(this, "_keys", {
213 | "enumerable" : false,
214 | "value" : shallowcopy(keys)
215 | });
216 | }
217 |
218 | } else if(isarray(data)) {
219 | // array, check it's elements
220 | if(data.length == 0){
221 | return;
222 | }
223 |
224 | Object.defineProperty(this, "length", {
225 | "enumerable" : false,
226 | "value" : data.length
227 | });
228 | // all checks pass use data as columns
229 | Object.defineProperty(this, "_cols", {
230 | "enumerable" : false,
231 | "value" : {}
232 | });
233 |
234 | var row;
235 | for(key in data[0]){
236 | this._cols[key] = [];
237 | }
238 | for(var i = 0; i < data.length; i++){
239 | row = data[i];
240 |
241 | // are the rows objects?
242 | if(isobject(row)){
243 | // yes
244 | for(key in this._cols){
245 | if(key in row)
246 | this._cols[key][i] = row[key];
247 | else
248 | this._cols[key][i] = null;
249 | }
250 | } else {
251 | // no, invalid data
252 | throw new Error("Invalid data, must be array of rows or dict of columns");
253 | }
254 | }
255 | }
256 |
257 | // expose columns as properties
258 | for(name in this._cols){
259 | addColumn(this, name);
260 | }
261 | }
262 |
263 | Object.defineProperty(Frame.prototype, "add", {
264 | enumerable: false,
265 | value : function(name, values){
266 |
267 | if(this.length !== values.length)
268 | throw new Error("Invalid data, arrays in object must be of equal length");
269 |
270 | this._cols[name] = values;
271 | addColumn(this, name);
272 | }
273 | });
274 |
275 | // internal function for exposing a data column as a property on the Frame
276 | function addColumn(frame, name){
277 | Object.defineProperty(frame, name, {
278 | enumerable : true,
279 | configurable: true,
280 | get: function(){
281 | // decode?
282 | var result = [];
283 | if(frame._keys && name in frame._keys){
284 | // yes, get keys
285 | var keys = frame._keys[name];
286 |
287 | // map data column onto decoded column
288 | // data column should be an array of indices into
289 | // the keys array
290 | var column = frame._cols[name];
291 | result = new Array(column.length);
292 | for(var i = 0; i < column.length;i++){
293 | result[i] = keys[column[i]];
294 | }
295 | } else {
296 | // no, just return the column
297 | result = frame._cols[name];
298 | }
299 |
300 | if(frame._filter){
301 | return result.filter(function(item, i){ return frame._filter.get(i);});
302 | } else {
303 | return result;
304 | }
305 | },
306 | set : function(data){
307 | if(!isarray(data)) throw new Error("data must be an array");
308 | if(data.length != frame.length) throw new Error("array must match length");
309 |
310 | if(frame._keys && name in frame._keys){
311 | throw new Error("setting keyed column not supported yet");
312 | } else {
313 | frame._cols[name] = data.slice(0);
314 | }
315 | }
316 | });
317 | }
318 |
319 | /*
320 | // alternate syntax for toStringTag
321 | get [Symbol.toStringTag]() {
322 | return 'Validator';
323 | }
324 | */
325 | module.exports = Frame;
326 |
327 | /*
328 | Get column names
329 | */
330 | Object.defineProperty(Frame.prototype, "columns", {
331 | enumerable: false,
332 | get : function(){
333 | return Object.keys(this._cols);
334 | }
335 | });
336 |
337 | Object.defineProperty(Frame.prototype, "rename", {
338 | enumerable: false,
339 | value : function(old_name, new_name){
340 | if(!(old_name in this._cols))
341 | throw new Error("Couldn't find a column named '" + selector + "'");
342 |
343 | // copy column to new name
344 | var column = this._cols[old_name];
345 | this._cols[new_name] = column;
346 |
347 | // delete old column
348 | delete this._cols[old_name];
349 | delete this[old_name];
350 |
351 | // rename any decode key
352 | if(this._keys && old_name in this._keys){
353 | this._keys[new_name] = this._keys[old_name];
354 | delete this._keys[old_name]
355 | }
356 |
357 | addColumn(this, new_name);
358 |
359 | }
360 | })
361 |
362 | Object.defineProperty(Frame.prototype, "distinct", {"enumerable": false, "value" : distinct});
363 |
364 | function distinct(selector){
365 | if(!(selector in this._cols))
366 | throw new Error("Couldn't find a column named '" + selector + "'");
367 |
368 | var key;
369 | if(this._keys) key = this._keys[selector];
370 |
371 | var column = this._cols[selector];
372 | var set = {};
373 | var value;
374 | for(var i = 0; i < column.length; i++){
375 | if(key) value = key[column[i]];
376 | else value = column[i];
377 | if(this._filter){
378 | if(this._filter.get(i)) set[value] = value;
379 | } else {
380 | set[value] = value;
381 | }
382 | }
383 |
384 | // this step enables non-string values
385 | var vals = [];
386 | for(key in set) vals.push(set[key]);
387 |
388 | return vals;
389 | };
390 |
391 | Object.defineProperty(Frame.prototype, "where", {"enumerable" : false, "value" : where});
392 |
393 | /* element of, takes an array as an argument
394 | create and return a function that takes a single argument and returns true if
395 | that argument is contained in the given array
396 |
397 | NOTE: null and undefined may both be present in arr, and will be distinct from one another
398 | */
399 | function el(arr){
400 | var set = {};
401 | for (var i = 0; i < arr.length; i++) set[arr[i]] = true;
402 | return function(v){ return set[v] != null;};
403 | }
404 |
405 | function eq(a){
406 | return function(v){ return v == a; };
407 | }
408 |
409 | function where(selector, condition){
410 |
411 | if(!(selector in this._cols))
412 | throw new Error("Couldn't find a column named '" + selector + "'");
413 |
414 | var column = this._cols[selector];
415 | var filter = new BitArray(this.length);
416 |
417 | var bits = filter.wordArray;
418 | var index = 0;
419 | var word = 0|0;
420 | var offset;
421 | var max = column.length - 1;
422 |
423 | if(isnumber(condition) || isstring(condition)){
424 | // keyed selector column?
425 | if(isstring(condition) && this._keys && selector in this._keys){
426 | // yes, encode condition
427 | var keys = this._keys[selector];
428 | condition = keys.indexOf(condition);
429 | }
430 | for(var i = 0; i < bits.length; i++){
431 | word = 0|0;
432 | offset = i * 32;
433 | var j = 31 + offset;
434 | if(j > max) j = max;
435 | for(; j >= offset; j--){
436 | if(column[j] === condition) word |= 1;
437 | if(j > offset) word <<= 1;
438 | }
439 | bits[i] = word;
440 | }
441 | } else {
442 | if(isarray(condition) || istypedarray(condition)){
443 | condition = el(condition);
444 | }
445 | if(this._keys && selector in this._keys){
446 | // yes, encode condition
447 | var keys = this._keys[selector];
448 | }
449 |
450 | var value;
451 | for(var i = 0; i < bits.length; i++){
452 | word = 0|0;
453 | offset = i * 32;
454 | var j = 31 + offset;
455 | if(j > max) j = max;
456 | for(; j >= offset; j--){
457 | if(keys) value = keys[column[j]];
458 | else value = column[j];
459 | if(condition(value)) word |= 1;
460 | if(j > offset) word <<= 1;
461 | }
462 | bits[i] = word;
463 | }
464 | }
465 |
466 | // create and return a new Frame with the new filter
467 | var filters = {};
468 | if(this._filters){
469 | Object.assign(filters, this._filters);
470 | }
471 | filters[selector] = filter;
472 |
473 | return new Frame(this._cols, this._keys, this._index, this._groups, filters);
474 |
475 | }
476 |
477 | Object.defineProperty(Frame.prototype, "join", {"enumerable" : false, "value" : join});
478 | Object.defineProperty(Frame.prototype, "groupby", {"enumerable" : false, "value" : groupby});
479 | Object.defineProperty(Frame.prototype, "ungroup", {"enumerable" : false, "value" : ungroup});
480 | Object.defineProperty(Frame.prototype, "count", {"enumerable" : false, "value" : count});
481 | Object.defineProperty(Frame.prototype, "argmax", {"enumerable" : false, "value": argmax});
482 | Object.defineProperty(Frame.prototype, "argmin", {"enumerable" : false, "value": argmin});
483 | Object.defineProperty(Frame.prototype, "min", {"enumerable" : false, "value": min});
484 | Object.defineProperty(Frame.prototype, "max", {"enumerable" : false, "value": max});
485 | Object.defineProperty(Frame.prototype, "sum", {"enumerable" : false, "value": sum});
486 | Object.defineProperty(Frame.prototype, "mean", {"enumerable" : false, "value": mean});
487 | Object.defineProperty(Frame.prototype, "median", {"enumerable" : false, "value": median});
488 | Object.defineProperty(Frame.prototype, "reduce", {"enumerable" : false, "value": reduce});
489 |
490 |
491 | /* use the partition method to find the median */
492 | function median(selector){
493 |
494 | var column = this._cols[selector];
495 | var key = selector && this._keys ? this._keys[selector] : null;
496 |
497 | if (column.length == 0) return null;
498 |
499 | var p, m;
500 |
501 | middle = column.length / 2 | 0;
502 |
503 | var low = 0,
504 | high = column.length - 1;
505 |
506 | var i = 0;
507 | // partition the array
508 | while(p != middle && i < column.length){
509 | i++;
510 | p = partition(column, low, high);
511 |
512 | if( p < middle) low = p + 1;
513 | else high = p - 1;
514 | }
515 |
516 | if(i == column.length){
517 | console.error("Maximum partition reached");
518 | }
519 |
520 | if(key) return key[column[p]];
521 | else return column[p];
522 | }
523 |
524 | /* partition an array, in place */
525 | function partition(arr, low, high){
526 |
527 | if (low >= high) return high;
528 |
529 | // choose a random index for the pivot
530 | var pivot = randint(low, high);
531 |
532 | // swap pivot into last location
533 | swap(arr, high, pivot);
534 |
535 | pivot = low; // location of pivot in result
536 | // scan array and swap elements less than pivot into low end
537 | for(var i = low; i < high; i++){
538 | if (arr[i] < arr[high]){
539 | swap(arr, i, pivot);
540 | pivot++;
541 | }
542 | }
543 |
544 | swap(arr, high, pivot);
545 |
546 | return pivot;
547 |
548 | }
549 |
550 | /* get random integer in the inclusive interval [a, b]
551 | a and b must be integers for correct performance
552 | */
553 | function randint(a, b){
554 | r = Math.random(); //[0, 1)
555 | return a + Math.floor((b - a + 1)*r);
556 | }
557 |
558 | function swap(arr, i, j){
559 | var temp = arr[i];
560 | arr[i] = arr[j];
561 | arr[j] = temp;
562 | }
563 |
564 | function join(frame, link){
565 |
566 | // verify length of link column
567 | if(link.length !== this.length) throw new Error("Length of link column must match frame.");
568 |
569 | if(!("_cols" in frame)) throw new Error("First argument must be a frame.");
570 |
571 | // duplicate columns and keys
572 | var columns = shallowcopy(this._cols),
573 | keys = shallowcopy(this._keys) || {};
574 |
575 | // add virtual columns for each column in the joining frame
576 | for(name in frame._cols){
577 | // skip columns with duplicate names
578 | if(name in columns) continue;
579 |
580 | // don't join encoded columns
581 | if(frame._keys && name in frame._keys) continue;
582 |
583 | // add link column as encoded column data
584 | columns[name] = link;
585 | // add joining frame column as key column
586 | keys[name] = frame._cols[name];
587 | }
588 |
589 | return new Frame(columns, keys, this._index, this._groups, this._filters);
590 |
591 | }
592 |
593 |
594 | /*
595 | * group the data in the frame by a selector or set of selectors
596 | */
597 | function groupby(){
598 |
599 | if(arguments.length == 0) throw new Error("No arguments provided");
600 |
601 | // collect arguments into list of selectors
602 | var selectors = [],
603 | arg;
604 | if(arguments.length === 1){
605 | arg = arguments[0];
606 | if(isstring(arg)) selectors = [arg];
607 | else if(isarray(arg)) selectors = arg;
608 | } else {
609 | for(var i = 0; i < arguments.length; i++){
610 | arg = arguments[i];
611 | if(!isstring(arg)) throw new Error("Invalid arguments");
612 |
613 | selectors.push(arg);
614 | }
615 | }
616 |
617 | var index = {};
618 | if(this._index){
619 | index = this._index;
620 | selectors = this._groups.concat(selectors);
621 | }
622 |
623 | // get references to all the columns involved in groups
624 | var columns = Array(selectors.length);
625 | var keys = {};
626 | for (var m = 0; m < selectors.length; m++){
627 | selector = selectors[m];
628 |
629 | if(!(selector in this._cols))
630 | throw new Error("Couldn't find a column named '" + selector + "'");
631 |
632 | columns[m] = this._cols[selector];
633 | if(this._keys && selector in this._keys) keys[m] = this._keys[selector];
634 | }
635 |
636 | var N = columns[0].length;
637 | var path = Array(columns.length);
638 | // iterate through rows
639 | for(var i = 0; i < N; i++){
640 |
641 | // compute distinct values for group columns describing the bin for
642 | // the current row
643 | for (var m = 0; m < columns.length; m++){
644 | var column = columns[m];
645 | if(m in keys) path[m] = keys[m][column[i]];
646 | else path[m] = column[i];
647 | }
648 |
649 | // add this row to the index using the group column values
650 | // by descending the hierarchy to the correct leaf
651 | var level = index;
652 | for(var j = 0; j < path.length - 1; j++){
653 |
654 | key = path[j];
655 | next = level[key];
656 | if(next == null || isarray(next)){
657 | next = {};
658 | level[key] = next;
659 | }
660 | level = next;
661 | }
662 |
663 | // update array of row indices stored in leaf
664 | key = path[path.length - 1];
665 | var arr = level[key];
666 | if(arr == null){
667 | level[key] = [i];
668 | } else {
669 | arr[arr.length] = i;
670 | }
671 | }
672 |
673 | /*
674 | this._index = index;
675 | this._groups = selectors.slice(0);
676 | return this;
677 | */
678 | return new Frame(this._cols, this._keys, index, selectors, this._filters);
679 | }
680 |
681 | /* remove the grouping created by the last remaining groupby selector */
682 | function ungroup(){
683 | if(this._index == null || this._groups.length < 1)
684 | throw new Error("Not enough groups")
685 |
686 | var frame = new Frame(this._cols, this._keys, null, null, this._filters);
687 |
688 | // handle special case of single group
689 | if(this._groups.length == 1)
690 | return frame;
691 |
692 | // for other cases do new groupby with one fewer groups
693 | return frame.groupby(this._groups.slice(0, -1));
694 | }
695 |
696 | function count(){
697 | if(this._index) return this.reduce();
698 |
699 | if(this._filter) return this._count;
700 |
701 | return this.length;
702 | }
703 |
704 | function min(selector){
705 | return this.reduce(selector, reducers.min);
706 | }
707 |
708 | function max(selector){
709 | return this.reduce(selector, reducers.max);
710 | }
711 |
712 | function sum(selector){
713 | return this.reduce(selector, reducers.sum);
714 | }
715 |
716 | function mean(selector){
717 | return this.reduce(selector, reducers.mean);
718 | }
719 |
720 | function argmax(selector){
721 | return this.reduce(selector, reducers.argmax);
722 | }
723 |
724 | function argmin(selector){
725 | return this.reduce(selector, reducers.argmin);
726 | }
727 |
728 | function reduce(selector, reducer, initial){
729 |
730 | var column = selector ? this._cols[selector] : null;
731 | var key = selector && this._keys ? this._keys[selector] : null;
732 |
733 | // choose default reduce, if none was supplied
734 | var is_numeric = column && column.length > 0 && Object.prototype.toString.call(column[0]) == "[object Number]";
735 | reducer = reducer || (is_numeric ? reducers.sum : reducers.max);
736 |
737 | if(this._index){
738 | return treereduce(column, key, this._index, this._keys, this._groups, this._filter, reducer, initial);
739 | } else if(this._filter) {
740 | return filterreduce(column, key, this._filter, reducer, initial);
741 | } else {
742 | return fullreduce(column, key, reducer, initial);
743 | }
744 | }
745 |
746 | function treereduce(column, rkey, index, keys, groups, filter, reducer, initial){
747 |
748 | var reduced = {};
749 | var parents = {};
750 |
751 | // depth first traversal
752 | var todo = [[index, null, 0]];
753 | var leaves = [];
754 |
755 | var result, pkey, level, n;
756 | while (todo.length > 0){
757 | n = todo.pop();// object
758 | index = n[0];
759 | pkey = n[1];
760 | level = n[2];
761 | result = {}; // container for this subtree in result
762 |
763 | var c, name;
764 | for(key in index){ // keys in object
765 | c = index[key];
766 | group = groups[level];
767 |
768 | // decode the key, if possible
769 | /*
770 | if(keys && group in keys){
771 | decoder = keys[group];
772 | key = decoder[key];
773 | }*/
774 |
775 | ckey = pkey ? pkey + "@" + key : key;
776 |
777 | if(isobject(c)){
778 | todo.push([c, ckey, level + 1]);
779 | } else {
780 | var indices = c;
781 | var filtered = filterindices(indices, filter);
782 | if(filtered.length != 0){
783 | var value;
784 | if(column){
785 | value = subsetreduce(column, rkey, filtered, reducer, initial);
786 | } else {
787 | value = filtered.length; // default to count
788 | }
789 | leaves.push([ckey, value]);
790 | }
791 | }
792 | parents[ckey] = [pkey, result];
793 | }
794 | }
795 |
796 | var root;
797 | while (leaves.length > 0){
798 | n = leaves.pop();
799 | ckey = n[0]; // composite key, parent + child
800 | value = n[1];
801 |
802 | p = parents[ckey];
803 | pkey = p[0];
804 | index = p[1];
805 |
806 | key = pkey ? ckey.slice(pkey.length + 1) : ckey;
807 | index[key] = value;
808 | if(pkey == null){
809 | root = index;
810 | } else {
811 | leaves.push([pkey, index]);
812 | }
813 | }
814 |
815 | return root;
816 | };
817 |
818 | function empty (obj){
819 | for (var key in obj) {
820 | if (obj.hasOwnProperty(key)) {
821 | return false
822 | }
823 | }
824 | return true
825 | }
826 |
827 | function filterindices(indices, filter){
828 | if(!filter) return indices;
829 |
830 | result = [];
831 | for(var i = 0; i < indices.length; i++){
832 | index = indices[i];
833 | if(filter.get(index)){
834 | result.push(index);
835 | }
836 | }
837 | return result;
838 | }
839 |
840 | /* reduce a subset of an array given by a set of indices using a supplied
841 | reducing function.
842 |
843 | Extracting this code into a function produces an order of magnitude speedup.
844 | I don't know why.
845 | */
846 | function subsetreduce(column, key, indices, reducer, initial){
847 |
848 | var value = null;
849 | if(initial) value = initial;
850 |
851 | if(key){
852 | for(var i = 0; i < indices.length; i++){
853 | index = indices[i];
854 | if(value === null) value = key[column[index]];
855 | else value = reducer(value, key[column[index]], i);
856 | }
857 | } else {
858 | for(var i = 0; i < indices.length; i++){
859 | index = indices[i];
860 | if(value === null) value = column[index];
861 | else value = reducer(value, column[index], i);
862 | }
863 | }
864 |
865 | return value || 0;
866 | }
867 |
868 | function filterreduce(column, key, filter, reducer, initial){
869 |
870 | var value = null;
871 | if(initial) value = initial;
872 |
873 | var word,
874 | mask,
875 | cutoff;
876 | var bits = filter.wordArray;
877 | var total = 0;
878 | var max = column.length;
879 |
880 | if(key){
881 | for(var i = 0; i < bits.length; i++){
882 | word = bits[i];
883 | if(word !== 0){
884 | cutoff = (i + 1) * 32;
885 | if(cutoff > max) cutoff = max;
886 | mask = 1;
887 | for(var j = i * 32; j < cutoff; j++){
888 | if((word & mask) !== 0) {
889 | if(value === null) value = key[column[j]];
890 | else value = reducer(value, key[column[j]], total);
891 | total++;
892 | }
893 | mask <<= 1;
894 | }
895 | }
896 | }
897 | } else {
898 | for(var i = 0; i < bits.length; i++){
899 | word = bits[i];
900 | if(word !== 0){
901 | cutoff = (i + 1) * 32;
902 | if(cutoff > max) cutoff = max;
903 | mask = 1;
904 | for(var j = i * 32; j < cutoff; j++){
905 | if((word & mask) !== 0) {
906 | if(value === null) value = column[j];
907 | else value = reducer(value, column[j], total);
908 | total++;
909 | }
910 | mask <<= 1;
911 | }
912 | }
913 | }
914 | }
915 |
916 |
917 | return value || 0;
918 | }
919 |
920 | function fullreduce(column, key, reducer, initial){
921 |
922 | var start,
923 | value;
924 |
925 | // chose initial values and start of loop based on number of inputs and
926 | // supplied initial value
927 | if(initial !== void(0)){
928 | start = 0;
929 | value = initial;
930 | } else if(column.length > 0) {
931 | start = 1;
932 | value = key ? key[column[0]] : column[0];
933 | } else {
934 | start = 0;
935 | value = 0;
936 | }
937 |
938 | if(key){
939 | for(var i = start; i < column.length; i++){
940 | value = reducer(value, key[column[i]], i);
941 | }
942 | } else {
943 | for(var i = start; i < column.length; i++){
944 | value = reducer(value, column[i], i);
945 | }
946 | }
947 |
948 | return value;
949 | }
950 |
--------------------------------------------------------------------------------
/lib/stream-reducers.js:
--------------------------------------------------------------------------------
1 |
2 | module.exports = {
3 | "count" : count,
4 | "sum" : sum,
5 | "max" : max,
6 | "min" : min,
7 | "mean" : mean,
8 | "mode" : mode,
9 | "median" : median,
10 | "argmax" : argmax,
11 | "argmin" : argmin
12 | };
13 |
14 | /* Array.prototype.reduce style function for finding the maximum
15 | * @examples
16 | * [1, 1, 1].reduce(ds.reduce.max); // => 1
17 | * [3, 1, 3, 5].reduce(ds.reduce.max); // => 5
18 | * reduce({"a" : 1, "b" : 0, "c" : 2}, ds.reduce.max); // => 2
19 | */
20 | function max(agg, val) { return agg > val ? agg : val; };
21 |
22 | /* Array.prototype.reduce style function for finding the minimum
23 | * @examples
24 | * [1, 1, 1].reduce(ds.reduce.min); // => 1
25 | * [3, 1, 3, 5].reduce(ds.reduce.min); // => 1
26 | * reduce({"a" : 1, "b" : 0, "c" : 2}, ds.reduce.min); // => 0
27 | */
28 | function min(agg, val) { return agg < val ? agg : val; };
29 |
30 | /* Array.prototype.reduce style function for finding the most common value
31 | * @examples
32 | * [1, 1, 1].reduce(ds.reduce.mode); // => 1
33 | * [1, 3, 3, 7].reduce(ds.reduce.mode); // => 3
34 | * reduce({"a" : 1, "b" : 0, "c" : 2}, ds.reduce.mode); // => 1
35 | */
36 | function mode(agg, val, n) {
37 | if(n === 0) return val;
38 |
39 | var self;
40 | if(n === 1){
41 | // internal state hack (compatible with groupby)
42 | self = mode.state = {};
43 | self.values = {};
44 | self.values[agg] = 1;
45 | self.argmax = agg;
46 | } else {
47 | self = mode.state;
48 | }
49 |
50 | if(val in self.values)
51 | self.values[val] += 1;
52 | else
53 | self.values[val] = 1;
54 |
55 | if(self.values[val] > self.values[agg])
56 | self.argmax = val;
57 |
58 | return self.argmax;
59 | }
60 |
61 | function argmax(agg, val, n){
62 | var self;
63 | if(n === 0){
64 | // internal state hack (compatible with groupby)
65 | self = argmax.state = {};
66 | self.max = val;
67 | return 0;
68 | }
69 |
70 | if(n === 1){
71 | if(argmax.state == null) self = argmax.state = {};
72 | else self = argmax.state;
73 | // is this the first time we've called this function on this array?
74 | if(self.max != null && self.argmax == null){
75 | // no
76 | } else {
77 | // yes
78 | self.max = agg;
79 | }
80 | self.argmax = 0;
81 | } else {
82 | self = argmax.state;
83 | }
84 |
85 | if(val > self.max){
86 | self.max = val;
87 | self.argmax = n;
88 | }
89 |
90 | return self.argmax;
91 | }
92 |
93 |
94 | function argmin(agg, val, n){
95 | var self;
96 | if(n === 0){
97 | // internal state hack (compatible with groupby)
98 | self = argmin.state = {};
99 | self.min = val;
100 | return 0;
101 | }
102 |
103 | if(n === 1){
104 | if(argmin.state == null) self = argmin.state = {};
105 | else self = argmin.state;
106 | // is this the first time we've called this function on this array?
107 | if(self.min != null && self.argmin == null){
108 | // no
109 | } else {
110 | // yes
111 | self.min = agg;
112 | }
113 | self.argmin = 0;
114 | } else {
115 | self = argmin.state;
116 | }
117 |
118 | if(val < self.min){
119 | self.min = val;
120 | self.argmin = n;
121 | }
122 |
123 | return self.argmin;
124 | }
125 |
126 | /* Array.prototype.reduce style function for finding the middle value
127 | * @examples
128 | * [1, 1, 1].reduce(ds.reduce.median); // => 1
129 | * [1, 3, 3, 7].reduce(ds.reduce.median); // => 3
130 | * [4, 1, 7].reduce(ds.reduce.median); // => 4
131 | * reduce({"a" : 4, "b" : 1, "c" : 7}, ds.reduce.median); // => 4
132 |
133 | DON'T USE THIS FUNCTION, IT'S VERY SLOW
134 | */
135 | function median(agg, val, n) {
136 | if(n === 0) return val;
137 |
138 | if(n === 1){
139 | // internal state hack (compatible with groupby)
140 | self = median.state = {};
141 | self.values = [agg];
142 | } else {
143 | self = median.state;
144 | }
145 |
146 | // insert the new value into the sorted array
147 | insert(self.values, val);
148 |
149 | var middle = self.values.length / 2 | 0;
150 | // even number of elements?
151 | if(self.values.length % 2 !== 0){
152 | // no, return the middle one
153 | return self.values[middle];
154 | } else {
155 | // yes, return the average of the middle two
156 | return (self.values[middle - 1] + self.values[middle]) / 2;
157 | }
158 | }
159 |
160 | /* Array.prototype.reduce style function for counting number of elements
161 | * @examples
162 | * [1, 1, 1].reduce(ds.reduce.count); // => 3
163 | * [3, 1, 3, 5].reduce(ds.reduce.count); // => 4
164 | * reduce({"a" : 1, "b" : 0, "c" : 2}, ds.reduce.count); // => 3
165 | */
166 | function count(agg, val, n){ return n + 1; };
167 |
168 | /* Array.prototype.reduce style function for finding the sum
169 | * @examples
170 | * [1, 1, 1].reduce(ds.reduce.sum); // => 3
171 | * [3, 1, 3, 5].reduce(ds.reduce.sum); // => 12
172 | * reduce({"a" : 1, "b" : 0, "c" : 2}, ds.reduce.sum); // => 3
173 | */
174 | function sum(agg, val){ return agg + val; };
175 |
176 | /* Array.prototype.reduce style function for finding the arithmetic mean
177 | * @examples
178 | * [1, 1, 1].reduce(ds.reduce.mean); // => 1
179 | * [3, 1, 3, 5].reduce(ds.reduce.mean); // => 3
180 | * reduce({"a" : 1, "b" : 0, "c" : 2}, ds.reduce.mean); // => 1
181 | */
182 | function mean(agg, val, n){ return (agg + ((val - agg)/(n + 1))); };
183 |
184 | var d = function(a, b){ return a > b ? 1 : a < b ? -1 : 0;};
185 |
186 | function insert(arr, el){
187 | var index = binarySearch(arr, el, d);
188 | arr.splice(index, 0, el);
189 |
190 | return arr;
191 | };
192 |
193 | var binarySearch = function binarySearch(arr, el, comparator) {
194 |
195 | var m = 0;
196 | var n = arr.length - 1;
197 | while (m <= n) {
198 | var k = (n + m) >> 1;
199 | var cmp = comparator(el, arr[k]); // comparator(arr[k], el);
200 | if (cmp > 0) {
201 | m = k + 1;
202 | } else if(cmp < 0) {
203 | n = k - 1;
204 | } else {
205 | return k;
206 | }
207 | }
208 |
209 | return m;
210 | }
211 |
--------------------------------------------------------------------------------
/lib/test.js:
--------------------------------------------------------------------------------
1 | var async = require('async'),
2 | path = require('path'),
3 | floader = require('floader'),
4 | aloader = require('arrayloader');
5 |
6 | test = {};
7 |
8 | test.DEFAULT_TYPE = DEFAULT_TYPE = "int32";
9 |
10 | test.type_map = type_map = {
11 | "int8" : ".i8",
12 | "uint8" : ".u8",
13 | "int16" : ".i16",
14 | "uint16" : ".u16",
15 | "int32" : ".i32",
16 | "uint32" : ".u32",
17 | "float32" : ".f32",
18 | "float64" : ".f64",
19 | "str8" : ".s8",
20 | "str16" : ".s16"
21 | };
22 |
23 | test.extension_map = extension_map = {
24 | ".i8" : Int8Array,
25 | ".u8" : Uint8Array,
26 | ".i16" : Int16Array,
27 | ".u16" : Uint16Array,
28 | ".i32" : Int32Array,
29 | ".u32" : Uint32Array,
30 | ".f32" : Float32Array,
31 | ".f64" : Float64Array,
32 | ".s8" : Int8Array,
33 | ".s16" : Int16Array
34 | };
35 |
36 | test.float_types = {
37 | "float32" : true,
38 | "float64" : true
39 | };
40 |
41 | test.string_types = {
42 | "str8" : true,
43 | "str16" : true
44 | };
45 |
46 |
47 | /* load a binary file as a TypedArray with the type given by the extension */
48 | function loadArray(filePath, cb){
49 |
50 | var ext = path.extname(filePath);
51 | ext = ext.toLowerCase();
52 |
53 | if (ext in extension_map)
54 | constructor = extension_map[ext];
55 | else
56 | constructor = Int32Array;
57 |
58 | return aloader.load(filePath, constructor, cb);
59 | }
60 |
61 | test.load = function(directory, names, types, callback){
62 |
63 | // array of paths to matrix data files for current test
64 | var paths = names.map(function(name, i){
65 | type = types[i];
66 | if (!(type in type_map)) type = DEFAULT_TYPE;
67 |
68 | ext = type_map[types[i]];
69 |
70 | return directory + name + ext;
71 | });
72 |
73 | //console.log(testFiles);
74 | async.map(paths, loadArray,
75 | function(err, results){
76 |
77 | if(err) return callback(err);
78 |
79 | callback(err, results);
80 | }
81 | );
82 | }
83 | /* a key file is just a JSON array of strings
84 | the index of the string in the array is it's code
85 | */
86 | function loadKey(filePath, cb){
87 |
88 | floader.load(filePath, function(err, key){
89 | if(err) return cb(err);
90 |
91 | return cb(null, JSON.parse(key));
92 | });
93 | }
94 |
95 | test.load_key = function(directory, names, types, callback){
96 |
97 | // array of paths to matrix data files for current test
98 | var paths = names.map(function(name, i){
99 | return directory + name + ".key";
100 | });
101 |
102 | //console.log(testFiles);
103 | async.map(paths, loadKey,
104 | function(err, results){
105 |
106 | if(err) return callback(err);
107 |
108 | callback(err, results);
109 | }
110 | );
111 | }
112 |
113 | function isobject(obj){ return Object.prototype.toString.call(obj) === "[object Object]";}
114 |
115 | /* is there a key in object 'a' not found in object 'b'?
116 | if so, return the first key that's not found
117 | if not, return null
118 | */
119 | function diffkeys(a, b){
120 | same_keys = true;
121 | for(key in a){
122 | same_keys &= (key in b);
123 | if(!same_keys){
124 | return key;
125 | }
126 | }
127 |
128 | return null;
129 | }
130 |
131 | /* comp is s comparison function for leaves
132 | a - actual
133 | b - expected
134 | */
135 | function treediff(a, b, comp){
136 |
137 | var p = "(r)";
138 | var todo = [[a, b, p]]; // tuple of (a, b, p)
139 | var parents = { p : null };
140 |
141 | var diff_key = null,
142 | diff_a = null,
143 | diff_b = null;
144 | var t;
145 | while (todo.length > 0 && !diff_key){
146 | t = todo.pop();
147 | n_a = t[0];
148 | n_b = t[1];
149 | p = t[2];
150 |
151 | // are all the keys the same?
152 | diff_b = diffkeys(n_b, n_a);
153 | if(diff_b){
154 | diff_key = p;
155 | break;
156 | }
157 | diff_a = diffkeys(n_a, n_b);
158 | if(diff_a){
159 | diff_key = p;
160 | break;
161 | }
162 |
163 | // check children
164 | for(key in n_b){
165 | // both objects/internal nodes?
166 | if(isobject(n_b[key]) && isobject(n_a[key])){
167 | // yes, add to stack
168 | parents[key] = p;
169 | todo.push([n_a[key], n_b[key], key]);
170 |
171 | // both leaves?
172 | } else if(!isobject(n_b[key]) && !isobject(n_a[key])) {
173 | // yes, compare values
174 | if(!comp(n_b[key], n_a[key])){
175 | diff_key = key;
176 | diff_a = n_a[key];
177 | diff_b = n_b[key];
178 | break;
179 | }
180 | } else {
181 | // one is leaf the other is internal
182 | diff_key = key;
183 | if(isobject(n_b)){
184 | diff_a = n_a[key];
185 | } else {
186 | diff_b = n_b[key];
187 | }
188 | break;
189 | }
190 | }
191 | }
192 |
193 | var path;
194 | // difference found?
195 | if(diff_key){
196 | // yes, reconstruct the path
197 | var n = diff_key;
198 | path = [n];
199 | while(parents[n]){
200 | n = parents[n];
201 | path.push(n);
202 | }
203 |
204 | // diff_a and diff_b are both present on a leaf difference
205 | // only one is present for an internal node difference
206 | return {"path" : path.reverse(), "a" : diff_a, "b" : diff_b};
207 | }
208 |
209 | return null;
210 |
211 | }
212 |
213 | test.assert = {};
214 | test.assert.tree = {};
215 |
216 | /* determine whether two trees are equivalent
217 | */
218 | test.assert.tree.equal = function(t, a, b, msg) {
219 | var fail = treediff(a, b, function(a_n, b_n){
220 | return a_n === b_n;
221 | });
222 |
223 | msg = msg || 'trees should be equal';
224 | return treeassert(t, fail, msg);
225 | };
226 |
227 | /* determine whether two trees are approximately equivalent:
228 | internal nodes are identical
229 | leaves are within specified floating point tolerances
230 | */
231 | test.assert.tree.allclose = function(t, a, b, msg, RTOL, ATOL) {
232 | RTOL= RTOL || 1e-05; // for 32 bit precision: 1e-06
233 | ATOL= ATOL || 1e-08;
234 |
235 | // treeequal with a floating point comparison function
236 | var fail = treediff(a, b, function(a_n, b_n){
237 | return Math.abs(a_n - b_n) <= ATOL + RTOL * Math.abs(b_n)
238 | });
239 |
240 | msg = msg || 'trees should be allclose';
241 | return treeassert(t, fail, msg);
242 | };
243 |
244 | test.assert.close = function(t, a, b, msg, RTOL, ATOL){
245 | RTOL= RTOL || 1e-05; // for 32 bit precision: 1e-06
246 | ATOL= ATOL || 1e-08;
247 |
248 | // treeequal with a floating point comparison function
249 | var success = Math.abs(a - b) <= ATOL + RTOL * Math.abs(b)
250 |
251 | t._assert(success, {
252 | message : msg,
253 | operator : 'close',
254 | actual : a,
255 | expected : b,
256 | extra : null
257 | });
258 |
259 | return success;
260 | }
261 |
262 | var NULL_PLACEHOLDER = "(null)";
263 | function treeassert(t, fail, msg){
264 |
265 | if(fail){
266 | var actual = fail.path.join(" -> "),
267 | expected = fail.path.join(" -> ");
268 |
269 | fail.a = fail.a || NULL_PLACEHOLDER;
270 | fail.b = fail.b || NULL_PLACEHOLDER;
271 | actual += " -> " + fail.a;
272 | expected += " -> " + fail.b;
273 | }
274 |
275 | t._assert(!fail, {
276 | message : msg,
277 | operator : 'tree.equal',
278 | actual : actual,
279 | expected : expected,
280 | extra : null
281 | });
282 |
283 | return !fail;
284 | };
285 |
286 | test.generate = {
287 | "Array" : {
288 | "int" : randomIntArray,
289 | "float" : randomFloatArray
290 | }
291 | };
292 |
293 | function randomIntArray(N, K){
294 |
295 | var data = [];
296 |
297 | for(var i = 0; i < N; i++){
298 | data.push(Math.random() * K | 0);
299 | }
300 |
301 | return data;
302 | }
303 |
304 | function randomFloatArray(N){
305 |
306 | var data = [];
307 |
308 | for(var i = 0; i < N; i++){
309 | data.push(Math.random() / Math.sqrt(N));
310 | }
311 |
312 | return data;
313 | }
314 |
315 | module.exports = test;
316 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "dataship-frame",
3 | "version": "2.1.1",
4 | "description": "A Data Frame for Javascript. Crunch numbers in node and the browser.",
5 | "main": "lib/frame.js",
6 | "directories": {
7 | "test": "test"
8 | },
9 | "scripts": {
10 | "data": "node test/data/generate.js",
11 | "test": "browserify test/*.js | testling -x $npm_config_browser",
12 | "dist": "mkdir -p dist && browserify lib/frame.js -s Frame > dist/frame.js",
13 | "bench": "browserify benchmark/*.js | testling -x $npm_config_browser",
14 | "bench-datavore": "browserify benchmark/datavore/*.js | testling -x $npm_config_browser"
15 | },
16 | "repository": {
17 | "type": "git",
18 | "url": "git+https://github.com/dataship/frame.git"
19 | },
20 | "keywords": [
21 | "dataframe",
22 | "statistics",
23 | "math",
24 | "pandas",
25 | "R"
26 | ],
27 | "author": "",
28 | "license": "MIT",
29 | "bugs": {
30 | "url": "https://github.com/dataship/frame/issues"
31 | },
32 | "homepage": "https://github.com/dataship/frame#readme",
33 | "devDependencies": {
34 | "arrayloader": "^1.1.2",
35 | "async": "^2.1.5",
36 | "benchtap": "^1.0.0",
37 | "floader": "^1.0.1",
38 | "tape": "^4.6.3"
39 | },
40 | "dependencies": {
41 | "bit-array": "^0.2.2"
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | docopt
3 |
--------------------------------------------------------------------------------
/test/argmax.js:
--------------------------------------------------------------------------------
1 | var tape = require('tape'),
2 | Frame = require('../lib/frame');
3 |
4 | // simple instructive test cases
5 | function simpleTestCases(){
6 |
7 | tape("argmax works with integers", function(t){
8 | t.plan(1);
9 | var frame = new Frame({
10 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
11 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
12 | });
13 |
14 | var expected = 6; // 1 + 2 + 2 + 3 + 1 + 3 + 4 + 2 + 1
15 |
16 | var actual = frame.argmax("value");
17 |
18 | t.equal(actual, expected);
19 | });
20 |
21 | tape("argmax works with integers", function(t){
22 | t.plan(1);
23 | var frame = new Frame({
24 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1],
25 | "value" : [4, 2, 7, 1, 3, 6, 5, 2, 1, 7, 8]
26 | });
27 |
28 | var expected = 10; // (4 + 2 + 7 + 1 + 3 + 6 + 5 + 2 + 1 + 7 + 8) / 11
29 |
30 | var actual = frame.argmax("value");
31 |
32 | t.equal(actual, expected);
33 | });
34 |
35 | tape("argmax works floats", function(t){
36 | t.plan(1);
37 | var frame = new Frame({
38 | "id" : [0, 2, 0, 1, 1, 0, 2, 0, 1],
39 | "value" : [1.2, 6.4, 2.3, 12.1, 1.6, 3.5, 7.2, 2.1, 10.2]
40 | });
41 |
42 | var expected = 3; // (1.2 + 6.4 + 2.3 + 12.1 + 1.6 + 3.5 + 7.2 + 2.1 + 10.2) / 9
43 | var actual = frame.argmax("value");
44 |
45 | t.equal(actual, expected);
46 | });
47 |
48 | tape("argmax works floats", function(t){
49 | t.plan(1);
50 | var frame = new Frame({
51 | "id" : [0, 2, 0, 1, 1, 0, 2, 0, 1],
52 | "value" : [1.2, 6.4, 2.3, 1.1, 10.6, 3.5, 7.2, 2.1, 9.2]
53 | });
54 |
55 | var expected = 4; // (1.2 + 6.4 + 2.3 + 12.1 + 1.6 + 3.5 + 7.2 + 2.1 + 10.2) / 9
56 | var actual = frame.argmax("value");
57 |
58 | t.equal(actual, expected);
59 | });
60 |
61 | tape("argmax wonky edge case", function(t){
62 | t.plan(1);
63 | var frame = new Frame({
64 | "id" : [0, 2, 0, 1, 1, 0, 2, 0, 1],
65 | "value" : [11.2, 6.4, 2.3, 1.1, 10.6, 3.5, 7.2, 2.1, 9.2]
66 | });
67 |
68 | // zero argmax
69 | var primer = frame.argmax("value");
70 |
71 | var expected = 4;
72 | var frame2 = new Frame({
73 | "id" : [0, 2, 0, 1, 1, 0, 2, 0, 1],
74 | "value" : [1.2, 6.4, 2.3, 1.1, 10.6, 3.5, 7.2, 2.1, 9.2]
75 | });
76 |
77 | var actual = frame2.argmax("value");
78 | t.equal(actual, expected);
79 | });
80 | }
81 |
82 | simpleTestCases();
83 | /*
84 | var RTOL = 1e-05, // 1e-05
85 | ATOL = 1e-12; // 1e-12
86 |
87 | var dataDirectory = 'test/data/mean/',
88 | testFile = 'small.json';
89 |
90 | var floader = require('floader'),
91 | dtest = require('../lib/test');
92 |
93 | floader.load(dataDirectory + testFile, function(err, config){
94 |
95 | var suite = JSON.parse(config);
96 | simpleTestCases();
97 |
98 | for(var i = 0; i < suite.length; i++){
99 |
100 | var prefix = String("0000" + (i + 1)).slice(-4);
101 |
102 | // directory containing matrix data files for current test
103 | var directory = dataDirectory + prefix + '/';
104 |
105 | var test = suite[i];
106 |
107 | var names = test.id.map(function(spec, i){ return "id_" + i;});
108 | var types = test.id.map(function(spec, i){ return spec['type'];});
109 |
110 | var N = test.N; // number of rows
111 | var distincts = test.id.map(function(spec, i){ return spec.K; });
112 |
113 | var testName = "mean: " + N + " x " + "(" + distincts.join(", ") + ")"
114 | tape(testName, generateTestCase(directory, names, types, ["value_0"], [test.value[0].type]));
115 | }
116 | });
117 |
118 | var OUT_FILENAME = "out.json";
119 |
120 |
121 | function generateTestCase(directory, id_names, id_types, value_names, value_types){
122 | return function(t){
123 | t.plan(1);
124 |
125 | var names = id_names.concat(value_names);
126 | var types = id_types.concat(value_types);
127 |
128 | // which columns require a key file?
129 | var key_names = id_names.filter(function(item, i){
130 | return id_types[i] in dtest.string_types
131 | });
132 | var key_types = id_types.filter(function(item, i){
133 | return item in dtest.string_types
134 | });
135 |
136 | // load columns from files
137 | dtest.load(directory, names, types, function(err, columns){
138 |
139 | // load key files
140 | dtest.load_key(directory, key_names, key_types, function(err, keys){
141 |
142 | floader.load(directory + OUT_FILENAME, function(err, out){
143 | var expected = JSON.parse(out);
144 |
145 | var column_set = {};
146 | for (var i = 0; i < names.length; i++){
147 | var name = names[i];
148 | var column = columns[i];
149 | column_set[name] = column;
150 | }
151 | // keys map a small set of integers to other things (like strings)
152 | // they're a very simple form of fixed length coding
153 | var key_set = {};
154 | for (var i = 0; i < keys.length; i++){
155 | var name = key_names[i];
156 | var key = keys[i];
157 | key_set[name] = key;
158 | }
159 |
160 | var frame = new Frame(column_set, key_set);
161 |
162 | //console.log(subset);
163 | var actual = frame.mean("value_0");
164 |
165 | dtest.assert.close(t, actual, expected, "close", RTOL, ATOL);
166 | });
167 |
168 | });
169 | });
170 | };
171 | }
172 | */
173 |
--------------------------------------------------------------------------------
/test/count.js:
--------------------------------------------------------------------------------
1 | var tape = require('tape'),
2 | Frame = require('../lib/frame');
3 |
4 | tape("count gives length with no filter", function(t){
5 | t.plan(1);
6 |
7 | var frame = new Frame({
8 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
9 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
10 | });
11 |
12 | var expected = 9;
13 |
14 | var actual = frame.count();
15 | t.equals(actual, expected);
16 | });
17 |
18 | tape("count works with where", function(t){
19 | t.plan(1);
20 |
21 | var frame = new Frame({
22 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
23 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
24 | });
25 |
26 | //frame.where(row => row.id == 1);
27 | frame = frame.where("id", v => v == 1);
28 |
29 | var expected = 4;
30 |
31 | var actual = frame.count();
32 | t.equals(actual, expected);
33 | });
34 |
35 | tape("count works with where.equals", function(t){
36 | t.plan(1);
37 |
38 | var frame = new Frame({
39 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
40 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
41 | });
42 |
43 | frame = frame.where("id", 1);
44 |
45 | var expected = 4;
46 |
47 | var actual = frame.count();
48 | t.equals(actual, expected);
49 | });
50 |
51 | tape("count works with where.in", function(t){
52 | t.plan(1);
53 |
54 | var frame = new Frame({
55 | "id" : [0, 2, 0, 1, 1, 0, 2, 0, 1],
56 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
57 | });
58 |
59 | frame = frame.where("id", [0, 2]);
60 |
61 | var expected = 6;
62 |
63 | var actual = frame.count();
64 | t.equals(actual, expected);
65 | });
66 |
67 | tape("count works with multiple where", function(t){
68 | t.plan(1);
69 |
70 | var frame = new Frame({
71 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
72 | "id_1" : [0, 0, 1, 1, 0, 1, 0, 0, 1],
73 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
74 | });
75 |
76 | //frame.where(row => row.id == 1);
77 | frame = frame.where("id_1", id => id == 1);
78 | frame = frame.where("id_0", id => id == 1);
79 |
80 | var expected = 2;
81 |
82 | var actual = frame.count();
83 | t.equals(actual, expected);
84 | });
85 |
86 |
87 | /*
88 | function eq(a){
89 | return function(v){ v == a; };
90 | }
91 |
92 | function in(arr){
93 | var set = {};
94 | for (a in arr) set[a] = true;
95 | return function(v){ return v in set;};
96 | }*/
97 |
--------------------------------------------------------------------------------
/test/create.js:
--------------------------------------------------------------------------------
1 | var test = require('tape'),
2 | Frame = require('../lib/frame');
3 |
4 | test("access column from hidden property", function(t){
5 | t.plan(1);
6 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
7 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1];
8 |
9 | var frame = new Frame({
10 | "a" : a,
11 | "b" : b
12 | });
13 |
14 | t.equals(JSON.stringify(frame._cols["a"]), JSON.stringify(a));
15 | });
16 |
17 | test("access keys from hidden property", function(t){
18 | t.plan(1);
19 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
20 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1];
21 | var k = ["one", "two"];
22 |
23 | var frame = new Frame({
24 | "a" : a,
25 | "b" : b
26 | },
27 | {
28 | "a" : k
29 | });
30 |
31 |
32 | t.equals(JSON.stringify(frame._keys["a"]), JSON.stringify(k));
33 | });
34 |
35 | test("row based constructor creates columns correctly", function(t){
36 | t.plan(2);
37 | var rows = [
38 | {"a" : 0, "b" : 1},
39 | {"a" : 0, "b" : 2},
40 | {"a" : 0, "b" : 2},
41 | {"a" : 1, "b" : 3},
42 | {"a" : 1, "b" : 1},
43 | {"a" : 0, "b" : 3},
44 | {"a" : 1, "b" : 4},
45 | {"a" : 0, "b" : 2},
46 | {"a" : 1, "b" : 1},
47 | ];
48 |
49 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
50 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1];
51 |
52 | var frame = new Frame(rows);
53 |
54 |
55 | t.equals(JSON.stringify(frame._cols["a"]), JSON.stringify(a));
56 | t.equals(JSON.stringify(frame._cols["b"]), JSON.stringify(b));
57 | });
58 |
59 | test("access column as property", function(t){
60 | t.plan(1);
61 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
62 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1];
63 |
64 | var frame = new Frame({
65 | "a" : a,
66 | "b" : b
67 | });
68 |
69 |
70 | t.equals(JSON.stringify(frame["a"]), JSON.stringify(a));
71 | });
72 |
73 | test("accessing column as property decodes when key is present", function(t){
74 | t.plan(1);
75 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
76 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1];
77 | var k = ["one", "two"];
78 |
79 | var frame = new Frame({
80 | "a" : a,
81 | "b" : b
82 | },
83 | {
84 | "a" : k
85 | });
86 |
87 |
88 | var expected = ["one", "one", "one", "two", "two", "one", "two", "one", "two"];
89 | t.equals(JSON.stringify(frame["a"]), JSON.stringify(expected));
90 | });
91 |
92 | test("only columns are enumerable", function(t){
93 | t.plan(2);
94 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
95 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1];
96 |
97 | var frame = new Frame({
98 | "a" : a,
99 | "b" : b
100 | });
101 |
102 | var expected = ["a", "b"];
103 |
104 | t.equals(JSON.stringify(Object.keys(frame)), JSON.stringify(expected));
105 |
106 | var found = [];
107 |
108 | for(name in frame){
109 | found.push(name);
110 | }
111 |
112 | t.equals(JSON.stringify(found), JSON.stringify(expected));
113 | });
114 |
115 | test("Symbol.toStringTag correctly overridden", function(t){
116 | t.plan(1);
117 | var frame = new Frame({
118 | "a" : [0],
119 | "b" : [1]
120 | });
121 |
122 | var expected = "[object Frame]";
123 |
124 | t.equals(Object.prototype.toString.call(frame), expected);
125 | });
126 |
127 | test("rename column correctly modifies frame properties", function(t){
128 | t.plan(2);
129 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
130 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1];
131 |
132 | var frame = new Frame({
133 | "a" : a,
134 | "b" : b
135 | });
136 |
137 | var expected = ["a", "c"];
138 |
139 | frame.rename("b", "c");
140 |
141 | t.equals(JSON.stringify(Object.keys(frame)), JSON.stringify(expected));
142 |
143 | var found = [];
144 |
145 | for(name in frame){
146 | found.push(name);
147 | }
148 |
149 | t.equals(JSON.stringify(found), JSON.stringify(expected));
150 | });
151 |
152 | test("rename column correctly adds accessor", function(t){
153 | t.plan(1);
154 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
155 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1];
156 |
157 | var frame = new Frame({
158 | "a" : a,
159 | "b" : b
160 | });
161 |
162 | var expected = b;
163 |
164 | frame.rename("b", "c");
165 |
166 | t.equals(JSON.stringify(frame["c"]), JSON.stringify(expected));
167 | });
168 |
169 | test("rename column correctly converts key", function(t){
170 | t.plan(1);
171 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
172 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1];
173 |
174 | var frame = new Frame({
175 | "a" : a,
176 | "b" : b
177 | },
178 | {
179 | "b" : ["zero", "one", "two", "three", "four"]
180 | });
181 |
182 | var expected = ["one", "two", "two", "three", "one", "three", "four", "two", "one"];
183 |
184 | frame.rename("b", "c");
185 |
186 | t.equals(JSON.stringify(frame["c"]), JSON.stringify(expected));
187 | });
188 |
189 | test("setting via property accessor works correctly", function(t){
190 | t.plan(1);
191 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
192 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1];
193 |
194 | var frame = new Frame({
195 | "a" : a,
196 | "b" : b
197 | });
198 | var c = [3, 4, 1, 0, 2, 1, 2, 3, 3];
199 |
200 | frame["b"] = c;
201 |
202 | var expected = c.slice(0);
203 | t.equals(JSON.stringify(frame["b"]), JSON.stringify(expected));
204 | });
205 |
206 | test("distinct works correctly", function(t){
207 | t.plan(2);
208 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
209 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1];
210 |
211 | var frame = new Frame({
212 | "a" : a,
213 | "b" : b
214 | });
215 |
216 | var expected = [1, 2, 3, 4];
217 | var actual = frame.distinct("b");
218 |
219 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
220 |
221 | var expected = [0, 1];
222 | var actual = frame.distinct("a");
223 |
224 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
225 | });
226 |
227 | test("distinct works with keyed column", function(t){
228 | t.plan(1);
229 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
230 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1];
231 |
232 | var frame = new Frame({
233 | "a" : a,
234 | "b" : b
235 | }, {
236 | "a" : ["zero", "one"]
237 | });
238 |
239 | var expected = ["zero", "one"];
240 | var actual = frame.distinct("a");
241 |
242 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
243 | });
244 |
245 | test("distinct works with where", function(t){
246 | t.plan(2);
247 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
248 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1];
249 |
250 | var frame = new Frame({
251 | "a" : a,
252 | "b" : b
253 | });
254 |
255 | var expected = [1, 3, 4];
256 | frame = frame.where("a", 1);
257 | var actual = frame.distinct("b");
258 |
259 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
260 |
261 | var expected = [1];
262 | var actual = frame.distinct("a");
263 |
264 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
265 | });
266 |
267 | test("argmax works correctly", function(t){
268 | t.plan(1);
269 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
270 | var b = [1, 2, 2, 3, 1, 0, 4, 2, 1];
271 |
272 | var frame = new Frame({
273 | "a" : a,
274 | "b" : b
275 | });
276 |
277 | var expected = 6;
278 | var actual = frame.argmax("b");
279 |
280 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
281 |
282 | });
283 |
284 | test("argmin works correctly", function(t){
285 | t.plan(1);
286 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
287 | var b = [1, 2, 2, 3, 1, 0, 4, 2, 1];
288 |
289 | var frame = new Frame({
290 | "a" : a,
291 | "b" : b
292 | });
293 |
294 | var expected = 5;
295 | var actual = frame.argmin("b");
296 |
297 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
298 |
299 | });
300 |
301 | test("median works correctly", function(t){
302 | t.plan(2);
303 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
304 | var b = [1, 2, 2, 3, 4, 3, 4, 2, 1];
305 |
306 | var frame = new Frame({
307 | "a" : a,
308 | "b" : b
309 | });
310 |
311 | var expected = 2;
312 | var actual = frame.median("b");
313 |
314 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
315 |
316 | var expected = 0;
317 | var actual = frame.median("a");
318 |
319 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
320 | });
321 |
322 | test("min works correctly", function(t){
323 | t.plan(2);
324 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
325 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1];
326 |
327 | var frame = new Frame({
328 | "a" : a,
329 | "b" : b
330 | });
331 |
332 | var expected = 1;
333 | var actual = frame.min("b");
334 |
335 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
336 |
337 | var expected = 0;
338 | var actual = frame.min("a");
339 |
340 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
341 | });
342 |
343 | test("min works with where", function(t){
344 | t.plan(2);
345 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
346 | var b = [1, 2, 2, 3, 4, 3, 4, 2, 3];
347 |
348 | var frame = new Frame({
349 | "a" : a,
350 | "b" : b
351 | });
352 |
353 | var expected = 3;
354 | frame = frame.where("a", 1);
355 | var actual = frame.min("b");
356 |
357 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
358 |
359 | var expected = 1;
360 | var actual = frame.min("a");
361 |
362 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
363 | });
364 |
365 | test("min works correctly on ISO date strings", function(t){
366 | t.plan(1);
367 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
368 | var b = ["2016-03-11", "2016-05-11", "2016-04-10", "2016-03-15",
369 | "2016-03-03", "2016-04-21", "2016-05-28", "2016-03-17",
370 | "2016-04-04"];
371 |
372 | var frame = new Frame({
373 | "a" : a,
374 | "b" : b
375 | });
376 |
377 | var expected = "2016-03-03";
378 | var actual = frame.min("b");
379 |
380 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
381 | });
382 |
383 | test("min works correctly on keyed column", function(t){
384 | t.plan(1);
385 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
386 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1];
387 | var k = ["b", "a"];
388 |
389 | var frame = new Frame({
390 | "a" : a,
391 | "b" : b
392 | }, {
393 | "a" : k
394 | });
395 |
396 | var expected = "a";
397 | var actual = frame.min("a");
398 |
399 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
400 | });
401 |
402 | test("min works with where on keyed column", function(t){
403 | t.plan(1);
404 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
405 | var b = [1, 2, 2, 3, 4, 3, 4, 2, 3];
406 | var k = ["b", "a"];
407 |
408 | var frame = new Frame({
409 | "a" : a,
410 | "b" : b
411 | }, {
412 | "a" : k
413 | });
414 |
415 | var expected = "a";
416 | frame = frame.where("b", 3);
417 | var actual = frame.min("a");
418 |
419 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
420 | });
421 |
422 | test("max works correctly", function(t){
423 | t.plan(2);
424 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
425 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1];
426 |
427 | var frame = new Frame({
428 | "a" : a,
429 | "b" : b
430 | });
431 |
432 | var expected = 4;
433 | var actual = frame.max("b");
434 |
435 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
436 |
437 | var expected = 1;
438 | var actual = frame.max("a");
439 |
440 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
441 | });
442 |
443 | test("max works correctly on ISO date strings", function(t){
444 | t.plan(1);
445 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
446 | var b = ["2016-03-11", "2016-05-11", "2016-04-10", "2016-03-15",
447 | "2016-03-03", "2016-04-21", "2016-05-28", "2016-03-17",
448 | "2016-04-04"];
449 |
450 | var frame = new Frame({
451 | "a" : a,
452 | "b" : b
453 | });
454 |
455 | var expected = "2016-05-28";
456 | var actual = frame.max("b");
457 |
458 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
459 | });
460 |
461 | test("max works correctly on keyed column", function(t){
462 | t.plan(1);
463 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
464 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1];
465 | var k = ["b", "a"];
466 |
467 | var frame = new Frame({
468 | "a" : a,
469 | "b" : b
470 | }, {
471 | "a" : k
472 | });
473 |
474 | var expected = "b";
475 | var actual = frame.max("a");
476 |
477 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
478 | });
479 |
480 | test("max works with where on keyed column", function(t){
481 | t.plan(1);
482 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
483 | var b = [1, 2, 2, 3, 4, 3, 4, 2, 3];
484 | var k = ["b", "a"];
485 |
486 | var frame = new Frame({
487 | "a" : a,
488 | "b" : b
489 | }, {
490 | "a" : k
491 | });
492 |
493 | var expected = "b";
494 | frame = frame.where("b", 3);
495 | var actual = frame.max("a");
496 |
497 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
498 | });
499 |
500 | test("add creates new column", function(t){
501 | t.plan(3);
502 | var a = [0, 0, 0, 1, 1, 0, 1, 0, 1];
503 | var b = [1, 2, 2, 3, 1, 3, 4, 2, 1];
504 | var c = [2, 7, 2, 1, 9, 3, 2, 1, 1];
505 |
506 | var frame = new Frame({
507 | "a" : a,
508 | "b" : b
509 | });
510 |
511 | var expected = ["a", "b", "c"];
512 |
513 | frame.add("c", c);
514 |
515 | t.equals(JSON.stringify(Object.keys(frame)), JSON.stringify(expected));
516 |
517 | var found = [];
518 |
519 | for(name in frame){
520 | found.push(name);
521 | }
522 |
523 | t.equals(JSON.stringify(found), JSON.stringify(expected));
524 |
525 | t.equals(JSON.stringify(frame["c"]), JSON.stringify(c));
526 | });
527 |
--------------------------------------------------------------------------------
/test/data/binary_matrix.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Create two randomly generated matrices, of the specified sizes and write them
3 | to JSON files.
4 |
5 | """
6 | import json
7 | import numpy as np
8 | import os
9 |
10 |
11 | type_map = {
12 | '.i8' : np.int8,
13 | '.u8' : np.uint8,
14 | '.i16' : np.int16,
15 | '.u16' : np.uint16,
16 | '.i32' : np.int32,
17 | '.u32' : np.uint32,
18 | '.f32' : np.float32,
19 | '.i64' : np.int64, # not compatible with javascript
20 | '.u64' : np.uint64,# not compatible with javascript
21 | '.f64' : np.float64,
22 | '.s8' : np.int8,
23 | '.s16' : np.int16
24 | }
25 |
26 | def get_extension(path):
27 | filename, file_extension = os.path.splitext(path)
28 | return file_extension
29 |
30 | def read(path):
31 |
32 | extension = get_extension(path)
33 | if extension in type_map:
34 | dtype = type_map[extension]
35 | else:
36 | dtype=np.float32
37 |
38 | with open(path, 'rb') as f:
39 | matrix = np.fromfile(f, dtype=dtype)
40 |
41 | return matrix
42 |
43 | def write(path, matrix):
44 |
45 | extension = get_extension(path)
46 | if extension in type_map:
47 | dtype = type_map[extension]
48 | else:
49 | dtype=np.float32
50 |
51 | with open(path, 'wb') as f:
52 | f.write(matrix.astype(dtype=dtype).tostring())
53 |
54 | return matrix
55 |
--------------------------------------------------------------------------------
/test/data/generate.js:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 | var spawn = require('child_process').spawn,
3 | async = require('async');
4 |
5 |
6 | /*
7 | ./generate.py count/ count/small.json
8 | ./generate.py sum/ sum/small.json
9 | */
10 |
11 | var tasks = [
12 | ['generate.py', 'groupby.count/', 'groupby.count/small.json'],
13 | ['generate.py', 'groupby.sum/', 'groupby.sum/small.json'],
14 | ['generate.py', 'groupby.mean/', 'groupby.mean/small.json'],
15 | ['generate.py', 'groupby.where.sum/', 'groupby.where.sum/small.json'],
16 | ['generate.py', 'where.in.sum/', 'where.in.sum/small.json'],
17 | ['generate.py', 'mean/', 'mean/small.json'],
18 | ['generate.py', 'where.mean/', 'where.mean/small.json']
19 | ];
20 | var options = {
21 | "cwd" : __dirname,
22 | "stdio": ["inherit", "inherit", "inherit"]
23 | };
24 |
25 |
26 | async.eachSeries(tasks, function(task, callback){
27 | spawn('python', task, options).on('close', callback);
28 | },
29 | function(){
30 | // all done
31 | });
32 |
--------------------------------------------------------------------------------
/test/data/generate.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Create data for the test suite described by the given specification.
3 | Deleting the file out.json in a subdirectory will cause it to be recreated
4 | with existing data and new "args". Deleting all files in a subdirectory will
5 | case all data to be recreated.
6 |
7 | spec.json contains an array of objects, each object contains
8 |
9 | "N" - a number of rows to generate
10 | "id" - a list of id columns to generate
11 | K - number of distinct values to generate
12 | type - type of data to generate for column
13 | [{"K" : 3, "type": "int32"}, {"K" : 3, "type": "int32"}],
14 | "value" - a list of value columns to generate
15 | [{"K" : 100, "type": "int32"}, {"K" : 100, "type": "int32"}]
16 |
17 | NOTE: 64 bit integer types are not compatible with Javascript.
18 | This includes np.int64 and np.uint64
19 |
20 | Implementing test data generation for a new operation involves two things:
21 | 1. creation of a json spec
22 | 2. implementing an operation file with a single function named execute
23 |
24 | Usage:
25 | generate.py
26 | """
27 | from docopt import docopt
28 | import os
29 | import sys
30 | import json
31 | import math
32 | import collections
33 | import numpy as np
34 | import binary_matrix
35 |
36 | OUT_FILENAME = "out.json"
37 |
38 | KEY_EXT = ".key"
39 |
40 | extension_map = {
41 | "int8" : ".i8",
42 | "uint8" : ".u8",
43 | "int16" : ".i16",
44 | "uint16" : ".u16",
45 | "int32" : ".i32",
46 | "uint32" : '.u32',
47 | "float32" : '.f32',
48 | "int64" : '.i64', # not compatible with javascript
49 | "uint64" : '.u64', # not compatible with javascript
50 | "float64" : '.f64'
51 | }
52 |
53 | # function adapted from this issue request on numpy
54 | # https://github.com/numpy/numpy/issues/3155
55 | def random_sample(size=None, dtype=np.float64):
56 |
57 | if type(dtype) == str or type(dtype) == unicode:
58 | dtype = np.dtype(dtype).type
59 |
60 | type_max = 1 << np.finfo(dtype).nmant
61 | sample = np.empty(size, dtype=dtype)
62 | sample[...] = np.random.randint(0, type_max, size=size) / dtype(type_max)
63 | if size is None:
64 | sample = sample[()]
65 | return sample
66 |
67 | int_types = set([
68 | "int8", "uint8",
69 | "int16", "uint16",
70 | "int32", "uint32",
71 | "int64", "uint64"]) # not compatible with javascript
72 | float_types = set(["float32", "float64"])
73 |
74 | def create_column(N, K, type="int32"):
75 |
76 | if type in int_types:
77 | return np.random.randint(0, K, N, dtype=type)
78 |
79 | if type in float_types:
80 | return K * random_sample(N, dtype=type)
81 |
82 |
83 |
84 | return np.random.randint(0, K, N, dtype="int32")
85 |
86 | def write_result(result, location):
87 | """write a dict to a file as a json document"""
88 | try:
89 | with open(location, 'w') as f:
90 | json.dump(result, f, indent=4)
91 | except Exception as e:
92 | print("Couldn't write output JSON file: {0}".format(e.message))
93 | sys.exit(1)
94 |
95 | def write_code(result, location):
96 | write_result(result, location)
97 |
98 | def read_code(location):
99 | with open(location, 'r') as f:
100 | code = json.load(f)
101 |
102 | return code
103 |
104 | class queue(collections.deque):
105 | def pop(self):
106 | return self.popleft()
107 | def push(self, n):
108 | self.append(n)
109 |
110 | def gen_strings(N):
111 | chars = [chr(i) for i in range(ord('a'), ord('z') + 1)]
112 | L = int(math.ceil(math.log(N) / math.log(len(chars))))
113 |
114 | results = queue([""])
115 |
116 | for i in range(L):
117 | for j in range(len(results)):
118 | r = results.pop()
119 |
120 | for c in chars:
121 | results.push(r+c)
122 |
123 | return list(results)[:N]
124 |
125 | if __name__ == '__main__':
126 | arguments = docopt(__doc__, version='JSON Groupby Generator')
127 |
128 | # arguments parsed from Usage statement by docopt
129 | base_directory = os.path.join(arguments[''], '')
130 | test_file = arguments['']
131 |
132 | sys.path.insert(0, './' + base_directory)
133 |
134 | operation = __import__("operation")
135 |
136 | with open(test_file, 'r') as f:
137 | try:
138 | tests = json.load(f)
139 | except Exception as e:
140 | print("Couldn't parse JSON configuration file: {0}".format(e.message))
141 | sys.exit(1)
142 |
143 |
144 | for i in range(len(tests)):
145 |
146 | options = tests[i]
147 | N = options['N']
148 |
149 | # test directory is a string of four numbers starting at 0001
150 | directory = base_directory + "{0:0>4}/".format(i + 1)
151 |
152 | if not os.path.exists(directory):
153 | os.makedirs(directory)
154 |
155 | # if a result exists, skip this data set
156 | if os.path.exists(directory + OUT_FILENAME):
157 | print("Skipping {0}".format(directory))
158 | continue
159 |
160 | id_columns = {}
161 | for i in range(len(options['id'])):
162 | name = "id_{0}".format(i)
163 | spec = options['id'][i]
164 | K = spec['K']
165 | dtype = spec['type']
166 | if dtype[:3] == 'str':
167 |
168 | if K <= 256 and dtype == 'str8':
169 | dtype = "int8"
170 | extension = ".s8"
171 | elif K <= 65536 and dtype == 'str16':
172 | dtype = "int16"
173 | extension = ".s16"
174 | else:
175 | raise Exception("Too many strings!")
176 |
177 | if os.path.exists(directory + name + KEY_EXT) and os.path.exists(directory + name + extension):
178 | # read binary row file
179 | rows = binary_matrix.read(directory + name + extension)
180 | # read key file
181 | code = read_code(directory + name + KEY_EXT)
182 |
183 | else:
184 | rows = create_column(N, K, dtype)
185 | # map integers onto random strings
186 | code = gen_strings(K)
187 | # write key file
188 | write_code(code, directory + name + KEY_EXT)
189 | binary_matrix.write(directory + name + extension, rows)
190 |
191 | column = [code[index] for index in rows]
192 |
193 | else:
194 | if dtype not in extension_map:
195 | dtype = "int32"
196 |
197 | extension = extension_map[dtype]
198 |
199 | if os.path.exists(directory + name + extension):
200 | column = binary_matrix.read(directory + name + extension)
201 | else:
202 | column = create_column(N, K, dtype)
203 | binary_matrix.write(directory + name + extension, column)
204 |
205 | id_columns[name] = column
206 |
207 | value_columns = {}
208 | for i in range(len(options['value'])):
209 | name = "value_{0}".format(i)
210 | spec = options['value'][i]
211 | K = spec['K']
212 | dtype = spec['type']
213 | if dtype not in extension_map:
214 | dtype = "int32"
215 |
216 | extension = extension_map[dtype]
217 | if os.path.exists(directory + name + extension):
218 | column = binary_matrix.read(directory + name + extension)
219 | else:
220 | column = create_column(N, K, dtype)
221 | binary_matrix.write(directory + name + extension, column)
222 |
223 | value_columns[name] = column
224 |
225 | # run reduction
226 | arguments = options['arg'] if 'arg' in options else {}
227 | out = operation.execute(arguments, id_columns, value_columns)
228 |
229 | # write result
230 | #binary_matrix.write(directory + "out.arr", out.flatten())
231 | write_result(out, directory + OUT_FILENAME)
232 |
233 | print("Created {0}".format(directory))
234 |
--------------------------------------------------------------------------------
/test/data/groupby.count/operation.py:
--------------------------------------------------------------------------------
1 | """count operation
2 | """
3 | import pandas as pd
4 |
5 | def convert_to_dict(r):
6 |
7 | # returns a dictionary whose keys are tuples
8 | tupled = r.to_dict()
9 |
10 | # convert tuple keys to nested dictionaries
11 | dicted = {}
12 | for (t, k) in tupled.items():
13 | level = dicted
14 |
15 | # create a nested dictionary for each item in the tuple
16 | for l in t[:-1]:
17 | if l in level:
18 | level = level[l]
19 | else:
20 | level[l] = {}
21 | level = level[l]
22 |
23 | # the last level points to the value
24 | l = t[-1]
25 | level[l] = k.item() # convert numpy type to python type
26 |
27 | return dicted
28 |
29 | def execute(options, id_columns, value_columns):
30 |
31 | columns = id_columns.copy()
32 | columns.update(value_columns)
33 | #print(columns)
34 |
35 | frame = pd.DataFrame(columns)
36 |
37 | g = frame.groupby(by=list(id_columns.keys()))
38 | return convert_to_dict(g.count()["value_0"])
39 |
--------------------------------------------------------------------------------
/test/data/groupby.count/small.json:
--------------------------------------------------------------------------------
1 | [
2 | {"N" : 10000,
3 | "id" : [{"K" : 3, "type" : "int8"}, {"K" : 3, "type" : "int8"}],
4 | "value" : [{"K" : 100, "type" : "int32"}]
5 | },
6 | {"N" : 10000,
7 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "int32"}],
8 | "value" : [{"K" : 100, "type" : "float32"}]
9 | },
10 | {"N" : 10000,
11 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "int32"}],
12 | "value" : [{"K" : 100, "type" : "int32"}]
13 | },
14 | {"N" : 100000,
15 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "int32"}],
16 | "value" : [{"K" : 100, "type" : "int32"}]
17 | },
18 | {"N" : 1000000,
19 | "id" : [{"K" : 3, "type" : "uint8"}, {"K" : 3, "type" : "int32"}],
20 | "value" : [{"K" : 100, "type" : "float64"}]
21 | }
22 | ]
23 |
--------------------------------------------------------------------------------
/test/data/groupby.mean/operation.py:
--------------------------------------------------------------------------------
1 | """groupby mean operation
2 | """
3 | import pandas as pd
4 |
5 | def convert_to_dict(r):
6 |
7 | # returns a dictionary whose keys are tuples
8 | tupled = r.to_dict()
9 |
10 | # convert tuple keys to nested dictionaries
11 | dicted = {}
12 | for (t, k) in tupled.items():
13 | level = dicted
14 |
15 | # create a nested dictionary for each item in the tuple
16 | for l in t[:-1]:
17 | if l in level:
18 | level = level[l]
19 | else:
20 | level[l] = {}
21 | level = level[l]
22 |
23 | # the last level points to the value
24 | l = t[-1]
25 | level[l] = k.item() # convert numpy type to python type
26 |
27 | return dicted
28 |
29 | def execute(options, id_columns, value_columns):
30 |
31 | columns = id_columns.copy()
32 | columns.update(value_columns)
33 | #print(columns)
34 |
35 | frame = pd.DataFrame(columns)
36 |
37 | g = frame.groupby(by=list(id_columns.keys()))
38 | return convert_to_dict(g.mean()["value_0"])
39 |
--------------------------------------------------------------------------------
/test/data/groupby.mean/small.json:
--------------------------------------------------------------------------------
1 | [
2 | {"N" : 10000,
3 | "id" : [{"K" : 3, "type" : "int8"}, {"K" : 3, "type" : "uint8"}],
4 | "value" : [{"K" : 100, "type" : "int32"}]
5 | },
6 | {"N" : 10000,
7 | "id" : [{"K" : 3, "type" : "int8"}, {"K" : 3, "type" : "str8"}],
8 | "value" : [{"K" : 100, "type" : "int32"}]
9 | },
10 | {"N" : 10000,
11 | "id" : [{"K" : 100, "type" : "int8"}, {"K" : 3, "type" : "str8"}],
12 | "value" : [{"K" : 1000, "type" : "int32"}]
13 | },
14 | {"N" : 10000,
15 | "id" : [{"K" : 100, "type" : "int8"}, {"K" : 100, "type" : "str8"}],
16 | "value" : [{"K" : 1000, "type" : "int32"}]
17 | },
18 | {"N" : 10000,
19 | "id" : [{"K" : 3, "type" : "int16"}, {"K" : 3, "type" : "uint16"}],
20 | "value" : [{"K" : 100, "type" : "float32"}]
21 | },
22 | {"N" : 10000,
23 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "uint32"}],
24 | "value" : [{"K" : 100, "type" : "int32"}]
25 | },
26 | {"N" : 100000,
27 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "int32"}],
28 | "value" : [{"K" : 100, "type" : "int32"}]
29 | },
30 | {"N" : 100000,
31 | "id" : [{"K" : 1000, "type" : "int32"}, {"K" : 300, "type" : "str16"}],
32 | "value" : [{"K" : 100, "type" : "int32"}]
33 | },
34 | {"N" : 100000,
35 | "id" : [{"K" : 1000, "type" : "int32"}, {"K" : 300, "type" : "str16"}],
36 | "value" : [{"K" : 100, "type" : "float32"}]
37 | },
38 | {"N" : 1000000,
39 | "id" : [{"K" : 3, "type" : "uint8"}, {"K" : 3, "type" : "int32"}],
40 | "value" : [{"K" : 100, "type" : "float64"}]
41 | }
42 | ]
43 |
--------------------------------------------------------------------------------
/test/data/groupby.sum/operation.py:
--------------------------------------------------------------------------------
1 | """sum operation
2 | """
3 | import pandas as pd
4 |
5 | def convert_to_dict(r):
6 |
7 | # returns a dictionary whose keys are tuples
8 | tupled = r.to_dict()
9 |
10 | # convert tuple keys to nested dictionaries
11 | dicted = {}
12 | for (t, k) in tupled.items():
13 | level = dicted
14 |
15 | # create a nested dictionary for each item in the tuple
16 | for l in t[:-1]:
17 | if l in level:
18 | level = level[l]
19 | else:
20 | level[l] = {}
21 | level = level[l]
22 |
23 | # the last level points to the value
24 | l = t[-1]
25 | level[l] = k.item() # convert numpy type to python type
26 |
27 | return dicted
28 |
29 | def execute(options, id_columns, value_columns):
30 |
31 | columns = id_columns.copy()
32 | columns.update(value_columns)
33 | #print(columns)
34 |
35 | frame = pd.DataFrame(columns)
36 |
37 | g = frame.groupby(by=list(id_columns.keys()))
38 | return convert_to_dict(g.sum()["value_0"])
39 |
--------------------------------------------------------------------------------
/test/data/groupby.sum/small.json:
--------------------------------------------------------------------------------
1 | [
2 | {"N" : 10000,
3 | "id" : [{"K" : 3, "type" : "int8"}, {"K" : 3, "type" : "uint8"}],
4 | "value" : [{"K" : 100, "type" : "int32"}]
5 | },
6 | {"N" : 10000,
7 | "id" : [{"K" : 3, "type" : "int8"}, {"K" : 3, "type" : "str8"}],
8 | "value" : [{"K" : 100, "type" : "int32"}]
9 | },
10 | {"N" : 10000,
11 | "id" : [{"K" : 100, "type" : "int8"}, {"K" : 3, "type" : "str8"}],
12 | "value" : [{"K" : 1000, "type" : "int32"}]
13 | },
14 | {"N" : 10000,
15 | "id" : [{"K" : 100, "type" : "int8"}, {"K" : 100, "type" : "str8"}],
16 | "value" : [{"K" : 1000, "type" : "int32"}]
17 | },
18 | {"N" : 10000,
19 | "id" : [{"K" : 3, "type" : "int16"}, {"K" : 3, "type" : "uint16"}],
20 | "value" : [{"K" : 100, "type" : "float32"}]
21 | },
22 | {"N" : 10000,
23 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "uint32"}],
24 | "value" : [{"K" : 100, "type" : "int32"}]
25 | },
26 | {"N" : 100000,
27 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "int32"}],
28 | "value" : [{"K" : 100, "type" : "int32"}]
29 | },
30 | {"N" : 100000,
31 | "id" : [{"K" : 1000, "type" : "int32"}, {"K" : 300, "type" : "str16"}],
32 | "value" : [{"K" : 100, "type" : "int32"}]
33 | },
34 | {"N" : 100000,
35 | "id" : [{"K" : 1000, "type" : "int32"}, {"K" : 300, "type" : "str16"}],
36 | "value" : [{"K" : 100, "type" : "float32"}]
37 | },
38 | {"N" : 1000000,
39 | "id" : [{"K" : 3, "type" : "uint8"}, {"K" : 3, "type" : "int32"}],
40 | "value" : [{"K" : 100, "type" : "float64"}]
41 | }
42 | ]
43 |
--------------------------------------------------------------------------------
/test/data/groupby.where.sum/operation.py:
--------------------------------------------------------------------------------
1 | """sum operation
2 | """
3 | import pandas as pd
4 | import math
5 |
6 | def convert_to_dict(r):
7 |
8 | # returns a dictionary whose keys are tuples
9 | tupled = r.to_dict()
10 |
11 | # convert tuple keys to nested dictionaries
12 | dicted = {}
13 | for (t, k) in tupled.items():
14 | level = dicted
15 |
16 | # create a nested dictionary for each item in the tuple
17 | for l in t[:-1]:
18 | if l in level:
19 | level = level[l]
20 | else:
21 | level[l] = {}
22 | level = level[l]
23 |
24 | # the last level points to the value
25 | l = t[-1]
26 | level[l] = k.item() # convert numpy type to python type
27 |
28 | return dicted
29 |
30 | SAMPLE = 10
31 |
32 | def execute(options, id_columns, value_columns):
33 | '''
34 | id_columns - a dictionary mapping names (strings) to numpy arrays
35 | value_columns - a dictionary mapping names (strings) to numpy arrays
36 |
37 | '''
38 |
39 | columns = id_columns.copy()
40 | columns.update(value_columns)
41 |
42 | frame = pd.DataFrame(columns)
43 |
44 | id_name = "id_0"
45 | value_name = "value_0"
46 |
47 | # create a subset of the column values
48 | column = id_columns[id_name]
49 | uniques = set(column[:SAMPLE])
50 | l = int(math.ceil(len(uniques)/2.0))
51 | subset = sorted(list(uniques))[:l]
52 | #print(subset)
53 |
54 | #frame.loc[frame[id_name] == 1, value_name].sum()
55 | #v = frame.loc[frame[id_name].isin(subset), value_name].sum()
56 | filtered = frame.loc[frame[id_name].isin(subset)]
57 | grouped = filtered.groupby(by=list(id_columns.keys()))
58 |
59 | return convert_to_dict(grouped.sum()["value_0"])
60 |
--------------------------------------------------------------------------------
/test/data/groupby.where.sum/small.json:
--------------------------------------------------------------------------------
1 | [
2 | {"N" : 10000,
3 | "id" : [{"K" : 3, "type" : "int8"}, {"K" : 3, "type" : "uint8"}],
4 | "value" : [{"K" : 100, "type" : "int32"}]
5 | },
6 | {"N" : 10000,
7 | "id" : [{"K" : 3, "type" : "int8"}, {"K" : 3, "type" : "str8"}],
8 | "value" : [{"K" : 100, "type" : "int32"}]
9 | },
10 | {"N" : 10000,
11 | "id" : [{"K" : 100, "type" : "int8"}, {"K" : 3, "type" : "str8"}],
12 | "value" : [{"K" : 1000, "type" : "int32"}]
13 | },
14 | {"N" : 10000,
15 | "id" : [{"K" : 100, "type" : "int8"}, {"K" : 100, "type" : "str8"}],
16 | "value" : [{"K" : 1000, "type" : "int32"}]
17 | },
18 | {"N" : 10000,
19 | "id" : [{"K" : 3, "type" : "int16"}, {"K" : 3, "type" : "uint16"}],
20 | "value" : [{"K" : 100, "type" : "float32"}]
21 | },
22 | {"N" : 10000,
23 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "uint32"}],
24 | "value" : [{"K" : 100, "type" : "int32"}]
25 | },
26 | {"N" : 100000,
27 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "int32"}],
28 | "value" : [{"K" : 100, "type" : "int32"}]
29 | },
30 | {"N" : 100000,
31 | "id" : [{"K" : 1000, "type" : "int32"}, {"K" : 300, "type" : "str16"}],
32 | "value" : [{"K" : 100, "type" : "int32"}]
33 | },
34 | {"N" : 100000,
35 | "id" : [{"K" : 1000, "type" : "int32"}, {"K" : 300, "type" : "str16"}],
36 | "value" : [{"K" : 100, "type" : "float32"}]
37 | },
38 | {"N" : 1000000,
39 | "id" : [{"K" : 3, "type" : "uint8"}, {"K" : 3, "type" : "int32"}],
40 | "value" : [{"K" : 100, "type" : "float64"}]
41 | }
42 | ]
43 |
--------------------------------------------------------------------------------
/test/data/mean/operation.py:
--------------------------------------------------------------------------------
1 | """mean operation
2 | find the mean (average) of a column
3 | """
4 | import pandas as pd
5 | import math
6 |
7 | def execute(options, id_columns, value_columns):
8 | '''
9 | id_columns - a dictionary mapping names (strings) to numpy arrays
10 | value_columns - a dictionary mapping names (strings) to numpy arrays
11 |
12 | '''
13 |
14 | columns = id_columns.copy()
15 | columns.update(value_columns)
16 |
17 | frame = pd.DataFrame(columns)
18 |
19 | v = frame.mean()["value_0"]
20 |
21 | return v.item() # convert from numpy type to python type
22 |
--------------------------------------------------------------------------------
/test/data/mean/small.json:
--------------------------------------------------------------------------------
1 | [
2 | {"N" : 10000,
3 | "id" : [{"K" : 3, "type" : "int8"}],
4 | "value" : [{"K" : 100, "type" : "int32"}]
5 | },
6 | {"N" : 10000,
7 | "id" : [{"K" : 3, "type" : "int8"}],
8 | "value" : [{"K" : 100, "type" : "int32"}]
9 | },
10 | {"N" : 10000,
11 | "id" : [{"K" : 100, "type" : "int8"}],
12 | "value" : [{"K" : 1000, "type" : "int32"}]
13 | },
14 | {"N" : 10000,
15 | "id" : [{"K" : 100, "type" : "int8"}],
16 | "value" : [{"K" : 1000, "type" : "int32"}]
17 | },
18 | {"N" : 10000,
19 | "id" : [{"K" : 3, "type" : "int16"}],
20 | "value" : [{"K" : 100, "type" : "float32"}]
21 | },
22 | {"N" : 10000,
23 | "id" : [{"K" : 3, "type" : "int32"}],
24 | "value" : [{"K" : 100, "type" : "float32"}]
25 | },
26 | {"N" : 100000,
27 | "id" : [{"K" : 3, "type" : "int32"}],
28 | "value" : [{"K" : 100, "type" : "int32"}]
29 | },
30 | {"N" : 100000,
31 | "id" : [{"K" : 1000, "type" : "int32"}],
32 | "value" : [{"K" : 100, "type" : "int32"}]
33 | },
34 | {"N" : 100000,
35 | "id" : [{"K" : 1000, "type" : "int32"}],
36 | "value" : [{"K" : 100, "type" : "float32"}]
37 | },
38 | {"N" : 1000000,
39 | "id" : [{"K" : 3, "type" : "uint8"}],
40 | "value" : [{"K" : 100, "type" : "float64"}]
41 | }
42 | ]
43 |
--------------------------------------------------------------------------------
/test/data/where.in.sum/operation.py:
--------------------------------------------------------------------------------
1 | """where.in sum operation
2 | filter by inclusion in a list, then sum the matches
3 | """
4 | import pandas as pd
5 | import math
6 |
7 | SAMPLE = 10
8 |
9 | def execute(options, id_columns, value_columns):
10 | '''
11 | id_columns - a dictionary mapping names (strings) to numpy arrays
12 | value_columns - a dictionary mapping names (strings) to numpy arrays
13 |
14 | '''
15 |
16 | columns = id_columns.copy()
17 | columns.update(value_columns)
18 |
19 | frame = pd.DataFrame(columns)
20 |
21 | id_name = "id_0"
22 | value_name = "value_0"
23 |
24 | # create a subset of the column values
25 | column = id_columns[id_name]
26 | uniques = set(column[:SAMPLE])
27 | l = int(math.ceil(len(uniques)/2.0))
28 | subset = sorted(list(uniques))[:l]
29 |
30 | #frame.loc[frame[id_name] == 1, value_name].sum()
31 | v = frame.loc[frame[id_name].isin(subset), value_name].sum()
32 |
33 | return v.item() # convert from numpy type to python type
34 |
--------------------------------------------------------------------------------
/test/data/where.in.sum/small.json:
--------------------------------------------------------------------------------
1 | [
2 | {"N" : 10000,
3 | "id" : [{"K" : 3, "type" : "int8"}, {"K" : 3, "type" : "uint8"}],
4 | "value" : [{"K" : 100, "type" : "int32"}]
5 | },
6 | {"N" : 10000,
7 | "id" : [{"K" : 3, "type" : "int8"}, {"K" : 3, "type" : "str8"}],
8 | "value" : [{"K" : 100, "type" : "int32"}]
9 | },
10 | {"N" : 10000,
11 | "id" : [{"K" : 100, "type" : "int8"}, {"K" : 3, "type" : "str8"}],
12 | "value" : [{"K" : 1000, "type" : "int32"}]
13 | },
14 | {"N" : 10000,
15 | "id" : [{"K" : 100, "type" : "int8"}, {"K" : 100, "type" : "str8"}],
16 | "value" : [{"K" : 1000, "type" : "int32"}]
17 | },
18 | {"N" : 10000,
19 | "id" : [{"K" : 3, "type" : "int16"}, {"K" : 3, "type" : "uint16"}],
20 | "value" : [{"K" : 100, "type" : "float32"}]
21 | },
22 | {"N" : 10000,
23 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "uint32"}],
24 | "value" : [{"K" : 100, "type" : "int32"}]
25 | },
26 | {"N" : 100000,
27 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "int32"}],
28 | "value" : [{"K" : 100, "type" : "int32"}]
29 | },
30 | {"N" : 100000,
31 | "id" : [{"K" : 1000, "type" : "int32"}, {"K" : 300, "type" : "str16"}],
32 | "value" : [{"K" : 100, "type" : "int32"}]
33 | },
34 | {"N" : 100000,
35 | "id" : [{"K" : 1000, "type" : "int32"}, {"K" : 300, "type" : "str16"}],
36 | "value" : [{"K" : 100, "type" : "float32"}]
37 | },
38 | {"N" : 1000000,
39 | "id" : [{"K" : 3, "type" : "uint8"}, {"K" : 3, "type" : "int32"}],
40 | "value" : [{"K" : 100, "type" : "float64"}]
41 | }
42 | ]
43 |
--------------------------------------------------------------------------------
/test/data/where.mean/operation.py:
--------------------------------------------------------------------------------
1 | """where mean operation
2 | filter by equality with a value, then take the mean of (average) the matches
3 | """
4 | import pandas as pd
5 | import math
6 |
7 | SAMPLE = 10
8 |
9 | def execute(options, id_columns, value_columns):
10 | '''
11 | id_columns - a dictionary mapping names (strings) to numpy arrays
12 | value_columns - a dictionary mapping names (strings) to numpy arrays
13 |
14 | '''
15 |
16 | columns = id_columns.copy()
17 | columns.update(value_columns)
18 |
19 | frame = pd.DataFrame(columns)
20 |
21 | id_name = "id_0"
22 | value_name = "value_0"
23 |
24 | # create a subset of the column values
25 | column = id_columns[id_name]
26 | first = column[0]
27 |
28 | #frame.loc[frame[id_name] == 1, value_name].sum()
29 | v = frame.loc[frame[id_name] == first, value_name].mean()
30 |
31 | return v.item() # convert from numpy type to python type
32 |
--------------------------------------------------------------------------------
/test/data/where.mean/small.json:
--------------------------------------------------------------------------------
1 | [
2 | {"N" : 10000,
3 | "id" : [{"K" : 3, "type" : "int8"}, {"K" : 3, "type" : "uint8"}],
4 | "value" : [{"K" : 100, "type" : "int32"}]
5 | },
6 | {"N" : 10000,
7 | "id" : [{"K" : 3, "type" : "int8"}, {"K" : 3, "type" : "str8"}],
8 | "value" : [{"K" : 100, "type" : "int32"}]
9 | },
10 | {"N" : 10000,
11 | "id" : [{"K" : 100, "type" : "int8"}, {"K" : 3, "type" : "str8"}],
12 | "value" : [{"K" : 1000, "type" : "int32"}]
13 | },
14 | {"N" : 10000,
15 | "id" : [{"K" : 100, "type" : "int8"}, {"K" : 100, "type" : "str8"}],
16 | "value" : [{"K" : 1000, "type" : "int32"}]
17 | },
18 | {"N" : 10000,
19 | "id" : [{"K" : 3, "type" : "int16"}, {"K" : 3, "type" : "uint16"}],
20 | "value" : [{"K" : 100, "type" : "float32"}]
21 | },
22 | {"N" : 10000,
23 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "uint32"}],
24 | "value" : [{"K" : 100, "type" : "int32"}]
25 | },
26 | {"N" : 100000,
27 | "id" : [{"K" : 3, "type" : "int32"}, {"K" : 3, "type" : "int32"}],
28 | "value" : [{"K" : 100, "type" : "int32"}]
29 | },
30 | {"N" : 100000,
31 | "id" : [{"K" : 1000, "type" : "int32"}, {"K" : 300, "type" : "str16"}],
32 | "value" : [{"K" : 100, "type" : "int32"}]
33 | },
34 | {"N" : 100000,
35 | "id" : [{"K" : 1000, "type" : "int32"}, {"K" : 300, "type" : "str16"}],
36 | "value" : [{"K" : 100, "type" : "float32"}]
37 | },
38 | {"N" : 1000000,
39 | "id" : [{"K" : 3, "type" : "uint8"}, {"K" : 3, "type" : "int32"}],
40 | "value" : [{"K" : 100, "type" : "float64"}]
41 | }
42 | ]
43 |
--------------------------------------------------------------------------------
/test/groupby.count.js:
--------------------------------------------------------------------------------
1 | var tape = require('tape'),
2 | Frame = require('../lib/frame');
3 |
4 |
5 | tape("groupby.count", function(t){
6 | t.plan(1);
7 | var frame = new Frame({
8 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
9 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
10 | });
11 |
12 | var expected = {
13 | 0 : 5,
14 | 1 : 4
15 | }
16 |
17 | var g = frame.groupby("id");
18 | var actual = g.count();
19 |
20 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
21 |
22 | });
23 |
24 | tape("groupby.count", function(t){
25 | t.plan(2);
26 | var frame = new Frame({
27 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
28 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
29 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
30 | });
31 |
32 | var expected = {
33 | "0" : {
34 | "0" : 4,
35 | "1" : 1
36 | },
37 | "1" : {
38 | "0" : 1,
39 | "1" : 3
40 | }
41 | };
42 |
43 |
44 | var g = frame.groupby(["id_0", "id_1"]);
45 | var actual = g.count();
46 |
47 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
48 |
49 |
50 | var g = frame.groupby("id_0", "id_1");
51 | var actual = g.count();
52 |
53 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
54 | });
55 |
56 |
57 |
58 | var dataDirectory = 'test/data/groupby.count/',
59 | testFile = 'small.json';
60 |
61 | var RTOL = 1e-05, // 1e-05
62 | ATOL = 1e-12; // 1e-12
63 |
64 | var floader = require('floader'),
65 | dtest = require('../lib/test');
66 |
67 | floader.load(dataDirectory + testFile, function(err, config){
68 |
69 | var suite = JSON.parse(config);
70 |
71 | for(var i = 0; i < suite.length; i++){
72 |
73 | var prefix = String("0000" + (i + 1)).slice(-4);
74 |
75 | // directory containing matrix data files for current test
76 | var directory = dataDirectory + prefix + '/';
77 |
78 | var test = suite[i];
79 | /*
80 | "N" : 10000,
81 | "id" : [{"M" : 3, "strings" : false}, {"M" : 3, "strings" : false}],
82 | "value" : [{"M" : 100}, {"M" : 100}]
83 | */
84 |
85 | var names = test.id.map(function(spec, i){ return "id_" + i;});
86 | var types = test.id.map(function(spec, i){ return spec['type'];});
87 |
88 | var N = test.N; // number of rows
89 | distincts = test.id.map(function(spec, i){ return spec.K; });
90 |
91 | var testName = "groupby.count: " + N + " x " + "(" + distincts.join(", ") + ")"
92 | tape(testName, generateTestCase(directory, names, types, ["value_0"], [test.value[0].type]));
93 | }
94 | });
95 |
96 | function generateTestCase(directory, id_names, id_types, value_names, value_types){
97 | return function(t){
98 | t.plan(1);
99 |
100 | var names = id_names.concat(value_names);
101 | var types = id_types.concat(value_types);
102 | // load columns from files
103 | dtest.load(directory, names, types, function(err, columns){
104 |
105 | floader.load(directory + "out.json", function(err, out){
106 | var expected = JSON.parse(out);
107 |
108 | var column_set = {};
109 | for (var i = 0; i < names.length; i++){
110 | var name = names[i];
111 | var column = columns[i];
112 | column_set[name] = column;
113 | }
114 | var frame = new Frame(column_set);
115 |
116 | var g = frame.groupby(id_names);
117 | var actual = g.count();
118 |
119 | var assert;
120 | if(value_types[0] in dtest.float_types){
121 | assert = dtest.assert.tree.allclose;
122 | } else {
123 | assert = dtest.assert.tree.equal;
124 | }
125 |
126 | assert(t, actual, expected, null, RTOL, ATOL);
127 | });
128 |
129 | });
130 | };
131 | }
132 |
--------------------------------------------------------------------------------
/test/groupby.js:
--------------------------------------------------------------------------------
1 | var tape = require('tape'),
2 | Frame = require('../lib/frame');
3 |
4 | tape("groupby has correct index", function(t){
5 | t.plan(1);
6 | var frame = new Frame({
7 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
8 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
9 | });
10 |
11 | var expected = {
12 | "0" : [0, 1, 2, 5, 7],
13 | "1" : [3, 4, 6, 8]
14 | };
15 |
16 | var g = frame.groupby("id");
17 | var actual = g._index;
18 |
19 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
20 |
21 | });
22 |
23 | tape("groupby with two arguments has correct index", function(t){
24 | t.plan(1);
25 | var frame = new Frame({
26 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
27 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
28 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
29 | });
30 |
31 | var expected = {
32 | "0" : {
33 | "0" : [0, 1, 5, 7],
34 | "1" : [2]
35 | },
36 | "1" : {
37 | "0" : [4],
38 | "1" : [3, 6, 8]
39 | }
40 | };
41 |
42 | var g = frame.groupby("id_0", "id_1");
43 | var actual = g._index;
44 |
45 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
46 | });
47 |
48 | tape("successive groupby has correct index", function(t){
49 | t.plan(1);
50 | var frame = new Frame({
51 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
52 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
53 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
54 | });
55 |
56 | var expected = {
57 | "0" : {
58 | "0" : [0, 1, 5, 7],
59 | "1" : [2]
60 | },
61 | "1" : {
62 | "0" : [4],
63 | "1" : [3, 6, 8]
64 | }
65 | };
66 |
67 | var g = frame.groupby("id_0");
68 | g = g.groupby("id_1");
69 | var actual = g._index;
70 |
71 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
72 | });
73 |
74 |
75 | var dataDirectory = 'test/data/groupby.count/',
76 | testFile = 'small.json';
77 |
78 | var RTOL = 1e-05, // 1e-05
79 | ATOL = 1e-12; // 1e-12
80 |
81 | var floader = require('floader'),
82 | dtest = require('../lib/test');
83 |
84 | floader.load(dataDirectory + testFile, function(err, config){
85 |
86 | var suite = JSON.parse(config);
87 |
88 | for(var i = 0; i < suite.length; i++){
89 |
90 | var prefix = String("0000" + (i + 1)).slice(-4);
91 |
92 | // directory containing matrix data files for current test
93 | var directory = dataDirectory + prefix + '/';
94 |
95 | var test = suite[i];
96 | var names = test.id.map(function(spec, i){ return "id_" + i;});
97 | var types = test.id.map(function(spec, i){ return spec['type'];});
98 |
99 | var N = test.N; // number of rows
100 | distincts = test.id.map(function(spec, i){ return spec.K; });
101 |
102 | var testName = "groupby.count: " + N + " x " + "(" + distincts.join(", ") + ")"
103 | tape(testName, generateTestCase(directory, names, types, ["value_0"], [test.value[0].type]));
104 | }
105 | });
106 |
107 | function generateTestCase(directory, id_names, id_types, value_names, value_types){
108 | return function(t){
109 | t.plan(1);
110 |
111 | var names = id_names.concat(value_names);
112 | var types = id_types.concat(value_types);
113 | // load columns from files
114 | dtest.load(directory, names, types, function(err, columns){
115 |
116 | floader.load(directory + "out.json", function(err, out){
117 | var expected = JSON.parse(out);
118 |
119 | var column_set = {};
120 | for (var i = 0; i < names.length; i++){
121 | var name = names[i];
122 | var column = columns[i];
123 | column_set[name] = column;
124 | }
125 | var frame = new Frame(column_set);
126 |
127 | var g = frame;
128 | for(var i = 0; i < id_names.length; i++){
129 | id_name = id_names[i];
130 | g = g.groupby(id_name);
131 | }
132 | var actual = g.count();
133 |
134 | var assert;
135 | if(value_types[0] in dtest.float_types){
136 | assert = dtest.assert.tree.allclose;
137 | } else {
138 | assert = dtest.assert.tree.equal;
139 | }
140 |
141 | assert(t, actual, expected, null, RTOL, ATOL);
142 | });
143 |
144 | });
145 | };
146 | }
147 |
--------------------------------------------------------------------------------
/test/groupby.mean.js:
--------------------------------------------------------------------------------
1 | var tape = require('tape'),
2 | Frame = require('../lib/frame'),
3 | dtest = require('../lib/test');
4 |
5 | var RTOL = 1e-05, // 1e-05
6 | ATOL = 1e-12; // 1e-12
7 |
8 | // simple instructive test cases
9 | function simpleTestCases(){
10 |
11 | tape("groupby accepts single string", function(t){
12 | t.plan(1);
13 | var frame = new Frame({
14 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
15 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
16 | });
17 |
18 | var expected = {
19 | 0: 2, // 1 + 2 + 2 + 3 + 2
20 | 1: 2.25 // 3 + 1 + 4 + 1
21 | };
22 |
23 | frame = frame.groupby("id");
24 | var actual = frame.mean("value");
25 |
26 | dtest.assert.tree.allclose(t, actual, expected, "mean", RTOL, ATOL);
27 |
28 | });
29 |
30 | tape("groupby accepts single string", function(t){
31 | t.plan(1);
32 | var frame = new Frame({
33 | "id" : [ 0, 0, 0, 1, 1, 0, 1, 0, 1],
34 | "value" : [1.4, 10.3, 24.2, 31.2, 1.9, 8.6, 4.7, 21.2, 7.4]
35 | });
36 |
37 | var expected = {
38 | 0: 13.14, // 1.4 + 10.3 + 24.2 + 8.6 + 21.2
39 | 1: 11.3 // 31.2 + 1.9 + 4.7 + 7.4
40 | };
41 |
42 | frame = frame.groupby("id");
43 | var actual = frame.mean("value");
44 |
45 | dtest.assert.tree.allclose(t, actual, expected, "mean", RTOL, ATOL);
46 |
47 | });
48 |
49 | tape("groupby accepts single string argument over string variable", function(t){
50 | t.plan(1);
51 | var frame = new Frame({
52 | "id" : ["b", "a", "a", "a", "b", "a", "b", "a", "b"],
53 | "value" : [ 3, 1, 2, 2, 1, 3, 4, 2, 1]
54 | });
55 | expected = {
56 | "a": 2, // 1 + 2 + 2 + 3 + 2
57 | "b": 2.25 // 3 + 1 + 4 + 1
58 | };
59 |
60 | frame = frame.groupby("id");
61 | var actual = frame.mean("value");
62 |
63 | dtest.assert.tree.allclose(t, actual, expected, "mean", RTOL, ATOL);
64 | });
65 |
66 | tape("groupby accepts array argument", function(t){
67 | t.plan(1);
68 | var frame = new Frame({
69 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
70 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
71 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
72 | });
73 |
74 | var expected = {
75 | "0" : {
76 | "0" : 2, // 1 + 2 + 3 + 2
77 | "1" : 2 // 2
78 | },
79 | "1" : {
80 | "0" : 1, // 1
81 | "1" : 2.6666666666 // 3 + 4 + 1
82 | }
83 | };
84 |
85 | frame = frame.groupby(["id_0", "id_1"]);
86 | var actual = frame.mean("value");
87 |
88 | dtest.assert.tree.allclose(t, actual, expected, "mean", RTOL, ATOL);
89 | });
90 |
91 | tape("groupby accepts multiple string arguments", function(t){
92 | t.plan(1);
93 | var frame = new Frame({
94 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
95 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
96 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
97 | });
98 |
99 | var expected = {
100 | "0" : {
101 | "0" : 2, // 1 + 2 + 3 + 2
102 | "1" : 2 // 2
103 | },
104 | "1" : {
105 | "0" : 1, // 1
106 | "1" : 2.6666666666 // 3 + 4 + 1
107 | }
108 | };
109 |
110 |
111 | frame = frame.groupby("id_0", "id_1");
112 | var actual = frame.mean("value");
113 |
114 | dtest.assert.tree.allclose(t, actual, expected, "mean", RTOL, ATOL);
115 | });
116 |
117 | tape("mean works without groupby", function(t){
118 | t.plan(1);
119 | var frame = new Frame({
120 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
121 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
122 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
123 | });
124 |
125 | var expected = 2.11111111111111; // (1 + 2 + 2 + 3 + 1 + 3 + 4 + 2 + 1) / 9
126 |
127 | var actual = frame.mean("value");
128 |
129 | dtest.assert.close(t, actual, expected, "close", RTOL, ATOL);
130 | });
131 |
132 | tape("mean works without groupby", function(t){
133 | t.plan(1);
134 | var frame = new Frame({
135 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
136 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
137 | "value" : [3.5, 4.0, 2.1, 3.4, 1.3, 3.8, 4.2, 2.0, 1.5]
138 | });
139 |
140 | var expected = 2.8666666666; // (3.5, 4.0, 2.1, 3.4, 1.3, 3.8, 4.2, 2.0, 1.5) / 9
141 |
142 | var actual = frame.mean("value");
143 |
144 | dtest.assert.close(t, actual, expected, "close", RTOL, ATOL);
145 | });
146 | }
147 |
148 |
149 | var dataDirectory = 'test/data/groupby.mean/',
150 | testFile = 'small.json';
151 |
152 | var floader = require('floader'),
153 | dtest = require('../lib/test');
154 |
155 | floader.load(dataDirectory + testFile, function(err, config){
156 |
157 | var suite = JSON.parse(config);
158 | simpleTestCases();
159 |
160 | for(var i = 0; i < suite.length; i++){
161 |
162 | var prefix = String("0000" + (i + 1)).slice(-4);
163 |
164 | // directory containing matrix data files for current test
165 | var directory = dataDirectory + prefix + '/';
166 |
167 | var test = suite[i];
168 |
169 | var names = test.id.map(function(spec, i){ return "id_" + i;});
170 | var types = test.id.map(function(spec, i){ return spec['type'];});
171 |
172 | var N = test.N; // number of rows
173 | var distincts = test.id.map(function(spec, i){ return spec.K; });
174 |
175 | var testName = "groupby.mean: " + N + " x " + "(" + distincts.join(", ") + ")"
176 | tape(testName, generateTestCase(directory, names, types, ["value_0"], [test.value[0].type]));
177 | }
178 | });
179 |
180 | var OUT_FILENAME = "out.json";
181 |
182 |
183 | function generateTestCase(directory, id_names, id_types, value_names, value_types){
184 | return function(t){
185 | t.plan(1);
186 |
187 | var names = id_names.concat(value_names);
188 | var types = id_types.concat(value_types);
189 |
190 | // which columns require a key file?
191 | var key_names = id_names.filter(function(item, i){
192 | return id_types[i] in dtest.string_types
193 | });
194 | var key_types = id_types.filter(function(item, i){
195 | return item in dtest.string_types
196 | });
197 |
198 | // load columns from files
199 | dtest.load(directory, names, types, function(err, columns){
200 |
201 | // load key files
202 | dtest.load_key(directory, key_names, key_types, function(err, keys){
203 |
204 | floader.load(directory + OUT_FILENAME, function(err, out){
205 | var expected = JSON.parse(out);
206 |
207 | var column_set = {};
208 | for (var i = 0; i < names.length; i++){
209 | var name = names[i];
210 | var column = columns[i];
211 | column_set[name] = column;
212 | }
213 | // keys map a small set of integers to other things (like strings)
214 | // they're a very simple form of fixed length coding
215 | var key_set = {};
216 | for (var i = 0; i < keys.length; i++){
217 | var name = key_names[i];
218 | var key = keys[i];
219 | key_set[name] = key;
220 | }
221 |
222 | var frame = new Frame(column_set, key_set);
223 |
224 | var g = frame.groupby(id_names);
225 | var actual = g.mean(value_names[0]);
226 |
227 | var assert = dtest.assert.tree.allclose;
228 |
229 | //console.log(actual);
230 | assert(t, actual, expected, null, RTOL, ATOL);
231 | });
232 |
233 | });
234 | });
235 | };
236 | }
237 |
--------------------------------------------------------------------------------
/test/groupby.sum.js:
--------------------------------------------------------------------------------
1 | var tape = require('tape'),
2 | Frame = require('../lib/frame');
3 |
4 | // simple instructive test cases
5 | function simpleTestCases(){
6 |
7 | tape("groupby accepts single string", function(t){
8 | t.plan(1);
9 | var frame = new Frame({
10 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
11 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
12 | });
13 |
14 | var expected = {
15 | 0: 10, // 1 + 2 + 2 + 3 + 2
16 | 1: 9 // 3 + 1 + 4 + 1
17 | };
18 |
19 | frame = frame.groupby("id");
20 | var actual = frame.sum("value");
21 |
22 | t.equals(JSON.stringify(actual), JSON.stringify(expected), "reduce");
23 |
24 | });
25 |
26 | tape("groupby accepts single string argument over string variable", function(t){
27 | t.plan(1);
28 | var frame = new Frame({
29 | "id" : ["b", "a", "a", "a", "b", "a", "b", "a", "b"],
30 | "value" : [ 3, 1, 2, 2, 1, 3, 4, 2, 1]
31 | });
32 | expected = {
33 | "a": 10, // 1 + 2 + 2 + 3 + 2
34 | "b": 9 // 3 + 1 + 4 + 1
35 | };
36 |
37 | frame = frame.groupby("id");
38 | var actual = frame.sum("value");
39 |
40 | t.equals(JSON.stringify(actual), JSON.stringify(expected), "reduce");
41 |
42 | });
43 |
44 | tape("groupby accepts array argument", function(t){
45 | t.plan(1);
46 | var frame = new Frame({
47 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
48 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
49 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
50 | });
51 |
52 | var expected = {
53 | "0" : {
54 | "0" : 8, // 1 + 2 + 3 + 2
55 | "1" : 2 // 2
56 | },
57 | "1" : {
58 | "0" : 1, // 1
59 | "1" : 8 // 3 + 4 + 1
60 | }
61 | };
62 |
63 |
64 | frame = frame.groupby(["id_0", "id_1"]);
65 | var actual = frame.sum("value");
66 |
67 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
68 | });
69 |
70 | tape("groupby accepts multiple string arguments", function(t){
71 | t.plan(1);
72 | var frame = new Frame({
73 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
74 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
75 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
76 | });
77 |
78 | var expected = {
79 | "0" : {
80 | "0" : 8, // 1 + 2 + 3 + 2
81 | "1" : 2 // 2
82 | },
83 | "1" : {
84 | "0" : 1, // 1
85 | "1" : 8 // 3 + 4 + 1
86 | }
87 | };
88 |
89 |
90 | frame = frame.groupby("id_0", "id_1");
91 | var actual = frame.sum("value");
92 |
93 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
94 | });
95 |
96 | tape("sum works without groupby", function(t){
97 | t.plan(1);
98 | var frame = new Frame({
99 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
100 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
101 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
102 | });
103 |
104 | var expected = 19; // 1 + 2 + 2 + 3 + 1 + 3 + 4 + 2 + 1
105 |
106 | var actual = frame.sum("value");
107 |
108 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
109 | });
110 |
111 | tape("groupby sum, reduce over keyed column", function(t){
112 | t.plan(1);
113 | var frame = new Frame({
114 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
115 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
116 | },{
117 | "value" : [1, 2, 3, 4, 5]
118 | });
119 |
120 | var expected = {
121 | 0 : 15, // 2 + 3 + 3 + 4 + 3
122 | 1 : 13 // 4 + 2 + 5 + 2
123 | }
124 |
125 | var g = frame.groupby("id");
126 | var actual = g.sum("value");
127 |
128 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
129 |
130 | });
131 | }
132 |
133 | var RTOL = 1e-05, // 1e-05
134 | ATOL = 1e-12; // 1e-12
135 |
136 | var dataDirectory = 'test/data/groupby.sum/',
137 | testFile = 'small.json';
138 |
139 | var floader = require('floader'),
140 | dtest = require('../lib/test');
141 |
142 | floader.load(dataDirectory + testFile, function(err, config){
143 |
144 | var suite = JSON.parse(config);
145 | simpleTestCases();
146 |
147 | for(var i = 0; i < suite.length; i++){
148 |
149 | var prefix = String("0000" + (i + 1)).slice(-4);
150 |
151 | // directory containing matrix data files for current test
152 | var directory = dataDirectory + prefix + '/';
153 |
154 | var test = suite[i];
155 |
156 | var names = test.id.map(function(spec, i){ return "id_" + i;});
157 | var types = test.id.map(function(spec, i){ return spec['type'];});
158 |
159 | var N = test.N; // number of rows
160 | var distincts = test.id.map(function(spec, i){ return spec.K; });
161 |
162 | var testName = "groupby.sum: " + N + " x " + "(" + distincts.join(", ") + ")"
163 | tape(testName, generateTestCase(directory, names, types, ["value_0"], [test.value[0].type]));
164 | }
165 | });
166 |
167 | var OUT_FILENAME = "out.json";
168 |
169 |
170 | function generateTestCase(directory, id_names, id_types, value_names, value_types){
171 | return function(t){
172 | t.plan(1);
173 |
174 | var names = id_names.concat(value_names);
175 | var types = id_types.concat(value_types);
176 |
177 | // which columns require a key file?
178 | var key_names = id_names.filter(function(item, i){
179 | return id_types[i] in dtest.string_types
180 | });
181 | var key_types = id_types.filter(function(item, i){
182 | return item in dtest.string_types
183 | });
184 |
185 | // load columns from files
186 | dtest.load(directory, names, types, function(err, columns){
187 |
188 | // load key files
189 | dtest.load_key(directory, key_names, key_types, function(err, keys){
190 |
191 | floader.load(directory + OUT_FILENAME, function(err, out){
192 | var expected = JSON.parse(out);
193 |
194 | var column_set = {};
195 | for (var i = 0; i < names.length; i++){
196 | var name = names[i];
197 | var column = columns[i];
198 | column_set[name] = column;
199 | }
200 | // keys map a small set of integers to other things (like strings)
201 | // they're a very simple form of fixed length coding
202 | var key_set = {};
203 | for (var i = 0; i < keys.length; i++){
204 | var name = key_names[i];
205 | var key = keys[i];
206 | key_set[name] = key;
207 | }
208 |
209 | var frame = new Frame(column_set, key_set);
210 |
211 | var g = frame.groupby(id_names);
212 | var actual = g.sum(value_names[0]);
213 |
214 | var assert;
215 | if(value_types[0] in dtest.float_types){
216 | assert = dtest.assert.tree.allclose;
217 | } else {
218 | assert = dtest.assert.tree.equal;
219 | }
220 |
221 | //console.log(actual);
222 | assert(t, actual, expected, null, RTOL, ATOL);
223 | });
224 |
225 | });
226 | });
227 | };
228 | }
229 |
--------------------------------------------------------------------------------
/test/groupby.where.sum.js:
--------------------------------------------------------------------------------
1 | var tape = require('tape'),
2 | Frame = require('../lib/frame');
3 |
4 | // simple instructive test cases
5 | function simpleTestCases(){
6 |
7 | tape("sum works with where before groupby", function(t){
8 | t.plan(1);
9 | var frame = new Frame({
10 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
11 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
12 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
13 | });
14 |
15 | var expected = {
16 | "0" : 1, // 1
17 | "1" : 8 // 3 + 4 + 1
18 | };
19 |
20 | var actual = frame.where("id_0", 1).groupby("id_1").sum("value");
21 |
22 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
23 | });
24 |
25 | tape("sum works with where before groupby", function(t){
26 | t.plan(1);
27 | var frame = new Frame({
28 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
29 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
30 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
31 | });
32 |
33 | var expected = {
34 | "0" : 8,
35 | "1" : 2
36 | };
37 |
38 | var actual = frame.where("id_0", 0).groupby("id_1").sum("value");
39 |
40 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
41 | });
42 |
43 | tape("sum works with groupby before where", function(t){
44 | t.plan(1);
45 | var frame = new Frame({
46 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
47 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
48 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
49 | });
50 |
51 | var expected = {
52 | "0" : 1,
53 | "1" : 8
54 | };
55 |
56 | var actual = frame.groupby("id_1").where("id_0", 1).sum("value");
57 |
58 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
59 | });
60 |
61 |
62 |
63 | tape("sum works with where.in before groupby", function(t){
64 | t.plan(1);
65 | var frame = new Frame({
66 | "id_0" : [0, 2, 0, 1, 1, 0, 2, 0, 1],
67 | "id_1" : [0, 0, 1, 0, 0, 0, 1, 1, 1],
68 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
69 | });
70 |
71 | var expected = {
72 | "0" : 6, // 1 + 2 + 3
73 | "1" : 8 // 2 + 4 + 2
74 | };
75 | frame = frame.where("id_0", [0, 2]).groupby("id_1");
76 | var actual = frame.sum("value");
77 |
78 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
79 | });
80 | }
81 |
82 | //simpleTestCases();
83 |
84 | var SAMPLE = 10;
85 | function numberCompare(a, b){ return a - b; }
86 | // get a predefined subset of a column (matches test data generation)
87 | function generate_subset(column){
88 | //column = id_columns[id_name]
89 | var uniques = {};
90 | for(var i = 0; i < SAMPLE; i++){
91 | uniques[column[i]] = column[i];
92 | }
93 | var keys = Object.keys(uniques);
94 | var subset = keys.map(function(k){ return uniques[k]});
95 |
96 | l = Math.ceil(subset.length / 2);
97 | return subset.sort(numberCompare).slice(0, l);
98 | }
99 |
100 | var RTOL = 1e-05, // 1e-05
101 | ATOL = 1e-12; // 1e-12
102 |
103 | var dataDirectory = 'test/data/groupby.where.sum/',
104 | testFile = 'small.json';
105 |
106 | var floader = require('floader'),
107 | dtest = require('../lib/test');
108 |
109 | floader.load(dataDirectory + testFile, function(err, config){
110 |
111 | var suite = JSON.parse(config);
112 | simpleTestCases();
113 |
114 | for(var i = 0; i < suite.length; i++){
115 |
116 | var prefix = String("0000" + (i + 1)).slice(-4);
117 |
118 | // directory containing matrix data files for current test
119 | var directory = dataDirectory + prefix + '/';
120 |
121 | var test = suite[i];
122 |
123 | var names = test.id.map(function(spec, i){ return "id_" + i;});
124 | var types = test.id.map(function(spec, i){ return spec['type'];});
125 |
126 | var N = test.N; // number of rows
127 | var distincts = test.id.map(function(spec, i){ return spec.K; });
128 |
129 | var testName = "groupby.where.sum: " + N + " x " + "(" + distincts.join(", ") + ")"
130 | tape(testName, generateTestCase(directory, names, types, ["value_0"], [test.value[0].type]));
131 | }
132 | });
133 |
134 | var OUT_FILENAME = "out.json";
135 |
136 |
137 | function generateTestCase(directory, id_names, id_types, value_names, value_types){
138 | return function(t){
139 | t.plan(1);
140 |
141 | var names = id_names.concat(value_names);
142 | var types = id_types.concat(value_types);
143 |
144 | // which columns require a key file?
145 | var key_names = id_names.filter(function(item, i){
146 | return id_types[i] in dtest.string_types
147 | });
148 | var key_types = id_types.filter(function(item, i){
149 | return item in dtest.string_types
150 | });
151 |
152 | // load columns from files
153 | dtest.load(directory, names, types, function(err, columns){
154 |
155 | // load key files
156 | dtest.load_key(directory, key_names, key_types, function(err, keys){
157 |
158 | floader.load(directory + OUT_FILENAME, function(err, out){
159 | var expected = JSON.parse(out);
160 |
161 | var column_set = {};
162 | for (var i = 0; i < names.length; i++){
163 | var name = names[i];
164 | var column = columns[i];
165 | column_set[name] = column;
166 | }
167 | // keys map a small set of integers to other things (like strings)
168 | // they're a very simple form of fixed length coding
169 | var key_set = {};
170 | for (var i = 0; i < keys.length; i++){
171 | var name = key_names[i];
172 | var key = keys[i];
173 | key_set[name] = key;
174 | }
175 |
176 | var frame = new Frame(column_set, key_set);
177 |
178 | var subset = generate_subset(column_set["id_0"]);
179 | //console.log(subset);
180 | frame = frame.where("id_0", subset).groupby(id_names);
181 |
182 | var actual = frame.sum(value_names[0]);
183 |
184 | var assert;
185 | if(value_types[0] in dtest.float_types){
186 | assert = dtest.assert.tree.allclose;
187 | } else {
188 | assert = dtest.assert.tree.equal;
189 | }
190 |
191 | //console.log(actual);
192 | var success = assert(t, actual, expected, null, RTOL, ATOL);
193 | /*
194 | if(!success){
195 | console.log(actual);
196 | console.log(expected);
197 | }*/
198 | });
199 |
200 | });
201 | });
202 | };
203 | }
204 |
--------------------------------------------------------------------------------
/test/join.js:
--------------------------------------------------------------------------------
1 | var tape = require('tape'),
2 | Frame = require('../lib/frame');
3 |
4 | tape("join to smaller frame produces correct virtual column", function(t){
5 | t.plan(1);
6 | var frame0 = new Frame({
7 | "value0" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
8 | });
9 |
10 | //console.log(JSON.stringify(frame0._cols));
11 | var frame1 = new Frame({
12 | "value1" : [1, 2]
13 | });
14 |
15 | var link = [0, 0, 0, 1, 1, 0, 1, 0, 1];
16 |
17 | var joined = frame0.join(frame1, link);
18 |
19 | var expected = [1, 1, 1, 2, 2, 1, 2, 1, 2]; // 1 + 1 + 1 + 2 + 2 + 1 + 2 + 1 + 2
20 |
21 | var actual = joined["value1"];
22 |
23 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
24 |
25 | });
26 |
27 | tape("join to smaller frame produces correct sum", function(t){
28 | t.plan(1);
29 | var frame0 = new Frame({
30 | "value0" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
31 | });
32 |
33 | var frame1 = new Frame({
34 | "value1" : [1, 2]
35 | });
36 |
37 | var link = [0, 0, 0, 1, 1, 0, 1, 0, 1];
38 |
39 | var joined = frame0.join(frame1, link);
40 |
41 | var expected = 13; // 1 + 1 + 1 + 2 + 2 + 1 + 2 + 1 + 2
42 |
43 | var actual = joined.sum("value1");
44 |
45 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
46 |
47 | });
48 |
49 | tape("join to larger frame produces correct virtual column", function(t){
50 | t.plan(1);
51 | var frame0 = new Frame({
52 | "value0" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
53 | });
54 |
55 | //console.log(JSON.stringify(frame0._cols));
56 | var frame1 = new Frame({
57 | "value1" : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
58 | });
59 |
60 | var link = [9, 1, 12, 2, 3, 7, 10, 5, 11];
61 |
62 | var joined = frame0.join(frame1, link);
63 |
64 | var expected = [10, 2, 13, 3, 4, 8, 11, 6, 12]; // 1 + 1 + 1 + 2 + 2 + 1 + 2 + 1 + 2
65 |
66 | var actual = joined["value1"];
67 |
68 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
69 |
70 | });
71 |
72 | tape("join to larger frame produces correct argmax and argmin", function(t){
73 | t.plan(2);
74 | var frame0 = new Frame({
75 | "value0" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
76 | });
77 |
78 | //console.log(JSON.stringify(frame0._cols));
79 | var frame1 = new Frame({
80 | "value1" : [5, 2, 13, 4, 6, 1, 7, 8, 9, 10, 11, 12, 3]
81 | });
82 |
83 | var link = [9, 3, 12, 2, 1, 7, 10, 5, 11];
84 |
85 | var joined = frame0.join(frame1, link);
86 |
87 | var expected = 3;
88 |
89 | var actual = joined.argmax("value1");
90 |
91 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
92 |
93 | var expected = 7;
94 |
95 | var actual = joined.argmin("value1");
96 |
97 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
98 | });
99 |
100 | tape("join to larger frame produces correct sum", function(t){
101 | t.plan(1);
102 | var frame0 = new Frame({
103 | "value0" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
104 | });
105 |
106 | //console.log(JSON.stringify(frame0._cols));
107 | var frame1 = new Frame({
108 | "value1" : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
109 | });
110 |
111 | var link = [9, 1, 12, 2, 3, 7, 10, 5, 11];
112 |
113 | var joined = frame0.join(frame1, link);
114 |
115 | var expected = 69; // 10 + 2 + 13 + 3 + 4 + 8 + 11 + 6 + 12
116 |
117 | var actual = joined.sum("value1");
118 |
119 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
120 |
121 | });
122 |
123 | tape("join with where produces correct sum", function(t){
124 | t.plan(1);
125 | var frame0 = new Frame({
126 | "value0" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
127 | });
128 |
129 | //console.log(JSON.stringify(frame0._cols));
130 | var frame1 = new Frame({
131 | "value1" : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
132 | });
133 |
134 | var link = [9, 1, 12, 2, 3, 7, 10, 5, 11];
135 |
136 | var joined = frame0.join(frame1, link);
137 |
138 | var filtered = joined.where("value0", function(v){ return v > 2; });
139 | var expected = 22; // 3 + 8 + 11
140 |
141 | var actual = filtered.sum("value1");
142 |
143 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
144 |
145 | });
146 |
147 | tape("join with where produces correct argmax", function(t){
148 | t.plan(2);
149 | var frame0 = new Frame({
150 | "value0" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
151 | });
152 |
153 | //console.log(JSON.stringify(frame0._cols));
154 | var frame1 = new Frame({
155 | "value1" : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
156 | });
157 |
158 | var link = [9, 1, 12, 2, 3, 7, 10, 5, 11];
159 |
160 | var joined = frame0.join(frame1, link);
161 |
162 | var filtered = joined.where("value0", function(v){ return v > 2; });
163 | var expected = 2; // 3 + 8 + 11
164 |
165 | var actual = filtered.argmax("value1");
166 |
167 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
168 |
169 | var expected = 11;
170 |
171 | var argmax = actual;
172 | var actual = filtered["value1"][argmax];
173 |
174 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
175 | });
176 |
177 | /*
178 | tape("groupby has correct index", function(t){
179 | t.plan(1);
180 | var frame = new Frame({
181 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
182 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
183 | });
184 |
185 | var expected = {
186 | "0" : [0, 1, 2, 5, 7],
187 | "1" : [3, 4, 6, 8]
188 | };
189 |
190 | var g = frame.groupby("id");
191 | var actual = g._index;
192 |
193 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
194 |
195 | });
196 |
197 | tape("groupby with two arguments has correct index", function(t){
198 | t.plan(1);
199 | var frame = new Frame({
200 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
201 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
202 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
203 | });
204 |
205 | var expected = {
206 | "0" : {
207 | "0" : [0, 1, 5, 7],
208 | "1" : [2]
209 | },
210 | "1" : {
211 | "0" : [4],
212 | "1" : [3, 6, 8]
213 | }
214 | };
215 |
216 | var g = frame.groupby("id_0", "id_1");
217 | var actual = g._index;
218 |
219 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
220 | });
221 |
222 | tape("successive groupby has correct index", function(t){
223 | t.plan(1);
224 | var frame = new Frame({
225 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
226 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
227 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
228 | });
229 |
230 | var expected = {
231 | "0" : {
232 | "0" : [0, 1, 5, 7],
233 | "1" : [2]
234 | },
235 | "1" : {
236 | "0" : [4],
237 | "1" : [3, 6, 8]
238 | }
239 | };
240 |
241 | var g = frame.groupby("id_0");
242 | g = g.groupby("id_1");
243 | var actual = g._index;
244 |
245 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
246 | });
247 |
248 | */
249 | /*
250 | var dataDirectory = 'test/data/groupby.count/',
251 | testFile = 'small.json';
252 |
253 | var RTOL = 1e-05, // 1e-05
254 | ATOL = 1e-12; // 1e-12
255 |
256 | var floader = require('floader'),
257 | dtest = require('../lib/test');
258 |
259 | floader.load(dataDirectory + testFile, function(err, config){
260 |
261 | var suite = JSON.parse(config);
262 |
263 | for(var i = 0; i < suite.length; i++){
264 |
265 | var prefix = String("0000" + (i + 1)).slice(-4);
266 |
267 | // directory containing matrix data files for current test
268 | var directory = dataDirectory + prefix + '/';
269 |
270 | var test = suite[i];
271 | var names = test.id.map(function(spec, i){ return "id_" + i;});
272 | var types = test.id.map(function(spec, i){ return spec['type'];});
273 |
274 | var N = test.N; // number of rows
275 | distincts = test.id.map(function(spec, i){ return spec.K; });
276 |
277 | var testName = "groupby.count: " + N + " x " + "(" + distincts.join(", ") + ")"
278 | tape(testName, generateTestCase(directory, names, types, ["value_0"], [test.value[0].type]));
279 | }
280 | });
281 |
282 | function generateTestCase(directory, id_names, id_types, value_names, value_types){
283 | return function(t){
284 | t.plan(1);
285 |
286 | var names = id_names.concat(value_names);
287 | var types = id_types.concat(value_types);
288 | // load columns from files
289 | dtest.load(directory, names, types, function(err, columns){
290 |
291 | floader.load(directory + "out.json", function(err, out){
292 | var expected = JSON.parse(out);
293 |
294 | var column_set = {};
295 | for (var i = 0; i < names.length; i++){
296 | var name = names[i];
297 | var column = columns[i];
298 | column_set[name] = column;
299 | }
300 | var frame = new Frame(column_set);
301 |
302 | var g = frame;
303 | for(var i = 0; i < id_names.length; i++){
304 | id_name = id_names[i];
305 | g = g.groupby(id_name);
306 | }
307 | var actual = g.count();
308 |
309 | var assert;
310 | if(value_types[0] in dtest.float_types){
311 | assert = dtest.assert.tree.allclose;
312 | } else {
313 | assert = dtest.assert.tree.equal;
314 | }
315 |
316 | assert(t, actual, expected, null, RTOL, ATOL);
317 | });
318 |
319 | });
320 | };
321 | }
322 | */
323 |
--------------------------------------------------------------------------------
/test/mean.js:
--------------------------------------------------------------------------------
1 | var tape = require('tape'),
2 | Frame = require('../lib/frame');
3 |
4 | // simple instructive test cases
5 | function simpleTestCases(){
6 |
7 | tape("mean works with integers", function(t){
8 | t.plan(1);
9 | var frame = new Frame({
10 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
11 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
12 | });
13 |
14 | var expected = 2.111111111; // 1 + 2 + 2 + 3 + 1 + 3 + 4 + 2 + 1
15 |
16 | var actual = frame.mean("value");
17 |
18 | dtest.assert.close(t, actual, expected);
19 | });
20 |
21 | tape("mean works with integers", function(t){
22 | t.plan(1);
23 | var frame = new Frame({
24 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1],
25 | "value" : [4, 2, 7, 1, 3, 6, 5, 2, 1, 7, 8]
26 | });
27 |
28 | var expected = 4.1818181818; // (4 + 2 + 7 + 1 + 3 + 6 + 5 + 2 + 1 + 7 + 8) / 11
29 |
30 | var actual = frame.mean("value");
31 |
32 | dtest.assert.close(t, actual, expected);
33 | });
34 |
35 | tape("mean works floats", function(t){
36 | t.plan(1);
37 | var frame = new Frame({
38 | "id" : [0, 2, 0, 1, 1, 0, 2, 0, 1],
39 | "value" : [1.2, 6.4, 2.3, 12.1, 1.6, 3.5, 7.2, 2.1, 10.2]
40 | });
41 |
42 | var expected = 5.177777777777779; // (1.2 + 6.4 + 2.3 + 12.1 + 1.6 + 3.5 + 7.2 + 2.1 + 10.2) / 9
43 | var actual = frame.mean("value");
44 |
45 | dtest.assert.close(t, actual, expected);
46 | });
47 | }
48 |
49 | //simpleTestCases();
50 |
51 | var RTOL = 1e-05, // 1e-05
52 | ATOL = 1e-12; // 1e-12
53 |
54 | var dataDirectory = 'test/data/mean/',
55 | testFile = 'small.json';
56 |
57 | var floader = require('floader'),
58 | dtest = require('../lib/test');
59 |
60 | floader.load(dataDirectory + testFile, function(err, config){
61 |
62 | var suite = JSON.parse(config);
63 | simpleTestCases();
64 |
65 | for(var i = 0; i < suite.length; i++){
66 |
67 | var prefix = String("0000" + (i + 1)).slice(-4);
68 |
69 | // directory containing matrix data files for current test
70 | var directory = dataDirectory + prefix + '/';
71 |
72 | var test = suite[i];
73 |
74 | var names = test.id.map(function(spec, i){ return "id_" + i;});
75 | var types = test.id.map(function(spec, i){ return spec['type'];});
76 |
77 | var N = test.N; // number of rows
78 | var distincts = test.id.map(function(spec, i){ return spec.K; });
79 |
80 | var testName = "mean: " + N + " x " + "(" + distincts.join(", ") + ")"
81 | tape(testName, generateTestCase(directory, names, types, ["value_0"], [test.value[0].type]));
82 | }
83 | });
84 |
85 | var OUT_FILENAME = "out.json";
86 |
87 |
88 | function generateTestCase(directory, id_names, id_types, value_names, value_types){
89 | return function(t){
90 | t.plan(1);
91 |
92 | var names = id_names.concat(value_names);
93 | var types = id_types.concat(value_types);
94 |
95 | // which columns require a key file?
96 | var key_names = id_names.filter(function(item, i){
97 | return id_types[i] in dtest.string_types
98 | });
99 | var key_types = id_types.filter(function(item, i){
100 | return item in dtest.string_types
101 | });
102 |
103 | // load columns from files
104 | dtest.load(directory, names, types, function(err, columns){
105 |
106 | // load key files
107 | dtest.load_key(directory, key_names, key_types, function(err, keys){
108 |
109 | floader.load(directory + OUT_FILENAME, function(err, out){
110 | var expected = JSON.parse(out);
111 |
112 | var column_set = {};
113 | for (var i = 0; i < names.length; i++){
114 | var name = names[i];
115 | var column = columns[i];
116 | column_set[name] = column;
117 | }
118 | // keys map a small set of integers to other things (like strings)
119 | // they're a very simple form of fixed length coding
120 | var key_set = {};
121 | for (var i = 0; i < keys.length; i++){
122 | var name = key_names[i];
123 | var key = keys[i];
124 | key_set[name] = key;
125 | }
126 |
127 | var frame = new Frame(column_set, key_set);
128 |
129 | //console.log(subset);
130 | var actual = frame.mean("value_0");
131 |
132 | dtest.assert.close(t, actual, expected, "close", RTOL, ATOL);
133 | });
134 |
135 | });
136 | });
137 | };
138 | }
139 |
--------------------------------------------------------------------------------
/test/ungroup.js:
--------------------------------------------------------------------------------
1 | var tape = require('tape'),
2 | Frame = require('../lib/frame');
3 |
4 | tape("ungroup single groupby has correct index", function(t){
5 | t.plan(1);
6 | var frame = new Frame({
7 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
8 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
9 | });
10 |
11 | var expected; // undefined
12 |
13 | var g = frame.groupby("id");
14 | var g = g.ungroup();
15 | var actual = g._index;
16 |
17 | t.equals(actual, expected);
18 |
19 | });
20 |
21 | tape("ungroup on multiple groupby has correct index", function(t){
22 | t.plan(1);
23 | var frame = new Frame({
24 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
25 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
26 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
27 | });
28 |
29 |
30 | var expected = {
31 | "0" : [0, 1, 2, 5, 7],
32 | "1" : [3, 4, 6, 8]
33 | };
34 |
35 | var g = frame.groupby("id_0", "id_1");
36 | g = g.ungroup();
37 | var actual = g._index;
38 |
39 | t.equals(JSON.stringify(actual), JSON.stringify(expected));
40 | });
41 |
42 | tape("successive ungroup on multiple groupby has correct index", function(t){
43 | t.plan(1);
44 | var frame = new Frame({
45 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
46 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
47 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
48 | });
49 |
50 | var expected; // undefined
51 |
52 | var g = frame.groupby("id_0", "id_1");
53 | g = g.ungroup();
54 | g = g.ungroup();
55 | var actual = g._index;
56 |
57 | t.equals(actual, expected);
58 | });
59 |
60 | /*
61 | var dataDirectory = 'test/data/groupby.count/',
62 | testFile = 'small.json';
63 |
64 | var RTOL = 1e-05, // 1e-05
65 | ATOL = 1e-12; // 1e-12
66 |
67 | var floader = require('floader'),
68 | dtest = require('../lib/test');
69 |
70 | floader.load(dataDirectory + testFile, function(err, config){
71 |
72 | var suite = JSON.parse(config);
73 |
74 | for(var i = 0; i < suite.length; i++){
75 |
76 | var prefix = String("0000" + (i + 1)).slice(-4);
77 |
78 | // directory containing matrix data files for current test
79 | var directory = dataDirectory + prefix + '/';
80 |
81 | var test = suite[i];
82 | var names = test.id.map(function(spec, i){ return "id_" + i;});
83 | var types = test.id.map(function(spec, i){ return spec['type'];});
84 |
85 | var N = test.N; // number of rows
86 | distincts = test.id.map(function(spec, i){ return spec.K; });
87 |
88 | var testName = "groupby.count: " + N + " x " + "(" + distincts.join(", ") + ")"
89 | tape(testName, generateTestCase(directory, names, types, ["value_0"], [test.value[0].type]));
90 | }
91 | });
92 |
93 | function generateTestCase(directory, id_names, id_types, value_names, value_types){
94 | return function(t){
95 | t.plan(1);
96 |
97 | var names = id_names.concat(value_names);
98 | var types = id_types.concat(value_types);
99 | // load columns from files
100 | dtest.load(directory, names, types, function(err, columns){
101 |
102 | floader.load(directory + "out.json", function(err, out){
103 | var expected = JSON.parse(out);
104 |
105 | var column_set = {};
106 | for (var i = 0; i < names.length; i++){
107 | var name = names[i];
108 | var column = columns[i];
109 | column_set[name] = column;
110 | }
111 | var frame = new Frame(column_set);
112 |
113 | var g = frame;
114 | for(var i = 0; i < id_names.length; i++){
115 | id_name = id_names[i];
116 | g = g.groupby(id_name);
117 | }
118 | var actual = g.count();
119 |
120 | var assert;
121 | if(value_types[0] in dtest.float_types){
122 | assert = dtest.assert.tree.allclose;
123 | } else {
124 | assert = dtest.assert.tree.equal;
125 | }
126 |
127 | assert(t, actual, expected, null, RTOL, ATOL);
128 | });
129 |
130 | });
131 | };
132 | }
133 | */
134 |
--------------------------------------------------------------------------------
/test/where.in.sum.js:
--------------------------------------------------------------------------------
1 | var tape = require('tape'),
2 | Frame = require('../lib/frame');
3 |
4 | // simple instructive test cases
5 | function simpleTestCases(){
6 |
7 | tape("sum works with where", function(t){
8 | t.plan(1);
9 | var frame = new Frame({
10 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
11 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
12 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
13 | });
14 |
15 | var expected = 9; // 3 + 1 + 4 + 1
16 |
17 | var actual = frame.where("id_0", 1).sum("value");
18 |
19 | t.equals(actual, expected);
20 | });
21 |
22 | tape("sum works with where", function(t){
23 | t.plan(1);
24 | var frame = new Frame({
25 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
26 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
27 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
28 | });
29 |
30 | var expected = 10; // 1 + 2 + 2 + 3 + 2
31 |
32 | var actual = frame.where("id_0", 0).sum("value");
33 |
34 | t.equals(actual, expected);
35 | });
36 |
37 | tape("where does not modify sum on original Frame", function(t){
38 | t.plan(1);
39 | var frame = new Frame({
40 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
41 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
42 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
43 | });
44 |
45 | var expected = 19; // 1 + 2 + 2 + 3 + 1 + 3 + 4 + 2 + 1
46 |
47 | var fw = frame.where("id_0", 0);
48 | var actual = frame.sum("value");
49 |
50 | t.equals(actual, expected);
51 | });
52 |
53 | tape("sum works with multiple wheres", function(t){
54 | t.plan(1);
55 | var frame = new Frame({
56 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
57 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
58 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
59 | });
60 |
61 | var expected = 8; // 3 + 4 + 1
62 | var actual = frame.where("id_0", 1).where("id_1", 1).sum("value");
63 |
64 | t.equals(actual, expected);
65 | });
66 |
67 |
68 | tape("sum works with where in", function(t){
69 | t.plan(1);
70 | var frame = new Frame({
71 | "id" : [0, 2, 0, 1, 1, 0, 2, 0, 1],
72 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
73 | });
74 |
75 | var expected = 14; // 1 + 2 + 2 + 3 + 4 + 2
76 | frame = frame.where("id", [0, 2]);
77 | var actual = frame.sum("value");
78 |
79 | t.equals(actual, expected);
80 | });
81 |
82 |
83 | tape("sum works with where in undefined", function(t){
84 | t.plan(1);
85 | var frame = new Frame({
86 | "id" : [0, 2, 0, 1, 1, 0, 2, 0, 1],
87 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
88 | });
89 |
90 | var a; // undefined
91 | var expected = 14; // 1 + 2 + 2 + 3 + 4 + 2
92 | frame = frame.where("id", [0, 2, a]);
93 | var actual = frame.sum("value");
94 |
95 | t.equals(actual, expected);
96 | });
97 | }
98 |
99 | //simpleTestCases();
100 |
101 | var SAMPLE = 10;
102 | function numberCompare(a, b){ return a - b; }
103 | // get a predefined subset of a column (matches test data generation)
104 | function generate_subset(column){
105 | //column = id_columns[id_name]
106 | var uniques = {};
107 | for(var i = 0; i < SAMPLE; i++){
108 | uniques[column[i]] = column[i];
109 | }
110 | var keys = Object.keys(uniques);
111 | var subset = keys.map(function(k){ return uniques[k]});
112 |
113 | l = Math.ceil(subset.length / 2);
114 | return subset.sort(numberCompare).slice(0, l);
115 | }
116 |
117 | var RTOL = 1e-05, // 1e-05
118 | ATOL = 1e-12; // 1e-12
119 |
120 | var dataDirectory = 'test/data/where.in.sum/',
121 | testFile = 'small.json';
122 |
123 | var floader = require('floader'),
124 | dtest = require('../lib/test');
125 |
126 | floader.load(dataDirectory + testFile, function(err, config){
127 |
128 | var suite = JSON.parse(config);
129 | simpleTestCases();
130 |
131 | for(var i = 0; i < suite.length; i++){
132 |
133 | var prefix = String("0000" + (i + 1)).slice(-4);
134 |
135 | // directory containing matrix data files for current test
136 | var directory = dataDirectory + prefix + '/';
137 |
138 | var test = suite[i];
139 |
140 | var names = test.id.map(function(spec, i){ return "id_" + i;});
141 | var types = test.id.map(function(spec, i){ return spec['type'];});
142 |
143 | var N = test.N; // number of rows
144 | var distincts = test.id.map(function(spec, i){ return spec.K; });
145 |
146 | var testName = "where.in.sum: " + N + " x " + "(" + distincts.join(", ") + ")"
147 | tape(testName, generateTestCase(directory, names, types, ["value_0"], [test.value[0].type]));
148 | }
149 | });
150 |
151 | var OUT_FILENAME = "out.json";
152 |
153 |
154 | function generateTestCase(directory, id_names, id_types, value_names, value_types){
155 | return function(t){
156 | t.plan(1);
157 |
158 | var names = id_names.concat(value_names);
159 | var types = id_types.concat(value_types);
160 |
161 | // which columns require a key file?
162 | var key_names = id_names.filter(function(item, i){
163 | return id_types[i] in dtest.string_types
164 | });
165 | var key_types = id_types.filter(function(item, i){
166 | return item in dtest.string_types
167 | });
168 |
169 | // load columns from files
170 | dtest.load(directory, names, types, function(err, columns){
171 |
172 | // load key files
173 | dtest.load_key(directory, key_names, key_types, function(err, keys){
174 |
175 | floader.load(directory + OUT_FILENAME, function(err, out){
176 | var expected = JSON.parse(out);
177 |
178 | var column_set = {};
179 | for (var i = 0; i < names.length; i++){
180 | var name = names[i];
181 | var column = columns[i];
182 | column_set[name] = column;
183 | }
184 | // keys map a small set of integers to other things (like strings)
185 | // they're a very simple form of fixed length coding
186 | var key_set = {};
187 | for (var i = 0; i < keys.length; i++){
188 | var name = key_names[i];
189 | var key = keys[i];
190 | key_set[name] = key;
191 | }
192 |
193 | var frame = new Frame(column_set, key_set);
194 |
195 | var subset = generate_subset(column_set["id_0"]);
196 | //console.log(subset);
197 | frame = frame.where("id_0", subset);
198 | var actual = frame.sum("value_0");
199 |
200 | dtest.assert.close(t, actual, expected, "close", RTOL, ATOL);
201 | });
202 |
203 | });
204 | });
205 | };
206 | }
207 |
--------------------------------------------------------------------------------
/test/where.js:
--------------------------------------------------------------------------------
1 | var tape = require('tape'),
2 | BitArray = require('bit-array'),
3 | Frame = require('../lib/frame');
4 |
5 | tape("where creates correct filter", function(t){
6 | t.plan(1);
7 |
8 | var frame = new Frame({
9 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
10 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
11 | });
12 |
13 | //frame.where(row => row.id == 1);
14 | frame = frame.where("id", v => v == 1);
15 |
16 | var expected = new BitArray(9);
17 |
18 | expected.set(3, true);
19 | expected.set(4, true);
20 | expected.set(6, true);
21 | expected.set(8, true);
22 |
23 | var actual = frame._filter;
24 | t.equals(actual.toString(), expected.toString());
25 | });
26 |
27 | tape("where with numerical argument creates correct filter", function(t){
28 | t.plan(1);
29 |
30 | var frame = new Frame({
31 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
32 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
33 | });
34 |
35 | frame = frame.where("id", 1);
36 |
37 | var expected = new BitArray(9);
38 |
39 | expected.set(3, true);
40 | expected.set(4, true);
41 | expected.set(6, true);
42 | expected.set(8, true);
43 |
44 | var actual = frame._filter;
45 | t.equals(actual.toString(), expected.toString());
46 | });
47 |
48 | tape("where with array argument creates correct filter", function(t){
49 | t.plan(1);
50 |
51 | var frame = new Frame({
52 | "id" : [0, 2, 0, 1, 1, 0, 2, 0, 1],
53 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
54 | });
55 |
56 | frame = frame.where("id", [0, 2]);
57 |
58 | var expected = new BitArray(9);
59 |
60 | expected.set(0, true);
61 | expected.set(1, true);
62 | expected.set(2, true);
63 | expected.set(5, true);
64 | expected.set(6, true);
65 | expected.set(7, true);
66 |
67 | var actual = frame._filter;
68 | t.equals(actual.toString(), expected.toString());
69 | });
70 |
71 | tape("where creates second filter correctly", function(t){
72 | t.plan(1);
73 |
74 | var frame = new Frame({
75 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
76 | "id_1" : [0, 0, 1, 1, 0, 1, 0, 0, 1],
77 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
78 | });
79 |
80 | //frame.where(row => row.id == 1);
81 | frame = frame.where("id_1", id => id == 1);
82 | frame = frame.where("id_0", id => id == 1);
83 |
84 | var expected = new BitArray(9);
85 |
86 | expected.set(3, true);
87 | expected.set(8, true);
88 |
89 | var actual = frame._filter;
90 | t.equals(actual.toString(), expected.toString());
91 | });
92 |
93 | tape("where filters column via accessor", function(t){
94 | t.plan(1);
95 |
96 | var frame = new Frame({
97 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
98 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
99 | });
100 |
101 | //frame.where(row => row.id == 1);
102 | frame = frame.where("id", v => v == 1);
103 |
104 | var expected = [3, 1, 4, 1];
105 |
106 |
107 | var actual = frame["value"];
108 | t.equals(actual.toString(), expected.toString());
109 | });
110 |
111 | tape("where filters keyed column via accessor", function(t){
112 | t.plan(1);
113 |
114 | var columns = {
115 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
116 | "value" : [6, 1, 5, 3, 1, 2, 4, 0, 1]
117 | };
118 | var keys = {
119 | "value" : ["fare", "fish", "my", "red", "blue", "to", "add"]
120 | };
121 |
122 | var frame = new Frame(columns, keys);
123 |
124 | frame = frame.where("id", v => v == 1);
125 |
126 | var expected = ["red", "fish", "blue", "fish"];
127 |
128 |
129 | var actual = frame["value"];
130 | t.equals(actual.toString(), expected.toString());
131 | });
132 |
133 | tape("where accepts string filter on keyed column", function(t){
134 | t.plan(1);
135 |
136 | var columns = {
137 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
138 | "value" : [6, 1, 5, 3, 1, 2, 4, 0, 1]
139 | };
140 | var keys = {
141 | "id" : ["thoreau", "seuss"],
142 | "value" : ["fare", "fish", "my", "red", "blue", "to", "add"]
143 | };
144 |
145 | var frame = new Frame(columns, keys);
146 |
147 | frame = frame.where("id", "thoreau");
148 |
149 | var expected = ["add", "fish", "to", "my", "fare"];
150 |
151 |
152 | var actual = frame["value"];
153 | t.equals(actual.toString(), expected.toString());
154 | });
155 |
156 | tape("where accepts function with string on keyed column", function(t){
157 | t.plan(1);
158 |
159 | var columns = {
160 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
161 | "value" : [6, 1, 5, 3, 1, 2, 4, 0, 1]
162 | };
163 | var keys = {
164 | "id" : ["thoreau", "seuss"],
165 | "value" : ["fare", "fish", "my", "red", "blue", "to", "add"]
166 | };
167 |
168 | var frame = new Frame(columns, keys);
169 |
170 | frame = frame.where("id", v => v == "seuss");
171 |
172 | var expected = ["red", "fish", "blue", "fish"];
173 |
174 | var actual = frame["value"];
175 | t.equals(actual.toString(), expected.toString());
176 | });
177 |
178 | tape("where filter can be modified", function(t){
179 | t.plan(2);
180 |
181 | var columns = {
182 | "id" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
183 | "value" : [6, 1, 5, 3, 1, 2, 4, 0, 1]
184 | };
185 | var keys = {
186 | "id" : ["thoreau", "seuss"],
187 | "value" : ["fare", "fish", "my", "red", "blue", "to", "add"]
188 | };
189 |
190 | var frame = new Frame(columns, keys);
191 |
192 |
193 | frame = frame.where("id", "thoreau");
194 | var expected = ["add", "fish", "to", "my", "fare"];
195 |
196 | var actual = frame["value"];
197 | t.equals(actual.toString(), expected.toString());
198 |
199 | frame = frame.where("id", v => v == "seuss");
200 | var expected = ["red", "fish", "blue", "fish"];
201 |
202 | var actual = frame["value"];
203 | t.equals(actual.toString(), expected.toString());
204 | });
205 | /*
206 | function eq(a){
207 | return function(v){ v == a; };
208 | }
209 |
210 | function in(arr){
211 | var set = {};
212 | for (a in arr) set[a] = true;
213 | return function(v){ return v in set;};
214 | }*/
215 |
--------------------------------------------------------------------------------
/test/where.mean.js:
--------------------------------------------------------------------------------
1 | var tape = require('tape'),
2 | Frame = require('../lib/frame');
3 |
4 | // simple instructive test cases
5 | function simpleTestCases(){
6 |
7 | tape("mean works with where", function(t){
8 | t.plan(1);
9 | var frame = new Frame({
10 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
11 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
12 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
13 | });
14 |
15 | var expected = 2.25; // 3 + 1 + 4 + 1
16 |
17 | var actual = frame.where("id_0", 1).mean("value");
18 |
19 | dtest.assert.close(t, actual, expected);
20 | });
21 |
22 | tape("mean works with where", function(t){
23 | t.plan(1);
24 | var frame = new Frame({
25 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
26 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
27 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
28 | });
29 |
30 | var expected = 2; // 1 + 2 + 2 + 3 + 2
31 |
32 | var actual = frame.where("id_0", 0).mean("value");
33 |
34 | dtest.assert.close(t, actual, expected);
35 | });
36 |
37 | tape("where does not modify mean on original Frame", function(t){
38 | t.plan(1);
39 | var frame = new Frame({
40 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
41 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
42 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
43 | });
44 |
45 | var expected = 2.1111111111; // (1 + 2 + 2 + 3 + 1 + 3 + 4 + 2 + 1) / 9
46 |
47 | var fw = frame.where("id_0", 0);
48 | var actual = frame.mean("value");
49 |
50 | dtest.assert.close(t, actual, expected);
51 | });
52 |
53 | tape("mean works with multiple wheres", function(t){
54 | t.plan(1);
55 | var frame = new Frame({
56 | "id_0" : [0, 0, 0, 1, 1, 0, 1, 0, 1],
57 | "id_1" : [0, 0, 1, 1, 0, 0, 1, 0, 1],
58 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
59 | });
60 |
61 | var expected = 2.666666666; // (3 + 4 + 1) / 3
62 | var actual = frame.where("id_0", 1).where("id_1", 1).mean("value");
63 |
64 | dtest.assert.close(t, actual, expected);
65 | });
66 |
67 |
68 | tape("mean works with where in", function(t){
69 | t.plan(1);
70 | var frame = new Frame({
71 | "id" : [0, 2, 0, 1, 1, 0, 2, 0, 1],
72 | "value" : [1, 2, 2, 3, 1, 3, 4, 2, 1]
73 | });
74 |
75 | var expected = 2.3333333333333; // 1 + 2 + 2 + 3 + 4 + 2
76 | frame = frame.where("id", [0, 2]);
77 | var actual = frame.mean("value");
78 |
79 | dtest.assert.close(t, actual, expected);
80 | });
81 | }
82 |
83 | //simpleTestCases();
84 |
85 | var RTOL = 1e-05, // 1e-05
86 | ATOL = 1e-12; // 1e-12
87 |
88 | var dataDirectory = 'test/data/where.mean/',
89 | testFile = 'small.json';
90 |
91 | var floader = require('floader'),
92 | dtest = require('../lib/test');
93 |
94 | floader.load(dataDirectory + testFile, function(err, config){
95 |
96 | var suite = JSON.parse(config);
97 | simpleTestCases();
98 |
99 | for(var i = 0; i < suite.length; i++){
100 |
101 | var prefix = String("0000" + (i + 1)).slice(-4);
102 |
103 | // directory containing matrix data files for current test
104 | var directory = dataDirectory + prefix + '/';
105 |
106 | var test = suite[i];
107 |
108 | var names = test.id.map(function(spec, i){ return "id_" + i;});
109 | var types = test.id.map(function(spec, i){ return spec['type'];});
110 |
111 | var N = test.N; // number of rows
112 | var distincts = test.id.map(function(spec, i){ return spec.K; });
113 |
114 | var testName = "where.mean: " + N + " x " + "(" + distincts.join(", ") + ")"
115 | tape(testName, generateTestCase(directory, names, types, ["value_0"], [test.value[0].type]));
116 | }
117 | });
118 |
119 | var OUT_FILENAME = "out.json";
120 |
121 |
122 | function generateTestCase(directory, id_names, id_types, value_names, value_types){
123 | return function(t){
124 | t.plan(1);
125 |
126 | var names = id_names.concat(value_names);
127 | var types = id_types.concat(value_types);
128 |
129 | // which columns require a key file?
130 | var key_names = id_names.filter(function(item, i){
131 | return id_types[i] in dtest.string_types
132 | });
133 | var key_types = id_types.filter(function(item, i){
134 | return item in dtest.string_types
135 | });
136 |
137 | // load columns from files
138 | dtest.load(directory, names, types, function(err, columns){
139 |
140 | // load key files
141 | dtest.load_key(directory, key_names, key_types, function(err, keys){
142 |
143 | floader.load(directory + OUT_FILENAME, function(err, out){
144 | var expected = JSON.parse(out);
145 |
146 | var column_set = {};
147 | for (var i = 0; i < names.length; i++){
148 | var name = names[i];
149 | var column = columns[i];
150 | column_set[name] = column;
151 | }
152 | // keys map a small set of integers to other things (like strings)
153 | // they're a very simple form of fixed length coding
154 | var key_set = {};
155 | for (var i = 0; i < keys.length; i++){
156 | var name = key_names[i];
157 | var key = keys[i];
158 | key_set[name] = key;
159 | }
160 |
161 | var frame = new Frame(column_set, key_set);
162 |
163 | var value = column_set["id_0"][0];
164 | //console.log(subset);
165 | frame = frame.where("id_0", value);
166 | var actual = frame.mean("value_0");
167 |
168 | dtest.assert.close(t, actual, expected, "close", RTOL, ATOL);
169 | });
170 |
171 | });
172 | });
173 | };
174 | }
175 |
--------------------------------------------------------------------------------