├── .gitignore
├── .travis.yml
├── README.md
├── lib
├── chart.js
└── datakit.js
├── package.json
├── plot.png
└── spec
├── support
└── jasmine.json
└── test
├── test.csv
├── test2.csv
└── testSpec.js
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | node_modules
3 | *.swp
4 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: node_js
2 | node_js:
3 | - '0.12'
4 | - '0.11'
5 | - '0.10'
6 | - 'iojs'
7 | before_script:
8 | - 'npm i -g jasmine'
9 | deploy:
10 | provider: npm
11 | email: ne2210@columbia.edu
12 | api_key:
13 | secure: WDjnZSc8q9Oh2ro0Gpoa75/s2Nb4k1CguyZeQwibt3dl2+QveuTdbg9mAsKmbwSBSy+e+EWiZI5/C0j+pDGlZy/pjqse7c1VGcLzVqeYrSbJtvFXKl5eEUXbM0tuvw9ASKlHGM7P88xY/oe8osqoXmUnkjUHPWsXaTHhsG3HqDc=
14 | on:
15 | tags: true
16 | repo: NathanEpstein/datakit
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # datakit
2 |
3 |
4 |
5 |
6 | ## About
7 | A lightweight library/framework for data analysis in JavaScript.
8 |
9 | ## Usage
10 |
11 | ```npm install datakitjs --save```
12 |
13 | ## Documentation & Examples
14 |
15 | ### Reading, Filtering, & Plotting Data
16 | ```javascript
17 | var dk = require('datakitjs');
18 |
19 | //READ A CSV FILE
20 |
21 | //file.csv
22 | // COL1, COL2
23 | // val11, val12
24 | // val21, val22
25 |
26 | dk.csv('file.csv', function(data) {
27 | console.log(data);
28 | });
29 |
30 | //Output:
31 | //[{ COL1: val11, COL2: val12 }, { COL1: val21, COL2: val22 }]
32 |
33 |
34 | //GET A COLUMN FROM AN ARRAY OF ROW OBJECTS
35 | dk.csv('file.csv', function(data) {
36 | var c2 = dk.col(data, 'COL2');
37 | console.log(c2);
38 | });
39 |
40 | //Output:
41 | //[val12, val22]
42 |
43 | // By default, dk.csv will convert all values to strings. You can convert select
44 | // columns to numbers by passing an array of column names to 'dk.numeric'.
45 |
46 | //file2.csv
47 | // COL1, COL2
48 | // val11, 1
49 | // val21, 2
50 |
51 | dk.csv('file2.csv', function(data) {
52 | var d = dk.numeric(data, ['COL2'], 0) // The third parameter value will be filled
53 | // in to blank cells. Its default value is 0.
54 | var c2 = dk.col(d, 'COL2');
55 | console.log(c2);
56 | });
57 |
58 | //Output:
59 | //[1, 2]
60 |
61 |
62 | //PLOT ARRAY(S) OF DATA
63 |
64 | var chart = new dk.Chart({
65 | //optional config
66 | height: 500,
67 | width: 500,
68 | xLab: 'x-Axis Label',
69 | yLab: 'y-Axis Label'
70 | });
71 |
72 | chart.addDataSet({
73 | x: [1, 2, 3],
74 | y: [4, 5, 6],
75 | z: [2, 3, 5],
76 | colors: ['blue', 'green', 'red']
77 | }).addDataSet({
78 | x: [1, 10],
79 | y: [2, -1],
80 | type: 'line'
81 | }).addDataSet({
82 | x: [10, 5, 1],
83 | y: [4, 5, 2],
84 | labels: ["first", "second", "third"]
85 | }).plot();
86 | ```
87 |
88 | ### Statistical Methods
89 |
90 | ```javascript
91 | var dk = require('datakitjs');
92 |
93 | //MEAN OF AN ARRAY
94 | dk.mean([1, 2, 3]); //returns 2
95 |
96 | //STANDARD DEVIATION AND VARIANCE OF AN ARRAY
97 | dk.sd([1, 2, 3]); //returns 1
98 | dk.vari([1, 2, 3]); //returns 1
99 |
100 | //COVARIANCE OF TWO ARRAYS
101 | dk.cov([1, 2, 3], [3, 2, 1]); //returns -1
102 |
103 | //SIMPLE LINEAR REGRESSION
104 |
105 | var x = [1, 2, 3];
106 | var y = [2, 1, 3];
107 |
108 | var model = dk.reg(x, y);
109 |
110 | // model.f is a function that returns the estimated y for an input x (estimated via standard OLS regression)
111 | // model.f = function(x) {
112 | // return (a + b * x);
113 | // };
114 |
115 | // model.pts is an array of the estimated y for each element of x
116 | // model.pts = [1.5, 2, 2.5];
117 |
118 | // model.endPoints is an object with the coordinates of the boundary points
119 | // model.endPoints = { x1: 1, x2: 3, y1: 1.5, y2: 2.5 };
120 |
121 | ```
122 |
123 | ### Convenience Methods
124 | ```javascript
125 | var dk = require('datakitjs');
126 |
127 | //GENERATE AN ARRAY WITH A SEQUENCE OF NUMBERS
128 |
129 | dk.seq(1, 5); //returns [1, 2, 3, 4, 5]
130 |
131 | dk.seq(0, 1, 0.25); //returns [0, 0.25, 0.5, 0.75, 1]
132 |
133 | //GENERATE AN ARRAY WITH REPEATED VALUE
134 |
135 | dk.rep(1, 5); //returns [1, 1, 1, 1, 1]
136 |
137 | //CHECK IF NUMBERS ARE CLOSE
138 | dk.isclose(0, Math.pow(10, -15)); //returns true
139 |
140 | dk.isclose(0, Math.pow(10, -5)); //returns false
141 |
142 | //SUM AN ARRAY OF NUMBERS
143 | //uses Kahan summation
144 |
145 | dk.sum([1, 2, 3]); //returns 6
146 |
147 | //PRODUCT OF AN ARRAY OF NUMBERS
148 | //implementation from 'Accurate Floating Point Product' - Stef Graillat
149 |
150 | dk.prod([1, 2, 3]); //returns 6
151 |
152 | //MAX AND MIN OF AN ARRAY
153 | var x = [1, 2, 3];
154 | dk.min(x); //returns 1
155 | dk.max(x); //returns 3
156 |
157 | ```
158 |
159 | ### Random Numbers
160 | ```javascript
161 | var dk = require('datakitjs');
162 |
163 | //GET AN ARRAY OF EXPONENTIALLY DISTRIBUTED VALUES
164 |
165 | dk.exp(3, 1); //returns [0.3584189321510761, 1.0466439500242446, 0.08887770301056963]
166 |
167 |
168 | //GET AN ARRAY OF NORMALLY DISTRIBUTED VALUES
169 |
170 | dk.norm(3, 0, 1); //returns [-1.709768103193772, 0.23530041388459744, 0.4431320382580479]
171 |
172 | //GET AN ARRAY OF UNIFORMLY DISTRIBUTED VALUES
173 |
174 | dk.uni(3); //returns [0.30658303829841316, 0.1601463456172496, 0.8538850131444633]
175 |
176 | ```
177 |
178 | ## Testing
179 |
180 | Just run `npm test` to run the tests.
181 |
182 |
183 | ## Contributing
184 |
185 | Additional methods for random number generation, data filtration, convenience functions, and common statistical analyses are welcome additions. Just add tests following the structure in `spec/test/testSpec.js`.
186 |
187 | ## License
188 |
189 | **The MIT License (MIT)**
190 |
191 | > Copyright (c) 2015 Nathan Epstein
192 | >
193 | > Permission is hereby granted, free of charge, to any person obtaining a copy
194 | > of this software and associated documentation files (the "Software"), to deal
195 | > in the Software without restriction, including without limitation the rights
196 | > to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
197 | > copies of the Software, and to permit persons to whom the Software is
198 | > furnished to do so, subject to the following conditions:
199 | >
200 | > The above copyright notice and this permission notice shall be included in
201 | > all copies or substantial portions of the Software.
202 | >
203 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
204 | > IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
205 | > FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
206 | > AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
207 | > LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
208 | > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
209 | > THE SOFTWARE.
210 |
211 |
--------------------------------------------------------------------------------
/lib/chart.js:
--------------------------------------------------------------------------------
1 | var Chart = function(config) {
2 | var self = this;
3 |
4 | // initial configuration of size and datasets
5 | var datasets = [];
6 |
7 | self.config = config || {};
8 | self.config.height = self.config.height || 500;
9 | self.config.width = self.config.width || 500;
10 | self.config.bufferHeight = self.config.height - Math.min(200, self.config.height * 0.2);
11 | self.config.bufferWidth = self.config.width - Math.min(200, self.config.width * 0.4);
12 |
13 | // public method to plot by calling all private methods
14 | this.render = function() {
15 | buildCanvas();
16 | buildMapping();
17 | buildAxes();
18 | plotDataSets();
19 | return self;
20 | };
21 |
22 | // public methods to get and set datasets
23 | this.datasets = function() {
24 | return datasets.slice();
25 | };
26 |
27 | this.addDataSet = function(dataset) {
28 | datasets.push(dataset);
29 | return self;
30 | };
31 |
32 | // private methods
33 | var buildCanvas = function() {
34 | self.canvas = d3.select(self.config.selector || 'body')
35 | .append('svg')
36 | .attr('height', self.config.height)
37 | .attr('width', self.config.width);
38 |
39 | self.buffer = self.canvas.append('g');
40 |
41 | var xTranslate = (self.config.width - self.config.bufferWidth) / 2;
42 | var yTranslate = (self.config.height - self.config.bufferHeight) / 2;
43 |
44 | self.buffer.attr(
45 | 'transform',
46 | 'translate(' + xTranslate + ', ' + yTranslate + ')'
47 | );
48 | };
49 |
50 | var buildMapping = function() {
51 | var xMin = mini(datasets[0].x),
52 | yMin = mini(datasets[0].y),
53 | xMax = maxi(datasets[0].x),
54 | yMax = maxi(datasets[0].y);
55 |
56 | datasets.forEach(function(dataset, index) {
57 | if (index > 0) {
58 | xMin = Math.min(mini(dataset.x), xMin);
59 | yMin = Math.min(mini(dataset.y), yMin);
60 | xMax = Math.max(maxi(dataset.x), xMax);
61 | yMax = Math.max(maxi(dataset.y), yMax);
62 | }
63 | });
64 |
65 | self.xMap = d3.scale.linear()
66 | .domain([xMin, xMax])
67 | .range([0, self.config.bufferWidth]);
68 |
69 | self.yMap = d3.scale.linear()
70 | .domain([yMax, yMin])
71 | .range([0, self.config.bufferHeight]);
72 |
73 | };
74 |
75 | var buildAxes = function() {
76 | var xAxis = d3.svg.axis()
77 | .scale(self.xMap);
78 |
79 | var yAxis = d3.svg.axis()
80 | .scale(self.yMap)
81 | .orient('left');
82 |
83 | self.buffer.append('g')
84 | .attr('transform','translate(0,' + self.config.bufferHeight + ')')
85 | .call(xAxis);
86 |
87 | self.buffer.append('g')
88 | .call(yAxis);
89 |
90 | var xLabel = self.buffer.append('text')
91 | .attr('x', self.config.bufferWidth * 0.5)
92 | .attr('y', self.config.bufferHeight + 50)
93 | .text(self.config.xLab)
94 | .attr('text-anchor','middle');
95 |
96 | var yLabel = self.buffer.append('text')
97 | .attr('x', -self.config.bufferHeight * 0.5)
98 | .attr('y', -50)
99 | .attr('transform','rotate(-90)')
100 | .text(self.config.yLab)
101 | .attr('text-anchor','middle');
102 |
103 | };
104 |
105 | var plotDataSets = function() {
106 | datasets.forEach(function(dataset) {
107 | if (dataset.type == 'line') {
108 | for (var i = 1; i < dataset.x.length; i++) {
109 | self.buffer.append('line')
110 | .attr('stroke-width', 1)
111 | .attr('stroke', 'black')
112 | .attr('x1', self.xMap(dataset.x[i-1]))
113 | .attr('x2', self.xMap(dataset.x[i]))
114 | .attr('y1', self.yMap(dataset.y[i-1]))
115 | .attr('y2', self.yMap(dataset.y[i]));
116 | };
117 | }
118 | else if (typeof dataset.labels !== 'undefined') {
119 | for (var i = 0; i < dataset.x.length; i++) {
120 | self.buffer.append('text')
121 | .attr('x', self.xMap(dataset.x[i]))
122 | .attr('y', self.yMap(dataset.y[i]))
123 | .text(dataset.labels[i])
124 | .attr('text-anchor','middle')
125 | .attr('stroke', dataset.color || 'black');
126 | };
127 | }
128 | else {
129 | // make a scatter plot if not a line
130 | for (var i = 0; i < dataset.x.length; i++) {
131 | var zMin = typeof dataset.z === 'undefined' ? null : mini(dataset.z);
132 | var zMax = typeof dataset.z === 'undefined' ? null : maxi(dataset.z);
133 |
134 | self.buffer.append('circle')
135 | .attr('r', function() {
136 | if (typeof dataset.z === 'undefined') {
137 | return self.config.height * self.config.width * (0.00001);
138 | }
139 | else {
140 | var minSize = self.config.height * self.config.width * 0.000025;
141 | var sizeMultiplier = self.config.height * self.config.width * (0.0001);
142 | var proportionOfMaxValue = (dataset.z[i] - zMin) / (zMax - zMin);
143 |
144 | return (minSize + sizeMultiplier * proportionOfMaxValue);
145 | }
146 | })
147 | .attr('cx', self.xMap(dataset.x[i]))
148 | .attr('cy', self.yMap(dataset.y[i]))
149 | .attr('opacity',function() {
150 | if (typeof z === 'undefined') {
151 | return 1;
152 | }
153 | else{
154 | return 0.3;
155 | }
156 | })
157 | .attr('fill',function() {
158 | if (typeof dataset.colors === 'undefined') {
159 | return 'none';
160 | }
161 | else{
162 | return dataset.colors[i];
163 | }
164 | })
165 | .attr('stroke', 'black');
166 | };
167 | };
168 | });
169 | };
170 |
171 | function mini(arr) {
172 | return Math.min.apply(null, arr);
173 | };
174 |
175 | function maxi(arr) {
176 | return Math.max.apply(null, arr);
177 | };
178 | };
179 |
180 | module.exports = Chart;
181 |
182 |
--------------------------------------------------------------------------------
/lib/datakit.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 | var fs = require('fs');
3 | var exec = require('child_process').exec;
4 | var express = require('express');
5 |
6 | // SUMMARY STATISTICS/CONVENIENCE METHODS
7 |
8 | //check if args are close (implementation from numpy)
9 | var isclose = module.exports.isclose = function(a, b) {
10 | var atol = Math.pow(10, -8);
11 | var rtol = Math.pow(10, -5);
12 | return (Math.abs(a - b) <= (atol + rtol * Math.abs(b)));
13 | };
14 | //array sum (Kahan summation algorithm)
15 | var sum = module.exports.sum = function(arr) {
16 | var s = 0;
17 | var c = 0;
18 | for (var i = 0; i < arr.length; i++) {
19 | var y = arr[i] - c;
20 | var t = s + y;
21 | c = (t - s) - y;
22 | s = t;
23 | }
24 | return s;
25 | };
26 |
27 | // from 'Accurate Floating Point Product' - Stef Graillat
28 | // EF split of float into 2 parts
29 | var split = function(val) {
30 | var factor = Math.pow(2, 27) + 1;
31 | var c = factor * val;
32 | var x = c - (c - val);
33 | var y = val - x;
34 | return [x, y];
35 | };
36 | // EFT of the product of 2 floats
37 | var twoProd = function(a, b) {
38 | var x = a * b;
39 | var A = split(a);
40 | var B = split(b);
41 | var y = A[1] * B[1] - (((x - A[0] * B[0]) - A[1] * B[0]) - A[0] * B[1]);
42 | return [x, y];
43 | };
44 | //array product (compensated product method)
45 | var prod = module.exports.prod = function(arr) {
46 | var p_ = arr[0];
47 | var e_ = 0;
48 | for (var i = 1; i < arr.length; i++) {
49 | var step = twoProd(p_, arr[i]);
50 | p_ = step[0];
51 | e_ = e_ * arr[i] + step[1];
52 | }
53 | return (p_ + e_);
54 | };
55 | //array mean
56 | var mean = module.exports.mean = function(arr) {
57 | return prod([sum(arr), 1 / arr.length]);
58 | };
59 | //array max and min
60 | var min = module.exports.min = function(arr) {
61 | return Math.min.apply(null, arr);
62 | };
63 | var max = module.exports.max = function(arr) {
64 | return Math.max.apply(null, arr);
65 | };
66 | //mean shifted covariance to stabilize against catastrophic cancellation
67 | var cov = module.exports.cov = function(arr1, arr2) {
68 | var n = arr1.length;
69 | if (n < 2) return 0;
70 | var m1 = mean(arr1),
71 | m2 = mean(arr2),
72 | res = 0;
73 |
74 | for (var i = 0; i < arr1.length; i++) {
75 | var a = (arr1[i] - m1),
76 | b = (arr2[i] - m2);
77 | res += a * b / (n - 1);
78 | }
79 | return res;
80 | };
81 | //std deviation and variance
82 | var vari = module.exports.vari = function(arr) {
83 | return cov(arr, arr);
84 | };
85 | var sd = module.exports.sd = function(arr) {
86 | return Math.sqrt(cov(arr, arr));
87 | };
88 |
89 | //READ AND MANIPULATE DATA
90 |
91 | //read csv
92 | var csv = module.exports.csv = function(path, callback) {
93 | fs.readFile(path,function(err, data) {
94 | var reg = new RegExp('\r', 'g');
95 | var parse = String(data)
96 | .replace(reg, '')
97 | .split('\n');
98 |
99 | var colnames = parse[0].split(',');
100 |
101 | var res = [];
102 | for (var i = 1; i < parse.length; i++) {
103 | var rowObj = {};
104 | parse[i].split(',').forEach(function(el, j) {
105 | rowObj[colnames[j]] = el;
106 | });
107 | res.push(rowObj);
108 | }
109 | callback(res);
110 | });
111 | };
112 |
113 | //Given an array of objects (arr), get all values associated with a key (key).
114 | var col = module.exports.col = function(arr, key) {
115 | var res = [];
116 | arr.forEach(function(row) {
117 | res.push(row[key]);
118 | });
119 | return res;
120 | };
121 |
122 | // Assumes headings is an array of headings. Would be nice if it could also be an
123 | // array of column numbers.
124 | var numeric = module.exports.numeric = function(data, headings, replaceBlanks) {
125 | var heads = Array.prototype.slice.call(arguments, 1, 2)[0];
126 | if (heads.length === 0 || heads === undefined) {
127 | throw new Error('No headings supplied to numeric.');
128 | }
129 |
130 | var includeNaN = false;
131 |
132 | for (var i = 0; i < heads.length; i++) {
133 | var head = heads[i];
134 | for (var j = 0; j < data.length; j++) {
135 | if (data[j][head] === '') {
136 | data[j][head] = replaceBlanks || 0;
137 | }
138 | else {
139 | data[j][head] = Number(data[j][head]);
140 | }
141 | if (isNaN(data[j][head])) includeNaN = true;
142 | };
143 | };
144 |
145 | if (includeNaN === true) console.log("Warning: Some values are NaN.");
146 |
147 | return data;
148 | };
149 |
150 | //RANDOM NUMBERS
151 |
152 | //array of exponential random variables
153 | var exp = module.exports.exp = function(n, lambda) {
154 | lambda = lambda || 1;
155 | var res = [];
156 | for (var i = 0; i < n; i++) {
157 | var U = Math.random();
158 | res.push(-Math.log(U) / lambda);
159 | }
160 | return res;
161 | };
162 |
163 | //array of uniform random variables
164 | var uni = module.exports.uni = function(n) {
165 | var res = [];
166 | for (var i = 0; i < n; i++) {
167 | res.push(Math.random());
168 | }
169 | return res;
170 | };
171 |
172 | var norm = module.exports.norm = function(n, mu, sig) {
173 | mu = mu || 0;
174 | sig = sig || 1;
175 |
176 | //makes a pair of normals with specified parameters via Box-Muller
177 | function box(mu_, sig_) {
178 | var u1 = Math.random();
179 | var u2 = Math.random();
180 | var z1 = Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * u2 * Math.PI);
181 | var z2 = Math.sqrt(-2 * Math.log(u1)) * Math.sin(2 * u2 * Math.PI);
182 | return [(mu_ + (sig_ * z1)), (mu_ + (sig_ * z2))];
183 | };
184 | var res = [];
185 | if (n % 2 == 0) {
186 | var iter = n / 2;
187 | }
188 | else {
189 | var iter = (n - 1) / 2;
190 | res.push(box(mu, sig).pop());
191 | }
192 | for (var i = 0; i < iter; i++) {
193 | res = res.concat(box(mu, sig));
194 | }
195 | return res;
196 | };
197 |
198 | //SEQUENCE METHOD
199 | var seq = module.exports.seq = function(start, end, incr) {
200 | var res = [];
201 | var num = start;
202 | incr = incr || 1;
203 |
204 | while (num <= end) {
205 | res.push(num);
206 | num += incr;
207 | }
208 | return res;
209 | };
210 |
211 | // LINEAR REGRESSION
212 | var reg = module.exports.reg = function(x, y) {
213 | // infer the regression line
214 | var beta = cov(x, y) / vari(x);
215 | var alpha = mean(y) - (beta * mean(x));
216 |
217 | var res = {};
218 | res.f = function(input) {
219 | return (alpha + (beta * input));
220 | };
221 |
222 | // set array of estimated y values for each x
223 | res.pts = [];
224 | x.forEach(function(point, i) {
225 | res.pts.push(res.f(point));
226 | });
227 |
228 | // set endpoints
229 | var xMin = min(x);
230 | var xMax = max(x);
231 | res.endPoints = {
232 | x1: xMin,
233 | x2: xMax,
234 | y1: res.f(xMin),
235 | y2: res.f(xMax)
236 | }
237 |
238 | return res;
239 | };
240 |
241 | //REPEATED VALUES
242 | var rep = module.exports.rep = function(val, n) {
243 | var res = [];
244 | for (var i = 0; i < n; i++) {
245 | res.push(val);
246 | }
247 | return res;
248 | };
249 |
250 |
251 | //PLOTTING
252 | var server, html;
253 | var app = express();
254 | app.get('/', function(req, res) {
255 | res.send(html);
256 | });
257 |
258 | var Chart = module.exports.Chart = require('./chart.js');
259 |
260 | Chart.prototype.plot = function() {
261 | var self = this;
262 | var head = '