├── .eslintignore
├── examples
    ├── basic
    │   ├── kv.data
    │   ├── kv2.data
    │   ├── xxx.gz
    │   ├── textFile-parquet.js
    │   ├── textFile-gzip.js
    │   ├── count.js
    │   ├── arrow.js
    │   ├── takeSample.js
    │   ├── textFile-take.js
    │   ├── forEach.js
    │   ├── stream.js
    │   ├── sample.js
    │   ├── env.js
    │   ├── take.js
    │   ├── top.js
    │   ├── first.js
    │   ├── collect.js
    │   ├── parallelize.js
    │   ├── aggregate.js
    │   ├── lookup.js
    │   ├── range.js
    │   ├── keys.js
    │   ├── values.js
    │   ├── countByKey.js
    │   ├── distinct.js
    │   ├── save-parquet.js
    │   ├── map.js
    │   ├── wordcount.js
    │   ├── lineStream.js
    │   ├── intersection.js
    │   ├── sortByKey.js
    │   ├── leftOuterJoin.js
    │   ├── save.js
    │   ├── subtract.js
    │   ├── rightOuterJoin.js
    │   ├── sortBy.js
    │   ├── filter.js
    │   ├── groupByKey.js
    │   ├── s3file.js
    │   ├── textFile.js
    │   ├── partitionBy.js
    │   ├── union.js
    │   ├── flatMap.js
    │   ├── reduceByKey.js
    │   ├── coGroupStreams.js
    │   ├── join.js
    │   ├── persist.js
    │   ├── coGroup.js
    │   ├── aggregateByKey.js
    │   ├── mapValues.js
    │   ├── r2.js
    │   ├── flatMapValues.js
    │   ├── countByValue.js
    │   └── cartesian.js
    ├── node_modules
    │   └── skale
    │   │   ├── ml.js
    │   │   └── index.js
    └── ml
    │   ├── binary-classification
    │       ├── pr.png
    │       ├── roc.png
    │       ├── dataset
    │       │   └── README.md
    │       └── adult.js
    │   ├── linear-regression
    │       └── regression.js
    │   └── clustering
    │       ├── README.md
    │       ├── iris.js
    │       └── iris.csv
├── test
    ├── node_modules
    │   └── skale
    │   │   ├── ml.js
    │   │   └── index.js
    ├── data
    │   ├── iris.csv.gz
    │   ├── split-gz
    │   │   ├── iris-00.csv.gz
    │   │   ├── iris-01.csv.gz
    │   │   ├── iris-02.csv.gz
    │   │   └── iris-03.csv.gz
    │   ├── split
    │   │   ├── iris-03.csv
    │   │   ├── iris-00.csv
    │   │   ├── iris-01.csv
    │   │   └── iris-02.csv
    │   └── iris.csv
    ├── dep.js
    ├── forEach.js
    ├── collect.js
    ├── parallelize.js
    ├── arrow.js
    ├── filter.js
    ├── takeSample.js
    ├── Makefile
    ├── distinct.js
    ├── union.js
    ├── groupByKey.js
    ├── intersection.js
    ├── subtract.js
    ├── sortByKey.js
    ├── count.js
    ├── sample.js
    ├── sortBy.js
    ├── reduce.js
    ├── reduceByKey.js
    ├── lookup.js
    ├── keys.js
    ├── countByKey.js
    ├── ml
    │   ├── kmeans.js
    │   ├── sgd-linear-model.js
    │   └── standard-scaler.js
    ├── range.js
    ├── countByValue.js
    ├── flatMap.js
    ├── coGroup.js
    ├── map.js
    ├── aggregateByKey.js
    ├── 0_require.js
    ├── aggregate.js
    ├── cartesian.js
    ├── save-csv.js
    ├── join.js
    ├── stream.js
    ├── textFile.js
    ├── take.js
    ├── textFile-azure.js
    ├── textFile-s3.js
    ├── save.js
    ├── save-s3.js
    └── save-azure.js
├── benchmark
    ├── node_modules
    │   └── skale.js
    ├── logreg-1.png
    ├── logreg-10.png
    ├── gen_data.js
    ├── sparkLR.py
    ├── skaleLR.js
    └── README.md
├── docs
    ├── images
    │   ├── favicon.png
    │   ├── logo-skale.png
    │   └── logo.svg
    ├── Makefile
    ├── index.md
    ├── machine-learning.md
    ├── skale-hackers-guide.md
    └── concepts.md
├── .travis.yml
├── .gitignore
├── .npmignore
├── docker
    ├── Dockerfile
    ├── docker-compose.yml
    ├── Makefile
    ├── run.sh
    └── README.md
├── Makefile
├── bin
    ├── shell.js
    └── worker.js
├── lib
    ├── stub-parquet.js
    ├── lines.js
    ├── rough-sizeof.js
    ├── task.js
    ├── readsplit.js
    └── worker-local.js
├── ml
    ├── index.js
    ├── standard-scaler.js
    ├── kmeans.js
    ├── classification-metrics.js
    └── sgd-linear-model.js
├── appveyor.yml
├── index.js
├── mkdocs.yml
├── Roadmap.md
├── package.json
├── CONTRIBUTING.md
├── CODE_OF_CONDUCT.md
├── README.md
└── LICENSE


/.eslintignore:
--------------------------------------------------------------------------------
1 | site/
2 | 


--------------------------------------------------------------------------------
/examples/basic/kv.data:
--------------------------------------------------------------------------------
1 | 1 1
2 | 1 1
3 | 2 3
4 | 2 4
5 | 3 5


--------------------------------------------------------------------------------
/examples/basic/kv2.data:
--------------------------------------------------------------------------------
1 | 0 5
2 | 1 6
3 | 2 7
4 | 3 9
5 | 0 9


--------------------------------------------------------------------------------
/examples/node_modules/skale/ml.js:
--------------------------------------------------------------------------------
1 | module.exports = require('../../../ml');
2 | 


--------------------------------------------------------------------------------
/test/node_modules/skale/ml.js:
--------------------------------------------------------------------------------
1 | module.exports = require('../../../ml');
2 | 


--------------------------------------------------------------------------------
/benchmark/node_modules/skale.js:
--------------------------------------------------------------------------------
1 | module.exports = require('../../index.js');
2 | 


--------------------------------------------------------------------------------
/examples/node_modules/skale/index.js:
--------------------------------------------------------------------------------
1 | module.exports = require('../../../index.js');
2 | 


--------------------------------------------------------------------------------
/examples/basic/xxx.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skale-me/skale/HEAD/examples/basic/xxx.gz


--------------------------------------------------------------------------------
/test/data/iris.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skale-me/skale/HEAD/test/data/iris.csv.gz


--------------------------------------------------------------------------------
/test/dep.js:
--------------------------------------------------------------------------------
1 | function add3(a) {
2 |   return a + 3;
3 | }
4 | 
5 | module.exports = add3;
6 | 


--------------------------------------------------------------------------------
/benchmark/logreg-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skale-me/skale/HEAD/benchmark/logreg-1.png


--------------------------------------------------------------------------------
/benchmark/logreg-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skale-me/skale/HEAD/benchmark/logreg-10.png


--------------------------------------------------------------------------------
/docs/images/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skale-me/skale/HEAD/docs/images/favicon.png


--------------------------------------------------------------------------------
/docs/images/logo-skale.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skale-me/skale/HEAD/docs/images/logo-skale.png


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | os:
2 | - linux
3 | - osx
4 | language: node_js
5 | node_js:
6 | - node
7 | - '8'
8 | - '6'
9 | 


--------------------------------------------------------------------------------
/test/data/split-gz/iris-00.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skale-me/skale/HEAD/test/data/split-gz/iris-00.csv.gz


--------------------------------------------------------------------------------
/test/data/split-gz/iris-01.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skale-me/skale/HEAD/test/data/split-gz/iris-01.csv.gz


--------------------------------------------------------------------------------
/test/data/split-gz/iris-02.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skale-me/skale/HEAD/test/data/split-gz/iris-02.csv.gz


--------------------------------------------------------------------------------
/test/data/split-gz/iris-03.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skale-me/skale/HEAD/test/data/split-gz/iris-03.csv.gz


--------------------------------------------------------------------------------
/examples/ml/binary-classification/pr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skale-me/skale/HEAD/examples/ml/binary-classification/pr.png


--------------------------------------------------------------------------------
/examples/ml/binary-classification/roc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skale-me/skale/HEAD/examples/ml/binary-classification/roc.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.swp
 2 | .DS_Store
 3 | /node_modules/
 4 | /site/
 5 | npm-debug.log
 6 | tags
 7 | yarn-error.log
 8 | bin/tmp/*.js
 9 | .idea
10 | 


--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
 1 | .travis.yml
 2 | Roadmap.md
 3 | appveyor.yml
 4 | benchmark/
 5 | docs/
 6 | docker/
 7 | examples/
 8 | site/
 9 | test/
10 | tape-test/
11 | 


--------------------------------------------------------------------------------
/examples/basic/textFile-parquet.js:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 | 
3 | const sc = require('skale').context();
4 | 
5 | sc.textFile(process.argv[2]).stream({end: true}).pipe(process.stdout);
6 | 


--------------------------------------------------------------------------------
/examples/basic/textFile-gzip.js:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 | 
3 | const sc = require('skale').context();
4 | 
5 | sc.textFile(__dirname + '/xxx.gz').count().then(function (res) {console.log(res); sc.end();});
6 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | from alpine:edge
 2 | 
 3 | RUN apk add --no-cache nodejs nodejs-npm; \
 4 | 	npm install -g skale; \
 5 | 	apk del nodejs-npm; \
 6 | 	adduser -D skale
 7 | 
 8 | ADD run.sh /
 9 | 
10 | ENTRYPOINT [ "/run.sh" ]
11 | CMD [ "sh" ]
12 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | build:
2 | 	browserify --node -i browserify -i node-parquet -i bufferutil -i utf-8-validate -s skale -o skale-node-bundle.js index.js
3 | 	browserify -i browserify -i node-parquet -i bufferutil -i utf-8-validate -s skale -o skale-web-bundle.js index.js
4 | 


--------------------------------------------------------------------------------
/examples/basic/count.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | sc.parallelize([1, 2, 3, 4]).count()
 6 |   .then(function(data) {
 7 |     console.log(data);
 8 |     console.assert(data === 4);
 9 |     sc.end();
10 |   });
11 | 


--------------------------------------------------------------------------------
/examples/basic/arrow.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | sc.range(6).map((a) => a*a).reduce((a,b) => a+b, 0)
 6 |   .then(function (res) {
 7 |     console.log(res);
 8 |     console.assert(res === 55);
 9 |     sc.end();
10 |   });
11 | 


--------------------------------------------------------------------------------
/examples/basic/takeSample.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | process.env.SKALE_RANDOM_SEED = 'skale';
 4 | 
 5 | const sc = require('skale').context();
 6 | 
 7 | sc.range(100)
 8 |   .takeSample(false, 4, function(err, res) {
 9 |     console.log(res);
10 |     sc.end();
11 |   });
12 | 


--------------------------------------------------------------------------------
/examples/basic/textFile-take.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | const file = __dirname + '/kv.data';
 6 | 
 7 | sc.textFile(file)
 8 |   .take(1)
 9 |   .then(function (res) {
10 |     console.log(res);
11 |     sc.end();
12 |   });
13 | 


--------------------------------------------------------------------------------
/examples/basic/forEach.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | //sc.range(5).forEach((a, b) => console.log('# b', b), () => {
 6 | sc.range(5).forEach((b) => console.log('# b', b), () => {
 7 |   console.log('done');
 8 |   sc.end();
 9 | });
10 | 


--------------------------------------------------------------------------------
/examples/basic/stream.js:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 | 
3 | const sc = require('skale').context();
4 | // const s = sc.range(20).stream({gzip: true});
5 | //const s = sc.range(20).stream();
6 | const s = sc.range(20).stream({end: true});
7 | s.pipe(process.stdout);
8 | //s.on('end', sc.end);
9 | 


--------------------------------------------------------------------------------
/examples/basic/sample.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | process.env.SKALE_RANDOM_SEED = 'skale';
 4 | 
 5 | const sc = require('skale').context();
 6 | 
 7 | sc.range(100)
 8 |   .sample(false, 0.1)
 9 |   .collect(function(err, res) {
10 |     console.log(res);
11 |     sc.end();
12 |   });
13 | 


--------------------------------------------------------------------------------
/test/forEach.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | t.test('forEach', function (t) {
 5 |   t.plan(1);
 6 | 
 7 |   sc.range(5).forEach((b) => console.log('# b', b), () => {
 8 |     t.pass('nothing on master');
 9 |     sc.end();
10 |   });
11 | });
12 | 


--------------------------------------------------------------------------------
/examples/basic/env.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | sc.env.MY_VAR = 'Hello';
 6 | 
 7 | sc.range(5).
 8 |   map(function (i) {return process.env.MY_VAR + i;}).
 9 |   collect(function (err, res) {
10 |     console.log(res);
11 |     sc.end();
12 |   });
13 | 


--------------------------------------------------------------------------------
/examples/basic/take.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | sc.parallelize([1, 2, 3, 4], 2)
 6 |   .take(2).then(function(res) {
 7 |     console.log(res);
 8 |     console.assert(JSON.stringify(res) === JSON.stringify([1, 2]));   
 9 |     sc.end();
10 |   });
11 | 


--------------------------------------------------------------------------------
/examples/basic/top.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | sc.parallelize([1, 2, 3, 4], 2)
 6 |   .top(2).then(function(res) {
 7 |     console.log(res);
 8 |     console.assert(JSON.stringify(res) === JSON.stringify([3, 4]));   
 9 |     sc.end();
10 |   });
11 | 


--------------------------------------------------------------------------------
/test/collect.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | t.test('collect', function (t) {
 5 |   t.plan(1);
 6 | 
 7 |   sc.range(6)
 8 |     .collect(function (err, res) {
 9 |       t.deepEqual(res, [0, 1, 2, 3, 4, 5]);
10 |       sc.end();
11 |     });
12 | });
13 | 


--------------------------------------------------------------------------------
/test/parallelize.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | t.test('parallelize', function (t) {
 5 |   t.plan(1);
 6 | 
 7 |   sc.parallelize([0, 1, 2, 3]).collect(function (err, data) {
 8 |     t.deepEqual(data, [0, 1, 2, 3]);
 9 |     sc.end();
10 |   });
11 | });
12 | 


--------------------------------------------------------------------------------
/examples/basic/first.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | sc.parallelize([[1,2],[3,4],[3,6]]).
 6 |   first().
 7 |   then(function(res) {
 8 |     console.log(res);
 9 |     console.assert(JSON.stringify(res) === JSON.stringify([1, 2]));
10 |     sc.end();
11 |   });
12 | 


--------------------------------------------------------------------------------
/examples/basic/collect.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | sc.parallelize([[1, 2], [3, 4]], 2)
 6 |   .collect(function(err, res) {
 7 |     console.log(res);
 8 |     console.assert(JSON.stringify(res) === JSON.stringify([[1, 2], [3, 4]]));
 9 |     sc.end();
10 |   });
11 | 


--------------------------------------------------------------------------------
/examples/basic/parallelize.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | sc.parallelize([1, 2, 3, 4, 5])
 6 |   .collect(function(err, res) {
 7 |     console.log(res);
 8 |     console.assert(JSON.stringify(res) === JSON.stringify([1, 2, 3, 4, 5]));
 9 |     sc.end();
10 |   });
11 | 


--------------------------------------------------------------------------------
/examples/basic/aggregate.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | function sum(a, b) {return a + b;}
 6 | 
 7 | sc.parallelize([1, 2, 3, 4], 2)
 8 |   .reduce(sum, 0).then(function(res) {
 9 |     console.log(res);
10 |     console.assert(res === 10);
11 |     sc.end();
12 |   });
13 | 


--------------------------------------------------------------------------------
/examples/basic/lookup.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | sc.parallelize([[1, 2], [3, 4], [3, 6]])
 6 |   .lookup(3)
 7 |   .then(function(res) {
 8 |     console.log(res);
 9 |     console.assert(JSON.stringify(res) === JSON.stringify([4, 6]));  
10 |     sc.end();
11 |   });
12 | 


--------------------------------------------------------------------------------
/examples/basic/range.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | sc.range(10).map(a => a * 2).collect().then(console.log);
 6 | 
 7 | sc.range(10, -5, -3).collect().then(console.log);
 8 | 
 9 | sc.range(-4, 3).collect(function(err, res) {
10 |   console.log(res);
11 |   sc.end();
12 | });
13 | 


--------------------------------------------------------------------------------
/bin/shell.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('../').context();
 4 | const ml = require('../ml');
 5 | const addAwait = require('await-outside').addAwaitOutsideToReplServer;
 6 | const repl = require('repl').start({ prompt: 'skale> ' });
 7 | 
 8 | addAwait(repl);
 9 | repl.context.sc = sc;
10 | repl.context.ml = ml;
11 | 


--------------------------------------------------------------------------------
/lib/stub-parquet.js:
--------------------------------------------------------------------------------
 1 | // Stub node-parquet binary module.
 2 | 'use strict';
 3 | 
 4 | try {
 5 |   module.exports = require('node-parquet');
 6 | } catch (err) {
 7 |   module.exports = { ParquetReader: stub, ParquetWriter: stub };
 8 | }
 9 | 
10 | function stub() {
11 |   throw 'Missing module, run "npm install node-parquet"';
12 | }
13 | 


--------------------------------------------------------------------------------
/docker/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | 
 3 | services:
 4 | 
 5 |   skale-server:
 6 |     image: skale/skale
 7 |     command: skale-server
 8 |     ports:
 9 |       - 12346:12346
10 |       - 81:80
11 | 
12 |   skale-worker:
13 |     image: skale/skale
14 |     command: skale-worker -H skale-server
15 |     ports:
16 |       - 82:80
17 | 


--------------------------------------------------------------------------------
/test/arrow.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | t.test('arrow function', function (t) {
 5 |   t.plan(1);
 6 | 
 7 |   sc.range(6)
 8 |     .map((a) => a*a)
 9 |     .reduce((a,b) => a+b, 0)
10 |     .then(function (res) {
11 |       t.equal(res, 55);
12 |       sc.end();
13 |     });
14 | });
15 | 


--------------------------------------------------------------------------------
/test/filter.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | t.test('filter', function (t) {
 5 |   t.plan(1);
 6 | 
 7 |   sc.parallelize([1, 2, 3, 4])
 8 |     .filter(a => a % 2)
 9 |     .collect(function (err, res) {
10 |       t.deepEqual(res, [1, 3]); 
11 |       sc.end();
12 |     });
13 | });
14 | 


--------------------------------------------------------------------------------
/test/takeSample.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | t.test('takeSample', function (t) {
 5 |   t.plan(1);
 6 | 
 7 |   sc.range(100)
 8 |     .takeSample(false, 4, function(err, res) {
 9 |       console.log(res);
10 |       t.ok(res.length == 4);
11 |       sc.end();
12 |     });
13 | });
14 | 


--------------------------------------------------------------------------------
/examples/basic/keys.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | sc.parallelize([[1, 2], [2, 4], [4, 6]])
 6 |   .keys()
 7 |   .collect(function(err, res) {
 8 |     console.log(res);
 9 |     res.sort();
10 |     console.assert(JSON.stringify(res) === JSON.stringify([1, 2, 4]));
11 |     sc.end();
12 |   });
13 | 


--------------------------------------------------------------------------------
/examples/basic/values.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | sc.parallelize([[1,2],[2,4],[4,6]])
 6 |   .values()
 7 |   .collect(function(err, res) {
 8 |     console.log(res);
 9 |     res.sort();
10 |     console.assert(JSON.stringify(res) === JSON.stringify([2,4,6]));    
11 |     sc.end();
12 |   });
13 | 


--------------------------------------------------------------------------------
/examples/basic/countByKey.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | sc.parallelize([[1, 2], [3, 4], [3, 6]])
 6 |   .countByKey()
 7 |   .then(function(res) {
 8 |     console.log(res);
 9 |     res.sort();
10 |     console.assert(JSON.stringify(res) === JSON.stringify([[1, 1], [3, 2]]));
11 |     sc.end();
12 |   });
13 | 


--------------------------------------------------------------------------------
/examples/basic/distinct.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | sc.parallelize([ 1, 2, 3, 1, 4, 3, 5 ]).
 6 |   distinct().
 7 |   collect(function(err, res) {
 8 |     console.log(res);
 9 |     res.sort();
10 |     console.assert(JSON.stringify(res) === JSON.stringify([1, 2, 3, 4, 5]));
11 |     sc.end();
12 |   });
13 | 


--------------------------------------------------------------------------------
/test/Makefile:
--------------------------------------------------------------------------------
 1 | default: standalone distributed
 2 | 
 3 | standalone:
 4 | 	@SKALE_HOST= SKALE_WORKERS=2 tape "./**/*.js"
 5 | 
 6 | distributed:
 7 | 	@../bin/server.js -l 2 >/tmp/skale-server.log 2>&1 & pid=$$!; \
 8 | 	SKALE_HOST=localhost tape "./**/*.js"; \
 9 | 	res=$$?; kill $$pid; exit $$res
10 | 
11 | %:
12 | 	SKALE_HOST= SKALE_WORKERS=2 tape "./**/*$@*.js"
13 | 


--------------------------------------------------------------------------------
/test/distinct.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | t.test('distinct', function (t) {
 5 |   t.plan(1);
 6 | 
 7 |   sc.parallelize([1, 2, 3, 1, 4, 3, 5])
 8 |     .distinct()
 9 |     .collect(function (err, res) {
10 |       t.deepEqual(res.sort(), [1, 2, 3, 4, 5]);
11 |       sc.end();
12 |     });
13 | });
14 | 
15 | 


--------------------------------------------------------------------------------
/test/union.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | t.test('union', function (t) {
 5 |   t.plan(1);
 6 | 
 7 |   sc.parallelize([1, 2, 3, 4])
 8 |     .union(sc.parallelize([5, 6, 7, 8]))
 9 |     .collect(function (err, res) {
10 |       t.deepEqual(res, [1, 2, 3, 4, 5, 6, 7, 8]);
11 |       sc.end();
12 |     });
13 | });
14 | 


--------------------------------------------------------------------------------
/examples/basic/save-parquet.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | const schema = {
 6 |   int1: {type: 'int32'},
 7 |   int2: {type: 'int32'}
 8 | };
 9 | 
10 | sc.range(900).
11 |   map(a => [a, 2 * a]).
12 |   save('/tmp/truc', {parquet: {schema: schema}}, (err, res) => {
13 |     console.log(res);
14 |     sc.end();
15 |   });
16 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | all:
 2 | 
 3 | # When editing docs, run 'make edit' then browse http://localhost:8000
 4 | edit:
 5 | 	docker run --rm -it -p 8000:8000 -v $${PWD%/*}:/docs squidfunk/mkdocs-material
 6 | 
 7 | # Maintainer only, to deploy static doc website on github pages
 8 | deploy:
 9 | 	docker run --rm -it -v $$HOME/.ssh:/root/.ssh -v $${PWD%/*}:/docs squidfunk/mkdocs-material gh-deploy
10 | 


--------------------------------------------------------------------------------
/examples/basic/map.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | function by2(a, args) {return a * 2 * args.bias;}
 6 | function sum(a, b) {return a + b;}
 7 | 
 8 | sc.parallelize([1, 2, 3, 4])
 9 |   .map(by2, {bias: 2})
10 |   .reduce(sum, 0, function(err, res) {
11 |     console.log(res);
12 |     console.assert(res === 40);
13 |     sc.end();
14 |   });
15 | 


--------------------------------------------------------------------------------
/examples/basic/wordcount.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | const file = process.argv[2] || '/etc/hosts';
 6 | 
 7 | sc.textFile(file)
 8 |   .flatMap(line => line.split(' '))
 9 |   .map(word => [word, 1])
10 |   .reduceByKey((a, b) => a + b, 0)
11 |   .count()
12 |   .then(function (res) {
13 |     console.log(res);
14 |     sc.end();
15 |   });
16 | 


--------------------------------------------------------------------------------
/test/groupByKey.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | t.test('grouByKey', function (t) {
 5 |   t.plan(1);
 6 | 
 7 |   sc.parallelize([['hello', 1], ['hello', 2], ['world', 1]])
 8 |     .groupByKey()
 9 |     .collect(function (err, res) {
10 |       t.deepEqual(res.sort(), [['hello', [1, 2]], ['world', [1]]]);
11 |       sc.end();
12 |     });
13 | });
14 | 


--------------------------------------------------------------------------------
/examples/basic/lineStream.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const fs = require('fs');
 4 | const sc = require('skale').context();
 5 | 
 6 | const stream = fs.createReadStream(__dirname + '/kv.data');
 7 | 
 8 | sc.lineStream(stream)
 9 |   .collect(function(err, res) {
10 |     console.log(res);
11 |     console.assert(JSON.stringify(res) === JSON.stringify(['1 1', '1 1', '2 3', '2 4', '3 5']));  
12 |     sc.end();
13 |   });
14 | 


--------------------------------------------------------------------------------
/examples/basic/intersection.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9]).
 6 |   intersection(sc.parallelize([5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])).
 7 |   collect(function(err, res) {
 8 |     console.log(res);
 9 |     res.sort();
10 |     console.assert(JSON.stringify(res) === JSON.stringify([5, 6, 7, 8, 9]));    
11 |     sc.end();
12 |   });
13 | 


--------------------------------------------------------------------------------
/test/intersection.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | t.test('intersection', function (t) {
 5 |   t.plan(1);
 6 | 
 7 |   sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9])
 8 |     .intersection(sc.parallelize([5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]))
 9 |     .collect(function(err, res) {
10 |       t.deepEqual(res.sort(), [5, 6, 7, 8, 9]);    
11 |       sc.end();
12 |     });
13 | });
14 | 


--------------------------------------------------------------------------------
/docker/Makefile:
--------------------------------------------------------------------------------
 1 | VERSION = 1.2.0-0
 2 | IMAGE = skale/skale
 3 | 
 4 | all:
 5 | 
 6 | image:
 7 | 	docker build -t $(IMAGE):$(VERSION) .
 8 | 	docker tag $(IMAGE):$(VERSION) $(IMAGE):latest
 9 | 
10 | rmi:
11 | 	docker rmi $(IMAGE):$(VERSION) $(IMAGE):latest
12 | 
13 | test:
14 | 	docker run --rm -ti $(IMAGE) sh
15 | 
16 | # Publish only if you are image owner
17 | publish:
18 | 	docker push $(IMAGE):$(VERSION)
19 | 	docker push $(IMAGE):latest
20 | 


--------------------------------------------------------------------------------
/examples/basic/sortByKey.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | const data = [['world', 2], ['cedric', 3], ['hello', 1]];
 6 | const nPartitions = 2;
 7 | 
 8 | sc.parallelize(data, nPartitions)
 9 |   .sortByKey()
10 |   .collect(function(err, res) {
11 |     console.log(res);
12 |     console.assert(JSON.stringify(res) === JSON.stringify([['cedric', 3], ['hello', 1], ['world', 2]]));  
13 |     sc.end();
14 |   });
15 | 


--------------------------------------------------------------------------------
/ml/index.js:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Luca-SAS, licensed under the Apache License 2.0
 2 | 
 3 | 'use strict';
 4 | 
 5 | const StandardScaler = require('./standard-scaler');
 6 | const classificationMetrics = require('./classification-metrics');
 7 | const SGDLinearModel = require('./sgd-linear-model');
 8 | const KMeans = require('./kmeans');
 9 | 
10 | module.exports = {
11 |   StandardScaler,
12 |   classificationMetrics,
13 |   SGDLinearModel,
14 |   KMeans
15 | };
16 | 


--------------------------------------------------------------------------------
/test/subtract.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | const d1 = [[1, 1], [1, 1], [2, 3], [2, 4], [3, 5]];
 5 | const d2 = [[1, 1], [1, 1], [2, 3]];
 6 | 
 7 | t.test('subtract', function (t) {
 8 |   t.plan(1);
 9 | 
10 |   sc.parallelize(d1)
11 |     .subtract(sc.parallelize(d2))
12 |     .collect(function(err, res) {
13 |       t.deepEqual(res.sort(), [[2, 4], [3, 5]]);
14 |       sc.end();
15 |     });
16 | });
17 | 


--------------------------------------------------------------------------------
/examples/basic/leftOuterJoin.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | const da1 = sc.parallelize([[10, 1], [20, 2]]);
 6 | const da2 = sc.parallelize([[10, 'world'], [30, 3]]);
 7 | 
 8 | da1.leftOuterJoin(da2)
 9 |   .collect(function(err, res) {
10 |     console.log(res);
11 |     res.sort();
12 |     console.assert(JSON.stringify(res) === JSON.stringify([[10, [1, 'world']], [20, [2, null]]]));
13 |     sc.end();
14 |   });
15 | 


--------------------------------------------------------------------------------
/examples/basic/save.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | //sc.range(900).save('/tmp/truc', {gzip: true}, (err, res) => {
 6 | //sc.range(900).save('/tmp/truc', {stream: true}, (err, res) => {
 7 | //sc.range(900).save('s3://skale-demo/test/s1', {gzip: false, stream: true}, (err, res) => {
 8 | sc.range(900).save('/tmp/truc', {gzip: true, stream: true}, (err, res) => {
 9 |   console.log(res);
10 |   sc.end();
11 | });
12 | 


--------------------------------------------------------------------------------
/examples/basic/subtract.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | const d1 = [[1, 1], [1, 1], [2, 3], [2, 4], [3, 5]];
 6 | const d2 = [[1, 1], [1, 1], [2, 3]];
 7 | 
 8 | sc.parallelize(d1)
 9 |   .subtract(sc.parallelize(d2))
10 |   .collect(function(err, res) {
11 |     console.log(res);
12 |     res.sort();
13 |     console.assert(JSON.stringify(res) === JSON.stringify([[2, 4], [3, 5]]));   
14 |     sc.end();
15 |   });
16 | 


--------------------------------------------------------------------------------
/examples/basic/rightOuterJoin.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | const da1 = sc.parallelize([[10, 1], [20, 2]]);
 6 | const da2 = sc.parallelize([[10, 'world'], [30, 3]]);
 7 | 
 8 | da1.rightOuterJoin(da2)
 9 |   .collect(function(err, res) {
10 |     console.log(res);
11 |     res.sort();
12 |     console.assert(JSON.stringify(res) === JSON.stringify([[10, [1, 'world']], [30, [null, 3]]])); 
13 |     sc.end();
14 |   });
15 | 


--------------------------------------------------------------------------------
/test/sortByKey.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | const data = [['world', 2], ['cedric', 3], ['hello', 1]];
 5 | const nPartitions = 2;
 6 | 
 7 | t.test('sortByKey', function (t) {
 8 |   t.plan(1);
 9 | 
10 |   sc.parallelize(data, nPartitions)
11 |     .sortByKey()
12 |     .collect(function(err, res) {
13 |       t.deepEqual(res, [['cedric', 3], ['hello', 1], ['world', 2]]);
14 |       sc.end();
15 |     });
16 | });
17 | 


--------------------------------------------------------------------------------
/test/count.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | t.test('count callback', function (t) {
 5 |   t.plan(1);
 6 | 
 7 |   sc.range(6)
 8 |     .count(function (err, res) {
 9 |       t.equal(res, 6);
10 |     });
11 | });
12 | 
13 | t.test('count promise', function (t) {
14 |   t.plan(1);
15 | 
16 |   sc.range(6)
17 |     .count()
18 |     .then(function (res) {
19 |       t.equal(res, 6);
20 |       sc.end();
21 |     });
22 | });
23 | 


--------------------------------------------------------------------------------
/examples/basic/sortBy.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | const data = [4, 6, 10, 5, 1, 2, 9, 7, 3, 0];
 6 | const nPartitions = 3;
 7 | 
 8 | function keyFunc(data) {return data;}
 9 | 
10 | sc.parallelize(data, nPartitions)
11 |   .sortBy(keyFunc)
12 |   .collect(function(err, res) {
13 |     console.log(res);
14 |     console.assert(JSON.stringify(res) === JSON.stringify([0, 1, 2, 3, 4, 5, 6, 7, 9, 10]));  
15 |     sc.end();
16 |   });
17 | 


--------------------------------------------------------------------------------
/test/sample.js:
--------------------------------------------------------------------------------
 1 | process.env.SKALE_RANDOM_SEED = 'skale';
 2 | 
 3 | const t = require('tape');
 4 | const sc = require('skale').context();
 5 | 
 6 | t.test('sample', function (t) {
 7 |   t.plan(1);
 8 | 
 9 |   sc.env.SKALE_RANDOM_SEED = process.env.SKALE_RANDOM_SEED;
10 | 
11 |   sc.range(100)
12 |     .sample(false, 0.1)
13 |     .collect(function(err, res) {
14 |       console.log(res);
15 |       t.ok(res.length > 0 && res.length < 20);
16 |       sc.end();
17 |     });
18 | });
19 | 


--------------------------------------------------------------------------------
/test/sortBy.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | const data = [4, 6, 10, 5, 1, 2, 9, 7, 3, 0];
 5 | const nPartitions = 3;
 6 | 
 7 | function keyFunc(data) {return data;}
 8 | 
 9 | t.test('sortBy', function (t) {
10 |   t.plan(1);
11 | 
12 |   sc.parallelize(data, nPartitions)
13 |     .sortBy(keyFunc)
14 |     .collect(function(err, res) {
15 |       t.deepEqual(res, [0, 1, 2, 3, 4, 5, 6, 7, 9, 10]);
16 |       sc.end();
17 |     });
18 | });
19 | 


--------------------------------------------------------------------------------
/examples/basic/filter.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | function reducer(a, b) {a.push(b); return a;}
 6 | function combiner(a, b) {return a.concat(b);}
 7 | 
 8 | function filter(a) {return a % 2;}
 9 | 
10 | sc.parallelize([1, 2, 3, 4])
11 |   .filter(filter)
12 |   .aggregate(reducer, combiner, [], function(err, res) {
13 |     console.log(res);
14 |     console.assert(JSON.stringify(res) === JSON.stringify([1, 3])); 
15 |     sc.end();
16 |   });
17 | 


--------------------------------------------------------------------------------
/examples/basic/groupByKey.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | const data = [['hello', 1], ['hello', 1], ['world', 1]];
 6 | const nPartitions = 1;
 7 | 
 8 | const a = sc.parallelize(data, nPartitions).groupByKey().persist();
 9 | 
10 | a.collect(function(err, res) {
11 |   console.log(res);
12 |   console.log('First ok!');
13 |   a.collect(function(err, res) {
14 |     console.log(res);
15 |     console.log('Second ok !');
16 |     sc.end();
17 |   });
18 | });
19 | 


--------------------------------------------------------------------------------
/examples/basic/s3file.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | const input = sc.textFile('s3://skale-demo/datasets/*-ny.json.gz');
 5 | //const input = sc.textFile('s3://skale-demo/datasets/restaurants-ny.json.gz');
 6 | //const input = sc.textFile('s3://skale-demo/datasets/restaurants-ny.json');
 7 | //const s = input.stream();
 8 | //s.pipe(process.stdout);
 9 | //s.on('end', sc.end);
10 | 
11 | input.count(function (err, res) {
12 |   console.log(res);
13 |   sc.end();
14 | });
15 | 


--------------------------------------------------------------------------------
/examples/basic/textFile.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | function reducer(a, b) {a.push(b); return a;}
 6 | function combiner(a, b) {return a.concat(b);}
 7 | 
 8 | const file = __dirname + '/kv.data';
 9 | 
10 | sc.textFile(file)
11 |   .aggregate(reducer, combiner, [], function(err, res) {
12 |     console.log(res);
13 |     res.sort();
14 |     console.assert(JSON.stringify(res) === JSON.stringify(['1 1', '1 1', '2 3', '2 4', '3 5']));  
15 |     sc.end();
16 |   });
17 | 


--------------------------------------------------------------------------------
/examples/basic/partitionBy.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const skale = require('skale');
 4 | const sc = skale.context();
 5 | 
 6 | const data = [['hello', 1], ['world', 1], ['hello', 2], ['world', 2], ['cedric', 3]];
 7 | 
 8 | sc.parallelize(data)
 9 |   .partitionBy(new skale.HashPartitioner(3))
10 |   .collect(function(err, res) {
11 |     console.log(res);
12 |     console.assert(JSON.stringify(res) === JSON.stringify([['world', 1], ['world', 2],['hello', 1],['hello', 2],['cedric', 3]])); 
13 |     sc.end();
14 |   });
15 | 


--------------------------------------------------------------------------------
/examples/basic/union.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | function reducer(a, b) {a.push(b); return a;}
 6 | function combiner(a, b) {return a.concat(b);}
 7 | 
 8 | const a = sc.parallelize([1, 2, 3, 4]);
 9 | const b = sc.parallelize([5, 6, 7, 8]);
10 | 
11 | a.union(b).aggregate(reducer, combiner, [], function(err, res) {
12 |   console.log(res);
13 |   res.sort();
14 |   console.assert(JSON.stringify(res) === JSON.stringify([1, 2, 3, 4, 5, 6, 7, 8])); 
15 |   sc.end();
16 | });
17 | 


--------------------------------------------------------------------------------
/examples/basic/flatMap.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | function reducer(a, b) {a.push(b); return a;}
 6 | function combiner(a, b) {return a.concat(b);}
 7 | 
 8 | function dup(a) {return [a, a];}
 9 | 
10 | sc.parallelize([1, 2, 3, 4])
11 |   .flatMap(dup)
12 |   .aggregate(reducer, combiner, [], function(err, res) {
13 |     console.log(res);
14 |     res.sort();
15 |     console.assert(JSON.stringify(res) === JSON.stringify([1, 1, 2, 2, 3, 3, 4, 4])); 
16 |     sc.end();
17 |   });
18 | 


--------------------------------------------------------------------------------
/examples/basic/reduceByKey.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | const data = [['hello', 1], ['hello', 1], ['world', 1]];
 6 | const nPartitions = 2;
 7 | 
 8 | const init = 0;
 9 | 
10 | function reducer(a, b) {return a + b;}
11 | 
12 | sc.parallelize(data, nPartitions)
13 |   .reduceByKey(reducer, init)
14 |   .collect(function(err, res) {
15 |     console.log(res);
16 |     console.assert(JSON.stringify(res) === JSON.stringify([['hello', 2], ['world', 1]]));   
17 |     sc.end();
18 |   });
19 | 


--------------------------------------------------------------------------------
/examples/basic/coGroupStreams.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const fs = require('fs');
 4 | const sc = require('skale').context();
 5 | 
 6 | const s1 = sc.lineStream(fs.createReadStream(__dirname + '/kv.data')).map(line => line.split(' '));
 7 | const s2 = sc.lineStream(fs.createReadStream(__dirname + '/kv2.data')).map(line =>line.split(' '));
 8 | 
 9 | s1.coGroup(s2).collect(function(err, res) {
10 |   console.log(res[0]);
11 |   console.log(res[1]);  
12 |   console.log(res[2]);
13 |   console.log(res[3]);    
14 |   sc.end();
15 | });
16 | 


--------------------------------------------------------------------------------
/examples/basic/join.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | const data = [['hello', 1], ['world', 2], ['world', 3]];
 6 | const data2 = [['cedric', 3], ['world', 4]];
 7 | const nPartitions = 4;
 8 | 
 9 | const a = sc.parallelize(data, nPartitions);
10 | const b = sc.parallelize(data2, nPartitions);
11 | 
12 | a.join(b).collect(function(err, res) {
13 |   console.log(res);
14 |   console.assert(JSON.stringify(res) === JSON.stringify([['world', [2, 4]],['world',[3, 4]]])); 
15 |   sc.end();
16 | });
17 | 


--------------------------------------------------------------------------------
/test/reduce.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | t.test('reduce callback', function (t) {
 5 |   t.plan(1);
 6 |   sc.parallelize([1, 2, 3, 4], 2)
 7 |     .reduce((a, b) => a + b, 0, function(err, res) {
 8 |       t.equal(res, 10);
 9 |     });
10 | });
11 | 
12 | t.test('reduce promise', function (t) {
13 |   t.plan(1);
14 |   sc.parallelize([1, 2, 3, 4], 2)
15 |     .reduce((a, b) => a + b, 0)
16 |     .then(function(res) {
17 |       t.equal(res, 10);
18 |       sc.end();
19 |     });
20 | });
21 | 


--------------------------------------------------------------------------------
/test/reduceByKey.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | const data = [['hello', 1], ['hello', 1], ['world', 1]];
 5 | const nPartitions = 2;
 6 | 
 7 | const init = 0;
 8 | 
 9 | function reducer(a, b) {return a + b;}
10 | 
11 | t.test('reduceByKey', function (t) {
12 |   t.plan(1);
13 | 
14 |   sc.parallelize(data, nPartitions)
15 |     .reduceByKey(reducer, init)
16 |     .collect(function(err, res) {
17 |       t.deepEqual(res.sort(), [['hello', 2], ['world', 1]]);
18 |       sc.end();
19 |     });
20 | });
21 | 


--------------------------------------------------------------------------------
/test/lookup.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | t.test('lookup callback', function (t) {
 5 |   t.plan(1);
 6 | 
 7 |   sc.parallelize([[1, 2], [3, 4], [3, 6]])
 8 |     .lookup(3, function(err, res) {
 9 |       t.deepEqual(res, [4, 6]);
10 |     });
11 | });
12 | 
13 | t.test('lookup promise', function (t) {
14 |   t.plan(1);
15 | 
16 |   sc.parallelize([[1, 2], [3, 4], [3, 6]])
17 |     .lookup(3)
18 |     .then(function(res) {
19 |       t.deepEqual(res, [4, 6]);
20 |       sc.end();
21 |     });
22 | });
23 | 


--------------------------------------------------------------------------------
/examples/basic/persist.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | function reducer(a, b) {a.push(b); return a;}
 6 | function combiner(a, b) {return a.concat(b);}
 7 | 
 8 | const a = sc.parallelize([1, 2, 3, 4], 2).persist();
 9 | 
10 | a.aggregate(reducer, combiner, [], function(err, res) {
11 |   console.log('First Time !');
12 |   console.log(res);
13 | 
14 |   a.aggregate(reducer, combiner, [], function(err, res) {
15 |     console.log('\nSecond Time !');
16 |     console.log(res);
17 |     sc.end();
18 |   });
19 | });
20 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | environment:
 2 |   nodejs_version: "8"
 3 |   SKALE_WORKERS: "2"
 4 |   SKALE_MEMORY: "512"
 5 |   AWS_ACCESS_KEY_ID:
 6 |     secure: d3GA6U28GOVQvQy9pjKpnqkiQ8GfJgyqggLrfRnfRsQ=
 7 |   AWS_SECRET_ACCESS_KEY:
 8 |     secure: MntXe1/M33xEgIZimiBsEi89oyRqTIIK3s65U4qLxfqhKmZGWZYKOt8ctWndSbhN
 9 | 
10 | init:
11 |   - git config --global core.autocrlf true
12 | 
13 | install:
14 |   - ps: Install-Product node $env:nodejs_version
15 |   - npm install
16 | 
17 | test_script:
18 |   - node --version
19 |   - npm --version
20 |   - npm run tape
21 | 
22 | build: off
23 | 


--------------------------------------------------------------------------------
/docker/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Docker entrypoint (pid 1), run as root
 3 | 
 4 | [ "$1" = "sh" ] && exec "$@"
 5 | 
 6 | webserver() {
 7 | 	mkdir -p /www/tmp
 8 | 	ln -sf /tmp/skale /www/tmp/skale
 9 | 	httpd -h /www
10 | }
11 | 
12 | trap 'echo terminated; kill $pid' SIGTERM
13 | 
14 | case $1 in
15 | (skale-server|skale-worker)
16 | 	webserver
17 | 	log=/var/log/$1.log
18 | 	[ -f $log ] && mv $log $log.old
19 | 	cmd="cd; ulimit -c unlimited; env; echo $@; exec $@"
20 | 	su -s /bin/sh -c "$cmd" skale 2>&1 | tee /var/log/$1.log & pid=$!
21 | 	wait $pid
22 | 	;;
23 | esac
24 | 


--------------------------------------------------------------------------------
/examples/ml/binary-classification/dataset/README.md:
--------------------------------------------------------------------------------
 1 | # Adult dataset
 2 | 
 3 | Predict whether income exceeds $50K/yr based on census data. Also
 4 | known as "Census Income" dataset.
 5 | 
 6 | Imported from [Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/adult).
 7 | 
 8 | The dataset is split in multiple CSV files (less than 512 kB) with headers, so:
 9 | 
10 | - file schema and features may be automatically extracted
11 | - programs operate on multiple partitions, as on large datasets
12 | - individual files can be loaded and formatted within the browser
13 | 


--------------------------------------------------------------------------------
/test/keys.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | t.test('keys', function (t) {
 5 |   t.plan(1);
 6 | 
 7 |   sc.parallelize([[1, 2], [2, 4], [4, 6]])
 8 |     .keys()
 9 |     .collect(function(err, res) {
10 |       t.deepEqual(res.sort(), [1, 2, 4]);
11 |     });
12 | });
13 | 
14 | t.test('values', function (t) {
15 |   t.plan(1);
16 | 
17 |   sc.parallelize([[1, 2], [2, 4], [4, 6]])
18 |     .values()
19 |     .collect(function(err, res) {
20 |       t.deepEqual(res.sort(), [2, 4, 6]);
21 |       sc.end();
22 |     });
23 | });
24 | 


--------------------------------------------------------------------------------
/examples/basic/coGroup.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | const data = [['hello', 1], ['world', 2], ['cedric', 3], ['cedric', 4]];
 6 | const data2 = [['cedric', 3], ['world', 4], ['test', 5]];
 7 | const nPartitions = 2;
 8 | 
 9 | const a = sc.parallelize(data, nPartitions);
10 | const b = sc.parallelize(data2, nPartitions);
11 | 
12 | a.coGroup(b).collect(function(err, res) {
13 |   console.log(res);
14 |   console.log(res[0]);
15 |   console.log(res[1]);
16 |   console.log(res[2]);
17 |   console.log(res[3]);  
18 |   sc.end();
19 | });
20 | 


--------------------------------------------------------------------------------
/examples/basic/aggregateByKey.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | const data = [['hello', 1], ['hello', 1], ['world', 1]];
 6 | const nPartitions = 2;
 7 | 
 8 | const init = 0;
 9 | 
10 | function reducer(a, b) {return a + b;}
11 | function combiner(a, b) {return a + b;}
12 | 
13 | sc.parallelize(data, nPartitions)
14 |   .aggregateByKey(reducer, combiner, init)
15 |   .collect(function(err, res) {
16 |     console.log(res);
17 |     console.assert(JSON.stringify(res) === JSON.stringify([['hello', 2], ['world', 1]]));
18 |     sc.end();
19 |   });
20 | 


--------------------------------------------------------------------------------
/test/countByKey.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | t.test('countByKey callback', function (t) {
 5 |   t.plan(1);
 6 | 
 7 |   sc.parallelize([[1, 2], [3, 4], [3, 6]])
 8 |     .countByKey(function (err, res) {
 9 |       t.deepEqual(res.sort(), [[1, 1], [3, 2]]);
10 |     });
11 | });
12 | 
13 | t.test('countByKey promise', function (t) {
14 |   t.plan(1);
15 | 
16 |   sc.parallelize([[1, 2], [3, 4], [3, 6]])
17 |     .countByKey()
18 |     .then(function(res) {
19 |       t.deepEqual(res.sort(), [[1, 1], [3, 2]]);
20 |       sc.end();
21 |     });
22 | });
23 | 


--------------------------------------------------------------------------------
/examples/basic/mapValues.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | function reducer(a, b) {a.push(b); return a;}
 6 | function combiner(a, b) {return a.concat(b);}
 7 | 
 8 | function by2(a) {return a * 2;}
 9 | 
10 | sc.parallelize([['hello', 1], ['world', 2], ['cedric', 3], ['test', 4]])
11 |   .mapValues(by2)
12 |   .aggregate(reducer, combiner, [], function(err, res) {
13 |     console.log(res);
14 |     res.sort();
15 |     console.assert(JSON.stringify(res) === JSON.stringify([['cedric', 6], ['hello', 2], ['test', 8], ['world', 4]])); 
16 |     sc.end();
17 |   });
18 | 


--------------------------------------------------------------------------------
/examples/basic/r2.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | const data = [
 6 |   ['hello', [12, 'param1', 'param2']],
 7 |   ['hello', [10, 'param3', 'param4']],
 8 |   ['world', [5, 'param5', 'param6']]
 9 | ];
10 | const nPartitions = 1;
11 | 
12 | const init = [0, []];
13 | 
14 | function reducer(a, b) {
15 |   a[0] += b[0];
16 |   a[1].push([b[1], b[2]]);
17 |   return a;
18 | }
19 | 
20 | sc.parallelize(data, nPartitions)
21 |   .reduceByKey(reducer, init)
22 |   .collect(function(err, res) {
23 |     console.log(res[0][0], res[0][1]);
24 |     sc.end();
25 |   });
26 | 


--------------------------------------------------------------------------------
/test/ml/kmeans.js:
--------------------------------------------------------------------------------
 1 | process.env.SKALE_RANDOM_SEED = 1;
 2 | 
 3 | const t = require('tape');
 4 | const sc = require('skale').context();
 5 | const ml = require('skale/ml');
 6 | 
 7 | t.test('kmeans', function (t) {
 8 |   t.plan(2);
 9 | 
10 |   const dataset = sc.parallelize([
11 |     [1, 2], [1, 4], [1, 0],
12 |     [4, 2], [4, 4], [4, 0]
13 |   ]);
14 |   const kmeans = ml.KMeans(2);
15 |   kmeans.fit(dataset, function (err) {
16 |     t.ok(!err, 'kmeans.fit() returns no error');
17 |     t.ok(kmeans.predict([0, 0]) !== kmeans.predict([4, 4]), 'predictions are correct');
18 |     sc.end();
19 |   });
20 | });
21 | 


--------------------------------------------------------------------------------
/test/range.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | t.test('range 1 arg', function (t) {
 5 |   t.plan(1);
 6 |   sc.range(4).collect(function (err, data) {
 7 |     t.deepEqual(data, [0, 1, 2, 3]);
 8 |   });
 9 | });
10 | 
11 | t.test('range 2 args', function (t) {
12 |   t.plan(1);
13 |   sc.range(2, 4).collect(function (err, data) {
14 |     t.deepEqual(data, [2, 3]);
15 |   });
16 | });
17 | 
18 | t.test('range 3 args', function (t) {
19 |   t.plan(1);
20 |   sc.range(10, -5, -3).collect(function (err, data) {
21 |     t.deepEqual(data, [10, 7, 4, 1, -2]);
22 |     sc.end();
23 |   });
24 | });
25 | 


--------------------------------------------------------------------------------
/test/countByValue.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | t.test('countByValue callback', function (t) {
 5 |   t.plan(1);
 6 | 
 7 |   sc.parallelize([[1, 2], [3, 4], [1, 2], [3, 4]])
 8 |     .countByValue(function (err, res) {
 9 |       t.deepEqual(res.sort(), [[[1, 2], 2], [[3, 4], 2]]);
10 |     });
11 | });
12 | 
13 | t.test('countByValue promise', function (t) {
14 |   t.plan(1);
15 | 
16 |   sc.parallelize([[1, 2], [3, 4], [1, 2], [3, 4]])
17 |     .countByValue()
18 |     .then(function(res) {
19 |       t.deepEqual(res.sort(), [[[1, 2], 2], [[3, 4], 2]]);
20 |       sc.end();
21 |     });
22 | });
23 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Luca-SAS, licensed under the Apache License 2.0
 2 | 
 3 | 'use strict';
 4 | 
 5 | const ContextRemote = require('./lib/context.js');
 6 | const ContextLocal = require('./lib/context-local.js');
 7 | const Dataset = require('./lib/dataset.js');
 8 | 
 9 | function Context(args) {
10 |   args = args || {};
11 |   if (args.host || process.env.SKALE_HOST) return ContextRemote(args);
12 |   return ContextLocal(args);
13 | }
14 | 
15 | module.exports = {
16 |   Context: Context,
17 |   context: Context,
18 |   HashPartitioner: Dataset.HashPartitioner,
19 |   RangePartitioner: Dataset.RangePartitioner,
20 |   Source: Dataset.Source
21 | };
22 | 


--------------------------------------------------------------------------------
/test/ml/sgd-linear-model.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | const ml = require('skale/ml');
 4 | 
 5 | t.test('SGDLinearModel', function (t) {
 6 |   t.plan(3);
 7 | 
 8 |   const trainingSet = sc.parallelize([
 9 |     [1, [0.5, -0.7]],
10 |     [-1, [-0.5, 0.7]]
11 |   ]);
12 |   const sgd = new ml.SGDLinearModel();
13 | 
14 |   sgd.fit(trainingSet, 2, function (err) {
15 |     t.ok(!err, 'sgd.fit() returns no error');
16 |     t.deepEqual(sgd.weights, [0.8531998372026804, -1.1944797720837526], 'sgd weights are correct');
17 |     t.ok(sgd.predict([2, -2]) > 0, 'sgd prediction is correct');
18 |     sc.end();
19 |   });
20 | });
21 | 


--------------------------------------------------------------------------------
/examples/ml/linear-regression/regression.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | 'use strict';
 4 | 
 5 | (async function main() {
 6 | 
 7 |   const sc = require('skale').context();
 8 |   const ml = require('skale/ml');
 9 | 
10 |   const labelFeatures = sc.textFile(__dirname + '/sample_linear_regression_data.txt')
11 |     .map(a => {
12 |       const b = a.split(' ');
13 |       return [Number(b.shift()), b.map(a => Number(a.split(':').pop()))];
14 |     });
15 |   //console.log(await labelFeatures.take(1));
16 | 
17 |   const model = ml.SGDLinearModel({});
18 |   await model.fit(labelFeatures, 10);
19 |   console.log('model:', model);
20 | 
21 |   sc.end();
22 | 
23 | })(); // main
24 | 


--------------------------------------------------------------------------------
/examples/basic/flatMapValues.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | function reducer(a, b) {a.push(b); return a;}
 6 | function combiner(a, b) {return a.concat(b);}
 7 | 
 8 | function dup(a) {return [a, a];}
 9 | 
10 | sc.parallelize([['hello', 1], ['world', 2], ['cedric', 3], ['test', 4]])
11 |   .flatMapValues(dup)
12 |   .aggregate(reducer, combiner, [], function(err, res) {
13 |     console.log(res);
14 |     res.sort();
15 |     console.assert(JSON.stringify(res) === JSON.stringify([ [ 'cedric', 3 ],[ 'cedric', 3 ],[ 'hello', 1 ],[ 'hello', 1 ],[ 'test', 4 ],[ 'test', 4 ],[ 'world', 2 ],[ 'world', 2 ] ]));  
16 |     sc.end();
17 |   });
18 | 


--------------------------------------------------------------------------------
/examples/basic/countByValue.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | const data = [[1, 1], [1, 1], [2, 3], [2, 4], [3, 5]];
 6 | 
 7 | const nPartitions = 1;
 8 | 
 9 | function valueFlatMapper(e) {
10 |   const out = [];
11 |   for (let i = e; i <= 5; i++) out.push(i);
12 |   return out;
13 | }
14 | 
15 | sc.parallelize(data, nPartitions)
16 |   .flatMapValues(valueFlatMapper)
17 |   .countByValue()
18 |   .then(function(res) {
19 |     console.log(res);
20 |     console.assert(JSON.stringify(res) === JSON.stringify([[[1, 1], 2], [[1, 2], 2], [[1, 3], 2], [[1, 4], 2], [[1, 5], 2], [[2, 3], 1], [[2, 4], 2], [[2, 5], 2], [[3, 5], 1]]));
21 |     sc.end();
22 |   });
23 | 


--------------------------------------------------------------------------------
/test/flatMap.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | t.test('flatMap', function (t) {
 5 |   t.plan(1);
 6 | 
 7 |   sc.parallelize([1, 2, 3, 4])
 8 |     .flatMap(a => [a, a])
 9 |     .collect(function (err, res) {
10 |       t.deepEqual(res, [1, 1, 2, 2, 3, 3, 4, 4]);
11 |     });
12 | });
13 | 
14 | t.test('flatMapValues', function (t) {
15 |   t.plan(1);
16 | 
17 |   sc.parallelize([['hello', 1], ['world', 2]])
18 |     .flatMapValues(a => [a, 2 * a])
19 |     .collect(function (err, res) {
20 |       t.deepEqual(res, [
21 |         ['hello', 1], ['hello', 2],
22 |         ['world', 2], ['world', 4]
23 |       ]);
24 |       sc.end();
25 |     });
26 | });
27 | 


--------------------------------------------------------------------------------
/test/coGroup.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | t.test('coGroup', function (t) {
 5 |   t.plan(1);
 6 | 
 7 |   const data = [['hello', 1], ['world', 2], ['cedric', 3], ['cedric', 4]];
 8 |   const data2 = [['cedric', 3], ['world', 4], ['test', 5]];
 9 |   const nPartitions = 2;
10 | 
11 |   const a = sc.parallelize(data, nPartitions);
12 |   const b = sc.parallelize(data2, nPartitions);
13 | 
14 |   a.coGroup(b).collect(function (err, res) {
15 |     t.deepEqual(res.sort(), [
16 |       ['cedric', [[3, 4], [3]]],
17 |       ['hello', [[1], []]],
18 |       ['test', [[], [5]]],
19 |       ['world', [[2], [4]]],
20 |     ]);
21 |     sc.end();
22 |   });
23 | });
24 | 


--------------------------------------------------------------------------------
/test/map.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | function by2(a, args) {return a * 2 * args.bias;}
 5 | function sum(a, b) {return a + b;}
 6 | 
 7 | t.test('map', function (t) {
 8 |   t.plan(1);
 9 | 
10 |   sc.parallelize([1, 2, 3, 4])
11 |     .map(by2, {bias: 2})
12 |     .reduce(sum, 0, function(err, res) {
13 |       t.equal(res, 40);
14 |     });
15 | });
16 | 
17 | t.test('mapValues', function (t) {
18 |   t.plan(1);
19 | 
20 |   sc.parallelize([['hello', 1], ['world', 2], ['test', 4]])
21 |     .mapValues(a => a * 2)
22 |     .collect(function (err, res) {
23 |       t.deepEqual(res.sort(), [['hello', 2], ['test', 8], ['world', 4]]);
24 |       sc.end();
25 |     });
26 | });
27 | 


--------------------------------------------------------------------------------
/test/aggregateByKey.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | const data = [['hello', 1], ['hello', 1], ['world', 1]];
 5 | const nPartitions = 2;
 6 | 
 7 | const init = 0;
 8 | 
 9 | function reducer(a, b) {return a + b;}
10 | function combiner(a, b) {return a + b;}
11 | 
12 | t.test('aggregateByKey', function (t) {
13 |   t.plan(1);
14 | 
15 |   sc.parallelize(data, nPartitions)
16 |     .aggregateByKey(reducer, combiner, init)
17 |     .collect(function(err, res) {
18 |       t.deepEqual(res, [['hello', 2], ['world', 1]]);
19 |       sc.end();
20 |     });
21 | });
22 | 
23 | // TODO: test passing args in combiner / reducer
24 | 
25 | // TODO: test using worker contex in combiner / reducer
26 | 


--------------------------------------------------------------------------------
/test/0_require.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | t.test('env', function (t) {
 5 |   t.plan(1);
 6 | 
 7 |   sc.env.MY_VAR = 'hello';
 8 |   sc.range(5)
 9 |     .map(a => process.env.MY_VAR + a)
10 |     .collect(function (err, res) {
11 |       t.equal(res[0], 'hello0', 'env is propagated to workers');
12 |     });
13 | });
14 | 
15 | t.test('require', function (t) {
16 |   t.plan(1);
17 | 
18 |   sc.require({add3: './dep.js'})
19 |     .range(4)
20 |     .map(a => add3(a))                // eslint-disable-line no-undef
21 |     .collect(function (err, res) {
22 |       t.deepEquals(res, [3, 4, 5, 6], 'dependency is injected in workers');
23 |       sc.end();
24 |     });
25 | });
26 | 
27 | 


--------------------------------------------------------------------------------
/examples/ml/clustering/README.md:
--------------------------------------------------------------------------------
 1 | # Clustering example
 2 | 
 3 | This example showcases unsupervised clusterization with [K-Means].
 4 | 
 5 | We use the [iris flower dataset] which consists of 50 samples from
 6 | each of three species of Iris (Iris setosa, Iris virginica and Iris
 7 | versicolor). Four features were measured from each sample: the
 8 | length and the width of the sepals and petals, in centimetres. Data
 9 | are in [iris.csv](iris.csv).
10 | 
11 | We train an unsupervised clustering model on numerical features,
12 | and check the predicted data (cluster number) against dataset label
13 | to evaluate performance.
14 | 
15 | [K-Means]: https://en.wikipedia.org/wiki/K-means_clustering
16 | [iris flower dataset]: https://en.wikipedia.org/wiki/Iris_flower_data_set
17 | 


--------------------------------------------------------------------------------
/lib/lines.js:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Luca-SAS, licensed under the Apache License 2.0
 2 | 
 3 | 'use strict';
 4 | 
 5 | const stream = require('stream');
 6 | const util = require('util');
 7 | 
 8 | const Lines = module.exports = function Lines(opt) {
 9 |   if (!(this instanceof Lines)) return new Lines(opt);
10 |   stream.Transform.call(this, {objectMode: true});
11 |   this._buf = '';
12 | };
13 | util.inherits(Lines, stream.Transform);
14 | 
15 | Lines.prototype._transform = function (chunk, encoding, done) {
16 |   const data = this._buf + chunk.toString();
17 |   const lines = data.split('\n');
18 |   this._buf = lines.pop();
19 |   done(null, lines);
20 | };
21 | 
22 | Lines.prototype._flush = function (done) {
23 |   if (this._buf) this.push([this._buf]);
24 |   done();
25 | };
26 | 


--------------------------------------------------------------------------------
/test/aggregate.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | t.test('aggregate callback', function (t) {
 5 |   t.plan(1);
 6 | 
 7 |   sc.parallelize([3, 5, 2, 7, 4, 8])
 8 |     .aggregate(
 9 |       (a, v) => [a[0] + v, a[1] + 1],
10 |       (a1, a2) => [a1[0] + a2[0], a1[1] + a2[1]],
11 |       [0, 0],
12 |       function (err, res) {
13 |         t.equal(res[0] / res[1], 29 / 6);
14 |       }
15 |     );
16 | });
17 | 
18 | t.test('aggregate promise', function (t) {
19 |   t.plan(1);
20 | 
21 |   sc.parallelize([3, 5, 2, 7, 4, 8])
22 |     .aggregate(
23 |       (a, v) => [a[0] + v, a[1] + 1],
24 |       (a1, a2) => [a1[0] + a2[0], a1[1] + a2[1]],
25 |       [0, 0]
26 |     )
27 |     .then(function(res) {
28 |       t.equal(res[0] / res[1], 29 / 6);
29 |       sc.end();
30 |     });
31 | });
32 | 


--------------------------------------------------------------------------------
/examples/ml/clustering/iris.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | 'use strict';
 4 | 
 5 | (async function main() {
 6 | 
 7 |   const sc = require('skale').context();
 8 |   const ml = require('skale/ml');
 9 | 
10 |   const rawdata = sc.textFile(__dirname + '/iris.csv');
11 |   //console.log(await rawdata.collect());
12 | 
13 |   const data = rawdata
14 |     .filter(a => a[0] !== 'S')
15 |     .map(a => a.split(','))
16 |     .map(a => [a.pop(), a.map(Number)])   // [species, array of numeric features]
17 |     .persist();
18 |   console.log(await data.map(a => a[1]).collect());
19 | 
20 |   const model = new ml.KMeans(3);
21 |   await model.fit(data.map(a => a[1]));
22 |   console.log(model);
23 | 
24 |   const predicted = data.map((a, model) => [a[0], model.predict(a[1])], model);
25 |   console.log(await predicted.collect());
26 |   sc.end();
27 | 
28 | })(); // main
29 | 


--------------------------------------------------------------------------------
/test/ml/standard-scaler.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | const ml = require('skale/ml');
 4 | 
 5 | t.test('standard scaler', function (t) {
 6 |   t.plan(4);
 7 | 
 8 |   const data = sc.parallelize([[0, 0], [0, 0], [1, 1], [1, 1]]);
 9 |   const scaler = new ml.StandardScaler();
10 | 
11 |   scaler.fit(data, function (err) {
12 |     t.ok(!err, 'scaler.fit() returns no error');
13 |     t.deepEqual(scaler.mean, [0.5, 0.5], 'scaler mean vector is correct');
14 |     t.deepEqual(scaler.std, [0.5, 0.5], 'scaler standard deviation vector is correct');
15 | 
16 |     const scaled = data.map((p, scaler) => scaler.transform(p), scaler);
17 |     scaled.collect(function (err, res) {
18 |       t.deepEqual(res, [[-1, -1], [-1, -1], [1, 1], [1, 1]], 'scaled data is correct');
19 |       sc.end();
20 |     });
21 |   });
22 | });
23 | 


--------------------------------------------------------------------------------
/examples/basic/cartesian.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const sc = require('skale').context();
 4 | 
 5 | const data = [1, 2, 3, 4, 5, 6];
 6 | const data2 = [7, 8, 9, 10, 11, 12];
 7 | const nPartitions = 3;
 8 | 
 9 | const a = sc.parallelize(data, nPartitions);
10 | const b = sc.parallelize(data2, nPartitions);
11 | 
12 | a.cartesian(b)
13 |   .collect(function(err, res) {
14 |     res.sort();
15 |     console.log(res);
16 |     console.assert(JSON.stringify(res) === JSON.stringify([
17 |       [1, 10], [1, 11], [1, 12], [1, 7], [1, 8], [1, 9],
18 |       [2, 10], [2, 11], [2, 12], [2, 7], [2, 8], [2, 9],
19 |       [3, 10], [3, 11], [3, 12], [3, 7], [3, 8], [3, 9],
20 |       [4, 10], [4, 11], [4, 12], [4, 7], [4, 8], [4, 9],
21 |       [5, 10], [5, 11], [5, 12], [5, 7], [5, 8], [5, 9],
22 |       [6, 10], [6, 11], [6, 12], [6, 7], [6, 8], [6, 9]
23 |     ]));
24 |     sc.end();
25 |   });
26 | 


--------------------------------------------------------------------------------
/lib/rough-sizeof.js:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Luca-SAS, licensed under the Apache License 2.0
 2 | 
 3 | 'use strict';
 4 | 
 5 | module.exports = function sizeof(obj) {
 6 |   let size = 0;
 7 | 
 8 |   function sizeOf(obj) {
 9 |     if (obj === undefined || obj === null) return size;
10 |     switch (typeof obj) {
11 |     case 'number':
12 |       size += 8;
13 |       break;
14 |     case 'string':
15 |       size += obj.length * 2;
16 |       break;
17 |     case 'boolean':
18 |       size += 4;
19 |       break;
20 |     case 'object':
21 |       if (obj instanceof Array) {
22 |         size += 8 * obj.length;
23 |         for (let i = 0; i < obj.length; i++) sizeOf(obj[i]);
24 |       } else {
25 |         for (let i in obj) {
26 |           size += i.length * 2;
27 |           sizeOf(obj[i]);
28 |         }
29 |       }
30 |       break;
31 |     }
32 |     return size;
33 |   }
34 |   return sizeOf(obj);
35 | };
36 | 


--------------------------------------------------------------------------------
/test/cartesian.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | t.test('cartesian', function (t) {
 5 |   const data = [1, 2, 3, 4, 5, 6];
 6 |   const data2 = [7, 8, 9, 10, 11, 12];
 7 |   const nPartitions = 3;
 8 | 
 9 |   const a = sc.parallelize(data, nPartitions);
10 |   const b = sc.parallelize(data2, nPartitions);
11 | 
12 |   t.plan(1);
13 | 
14 |   a.cartesian(b)
15 |     .collect(function(err, res) {
16 |       res.sort();
17 |       t.deepEqual(res, [
18 |         [1, 10], [1, 11], [1, 12], [1, 7], [1, 8], [1, 9],
19 |         [2, 10], [2, 11], [2, 12], [2, 7], [2, 8], [2, 9],
20 |         [3, 10], [3, 11], [3, 12], [3, 7], [3, 8], [3, 9],
21 |         [4, 10], [4, 11], [4, 12], [4, 7], [4, 8], [4, 9],
22 |         [5, 10], [5, 11], [5, 12], [5, 7], [5, 8], [5, 9],
23 |         [6, 10], [6, 11], [6, 12], [6, 7], [6, 8], [6, 9]
24 |       ]);
25 |       sc.end();
26 |     });
27 | });
28 | 


--------------------------------------------------------------------------------
/test/node_modules/skale/index.js:
--------------------------------------------------------------------------------
 1 | 'use strict';
 2 | 
 3 | const ContextRemote = require('../../../lib/context.js');
 4 | const ContextLocal = require('../../../lib/context-local.js');
 5 | const Dataset = require('../../../lib/dataset.js');
 6 | 
 7 | // In test mode, cache already opened context in global var 
 8 | // so only one context is active for the whole test session
 9 | // no matter the number of test files
10 | 
11 | function Context(args) {
12 |   if (global._sc) {
13 |     global._scn++;
14 |   } else {
15 |     global._scn = 0;
16 |     args = args || {};
17 |     if (args.host || process.env.SKALE_HOST)
18 |       global._sc = ContextRemote(args);
19 |     else
20 |       global._sc = ContextLocal(args);
21 |   } 
22 |   return global._sc;
23 | }
24 | 
25 | module.exports = {
26 |   Context: Context,
27 |   context: Context,
28 |   HashPartitioner: Dataset.HashPartitioner,
29 |   RangePartitioner: Dataset.RangePartitioner,
30 |   Source: Dataset.Source
31 | };
32 | 


--------------------------------------------------------------------------------
/test/save-csv.js:
--------------------------------------------------------------------------------
 1 | const fs = require('fs');
 2 | const t = require('tape');
 3 | const rimraf = require('rimraf');
 4 | const sc = require('skale').context();
 5 | 
 6 | const savedir = '/tmp/skale-test/save';
 7 | 
 8 | t.test('save csv', function (t) {
 9 |   t.plan(4);
10 | 
11 |   rimraf(savedir, function (err) {
12 |     t.ok(!err, 'delete previous saved data');
13 |     sc.range(10)
14 |       .map(a => [a, a, a])
15 |       .save(savedir, {stream: true, csv: true}, function (err) {
16 |         t.ok(!err, 'save returns no error');
17 |         t.ok(fs.existsSync(savedir + '/0.csv'), 'saved filename is correct');
18 |         sc.textFile(savedir + '/')
19 |           .collect(function (err, res) {
20 |             t.deepEqual(res, [
21 |               '0;0;0', '1;1;1', '2;2;2', '3;3;3', '4;4;4',
22 |               '5;5;5', '6;6;6', '7;7;7', '8;8;8', '9;9;9'
23 |             ], 'saved content is correct');
24 |             sc.end();
25 |           });
26 |       });
27 |   });
28 | });
29 | 


--------------------------------------------------------------------------------
/test/join.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | const data = [['hello', 1], ['world', 2], ['world', 3]];
 5 | const data2 = [['cedric', 3], ['world', 4]];
 6 | 
 7 | t.test('join', function (t) {
 8 |   t.plan(1);
 9 | 
10 |   sc.parallelize(data)
11 |     .join(sc.parallelize(data2))
12 |     .collect(function(err, res) {
13 |       t.deepEqual(res, [['world', [2, 4]], ['world', [3, 4]]]);
14 |     });
15 | });
16 | 
17 | t.test('leftOuterJoin', function (t) {
18 |   t.plan(1);
19 | 
20 |   sc.parallelize(data)
21 |     .leftOuterJoin(sc.parallelize(data2))
22 |     .collect(function(err, res) {
23 |       t.deepEqual(res.sort(), [['hello', [1, null]], ['world', [2, 4]], ['world', [3, 4]]]);
24 |     });
25 | });
26 | 
27 | t.test('rightOuterJoin', function (t) {
28 |   t.plan(1);
29 | 
30 |   sc.parallelize(data)
31 |     .rightOuterJoin(sc.parallelize(data2))
32 |     .collect(function(err, res) {
33 |       t.deepEqual(res.sort(), [['cedric', [null, 3]], ['world', [2, 4]], ['world', [3, 4]]]);
34 |       sc.end();
35 |     });
36 | });
37 | 


--------------------------------------------------------------------------------
/test/stream.js:
--------------------------------------------------------------------------------
 1 | const stream = require('stream');
 2 | const zlib =  require('zlib');
 3 | const t = require('tape');
 4 | const sc = require('skale').context();
 5 | 
 6 | t.test('stream', function (t) {
 7 |   t.plan(2);
 8 | 
 9 |   let res = '';
10 |   const s = sc.range(10).stream();
11 | 
12 |   t.ok(s instanceof stream.Readable, 'ds.stream() returns a readable stream');
13 | 
14 |   s.on('data', function (data) {res += data.toString();});
15 |   s.on('end', function () {
16 |     t.equal(res, '0\n1\n2\n3\n4\n5\n6\n7\n8\n\9\n', 'data read is correct');
17 |   });
18 | });
19 | 
20 | t.test('stream gzip', function (t) {
21 |   t.plan(2);
22 | 
23 |   let res = '';
24 |   const s = sc.range(10).stream({gzip: true});
25 | 
26 |   t.ok(s instanceof stream.Readable, 'ds.stream() returns a readable stream');
27 | 
28 |   const rs = s.pipe(zlib.createGunzip());
29 | 
30 |   rs.on('data', function (data) {res += data.toString();});
31 |   rs.on('end', function () {
32 |     t.equal(res, '0\n1\n2\n3\n4\n5\n6\n7\n8\n\9\n', 'gunzip data read is correct');
33 |     sc.end();
34 |   });
35 | });
36 | 


--------------------------------------------------------------------------------
/test/data/split/iris-03.csv:
--------------------------------------------------------------------------------
 1 | 6.0,2.2,5.0,1.5,Iris virginica
 2 | 6.9,3.2,5.7,2.3,Iris virginica
 3 | 5.6,2.8,4.9,2.0,Iris virginica
 4 | 7.7,2.8,6.7,2.0,Iris virginica
 5 | 6.3,2.7,4.9,1.8,Iris virginica
 6 | 6.7,3.3,5.7,2.1,Iris virginica
 7 | 7.2,3.2,6.0,1.8,Iris virginica
 8 | 6.2,2.8,4.8,1.8,Iris virginica
 9 | 6.1,3.0,4.9,1.8,Iris virginica
10 | 6.4,2.8,5.6,2.1,Iris virginica
11 | 7.2,3.0,5.8,1.6,Iris virginica
12 | 7.4,2.8,6.1,1.9,Iris virginica
13 | 7.9,3.8,6.4,2.0,Iris virginica
14 | 6.4,2.8,5.6,2.2,Iris virginica
15 | 6.3,2.8,5.1,1.5,Iris virginica
16 | 6.1,2.6,5.6,1.4,Iris virginica
17 | 7.7,3.0,6.1,2.3,Iris virginica
18 | 6.3,3.4,5.6,2.4,Iris virginica
19 | 6.4,3.1,5.5,1.8,Iris virginica
20 | 6.0,3.0,4.8,1.8,Iris virginica
21 | 6.9,3.1,5.4,2.1,Iris virginica
22 | 6.7,3.1,5.6,2.4,Iris virginica
23 | 6.9,3.1,5.1,2.3,Iris virginica
24 | 5.8,2.7,5.1,1.9,Iris virginica
25 | 6.8,3.2,5.9,2.3,Iris virginica
26 | 6.7,3.3,5.7,2.5,Iris virginica
27 | 6.7,3.0,5.2,2.3,Iris virginica
28 | 6.3,2.5,5.0,1.9,Iris virginica
29 | 6.5,3.0,5.2,2.0,Iris virginica
30 | 6.2,3.4,5.4,2.3,Iris virginica
31 | 5.9,3.0,5.1,1.8,Iris virginica
32 | 


--------------------------------------------------------------------------------
/test/textFile.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | t.test('textFile local file', function (t) {
 5 |   t.plan(1);
 6 |   sc.textFile(__dirname + '/data/iris.csv')
 7 |     .count(function (err, res) {
 8 |       t.ok(res === 151);
 9 |     });
10 | });
11 | 
12 | t.test('textFile compressed file', function (t) {
13 |   t.plan(1);
14 |   sc.textFile(__dirname + '/data/iris.csv.gz')
15 |     .count(function (err, res) {
16 |       t.ok(res === 151);
17 |     });
18 | });
19 | 
20 | t.test('textFile dir', function (t) {
21 |   t.plan(1);
22 |   sc.textFile(__dirname + '/data/split/')
23 |     .count(function (err, res) {
24 |       t.ok(res === 151);
25 |     });
26 | });
27 | 
28 | t.test('textFile compressed dir', function (t) {
29 |   t.plan(1);
30 |   sc.textFile(__dirname + '/data/split-gz/')
31 |     .count(function (err, res) {
32 |       t.ok(res === 151);
33 |     });
34 | });
35 | 
36 | t.test('textFile multiple files', function (t) {
37 |   t.plan(1);
38 |   sc.textFile(__dirname + '/data/split/iris-*.csv')
39 |     .count(function (err, res) {
40 |       t.ok(res === 151);
41 |       sc.end();
42 |     });
43 | });
44 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | # Project information
 2 | site_name: Skale
 3 | site_description: Skale documentation
 4 | site_author: 'Skale authors'
 5 | site_url: 'https://skale-me.github.io/skale/'
 6 | dev_addr: 0.0.0.0:8000
 7 | 
 8 | google_analytics:
 9 |   - 'UA-75822605-1'
10 |   - 'auto'
11 | 
12 | # Repository
13 | repo_name: 'skale-me/skale'
14 | repo_url: 'https://github.com/skale-me/skale/'
15 | 
16 | # Copyright
17 | copyright: 'Copyright &copy; 2016 Luca-SAS'
18 | 
19 | # Configuration
20 | theme:
21 |   name: 'material'
22 |   langage: 'en'
23 |   logo: 'images/logo.svg'
24 |   favicon: 'images/favicon.png'
25 |   palette:
26 |     primary: 'indigo'
27 |     accent: 'indigo'
28 |   font:
29 |     text: 'Roboto'
30 |     code: 'Roboto mono'
31 | 
32 | # Customization
33 | extra:
34 |   social:
35 |     - type: 'github'
36 |       link: 'https://github.com/skale-me'
37 | 
38 | # Extensions
39 | markdown_extensions:
40 |   - admonition
41 |   - codehilite:
42 |       guess_lang: false
43 |   - toc:
44 |       permalink: true
45 | 
46 | # Content
47 | docs_dir: docs
48 | 
49 | pages:
50 |   - About Skale: 'index.md'
51 |   - Programming guide:
52 |     - 'concepts.md'
53 |     - 'skale-API.md'
54 |     - 'machine-learning.md'
55 |   - 'skale-hackers-guide.md'
56 | 


--------------------------------------------------------------------------------
/test/take.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | t.test('first callback', function (t) {
 5 |   t.plan(1);
 6 | 
 7 |   sc.range(100)
 8 |     .first(function (err, res) {
 9 |       t.equal(res, 0); 
10 |     });
11 | });
12 | 
13 | t.test('first promise', function (t) {
14 |   t.plan(1);
15 | 
16 |   sc.range(100)
17 |     .first()
18 |     .then(function (res) {
19 |       t.equal(res, 0); 
20 |     });
21 | });
22 | 
23 | t.test('take callback', function (t) {
24 |   t.plan(1);
25 | 
26 |   sc.range(100)
27 |     .take(3, function (err, res) {
28 |       t.deepEqual(res, [0, 1, 2]); 
29 |     });
30 | });
31 | 
32 | t.test('take promise', function (t) {
33 |   t.plan(1);
34 | 
35 |   sc.range(100)
36 |     .take(3)
37 |     .then(function (res) {
38 |       t.deepEqual(res, [0, 1, 2]); 
39 |     });
40 | });
41 | 
42 | t.test('top callback', function (t) {
43 |   t.plan(1);
44 | 
45 |   sc.range(100)
46 |     .top(3, function (err, res) {
47 |       t.deepEqual(res, [97, 98, 99]); 
48 |     });
49 | });
50 | 
51 | t.test('top promise', function (t) {
52 |   t.plan(1);
53 | 
54 |   sc.range(100)
55 |     .top(3)
56 |     .then(function (res) {
57 |       t.deepEqual(res, [97, 98, 99]); 
58 |       sc.end();
59 |     });
60 | });
61 | 


--------------------------------------------------------------------------------
/docs/images/logo.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="iso-8859-1"?>
2 | <!-- Generator: Adobe Illustrator 19.0.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
3 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" id="Capa_1" x="0px" y="0px" viewBox="0 0 55 55" style="enable-background:new 0 0 55 55;" xml:space="preserve" width="512px" height="512px">
4 | <path d="M49,0c-3.309,0-6,2.691-6,6c0,1.035,0.263,2.009,0.726,2.86l-9.829,9.829C32.542,17.634,30.846,17,29,17  s-3.542,0.634-4.898,1.688l-7.669-7.669C16.785,10.424,17,9.74,17,9c0-2.206-1.794-4-4-4S9,6.794,9,9s1.794,4,4,4  c0.74,0,1.424-0.215,2.019-0.567l7.669,7.669C21.634,21.458,21,23.154,21,25s0.634,3.542,1.688,4.897L10.024,42.562  C8.958,41.595,7.549,41,6,41c-3.309,0-6,2.691-6,6s2.691,6,6,6s6-2.691,6-6c0-1.035-0.263-2.009-0.726-2.86l12.829-12.829  c1.106,0.86,2.44,1.436,3.898,1.619v10.16c-2.833,0.478-5,2.942-5,5.91c0,3.309,2.691,6,6,6s6-2.691,6-6c0-2.967-2.167-5.431-5-5.91  v-10.16c1.458-0.183,2.792-0.759,3.898-1.619l7.669,7.669C41.215,39.576,41,40.26,41,41c0,2.206,1.794,4,4,4s4-1.794,4-4  s-1.794-4-4-4c-0.74,0-1.424,0.215-2.019,0.567l-7.669-7.669C36.366,28.542,37,26.846,37,25s-0.634-3.542-1.688-4.897l9.665-9.665  C46.042,11.405,47.451,12,49,12c3.309,0,6-2.691,6-6S52.309,0,49,0z" fill="#FFFFFF"/>
5 | </svg>
6 | 


--------------------------------------------------------------------------------
/benchmark/gen_data.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | const fs = require('fs');
 4 | 
 5 | if (process.argv.length !== 4) {
 6 |   console.log('Usage: gen_data.js file size_in_Mo');
 7 |   process.exit(1);
 8 | }
 9 | 
10 | const file = process.argv[2];
11 | const D = 16;
12 | const maxSize = process.argv[3] * 1024 * 1024;
13 | 
14 | const rng = new Random();
15 | const fd = fs.createWriteStream(file);
16 | let fileSize = 0;
17 | 
18 | function writeChunk() {
19 |   let line = '';
20 |   for (let i = 0; i < 500; i++) {
21 |     line += 2 * Math.round(Math.abs(rng.randn(1))) - 1;
22 |     line += ' ' + rng.randn(D).join(' ') + '\n';
23 |   }
24 |   const lineSize = Buffer.byteLength(line, 'utf8');
25 |   if ((fileSize + lineSize) > maxSize) fd.end();
26 |   else fd.write(line, function() {fileSize += lineSize; writeChunk();});
27 | }
28 | 
29 | writeChunk();
30 | 
31 | function Random(initSeed) {
32 |   this.seed = initSeed || 1;
33 | 
34 |   this.next = function () {
35 |     const x = Math.sin(this.seed++) * 10000;
36 |     return (x - Math.floor(x)) * 2 - 1;
37 |   };
38 | 
39 |   this.reset = function () {
40 |     this.seed = initSeed;
41 |   };
42 | 
43 |   this.randn = function (N) {
44 |     const w = new Array(N);
45 |     for (let i = 0; i < N; i++)
46 |       w[i] = this.next();
47 |     return w;
48 |   };
49 | }
50 | 


--------------------------------------------------------------------------------
/test/data/split/iris-00.csv:
--------------------------------------------------------------------------------
 1 | Sepal length,Sepal Width,Petal length,Petal width,Species
 2 | 5.1,3.5,1.4,0.2,Iris setosa
 3 | 4.9,3.0,1.4,0.2,Iris setosa
 4 | 4.7,3.2,1.3,0.2,Iris setosa
 5 | 4.6,3.1,1.5,0.2,Iris setosa
 6 | 5.0,3.6,1.4,0.2,Iris setosa
 7 | 5.4,3.9,1.7,0.4,Iris setosa
 8 | 4.6,3.4,1.4,0.3,Iris setosa
 9 | 5.0,3.4,1.5,0.2,Iris setosa
10 | 4.4,2.9,1.4,0.2,Iris setosa
11 | 4.9,3.1,1.5,0.1,Iris setosa
12 | 5.4,3.7,1.5,0.2,Iris setosa
13 | 4.8,3.4,1.6,0.2,Iris setosa
14 | 4.8,3.0,1.4,0.1,Iris setosa
15 | 4.3,3.0,1.1,0.1,Iris setosa
16 | 5.8,4.0,1.2,0.2,Iris setosa
17 | 5.7,4.4,1.5,0.4,Iris setosa
18 | 5.4,3.9,1.3,0.4,Iris setosa
19 | 5.1,3.5,1.4,0.3,Iris setosa
20 | 5.7,3.8,1.7,0.3,Iris setosa
21 | 5.1,3.8,1.5,0.3,Iris setosa
22 | 5.4,3.4,1.7,0.2,Iris setosa
23 | 5.1,3.7,1.5,0.4,Iris setosa
24 | 4.6,3.6,1.0,0.2,Iris setosa
25 | 5.1,3.3,1.7,0.5,Iris setosa
26 | 4.8,3.4,1.9,0.2,Iris setosa
27 | 5.0,3.0,1.6,0.2,Iris setosa
28 | 5.0,3.4,1.6,0.4,Iris setosa
29 | 5.2,3.5,1.5,0.2,Iris setosa
30 | 5.2,3.4,1.4,0.2,Iris setosa
31 | 4.7,3.2,1.6,0.2,Iris setosa
32 | 4.8,3.1,1.6,0.2,Iris setosa
33 | 5.4,3.4,1.5,0.4,Iris setosa
34 | 5.2,4.1,1.5,0.1,Iris setosa
35 | 5.5,4.2,1.4,0.2,Iris setosa
36 | 4.9,3.1,1.5,0.1,Iris setosa
37 | 5.0,3.2,1.2,0.2,Iris setosa
38 | 5.5,3.5,1.3,0.2,Iris setosa
39 | 4.9,3.1,1.5,0.1,Iris setosa
40 | 4.4,3.0,1.3,0.2,Iris setosa
41 | 


--------------------------------------------------------------------------------
/test/textFile-azure.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | const skip = process.env.AZURE_STORAGE_CONNECTION_STRING ? false : true;
 5 | 
 6 | t.test('textFile azure file', {skip: skip}, function (t) {
 7 |   t.plan(1);
 8 |   sc.textFile('wasb://skalejs/test/iris.csv')
 9 |     .count(function (err, res) {
10 |       t.ok(res === 151);
11 |     });
12 | });
13 | 
14 | t.test('textFile azure compressed file', {skip: skip}, function (t) {
15 |   t.plan(1);
16 |   sc.textFile('wasb://skalejs/test/iris.csv.gz')
17 |     .count(function (err, res) {
18 |       t.ok(res === 151);
19 |     });
20 | });
21 | 
22 | t.test('textFile azure dir', {skip: skip}, function (t) {
23 |   t.plan(1);
24 |   sc.textFile('wasb://skalejs/split/')
25 |     .count(function (err, res) {
26 |       t.ok(res === 151);
27 |     });
28 | });
29 | 
30 | t.test('textFile azure compressed dir', {skip: skip}, function (t) {
31 |   t.plan(1);
32 |   sc.textFile('wasb://skalejs/splitgz/')
33 |     .count(function (err, res) {
34 |       t.ok(res === 151);
35 |     });
36 | });
37 | 
38 | t.test('textFile azure multiple files', {skip: skip}, function (t) {
39 |   t.plan(1);
40 |   sc.textFile('wasb://skalejs/split/iris-*.csv')
41 |     .count(function (err, res) {
42 |       t.ok(res === 151);
43 |       sc.end();
44 |     });
45 | });
46 | 
47 | if (skip) sc.end();
48 | 


--------------------------------------------------------------------------------
/test/textFile-s3.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const sc = require('skale').context();
 3 | 
 4 | const skip = process.env.AWS_ACCESS_KEY_ID ? false : true;
 5 | 
 6 | t.test('textFile s3 file', {skip: skip}, function (t) {
 7 |   t.plan(1);
 8 |   sc.textFile('s3://skale-test-eu-west-1/test/iris.csv')
 9 |     .count(function (err, res) {
10 |       t.ok(res === 151);
11 |     });
12 | });
13 | 
14 | t.test('textFile s3 compressed file', {skip: skip}, function (t) {
15 |   t.plan(1);
16 |   sc.textFile('s3://skale-test-eu-west-1/test/iris.csv.gz')
17 |     .count(function (err, res) {
18 |       t.ok(res === 151);
19 |     });
20 | });
21 | 
22 | t.test('textFile s3 dir', {skip: skip}, function (t) {
23 |   t.plan(1);
24 |   sc.textFile('s3://skale-test-eu-west-1/test/split/')
25 |     .count(function (err, res) {
26 |       t.ok(res === 151);
27 |     });
28 | });
29 | 
30 | t.test('textFile s3 compressed dir', {skip: skip}, function (t) {
31 |   t.plan(1);
32 |   sc.textFile('s3://skale-test-eu-west-1/test/split-gz/')
33 |     .count(function (err, res) {
34 |       t.ok(res === 151);
35 |     });
36 | });
37 | 
38 | t.test('textFile s3 multiple files', {skip: skip}, function (t) {
39 |   t.plan(1);
40 |   sc.textFile('s3://skale-test-eu-west-1/test/split/iris-*.csv')
41 |     .count(function (err, res) {
42 |       t.ok(res === 151);
43 |       sc.end();
44 |     });
45 | });
46 | 
47 | if (skip) sc.end();
48 | 


--------------------------------------------------------------------------------
/test/data/split/iris-01.csv:
--------------------------------------------------------------------------------
 1 | 5.1,3.4,1.5,0.2,Iris setosa
 2 | 5.0,3.5,1.3,0.3,Iris setosa
 3 | 4.5,2.3,1.3,0.3,Iris setosa
 4 | 4.4,3.2,1.3,0.2,Iris setosa
 5 | 5.0,3.5,1.6,0.6,Iris setosa
 6 | 5.1,3.8,1.9,0.4,Iris setosa
 7 | 4.8,3.0,1.4,0.3,Iris setosa
 8 | 5.1,3.8,1.6,0.2,Iris setosa
 9 | 4.6,3.2,1.4,0.2,Iris setosa
10 | 5.3,3.7,1.5,0.2,Iris setosa
11 | 5.0,3.3,1.4,0.2,Iris setosa
12 | 7.0,3.2,4.7,1.4,Iris versicolor
13 | 6.4,3.2,4.5,1.5,Iris versicolor
14 | 6.9,3.1,4.9,1.5,Iris versicolor
15 | 5.5,2.3,4.0,1.3,Iris versicolor
16 | 6.5,2.8,4.6,1.5,Iris versicolor
17 | 5.7,2.8,4.5,1.3,Iris versicolor
18 | 6.3,3.3,4.7,1.6,Iris versicolor
19 | 4.9,2.4,3.3,1.0,Iris versicolor
20 | 6.6,2.9,4.6,1.3,Iris versicolor
21 | 5.2,2.7,3.9,1.4,Iris versicolor
22 | 5.0,2.0,3.5,1.0,Iris versicolor
23 | 5.9,3.0,4.2,1.5,Iris versicolor
24 | 6.0,2.2,4.0,1.0,Iris versicolor
25 | 6.1,2.9,4.7,1.4,Iris versicolor
26 | 5.6,2.9,3.6,1.3,Iris versicolor
27 | 6.7,3.1,4.4,1.4,Iris versicolor
28 | 5.6,3.0,4.5,1.5,Iris versicolor
29 | 5.8,2.7,4.1,1.0,Iris versicolor
30 | 6.2,2.2,4.5,1.5,Iris versicolor
31 | 5.6,2.5,3.9,1.1,Iris versicolor
32 | 5.9,3.2,4.8,1.8,Iris versicolor
33 | 6.1,2.8,4.0,1.3,Iris versicolor
34 | 6.3,2.5,4.9,1.5,Iris versicolor
35 | 6.1,2.8,4.7,1.2,Iris versicolor
36 | 6.4,2.9,4.3,1.3,Iris versicolor
37 | 6.6,3.0,4.4,1.4,Iris versicolor
38 | 6.8,2.8,4.8,1.4,Iris versicolor
39 | 6.7,3.0,5.0,1.7,Iris versicolor
40 | 6.0,2.9,4.5,1.5,Iris versicolor
41 | 


--------------------------------------------------------------------------------
/test/data/split/iris-02.csv:
--------------------------------------------------------------------------------
 1 | 5.7,2.6,3.5,1.0,Iris versicolor
 2 | 5.5,2.4,3.8,1.1,Iris versicolor
 3 | 5.5,2.4,3.7,1.0,Iris versicolor
 4 | 5.8,2.7,3.9,1.2,Iris versicolor
 5 | 6.0,2.7,5.1,1.6,Iris versicolor
 6 | 5.4,3.0,4.5,1.5,Iris versicolor
 7 | 6.0,3.4,4.5,1.6,Iris versicolor
 8 | 6.7,3.1,4.7,1.5,Iris versicolor
 9 | 6.3,2.3,4.4,1.3,Iris versicolor
10 | 5.6,3.0,4.1,1.3,Iris versicolor
11 | 5.5,2.5,4.0,1.3,Iris versicolor
12 | 5.5,2.6,4.4,1.2,Iris versicolor
13 | 6.1,3.0,4.6,1.4,Iris versicolor
14 | 5.8,2.6,4.0,1.2,Iris versicolor
15 | 5.0,2.3,3.3,1.0,Iris versicolor
16 | 5.6,2.7,4.2,1.3,Iris versicolor
17 | 5.7,3.0,4.2,1.2,Iris versicolor
18 | 5.7,2.9,4.2,1.3,Iris versicolor
19 | 6.2,2.9,4.3,1.3,Iris versicolor
20 | 5.1,2.5,3.0,1.1,Iris versicolor
21 | 5.7,2.8,4.1,1.3,Iris versicolor
22 | 6.3,3.3,6.0,2.5,Iris virginica
23 | 5.8,2.7,5.1,1.9,Iris virginica
24 | 7.1,3.0,5.9,2.1,Iris virginica
25 | 6.3,2.9,5.6,1.8,Iris virginica
26 | 6.5,3.0,5.8,2.2,Iris virginica
27 | 7.6,3.0,6.6,2.1,Iris virginica
28 | 4.9,2.5,4.5,1.7,Iris virginica
29 | 7.3,2.9,6.3,1.8,Iris virginica
30 | 6.7,2.5,5.8,1.8,Iris virginica
31 | 7.2,3.6,6.1,2.5,Iris virginica
32 | 6.5,3.2,5.1,2.0,Iris virginica
33 | 6.4,2.7,5.3,1.9,Iris virginica
34 | 6.8,3.0,5.5,2.1,Iris virginica
35 | 5.7,2.5,5.0,2.0,Iris virginica
36 | 5.8,2.8,5.1,2.4,Iris virginica
37 | 6.4,3.2,5.3,2.3,Iris virginica
38 | 6.5,3.0,5.5,1.8,Iris virginica
39 | 7.7,3.8,6.7,2.2,Iris virginica
40 | 7.7,2.6,6.9,2.3,Iris virginica
41 | 


--------------------------------------------------------------------------------
/test/save.js:
--------------------------------------------------------------------------------
 1 | const fs = require('fs');
 2 | const t = require('tape');
 3 | const rimraf = require('rimraf');
 4 | const sc = require('skale').context();
 5 | 
 6 | const savedir = '/tmp/skale-test/save';
 7 | 
 8 | t.test('save', function (t) {
 9 |   t.plan(4);
10 | 
11 |   rimraf(savedir, function (err) {
12 |     t.ok(!err, 'delete previous saved data');
13 |     sc.range(10)
14 |       .save(savedir, function (err) {
15 |         t.ok(!err, 'save returns no error');
16 |         t.ok(fs.existsSync(savedir + '/0'), 'saved filename is correct');
17 |         sc.textFile(savedir + '/')
18 |           .map(a => JSON.parse(a))
19 |           .collect(function (err, res) {
20 |             t.deepEqual(res, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'saved content is correct');
21 |           });
22 |       });
23 |   });
24 | });
25 | 
26 | t.test('save gzip', function (t) {
27 |   t.plan(4);
28 | 
29 |   rimraf(savedir, function (err) {
30 |     t.ok(!err, 'delete previous saved data');
31 |     sc.range(10)
32 |       .save(savedir, {gzip: true}, function (err) {
33 |         t.ok(!err, 'save returns no error');
34 |         t.ok(fs.existsSync(savedir + '/0.gz'), 'saved filename is correct');
35 |         sc.textFile(savedir + '/')
36 |           .map(a => JSON.parse(a))
37 |           .collect(function (err, res) {
38 |             t.deepEqual(res, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'saved content is correct');
39 |             sc.end();
40 |           });
41 |       });
42 |   });
43 | });
44 | 


--------------------------------------------------------------------------------
/Roadmap.md:
--------------------------------------------------------------------------------
 1 | # Skale-engine Roadmap
 2 | 
 3 | *Last updated December 29 2016*
 4 | 
 5 | This document describes the high level features the Skale-engine
 6 | maintainers have decided to prioritize in the near to medium term.
 7 | 
 8 | ## Schema description for datasets
 9 | 
10 | This would be useful for several purposes:
11 | 
12 | - support of columnar, such as parquet
13 | - integration with a query language compatible with B.I. tools
14 | - optimization of datasets serialization and transfers
15 | 
16 | ## Add support for Parquet
17 | 
18 | *Status: in progress*
19 | 
20 | Parquet is a columnar storage format from the Apache Software
21 | Foundation available to any project in the Hadoop ecosystem.
22 | 
23 | A separate nodeJS module supporting Apache parquet format, both for
24 | reading an writing is first required.
25 | 
26 | Such an experimental module has been started by skale
27 | maintainers [here](https://github.com/mvertes/node-parquet)
28 | 
29 | ## Add support for Avro
30 | 
31 | Avro is a data serialization system from the Apache Software
32 | Foundation which provides rich data structures and a compact fast
33 | binary data format.
34 | 
35 | It is well suited to structured data where a schema is required to
36 | encode and decode values.
37 | 
38 | A pure Javascript implementation exists
39 | [here](https://github.com/mtth/avsc).
40 | 
41 | ## Add realtime streaming capabilities
42 | 
43 | The current processing model is *action* driven, suitable for batch,
44 | or micro-batch processing. See if it is possible to apply the same
45 | API, or at least a subset, to *source* driven processing better
46 | suited for realtime data processing, while retaining skale
47 | scalability and efficiency.
48 | 


--------------------------------------------------------------------------------
/benchmark/sparkLR.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple Logistic regression algorithm.
 3 | """
 4 | 
 5 | from math import exp
 6 | from math import sqrt
 7 | import sys
 8 | import math
 9 | 
10 | from pyspark import SparkContext
11 | 
12 | stepSize = 1
13 | regParam = 1
14 | D = 16                             # Number of features
15 | weights = [0 for i in range(D)]    # Initial weights
16 | 
17 | def parsePoint(line):
18 |     values = [float(s) for s in line.split(' ')]
19 |     return [values[0], values[1:]]
20 | 
21 | def logisticLossGradient(point):
22 |     grad = []
23 |     dotprod = 0
24 |     label = point[0]
25 |     features = point[1]
26 |     for i in range(0, D):
27 |         dotprod += features[i] * weights[i]
28 |     tmp = 1 / (1 + exp(-dotprod)) - label
29 |     for i in range(0, D):
30 |         grad += [features[i] * tmp]
31 |     return grad
32 | 
33 | def mySum(a, b):
34 |     for i in range (0, D):
35 |         a[i] += b[i]
36 |     return a
37 | 
38 | if __name__ == "__main__":
39 |     if len(sys.argv) != 3:
40 |         print >> sys.stderr, "Usage: sparkLR <file> <iterations>"
41 |         exit(-1)
42 | 
43 |     sc = SparkContext(appName="pysparkLR")
44 |     points = sc.textFile(sys.argv[1]).map(parsePoint).persist()
45 | 
46 |     N = points.count()
47 |     
48 |     iterations = int(sys.argv[2])
49 |     for i in range(0, iterations):
50 |         gradient = points.map(logisticLossGradient).reduce(mySum)
51 |         iterStepSize = stepSize / sqrt(i + 1)
52 |         for j in range(0, D):
53 |             weights[j] -= iterStepSize * (gradient[j] / N + regParam * weights[j])
54 |     # format and output weights to stdout  
55 |     line = str(weights[0])
56 |     for i in range (1, D):
57 |         line += " "  + str(weights[i])
58 |     sys.stdout.write(line + "\n")
59 |     sc.stop()
60 | 


--------------------------------------------------------------------------------
/benchmark/skaleLR.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 'use strict';
 3 | 
 4 | const sc = require('skale').context();
 5 | 
 6 | function logisticLossGradient(p, weights) {
 7 |   const grad = [];
 8 |   const label = p[0];
 9 |   const features = p[1];
10 |   let dotProd = 0;
11 | 
12 |   for (let i = 0; i < features.length; i++)
13 |     dotProd += features[i] * weights[i];
14 | 
15 |   const tmp = 1 / (1 + Math.exp(-dotProd)) - label;
16 | 
17 |   for (let i = 0; i < features.length; i++)
18 |     grad[i] = features[i] * tmp;
19 |   return grad;
20 | }
21 | 
22 | function sum(a, b) {
23 |   for (let i = 0; i < b.length; i++)
24 |     a[i] += b[i];
25 |   return a;
26 | }
27 | 
28 | function featurize(line) {
29 |   const tmp = line.split(' ').map(Number);
30 |   const label = tmp.shift();  // [-1,1] labels
31 |   const features = tmp;
32 |   return [label, features];
33 | }
34 | 
35 | const file = process.argv[2];
36 | const nIterations = +process.argv[3] || 10;
37 | const points = sc.textFile(file).map(featurize).persist();
38 | const D = 16;
39 | const stepSize = 1;
40 | const regParam = 1;
41 | 
42 | const zero = Array(D).fill(0);
43 | const weights = Array(D).fill(0);
44 | 
45 | if (!file) throw 'Usage: lr.js file [nIterations]';
46 | 
47 | points.count(function (err, data) {
48 |   const N = data;
49 |   let i = 0;
50 | 
51 |   function iterate() {
52 |     points.map(logisticLossGradient, weights)
53 |       .reduce(sum, zero)
54 |       .then(function(gradient) {
55 |         const iss = stepSize / Math.sqrt(i + 1);
56 |         for (let j = 0; j < weights.length; j++) {
57 |           weights[j] -= iss * (gradient[j] / N + regParam * weights[j]);
58 |         }
59 |         if (++i < nIterations) return iterate();
60 |         console.log(weights);
61 |         sc.end();
62 |       });
63 |   }
64 |   iterate();
65 | });
66 | 


--------------------------------------------------------------------------------
/ml/standard-scaler.js:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Luca-SAS, licensed under the Apache License 2.0
 2 | 
 3 | const thenify = require('thenify');
 4 | 
 5 | module.exports = StandardScaler;
 6 | 
 7 | function StandardScaler() {
 8 |   // Transform is defined here to be automatically serialized
 9 |   // along with object instance and be usable inside workers callbacks.
10 |   this.transform = function (point) {
11 |     let pointStd = [];
12 |     for (let i = 0; i < point.length; i++)
13 |       pointStd[i] = (point[i] - this.mean[i]) / this.std[i];
14 |     return pointStd;
15 |   };
16 |   this.mean;
17 |   this.std;
18 | }
19 | 
20 | function meanReducer(acc, features) {
21 |   for (let i = 0; i < features.length; i++)
22 |     acc.sum[i] = (acc.sum[i] || 0) + features[i];
23 |   acc.count++;
24 |   return acc;
25 | }
26 | 
27 | function meanCombiner(a, b) {
28 |   if (a.sum.length === 0) return b;
29 |   for (let i = 0; i < b.sum.length; i++)
30 |     a.sum[i] += b.sum[i];
31 |   a.count += b.count;
32 |   return a;
33 | }
34 | 
35 | function stddevReducer(acc, features) {
36 |   for (let i = 0; i < features.length; i++) {
37 |     let delta = features[i] - acc.mean[i];
38 |     acc.sum[i] = (acc.sum[i] || 0) + delta * delta;
39 |   }
40 |   return acc;
41 | }
42 | 
43 | function stddevCombiner(a, b) {
44 |   if (a.sum.length === 0) return b;
45 |   for (let i = 0; i < b.sum.length; i++)
46 |     a.sum[i] += b.sum[i];
47 |   return a;
48 | }
49 | 
50 | StandardScaler.prototype.fit = thenify(function (points, done) {
51 |   const self = this;
52 | 
53 |   // Compute mean of each features
54 |   points.aggregate(meanReducer, meanCombiner, {sum: [], count: 0}, function(err, data) {
55 |     self.count = data.count;
56 |     self.mean = [];
57 |     for (let i = 0; i < data.sum.length; i++)
58 |       self.mean[i] = data.sum[i] / data.count;
59 | 
60 |     // Now that we have the mean of each feature, let's compute their standard deviation
61 |     points .aggregate(stddevReducer, stddevCombiner, {sum: [], mean: self.mean}, function(err, res) {
62 |       self.std = [];
63 |       for (let i = 0; i < res.sum.length; i++)
64 |         self.std[i] = Math.sqrt(res.sum[i] / self.count);
65 |       done();
66 |     });
67 |   });
68 | });
69 | 


--------------------------------------------------------------------------------
/test/save-s3.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const aws = require('aws-sdk');
 3 | const sc = require('skale').context();
 4 | 
 5 | const skip = process.env.CI || (process.env.AWS_ACCESS_KEY_ID ? false : true);
 6 | const s3 = skip ? null : new aws.S3({httpOptions: {timeout: 3600000}, signatureVersion: 'v4'});
 7 | const savedir = 's3://skale-test-eu-west-1/test/save';
 8 | 
 9 | t.test('save s3', {skip: skip}, function (t) {
10 |   t.plan(3);
11 | 
12 |   deleteS3Dir('skale-test-eu-west-1', 'test/save/', function (err) {
13 |     t.ok(!err, 'delete S3 previous saved test data');
14 |     sc.range(10)
15 |       .save(savedir, function (err) {
16 |         t.ok(!err, 'save returns no error');
17 |         sc.textFile(savedir + '/')
18 |           .map(a => JSON.parse(a))
19 |           .collect(function (err, res) {
20 |             t.deepEqual(res, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'saved content is correct');
21 |           });
22 |       });
23 |   });
24 | });
25 | 
26 | t.test('save s3 gzip', {skip: skip}, function (t) {
27 |   t.plan(3);
28 | 
29 |   deleteS3Dir('skale-test-eu-west-1', 'test/save/', function (err) {
30 |     t.ok(!err, 'delete S3 previous saved test data');
31 |     sc.range(10)
32 |       .save(savedir, {gzip: true}, function (err) {
33 |         t.ok(!err, 'save returns no error');
34 |         sc.textFile(savedir + '/')
35 |           .map(a => JSON.parse(a))
36 |           .collect(function (err, res) {
37 |             t.deepEqual(res, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'saved content is correct');
38 |             sc.end();
39 |           });
40 |       });
41 |   });
42 | });
43 | 
44 | if (skip) sc.end();
45 | 
46 | function deleteS3Dir(bucket, prefix, done) {
47 |   function getList(list, token, done) {
48 |     s3.listObjectsV2({
49 |       Bucket: bucket,
50 |       Prefix: prefix,
51 |       ContinuationToken: token
52 |     }, function (err, data) {
53 |       if (err) throw new Error('s3.listObjectsV2 failed');
54 |       list = list.concat(data.Contents);
55 |       if (data.IsTruncated)
56 |         return getList(list, data.NextContinuationToken, done);
57 |       done(err, list);
58 |     });
59 |   }
60 | 
61 |   getList([], null, function (err, res) {
62 |     if (!res || !res.length) return done();
63 |     s3.deleteObjects({
64 |       Bucket: bucket,
65 |       Delete: {
66 |         Objects: res.map(o => ({Key: o.Key}))
67 |       }
68 |     }, done);
69 |   });
70 | }
71 | 


--------------------------------------------------------------------------------
/test/save-azure.js:
--------------------------------------------------------------------------------
 1 | const t = require('tape');
 2 | const azure = require('azure-storage');
 3 | const sc = require('skale').context();
 4 | 
 5 | //const skip = process.env.CI || (process.env.AZURE_STORAGE_CONNECTION_STRING ? false : true);
 6 | const skip = true;
 7 | const retry = new azure.ExponentialRetryPolicyFilter();
 8 | const az = skip ? null : azure.createBlobService().withFilter(retry);
 9 | const savedir = 'wasb://skalejs/save';
10 | 
11 | t.test('save azure', {skip: skip}, function (t) {
12 |   t.plan(3);
13 | 
14 |   deleteAzureDir('save', '', function (err) {
15 |     t.ok(!err, 'delete previous saved test data');
16 |     sc.range(10)
17 |       .save(savedir, function (err) {
18 |         t.ok(!err, 'save returns no error');
19 |         sc.textFile(savedir + '/')
20 |           .map(a => JSON.parse(a))
21 |           .collect(function (err, res) {
22 |             t.deepEqual(res, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'saved content is correct');
23 |           });
24 |       });
25 |   });
26 | });
27 | 
28 | t.test('save azure gzip', {skip: skip}, function (t) {
29 |   t.plan(3);
30 | 
31 |   deleteAzureDir('save', '', function (err) {
32 |     t.ok(!err, 'delete previous saved test data');
33 |     sc.range(10)
34 |       .save(savedir, {gzip: true}, function (err) {
35 |         t.ok(!err, 'save returns no error');
36 |         sc.textFile(savedir + '/')
37 |           .map(a => JSON.parse(a))
38 |           .collect(function (err, res) {
39 |             t.deepEqual(res, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'saved content is correct');
40 |             sc.end();
41 |           });
42 |       });
43 |   });
44 | });
45 | 
46 | if (skip) sc.end();
47 | 
48 | function deleteAzureDir(container, prefix, done) {
49 |   function getList(list, token, done) {
50 |     az.listBlobsSegmentedWithPrefix(container, prefix, token, function (err, data) {
51 |       if (err) throw new Error('az.listBlobsSegmented failed');
52 |       list = list.concat(data.entries);
53 |       if (data.continuationToken)
54 |         return getList(list, data.continuationToken, done);
55 |       done(err, list);
56 |     });
57 |   }
58 | 
59 |   getList([], null, function (err, res) {
60 |     if (!res || !res.length) return done();
61 |     let toDelete = res.length;
62 |     res.forEach(function (element) {
63 |       az.deleteBlob(container, element.name, function (err) {
64 |         if (--toDelete <= 0) done(err);
65 |       });
66 |     });
67 |   });
68 | }
69 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "skale",
 3 |   "version": "1.2.2",
 4 |   "license": "Apache-2.0",
 5 |   "description": "parallel and distributed data processing engine",
 6 |   "main": "index.js",
 7 |   "scripts": {
 8 |     "start": "pf=/tmp/skale-server.pid; test -f $pf && exit 1; bin/server.js -l 0 & echo $! > $pf",
 9 |     "stop": "pf=/tmp/skale-server.pid; test -f $pf || exit 1; kill $(cat $pf); rm -f $pf",
10 |     "pretest": "eslint .",
11 |     "tape": "tape \"test/**/*.js\"",
12 |     "test": "make -C test",
13 |     "version": "github_changelog_generator --future-release $npm_package_version"
14 |   },
15 |   "bin": {
16 |     "skale-server": "./bin/server.js",
17 |     "skale-worker": "./bin/worker.js",
18 |     "skale-shell": "./bin/shell.js"
19 |   },
20 |   "repository": "skale-me/skale",
21 |   "bugs": {
22 |     "url": "https://github.com/skale-me/skale/issues"
23 |   },
24 |   "keywords": [
25 |     "big data",
26 |     "ETL",
27 |     "distributed",
28 |     "data processing",
29 |     "machine learning",
30 |     "cloud",
31 |     "S3",
32 |     "azure",
33 |     "parallel",
34 |     "cluster",
35 |     "hpc"
36 |   ],
37 |   "author": "Skale team",
38 |   "dependencies": {
39 |     "await-outside": "^2.1.2",
40 |     "aws-sdk": "^2.382.0",
41 |     "azure-storage": "^2.10.2",
42 |     "browserify": "^16.2.3",
43 |     "callsite": "^1.0.0",
44 |     "merge2": "^1.2.3",
45 |     "micromatch": "^3.1.10",
46 |     "mkdirp": "^0.5.1",
47 |     "node-getopt": "^0.3.2",
48 |     "resolve": "^1.9.0",
49 |     "rimraf": "^2.6.2",
50 |     "seedrandom": "^2.4.4",
51 |     "thenify": "^3.3.0",
52 |     "uuid": "^3.3.2",
53 |     "websocket-stream": "^5.1.2",
54 |     "ws": "^6.1.2"
55 |   },
56 |   "devDependencies": {
57 |     "eslint": "^5.11.0",
58 |     "plotter": "^0.5.0",
59 |     "tape": "^4.9.1"
60 |   },
61 |   "peerDependencies": {
62 |     "node-parquet": "^0.2.4"
63 |   },
64 |   "engines": {
65 |     "node": ">=6.0"
66 |   },
67 |   "eslintConfig": {
68 |     "rules": {
69 |       "indent": [
70 |         2,
71 |         2
72 |       ],
73 |       "quotes": [
74 |         2,
75 |         "single"
76 |       ],
77 |       "semi": [
78 |         2,
79 |         "always"
80 |       ],
81 |       "no-var": 2,
82 |       "no-console": 0
83 |     },
84 |     "env": {
85 |       "es6": true,
86 |       "node": true
87 |     },
88 |     "parserOptions": {
89 |       "ecmaVersion": 2017
90 |     },
91 |     "extends": "eslint:recommended"
92 |   }
93 | }
94 | 


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
 1 | # Skale Docker
 2 | 
 3 | This directory contains a sample Dockerfile for skale container, based on [Alpine](https://hub.docker.com/_/alpine/) container. 
 4 | 
 5 | It also contains a sample Compose file to deploy a minimal stack on a single host or a docker swarm.
 6 | 
 7 | This docker configuration is not meant for production, ok for evaluation or experimentation.
 8 | 
 9 | ## Installing
10 | 
11 | As a prerequisite, [Docker](https://docker.com) must be installed, in version v1.12.0 or higher.
12 | 
13 | To download this docker image from the public docker hub:
14 | 
15 | 	$ docker pull skale/skale
16 | 
17 | To re-build this image from the dockerfile:
18 | 
19 | 	$ docker build -t skale/skale .
20 | 
21 | ## Deploying on a single host
22 | 
23 | This can be done simply with `docker-compose` and the provided `docker-compose.yml` file:
24 | 
25 | 	$ docker-compose up
26 | 
27 | ## Deploying on a cluster
28 | 
29 | The provided image and compose files are compatible to run distributed skale using the docker engine in [swarm mode](https://docs.docker.com/engine/swarm/).
30 | 
31 | First create a cluster of docker machines in a swarm (see docker [documentation](https://docs.docker.com/engine/swarm/swarm-tutorial/create-swarm/))
32 | 
33 | Once a docker swarm is ready, one can deploy a skale stack using the `stack` command and the same `docker-compose.yml` file:
34 | 
35 | 	$ docker stack deploy -c docker-compose.yml skale
36 | 
37 | Then you can adjust the size of the skale cluster by setting the number of worker controllers:
38 | 
39 | 	$ docker service scale skale_skale-worker=3
40 | 
41 | There should be one instance of skale-worker per host. During jobs, each worker controller will spawn as many worker processes as CPUs on each host.
42 | 
43 | ## Running programs
44 | 
45 | To execute skale programs onto the previously deployed skale stack, the `SKALE_HOST` environment variable must point to the cluster public address, i.e the one given by `docker info` on the docker host (or the swarm master): 
46 | 
47 | 	$ docker info | grep 'Node Address'
48 | 	  Node Address: 192.168.99.101
49 | 
50 | For example, to run a sample program from the examples directory:
51 | 
52 | 	$ SKALE_DEBUG=2 SKALE_HOST=192.168.99.101 ../examples/parallelize.js
53 | 	[master 0.050s] workers: 3
54 | 	[master 0.054s] start result stage, partitions: 3
55 | 	[master 0.067s] part 0 from worker-w17 (1/3)
56 | 	[master 0.075s] part 1 from worker-w18 (2/3)
57 | 	[master 0.080s] part 2 from worker-w19 (3/3)
58 | 	[ 1, 2, 3, 4, 5 ]
59 | 


--------------------------------------------------------------------------------
/ml/kmeans.js:
--------------------------------------------------------------------------------
 1 | // Unsupervised clusterization model using K-means
 2 | // Authors: M. Vertes (current), C. Artigue (preliminary)
 3 | // License: Apache License 2.0
 4 | 
 5 | 'use strict';
 6 | 
 7 | const thenify = require('thenify');
 8 | 
 9 | function KMeans(nClusters, options) {
10 |   if (!(this instanceof KMeans))
11 |     return new KMeans(nClusters, options);
12 |   options = options || {};
13 |   this.nClusters = nClusters;
14 |   this.maxMse = options.maxMse || 0.0000001;
15 |   this.means = options.means;
16 |   this.maxIterations = options.maxIterations || 100;
17 | 
18 |   // Return cluster index for which element is closest to center (euclidean norm)
19 |   // Function is inlined in object (vs prototype) serialized to workers
20 |   this.predict = function (element) {
21 |     let means = this.means;
22 |     let smallestSn = Infinity;
23 |     let smallestSnIdx;
24 |     for (let i = 0; i < means.length; i++) {
25 |       let sn = 0;
26 |       for (let j = 0; j < element.length; j++) {
27 |         let delta = element[j] - means[i][j];
28 |         sn += delta * delta;
29 |       }
30 |       if (sn < smallestSn) {
31 |         smallestSnIdx = i;
32 |         smallestSn = sn;
33 |       }
34 |     }
35 |     return smallestSnIdx;
36 |   };
37 | }
38 | 
39 | KMeans.prototype.fit = thenify(function(trainingSet, done) {
40 |   const self = this;
41 |   let iter = 0;
42 | 
43 |   if (self.means === undefined) {
44 |     trainingSet.takeSample(false, self.nClusters, function (err, means) {
45 |       self.means = means;
46 |       iterate();
47 |     });
48 |   } else iterate();
49 | 
50 |   function accumulate(a, b) {
51 |     a.sum += b.sum;
52 |     for (let i = 0; i < b.data.length; i++)
53 |       a.data[i] += b.data[i];
54 |     return a;
55 |   }
56 | 
57 |   function iterate() {
58 |     trainingSet
59 |       .map((a, self) => [self.predict(a), {data: a, sum: 1}], self)
60 |       .reduceByKey(accumulate, {data: Array(self.means.length).fill(0), sum: 0})
61 |       .map(a => a[1].data.map(e => e / a[1].sum))
62 |       .collect(function (err, means) {
63 |         let mse = 0;
64 |         for (let i = 0; i < self.nClusters; i++) {
65 |           for (let j = 0; j < means[i].length; j++) {
66 |             let delta = means[i][j] - self.means[i][j];
67 |             mse += delta * delta;
68 |           }
69 |         }
70 |         self.means = means;
71 |         if (mse < self.maxMse || iter++ > self.maxIterations)
72 |           return done();
73 |         iterate();
74 |       });
75 |   }
76 | });
77 | 
78 | module.exports = KMeans;
79 | 


--------------------------------------------------------------------------------
/ml/classification-metrics.js:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 Luca-SAS, licensed under the Apache License 2.0
 2 | 
 3 | const thenify = require('thenify');
 4 | 
 5 | function reducer(acc, point) {
 6 |   let [prediction, label] = point;
 7 |   for (let i = 0; i < acc.length; i++) {
 8 |     let threshold = i / acc.length;
 9 |     if (prediction > threshold) {
10 |       if (label > 0)
11 |         acc[i].tp++;    // True positive
12 |       else
13 |         acc[i].fp++;    // False positive
14 |     } else {
15 |       if (label > 0)
16 |         acc[i].fn++;    // False negative
17 |       else
18 |         acc[i].tn++;    // True negative
19 |     }
20 |   }
21 |   return acc;
22 | }
23 | 
24 | function combiner(acc1, acc2) {
25 |   for (let i = 0; i < acc1.length; i++) {
26 |     acc1[i].tp += acc2[i].tp;
27 |     acc1[i].tn += acc2[i].tn;
28 |     acc1[i].fp += acc2[i].fp;
29 |     acc1[i].fn += acc2[i].fn;
30 |   }
31 |   return acc1;
32 | }
33 | 
34 | // Compute area under curve, where curve is an Array of Objects {x, y}
35 | function areaUnder(curve, sortx) {
36 |   const sorted = sortx ? curve.sort((a, b) => a.x - b.x) : curve;
37 |   let auc = 0;
38 |   let {x, y} = sorted[0];
39 | 
40 |   for (let i = 0; i < sorted.length; i++) {
41 |     let e = sorted[i];
42 |     auc += (e.x - x) * (y + (e.y - y) / 2);
43 |     x = e.x;
44 |     y = e.y;
45 |   }
46 |   return auc;
47 | }
48 | 
49 | const classificationMetrics = thenify(function (points, options, callback) {
50 |   options = options || {};
51 |   const steps = Number(options.steps) || 10;
52 |   const init = Array(steps).fill({tp: 0, tn: 0, fp: 0, fn: 0}); // Confusion matrices
53 | 
54 |   points.aggregate(reducer, combiner, init, function (error, result) {
55 |     result.map((e, i) => {
56 |       e.threshold = i / steps;
57 |       e.precision = e.tp / (e.tp + e.fp);           // Also called Positive Predictive Value (PPV)
58 |       e.recall = e.tp / (e.tp + e.fn);              // Also called True Positive Rate (TPR) or sensitivity
59 |       e.accuracy = (e.tp + e.tn) / (e.tp + e.tn + e.fp + e.fn);
60 |       e.specificity = e.tn / (e.tn + e.fp);         // Also called True Negative Rate (TNR)
61 |       e.fpr = e.fp / (e.fp + e.tn);
62 |       e.f1 = 2 / (1 / e.recall + 1 / e.precision);  // F1 measure
63 |       e.J = e.recall + e.specificity - 1;           // Younden's J statistic
64 |       return e;
65 |     });
66 |     const auROC = areaUnder(result.map(a => ({x: a.fpr, y: a.recall})), true);
67 |     const auPR = areaUnder(result.map(a => ({x: a.recall, y: a.precision})), true);
68 |     const maxF1 = result.reduce((a, b) => a.f1 > b.f1 ? a : b, result[0]);
69 |     callback(null, {rates: result, auROC: auROC, auPR: auPR, threshold: maxF1.threshold});
70 |   });
71 | });
72 | 
73 | module.exports = classificationMetrics;
74 | 


--------------------------------------------------------------------------------
/benchmark/README.md:
--------------------------------------------------------------------------------
 1 | # A first benchmark
 2 | 
 3 | In this first benchmark, the goal is to evaluate the performance
 4 | of the skale data processing engine.
 5 | 
 6 | For that, we use an iterative machine learning application, the
 7 | logistic regression.
 8 | 
 9 | ## Overview
10 | 
11 | The same algorithm is implemented both in NodeJS/Skale ([skaleLR.js]),
12 | and Python/Spark ([sparkLR.py]). Note that we do not use the spark-ml
13 | library or the skale-ml library, because our focus here is on
14 | core engine performances, and we also want this test to remain
15 | standalone and stable accross multiple versions of Skale and Spark.
16 | 
17 | We use the exact same input file for both, stored on the local file
18 | system, and run the test using the same host. We randomly
19 | generated data using [gen_data.js] program.
20 | 
21 | ## Results
22 | 
23 | Tests are performed on an AWS EC2 instance, called m4.4xlarge, with 16 cores and 64 GB RAM.
24 | The spark environment is spark-1.6.1, java-1.8.0_31, Python-3.4.2 (from https://hub.docker.com/r/gettyimages/spark/)
25 | The skale environment is skale-0.4.5, nodejs-4.4.3.
26 | 
27 | The spark test is run using the following command:
28 | 
29 | 	$ spark-submit --executor-memory 60G sparkLR.py <file> <iteration>
30 | 
31 | The skale test is run using:
32 | 
33 | 	$ skale-run --worker 16 --memory 4000 -- <file> <iteration>
34 | 
35 | The first set of results is for a 1 GB input file, with 3.829 millions entries of 16 features.
36 | 
37 | |iterations | skale (sec) | spark (sec) | speedup |
38 | |-----------|-------------|-------------|---------|
39 | |1          | 4.6         | 19.8        | 4.3     |
40 | |4          | 5           | 34.8        | 7.0     |
41 | |10         | 6.5         | 65.9        | 10.1    |
42 | |20         | 8.7         | 116         | 13.3    |
43 | |70         | 20          | 369         | 18.5    |
44 | |100        | 26          | 522         | 20.1    |
45 | 
46 | ![logreg1](logreg-1.png)
47 | 
48 | The second set of results is for a 10 GB input file, with 38.29 millions entries of 16 features.
49 | 
50 | |iterations | skale (sec) | spark (sec) | speedup |
51 | |-----------|-------------|-------------|---------|
52 | |1          | 43.9        | 164.9       | 3.8     |
53 | |4          | 48.1        | 455         | 9.5     |
54 | |10         | 58.8        | 1038        | 17.7    |
55 | |20         | 82.2        | 2010        | 24.5    |
56 | |50         | 170         | 4927        | 29      |
57 | |100        | 224         | 9772        | 43.6    |
58 | 
59 | ![logreg1](logreg-10.png)
60 | 
61 | ## Call for contribution
62 | 
63 | It would be nice to have a spark/scala version of this benchmark which
64 | could possibly perform better than the spark/python version. 
65 | 
66 | [skaleLR.js]: skaleLR.js
67 | [sparkLR.py]: sparkLR.py
68 | [gen_data.js]: gen_data.js
69 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | Interested in contributing to skale? We'd love
 4 | your help. Skale-engine is an open source project, built one
 5 | contribution at a time by users just like you.
 6 | 
 7 | The [Code of Conduct] details the bare minimum behavior
 8 | expectations required from contributors.
 9 | 
10 | ## Where to get help or report a problem
11 | 
12 | * If you have a question about using skale, start a discussion
13 |   on [gitter] or on [google group]. Please do not open issues for
14 |   questions or support requests.
15 | * If you think you have found a bug within skale, open an
16 |   [issue].  Do not forget to check that it doesn't already exist
17 |   in our [issue database]
18 | * If you want to learn more about skale internals, architecture and
19 |   how to extend skale, see the
20 |   [Skale Hacker's Guide](docs/skale-hackers-guide.md)
21 | * If you have a suggestion for improvement or a new feature, create
22 |   a [pull request] so it can be discussed and reviewed by the
23 |   community and project committers. Even the project committers
24 |   submit their code this way.
25 | 
26 | ## Submitting a pull request
27 | 
28 | * Create your own [fork] on github, then checkout your fork
29 | * Write your code in your local copy. It's good practice to create
30 |   a branch for each new issue you work on, although not compulsory
31 | * Your code must follow existing coding style, and tests must pass.
32 |   To check coding style, run `npm run lint`. The [coding style] of skale
33 |   is the same as in core NodeJS.
34 |   To run the tests, first run `npm install`, then `npm test`
35 | * If the tests pass, you can commit changes to your fork and then
36 |   create a pull request from there. Reference any relevant issue by
37 |   including its number in the message, e.g. #123
38 | 
39 | ## Writing documentation
40 | 
41 | The [documentation guidelines] from Google provide a good reference
42 | for writing consistent and good technical documents, in particular
43 | [API documentation rules].
44 | 
45 | Note: skale documentation was started before knowing this standard,
46 | thus is not yet fully compliant! Please help us to write better
47 | docs.
48 | 
49 | ## Coding rules
50 | 
51 | In addition to applying the already mentioned [coding style],
52 | the following conventions should be applied as well:
53 | 
54 | * Use `const` instead of `var` for declarations, whenever possible
55 | * Use `let` instead of `var` if reference must be reassigned
56 | * Use array or object destructuring to set variables from array or
57 |   object: `let [a, b] = [1, 2, 3]`
58 | * Use arrow functions in callbacks, where applicable: `map`, `reduce`,
59 |   `aggregate`, etc
60 | 
61 | [Code of Conduct]: CODE_OF_CONDUCT.md
62 | [coding style]: https://github.com/felixge/node-style-guide
63 | [gitter]: https://gitter.im/skale-me/skale
64 | [google group]: https://groups.google.com/forum/#!forum/skale
65 | [issue database]: https://github.com/skale-me/skale/issues
66 | [issue]: https://github.com/skale-me/skale/issues/new
67 | [pull request]: #submitting-a-pull-request
68 | [fork]: https://github.com/skale-me/skale
69 | [documentation guidelines]: https://developers.google.com/style/
70 | [API documentation rules]: https://developers.google.com/style/api-reference-comments
71 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
 6 | 
 7 | ## Our Standards
 8 | 
 9 | Examples of behavior that contributes to creating a positive environment include:
10 | 
11 | * Using welcoming and inclusive language
12 | * Being respectful of differing viewpoints and experiences
13 | * Gracefully accepting constructive criticism
14 | * Focusing on what is best for the community
15 | * Showing empathy towards other community members
16 | 
17 | Examples of unacceptable behavior by participants include:
18 | 
19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
20 | * Trolling, insulting/derogatory comments, and personal or political attacks
21 | * Public or private harassment
22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
23 | * Other conduct which could reasonably be considered inappropriate in a professional setting
24 | 
25 | ## Our Responsibilities
26 | 
27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
28 | 
29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
30 | 
31 | ## Scope
32 | 
33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
34 | 
35 | ## Enforcement
36 | 
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at contact@skale.me. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
38 | 
39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
40 | 
41 | ## Attribution
42 | 
43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
44 | 
45 | [homepage]: http://contributor-covenant.org
46 | [version]: http://contributor-covenant.org/version/1/4/
47 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | **Development activity is stopped, and this project is now archived.**
  2 | 
  3 | ![logo](docs/images/logo-skale.png)
  4 | 
  5 | [![Build Status](https://travis-ci.org/skale-me/skale.svg?branch=master)](https://travis-ci.org/skale-me/skale)
  6 | [![Build Status](https://ci.appveyor.com/api/projects/status/github/skale-me/skale?svg=true)](https://ci.appveyor.com/project/skaleme/skale)
  7 | [![npm badge](https://img.shields.io/npm/v/skale.svg)](https://www.npmjs.com/package/skale)
  8 | 
  9 | 
 10 | High performance distributed data processing and machine learning.
 11 | 
 12 | Skale provides a high-level API in Javascript and an optimized
 13 | parallel execution engine on top of NodeJS.
 14 | 
 15 | ## Features
 16 | * Pure javascript implementation of a Spark like engine
 17 | * Multiple data sources: filesystems, databases, cloud (S3, azure)
 18 | * Multiple data formats: CSV, JSON, Columnar (Parquet)...
 19 | * 50 high level operators to build parallel apps
 20 | * Machine learning: scalable classification, regression, clusterization
 21 | * Run interactively in a nodeJS REPL shell
 22 | * Docker [ready](docker/), simple local mode or full distributed mode
 23 | * Very fast, see [benchmark](benchmark/)
 24 | 
 25 | ## Quickstart
 26 | ```sh
 27 | npm install skale
 28 | ```
 29 | 
 30 | Word count example: 
 31 | 
 32 | ```javascript
 33 | var sc = require('skale').context();
 34 | 
 35 | sc.textFile('/my/path/*.txt')
 36 |   .flatMap(line => line.split(' '))
 37 |   .map(word => [word, 1])
 38 |   .reduceByKey((a, b) => a + b, 0)
 39 |   .count(function (err, result) {
 40 |     console.log(result);
 41 |     sc.end();
 42 |   });
 43 | ```
 44 | 
 45 | ### Local mode
 46 | In local mode, worker processes are automatically forked and
 47 | communicate with app through child process IPC channel. This is
 48 | the simplest way to operate, and it allows to use all machine
 49 | available cores.
 50 | 
 51 | To run in local mode, just execute your app script:
 52 | ```sh
 53 | node my_app.js
 54 | ```
 55 | 
 56 | or with debug traces:
 57 | ```sh
 58 | SKALE_DEBUG=2 node my_app.js
 59 | ```
 60 | 
 61 | ### Distributed mode
 62 | In distributed mode, a cluster server process and worker processes
 63 | must be started prior to start app. Processes communicate with each
 64 | other via raw TCP or via websockets.
 65 | 
 66 | To run in distributed cluster mode, first start a cluster server
 67 | on `server_host`:
 68 | ```sh
 69 | ./bin/server.js
 70 | ```
 71 | 
 72 | On each worker host, start a worker controller process which connects
 73 | to server:
 74 | ```sh
 75 | ./bin/worker.js -H server_host
 76 | ```
 77 | 
 78 | Then run your app, setting the cluster server host in environment:
 79 | ```sh
 80 | SKALE_HOST=server_host node my_app.js
 81 | ```
 82 | 
 83 | The same with debug traces:
 84 | ```sh
 85 | SKALE_HOST=server_host SKALE_DEBUG=2 node my_app.js
 86 | ```
 87 | 
 88 | ## Resources
 89 | 
 90 | * [Contributing guide](CONTRIBUTING.md)
 91 | * [Documentation](https://skale-me.github.io/skale)
 92 | * [Gitter](https://gitter.im/skale-me/skale-engine) for support and
 93 |   discussion
 94 | * [Mailing list](https://groups.google.com/forum/#!forum/skale)
 95 |   for discussion about use and development
 96 | 
 97 | ## Authors
 98 | 
 99 | The original authors of skale are [Cedric Artigue](https://github.com/CedricArtigue) and [Marc Vertes](https://github.com/mvertes).
100 | 
101 | [List of all
102 | contributors](https://github.com/skale-me/skale/graphs/contributors)
103 | 
104 | ## License
105 | 
106 | [Apache-2.0](LICENSE)
107 | 
108 | ## Credits
109 | 
110 | <div>Logo Icon made by <a href="https://www.flaticon.com/authors/smashicons" title="Smashicons">Smashicons</a> from <a href="https://www.flaticon.com/" title="Flaticon">www.flaticon.com</a> is licensed by <a href="http://creativecommons.org/licenses/by/3.0/" title="Creative Commons BY 3.0" target="_blank">CC 3.0 BY</a></div>
111 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
  1 | ![logo](images/logo-skale.png)
  2 | 
  3 | [![Build Status](https://travis-ci.org/skale-me/skale.svg?branch=master)](https://travis-ci.org/skale-me/skale)
  4 | [![Build Status](https://ci.appveyor.com/api/projects/status/github/skale-me/skale?svg=true)](https://ci.appveyor.com/project/skaleme/skale)
  5 | [![npm badge](https://img.shields.io/npm/v/skale.svg)](https://www.npmjs.com/package/skale)
  6 | 
  7 | 
  8 | High performance distributed data processing and machine learning.
  9 | 
 10 | Skale provides a high-level API in Javascript and an optimized
 11 | parallel execution engine on top of NodeJS.
 12 | 
 13 | ## Features
 14 | * Pure javascript implementation of a Spark like engine
 15 | * Multiple data sources: filesystems, databases, cloud (S3, azure)
 16 | * Multiple data formats: CSV, JSON, Columnar (Parquet)...
 17 | * 50 high level operators to build parallel apps
 18 | * Machine learning: scalable classification, regression, clusterization
 19 | * Run interactively in a nodeJS REPL shell
 20 | * Docker [ready](https://github.com/skale-me/skale/blob/master/docker/), simple local mode or full distributed mode
 21 | * Very fast, see [benchmark](https://github.com/skale-me/skale/blob/master/benchmark/)
 22 | 
 23 | ## Quickstart
 24 | ```sh
 25 | npm install skale
 26 | ```
 27 | 
 28 | Word count example: 
 29 | 
 30 | ```javascript
 31 | var sc = require('skale').context();
 32 | 
 33 | sc.textFile('/my/path/*.txt')
 34 |   .flatMap(line => line.split(' '))
 35 |   .map(word => [word, 1])
 36 |   .reduceByKey((a, b) => a + b, 0)
 37 |   .count(function (err, result) {
 38 |     console.log(result);
 39 |     sc.end();
 40 |   });
 41 | ```
 42 | 
 43 | ### Local mode
 44 | In local mode, worker processes are automatically forked and
 45 | communicate with app through child process IPC channel. This is
 46 | the simplest way to operate, and it allows to use all machine
 47 | available cores.
 48 | 
 49 | To run in local mode, just execute your app script:
 50 | ```sh
 51 | node my_app.js
 52 | ```
 53 | 
 54 | or with debug traces:
 55 | ```sh
 56 | SKALE_DEBUG=2 node my_app.js
 57 | ```
 58 | 
 59 | ### Distributed mode
 60 | In distributed mode, a cluster server process and worker processes
 61 | must be started prior to start app. Processes communicate with each
 62 | other via raw TCP or via websockets.
 63 | 
 64 | To run in distributed cluster mode, first start a cluster server
 65 | on `server_host`:
 66 | ```sh
 67 | ./bin/server.js
 68 | ```
 69 | 
 70 | On each worker host, start a worker controller process which connects
 71 | to server:
 72 | ```sh
 73 | ./bin/worker.js -H server_host
 74 | ```
 75 | 
 76 | Then run your app, setting the cluster server host in environment:
 77 | ```sh
 78 | SKALE_HOST=server_host node my_app.js
 79 | ```
 80 | 
 81 | The same with debug traces:
 82 | ```sh
 83 | SKALE_HOST=server_host SKALE_DEBUG=2 node my_app.js
 84 | ```
 85 | 
 86 | ## Resources
 87 | 
 88 | * [Contributing guide](https://github.com/skale-me/skale/blob/master/CONTRIBUTING.md)
 89 | * [Gitter](https://gitter.im/skale-me/skale-engine) for support and
 90 |   discussion
 91 | * [Mailing list](https://groups.google.com/forum/#!forum/skale)
 92 |   for discussion about use and development
 93 | 
 94 | ## Authors
 95 | 
 96 | The original authors of skale are [Cedric Artigue](https://github.com/CedricArtigue) and [Marc Vertes](https://github.com/mvertes).
 97 | 
 98 | [List of all
 99 | contributors](https://github.com/skale-me/skale/graphs/contributors)
100 | 
101 | ## License
102 | 
103 | [Apache-2.0](https://github.com/skale-me/skale/blob/master/LICENSE)
104 | 
105 | ## Credits
106 | 
107 | <div>Logo Icon made by <a href="https://www.flaticon.com/authors/smashicons" title="Smashicons">Smashicons</a> from <a href="https://www.flaticon.com/" title="Flaticon">www.flaticon.com</a> is licensed by <a href="http://creativecommons.org/licenses/by/3.0/" title="Creative Commons BY 3.0" target="_blank">CC 3.0 BY</a></div>
108 | 


--------------------------------------------------------------------------------
/test/data/iris.csv:
--------------------------------------------------------------------------------
  1 | Sepal length,Sepal Width,Petal length,Petal width,Species
  2 | 5.1,3.5,1.4,0.2,Iris setosa
  3 | 4.9,3.0,1.4,0.2,Iris setosa
  4 | 4.7,3.2,1.3,0.2,Iris setosa
  5 | 4.6,3.1,1.5,0.2,Iris setosa
  6 | 5.0,3.6,1.4,0.2,Iris setosa
  7 | 5.4,3.9,1.7,0.4,Iris setosa
  8 | 4.6,3.4,1.4,0.3,Iris setosa
  9 | 5.0,3.4,1.5,0.2,Iris setosa
 10 | 4.4,2.9,1.4,0.2,Iris setosa
 11 | 4.9,3.1,1.5,0.1,Iris setosa
 12 | 5.4,3.7,1.5,0.2,Iris setosa
 13 | 4.8,3.4,1.6,0.2,Iris setosa
 14 | 4.8,3.0,1.4,0.1,Iris setosa
 15 | 4.3,3.0,1.1,0.1,Iris setosa
 16 | 5.8,4.0,1.2,0.2,Iris setosa
 17 | 5.7,4.4,1.5,0.4,Iris setosa
 18 | 5.4,3.9,1.3,0.4,Iris setosa
 19 | 5.1,3.5,1.4,0.3,Iris setosa
 20 | 5.7,3.8,1.7,0.3,Iris setosa
 21 | 5.1,3.8,1.5,0.3,Iris setosa
 22 | 5.4,3.4,1.7,0.2,Iris setosa
 23 | 5.1,3.7,1.5,0.4,Iris setosa
 24 | 4.6,3.6,1.0,0.2,Iris setosa
 25 | 5.1,3.3,1.7,0.5,Iris setosa
 26 | 4.8,3.4,1.9,0.2,Iris setosa
 27 | 5.0,3.0,1.6,0.2,Iris setosa
 28 | 5.0,3.4,1.6,0.4,Iris setosa
 29 | 5.2,3.5,1.5,0.2,Iris setosa
 30 | 5.2,3.4,1.4,0.2,Iris setosa
 31 | 4.7,3.2,1.6,0.2,Iris setosa
 32 | 4.8,3.1,1.6,0.2,Iris setosa
 33 | 5.4,3.4,1.5,0.4,Iris setosa
 34 | 5.2,4.1,1.5,0.1,Iris setosa
 35 | 5.5,4.2,1.4,0.2,Iris setosa
 36 | 4.9,3.1,1.5,0.1,Iris setosa
 37 | 5.0,3.2,1.2,0.2,Iris setosa
 38 | 5.5,3.5,1.3,0.2,Iris setosa
 39 | 4.9,3.1,1.5,0.1,Iris setosa
 40 | 4.4,3.0,1.3,0.2,Iris setosa
 41 | 5.1,3.4,1.5,0.2,Iris setosa
 42 | 5.0,3.5,1.3,0.3,Iris setosa
 43 | 4.5,2.3,1.3,0.3,Iris setosa
 44 | 4.4,3.2,1.3,0.2,Iris setosa
 45 | 5.0,3.5,1.6,0.6,Iris setosa
 46 | 5.1,3.8,1.9,0.4,Iris setosa
 47 | 4.8,3.0,1.4,0.3,Iris setosa
 48 | 5.1,3.8,1.6,0.2,Iris setosa
 49 | 4.6,3.2,1.4,0.2,Iris setosa
 50 | 5.3,3.7,1.5,0.2,Iris setosa
 51 | 5.0,3.3,1.4,0.2,Iris setosa
 52 | 7.0,3.2,4.7,1.4,Iris versicolor
 53 | 6.4,3.2,4.5,1.5,Iris versicolor
 54 | 6.9,3.1,4.9,1.5,Iris versicolor
 55 | 5.5,2.3,4.0,1.3,Iris versicolor
 56 | 6.5,2.8,4.6,1.5,Iris versicolor
 57 | 5.7,2.8,4.5,1.3,Iris versicolor
 58 | 6.3,3.3,4.7,1.6,Iris versicolor
 59 | 4.9,2.4,3.3,1.0,Iris versicolor
 60 | 6.6,2.9,4.6,1.3,Iris versicolor
 61 | 5.2,2.7,3.9,1.4,Iris versicolor
 62 | 5.0,2.0,3.5,1.0,Iris versicolor
 63 | 5.9,3.0,4.2,1.5,Iris versicolor
 64 | 6.0,2.2,4.0,1.0,Iris versicolor
 65 | 6.1,2.9,4.7,1.4,Iris versicolor
 66 | 5.6,2.9,3.6,1.3,Iris versicolor
 67 | 6.7,3.1,4.4,1.4,Iris versicolor
 68 | 5.6,3.0,4.5,1.5,Iris versicolor
 69 | 5.8,2.7,4.1,1.0,Iris versicolor
 70 | 6.2,2.2,4.5,1.5,Iris versicolor
 71 | 5.6,2.5,3.9,1.1,Iris versicolor
 72 | 5.9,3.2,4.8,1.8,Iris versicolor
 73 | 6.1,2.8,4.0,1.3,Iris versicolor
 74 | 6.3,2.5,4.9,1.5,Iris versicolor
 75 | 6.1,2.8,4.7,1.2,Iris versicolor
 76 | 6.4,2.9,4.3,1.3,Iris versicolor
 77 | 6.6,3.0,4.4,1.4,Iris versicolor
 78 | 6.8,2.8,4.8,1.4,Iris versicolor
 79 | 6.7,3.0,5.0,1.7,Iris versicolor
 80 | 6.0,2.9,4.5,1.5,Iris versicolor
 81 | 5.7,2.6,3.5,1.0,Iris versicolor
 82 | 5.5,2.4,3.8,1.1,Iris versicolor
 83 | 5.5,2.4,3.7,1.0,Iris versicolor
 84 | 5.8,2.7,3.9,1.2,Iris versicolor
 85 | 6.0,2.7,5.1,1.6,Iris versicolor
 86 | 5.4,3.0,4.5,1.5,Iris versicolor
 87 | 6.0,3.4,4.5,1.6,Iris versicolor
 88 | 6.7,3.1,4.7,1.5,Iris versicolor
 89 | 6.3,2.3,4.4,1.3,Iris versicolor
 90 | 5.6,3.0,4.1,1.3,Iris versicolor
 91 | 5.5,2.5,4.0,1.3,Iris versicolor
 92 | 5.5,2.6,4.4,1.2,Iris versicolor
 93 | 6.1,3.0,4.6,1.4,Iris versicolor
 94 | 5.8,2.6,4.0,1.2,Iris versicolor
 95 | 5.0,2.3,3.3,1.0,Iris versicolor
 96 | 5.6,2.7,4.2,1.3,Iris versicolor
 97 | 5.7,3.0,4.2,1.2,Iris versicolor
 98 | 5.7,2.9,4.2,1.3,Iris versicolor
 99 | 6.2,2.9,4.3,1.3,Iris versicolor
100 | 5.1,2.5,3.0,1.1,Iris versicolor
101 | 5.7,2.8,4.1,1.3,Iris versicolor
102 | 6.3,3.3,6.0,2.5,Iris virginica
103 | 5.8,2.7,5.1,1.9,Iris virginica
104 | 7.1,3.0,5.9,2.1,Iris virginica
105 | 6.3,2.9,5.6,1.8,Iris virginica
106 | 6.5,3.0,5.8,2.2,Iris virginica
107 | 7.6,3.0,6.6,2.1,Iris virginica
108 | 4.9,2.5,4.5,1.7,Iris virginica
109 | 7.3,2.9,6.3,1.8,Iris virginica
110 | 6.7,2.5,5.8,1.8,Iris virginica
111 | 7.2,3.6,6.1,2.5,Iris virginica
112 | 6.5,3.2,5.1,2.0,Iris virginica
113 | 6.4,2.7,5.3,1.9,Iris virginica
114 | 6.8,3.0,5.5,2.1,Iris virginica
115 | 5.7,2.5,5.0,2.0,Iris virginica
116 | 5.8,2.8,5.1,2.4,Iris virginica
117 | 6.4,3.2,5.3,2.3,Iris virginica
118 | 6.5,3.0,5.5,1.8,Iris virginica
119 | 7.7,3.8,6.7,2.2,Iris virginica
120 | 7.7,2.6,6.9,2.3,Iris virginica
121 | 6.0,2.2,5.0,1.5,Iris virginica
122 | 6.9,3.2,5.7,2.3,Iris virginica
123 | 5.6,2.8,4.9,2.0,Iris virginica
124 | 7.7,2.8,6.7,2.0,Iris virginica
125 | 6.3,2.7,4.9,1.8,Iris virginica
126 | 6.7,3.3,5.7,2.1,Iris virginica
127 | 7.2,3.2,6.0,1.8,Iris virginica
128 | 6.2,2.8,4.8,1.8,Iris virginica
129 | 6.1,3.0,4.9,1.8,Iris virginica
130 | 6.4,2.8,5.6,2.1,Iris virginica
131 | 7.2,3.0,5.8,1.6,Iris virginica
132 | 7.4,2.8,6.1,1.9,Iris virginica
133 | 7.9,3.8,6.4,2.0,Iris virginica
134 | 6.4,2.8,5.6,2.2,Iris virginica
135 | 6.3,2.8,5.1,1.5,Iris virginica
136 | 6.1,2.6,5.6,1.4,Iris virginica
137 | 7.7,3.0,6.1,2.3,Iris virginica
138 | 6.3,3.4,5.6,2.4,Iris virginica
139 | 6.4,3.1,5.5,1.8,Iris virginica
140 | 6.0,3.0,4.8,1.8,Iris virginica
141 | 6.9,3.1,5.4,2.1,Iris virginica
142 | 6.7,3.1,5.6,2.4,Iris virginica
143 | 6.9,3.1,5.1,2.3,Iris virginica
144 | 5.8,2.7,5.1,1.9,Iris virginica
145 | 6.8,3.2,5.9,2.3,Iris virginica
146 | 6.7,3.3,5.7,2.5,Iris virginica
147 | 6.7,3.0,5.2,2.3,Iris virginica
148 | 6.3,2.5,5.0,1.9,Iris virginica
149 | 6.5,3.0,5.2,2.0,Iris virginica
150 | 6.2,3.4,5.4,2.3,Iris virginica
151 | 5.9,3.0,5.1,1.8,Iris virginica
152 | 


--------------------------------------------------------------------------------
/examples/ml/clustering/iris.csv:
--------------------------------------------------------------------------------
  1 | Sepal length,Sepal Width,Petal length,Petal width,Species
  2 | 5.1,3.5,1.4,0.2,Iris setosa
  3 | 4.9,3.0,1.4,0.2,Iris setosa
  4 | 4.7,3.2,1.3,0.2,Iris setosa
  5 | 4.6,3.1,1.5,0.2,Iris setosa
  6 | 5.0,3.6,1.4,0.2,Iris setosa
  7 | 5.4,3.9,1.7,0.4,Iris setosa
  8 | 4.6,3.4,1.4,0.3,Iris setosa
  9 | 5.0,3.4,1.5,0.2,Iris setosa
 10 | 4.4,2.9,1.4,0.2,Iris setosa
 11 | 4.9,3.1,1.5,0.1,Iris setosa
 12 | 5.4,3.7,1.5,0.2,Iris setosa
 13 | 4.8,3.4,1.6,0.2,Iris setosa
 14 | 4.8,3.0,1.4,0.1,Iris setosa
 15 | 4.3,3.0,1.1,0.1,Iris setosa
 16 | 5.8,4.0,1.2,0.2,Iris setosa
 17 | 5.7,4.4,1.5,0.4,Iris setosa
 18 | 5.4,3.9,1.3,0.4,Iris setosa
 19 | 5.1,3.5,1.4,0.3,Iris setosa
 20 | 5.7,3.8,1.7,0.3,Iris setosa
 21 | 5.1,3.8,1.5,0.3,Iris setosa
 22 | 5.4,3.4,1.7,0.2,Iris setosa
 23 | 5.1,3.7,1.5,0.4,Iris setosa
 24 | 4.6,3.6,1.0,0.2,Iris setosa
 25 | 5.1,3.3,1.7,0.5,Iris setosa
 26 | 4.8,3.4,1.9,0.2,Iris setosa
 27 | 5.0,3.0,1.6,0.2,Iris setosa
 28 | 5.0,3.4,1.6,0.4,Iris setosa
 29 | 5.2,3.5,1.5,0.2,Iris setosa
 30 | 5.2,3.4,1.4,0.2,Iris setosa
 31 | 4.7,3.2,1.6,0.2,Iris setosa
 32 | 4.8,3.1,1.6,0.2,Iris setosa
 33 | 5.4,3.4,1.5,0.4,Iris setosa
 34 | 5.2,4.1,1.5,0.1,Iris setosa
 35 | 5.5,4.2,1.4,0.2,Iris setosa
 36 | 4.9,3.1,1.5,0.1,Iris setosa
 37 | 5.0,3.2,1.2,0.2,Iris setosa
 38 | 5.5,3.5,1.3,0.2,Iris setosa
 39 | 4.9,3.1,1.5,0.1,Iris setosa
 40 | 4.4,3.0,1.3,0.2,Iris setosa
 41 | 5.1,3.4,1.5,0.2,Iris setosa
 42 | 5.0,3.5,1.3,0.3,Iris setosa
 43 | 4.5,2.3,1.3,0.3,Iris setosa
 44 | 4.4,3.2,1.3,0.2,Iris setosa
 45 | 5.0,3.5,1.6,0.6,Iris setosa
 46 | 5.1,3.8,1.9,0.4,Iris setosa
 47 | 4.8,3.0,1.4,0.3,Iris setosa
 48 | 5.1,3.8,1.6,0.2,Iris setosa
 49 | 4.6,3.2,1.4,0.2,Iris setosa
 50 | 5.3,3.7,1.5,0.2,Iris setosa
 51 | 5.0,3.3,1.4,0.2,Iris setosa
 52 | 7.0,3.2,4.7,1.4,Iris versicolor
 53 | 6.4,3.2,4.5,1.5,Iris versicolor
 54 | 6.9,3.1,4.9,1.5,Iris versicolor
 55 | 5.5,2.3,4.0,1.3,Iris versicolor
 56 | 6.5,2.8,4.6,1.5,Iris versicolor
 57 | 5.7,2.8,4.5,1.3,Iris versicolor
 58 | 6.3,3.3,4.7,1.6,Iris versicolor
 59 | 4.9,2.4,3.3,1.0,Iris versicolor
 60 | 6.6,2.9,4.6,1.3,Iris versicolor
 61 | 5.2,2.7,3.9,1.4,Iris versicolor
 62 | 5.0,2.0,3.5,1.0,Iris versicolor
 63 | 5.9,3.0,4.2,1.5,Iris versicolor
 64 | 6.0,2.2,4.0,1.0,Iris versicolor
 65 | 6.1,2.9,4.7,1.4,Iris versicolor
 66 | 5.6,2.9,3.6,1.3,Iris versicolor
 67 | 6.7,3.1,4.4,1.4,Iris versicolor
 68 | 5.6,3.0,4.5,1.5,Iris versicolor
 69 | 5.8,2.7,4.1,1.0,Iris versicolor
 70 | 6.2,2.2,4.5,1.5,Iris versicolor
 71 | 5.6,2.5,3.9,1.1,Iris versicolor
 72 | 5.9,3.2,4.8,1.8,Iris versicolor
 73 | 6.1,2.8,4.0,1.3,Iris versicolor
 74 | 6.3,2.5,4.9,1.5,Iris versicolor
 75 | 6.1,2.8,4.7,1.2,Iris versicolor
 76 | 6.4,2.9,4.3,1.3,Iris versicolor
 77 | 6.6,3.0,4.4,1.4,Iris versicolor
 78 | 6.8,2.8,4.8,1.4,Iris versicolor
 79 | 6.7,3.0,5.0,1.7,Iris versicolor
 80 | 6.0,2.9,4.5,1.5,Iris versicolor
 81 | 5.7,2.6,3.5,1.0,Iris versicolor
 82 | 5.5,2.4,3.8,1.1,Iris versicolor
 83 | 5.5,2.4,3.7,1.0,Iris versicolor
 84 | 5.8,2.7,3.9,1.2,Iris versicolor
 85 | 6.0,2.7,5.1,1.6,Iris versicolor
 86 | 5.4,3.0,4.5,1.5,Iris versicolor
 87 | 6.0,3.4,4.5,1.6,Iris versicolor
 88 | 6.7,3.1,4.7,1.5,Iris versicolor
 89 | 6.3,2.3,4.4,1.3,Iris versicolor
 90 | 5.6,3.0,4.1,1.3,Iris versicolor
 91 | 5.5,2.5,4.0,1.3,Iris versicolor
 92 | 5.5,2.6,4.4,1.2,Iris versicolor
 93 | 6.1,3.0,4.6,1.4,Iris versicolor
 94 | 5.8,2.6,4.0,1.2,Iris versicolor
 95 | 5.0,2.3,3.3,1.0,Iris versicolor
 96 | 5.6,2.7,4.2,1.3,Iris versicolor
 97 | 5.7,3.0,4.2,1.2,Iris versicolor
 98 | 5.7,2.9,4.2,1.3,Iris versicolor
 99 | 6.2,2.9,4.3,1.3,Iris versicolor
100 | 5.1,2.5,3.0,1.1,Iris versicolor
101 | 5.7,2.8,4.1,1.3,Iris versicolor
102 | 6.3,3.3,6.0,2.5,Iris virginica
103 | 5.8,2.7,5.1,1.9,Iris virginica
104 | 7.1,3.0,5.9,2.1,Iris virginica
105 | 6.3,2.9,5.6,1.8,Iris virginica
106 | 6.5,3.0,5.8,2.2,Iris virginica
107 | 7.6,3.0,6.6,2.1,Iris virginica
108 | 4.9,2.5,4.5,1.7,Iris virginica
109 | 7.3,2.9,6.3,1.8,Iris virginica
110 | 6.7,2.5,5.8,1.8,Iris virginica
111 | 7.2,3.6,6.1,2.5,Iris virginica
112 | 6.5,3.2,5.1,2.0,Iris virginica
113 | 6.4,2.7,5.3,1.9,Iris virginica
114 | 6.8,3.0,5.5,2.1,Iris virginica
115 | 5.7,2.5,5.0,2.0,Iris virginica
116 | 5.8,2.8,5.1,2.4,Iris virginica
117 | 6.4,3.2,5.3,2.3,Iris virginica
118 | 6.5,3.0,5.5,1.8,Iris virginica
119 | 7.7,3.8,6.7,2.2,Iris virginica
120 | 7.7,2.6,6.9,2.3,Iris virginica
121 | 6.0,2.2,5.0,1.5,Iris virginica
122 | 6.9,3.2,5.7,2.3,Iris virginica
123 | 5.6,2.8,4.9,2.0,Iris virginica
124 | 7.7,2.8,6.7,2.0,Iris virginica
125 | 6.3,2.7,4.9,1.8,Iris virginica
126 | 6.7,3.3,5.7,2.1,Iris virginica
127 | 7.2,3.2,6.0,1.8,Iris virginica
128 | 6.2,2.8,4.8,1.8,Iris virginica
129 | 6.1,3.0,4.9,1.8,Iris virginica
130 | 6.4,2.8,5.6,2.1,Iris virginica
131 | 7.2,3.0,5.8,1.6,Iris virginica
132 | 7.4,2.8,6.1,1.9,Iris virginica
133 | 7.9,3.8,6.4,2.0,Iris virginica
134 | 6.4,2.8,5.6,2.2,Iris virginica
135 | 6.3,2.8,5.1,1.5,Iris virginica
136 | 6.1,2.6,5.6,1.4,Iris virginica
137 | 7.7,3.0,6.1,2.3,Iris virginica
138 | 6.3,3.4,5.6,2.4,Iris virginica
139 | 6.4,3.1,5.5,1.8,Iris virginica
140 | 6.0,3.0,4.8,1.8,Iris virginica
141 | 6.9,3.1,5.4,2.1,Iris virginica
142 | 6.7,3.1,5.6,2.4,Iris virginica
143 | 6.9,3.1,5.1,2.3,Iris virginica
144 | 5.8,2.7,5.1,1.9,Iris virginica
145 | 6.8,3.2,5.9,2.3,Iris virginica
146 | 6.7,3.3,5.7,2.5,Iris virginica
147 | 6.7,3.0,5.2,2.3,Iris virginica
148 | 6.3,2.5,5.0,1.9,Iris virginica
149 | 6.5,3.0,5.2,2.0,Iris virginica
150 | 6.2,3.4,5.4,2.3,Iris virginica
151 | 5.9,3.0,5.1,1.8,Iris virginica
152 | 


--------------------------------------------------------------------------------
/lib/task.js:
--------------------------------------------------------------------------------
  1 | 'use strict';
  2 | 
  3 | const fs = require('fs');
  4 | const http = require('http');
  5 | 
  6 | const mkdirp = require('mkdirp');
  7 | 
  8 | const uuidPattern = /^[0-9a-f]{8}-[0-9a-f]{4}-[1-4][0-9a-f]{3}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
  9 | 
 10 | /* global dlog */
 11 | 
 12 | module.exports = Task;
 13 | 
 14 | // function Task(basedir, jobId, nodes, datasetId, pid, action) {
 15 | function Task(init) {
 16 |   this.basedir = init.basedir;
 17 |   this.bundle = init.bundle;
 18 |   this.datasetId = init.datasetId;
 19 |   this.pid = init.pid;
 20 |   this.nodes = init.nodes;
 21 |   this.action = init.action;
 22 |   this.outputStreamOk = true;
 23 |   this.files = {};      // object in which we store shuffle file informations to be sent back to master
 24 | //  this.lib;         // handler to libraries required on worker side (which cannot be serialized)
 25 | //  this.mm;          // handler to worker side memory manager instance
 26 | //  this.grid;          // handler to socket object instance
 27 | }
 28 | 
 29 | Task.prototype.run = function(done) {
 30 |   const pipeline = [];
 31 |   const self = this;
 32 |   const mm = this.mm;
 33 |   const action = this.action;
 34 |   const p = this.pid;
 35 |   const blocksToRegister = [];
 36 |   let tmpPart = action ? this.nodes[this.datasetId].partitions[p] : this.nodes[this.datasetId].shufflePartitions[p];
 37 |   let tmpDataset = this.nodes[tmpPart.datasetId];
 38 | 
 39 |   mkdirp.sync(this.basedir + 'export');
 40 |   mkdirp.sync(this.basedir + 'import');
 41 |   mkdirp.sync(this.basedir + 'shuffle');
 42 | 
 43 |   // Propagate environment settings from master
 44 |   if (this.env) {
 45 |     //log('env:', this.env);
 46 |     for (let e in this.env) {
 47 |       if (this.env[e]) process.env[e] = this.env[e];
 48 |       else delete process.env[e];
 49 |     }
 50 |   }
 51 | 
 52 |   // Inject user dependencies
 53 |   for (let m in this.modules) {
 54 |     this.lib[m] = this.modules[m];
 55 |   }
 56 | 
 57 |   if (action) {
 58 |     if (action.opt._foreach) {
 59 |       pipeline.push({transform: function foreach(data) {
 60 |         for (let i = 0; i < data.length; i++) action.src(data[i], action.opt, self);
 61 |       }});
 62 |     } else {
 63 |       pipeline.push({transform: function aggregate(data) {
 64 |         for (let i = 0; i < data.length; i++)
 65 |           action.init = action.src(action.init, data[i], action.opt, self);
 66 |       }});
 67 |     }
 68 |   }
 69 | 
 70 |   let tmpPartAvailable;
 71 |   for (;;) {
 72 |     tmpPartAvailable = mm.isAvailable(tmpPart);             // is partition available in memory
 73 |     if (!tmpPartAvailable && tmpDataset.persistent) {             // if data must be stored in memory
 74 |       if ((action !== undefined) || (tmpDataset.id !== this.datasetId)) {
 75 |         // no persist if no action and shuffleRDD
 76 |         blocksToRegister.push(tmpPart);                 // register block inside memory manager
 77 |         pipeline.unshift(tmpPart);                    // add it to pipeline
 78 |         tmpPart.mm = this.mm;
 79 |       }
 80 |     }
 81 |     if (tmpPartAvailable || (tmpPart.parentDatasetId === undefined)) break;    // source partition found
 82 |     pipeline.unshift(tmpDataset);                       // else add current dataset transform to pipeline
 83 |     tmpPart = this.nodes[tmpPart.parentDatasetId].partitions[tmpPart.parentPartitionIndex];
 84 |     tmpDataset = this.nodes[tmpPart.datasetId];
 85 |   }
 86 | 
 87 |   // Pre-iterate actions
 88 |   if (action) {
 89 |     if (action.opt._preIterate) {
 90 |       action.opt._preIterate(action.opt, this, tmpPart.partitionIndex);
 91 |     }
 92 |   }
 93 | 
 94 |   // Iterate actions
 95 |   const start = Date.now();
 96 |   if (tmpPartAvailable) mm.partitions[tmpPart.datasetId + '.' + tmpPart.partitionIndex].iterate(this, tmpPart.partitionIndex, pipeline, iterateDone);
 97 |   else this.nodes[tmpPart.datasetId].iterate(this, tmpPart.partitionIndex, pipeline, iterateDone);
 98 | 
 99 |   // Post-iterate actions
100 |   function iterateDone() {
101 |     dlog(start, 'iterate');
102 |     blocksToRegister.map(function(block) {mm.register(block);});
103 |     if (action) {
104 |       if (action.opt._postIterate) {
105 |         action.opt._postIterate(action.init, action.opt, self, tmpPart.partitionIndex, function () {
106 |           done({data: {host: self.grid.host.uuid, path: self.exportFile}});
107 |         });
108 |       } else done({data: action.init});
109 |     } else {
110 |       const start1 = Date.now();
111 |       self.nodes[self.datasetId].spillToDisk(self, function() {
112 |         done({pid: self.pid, files: self.files});
113 |         dlog(start1, 'spillToDisk');
114 |       });
115 |     }
116 |   }
117 | };
118 | 
119 | // Get a readable stream for shuffle or source file.
120 | // First, attempt to read from local filesystem
121 | // If not present, attempt to access an HTTP server
122 | // If HTTP server not available, use skale transport through skale server
123 | Task.prototype.getReadStream = function (fileObj, opt, done) {
124 |   if (fs.existsSync(fileObj.path)) return done(null, fs.createReadStream(fileObj.path, opt));
125 |   // Default host is master
126 |   if (!fileObj.host) fileObj.host = this.grid.muuid;
127 |   if (uuidPattern.test(fileObj.host))
128 |     return done(null, this.grid.createStreamFrom(fileObj.host, {cmd: 'sendFile', path: fileObj.path, opt: opt}));
129 |   const url = 'http://' + fileObj.host + fileObj.path;
130 |   http.get(url, function (res) {
131 |     done(null, res);
132 |   });
133 | };
134 | 
135 | // Same as above getReadStream, but return a streams synchronously.
136 | // This may be more expensive, as it requires an additional pass-through stream
137 | Task.prototype.getReadStreamSync = function (fileObj, opt) {
138 |   const fs = this.lib.fs;
139 |   if (fs.existsSync(fileObj.path)) return fs.createReadStream(fileObj.path, opt);
140 |   if (!fileObj.host) fileObj.host = this.grid.muuid;
141 |   return this.grid.createStreamFrom(fileObj.host, {cmd: 'sendFile', path: fileObj.path, opt: opt});
142 | };
143 | 


--------------------------------------------------------------------------------
/ml/sgd-linear-model.js:
--------------------------------------------------------------------------------
  1 | // Regularized Linear models trained by Stochastic Gradient Descent (SGD)
  2 | // Authors: M. Vertes (current), C. Artigue (preliminary)
  3 | // License: Apache License 2.0
  4 | 
  5 | 'use strict';
  6 | 
  7 | const thenify = require('thenify');
  8 | 
  9 | module.exports = SGDLinearModel;
 10 | 
 11 | function SGDLinearModel(options) {
 12 |   if (!(this instanceof SGDLinearModel))
 13 |     return new SGDLinearModel(options);
 14 |   options = options || {};
 15 |   this.weights = options.weights || [];
 16 |   this.stepSize = options.stepSize || 1;
 17 |   this.regParam = options.regParam || 0.001;
 18 |   this.fitIntercept = options.fitIntercept || true;
 19 |   this.proba = options.proba || false;
 20 |   this.intercept = 0;
 21 |     
 22 |   if (!options.penalty)                this.regularize = regularizeL2;
 23 |   else if (options.penalty === 'l2')   this.regularize = regularizeL2;
 24 |   else if (options.penalty === 'l1')   this.regularize = regularizeL1;
 25 |   else if (options.penalty === 'none') this.regularize = regularizeNone;
 26 |   else throw 'Invalid penalty parameter: ' + options.penalty;
 27 | 
 28 |   if (!options.loss)                    this.loss = hingeLoss;
 29 |   else if (options.loss === 'hinge')    this.loss = hingeLoss;
 30 |   else if (options.loss === 'log')      this.loss = logisticLoss;
 31 |   else if (options.loss === 'square')   this.loss = squaredLoss;
 32 |   else throw 'Invalid loss parameter: ' + options.loss;
 33 | 
 34 |   // For now prediction returns a soft output, TODO: include threshold and hard output
 35 |   this.predict = function (point) {
 36 |     let margin = this.intercept;
 37 |     for (let i = 0; i < this.weights.length; i++)
 38 |       margin += (this.weights[i] || 0) * (point[i] || 0);
 39 |     if (this.proba)
 40 |       return 1 / (1 + Math.exp(-margin));
 41 |     return margin;
 42 |   };
 43 | }
 44 | 
 45 | // A training iteration for a stochastic gradient descent classifier consists to:
 46 | //   - compute loss (price of inaccuracy) for each label/features of training set
 47 | //   - finalize gradient (sum and average loss per feature)
 48 | //   - regularize loss weigths using a penalty function of gradient
 49 | 
 50 | SGDLinearModel.prototype.fit = thenify(function (trainingSet, nIterations, callback) {
 51 |   const self = this;
 52 |   let iter = 0;
 53 | 
 54 |   if (this.fitIntercept)
 55 |     trainingSet = trainingSet.map(a => {a[1].unshift(1); return a;});
 56 | 
 57 |   iterate();
 58 | 
 59 |   function iterate() {
 60 |     trainingSet
 61 |       .map(self.loss, self.weights)
 62 |       .aggregate(
 63 |         // Compute total loss per feature and number of samples
 64 |         (a, b) => {
 65 |           for (let i = 0; i < b.length; i++)
 66 |             a[0][i] = (a[0][i] || 0) + (b[i] || 0);
 67 |           a[1]++;
 68 |           return a;
 69 |         },
 70 |         (a, b) => {
 71 |           for (let i = 0; i < b[0].length; i++)
 72 |             a[0][i] = (a[0][i] || 0) + (b[0][i] || 0);
 73 |           a[1] += b[1];
 74 |           return a;
 75 |         },
 76 |         [[], 0],
 77 |         function (err, result) {
 78 |           const iterStepSize = self.stepSize / Math.sqrt(iter + 1);
 79 |           self.regularize(self.weights, result, iterStepSize, self.regParam);
 80 |           if (++iter === nIterations) {
 81 |             if (self.fitIntercept)
 82 |               self.intercept = self.weights.shift();
 83 |             callback();
 84 |           } else iterate();
 85 |         }
 86 |       );
 87 |   }
 88 | });
 89 | 
 90 | // None, a.k.a ordinary least squares
 91 | function regularizeNone(weights, gradientCount) {
 92 |   const [gradient, count] = gradientCount;
 93 | 
 94 |   for (let i = 0; i < gradient.length; i++) {
 95 |     let grad = (gradient[i] || 0) / count;
 96 |     weights[i] = (weights[i] || 0) - grad;
 97 |   }
 98 | }
 99 | 
100 | // L1, a.k.a Lasso
101 | function regularizeL1(weights, gradientCount, stepSize) {
102 |   const [gradient, count] = gradientCount;
103 | 
104 |   for (let i = 0; i < gradient.length; i++) {
105 |     let grad = (gradient[i] || 0) / count;
106 |     weights[i] = weights[i] || 0;
107 |     weights[i] -= stepSize * grad + (weights[i] > 0 ? 1 : -1);
108 |   }
109 | }
110 | 
111 | // L2, a.k.a ridge regression
112 | function regularizeL2(weights, gradientCount, stepSize, regParam) {
113 |   const [gradient, count] = gradientCount;
114 | 
115 |   for (let i = 0; i < gradient.length; i++) {
116 |     let grad = (gradient[i] || 0) / count;
117 |     weights[i] = weights[i] || 0;
118 |     weights[i] -= stepSize * (grad + regParam * weights[i]);
119 |   }
120 | }
121 | 
122 | // TODO #1: elastic-net regularizer: combine L1 and L2 with an
123 | // alpha parameter in range [0, 1] where 1 => L1, 0 => L2,
124 | // in between: (alpha * L1) + ((1-alpha) * L2)
125 | // May be merge L1 and L2 functions
126 | 
127 | // TODO #2: for each regularizer: set weight to 0 if regularization
128 | // crosses 0 (sign change), to achieve feature selection (sparse models)
129 | 
130 | function hingeLoss(p, weights) {
131 |   const [label, features] = p;
132 |   const grad = [];
133 |   let dotProd = 0;
134 | 
135 |   for (let i = 0; i < features.length; i++)
136 |     dotProd += (features[i] || 0) * (weights[i] || 0);
137 | 
138 |   if (label * dotProd < 1)
139 |     for (let i = 0; i < features.length; i++)
140 |       grad[i] = -label * (features[i] || 0);
141 |   else
142 |     for (let i = 0; i < features.length; i++)
143 |       grad[i] = 0;
144 | 
145 |   return grad;
146 | }
147 | 
148 | // valid for labels in [-1, 1]
149 | function logisticLoss(p, weights) {
150 |   const [label, features] = p;
151 |   const grad = [];
152 |   let dotProd = 0;
153 | 
154 |   for (let i = 0; i < features.length; i++)
155 |     dotProd += (features[i] || 0) * (weights[i] || 0);
156 | 
157 |   const tmp = 1 / (1 + Math.exp(-dotProd)) - label;
158 | 
159 |   for (let i = 0; i < features.length; i++)
160 |     grad[i] = (features[i] || 0) * tmp;
161 | 
162 |   return grad;
163 | }
164 | 
165 | function squaredLoss(p, weights) {
166 |   const [label, features] = p;
167 |   const grad = [];
168 |   let dotProd = 0;
169 | 
170 |   for (let i = 0; i < features.length; i++)
171 |     dotProd += (features[i] || 0) * (weights[i] || 0);
172 | 
173 |   for (let i = 0; i < features.length; i++)
174 |     grad[i] = (dotProd - label) * (features[i] || 0);
175 | 
176 |   return grad;
177 | }
178 | 


--------------------------------------------------------------------------------
/lib/readsplit.js:
--------------------------------------------------------------------------------
  1 | // Copyright 2016 Luca-SAS, licensed under the Apache License 2.0
  2 | 
  3 | const fs = require('fs');
  4 | //var Lines = require('../lib/lines.js');
  5 | 
  6 | /**
  7 |   * Compute N-Length logic split of given file
  8 |   */
  9 | function splitLocalFile(file, N, callback) {
 10 |   const split = [];
 11 |   const size = fs.statSync(file).size;
 12 |   const maxBlockSize = Math.ceil(size / (N || 1));
 13 |   let start = 0;
 14 | 
 15 |   // var str = fs.readFileSync(file, {encoding: 'utf8'}).replace(/\n/g, '*');
 16 |   // console.log(str)
 17 |   while (start < size) {
 18 |     // console.log('Split n° %d = %s', split.length, str.substr(start, maxBlockSize + 1))
 19 |     split.push({index: split.length, chunk: [{path: file, opt: {start: start, end: start + maxBlockSize}}]});
 20 |     start += maxBlockSize + 1;
 21 |   }
 22 | 
 23 |   callback(split);
 24 | }
 25 | 
 26 | function splitDistributedFile(file, N, callback) {  // emulates a distributed file for now
 27 |   callback([
 28 |     {
 29 |       index: 0, chunk: [
 30 |         {path: './test.dat', opt: {start: 0, end: 9}},
 31 |         {path: './test.dat', opt: {start: 10, end: 19}}
 32 |       ]
 33 |     }, {
 34 |       index: 1,
 35 |       chunk: [
 36 |         {path: './test.dat', opt: {start: 20, end: 29}},
 37 |         {path: './test.dat', opt: {start: 30}}
 38 |       ]
 39 |     }
 40 |   ]);
 41 | }
 42 | 
 43 | function getFirstLine(split, chunk_buffer, s, getStream, done) {
 44 |   // console.log('Split n° ' + (s - 1) + ' seeks end of line starting with : ' + chunk_buffer.replace(/\n/g, '*'))
 45 |   const isLastSplit = (split[s].index === (split.length - 1));
 46 |   let p = 0;
 47 |   let firstLineFound = false;
 48 |   let firstLine;
 49 | 
 50 |   function readPart(part, partDone) {
 51 |     const isLastPart = (p === split[s].chunk.length - 1);
 52 |     //const rs = fs.createReadStream(part.path, part.opt);
 53 |     const rs = getStream(part, part.opt);
 54 | 
 55 |     function processChunk(chunk) {
 56 |       const lines = (chunk_buffer + chunk).split(/\r\n|\r|\n/);
 57 |       chunk_buffer = lines.pop();
 58 |       if (lines.length > 0) {
 59 |         firstLine = lines[0];
 60 |         firstLineFound = true;
 61 |         //rs.destroy();
 62 |       } else rs.once('data', processChunk);
 63 |     }
 64 | 
 65 |     rs.once('data', processChunk);
 66 | 
 67 |     rs.on('end', function () {
 68 |       if (firstLineFound) done(firstLine);
 69 |       else if (!isLastPart) partDone();
 70 |       else if (isLastSplit) done(chunk_buffer);
 71 |       else {
 72 |         getFirstLine(split, chunk_buffer, s + 1, getStream, done);
 73 |       }
 74 |     });
 75 |   }
 76 | 
 77 |   function end() {
 78 |     if (++p < split[s].chunk.length) readPart(split[s].chunk[p], end);
 79 |   }
 80 | 
 81 |   readPart(split[s].chunk[p], end);
 82 | }
 83 | 
 84 | function readSplit(split, s, processLine, splitDone, getStream) {
 85 |   if (split.length === 0) return splitDone();
 86 |   const isFirstSplit = (split[s].index === 0);
 87 |   const isLastSplit = (split[s].index === (split.length - 1));
 88 |   let chunk_buffer = '';
 89 |   let p = 0;
 90 |   let hasToSkipFirstLine = isFirstSplit ? false : undefined;
 91 |   let firstLineFound = isFirstSplit ? true : false;
 92 | 
 93 |   function readPart(part, partDone) {
 94 |     const isFirstPart = (p === 0);
 95 |     const isLastPart = (p === split[s].chunk.length - 1);
 96 |     const opt = (!isFirstSplit && isFirstPart) ? {start: part.opt.start - 1, end: part.opt.end} : part.opt;
 97 |     //const rs = fs.createReadStream(part.path, opt);
 98 |     const rs = getStream(part, opt);
 99 |     let chunkLastChar = '';
100 | 
101 |     function processChunkOnce(chunk) {
102 |       // console.log('Split n° %d found chunk = %s', s, String(chunk).replace(/\n/g, '*'))
103 |       if (hasToSkipFirstLine === undefined) {
104 |         chunk = String(chunk);
105 |         hasToSkipFirstLine = (chunk.charAt(0) !== '\n');
106 |         // console.log('Has to skip first line = ' + hasToSkipFirstLine)
107 |         chunk = chunk.substr(1);
108 |         // console.log('Chunk after first byte test = ' + chunk)
109 |         if (!hasToSkipFirstLine) firstLineFound = true;
110 |       }
111 |       const str = (chunk_buffer + chunk);
112 |       chunkLastChar = str.charAt(str.length - 1);
113 |       const lines = str.split(/\r\n|\r|\n/);
114 |       chunk_buffer = lines.pop();
115 |       if (lines.length) {
116 |         firstLineFound = true;
117 |         const start = hasToSkipFirstLine ? 1 : 0;
118 |         for (let i = start; i < lines.length; i++) processLine(lines[i]);
119 |         if (lines.length === 1) chunkLastChar = '';
120 |         rs.on('data', processChunk);
121 |         // console.log('Found first line')
122 |       } else rs.once('data', processChunkOnce);
123 |     }
124 | 
125 |     const processChunk = function(chunk) {
126 |       const str = chunk_buffer + chunk;
127 |       chunkLastChar = str.charAt(str.length - 1);
128 |       const lines = str.split(/\r\n|\r|\n/);
129 |       chunk_buffer = lines.pop();
130 |       for (let i = 0; i < lines.length; ++i) processLine(lines[i]);
131 |     };
132 | 
133 |     rs.on('end', function () {
134 |       // console.log(chunk_buffer)
135 |       if (!isLastPart) return partDone();
136 |       if (isLastSplit) {
137 |         if (!firstLineFound) {
138 |           firstLineFound = true;
139 |           if (!hasToSkipFirstLine) processLine(chunk_buffer);
140 |         } else processLine(chunk_buffer);
141 |         splitDone();
142 |       } else {
143 |         if (!firstLineFound) {
144 |           if (chunkLastChar === '\n') {
145 |             firstLineFound = true;
146 |             if (!hasToSkipFirstLine) processLine(chunk_buffer);
147 |           }
148 |           splitDone();
149 |         } else {
150 |           if (chunkLastChar === '\n') {
151 |             processLine(chunk_buffer);
152 |             splitDone();
153 |           } else {
154 |             if (chunk_buffer === '') {
155 |               splitDone();
156 |             } else {
157 |               getFirstLine(split, chunk_buffer, s + 1, getStream, function(firstline) {
158 |                 processLine(firstline);
159 |                 splitDone();
160 |               });
161 |             }
162 |           }
163 |         }
164 |       }
165 |     });
166 | 
167 |     rs.once('data', processChunkOnce);
168 |   }
169 | 
170 |   function end() {
171 |     if (++p < split[s].chunk.length)
172 |       readPart(split[s].chunk[p], end);
173 |   }
174 | 
175 |   readPart(split[s].chunk[p], end);
176 | }
177 | 
178 | module.exports.splitLocalFile = splitLocalFile;
179 | module.exports.splitDistributedFile = splitDistributedFile;
180 | module.exports.readSplit = readSplit;
181 | 


--------------------------------------------------------------------------------
/lib/worker-local.js:
--------------------------------------------------------------------------------
  1 | // Copyright 2016 Luca-SAS, licensed under the Apache License 2.0
  2 | 
  3 | // worker module
  4 | 
  5 | 'use strict';
  6 | 
  7 | const fs = require('fs');
  8 | const zlib = require('zlib');
  9 | const url = require('url');
 10 | const stream = require('stream');
 11 | 
 12 | const mkdirp = require('mkdirp');
 13 | const uuid = require('uuid');
 14 | const S3 = require('aws-sdk/clients/s3');
 15 | const azure = require('azure-storage');
 16 | const parquet = require('./stub-parquet.js');
 17 | 
 18 | const Dataset = require('./dataset.js');
 19 | const Task = require('./task.js');
 20 | const sizeOf = require('./rough-sizeof.js');
 21 | const Lines = require('./lines.js');
 22 | const readSplit = require('./readsplit.js').readSplit;
 23 | 
 24 | const workerId = process.argv[2];
 25 | let memory = process.argv[3];
 26 | 
 27 | const mm = new MemoryManager(memory);
 28 | 
 29 | const start = Date.now();
 30 | 
 31 | if (process.env.SKALE_RANDOM_SEED)
 32 |   Dataset.setRandomSeed(process.env.SKALE_RANDOM_SEED);
 33 | 
 34 | process.title = 'skale-worker-' + workerId;
 35 | 
 36 | process.on('disconnect', function () {
 37 |   log('disconnected, exit');
 38 |   process.exit();
 39 | });
 40 | 
 41 | process.on('message', function (msg) {
 42 |   if (typeof msg === 'object' && msg.req) {
 43 |     switch (msg.req.cmd) {
 44 |     case 'runTask':
 45 |       runTask(msg);
 46 |       break;
 47 |     case 'runztask':
 48 |       runztask(msg);
 49 |       break;
 50 |     }
 51 |   }
 52 | });
 53 | 
 54 | function runztask(msg) {
 55 |   const file = msg.req.args;
 56 |   fs.readFile(file, function (err, data) {
 57 |     fs.unlink(file, function () {});
 58 |     if (err) throw new Error(err);
 59 |     zlib.gunzip(data, {chunkSize: 65536}, function (err, data) {
 60 |       if (err) throw new Error(err);
 61 |       msg.req.args = data;
 62 |       runTask(msg);
 63 |     });
 64 |   });
 65 | }
 66 | 
 67 | function runTask(msg) {
 68 |   const task = parseTask(msg.req.args);
 69 |   task.workerId = workerId;
 70 |   task.grid = {host: {}};
 71 |   task.mm = mm;
 72 |   // Expose system core dependencies explicitely for user evaluated code in workers
 73 |   // Those dependencies do not need to be serialized
 74 |   global.azure = azure;
 75 |   global.S3 = S3;
 76 |   global.log = log;
 77 |   global.dlog = dlog;
 78 |   global.Lines = Lines;
 79 |   global.mkdirp = mkdirp;
 80 |   global.mm = mm;
 81 |   global.parquet = parquet;
 82 |   global.readSplit = readSplit;
 83 |   global.uuid = uuid;
 84 | 
 85 |   global.fs = fs;
 86 |   global.stream = stream;
 87 |   global.url = url;
 88 |   global.zlib = zlib;
 89 | 
 90 |   // Indirect Eval to set user dependencies bundle in the worker global context
 91 |   (0, eval)(task.bundle);
 92 |   task.run(function (result) {
 93 |     delete msg.req.args;
 94 |     msg.result = result;
 95 |     msg.result.workerId = workerId;
 96 |     process.send(msg);
 97 |     if (global.gc) {
 98 |       setImmediate(function () {
 99 |         const gcs = Date.now();
100 |         global.gc();
101 |         dlog(gcs, 'gc');
102 |       });
103 |     }
104 |     else log('no global.gc');
105 |   });
106 | }
107 | 
108 | function parseTask(str) {
109 |   //var i, j, n, ref;
110 |   const task = JSON.parse(str, function (key, value) {
111 |     if (typeof value === 'string') {
112 |       // String value can be a regular function or an ES6 arrow function
113 |       if (value.substring(0, 8) == 'function') {
114 |         const args = value.match(/\(([^)]*)/)[1];
115 |         const body = value.replace(/^function\s*[^)]*\)\s*{/, '').replace(/}$/, '');
116 |         value = new Function(args, body);
117 |       } else if (value.match(/^\s*\(\s*[^(][^)]*\)\s*=>/) || value.match(/^\s*\w+\s*=>/))
118 |         value = ('indirect', eval)(value);
119 |     }
120 |     return value;
121 |   });
122 | 
123 |   for (let i in task.nodes) {
124 |     const n = task.nodes[i];
125 |     for (let j in n.dependencies) {
126 |       const ref = n.dependencies[j];
127 |       n.dependencies[j] = task.nodes[ref];
128 |     }
129 |     for (let j in n.partitions) {
130 |       Object.setPrototypeOf(task.nodes[i].partitions[j], Dataset.Partition.prototype);
131 |       task.nodes[i].partitions[j].count = 0;
132 |       task.nodes[i].partitions[j].bsize = 0;
133 |       task.nodes[i].partitions[j].tsize = 0;
134 |       task.nodes[i].partitions[j].skip = false;
135 |     }
136 |     if (n.type) {
137 |       Object.setPrototypeOf(task.nodes[i], Dataset[n.type].prototype);
138 |     }
139 |     if (n.partitioner && n.partitioner.type) {
140 |       Object.setPrototypeOf(n.partitioner, Dataset[n.partitioner.type].prototype);
141 |     }
142 |   }
143 |   Object.setPrototypeOf(task, Task.prototype);
144 |   //log('task:', JSON.stringify(task, null, 2));
145 |   return task;
146 | }
147 | 
148 | function MemoryManager(memory = 1024) {
149 |   const Mb = 1024 * 1024;
150 |   const MAX_MEMORY = (memory - 100) * Mb;
151 |   const maxStorageMemory = MAX_MEMORY * 0.4;
152 |   const maxShuffleMemory = MAX_MEMORY * 0.2;
153 |   const maxCollectMemory = MAX_MEMORY * 0.2;
154 | 
155 |   this.storageMemory = 0;
156 |   this.shuffleMemory = 0;
157 |   this.collectMemory = 0;
158 |   this.sizeOf = sizeOf;
159 | 
160 |   this.storageFull = function () {return (this.storageMemory > maxStorageMemory);};
161 |   this.shuffleFull = function () {return (this.shuffleMemory > maxShuffleMemory);};
162 |   this.collectFull = function () {return (this.collectMemory > maxCollectMemory);};
163 | 
164 |   this.partitions = {};
165 |   this.register = function (partition) {
166 |     const key = partition.datasetId + '.' + partition.partitionIndex;
167 |     if (!(key in this.partitions)) this.partitions[key] = partition;
168 |   };
169 | 
170 |   this.unregister = function (partition) {
171 |     this.partitions[partition.datasetId + '.' + partition.partitionIndex] = undefined;
172 |   };
173 | 
174 |   this.isAvailable = function (partition) {
175 |     return (this.partitions[partition.datasetId + '.' + partition.partitionIndex] !== undefined);
176 |   };
177 | }
178 | 
179 | let log;
180 | let dlog;
181 | if (process.env.SKALE_DEBUG > 1) {
182 |   log = function log() {
183 |     const args = Array.prototype.slice.call(arguments);
184 |     args.unshift('[worker-' +  process.argv[2] + ' ' + (Date.now() - start) / 1000 + 's]');
185 |     console.error.apply(null, args);
186 |   };
187 |   dlog = function dlog() {
188 |     const args = Array.prototype.slice.call(arguments);
189 |     const now = Date.now();
190 |     const lstart = args.shift();
191 |     args.unshift('[worker-' +  process.argv[2] + ' ' + (now - start) / 1000 + 's]');
192 |     args.push('in ' + (now - lstart) / 1000 + 's');
193 |     console.error.apply(null, args);
194 |   };
195 | } else {
196 |   dlog = log = function noop() {};
197 | }
198 | 


--------------------------------------------------------------------------------
/examples/ml/binary-classification/adult.js:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env node
  2 | 
  3 | 'use strict';
  4 | 
  5 | // Wrap code into ES7 async function, to limit callback imbrications
  6 | (async function main() {
  7 | 
  8 |   // Adult dataset processing as per http://scg.sdsu.edu/dataset-adult_r/
  9 |   // In this example we:
 10 |   // - train a logistic regression on the adult training set.
 11 |   // - evaluate the model on the adult test set
 12 |   // - generate ROC curves as png images
 13 | 
 14 |   const sc = require('skale').context();
 15 |   const ml = require('skale/ml');
 16 |   const plot = require('plotter').plot;     // Todo: should be replaced by D3
 17 | 
 18 |   // Todo: features should be automatically extracted from dataset + type schema
 19 |   const metadata = {
 20 |     workclass: [
 21 |       'Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 'Local-gov',
 22 |       'State-gov', 'Without-pay', 'Never-worked'
 23 |     ],
 24 |     education: [
 25 |       'Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school', 'Assoc-acdm',
 26 |       'Assoc-voc', '9th', '7th-8th', '12th', 'Masters', '1st-4th', '10th', 'Doctorate',
 27 |       '5th-6th', 'Preschool'
 28 |     ],
 29 |     maritalstatus: [
 30 |       'Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed',
 31 |       'Married-spouse-absent', 'Married-AF-spouse'
 32 |     ],
 33 |     occupation: [
 34 |       'Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial',
 35 |       'Prof-specialty', 'Handlers-cleaners', 'Machine-op-inspct', 'Adm-clerical',
 36 |       'Farming-fishing', 'Transport-moving', 'Priv-house-serv', 'Protective-serv',
 37 |       'Armed-Forces'
 38 |     ],
 39 |     relationship: [
 40 |       'Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried'
 41 |     ],
 42 |     race: [
 43 |       'White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black'
 44 |     ],
 45 |     sex: [
 46 |       'Female', 'Male'
 47 |     ],
 48 |     nativecountry: [
 49 |       'United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada', 'Germany',
 50 |       'Outlying-US(Guam-USVI-etc)', 'India', 'Japan', 'Greece', 'South', 'China',
 51 |       'Cuba', 'Iran', 'Honduras', 'Philippines', 'Italy', 'Poland', 'Jamaica',
 52 |       'Vietnam', 'Mexico', 'Portugal', 'Ireland', 'France', 'Dominican-Republic',
 53 |       'Laos', 'Ecuador', 'Taiwan', 'Haiti', 'Columbia', 'Hungary', 'Guatemala',
 54 |       'Nicaragua', 'Scotland', 'Thailand', 'Yugoslavia', 'El-Salvador',
 55 |       'Trinadad&Tobago', 'Peru', 'Hong', 'Holand-Netherlands'
 56 |     ]
 57 |   };
 58 | 
 59 |   function featurize(data, metadata) {
 60 |     const label = (data[14] === '>50K' || data[14] === '>50K.') ? 1 : -1;
 61 |     const features = [
 62 |       Number(data[0]),                          // 1 age
 63 |       metadata.workclass.indexOf(data[1]),      // 2 workclass
 64 |       Number(data[2]),                          // 3 fnlwgt
 65 |       // metadata.education.indexOf(data[3]),   // education (redundant with education-num)
 66 |       Number(data[4]),                          // 4 education-num
 67 |       metadata.maritalstatus.indexOf(data[5]),  // 5 marital-status
 68 |       metadata.occupation.indexOf(data[6]),     // 6 occupation
 69 |       metadata.relationship.indexOf(data[7]),   // 7 relationship
 70 |       metadata.race.indexOf(data[8]),           // 8 race
 71 |       metadata.sex.indexOf(data[9]),            // 9 sex
 72 |       Number(data[10]),                         // 10 capital-gain
 73 |       Number(data[11]),                         // 11 capital-loss
 74 |       Number(data[12]),                         // 12 hours-per-week
 75 |       metadata.nativecountry.indexOf(data[13])  // 13 native-country
 76 |     ];
 77 |     return [label, features];
 78 |   }
 79 | 
 80 |   const trainingSet = sc.textFile(__dirname + '/dataset/adult-0*.csv')
 81 |     .filter(a => a[0] !== 'a')                                        // filter out header
 82 |     .map(line => line.split(',').map(str => str.trim()))              // split csv lines
 83 |     .filter(data => data.length === 15 && data.indexOf('?') === -1)   // remove incomplete data
 84 |     .map(featurize, metadata)                                         // transform string data to number
 85 |     .persist();
 86 | 
 87 |   const testSet = sc.textFile(__dirname + '/dataset/adult-1*.csv')
 88 |     .filter(a => a[0] !== 'a')                                        // filter out header
 89 |     .map(line => line.split(',').map(str => str.trim()))              // split csv lines
 90 |     .filter(data => data.length === 15 && data.indexOf('?') === -1)   // remove incomplete data
 91 |     .map(featurize, metadata);                                        // transform string data to number
 92 | 
 93 |   // Standardize features to zero mean and unit variance
 94 |   const scaler = new ml.StandardScaler();
 95 | 
 96 |   await scaler.fit(trainingSet.map(point => point[1]));
 97 | 
 98 |   // Use scaler to standardize training and test datasets
 99 |   const trainingSetStd = trainingSet
100 |     .map((p, scaler) => [p[0], scaler.transform(p[1])], scaler);
101 | 
102 |   const testSetStd = testSet
103 |     .map((p, scaler) => [p[0], scaler.transform(p[1])], scaler);
104 | 
105 |   // Train logistic regression with SGD on standardized training set
106 |   const nIterations = 10;
107 |   const parameters = {loss: 'log', penalty: 'l2', regParam: 0.001, stepSize: 1, proba: true};
108 |   const model = new ml.SGDLinearModel(parameters);
109 | 
110 |   await model.fit(trainingSetStd, nIterations);
111 | 
112 |   const predictionAndLabels = testSetStd.map((p, model) => [model.predict(p[1]), p[0]], model);
113 |   const metrics = await ml.classificationMetrics(predictionAndLabels, {steps: 100});
114 | 
115 |   console.log('model weights:', model.weights);
116 |   console.log('intercept:', model.intercept);
117 |   console.log('PR AUC:', metrics.auPR);
118 |   console.log('ROC AUC:', metrics.auROC);
119 |   console.log('ROC curve: roc.png');
120 |   console.log('Best threshold (F1 max):', metrics.threshold);
121 |   sc.end();
122 | 
123 |   // Plot ROC curve
124 |   const xy = {};
125 |   for (let i = 0; i < metrics.rates.length; i++)
126 |     xy[metrics.rates[i].fpr || '0.00000000000'] = metrics.rates[i].recall;
127 |   const data = {};
128 |   data['regParam: ' + parameters.regParam + ', stepSize: ' + parameters.stepSize] = xy;
129 |   data['Random'] = {0: 0, 1: 1};
130 |   plot({
131 |     title: 'Logistic Regression ROC Curve',
132 |     data: data,
133 |     filename: 'roc.png',
134 |   });
135 | 
136 |   // Plot PR curve
137 |   const xy0 = {};
138 |   for (let i = 0; i < metrics.rates.length; i++)
139 |     xy0[metrics.rates[i].recall || '0.00000000000'] = metrics.rates[i].precision;
140 |   const data0 = {};
141 |   data0['regParam: ' + parameters.regParam + ', stepSize: ' + parameters.stepSize] = xy0;
142 |   plot({
143 |     title: 'Logistic Regression PR Curve',
144 |     data: data0,
145 |     filename: 'pr.png',
146 |   });
147 | 
148 | })(); // main
149 | 


--------------------------------------------------------------------------------
/docs/machine-learning.md:
--------------------------------------------------------------------------------
  1 | # Machine Learning module
  2 | 
  3 | The Machine Learning (ML) module provides scalable functions for
  4 | supervised (classification, regression) and unsupervised (clustering)
  5 | statistical learning on top of skale datasets and distributed
  6 | map-reduce engine.
  7 | 
  8 | The module can be loaded using:
  9 | ```js
 10 | var ml = require('skale/ml')
 11 | ```
 12 | 
 13 | ## classificationMetrics(measures[, options][, done])
 14 | 
 15 | This [action] computes various metrics to measure classification performance.
 16 | 
 17 | - *measures*: a dataset where entries are in the form
 18 |   `[prediction, label]` with *prediction* and *label* being numbers where
 19 |   only value sign is used: true if positive or false if negative.
 20 | - *options*: an optional *Object* with the following fields:
 21 |     - *steps*: integer *Number* defining the number of points in the Receiver
 22 |       Operation Charateristics (ROC) curve. Defaults: 10
 23 | - *done*:  an optional callback of the form `function(error, result)`
 24 |   which is called at completion. *result* is an object with the following fields:
 25 |     - *rates*: an array of *steps* of confusion matrix raw values
 26 |     - *auROC*: area under ROC curve, using the trapezoidal rule
 27 |     - *auPR*: area under Precision Recall curve, using the trapezoidal rule
 28 | 
 29 | Example:
 30 | ```js
 31 | var model = new ml.SGDLinearModel();
 32 | await model.fit(trainingSet);
 33 | var predictionAndLabels = testSet.map((p, model) => [model.predict(p[1]), p[0]], model);
 34 | var metrics = await ml.classificationMetrics(predictionAndLabels)
 35 | console.log('ROC AUC:', metrics.auROC);
 36 | // 0.869
 37 | ```
 38 | 
 39 | ## KMeans(nbClusters[, options])
 40 | 
 41 | Creates a clusterization model fitted via [K-Means] algorithm.
 42 | 
 43 | - *nbClusters*: *Number*, specifying the number of clusters in the model
 44 | - *Options*: an optional *Object* with the following fields:
 45 |     - *maxMse*: *Number* defining the maximum mean square error between cluster
 46 |       centers since previous iteration. Used to stop iterations. Default to 1e-7.
 47 |     - *maxIterations*: *Number* defining the maximum number of iterations. Default: 100.
 48 |     - *means*: an initial array of vectors (arrays) of numbers, default undefined.
 49 | 
 50 | Example:
 51 | ```js
 52 | const dataset = sc.parallelize([
 53 |   [1, 2], [1, 4], [1, 0],
 54 |   [4, 2], [4, 4], [4, 0]
 55 | ]);
 56 | const kmeans = ml.KMeans(2);
 57 | await kmeans.fit(dataset);
 58 | kmeans.means
 59 | // [ [ 2.5, 1 ], [ 2.5, 4 ] ]
 60 | kmeans.predict([0, 0])
 61 | // 0
 62 | kmeans.predict([4, 4]
 63 | // 1
 64 | ```
 65 | 
 66 | ### kmeans.fit(trainingSet[, done])
 67 | 
 68 | This [action] updates *kmeans* model by fitting it to the input
 69 | dataset *trainingSet*. The *done()* callback is called at completion
 70 | if provided, otherwise an [ES6 promise] is returned.
 71 | 
 72 | - *trainingSet*: a dataset where entries are in the following format:
 73 |   `[feature0, feature1, ...]` with *featureN* being a float number.
 74 | - *done*: an optional callback of the form `function(error)`
 75 |   which is called at completion.
 76 | 
 77 | ### kmeans.predict(sample)
 78 | 
 79 | Returns the closest cluster index for the *sample*.
 80 | 
 81 | - *sample*: an *Array* with the format `[feature0, feature 1, ...]`
 82 |   with *featureN* being a float number.
 83 | 
 84 | ## SGDLinearModel([options])
 85 | 
 86 | Creates a regularized linear model fitted via [stochastic
 87 | gradient descent] learning. Such model can be used either for 
 88 | regression or classification, as training method is identical,
 89 | only prediction changes. SGD is sensitive to the scaling
 90 | of the features. For best results, the data should have zero mean and
 91 | unit variance, which can be achieved with [ml.StandardScaler].
 92 | 
 93 | The model it fits can be controlled with the *loss* option; by default,
 94 | it fits a linear [support vector machine] (SVM). A regularization term
 95 | can be added to the loss, by default the squared euclidean norm L2.
 96 | 
 97 | - *options*: an *Object* with the following fields:
 98 |   - *fitIntercept*: *Boolean* indicating whether to include an intercept. Default: *true*
 99 |   - *loss*: *String* specifying the [loss function] to be used. Possible values are:
100 |       - `hinge`: (default), gives a linear SVM
101 |       - `log`: gives logistic loss, a probabilistic classifier
102 |       - `square`: gives square loss fit
103 |   - *penalty*: *String*  specifying the [regularization] term. Possible values are:
104 |       - `l2`: (default) squared euclidean norm L2, standard regularizer for linear SVM models
105 |       - `l1`: absolute norm L1, might bring sparsity to the model, not achievable with `l2`
106 |       - `none`: zero penalty
107 |   - *proba*: *Boolean* (default *false*). If *true* predict returns a probability rather than a raw number. Only applicable when logisitic loss is selected.
108 |   - *regParam*: *Number*  >= 0, defaults to 0.001, defines the trade-off between the
109 |     two goals of minimizing the loss (i.e. training error) and minimizing model complexity
110 |     (i.e. to avoid overfitting)
111 |   - *stepSize*: *Number* >= 0, defaults to 1, defines the initial step size of the gradient
112 |     descent
113 | 
114 | Example:
115 | ```js
116 | const trainingSet = sc.parallelize([
117 |  [1, [0.5, -0.7]],
118 |  [-1, [-0.5, 0.7]]
119 | ]);
120 | const sgd = new ml.SGDLinearModel()
121 | await sgd.fit(trainingSet, 2)
122 | sgd.weights
123 | // [ 0.8531998372026804, -1.1944797720837526 ]
124 | sgd.predict([2, -2])
125 | // 0.9836229103782058
126 | ```
127 | 
128 | ### sgd.fit(trainingSet, iterations[, done])
129 | 
130 | This [action] updates *sgdClassifier* model by fitting it to the
131 | input dataset *trainingSet*. The *done()* callback is called at
132 | completion if provided, otherwise an [ES6 promise] is returned.
133 | 
134 | - *trainingSet*: a dataset where entries are in the following format:
135 |   `[label, [feature0, feature1, ...]]` with *label* being either 1 or -1,
136 |   and *featureN* being a float number, preferentially with a zero mean and
137 |   unit variance (in range [-1, 1]). Sparse vectors with undefined features
138 |   are supported.
139 | - *done*: an optional callback of the form `function(error)`
140 |   which is called at completion.
141 | 
142 | ### sgd.predict(sample)
143 | 
144 | Predict a label for a given *sample* and returns a numerical value
145 | which can be converted to a label -1 if negative, or 1 if positive.
146 | 
147 | If selected loss is `log`, the returned value can be interpreted as
148 | a probability of the corresponding label.
149 | 
150 | ## StandardScaler()
151 | 
152 | Creates a standard scaler which standardizes features by removing
153 | the mean and scaling to unit variance.
154 | 
155 | Centering and scaling happen independently on each feature by
156 | computing the relevant statistics on the samples in the training
157 | set. 
158 | 
159 | Standardization of datasets is a common requirement for many machine
160 | learning estimators. They might behave badly if the individual
161 | features do not more or less look like standard normally distributed
162 | data: Gaussian with zero mean and unit variance.
163 | 
164 | Example:
165 | ```js
166 | var data = sc.parallelize([[0, 0], [0, 0], [1, 1], [1, 1]]);
167 | var scaler = new ml.StandardScaler();
168 | await scaler.fit(data);
169 | scaler
170 | //StandardScaler {
171 | //  transform: [Function],
172 | //  count: 4,
173 | //  mean: [ 0.5, 0.5 ],
174 | //  std: [ 0.5, 0.5 ] }
175 | var scaled = data.map((p, scaler) => scaler.transform(p), scaler)
176 | console.log(await scaled.collect());
177 | // [ [ -1, -1 ], [ -1, -1 ], [ 1, 1 ], [ 1, 1 ] ]
178 | scaler.transform([2, 2])
179 | // [ 3, 3 ]
180 | ```
181 | 
182 | ### scaler.fit(trainingSet[, done])
183 | 
184 | This [action] updates *scaler* by computing the mean and std of
185 | *trainingSet* to be used for later scaling. The *done()* callback
186 | is called at completion if provided, otherwise an [ES6 promise] is
187 | returned.
188 | 
189 | - *trainingSet*: a dataset where entries are in the format
190 |   `[feature0, feature1, ...]` with *featureN* being a *Number*
191 | - *done*: an optional callback of the form `function (error)` which
192 |   is called at completion.
193 | 
194 | ### scaler.transform(sample)
195 | 
196 | Returns the standardized scaled value of *sample*.
197 | 
198 | - *sample*: an *Array* with the format `[feature0, feature 1, ...]`
199 |   with *featureN* being a float number.
200 | 
201 | [readable stream]: https://nodejs.org/api/stream.html#stream_class_stream_readable
202 | [ES6 promise]: https://promisesaplus.com
203 | [action]: concepts#actions
204 | [K-Means]: https://en.wikipedia.org/wiki/K-means_clustering
205 | [loss function]: https://en.wikipedia.org/wiki/Loss_functions_for_classification
206 | [logistic regression]: https://en.wikipedia.org/wiki/Logistic_regression
207 | [ml.StandardScaler]: #mlstandardscaler
208 | [parquet]: https://parquet.apache.org
209 | [regularization]: https://en.wikipedia.org/wiki/Regularization_(mathematics)
210 | [stochastic gradient descent]: https://en.wikipedia.org/wiki/Stochastic_gradient_descent
211 | [support vector machine]: https://en.wikipedia.org/wiki/Support_vector_machine
212 | 


--------------------------------------------------------------------------------
/docs/skale-hackers-guide.md:
--------------------------------------------------------------------------------
  1 | # Skale Hacker's Guide
  2 | 
  3 | ## Introduction
  4 | 
  5 | Skale is a fast and general purpose distributed data processing system. It provides a high-level API in Javascript and an optimized parallel execution engine.
  6 | 
  7 | This document gives an overview of its design and architecture, then some details on internals and code organisation, and finally presents how to extends various parts of the engine.
  8 | 
  9 | It is assumed that the reader is already familiar with using skale and with the [reference guide], at least the [core concepts].
 10 | 
 11 | ## Architecture
 12 | 
 13 | This section describes the core architecture of skale. At high level, a skale application consists of a *master* program which launches various parallel tasks on *worker* nodes. The tasks read and write *datasets*, i.e. arrays of data of abritrary size, split in *partitions* distributed on workers.
 14 | 
 15 | ### Master
 16 | 
 17 | The corresponding code is in [context-local.js] for the standalone mode, or [context.js] for the distributed mode, the only difference between the 2 being the way workers are created and connected to the master.
 18 | 
 19 | In a nutshell, the master performs the following:
 20 | 
 21 | 1. Creates a new skale [context] object to hold the state of cluster, datasets and tasks, then in this context:
 22 | 2. Allocates a new cluster, i.e. and array of [workers]: connected slave processes on each worker host (1 process per CPU).
 23 | 3. [Compiles then runs] an execution graph derived from the user code, the *job*, consisting of a sequence of *stages*. This compilation is only triggered when an *action* is met, thus in *lazy* mode.
 24 | 4. For each stage, [runs the next task]: serialize and send stage code and metadata about input dataset partitions to the next free worker, trigger execution, wait for result, repeat until all stage's tasks are completed.
 25 | 
 26 | *Stage explanation here*
 27 | 
 28 | ### Worker
 29 | 
 30 | The corresponding code is in [worker-local.js] for the standalone mode and [worker.js] for the distributed mode. The common part is implemented in [task.js].
 31 | 
 32 | A worker performs the following:
 33 | 
 34 | 1. Connects to the master and wait for the next task to execute, then for each task:
 35 | 2. Select input partition(s), possible cases are:
 36 |    - in memory local partition computed from a previous stage, already loaded
 37 |    - on-disk local partition computed from a previous stage, spilled to disk
 38 |    - remote partition stored on a separate worker (post-shuffle)
 39 |    - external data source, through a source connector
 40 | 3. Iterate on partition(s), applying for each record a *pipeline* of functions as defined by the user for the current stage (for example a filter function, followed by a mapper function, followed by a reducer function)
 41 | 4. The last function of the pipeline is either an *action* (function returning data to master), or a pre-shuffle function (saving data on disk for remote access at start of next stage, i.e post-shuffle)
 42 | 5. At end of task, a result is sent to master, usually metadata for output files, used for next stage or for final combiner action
 43 | 
 44 | *Explain communication model here, for data transfers and remote procedure calls*
 45 | 
 46 | ### Datasets
 47 | 
 48 | The main abstraction provided by skale is a *dataset* which is similar to a Javascript array, but partitioned accross the workers that can be operated in parallel.
 49 | 
 50 | A dataset object is always created first on the master side, either by a *source* function which returns a dataset from an external input or from scratch, or by a *transformation* function, which takes a dataset in input and outputs a new dataset.
 51 | 
 52 | The same code, in [dataset.js] is loaded both in master and workers. A dataset object instantiated on master will be replicated on each worker through task [serialization] and [deserialization] process.
 53 | 
 54 | From an object oriented perspective, all *sources* and *transformations*, as dataset contructors, are classes which derive and inherit from the *Dataset* class, whereas *actions*, which operate on a dataset object, are simply methods of the *Dataset* class.
 55 | 
 56 | Dataset objects have methods that can be run either on master side or on worker side (never on both), the following table provides a summary of these:
 57 | 
 58 | |Dataset method     | on master | on worker | type | description |
 59 | |-------------------|-----------|-----------|------|-------------|
 60 | |getPartitions      | ✓ |   | source, post-shuffle transform| Allocate output dataset partitions |
 61 | |getPreferedLocation| ✓ |   | source                        | return prefered worker for a given partition |
 62 | |iterate            |   | ✓ | source, post-shuffle transform| iterate stage pipeline on partition entries|
 63 | |transform          |   | ✓ | transform                     | Apply a custom operation on each input dataset entry, pre-shuffle|
 64 | |spillToDisk        |   | ✓ | pre-shuffle transform         | dump partition data to disk during pre-shuffle, for next stage|
 65 | 
 66 | ### Local standalone mode
 67 | 
 68 | The standalone mode is the default operating mode. All the processes, master and workers are running on the local host, using the [cluster] core NodeJS module. This mode is the simplest to operate: no dependency, and no server nor cluster setup and management required.  It is used as any standard NodeJS package: simply `require('skale')`, and that's it.
 69 | 
 70 | This mode is perfect for development, fast prototyping and tests on a single machine (i.e. a laptop). For unlimited scalibity, see distributed mode below.
 71 | 
 72 | ### Distributed mode
 73 | 
 74 | The distributed mode allows to run the exact same code as in standalone over a network of multiple machines, thus achieving horizontal scalability.
 75 | 
 76 | The distributed mode involves two executables, which must be running prior to launch application programs:
 77 | 
 78 | - a `skale-server` process, which is the access point where the `master` (user application) and `workers` (running slaves) connect to, either by direct TCP connections, or by websockets.
 79 | - A `skale-worker` process, which is a worker controller, running on each machine of the computing cluster, and connecting to the `skale-server`. The worker controller will spawn worker processes on demand (typically one per CPU), each time a new job is submitted.
 80 | 
 81 | To run in distributed mode, the environment variable `SKALE_HOST` must be set to the `skale-server` hostname or IP address. If unset, the application will run in standalone mode. Multiple applications, each with its own set of workers and master processes can run simultaneously using the same server and worker controllers.
 82 | 
 83 | Although not mandatory, running an external HTTP server on worker hosts, exposing skale temporary files, allows efficient peer-to-peer shuffle data transfer between workers. If not available, this traffic will go through the centralized `skale-server`. Any external HTTP server such as nginx, apache or busybox httpd, or even NodeJS (although not the most efficient for static file serving) will do.
 84 | 
 85 | For further details, see command line help for `skale-worker` and `skale-server`.
 86 | 
 87 | ## Adding a new source
 88 | 
 89 | A source returns a dataset from an external input or from scratch. For example, to be able to process data from kafka in a parallel manner, i.e. one topic partition per worker, one has to implement a kafka source in skale.
 90 | 
 91 | Adding a new source is a matter of:
 92 | 
 93 | - Deriving a new class from the Dataset class, see as for example [TextLocal], which implements a textFile source from local filesystem
 94 | - Providing a `getPartition` method prototype, which allocates a fixed number of partitions, see [TextLocal.getPartitions] as an example of allocating one partition per file. This method will be run on the master, when triggered by the action, and prior to dispatch tasks to workers
 95 | - Optionally providing a `getPreferedLocation` method prototype, to select a given worker according to your source semantics. If not provided, the master will dispatch the partition by default to the next free worker at execution time.
 96 | - Providing an `iterate` method prototype, which operates this time on the worker to execute the stage pipeline on each partition entry. See for example [TextLocal.iterate] and [iterateStream] which processes each line of a [readable stream]. If the partition can be mapped to a readable stream, as it is the case for many NodeJS connectors, one can just reuse `iterateStream` as is.
 97 | - Exposing the source in the API, either by extending [textFile] to process a new URL protocol, or adding a new source method in the context, see for example [parallelize].
 98 | 
 99 | ## Adding a new transform
100 | 
101 | A new transform can be implemented either by deriving a new class from the Dataset class then providing dataset methods as in the previous table of dataset methods, or by composing existing tranform methods to issue a new one, see for example [distinct].
102 | 
103 | *Here give details on narrow vs wide transforms and impact on implementation*
104 | 
105 | ## Adding a new action
106 | 
107 | [reference guide]: https://github.com/skale-me/skale/blob/0.7.0/doc/skale-API.md
108 | [core concepts]: https://github.com/skale-me/skale/blob/0.7.0/doc/skale-API.md#core-concepts
109 | [context-local.js]: https://github.com/skale-me/skale/blob/0.7.0/lib/context-local.js
110 | [context.js]: https://github.com/skale-me/skale/blob/0.7.0/lib/context.js
111 | [context]: https://github.com/skale-me/skale/blob/0.7.0/lib/context.js#L22
112 | [workers]: https://github.com/skale-me/skale/blob/0.7.0/lib/context.js#L51-L53
113 | [Compiles then runs]: https://github.com/skale-me/skale/blob/0.7.0/lib/context.js#L223
114 | [runs the next task]: https://github.com/skale-me/skale/blob/0.7.0/lib/context.js#L129
115 | [worker-local.js]: https://github.com/skale-me/skale/blob/0.7.0/lib/worker-local.js
116 | [worker.js]: https://github.com/skale-me/skale/blob/0.7.0/bin/worker.js
117 | [task.js]: https://github.com/skale-me/skale/blob/0.7.0/lib/task.js
118 | [dataset.js]: https://github.com/skale-me/skale/blob/0.7.0/lib/dataset.js
119 | [serialization]: https://github.com/skale-me/skale/blob/0.7.0/lib/context.js#L141
120 | [deserialization]: https://github.com/skale-me/skale/blob/0.7.0/bin/worker.js#L275
121 | [cluster]: https://nodejs.org/dist/latest-v8.x/docs/api/cluster.html
122 | [TextLocal]: https://github.com/skale-me/skale/blob/0.7.0/lib/dataset.js#L911-L919
123 | [TextLocal.getPartitions]: https://github.com/skale-me/skale/blob/0.7.0/lib/dataset.js#L921-L941
124 | [TextLocal.iterate]: https://github.com/skale-me/skale/blob/0.7.0/lib/dataset.js#L943
125 | [iterateStream]: https://github.com/skale-me/skale/blob/0.7.0/lib/dataset.js#L800
126 | [readable stream]: https://nodejs.org/api/stream.html#stream_class_stream_readable
127 | [textFile]: https://github.com/skale-me/skale/blob/0.7.0/lib/context.js#L112-121
128 | [parallelize]: https://github.com/skale-me/skale/blob/0.7.0/lib/context.js#L107
129 | [distinct]: https://github.com/skale-me/skale/blob/0.7.0/lib/dataset.js#L121-L125
130 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/bin/worker.js:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env node
  2 | 
  3 | // Copyright 2016 Luca-SAS, licensed under the Apache License 2.0
  4 | 
  5 | 'use strict';
  6 | 
  7 | const child_process = require('child_process');
  8 | const fs = require('fs');
  9 | const os = require('os');
 10 | const cluster = require('cluster');
 11 | const url = require('url');
 12 | const zlib = require('zlib');
 13 | const stream = require('stream');
 14 | 
 15 | const mkdirp = require('mkdirp');
 16 | const uuid = require('uuid');
 17 | const S3 = require('aws-sdk/clients/s3');
 18 | const azure = require('azure-storage');
 19 | const parquet = require('../lib/stub-parquet.js');
 20 | 
 21 | const SkaleClient = require('../lib/client.js');
 22 | const Dataset = require('../lib/dataset.js');
 23 | const Task = require('../lib/task.js');
 24 | const Lines = require('../lib/lines.js');
 25 | const sizeOf = require('../lib/rough-sizeof.js');
 26 | const readSplit = require('../lib/readsplit.js').readSplit;
 27 | 
 28 | const opt = require('node-getopt').create([
 29 |   ['h', 'help', 'print this help text'],
 30 |   ['d', 'debug', 'print debug traces'],
 31 |   ['m', 'memory=ARG', 'set max memory in MB for workers'],
 32 |   ['M', 'MyHost=ARG', 'advertised hostname (peer-to-peer)'],
 33 |   ['n', 'nworker=ARG', 'number of workers (default: number of cpus)'],
 34 |   ['r', 'retry=ARG', 'number of connection retry (default 0)'],
 35 |   ['s', 'slow', 'disable peer-to-peer file transfers though HTTP'],
 36 |   ['G', 'forcegc', 'workers force garbage collect at end of task'],
 37 |   ['H', 'Host=ARG', 'server hostname (default localhost)'],
 38 |   ['P', 'Port=ARG', 'server port (default 12346)'],
 39 |   ['V', 'version', 'print version']
 40 | ]).bindHelp().parseSystem();
 41 | 
 42 | if (opt.options.version) {
 43 |   const pkg = require('../package');
 44 |   return console.log(pkg.name + '-' +  pkg.version);
 45 | }
 46 | 
 47 | const debug = opt.options.debug || false;
 48 | const forceGc = opt.options.forcegc || false;
 49 | const nworkers = +opt.options.nworker || +(process.env.SKALE_WORKER_PER_HOST ? process.env.SKALE_WORKER_PER_HOST : os.cpus().length);
 50 | const memory = +(opt.options.memory || process.env.SKALE_MEMORY);
 51 | const mm = new MemoryManager(memory);
 52 | const start = Date.now();
 53 | let cgrid;
 54 | let hostname;
 55 | let log;
 56 | let dlog;
 57 | 
 58 | if (!opt.options.slow)
 59 |   hostname = opt.options.MyHost || os.hostname();
 60 | 
 61 | if (process.env.SKALE_DEBUG > 1) {
 62 |   log = function () {
 63 |     const args = Array.prototype.slice.call(arguments);
 64 |     args.unshift('[worker-controller ' + (Date.now() - start) / 1000 + 's]');
 65 |     console.error.apply(null, args);
 66 |   };
 67 | } else {
 68 |   log = function () {};
 69 | }
 70 | 
 71 | if (cluster.isMaster) {
 72 |   process.title = 'skale-worker-controller';
 73 |   if (memory)
 74 |     cluster.setupMaster({execArgv: ['--expose-gc', '--max_old_space_size=' + memory]});
 75 |   cluster.on('exit', handleExit);
 76 |   const cpus = os.cpus();
 77 |   cgrid = new SkaleClient({
 78 |     debug: debug,
 79 |     retry: opt.options.retry,
 80 |     host: opt.options.Host,
 81 |     port: opt.options.Port,
 82 |     data: {
 83 |       type: 'worker-controller',
 84 |       hostname: hostname,
 85 |       nworkers: nworkers,
 86 |       ncpus: cpus.length,
 87 |       memory: os.totalmem(),
 88 |       platform: os.platform(),
 89 |       arch: os.arch(),
 90 |       cpumodel: cpus[0].model,
 91 |       cpuspeed: cpus[0].speed
 92 |     }
 93 |   });
 94 |   cgrid.on('connect', startWorkers);
 95 |   cgrid.on('getWorker', startWorkers);
 96 |   cgrid.on('close', process.exit);
 97 |   cgrid.on('sendFile', function (msg) {
 98 |     fs.createReadStream(msg.path, msg.opt).pipe(cgrid.createStreamTo(msg));
 99 |   });
100 |   // Periodic stats
101 |   fs.mkdir('/tmp/skale', function () {});
102 |   setInterval(function () {
103 |     const stats = { nworkers: Object.keys(cluster.workers).length };
104 |     fs.writeFile('/tmp/skale/worker-controller-stats', JSON.stringify(stats), function () {});
105 |   }, 3000);
106 |   log('worker controller ready');
107 | } else {
108 |   runWorker(opt.options.Host, opt.options.Port);
109 | }
110 | 
111 | function startWorkers(msg) {
112 |   const worker = [];
113 |   const removed = {};
114 |   const n = msg.n || nworkers;
115 | 
116 |   log('worker-controller host', cgrid.uuid);
117 |   for (let i = 0; i < n; i++) {
118 |     worker[i] = cluster.fork({wsid: msg.wsid, rank: i, puuid: cgrid.uuid});
119 |   }
120 |   worker.forEach(function (w) {
121 |     w.on('message', function (msg) {
122 |       switch (msg.cmd) {
123 |       case 'rm':
124 |         if (msg.dir && !removed[msg.dir]) {
125 |           removed[msg.dir] = true;
126 |           child_process.execFile('/bin/rm', ['-rf', msg.dir]);
127 |         }
128 |         break;
129 |       default:
130 |         console.error('unexpected msg', msg);
131 |       }
132 |     });
133 |   });
134 | }
135 | 
136 | function handleExit(worker, code, signal) {
137 |   log('worker pid', worker.process.pid, ', exited:', signal || code);
138 | }
139 | 
140 | function runWorker(host, port) {
141 |   const start = Date.now();
142 |   let wid = process.env.rank;
143 |   let basedir;
144 |   let log;
145 | 
146 |   if (process.env.SKALE_DEBUG > 1) {
147 |     log = function () {
148 |       const args = Array.prototype.slice.call(arguments);
149 |       args.unshift('[worker-' +  wid + ' ' + (Date.now() - start) / 1000 + 's]');
150 |       console.error.apply(null, args);
151 |     };
152 |     dlog = function() {
153 |       const args = Array.prototype.slice.call(arguments);
154 |       const now = Date.now();
155 |       const lstart = args.shift();
156 |       args.unshift('[worker-' +  wid + ' ' + (now - start) / 1000 + 's]');
157 |       args.push('in ' + (now - lstart) / 1000 + 's');
158 |       console.error.apply(null, args);
159 |     };
160 |   } else {
161 |     dlog = log = function () {};
162 |   }
163 |   if (process.env.SKALE_RANDOM_SEED)
164 |     Dataset.setRandomSeed(process.SKALE_RANDOM_SEED);
165 |   process.on('uncaughtException', function (err) {
166 |     grid.send(grid.muuid, {cmd: 'workerError', args: err.stack});
167 |     process.exit(2);
168 |   });
169 | 
170 |   const grid = new SkaleClient({
171 |     debug: debug,
172 |     host: host,
173 |     port: port,
174 |     data: {
175 |       ncpu: os.cpus().length,
176 |       os: os.type(),
177 |       arch: os.arch(),
178 |       usedmem: process.memoryUsage().rss,
179 |       totalmem: os.totalmem(),
180 |       hostname: hostname || process.env.puuid,
181 |       type: 'worker',
182 |       wsid: Number(process.env.wsid),
183 |       jobId: ''
184 |     }
185 |   }, function (err, res) {
186 |     log('id:', res.id, ', uuid:', res.uuid);
187 |     wid = 'w' + res.id;
188 |     grid.host = {uuid: res.uuid, id: res.id};
189 |     process.title = 'skale-worker_w' + res.id;
190 |   });
191 | 
192 |   grid.on('error', function (err) {
193 |     console.error('grid error', err);
194 |     process.exit(2);
195 |   });
196 | 
197 |   function runTask(msg) {
198 |     grid.muuid = msg.data.master_uuid;
199 |     const task = parseTask(msg.data.args);
200 |     basedir = task.basedir;
201 |     // set worker side dependencies
202 |     task.workerId = 'w' + grid.id;
203 |     task.mm = mm;
204 |     task.grid = grid;
205 |     // Set dependencies in global scope for user evaluated code in workers
206 |     global.azure = azure;
207 |     global.S3 = S3;
208 |     global.dlog = dlog;
209 |     global.log = log;
210 |     global.Lines = Lines;
211 |     global.mkdirp = mkdirp;
212 |     global.mm = mm;
213 |     global.parquet = parquet;
214 |     global.readSplit = readSplit;
215 |     global.uuid = uuid;
216 | 
217 |     global.fs = fs;
218 |     global.stream = stream;
219 |     global.url = url;
220 |     global.zlib = zlib;
221 | 
222 |     // Indirect Eval to set user dependencies bundle in the worker global context
223 |     (0, eval)(task.bundle);
224 |     task.run(function(result) {
225 |       result.workerId = task.workerId;
226 |       grid.reply(msg, null, result);
227 |       if (global.gc && forceGc) {
228 |         setImmediate(function () {
229 |           const gcs = Date.now();
230 |           global.gc();
231 |           dlog(gcs, 'gc');
232 |         });
233 |       }
234 |       else log('no global.gc');
235 |     });
236 |   }
237 | 
238 |   function runztask(msg) {
239 |     //log('runztask msg', msg);
240 |     const file = msg.data.args;
241 |     grid.muuid = msg.data.master_uuid;
242 | 
243 |     const s = getReadStreamSync({path: file});
244 |     let data = Buffer.concat([]);
245 | 
246 |     s.on('data', function (chunk) {
247 |       data = Buffer.concat([data, chunk]);
248 |     });
249 |     s.on('end', function () {
250 |       //log('end stream ztask');
251 |       zlib.gunzip(data, {chunkSize: 65536}, function (err, data) {
252 |         if (err) throw new Error(err);
253 |         msg.data.args = data;
254 |         runTask(msg);
255 |       });
256 |     });
257 | 
258 |     function getReadStreamSync(fileObj, opt) {
259 |       if (fs.existsSync(fileObj.path)) return fs.createReadStream(fileObj.path, opt);
260 |       if (!fileObj.host) fileObj.host = grid.muuid;
261 |       return grid.createStreamFrom(fileObj.host, {cmd: 'sendFile', path: fileObj.path, opt: opt});
262 |     }
263 |   }
264 | 
265 |   const request = { runTask: runTask, runztask: runztask };
266 | 
267 |   grid.on('remoteClose', function () {
268 |     process.send({cmd: 'rm', dir: basedir});
269 |     process.exit();
270 |   });
271 | 
272 |   grid.on('request', function (msg) {
273 |     try {
274 |       request[msg.data.cmd](msg);
275 |     } catch (error) {
276 |       console.error(error.stack);
277 |       grid.reply(msg, error, null);
278 |     }
279 |   });
280 | 
281 |   grid.on('sendFile', function (msg) {
282 |     fs.createReadStream(msg.path, msg.opt).pipe(grid.createStreamTo(msg));
283 |   });
284 | }
285 | 
286 | function MemoryManager(memory = 1024) {
287 |   const Mb = 1024 * 1024;
288 |   const MAX_MEMORY = (memory - 100) * Mb;
289 |   const maxStorageMemory = MAX_MEMORY * 0.4;
290 |   const maxShuffleMemory = MAX_MEMORY * 0.2;
291 |   const maxCollectMemory = MAX_MEMORY * 0.2;
292 | 
293 |   this.storageMemory = 0;
294 |   this.shuffleMemory = 0;
295 |   this.collectMemory = 0;
296 |   this.sizeOf = sizeOf;
297 | 
298 |   this.storageFull = function () {return (this.storageMemory > maxStorageMemory);};
299 |   this.shuffleFull = function () {return (this.shuffleMemory > maxShuffleMemory);};
300 |   this.collectFull = function () {return (this.collectMemory > maxCollectMemory);};
301 | 
302 |   this.partitions = {};
303 |   this.register = function (partition) {
304 |     const key = partition.datasetId + '.' + partition.partitionIndex;
305 |     if (!(key in this.partitions)) this.partitions[key] = partition;
306 |   };
307 | 
308 |   this.unregister = function (partition) {
309 |     this.partitions[partition.datasetId + '.' + partition.partitionIndex] = undefined;
310 |   };
311 | 
312 |   this.isAvailable = function (partition) {
313 |     return (this.partitions[partition.datasetId + '.' + partition.partitionIndex] !== undefined);
314 |   };
315 | }
316 | 
317 | function parseTask(str) {
318 |   const task = JSON.parse(str, function (key, value) {
319 |     if (typeof value === 'string') {
320 |       // String value can be a regular function or an ES6 arrow function
321 |       if (value.substring(0, 8) === 'function') {
322 |         const args = value.match(/\(([^)]*)/)[1];
323 |         const body = value.replace(/^function\s*[^)]*\)\s*{/, '').replace(/}$/, '');
324 |         value = new Function(args, body);
325 |       } else if (value.match(/^\s*\(\s*[^(][^)]*\)\s*=>/) || value.match(/^\s*\w+\s*=>/))
326 |         value = ('indirect', eval)(value);
327 |     }
328 |     return value;
329 |   });
330 | 
331 |   for (let i in task.nodes) {
332 |     const n = task.nodes[i];
333 |     for (let j in n.dependencies) {
334 |       const ref = n.dependencies[j];
335 |       n.dependencies[j] = task.nodes[ref];
336 |     }
337 |     for (let j in n.partitions) {
338 |       Object.setPrototypeOf(task.nodes[i].partitions[j], Dataset.Partition.prototype);
339 |       task.nodes[i].partitions[j].count = 0;
340 |       task.nodes[i].partitions[j].bsize = 0;
341 |       task.nodes[i].partitions[j].tsize = 0;
342 |       task.nodes[i].partitions[j].skip = false;
343 |     }
344 |     if (n.type) {
345 |       Object.setPrototypeOf(task.nodes[i], Dataset[n.type].prototype);
346 |     }
347 |     if (n.partitioner && n.partitioner.type) {
348 |       Object.setPrototypeOf(n.partitioner, Dataset[n.partitioner.type].prototype);
349 |     }
350 |   }
351 |   Object.setPrototypeOf(task, Task.prototype);
352 |   //log('task:', JSON.stringify(task, null, 2));
353 |   return task;
354 | }
355 | 


--------------------------------------------------------------------------------
/docs/concepts.md:
--------------------------------------------------------------------------------
  1 | ## Introduction
  2 | 
  3 | Skale is a fast and general purpose distributed data processing
  4 | system. It provides a high-level API in Javascript and an optimized
  5 | parallel execution engine.
  6 | 
  7 | A Skale application consists of a *master* program that runs the
  8 | user code and executes various *parallel operations* on a cluster
  9 | of *workers*.
 10 | 
 11 | The main abstraction Skale provides is a *dataset* which is similar
 12 | to a Javascript *array*, but partitioned accross the workers that
 13 | can be operated in parallel.
 14 | 
 15 | There are several ways to create a dataset: *parallelizing* an existing
 16 | array in the master program, or referencing a dataset in a distributed
 17 | storage system (such as HDFS), or *streaming* the content of any
 18 | source that can be processed through Node.js *Streams*. We call
 19 | *source* a function which initializes a dataset.
 20 | 
 21 | Datasets support two kinds of operations: *transformations*, which create
 22 | a new dataset from an existing one, and *actions*, which
 23 | return a value to the *master* program after running a computation
 24 | on the dataset.
 25 | 
 26 | For example, `map` is a transformation that applies a function to
 27 | each element of a dataset, returning a new dataset. On the other
 28 | hand, `reduce` is an action that aggregates all elements of a dataset
 29 | using some function, and returns the final result to the master.
 30 | 
 31 | *Sources* and *transformations* in Skale are *lazy*. They do not
 32 | start right away, but are triggered by *actions*, thus allowing
 33 | efficient pipelined execution and optimized data transfers.
 34 | 
 35 | A first example:
 36 | 
 37 | ```javascript
 38 | var sc = require('skale').context();		// create a new context
 39 | sc.parallelize([1, 2, 3, 4]).				// source
 40 |    map(function (x) {return x+1}).			// transform
 41 |    reduce(function (a, b) {return a+b}, 0).	// action
 42 |    then(console.log);						// process result: 14
 43 | ```
 44 | 
 45 | ## Core concepts
 46 | 
 47 | As stated above, a program can be considered as a workflow of steps,
 48 | each step consisting of a transformation which inputs from one or
 49 | more datasets (parents), and outputs to a new dataset (child).
 50 | 
 51 | ### Partitioning
 52 | 
 53 | Datasets are divided into several partitions, so each partition can
 54 | be assigned to a separate worker, and processing can occur concurently
 55 | in a distributed and parallel system.
 56 | 
 57 | The consequence of this partitioning is that two types of transformations
 58 | exist:
 59 | 
 60 | - *Narrow* transformations, where each partition of the parent dataset
 61 |   is used by at most one partition of the child dataset. This is the
 62 |   case for example for `map()` or `filter()`, where each dataset entry
 63 |   is processed independently from each other.
 64 |   Partitions are decoupled, no synchronization
 65 |   between workers is required, and narrow transformations can be
 66 |   pipelined on each worker.
 67 | 
 68 | - *Wide* transformations, where multiple child partitions may depend
 69 |   on one parent partition. This is the case for example for `sortBy()`
 70 |   or `groupByKey()`. Data need to be exchanged between workers or
 71 |   *shuffled*, in order to complete the transformation. This introduces
 72 |   synchronization points which prevent pipelining.
 73 | 
 74 | ### Pipeline stages and shuffles
 75 | 
 76 | Internally, each wide transformation consists of a pre-shuffle and
 77 | a post-shuffle part. All sequences of steps from source to pre-shuffle,
 78 | or from post-shuffle to next pre-shuffle or action, are thus only
 79 | narrow transformations, or pipelined stages (the most efficient
 80 | pattern).  A skale program is therefore simply a sequence of stages
 81 | and shuffles, shuffles being global serialization points.
 82 | 
 83 | It's important to grab this concept as it sets the limit to the
 84 | level of parallelism which can be achieved by a given code.
 85 | 
 86 | The synoptic table of [transformations](#transformations) indicates
 87 | for each transformation if it is narrow or wide (shuffle).
 88 | 
 89 | ## Working with datasets
 90 | 
 91 | ### Sources
 92 | 
 93 | After having initialized a cluster context using [skale.context()],
 94 | one can create a dataset using the following sources:
 95 | 
 96 | | Source Name                 | Description                                            |
 97 | | ----------------------------| ------------------------------------------------------ |
 98 | |[lineStream(stream)]         | Create a dataset from a text stream                    |
 99 | |[objectStream(stream)]       | Create a dataset from an object stream                 |
100 | |[parallelize(array)]         | Create a dataset from an array                         |
101 | |[range(start,end,step)]      | Create a dataset containing integers from start to end |
102 | |[source(size,callback,args)] | Create a dataset from a custom source function         |
103 | |[textFile(path, options)]    | Create a dataset from text file                        |
104 | 
105 | ### Transformations
106 | 
107 | Transformations operate on a dataset and return a new dataset. Note that some
108 | transformation operate only on datasets where each element is in the form
109 | of 2 elements array of key and value (`[k,v]` dataset):
110 | 
111 | 	[[Ki,Vi], ..., [Kj, Vj]]
112 | 
113 | A special transformation `persist()` enables one to *persist* a dataset
114 | in memory, allowing efficient reuse accross parallel operations.
115 | 
116 | |Transformation Name               |Description                                                            |In         |Out          |Shuffle|
117 | |----------------------------------|-----------------------------------------------------------------------|-----------|-------------|-------|
118 | |[aggregateByKey(func, func, init)]|Reduce and combine by key using functions                              |[k,v]      |[k,v]        |yes    |
119 | |[cartesian(other)]                |Perform a cartesian product with the other dataset                     |v w        |[v,w]        |yes    |
120 | |[coGroup(other)]                  |Group data from both datasets sharing the same key                     |[k,v] [k,w]|[k,[[v],[w]]]|yes    |
121 | |[distinct()]                      |Return a dataset where duplicates are removed                          |v          |w            |yes    |
122 | |[filter(func)]                    |Return a dataset of elements on which function returns true            |v          |w            |no     |
123 | |[flatMap(func)]                   |Pass the dataset elements to a function which returns a sequence       |v          |w            |no     |
124 | |[flatMapValues(func)]             |Pass the dataset [k,v] elements to a function without changing the keys|[k,v]      |[k,w]        |no     |
125 | |[groupByKey()]                    |Group values with the same key                                         |[k,v]      |[k,[v]]      |yes    |
126 | |[intersection(other)]             |Return a dataset containing only elements found in both datasets       |v w        |v            |yes    |
127 | |[join(other)]                     |Perform an inner join between 2 datasets                               |[k,v]      |[k,[v,w]]    |yes    |
128 | |[leftOuterJoin(other)]            |Join 2 datasets where the key must be present in the other             |[k,v]      |[k,[v,w]]    |yes    |
129 | |[rightOuterJoin(other)]           |Join 2 datasets where the key must be present in the first             |[k,v]      |[k,[v,w]]    |yes    |
130 | |[keys()]                          |Return a dataset of just the keys                                      |[k,v]      |k            |no     |
131 | |[map(func)]                       |Return a dataset where elements are passed through a function          |v          |w            |no     |
132 | |[mapValues(func)]                 |Map a function to the value field of key-value dataset                 |[k,v]      |[k,w]        |no     |
133 | |[reduceByKey(func, init)]         |Combine values with the same key                                       |[k,v]      |[k,w]        |yes    |
134 | |[partitionBy(partitioner)]        |Partition using the partitioner                                        |v          |v            |yes    |
135 | |[persist()]                       |Idempotent, keep content of dataset in cache for further reuse         |v          |v            |no     |
136 | |[sample(rep, frac)]               |Sample a dataset, with or without replacement                          |v          |w            |no     |
137 | |[sortBy(func)]                    |Sort a dataset                                                         |v          |v            |yes    |
138 | |[sortByKey()]                     |Sort a [k,v] dataset                                                   |[k,v]      |[k,v]        |yes    |
139 | |[subtract(other)]                 |Remove the content of one dataset                                      |v w        |v            |yes    |
140 | |[union(other)]                    |Return a dataset containing elements from both datasets                |v          |v w          |no     |
141 | |[values()]                        |Return a dataset of just the values                                    |[k,v]      |v            |no     |
142 | 
143 | ### Actions
144 | 
145 | Actions operate on a dataset and send back results to the *master*. Results
146 | are always produced asynchronously and send to an optional callback function,
147 | alternatively through a returned [ES6 promise].
148 | 
149 | | Action Name                      |Description                                                       |out                |
150 | |----------------------------------|------------------------------------------------------------------|-------------------|
151 | |[aggregate(func, func, init)]     |Similar to reduce() but may return a different typei              |value              |
152 | |[collect()]                       |Return the content of dataset                                     |array of elements  |
153 | |[count()]                         |Return the number of elements from dataset                        |number             |
154 | |[countByKey()]                    |Return the number of occurrences for each key in a `[k,v]` dataset|array of [k,number]|
155 | |[countByValue()]                  |Return the number of occurrences of elements from dataset         |array of [v,number]|
156 | |[first()]                         |Return the first element in dataset i                             |value              |
157 | |[forEach(func)]                   |Apply the provided function to each element of the dataset        |empty              |
158 | |[lookup(k)]                       |Return the list of values `v` for key `k` in a `[k,v]` dataset    |array of v         |
159 | |[reduce(func, init)]              |Aggregates dataset elements using a function into one value       |value              |
160 | |[save(url)]                       |Save the content of a dataset to an url                           |empty              |
161 | |[stream()]                        |Stream out a dataset                                              |stream             |
162 | |[take(num)]                       |Return the first `num` elements of dataset                        |array of value     |
163 | |[takeSample(withReplacement, num)]|Return a sample of `num` elements of dataset                      |array of value     |
164 | |[top(num)]                        |Return the top `num` elements of dataset                          |array of value     |
165 | 
166 | [ES6 promise]: https://promisesaplus.com
167 | [skale.context()]: skale-API.md#skalecontextconfig
168 | 
169 | [lineStream(stream)]: skale-API#sclinestreaminput_stream
170 | [objectStream(stream)]: skale-API#scobjectstreaminput_stream
171 | [parallelize(array)]: skale-API#scparallelizearray
172 | [range(start,end,step)]: skale-API#scrangestart-end-step
173 | [source(size,callback,args)]: skale-API#scsourcesize-callback-args
174 | [textFile(path, options)]: skale-API#sctextfilepath-options
175 | 
176 | [aggregateByKey(func, func, init)]: skale-API#dsaggregatebykeyreducer-combiner-init-obj
177 | [cartesian(other)]: skale-API#dscartesianother
178 | [coGroup(other)]: skale-API#dscogroupother
179 | [distinct()]: skale-API#dsdistinct
180 | [filter(func)]: skale-API#dsfilterfilter-obj
181 | [flatMap(func)]: skale-API#dsflatmapflatmapper-obj
182 | [flatMapValues(func)]: skale-API#dsflatmapvaluesflatmapper-obj
183 | [groupByKey()]: skale-API#dsgroupbykey
184 | [intersection(other)]: skale-API#dsintersectionother
185 | [join(other)]: skale-API#dsjoinother
186 | [leftOuterJoin(other)]: skale-API#dsleftouterjoinother
187 | [rightOuterJoin(other)]: skale-API#dsrightouterjoinother
188 | [keys()]: skale-API#dskeys
189 | [map(func)]: skale-API#dsmapmapper-obj
190 | [mapValues(func)]: skale-API#dsmapvaluesmapper-obj
191 | [reduceByKey(func, init)]: skale-API#dsreducebykeyreducer-init-obj
192 | [partitionBy(partitioner)]: skale-API#dspartitionbypartitioner
193 | [persist()]: skale-API#dspersist
194 | [sample(rep, frac)]: skale-API#dssamplewithreplacement-frac
195 | [sortBy(func)]: skale-API#dssortbykeyfunc-ascending
196 | [sortByKey()]: skale-API#dssortbykeyascending
197 | [subtract(other)]: skale-API#dssubtractother
198 | [union(other)]: skale-API#dsunionother
199 | [values()]: skale-API#dsvalues
200 | 
201 | [aggregate(func, func, init)]: skale-API#dsaggregatereducer-combiner-init-obj-done
202 | [collect()]: skale-API#dscollectdone
203 | [count()]: skale-API#dscountdone
204 | [countByKey()]: skale-API#dscountbykeydone
205 | [countByValue()]: skale-API#dscountbyvaluedone
206 | [first()]: skale-API#dsfirstdone
207 | [forEach(func)]: skale-API#dsforeachcallback-obj-done
208 | [lookup(k)]: skale-API#dslookupk-done
209 | [reduce(func, init)]: skale-API#dsreducereducer-init-obj-done
210 | [save(url)]: skale-API#dssaveurl-options-done
211 | [stream()]: skale-API#dsstreamopt
212 | [take(num)]: skale-API#dstakenum-done
213 | [takeSample(withReplacement, num)]: skale-API#dstakesamplewithreplacement-num-done
214 | [top(num)]: skale-API#dstopnum-done
215 | 


--------------------------------------------------------------------------------