├── .eslintignore ├── examples ├── basic │ ├── kv.data │ ├── kv2.data │ ├── xxx.gz │ ├── textFile-parquet.js │ ├── textFile-gzip.js │ ├── count.js │ ├── arrow.js │ ├── takeSample.js │ ├── textFile-take.js │ ├── forEach.js │ ├── stream.js │ ├── sample.js │ ├── env.js │ ├── take.js │ ├── top.js │ ├── first.js │ ├── collect.js │ ├── parallelize.js │ ├── aggregate.js │ ├── lookup.js │ ├── range.js │ ├── keys.js │ ├── values.js │ ├── countByKey.js │ ├── distinct.js │ ├── save-parquet.js │ ├── map.js │ ├── wordcount.js │ ├── lineStream.js │ ├── intersection.js │ ├── sortByKey.js │ ├── leftOuterJoin.js │ ├── save.js │ ├── subtract.js │ ├── rightOuterJoin.js │ ├── sortBy.js │ ├── filter.js │ ├── groupByKey.js │ ├── s3file.js │ ├── textFile.js │ ├── partitionBy.js │ ├── union.js │ ├── flatMap.js │ ├── reduceByKey.js │ ├── coGroupStreams.js │ ├── join.js │ ├── persist.js │ ├── coGroup.js │ ├── aggregateByKey.js │ ├── mapValues.js │ ├── r2.js │ ├── flatMapValues.js │ ├── countByValue.js │ └── cartesian.js ├── node_modules │ └── skale │ │ ├── ml.js │ │ └── index.js └── ml │ ├── binary-classification │ ├── pr.png │ ├── roc.png │ ├── dataset │ │ └── README.md │ └── adult.js │ ├── linear-regression │ └── regression.js │ └── clustering │ ├── README.md │ ├── iris.js │ └── iris.csv ├── test ├── node_modules │ └── skale │ │ ├── ml.js │ │ └── index.js ├── data │ ├── iris.csv.gz │ ├── split-gz │ │ ├── iris-00.csv.gz │ │ ├── iris-01.csv.gz │ │ ├── iris-02.csv.gz │ │ └── iris-03.csv.gz │ ├── split │ │ ├── iris-03.csv │ │ ├── iris-00.csv │ │ ├── iris-01.csv │ │ └── iris-02.csv │ └── iris.csv ├── dep.js ├── forEach.js ├── collect.js ├── parallelize.js ├── arrow.js ├── filter.js ├── takeSample.js ├── Makefile ├── distinct.js ├── union.js ├── groupByKey.js ├── intersection.js ├── subtract.js ├── sortByKey.js ├── count.js ├── sample.js ├── sortBy.js ├── reduce.js ├── reduceByKey.js ├── lookup.js ├── keys.js ├── countByKey.js ├── ml │ ├── kmeans.js │ ├── sgd-linear-model.js │ └── standard-scaler.js ├── range.js ├── countByValue.js ├── flatMap.js ├── coGroup.js ├── map.js ├── aggregateByKey.js ├── 0_require.js ├── aggregate.js ├── cartesian.js ├── save-csv.js ├── join.js ├── stream.js ├── textFile.js ├── take.js ├── textFile-azure.js ├── textFile-s3.js ├── save.js ├── save-s3.js └── save-azure.js ├── benchmark ├── node_modules │ └── skale.js ├── logreg-1.png ├── logreg-10.png ├── gen_data.js ├── sparkLR.py ├── skaleLR.js └── README.md ├── docs ├── images │ ├── favicon.png │ ├── logo-skale.png │ └── logo.svg ├── Makefile ├── index.md ├── machine-learning.md ├── skale-hackers-guide.md └── concepts.md ├── .travis.yml ├── .gitignore ├── .npmignore ├── docker ├── Dockerfile ├── docker-compose.yml ├── Makefile ├── run.sh └── README.md ├── Makefile ├── bin ├── shell.js └── worker.js ├── lib ├── stub-parquet.js ├── lines.js ├── rough-sizeof.js ├── task.js ├── readsplit.js └── worker-local.js ├── ml ├── index.js ├── standard-scaler.js ├── kmeans.js ├── classification-metrics.js └── sgd-linear-model.js ├── appveyor.yml ├── index.js ├── mkdocs.yml ├── Roadmap.md ├── package.json ├── CONTRIBUTING.md ├── CODE_OF_CONDUCT.md ├── README.md └── LICENSE /.eslintignore: -------------------------------------------------------------------------------- 1 | site/ 2 | -------------------------------------------------------------------------------- /examples/basic/kv.data: -------------------------------------------------------------------------------- 1 | 1 1 2 | 1 1 3 | 2 3 4 | 2 4 5 | 3 5 -------------------------------------------------------------------------------- /examples/basic/kv2.data: -------------------------------------------------------------------------------- 1 | 0 5 2 | 1 6 3 | 2 7 4 | 3 9 5 | 0 9 -------------------------------------------------------------------------------- /examples/node_modules/skale/ml.js: -------------------------------------------------------------------------------- 1 | module.exports = require('../../../ml'); 2 | -------------------------------------------------------------------------------- /test/node_modules/skale/ml.js: -------------------------------------------------------------------------------- 1 | module.exports = require('../../../ml'); 2 | -------------------------------------------------------------------------------- /benchmark/node_modules/skale.js: -------------------------------------------------------------------------------- 1 | module.exports = require('../../index.js'); 2 | -------------------------------------------------------------------------------- /examples/node_modules/skale/index.js: -------------------------------------------------------------------------------- 1 | module.exports = require('../../../index.js'); 2 | -------------------------------------------------------------------------------- /examples/basic/xxx.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skale-me/skale/HEAD/examples/basic/xxx.gz -------------------------------------------------------------------------------- /test/data/iris.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skale-me/skale/HEAD/test/data/iris.csv.gz -------------------------------------------------------------------------------- /test/dep.js: -------------------------------------------------------------------------------- 1 | function add3(a) { 2 | return a + 3; 3 | } 4 | 5 | module.exports = add3; 6 | -------------------------------------------------------------------------------- /benchmark/logreg-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skale-me/skale/HEAD/benchmark/logreg-1.png -------------------------------------------------------------------------------- /benchmark/logreg-10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skale-me/skale/HEAD/benchmark/logreg-10.png -------------------------------------------------------------------------------- /docs/images/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skale-me/skale/HEAD/docs/images/favicon.png -------------------------------------------------------------------------------- /docs/images/logo-skale.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skale-me/skale/HEAD/docs/images/logo-skale.png -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | os: 2 | - linux 3 | - osx 4 | language: node_js 5 | node_js: 6 | - node 7 | - '8' 8 | - '6' 9 | -------------------------------------------------------------------------------- /test/data/split-gz/iris-00.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skale-me/skale/HEAD/test/data/split-gz/iris-00.csv.gz -------------------------------------------------------------------------------- /test/data/split-gz/iris-01.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skale-me/skale/HEAD/test/data/split-gz/iris-01.csv.gz -------------------------------------------------------------------------------- /test/data/split-gz/iris-02.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skale-me/skale/HEAD/test/data/split-gz/iris-02.csv.gz -------------------------------------------------------------------------------- /test/data/split-gz/iris-03.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skale-me/skale/HEAD/test/data/split-gz/iris-03.csv.gz -------------------------------------------------------------------------------- /examples/ml/binary-classification/pr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skale-me/skale/HEAD/examples/ml/binary-classification/pr.png -------------------------------------------------------------------------------- /examples/ml/binary-classification/roc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skale-me/skale/HEAD/examples/ml/binary-classification/roc.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | .DS_Store 3 | /node_modules/ 4 | /site/ 5 | npm-debug.log 6 | tags 7 | yarn-error.log 8 | bin/tmp/*.js 9 | .idea 10 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | .travis.yml 2 | Roadmap.md 3 | appveyor.yml 4 | benchmark/ 5 | docs/ 6 | docker/ 7 | examples/ 8 | site/ 9 | test/ 10 | tape-test/ 11 | -------------------------------------------------------------------------------- /examples/basic/textFile-parquet.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | sc.textFile(process.argv[2]).stream({end: true}).pipe(process.stdout); 6 | -------------------------------------------------------------------------------- /examples/basic/textFile-gzip.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | sc.textFile(__dirname + '/xxx.gz').count().then(function (res) {console.log(res); sc.end();}); 6 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | from alpine:edge 2 | 3 | RUN apk add --no-cache nodejs nodejs-npm; \ 4 | npm install -g skale; \ 5 | apk del nodejs-npm; \ 6 | adduser -D skale 7 | 8 | ADD run.sh / 9 | 10 | ENTRYPOINT [ "/run.sh" ] 11 | CMD [ "sh" ] 12 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | build: 2 | browserify --node -i browserify -i node-parquet -i bufferutil -i utf-8-validate -s skale -o skale-node-bundle.js index.js 3 | browserify -i browserify -i node-parquet -i bufferutil -i utf-8-validate -s skale -o skale-web-bundle.js index.js 4 | -------------------------------------------------------------------------------- /examples/basic/count.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | sc.parallelize([1, 2, 3, 4]).count() 6 | .then(function(data) { 7 | console.log(data); 8 | console.assert(data === 4); 9 | sc.end(); 10 | }); 11 | -------------------------------------------------------------------------------- /examples/basic/arrow.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | sc.range(6).map((a) => a*a).reduce((a,b) => a+b, 0) 6 | .then(function (res) { 7 | console.log(res); 8 | console.assert(res === 55); 9 | sc.end(); 10 | }); 11 | -------------------------------------------------------------------------------- /examples/basic/takeSample.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | process.env.SKALE_RANDOM_SEED = 'skale'; 4 | 5 | const sc = require('skale').context(); 6 | 7 | sc.range(100) 8 | .takeSample(false, 4, function(err, res) { 9 | console.log(res); 10 | sc.end(); 11 | }); 12 | -------------------------------------------------------------------------------- /examples/basic/textFile-take.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | const file = __dirname + '/kv.data'; 6 | 7 | sc.textFile(file) 8 | .take(1) 9 | .then(function (res) { 10 | console.log(res); 11 | sc.end(); 12 | }); 13 | -------------------------------------------------------------------------------- /examples/basic/forEach.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | //sc.range(5).forEach((a, b) => console.log('# b', b), () => { 6 | sc.range(5).forEach((b) => console.log('# b', b), () => { 7 | console.log('done'); 8 | sc.end(); 9 | }); 10 | -------------------------------------------------------------------------------- /examples/basic/stream.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | // const s = sc.range(20).stream({gzip: true}); 5 | //const s = sc.range(20).stream(); 6 | const s = sc.range(20).stream({end: true}); 7 | s.pipe(process.stdout); 8 | //s.on('end', sc.end); 9 | -------------------------------------------------------------------------------- /examples/basic/sample.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | process.env.SKALE_RANDOM_SEED = 'skale'; 4 | 5 | const sc = require('skale').context(); 6 | 7 | sc.range(100) 8 | .sample(false, 0.1) 9 | .collect(function(err, res) { 10 | console.log(res); 11 | sc.end(); 12 | }); 13 | -------------------------------------------------------------------------------- /test/forEach.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | t.test('forEach', function (t) { 5 | t.plan(1); 6 | 7 | sc.range(5).forEach((b) => console.log('# b', b), () => { 8 | t.pass('nothing on master'); 9 | sc.end(); 10 | }); 11 | }); 12 | -------------------------------------------------------------------------------- /examples/basic/env.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | sc.env.MY_VAR = 'Hello'; 6 | 7 | sc.range(5). 8 | map(function (i) {return process.env.MY_VAR + i;}). 9 | collect(function (err, res) { 10 | console.log(res); 11 | sc.end(); 12 | }); 13 | -------------------------------------------------------------------------------- /examples/basic/take.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | sc.parallelize([1, 2, 3, 4], 2) 6 | .take(2).then(function(res) { 7 | console.log(res); 8 | console.assert(JSON.stringify(res) === JSON.stringify([1, 2])); 9 | sc.end(); 10 | }); 11 | -------------------------------------------------------------------------------- /examples/basic/top.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | sc.parallelize([1, 2, 3, 4], 2) 6 | .top(2).then(function(res) { 7 | console.log(res); 8 | console.assert(JSON.stringify(res) === JSON.stringify([3, 4])); 9 | sc.end(); 10 | }); 11 | -------------------------------------------------------------------------------- /test/collect.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | t.test('collect', function (t) { 5 | t.plan(1); 6 | 7 | sc.range(6) 8 | .collect(function (err, res) { 9 | t.deepEqual(res, [0, 1, 2, 3, 4, 5]); 10 | sc.end(); 11 | }); 12 | }); 13 | -------------------------------------------------------------------------------- /test/parallelize.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | t.test('parallelize', function (t) { 5 | t.plan(1); 6 | 7 | sc.parallelize([0, 1, 2, 3]).collect(function (err, data) { 8 | t.deepEqual(data, [0, 1, 2, 3]); 9 | sc.end(); 10 | }); 11 | }); 12 | -------------------------------------------------------------------------------- /examples/basic/first.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | sc.parallelize([[1,2],[3,4],[3,6]]). 6 | first(). 7 | then(function(res) { 8 | console.log(res); 9 | console.assert(JSON.stringify(res) === JSON.stringify([1, 2])); 10 | sc.end(); 11 | }); 12 | -------------------------------------------------------------------------------- /examples/basic/collect.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | sc.parallelize([[1, 2], [3, 4]], 2) 6 | .collect(function(err, res) { 7 | console.log(res); 8 | console.assert(JSON.stringify(res) === JSON.stringify([[1, 2], [3, 4]])); 9 | sc.end(); 10 | }); 11 | -------------------------------------------------------------------------------- /examples/basic/parallelize.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | sc.parallelize([1, 2, 3, 4, 5]) 6 | .collect(function(err, res) { 7 | console.log(res); 8 | console.assert(JSON.stringify(res) === JSON.stringify([1, 2, 3, 4, 5])); 9 | sc.end(); 10 | }); 11 | -------------------------------------------------------------------------------- /examples/basic/aggregate.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | function sum(a, b) {return a + b;} 6 | 7 | sc.parallelize([1, 2, 3, 4], 2) 8 | .reduce(sum, 0).then(function(res) { 9 | console.log(res); 10 | console.assert(res === 10); 11 | sc.end(); 12 | }); 13 | -------------------------------------------------------------------------------- /examples/basic/lookup.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | sc.parallelize([[1, 2], [3, 4], [3, 6]]) 6 | .lookup(3) 7 | .then(function(res) { 8 | console.log(res); 9 | console.assert(JSON.stringify(res) === JSON.stringify([4, 6])); 10 | sc.end(); 11 | }); 12 | -------------------------------------------------------------------------------- /examples/basic/range.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | sc.range(10).map(a => a * 2).collect().then(console.log); 6 | 7 | sc.range(10, -5, -3).collect().then(console.log); 8 | 9 | sc.range(-4, 3).collect(function(err, res) { 10 | console.log(res); 11 | sc.end(); 12 | }); 13 | -------------------------------------------------------------------------------- /bin/shell.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('../').context(); 4 | const ml = require('../ml'); 5 | const addAwait = require('await-outside').addAwaitOutsideToReplServer; 6 | const repl = require('repl').start({ prompt: 'skale> ' }); 7 | 8 | addAwait(repl); 9 | repl.context.sc = sc; 10 | repl.context.ml = ml; 11 | -------------------------------------------------------------------------------- /lib/stub-parquet.js: -------------------------------------------------------------------------------- 1 | // Stub node-parquet binary module. 2 | 'use strict'; 3 | 4 | try { 5 | module.exports = require('node-parquet'); 6 | } catch (err) { 7 | module.exports = { ParquetReader: stub, ParquetWriter: stub }; 8 | } 9 | 10 | function stub() { 11 | throw 'Missing module, run "npm install node-parquet"'; 12 | } 13 | -------------------------------------------------------------------------------- /docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | 5 | skale-server: 6 | image: skale/skale 7 | command: skale-server 8 | ports: 9 | - 12346:12346 10 | - 81:80 11 | 12 | skale-worker: 13 | image: skale/skale 14 | command: skale-worker -H skale-server 15 | ports: 16 | - 82:80 17 | -------------------------------------------------------------------------------- /test/arrow.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | t.test('arrow function', function (t) { 5 | t.plan(1); 6 | 7 | sc.range(6) 8 | .map((a) => a*a) 9 | .reduce((a,b) => a+b, 0) 10 | .then(function (res) { 11 | t.equal(res, 55); 12 | sc.end(); 13 | }); 14 | }); 15 | -------------------------------------------------------------------------------- /test/filter.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | t.test('filter', function (t) { 5 | t.plan(1); 6 | 7 | sc.parallelize([1, 2, 3, 4]) 8 | .filter(a => a % 2) 9 | .collect(function (err, res) { 10 | t.deepEqual(res, [1, 3]); 11 | sc.end(); 12 | }); 13 | }); 14 | -------------------------------------------------------------------------------- /test/takeSample.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | t.test('takeSample', function (t) { 5 | t.plan(1); 6 | 7 | sc.range(100) 8 | .takeSample(false, 4, function(err, res) { 9 | console.log(res); 10 | t.ok(res.length == 4); 11 | sc.end(); 12 | }); 13 | }); 14 | -------------------------------------------------------------------------------- /examples/basic/keys.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | sc.parallelize([[1, 2], [2, 4], [4, 6]]) 6 | .keys() 7 | .collect(function(err, res) { 8 | console.log(res); 9 | res.sort(); 10 | console.assert(JSON.stringify(res) === JSON.stringify([1, 2, 4])); 11 | sc.end(); 12 | }); 13 | -------------------------------------------------------------------------------- /examples/basic/values.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | sc.parallelize([[1,2],[2,4],[4,6]]) 6 | .values() 7 | .collect(function(err, res) { 8 | console.log(res); 9 | res.sort(); 10 | console.assert(JSON.stringify(res) === JSON.stringify([2,4,6])); 11 | sc.end(); 12 | }); 13 | -------------------------------------------------------------------------------- /examples/basic/countByKey.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | sc.parallelize([[1, 2], [3, 4], [3, 6]]) 6 | .countByKey() 7 | .then(function(res) { 8 | console.log(res); 9 | res.sort(); 10 | console.assert(JSON.stringify(res) === JSON.stringify([[1, 1], [3, 2]])); 11 | sc.end(); 12 | }); 13 | -------------------------------------------------------------------------------- /examples/basic/distinct.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | sc.parallelize([ 1, 2, 3, 1, 4, 3, 5 ]). 6 | distinct(). 7 | collect(function(err, res) { 8 | console.log(res); 9 | res.sort(); 10 | console.assert(JSON.stringify(res) === JSON.stringify([1, 2, 3, 4, 5])); 11 | sc.end(); 12 | }); 13 | -------------------------------------------------------------------------------- /test/Makefile: -------------------------------------------------------------------------------- 1 | default: standalone distributed 2 | 3 | standalone: 4 | @SKALE_HOST= SKALE_WORKERS=2 tape "./**/*.js" 5 | 6 | distributed: 7 | @../bin/server.js -l 2 >/tmp/skale-server.log 2>&1 & pid=$$!; \ 8 | SKALE_HOST=localhost tape "./**/*.js"; \ 9 | res=$$?; kill $$pid; exit $$res 10 | 11 | %: 12 | SKALE_HOST= SKALE_WORKERS=2 tape "./**/*$@*.js" 13 | -------------------------------------------------------------------------------- /test/distinct.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | t.test('distinct', function (t) { 5 | t.plan(1); 6 | 7 | sc.parallelize([1, 2, 3, 1, 4, 3, 5]) 8 | .distinct() 9 | .collect(function (err, res) { 10 | t.deepEqual(res.sort(), [1, 2, 3, 4, 5]); 11 | sc.end(); 12 | }); 13 | }); 14 | 15 | -------------------------------------------------------------------------------- /test/union.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | t.test('union', function (t) { 5 | t.plan(1); 6 | 7 | sc.parallelize([1, 2, 3, 4]) 8 | .union(sc.parallelize([5, 6, 7, 8])) 9 | .collect(function (err, res) { 10 | t.deepEqual(res, [1, 2, 3, 4, 5, 6, 7, 8]); 11 | sc.end(); 12 | }); 13 | }); 14 | -------------------------------------------------------------------------------- /examples/basic/save-parquet.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | const schema = { 6 | int1: {type: 'int32'}, 7 | int2: {type: 'int32'} 8 | }; 9 | 10 | sc.range(900). 11 | map(a => [a, 2 * a]). 12 | save('/tmp/truc', {parquet: {schema: schema}}, (err, res) => { 13 | console.log(res); 14 | sc.end(); 15 | }); 16 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | 3 | # When editing docs, run 'make edit' then browse http://localhost:8000 4 | edit: 5 | docker run --rm -it -p 8000:8000 -v $${PWD%/*}:/docs squidfunk/mkdocs-material 6 | 7 | # Maintainer only, to deploy static doc website on github pages 8 | deploy: 9 | docker run --rm -it -v $$HOME/.ssh:/root/.ssh -v $${PWD%/*}:/docs squidfunk/mkdocs-material gh-deploy 10 | -------------------------------------------------------------------------------- /examples/basic/map.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | function by2(a, args) {return a * 2 * args.bias;} 6 | function sum(a, b) {return a + b;} 7 | 8 | sc.parallelize([1, 2, 3, 4]) 9 | .map(by2, {bias: 2}) 10 | .reduce(sum, 0, function(err, res) { 11 | console.log(res); 12 | console.assert(res === 40); 13 | sc.end(); 14 | }); 15 | -------------------------------------------------------------------------------- /examples/basic/wordcount.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | const file = process.argv[2] || '/etc/hosts'; 6 | 7 | sc.textFile(file) 8 | .flatMap(line => line.split(' ')) 9 | .map(word => [word, 1]) 10 | .reduceByKey((a, b) => a + b, 0) 11 | .count() 12 | .then(function (res) { 13 | console.log(res); 14 | sc.end(); 15 | }); 16 | -------------------------------------------------------------------------------- /test/groupByKey.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | t.test('grouByKey', function (t) { 5 | t.plan(1); 6 | 7 | sc.parallelize([['hello', 1], ['hello', 2], ['world', 1]]) 8 | .groupByKey() 9 | .collect(function (err, res) { 10 | t.deepEqual(res.sort(), [['hello', [1, 2]], ['world', [1]]]); 11 | sc.end(); 12 | }); 13 | }); 14 | -------------------------------------------------------------------------------- /examples/basic/lineStream.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const fs = require('fs'); 4 | const sc = require('skale').context(); 5 | 6 | const stream = fs.createReadStream(__dirname + '/kv.data'); 7 | 8 | sc.lineStream(stream) 9 | .collect(function(err, res) { 10 | console.log(res); 11 | console.assert(JSON.stringify(res) === JSON.stringify(['1 1', '1 1', '2 3', '2 4', '3 5'])); 12 | sc.end(); 13 | }); 14 | -------------------------------------------------------------------------------- /examples/basic/intersection.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9]). 6 | intersection(sc.parallelize([5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])). 7 | collect(function(err, res) { 8 | console.log(res); 9 | res.sort(); 10 | console.assert(JSON.stringify(res) === JSON.stringify([5, 6, 7, 8, 9])); 11 | sc.end(); 12 | }); 13 | -------------------------------------------------------------------------------- /test/intersection.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | t.test('intersection', function (t) { 5 | t.plan(1); 6 | 7 | sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9]) 8 | .intersection(sc.parallelize([5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])) 9 | .collect(function(err, res) { 10 | t.deepEqual(res.sort(), [5, 6, 7, 8, 9]); 11 | sc.end(); 12 | }); 13 | }); 14 | -------------------------------------------------------------------------------- /docker/Makefile: -------------------------------------------------------------------------------- 1 | VERSION = 1.2.0-0 2 | IMAGE = skale/skale 3 | 4 | all: 5 | 6 | image: 7 | docker build -t $(IMAGE):$(VERSION) . 8 | docker tag $(IMAGE):$(VERSION) $(IMAGE):latest 9 | 10 | rmi: 11 | docker rmi $(IMAGE):$(VERSION) $(IMAGE):latest 12 | 13 | test: 14 | docker run --rm -ti $(IMAGE) sh 15 | 16 | # Publish only if you are image owner 17 | publish: 18 | docker push $(IMAGE):$(VERSION) 19 | docker push $(IMAGE):latest 20 | -------------------------------------------------------------------------------- /examples/basic/sortByKey.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | const data = [['world', 2], ['cedric', 3], ['hello', 1]]; 6 | const nPartitions = 2; 7 | 8 | sc.parallelize(data, nPartitions) 9 | .sortByKey() 10 | .collect(function(err, res) { 11 | console.log(res); 12 | console.assert(JSON.stringify(res) === JSON.stringify([['cedric', 3], ['hello', 1], ['world', 2]])); 13 | sc.end(); 14 | }); 15 | -------------------------------------------------------------------------------- /ml/index.js: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Luca-SAS, licensed under the Apache License 2.0 2 | 3 | 'use strict'; 4 | 5 | const StandardScaler = require('./standard-scaler'); 6 | const classificationMetrics = require('./classification-metrics'); 7 | const SGDLinearModel = require('./sgd-linear-model'); 8 | const KMeans = require('./kmeans'); 9 | 10 | module.exports = { 11 | StandardScaler, 12 | classificationMetrics, 13 | SGDLinearModel, 14 | KMeans 15 | }; 16 | -------------------------------------------------------------------------------- /test/subtract.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | const d1 = [[1, 1], [1, 1], [2, 3], [2, 4], [3, 5]]; 5 | const d2 = [[1, 1], [1, 1], [2, 3]]; 6 | 7 | t.test('subtract', function (t) { 8 | t.plan(1); 9 | 10 | sc.parallelize(d1) 11 | .subtract(sc.parallelize(d2)) 12 | .collect(function(err, res) { 13 | t.deepEqual(res.sort(), [[2, 4], [3, 5]]); 14 | sc.end(); 15 | }); 16 | }); 17 | -------------------------------------------------------------------------------- /examples/basic/leftOuterJoin.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | const da1 = sc.parallelize([[10, 1], [20, 2]]); 6 | const da2 = sc.parallelize([[10, 'world'], [30, 3]]); 7 | 8 | da1.leftOuterJoin(da2) 9 | .collect(function(err, res) { 10 | console.log(res); 11 | res.sort(); 12 | console.assert(JSON.stringify(res) === JSON.stringify([[10, [1, 'world']], [20, [2, null]]])); 13 | sc.end(); 14 | }); 15 | -------------------------------------------------------------------------------- /examples/basic/save.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | //sc.range(900).save('/tmp/truc', {gzip: true}, (err, res) => { 6 | //sc.range(900).save('/tmp/truc', {stream: true}, (err, res) => { 7 | //sc.range(900).save('s3://skale-demo/test/s1', {gzip: false, stream: true}, (err, res) => { 8 | sc.range(900).save('/tmp/truc', {gzip: true, stream: true}, (err, res) => { 9 | console.log(res); 10 | sc.end(); 11 | }); 12 | -------------------------------------------------------------------------------- /examples/basic/subtract.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | const d1 = [[1, 1], [1, 1], [2, 3], [2, 4], [3, 5]]; 6 | const d2 = [[1, 1], [1, 1], [2, 3]]; 7 | 8 | sc.parallelize(d1) 9 | .subtract(sc.parallelize(d2)) 10 | .collect(function(err, res) { 11 | console.log(res); 12 | res.sort(); 13 | console.assert(JSON.stringify(res) === JSON.stringify([[2, 4], [3, 5]])); 14 | sc.end(); 15 | }); 16 | -------------------------------------------------------------------------------- /examples/basic/rightOuterJoin.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | const da1 = sc.parallelize([[10, 1], [20, 2]]); 6 | const da2 = sc.parallelize([[10, 'world'], [30, 3]]); 7 | 8 | da1.rightOuterJoin(da2) 9 | .collect(function(err, res) { 10 | console.log(res); 11 | res.sort(); 12 | console.assert(JSON.stringify(res) === JSON.stringify([[10, [1, 'world']], [30, [null, 3]]])); 13 | sc.end(); 14 | }); 15 | -------------------------------------------------------------------------------- /test/sortByKey.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | const data = [['world', 2], ['cedric', 3], ['hello', 1]]; 5 | const nPartitions = 2; 6 | 7 | t.test('sortByKey', function (t) { 8 | t.plan(1); 9 | 10 | sc.parallelize(data, nPartitions) 11 | .sortByKey() 12 | .collect(function(err, res) { 13 | t.deepEqual(res, [['cedric', 3], ['hello', 1], ['world', 2]]); 14 | sc.end(); 15 | }); 16 | }); 17 | -------------------------------------------------------------------------------- /test/count.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | t.test('count callback', function (t) { 5 | t.plan(1); 6 | 7 | sc.range(6) 8 | .count(function (err, res) { 9 | t.equal(res, 6); 10 | }); 11 | }); 12 | 13 | t.test('count promise', function (t) { 14 | t.plan(1); 15 | 16 | sc.range(6) 17 | .count() 18 | .then(function (res) { 19 | t.equal(res, 6); 20 | sc.end(); 21 | }); 22 | }); 23 | -------------------------------------------------------------------------------- /examples/basic/sortBy.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | const data = [4, 6, 10, 5, 1, 2, 9, 7, 3, 0]; 6 | const nPartitions = 3; 7 | 8 | function keyFunc(data) {return data;} 9 | 10 | sc.parallelize(data, nPartitions) 11 | .sortBy(keyFunc) 12 | .collect(function(err, res) { 13 | console.log(res); 14 | console.assert(JSON.stringify(res) === JSON.stringify([0, 1, 2, 3, 4, 5, 6, 7, 9, 10])); 15 | sc.end(); 16 | }); 17 | -------------------------------------------------------------------------------- /test/sample.js: -------------------------------------------------------------------------------- 1 | process.env.SKALE_RANDOM_SEED = 'skale'; 2 | 3 | const t = require('tape'); 4 | const sc = require('skale').context(); 5 | 6 | t.test('sample', function (t) { 7 | t.plan(1); 8 | 9 | sc.env.SKALE_RANDOM_SEED = process.env.SKALE_RANDOM_SEED; 10 | 11 | sc.range(100) 12 | .sample(false, 0.1) 13 | .collect(function(err, res) { 14 | console.log(res); 15 | t.ok(res.length > 0 && res.length < 20); 16 | sc.end(); 17 | }); 18 | }); 19 | -------------------------------------------------------------------------------- /test/sortBy.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | const data = [4, 6, 10, 5, 1, 2, 9, 7, 3, 0]; 5 | const nPartitions = 3; 6 | 7 | function keyFunc(data) {return data;} 8 | 9 | t.test('sortBy', function (t) { 10 | t.plan(1); 11 | 12 | sc.parallelize(data, nPartitions) 13 | .sortBy(keyFunc) 14 | .collect(function(err, res) { 15 | t.deepEqual(res, [0, 1, 2, 3, 4, 5, 6, 7, 9, 10]); 16 | sc.end(); 17 | }); 18 | }); 19 | -------------------------------------------------------------------------------- /examples/basic/filter.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | function reducer(a, b) {a.push(b); return a;} 6 | function combiner(a, b) {return a.concat(b);} 7 | 8 | function filter(a) {return a % 2;} 9 | 10 | sc.parallelize([1, 2, 3, 4]) 11 | .filter(filter) 12 | .aggregate(reducer, combiner, [], function(err, res) { 13 | console.log(res); 14 | console.assert(JSON.stringify(res) === JSON.stringify([1, 3])); 15 | sc.end(); 16 | }); 17 | -------------------------------------------------------------------------------- /examples/basic/groupByKey.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | const data = [['hello', 1], ['hello', 1], ['world', 1]]; 6 | const nPartitions = 1; 7 | 8 | const a = sc.parallelize(data, nPartitions).groupByKey().persist(); 9 | 10 | a.collect(function(err, res) { 11 | console.log(res); 12 | console.log('First ok!'); 13 | a.collect(function(err, res) { 14 | console.log(res); 15 | console.log('Second ok !'); 16 | sc.end(); 17 | }); 18 | }); 19 | -------------------------------------------------------------------------------- /examples/basic/s3file.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | const input = sc.textFile('s3://skale-demo/datasets/*-ny.json.gz'); 5 | //const input = sc.textFile('s3://skale-demo/datasets/restaurants-ny.json.gz'); 6 | //const input = sc.textFile('s3://skale-demo/datasets/restaurants-ny.json'); 7 | //const s = input.stream(); 8 | //s.pipe(process.stdout); 9 | //s.on('end', sc.end); 10 | 11 | input.count(function (err, res) { 12 | console.log(res); 13 | sc.end(); 14 | }); 15 | -------------------------------------------------------------------------------- /examples/basic/textFile.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | function reducer(a, b) {a.push(b); return a;} 6 | function combiner(a, b) {return a.concat(b);} 7 | 8 | const file = __dirname + '/kv.data'; 9 | 10 | sc.textFile(file) 11 | .aggregate(reducer, combiner, [], function(err, res) { 12 | console.log(res); 13 | res.sort(); 14 | console.assert(JSON.stringify(res) === JSON.stringify(['1 1', '1 1', '2 3', '2 4', '3 5'])); 15 | sc.end(); 16 | }); 17 | -------------------------------------------------------------------------------- /examples/basic/partitionBy.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const skale = require('skale'); 4 | const sc = skale.context(); 5 | 6 | const data = [['hello', 1], ['world', 1], ['hello', 2], ['world', 2], ['cedric', 3]]; 7 | 8 | sc.parallelize(data) 9 | .partitionBy(new skale.HashPartitioner(3)) 10 | .collect(function(err, res) { 11 | console.log(res); 12 | console.assert(JSON.stringify(res) === JSON.stringify([['world', 1], ['world', 2],['hello', 1],['hello', 2],['cedric', 3]])); 13 | sc.end(); 14 | }); 15 | -------------------------------------------------------------------------------- /examples/basic/union.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | function reducer(a, b) {a.push(b); return a;} 6 | function combiner(a, b) {return a.concat(b);} 7 | 8 | const a = sc.parallelize([1, 2, 3, 4]); 9 | const b = sc.parallelize([5, 6, 7, 8]); 10 | 11 | a.union(b).aggregate(reducer, combiner, [], function(err, res) { 12 | console.log(res); 13 | res.sort(); 14 | console.assert(JSON.stringify(res) === JSON.stringify([1, 2, 3, 4, 5, 6, 7, 8])); 15 | sc.end(); 16 | }); 17 | -------------------------------------------------------------------------------- /examples/basic/flatMap.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | function reducer(a, b) {a.push(b); return a;} 6 | function combiner(a, b) {return a.concat(b);} 7 | 8 | function dup(a) {return [a, a];} 9 | 10 | sc.parallelize([1, 2, 3, 4]) 11 | .flatMap(dup) 12 | .aggregate(reducer, combiner, [], function(err, res) { 13 | console.log(res); 14 | res.sort(); 15 | console.assert(JSON.stringify(res) === JSON.stringify([1, 1, 2, 2, 3, 3, 4, 4])); 16 | sc.end(); 17 | }); 18 | -------------------------------------------------------------------------------- /examples/basic/reduceByKey.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | const data = [['hello', 1], ['hello', 1], ['world', 1]]; 6 | const nPartitions = 2; 7 | 8 | const init = 0; 9 | 10 | function reducer(a, b) {return a + b;} 11 | 12 | sc.parallelize(data, nPartitions) 13 | .reduceByKey(reducer, init) 14 | .collect(function(err, res) { 15 | console.log(res); 16 | console.assert(JSON.stringify(res) === JSON.stringify([['hello', 2], ['world', 1]])); 17 | sc.end(); 18 | }); 19 | -------------------------------------------------------------------------------- /examples/basic/coGroupStreams.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const fs = require('fs'); 4 | const sc = require('skale').context(); 5 | 6 | const s1 = sc.lineStream(fs.createReadStream(__dirname + '/kv.data')).map(line => line.split(' ')); 7 | const s2 = sc.lineStream(fs.createReadStream(__dirname + '/kv2.data')).map(line =>line.split(' ')); 8 | 9 | s1.coGroup(s2).collect(function(err, res) { 10 | console.log(res[0]); 11 | console.log(res[1]); 12 | console.log(res[2]); 13 | console.log(res[3]); 14 | sc.end(); 15 | }); 16 | -------------------------------------------------------------------------------- /examples/basic/join.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | const data = [['hello', 1], ['world', 2], ['world', 3]]; 6 | const data2 = [['cedric', 3], ['world', 4]]; 7 | const nPartitions = 4; 8 | 9 | const a = sc.parallelize(data, nPartitions); 10 | const b = sc.parallelize(data2, nPartitions); 11 | 12 | a.join(b).collect(function(err, res) { 13 | console.log(res); 14 | console.assert(JSON.stringify(res) === JSON.stringify([['world', [2, 4]],['world',[3, 4]]])); 15 | sc.end(); 16 | }); 17 | -------------------------------------------------------------------------------- /test/reduce.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | t.test('reduce callback', function (t) { 5 | t.plan(1); 6 | sc.parallelize([1, 2, 3, 4], 2) 7 | .reduce((a, b) => a + b, 0, function(err, res) { 8 | t.equal(res, 10); 9 | }); 10 | }); 11 | 12 | t.test('reduce promise', function (t) { 13 | t.plan(1); 14 | sc.parallelize([1, 2, 3, 4], 2) 15 | .reduce((a, b) => a + b, 0) 16 | .then(function(res) { 17 | t.equal(res, 10); 18 | sc.end(); 19 | }); 20 | }); 21 | -------------------------------------------------------------------------------- /test/reduceByKey.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | const data = [['hello', 1], ['hello', 1], ['world', 1]]; 5 | const nPartitions = 2; 6 | 7 | const init = 0; 8 | 9 | function reducer(a, b) {return a + b;} 10 | 11 | t.test('reduceByKey', function (t) { 12 | t.plan(1); 13 | 14 | sc.parallelize(data, nPartitions) 15 | .reduceByKey(reducer, init) 16 | .collect(function(err, res) { 17 | t.deepEqual(res.sort(), [['hello', 2], ['world', 1]]); 18 | sc.end(); 19 | }); 20 | }); 21 | -------------------------------------------------------------------------------- /test/lookup.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | t.test('lookup callback', function (t) { 5 | t.plan(1); 6 | 7 | sc.parallelize([[1, 2], [3, 4], [3, 6]]) 8 | .lookup(3, function(err, res) { 9 | t.deepEqual(res, [4, 6]); 10 | }); 11 | }); 12 | 13 | t.test('lookup promise', function (t) { 14 | t.plan(1); 15 | 16 | sc.parallelize([[1, 2], [3, 4], [3, 6]]) 17 | .lookup(3) 18 | .then(function(res) { 19 | t.deepEqual(res, [4, 6]); 20 | sc.end(); 21 | }); 22 | }); 23 | -------------------------------------------------------------------------------- /examples/basic/persist.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | function reducer(a, b) {a.push(b); return a;} 6 | function combiner(a, b) {return a.concat(b);} 7 | 8 | const a = sc.parallelize([1, 2, 3, 4], 2).persist(); 9 | 10 | a.aggregate(reducer, combiner, [], function(err, res) { 11 | console.log('First Time !'); 12 | console.log(res); 13 | 14 | a.aggregate(reducer, combiner, [], function(err, res) { 15 | console.log('\nSecond Time !'); 16 | console.log(res); 17 | sc.end(); 18 | }); 19 | }); 20 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | environment: 2 | nodejs_version: "8" 3 | SKALE_WORKERS: "2" 4 | SKALE_MEMORY: "512" 5 | AWS_ACCESS_KEY_ID: 6 | secure: d3GA6U28GOVQvQy9pjKpnqkiQ8GfJgyqggLrfRnfRsQ= 7 | AWS_SECRET_ACCESS_KEY: 8 | secure: MntXe1/M33xEgIZimiBsEi89oyRqTIIK3s65U4qLxfqhKmZGWZYKOt8ctWndSbhN 9 | 10 | init: 11 | - git config --global core.autocrlf true 12 | 13 | install: 14 | - ps: Install-Product node $env:nodejs_version 15 | - npm install 16 | 17 | test_script: 18 | - node --version 19 | - npm --version 20 | - npm run tape 21 | 22 | build: off 23 | -------------------------------------------------------------------------------- /docker/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Docker entrypoint (pid 1), run as root 3 | 4 | [ "$1" = "sh" ] && exec "$@" 5 | 6 | webserver() { 7 | mkdir -p /www/tmp 8 | ln -sf /tmp/skale /www/tmp/skale 9 | httpd -h /www 10 | } 11 | 12 | trap 'echo terminated; kill $pid' SIGTERM 13 | 14 | case $1 in 15 | (skale-server|skale-worker) 16 | webserver 17 | log=/var/log/$1.log 18 | [ -f $log ] && mv $log $log.old 19 | cmd="cd; ulimit -c unlimited; env; echo $@; exec $@" 20 | su -s /bin/sh -c "$cmd" skale 2>&1 | tee /var/log/$1.log & pid=$! 21 | wait $pid 22 | ;; 23 | esac 24 | -------------------------------------------------------------------------------- /examples/ml/binary-classification/dataset/README.md: -------------------------------------------------------------------------------- 1 | # Adult dataset 2 | 3 | Predict whether income exceeds $50K/yr based on census data. Also 4 | known as "Census Income" dataset. 5 | 6 | Imported from [Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/adult). 7 | 8 | The dataset is split in multiple CSV files (less than 512 kB) with headers, so: 9 | 10 | - file schema and features may be automatically extracted 11 | - programs operate on multiple partitions, as on large datasets 12 | - individual files can be loaded and formatted within the browser 13 | -------------------------------------------------------------------------------- /test/keys.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | t.test('keys', function (t) { 5 | t.plan(1); 6 | 7 | sc.parallelize([[1, 2], [2, 4], [4, 6]]) 8 | .keys() 9 | .collect(function(err, res) { 10 | t.deepEqual(res.sort(), [1, 2, 4]); 11 | }); 12 | }); 13 | 14 | t.test('values', function (t) { 15 | t.plan(1); 16 | 17 | sc.parallelize([[1, 2], [2, 4], [4, 6]]) 18 | .values() 19 | .collect(function(err, res) { 20 | t.deepEqual(res.sort(), [2, 4, 6]); 21 | sc.end(); 22 | }); 23 | }); 24 | -------------------------------------------------------------------------------- /examples/basic/coGroup.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | const data = [['hello', 1], ['world', 2], ['cedric', 3], ['cedric', 4]]; 6 | const data2 = [['cedric', 3], ['world', 4], ['test', 5]]; 7 | const nPartitions = 2; 8 | 9 | const a = sc.parallelize(data, nPartitions); 10 | const b = sc.parallelize(data2, nPartitions); 11 | 12 | a.coGroup(b).collect(function(err, res) { 13 | console.log(res); 14 | console.log(res[0]); 15 | console.log(res[1]); 16 | console.log(res[2]); 17 | console.log(res[3]); 18 | sc.end(); 19 | }); 20 | -------------------------------------------------------------------------------- /examples/basic/aggregateByKey.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | const data = [['hello', 1], ['hello', 1], ['world', 1]]; 6 | const nPartitions = 2; 7 | 8 | const init = 0; 9 | 10 | function reducer(a, b) {return a + b;} 11 | function combiner(a, b) {return a + b;} 12 | 13 | sc.parallelize(data, nPartitions) 14 | .aggregateByKey(reducer, combiner, init) 15 | .collect(function(err, res) { 16 | console.log(res); 17 | console.assert(JSON.stringify(res) === JSON.stringify([['hello', 2], ['world', 1]])); 18 | sc.end(); 19 | }); 20 | -------------------------------------------------------------------------------- /test/countByKey.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | t.test('countByKey callback', function (t) { 5 | t.plan(1); 6 | 7 | sc.parallelize([[1, 2], [3, 4], [3, 6]]) 8 | .countByKey(function (err, res) { 9 | t.deepEqual(res.sort(), [[1, 1], [3, 2]]); 10 | }); 11 | }); 12 | 13 | t.test('countByKey promise', function (t) { 14 | t.plan(1); 15 | 16 | sc.parallelize([[1, 2], [3, 4], [3, 6]]) 17 | .countByKey() 18 | .then(function(res) { 19 | t.deepEqual(res.sort(), [[1, 1], [3, 2]]); 20 | sc.end(); 21 | }); 22 | }); 23 | -------------------------------------------------------------------------------- /examples/basic/mapValues.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | function reducer(a, b) {a.push(b); return a;} 6 | function combiner(a, b) {return a.concat(b);} 7 | 8 | function by2(a) {return a * 2;} 9 | 10 | sc.parallelize([['hello', 1], ['world', 2], ['cedric', 3], ['test', 4]]) 11 | .mapValues(by2) 12 | .aggregate(reducer, combiner, [], function(err, res) { 13 | console.log(res); 14 | res.sort(); 15 | console.assert(JSON.stringify(res) === JSON.stringify([['cedric', 6], ['hello', 2], ['test', 8], ['world', 4]])); 16 | sc.end(); 17 | }); 18 | -------------------------------------------------------------------------------- /examples/basic/r2.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | const data = [ 6 | ['hello', [12, 'param1', 'param2']], 7 | ['hello', [10, 'param3', 'param4']], 8 | ['world', [5, 'param5', 'param6']] 9 | ]; 10 | const nPartitions = 1; 11 | 12 | const init = [0, []]; 13 | 14 | function reducer(a, b) { 15 | a[0] += b[0]; 16 | a[1].push([b[1], b[2]]); 17 | return a; 18 | } 19 | 20 | sc.parallelize(data, nPartitions) 21 | .reduceByKey(reducer, init) 22 | .collect(function(err, res) { 23 | console.log(res[0][0], res[0][1]); 24 | sc.end(); 25 | }); 26 | -------------------------------------------------------------------------------- /test/ml/kmeans.js: -------------------------------------------------------------------------------- 1 | process.env.SKALE_RANDOM_SEED = 1; 2 | 3 | const t = require('tape'); 4 | const sc = require('skale').context(); 5 | const ml = require('skale/ml'); 6 | 7 | t.test('kmeans', function (t) { 8 | t.plan(2); 9 | 10 | const dataset = sc.parallelize([ 11 | [1, 2], [1, 4], [1, 0], 12 | [4, 2], [4, 4], [4, 0] 13 | ]); 14 | const kmeans = ml.KMeans(2); 15 | kmeans.fit(dataset, function (err) { 16 | t.ok(!err, 'kmeans.fit() returns no error'); 17 | t.ok(kmeans.predict([0, 0]) !== kmeans.predict([4, 4]), 'predictions are correct'); 18 | sc.end(); 19 | }); 20 | }); 21 | -------------------------------------------------------------------------------- /test/range.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | t.test('range 1 arg', function (t) { 5 | t.plan(1); 6 | sc.range(4).collect(function (err, data) { 7 | t.deepEqual(data, [0, 1, 2, 3]); 8 | }); 9 | }); 10 | 11 | t.test('range 2 args', function (t) { 12 | t.plan(1); 13 | sc.range(2, 4).collect(function (err, data) { 14 | t.deepEqual(data, [2, 3]); 15 | }); 16 | }); 17 | 18 | t.test('range 3 args', function (t) { 19 | t.plan(1); 20 | sc.range(10, -5, -3).collect(function (err, data) { 21 | t.deepEqual(data, [10, 7, 4, 1, -2]); 22 | sc.end(); 23 | }); 24 | }); 25 | -------------------------------------------------------------------------------- /test/countByValue.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | t.test('countByValue callback', function (t) { 5 | t.plan(1); 6 | 7 | sc.parallelize([[1, 2], [3, 4], [1, 2], [3, 4]]) 8 | .countByValue(function (err, res) { 9 | t.deepEqual(res.sort(), [[[1, 2], 2], [[3, 4], 2]]); 10 | }); 11 | }); 12 | 13 | t.test('countByValue promise', function (t) { 14 | t.plan(1); 15 | 16 | sc.parallelize([[1, 2], [3, 4], [1, 2], [3, 4]]) 17 | .countByValue() 18 | .then(function(res) { 19 | t.deepEqual(res.sort(), [[[1, 2], 2], [[3, 4], 2]]); 20 | sc.end(); 21 | }); 22 | }); 23 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Luca-SAS, licensed under the Apache License 2.0 2 | 3 | 'use strict'; 4 | 5 | const ContextRemote = require('./lib/context.js'); 6 | const ContextLocal = require('./lib/context-local.js'); 7 | const Dataset = require('./lib/dataset.js'); 8 | 9 | function Context(args) { 10 | args = args || {}; 11 | if (args.host || process.env.SKALE_HOST) return ContextRemote(args); 12 | return ContextLocal(args); 13 | } 14 | 15 | module.exports = { 16 | Context: Context, 17 | context: Context, 18 | HashPartitioner: Dataset.HashPartitioner, 19 | RangePartitioner: Dataset.RangePartitioner, 20 | Source: Dataset.Source 21 | }; 22 | -------------------------------------------------------------------------------- /test/ml/sgd-linear-model.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | const ml = require('skale/ml'); 4 | 5 | t.test('SGDLinearModel', function (t) { 6 | t.plan(3); 7 | 8 | const trainingSet = sc.parallelize([ 9 | [1, [0.5, -0.7]], 10 | [-1, [-0.5, 0.7]] 11 | ]); 12 | const sgd = new ml.SGDLinearModel(); 13 | 14 | sgd.fit(trainingSet, 2, function (err) { 15 | t.ok(!err, 'sgd.fit() returns no error'); 16 | t.deepEqual(sgd.weights, [0.8531998372026804, -1.1944797720837526], 'sgd weights are correct'); 17 | t.ok(sgd.predict([2, -2]) > 0, 'sgd prediction is correct'); 18 | sc.end(); 19 | }); 20 | }); 21 | -------------------------------------------------------------------------------- /examples/ml/linear-regression/regression.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | 'use strict'; 4 | 5 | (async function main() { 6 | 7 | const sc = require('skale').context(); 8 | const ml = require('skale/ml'); 9 | 10 | const labelFeatures = sc.textFile(__dirname + '/sample_linear_regression_data.txt') 11 | .map(a => { 12 | const b = a.split(' '); 13 | return [Number(b.shift()), b.map(a => Number(a.split(':').pop()))]; 14 | }); 15 | //console.log(await labelFeatures.take(1)); 16 | 17 | const model = ml.SGDLinearModel({}); 18 | await model.fit(labelFeatures, 10); 19 | console.log('model:', model); 20 | 21 | sc.end(); 22 | 23 | })(); // main 24 | -------------------------------------------------------------------------------- /examples/basic/flatMapValues.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | function reducer(a, b) {a.push(b); return a;} 6 | function combiner(a, b) {return a.concat(b);} 7 | 8 | function dup(a) {return [a, a];} 9 | 10 | sc.parallelize([['hello', 1], ['world', 2], ['cedric', 3], ['test', 4]]) 11 | .flatMapValues(dup) 12 | .aggregate(reducer, combiner, [], function(err, res) { 13 | console.log(res); 14 | res.sort(); 15 | console.assert(JSON.stringify(res) === JSON.stringify([ [ 'cedric', 3 ],[ 'cedric', 3 ],[ 'hello', 1 ],[ 'hello', 1 ],[ 'test', 4 ],[ 'test', 4 ],[ 'world', 2 ],[ 'world', 2 ] ])); 16 | sc.end(); 17 | }); 18 | -------------------------------------------------------------------------------- /examples/basic/countByValue.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | const data = [[1, 1], [1, 1], [2, 3], [2, 4], [3, 5]]; 6 | 7 | const nPartitions = 1; 8 | 9 | function valueFlatMapper(e) { 10 | const out = []; 11 | for (let i = e; i <= 5; i++) out.push(i); 12 | return out; 13 | } 14 | 15 | sc.parallelize(data, nPartitions) 16 | .flatMapValues(valueFlatMapper) 17 | .countByValue() 18 | .then(function(res) { 19 | console.log(res); 20 | console.assert(JSON.stringify(res) === JSON.stringify([[[1, 1], 2], [[1, 2], 2], [[1, 3], 2], [[1, 4], 2], [[1, 5], 2], [[2, 3], 1], [[2, 4], 2], [[2, 5], 2], [[3, 5], 1]])); 21 | sc.end(); 22 | }); 23 | -------------------------------------------------------------------------------- /test/flatMap.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | t.test('flatMap', function (t) { 5 | t.plan(1); 6 | 7 | sc.parallelize([1, 2, 3, 4]) 8 | .flatMap(a => [a, a]) 9 | .collect(function (err, res) { 10 | t.deepEqual(res, [1, 1, 2, 2, 3, 3, 4, 4]); 11 | }); 12 | }); 13 | 14 | t.test('flatMapValues', function (t) { 15 | t.plan(1); 16 | 17 | sc.parallelize([['hello', 1], ['world', 2]]) 18 | .flatMapValues(a => [a, 2 * a]) 19 | .collect(function (err, res) { 20 | t.deepEqual(res, [ 21 | ['hello', 1], ['hello', 2], 22 | ['world', 2], ['world', 4] 23 | ]); 24 | sc.end(); 25 | }); 26 | }); 27 | -------------------------------------------------------------------------------- /test/coGroup.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | t.test('coGroup', function (t) { 5 | t.plan(1); 6 | 7 | const data = [['hello', 1], ['world', 2], ['cedric', 3], ['cedric', 4]]; 8 | const data2 = [['cedric', 3], ['world', 4], ['test', 5]]; 9 | const nPartitions = 2; 10 | 11 | const a = sc.parallelize(data, nPartitions); 12 | const b = sc.parallelize(data2, nPartitions); 13 | 14 | a.coGroup(b).collect(function (err, res) { 15 | t.deepEqual(res.sort(), [ 16 | ['cedric', [[3, 4], [3]]], 17 | ['hello', [[1], []]], 18 | ['test', [[], [5]]], 19 | ['world', [[2], [4]]], 20 | ]); 21 | sc.end(); 22 | }); 23 | }); 24 | -------------------------------------------------------------------------------- /test/map.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | function by2(a, args) {return a * 2 * args.bias;} 5 | function sum(a, b) {return a + b;} 6 | 7 | t.test('map', function (t) { 8 | t.plan(1); 9 | 10 | sc.parallelize([1, 2, 3, 4]) 11 | .map(by2, {bias: 2}) 12 | .reduce(sum, 0, function(err, res) { 13 | t.equal(res, 40); 14 | }); 15 | }); 16 | 17 | t.test('mapValues', function (t) { 18 | t.plan(1); 19 | 20 | sc.parallelize([['hello', 1], ['world', 2], ['test', 4]]) 21 | .mapValues(a => a * 2) 22 | .collect(function (err, res) { 23 | t.deepEqual(res.sort(), [['hello', 2], ['test', 8], ['world', 4]]); 24 | sc.end(); 25 | }); 26 | }); 27 | -------------------------------------------------------------------------------- /test/aggregateByKey.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | const data = [['hello', 1], ['hello', 1], ['world', 1]]; 5 | const nPartitions = 2; 6 | 7 | const init = 0; 8 | 9 | function reducer(a, b) {return a + b;} 10 | function combiner(a, b) {return a + b;} 11 | 12 | t.test('aggregateByKey', function (t) { 13 | t.plan(1); 14 | 15 | sc.parallelize(data, nPartitions) 16 | .aggregateByKey(reducer, combiner, init) 17 | .collect(function(err, res) { 18 | t.deepEqual(res, [['hello', 2], ['world', 1]]); 19 | sc.end(); 20 | }); 21 | }); 22 | 23 | // TODO: test passing args in combiner / reducer 24 | 25 | // TODO: test using worker contex in combiner / reducer 26 | -------------------------------------------------------------------------------- /test/0_require.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | t.test('env', function (t) { 5 | t.plan(1); 6 | 7 | sc.env.MY_VAR = 'hello'; 8 | sc.range(5) 9 | .map(a => process.env.MY_VAR + a) 10 | .collect(function (err, res) { 11 | t.equal(res[0], 'hello0', 'env is propagated to workers'); 12 | }); 13 | }); 14 | 15 | t.test('require', function (t) { 16 | t.plan(1); 17 | 18 | sc.require({add3: './dep.js'}) 19 | .range(4) 20 | .map(a => add3(a)) // eslint-disable-line no-undef 21 | .collect(function (err, res) { 22 | t.deepEquals(res, [3, 4, 5, 6], 'dependency is injected in workers'); 23 | sc.end(); 24 | }); 25 | }); 26 | 27 | -------------------------------------------------------------------------------- /examples/ml/clustering/README.md: -------------------------------------------------------------------------------- 1 | # Clustering example 2 | 3 | This example showcases unsupervised clusterization with [K-Means]. 4 | 5 | We use the [iris flower dataset] which consists of 50 samples from 6 | each of three species of Iris (Iris setosa, Iris virginica and Iris 7 | versicolor). Four features were measured from each sample: the 8 | length and the width of the sepals and petals, in centimetres. Data 9 | are in [iris.csv](iris.csv). 10 | 11 | We train an unsupervised clustering model on numerical features, 12 | and check the predicted data (cluster number) against dataset label 13 | to evaluate performance. 14 | 15 | [K-Means]: https://en.wikipedia.org/wiki/K-means_clustering 16 | [iris flower dataset]: https://en.wikipedia.org/wiki/Iris_flower_data_set 17 | -------------------------------------------------------------------------------- /lib/lines.js: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Luca-SAS, licensed under the Apache License 2.0 2 | 3 | 'use strict'; 4 | 5 | const stream = require('stream'); 6 | const util = require('util'); 7 | 8 | const Lines = module.exports = function Lines(opt) { 9 | if (!(this instanceof Lines)) return new Lines(opt); 10 | stream.Transform.call(this, {objectMode: true}); 11 | this._buf = ''; 12 | }; 13 | util.inherits(Lines, stream.Transform); 14 | 15 | Lines.prototype._transform = function (chunk, encoding, done) { 16 | const data = this._buf + chunk.toString(); 17 | const lines = data.split('\n'); 18 | this._buf = lines.pop(); 19 | done(null, lines); 20 | }; 21 | 22 | Lines.prototype._flush = function (done) { 23 | if (this._buf) this.push([this._buf]); 24 | done(); 25 | }; 26 | -------------------------------------------------------------------------------- /test/aggregate.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | t.test('aggregate callback', function (t) { 5 | t.plan(1); 6 | 7 | sc.parallelize([3, 5, 2, 7, 4, 8]) 8 | .aggregate( 9 | (a, v) => [a[0] + v, a[1] + 1], 10 | (a1, a2) => [a1[0] + a2[0], a1[1] + a2[1]], 11 | [0, 0], 12 | function (err, res) { 13 | t.equal(res[0] / res[1], 29 / 6); 14 | } 15 | ); 16 | }); 17 | 18 | t.test('aggregate promise', function (t) { 19 | t.plan(1); 20 | 21 | sc.parallelize([3, 5, 2, 7, 4, 8]) 22 | .aggregate( 23 | (a, v) => [a[0] + v, a[1] + 1], 24 | (a1, a2) => [a1[0] + a2[0], a1[1] + a2[1]], 25 | [0, 0] 26 | ) 27 | .then(function(res) { 28 | t.equal(res[0] / res[1], 29 / 6); 29 | sc.end(); 30 | }); 31 | }); 32 | -------------------------------------------------------------------------------- /examples/ml/clustering/iris.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | 'use strict'; 4 | 5 | (async function main() { 6 | 7 | const sc = require('skale').context(); 8 | const ml = require('skale/ml'); 9 | 10 | const rawdata = sc.textFile(__dirname + '/iris.csv'); 11 | //console.log(await rawdata.collect()); 12 | 13 | const data = rawdata 14 | .filter(a => a[0] !== 'S') 15 | .map(a => a.split(',')) 16 | .map(a => [a.pop(), a.map(Number)]) // [species, array of numeric features] 17 | .persist(); 18 | console.log(await data.map(a => a[1]).collect()); 19 | 20 | const model = new ml.KMeans(3); 21 | await model.fit(data.map(a => a[1])); 22 | console.log(model); 23 | 24 | const predicted = data.map((a, model) => [a[0], model.predict(a[1])], model); 25 | console.log(await predicted.collect()); 26 | sc.end(); 27 | 28 | })(); // main 29 | -------------------------------------------------------------------------------- /test/ml/standard-scaler.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | const ml = require('skale/ml'); 4 | 5 | t.test('standard scaler', function (t) { 6 | t.plan(4); 7 | 8 | const data = sc.parallelize([[0, 0], [0, 0], [1, 1], [1, 1]]); 9 | const scaler = new ml.StandardScaler(); 10 | 11 | scaler.fit(data, function (err) { 12 | t.ok(!err, 'scaler.fit() returns no error'); 13 | t.deepEqual(scaler.mean, [0.5, 0.5], 'scaler mean vector is correct'); 14 | t.deepEqual(scaler.std, [0.5, 0.5], 'scaler standard deviation vector is correct'); 15 | 16 | const scaled = data.map((p, scaler) => scaler.transform(p), scaler); 17 | scaled.collect(function (err, res) { 18 | t.deepEqual(res, [[-1, -1], [-1, -1], [1, 1], [1, 1]], 'scaled data is correct'); 19 | sc.end(); 20 | }); 21 | }); 22 | }); 23 | -------------------------------------------------------------------------------- /examples/basic/cartesian.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const sc = require('skale').context(); 4 | 5 | const data = [1, 2, 3, 4, 5, 6]; 6 | const data2 = [7, 8, 9, 10, 11, 12]; 7 | const nPartitions = 3; 8 | 9 | const a = sc.parallelize(data, nPartitions); 10 | const b = sc.parallelize(data2, nPartitions); 11 | 12 | a.cartesian(b) 13 | .collect(function(err, res) { 14 | res.sort(); 15 | console.log(res); 16 | console.assert(JSON.stringify(res) === JSON.stringify([ 17 | [1, 10], [1, 11], [1, 12], [1, 7], [1, 8], [1, 9], 18 | [2, 10], [2, 11], [2, 12], [2, 7], [2, 8], [2, 9], 19 | [3, 10], [3, 11], [3, 12], [3, 7], [3, 8], [3, 9], 20 | [4, 10], [4, 11], [4, 12], [4, 7], [4, 8], [4, 9], 21 | [5, 10], [5, 11], [5, 12], [5, 7], [5, 8], [5, 9], 22 | [6, 10], [6, 11], [6, 12], [6, 7], [6, 8], [6, 9] 23 | ])); 24 | sc.end(); 25 | }); 26 | -------------------------------------------------------------------------------- /lib/rough-sizeof.js: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Luca-SAS, licensed under the Apache License 2.0 2 | 3 | 'use strict'; 4 | 5 | module.exports = function sizeof(obj) { 6 | let size = 0; 7 | 8 | function sizeOf(obj) { 9 | if (obj === undefined || obj === null) return size; 10 | switch (typeof obj) { 11 | case 'number': 12 | size += 8; 13 | break; 14 | case 'string': 15 | size += obj.length * 2; 16 | break; 17 | case 'boolean': 18 | size += 4; 19 | break; 20 | case 'object': 21 | if (obj instanceof Array) { 22 | size += 8 * obj.length; 23 | for (let i = 0; i < obj.length; i++) sizeOf(obj[i]); 24 | } else { 25 | for (let i in obj) { 26 | size += i.length * 2; 27 | sizeOf(obj[i]); 28 | } 29 | } 30 | break; 31 | } 32 | return size; 33 | } 34 | return sizeOf(obj); 35 | }; 36 | -------------------------------------------------------------------------------- /test/cartesian.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | t.test('cartesian', function (t) { 5 | const data = [1, 2, 3, 4, 5, 6]; 6 | const data2 = [7, 8, 9, 10, 11, 12]; 7 | const nPartitions = 3; 8 | 9 | const a = sc.parallelize(data, nPartitions); 10 | const b = sc.parallelize(data2, nPartitions); 11 | 12 | t.plan(1); 13 | 14 | a.cartesian(b) 15 | .collect(function(err, res) { 16 | res.sort(); 17 | t.deepEqual(res, [ 18 | [1, 10], [1, 11], [1, 12], [1, 7], [1, 8], [1, 9], 19 | [2, 10], [2, 11], [2, 12], [2, 7], [2, 8], [2, 9], 20 | [3, 10], [3, 11], [3, 12], [3, 7], [3, 8], [3, 9], 21 | [4, 10], [4, 11], [4, 12], [4, 7], [4, 8], [4, 9], 22 | [5, 10], [5, 11], [5, 12], [5, 7], [5, 8], [5, 9], 23 | [6, 10], [6, 11], [6, 12], [6, 7], [6, 8], [6, 9] 24 | ]); 25 | sc.end(); 26 | }); 27 | }); 28 | -------------------------------------------------------------------------------- /test/node_modules/skale/index.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const ContextRemote = require('../../../lib/context.js'); 4 | const ContextLocal = require('../../../lib/context-local.js'); 5 | const Dataset = require('../../../lib/dataset.js'); 6 | 7 | // In test mode, cache already opened context in global var 8 | // so only one context is active for the whole test session 9 | // no matter the number of test files 10 | 11 | function Context(args) { 12 | if (global._sc) { 13 | global._scn++; 14 | } else { 15 | global._scn = 0; 16 | args = args || {}; 17 | if (args.host || process.env.SKALE_HOST) 18 | global._sc = ContextRemote(args); 19 | else 20 | global._sc = ContextLocal(args); 21 | } 22 | return global._sc; 23 | } 24 | 25 | module.exports = { 26 | Context: Context, 27 | context: Context, 28 | HashPartitioner: Dataset.HashPartitioner, 29 | RangePartitioner: Dataset.RangePartitioner, 30 | Source: Dataset.Source 31 | }; 32 | -------------------------------------------------------------------------------- /test/save-csv.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const t = require('tape'); 3 | const rimraf = require('rimraf'); 4 | const sc = require('skale').context(); 5 | 6 | const savedir = '/tmp/skale-test/save'; 7 | 8 | t.test('save csv', function (t) { 9 | t.plan(4); 10 | 11 | rimraf(savedir, function (err) { 12 | t.ok(!err, 'delete previous saved data'); 13 | sc.range(10) 14 | .map(a => [a, a, a]) 15 | .save(savedir, {stream: true, csv: true}, function (err) { 16 | t.ok(!err, 'save returns no error'); 17 | t.ok(fs.existsSync(savedir + '/0.csv'), 'saved filename is correct'); 18 | sc.textFile(savedir + '/') 19 | .collect(function (err, res) { 20 | t.deepEqual(res, [ 21 | '0;0;0', '1;1;1', '2;2;2', '3;3;3', '4;4;4', 22 | '5;5;5', '6;6;6', '7;7;7', '8;8;8', '9;9;9' 23 | ], 'saved content is correct'); 24 | sc.end(); 25 | }); 26 | }); 27 | }); 28 | }); 29 | -------------------------------------------------------------------------------- /test/join.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | const data = [['hello', 1], ['world', 2], ['world', 3]]; 5 | const data2 = [['cedric', 3], ['world', 4]]; 6 | 7 | t.test('join', function (t) { 8 | t.plan(1); 9 | 10 | sc.parallelize(data) 11 | .join(sc.parallelize(data2)) 12 | .collect(function(err, res) { 13 | t.deepEqual(res, [['world', [2, 4]], ['world', [3, 4]]]); 14 | }); 15 | }); 16 | 17 | t.test('leftOuterJoin', function (t) { 18 | t.plan(1); 19 | 20 | sc.parallelize(data) 21 | .leftOuterJoin(sc.parallelize(data2)) 22 | .collect(function(err, res) { 23 | t.deepEqual(res.sort(), [['hello', [1, null]], ['world', [2, 4]], ['world', [3, 4]]]); 24 | }); 25 | }); 26 | 27 | t.test('rightOuterJoin', function (t) { 28 | t.plan(1); 29 | 30 | sc.parallelize(data) 31 | .rightOuterJoin(sc.parallelize(data2)) 32 | .collect(function(err, res) { 33 | t.deepEqual(res.sort(), [['cedric', [null, 3]], ['world', [2, 4]], ['world', [3, 4]]]); 34 | sc.end(); 35 | }); 36 | }); 37 | -------------------------------------------------------------------------------- /test/stream.js: -------------------------------------------------------------------------------- 1 | const stream = require('stream'); 2 | const zlib = require('zlib'); 3 | const t = require('tape'); 4 | const sc = require('skale').context(); 5 | 6 | t.test('stream', function (t) { 7 | t.plan(2); 8 | 9 | let res = ''; 10 | const s = sc.range(10).stream(); 11 | 12 | t.ok(s instanceof stream.Readable, 'ds.stream() returns a readable stream'); 13 | 14 | s.on('data', function (data) {res += data.toString();}); 15 | s.on('end', function () { 16 | t.equal(res, '0\n1\n2\n3\n4\n5\n6\n7\n8\n\9\n', 'data read is correct'); 17 | }); 18 | }); 19 | 20 | t.test('stream gzip', function (t) { 21 | t.plan(2); 22 | 23 | let res = ''; 24 | const s = sc.range(10).stream({gzip: true}); 25 | 26 | t.ok(s instanceof stream.Readable, 'ds.stream() returns a readable stream'); 27 | 28 | const rs = s.pipe(zlib.createGunzip()); 29 | 30 | rs.on('data', function (data) {res += data.toString();}); 31 | rs.on('end', function () { 32 | t.equal(res, '0\n1\n2\n3\n4\n5\n6\n7\n8\n\9\n', 'gunzip data read is correct'); 33 | sc.end(); 34 | }); 35 | }); 36 | -------------------------------------------------------------------------------- /test/data/split/iris-03.csv: -------------------------------------------------------------------------------- 1 | 6.0,2.2,5.0,1.5,Iris virginica 2 | 6.9,3.2,5.7,2.3,Iris virginica 3 | 5.6,2.8,4.9,2.0,Iris virginica 4 | 7.7,2.8,6.7,2.0,Iris virginica 5 | 6.3,2.7,4.9,1.8,Iris virginica 6 | 6.7,3.3,5.7,2.1,Iris virginica 7 | 7.2,3.2,6.0,1.8,Iris virginica 8 | 6.2,2.8,4.8,1.8,Iris virginica 9 | 6.1,3.0,4.9,1.8,Iris virginica 10 | 6.4,2.8,5.6,2.1,Iris virginica 11 | 7.2,3.0,5.8,1.6,Iris virginica 12 | 7.4,2.8,6.1,1.9,Iris virginica 13 | 7.9,3.8,6.4,2.0,Iris virginica 14 | 6.4,2.8,5.6,2.2,Iris virginica 15 | 6.3,2.8,5.1,1.5,Iris virginica 16 | 6.1,2.6,5.6,1.4,Iris virginica 17 | 7.7,3.0,6.1,2.3,Iris virginica 18 | 6.3,3.4,5.6,2.4,Iris virginica 19 | 6.4,3.1,5.5,1.8,Iris virginica 20 | 6.0,3.0,4.8,1.8,Iris virginica 21 | 6.9,3.1,5.4,2.1,Iris virginica 22 | 6.7,3.1,5.6,2.4,Iris virginica 23 | 6.9,3.1,5.1,2.3,Iris virginica 24 | 5.8,2.7,5.1,1.9,Iris virginica 25 | 6.8,3.2,5.9,2.3,Iris virginica 26 | 6.7,3.3,5.7,2.5,Iris virginica 27 | 6.7,3.0,5.2,2.3,Iris virginica 28 | 6.3,2.5,5.0,1.9,Iris virginica 29 | 6.5,3.0,5.2,2.0,Iris virginica 30 | 6.2,3.4,5.4,2.3,Iris virginica 31 | 5.9,3.0,5.1,1.8,Iris virginica 32 | -------------------------------------------------------------------------------- /test/textFile.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | t.test('textFile local file', function (t) { 5 | t.plan(1); 6 | sc.textFile(__dirname + '/data/iris.csv') 7 | .count(function (err, res) { 8 | t.ok(res === 151); 9 | }); 10 | }); 11 | 12 | t.test('textFile compressed file', function (t) { 13 | t.plan(1); 14 | sc.textFile(__dirname + '/data/iris.csv.gz') 15 | .count(function (err, res) { 16 | t.ok(res === 151); 17 | }); 18 | }); 19 | 20 | t.test('textFile dir', function (t) { 21 | t.plan(1); 22 | sc.textFile(__dirname + '/data/split/') 23 | .count(function (err, res) { 24 | t.ok(res === 151); 25 | }); 26 | }); 27 | 28 | t.test('textFile compressed dir', function (t) { 29 | t.plan(1); 30 | sc.textFile(__dirname + '/data/split-gz/') 31 | .count(function (err, res) { 32 | t.ok(res === 151); 33 | }); 34 | }); 35 | 36 | t.test('textFile multiple files', function (t) { 37 | t.plan(1); 38 | sc.textFile(__dirname + '/data/split/iris-*.csv') 39 | .count(function (err, res) { 40 | t.ok(res === 151); 41 | sc.end(); 42 | }); 43 | }); 44 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | # Project information 2 | site_name: Skale 3 | site_description: Skale documentation 4 | site_author: 'Skale authors' 5 | site_url: 'https://skale-me.github.io/skale/' 6 | dev_addr: 0.0.0.0:8000 7 | 8 | google_analytics: 9 | - 'UA-75822605-1' 10 | - 'auto' 11 | 12 | # Repository 13 | repo_name: 'skale-me/skale' 14 | repo_url: 'https://github.com/skale-me/skale/' 15 | 16 | # Copyright 17 | copyright: 'Copyright © 2016 Luca-SAS' 18 | 19 | # Configuration 20 | theme: 21 | name: 'material' 22 | langage: 'en' 23 | logo: 'images/logo.svg' 24 | favicon: 'images/favicon.png' 25 | palette: 26 | primary: 'indigo' 27 | accent: 'indigo' 28 | font: 29 | text: 'Roboto' 30 | code: 'Roboto mono' 31 | 32 | # Customization 33 | extra: 34 | social: 35 | - type: 'github' 36 | link: 'https://github.com/skale-me' 37 | 38 | # Extensions 39 | markdown_extensions: 40 | - admonition 41 | - codehilite: 42 | guess_lang: false 43 | - toc: 44 | permalink: true 45 | 46 | # Content 47 | docs_dir: docs 48 | 49 | pages: 50 | - About Skale: 'index.md' 51 | - Programming guide: 52 | - 'concepts.md' 53 | - 'skale-API.md' 54 | - 'machine-learning.md' 55 | - 'skale-hackers-guide.md' 56 | -------------------------------------------------------------------------------- /test/take.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | t.test('first callback', function (t) { 5 | t.plan(1); 6 | 7 | sc.range(100) 8 | .first(function (err, res) { 9 | t.equal(res, 0); 10 | }); 11 | }); 12 | 13 | t.test('first promise', function (t) { 14 | t.plan(1); 15 | 16 | sc.range(100) 17 | .first() 18 | .then(function (res) { 19 | t.equal(res, 0); 20 | }); 21 | }); 22 | 23 | t.test('take callback', function (t) { 24 | t.plan(1); 25 | 26 | sc.range(100) 27 | .take(3, function (err, res) { 28 | t.deepEqual(res, [0, 1, 2]); 29 | }); 30 | }); 31 | 32 | t.test('take promise', function (t) { 33 | t.plan(1); 34 | 35 | sc.range(100) 36 | .take(3) 37 | .then(function (res) { 38 | t.deepEqual(res, [0, 1, 2]); 39 | }); 40 | }); 41 | 42 | t.test('top callback', function (t) { 43 | t.plan(1); 44 | 45 | sc.range(100) 46 | .top(3, function (err, res) { 47 | t.deepEqual(res, [97, 98, 99]); 48 | }); 49 | }); 50 | 51 | t.test('top promise', function (t) { 52 | t.plan(1); 53 | 54 | sc.range(100) 55 | .top(3) 56 | .then(function (res) { 57 | t.deepEqual(res, [97, 98, 99]); 58 | sc.end(); 59 | }); 60 | }); 61 | -------------------------------------------------------------------------------- /docs/images/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /benchmark/gen_data.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const fs = require('fs'); 4 | 5 | if (process.argv.length !== 4) { 6 | console.log('Usage: gen_data.js file size_in_Mo'); 7 | process.exit(1); 8 | } 9 | 10 | const file = process.argv[2]; 11 | const D = 16; 12 | const maxSize = process.argv[3] * 1024 * 1024; 13 | 14 | const rng = new Random(); 15 | const fd = fs.createWriteStream(file); 16 | let fileSize = 0; 17 | 18 | function writeChunk() { 19 | let line = ''; 20 | for (let i = 0; i < 500; i++) { 21 | line += 2 * Math.round(Math.abs(rng.randn(1))) - 1; 22 | line += ' ' + rng.randn(D).join(' ') + '\n'; 23 | } 24 | const lineSize = Buffer.byteLength(line, 'utf8'); 25 | if ((fileSize + lineSize) > maxSize) fd.end(); 26 | else fd.write(line, function() {fileSize += lineSize; writeChunk();}); 27 | } 28 | 29 | writeChunk(); 30 | 31 | function Random(initSeed) { 32 | this.seed = initSeed || 1; 33 | 34 | this.next = function () { 35 | const x = Math.sin(this.seed++) * 10000; 36 | return (x - Math.floor(x)) * 2 - 1; 37 | }; 38 | 39 | this.reset = function () { 40 | this.seed = initSeed; 41 | }; 42 | 43 | this.randn = function (N) { 44 | const w = new Array(N); 45 | for (let i = 0; i < N; i++) 46 | w[i] = this.next(); 47 | return w; 48 | }; 49 | } 50 | -------------------------------------------------------------------------------- /test/data/split/iris-00.csv: -------------------------------------------------------------------------------- 1 | Sepal length,Sepal Width,Petal length,Petal width,Species 2 | 5.1,3.5,1.4,0.2,Iris setosa 3 | 4.9,3.0,1.4,0.2,Iris setosa 4 | 4.7,3.2,1.3,0.2,Iris setosa 5 | 4.6,3.1,1.5,0.2,Iris setosa 6 | 5.0,3.6,1.4,0.2,Iris setosa 7 | 5.4,3.9,1.7,0.4,Iris setosa 8 | 4.6,3.4,1.4,0.3,Iris setosa 9 | 5.0,3.4,1.5,0.2,Iris setosa 10 | 4.4,2.9,1.4,0.2,Iris setosa 11 | 4.9,3.1,1.5,0.1,Iris setosa 12 | 5.4,3.7,1.5,0.2,Iris setosa 13 | 4.8,3.4,1.6,0.2,Iris setosa 14 | 4.8,3.0,1.4,0.1,Iris setosa 15 | 4.3,3.0,1.1,0.1,Iris setosa 16 | 5.8,4.0,1.2,0.2,Iris setosa 17 | 5.7,4.4,1.5,0.4,Iris setosa 18 | 5.4,3.9,1.3,0.4,Iris setosa 19 | 5.1,3.5,1.4,0.3,Iris setosa 20 | 5.7,3.8,1.7,0.3,Iris setosa 21 | 5.1,3.8,1.5,0.3,Iris setosa 22 | 5.4,3.4,1.7,0.2,Iris setosa 23 | 5.1,3.7,1.5,0.4,Iris setosa 24 | 4.6,3.6,1.0,0.2,Iris setosa 25 | 5.1,3.3,1.7,0.5,Iris setosa 26 | 4.8,3.4,1.9,0.2,Iris setosa 27 | 5.0,3.0,1.6,0.2,Iris setosa 28 | 5.0,3.4,1.6,0.4,Iris setosa 29 | 5.2,3.5,1.5,0.2,Iris setosa 30 | 5.2,3.4,1.4,0.2,Iris setosa 31 | 4.7,3.2,1.6,0.2,Iris setosa 32 | 4.8,3.1,1.6,0.2,Iris setosa 33 | 5.4,3.4,1.5,0.4,Iris setosa 34 | 5.2,4.1,1.5,0.1,Iris setosa 35 | 5.5,4.2,1.4,0.2,Iris setosa 36 | 4.9,3.1,1.5,0.1,Iris setosa 37 | 5.0,3.2,1.2,0.2,Iris setosa 38 | 5.5,3.5,1.3,0.2,Iris setosa 39 | 4.9,3.1,1.5,0.1,Iris setosa 40 | 4.4,3.0,1.3,0.2,Iris setosa 41 | -------------------------------------------------------------------------------- /test/textFile-azure.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | const skip = process.env.AZURE_STORAGE_CONNECTION_STRING ? false : true; 5 | 6 | t.test('textFile azure file', {skip: skip}, function (t) { 7 | t.plan(1); 8 | sc.textFile('wasb://skalejs/test/iris.csv') 9 | .count(function (err, res) { 10 | t.ok(res === 151); 11 | }); 12 | }); 13 | 14 | t.test('textFile azure compressed file', {skip: skip}, function (t) { 15 | t.plan(1); 16 | sc.textFile('wasb://skalejs/test/iris.csv.gz') 17 | .count(function (err, res) { 18 | t.ok(res === 151); 19 | }); 20 | }); 21 | 22 | t.test('textFile azure dir', {skip: skip}, function (t) { 23 | t.plan(1); 24 | sc.textFile('wasb://skalejs/split/') 25 | .count(function (err, res) { 26 | t.ok(res === 151); 27 | }); 28 | }); 29 | 30 | t.test('textFile azure compressed dir', {skip: skip}, function (t) { 31 | t.plan(1); 32 | sc.textFile('wasb://skalejs/splitgz/') 33 | .count(function (err, res) { 34 | t.ok(res === 151); 35 | }); 36 | }); 37 | 38 | t.test('textFile azure multiple files', {skip: skip}, function (t) { 39 | t.plan(1); 40 | sc.textFile('wasb://skalejs/split/iris-*.csv') 41 | .count(function (err, res) { 42 | t.ok(res === 151); 43 | sc.end(); 44 | }); 45 | }); 46 | 47 | if (skip) sc.end(); 48 | -------------------------------------------------------------------------------- /test/textFile-s3.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const sc = require('skale').context(); 3 | 4 | const skip = process.env.AWS_ACCESS_KEY_ID ? false : true; 5 | 6 | t.test('textFile s3 file', {skip: skip}, function (t) { 7 | t.plan(1); 8 | sc.textFile('s3://skale-test-eu-west-1/test/iris.csv') 9 | .count(function (err, res) { 10 | t.ok(res === 151); 11 | }); 12 | }); 13 | 14 | t.test('textFile s3 compressed file', {skip: skip}, function (t) { 15 | t.plan(1); 16 | sc.textFile('s3://skale-test-eu-west-1/test/iris.csv.gz') 17 | .count(function (err, res) { 18 | t.ok(res === 151); 19 | }); 20 | }); 21 | 22 | t.test('textFile s3 dir', {skip: skip}, function (t) { 23 | t.plan(1); 24 | sc.textFile('s3://skale-test-eu-west-1/test/split/') 25 | .count(function (err, res) { 26 | t.ok(res === 151); 27 | }); 28 | }); 29 | 30 | t.test('textFile s3 compressed dir', {skip: skip}, function (t) { 31 | t.plan(1); 32 | sc.textFile('s3://skale-test-eu-west-1/test/split-gz/') 33 | .count(function (err, res) { 34 | t.ok(res === 151); 35 | }); 36 | }); 37 | 38 | t.test('textFile s3 multiple files', {skip: skip}, function (t) { 39 | t.plan(1); 40 | sc.textFile('s3://skale-test-eu-west-1/test/split/iris-*.csv') 41 | .count(function (err, res) { 42 | t.ok(res === 151); 43 | sc.end(); 44 | }); 45 | }); 46 | 47 | if (skip) sc.end(); 48 | -------------------------------------------------------------------------------- /test/data/split/iris-01.csv: -------------------------------------------------------------------------------- 1 | 5.1,3.4,1.5,0.2,Iris setosa 2 | 5.0,3.5,1.3,0.3,Iris setosa 3 | 4.5,2.3,1.3,0.3,Iris setosa 4 | 4.4,3.2,1.3,0.2,Iris setosa 5 | 5.0,3.5,1.6,0.6,Iris setosa 6 | 5.1,3.8,1.9,0.4,Iris setosa 7 | 4.8,3.0,1.4,0.3,Iris setosa 8 | 5.1,3.8,1.6,0.2,Iris setosa 9 | 4.6,3.2,1.4,0.2,Iris setosa 10 | 5.3,3.7,1.5,0.2,Iris setosa 11 | 5.0,3.3,1.4,0.2,Iris setosa 12 | 7.0,3.2,4.7,1.4,Iris versicolor 13 | 6.4,3.2,4.5,1.5,Iris versicolor 14 | 6.9,3.1,4.9,1.5,Iris versicolor 15 | 5.5,2.3,4.0,1.3,Iris versicolor 16 | 6.5,2.8,4.6,1.5,Iris versicolor 17 | 5.7,2.8,4.5,1.3,Iris versicolor 18 | 6.3,3.3,4.7,1.6,Iris versicolor 19 | 4.9,2.4,3.3,1.0,Iris versicolor 20 | 6.6,2.9,4.6,1.3,Iris versicolor 21 | 5.2,2.7,3.9,1.4,Iris versicolor 22 | 5.0,2.0,3.5,1.0,Iris versicolor 23 | 5.9,3.0,4.2,1.5,Iris versicolor 24 | 6.0,2.2,4.0,1.0,Iris versicolor 25 | 6.1,2.9,4.7,1.4,Iris versicolor 26 | 5.6,2.9,3.6,1.3,Iris versicolor 27 | 6.7,3.1,4.4,1.4,Iris versicolor 28 | 5.6,3.0,4.5,1.5,Iris versicolor 29 | 5.8,2.7,4.1,1.0,Iris versicolor 30 | 6.2,2.2,4.5,1.5,Iris versicolor 31 | 5.6,2.5,3.9,1.1,Iris versicolor 32 | 5.9,3.2,4.8,1.8,Iris versicolor 33 | 6.1,2.8,4.0,1.3,Iris versicolor 34 | 6.3,2.5,4.9,1.5,Iris versicolor 35 | 6.1,2.8,4.7,1.2,Iris versicolor 36 | 6.4,2.9,4.3,1.3,Iris versicolor 37 | 6.6,3.0,4.4,1.4,Iris versicolor 38 | 6.8,2.8,4.8,1.4,Iris versicolor 39 | 6.7,3.0,5.0,1.7,Iris versicolor 40 | 6.0,2.9,4.5,1.5,Iris versicolor 41 | -------------------------------------------------------------------------------- /test/data/split/iris-02.csv: -------------------------------------------------------------------------------- 1 | 5.7,2.6,3.5,1.0,Iris versicolor 2 | 5.5,2.4,3.8,1.1,Iris versicolor 3 | 5.5,2.4,3.7,1.0,Iris versicolor 4 | 5.8,2.7,3.9,1.2,Iris versicolor 5 | 6.0,2.7,5.1,1.6,Iris versicolor 6 | 5.4,3.0,4.5,1.5,Iris versicolor 7 | 6.0,3.4,4.5,1.6,Iris versicolor 8 | 6.7,3.1,4.7,1.5,Iris versicolor 9 | 6.3,2.3,4.4,1.3,Iris versicolor 10 | 5.6,3.0,4.1,1.3,Iris versicolor 11 | 5.5,2.5,4.0,1.3,Iris versicolor 12 | 5.5,2.6,4.4,1.2,Iris versicolor 13 | 6.1,3.0,4.6,1.4,Iris versicolor 14 | 5.8,2.6,4.0,1.2,Iris versicolor 15 | 5.0,2.3,3.3,1.0,Iris versicolor 16 | 5.6,2.7,4.2,1.3,Iris versicolor 17 | 5.7,3.0,4.2,1.2,Iris versicolor 18 | 5.7,2.9,4.2,1.3,Iris versicolor 19 | 6.2,2.9,4.3,1.3,Iris versicolor 20 | 5.1,2.5,3.0,1.1,Iris versicolor 21 | 5.7,2.8,4.1,1.3,Iris versicolor 22 | 6.3,3.3,6.0,2.5,Iris virginica 23 | 5.8,2.7,5.1,1.9,Iris virginica 24 | 7.1,3.0,5.9,2.1,Iris virginica 25 | 6.3,2.9,5.6,1.8,Iris virginica 26 | 6.5,3.0,5.8,2.2,Iris virginica 27 | 7.6,3.0,6.6,2.1,Iris virginica 28 | 4.9,2.5,4.5,1.7,Iris virginica 29 | 7.3,2.9,6.3,1.8,Iris virginica 30 | 6.7,2.5,5.8,1.8,Iris virginica 31 | 7.2,3.6,6.1,2.5,Iris virginica 32 | 6.5,3.2,5.1,2.0,Iris virginica 33 | 6.4,2.7,5.3,1.9,Iris virginica 34 | 6.8,3.0,5.5,2.1,Iris virginica 35 | 5.7,2.5,5.0,2.0,Iris virginica 36 | 5.8,2.8,5.1,2.4,Iris virginica 37 | 6.4,3.2,5.3,2.3,Iris virginica 38 | 6.5,3.0,5.5,1.8,Iris virginica 39 | 7.7,3.8,6.7,2.2,Iris virginica 40 | 7.7,2.6,6.9,2.3,Iris virginica 41 | -------------------------------------------------------------------------------- /test/save.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const t = require('tape'); 3 | const rimraf = require('rimraf'); 4 | const sc = require('skale').context(); 5 | 6 | const savedir = '/tmp/skale-test/save'; 7 | 8 | t.test('save', function (t) { 9 | t.plan(4); 10 | 11 | rimraf(savedir, function (err) { 12 | t.ok(!err, 'delete previous saved data'); 13 | sc.range(10) 14 | .save(savedir, function (err) { 15 | t.ok(!err, 'save returns no error'); 16 | t.ok(fs.existsSync(savedir + '/0'), 'saved filename is correct'); 17 | sc.textFile(savedir + '/') 18 | .map(a => JSON.parse(a)) 19 | .collect(function (err, res) { 20 | t.deepEqual(res, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'saved content is correct'); 21 | }); 22 | }); 23 | }); 24 | }); 25 | 26 | t.test('save gzip', function (t) { 27 | t.plan(4); 28 | 29 | rimraf(savedir, function (err) { 30 | t.ok(!err, 'delete previous saved data'); 31 | sc.range(10) 32 | .save(savedir, {gzip: true}, function (err) { 33 | t.ok(!err, 'save returns no error'); 34 | t.ok(fs.existsSync(savedir + '/0.gz'), 'saved filename is correct'); 35 | sc.textFile(savedir + '/') 36 | .map(a => JSON.parse(a)) 37 | .collect(function (err, res) { 38 | t.deepEqual(res, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'saved content is correct'); 39 | sc.end(); 40 | }); 41 | }); 42 | }); 43 | }); 44 | -------------------------------------------------------------------------------- /Roadmap.md: -------------------------------------------------------------------------------- 1 | # Skale-engine Roadmap 2 | 3 | *Last updated December 29 2016* 4 | 5 | This document describes the high level features the Skale-engine 6 | maintainers have decided to prioritize in the near to medium term. 7 | 8 | ## Schema description for datasets 9 | 10 | This would be useful for several purposes: 11 | 12 | - support of columnar, such as parquet 13 | - integration with a query language compatible with B.I. tools 14 | - optimization of datasets serialization and transfers 15 | 16 | ## Add support for Parquet 17 | 18 | *Status: in progress* 19 | 20 | Parquet is a columnar storage format from the Apache Software 21 | Foundation available to any project in the Hadoop ecosystem. 22 | 23 | A separate nodeJS module supporting Apache parquet format, both for 24 | reading an writing is first required. 25 | 26 | Such an experimental module has been started by skale 27 | maintainers [here](https://github.com/mvertes/node-parquet) 28 | 29 | ## Add support for Avro 30 | 31 | Avro is a data serialization system from the Apache Software 32 | Foundation which provides rich data structures and a compact fast 33 | binary data format. 34 | 35 | It is well suited to structured data where a schema is required to 36 | encode and decode values. 37 | 38 | A pure Javascript implementation exists 39 | [here](https://github.com/mtth/avsc). 40 | 41 | ## Add realtime streaming capabilities 42 | 43 | The current processing model is *action* driven, suitable for batch, 44 | or micro-batch processing. See if it is possible to apply the same 45 | API, or at least a subset, to *source* driven processing better 46 | suited for realtime data processing, while retaining skale 47 | scalability and efficiency. 48 | -------------------------------------------------------------------------------- /benchmark/sparkLR.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple Logistic regression algorithm. 3 | """ 4 | 5 | from math import exp 6 | from math import sqrt 7 | import sys 8 | import math 9 | 10 | from pyspark import SparkContext 11 | 12 | stepSize = 1 13 | regParam = 1 14 | D = 16 # Number of features 15 | weights = [0 for i in range(D)] # Initial weights 16 | 17 | def parsePoint(line): 18 | values = [float(s) for s in line.split(' ')] 19 | return [values[0], values[1:]] 20 | 21 | def logisticLossGradient(point): 22 | grad = [] 23 | dotprod = 0 24 | label = point[0] 25 | features = point[1] 26 | for i in range(0, D): 27 | dotprod += features[i] * weights[i] 28 | tmp = 1 / (1 + exp(-dotprod)) - label 29 | for i in range(0, D): 30 | grad += [features[i] * tmp] 31 | return grad 32 | 33 | def mySum(a, b): 34 | for i in range (0, D): 35 | a[i] += b[i] 36 | return a 37 | 38 | if __name__ == "__main__": 39 | if len(sys.argv) != 3: 40 | print >> sys.stderr, "Usage: sparkLR " 41 | exit(-1) 42 | 43 | sc = SparkContext(appName="pysparkLR") 44 | points = sc.textFile(sys.argv[1]).map(parsePoint).persist() 45 | 46 | N = points.count() 47 | 48 | iterations = int(sys.argv[2]) 49 | for i in range(0, iterations): 50 | gradient = points.map(logisticLossGradient).reduce(mySum) 51 | iterStepSize = stepSize / sqrt(i + 1) 52 | for j in range(0, D): 53 | weights[j] -= iterStepSize * (gradient[j] / N + regParam * weights[j]) 54 | # format and output weights to stdout 55 | line = str(weights[0]) 56 | for i in range (1, D): 57 | line += " " + str(weights[i]) 58 | sys.stdout.write(line + "\n") 59 | sc.stop() 60 | -------------------------------------------------------------------------------- /benchmark/skaleLR.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 'use strict'; 3 | 4 | const sc = require('skale').context(); 5 | 6 | function logisticLossGradient(p, weights) { 7 | const grad = []; 8 | const label = p[0]; 9 | const features = p[1]; 10 | let dotProd = 0; 11 | 12 | for (let i = 0; i < features.length; i++) 13 | dotProd += features[i] * weights[i]; 14 | 15 | const tmp = 1 / (1 + Math.exp(-dotProd)) - label; 16 | 17 | for (let i = 0; i < features.length; i++) 18 | grad[i] = features[i] * tmp; 19 | return grad; 20 | } 21 | 22 | function sum(a, b) { 23 | for (let i = 0; i < b.length; i++) 24 | a[i] += b[i]; 25 | return a; 26 | } 27 | 28 | function featurize(line) { 29 | const tmp = line.split(' ').map(Number); 30 | const label = tmp.shift(); // [-1,1] labels 31 | const features = tmp; 32 | return [label, features]; 33 | } 34 | 35 | const file = process.argv[2]; 36 | const nIterations = +process.argv[3] || 10; 37 | const points = sc.textFile(file).map(featurize).persist(); 38 | const D = 16; 39 | const stepSize = 1; 40 | const regParam = 1; 41 | 42 | const zero = Array(D).fill(0); 43 | const weights = Array(D).fill(0); 44 | 45 | if (!file) throw 'Usage: lr.js file [nIterations]'; 46 | 47 | points.count(function (err, data) { 48 | const N = data; 49 | let i = 0; 50 | 51 | function iterate() { 52 | points.map(logisticLossGradient, weights) 53 | .reduce(sum, zero) 54 | .then(function(gradient) { 55 | const iss = stepSize / Math.sqrt(i + 1); 56 | for (let j = 0; j < weights.length; j++) { 57 | weights[j] -= iss * (gradient[j] / N + regParam * weights[j]); 58 | } 59 | if (++i < nIterations) return iterate(); 60 | console.log(weights); 61 | sc.end(); 62 | }); 63 | } 64 | iterate(); 65 | }); 66 | -------------------------------------------------------------------------------- /ml/standard-scaler.js: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Luca-SAS, licensed under the Apache License 2.0 2 | 3 | const thenify = require('thenify'); 4 | 5 | module.exports = StandardScaler; 6 | 7 | function StandardScaler() { 8 | // Transform is defined here to be automatically serialized 9 | // along with object instance and be usable inside workers callbacks. 10 | this.transform = function (point) { 11 | let pointStd = []; 12 | for (let i = 0; i < point.length; i++) 13 | pointStd[i] = (point[i] - this.mean[i]) / this.std[i]; 14 | return pointStd; 15 | }; 16 | this.mean; 17 | this.std; 18 | } 19 | 20 | function meanReducer(acc, features) { 21 | for (let i = 0; i < features.length; i++) 22 | acc.sum[i] = (acc.sum[i] || 0) + features[i]; 23 | acc.count++; 24 | return acc; 25 | } 26 | 27 | function meanCombiner(a, b) { 28 | if (a.sum.length === 0) return b; 29 | for (let i = 0; i < b.sum.length; i++) 30 | a.sum[i] += b.sum[i]; 31 | a.count += b.count; 32 | return a; 33 | } 34 | 35 | function stddevReducer(acc, features) { 36 | for (let i = 0; i < features.length; i++) { 37 | let delta = features[i] - acc.mean[i]; 38 | acc.sum[i] = (acc.sum[i] || 0) + delta * delta; 39 | } 40 | return acc; 41 | } 42 | 43 | function stddevCombiner(a, b) { 44 | if (a.sum.length === 0) return b; 45 | for (let i = 0; i < b.sum.length; i++) 46 | a.sum[i] += b.sum[i]; 47 | return a; 48 | } 49 | 50 | StandardScaler.prototype.fit = thenify(function (points, done) { 51 | const self = this; 52 | 53 | // Compute mean of each features 54 | points.aggregate(meanReducer, meanCombiner, {sum: [], count: 0}, function(err, data) { 55 | self.count = data.count; 56 | self.mean = []; 57 | for (let i = 0; i < data.sum.length; i++) 58 | self.mean[i] = data.sum[i] / data.count; 59 | 60 | // Now that we have the mean of each feature, let's compute their standard deviation 61 | points .aggregate(stddevReducer, stddevCombiner, {sum: [], mean: self.mean}, function(err, res) { 62 | self.std = []; 63 | for (let i = 0; i < res.sum.length; i++) 64 | self.std[i] = Math.sqrt(res.sum[i] / self.count); 65 | done(); 66 | }); 67 | }); 68 | }); 69 | -------------------------------------------------------------------------------- /test/save-s3.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const aws = require('aws-sdk'); 3 | const sc = require('skale').context(); 4 | 5 | const skip = process.env.CI || (process.env.AWS_ACCESS_KEY_ID ? false : true); 6 | const s3 = skip ? null : new aws.S3({httpOptions: {timeout: 3600000}, signatureVersion: 'v4'}); 7 | const savedir = 's3://skale-test-eu-west-1/test/save'; 8 | 9 | t.test('save s3', {skip: skip}, function (t) { 10 | t.plan(3); 11 | 12 | deleteS3Dir('skale-test-eu-west-1', 'test/save/', function (err) { 13 | t.ok(!err, 'delete S3 previous saved test data'); 14 | sc.range(10) 15 | .save(savedir, function (err) { 16 | t.ok(!err, 'save returns no error'); 17 | sc.textFile(savedir + '/') 18 | .map(a => JSON.parse(a)) 19 | .collect(function (err, res) { 20 | t.deepEqual(res, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'saved content is correct'); 21 | }); 22 | }); 23 | }); 24 | }); 25 | 26 | t.test('save s3 gzip', {skip: skip}, function (t) { 27 | t.plan(3); 28 | 29 | deleteS3Dir('skale-test-eu-west-1', 'test/save/', function (err) { 30 | t.ok(!err, 'delete S3 previous saved test data'); 31 | sc.range(10) 32 | .save(savedir, {gzip: true}, function (err) { 33 | t.ok(!err, 'save returns no error'); 34 | sc.textFile(savedir + '/') 35 | .map(a => JSON.parse(a)) 36 | .collect(function (err, res) { 37 | t.deepEqual(res, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'saved content is correct'); 38 | sc.end(); 39 | }); 40 | }); 41 | }); 42 | }); 43 | 44 | if (skip) sc.end(); 45 | 46 | function deleteS3Dir(bucket, prefix, done) { 47 | function getList(list, token, done) { 48 | s3.listObjectsV2({ 49 | Bucket: bucket, 50 | Prefix: prefix, 51 | ContinuationToken: token 52 | }, function (err, data) { 53 | if (err) throw new Error('s3.listObjectsV2 failed'); 54 | list = list.concat(data.Contents); 55 | if (data.IsTruncated) 56 | return getList(list, data.NextContinuationToken, done); 57 | done(err, list); 58 | }); 59 | } 60 | 61 | getList([], null, function (err, res) { 62 | if (!res || !res.length) return done(); 63 | s3.deleteObjects({ 64 | Bucket: bucket, 65 | Delete: { 66 | Objects: res.map(o => ({Key: o.Key})) 67 | } 68 | }, done); 69 | }); 70 | } 71 | -------------------------------------------------------------------------------- /test/save-azure.js: -------------------------------------------------------------------------------- 1 | const t = require('tape'); 2 | const azure = require('azure-storage'); 3 | const sc = require('skale').context(); 4 | 5 | //const skip = process.env.CI || (process.env.AZURE_STORAGE_CONNECTION_STRING ? false : true); 6 | const skip = true; 7 | const retry = new azure.ExponentialRetryPolicyFilter(); 8 | const az = skip ? null : azure.createBlobService().withFilter(retry); 9 | const savedir = 'wasb://skalejs/save'; 10 | 11 | t.test('save azure', {skip: skip}, function (t) { 12 | t.plan(3); 13 | 14 | deleteAzureDir('save', '', function (err) { 15 | t.ok(!err, 'delete previous saved test data'); 16 | sc.range(10) 17 | .save(savedir, function (err) { 18 | t.ok(!err, 'save returns no error'); 19 | sc.textFile(savedir + '/') 20 | .map(a => JSON.parse(a)) 21 | .collect(function (err, res) { 22 | t.deepEqual(res, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'saved content is correct'); 23 | }); 24 | }); 25 | }); 26 | }); 27 | 28 | t.test('save azure gzip', {skip: skip}, function (t) { 29 | t.plan(3); 30 | 31 | deleteAzureDir('save', '', function (err) { 32 | t.ok(!err, 'delete previous saved test data'); 33 | sc.range(10) 34 | .save(savedir, {gzip: true}, function (err) { 35 | t.ok(!err, 'save returns no error'); 36 | sc.textFile(savedir + '/') 37 | .map(a => JSON.parse(a)) 38 | .collect(function (err, res) { 39 | t.deepEqual(res, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'saved content is correct'); 40 | sc.end(); 41 | }); 42 | }); 43 | }); 44 | }); 45 | 46 | if (skip) sc.end(); 47 | 48 | function deleteAzureDir(container, prefix, done) { 49 | function getList(list, token, done) { 50 | az.listBlobsSegmentedWithPrefix(container, prefix, token, function (err, data) { 51 | if (err) throw new Error('az.listBlobsSegmented failed'); 52 | list = list.concat(data.entries); 53 | if (data.continuationToken) 54 | return getList(list, data.continuationToken, done); 55 | done(err, list); 56 | }); 57 | } 58 | 59 | getList([], null, function (err, res) { 60 | if (!res || !res.length) return done(); 61 | let toDelete = res.length; 62 | res.forEach(function (element) { 63 | az.deleteBlob(container, element.name, function (err) { 64 | if (--toDelete <= 0) done(err); 65 | }); 66 | }); 67 | }); 68 | } 69 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "skale", 3 | "version": "1.2.2", 4 | "license": "Apache-2.0", 5 | "description": "parallel and distributed data processing engine", 6 | "main": "index.js", 7 | "scripts": { 8 | "start": "pf=/tmp/skale-server.pid; test -f $pf && exit 1; bin/server.js -l 0 & echo $! > $pf", 9 | "stop": "pf=/tmp/skale-server.pid; test -f $pf || exit 1; kill $(cat $pf); rm -f $pf", 10 | "pretest": "eslint .", 11 | "tape": "tape \"test/**/*.js\"", 12 | "test": "make -C test", 13 | "version": "github_changelog_generator --future-release $npm_package_version" 14 | }, 15 | "bin": { 16 | "skale-server": "./bin/server.js", 17 | "skale-worker": "./bin/worker.js", 18 | "skale-shell": "./bin/shell.js" 19 | }, 20 | "repository": "skale-me/skale", 21 | "bugs": { 22 | "url": "https://github.com/skale-me/skale/issues" 23 | }, 24 | "keywords": [ 25 | "big data", 26 | "ETL", 27 | "distributed", 28 | "data processing", 29 | "machine learning", 30 | "cloud", 31 | "S3", 32 | "azure", 33 | "parallel", 34 | "cluster", 35 | "hpc" 36 | ], 37 | "author": "Skale team", 38 | "dependencies": { 39 | "await-outside": "^2.1.2", 40 | "aws-sdk": "^2.382.0", 41 | "azure-storage": "^2.10.2", 42 | "browserify": "^16.2.3", 43 | "callsite": "^1.0.0", 44 | "merge2": "^1.2.3", 45 | "micromatch": "^3.1.10", 46 | "mkdirp": "^0.5.1", 47 | "node-getopt": "^0.3.2", 48 | "resolve": "^1.9.0", 49 | "rimraf": "^2.6.2", 50 | "seedrandom": "^2.4.4", 51 | "thenify": "^3.3.0", 52 | "uuid": "^3.3.2", 53 | "websocket-stream": "^5.1.2", 54 | "ws": "^6.1.2" 55 | }, 56 | "devDependencies": { 57 | "eslint": "^5.11.0", 58 | "plotter": "^0.5.0", 59 | "tape": "^4.9.1" 60 | }, 61 | "peerDependencies": { 62 | "node-parquet": "^0.2.4" 63 | }, 64 | "engines": { 65 | "node": ">=6.0" 66 | }, 67 | "eslintConfig": { 68 | "rules": { 69 | "indent": [ 70 | 2, 71 | 2 72 | ], 73 | "quotes": [ 74 | 2, 75 | "single" 76 | ], 77 | "semi": [ 78 | 2, 79 | "always" 80 | ], 81 | "no-var": 2, 82 | "no-console": 0 83 | }, 84 | "env": { 85 | "es6": true, 86 | "node": true 87 | }, 88 | "parserOptions": { 89 | "ecmaVersion": 2017 90 | }, 91 | "extends": "eslint:recommended" 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | # Skale Docker 2 | 3 | This directory contains a sample Dockerfile for skale container, based on [Alpine](https://hub.docker.com/_/alpine/) container. 4 | 5 | It also contains a sample Compose file to deploy a minimal stack on a single host or a docker swarm. 6 | 7 | This docker configuration is not meant for production, ok for evaluation or experimentation. 8 | 9 | ## Installing 10 | 11 | As a prerequisite, [Docker](https://docker.com) must be installed, in version v1.12.0 or higher. 12 | 13 | To download this docker image from the public docker hub: 14 | 15 | $ docker pull skale/skale 16 | 17 | To re-build this image from the dockerfile: 18 | 19 | $ docker build -t skale/skale . 20 | 21 | ## Deploying on a single host 22 | 23 | This can be done simply with `docker-compose` and the provided `docker-compose.yml` file: 24 | 25 | $ docker-compose up 26 | 27 | ## Deploying on a cluster 28 | 29 | The provided image and compose files are compatible to run distributed skale using the docker engine in [swarm mode](https://docs.docker.com/engine/swarm/). 30 | 31 | First create a cluster of docker machines in a swarm (see docker [documentation](https://docs.docker.com/engine/swarm/swarm-tutorial/create-swarm/)) 32 | 33 | Once a docker swarm is ready, one can deploy a skale stack using the `stack` command and the same `docker-compose.yml` file: 34 | 35 | $ docker stack deploy -c docker-compose.yml skale 36 | 37 | Then you can adjust the size of the skale cluster by setting the number of worker controllers: 38 | 39 | $ docker service scale skale_skale-worker=3 40 | 41 | There should be one instance of skale-worker per host. During jobs, each worker controller will spawn as many worker processes as CPUs on each host. 42 | 43 | ## Running programs 44 | 45 | To execute skale programs onto the previously deployed skale stack, the `SKALE_HOST` environment variable must point to the cluster public address, i.e the one given by `docker info` on the docker host (or the swarm master): 46 | 47 | $ docker info | grep 'Node Address' 48 | Node Address: 192.168.99.101 49 | 50 | For example, to run a sample program from the examples directory: 51 | 52 | $ SKALE_DEBUG=2 SKALE_HOST=192.168.99.101 ../examples/parallelize.js 53 | [master 0.050s] workers: 3 54 | [master 0.054s] start result stage, partitions: 3 55 | [master 0.067s] part 0 from worker-w17 (1/3) 56 | [master 0.075s] part 1 from worker-w18 (2/3) 57 | [master 0.080s] part 2 from worker-w19 (3/3) 58 | [ 1, 2, 3, 4, 5 ] 59 | -------------------------------------------------------------------------------- /ml/kmeans.js: -------------------------------------------------------------------------------- 1 | // Unsupervised clusterization model using K-means 2 | // Authors: M. Vertes (current), C. Artigue (preliminary) 3 | // License: Apache License 2.0 4 | 5 | 'use strict'; 6 | 7 | const thenify = require('thenify'); 8 | 9 | function KMeans(nClusters, options) { 10 | if (!(this instanceof KMeans)) 11 | return new KMeans(nClusters, options); 12 | options = options || {}; 13 | this.nClusters = nClusters; 14 | this.maxMse = options.maxMse || 0.0000001; 15 | this.means = options.means; 16 | this.maxIterations = options.maxIterations || 100; 17 | 18 | // Return cluster index for which element is closest to center (euclidean norm) 19 | // Function is inlined in object (vs prototype) serialized to workers 20 | this.predict = function (element) { 21 | let means = this.means; 22 | let smallestSn = Infinity; 23 | let smallestSnIdx; 24 | for (let i = 0; i < means.length; i++) { 25 | let sn = 0; 26 | for (let j = 0; j < element.length; j++) { 27 | let delta = element[j] - means[i][j]; 28 | sn += delta * delta; 29 | } 30 | if (sn < smallestSn) { 31 | smallestSnIdx = i; 32 | smallestSn = sn; 33 | } 34 | } 35 | return smallestSnIdx; 36 | }; 37 | } 38 | 39 | KMeans.prototype.fit = thenify(function(trainingSet, done) { 40 | const self = this; 41 | let iter = 0; 42 | 43 | if (self.means === undefined) { 44 | trainingSet.takeSample(false, self.nClusters, function (err, means) { 45 | self.means = means; 46 | iterate(); 47 | }); 48 | } else iterate(); 49 | 50 | function accumulate(a, b) { 51 | a.sum += b.sum; 52 | for (let i = 0; i < b.data.length; i++) 53 | a.data[i] += b.data[i]; 54 | return a; 55 | } 56 | 57 | function iterate() { 58 | trainingSet 59 | .map((a, self) => [self.predict(a), {data: a, sum: 1}], self) 60 | .reduceByKey(accumulate, {data: Array(self.means.length).fill(0), sum: 0}) 61 | .map(a => a[1].data.map(e => e / a[1].sum)) 62 | .collect(function (err, means) { 63 | let mse = 0; 64 | for (let i = 0; i < self.nClusters; i++) { 65 | for (let j = 0; j < means[i].length; j++) { 66 | let delta = means[i][j] - self.means[i][j]; 67 | mse += delta * delta; 68 | } 69 | } 70 | self.means = means; 71 | if (mse < self.maxMse || iter++ > self.maxIterations) 72 | return done(); 73 | iterate(); 74 | }); 75 | } 76 | }); 77 | 78 | module.exports = KMeans; 79 | -------------------------------------------------------------------------------- /ml/classification-metrics.js: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Luca-SAS, licensed under the Apache License 2.0 2 | 3 | const thenify = require('thenify'); 4 | 5 | function reducer(acc, point) { 6 | let [prediction, label] = point; 7 | for (let i = 0; i < acc.length; i++) { 8 | let threshold = i / acc.length; 9 | if (prediction > threshold) { 10 | if (label > 0) 11 | acc[i].tp++; // True positive 12 | else 13 | acc[i].fp++; // False positive 14 | } else { 15 | if (label > 0) 16 | acc[i].fn++; // False negative 17 | else 18 | acc[i].tn++; // True negative 19 | } 20 | } 21 | return acc; 22 | } 23 | 24 | function combiner(acc1, acc2) { 25 | for (let i = 0; i < acc1.length; i++) { 26 | acc1[i].tp += acc2[i].tp; 27 | acc1[i].tn += acc2[i].tn; 28 | acc1[i].fp += acc2[i].fp; 29 | acc1[i].fn += acc2[i].fn; 30 | } 31 | return acc1; 32 | } 33 | 34 | // Compute area under curve, where curve is an Array of Objects {x, y} 35 | function areaUnder(curve, sortx) { 36 | const sorted = sortx ? curve.sort((a, b) => a.x - b.x) : curve; 37 | let auc = 0; 38 | let {x, y} = sorted[0]; 39 | 40 | for (let i = 0; i < sorted.length; i++) { 41 | let e = sorted[i]; 42 | auc += (e.x - x) * (y + (e.y - y) / 2); 43 | x = e.x; 44 | y = e.y; 45 | } 46 | return auc; 47 | } 48 | 49 | const classificationMetrics = thenify(function (points, options, callback) { 50 | options = options || {}; 51 | const steps = Number(options.steps) || 10; 52 | const init = Array(steps).fill({tp: 0, tn: 0, fp: 0, fn: 0}); // Confusion matrices 53 | 54 | points.aggregate(reducer, combiner, init, function (error, result) { 55 | result.map((e, i) => { 56 | e.threshold = i / steps; 57 | e.precision = e.tp / (e.tp + e.fp); // Also called Positive Predictive Value (PPV) 58 | e.recall = e.tp / (e.tp + e.fn); // Also called True Positive Rate (TPR) or sensitivity 59 | e.accuracy = (e.tp + e.tn) / (e.tp + e.tn + e.fp + e.fn); 60 | e.specificity = e.tn / (e.tn + e.fp); // Also called True Negative Rate (TNR) 61 | e.fpr = e.fp / (e.fp + e.tn); 62 | e.f1 = 2 / (1 / e.recall + 1 / e.precision); // F1 measure 63 | e.J = e.recall + e.specificity - 1; // Younden's J statistic 64 | return e; 65 | }); 66 | const auROC = areaUnder(result.map(a => ({x: a.fpr, y: a.recall})), true); 67 | const auPR = areaUnder(result.map(a => ({x: a.recall, y: a.precision})), true); 68 | const maxF1 = result.reduce((a, b) => a.f1 > b.f1 ? a : b, result[0]); 69 | callback(null, {rates: result, auROC: auROC, auPR: auPR, threshold: maxF1.threshold}); 70 | }); 71 | }); 72 | 73 | module.exports = classificationMetrics; 74 | -------------------------------------------------------------------------------- /benchmark/README.md: -------------------------------------------------------------------------------- 1 | # A first benchmark 2 | 3 | In this first benchmark, the goal is to evaluate the performance 4 | of the skale data processing engine. 5 | 6 | For that, we use an iterative machine learning application, the 7 | logistic regression. 8 | 9 | ## Overview 10 | 11 | The same algorithm is implemented both in NodeJS/Skale ([skaleLR.js]), 12 | and Python/Spark ([sparkLR.py]). Note that we do not use the spark-ml 13 | library or the skale-ml library, because our focus here is on 14 | core engine performances, and we also want this test to remain 15 | standalone and stable accross multiple versions of Skale and Spark. 16 | 17 | We use the exact same input file for both, stored on the local file 18 | system, and run the test using the same host. We randomly 19 | generated data using [gen_data.js] program. 20 | 21 | ## Results 22 | 23 | Tests are performed on an AWS EC2 instance, called m4.4xlarge, with 16 cores and 64 GB RAM. 24 | The spark environment is spark-1.6.1, java-1.8.0_31, Python-3.4.2 (from https://hub.docker.com/r/gettyimages/spark/) 25 | The skale environment is skale-0.4.5, nodejs-4.4.3. 26 | 27 | The spark test is run using the following command: 28 | 29 | $ spark-submit --executor-memory 60G sparkLR.py 30 | 31 | The skale test is run using: 32 | 33 | $ skale-run --worker 16 --memory 4000 -- 34 | 35 | The first set of results is for a 1 GB input file, with 3.829 millions entries of 16 features. 36 | 37 | |iterations | skale (sec) | spark (sec) | speedup | 38 | |-----------|-------------|-------------|---------| 39 | |1 | 4.6 | 19.8 | 4.3 | 40 | |4 | 5 | 34.8 | 7.0 | 41 | |10 | 6.5 | 65.9 | 10.1 | 42 | |20 | 8.7 | 116 | 13.3 | 43 | |70 | 20 | 369 | 18.5 | 44 | |100 | 26 | 522 | 20.1 | 45 | 46 | ![logreg1](logreg-1.png) 47 | 48 | The second set of results is for a 10 GB input file, with 38.29 millions entries of 16 features. 49 | 50 | |iterations | skale (sec) | spark (sec) | speedup | 51 | |-----------|-------------|-------------|---------| 52 | |1 | 43.9 | 164.9 | 3.8 | 53 | |4 | 48.1 | 455 | 9.5 | 54 | |10 | 58.8 | 1038 | 17.7 | 55 | |20 | 82.2 | 2010 | 24.5 | 56 | |50 | 170 | 4927 | 29 | 57 | |100 | 224 | 9772 | 43.6 | 58 | 59 | ![logreg1](logreg-10.png) 60 | 61 | ## Call for contribution 62 | 63 | It would be nice to have a spark/scala version of this benchmark which 64 | could possibly perform better than the spark/python version. 65 | 66 | [skaleLR.js]: skaleLR.js 67 | [sparkLR.py]: sparkLR.py 68 | [gen_data.js]: gen_data.js 69 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Interested in contributing to skale? We'd love 4 | your help. Skale-engine is an open source project, built one 5 | contribution at a time by users just like you. 6 | 7 | The [Code of Conduct] details the bare minimum behavior 8 | expectations required from contributors. 9 | 10 | ## Where to get help or report a problem 11 | 12 | * If you have a question about using skale, start a discussion 13 | on [gitter] or on [google group]. Please do not open issues for 14 | questions or support requests. 15 | * If you think you have found a bug within skale, open an 16 | [issue]. Do not forget to check that it doesn't already exist 17 | in our [issue database] 18 | * If you want to learn more about skale internals, architecture and 19 | how to extend skale, see the 20 | [Skale Hacker's Guide](docs/skale-hackers-guide.md) 21 | * If you have a suggestion for improvement or a new feature, create 22 | a [pull request] so it can be discussed and reviewed by the 23 | community and project committers. Even the project committers 24 | submit their code this way. 25 | 26 | ## Submitting a pull request 27 | 28 | * Create your own [fork] on github, then checkout your fork 29 | * Write your code in your local copy. It's good practice to create 30 | a branch for each new issue you work on, although not compulsory 31 | * Your code must follow existing coding style, and tests must pass. 32 | To check coding style, run `npm run lint`. The [coding style] of skale 33 | is the same as in core NodeJS. 34 | To run the tests, first run `npm install`, then `npm test` 35 | * If the tests pass, you can commit changes to your fork and then 36 | create a pull request from there. Reference any relevant issue by 37 | including its number in the message, e.g. #123 38 | 39 | ## Writing documentation 40 | 41 | The [documentation guidelines] from Google provide a good reference 42 | for writing consistent and good technical documents, in particular 43 | [API documentation rules]. 44 | 45 | Note: skale documentation was started before knowing this standard, 46 | thus is not yet fully compliant! Please help us to write better 47 | docs. 48 | 49 | ## Coding rules 50 | 51 | In addition to applying the already mentioned [coding style], 52 | the following conventions should be applied as well: 53 | 54 | * Use `const` instead of `var` for declarations, whenever possible 55 | * Use `let` instead of `var` if reference must be reassigned 56 | * Use array or object destructuring to set variables from array or 57 | object: `let [a, b] = [1, 2, 3]` 58 | * Use arrow functions in callbacks, where applicable: `map`, `reduce`, 59 | `aggregate`, etc 60 | 61 | [Code of Conduct]: CODE_OF_CONDUCT.md 62 | [coding style]: https://github.com/felixge/node-style-guide 63 | [gitter]: https://gitter.im/skale-me/skale 64 | [google group]: https://groups.google.com/forum/#!forum/skale 65 | [issue database]: https://github.com/skale-me/skale/issues 66 | [issue]: https://github.com/skale-me/skale/issues/new 67 | [pull request]: #submitting-a-pull-request 68 | [fork]: https://github.com/skale-me/skale 69 | [documentation guidelines]: https://developers.google.com/style/ 70 | [API documentation rules]: https://developers.google.com/style/api-reference-comments 71 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | * Using welcoming and inclusive language 12 | * Being respectful of differing viewpoints and experiences 13 | * Gracefully accepting constructive criticism 14 | * Focusing on what is best for the community 15 | * Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | * Trolling, insulting/derogatory comments, and personal or political attacks 21 | * Public or private harassment 22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 23 | * Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at contact@skale.me. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | 41 | ## Attribution 42 | 43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] 44 | 45 | [homepage]: http://contributor-covenant.org 46 | [version]: http://contributor-covenant.org/version/1/4/ 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **Development activity is stopped, and this project is now archived.** 2 | 3 | ![logo](docs/images/logo-skale.png) 4 | 5 | [![Build Status](https://travis-ci.org/skale-me/skale.svg?branch=master)](https://travis-ci.org/skale-me/skale) 6 | [![Build Status](https://ci.appveyor.com/api/projects/status/github/skale-me/skale?svg=true)](https://ci.appveyor.com/project/skaleme/skale) 7 | [![npm badge](https://img.shields.io/npm/v/skale.svg)](https://www.npmjs.com/package/skale) 8 | 9 | 10 | High performance distributed data processing and machine learning. 11 | 12 | Skale provides a high-level API in Javascript and an optimized 13 | parallel execution engine on top of NodeJS. 14 | 15 | ## Features 16 | * Pure javascript implementation of a Spark like engine 17 | * Multiple data sources: filesystems, databases, cloud (S3, azure) 18 | * Multiple data formats: CSV, JSON, Columnar (Parquet)... 19 | * 50 high level operators to build parallel apps 20 | * Machine learning: scalable classification, regression, clusterization 21 | * Run interactively in a nodeJS REPL shell 22 | * Docker [ready](docker/), simple local mode or full distributed mode 23 | * Very fast, see [benchmark](benchmark/) 24 | 25 | ## Quickstart 26 | ```sh 27 | npm install skale 28 | ``` 29 | 30 | Word count example: 31 | 32 | ```javascript 33 | var sc = require('skale').context(); 34 | 35 | sc.textFile('/my/path/*.txt') 36 | .flatMap(line => line.split(' ')) 37 | .map(word => [word, 1]) 38 | .reduceByKey((a, b) => a + b, 0) 39 | .count(function (err, result) { 40 | console.log(result); 41 | sc.end(); 42 | }); 43 | ``` 44 | 45 | ### Local mode 46 | In local mode, worker processes are automatically forked and 47 | communicate with app through child process IPC channel. This is 48 | the simplest way to operate, and it allows to use all machine 49 | available cores. 50 | 51 | To run in local mode, just execute your app script: 52 | ```sh 53 | node my_app.js 54 | ``` 55 | 56 | or with debug traces: 57 | ```sh 58 | SKALE_DEBUG=2 node my_app.js 59 | ``` 60 | 61 | ### Distributed mode 62 | In distributed mode, a cluster server process and worker processes 63 | must be started prior to start app. Processes communicate with each 64 | other via raw TCP or via websockets. 65 | 66 | To run in distributed cluster mode, first start a cluster server 67 | on `server_host`: 68 | ```sh 69 | ./bin/server.js 70 | ``` 71 | 72 | On each worker host, start a worker controller process which connects 73 | to server: 74 | ```sh 75 | ./bin/worker.js -H server_host 76 | ``` 77 | 78 | Then run your app, setting the cluster server host in environment: 79 | ```sh 80 | SKALE_HOST=server_host node my_app.js 81 | ``` 82 | 83 | The same with debug traces: 84 | ```sh 85 | SKALE_HOST=server_host SKALE_DEBUG=2 node my_app.js 86 | ``` 87 | 88 | ## Resources 89 | 90 | * [Contributing guide](CONTRIBUTING.md) 91 | * [Documentation](https://skale-me.github.io/skale) 92 | * [Gitter](https://gitter.im/skale-me/skale-engine) for support and 93 | discussion 94 | * [Mailing list](https://groups.google.com/forum/#!forum/skale) 95 | for discussion about use and development 96 | 97 | ## Authors 98 | 99 | The original authors of skale are [Cedric Artigue](https://github.com/CedricArtigue) and [Marc Vertes](https://github.com/mvertes). 100 | 101 | [List of all 102 | contributors](https://github.com/skale-me/skale/graphs/contributors) 103 | 104 | ## License 105 | 106 | [Apache-2.0](LICENSE) 107 | 108 | ## Credits 109 | 110 |
Logo Icon made by Smashicons from www.flaticon.com is licensed by CC 3.0 BY
111 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | ![logo](images/logo-skale.png) 2 | 3 | [![Build Status](https://travis-ci.org/skale-me/skale.svg?branch=master)](https://travis-ci.org/skale-me/skale) 4 | [![Build Status](https://ci.appveyor.com/api/projects/status/github/skale-me/skale?svg=true)](https://ci.appveyor.com/project/skaleme/skale) 5 | [![npm badge](https://img.shields.io/npm/v/skale.svg)](https://www.npmjs.com/package/skale) 6 | 7 | 8 | High performance distributed data processing and machine learning. 9 | 10 | Skale provides a high-level API in Javascript and an optimized 11 | parallel execution engine on top of NodeJS. 12 | 13 | ## Features 14 | * Pure javascript implementation of a Spark like engine 15 | * Multiple data sources: filesystems, databases, cloud (S3, azure) 16 | * Multiple data formats: CSV, JSON, Columnar (Parquet)... 17 | * 50 high level operators to build parallel apps 18 | * Machine learning: scalable classification, regression, clusterization 19 | * Run interactively in a nodeJS REPL shell 20 | * Docker [ready](https://github.com/skale-me/skale/blob/master/docker/), simple local mode or full distributed mode 21 | * Very fast, see [benchmark](https://github.com/skale-me/skale/blob/master/benchmark/) 22 | 23 | ## Quickstart 24 | ```sh 25 | npm install skale 26 | ``` 27 | 28 | Word count example: 29 | 30 | ```javascript 31 | var sc = require('skale').context(); 32 | 33 | sc.textFile('/my/path/*.txt') 34 | .flatMap(line => line.split(' ')) 35 | .map(word => [word, 1]) 36 | .reduceByKey((a, b) => a + b, 0) 37 | .count(function (err, result) { 38 | console.log(result); 39 | sc.end(); 40 | }); 41 | ``` 42 | 43 | ### Local mode 44 | In local mode, worker processes are automatically forked and 45 | communicate with app through child process IPC channel. This is 46 | the simplest way to operate, and it allows to use all machine 47 | available cores. 48 | 49 | To run in local mode, just execute your app script: 50 | ```sh 51 | node my_app.js 52 | ``` 53 | 54 | or with debug traces: 55 | ```sh 56 | SKALE_DEBUG=2 node my_app.js 57 | ``` 58 | 59 | ### Distributed mode 60 | In distributed mode, a cluster server process and worker processes 61 | must be started prior to start app. Processes communicate with each 62 | other via raw TCP or via websockets. 63 | 64 | To run in distributed cluster mode, first start a cluster server 65 | on `server_host`: 66 | ```sh 67 | ./bin/server.js 68 | ``` 69 | 70 | On each worker host, start a worker controller process which connects 71 | to server: 72 | ```sh 73 | ./bin/worker.js -H server_host 74 | ``` 75 | 76 | Then run your app, setting the cluster server host in environment: 77 | ```sh 78 | SKALE_HOST=server_host node my_app.js 79 | ``` 80 | 81 | The same with debug traces: 82 | ```sh 83 | SKALE_HOST=server_host SKALE_DEBUG=2 node my_app.js 84 | ``` 85 | 86 | ## Resources 87 | 88 | * [Contributing guide](https://github.com/skale-me/skale/blob/master/CONTRIBUTING.md) 89 | * [Gitter](https://gitter.im/skale-me/skale-engine) for support and 90 | discussion 91 | * [Mailing list](https://groups.google.com/forum/#!forum/skale) 92 | for discussion about use and development 93 | 94 | ## Authors 95 | 96 | The original authors of skale are [Cedric Artigue](https://github.com/CedricArtigue) and [Marc Vertes](https://github.com/mvertes). 97 | 98 | [List of all 99 | contributors](https://github.com/skale-me/skale/graphs/contributors) 100 | 101 | ## License 102 | 103 | [Apache-2.0](https://github.com/skale-me/skale/blob/master/LICENSE) 104 | 105 | ## Credits 106 | 107 |
Logo Icon made by Smashicons from www.flaticon.com is licensed by CC 3.0 BY
108 | -------------------------------------------------------------------------------- /test/data/iris.csv: -------------------------------------------------------------------------------- 1 | Sepal length,Sepal Width,Petal length,Petal width,Species 2 | 5.1,3.5,1.4,0.2,Iris setosa 3 | 4.9,3.0,1.4,0.2,Iris setosa 4 | 4.7,3.2,1.3,0.2,Iris setosa 5 | 4.6,3.1,1.5,0.2,Iris setosa 6 | 5.0,3.6,1.4,0.2,Iris setosa 7 | 5.4,3.9,1.7,0.4,Iris setosa 8 | 4.6,3.4,1.4,0.3,Iris setosa 9 | 5.0,3.4,1.5,0.2,Iris setosa 10 | 4.4,2.9,1.4,0.2,Iris setosa 11 | 4.9,3.1,1.5,0.1,Iris setosa 12 | 5.4,3.7,1.5,0.2,Iris setosa 13 | 4.8,3.4,1.6,0.2,Iris setosa 14 | 4.8,3.0,1.4,0.1,Iris setosa 15 | 4.3,3.0,1.1,0.1,Iris setosa 16 | 5.8,4.0,1.2,0.2,Iris setosa 17 | 5.7,4.4,1.5,0.4,Iris setosa 18 | 5.4,3.9,1.3,0.4,Iris setosa 19 | 5.1,3.5,1.4,0.3,Iris setosa 20 | 5.7,3.8,1.7,0.3,Iris setosa 21 | 5.1,3.8,1.5,0.3,Iris setosa 22 | 5.4,3.4,1.7,0.2,Iris setosa 23 | 5.1,3.7,1.5,0.4,Iris setosa 24 | 4.6,3.6,1.0,0.2,Iris setosa 25 | 5.1,3.3,1.7,0.5,Iris setosa 26 | 4.8,3.4,1.9,0.2,Iris setosa 27 | 5.0,3.0,1.6,0.2,Iris setosa 28 | 5.0,3.4,1.6,0.4,Iris setosa 29 | 5.2,3.5,1.5,0.2,Iris setosa 30 | 5.2,3.4,1.4,0.2,Iris setosa 31 | 4.7,3.2,1.6,0.2,Iris setosa 32 | 4.8,3.1,1.6,0.2,Iris setosa 33 | 5.4,3.4,1.5,0.4,Iris setosa 34 | 5.2,4.1,1.5,0.1,Iris setosa 35 | 5.5,4.2,1.4,0.2,Iris setosa 36 | 4.9,3.1,1.5,0.1,Iris setosa 37 | 5.0,3.2,1.2,0.2,Iris setosa 38 | 5.5,3.5,1.3,0.2,Iris setosa 39 | 4.9,3.1,1.5,0.1,Iris setosa 40 | 4.4,3.0,1.3,0.2,Iris setosa 41 | 5.1,3.4,1.5,0.2,Iris setosa 42 | 5.0,3.5,1.3,0.3,Iris setosa 43 | 4.5,2.3,1.3,0.3,Iris setosa 44 | 4.4,3.2,1.3,0.2,Iris setosa 45 | 5.0,3.5,1.6,0.6,Iris setosa 46 | 5.1,3.8,1.9,0.4,Iris setosa 47 | 4.8,3.0,1.4,0.3,Iris setosa 48 | 5.1,3.8,1.6,0.2,Iris setosa 49 | 4.6,3.2,1.4,0.2,Iris setosa 50 | 5.3,3.7,1.5,0.2,Iris setosa 51 | 5.0,3.3,1.4,0.2,Iris setosa 52 | 7.0,3.2,4.7,1.4,Iris versicolor 53 | 6.4,3.2,4.5,1.5,Iris versicolor 54 | 6.9,3.1,4.9,1.5,Iris versicolor 55 | 5.5,2.3,4.0,1.3,Iris versicolor 56 | 6.5,2.8,4.6,1.5,Iris versicolor 57 | 5.7,2.8,4.5,1.3,Iris versicolor 58 | 6.3,3.3,4.7,1.6,Iris versicolor 59 | 4.9,2.4,3.3,1.0,Iris versicolor 60 | 6.6,2.9,4.6,1.3,Iris versicolor 61 | 5.2,2.7,3.9,1.4,Iris versicolor 62 | 5.0,2.0,3.5,1.0,Iris versicolor 63 | 5.9,3.0,4.2,1.5,Iris versicolor 64 | 6.0,2.2,4.0,1.0,Iris versicolor 65 | 6.1,2.9,4.7,1.4,Iris versicolor 66 | 5.6,2.9,3.6,1.3,Iris versicolor 67 | 6.7,3.1,4.4,1.4,Iris versicolor 68 | 5.6,3.0,4.5,1.5,Iris versicolor 69 | 5.8,2.7,4.1,1.0,Iris versicolor 70 | 6.2,2.2,4.5,1.5,Iris versicolor 71 | 5.6,2.5,3.9,1.1,Iris versicolor 72 | 5.9,3.2,4.8,1.8,Iris versicolor 73 | 6.1,2.8,4.0,1.3,Iris versicolor 74 | 6.3,2.5,4.9,1.5,Iris versicolor 75 | 6.1,2.8,4.7,1.2,Iris versicolor 76 | 6.4,2.9,4.3,1.3,Iris versicolor 77 | 6.6,3.0,4.4,1.4,Iris versicolor 78 | 6.8,2.8,4.8,1.4,Iris versicolor 79 | 6.7,3.0,5.0,1.7,Iris versicolor 80 | 6.0,2.9,4.5,1.5,Iris versicolor 81 | 5.7,2.6,3.5,1.0,Iris versicolor 82 | 5.5,2.4,3.8,1.1,Iris versicolor 83 | 5.5,2.4,3.7,1.0,Iris versicolor 84 | 5.8,2.7,3.9,1.2,Iris versicolor 85 | 6.0,2.7,5.1,1.6,Iris versicolor 86 | 5.4,3.0,4.5,1.5,Iris versicolor 87 | 6.0,3.4,4.5,1.6,Iris versicolor 88 | 6.7,3.1,4.7,1.5,Iris versicolor 89 | 6.3,2.3,4.4,1.3,Iris versicolor 90 | 5.6,3.0,4.1,1.3,Iris versicolor 91 | 5.5,2.5,4.0,1.3,Iris versicolor 92 | 5.5,2.6,4.4,1.2,Iris versicolor 93 | 6.1,3.0,4.6,1.4,Iris versicolor 94 | 5.8,2.6,4.0,1.2,Iris versicolor 95 | 5.0,2.3,3.3,1.0,Iris versicolor 96 | 5.6,2.7,4.2,1.3,Iris versicolor 97 | 5.7,3.0,4.2,1.2,Iris versicolor 98 | 5.7,2.9,4.2,1.3,Iris versicolor 99 | 6.2,2.9,4.3,1.3,Iris versicolor 100 | 5.1,2.5,3.0,1.1,Iris versicolor 101 | 5.7,2.8,4.1,1.3,Iris versicolor 102 | 6.3,3.3,6.0,2.5,Iris virginica 103 | 5.8,2.7,5.1,1.9,Iris virginica 104 | 7.1,3.0,5.9,2.1,Iris virginica 105 | 6.3,2.9,5.6,1.8,Iris virginica 106 | 6.5,3.0,5.8,2.2,Iris virginica 107 | 7.6,3.0,6.6,2.1,Iris virginica 108 | 4.9,2.5,4.5,1.7,Iris virginica 109 | 7.3,2.9,6.3,1.8,Iris virginica 110 | 6.7,2.5,5.8,1.8,Iris virginica 111 | 7.2,3.6,6.1,2.5,Iris virginica 112 | 6.5,3.2,5.1,2.0,Iris virginica 113 | 6.4,2.7,5.3,1.9,Iris virginica 114 | 6.8,3.0,5.5,2.1,Iris virginica 115 | 5.7,2.5,5.0,2.0,Iris virginica 116 | 5.8,2.8,5.1,2.4,Iris virginica 117 | 6.4,3.2,5.3,2.3,Iris virginica 118 | 6.5,3.0,5.5,1.8,Iris virginica 119 | 7.7,3.8,6.7,2.2,Iris virginica 120 | 7.7,2.6,6.9,2.3,Iris virginica 121 | 6.0,2.2,5.0,1.5,Iris virginica 122 | 6.9,3.2,5.7,2.3,Iris virginica 123 | 5.6,2.8,4.9,2.0,Iris virginica 124 | 7.7,2.8,6.7,2.0,Iris virginica 125 | 6.3,2.7,4.9,1.8,Iris virginica 126 | 6.7,3.3,5.7,2.1,Iris virginica 127 | 7.2,3.2,6.0,1.8,Iris virginica 128 | 6.2,2.8,4.8,1.8,Iris virginica 129 | 6.1,3.0,4.9,1.8,Iris virginica 130 | 6.4,2.8,5.6,2.1,Iris virginica 131 | 7.2,3.0,5.8,1.6,Iris virginica 132 | 7.4,2.8,6.1,1.9,Iris virginica 133 | 7.9,3.8,6.4,2.0,Iris virginica 134 | 6.4,2.8,5.6,2.2,Iris virginica 135 | 6.3,2.8,5.1,1.5,Iris virginica 136 | 6.1,2.6,5.6,1.4,Iris virginica 137 | 7.7,3.0,6.1,2.3,Iris virginica 138 | 6.3,3.4,5.6,2.4,Iris virginica 139 | 6.4,3.1,5.5,1.8,Iris virginica 140 | 6.0,3.0,4.8,1.8,Iris virginica 141 | 6.9,3.1,5.4,2.1,Iris virginica 142 | 6.7,3.1,5.6,2.4,Iris virginica 143 | 6.9,3.1,5.1,2.3,Iris virginica 144 | 5.8,2.7,5.1,1.9,Iris virginica 145 | 6.8,3.2,5.9,2.3,Iris virginica 146 | 6.7,3.3,5.7,2.5,Iris virginica 147 | 6.7,3.0,5.2,2.3,Iris virginica 148 | 6.3,2.5,5.0,1.9,Iris virginica 149 | 6.5,3.0,5.2,2.0,Iris virginica 150 | 6.2,3.4,5.4,2.3,Iris virginica 151 | 5.9,3.0,5.1,1.8,Iris virginica 152 | -------------------------------------------------------------------------------- /examples/ml/clustering/iris.csv: -------------------------------------------------------------------------------- 1 | Sepal length,Sepal Width,Petal length,Petal width,Species 2 | 5.1,3.5,1.4,0.2,Iris setosa 3 | 4.9,3.0,1.4,0.2,Iris setosa 4 | 4.7,3.2,1.3,0.2,Iris setosa 5 | 4.6,3.1,1.5,0.2,Iris setosa 6 | 5.0,3.6,1.4,0.2,Iris setosa 7 | 5.4,3.9,1.7,0.4,Iris setosa 8 | 4.6,3.4,1.4,0.3,Iris setosa 9 | 5.0,3.4,1.5,0.2,Iris setosa 10 | 4.4,2.9,1.4,0.2,Iris setosa 11 | 4.9,3.1,1.5,0.1,Iris setosa 12 | 5.4,3.7,1.5,0.2,Iris setosa 13 | 4.8,3.4,1.6,0.2,Iris setosa 14 | 4.8,3.0,1.4,0.1,Iris setosa 15 | 4.3,3.0,1.1,0.1,Iris setosa 16 | 5.8,4.0,1.2,0.2,Iris setosa 17 | 5.7,4.4,1.5,0.4,Iris setosa 18 | 5.4,3.9,1.3,0.4,Iris setosa 19 | 5.1,3.5,1.4,0.3,Iris setosa 20 | 5.7,3.8,1.7,0.3,Iris setosa 21 | 5.1,3.8,1.5,0.3,Iris setosa 22 | 5.4,3.4,1.7,0.2,Iris setosa 23 | 5.1,3.7,1.5,0.4,Iris setosa 24 | 4.6,3.6,1.0,0.2,Iris setosa 25 | 5.1,3.3,1.7,0.5,Iris setosa 26 | 4.8,3.4,1.9,0.2,Iris setosa 27 | 5.0,3.0,1.6,0.2,Iris setosa 28 | 5.0,3.4,1.6,0.4,Iris setosa 29 | 5.2,3.5,1.5,0.2,Iris setosa 30 | 5.2,3.4,1.4,0.2,Iris setosa 31 | 4.7,3.2,1.6,0.2,Iris setosa 32 | 4.8,3.1,1.6,0.2,Iris setosa 33 | 5.4,3.4,1.5,0.4,Iris setosa 34 | 5.2,4.1,1.5,0.1,Iris setosa 35 | 5.5,4.2,1.4,0.2,Iris setosa 36 | 4.9,3.1,1.5,0.1,Iris setosa 37 | 5.0,3.2,1.2,0.2,Iris setosa 38 | 5.5,3.5,1.3,0.2,Iris setosa 39 | 4.9,3.1,1.5,0.1,Iris setosa 40 | 4.4,3.0,1.3,0.2,Iris setosa 41 | 5.1,3.4,1.5,0.2,Iris setosa 42 | 5.0,3.5,1.3,0.3,Iris setosa 43 | 4.5,2.3,1.3,0.3,Iris setosa 44 | 4.4,3.2,1.3,0.2,Iris setosa 45 | 5.0,3.5,1.6,0.6,Iris setosa 46 | 5.1,3.8,1.9,0.4,Iris setosa 47 | 4.8,3.0,1.4,0.3,Iris setosa 48 | 5.1,3.8,1.6,0.2,Iris setosa 49 | 4.6,3.2,1.4,0.2,Iris setosa 50 | 5.3,3.7,1.5,0.2,Iris setosa 51 | 5.0,3.3,1.4,0.2,Iris setosa 52 | 7.0,3.2,4.7,1.4,Iris versicolor 53 | 6.4,3.2,4.5,1.5,Iris versicolor 54 | 6.9,3.1,4.9,1.5,Iris versicolor 55 | 5.5,2.3,4.0,1.3,Iris versicolor 56 | 6.5,2.8,4.6,1.5,Iris versicolor 57 | 5.7,2.8,4.5,1.3,Iris versicolor 58 | 6.3,3.3,4.7,1.6,Iris versicolor 59 | 4.9,2.4,3.3,1.0,Iris versicolor 60 | 6.6,2.9,4.6,1.3,Iris versicolor 61 | 5.2,2.7,3.9,1.4,Iris versicolor 62 | 5.0,2.0,3.5,1.0,Iris versicolor 63 | 5.9,3.0,4.2,1.5,Iris versicolor 64 | 6.0,2.2,4.0,1.0,Iris versicolor 65 | 6.1,2.9,4.7,1.4,Iris versicolor 66 | 5.6,2.9,3.6,1.3,Iris versicolor 67 | 6.7,3.1,4.4,1.4,Iris versicolor 68 | 5.6,3.0,4.5,1.5,Iris versicolor 69 | 5.8,2.7,4.1,1.0,Iris versicolor 70 | 6.2,2.2,4.5,1.5,Iris versicolor 71 | 5.6,2.5,3.9,1.1,Iris versicolor 72 | 5.9,3.2,4.8,1.8,Iris versicolor 73 | 6.1,2.8,4.0,1.3,Iris versicolor 74 | 6.3,2.5,4.9,1.5,Iris versicolor 75 | 6.1,2.8,4.7,1.2,Iris versicolor 76 | 6.4,2.9,4.3,1.3,Iris versicolor 77 | 6.6,3.0,4.4,1.4,Iris versicolor 78 | 6.8,2.8,4.8,1.4,Iris versicolor 79 | 6.7,3.0,5.0,1.7,Iris versicolor 80 | 6.0,2.9,4.5,1.5,Iris versicolor 81 | 5.7,2.6,3.5,1.0,Iris versicolor 82 | 5.5,2.4,3.8,1.1,Iris versicolor 83 | 5.5,2.4,3.7,1.0,Iris versicolor 84 | 5.8,2.7,3.9,1.2,Iris versicolor 85 | 6.0,2.7,5.1,1.6,Iris versicolor 86 | 5.4,3.0,4.5,1.5,Iris versicolor 87 | 6.0,3.4,4.5,1.6,Iris versicolor 88 | 6.7,3.1,4.7,1.5,Iris versicolor 89 | 6.3,2.3,4.4,1.3,Iris versicolor 90 | 5.6,3.0,4.1,1.3,Iris versicolor 91 | 5.5,2.5,4.0,1.3,Iris versicolor 92 | 5.5,2.6,4.4,1.2,Iris versicolor 93 | 6.1,3.0,4.6,1.4,Iris versicolor 94 | 5.8,2.6,4.0,1.2,Iris versicolor 95 | 5.0,2.3,3.3,1.0,Iris versicolor 96 | 5.6,2.7,4.2,1.3,Iris versicolor 97 | 5.7,3.0,4.2,1.2,Iris versicolor 98 | 5.7,2.9,4.2,1.3,Iris versicolor 99 | 6.2,2.9,4.3,1.3,Iris versicolor 100 | 5.1,2.5,3.0,1.1,Iris versicolor 101 | 5.7,2.8,4.1,1.3,Iris versicolor 102 | 6.3,3.3,6.0,2.5,Iris virginica 103 | 5.8,2.7,5.1,1.9,Iris virginica 104 | 7.1,3.0,5.9,2.1,Iris virginica 105 | 6.3,2.9,5.6,1.8,Iris virginica 106 | 6.5,3.0,5.8,2.2,Iris virginica 107 | 7.6,3.0,6.6,2.1,Iris virginica 108 | 4.9,2.5,4.5,1.7,Iris virginica 109 | 7.3,2.9,6.3,1.8,Iris virginica 110 | 6.7,2.5,5.8,1.8,Iris virginica 111 | 7.2,3.6,6.1,2.5,Iris virginica 112 | 6.5,3.2,5.1,2.0,Iris virginica 113 | 6.4,2.7,5.3,1.9,Iris virginica 114 | 6.8,3.0,5.5,2.1,Iris virginica 115 | 5.7,2.5,5.0,2.0,Iris virginica 116 | 5.8,2.8,5.1,2.4,Iris virginica 117 | 6.4,3.2,5.3,2.3,Iris virginica 118 | 6.5,3.0,5.5,1.8,Iris virginica 119 | 7.7,3.8,6.7,2.2,Iris virginica 120 | 7.7,2.6,6.9,2.3,Iris virginica 121 | 6.0,2.2,5.0,1.5,Iris virginica 122 | 6.9,3.2,5.7,2.3,Iris virginica 123 | 5.6,2.8,4.9,2.0,Iris virginica 124 | 7.7,2.8,6.7,2.0,Iris virginica 125 | 6.3,2.7,4.9,1.8,Iris virginica 126 | 6.7,3.3,5.7,2.1,Iris virginica 127 | 7.2,3.2,6.0,1.8,Iris virginica 128 | 6.2,2.8,4.8,1.8,Iris virginica 129 | 6.1,3.0,4.9,1.8,Iris virginica 130 | 6.4,2.8,5.6,2.1,Iris virginica 131 | 7.2,3.0,5.8,1.6,Iris virginica 132 | 7.4,2.8,6.1,1.9,Iris virginica 133 | 7.9,3.8,6.4,2.0,Iris virginica 134 | 6.4,2.8,5.6,2.2,Iris virginica 135 | 6.3,2.8,5.1,1.5,Iris virginica 136 | 6.1,2.6,5.6,1.4,Iris virginica 137 | 7.7,3.0,6.1,2.3,Iris virginica 138 | 6.3,3.4,5.6,2.4,Iris virginica 139 | 6.4,3.1,5.5,1.8,Iris virginica 140 | 6.0,3.0,4.8,1.8,Iris virginica 141 | 6.9,3.1,5.4,2.1,Iris virginica 142 | 6.7,3.1,5.6,2.4,Iris virginica 143 | 6.9,3.1,5.1,2.3,Iris virginica 144 | 5.8,2.7,5.1,1.9,Iris virginica 145 | 6.8,3.2,5.9,2.3,Iris virginica 146 | 6.7,3.3,5.7,2.5,Iris virginica 147 | 6.7,3.0,5.2,2.3,Iris virginica 148 | 6.3,2.5,5.0,1.9,Iris virginica 149 | 6.5,3.0,5.2,2.0,Iris virginica 150 | 6.2,3.4,5.4,2.3,Iris virginica 151 | 5.9,3.0,5.1,1.8,Iris virginica 152 | -------------------------------------------------------------------------------- /lib/task.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | const fs = require('fs'); 4 | const http = require('http'); 5 | 6 | const mkdirp = require('mkdirp'); 7 | 8 | const uuidPattern = /^[0-9a-f]{8}-[0-9a-f]{4}-[1-4][0-9a-f]{3}-[0-9a-f]{4}-[0-9a-f]{12}$/i; 9 | 10 | /* global dlog */ 11 | 12 | module.exports = Task; 13 | 14 | // function Task(basedir, jobId, nodes, datasetId, pid, action) { 15 | function Task(init) { 16 | this.basedir = init.basedir; 17 | this.bundle = init.bundle; 18 | this.datasetId = init.datasetId; 19 | this.pid = init.pid; 20 | this.nodes = init.nodes; 21 | this.action = init.action; 22 | this.outputStreamOk = true; 23 | this.files = {}; // object in which we store shuffle file informations to be sent back to master 24 | // this.lib; // handler to libraries required on worker side (which cannot be serialized) 25 | // this.mm; // handler to worker side memory manager instance 26 | // this.grid; // handler to socket object instance 27 | } 28 | 29 | Task.prototype.run = function(done) { 30 | const pipeline = []; 31 | const self = this; 32 | const mm = this.mm; 33 | const action = this.action; 34 | const p = this.pid; 35 | const blocksToRegister = []; 36 | let tmpPart = action ? this.nodes[this.datasetId].partitions[p] : this.nodes[this.datasetId].shufflePartitions[p]; 37 | let tmpDataset = this.nodes[tmpPart.datasetId]; 38 | 39 | mkdirp.sync(this.basedir + 'export'); 40 | mkdirp.sync(this.basedir + 'import'); 41 | mkdirp.sync(this.basedir + 'shuffle'); 42 | 43 | // Propagate environment settings from master 44 | if (this.env) { 45 | //log('env:', this.env); 46 | for (let e in this.env) { 47 | if (this.env[e]) process.env[e] = this.env[e]; 48 | else delete process.env[e]; 49 | } 50 | } 51 | 52 | // Inject user dependencies 53 | for (let m in this.modules) { 54 | this.lib[m] = this.modules[m]; 55 | } 56 | 57 | if (action) { 58 | if (action.opt._foreach) { 59 | pipeline.push({transform: function foreach(data) { 60 | for (let i = 0; i < data.length; i++) action.src(data[i], action.opt, self); 61 | }}); 62 | } else { 63 | pipeline.push({transform: function aggregate(data) { 64 | for (let i = 0; i < data.length; i++) 65 | action.init = action.src(action.init, data[i], action.opt, self); 66 | }}); 67 | } 68 | } 69 | 70 | let tmpPartAvailable; 71 | for (;;) { 72 | tmpPartAvailable = mm.isAvailable(tmpPart); // is partition available in memory 73 | if (!tmpPartAvailable && tmpDataset.persistent) { // if data must be stored in memory 74 | if ((action !== undefined) || (tmpDataset.id !== this.datasetId)) { 75 | // no persist if no action and shuffleRDD 76 | blocksToRegister.push(tmpPart); // register block inside memory manager 77 | pipeline.unshift(tmpPart); // add it to pipeline 78 | tmpPart.mm = this.mm; 79 | } 80 | } 81 | if (tmpPartAvailable || (tmpPart.parentDatasetId === undefined)) break; // source partition found 82 | pipeline.unshift(tmpDataset); // else add current dataset transform to pipeline 83 | tmpPart = this.nodes[tmpPart.parentDatasetId].partitions[tmpPart.parentPartitionIndex]; 84 | tmpDataset = this.nodes[tmpPart.datasetId]; 85 | } 86 | 87 | // Pre-iterate actions 88 | if (action) { 89 | if (action.opt._preIterate) { 90 | action.opt._preIterate(action.opt, this, tmpPart.partitionIndex); 91 | } 92 | } 93 | 94 | // Iterate actions 95 | const start = Date.now(); 96 | if (tmpPartAvailable) mm.partitions[tmpPart.datasetId + '.' + tmpPart.partitionIndex].iterate(this, tmpPart.partitionIndex, pipeline, iterateDone); 97 | else this.nodes[tmpPart.datasetId].iterate(this, tmpPart.partitionIndex, pipeline, iterateDone); 98 | 99 | // Post-iterate actions 100 | function iterateDone() { 101 | dlog(start, 'iterate'); 102 | blocksToRegister.map(function(block) {mm.register(block);}); 103 | if (action) { 104 | if (action.opt._postIterate) { 105 | action.opt._postIterate(action.init, action.opt, self, tmpPart.partitionIndex, function () { 106 | done({data: {host: self.grid.host.uuid, path: self.exportFile}}); 107 | }); 108 | } else done({data: action.init}); 109 | } else { 110 | const start1 = Date.now(); 111 | self.nodes[self.datasetId].spillToDisk(self, function() { 112 | done({pid: self.pid, files: self.files}); 113 | dlog(start1, 'spillToDisk'); 114 | }); 115 | } 116 | } 117 | }; 118 | 119 | // Get a readable stream for shuffle or source file. 120 | // First, attempt to read from local filesystem 121 | // If not present, attempt to access an HTTP server 122 | // If HTTP server not available, use skale transport through skale server 123 | Task.prototype.getReadStream = function (fileObj, opt, done) { 124 | if (fs.existsSync(fileObj.path)) return done(null, fs.createReadStream(fileObj.path, opt)); 125 | // Default host is master 126 | if (!fileObj.host) fileObj.host = this.grid.muuid; 127 | if (uuidPattern.test(fileObj.host)) 128 | return done(null, this.grid.createStreamFrom(fileObj.host, {cmd: 'sendFile', path: fileObj.path, opt: opt})); 129 | const url = 'http://' + fileObj.host + fileObj.path; 130 | http.get(url, function (res) { 131 | done(null, res); 132 | }); 133 | }; 134 | 135 | // Same as above getReadStream, but return a streams synchronously. 136 | // This may be more expensive, as it requires an additional pass-through stream 137 | Task.prototype.getReadStreamSync = function (fileObj, opt) { 138 | const fs = this.lib.fs; 139 | if (fs.existsSync(fileObj.path)) return fs.createReadStream(fileObj.path, opt); 140 | if (!fileObj.host) fileObj.host = this.grid.muuid; 141 | return this.grid.createStreamFrom(fileObj.host, {cmd: 'sendFile', path: fileObj.path, opt: opt}); 142 | }; 143 | -------------------------------------------------------------------------------- /ml/sgd-linear-model.js: -------------------------------------------------------------------------------- 1 | // Regularized Linear models trained by Stochastic Gradient Descent (SGD) 2 | // Authors: M. Vertes (current), C. Artigue (preliminary) 3 | // License: Apache License 2.0 4 | 5 | 'use strict'; 6 | 7 | const thenify = require('thenify'); 8 | 9 | module.exports = SGDLinearModel; 10 | 11 | function SGDLinearModel(options) { 12 | if (!(this instanceof SGDLinearModel)) 13 | return new SGDLinearModel(options); 14 | options = options || {}; 15 | this.weights = options.weights || []; 16 | this.stepSize = options.stepSize || 1; 17 | this.regParam = options.regParam || 0.001; 18 | this.fitIntercept = options.fitIntercept || true; 19 | this.proba = options.proba || false; 20 | this.intercept = 0; 21 | 22 | if (!options.penalty) this.regularize = regularizeL2; 23 | else if (options.penalty === 'l2') this.regularize = regularizeL2; 24 | else if (options.penalty === 'l1') this.regularize = regularizeL1; 25 | else if (options.penalty === 'none') this.regularize = regularizeNone; 26 | else throw 'Invalid penalty parameter: ' + options.penalty; 27 | 28 | if (!options.loss) this.loss = hingeLoss; 29 | else if (options.loss === 'hinge') this.loss = hingeLoss; 30 | else if (options.loss === 'log') this.loss = logisticLoss; 31 | else if (options.loss === 'square') this.loss = squaredLoss; 32 | else throw 'Invalid loss parameter: ' + options.loss; 33 | 34 | // For now prediction returns a soft output, TODO: include threshold and hard output 35 | this.predict = function (point) { 36 | let margin = this.intercept; 37 | for (let i = 0; i < this.weights.length; i++) 38 | margin += (this.weights[i] || 0) * (point[i] || 0); 39 | if (this.proba) 40 | return 1 / (1 + Math.exp(-margin)); 41 | return margin; 42 | }; 43 | } 44 | 45 | // A training iteration for a stochastic gradient descent classifier consists to: 46 | // - compute loss (price of inaccuracy) for each label/features of training set 47 | // - finalize gradient (sum and average loss per feature) 48 | // - regularize loss weigths using a penalty function of gradient 49 | 50 | SGDLinearModel.prototype.fit = thenify(function (trainingSet, nIterations, callback) { 51 | const self = this; 52 | let iter = 0; 53 | 54 | if (this.fitIntercept) 55 | trainingSet = trainingSet.map(a => {a[1].unshift(1); return a;}); 56 | 57 | iterate(); 58 | 59 | function iterate() { 60 | trainingSet 61 | .map(self.loss, self.weights) 62 | .aggregate( 63 | // Compute total loss per feature and number of samples 64 | (a, b) => { 65 | for (let i = 0; i < b.length; i++) 66 | a[0][i] = (a[0][i] || 0) + (b[i] || 0); 67 | a[1]++; 68 | return a; 69 | }, 70 | (a, b) => { 71 | for (let i = 0; i < b[0].length; i++) 72 | a[0][i] = (a[0][i] || 0) + (b[0][i] || 0); 73 | a[1] += b[1]; 74 | return a; 75 | }, 76 | [[], 0], 77 | function (err, result) { 78 | const iterStepSize = self.stepSize / Math.sqrt(iter + 1); 79 | self.regularize(self.weights, result, iterStepSize, self.regParam); 80 | if (++iter === nIterations) { 81 | if (self.fitIntercept) 82 | self.intercept = self.weights.shift(); 83 | callback(); 84 | } else iterate(); 85 | } 86 | ); 87 | } 88 | }); 89 | 90 | // None, a.k.a ordinary least squares 91 | function regularizeNone(weights, gradientCount) { 92 | const [gradient, count] = gradientCount; 93 | 94 | for (let i = 0; i < gradient.length; i++) { 95 | let grad = (gradient[i] || 0) / count; 96 | weights[i] = (weights[i] || 0) - grad; 97 | } 98 | } 99 | 100 | // L1, a.k.a Lasso 101 | function regularizeL1(weights, gradientCount, stepSize) { 102 | const [gradient, count] = gradientCount; 103 | 104 | for (let i = 0; i < gradient.length; i++) { 105 | let grad = (gradient[i] || 0) / count; 106 | weights[i] = weights[i] || 0; 107 | weights[i] -= stepSize * grad + (weights[i] > 0 ? 1 : -1); 108 | } 109 | } 110 | 111 | // L2, a.k.a ridge regression 112 | function regularizeL2(weights, gradientCount, stepSize, regParam) { 113 | const [gradient, count] = gradientCount; 114 | 115 | for (let i = 0; i < gradient.length; i++) { 116 | let grad = (gradient[i] || 0) / count; 117 | weights[i] = weights[i] || 0; 118 | weights[i] -= stepSize * (grad + regParam * weights[i]); 119 | } 120 | } 121 | 122 | // TODO #1: elastic-net regularizer: combine L1 and L2 with an 123 | // alpha parameter in range [0, 1] where 1 => L1, 0 => L2, 124 | // in between: (alpha * L1) + ((1-alpha) * L2) 125 | // May be merge L1 and L2 functions 126 | 127 | // TODO #2: for each regularizer: set weight to 0 if regularization 128 | // crosses 0 (sign change), to achieve feature selection (sparse models) 129 | 130 | function hingeLoss(p, weights) { 131 | const [label, features] = p; 132 | const grad = []; 133 | let dotProd = 0; 134 | 135 | for (let i = 0; i < features.length; i++) 136 | dotProd += (features[i] || 0) * (weights[i] || 0); 137 | 138 | if (label * dotProd < 1) 139 | for (let i = 0; i < features.length; i++) 140 | grad[i] = -label * (features[i] || 0); 141 | else 142 | for (let i = 0; i < features.length; i++) 143 | grad[i] = 0; 144 | 145 | return grad; 146 | } 147 | 148 | // valid for labels in [-1, 1] 149 | function logisticLoss(p, weights) { 150 | const [label, features] = p; 151 | const grad = []; 152 | let dotProd = 0; 153 | 154 | for (let i = 0; i < features.length; i++) 155 | dotProd += (features[i] || 0) * (weights[i] || 0); 156 | 157 | const tmp = 1 / (1 + Math.exp(-dotProd)) - label; 158 | 159 | for (let i = 0; i < features.length; i++) 160 | grad[i] = (features[i] || 0) * tmp; 161 | 162 | return grad; 163 | } 164 | 165 | function squaredLoss(p, weights) { 166 | const [label, features] = p; 167 | const grad = []; 168 | let dotProd = 0; 169 | 170 | for (let i = 0; i < features.length; i++) 171 | dotProd += (features[i] || 0) * (weights[i] || 0); 172 | 173 | for (let i = 0; i < features.length; i++) 174 | grad[i] = (dotProd - label) * (features[i] || 0); 175 | 176 | return grad; 177 | } 178 | -------------------------------------------------------------------------------- /lib/readsplit.js: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Luca-SAS, licensed under the Apache License 2.0 2 | 3 | const fs = require('fs'); 4 | //var Lines = require('../lib/lines.js'); 5 | 6 | /** 7 | * Compute N-Length logic split of given file 8 | */ 9 | function splitLocalFile(file, N, callback) { 10 | const split = []; 11 | const size = fs.statSync(file).size; 12 | const maxBlockSize = Math.ceil(size / (N || 1)); 13 | let start = 0; 14 | 15 | // var str = fs.readFileSync(file, {encoding: 'utf8'}).replace(/\n/g, '*'); 16 | // console.log(str) 17 | while (start < size) { 18 | // console.log('Split n° %d = %s', split.length, str.substr(start, maxBlockSize + 1)) 19 | split.push({index: split.length, chunk: [{path: file, opt: {start: start, end: start + maxBlockSize}}]}); 20 | start += maxBlockSize + 1; 21 | } 22 | 23 | callback(split); 24 | } 25 | 26 | function splitDistributedFile(file, N, callback) { // emulates a distributed file for now 27 | callback([ 28 | { 29 | index: 0, chunk: [ 30 | {path: './test.dat', opt: {start: 0, end: 9}}, 31 | {path: './test.dat', opt: {start: 10, end: 19}} 32 | ] 33 | }, { 34 | index: 1, 35 | chunk: [ 36 | {path: './test.dat', opt: {start: 20, end: 29}}, 37 | {path: './test.dat', opt: {start: 30}} 38 | ] 39 | } 40 | ]); 41 | } 42 | 43 | function getFirstLine(split, chunk_buffer, s, getStream, done) { 44 | // console.log('Split n° ' + (s - 1) + ' seeks end of line starting with : ' + chunk_buffer.replace(/\n/g, '*')) 45 | const isLastSplit = (split[s].index === (split.length - 1)); 46 | let p = 0; 47 | let firstLineFound = false; 48 | let firstLine; 49 | 50 | function readPart(part, partDone) { 51 | const isLastPart = (p === split[s].chunk.length - 1); 52 | //const rs = fs.createReadStream(part.path, part.opt); 53 | const rs = getStream(part, part.opt); 54 | 55 | function processChunk(chunk) { 56 | const lines = (chunk_buffer + chunk).split(/\r\n|\r|\n/); 57 | chunk_buffer = lines.pop(); 58 | if (lines.length > 0) { 59 | firstLine = lines[0]; 60 | firstLineFound = true; 61 | //rs.destroy(); 62 | } else rs.once('data', processChunk); 63 | } 64 | 65 | rs.once('data', processChunk); 66 | 67 | rs.on('end', function () { 68 | if (firstLineFound) done(firstLine); 69 | else if (!isLastPart) partDone(); 70 | else if (isLastSplit) done(chunk_buffer); 71 | else { 72 | getFirstLine(split, chunk_buffer, s + 1, getStream, done); 73 | } 74 | }); 75 | } 76 | 77 | function end() { 78 | if (++p < split[s].chunk.length) readPart(split[s].chunk[p], end); 79 | } 80 | 81 | readPart(split[s].chunk[p], end); 82 | } 83 | 84 | function readSplit(split, s, processLine, splitDone, getStream) { 85 | if (split.length === 0) return splitDone(); 86 | const isFirstSplit = (split[s].index === 0); 87 | const isLastSplit = (split[s].index === (split.length - 1)); 88 | let chunk_buffer = ''; 89 | let p = 0; 90 | let hasToSkipFirstLine = isFirstSplit ? false : undefined; 91 | let firstLineFound = isFirstSplit ? true : false; 92 | 93 | function readPart(part, partDone) { 94 | const isFirstPart = (p === 0); 95 | const isLastPart = (p === split[s].chunk.length - 1); 96 | const opt = (!isFirstSplit && isFirstPart) ? {start: part.opt.start - 1, end: part.opt.end} : part.opt; 97 | //const rs = fs.createReadStream(part.path, opt); 98 | const rs = getStream(part, opt); 99 | let chunkLastChar = ''; 100 | 101 | function processChunkOnce(chunk) { 102 | // console.log('Split n° %d found chunk = %s', s, String(chunk).replace(/\n/g, '*')) 103 | if (hasToSkipFirstLine === undefined) { 104 | chunk = String(chunk); 105 | hasToSkipFirstLine = (chunk.charAt(0) !== '\n'); 106 | // console.log('Has to skip first line = ' + hasToSkipFirstLine) 107 | chunk = chunk.substr(1); 108 | // console.log('Chunk after first byte test = ' + chunk) 109 | if (!hasToSkipFirstLine) firstLineFound = true; 110 | } 111 | const str = (chunk_buffer + chunk); 112 | chunkLastChar = str.charAt(str.length - 1); 113 | const lines = str.split(/\r\n|\r|\n/); 114 | chunk_buffer = lines.pop(); 115 | if (lines.length) { 116 | firstLineFound = true; 117 | const start = hasToSkipFirstLine ? 1 : 0; 118 | for (let i = start; i < lines.length; i++) processLine(lines[i]); 119 | if (lines.length === 1) chunkLastChar = ''; 120 | rs.on('data', processChunk); 121 | // console.log('Found first line') 122 | } else rs.once('data', processChunkOnce); 123 | } 124 | 125 | const processChunk = function(chunk) { 126 | const str = chunk_buffer + chunk; 127 | chunkLastChar = str.charAt(str.length - 1); 128 | const lines = str.split(/\r\n|\r|\n/); 129 | chunk_buffer = lines.pop(); 130 | for (let i = 0; i < lines.length; ++i) processLine(lines[i]); 131 | }; 132 | 133 | rs.on('end', function () { 134 | // console.log(chunk_buffer) 135 | if (!isLastPart) return partDone(); 136 | if (isLastSplit) { 137 | if (!firstLineFound) { 138 | firstLineFound = true; 139 | if (!hasToSkipFirstLine) processLine(chunk_buffer); 140 | } else processLine(chunk_buffer); 141 | splitDone(); 142 | } else { 143 | if (!firstLineFound) { 144 | if (chunkLastChar === '\n') { 145 | firstLineFound = true; 146 | if (!hasToSkipFirstLine) processLine(chunk_buffer); 147 | } 148 | splitDone(); 149 | } else { 150 | if (chunkLastChar === '\n') { 151 | processLine(chunk_buffer); 152 | splitDone(); 153 | } else { 154 | if (chunk_buffer === '') { 155 | splitDone(); 156 | } else { 157 | getFirstLine(split, chunk_buffer, s + 1, getStream, function(firstline) { 158 | processLine(firstline); 159 | splitDone(); 160 | }); 161 | } 162 | } 163 | } 164 | } 165 | }); 166 | 167 | rs.once('data', processChunkOnce); 168 | } 169 | 170 | function end() { 171 | if (++p < split[s].chunk.length) 172 | readPart(split[s].chunk[p], end); 173 | } 174 | 175 | readPart(split[s].chunk[p], end); 176 | } 177 | 178 | module.exports.splitLocalFile = splitLocalFile; 179 | module.exports.splitDistributedFile = splitDistributedFile; 180 | module.exports.readSplit = readSplit; 181 | -------------------------------------------------------------------------------- /lib/worker-local.js: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Luca-SAS, licensed under the Apache License 2.0 2 | 3 | // worker module 4 | 5 | 'use strict'; 6 | 7 | const fs = require('fs'); 8 | const zlib = require('zlib'); 9 | const url = require('url'); 10 | const stream = require('stream'); 11 | 12 | const mkdirp = require('mkdirp'); 13 | const uuid = require('uuid'); 14 | const S3 = require('aws-sdk/clients/s3'); 15 | const azure = require('azure-storage'); 16 | const parquet = require('./stub-parquet.js'); 17 | 18 | const Dataset = require('./dataset.js'); 19 | const Task = require('./task.js'); 20 | const sizeOf = require('./rough-sizeof.js'); 21 | const Lines = require('./lines.js'); 22 | const readSplit = require('./readsplit.js').readSplit; 23 | 24 | const workerId = process.argv[2]; 25 | let memory = process.argv[3]; 26 | 27 | const mm = new MemoryManager(memory); 28 | 29 | const start = Date.now(); 30 | 31 | if (process.env.SKALE_RANDOM_SEED) 32 | Dataset.setRandomSeed(process.env.SKALE_RANDOM_SEED); 33 | 34 | process.title = 'skale-worker-' + workerId; 35 | 36 | process.on('disconnect', function () { 37 | log('disconnected, exit'); 38 | process.exit(); 39 | }); 40 | 41 | process.on('message', function (msg) { 42 | if (typeof msg === 'object' && msg.req) { 43 | switch (msg.req.cmd) { 44 | case 'runTask': 45 | runTask(msg); 46 | break; 47 | case 'runztask': 48 | runztask(msg); 49 | break; 50 | } 51 | } 52 | }); 53 | 54 | function runztask(msg) { 55 | const file = msg.req.args; 56 | fs.readFile(file, function (err, data) { 57 | fs.unlink(file, function () {}); 58 | if (err) throw new Error(err); 59 | zlib.gunzip(data, {chunkSize: 65536}, function (err, data) { 60 | if (err) throw new Error(err); 61 | msg.req.args = data; 62 | runTask(msg); 63 | }); 64 | }); 65 | } 66 | 67 | function runTask(msg) { 68 | const task = parseTask(msg.req.args); 69 | task.workerId = workerId; 70 | task.grid = {host: {}}; 71 | task.mm = mm; 72 | // Expose system core dependencies explicitely for user evaluated code in workers 73 | // Those dependencies do not need to be serialized 74 | global.azure = azure; 75 | global.S3 = S3; 76 | global.log = log; 77 | global.dlog = dlog; 78 | global.Lines = Lines; 79 | global.mkdirp = mkdirp; 80 | global.mm = mm; 81 | global.parquet = parquet; 82 | global.readSplit = readSplit; 83 | global.uuid = uuid; 84 | 85 | global.fs = fs; 86 | global.stream = stream; 87 | global.url = url; 88 | global.zlib = zlib; 89 | 90 | // Indirect Eval to set user dependencies bundle in the worker global context 91 | (0, eval)(task.bundle); 92 | task.run(function (result) { 93 | delete msg.req.args; 94 | msg.result = result; 95 | msg.result.workerId = workerId; 96 | process.send(msg); 97 | if (global.gc) { 98 | setImmediate(function () { 99 | const gcs = Date.now(); 100 | global.gc(); 101 | dlog(gcs, 'gc'); 102 | }); 103 | } 104 | else log('no global.gc'); 105 | }); 106 | } 107 | 108 | function parseTask(str) { 109 | //var i, j, n, ref; 110 | const task = JSON.parse(str, function (key, value) { 111 | if (typeof value === 'string') { 112 | // String value can be a regular function or an ES6 arrow function 113 | if (value.substring(0, 8) == 'function') { 114 | const args = value.match(/\(([^)]*)/)[1]; 115 | const body = value.replace(/^function\s*[^)]*\)\s*{/, '').replace(/}$/, ''); 116 | value = new Function(args, body); 117 | } else if (value.match(/^\s*\(\s*[^(][^)]*\)\s*=>/) || value.match(/^\s*\w+\s*=>/)) 118 | value = ('indirect', eval)(value); 119 | } 120 | return value; 121 | }); 122 | 123 | for (let i in task.nodes) { 124 | const n = task.nodes[i]; 125 | for (let j in n.dependencies) { 126 | const ref = n.dependencies[j]; 127 | n.dependencies[j] = task.nodes[ref]; 128 | } 129 | for (let j in n.partitions) { 130 | Object.setPrototypeOf(task.nodes[i].partitions[j], Dataset.Partition.prototype); 131 | task.nodes[i].partitions[j].count = 0; 132 | task.nodes[i].partitions[j].bsize = 0; 133 | task.nodes[i].partitions[j].tsize = 0; 134 | task.nodes[i].partitions[j].skip = false; 135 | } 136 | if (n.type) { 137 | Object.setPrototypeOf(task.nodes[i], Dataset[n.type].prototype); 138 | } 139 | if (n.partitioner && n.partitioner.type) { 140 | Object.setPrototypeOf(n.partitioner, Dataset[n.partitioner.type].prototype); 141 | } 142 | } 143 | Object.setPrototypeOf(task, Task.prototype); 144 | //log('task:', JSON.stringify(task, null, 2)); 145 | return task; 146 | } 147 | 148 | function MemoryManager(memory = 1024) { 149 | const Mb = 1024 * 1024; 150 | const MAX_MEMORY = (memory - 100) * Mb; 151 | const maxStorageMemory = MAX_MEMORY * 0.4; 152 | const maxShuffleMemory = MAX_MEMORY * 0.2; 153 | const maxCollectMemory = MAX_MEMORY * 0.2; 154 | 155 | this.storageMemory = 0; 156 | this.shuffleMemory = 0; 157 | this.collectMemory = 0; 158 | this.sizeOf = sizeOf; 159 | 160 | this.storageFull = function () {return (this.storageMemory > maxStorageMemory);}; 161 | this.shuffleFull = function () {return (this.shuffleMemory > maxShuffleMemory);}; 162 | this.collectFull = function () {return (this.collectMemory > maxCollectMemory);}; 163 | 164 | this.partitions = {}; 165 | this.register = function (partition) { 166 | const key = partition.datasetId + '.' + partition.partitionIndex; 167 | if (!(key in this.partitions)) this.partitions[key] = partition; 168 | }; 169 | 170 | this.unregister = function (partition) { 171 | this.partitions[partition.datasetId + '.' + partition.partitionIndex] = undefined; 172 | }; 173 | 174 | this.isAvailable = function (partition) { 175 | return (this.partitions[partition.datasetId + '.' + partition.partitionIndex] !== undefined); 176 | }; 177 | } 178 | 179 | let log; 180 | let dlog; 181 | if (process.env.SKALE_DEBUG > 1) { 182 | log = function log() { 183 | const args = Array.prototype.slice.call(arguments); 184 | args.unshift('[worker-' + process.argv[2] + ' ' + (Date.now() - start) / 1000 + 's]'); 185 | console.error.apply(null, args); 186 | }; 187 | dlog = function dlog() { 188 | const args = Array.prototype.slice.call(arguments); 189 | const now = Date.now(); 190 | const lstart = args.shift(); 191 | args.unshift('[worker-' + process.argv[2] + ' ' + (now - start) / 1000 + 's]'); 192 | args.push('in ' + (now - lstart) / 1000 + 's'); 193 | console.error.apply(null, args); 194 | }; 195 | } else { 196 | dlog = log = function noop() {}; 197 | } 198 | -------------------------------------------------------------------------------- /examples/ml/binary-classification/adult.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | 'use strict'; 4 | 5 | // Wrap code into ES7 async function, to limit callback imbrications 6 | (async function main() { 7 | 8 | // Adult dataset processing as per http://scg.sdsu.edu/dataset-adult_r/ 9 | // In this example we: 10 | // - train a logistic regression on the adult training set. 11 | // - evaluate the model on the adult test set 12 | // - generate ROC curves as png images 13 | 14 | const sc = require('skale').context(); 15 | const ml = require('skale/ml'); 16 | const plot = require('plotter').plot; // Todo: should be replaced by D3 17 | 18 | // Todo: features should be automatically extracted from dataset + type schema 19 | const metadata = { 20 | workclass: [ 21 | 'Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 'Local-gov', 22 | 'State-gov', 'Without-pay', 'Never-worked' 23 | ], 24 | education: [ 25 | 'Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school', 'Assoc-acdm', 26 | 'Assoc-voc', '9th', '7th-8th', '12th', 'Masters', '1st-4th', '10th', 'Doctorate', 27 | '5th-6th', 'Preschool' 28 | ], 29 | maritalstatus: [ 30 | 'Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed', 31 | 'Married-spouse-absent', 'Married-AF-spouse' 32 | ], 33 | occupation: [ 34 | 'Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial', 35 | 'Prof-specialty', 'Handlers-cleaners', 'Machine-op-inspct', 'Adm-clerical', 36 | 'Farming-fishing', 'Transport-moving', 'Priv-house-serv', 'Protective-serv', 37 | 'Armed-Forces' 38 | ], 39 | relationship: [ 40 | 'Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried' 41 | ], 42 | race: [ 43 | 'White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black' 44 | ], 45 | sex: [ 46 | 'Female', 'Male' 47 | ], 48 | nativecountry: [ 49 | 'United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada', 'Germany', 50 | 'Outlying-US(Guam-USVI-etc)', 'India', 'Japan', 'Greece', 'South', 'China', 51 | 'Cuba', 'Iran', 'Honduras', 'Philippines', 'Italy', 'Poland', 'Jamaica', 52 | 'Vietnam', 'Mexico', 'Portugal', 'Ireland', 'France', 'Dominican-Republic', 53 | 'Laos', 'Ecuador', 'Taiwan', 'Haiti', 'Columbia', 'Hungary', 'Guatemala', 54 | 'Nicaragua', 'Scotland', 'Thailand', 'Yugoslavia', 'El-Salvador', 55 | 'Trinadad&Tobago', 'Peru', 'Hong', 'Holand-Netherlands' 56 | ] 57 | }; 58 | 59 | function featurize(data, metadata) { 60 | const label = (data[14] === '>50K' || data[14] === '>50K.') ? 1 : -1; 61 | const features = [ 62 | Number(data[0]), // 1 age 63 | metadata.workclass.indexOf(data[1]), // 2 workclass 64 | Number(data[2]), // 3 fnlwgt 65 | // metadata.education.indexOf(data[3]), // education (redundant with education-num) 66 | Number(data[4]), // 4 education-num 67 | metadata.maritalstatus.indexOf(data[5]), // 5 marital-status 68 | metadata.occupation.indexOf(data[6]), // 6 occupation 69 | metadata.relationship.indexOf(data[7]), // 7 relationship 70 | metadata.race.indexOf(data[8]), // 8 race 71 | metadata.sex.indexOf(data[9]), // 9 sex 72 | Number(data[10]), // 10 capital-gain 73 | Number(data[11]), // 11 capital-loss 74 | Number(data[12]), // 12 hours-per-week 75 | metadata.nativecountry.indexOf(data[13]) // 13 native-country 76 | ]; 77 | return [label, features]; 78 | } 79 | 80 | const trainingSet = sc.textFile(__dirname + '/dataset/adult-0*.csv') 81 | .filter(a => a[0] !== 'a') // filter out header 82 | .map(line => line.split(',').map(str => str.trim())) // split csv lines 83 | .filter(data => data.length === 15 && data.indexOf('?') === -1) // remove incomplete data 84 | .map(featurize, metadata) // transform string data to number 85 | .persist(); 86 | 87 | const testSet = sc.textFile(__dirname + '/dataset/adult-1*.csv') 88 | .filter(a => a[0] !== 'a') // filter out header 89 | .map(line => line.split(',').map(str => str.trim())) // split csv lines 90 | .filter(data => data.length === 15 && data.indexOf('?') === -1) // remove incomplete data 91 | .map(featurize, metadata); // transform string data to number 92 | 93 | // Standardize features to zero mean and unit variance 94 | const scaler = new ml.StandardScaler(); 95 | 96 | await scaler.fit(trainingSet.map(point => point[1])); 97 | 98 | // Use scaler to standardize training and test datasets 99 | const trainingSetStd = trainingSet 100 | .map((p, scaler) => [p[0], scaler.transform(p[1])], scaler); 101 | 102 | const testSetStd = testSet 103 | .map((p, scaler) => [p[0], scaler.transform(p[1])], scaler); 104 | 105 | // Train logistic regression with SGD on standardized training set 106 | const nIterations = 10; 107 | const parameters = {loss: 'log', penalty: 'l2', regParam: 0.001, stepSize: 1, proba: true}; 108 | const model = new ml.SGDLinearModel(parameters); 109 | 110 | await model.fit(trainingSetStd, nIterations); 111 | 112 | const predictionAndLabels = testSetStd.map((p, model) => [model.predict(p[1]), p[0]], model); 113 | const metrics = await ml.classificationMetrics(predictionAndLabels, {steps: 100}); 114 | 115 | console.log('model weights:', model.weights); 116 | console.log('intercept:', model.intercept); 117 | console.log('PR AUC:', metrics.auPR); 118 | console.log('ROC AUC:', metrics.auROC); 119 | console.log('ROC curve: roc.png'); 120 | console.log('Best threshold (F1 max):', metrics.threshold); 121 | sc.end(); 122 | 123 | // Plot ROC curve 124 | const xy = {}; 125 | for (let i = 0; i < metrics.rates.length; i++) 126 | xy[metrics.rates[i].fpr || '0.00000000000'] = metrics.rates[i].recall; 127 | const data = {}; 128 | data['regParam: ' + parameters.regParam + ', stepSize: ' + parameters.stepSize] = xy; 129 | data['Random'] = {0: 0, 1: 1}; 130 | plot({ 131 | title: 'Logistic Regression ROC Curve', 132 | data: data, 133 | filename: 'roc.png', 134 | }); 135 | 136 | // Plot PR curve 137 | const xy0 = {}; 138 | for (let i = 0; i < metrics.rates.length; i++) 139 | xy0[metrics.rates[i].recall || '0.00000000000'] = metrics.rates[i].precision; 140 | const data0 = {}; 141 | data0['regParam: ' + parameters.regParam + ', stepSize: ' + parameters.stepSize] = xy0; 142 | plot({ 143 | title: 'Logistic Regression PR Curve', 144 | data: data0, 145 | filename: 'pr.png', 146 | }); 147 | 148 | })(); // main 149 | -------------------------------------------------------------------------------- /docs/machine-learning.md: -------------------------------------------------------------------------------- 1 | # Machine Learning module 2 | 3 | The Machine Learning (ML) module provides scalable functions for 4 | supervised (classification, regression) and unsupervised (clustering) 5 | statistical learning on top of skale datasets and distributed 6 | map-reduce engine. 7 | 8 | The module can be loaded using: 9 | ```js 10 | var ml = require('skale/ml') 11 | ``` 12 | 13 | ## classificationMetrics(measures[, options][, done]) 14 | 15 | This [action] computes various metrics to measure classification performance. 16 | 17 | - *measures*: a dataset where entries are in the form 18 | `[prediction, label]` with *prediction* and *label* being numbers where 19 | only value sign is used: true if positive or false if negative. 20 | - *options*: an optional *Object* with the following fields: 21 | - *steps*: integer *Number* defining the number of points in the Receiver 22 | Operation Charateristics (ROC) curve. Defaults: 10 23 | - *done*: an optional callback of the form `function(error, result)` 24 | which is called at completion. *result* is an object with the following fields: 25 | - *rates*: an array of *steps* of confusion matrix raw values 26 | - *auROC*: area under ROC curve, using the trapezoidal rule 27 | - *auPR*: area under Precision Recall curve, using the trapezoidal rule 28 | 29 | Example: 30 | ```js 31 | var model = new ml.SGDLinearModel(); 32 | await model.fit(trainingSet); 33 | var predictionAndLabels = testSet.map((p, model) => [model.predict(p[1]), p[0]], model); 34 | var metrics = await ml.classificationMetrics(predictionAndLabels) 35 | console.log('ROC AUC:', metrics.auROC); 36 | // 0.869 37 | ``` 38 | 39 | ## KMeans(nbClusters[, options]) 40 | 41 | Creates a clusterization model fitted via [K-Means] algorithm. 42 | 43 | - *nbClusters*: *Number*, specifying the number of clusters in the model 44 | - *Options*: an optional *Object* with the following fields: 45 | - *maxMse*: *Number* defining the maximum mean square error between cluster 46 | centers since previous iteration. Used to stop iterations. Default to 1e-7. 47 | - *maxIterations*: *Number* defining the maximum number of iterations. Default: 100. 48 | - *means*: an initial array of vectors (arrays) of numbers, default undefined. 49 | 50 | Example: 51 | ```js 52 | const dataset = sc.parallelize([ 53 | [1, 2], [1, 4], [1, 0], 54 | [4, 2], [4, 4], [4, 0] 55 | ]); 56 | const kmeans = ml.KMeans(2); 57 | await kmeans.fit(dataset); 58 | kmeans.means 59 | // [ [ 2.5, 1 ], [ 2.5, 4 ] ] 60 | kmeans.predict([0, 0]) 61 | // 0 62 | kmeans.predict([4, 4] 63 | // 1 64 | ``` 65 | 66 | ### kmeans.fit(trainingSet[, done]) 67 | 68 | This [action] updates *kmeans* model by fitting it to the input 69 | dataset *trainingSet*. The *done()* callback is called at completion 70 | if provided, otherwise an [ES6 promise] is returned. 71 | 72 | - *trainingSet*: a dataset where entries are in the following format: 73 | `[feature0, feature1, ...]` with *featureN* being a float number. 74 | - *done*: an optional callback of the form `function(error)` 75 | which is called at completion. 76 | 77 | ### kmeans.predict(sample) 78 | 79 | Returns the closest cluster index for the *sample*. 80 | 81 | - *sample*: an *Array* with the format `[feature0, feature 1, ...]` 82 | with *featureN* being a float number. 83 | 84 | ## SGDLinearModel([options]) 85 | 86 | Creates a regularized linear model fitted via [stochastic 87 | gradient descent] learning. Such model can be used either for 88 | regression or classification, as training method is identical, 89 | only prediction changes. SGD is sensitive to the scaling 90 | of the features. For best results, the data should have zero mean and 91 | unit variance, which can be achieved with [ml.StandardScaler]. 92 | 93 | The model it fits can be controlled with the *loss* option; by default, 94 | it fits a linear [support vector machine] (SVM). A regularization term 95 | can be added to the loss, by default the squared euclidean norm L2. 96 | 97 | - *options*: an *Object* with the following fields: 98 | - *fitIntercept*: *Boolean* indicating whether to include an intercept. Default: *true* 99 | - *loss*: *String* specifying the [loss function] to be used. Possible values are: 100 | - `hinge`: (default), gives a linear SVM 101 | - `log`: gives logistic loss, a probabilistic classifier 102 | - `square`: gives square loss fit 103 | - *penalty*: *String* specifying the [regularization] term. Possible values are: 104 | - `l2`: (default) squared euclidean norm L2, standard regularizer for linear SVM models 105 | - `l1`: absolute norm L1, might bring sparsity to the model, not achievable with `l2` 106 | - `none`: zero penalty 107 | - *proba*: *Boolean* (default *false*). If *true* predict returns a probability rather than a raw number. Only applicable when logisitic loss is selected. 108 | - *regParam*: *Number* >= 0, defaults to 0.001, defines the trade-off between the 109 | two goals of minimizing the loss (i.e. training error) and minimizing model complexity 110 | (i.e. to avoid overfitting) 111 | - *stepSize*: *Number* >= 0, defaults to 1, defines the initial step size of the gradient 112 | descent 113 | 114 | Example: 115 | ```js 116 | const trainingSet = sc.parallelize([ 117 | [1, [0.5, -0.7]], 118 | [-1, [-0.5, 0.7]] 119 | ]); 120 | const sgd = new ml.SGDLinearModel() 121 | await sgd.fit(trainingSet, 2) 122 | sgd.weights 123 | // [ 0.8531998372026804, -1.1944797720837526 ] 124 | sgd.predict([2, -2]) 125 | // 0.9836229103782058 126 | ``` 127 | 128 | ### sgd.fit(trainingSet, iterations[, done]) 129 | 130 | This [action] updates *sgdClassifier* model by fitting it to the 131 | input dataset *trainingSet*. The *done()* callback is called at 132 | completion if provided, otherwise an [ES6 promise] is returned. 133 | 134 | - *trainingSet*: a dataset where entries are in the following format: 135 | `[label, [feature0, feature1, ...]]` with *label* being either 1 or -1, 136 | and *featureN* being a float number, preferentially with a zero mean and 137 | unit variance (in range [-1, 1]). Sparse vectors with undefined features 138 | are supported. 139 | - *done*: an optional callback of the form `function(error)` 140 | which is called at completion. 141 | 142 | ### sgd.predict(sample) 143 | 144 | Predict a label for a given *sample* and returns a numerical value 145 | which can be converted to a label -1 if negative, or 1 if positive. 146 | 147 | If selected loss is `log`, the returned value can be interpreted as 148 | a probability of the corresponding label. 149 | 150 | ## StandardScaler() 151 | 152 | Creates a standard scaler which standardizes features by removing 153 | the mean and scaling to unit variance. 154 | 155 | Centering and scaling happen independently on each feature by 156 | computing the relevant statistics on the samples in the training 157 | set. 158 | 159 | Standardization of datasets is a common requirement for many machine 160 | learning estimators. They might behave badly if the individual 161 | features do not more or less look like standard normally distributed 162 | data: Gaussian with zero mean and unit variance. 163 | 164 | Example: 165 | ```js 166 | var data = sc.parallelize([[0, 0], [0, 0], [1, 1], [1, 1]]); 167 | var scaler = new ml.StandardScaler(); 168 | await scaler.fit(data); 169 | scaler 170 | //StandardScaler { 171 | // transform: [Function], 172 | // count: 4, 173 | // mean: [ 0.5, 0.5 ], 174 | // std: [ 0.5, 0.5 ] } 175 | var scaled = data.map((p, scaler) => scaler.transform(p), scaler) 176 | console.log(await scaled.collect()); 177 | // [ [ -1, -1 ], [ -1, -1 ], [ 1, 1 ], [ 1, 1 ] ] 178 | scaler.transform([2, 2]) 179 | // [ 3, 3 ] 180 | ``` 181 | 182 | ### scaler.fit(trainingSet[, done]) 183 | 184 | This [action] updates *scaler* by computing the mean and std of 185 | *trainingSet* to be used for later scaling. The *done()* callback 186 | is called at completion if provided, otherwise an [ES6 promise] is 187 | returned. 188 | 189 | - *trainingSet*: a dataset where entries are in the format 190 | `[feature0, feature1, ...]` with *featureN* being a *Number* 191 | - *done*: an optional callback of the form `function (error)` which 192 | is called at completion. 193 | 194 | ### scaler.transform(sample) 195 | 196 | Returns the standardized scaled value of *sample*. 197 | 198 | - *sample*: an *Array* with the format `[feature0, feature 1, ...]` 199 | with *featureN* being a float number. 200 | 201 | [readable stream]: https://nodejs.org/api/stream.html#stream_class_stream_readable 202 | [ES6 promise]: https://promisesaplus.com 203 | [action]: concepts#actions 204 | [K-Means]: https://en.wikipedia.org/wiki/K-means_clustering 205 | [loss function]: https://en.wikipedia.org/wiki/Loss_functions_for_classification 206 | [logistic regression]: https://en.wikipedia.org/wiki/Logistic_regression 207 | [ml.StandardScaler]: #mlstandardscaler 208 | [parquet]: https://parquet.apache.org 209 | [regularization]: https://en.wikipedia.org/wiki/Regularization_(mathematics) 210 | [stochastic gradient descent]: https://en.wikipedia.org/wiki/Stochastic_gradient_descent 211 | [support vector machine]: https://en.wikipedia.org/wiki/Support_vector_machine 212 | -------------------------------------------------------------------------------- /docs/skale-hackers-guide.md: -------------------------------------------------------------------------------- 1 | # Skale Hacker's Guide 2 | 3 | ## Introduction 4 | 5 | Skale is a fast and general purpose distributed data processing system. It provides a high-level API in Javascript and an optimized parallel execution engine. 6 | 7 | This document gives an overview of its design and architecture, then some details on internals and code organisation, and finally presents how to extends various parts of the engine. 8 | 9 | It is assumed that the reader is already familiar with using skale and with the [reference guide], at least the [core concepts]. 10 | 11 | ## Architecture 12 | 13 | This section describes the core architecture of skale. At high level, a skale application consists of a *master* program which launches various parallel tasks on *worker* nodes. The tasks read and write *datasets*, i.e. arrays of data of abritrary size, split in *partitions* distributed on workers. 14 | 15 | ### Master 16 | 17 | The corresponding code is in [context-local.js] for the standalone mode, or [context.js] for the distributed mode, the only difference between the 2 being the way workers are created and connected to the master. 18 | 19 | In a nutshell, the master performs the following: 20 | 21 | 1. Creates a new skale [context] object to hold the state of cluster, datasets and tasks, then in this context: 22 | 2. Allocates a new cluster, i.e. and array of [workers]: connected slave processes on each worker host (1 process per CPU). 23 | 3. [Compiles then runs] an execution graph derived from the user code, the *job*, consisting of a sequence of *stages*. This compilation is only triggered when an *action* is met, thus in *lazy* mode. 24 | 4. For each stage, [runs the next task]: serialize and send stage code and metadata about input dataset partitions to the next free worker, trigger execution, wait for result, repeat until all stage's tasks are completed. 25 | 26 | *Stage explanation here* 27 | 28 | ### Worker 29 | 30 | The corresponding code is in [worker-local.js] for the standalone mode and [worker.js] for the distributed mode. The common part is implemented in [task.js]. 31 | 32 | A worker performs the following: 33 | 34 | 1. Connects to the master and wait for the next task to execute, then for each task: 35 | 2. Select input partition(s), possible cases are: 36 | - in memory local partition computed from a previous stage, already loaded 37 | - on-disk local partition computed from a previous stage, spilled to disk 38 | - remote partition stored on a separate worker (post-shuffle) 39 | - external data source, through a source connector 40 | 3. Iterate on partition(s), applying for each record a *pipeline* of functions as defined by the user for the current stage (for example a filter function, followed by a mapper function, followed by a reducer function) 41 | 4. The last function of the pipeline is either an *action* (function returning data to master), or a pre-shuffle function (saving data on disk for remote access at start of next stage, i.e post-shuffle) 42 | 5. At end of task, a result is sent to master, usually metadata for output files, used for next stage or for final combiner action 43 | 44 | *Explain communication model here, for data transfers and remote procedure calls* 45 | 46 | ### Datasets 47 | 48 | The main abstraction provided by skale is a *dataset* which is similar to a Javascript array, but partitioned accross the workers that can be operated in parallel. 49 | 50 | A dataset object is always created first on the master side, either by a *source* function which returns a dataset from an external input or from scratch, or by a *transformation* function, which takes a dataset in input and outputs a new dataset. 51 | 52 | The same code, in [dataset.js] is loaded both in master and workers. A dataset object instantiated on master will be replicated on each worker through task [serialization] and [deserialization] process. 53 | 54 | From an object oriented perspective, all *sources* and *transformations*, as dataset contructors, are classes which derive and inherit from the *Dataset* class, whereas *actions*, which operate on a dataset object, are simply methods of the *Dataset* class. 55 | 56 | Dataset objects have methods that can be run either on master side or on worker side (never on both), the following table provides a summary of these: 57 | 58 | |Dataset method | on master | on worker | type | description | 59 | |-------------------|-----------|-----------|------|-------------| 60 | |getPartitions | ✓ | | source, post-shuffle transform| Allocate output dataset partitions | 61 | |getPreferedLocation| ✓ | | source | return prefered worker for a given partition | 62 | |iterate | | ✓ | source, post-shuffle transform| iterate stage pipeline on partition entries| 63 | |transform | | ✓ | transform | Apply a custom operation on each input dataset entry, pre-shuffle| 64 | |spillToDisk | | ✓ | pre-shuffle transform | dump partition data to disk during pre-shuffle, for next stage| 65 | 66 | ### Local standalone mode 67 | 68 | The standalone mode is the default operating mode. All the processes, master and workers are running on the local host, using the [cluster] core NodeJS module. This mode is the simplest to operate: no dependency, and no server nor cluster setup and management required. It is used as any standard NodeJS package: simply `require('skale')`, and that's it. 69 | 70 | This mode is perfect for development, fast prototyping and tests on a single machine (i.e. a laptop). For unlimited scalibity, see distributed mode below. 71 | 72 | ### Distributed mode 73 | 74 | The distributed mode allows to run the exact same code as in standalone over a network of multiple machines, thus achieving horizontal scalability. 75 | 76 | The distributed mode involves two executables, which must be running prior to launch application programs: 77 | 78 | - a `skale-server` process, which is the access point where the `master` (user application) and `workers` (running slaves) connect to, either by direct TCP connections, or by websockets. 79 | - A `skale-worker` process, which is a worker controller, running on each machine of the computing cluster, and connecting to the `skale-server`. The worker controller will spawn worker processes on demand (typically one per CPU), each time a new job is submitted. 80 | 81 | To run in distributed mode, the environment variable `SKALE_HOST` must be set to the `skale-server` hostname or IP address. If unset, the application will run in standalone mode. Multiple applications, each with its own set of workers and master processes can run simultaneously using the same server and worker controllers. 82 | 83 | Although not mandatory, running an external HTTP server on worker hosts, exposing skale temporary files, allows efficient peer-to-peer shuffle data transfer between workers. If not available, this traffic will go through the centralized `skale-server`. Any external HTTP server such as nginx, apache or busybox httpd, or even NodeJS (although not the most efficient for static file serving) will do. 84 | 85 | For further details, see command line help for `skale-worker` and `skale-server`. 86 | 87 | ## Adding a new source 88 | 89 | A source returns a dataset from an external input or from scratch. For example, to be able to process data from kafka in a parallel manner, i.e. one topic partition per worker, one has to implement a kafka source in skale. 90 | 91 | Adding a new source is a matter of: 92 | 93 | - Deriving a new class from the Dataset class, see as for example [TextLocal], which implements a textFile source from local filesystem 94 | - Providing a `getPartition` method prototype, which allocates a fixed number of partitions, see [TextLocal.getPartitions] as an example of allocating one partition per file. This method will be run on the master, when triggered by the action, and prior to dispatch tasks to workers 95 | - Optionally providing a `getPreferedLocation` method prototype, to select a given worker according to your source semantics. If not provided, the master will dispatch the partition by default to the next free worker at execution time. 96 | - Providing an `iterate` method prototype, which operates this time on the worker to execute the stage pipeline on each partition entry. See for example [TextLocal.iterate] and [iterateStream] which processes each line of a [readable stream]. If the partition can be mapped to a readable stream, as it is the case for many NodeJS connectors, one can just reuse `iterateStream` as is. 97 | - Exposing the source in the API, either by extending [textFile] to process a new URL protocol, or adding a new source method in the context, see for example [parallelize]. 98 | 99 | ## Adding a new transform 100 | 101 | A new transform can be implemented either by deriving a new class from the Dataset class then providing dataset methods as in the previous table of dataset methods, or by composing existing tranform methods to issue a new one, see for example [distinct]. 102 | 103 | *Here give details on narrow vs wide transforms and impact on implementation* 104 | 105 | ## Adding a new action 106 | 107 | [reference guide]: https://github.com/skale-me/skale/blob/0.7.0/doc/skale-API.md 108 | [core concepts]: https://github.com/skale-me/skale/blob/0.7.0/doc/skale-API.md#core-concepts 109 | [context-local.js]: https://github.com/skale-me/skale/blob/0.7.0/lib/context-local.js 110 | [context.js]: https://github.com/skale-me/skale/blob/0.7.0/lib/context.js 111 | [context]: https://github.com/skale-me/skale/blob/0.7.0/lib/context.js#L22 112 | [workers]: https://github.com/skale-me/skale/blob/0.7.0/lib/context.js#L51-L53 113 | [Compiles then runs]: https://github.com/skale-me/skale/blob/0.7.0/lib/context.js#L223 114 | [runs the next task]: https://github.com/skale-me/skale/blob/0.7.0/lib/context.js#L129 115 | [worker-local.js]: https://github.com/skale-me/skale/blob/0.7.0/lib/worker-local.js 116 | [worker.js]: https://github.com/skale-me/skale/blob/0.7.0/bin/worker.js 117 | [task.js]: https://github.com/skale-me/skale/blob/0.7.0/lib/task.js 118 | [dataset.js]: https://github.com/skale-me/skale/blob/0.7.0/lib/dataset.js 119 | [serialization]: https://github.com/skale-me/skale/blob/0.7.0/lib/context.js#L141 120 | [deserialization]: https://github.com/skale-me/skale/blob/0.7.0/bin/worker.js#L275 121 | [cluster]: https://nodejs.org/dist/latest-v8.x/docs/api/cluster.html 122 | [TextLocal]: https://github.com/skale-me/skale/blob/0.7.0/lib/dataset.js#L911-L919 123 | [TextLocal.getPartitions]: https://github.com/skale-me/skale/blob/0.7.0/lib/dataset.js#L921-L941 124 | [TextLocal.iterate]: https://github.com/skale-me/skale/blob/0.7.0/lib/dataset.js#L943 125 | [iterateStream]: https://github.com/skale-me/skale/blob/0.7.0/lib/dataset.js#L800 126 | [readable stream]: https://nodejs.org/api/stream.html#stream_class_stream_readable 127 | [textFile]: https://github.com/skale-me/skale/blob/0.7.0/lib/context.js#L112-121 128 | [parallelize]: https://github.com/skale-me/skale/blob/0.7.0/lib/context.js#L107 129 | [distinct]: https://github.com/skale-me/skale/blob/0.7.0/lib/dataset.js#L121-L125 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /bin/worker.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | // Copyright 2016 Luca-SAS, licensed under the Apache License 2.0 4 | 5 | 'use strict'; 6 | 7 | const child_process = require('child_process'); 8 | const fs = require('fs'); 9 | const os = require('os'); 10 | const cluster = require('cluster'); 11 | const url = require('url'); 12 | const zlib = require('zlib'); 13 | const stream = require('stream'); 14 | 15 | const mkdirp = require('mkdirp'); 16 | const uuid = require('uuid'); 17 | const S3 = require('aws-sdk/clients/s3'); 18 | const azure = require('azure-storage'); 19 | const parquet = require('../lib/stub-parquet.js'); 20 | 21 | const SkaleClient = require('../lib/client.js'); 22 | const Dataset = require('../lib/dataset.js'); 23 | const Task = require('../lib/task.js'); 24 | const Lines = require('../lib/lines.js'); 25 | const sizeOf = require('../lib/rough-sizeof.js'); 26 | const readSplit = require('../lib/readsplit.js').readSplit; 27 | 28 | const opt = require('node-getopt').create([ 29 | ['h', 'help', 'print this help text'], 30 | ['d', 'debug', 'print debug traces'], 31 | ['m', 'memory=ARG', 'set max memory in MB for workers'], 32 | ['M', 'MyHost=ARG', 'advertised hostname (peer-to-peer)'], 33 | ['n', 'nworker=ARG', 'number of workers (default: number of cpus)'], 34 | ['r', 'retry=ARG', 'number of connection retry (default 0)'], 35 | ['s', 'slow', 'disable peer-to-peer file transfers though HTTP'], 36 | ['G', 'forcegc', 'workers force garbage collect at end of task'], 37 | ['H', 'Host=ARG', 'server hostname (default localhost)'], 38 | ['P', 'Port=ARG', 'server port (default 12346)'], 39 | ['V', 'version', 'print version'] 40 | ]).bindHelp().parseSystem(); 41 | 42 | if (opt.options.version) { 43 | const pkg = require('../package'); 44 | return console.log(pkg.name + '-' + pkg.version); 45 | } 46 | 47 | const debug = opt.options.debug || false; 48 | const forceGc = opt.options.forcegc || false; 49 | const nworkers = +opt.options.nworker || +(process.env.SKALE_WORKER_PER_HOST ? process.env.SKALE_WORKER_PER_HOST : os.cpus().length); 50 | const memory = +(opt.options.memory || process.env.SKALE_MEMORY); 51 | const mm = new MemoryManager(memory); 52 | const start = Date.now(); 53 | let cgrid; 54 | let hostname; 55 | let log; 56 | let dlog; 57 | 58 | if (!opt.options.slow) 59 | hostname = opt.options.MyHost || os.hostname(); 60 | 61 | if (process.env.SKALE_DEBUG > 1) { 62 | log = function () { 63 | const args = Array.prototype.slice.call(arguments); 64 | args.unshift('[worker-controller ' + (Date.now() - start) / 1000 + 's]'); 65 | console.error.apply(null, args); 66 | }; 67 | } else { 68 | log = function () {}; 69 | } 70 | 71 | if (cluster.isMaster) { 72 | process.title = 'skale-worker-controller'; 73 | if (memory) 74 | cluster.setupMaster({execArgv: ['--expose-gc', '--max_old_space_size=' + memory]}); 75 | cluster.on('exit', handleExit); 76 | const cpus = os.cpus(); 77 | cgrid = new SkaleClient({ 78 | debug: debug, 79 | retry: opt.options.retry, 80 | host: opt.options.Host, 81 | port: opt.options.Port, 82 | data: { 83 | type: 'worker-controller', 84 | hostname: hostname, 85 | nworkers: nworkers, 86 | ncpus: cpus.length, 87 | memory: os.totalmem(), 88 | platform: os.platform(), 89 | arch: os.arch(), 90 | cpumodel: cpus[0].model, 91 | cpuspeed: cpus[0].speed 92 | } 93 | }); 94 | cgrid.on('connect', startWorkers); 95 | cgrid.on('getWorker', startWorkers); 96 | cgrid.on('close', process.exit); 97 | cgrid.on('sendFile', function (msg) { 98 | fs.createReadStream(msg.path, msg.opt).pipe(cgrid.createStreamTo(msg)); 99 | }); 100 | // Periodic stats 101 | fs.mkdir('/tmp/skale', function () {}); 102 | setInterval(function () { 103 | const stats = { nworkers: Object.keys(cluster.workers).length }; 104 | fs.writeFile('/tmp/skale/worker-controller-stats', JSON.stringify(stats), function () {}); 105 | }, 3000); 106 | log('worker controller ready'); 107 | } else { 108 | runWorker(opt.options.Host, opt.options.Port); 109 | } 110 | 111 | function startWorkers(msg) { 112 | const worker = []; 113 | const removed = {}; 114 | const n = msg.n || nworkers; 115 | 116 | log('worker-controller host', cgrid.uuid); 117 | for (let i = 0; i < n; i++) { 118 | worker[i] = cluster.fork({wsid: msg.wsid, rank: i, puuid: cgrid.uuid}); 119 | } 120 | worker.forEach(function (w) { 121 | w.on('message', function (msg) { 122 | switch (msg.cmd) { 123 | case 'rm': 124 | if (msg.dir && !removed[msg.dir]) { 125 | removed[msg.dir] = true; 126 | child_process.execFile('/bin/rm', ['-rf', msg.dir]); 127 | } 128 | break; 129 | default: 130 | console.error('unexpected msg', msg); 131 | } 132 | }); 133 | }); 134 | } 135 | 136 | function handleExit(worker, code, signal) { 137 | log('worker pid', worker.process.pid, ', exited:', signal || code); 138 | } 139 | 140 | function runWorker(host, port) { 141 | const start = Date.now(); 142 | let wid = process.env.rank; 143 | let basedir; 144 | let log; 145 | 146 | if (process.env.SKALE_DEBUG > 1) { 147 | log = function () { 148 | const args = Array.prototype.slice.call(arguments); 149 | args.unshift('[worker-' + wid + ' ' + (Date.now() - start) / 1000 + 's]'); 150 | console.error.apply(null, args); 151 | }; 152 | dlog = function() { 153 | const args = Array.prototype.slice.call(arguments); 154 | const now = Date.now(); 155 | const lstart = args.shift(); 156 | args.unshift('[worker-' + wid + ' ' + (now - start) / 1000 + 's]'); 157 | args.push('in ' + (now - lstart) / 1000 + 's'); 158 | console.error.apply(null, args); 159 | }; 160 | } else { 161 | dlog = log = function () {}; 162 | } 163 | if (process.env.SKALE_RANDOM_SEED) 164 | Dataset.setRandomSeed(process.SKALE_RANDOM_SEED); 165 | process.on('uncaughtException', function (err) { 166 | grid.send(grid.muuid, {cmd: 'workerError', args: err.stack}); 167 | process.exit(2); 168 | }); 169 | 170 | const grid = new SkaleClient({ 171 | debug: debug, 172 | host: host, 173 | port: port, 174 | data: { 175 | ncpu: os.cpus().length, 176 | os: os.type(), 177 | arch: os.arch(), 178 | usedmem: process.memoryUsage().rss, 179 | totalmem: os.totalmem(), 180 | hostname: hostname || process.env.puuid, 181 | type: 'worker', 182 | wsid: Number(process.env.wsid), 183 | jobId: '' 184 | } 185 | }, function (err, res) { 186 | log('id:', res.id, ', uuid:', res.uuid); 187 | wid = 'w' + res.id; 188 | grid.host = {uuid: res.uuid, id: res.id}; 189 | process.title = 'skale-worker_w' + res.id; 190 | }); 191 | 192 | grid.on('error', function (err) { 193 | console.error('grid error', err); 194 | process.exit(2); 195 | }); 196 | 197 | function runTask(msg) { 198 | grid.muuid = msg.data.master_uuid; 199 | const task = parseTask(msg.data.args); 200 | basedir = task.basedir; 201 | // set worker side dependencies 202 | task.workerId = 'w' + grid.id; 203 | task.mm = mm; 204 | task.grid = grid; 205 | // Set dependencies in global scope for user evaluated code in workers 206 | global.azure = azure; 207 | global.S3 = S3; 208 | global.dlog = dlog; 209 | global.log = log; 210 | global.Lines = Lines; 211 | global.mkdirp = mkdirp; 212 | global.mm = mm; 213 | global.parquet = parquet; 214 | global.readSplit = readSplit; 215 | global.uuid = uuid; 216 | 217 | global.fs = fs; 218 | global.stream = stream; 219 | global.url = url; 220 | global.zlib = zlib; 221 | 222 | // Indirect Eval to set user dependencies bundle in the worker global context 223 | (0, eval)(task.bundle); 224 | task.run(function(result) { 225 | result.workerId = task.workerId; 226 | grid.reply(msg, null, result); 227 | if (global.gc && forceGc) { 228 | setImmediate(function () { 229 | const gcs = Date.now(); 230 | global.gc(); 231 | dlog(gcs, 'gc'); 232 | }); 233 | } 234 | else log('no global.gc'); 235 | }); 236 | } 237 | 238 | function runztask(msg) { 239 | //log('runztask msg', msg); 240 | const file = msg.data.args; 241 | grid.muuid = msg.data.master_uuid; 242 | 243 | const s = getReadStreamSync({path: file}); 244 | let data = Buffer.concat([]); 245 | 246 | s.on('data', function (chunk) { 247 | data = Buffer.concat([data, chunk]); 248 | }); 249 | s.on('end', function () { 250 | //log('end stream ztask'); 251 | zlib.gunzip(data, {chunkSize: 65536}, function (err, data) { 252 | if (err) throw new Error(err); 253 | msg.data.args = data; 254 | runTask(msg); 255 | }); 256 | }); 257 | 258 | function getReadStreamSync(fileObj, opt) { 259 | if (fs.existsSync(fileObj.path)) return fs.createReadStream(fileObj.path, opt); 260 | if (!fileObj.host) fileObj.host = grid.muuid; 261 | return grid.createStreamFrom(fileObj.host, {cmd: 'sendFile', path: fileObj.path, opt: opt}); 262 | } 263 | } 264 | 265 | const request = { runTask: runTask, runztask: runztask }; 266 | 267 | grid.on('remoteClose', function () { 268 | process.send({cmd: 'rm', dir: basedir}); 269 | process.exit(); 270 | }); 271 | 272 | grid.on('request', function (msg) { 273 | try { 274 | request[msg.data.cmd](msg); 275 | } catch (error) { 276 | console.error(error.stack); 277 | grid.reply(msg, error, null); 278 | } 279 | }); 280 | 281 | grid.on('sendFile', function (msg) { 282 | fs.createReadStream(msg.path, msg.opt).pipe(grid.createStreamTo(msg)); 283 | }); 284 | } 285 | 286 | function MemoryManager(memory = 1024) { 287 | const Mb = 1024 * 1024; 288 | const MAX_MEMORY = (memory - 100) * Mb; 289 | const maxStorageMemory = MAX_MEMORY * 0.4; 290 | const maxShuffleMemory = MAX_MEMORY * 0.2; 291 | const maxCollectMemory = MAX_MEMORY * 0.2; 292 | 293 | this.storageMemory = 0; 294 | this.shuffleMemory = 0; 295 | this.collectMemory = 0; 296 | this.sizeOf = sizeOf; 297 | 298 | this.storageFull = function () {return (this.storageMemory > maxStorageMemory);}; 299 | this.shuffleFull = function () {return (this.shuffleMemory > maxShuffleMemory);}; 300 | this.collectFull = function () {return (this.collectMemory > maxCollectMemory);}; 301 | 302 | this.partitions = {}; 303 | this.register = function (partition) { 304 | const key = partition.datasetId + '.' + partition.partitionIndex; 305 | if (!(key in this.partitions)) this.partitions[key] = partition; 306 | }; 307 | 308 | this.unregister = function (partition) { 309 | this.partitions[partition.datasetId + '.' + partition.partitionIndex] = undefined; 310 | }; 311 | 312 | this.isAvailable = function (partition) { 313 | return (this.partitions[partition.datasetId + '.' + partition.partitionIndex] !== undefined); 314 | }; 315 | } 316 | 317 | function parseTask(str) { 318 | const task = JSON.parse(str, function (key, value) { 319 | if (typeof value === 'string') { 320 | // String value can be a regular function or an ES6 arrow function 321 | if (value.substring(0, 8) === 'function') { 322 | const args = value.match(/\(([^)]*)/)[1]; 323 | const body = value.replace(/^function\s*[^)]*\)\s*{/, '').replace(/}$/, ''); 324 | value = new Function(args, body); 325 | } else if (value.match(/^\s*\(\s*[^(][^)]*\)\s*=>/) || value.match(/^\s*\w+\s*=>/)) 326 | value = ('indirect', eval)(value); 327 | } 328 | return value; 329 | }); 330 | 331 | for (let i in task.nodes) { 332 | const n = task.nodes[i]; 333 | for (let j in n.dependencies) { 334 | const ref = n.dependencies[j]; 335 | n.dependencies[j] = task.nodes[ref]; 336 | } 337 | for (let j in n.partitions) { 338 | Object.setPrototypeOf(task.nodes[i].partitions[j], Dataset.Partition.prototype); 339 | task.nodes[i].partitions[j].count = 0; 340 | task.nodes[i].partitions[j].bsize = 0; 341 | task.nodes[i].partitions[j].tsize = 0; 342 | task.nodes[i].partitions[j].skip = false; 343 | } 344 | if (n.type) { 345 | Object.setPrototypeOf(task.nodes[i], Dataset[n.type].prototype); 346 | } 347 | if (n.partitioner && n.partitioner.type) { 348 | Object.setPrototypeOf(n.partitioner, Dataset[n.partitioner.type].prototype); 349 | } 350 | } 351 | Object.setPrototypeOf(task, Task.prototype); 352 | //log('task:', JSON.stringify(task, null, 2)); 353 | return task; 354 | } 355 | -------------------------------------------------------------------------------- /docs/concepts.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | Skale is a fast and general purpose distributed data processing 4 | system. It provides a high-level API in Javascript and an optimized 5 | parallel execution engine. 6 | 7 | A Skale application consists of a *master* program that runs the 8 | user code and executes various *parallel operations* on a cluster 9 | of *workers*. 10 | 11 | The main abstraction Skale provides is a *dataset* which is similar 12 | to a Javascript *array*, but partitioned accross the workers that 13 | can be operated in parallel. 14 | 15 | There are several ways to create a dataset: *parallelizing* an existing 16 | array in the master program, or referencing a dataset in a distributed 17 | storage system (such as HDFS), or *streaming* the content of any 18 | source that can be processed through Node.js *Streams*. We call 19 | *source* a function which initializes a dataset. 20 | 21 | Datasets support two kinds of operations: *transformations*, which create 22 | a new dataset from an existing one, and *actions*, which 23 | return a value to the *master* program after running a computation 24 | on the dataset. 25 | 26 | For example, `map` is a transformation that applies a function to 27 | each element of a dataset, returning a new dataset. On the other 28 | hand, `reduce` is an action that aggregates all elements of a dataset 29 | using some function, and returns the final result to the master. 30 | 31 | *Sources* and *transformations* in Skale are *lazy*. They do not 32 | start right away, but are triggered by *actions*, thus allowing 33 | efficient pipelined execution and optimized data transfers. 34 | 35 | A first example: 36 | 37 | ```javascript 38 | var sc = require('skale').context(); // create a new context 39 | sc.parallelize([1, 2, 3, 4]). // source 40 | map(function (x) {return x+1}). // transform 41 | reduce(function (a, b) {return a+b}, 0). // action 42 | then(console.log); // process result: 14 43 | ``` 44 | 45 | ## Core concepts 46 | 47 | As stated above, a program can be considered as a workflow of steps, 48 | each step consisting of a transformation which inputs from one or 49 | more datasets (parents), and outputs to a new dataset (child). 50 | 51 | ### Partitioning 52 | 53 | Datasets are divided into several partitions, so each partition can 54 | be assigned to a separate worker, and processing can occur concurently 55 | in a distributed and parallel system. 56 | 57 | The consequence of this partitioning is that two types of transformations 58 | exist: 59 | 60 | - *Narrow* transformations, where each partition of the parent dataset 61 | is used by at most one partition of the child dataset. This is the 62 | case for example for `map()` or `filter()`, where each dataset entry 63 | is processed independently from each other. 64 | Partitions are decoupled, no synchronization 65 | between workers is required, and narrow transformations can be 66 | pipelined on each worker. 67 | 68 | - *Wide* transformations, where multiple child partitions may depend 69 | on one parent partition. This is the case for example for `sortBy()` 70 | or `groupByKey()`. Data need to be exchanged between workers or 71 | *shuffled*, in order to complete the transformation. This introduces 72 | synchronization points which prevent pipelining. 73 | 74 | ### Pipeline stages and shuffles 75 | 76 | Internally, each wide transformation consists of a pre-shuffle and 77 | a post-shuffle part. All sequences of steps from source to pre-shuffle, 78 | or from post-shuffle to next pre-shuffle or action, are thus only 79 | narrow transformations, or pipelined stages (the most efficient 80 | pattern). A skale program is therefore simply a sequence of stages 81 | and shuffles, shuffles being global serialization points. 82 | 83 | It's important to grab this concept as it sets the limit to the 84 | level of parallelism which can be achieved by a given code. 85 | 86 | The synoptic table of [transformations](#transformations) indicates 87 | for each transformation if it is narrow or wide (shuffle). 88 | 89 | ## Working with datasets 90 | 91 | ### Sources 92 | 93 | After having initialized a cluster context using [skale.context()], 94 | one can create a dataset using the following sources: 95 | 96 | | Source Name | Description | 97 | | ----------------------------| ------------------------------------------------------ | 98 | |[lineStream(stream)] | Create a dataset from a text stream | 99 | |[objectStream(stream)] | Create a dataset from an object stream | 100 | |[parallelize(array)] | Create a dataset from an array | 101 | |[range(start,end,step)] | Create a dataset containing integers from start to end | 102 | |[source(size,callback,args)] | Create a dataset from a custom source function | 103 | |[textFile(path, options)] | Create a dataset from text file | 104 | 105 | ### Transformations 106 | 107 | Transformations operate on a dataset and return a new dataset. Note that some 108 | transformation operate only on datasets where each element is in the form 109 | of 2 elements array of key and value (`[k,v]` dataset): 110 | 111 | [[Ki,Vi], ..., [Kj, Vj]] 112 | 113 | A special transformation `persist()` enables one to *persist* a dataset 114 | in memory, allowing efficient reuse accross parallel operations. 115 | 116 | |Transformation Name |Description |In |Out |Shuffle| 117 | |----------------------------------|-----------------------------------------------------------------------|-----------|-------------|-------| 118 | |[aggregateByKey(func, func, init)]|Reduce and combine by key using functions |[k,v] |[k,v] |yes | 119 | |[cartesian(other)] |Perform a cartesian product with the other dataset |v w |[v,w] |yes | 120 | |[coGroup(other)] |Group data from both datasets sharing the same key |[k,v] [k,w]|[k,[[v],[w]]]|yes | 121 | |[distinct()] |Return a dataset where duplicates are removed |v |w |yes | 122 | |[filter(func)] |Return a dataset of elements on which function returns true |v |w |no | 123 | |[flatMap(func)] |Pass the dataset elements to a function which returns a sequence |v |w |no | 124 | |[flatMapValues(func)] |Pass the dataset [k,v] elements to a function without changing the keys|[k,v] |[k,w] |no | 125 | |[groupByKey()] |Group values with the same key |[k,v] |[k,[v]] |yes | 126 | |[intersection(other)] |Return a dataset containing only elements found in both datasets |v w |v |yes | 127 | |[join(other)] |Perform an inner join between 2 datasets |[k,v] |[k,[v,w]] |yes | 128 | |[leftOuterJoin(other)] |Join 2 datasets where the key must be present in the other |[k,v] |[k,[v,w]] |yes | 129 | |[rightOuterJoin(other)] |Join 2 datasets where the key must be present in the first |[k,v] |[k,[v,w]] |yes | 130 | |[keys()] |Return a dataset of just the keys |[k,v] |k |no | 131 | |[map(func)] |Return a dataset where elements are passed through a function |v |w |no | 132 | |[mapValues(func)] |Map a function to the value field of key-value dataset |[k,v] |[k,w] |no | 133 | |[reduceByKey(func, init)] |Combine values with the same key |[k,v] |[k,w] |yes | 134 | |[partitionBy(partitioner)] |Partition using the partitioner |v |v |yes | 135 | |[persist()] |Idempotent, keep content of dataset in cache for further reuse |v |v |no | 136 | |[sample(rep, frac)] |Sample a dataset, with or without replacement |v |w |no | 137 | |[sortBy(func)] |Sort a dataset |v |v |yes | 138 | |[sortByKey()] |Sort a [k,v] dataset |[k,v] |[k,v] |yes | 139 | |[subtract(other)] |Remove the content of one dataset |v w |v |yes | 140 | |[union(other)] |Return a dataset containing elements from both datasets |v |v w |no | 141 | |[values()] |Return a dataset of just the values |[k,v] |v |no | 142 | 143 | ### Actions 144 | 145 | Actions operate on a dataset and send back results to the *master*. Results 146 | are always produced asynchronously and send to an optional callback function, 147 | alternatively through a returned [ES6 promise]. 148 | 149 | | Action Name |Description |out | 150 | |----------------------------------|------------------------------------------------------------------|-------------------| 151 | |[aggregate(func, func, init)] |Similar to reduce() but may return a different typei |value | 152 | |[collect()] |Return the content of dataset |array of elements | 153 | |[count()] |Return the number of elements from dataset |number | 154 | |[countByKey()] |Return the number of occurrences for each key in a `[k,v]` dataset|array of [k,number]| 155 | |[countByValue()] |Return the number of occurrences of elements from dataset |array of [v,number]| 156 | |[first()] |Return the first element in dataset i |value | 157 | |[forEach(func)] |Apply the provided function to each element of the dataset |empty | 158 | |[lookup(k)] |Return the list of values `v` for key `k` in a `[k,v]` dataset |array of v | 159 | |[reduce(func, init)] |Aggregates dataset elements using a function into one value |value | 160 | |[save(url)] |Save the content of a dataset to an url |empty | 161 | |[stream()] |Stream out a dataset |stream | 162 | |[take(num)] |Return the first `num` elements of dataset |array of value | 163 | |[takeSample(withReplacement, num)]|Return a sample of `num` elements of dataset |array of value | 164 | |[top(num)] |Return the top `num` elements of dataset |array of value | 165 | 166 | [ES6 promise]: https://promisesaplus.com 167 | [skale.context()]: skale-API.md#skalecontextconfig 168 | 169 | [lineStream(stream)]: skale-API#sclinestreaminput_stream 170 | [objectStream(stream)]: skale-API#scobjectstreaminput_stream 171 | [parallelize(array)]: skale-API#scparallelizearray 172 | [range(start,end,step)]: skale-API#scrangestart-end-step 173 | [source(size,callback,args)]: skale-API#scsourcesize-callback-args 174 | [textFile(path, options)]: skale-API#sctextfilepath-options 175 | 176 | [aggregateByKey(func, func, init)]: skale-API#dsaggregatebykeyreducer-combiner-init-obj 177 | [cartesian(other)]: skale-API#dscartesianother 178 | [coGroup(other)]: skale-API#dscogroupother 179 | [distinct()]: skale-API#dsdistinct 180 | [filter(func)]: skale-API#dsfilterfilter-obj 181 | [flatMap(func)]: skale-API#dsflatmapflatmapper-obj 182 | [flatMapValues(func)]: skale-API#dsflatmapvaluesflatmapper-obj 183 | [groupByKey()]: skale-API#dsgroupbykey 184 | [intersection(other)]: skale-API#dsintersectionother 185 | [join(other)]: skale-API#dsjoinother 186 | [leftOuterJoin(other)]: skale-API#dsleftouterjoinother 187 | [rightOuterJoin(other)]: skale-API#dsrightouterjoinother 188 | [keys()]: skale-API#dskeys 189 | [map(func)]: skale-API#dsmapmapper-obj 190 | [mapValues(func)]: skale-API#dsmapvaluesmapper-obj 191 | [reduceByKey(func, init)]: skale-API#dsreducebykeyreducer-init-obj 192 | [partitionBy(partitioner)]: skale-API#dspartitionbypartitioner 193 | [persist()]: skale-API#dspersist 194 | [sample(rep, frac)]: skale-API#dssamplewithreplacement-frac 195 | [sortBy(func)]: skale-API#dssortbykeyfunc-ascending 196 | [sortByKey()]: skale-API#dssortbykeyascending 197 | [subtract(other)]: skale-API#dssubtractother 198 | [union(other)]: skale-API#dsunionother 199 | [values()]: skale-API#dsvalues 200 | 201 | [aggregate(func, func, init)]: skale-API#dsaggregatereducer-combiner-init-obj-done 202 | [collect()]: skale-API#dscollectdone 203 | [count()]: skale-API#dscountdone 204 | [countByKey()]: skale-API#dscountbykeydone 205 | [countByValue()]: skale-API#dscountbyvaluedone 206 | [first()]: skale-API#dsfirstdone 207 | [forEach(func)]: skale-API#dsforeachcallback-obj-done 208 | [lookup(k)]: skale-API#dslookupk-done 209 | [reduce(func, init)]: skale-API#dsreducereducer-init-obj-done 210 | [save(url)]: skale-API#dssaveurl-options-done 211 | [stream()]: skale-API#dsstreamopt 212 | [take(num)]: skale-API#dstakenum-done 213 | [takeSample(withReplacement, num)]: skale-API#dstakesamplewithreplacement-num-done 214 | [top(num)]: skale-API#dstopnum-done 215 | --------------------------------------------------------------------------------