├── src ├── index.ts ├── loader.ts ├── demo.ts └── normalizer.ts ├── .gitignore ├── dist ├── index.js ├── loader.js ├── demo.js └── normalizer.js ├── tsconfig.json ├── package.json └── README.md /src/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Romain Bruckert 3 | */ 4 | export * from './loader' 5 | export * from './normalizer' 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.log 2 | /typings 3 | /node_modules 4 | /test/artillery*.json 5 | !/dist/.gitkeep 6 | 7 | # Elastic Beanstalk Files 8 | .ebignore 9 | .elasticbeanstalk/* 10 | !.elasticbeanstalk/*.cfg.yml 11 | !.elasticbeanstalk/*.global.yml 12 | 13 | /models/ 14 | -------------------------------------------------------------------------------- /dist/index.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | function __export(m) { 3 | for (var p in m) if (!exports.hasOwnProperty(p)) exports[p] = m[p]; 4 | } 5 | Object.defineProperty(exports, "__esModule", { value: true }); 6 | __export(require("./loader")); 7 | __export(require("./normalizer")); 8 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compileOnSave": true, 3 | "compilerOptions": { 4 | "outDir": "dist", 5 | "module": "commonjs", 6 | "target": "es5", 7 | "lib": ["es6", "es2015.promise"], 8 | "sourceMap": false, 9 | "moduleResolution": "node", 10 | "experimentalDecorators": true, 11 | "emitDecoratorMetadata": true, 12 | "removeComments": true, 13 | "noImplicitAny": false, 14 | "rootDir": "src" 15 | }, 16 | "exclude": [ 17 | "dist", 18 | "node_modules" 19 | ], 20 | "include": [ 21 | "src/**/*.ts" 22 | ] 23 | } 24 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "neural-data-normalizer", 3 | "version": "1.1.0", 4 | "description": "Data to bits normalization helpers script to use with neural network.", 5 | "repository": "https://github.com/adadgio/neural-data-normalizer", 6 | "main": "dist/index.js", 7 | "engines": { 8 | "node": ">=6.0.0" 9 | }, 10 | "scripts": { 11 | "start": "node ./dist/index.js", 12 | "test": "node dist/demo.js", 13 | "watch": "npm run build:live", 14 | "build:live": "tsc --watch", 15 | "build": "tsc" 16 | }, 17 | "keywords": [ 18 | "node", 19 | "typescript", 20 | "Neural networks", 21 | "Synaptic.js", 22 | "Data manipulation" 23 | ], 24 | "author": "Romain Bruckert", 25 | "license": "ISC", 26 | "devDependencies": { 27 | "@types/node": "^7.0.0", 28 | "nodemon": "^1.11.0", 29 | "ts-node": "^3.1.0", 30 | "typescript": "^2.4.1", 31 | "typescript-watcher": "^0.0.4" 32 | }, 33 | "dependencies": { 34 | "synaptic": "^1.1.4" 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/loader.ts: -------------------------------------------------------------------------------- 1 | import * as fs from 'fs' 2 | 3 | /** 4 | * Load data from local file(s) 5 | * 6 | * @author Romain Bruckert 7 | */ 8 | export class Loader 9 | { 10 | dir: string; 11 | filename: string 12 | 13 | setDataDir(dir: string) 14 | { 15 | this.dir = dir 16 | 17 | return this 18 | } 19 | 20 | fromJsonFile(filename: string) 21 | { 22 | this.filename = filename 23 | 24 | let path = `${this.dir}/${filename}`.replace('//', '/') 25 | let data = fs.readFileSync(path, 'utf8') 26 | 27 | return JSON.parse(data) 28 | } 29 | 30 | fromCsvFile(filepath: string) 31 | { 32 | // @TODO 33 | } 34 | 35 | getTrainedData() 36 | { 37 | const filename = `trained_${this.filename}` 38 | const filepath = `${this.dir}/${filename}`.replace('//', '/') 39 | 40 | const data: string = fs.readFileSync(filepath, 'utf8') 41 | 42 | return JSON.parse(data) 43 | } 44 | 45 | saveTrainedData(data: any) 46 | { 47 | const filename = `trained_${this.filename}` 48 | const filepath = `${this.dir}/${filename}`.replace('//', '/') 49 | 50 | if (typeof data !== 'string') { 51 | data = JSON.stringify(data) 52 | } 53 | 54 | fs.writeFileSync(filepath, data) 55 | } 56 | 57 | dataIsTrained() 58 | { 59 | let filename = `trained_${this.filename}` 60 | let filepath = `${this.dir}/${filename}`.replace('//', '/') 61 | 62 | return fs.existsSync(filepath) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /dist/loader.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | var fs = require("fs"); 4 | var Loader = (function () { 5 | function Loader() { 6 | } 7 | Loader.prototype.setDataDir = function (dir) { 8 | this.dir = dir; 9 | return this; 10 | }; 11 | Loader.prototype.fromJsonFile = function (filename) { 12 | this.filename = filename; 13 | var path = (this.dir + "/" + filename).replace('//', '/'); 14 | var data = fs.readFileSync(path, 'utf8'); 15 | return JSON.parse(data); 16 | }; 17 | Loader.prototype.fromCsvFile = function (filepath) { 18 | }; 19 | Loader.prototype.getTrainedData = function () { 20 | var filename = "trained_" + this.filename; 21 | var filepath = (this.dir + "/" + filename).replace('//', '/'); 22 | var data = fs.readFileSync(filepath, 'utf8'); 23 | return JSON.parse(data); 24 | }; 25 | Loader.prototype.saveTrainedData = function (data) { 26 | var filename = "trained_" + this.filename; 27 | var filepath = (this.dir + "/" + filename).replace('//', '/'); 28 | if (typeof data !== 'string') { 29 | data = JSON.stringify(data); 30 | } 31 | fs.writeFileSync(filepath, data); 32 | }; 33 | Loader.prototype.dataIsTrained = function () { 34 | var filename = "trained_" + this.filename; 35 | var filepath = (this.dir + "/" + filename).replace('//', '/'); 36 | return fs.existsSync(filepath); 37 | }; 38 | return Loader; 39 | }()); 40 | exports.Loader = Loader; 41 | -------------------------------------------------------------------------------- /dist/demo.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | var normalizer_1 = require("./normalizer"); 4 | var sampleData = [ 5 | { "soilhum": 500, "airtemp": 32, "airhum": 18, "water": true, "plants": ["tomatoes", "potatoes"], "tempSpan": [34, 54] }, 6 | { "soilhum": 1050, "airtemp": 40, "airhum": 21, "water": true, "plants": ["potatoes", "asparagus"], "tempSpan": [24, 14] }, 7 | { "soilhum": 300, "airtemp": 100, "airhum": 90, "water": false, "plants": ["asparagus", "tomatoes"], "tempSpan": [56, 4] }, 8 | { "soilhum": 950, "airtemp": 103, "airhum": 26, "water": true, "plants": ["asparagus", "asparagus"], "tempSpan": [123, 2] }, 9 | { "soilhum": 1050, "airtemp": 8, "airhum": 26, "water": true, "plants": ["tomatoes", "tomatoes"], "tempSpan": [67, 12] }, 10 | { "soilhum": 1050, "airtemp": 56, "airhum": 26, "water": true, "plants": ["potatoes", "french fries"], "tempSpan": [8, 45.8] }, 11 | ]; 12 | var normalizer = new normalizer_1.Normalizer(sampleData); 13 | normalizer.setOutputProperties(['isExpert']); 14 | normalizer.normalize(); 15 | var nbrInputs = normalizer.getInputLength(); 16 | var nbrOutputs = normalizer.getOutputLength(); 17 | var metadata = normalizer.getDatasetMetaData(); 18 | var inputs = normalizer.getBinaryInputDataset(); 19 | var outputs = normalizer.getBinaryOutputDataset(); 20 | console.log('\n', '\x1b[37m\x1b[46m', 'METADATA:', '\x1b[0m'); 21 | console.log(metadata); 22 | console.log('\n', '\x1b[37m\x1b[42m', 'INPUT:', '\x1b[0m'); 23 | console.log(inputs); 24 | console.log('\n', '\x1b[37m\x1b[44m', 'OUTPUT:', '\x1b[0m'); 25 | console.log(outputs); 26 | -------------------------------------------------------------------------------- /src/demo.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Test and examples. 3 | * Notes: does only work if inputs and outputs rows are 4 | * always of the same length. You cant ommit one field in sampleData[2] for instance. 5 | * NULL and empty values are not really handled and can cause bugs, but i guess 6 | * its probably your job to normalize that data before ? 7 | * 8 | * @author Romain Bruckert 9 | */ 10 | import { RowInput, Normalizer } from './normalizer'; 11 | 12 | const sampleData: Array = [ 13 | { "soilhum": 500, "airtemp": 32, "airhum": 18, "water": true, "plants": ["tomatoes", "potatoes"], "tempSpan": [34, 54] }, 14 | { "soilhum": 1050, "airtemp": 40, "airhum": 21, "water": true, "plants": ["potatoes", "asparagus"], "tempSpan": [24, 14] }, 15 | { "soilhum": 300, "airtemp": 100, "airhum": 90, "water": false, "plants": ["asparagus", "tomatoes"], "tempSpan": [56, 4] }, 16 | { "soilhum": 950, "airtemp": 103, "airhum": 26, "water": true, "plants": ["asparagus", "asparagus"], "tempSpan": [123, 2] }, 17 | { "soilhum": 1050, "airtemp": 8, "airhum": 26, "water": true, "plants": ["tomatoes", "tomatoes"], "tempSpan": [67, 12] }, 18 | { "soilhum": 1050, "airtemp": 56, "airhum": 26, "water": true, "plants": ["potatoes", "french fries"], "tempSpan": [8, 45.8] }, 19 | ]; 20 | 21 | const normalizer = new Normalizer(sampleData) 22 | 23 | // setting required options and normalize the data 24 | normalizer.setOutputProperties(['isExpert']) 25 | normalizer.normalize() 26 | 27 | // find useful information about your data 28 | // to pass to your neural network 29 | // check input and output lenghtes 30 | const nbrInputs = normalizer.getInputLength() 31 | const nbrOutputs = normalizer.getOutputLength() 32 | 33 | const metadata = normalizer.getDatasetMetaData() 34 | const inputs = normalizer.getBinaryInputDataset() 35 | const outputs = normalizer.getBinaryOutputDataset() 36 | 37 | console.log('\n', '\x1b[37m\x1b[46m', 'METADATA:', '\x1b[0m') 38 | console.log(metadata) 39 | console.log('\n', '\x1b[37m\x1b[42m', 'INPUT:', '\x1b[0m') 40 | console.log(inputs) 41 | console.log('\n', '\x1b[37m\x1b[44m', 'OUTPUT:', '\x1b[0m') 42 | console.log(outputs) 43 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A simple data normalizer to be used with neural networks 2 | 3 | As my grandfather used to say (probably), **neural networks** are dumb. When they're born and when you need to train them just to see how all the magic works, its a pain in the... neck. 4 | 5 | This is library **convert datasets of human data** into **arrays of bits** understandable for neurons (duh). 6 | 7 | *Disclaimer*: 8 | This script was made when i tested the awesome [synaptic.js](https://github.com/cazala/synaptic) neural network library and might not suit all sorts of inputs. Its mainly meant to be able to quickly have test data from example given around the web for neural networks input. 9 | 10 | ## Cut the crap, show me how to 11 | 12 | Consider this. I'm trying to plug a neural network into my Arduino Connected Garden and i've got the following data. I want my network to know when or when not to water my plants on its own (whatever the units are for now). 13 | 14 | ```json 15 | { "soilhumidity": 500, "airtemp": 32, "airhum": 18, "water": true, "plants": ["tomatoes", "potatoes"] }, 16 | { "soilhumidity": 1050, "airtemp": 40, "airhum": 21, "water": true, "plants": ["potatoes", "asparagus"] }, 17 | { "soilhumidity": 300, "airtemp": 100, "airhum": 90, "water": false, "plants": ["asparagus", "tomatoes"] }, 18 | { "soilhumidity": 950, "airtemp": 103, "airhum": 26, "water": true, "plants": ["asparagus", "asparagus"] }, 19 | { "soilhumidity": 1050, "airtemp": 8, "airhum": 26, "water": true, "plants": ["tomatoes", "tomatoes"] }, 20 | { "soilhumidity": 1050, "airtemp": 56, "airhum": 26, "water": true, "plants": ["potatoes", "french fries"] }, 21 | ``` 22 | 23 | In the end, my output is "should i water the plants?": `water: true` and the rest are my inputs. Let's do this. 24 | 25 | ```ts 26 | const normalizer = new Normalizer(sampleData); 27 | 28 | // setting required options and normalize the data 29 | normalizer.setOutputProperties(['water']); 30 | normalizer.normalize(); 31 | 32 | // find useful information about your data 33 | // to pass to your neural network 34 | const nbrInputs = normalizer.getInputLength(); 35 | const nbrOutputs = normalizer.getOutputLength(); 36 | 37 | const metadata = normalizer.getDatasetMetaData(); 38 | const inputs = normalizer.getBinaryInputDataset(); 39 | const outputs = normalizer.getBinaryOutputDataset(); 40 | 41 | console.log(metadata); 42 | console.log(inputs); 43 | console.log(outputs); 44 | ``` 45 | 46 | There you should have all useful information to give to your network. You know the **number if inputs** and **outputs**, you get **~~binarized(?) dataset suitable for neural networks**, and event some *metadata* about your data. 47 | 48 | ``` 49 | $ (console output) 50 | { soilhum: { type: 'number', min: 300, max: 1050, distinctValues: null }, 51 | airtemp: { type: 'number', min: 8, max: 103, distinctValues: null }, 52 | airhum: { type: 'number', min: 18, max: 90, distinctValues: null }, 53 | water: { type: 'boolean', min: 0, max: 1, distinctValues: null }, 54 | plants: 55 | { type: 'array', 56 | min: null, 57 | max: null, 58 | distinctValues: [ 'tomatoes', 'potatoes', 'asparagus', 'french fries' ] } } 59 | 60 | [ [ 0.266667, 0.252632, 0, 1, 1, 0, 0 ], 61 | [ 1, 0.336842, 0.041667, 0, 1, 1, 0 ], 62 | [ 0, 0.968421, 1, 1, 0, 1, 0 ], 63 | [ 0.866667, 1, 0.111111, 0, 0, 1, 0 ], 64 | [ 1, 0, 0.111111, 1, 0, 0, 0 ], 65 | [ 1, 0.505263, 0.111111, 0, 1, 0, 1 ] ] 66 | 67 | [ [ 1 ], [ 1 ], [ 0 ], [ 1 ], [ 1 ], [ 1 ] ] 68 | ``` 69 | 70 | ## Why metadata ? 71 | 72 | Consider a real example where you actually started to understand what are neural networks and start implementing it. You realize the biggest challenge is data formatting. When you **activate Alfred** with you data (i always call my network Alfred) 73 | you realize you also need to **normalize the new data input** as well. 74 | 75 | So you need to save metadata information that you got earlier (mins, maxes, ets) so that our data normalizer here converts the new inputs to the same scales! (this implies training data MUST contain min and maxes values at some point). 76 | 77 | Then on new unkown input you just have to recall the normalizer one thing: *metadata of known values* range. 78 | 79 | ``` 80 | const normalizer = new Normalizer(newData); 81 | 82 | normalizer 83 | .setDatasetMetaData(networkObject.metadata) 84 | .setOutputProperties(['water']); 85 | 86 | const input = normalizer.getBinaryInputDataset()[0]; 87 | 88 | // and activate your neural network with data ! (see index.ts for an example using synaptic) 89 | ``` 90 | -------------------------------------------------------------------------------- /dist/normalizer.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | Object.defineProperty(exports, "__esModule", { value: true }); 3 | function isArray(input) { 4 | return (Object.prototype.toString.call(input) === '[object Array]') ? true : false; 5 | } 6 | var Normalizer = (function () { 7 | function Normalizer(data) { 8 | if (data === void 0) { data = []; } 9 | this.dataset = []; 10 | this.datasetMeta = null; 11 | this.binaryInput = []; 12 | this.binaryOutput = []; 13 | this.outputProperties = []; 14 | this.dataset = data; 15 | if (true !== Array.isArray(data)) { 16 | throw new Error('\x1b[37m\x1b[44mNormalizer input data should be an array of rows: [{...}, {...}]\x1b[0m'); 17 | } 18 | if (this.dataset.length <= 0) { 19 | throw new Error("\u001B[37m\u001B[44mNormalizer input data shouldn't be empty\u001B[0m"); 20 | } 21 | if (Object.keys(this.dataset[0]).length <= 0) { 22 | throw new Error("\u001B[37m\u001B[44mNormalizer input data rows has to contain some properties (only 1st row is checked)\u001B[0m"); 23 | } 24 | } 25 | Normalizer.prototype.getOutputLength = function () { 26 | return this.outputProperties.length; 27 | }; 28 | Normalizer.prototype.getOutputProperties = function () { 29 | return this.outputProperties; 30 | }; 31 | Normalizer.prototype.getInputLength = function () { 32 | return this.binaryInput[0].length; 33 | }; 34 | Normalizer.prototype.getBinaryInputDataset = function () { 35 | return this.binaryInput; 36 | }; 37 | Normalizer.prototype.getBinaryOutputDataset = function () { 38 | return this.binaryOutput; 39 | }; 40 | Normalizer.prototype.getDatasetMetaData = function () { 41 | return this.datasetMeta; 42 | }; 43 | Normalizer.prototype.setDatasetMetaData = function (metadata) { 44 | this.datasetMeta = metadata; 45 | return this; 46 | }; 47 | Normalizer.prototype.convertOutput = function () { 48 | var metadata = this.datasetMeta; 49 | }; 50 | Normalizer.prototype.normalize = function () { 51 | this.datasetMeta = (this.datasetMeta === null) ? this.analyzeMetaData() : this.datasetMeta; 52 | var binaryInput = []; 53 | var binaryOutput = []; 54 | for (var i in this.dataset) { 55 | var row = this.dataset[i]; 56 | var index = 0; 57 | var inputBits = []; 58 | var outputBits = []; 59 | for (var prop in row) { 60 | var bitsArr = void 0; 61 | var value = row[prop]; 62 | var meta = this.datasetMeta[prop]; 63 | switch (meta.type) { 64 | case 'number': 65 | bitsArr = [this.numToBit(meta.min, meta.max, value)]; 66 | break; 67 | case 'boolean': 68 | bitsArr = [this.boolToBit(value)]; 69 | break; 70 | case 'string': 71 | bitsArr = this.strToBitsArr(meta.distinctValues, value); 72 | break; 73 | case 'array': 74 | bitsArr = this.arrToBitsArr(meta.distinctValues, value); 75 | break; 76 | default: 77 | break; 78 | } 79 | if (this.outputProperties.indexOf(prop) > -1) { 80 | outputBits = outputBits.concat(bitsArr); 81 | } 82 | else { 83 | inputBits = inputBits.concat(bitsArr); 84 | } 85 | index++; 86 | } 87 | if (inputBits.length > 0) { 88 | this.binaryInput.push(inputBits); 89 | } 90 | if (outputBits.length > 0) { 91 | this.binaryOutput.push(outputBits); 92 | } 93 | } 94 | }; 95 | Normalizer.prototype.analyzeMetaData = function () { 96 | var firstRow = this.dataset[0]; 97 | var distinctProps = this.distinctProps(firstRow); 98 | var distinctTypes = this.distinctTypes(firstRow); 99 | var metadata = {}; 100 | var bitDataset = []; 101 | for (var _i = 0, distinctProps_1 = distinctProps; _i < distinctProps_1.length; _i++) { 102 | var prop = distinctProps_1[_i]; 103 | var type = distinctTypes[prop]; 104 | metadata[prop] = { 105 | type: type, 106 | min: null, 107 | max: null, 108 | distinctValues: null, 109 | }; 110 | switch (type) { 111 | case 'number': 112 | var minMax = this.getMinMax(prop, this.dataset); 113 | metadata[prop].min = minMax[0]; 114 | metadata[prop].max = minMax[1]; 115 | break; 116 | case 'boolean': 117 | metadata[prop].min = 0; 118 | metadata[prop].max = 1; 119 | break; 120 | case 'string': 121 | var distinctStrVals = this.getDistinctVals(prop, this.dataset); 122 | metadata[prop].distinctValues = distinctStrVals; 123 | break; 124 | case 'array': 125 | var arrMinMax = this.get2DimArrayMinMax(prop, this.dataset); 126 | var distinctArrVals = this.getDistinctArrayVals(prop, this.dataset); 127 | metadata[prop].min = arrMinMax[0]; 128 | metadata[prop].max = arrMinMax[1]; 129 | metadata[prop].distinctValues = distinctArrVals; 130 | break; 131 | } 132 | } 133 | return metadata; 134 | }; 135 | Normalizer.prototype.setOutputProperties = function (props) { 136 | this.outputProperties = props; 137 | return this; 138 | }; 139 | Normalizer.prototype.getMinMax = function (prop, data) { 140 | var min = null; 141 | var max = null; 142 | for (var i in data) { 143 | var val = data[i][prop]; 144 | if (min === null || val < min) { 145 | min = val; 146 | } 147 | if (max === null || val > max) { 148 | max = val; 149 | } 150 | } 151 | return [min, max]; 152 | }; 153 | Normalizer.prototype.getFlatArrMinMax = function (arr) { 154 | var min = null; 155 | var max = null; 156 | if (typeof arr[0] === 'string') { 157 | return [min, max]; 158 | } 159 | for (var i in arr) { 160 | if (typeof arr[i] !== 'number') { 161 | continue; 162 | } 163 | var val = parseFloat(arr[i]); 164 | if (min === null || val < min) { 165 | min = val; 166 | } 167 | if (max === null || val > max) { 168 | max = val; 169 | } 170 | } 171 | return [min, max]; 172 | }; 173 | Normalizer.prototype.get2DimArrayMinMax = function (prop, data) { 174 | var min = null; 175 | var max = null; 176 | var mins = []; 177 | var maxs = []; 178 | for (var _i = 0, data_1 = data; _i < data_1.length; _i++) { 179 | var row = data_1[_i]; 180 | var arr = row[prop]; 181 | var minMax = this.getFlatArrMinMax(arr); 182 | mins.push(minMax[0]); 183 | maxs.push(minMax[1]); 184 | } 185 | min = this.getFlatArrMinMax(mins)[0]; 186 | max = this.getFlatArrMinMax(maxs)[1]; 187 | return [min, max]; 188 | }; 189 | Normalizer.prototype.getDistinctVals = function (property, data) { 190 | var count = 0; 191 | var distinctValues = []; 192 | for (var _i = 0, data_2 = data; _i < data_2.length; _i++) { 193 | var row = data_2[_i]; 194 | var val = row[property]; 195 | if (distinctValues.indexOf(val) === -1) { 196 | distinctValues.push(val); 197 | } 198 | } 199 | return distinctValues; 200 | }; 201 | Normalizer.prototype.getDistinctArrayVals = function (property, data) { 202 | var count = 0; 203 | var distinctValues = []; 204 | for (var _i = 0, data_3 = data; _i < data_3.length; _i++) { 205 | var row = data_3[_i]; 206 | var arrVal = row[property]; 207 | for (var _a = 0, arrVal_1 = arrVal; _a < arrVal_1.length; _a++) { 208 | var val = arrVal_1[_a]; 209 | if (distinctValues.indexOf(val) === -1) { 210 | distinctValues.push(val); 211 | } 212 | } 213 | } 214 | return distinctValues; 215 | }; 216 | Normalizer.prototype.numToBit = function (min, max, value) { 217 | var num = (value - min) / (max - min); 218 | return Number((num).toFixed(6)); 219 | }; 220 | Normalizer.prototype.boolToBit = function (val) { 221 | return +val; 222 | }; 223 | Normalizer.prototype.strToBitsArr = function (distinctValues, val) { 224 | var bitArr = new Array(distinctValues.length); 225 | bitArr.fill(0); 226 | for (var i in distinctValues) { 227 | if (val === distinctValues[i]) { 228 | bitArr[i] = 1; 229 | } 230 | } 231 | return bitArr; 232 | }; 233 | Normalizer.prototype.arrToBitsArr = function (distinctValues, vals) { 234 | var bitArr = new Array(distinctValues.length); 235 | bitArr.fill(0); 236 | for (var j in vals) { 237 | var val = vals[j]; 238 | var idx = distinctValues.indexOf(val); 239 | bitArr[idx] = 1; 240 | } 241 | return bitArr; 242 | }; 243 | Normalizer.prototype.distinctProps = function (row) { 244 | return Object.keys(row); 245 | }; 246 | Normalizer.prototype.distinctTypes = function (row) { 247 | var distinctTypes = {}; 248 | for (var prop in row) { 249 | var value = row[prop]; 250 | if (typeof value === 'object' && isArray(value)) { 251 | distinctTypes[prop] = 'array'; 252 | } 253 | else if (typeof value === 'object') { 254 | distinctTypes[prop] = 'object'; 255 | } 256 | else { 257 | distinctTypes[prop] = typeof (value); 258 | } 259 | } 260 | return distinctTypes; 261 | }; 262 | Normalizer.prototype.getRow1stValue = function (row) { 263 | return row[Object.keys(row)[0]]; 264 | }; 265 | return Normalizer; 266 | }()); 267 | exports.Normalizer = Normalizer; 268 | -------------------------------------------------------------------------------- /src/normalizer.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Convert any input data to bits representation. 3 | * 4 | * @author Romain Bruckert 5 | */ 6 | export type RowInput = { 7 | [prop: string]: string|number|Array|boolean; 8 | }; 9 | 10 | function isArray(input: any) 11 | { 12 | return ( Object.prototype.toString.call(input) === '[object Array]' ) ? true : false; 13 | } 14 | 15 | export class Normalizer 16 | { 17 | private dataset: Array = []; 18 | private datasetMeta: any = null; // training meta data (ranges, min, max, etc) 19 | private binaryInput: Array> = []; 20 | private binaryOutput: Array = []; 21 | private outputProperties: Array = []; 22 | 23 | constructor(data: Array = []) 24 | { 25 | this.dataset = data; 26 | 27 | // prevent empty data input 28 | if (true !== Array.isArray(data)) { 29 | throw new Error('\x1b[37m\x1b[44mNormalizer input data should be an array of rows: [{...}, {...}]\x1b[0m') 30 | } 31 | 32 | // prevent empty data input 33 | if (this.dataset.length <= 0 ) { 34 | throw new Error(`\x1b[37m\x1b[44mNormalizer input data shouldn't be empty\x1b[0m`); 35 | } 36 | 37 | // prevent data rows to contain no properties 38 | if (Object.keys(this.dataset[0]).length <= 0) { 39 | throw new Error(`\x1b[37m\x1b[44mNormalizer input data rows has to contain some properties (only 1st row is checked)\x1b[0m`); 40 | } 41 | } 42 | 43 | getOutputLength() 44 | { 45 | return this.outputProperties.length; 46 | } 47 | 48 | getOutputProperties() 49 | { 50 | return this.outputProperties; 51 | } 52 | 53 | getInputLength() 54 | { 55 | return this.binaryInput[0].length; 56 | } 57 | 58 | getBinaryInputDataset() 59 | { 60 | return this.binaryInput; 61 | } 62 | 63 | getBinaryOutputDataset() 64 | { 65 | return this.binaryOutput; 66 | } 67 | 68 | getDatasetMetaData() 69 | { 70 | return this.datasetMeta; 71 | } 72 | 73 | setDatasetMetaData(metadata: any) 74 | { 75 | this.datasetMeta = metadata; 76 | return this; 77 | } 78 | 79 | convertOutput() 80 | { 81 | const metadata = this.datasetMeta; 82 | } 83 | 84 | normalize() 85 | { 86 | this.datasetMeta = (this.datasetMeta === null) ? this.analyzeMetaData() : this.datasetMeta; 87 | 88 | // now loop through data and convert any data to bits 89 | // depending on data type and known settings of metadata 90 | let binaryInput: Array = []; 91 | let binaryOutput: Array = []; 92 | 93 | for (let i in this.dataset) { 94 | const row = this.dataset[i]; 95 | 96 | let index: number = 0; 97 | let inputBits: any = []; 98 | let outputBits: any = []; 99 | 100 | for (let prop in row) { 101 | // skip output properties, they are not in the input dataset 102 | // start turning all data into bits! 103 | let bitsArr: any; 104 | 105 | const value: any = row[prop]; 106 | const meta = this.datasetMeta[prop]; 107 | 108 | switch (meta.type) { 109 | case 'number': 110 | bitsArr = [this.numToBit(meta.min, meta.max, value)]; // scalar to array of 1 length 111 | break; 112 | case 'boolean': 113 | bitsArr = [this.boolToBit(value)]; // scalar to array of 1 length 114 | break; 115 | case 'string': 116 | bitsArr = this.strToBitsArr(meta.distinctValues, value); 117 | break; 118 | case 'array': 119 | bitsArr = this.arrToBitsArr(meta.distinctValues, value); 120 | break; 121 | default: 122 | 123 | break; 124 | } 125 | 126 | if (this.outputProperties.indexOf(prop) > -1) { 127 | outputBits = outputBits.concat(bitsArr); 128 | } else { 129 | inputBits = inputBits.concat(bitsArr); 130 | } 131 | 132 | index++; 133 | } 134 | 135 | if (inputBits.length > 0) { this.binaryInput.push(inputBits) } 136 | if (outputBits.length > 0) { this.binaryOutput.push(outputBits) } 137 | } 138 | 139 | } 140 | 141 | analyzeMetaData(): any 142 | { 143 | // at this point we know that data is not an empty array and 144 | // that the first row contains at least one property (the others should as well) 145 | 146 | // depending on each data row property, find the values data type using only the first row 147 | const firstRow = this.dataset[0]; 148 | const distinctProps = this.distinctProps(firstRow); 149 | const distinctTypes = this.distinctTypes(firstRow); 150 | 151 | let metadata = {}; 152 | let bitDataset = []; 153 | 154 | for (let prop of distinctProps) { 155 | const type = distinctTypes[prop]; 156 | 157 | metadata[prop] = { 158 | type: type, 159 | min: null, 160 | max: null, 161 | distinctValues: null, 162 | }; 163 | 164 | switch (type) { 165 | case 'number': 166 | // data will be normalize with a number between 0 and 1 167 | const minMax = this.getMinMax(prop, this.dataset); 168 | metadata[prop].min = minMax[0]; 169 | metadata[prop].max = minMax[1]; 170 | break; 171 | case 'boolean': 172 | // data is a simple 0 or 1 bit 173 | metadata[prop].min = 0; 174 | metadata[prop].max = 1; 175 | break; 176 | case 'string': 177 | // data will be normalize in an array of bits which length is equivalent 178 | // to the total number of distinct string values of the whole dataset 179 | const distinctStrVals = this.getDistinctVals(prop, this.dataset); 180 | metadata[prop].distinctValues = distinctStrVals; 181 | break; 182 | case 'array': 183 | const arrMinMax: any = this.get2DimArrayMinMax(prop, this.dataset); 184 | const distinctArrVals = this.getDistinctArrayVals(prop, this.dataset); 185 | 186 | metadata[prop].min = arrMinMax[0] 187 | metadata[prop].max = arrMinMax[1] 188 | metadata[prop].distinctValues = distinctArrVals; 189 | break; 190 | } 191 | } 192 | 193 | return metadata; 194 | } 195 | 196 | setOutputProperties(props: Array) 197 | { 198 | this.outputProperties = props; 199 | return this; 200 | } 201 | 202 | getMinMax(prop: string, data: Array) 203 | { 204 | let min: number = null 205 | let max: number = null 206 | 207 | for (let i in data) { 208 | let val: any = data[i][prop] 209 | 210 | if (min === null || val < min) { min = val } 211 | if (max === null || val > max) { max = val } 212 | } 213 | 214 | return [min, max] 215 | } 216 | 217 | getFlatArrMinMax(arr: Array) 218 | { 219 | let min: number = null 220 | let max: number = null 221 | 222 | if (typeof arr[0] === 'string') { 223 | return [min, max] 224 | } 225 | 226 | for (let i in arr) { 227 | if (typeof arr[i] !== 'number') { continue } 228 | let val: number = parseFloat(arr[i]) 229 | 230 | if (min === null || val < min) { min = val } 231 | if (max === null || val > max) { max = val } 232 | } 233 | 234 | return [min, max] 235 | } 236 | 237 | get2DimArrayMinMax(prop: string, data: any) 238 | { 239 | let min: number = null 240 | let max: number = null 241 | 242 | let mins: Array = [] 243 | let maxs: Array = [] 244 | 245 | for(let row of data) { 246 | const arr = row[prop] // this is itself a 1 dim array 247 | 248 | let minMax = this.getFlatArrMinMax(arr) 249 | 250 | mins.push(minMax[0]) 251 | maxs.push(minMax[1]) 252 | } 253 | 254 | min = this.getFlatArrMinMax(mins)[0] 255 | max = this.getFlatArrMinMax(maxs)[1] 256 | 257 | return [min, max] 258 | } 259 | 260 | getDistinctVals(property: string, data: Array) 261 | { 262 | let count = 0; 263 | let distinctValues = []; 264 | 265 | for(let row of data) { 266 | const val = row[property]; 267 | 268 | if (distinctValues.indexOf(val) === -1) { 269 | distinctValues.push(val) 270 | } 271 | } 272 | 273 | return distinctValues 274 | } 275 | 276 | getDistinctArrayVals(property: string, data: Array) 277 | { 278 | let count = 0; 279 | let distinctValues = []; 280 | 281 | for(let row of data) { 282 | const arrVal: any = row[property]; 283 | 284 | for (let val of arrVal) { 285 | if (distinctValues.indexOf(val) === -1) { 286 | distinctValues.push(val); 287 | } 288 | } 289 | } 290 | 291 | return distinctValues; 292 | } 293 | 294 | numToBit(min: number, max: number, value: number): number 295 | { 296 | const num = (value - min) / (max - min); 297 | return Number((num).toFixed(6)); 298 | } 299 | 300 | boolToBit(val: boolean) 301 | { 302 | return + val; 303 | } 304 | 305 | /** 306 | * Turns discint values into unique array of bits to represent them all. 307 | * For example if we have distinct data values of [ 500, 1050, 300, 950 ] 308 | * will will need a 4 length array of bits to represent them all. 309 | * The 1st value will be [0,0,0,1], the second [0,0,1,0]... and so on. 310 | * The methor 311 | */ 312 | strToBitsArr(distinctValues: any, val: string) 313 | { 314 | let bitArr = new Array(distinctValues.length); 315 | bitArr.fill(0); 316 | 317 | for (let i in distinctValues) { 318 | if (val === distinctValues[i]) { 319 | bitArr[i] = 1; 320 | } 321 | } 322 | 323 | return bitArr; 324 | } 325 | 326 | arrToBitsArr(distinctValues: any, vals: any) 327 | { 328 | let bitArr = new Array(distinctValues.length); 329 | bitArr.fill(0); 330 | 331 | for (let j in vals) { 332 | const val = vals[j]; 333 | let idx = distinctValues.indexOf(val); 334 | bitArr[idx] = 1; 335 | } 336 | 337 | return bitArr; 338 | } 339 | 340 | distinctProps(row: RowInput) 341 | { 342 | return Object.keys(row); 343 | } 344 | 345 | distinctTypes(row: RowInput) 346 | { 347 | let distinctTypes = {}; 348 | 349 | for (let prop in row) { 350 | const value = row[prop]; 351 | 352 | // also check for "real" array or object type 353 | if (typeof value === 'object' && isArray(value)) { 354 | distinctTypes[prop] = 'array'; 355 | } else if (typeof value === 'object') { 356 | distinctTypes[prop] = 'object'; 357 | } else { 358 | distinctTypes[prop] = typeof(value); 359 | } 360 | } 361 | 362 | return distinctTypes; 363 | } 364 | 365 | getRow1stValue(row: RowInput) 366 | { 367 | return row[Object.keys(row)[0]]; 368 | } 369 | } 370 | --------------------------------------------------------------------------------