├── .babelrc ├── qr-code.png ├── .gitignore ├── spec ├── LimitedAccessTest.js ├── assets │ ├── SampleData.js │ ├── csvplugin.js │ └── ExpectedAscii.js ├── QueryLimitedAnalysisTest.js ├── PersistenceTest.js ├── utils │ ├── MongoShell.js │ ├── JsonValidator.js │ └── Tester.js ├── BasicAnalysisTest.js ├── PluginTest.js ├── ContinuousLoggingTest.js ├── MaxDepthAnalysisTest.js ├── DatatypeRecognitionTest.js ├── LimitResultsAnalysisTest.js ├── SortedAnalysisTest.js ├── UnnamedObjectsAnalysisTest.js ├── ExcludeSubkeysTest.js └── ParametersParsingTest.js ├── .travis.yml ├── docker ├── start-script.sh └── Dockerfile.template ├── test.sh ├── .eslintrc.js ├── CHANGELOG ├── package.json ├── variety.js └── README.markdown /.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "presets": ["es2015", "stage-0"] 3 | } 4 | -------------------------------------------------------------------------------- /qr-code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xzya/variety/master/qr-code.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | node_modules/ 3 | # intellij idea project files 4 | *.iml 5 | *.ipr 6 | .idea/ 7 | -------------------------------------------------------------------------------- /spec/LimitedAccessTest.js: -------------------------------------------------------------------------------- 1 | describe('Limited access test', () => { 2 | it('TODO: should handle authenticated and unknown users'); 3 | // How to implement? Start another db with auth and configure users? 4 | }); 5 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - '5.0' 4 | sudo: required 5 | services: docker 6 | env: 7 | matrix: 8 | - MONGODB_VERSION=2.4 9 | - MONGODB_VERSION=2.6 10 | - MONGODB_VERSION=2.8 11 | - MONGODB_VERSION=3.0 12 | - MONGODB_VERSION=3.2 13 | script: 14 | - npm run travis-ci 15 | -------------------------------------------------------------------------------- /spec/assets/SampleData.js: -------------------------------------------------------------------------------- 1 | import { Binary } from 'mongodb'; 2 | 3 | export default [{ 4 | 'name': 'Tom', 5 | 'bio': 'A nice guy.', 6 | 'pets': ['monkey', 'fish'], 7 | 'someWeirdLegacyKey': 'I like Ike!' 8 | }, { 9 | 'name': 'Dick', 10 | 'bio': 'I swordfight.', 11 | 'birthday': new Date(1974,2,14) 12 | }, { 13 | 'name': 'Harry', 14 | 'pets': 'egret', 15 | 'birthday': new Date(1984,2,14) 16 | }, { 17 | 'name': 'Geneviève', 18 | 'bio': 'Ça va?' 19 | }, { 20 | 'name': 'Jim', 21 | 'someBinData': new Binary('1234') //Binary.SUBTYPE_BYTE_ARRAY 22 | }]; 23 | -------------------------------------------------------------------------------- /docker/start-script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Start script for testing container 4 | 5 | # start MongoDB with disabled logs and journal 6 | mongod --nojournal --logpath /dev/null & 7 | 8 | # switch to linked sources volume 9 | cd /opt/variety 10 | 11 | # install all dependencies 12 | npm install 13 | 14 | # Wait until the DB is started and responds on selected port 15 | while ! curl --silent http://localhost:27017 > /dev/null 2>&1 16 | do 17 | echo "waiting for MongoDB connection" 18 | sleep 1 19 | done 20 | 21 | echo "MongoDB ready on port 27017" 22 | 23 | # start actual tests 24 | npm test 25 | -------------------------------------------------------------------------------- /spec/assets/csvplugin.js: -------------------------------------------------------------------------------- 1 | var getCsv = function(varietyResults) { 2 | var delimiter = this.delimiter || '|'; 3 | var headers = ['key', 'types', 'occurrences', 'percents']; 4 | var table = [headers.join(delimiter)]; 5 | var rows = varietyResults.map(function(key) { 6 | return [key._id.key, Object.keys(key.value.types).sort(), key.totalOccurrences, key.percentContaining].join(delimiter); 7 | }, this); 8 | return table.concat(rows).join('\n'); 9 | }; 10 | 11 | var setConfig = function(pluginConfig) { 12 | this.delimiter = pluginConfig.delimiter; 13 | }; 14 | 15 | module.exports = { 16 | init: setConfig, 17 | formatResults: getCsv 18 | }; 19 | -------------------------------------------------------------------------------- /spec/QueryLimitedAnalysisTest.js: -------------------------------------------------------------------------------- 1 | import Tester from './utils/Tester.js'; 2 | import sampleData from './assets/SampleData'; 3 | const test = new Tester('test', 'users'); 4 | 5 | describe('Query-limited analysis', () => { 6 | 7 | beforeEach(() => test.init(sampleData)); 8 | afterEach(() => test.cleanUp()); 9 | 10 | it('should return only filtered values', async () => { 11 | const results = await test.runJsonAnalysis({collection:'users', query:{birthday:{$exists: true}}}); 12 | results.validateResultsCount(5); 13 | results.validate('_id', 2, 100.0, {ObjectId: 2}); 14 | results.validate('birthday', 2, 100.0, {Date: 2}); 15 | results.validate('name', 2, 100.0, {String: 2}); 16 | results.validate('bio', 1, 50.0, {String: 1}); 17 | results.validate('pets', 1, 50.0, {String: 1}); 18 | }); 19 | }); 20 | -------------------------------------------------------------------------------- /spec/PersistenceTest.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import Tester from './utils/Tester.js'; 3 | import sampleData from './assets/SampleData'; 4 | 5 | const test = new Tester('test', 'users'); 6 | 7 | describe('Persistence of results', () => { 8 | 9 | beforeEach(() => test.init(sampleData)); 10 | afterEach(() => test.cleanUp()); 11 | 12 | it('should persist results into varietyResults DB', async () => { 13 | await test.runAnalysis({collection:'users', persistResults: true}, true); 14 | const db = await test.getDb('varietyResults'); 15 | const arr = await db.collection('usersKeys').find().toArray(); 16 | assert.equal(arr.length, 7); 17 | const keys = arr.map(it => it._id.key); 18 | assert.deepEqual(keys, ['_id', 'name', 'bio', 'birthday', 'pets', 'someBinData', 'someWeirdLegacyKey']); 19 | }); 20 | }); 21 | -------------------------------------------------------------------------------- /spec/assets/ExpectedAscii.js: -------------------------------------------------------------------------------- 1 | export default ` 2 | 3 | +--------------------------------------------------------------------+ 4 | | key | types | occurrences | percents | 5 | | ------------------ | -------------------- | ----------- | -------- | 6 | | _id | ObjectId | 5 | 100.0 | 7 | | name | String | 5 | 100.0 | 8 | | bio | String | 3 | 60.0 | 9 | | birthday | Date | 2 | 40.0 | 10 | | pets | String (1),Array (1) | 2 | 40.0 | 11 | | someBinData | BinData-generic | 1 | 20.0 | 12 | | someWeirdLegacyKey | String | 1 | 20.0 | 13 | +--------------------------------------------------------------------+ 14 | 15 | `.trim(); 16 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # location of this script 5 | DIR=$(readlink -f $(dirname $0)) 6 | 7 | # Read version info from env property MONGODB_VERSION or use 2.6 as default 8 | VERSION=${MONGODB_VERSION:=2.6} 9 | 10 | # Read Variety.js version from package.json 11 | PACKAGE_VERSION=$(node -p -e "require('./package.json').version") 12 | 13 | echo 14 | echo "****************************************" 15 | echo "* " 16 | echo "* Variety.js version $PACKAGE_VERSION" 17 | echo "* MongoDB version $VERSION" 18 | echo "* $(docker --version)" 19 | echo "* " 20 | echo "****************************************" 21 | echo 22 | 23 | sed -e "s/{MONGODB_VERSION}/$VERSION/g" docker/Dockerfile.template > Dockerfile_$VERSION 24 | 25 | echo "Building docker image for Variety tests..." 26 | 27 | docker build -t variety-$VERSION -f Dockerfile_$VERSION . 28 | docker run -t -v $DIR:/opt/variety variety-$VERSION 29 | 30 | rm Dockerfile_$VERSION 31 | -------------------------------------------------------------------------------- /spec/utils/MongoShell.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | import { exec } from 'child-process-promise'; 4 | 5 | export default async (database, credentials, args, script, quiet, port) => { 6 | const commands = ['mongo']; 7 | 8 | commands.push('--port'); 9 | commands.push(port); 10 | 11 | if (database) { 12 | commands.push(database); 13 | } 14 | if (quiet) { 15 | commands.push('--quiet'); 16 | } 17 | 18 | if (credentials) { 19 | commands.push('--username'); 20 | commands.push(credentials.username); 21 | commands.push('--password'); 22 | commands.push(credentials.password); 23 | commands.push('--authenticationDatabase'); 24 | commands.push(credentials.authDatabase); 25 | } 26 | 27 | if (args) { 28 | commands.push('--eval'); 29 | commands.push(args); 30 | } 31 | 32 | if (script) { 33 | commands.push(script); 34 | } 35 | 36 | const result = await exec(commands.join(' ')); 37 | return result.stdout.trim(); 38 | }; 39 | -------------------------------------------------------------------------------- /docker/Dockerfile.template: -------------------------------------------------------------------------------- 1 | FROM mongo:{MONGODB_VERSION} 2 | 3 | RUN apt-get -qq update 4 | RUN apt-get install -y --force-yes --no-install-recommends curl 5 | 6 | # This is the recommended installation of node 7 | # See: https://nodejs.org/en/download/package-manager/#debian-and-ubuntu-based-linux-distributions 8 | # RUN curl -sL https://deb.nodesource.com/setup_5.x | bash - 9 | # RUN apt-get install -y --force-yes --no-install-recommends nodejs 10 | 11 | # To speed up the installation, we skip packages and download directly node archive 12 | # Version of node is determinded by Heroku's API https://semver.io/node/stable 13 | RUN NODE_VERSION=$(curl -sk https://semver.io/node/stable) \ 14 | && curl -SLO "http://nodejs.org/dist/v$NODE_VERSION/node-v$NODE_VERSION-linux-x64.tar.gz" \ 15 | && tar -xzf "node-v$NODE_VERSION-linux-x64.tar.gz" -C /usr/local --strip-components=1 \ 16 | && rm "node-v$NODE_VERSION-linux-x64.tar.gz" 17 | 18 | ENTRYPOINT ["/opt/variety/docker/start-script.sh"] 19 | -------------------------------------------------------------------------------- /spec/utils/JsonValidator.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | import { equal, deepEqual } from 'assert'; 4 | 5 | export default class JsonValidator { 6 | constructor(results) { 7 | this.results = results; 8 | } 9 | 10 | validate(key, totalOccurrences, percentContaining, types) { 11 | const row = this.results.filter(item => item._id.key === key)[0]; 12 | if(typeof row === 'undefined') { 13 | throw new Error(`Key '${key}' not present in results. Known keys are: [${this.results.map(item => item._id.key).join(',')}].`); 14 | } 15 | equal(row.totalOccurrences, totalOccurrences, `TotalOccurrences of key ${key} does not match`); 16 | equal(row.percentContaining, percentContaining, `PercentContaining of key ${key} does not match`); 17 | deepEqual(row.value.types, types, `Types of key ${key} do not match`); 18 | } 19 | 20 | validateResultsCount(count) { 21 | equal(this.results.length, count, `Total count of results does not match expected count. Known keys are: [${this.results.map(item => item._id.key).join(',')}].`); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /.eslintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | "parser": "babel-eslint", 3 | "env": { 4 | "mongo": true, 5 | "node": true, 6 | "es6": true, 7 | "mocha": true 8 | }, 9 | "extends": "eslint:recommended", 10 | "rules": { 11 | "indent": [ 12 | 2, 13 | 2 14 | ], 15 | "linebreak-style": [ 16 | "error", 17 | "unix" 18 | ], 19 | "quotes": [ 20 | "error", 21 | "single" 22 | ], 23 | "semi": [ 24 | "error", 25 | "always" 26 | ], 27 | "brace-style": [ 28 | 2, 29 | "1tbs", 30 | { "allowSingleLine": true } 31 | ] 32 | }, 33 | "globals": { 34 | "__quiet": false, 35 | "slaveOk": false, 36 | "collection": false, 37 | "DBQuery": false, 38 | "BinData": false, 39 | "NumberLong": false, 40 | "tojson": false 41 | 42 | }, 43 | "parserOptions": { 44 | "sourceType": "module" 45 | } 46 | }; 47 | -------------------------------------------------------------------------------- /spec/BasicAnalysisTest.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import Tester from './utils/Tester.js'; 3 | const test = new Tester('test', 'users'); 4 | 5 | import sampleData from './assets/SampleData'; 6 | import expectedAscii from './assets/ExpectedAscii'; 7 | 8 | describe('Basic Analysis', () => { 9 | 10 | beforeEach(() => test.init(sampleData)); 11 | afterEach(() => test.cleanUp()); 12 | 13 | it('should return ASCII results', async () => { 14 | const output = await test.runAnalysis({collection:'users'}, true); 15 | assert.equal(output, expectedAscii); 16 | }); 17 | 18 | it('should return JSON results', async () => { 19 | const results = await test.runJsonAnalysis({collection:'users'}, true); 20 | results.validateResultsCount(7); 21 | results.validate('_id', 5, 100.0, {ObjectId: 5}); 22 | results.validate('name', 5, 100.0, {String: 5}); 23 | results.validate('bio', 3, 60.0, {String: 3}); 24 | results.validate('birthday', 2, 40.0, {Date: 2}); 25 | results.validate('pets', 2, 40.0, {String: 1, Array: 1}); 26 | results.validate('someBinData', 1, 20.0, {'BinData-generic': 1}); 27 | results.validate('someWeirdLegacyKey', 1, 20.0, {String: 1}); 28 | }); 29 | }); 30 | -------------------------------------------------------------------------------- /spec/PluginTest.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import Tester from './utils/Tester.js'; 3 | import { resolve, join } from 'path'; 4 | import sampleData from './assets/SampleData'; 5 | 6 | const test = new Tester('test', 'users'); 7 | 8 | const expectedOutput = ` 9 | key|types|occurrences|percents 10 | _id|ObjectId|5|100 11 | name|String|5|100 12 | bio|String|3|60 13 | birthday|Date|2|40 14 | pets|Array,String|2|40 15 | someBinData|BinData-generic|1|20 16 | someWeirdLegacyKey|String|1|20 17 | `.trim(); 18 | 19 | const getPluginPath = () => resolve(join(__dirname , 'assets', 'csvplugin.js')); 20 | 21 | describe('Plugins', () => { 22 | 23 | beforeEach(() => test.init(sampleData)); 24 | afterEach(() => test.cleanUp()); 25 | 26 | it('should load plugin and modify output', async () => { 27 | const output = await test.runAnalysis({collection:'users', plugins: getPluginPath()}, true); 28 | assert.equal(output, expectedOutput); 29 | }); 30 | 31 | it('should read additional plugin params', async () => { 32 | const output = await test.runAnalysis({collection:'users', plugins: getPluginPath() + '|delimiter=;'}, true); 33 | const expectedWithSeparator = expectedOutput.replace(/\|/g, ';'); 34 | assert.equal(output, expectedWithSeparator); 35 | }); 36 | 37 | }); 38 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | (14 May 2015) Version 1.5.0: Introduced basic plugin infrastructure, 'onConfig' and 'formatResults' hooks 2 | 3 | (14 Oct 2014) Version 1.4.1: @todvora's fix for maxDepth (matches readme), and fixes for incorrect counts while using query/limit 4 | 5 | (13 Oct 2014) Version 1.4.0: @todvora's fix for nested objects 6 | 7 | (30 May 2014) Version 1.3.0: @Jacob111's refactoring and tests from @todvora. Thanks! 8 | 9 | (28 March 2014) Version 1.2.6: @jamescropcho has made variety.js pass JSHint analysis. 10 | 11 | (21 February 2014) Version 1.2.5: Thanks to @nitindhar7: Adds a sort option, so you can be more choosy with your limit. 12 | 13 | (05 December 2013) Version 1.2.4: Thanks to @jmargeta: Bugfix for newer mongo versions that don't support the JSON object. 14 | 15 | (01 September 2013) Version 1.2.3: Thanks to @rugbyhead: Now handles query-based filtering. 16 | 17 | (03 November 2012) Version 1.2.2: Thanks to @kmcgrath: Added support for db_name, allowing people using admin authentication to specify a db name to use, since they must connect to the admin db. 18 | 19 | (29 July 2012) Version 1.2.1: Fixed bug with occurrence calculation. 20 | 21 | (28 July 2012) Version 1.2: Removed map/reduce to avoid certain bugs, which also simplified some of the calculations. 22 | 23 | (03 June 2012) Thanks to @wfreeman, we now handle MaxDepth, and other goodies. 24 | 25 | (25 May 2012) Thanks to @wfreeman, we now recognize the Date, ObjectID and BinData types. 26 | -------------------------------------------------------------------------------- /spec/ContinuousLoggingTest.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import Tester from './utils/Tester.js'; 3 | import sampleData from './assets/SampleData'; 4 | 5 | const test = new Tester('test', 'users'); 6 | 7 | const pattern = /^Found new key type "(.{1,})" type "(.{1,})"$/g; 8 | const expectedLines = [ 9 | 'Found new key type "_id" type "ObjectId"', 10 | 'Found new key type "name" type "String"', 11 | 'Found new key type "someBinData" type "BinData-generic"', 12 | 'Found new key type "bio" type "String"', 13 | 'Found new key type "pets" type "String"', 14 | 'Found new key type "birthday" type "Date"', 15 | 'Found new key type "pets" type "Array"', 16 | 'Found new key type "pets.XX" type "String"', 17 | 'Found new key type "someWeirdLegacyKey" type "String"' 18 | ]; 19 | 20 | 21 | 22 | describe('Continuous logging', async () => { 23 | 24 | beforeEach(() => test.init(sampleData)); 25 | afterEach(() => test.cleanUp()); 26 | 27 | it('should log every new key', async () => { 28 | const output = await test.runAnalysis({collection:'users', logKeysContinuously:true}); 29 | var filteredOutput = output 30 | .split('\n') 31 | .filter(line => line.match(pattern)); 32 | assert.equal(filteredOutput.length, expectedLines.length); 33 | expectedLines.forEach(expectedLine => { 34 | const found = filteredOutput.indexOf(expectedLine) > -1; 35 | assert.ok(found, `Expected line '${expectedLine}' not found in Variety output`); 36 | }); 37 | }); 38 | }); 39 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "variety", 3 | "version": "1.5.0", 4 | "description": "A schema analyzer for MongoDB", 5 | "main": "variety.js", 6 | "directories": { 7 | "test": "test" 8 | }, 9 | "scripts": { 10 | "lint": "node_modules/.bin/eslint variety.js spec", 11 | "lint:fix": "node_modules/.bin/eslint variety.js spec --fix", 12 | "test": "node_modules/.bin/mocha --compilers js:babel-core/register --require babel-polyfill --recursive --reporter spec --timeout 15000 spec", 13 | "test:docker": "./test.sh", 14 | "travis-ci": "npm run lint && npm run test:docker" 15 | }, 16 | "repository": { 17 | "type": "git", 18 | "url": "git+https://github.com/variety/variety.git" 19 | }, 20 | "author": "James Cropcho (https://twitter.com/Cropcho)", 21 | "contributors": [ 22 | "James Cropcho (https://twitter.com/Cropcho)", 23 | "Eve Freeman (https://twitter.com/wefreema)", 24 | "Tomas Dvorak (http://www.tomas-dvorak.cz/)" 25 | ], 26 | "license": "MIT", 27 | "bugs": { 28 | "url": "https://github.com/variety/variety/issues" 29 | }, 30 | "homepage": "https://github.com/variety/variety#readme", 31 | "devDependencies": { 32 | "babel-core": "^6.7.2", 33 | "babel-eslint": "^6.0.2", 34 | "babel-polyfill": "^6.7.4", 35 | "babel-preset-es2015": "^6.6.0", 36 | "babel-preset-stage-0": "^6.5.0", 37 | "child-process-promise": "^1.1.0", 38 | "eslint": "^2.4.0", 39 | "mocha": "^2.4.5", 40 | "mongodb": "^2.1.7" 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /spec/MaxDepthAnalysisTest.js: -------------------------------------------------------------------------------- 1 | import Tester from './utils/Tester.js'; 2 | const test = new Tester('test', 'users'); 3 | 4 | describe('Max-depth-limited analysis', () => { 5 | 6 | beforeEach(() => test.init([{name:'Walter', someNestedObject:{a:{b:{c:{d:{e:1}}}}}}])); 7 | afterEach(() => test.cleanUp()); 8 | 9 | it('should return all keys', async () => { 10 | const results = await test.runJsonAnalysis({collection:'users'}); 11 | 12 | results.validateResultsCount(8); 13 | 14 | results.validate('_id', 1, 100.0, {ObjectId:1}); 15 | results.validate('name', 1, 100.0, {String:1}); 16 | results.validate('someNestedObject', 1, 100.0, {Object:1}); 17 | results.validate('someNestedObject.a', 1, 100.0, {Object:1}); 18 | results.validate('someNestedObject.a.b', 1, 100.0, {Object:1}); 19 | results.validate('someNestedObject.a.b.c', 1, 100.0, {Object:1}); 20 | results.validate('someNestedObject.a.b.c.d', 1, 100.0, {Object:1}); 21 | results.validate('someNestedObject.a.b.c.d.e', 1, 100.0, {Number:1}); 22 | }); 23 | 24 | it('should return only first 3 levels', async () => { 25 | const results = await test.runJsonAnalysis({collection:'users', maxDepth:3}); 26 | 27 | results.validateResultsCount(5); 28 | 29 | results.validate('_id', 1, 100.0, {ObjectId:1}); 30 | results.validate('name', 1, 100.0, {String:1}); 31 | results.validate('someNestedObject', 1, 100.0, {Object:1}); 32 | results.validate('someNestedObject.a', 1, 100.0, {Object:1}); 33 | results.validate('someNestedObject.a.b', 1, 100.0, {Object:1}); 34 | }); 35 | }); 36 | -------------------------------------------------------------------------------- /spec/DatatypeRecognitionTest.js: -------------------------------------------------------------------------------- 1 | import { Binary } from 'mongodb'; 2 | import { Long } from 'mongodb'; 3 | import Tester from './utils/Tester.js'; 4 | const test = new Tester('test', 'users'); 5 | 6 | const crazyObject = { 7 | key_string: 'Just plain String', 8 | key_boolean: true, 9 | key_number: 1, 10 | key_date: new Date(), 11 | 'key_binData-generic': new Binary('1234'), // TODO: how to create other bin-data types? 12 | key_array: [], 13 | key_object: {}, 14 | key_null: null, 15 | key_long: Long.fromString('4611686018427387904') 16 | }; 17 | 18 | describe('Data type recognition', () => { 19 | 20 | beforeEach(() => test.init([crazyObject])); 21 | afterEach(() => test.cleanUp()); 22 | 23 | it('should recognize all supported data types', async () => { 24 | const results = await test.runJsonAnalysis({collection:'users'}, true); 25 | results.validateResultsCount(10); 26 | results.validate('_id', 1, 100.0, {ObjectId: 1}); 27 | results.validate('key_string', 1, 100.0, {String: 1}); 28 | results.validate('key_boolean', 1, 100.0, {Boolean: 1}); 29 | results.validate('key_number', 1, 100.0, {Number: 1}); 30 | results.validate('key_date', 1, 100.0, {Date: 1}); 31 | results.validate('key_binData-generic', 1, 100.0, {'BinData-generic': 1}); 32 | results.validate('key_array', 1, 100.0, {Array: 1}); 33 | results.validate('key_object', 1, 100.0, {Object: 1}); 34 | results.validate('key_null', 1, 100.0, {null: 1}); // TODO: why has 'null' first letter lowercase, unlike all other types? 35 | results.validate('key_long', 1, 100.0, {NumberLong: 1}); 36 | }); 37 | }); 38 | -------------------------------------------------------------------------------- /spec/LimitResultsAnalysisTest.js: -------------------------------------------------------------------------------- 1 | import Tester from './utils/Tester.js'; 2 | import sampleData from './assets/SampleData'; 3 | 4 | const test = new Tester('test', 'users'); 5 | 6 | describe('Limited results count analysis', () => { 7 | 8 | beforeEach(() => test.init(sampleData)); 9 | afterEach(() => test.cleanUp()); 10 | 11 | it('should analyze only first item', async () => { 12 | // limit=1 without other params selects the last inserted document (see sampleData) 13 | // it should equals {name: "Jim", someBinData: new BinData(2,"1234")} 14 | const results = await test.runJsonAnalysis({collection:'users', limit:1}); 15 | results.validate('_id', 1, 100.0, {ObjectId:1}); 16 | results.validate('name', 1, 100.0, {String:1}); 17 | results.validate('someBinData', 1, 100.0, {'BinData-generic':1}); 18 | }); 19 | 20 | it('should analyze all and compute real percentages', async () => { 21 | const results = await test.runJsonAnalysis({collection:'users', limit:10}); 22 | // limit is set to higher number, that the actual number of documents in collection 23 | // analysis should compute percentages based on the real number of documents, not on the 24 | // number provided in the limit var. 25 | results.validateResultsCount(7); 26 | results.validate('_id', 5, 100.0, {ObjectId: 5}); 27 | results.validate('name', 5, 100.0, {String: 5}); 28 | results.validate('bio', 3, 60.0, {String: 3}); 29 | results.validate('birthday', 2, 40.0, {Date: 2}); 30 | results.validate('pets', 2, 40.0, {String: 1, Array: 1}); 31 | results.validate('someBinData', 1, 20.0, {'BinData-generic': 1}); 32 | results.validate('someWeirdLegacyKey', 1, 20.0, {String: 1}); 33 | }); 34 | }); 35 | -------------------------------------------------------------------------------- /spec/SortedAnalysisTest.js: -------------------------------------------------------------------------------- 1 | import Tester from './utils/Tester.js'; 2 | import sampleData from './assets/SampleData'; 3 | 4 | const test = new Tester('test', 'users'); 5 | 6 | describe('Sorted-data analysis', () => { 7 | 8 | beforeEach(() => test.init(sampleData)); 9 | afterEach(() => test.cleanUp()); 10 | 11 | it('should not exclude any results', async () => { 12 | const results = await test.runJsonAnalysis({collection:'users', sort:{name:-1}}); 13 | results.validateResultsCount(7); 14 | results.validate('_id', 5, 100.0, {ObjectId: 5}); 15 | results.validate('name', 5, 100.0, {String: 5}); 16 | results.validate('bio', 3, 60.0, {String: 3}); 17 | results.validate('birthday', 2, 40.0, {Date: 2}); 18 | results.validate('pets', 2, 40.0, {String: 1, Array: 1}); 19 | results.validate('someBinData', 1, 20.0, {'BinData-generic': 1}); 20 | results.validate('someWeirdLegacyKey', 1, 20.0, {String: 1}); 21 | }); 22 | 23 | it('should sort and apply limit', async () => { 24 | const criteria = { 25 | collection:'users', 26 | sort:{name:-1}, 27 | limit:1 28 | }; 29 | 30 | // when sorting default SampleData by name desc, first entry becomes Tom. He is only with key 'someWeirdLegacyKey' 31 | // Together with applying limit 1, Tom is the only result in analysis. That gives us chance to assume keys and verify 32 | // that ordering is correct. 33 | // {name: "Tom", bio: "A nice guy.", pets: ["monkey", "fish"], someWeirdLegacyKey: "I like Ike!"} 34 | const results = await test.runJsonAnalysis(criteria); 35 | results.validateResultsCount(5); 36 | results.validate('_id', 1, 100.0, {ObjectId: 1}); 37 | results.validate('name', 1, 100.0, {String: 1}); 38 | results.validate('bio', 1, 100.0, {String: 1}); 39 | results.validate('pets', 1, 100.0, {Array: 1}); 40 | results.validate('someWeirdLegacyKey', 1, 100.0, {String: 1}); 41 | }); 42 | }); 43 | -------------------------------------------------------------------------------- /spec/utils/Tester.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | import { resolve, join } from 'path'; 4 | import { MongoClient } from 'mongodb'; 5 | import execute from './MongoShell'; 6 | import JsonValidator from './JsonValidator'; 7 | 8 | const mongodb_port = process.env.MONGODB_PORT || 27017; 9 | const default_url = `mongodb://localhost:${mongodb_port}/test?autoReconnect=true`; 10 | 11 | export default class Tester { 12 | constructor(databaseName, collectionName) { 13 | this.databaseName = databaseName; 14 | this.collectionName = collectionName; 15 | } 16 | 17 | async connect() { 18 | const connection = await MongoClient.connect(default_url); 19 | this.connection = connection; 20 | this.coll = connection.db(this.databaseName).collection(this.collectionName); 21 | return connection; 22 | } 23 | 24 | async init(initialData) { 25 | var connection = await this.connect(); 26 | await this.coll.deleteMany(); 27 | await this.coll.insertMany(initialData); 28 | return connection; 29 | } 30 | 31 | async cleanUp() { 32 | await this.coll.deleteMany(); 33 | await this.connection.close(); 34 | } 35 | 36 | getDb(dbName) { 37 | return this.connection.db(dbName); 38 | } 39 | 40 | getVarietyPath() { 41 | return resolve(join(__dirname , '..', '..', 'variety.js')); 42 | } 43 | 44 | async runJsonAnalysis(options) { 45 | options.outputFormat = 'json'; 46 | const result = await this.runAnalysis(options, true); 47 | return new JsonValidator(JSON.parse(result)); 48 | } 49 | 50 | 51 | runAnalysis(options, quiet) { 52 | let str = []; 53 | if(options) { 54 | for(let key in options) { 55 | let value = JSON.stringify(options[key]).replace(/"/g, '\'').replace(/\$/g, '\\$'); 56 | str.push(`var ${key}=${value}`); 57 | } 58 | } 59 | return execute(this.database, null, '"' + str.join(';') + '"', this.getVarietyPath(), quiet, mongodb_port); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /spec/UnnamedObjectsAnalysisTest.js: -------------------------------------------------------------------------------- 1 | import Tester from './utils/Tester.js'; 2 | const test = new Tester('test', 'users'); 3 | 4 | const sampleData = [ 5 | {title:'Article 1', comments:[{author:'John', body:'it works', visible:true, '123key': '123value' }]}, 6 | {title:'Article 2', comments:[{author:'Tom', body:'thanks'}, {author:'Mark', body:1}]} 7 | ]; 8 | 9 | // Test, how variety handles objects, that are not named (for example objects inside array). 10 | // It addresses behavior described in issue https://github.com/variety/variety/issues/29 11 | 12 | describe('Unnamed object analysis', () => { 13 | 14 | beforeEach(() => test.init(sampleData)); 15 | afterEach(() => test.cleanUp()); 16 | 17 | it('should handle keys of unnamed object', async () => { 18 | const results = await test.runJsonAnalysis({collection:'users'}, true); 19 | results.validateResultsCount(7); 20 | results.validate('_id', 2, 100.0, {ObjectId: 2}); 21 | results.validate('title', 2, 100.0, {String: 2}); 22 | results.validate('comments', 2, 100.0, {Array: 2}); 23 | 24 | // unnamed objects are prefixed with .XX key 25 | results.validate('comments.XX.author', 2, 100.0, {String: 2}); 26 | results.validate('comments.XX.body', 2, 100.0, {String: 2, Number:1}); 27 | results.validate('comments.XX.visible', 1, 50.0, {Boolean: 1}); 28 | results.validate('comments.XX.123key', 1, 50.0, {String: 1}); 29 | }); 30 | 31 | it('should use different array escape key', async () => { 32 | const results = await test.runJsonAnalysis({collection:'users', arrayEscape:'YY'}, true); 33 | results.validateResultsCount(7); 34 | // unnamed objects are prefixed with .YY key 35 | results.validate('comments.YY.author', 2, 100.0, {String: 2}); 36 | results.validate('comments.YY.body', 2, 100.0, {String: 2, Number:1}); 37 | results.validate('comments.YY.visible', 1, 50.0, {Boolean: 1}); 38 | results.validate('comments.YY.123key', 1, 50.0, {String: 1}); 39 | }); 40 | }); 41 | -------------------------------------------------------------------------------- /spec/ExcludeSubkeysTest.js: -------------------------------------------------------------------------------- 1 | import Tester from './utils/Tester.js'; 2 | const test = new Tester('test', 'users'); 3 | 4 | const sampleData = [ 5 | {name:'Walter', someNestedObject:{a:{b:{c:{d:{e:1}}}}}, otherNestedObject:{a:{b:{c:{d:{e:1}}}}}} 6 | ]; 7 | 8 | describe('Exclude subkeys', () => { 9 | 10 | beforeEach(() => test.init(sampleData)); 11 | afterEach(() => test.cleanUp()); 12 | 13 | it('should exclude some subkeys', async () => { 14 | const results = await test.runJsonAnalysis({collection:'users',excludeSubkeys:['someNestedObject.a.b']}, true); 15 | 16 | results.validateResultsCount(11); 17 | results.validate('_id', 1, 100.0, {ObjectId: 1}); 18 | results.validate('name', 1, 100.0, {String: 1}); 19 | results.validate('someNestedObject', 1, 100.0, {Object: 1}); 20 | results.validate('someNestedObject.a', 1, 100.0, {Object: 1}); 21 | results.validate('someNestedObject.a.b', 1, 100.0, {Object: 1}); 22 | // no more descendants of someNestedObject.a.b, they are excluded 23 | 24 | results.validate('otherNestedObject', 1, 100.0, {Object: 1}); 25 | results.validate('otherNestedObject.a', 1, 100.0, {Object: 1}); 26 | results.validate('otherNestedObject.a.b', 1, 100.0, {Object: 1}); 27 | results.validate('otherNestedObject.a.b.c', 1, 100.0, {Object: 1}); 28 | results.validate('otherNestedObject.a.b.c.d', 1, 100.0, {Object: 1}); 29 | results.validate('otherNestedObject.a.b.c.d.e', 1, 100.0, {Number: 1}); 30 | }); 31 | 32 | it('should exclude some subkeys excluding root', async () => { 33 | const results = await test.runJsonAnalysis({collection:'users',excludeSubkeys:['someNestedObject']}, true); 34 | 35 | results.validateResultsCount(9); 36 | results.validate('_id', 1, 100.0, {ObjectId: 1}); 37 | results.validate('name', 1, 100.0, {String: 1}); 38 | results.validate('someNestedObject', 1, 100.0, {Object: 1}); 39 | // no more descendants of someNestedObject, they are excluded 40 | 41 | results.validate('otherNestedObject', 1, 100.0, {Object: 1}); 42 | results.validate('otherNestedObject.a', 1, 100.0, {Object: 1}); 43 | results.validate('otherNestedObject.a.b', 1, 100.0, {Object: 1}); 44 | results.validate('otherNestedObject.a.b.c', 1, 100.0, {Object: 1}); 45 | results.validate('otherNestedObject.a.b.c.d', 1, 100.0, {Object: 1}); 46 | results.validate('otherNestedObject.a.b.c.d.e', 1, 100.0, {Number: 1}); 47 | }); 48 | }); 49 | -------------------------------------------------------------------------------- /spec/ParametersParsingTest.js: -------------------------------------------------------------------------------- 1 | import assert from 'assert'; 2 | import Tester from './utils/Tester.js'; 3 | import sampleData from './assets/SampleData'; 4 | 5 | const test = new Tester('test', 'users'); 6 | 7 | const parseParams = (output) => { 8 | return output 9 | .split('\n') // split by new line 10 | .filter(line => line.indexOf('Using') === 0) // take only lines starting with Using 11 | .map(line => /^Using\s{1}(\w+)\s{1}of\s{1}(.*)$/.exec(line)) // parse with regular expression 12 | .reduce((acc, match) => {acc[match[1]] = JSON.parse(match[2]); return acc;}, {}); // reduce to params object 13 | }; 14 | 15 | describe('Parameters parsing', () => { 16 | 17 | beforeEach(() => test.init(sampleData)); 18 | afterEach(() => test.cleanUp()); 19 | 20 | it('should parse default params', async () => { 21 | const results = await test.runAnalysis({collection:'users'}); 22 | const params = parseParams(results); 23 | assert.equal(params.collection, 'users'); 24 | assert.deepEqual(params.query, {}); 25 | assert.equal(params.limit, 5); 26 | assert.equal(params.maxDepth, 99); 27 | assert.deepEqual(params.sort, {'_id':-1}); 28 | assert.equal(params.outputFormat, 'ascii'); 29 | assert.equal(params.persistResults, false); 30 | assert.equal(params.resultsDatabase, 'varietyResults'); 31 | assert.equal(params.resultsCollection, 'usersKeys'); 32 | assert.equal(params.resultsUser, null); 33 | assert.equal(params.resultsPass, null); 34 | assert.deepEqual(params.plugins, []); 35 | }); 36 | 37 | it('should parse restricted results', async () => { 38 | 39 | const criteria = { 40 | collection:'users', 41 | query: {name:'Harry'}, 42 | sort: {name:1}, 43 | maxDepth: 5, 44 | limit: 2 45 | }; 46 | 47 | const results = await test.runAnalysis(criteria); 48 | const params = parseParams(results); 49 | assert.equal(params.limit, 2); 50 | assert.equal(params.maxDepth, 5); 51 | assert.deepEqual(params.sort, {name:1}); 52 | assert.deepEqual(params.query, {name:'Harry'}); 53 | }); 54 | 55 | it('should recognize unknown collection', async (done) => { 56 | try { 57 | await test.runAnalysis({collection:'--unknown--'}); 58 | done(new Error('Should throw an exception!')); 59 | } catch(err) { 60 | assert.ok(err.code > 0); 61 | assert.ok(err.stdout.indexOf('The collection specified (--unknown--) in the database specified (test) does not exist or is empty.') > -1); 62 | done(); 63 | } 64 | }); 65 | 66 | }); 67 | -------------------------------------------------------------------------------- /variety.js: -------------------------------------------------------------------------------- 1 | /* Variety: A MongoDB Schema Analyzer 2 | 3 | This tool helps you get a sense of your application's schema, as well as any 4 | outliers to that schema. Particularly useful when you inherit a codebase with 5 | data dump and want to quickly learn how the data's structured. Also useful for 6 | finding rare keys. 7 | 8 | Please see https://github.com/variety/variety for details. 9 | 10 | Released by Maypop Inc, © 2012-2016, under the MIT License. */ 11 | 12 | (function () { 13 | 'use strict'; // wraps everything for which we can use strict mode -JC 14 | 15 | var log = function(message) { 16 | if(!__quiet) { // mongo shell param, coming from https://github.com/mongodb/mongo/blob/5fc306543cd3ba2637e5cb0662cc375f36868b28/src/mongo/shell/dbshell.cpp#L624 17 | print(message); 18 | } 19 | }; 20 | 21 | log('Variety: A MongoDB Schema Analyzer'); 22 | log('Version 1.5.0, released 14 May 2015'); 23 | 24 | var dbs = []; 25 | var emptyDbs = []; 26 | 27 | if (typeof slaveOk !== 'undefined') { 28 | if (slaveOk === true) { 29 | db.getMongo().setSlaveOk(); 30 | } 31 | } 32 | 33 | var knownDatabases = db.adminCommand('listDatabases').databases; 34 | if(typeof knownDatabases !== 'undefined') { // not authorized user receives error response (json) without databases key 35 | knownDatabases.forEach(function(d){ 36 | if(db.getSisterDB(d.name).getCollectionNames().length > 0) { 37 | dbs.push(d.name); 38 | } 39 | if(db.getSisterDB(d.name).getCollectionNames().length === 0) { 40 | emptyDbs.push(d.name); 41 | } 42 | }); 43 | 44 | if (emptyDbs.indexOf(db.getName()) !== -1) { 45 | throw 'The database specified ('+ db +') is empty.\n'+ 46 | 'Possible database options are: ' + dbs.join(', ') + '.'; 47 | } 48 | 49 | if (dbs.indexOf(db.getName()) === -1) { 50 | throw 'The database specified ('+ db +') does not exist.\n'+ 51 | 'Possible database options are: ' + dbs.join(', ') + '.'; 52 | } 53 | } 54 | 55 | var collNames = db.getCollectionNames().join(', '); 56 | if (typeof collection === 'undefined') { 57 | throw 'You have to supply a \'collection\' variable, à la --eval \'var collection = "animals"\'.\n'+ 58 | 'Possible collection options for database specified: ' + collNames + '.\n'+ 59 | 'Please see https://github.com/variety/variety for details.'; 60 | } 61 | 62 | if (db.getCollection(collection).count() === 0) { 63 | throw 'The collection specified (' + collection + ') in the database specified ('+ db +') does not exist or is empty.\n'+ 64 | 'Possible collection options for database specified: ' + collNames + '.'; 65 | } 66 | 67 | var readConfig = function(configProvider) { 68 | var config = {}; 69 | var read = function(name, defaultValue) { 70 | var value = typeof configProvider[name] !== 'undefined' ? configProvider[name] : defaultValue; 71 | config[name] = value; 72 | log('Using '+name+' of ' + tojson(value)); 73 | }; 74 | read('collection', null); 75 | read('query', {}); 76 | read('limit', db.getCollection(config.collection).find(config.query).count()); 77 | read('maxDepth', 99); 78 | read('sort', {_id: -1}); 79 | read('outputFormat', 'ascii'); 80 | read('persistResults', false); 81 | read('resultsDatabase', 'varietyResults'); 82 | read('resultsCollection', collection + 'Keys'); 83 | read('resultsUser', null); 84 | read('resultsPass', null); 85 | read('logKeysContinuously', false); 86 | read('excludeSubkeys', []); 87 | read('arrayEscape', 'XX'); 88 | 89 | //Translate excludeSubkeys to set like object... using an object for compatibility... 90 | config.excludeSubkeys = config.excludeSubkeys.reduce(function (result, item) { result[item+'.'] = true; return result; }, {}); 91 | 92 | return config; 93 | }; 94 | 95 | var config = readConfig(this); 96 | 97 | var PluginsClass = function(context) { 98 | var parsePath = function(val) { return val.slice(-3) !== '.js' ? val + '.js' : val;}; 99 | var parseConfig = function(val) { 100 | var config = {}; 101 | val.split('&').reduce(function(acc, val) { 102 | var parts = val.split('='); 103 | acc[parts[0]] = parts[1]; 104 | return acc; 105 | }, config); 106 | return config; 107 | }; 108 | 109 | if(typeof context.plugins !== 'undefined') { 110 | this.plugins = context.plugins.split(',') 111 | .map(function(path){return path.trim();}) 112 | .map(function(definition){ 113 | var path = parsePath(definition.split('|')[0]); 114 | var config = parseConfig(definition.split('|')[1] || ''); 115 | context.module = context.module || {}; 116 | load(path); 117 | var plugin = context.module.exports; 118 | plugin.path = path; 119 | if(typeof plugin.init === 'function') { 120 | plugin.init(config); 121 | } 122 | return plugin; 123 | }, this); 124 | } else { 125 | this.plugins = []; 126 | } 127 | 128 | this.execute = function(methodName) { 129 | var args = Array.prototype.slice.call(arguments, 1); 130 | var applicablePlugins = this.plugins.filter(function(plugin){return typeof plugin[methodName] === 'function';}); 131 | return applicablePlugins.map(function(plugin) { 132 | return plugin[methodName].apply(plugin, args); 133 | }); 134 | }; 135 | 136 | log('Using plugins of ' + tojson(this.plugins.map(function(plugin){return plugin.path;}))); 137 | }; 138 | 139 | var $plugins = new PluginsClass(this); 140 | $plugins.execute('onConfig', config); 141 | 142 | var varietyTypeOf = function(thing) { 143 | if (typeof thing === 'undefined') { throw 'varietyTypeOf() requires an argument'; } 144 | 145 | if (typeof thing !== 'object') { 146 | // the messiness below capitalizes the first letter, so the output matches 147 | // the other return values below. -JC 148 | var typeofThing = typeof thing; // edgecase of JSHint's "singleGroups" 149 | return typeofThing[0].toUpperCase() + typeofThing.slice(1); 150 | } else { 151 | if (thing && thing.constructor === Array) { 152 | return 'Array'; 153 | } else if (thing === null) { 154 | return 'null'; 155 | } else if (thing instanceof Date) { 156 | return 'Date'; 157 | } else if(thing instanceof NumberLong) { 158 | return 'NumberLong'; 159 | } else if (thing instanceof ObjectId) { 160 | return 'ObjectId'; 161 | } else if (thing instanceof BinData) { 162 | var binDataTypes = {}; 163 | binDataTypes[0x00] = 'generic'; 164 | binDataTypes[0x01] = 'function'; 165 | binDataTypes[0x02] = 'old'; 166 | binDataTypes[0x03] = 'UUID'; 167 | binDataTypes[0x05] = 'MD5'; 168 | binDataTypes[0x80] = 'user'; 169 | return 'BinData-' + binDataTypes[thing.subtype()]; 170 | } else { 171 | return 'Object'; 172 | } 173 | } 174 | }; 175 | 176 | //flattens object keys to 1D. i.e. {'key1':1,{'key2':{'key3':2}}} becomes {'key1':1,'key2.key3':2} 177 | //we assume no '.' characters in the keys, which is an OK assumption for MongoDB 178 | var serializeDoc = function(doc, maxDepth, excludeSubkeys) { 179 | var result = {}; 180 | 181 | //determining if an object is a Hash vs Array vs something else is hard 182 | //returns true, if object in argument may have nested objects and makes sense to analyse its content 183 | function isHash(v) { 184 | var isArray = Array.isArray(v); 185 | var isObject = typeof v === 'object'; 186 | var specialObject = v instanceof Date || 187 | v instanceof ObjectId || 188 | v instanceof BinData || 189 | v instanceof NumberLong; 190 | return !specialObject && (isArray || isObject); 191 | } 192 | 193 | var arrayRegex = new RegExp('\\.' + config.arrayEscape + '\\d+' + config.arrayEscape + '\\.', 'g'); 194 | 195 | function serialize(document, parentKey, maxDepth) { 196 | if(Object.prototype.hasOwnProperty.call(excludeSubkeys, parentKey.replace(arrayRegex, '.'))) 197 | return; 198 | for(var key in document) { 199 | //skip over inherited properties such as string, length, etch 200 | if(!document.hasOwnProperty(key)) { 201 | continue; 202 | } 203 | var value = document[key]; 204 | if(Array.isArray(document)) 205 | key = config.arrayEscape + key + config.arrayEscape; //translate unnamed object key from {_parent_name_}.{_index_} to {_parent_name_}.arrayEscape{_index_}arrayEscape. 206 | result[parentKey+key] = value; 207 | //it's an object, recurse...only if we haven't reached max depth 208 | if(isHash(value) && maxDepth > 1) { 209 | serialize(value, parentKey+key+'.', maxDepth-1); 210 | } 211 | } 212 | } 213 | serialize(doc, '', maxDepth); 214 | return result; 215 | }; 216 | 217 | // convert document to key-value map, where value is always an array with types as plain strings 218 | var analyseDocument = function(document) { 219 | var result = {}; 220 | var arrayRegex = new RegExp('\\.' + config.arrayEscape + '\\d+' + config.arrayEscape, 'g'); 221 | for (var key in document) { 222 | var value = document[key]; 223 | key = key.replace(arrayRegex, '.' + config.arrayEscape); 224 | if(typeof result[key] === 'undefined') { 225 | result[key] = {}; 226 | } 227 | var type = varietyTypeOf(value); 228 | result[key][type] = true; 229 | } 230 | return result; 231 | }; 232 | 233 | var mergeDocument = function(docResult, interimResults) { 234 | for (var key in docResult) { 235 | if(key in interimResults) { 236 | var existing = interimResults[key]; 237 | 238 | for(var type in docResult[key]) { 239 | if (type in existing.types) { 240 | existing.types[type] = existing.types[type] + 1; 241 | } else { 242 | existing.types[type] = 1; 243 | if (config.logKeysContinuously) { 244 | log('Found new key type "' + key + '" type "' + type + '"'); 245 | } 246 | } 247 | } 248 | existing.totalOccurrences = existing.totalOccurrences + 1; 249 | } else { 250 | var types = {}; 251 | for (var newType in docResult[key]) { 252 | types[newType] = 1; 253 | if (config.logKeysContinuously) { 254 | log('Found new key type "' + key + '" type "' + newType + '"'); 255 | } 256 | } 257 | interimResults[key] = {'types': types,'totalOccurrences':1}; 258 | } 259 | } 260 | }; 261 | 262 | var convertResults = function(interimResults, documentsCount) { 263 | var getKeys = function(obj) { 264 | var keys = {}; 265 | for(var key in obj) { 266 | keys[key] = obj[key]; 267 | } 268 | return keys; 269 | //return keys.sort(); 270 | }; 271 | var varietyResults = []; 272 | //now convert the interimResults into the proper format 273 | for(var key in interimResults) { 274 | var entry = interimResults[key]; 275 | varietyResults.push({ 276 | '_id': {'key':key}, 277 | 'value': {'types':getKeys(entry.types)}, 278 | 'totalOccurrences': entry.totalOccurrences, 279 | 'percentContaining': entry.totalOccurrences * 100 / documentsCount 280 | }); 281 | } 282 | return varietyResults; 283 | }; 284 | 285 | // Merge the keys and types of current object into accumulator object 286 | var reduceDocuments = function(accumulator, object) { 287 | var docResult = analyseDocument(serializeDoc(object, config.maxDepth, config.excludeSubkeys)); 288 | mergeDocument(docResult, accumulator); 289 | return accumulator; 290 | }; 291 | 292 | // We throw away keys which end in an array index, since they are not useful 293 | // for our analysis. (We still keep the key of their parent array, though.) -JC 294 | var arrayRegex = new RegExp('\\.' + config.arrayEscape + '$', 'g'); 295 | var filter = function(item) { 296 | return !item._id.key.match(arrayRegex); 297 | }; 298 | 299 | // sort desc by totalOccurrences or by key asc if occurrences equal 300 | var comparator = function(a, b) { 301 | var countsDiff = b.totalOccurrences - a.totalOccurrences; 302 | return countsDiff !== 0 ? countsDiff : a._id.key.localeCompare(b._id.key); 303 | }; 304 | 305 | // extend standard MongoDB cursor of reduce method - call forEach and combine the results 306 | DBQuery.prototype.reduce = function(callback, initialValue) { 307 | var result = initialValue; 308 | this.forEach(function(obj){ 309 | result = callback(result, obj); 310 | }); 311 | return result; 312 | }; 313 | 314 | var cursor = db.getCollection(config.collection).find(config.query).sort(config.sort).limit(config.limit); 315 | var interimResults = cursor.reduce(reduceDocuments, {}); 316 | var varietyResults = convertResults(interimResults, cursor.size()) 317 | .filter(filter) 318 | .sort(comparator); 319 | 320 | if(config.persistResults) { 321 | var resultsDB; 322 | var resultsCollectionName = config.resultsCollection; 323 | 324 | if (config.resultsDatabase.indexOf('/') === -1) { 325 | // Local database; don't reconnect 326 | resultsDB = db.getMongo().getDB(config.resultsDatabase); 327 | } else { 328 | // Remote database, establish new connection 329 | resultsDB = connect(config.resultsDatabase); 330 | } 331 | 332 | if (config.resultsUser !== null && config.resultsPass !== null) { 333 | resultsDB.auth(config.resultsUser, config.resultsPass); 334 | } 335 | 336 | // replace results collection 337 | log('replacing results collection: '+ resultsCollectionName); 338 | resultsDB.getCollection(resultsCollectionName).drop(); 339 | resultsDB.getCollection(resultsCollectionName).insert(varietyResults); 340 | } 341 | 342 | var createAsciiTable = function(results) { 343 | var headers = ['key', 'types', 'occurrences', 'percents']; 344 | // return the number of decimal places or 1, if the number is int (1.23=>2, 100=>1, 0.1415=>4) 345 | var significantDigits = function(value) { 346 | var res = value.toString().match(/^[0-9]+\.([0-9]+)$/); 347 | return res !== null ? res[1].length : 1; 348 | }; 349 | 350 | var maxDigits = varietyResults.map(function(value){return significantDigits(value.percentContaining);}).reduce(function(acc,val){return acc>val?acc:val;}); 351 | 352 | var rows = results.map(function(row) { 353 | var types = []; 354 | var typeKeys = Object.keys(row.value.types); 355 | if (typeKeys.length > 1) { 356 | for (var type in row.value.types) { 357 | var typestring = type + ' (' + row.value.types[type] + ')'; 358 | types.push(typestring); 359 | } 360 | } else { 361 | types = typeKeys; 362 | } 363 | 364 | return [row._id.key, types, row.totalOccurrences, row.percentContaining.toFixed(Math.min(maxDigits, 20))]; 365 | }); 366 | var table = [headers, headers.map(function(){return '';})].concat(rows); 367 | var colMaxWidth = function(arr, index) {return Math.max.apply(null, arr.map(function(row){return row[index].toString().length;}));}; 368 | var pad = function(width, string, symbol) { return width <= string.length ? string : pad(width, isNaN(string) ? string + symbol : symbol + string, symbol); }; 369 | table = table.map(function(row, ri){ 370 | return '| ' + row.map(function(cell, i) {return pad(colMaxWidth(table, i), cell.toString(), ri === 1 ? '-' : ' ');}).join(' | ') + ' |'; 371 | }); 372 | var border = '+' + pad(table[0].length - 2, '', '-') + '+'; 373 | return [border].concat(table).concat(border).join('\n'); 374 | }; 375 | 376 | var pluginsOutput = $plugins.execute('formatResults', varietyResults); 377 | if (pluginsOutput.length > 0) { 378 | pluginsOutput.forEach(function(i){print(i);}); 379 | } else if(config.outputFormat === 'json') { 380 | printjson(varietyResults); // valid formatted json output, compressed variant is printjsononeline() 381 | } else { 382 | print(createAsciiTable(varietyResults)); // output nice ascii table with results 383 | } 384 | 385 | }.bind(this)()); // end strict mode 386 | -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | # Meet Variety, a Schema Analyzer for MongoDB ### 2 | This lightweight tool helps you get a sense of your application's schema, as well as any outliers to that schema. Particularly useful when you inherit a codebase with data dump and want to quickly learn how the data's structured. Also useful for finding rare keys. 3 | 4 | [![Build Status](https://travis-ci.org/variety/variety.svg?branch=master)](https://travis-ci.org/variety/variety) 5 | 6 | *** 7 | 8 | _“I happen to slowly be falling in love with Variety! It is actually one of the most useful tools to get a sense for a messy/unknown data set, and I have put it in a few of our exercises at Zipfian Academy.”_ 9 | 10 | Jon Dinu 11 | _Co-founder of [Zipfian Academy](http://www.zipfianacademy.com/)_ 12 | 13 | *** 14 | 15 | Also featured on the [official MongoDB blog](http://blog.mongodb.org/post/21923016898/meet-variety-a-schema-analyzer-for-mongodb). 16 | 17 | ### An Easy Example ### 18 | 19 | We'll make a collection: 20 | 21 | db.users.insert({name: "Tom", bio: "A nice guy.", pets: ["monkey", "fish"], someWeirdLegacyKey: "I like Ike!"}); 22 | db.users.insert({name: "Dick", bio: "I swordfight.", birthday: new Date("1974/03/14")}); 23 | db.users.insert({name: "Harry", pets: "egret", birthday: new Date("1984/03/14")}); 24 | db.users.insert({name: "Geneviève", bio: "Ça va?"}); 25 | db.users.insert({name: "Jim", someBinData: new BinData(2,"1234")}); 26 | 27 | So, let's see what we've got here: 28 | 29 | $ mongo test --eval "var collection = 'users'" variety.js 30 | 31 | +------------------------------------------------------------------+ 32 | | key | types | occurrences | percents | 33 | | ------------------ | ------------ | ----------- | -------- | 34 | | _id | ObjectId | 5 | 100.0 | 35 | | name | String | 5 | 100.0 | 36 | | bio | String | 3 | 60.0 | 37 | | birthday | Date | 2 | 40.0 | 38 | | pets | Array(1),String(1) | 2 | 40.0 | 39 | | someBinData | BinData-old | 1 | 20.0 | 40 | | someWeirdLegacyKey | String | 1 | 20.0 | 41 | +------------------------------------------------------------------+ 42 | 43 | _("test" is the database containing the collection we are analyzing.)_ 44 | 45 | Hmm. Looks like everybody has a "name" and "_id". Most, but not all have a "bio". 46 | 47 | Interestingly, it looks like "pets" can be either an array or a string, but there are more arrays than strings. Will this cause any problems in the application, I wonder? 48 | 49 | Seems like the first document created has a weird legacy key—those damn fools who built the prototype didn't clean up after themselves. If there were a thousand such early documents, I might cross-reference the codebase to confirm they are no longer used, and then delete them all. That way they'll not confuse any future developers. 50 | 51 | Results are stored for future use in a varietyResults database. 52 | 53 | ### See Progress When Analysis Takes a Long Time ### 54 | 55 | Tailing the log is great for this. Mongo provides a "percent complete" measurement for you. These operations can take a long time on huge collections. 56 | 57 | ### Analyze Only Recent Documents ### 58 | 59 | Perhaps you have a really large collection, and you can't wait a whole day for Variety's results. 60 | 61 | Perhaps you want to ignore a collection's oldest documents, and only see what the collection's documents' structures have been looking like, as of late. 62 | 63 | One can apply a "limit" constraint, which analyzes only the newest documents in a collection ([unless sorting](https://github.com/variety/variety#analyze-documents-sorted-in-a-particular-order)), like so: 64 | 65 | $ mongo test --eval "var collection = 'users', limit = 1" variety.js 66 | 67 | Let's examine the results closely: 68 | 69 | +----------------------------------------------------+ 70 | | key | types | occurrences | percents | 71 | | ----------- | ----------- | ----------- | -------- | 72 | | _id | ObjectId | 1 | 100.0 | 73 | | name | String | 1 | 100.0 | 74 | | someBinData | BinData-old | 1 | 100.0 | 75 | +----------------------------------------------------+ 76 | 77 | We are only examining the last document here ("limit = 1"). It belongs to Geneviève, and only contains the _id, name and bio fields. So it makes sense these are the only three keys. 78 | 79 | ### Analyze Documents to a Maximum Depth 80 | 81 | Perhaps you have a potentially very deep nested object structure, and you don't want to see more than a few levels deep in the analysis. 82 | 83 | One can apply a "maxDepth" constraint, which limits the depth Variety will recursively search to find new objects. 84 | 85 | db.users.insert({name:"Walter", someNestedObject:{a:{b:{c:{d:{e:1}}}}}}); 86 | 87 | The default will traverse all the way to the bottom of that structure: 88 | 89 | $ mongo test --eval "var collection = 'users'" variety.js 90 | 91 | +----------------------------------------------------------------+ 92 | | key | types | occurrences | percents | 93 | | -------------------------- | -------- | ----------- | -------- | 94 | | _id | ObjectId | 1 | 100.0 | 95 | | name | String | 1 | 100.0 | 96 | | someNestedObject | Object | 1 | 100.0 | 97 | | someNestedObject.a | Object | 1 | 100.0 | 98 | | someNestedObject.a.b | Object | 1 | 100.0 | 99 | | someNestedObject.a.b.c | Object | 1 | 100.0 | 100 | | someNestedObject.a.b.c.d | Object | 1 | 100.0 | 101 | | someNestedObject.a.b.c.d.e | Number | 1 | 100.0 | 102 | +----------------------------------------------------------------+ 103 | 104 | $ mongo test --eval "var collection = 'users', maxDepth = 3" variety.js 105 | 106 | +----------------------------------------------------------+ 107 | | key | types | occurrences | percents | 108 | | -------------------- | -------- | ----------- | -------- | 109 | | _id | ObjectId | 1 | 100.0 | 110 | | name | String | 1 | 100.0 | 111 | | someNestedObject | Object | 1 | 100.0 | 112 | | someNestedObject.a | Object | 1 | 100.0 | 113 | | someNestedObject.a.b | Object | 1 | 100.0 | 114 | +----------------------------------------------------------+ 115 | 116 | As you can see, Variety only traversed three levels deep. 117 | 118 | ### Analyze a Subset of Documents ### 119 | 120 | Perhaps you have a large collection, or you only care about some subset of the documents. 121 | 122 | One can apply a "query" constraint, which takes a standard Mongo query object, to filter the set of documents required before analysis. 123 | 124 | $ mongo test --eval "var collection = 'users', query = {'caredAbout':true}" variety.js 125 | 126 | ### Analyze Documents Sorted In a Particular Order ### 127 | 128 | Perhaps you want to analyze a subset of documents sorted in an order other than creation order, say, for example, sorted by when documents were updated. 129 | 130 | One can apply a "sort" constraint, which analyzes documents in the specified order like so: 131 | 132 | $ mongo test --eval "var collection = 'users', sort = { updated_at : -1 }" variety.js 133 | 134 | ### Render Output As JSON For Easy Ingestion and Parsing ### 135 | 136 | Variety supports two different output formats: 137 | 138 | - ASCII: nicely formatted tables (as in this README) 139 | - JSON: valid JSON results for subsequent processing in other tools (see also [quiet option](#quiet-option)) 140 | 141 | Default format is ```ascii```. You can select the type of format with property ```outputFormat``` provided to Variety. Valid values are ```ascii``` and ```json```. 142 | 143 | $ mongo test --quiet --eval "var collection = 'users', outputFormat='json'" variety.js 144 | 145 | #### Quiet Option #### 146 | Both MongoDB and Variety output some additional information to standard output. If you want to remove this info, you can use ```--quiet``` option provided to ```mongo``` executable. 147 | Variety can also read that option and mute unnecessary output. This is useful in connection with ```outputFormat=json```. You would then receive only JSON, without any other characters around it. 148 | 149 | $ mongo test --quiet --eval "var collection = 'users', sort = { updated_at : -1 }" variety.js 150 | 151 | #### Log Keys and Types As They Arrive Option #### 152 | Sometimes you want to see the keys and types come in as it happens. Maybe you have a large dataset and want accurate results, but you also are impatient and want to see something now. Or maybe you have a large mangled dataset with crazy keys (that probably shouldn't be keys) and Variety is going out of memory. This option will show you the keys and types as they come in and help you identify problems with your dataset without needing the Variety script to finish. 153 | 154 | $ mongo test --eval "var collection = 'users', sort = { updated_at : -1 }, logKeysContinuously = true" variety.js 155 | 156 | #### Exclude Subkeys #### 157 | Sometimes you inherit a database full of junk. Maybe the previous developer put data in the database keys, which causes Variety to go out of memory when run. After you've run the `logKeysContinuously` to figure out which subkeys may be a problem, you can use this option to run Variety without those subkeys. 158 | 159 | db.users.insert({name:"Walter", someNestedObject:{a:{b:{c:{d:{e:1}}}}}, otherNestedObject:{a:{b:{c:{d:{e:1}}}}}}); 160 | 161 | $ mongo test --eval "var collection = 'users', sort = { updated_at : -1 }, excludeSubkeys = [ 'someNestedObject.a.b' ]" variety.js 162 | 163 | +-----------------------------------------------------------------+ 164 | | key | types | occurrences | percents | 165 | | --------------------------- | -------- | ----------- | -------- | 166 | | _id | ObjectId | 1 | 100.0 | 167 | | name | String | 1 | 100.0 | 168 | | someNestedObject | Object | 1 | 100.0 | 169 | | someNestedObject.a | Object | 1 | 100.0 | 170 | | someNestedObject.a.b | Object | 1 | 100.0 | 171 | | otherNestedObject | Object | 1 | 100.0 | 172 | | otherNestedObject.a | Object | 1 | 100.0 | 173 | | otherNestedObject.a.b | Object | 1 | 100.0 | 174 | | otherNestedObject.a.b.c | Object | 1 | 100.0 | 175 | | otherNestedObject.a.b.c.d | Object | 1 | 100.0 | 176 | | otherNestedObject.a.b.c.d.e | Number | 1 | 100.0 | 177 | +-----------------------------------------------------------------+ 178 | 179 | #### Secondary Reads #### 180 | Analyzing a large collection on a busy replica set primary could take a lot longer than if you read from a secondary. To do so, we have to tell MongoDB it's okay to perform secondary reads 181 | by setting the ```slaveOk``` property to ```true```: 182 | 183 | $ mongo secondary.replicaset.member:31337/somedb --eval "var collection = 'users', slaveOk = true" variety.js 184 | 185 | ### Save Results in MongoDB For Future Use ### 186 | By default, Variety prints results only to standard output and does not store them in MongoDB itself. If you want to persist them automatically in MongoDB for later usage, you can set the parameter ```persistResults```. 187 | Variety then stores result documents in database ```varietyResults``` and the collection name is derived from the source collection's name. 188 | If the source collection's name is ```users```, Variety will store results in collection ```usersKeys``` under ```varietyResults``` database. 189 | 190 | $ mongo test --quiet --eval "var collection = 'users', persistResults=true" variety.js 191 | 192 | To persist to an alternate MongoDB database, you may specify the following parameters: 193 | 194 | * `resultsDatabase` - The database to store Variety results in. Accepts either a database name or a `host[:port]/database` URL. 195 | * `resultsCollection` - Collection to store Variety results in. **WARNING:** This collection is dropped before results are inserted. 196 | * `resultsUser` - MongoDB username for results database 197 | * `resultsPass` - MongoDB password for results database 198 | 199 | ``` 200 | $ mongo test --quiet --eval "var collection = 'users', persistResults=true, resultsDatabase='db.example.com/variety' variety.js 201 | ``` 202 | 203 | ### Reserved Keys ### 204 | Variety expects keys to be well formed, not having any '.'s in them (mongo 2.4 allows dots in certain cases). Also mongo uses the pseudo keys 'XX' and keys coresponding to the regex 'XX\d+XX.*' for use with arrays. You can change the string XX in these patterns to whatever you like if there is a conflict in your database using the `arrayEscape` parameter. 205 | 206 | $ mongo test --quiet --eval "var collection = 'users', arrayEscape = 'YY'" variety.js 207 | 208 | ### Command Line Interface 209 | Variety itself is command line friendly, as shown on examples above. 210 | But if you are a NPM and Node.js user, you could prefer the 211 | [variety-cli](https://github.com/variety/variety-cli) project. It simplifies usage of 212 | Variety and removes all the complexity of passing variables in the ```--eval``` argument and 213 | providing a path to the variety.js library. 214 | 215 | Example of a simplified command-line usage: 216 | ``` 217 | variety test/users --outputFormat='json' --quiet 218 | ``` 219 | For more details see the [documentation of variety-cli project](https://github.com/variety/variety-cli). 220 | 221 | ##### "But my dad told me MongoDB is a schemaless database!" ##### 222 | 223 | First of all, your father is a great guy. Moving on... 224 | 225 | A Mongo collection does not enforce a predefined schema like a relational database table. Still, documents in real-world collections nearly always have large sections for which the format of the data is the same. In other words, there is a schema to the majority of collections, it's just enforced by the _application_, rather than by the database system. And this schema is allowed to be a bit fuzzy, in the same way that a given table column might not be required in all rows, but to a much greater degree of flexibility. So we examine what percent of documents in the collection contain a key, and we get a feel for, among other things, how crucial that key is to the proper functioning of the application. 226 | 227 | ##### Dependencies ##### 228 | 229 | Absolutely none, except MongoDB. Written in 100% JavaScript. _(mongod's "noscripting" may not be set to true, and 'strict mode' must be disabled.)_ 230 | 231 | ##### Development, Hacking ##### 232 | This project is NPM based and provides standard NPM functionality. As an additional (not required) dependency, [Docker](https://www.docker.com/) can be installed to test against different MongoDB versions. 233 | 234 | To install all dev dependencies call as usual: 235 | ``` 236 | npm install 237 | ``` 238 | 239 | By default, tests expect MongoDB available on ```localhost:27017``` and can be executed by calling: 240 | 241 | ``` 242 | npm test 243 | ``` 244 | 245 | If you have Docker installed and don't want to test against your own MongoDB instance, 246 | you can execute tests against dockerized MongoDB: 247 | 248 | ``` 249 | MONGODB_VERSION=3.2 npm run test:docker 250 | ``` 251 | The script downloads one of [official MongoDB images](https://hub.docker.com/_/mongo/) (based on your provided version), 252 | starts the database, executes test suite against it (inside the container) and stops the DB. 253 | 254 | #### Reporting Issues / Contributing #### 255 | 256 | Please report any bugs and feature requests on the Github issue tracker. I will read all reports! 257 | 258 | I accept pull requests from forks. Very grateful to accept contributions from folks. 259 | 260 | #### Core Maintainers #### 261 | 262 | * Tomáš Dvořák ([personal website](http://www.tomas-dvorak.cz/)) 263 | * Eve Freeman ([Twitter](https://twitter.com/wefreema)) 264 | * James Cropcho (original creator of Variety) ([Twitter](https://twitter.com/Cropcho)) 265 | 266 | #### Special Thanks #### 267 | 268 | Additional special thanks to Gaëtan Voyer-Perraul ([@gatesvp](https://twitter.com/#!/@gatesvp)) and Kristina Chodorow ([@kchodorow](https://twitter.com/#!/kchodorow)) for answering other people's questions about how to do this on Stack Overflow, thereby providing me with the initial seed of code which grew into this tool. 269 | 270 | Much thanks also, to Kyle Banker ([@Hwaet](https://twitter.com/#!/hwaet)) for writing an unusually good book on MongoDB, which has taught me everything I know about it so far. 271 | 272 | #### Tools Which Use Variety (Open Source) #### 273 | 274 | Know of one? Built one? Let us know! 275 | 276 | ##### Stay Safe ##### 277 | 278 | I have every reason to believe this tool will **not** corrupt your data or harm your computer. But if I were you, I would not use it in a production environment. 279 | 280 | 281 | Released by Maypop Inc, © 2012–2016, under the [MIT License] (http://www.opensource.org/licenses/MIT). 282 | --------------------------------------------------------------------------------