├── vagrant
    ├── m202-ubuntu1404
    │   ├── .gitignore
    │   ├── README.md
    │   ├── provisioners
    │   │   └── setup.sh
    │   └── Vagrantfile
    └── MongoDBU
    │   ├── README.md
    │   ├── .vagrant
    │       └── machines
    │       │   └── default
    │       │       └── virtualbox
    │       │           ├── action_set_name
    │       │           ├── id
    │       │           ├── index_uuid
    │       │           ├── action_provision
    │       │           └── synced_folders
    │   ├── provisioners
    │       └── setup.sh
    │   └── Vagrantfile
├── README.md
├── ChunkInfo.js
├── LICENSE.md
├── compactness.js
├── mongostat-demangler.sh
├── AllChunkInfo.js
├── two_shards_1m_docs.js
├── pre_alloc.bash
└── crud.js


/vagrant/m202-ubuntu1404/.gitignore:
--------------------------------------------------------------------------------
1 | /.vagrant
2 | 


--------------------------------------------------------------------------------
/vagrant/MongoDBU/README.md:
--------------------------------------------------------------------------------
1 | MongoDBU_vm
2 | ===========
3 | 


--------------------------------------------------------------------------------
/vagrant/MongoDBU/.vagrant/machines/default/virtualbox/action_set_name:
--------------------------------------------------------------------------------
1 | 1405100085


--------------------------------------------------------------------------------
/vagrant/MongoDBU/.vagrant/machines/default/virtualbox/id:
--------------------------------------------------------------------------------
1 | 336fe158-7832-4574-bb55-fd1ba31860f4


--------------------------------------------------------------------------------
/vagrant/MongoDBU/.vagrant/machines/default/virtualbox/index_uuid:
--------------------------------------------------------------------------------
1 | beba46c3c19740a8aca03c5f1244fd9d


--------------------------------------------------------------------------------
/vagrant/MongoDBU/.vagrant/machines/default/virtualbox/action_provision:
--------------------------------------------------------------------------------
1 | 1.5:336fe158-7832-4574-bb55-fd1ba31860f4


--------------------------------------------------------------------------------
/vagrant/MongoDBU/.vagrant/machines/default/virtualbox/synced_folders:
--------------------------------------------------------------------------------
1 | {"virtualbox":{"/vagrant":{"guestpath":"/vagrant","hostpath":"/Users/adam/git/mongodb-scripts/vagrant/MongoDBU"}}}


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | mongodb-scripts
 2 | ===============
 3 | 
 4 | These are just some scripts and recipes I find useful when administering MongoDB
 5 | 
 6 | They are not intended to be fully fledged tools, and will occasionally need some tweaking to run in particular environments, but I have found them quite useful in the past.
 7 | 
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/vagrant/m202-ubuntu1404/README.md:
--------------------------------------------------------------------------------
 1 | M202 Virtual Machine (Ubuntu 14.04.1 LTS)
 2 | =========================================
 3 | 
 4 | This is the vagrant folder for the M202 Ubuntu 14.04 virtal machine. It will provision a machine
 5 | with the following characteristics:
 6 | 
 7 | * 2GB RAM
 8 | * 8GB Disk (dynamically allocated)
 9 | * 1 VCPU
10 | * NAT Networking
11 | * Standard Vagrant port forwarding
12 | * Default login/pass of m202/m202 (sudo is available without a password)
13 | 
14 | 


--------------------------------------------------------------------------------
/vagrant/MongoDBU/provisioners/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | set -e
 4 | 
 5 | HOME="/home/vagrant"
 6 | COURSE="m202"
 7 | MONGOPROC="$HOME/$COURSE/mongoProc"
 8 | 
 9 | echo 'Setting up VM...'
10 | 
11 | echo 'Updating system... this may take a while'
12 | apt-get -y update > /dev/null 2>&1
13 | DEBIAN_FRONTEND=noninteractive apt-get -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" upgrade > /dev/null 2>&1
14 | 
15 | echo 'Done! Rebooting...'
16 | 
17 | reboot
18 | 


--------------------------------------------------------------------------------
/vagrant/m202-ubuntu1404/provisioners/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | #set -e
 4 | 
 5 | #HOME="/home/m202"
 6 | #COURSE="m202"
 7 | #MONGOPROC="$HOME/mongoProc"
 8 | 
 9 | #echo 'Setting up VM...'
10 | 
11 | #echo 'Updating system... this may take a while'
12 | #apt-get -y update > /dev/null 2>&1
13 | #DEBIAN_FRONTEND=noninteractive apt-get -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" upgrade > /dev/null 2>&1
14 | 
15 | #echo 'Done! Rebooting...'
16 | 
17 | #reboot
18 | 


--------------------------------------------------------------------------------
/ChunkInfo.js:
--------------------------------------------------------------------------------
 1 | // This is a very simple function, which takes three arguments:
 2 | // ns: a string representing the sharded namespace to be examined        
 3 | // id: the chunk ID for the chunk you want the information on
 4 | // est: a boolean to determine whether or not to use the estimate option (recommended generally)
 5 | 
 6 | // It is called from the mongos like so:
 7 | 
 8 | // ChunkInfo("database.collection", database.collection-_id_"value", true);
 9 | // Currently the output is CSV, will add options for other output later
10 | 
11 | ChunkInfo = function(ns, id, est){
12 |     var configDB = db.getSiblingDB("config");
13 |     var db1 = db.getSiblingDB(ns.split(".")[0]);
14 |     var key = configDB.collections.findOne({_id:ns}).key;
15 |     var chunk = configDB.chunks.find({"_id" : id }).limit(1).next();
16 |     var dataSizeResult = db1.runCommand({datasize:chunk.ns, keyPattern:key, min:chunk.min, max:chunk.max, estimate:est});
17 |     print("***********Chunk Information***********");
18 |     printjson(chunk);
19 |     print("Chunk Size: "+dataSizeResult.size)
20 |     print("Objects in chunk: "+dataSizeResult.numObjects)
21 | }


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Adam Comerford
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/vagrant/MongoDBU/Vagrantfile:
--------------------------------------------------------------------------------
 1 | # -*- mode: ruby -*-
 2 | # vi: set ft=ruby :
 3 | 
 4 | # Vagrantfile API/syntax version. Don't touch unless you know what you're doing!
 5 | VAGRANTFILE_API_VERSION = "2"
 6 | 
 7 | Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
 8 |   
 9 |   config.vm.box = "MongoDBU_vm"
10 |   config.vm.box_url = "http://mekhar/mongodbu.box"
11 | 
12 |   config.vm.provision "shell", path: "provisioners/setup.sh"
13 | 
14 |   config.vm.provider "virtualbox" do |vb|
15 |     # Uncomment the next line to show VM window. User/pass = vagrant/vagrant
16 |     vb.gui = true
17 |   
18 |     vb.name = "MongoDBU_vm"
19 |     vb.customize ["modifyvm", :id, "--memory", "2048"]
20 |     vb.customize ["modifyvm", :id, "--cpus", "1"]  
21 |   end
22 | 
23 |   config.vm.provider "vmware_fusion" do |v|
24 |     # Uncomment the next line to show VM window. User/pass = vagrant/vagrant
25 |     v.gui = true
26 | 
27 |     v.vmx["displayname"] = "MongoDBU"
28 |     v.vmx["memsize"] = "2048"
29 |     v.vmx["numvcpus"] = "1"
30 |   end
31 | 
32 |   config.vm.provider "vmware_workstation" do |v|
33 |     # Uncomment the next line to show VM window. User/pass = vagrant/vagrant
34 |     v.gui = true
35 | 
36 |     v.vmx["displayname"] = "MongoDBU"
37 |     v.vmx["memsize"] = "2048"
38 |     v.vmx["numvcpus"] = "1"
39 |   end
40 | 
41 | end
42 | 


--------------------------------------------------------------------------------
/vagrant/m202-ubuntu1404/Vagrantfile:
--------------------------------------------------------------------------------
 1 | # -*- mode: ruby -*-
 2 | # vi: set ft=ruby :
 3 | 
 4 | # Vagrantfile API/syntax version. Don't touch unless you know what you're doing!
 5 | VAGRANTFILE_API_VERSION = "2"
 6 | 
 7 | Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
 8 |   
 9 |   config.vm.box = "m202-ubuntu1404"
10 |   config.vm.box_url = "http://127.0.0.1/~adam/m202-ubuntu1404.box"
11 | 
12 |   config.vm.provision "shell", path: "provisioners/setup.sh"
13 | 
14 |   config.vm.provider "virtualbox" do |vb|
15 |     # Uncomment the next line to show VM window. User/pass = vagrant/vagrant
16 |     vb.gui = true
17 |   
18 |     vb.name = "m202-ubuntu1404"
19 |     vb.customize ["modifyvm", :id, "--memory", "2048"]
20 |     vb.customize ["modifyvm", :id, "--cpus", "1"]  
21 |   end
22 | 
23 |   config.vm.provider "vmware_fusion" do |v|
24 |     # Uncomment the next line to show VM window. User/pass = vagrant/vagrant
25 |     v.gui = true
26 | 
27 |     v.vmx["displayname"] = "m202-ubuntu1404"
28 |     v.vmx["memsize"] = "2048"
29 |     v.vmx["numvcpus"] = "1"
30 |   end
31 | 
32 |   config.vm.provider "vmware_workstation" do |v|
33 |     # Uncomment the next line to show VM window. User/pass = vagrant/vagrant
34 |     v.gui = true
35 | 
36 |     v.vmx["displayname"] = "m202-ubuntu1404"
37 |     v.vmx["memsize"] = "2048"
38 |     v.vmx["numvcpus"] = "1"
39 |   end
40 |   config.ssh.username = "m202"
41 | end
42 | 


--------------------------------------------------------------------------------
/compactness.js:
--------------------------------------------------------------------------------
 1 | // original credit for this goes to https://github.com/achille
 2 | // compactness() calculates how closely the resulting documents are located together
 3 | // It counts the size of the d vs size of the unique pages they reside on
 4 |    
 5 | function compactness(collection, query, limit) {
 6 |     "use strict";
 7 |     Object.size = function(o) {
 8 |         var size = 0, key;
 9 |         for (key in o) { if (o.hasOwnProperty(key)) size++; }
10 |         return size;
11 |     };
12 |     
13 |     var count = 0,
14 |         size=0;
15 |     var disklocs = {}; //will store each disk loc, format: file-loc%4kb, ie file-0, file-4096, etc
16 |     
17 |     db.getCollection(collection).find(query).limit(limit).showDiskLoc().forEach(
18 |         function(doc) {
19 |             var file = doc.$diskLoc.file,
20 |                 offset = (doc.$diskLoc.offset),
21 |                 offsetPage = offset - offset % 4096;
22 |             count++;
23 |             size += Object.bsonsize(doc) - 45; //$diskloc info adds 45 bytes           
24 |             disklocs[file + "-" + offsetPage] = 1;
25 |         }
26 |     );
27 |     var numpages = Object.size(disklocs);
28 |     var numbytespages = 1024 * 4 * numpages;
29 |     print("Size of returned data in bytes: " + size);
30 |     print("Size of pages touched by data : " + numbytespages);
31 |     print("Compactness: " + Math.floor(100*size/numbytespages) + "%");
32 | }
33 | 


--------------------------------------------------------------------------------
/mongostat-demangler.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cat "$@" \
 4 | 	| awk '
 5 | 		{
 6 | 			if (! /^insert/ && $3 !~ /\|/) {
 7 | 				$3 = $3 "|0";
 8 | 			} else {
 9 | 				$3 = $3;
10 | 			}
11 | 			print;
12 | 		}
13 | 		' \
14 | 	| sed \
15 | 		-e '/^connected to/d' \
16 | 		-e '/can.t get data/d' \
17 | 		-e '/reconnect/d' \
18 | 		-e '/DBClient/d' \
19 | 		-e 's/\*//g' \
20 | 		-e 's/|/ /g' \
21 | 		-e 's/:\([0-9.]\+%\)/ \1/' \
22 | 	| awk '
23 | 		BEGIN {
24 | 			print "#time insert query update update_r delete getmore command command_r flushes mapped vsize res non-mapped faults locked_db lock% idx_miss_% qr qw ar aw netIn netOut conn set repl";
25 | 			header = 0;
26 | 		}
27 | 
28 | 		{
29 | 			if (/^insert/) {
30 | 				if (!header) {
31 | 					$0 = gensub("^", "#time ", "", $0);
32 | 					$0 = gensub("time *$", "", "", $0);
33 | 					$0 = gensub("update", "update update_r", "", $0);
34 | 					$0 = gensub("command", "command command_r", "", $0);
35 | 					$0 = gensub("locked db", "locked_db lock%", "", $0);
36 | 					$0 = gensub("idx miss %", "idx_miss_%", "", $0);
37 | 					for (i = 1; i <= NF; i++) {
38 | 						printf("%s(%d)%s", (i==1)?"#":"", i, (i==NF)?"\n":" ");
39 | 					}
40 | 					print;
41 | 					#header = 1;
42 | 				}
43 | 
44 | 			} else {
45 | 
46 | 				for (i = 1; i <= NF; i++) {
47 | 					if ($i ~ /k$/) {
48 | 						$i = 1000 * gensub("k$", "", "", $i);
49 | 					}
50 | 					if ($i ~ /m$/) {
51 | 						$i = 1000000 * gensub("m$", "", "", $i);
52 | 					}
53 | 					if ($i ~ /g$/) {
54 | 						$i = 1000000000 * gensub("g$", "", "", $i);
55 | 					}
56 | 				}
57 | 				if ($3 >= 0) {
58 | 				}
59 | 				if ($1 >= 0) {
60 | 					time = $NF;
61 | 					for (i = NF; i > 1; i--) {
62 | 						$i = $(i-1);
63 | 					}
64 | 					$1 = time;
65 | 					print;
66 | 				}
67 | 			}
68 | 		}
69 | 		' \
70 | 	| column -tn
71 | 
72 | 


--------------------------------------------------------------------------------
/AllChunkInfo.js:
--------------------------------------------------------------------------------
 1 | // This is a simple function, which takes just two arguments:
 2 | // ns: a string representing the sharded namespace to be examined
 3 | // est: a boolean to determine whether or not to use the estimate option (recommended generally)
 4 | 
 5 | // It is called from the mongos like so:
 6 | 
 7 | // AllChunkInfo("database.collection", true);
 8 | // Currently the output is CSV, will add options for other output later
 9 | 
10 | sh.printAllChunkInfo = function(ns, est) {
11 | 	var configDB = db.getSiblingDB("config");
12 | 	var chunks = configDB.chunks.find({ns: ns}).sort({min: 1});
13 | 	var key = configDB.collections.findOne({_id: ns}).key;
14 | 	var total = { chunks: 0, objs: 0, size: 0, empty: 0 };
15 | 	var shards = {};
16 | 	configDB.shards.find().toArray().forEach( function (shard) {
17 | 		shards[shard._id] = { chunks: 0, objs: 0, size: 0, empty: 0 };
18 | 	} );
19 | 	print("ChunkID,Shard,ChunkSize,ObjectsInChunk");
20 | 	chunks.forEach( function printChunkInfo(chunk) {
21 | 		var res = db.getSiblingDB(chunk.ns.split(".")[0]).runCommand({ datasize: chunk.ns, keyPattern: key, min: chunk.min, max: chunk.max, estimate: est });
22 | 		print(chunk._id + "," + chunk.shard + "," + res.size + "," + res.numObjects);
23 | 		(function(stats) {
24 | 			for (stat in stats) {
25 | 				stats[stat].chunks++;
26 | 				stats[stat].objs += res.numObjects;
27 | 				stats[stat].size += res.size;
28 | 				if (res.size == 0) stats[stat].empty++;
29 | 			}
30 | 		})( [ total, shards[chunk.shard] ] );
31 | 	} );
32 | 
33 | 	function printStats(s, indent) {
34 | 		print(indent + "Total Chunks: " + s.chunks + " (" + s.empty + " empty)");
35 | 		print(indent + "Total Size: " + s.size + " bytes");
36 | 		print(indent + "Average Chunk Size: " + (s.size/s.chunks) + " bytes");
37 | 		print(indent + "Average Non-empty Chunk Size: " + (s.size/(s.chunks-s.empty)) + " bytes");
38 | 	}
39 | 
40 | 	print("");
41 | 	print("*********** Summary Information ***********");
42 | 	printStats(total, "");
43 | 
44 | 	print("");
45 | 	print("*********** Per-Shard Information ***********");
46 | 	for (shard in shards) {
47 | 		print("Shard " + shard + ":");
48 | 		printStats(shards[shard], "    ");
49 | 	}
50 | }
51 | 


--------------------------------------------------------------------------------
/two_shards_1m_docs.js:
--------------------------------------------------------------------------------
 1 | // NOTE: This is not pure Javascript and cannot be run as such
 2 | // It is a series of instructions, some of which need to be run from the command line, others must be run from a MongoDB shell
 3 | // The comments before the instructions will describe where to run each piece                
 4 | // The command line instructions assume that the MongoDB binaries are in your current working directory on a Unix-like system
 5 | 
 6 | // Start a mongodb shell from the command line, this will be used to create your test cluster
 7 | 
 8 | ./mongo --nodb
 9 | 
10 | // Once you have a mongo shell, create a test cluster
11 | // We'll just start with 2 shards, and a small chunk size (handy for getting lots of chunks and testing the balancer)
12 | cluster = new ShardingTest({shards: 2, chunksize: 1});         
13 | // You will now see a lot of logging on this terminal, and I find it useful to keep it this way when testing
14 | 
15 | // So, start a new mongo shell -- this time connecting to the mongos you just created
16 | ./mongo --port 30999                                                                 
17 | 
18 | // Now that we again have a mongo shell, this time connected to the mongos, let's call the DB chunkTest
19 | use chunkTest;     
20 | // enable sharding on the DB
21 | sh.enableSharding("chunkTest");
22 | // Then create a sharded collection, we'll just call the collection "foo"
23 | // Since this is just a test, we will shard based on _id - this would generally be a bad idea for production
24 | sh.shardCollection("chunkTest.foo", {"_id" : 1});
25 | // Optional - uncomment the following to disable the balancer before inserting - this will put all data on one shard initially
26 | // sh.stopBalancer();    
27 | // Check the balancer state
28 | sh.getBalancerState();
29 | // Now insert 1,000,000 docs (increase as necessary) - this may take a while
30 | // In this case, I've put in a couple of random fields, and overridden the _id with the integer counter in the for loop
31 | for(var i = 0; i <= 1000000; i++){db.foo.insert({"_id" : i, "date" : new Date(), "otherID" : new ObjectId()})};
32 | // and that's it - you now have a 2 shard cluster with 1,000,000 docs to play with


--------------------------------------------------------------------------------
/pre_alloc.bash:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Tool to pre-allocate MongoDB data files for MMAPv0/1 storage engine
  4 | # 
  5 | # Requirements: fallocate command from Google ftools - https://code.google.com/p/linux-ftools/#fallocate
  6 | # 
  7 | # Author: Adam Comerford (adam@comerford.cc)
  8 | #
  9 | # Options:
 10 | #  
 11 | # -s - size of data files to allocate in MB, not including namespace file (default: 192) 
 12 | # -f - path to fallocate binary (default is to just call the fallocate command, look in PATH)
 13 | # -n - name of database (default: data)
 14 | # -d - where to place the files (default: /data/db)
 15 | 
 16 | # Error/safety checks
 17 | # 
 18 | # TODO - check for existing files first, if they exist, bail out and carp
 19 | # TODO - check for sufficient free space on target device
 20 | # TODO - add smallfiles option (divide by 4)
 21 |      
 22 | # set the defaults
 23 | SIZE=192
 24 | NAME="data"
 25 | FBINARY="fallocate"
 26 | DBPATH="/data/db"  
 27 | 
 28 | 
 29 | # Parse arguments, overwrite defaults when necessary, error if invalid arg passed
 30 | while getopts ":s:n:p:d:" opt; do
 31 |   case $opt in
 32 |     s) SIZE="$OPTARG"
 33 |     ;;
 34 |     n) NAME="$OPTARG"
 35 | 	;;
 36 |     f) FBINARY="$OPTARG"
 37 | 	;;
 38 |     d) DBPATH="$OPTARG"
 39 |     ;;
 40 |     \?) echo "Invalid option -$OPTARG" >&2
 41 |     ;;
 42 |   esac
 43 | done
 44 | 
 45 | command -v $FBINARY >/dev/null || { echo "fallocate command not found in PATH, cannot continue, please install util-linux package or similar, or provide the full path to the command."; exit 1; }
 46 | 
 47 | # Create namespace first - always needed
 48 | 
 49 | $FBINARY -l $((1024 * 1024 * 16)) $DBPATH/$NAME.ns
 50 | 
 51 | # calculate the number of files that will be required
 52 | # 4032 is the magic number, anything beyond that will have multiples of 2048
 53 | NUMFILES=0
 54 | 
 55 | if [ $SIZE -le 4032 ] ;
 56 | 	then
 57 | 	# only a few cases to deal with here
 58 | 	if [ $SIZE -le 192 ] ;
 59 | 		then
 60 | 	    NUMFILES=2
 61 | 	elif [ $SIZE -le 448 ] ;
 62 | 		then
 63 | 		NUMFILES=3
 64 | 	elif [ $SIZE -le 960 ] ;
 65 | 		then
 66 | 		NUMFILES=4
 67 | 	elif [ $SIZE -le 1984 ] ;
 68 | 		then 
 69 | 		NUMFILES=5
 70 | 	else
 71 | 		NUMFILES=6
 72 | 	fi 	 
 73 | else
 74 | 	# for larger than 4032, will always be 7 plus however many 2048 files are needed additionally
 75 | 	NUMFILES=$(( (($SIZE - 4032)/2048) + 7 ))
 76 | fi	
 77 | 
 78 | ALLOCATED=0      
 79 | while [ $ALLOCATED -lt $NUMFILES ]; do
 80 |   	case $ALLOCATED in
 81 | 	0)
 82 | 	  $FBINARY -l $((1024 * 1024 * 64)) $DBPATH/$NAME.$ALLOCATED
 83 | 	  ((ALLOCATED++))
 84 | 	  ;;
 85 | 	1)
 86 | 	  $FBINARY -l $((1024 * 1024 * 128)) $DBPATH/$NAME.$ALLOCATED
 87 | 	  ((ALLOCATED++))
 88 | 	  ;;  
 89 | 	2) 
 90 | 	  $FBINARY -l $((1024 * 1024 * 256)) $DBPATH/$NAME.$ALLOCATED
 91 |           ((ALLOCATED++))
 92 | 	  ;;  
 93 | 	3) 
 94 | 	  $FBINARY -l $((1024 * 1024 * 512)) $DBPATH/$NAME.$ALLOCATED
 95 |           ((ALLOCATED++))
 96 | 	  ;;  
 97 | 	4) 
 98 | 	  $FBINARY -l $((1024 * 1024 * 1024)) $DBPATH/$NAME.$ALLOCATED
 99 |           ((ALLOCATED++))
100 | 	  ;;
101 | 	*) 
102 | 	  $FBINARY -l $((1024 * 1024 * 2048)) $DBPATH/$NAME.$ALLOCATED
103 |           ((ALLOCATED++))
104 | 	  ;;
105 | 	esac
106 | done
107 | 


--------------------------------------------------------------------------------
/crud.js:
--------------------------------------------------------------------------------
  1 | // Some functions for inserting quickly (unack'ed writes) (Create)
  2 | // Preheating data to get it into memory (Read)
  3 | // Changing the data, with/without growth (Update)
  4 | // Removing data (Delete)
  5 | 
  6 | // First function is used to create a random set of data, the C in CRUD
  7 | // Defaults/Assumptions: _id index only, collection always called "data"
  8 | // Takes 3 arguments:
  9 | // numGB is the approximate data size to create in GiB (integer usually, but any number should work)
 10 | // dbName is the database to use (defaults to a collection called data)
 11 | // usePowerOf2 is a boolean to allow you to select the storage strategy
 12 | // delay is optional - it will introduce a sleep into the loop to slow down the operations, defaults to 0
 13 | 
 14 | createData = function(numGB, dbName, usePowerOf2, delay) {
 15 |     var db1 = db.getSiblingDB(dbName);
 16 |     // set powerOf2 per the boolean, but need to handle it differently if it currently exists or not 
 17 |     // NOTE that second option will turn it off for all new collections
 18 |     
 19 |     if(db1.data.findOne()){ 
 20 |         db1.runCommand({ collMod : "data", usePowerOf2Sizes : usePowerOf2 });
 21 |     } else {
 22 |         db1.adminCommand({ setParameter: 1, newCollectionsUsePowerOf2Sizes: usePowerOf2 });
 23 |     };
 24 |     // set the delay as passed in, with the default of 0 as a fallback
 25 |     delay = typeof delay !== 'undefined' ? delay : 0;
 26 |     // check the shell version, if 2.5+ set legacy mode for unacked writes (for speed)
 27 |     var shellVersion = version().split('.').map(Number);
 28 |     if ( shellVersion[0] > 2 ) {
 29 | 	    db1.getMongo().forceWriteMode("legacy");
 30 | 	} else if (shellVersion[0] == 2) {
 31 | 		if (shellVersion[1] > 4) {
 32 | 		db1.getMongo().forceWriteMode("legacy");
 33 | 		}
 34 | 	};
 35 | 
 36 |     // with the document we are using, 68 iterations of this loop will get you ~1033MiB of data (not uncluding indexes), so use that as a multiplier
 37 |     var startTime = new Date();
 38 |     
 39 |     for(var j = 0; j < (numGB * 68); j++){
 40 |         // going to create a big array of docs, then insert them
 41 |         var bigDoc = [];
 42 |         for(var i = 0; i < 66400; i++){  // 132800 gets pretty close to the max doc size but takes a bit too long to generate on my machine, leaving gaps, so divide by 2
 43 |             var randomNum = Math.random(); // generate a single random number per loop iteration
 44 |             var ranDate = new Date(Math.floor(1500000000000 * randomNum));
 45 |             // let's construct a random ObjectId based on the number, basically construct a string with the info we need and some randomness
 46 |             // first piece is 4 bytes (8 hex digits), need to pad with zeroes for low values (same with random end piece)
 47 |             // next pieces per the spec are 6 hex digits for the machine, 4 digits for the PID
 48 |             // instead we will insert 10 placeholder characters for expedience
 49 |             // then, round things out with 3 bytes of randomness per the spec, and use the increment on the loop to avoid collisions
 50 |             var ranString = (Math.floor(randomNum * 1500000000).toString(16)).pad(8, false, 0) + "adacefd123" + ((Math.floor(randomNum * 16710815) + i).toString(16)).pad(6, false, 0);
 51 |             // this one would be better, but too slow to generate:
 52 |             // var ranString = ((Math.floor(1500000000 * randomNum)).toString(16)).pad(8, false, "0") + db1.version().replace(/\./g, "") + "adacefd" + ((Math.floor(randomNum * 9920329) + i).toString(16)).pad(6, false, "0");
 53 |             var ranId = new ObjectId(ranString);
 54 |             // To explain the document:
 55 |             // _id and ranDate are both based on the same randomly generated date, but ranDate has millis and is a bit easier to parse
 56 |             // After that we add an integer, boolean and a small array with a string and a number (array is useful for growth later)
 57 |             bigDoc.push({_id : ranId, ranDate : ranDate, ranInt : NumberInt(randomNum * 1000000), ranBool : (randomNum < 0.5 ? true : false), smallArray : [randomNum, randomNum.toString()]});
 58 |             // if there is a non-default delay specified, use it
 59 |             if(delay > 0){
 60 |                 sleep(delay);
 61 |             }
 62 |         };
 63 |         db1.data.insert(bigDoc);
 64 |     
 65 |         if(j == (numGB * 34)){
 66 |             print("Approximately 50% done: " + (j * 66400) + " docs inserted in " + (new Date() - startTime)/1000 + " seconds");
 67 |         };
 68 |     
 69 |     };
 70 |     var timeTaken = ((new Date() - startTime)/1000);
 71 |     print("Run complete: " + (numGB * 68 * 66400) + " docs inserted in " + timeTaken + " seconds.  Average insertion rate: " + ((numGB * 136 * 33200)/timeTaken) + " docs per second");
 72 |     // clean up the write mode if altered at the top
 73 |     if(db1.getMongo().writeMode() == "legacy"){
 74 | 	    db1.getMongo().forceWriteMode("commands"); 
 75 |     }    
 76 | };
 77 | 
 78 | // Sample runs of the createData script on a standalone mongod, both run on same 8 core Linux host, not IO bound 
 79 | // Single thread CPU for shell was close to max, as was database lock on mongod - results within margin of error for the versions:
 80 | 
 81 | // 2.6.3 - Run complete: 4515200 docs inserted in 148.452 seconds.  Average insertion rate: 30415.218387088084 docs per second
 82 | // 2.4.10 - Run complete: 4515200 docs inserted in 146.916 seconds.  Average insertion rate: 30733.20809169866 docs per second
 83 | 
 84 | 
 85 | // Next, the reads - we'll do this randomly across the set
 86 | // Takes 2 arguments:
 87 | // numGB is the approximate data size to create in GiB (integer usually, but any number should work)
 88 | // dbName is the database to use (defaults to a collection called data)
 89 | 
 90 | preHeatRandomData = function(numGB, dbName) {
 91 | 
 92 | // We will brute force this basically, the _id is indexed and we know how it was constructed
 93 | // The first 8 hex digits of the pseudo ObjectID we created have a maximum of 16^6 docs, but likely far less
 94 | // A bit of experimentation tells me that using all 8 digits is too slow (low hit rate)
 95 | // Even 6 digits is still only 256 second ranges and yielded an average of less than 2 docs per range in limited tests
 96 | // Hence, we will do a range query using the first 5 digits plus fixed strings to create the start/end of the range
 97 | // Thats 16^3 or 4096 secs, so not an unreasonable range to query in general (1GiB set tests yielded ~12 docs per range)
 98 | // Every time we do the range query, we will call explain, and then increment the results by the nscanned count
 99 | // 
100 | // Note: decent chance there will be collisions, so may need to "oversubscribe" the amount of data to be touched
101 | 
102 | var docHits = 0;
103 | var noHits = 0;
104 | var iterations = 0;   // not really needed other than for stats
105 | 
106 | var db1 = db.getSiblingDB(dbName);
107 | 
108 | var startTime = new Date(); // time the loop
109 | while(docHits < (5000000 * numGB)) {
110 | 	// the creation of the string is identical to the creation code
111 |     var randomNum = Math.random();
112 |     var ranString = (Math.floor(randomNum * 1500000000).toString(16)).pad(8, false, 0);
113 |     // we just strip the last 3 characters to allow us to create ranges - 3 characters is only 4096 seconds
114 |     ranString = ranString.substring(0, ranString.length - 3)
115 |     var beginId = new ObjectId(ranString + "000adacefd123000000");
116 |     var endId = new ObjectId(ranString + "fffadacefd123ffffff");                  
117 |     // simple ranged query on _id with an explicit hint and an explain so we exhaust the cursor and get useful stats back
118 |     var result = db1.data.find({_id : {$gte : beginId, $lte : endId}}).hint({_id : 1}).explain();
119 |     if(result.nscanned > 0) { 
120 |         docHits += result.nscanned; //increment by number of scanned if not empty
121 |     } else {
122 |         noHits++;  // record the lack of hits
123 |     };
124 |     iterations++; // total iterations
125 |     // warn about low hit rates at each 250k no hit occurrences
126 | 	if((noHits % 250000) == 0  && noHits > 0){
127 |         print("Warning: hit rate is poor - just passed " + noHits + " iterations with no hits (current hits doc hits are: " + docHits + " out of " + (5000000 * numGB) + " or " + docHits/(50000 * numGB) + "%).");
128 | 	};
129 | };
130 | var endTime = new Date();
131 | // some info on the time taken, hit rate etc.
132 | print(numGB + "GiB of data loaded (" + (numGB * 5000000) + " docs), took " + (endTime - startTime)/1000 + " seconds to complete (average: " + (numGB * 5000000)/((endTime - startTime)/1000) + " docs/sec)")
133 | print(noHits + " queries hit 0 documents (" + (noHits*100)/iterations + "%) and there were " + iterations + " total iterations." );
134 | print("Average number of docs scanned per iteration (hits only): " + (numGB * 5000000)/(iterations - noHits) );
135 | };
136 | 
137 | // update docs, optionally making them grow (creates free list)
138 | 
139 | updateRandomData = function(numGB, dbName, growDocs){
140 | 
141 | // quick test shows that with powerOf2Sizes, need to add 9 ObjectIds to the smallArray to trigger a move
142 | // so growing the docs will take a lot more updates in order to complete the run
143 | // testing for a move is a little clunky until we get better write command stats, so pushing that to its own function for now
144 | var db1 = db.getSiblingDB(dbName);
145 | var updateHits = 0;
146 | var growthOverhead = 0;
147 | var startTime = new Date(); // time the loop
148 | while(updateHits < (5000000 * numGB)){
149 |     // we'll re-use the logic from the finds, create a range to look for a candidate document
150 |     var randomNum = Math.random();
151 |     var ranString = (Math.floor(randomNum * 1500000000).toString(16)).pad(8, false, 0);
152 |     // we just strip the last 3 characters to allow us to create ranges - 3 characters is only 4096 seconds
153 |     // this is looking pretty inefficient at finding data in a 2GB data set for testing, may need to increase the ranges
154 |     ranString = ranString.substring(0, ranString.length - 3)
155 |     var beginId = new ObjectId(ranString + "000adacefd123000000");
156 |     var endId = new ObjectId(ranString + "fffadacefd123ffffff");
157 | 	var result = 0;
158 |     // simple find on _id with a hint and next() to get the first doc off the cursor
159 | 	// loop until we have a valid result (in case of misses), and we will use the ranInt to not hit docs twice
160 | 	while(result == 0){ 
161 |         result = db1.data.find({_id : {$gte : beginId, $lte : endId}, ranInt : {$lte : 1000000}}).hint({_id : 1}).next();
162 |     }
163 |     if(growDocs){
164 |         growthOverhead += pushUntilMoved(dbName, result._id, false);
165 |         db1.data.update({_id : result._id}, {$inc : {ranInt : 1000000}});
166 |         updateHits++;
167 |     } else {
168 | 	    db1.data.update({_id : result._id}, {$inc : {ranInt : 1000000}});
169 |         updateHits++;
170 | 	}
171 | }
172 | var endTime = new Date();
173 | 
174 | if(growDocs){
175 |     print("Updated " + updateHits + " docs in " + (endTime - startTime)/1000 + " seconds (avg: " + (5000000 * numGB)/((endTime - startTime)/1000) + " docs/sec. Growth required an average of " + (growthOverhead/updateHits) + " pushes to the array.");
176 | } else {
177 |     print("Updated " + updateHits + " docs in " + (endTime - startTime)/1000 + " seconds (avg: " + (5000000 * numGB)/((endTime - startTime)/1000) + " docs/sec.");
178 | };
179 | 
180 | }
181 | 
182 | // this little function will take an ObjectID, then push new IDs to the smallArray until the document moves on disk
183 | // verbose toggles information about old/new location and number of pushes required (will be more for powerOf2 docs)
184 | // it's needed to provide the move functionality in the update function
185 | pushUntilMoved = function(dbName, docID, verbose){
186 | 	var db1 = db.getSiblingDB(dbName);
187 | 	var currentLoc = db1.data.find({_id : docID}).showDiskLoc().next().$diskLoc;
188 | 	var newLoc = currentLoc;
189 | 	var pushes = 0;
190 | 	while((currentLoc.file == newLoc.file) && (currentLoc.offset == newLoc.offset)){
191 | 		db1.data.update({_id : docID}, {$push : {smallArray : new ObjectId()}});
192 | 		newLoc = db1.data.find({_id : docID}).showDiskLoc().next().$diskLoc;
193 | 		pushes++;
194 | 	}
195 | 	if(verbose){
196 | 		print("Old location: file: " + currentLoc.file + " offset: " + currentLoc.offset);
197 | 		print("New location: file: " + newLoc.file + " offset: " + newLoc.offset);
198 | 		print("Pushes required: " + pushes);
199 |     }
200 | 	return pushes;
201 | }
202 | 
203 | 
204 | // delete docs, create holes and a free list
205 | 
206 | deleteRandomData = function(numGB, dbName){
207 | 	var db1 = db.getSiblingDB(dbName);
208 | 	var delHits = 0; 
209 | 	
210 | 	// this one is actually far more simple in 2.6 with the write results, so writing that first, may not bother with 2.4
211 |  	var startTime = new Date(); // time the loop
212 | 	while(delHits < (5000000 * numGB)){
213 | 	// we'll re-use the logic from the finds/updates, create a range to look for a candidate document
214 | 	    var randomNum = Math.random();
215 | 	    var ranString = (Math.floor(randomNum * 1500000000).toString(16)).pad(8, false, 0);
216 | 	    ranString = ranString.substring(0, ranString.length - 3)
217 | 	    var beginId = new ObjectId(ranString + "000adacefd123000000");
218 | 	    var endId = new ObjectId(ranString + "fffadacefd123ffffff");
219 | 		var result = db1.data.remove({_id : {$gte : beginId, $lte : endId}}, 1); // just remove one doc at a time
220 | 		delHits += result.nRemoved;
221 |     }
222 |     var endTime = new Date();
223 | 	print("Removed " + delHits + " docs in " + (endTime - startTime)/1000 + " seconds (avg: " + (5000000 * numGB)/((endTime - startTime)/1000) + " docs/sec."); 	
224 | 	
225 | }
226 | 


--------------------------------------------------------------------------------