├── .gitignore ├── README.md ├── TODO.md ├── exps ├── .gitignore ├── README.md ├── __init__.py ├── api │ ├── __init__.py │ ├── abstractcoordinator.py │ ├── abstractworker.py │ ├── directchannel.py │ ├── message.py │ ├── messageprocessor.py │ ├── mongostat.py │ └── results.py ├── benchmark.py ├── benchmarks │ ├── __init__.py │ ├── blog │ │ ├── __init__.py │ │ ├── blogcoordinator.py │ │ ├── blogworker.py │ │ ├── constants.py │ │ ├── maxnumofcomments.py │ │ └── util │ │ │ ├── __init__.py │ │ │ ├── rand.py │ │ │ └── zipf.py │ ├── replay │ │ ├── __init__.py │ │ ├── dbcombiner.py │ │ ├── dbdenormalizer.py │ │ ├── dbmigrator.py │ │ ├── denormalizer.py │ │ ├── replaycoordinator.py │ │ ├── replayworker.py │ │ └── unittest │ │ │ └── test_combiner.py │ └── tpcc │ │ ├── __init__.py │ │ ├── constants.py │ │ ├── drivers │ │ ├── __init__.py │ │ ├── abstractdriver.py │ │ └── mongodbdriver.py │ │ ├── runtime │ │ ├── __init__.py │ │ ├── executor.py │ │ ├── loader.py │ │ ├── nurand.py │ │ ├── rand.py │ │ └── scaleparameters.py │ │ ├── tpcc.sql │ │ ├── tpcccoordinator.py │ │ └── tpccworker.py └── tools │ ├── __init__.py │ ├── dba-export.py │ ├── design_deserializer.py │ ├── dump-csv.py │ ├── duplicator.py │ ├── load-csv.py │ └── spencerdesign2json.py ├── libs ├── argparse │ ├── __init__.py │ └── argparse.py ├── mongokit │ ├── __init__.py │ ├── auth.py │ ├── collection.py │ ├── connection.py │ ├── cursor.py │ ├── database.py │ ├── document.py │ ├── grid.py │ ├── helpers.py │ ├── master_slave_connection.py │ ├── migration.py │ ├── mongo_exceptions.py │ ├── operators.py │ ├── schema_document.py │ └── versioned_document.py └── sqlparse │ ├── __init__.py │ ├── engine │ ├── __init__.py │ ├── filter.py │ └── grouping.py │ ├── exceptions.py │ ├── filters.py │ ├── formatter.py │ ├── functions.py │ ├── keywords.py │ ├── lexer.py │ ├── pipeline.py │ ├── sql.py │ ├── tokens.py │ └── utils.py ├── src ├── OVERVIEW.md ├── README.md ├── catalog │ ├── __init__.py │ ├── collection.py │ └── utilmethods.py ├── costmodel │ ├── __init__.py │ ├── abstractcostcomponent.py │ ├── costmodel.py │ ├── disk │ │ ├── __init__.py │ │ ├── diskcostcomponent.py │ │ ├── fastlrubuffer.py │ │ ├── fastlrubufferusingwindow.py │ │ └── lrubuffer.py │ ├── network │ │ ├── __init__.py │ │ └── networkcostcomponent.py │ ├── nodeestimator.py │ ├── skew │ │ ├── __init__.py │ │ └── skewcostcomponent.py │ └── state.py ├── d4.py ├── inputs │ ├── __init__.py │ ├── abstractconverter.py │ ├── mongodb │ │ ├── README │ │ ├── __init__.py │ │ ├── dependencyfinder.py │ │ ├── mongosniffconverter.py │ │ ├── normalizer.py │ │ ├── parser.py │ │ ├── reconstructor.py │ │ ├── salt_crack.py │ │ ├── sample.txt │ │ ├── samplecreator.py │ │ ├── sessionizer.py │ │ └── workload_info.py │ └── mysql │ │ ├── __init__.py │ │ ├── mysqlconverter.py │ │ ├── sql2mongo.py │ │ └── utilmethods.py ├── multithreaded │ ├── __init__.py │ ├── message.py │ ├── messageprocessor.py │ ├── multi_search.py │ ├── multi_search_coordinator.py │ └── multi_search_worker.py ├── sanitizer │ ├── __init__.py │ ├── anonymize.py │ ├── anonymized-sample.txt │ ├── out.txt │ ├── sample-anonymize.txt │ └── sample.dat ├── search │ ├── __init__.py │ ├── abstractdesigner.py │ ├── bbsearch.py │ ├── design.py │ ├── designcandidates.py │ ├── designer.py │ ├── initialdesigner.py │ ├── lnsdesigner.py │ ├── randomdesigner.py │ └── utilmethods.py ├── util │ ├── __init__.py │ ├── configutil.py │ ├── constants.py │ ├── histogram.py │ ├── mathutil.py │ ├── termcolor.py │ └── utilmethods.py └── workload │ ├── __init__.py │ ├── ophasher.py │ ├── session.py │ ├── utilmethods.py │ └── workloadcombiner.py └── tests ├── README ├── __init__.py ├── api └── unittest_results.py ├── catalog └── unittest_utilmethods.py ├── costmodel ├── costmodeltestcase.py ├── costmodeltestcase_guessIndex.py ├── costmodeltestcase_index.py ├── costmodeltestcase_index_withprojection.py ├── disk │ ├── unittest_diskcostcomponent_guessIndex.py │ ├── unittest_diskcostcomponent_indexinsertionpenalty.py │ ├── unittest_diskcostcomponentindexes.py │ ├── unittest_diskcostcomponentindexes_withprojection.py │ └── unittest_fastlrubuffer.py ├── network │ ├── unittest_networkcostcomponent.py │ └── unittest_networkcostcomponenttpcc.py ├── skew │ └── unittest_skewcostcomponent.py ├── unittest_costmodel.py ├── unittest_costmodel_denormalization.py ├── unittest_lrubuffer.py └── unittest_nodeestimator.py ├── exps ├── replay │ ├── unittest_denormalizer.py │ └── workloadgenerator.py └── tools │ └── unittest_design_deserializer.py ├── inputs ├── mongodb │ └── unittest_reconstructor.py ├── mysql │ └── unittest_sql2mongo.py └── unittest_abstractconverter.py ├── mongodbtestcase.py ├── runTests.sh ├── sanitizer ├── trace-anon.out ├── trace-clean.out └── unittest_sanitizer.py ├── search ├── bbsearch-test.py ├── unittest_bbsearch.py ├── unittest_bbsearch_CompoundKeyIterator.py ├── unittest_bbsearch_ShardKeyIterator.py ├── unittest_design.py ├── unittest_findExpectedDesign.py ├── unittest_initialdesigner.py ├── unittest_lnsdesigner.py └── unittest_utilmethods.py ├── tpcctestcase.py ├── util ├── unittest_configutil.py ├── unittest_histogram.py ├── unittest_mathutil.py └── unittest_utilmethods.py └── workload ├── unittest_ophasher.py ├── unittest_utilmethods.py ├── unittest_workloadcombiner.py ├── unittest_workloadcombinerwithtpcc.py └── workloadcombinersetup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.pyo 3 | .#* 4 | *.kate-swp 5 | *.config 6 | .idea 7 | nosetests.xml 8 | src/*.png 9 | *~ 10 | *.log 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mongodb-d4 2 | 3 | **D4** is an automated tool for a generating **d**istributed **d**ocument **d**atabase **d**esigns for applications 4 | running on MongoDB. This tool specifically targets applications running highly concurrent workloads, and thus its 5 | designs are tailored to the unique properties of large-scale, Web-based applications. It can also be used to assist 6 | in porting MySQL-based applications to MongoDB. 7 | 8 | Using a sample workload trace from a either a document-oriented or relational database application, **D4** will compute 9 | the best a database design that optimizes the throughput and latency of a document DBMS. The three design elements that 10 | D4 can select for an application are: 11 | 12 | + Sharding Keys 13 | + Indexes 14 | + Collection (De)normalization 15 | 16 | For More Information: 17 | 18 | ## Dependencies 19 | + python-pymongo 20 | + python-yaml 21 | + python-MySQLdb (optional) 22 | 23 | ## Authors 24 | + [Andy Pavlo](http://www.cs.brown.edu/~pavlo) 25 | + [Yang Zou](http://www.cs.brown.edu/~yang) 26 | + [Michail Michailidis](http://www.cs.brown.edu/~mmichail) 27 | + [Stan Zdonik](http://www.cs.brown.edu/~sbz) 28 | 29 | ## Past Contributors 30 | + [Christopher Keith](http://www.linkedin.com/pub/christopher-keith/38/882/81a) 31 | + [Emanuel Buzek](http://www.linkedin.com/pub/emanuel-buzek/2/655/b04) 32 | 33 | ## Acknowledgements 34 | This work is supported (in part) by an [Amazon AWS Research Grant](http://aws.amazon.com/education/). 35 | Additional assistance is also provided by [10gen, Inc.](http://10gen.com) -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | ## What Jian Have Done? 2 | 3 | * Computation of touched node for range shard keys 4 | 5 | First partition every field into ranges in the workload analysis stage according to its distinct values. For example, if a field has distinct values [1,2,3,4,5,6,7,8], and we have 4 shards, the ranges arrays will be [1,3,5,7]. The number in the ranges array indicate the minimum value for that range. 6 | 7 | Then we use the ranges information generated in the workload analysis stage to compute the touched node for range shard keys. If a query contains a equality by key A with value 6 in the above example, this query will access shard number 2(starts from 0). 8 | 9 | 10 | * Candidate generation for shard keys 11 | 12 | We only choose shard keys with high cardinality and high referenced count. According to cardinality and referenced count, d4 generates a score for each key, then d4 sorts all keys by those scores. We set a threshold to filter out keys with low score. 13 | 14 | When iterating on the combination of shard keys, the compound keys with more keys have higher priority to evaluate. 15 | 16 | 17 | * Estimation of number of shards 18 | 19 | Though the number of shards is set by user, however, not all collections could use all shards. For example, collections are sharded by keys with low cardinality or collections has small document size will be only sharded into subset of shards. So we need to estimate the number of shards for each collection with each design. Then use this number to calculate the cost. 20 | 21 | 22 | * Latencies report for replay framework 23 | 24 | Add latencies report for replay framework, also output the top slowest queries for debugging usage. 25 | 26 | * Lots of bug fixes 27 | 28 | Fixes bugs for input module, search algorithms, cost models and benchmark modules 29 | 30 | ## Future Work: 31 | 32 | * [Issue 37](https://github.com/cmu-db/mongodb-d4/issues/37) 33 | * [Issue 38](https://github.com/cmu-db/mongodb-d4/issues/38) 34 | * [Issue 39](https://github.com/cmu-db/mongodb-d4/issues/37) 35 | -------------------------------------------------------------------------------- /exps/.gitignore: -------------------------------------------------------------------------------- 1 | mongostat -------------------------------------------------------------------------------- /exps/README.md: -------------------------------------------------------------------------------- 1 | # MongoDB Benchmark Framework 2 | 3 | This framework is able to run different benchmarks using MongoDB. It was originally based 4 | on my TPC-C benchmark framework that I used in my NoSQL course in spring 2011. It was then 5 | forked by one of my students in the summer of 2011. I then grabbed his changes and modified 6 | it further to support the different types of experiments that we will need for this work. 7 | 8 | **TLDR:** 9 | This code is based on https://github.com/yanglu/BigBenchmark 10 | which was originally based on: https://github.com/apavlo/py-tpcc 11 | 12 | 13 | ## Dependencies: 14 | + python-execnet 15 | 16 | ## Example Usage 17 | 18 | 1. Create a configuration file for the benchmark that you are going to run. 19 | For this example, we will use the `blog` benchmark. 20 | 21 | ./benchmark.py --print-config blog > blog.config 22 | 23 | Modify the configuration file to change the parameters according to your environment setup. 24 | 25 | 2. Load in the benchmark database into MongoDB. The `--no-execute` option will prevent 26 | the framework from executing the workload portion of the benchmark, while the `--reset` option 27 | will clear out the contents of the database if it already exists. 28 | 29 | ./benchmark.py --config=blog.config --no-execute --reset blog 30 | 31 | 3. Now execute the workload driver to perform the experiment. The final throughput results 32 | will be printed at the end. Note here that the `--no-load` option will prevent the framework 33 | from repeating the loading step. 34 | 35 | ./benchmark.py --config=blog.config --no-load blog 36 | 37 | 38 | ## Configuration 39 | 40 | + **logfile**: 41 | This controls where the worker threads will write their log messages out to. 42 | It will not be overwritten on each invocation. -------------------------------------------------------------------------------- /exps/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __all__ = ["benchmark"] -------------------------------------------------------------------------------- /exps/api/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __all__ = [ "messageprocessor", "message", "results" ] -------------------------------------------------------------------------------- /exps/api/directchannel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # ----------------------------------------------------------------------- 3 | # Copyright (C) 2012 4 | # Andy Pavlo - http://www.cs.brown.edu/~pavlo/ 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining 7 | # a copy of this software and associated documentation files (the 8 | # "Software"), to deal in the Software without restriction, including 9 | # without limitation the rights to use, copy, modify, merge, publish, 10 | # distribute, sublicense, and/or sell copies of the Software, and to 11 | # permit persons to whom the Software is furnished to do so, subject to 12 | # the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be 15 | # included in all copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 20 | # IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 21 | # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 22 | # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 23 | # OTHER DEALINGS IN THE SOFTWARE. 24 | # ----------------------------------------------------------------------- 25 | import logging 26 | 27 | from messageprocessor import * 28 | from message import * 29 | 30 | LOG = logging.getLogger(__name__) 31 | 32 | class DirectChannel: 33 | 34 | def __init__(self): 35 | self.gateway = None # Needed by message.py 36 | self.queue = [ ] 37 | self.processor = MessageProcessor(self) 38 | 39 | m = Message(MSG_NOOP, True) 40 | self.defaultResponse = pickle.dumps(m, -1) 41 | self.response = None 42 | 43 | pass 44 | 45 | def __iter__(self): 46 | return self 47 | 48 | def next(self): 49 | if len(self.queue) == 0: 50 | raise StopIteration 51 | return self.queue.pop(0) 52 | 53 | def send(self, msg): 54 | m = getMessage(msg) 55 | if m.header in [ MSG_INIT_COMPLETED, MSG_LOAD_COMPLETED, MSG_EXECUTE_COMPLETED ]: 56 | self.response = msg 57 | else: 58 | self.queue.append(msg) 59 | self.processor.processMessage() 60 | ## DEF 61 | 62 | def receive(self): 63 | r = None 64 | if self.response != None: 65 | r = self.response 66 | self.response = None 67 | else: 68 | r = self.defaultResponse 69 | return r 70 | ## CLASS 71 | 72 | -------------------------------------------------------------------------------- /exps/api/mongostat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # ----------------------------------------------------------------------- 3 | # Copyright (C) 2012 4 | # Andy Pavlo - http://www.cs.brown.edu/~pavlo/ 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining 7 | # a copy of this software and associated documentation files (the 8 | # "Software"), to deal in the Software without restriction, including 9 | # without limitation the rights to use, copy, modify, merge, publish, 10 | # distribute, sublicense, and/or sell copies of the Software, and to 11 | # permit persons to whom the Software is furnished to do so, subject to 12 | # the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be 15 | # included in all copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 20 | # IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 21 | # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 22 | # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 23 | # OTHER DEALINGS IN THE SOFTWARE. 24 | # ----------------------------------------------------------------------- 25 | import os 26 | import threading 27 | import subprocess 28 | import shlex 29 | import logging 30 | 31 | LOG = logging.getLogger(__name__) 32 | 33 | class MongoStatCollector(threading.Thread): 34 | 35 | def __init__(self, host, outputFile, outputInterval=10, showAll=True): 36 | threading.Thread.__init__(self) 37 | self.host = host 38 | self.outputFile = outputFile 39 | self.outputInterval = outputInterval 40 | self.showAll = outputFile 41 | self.daemon = True 42 | self.process = None 43 | self.record = False 44 | self.stopThread = False 45 | ## DEF 46 | 47 | def startRecording(self): 48 | LOG.info("Starting stat data collection [%s]", self.outputFile) 49 | self.record = True 50 | 51 | def stopRecording(self): 52 | LOG.info("Stopping stat data collection [%s]", self.outputFile) 53 | self.record = False 54 | 55 | def run(self): 56 | command = "mongostat --host %s" % self.host 57 | if self.showAll: command += " --all" 58 | command += " %d" % self.outputInterval 59 | 60 | args = shlex.split(command) 61 | LOG.info("Forking command: %s" % args) 62 | self.process = subprocess.Popen(args, 63 | stdout=subprocess.PIPE, 64 | stderr=subprocess.STDOUT, 65 | shell=False, 66 | ) 67 | LOG.info("Writing MongoStat output to '%s'" % self.outputFile) 68 | header = None 69 | headerHash = None 70 | writeHeader = True 71 | with open(self.outputFile, "w") as fd: 72 | while not self.stopThread: 73 | self.process.stdout.flush() 74 | line = self.process.stdout.readline() 75 | if header is None and line.find("flushes") != -1: 76 | header = line 77 | headerHash = hash(header) 78 | if self.record: 79 | if writeHeader and not header is None: 80 | fd.write(header) 81 | writeHeader = False 82 | if hash(line) != headerHash: 83 | fd.write(line) 84 | fd.flush() 85 | # WHILE 86 | LOG.debug("MongoStatCollection thread is stopping") 87 | ## DEF 88 | 89 | def stop(self): 90 | if not self.process is None: 91 | LOG.debug("Killing MongoStatCollection process %d [%s]", self.process.pid, self.outputFile) 92 | self.stopThread = True 93 | self.process.kill() 94 | ## DEF 95 | 96 | ## CLASS 97 | -------------------------------------------------------------------------------- /exps/benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | -------------------------------------------------------------------------------- /exps/benchmarks/blog/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __all__ = ["blogcoordinator", "blogworker"] 4 | -------------------------------------------------------------------------------- /exps/benchmarks/blog/constants.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # ----------------------------------------------------------------------- 3 | # Copyright (C) 2012 4 | # Andy Pavlo - http://www.cs.brown.edu/~pavlo/ 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining 7 | # a copy of this software and associated documentation files (the 8 | # "Software"), to deal in the Software without restriction, including 9 | # without limitation the rights to use, copy, modify, merge, publish, 10 | # distribute, sublicense, and/or sell copies of the Software, and to 11 | # permit persons to whom the Software is furnished to do so, subject to 12 | # the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be 15 | # included in all copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 20 | # IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 21 | # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 22 | # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 23 | # OTHER DEALINGS IN THE SOFTWARE. 24 | # ----------------------------------------------------------------------- 25 | 26 | from datetime import datetime 27 | 28 | #DB_NAME = 'microblog' 29 | ARTICLE_COLL = 'articles' 30 | COMMENT_COLL = 'comments' 31 | 32 | NUM_AUTHORS = 1024 33 | NUM_TAGS = 6000 34 | NUM_TAGS_PER_ARTICLE = 40 35 | 36 | ARTICLE_TITLE_SIZE = 200 37 | ARTICLE_CONTENT_SIZE = 4096 38 | COMMENT_CONTENT_SIZE = 512 39 | MAX_COMMENT_RATING = 100 40 | NUM_ARTICLES = 10000 # this is multiplied by the scale factor 41 | NUMBER_OF_DATE_SUBRANGES = 8 # this breaks the interval between START_DATE and STOP_DATE in X segments 42 | 43 | # Special atomic counter 44 | NEXT_ARTICLE_CTR_ID = -9999 45 | NEXT_ARTICLE_CTR_KEY = "nextArticleId" 46 | 47 | #deprecated 48 | #AUTHOR_NAME_SIZE = 20 49 | #MAX_AUTHOR_SIZE = 20 50 | #MAX_TITLE_SIZE = 200 51 | #MAX_CONTENT_SIZE = 102400 52 | #MAX_COMMENT_SIZE = 1024 53 | #MAX_NUM_COMMENTS = 100 54 | 55 | 56 | 57 | 58 | WORKLOAD_READ_PERCENT = 90 59 | WORKLOAD_WRITE_PERCENT = 10 60 | assert (WORKLOAD_READ_PERCENT+WORKLOAD_WRITE_PERCENT) == 100 61 | 62 | START_DATE = datetime.strptime('11/1/2011 1:30 PM', '%m/%d/%Y %I:%M %p') 63 | STOP_DATE = datetime.strptime('1/1/2012 1:30 PM', '%m/%d/%Y %I:%M %p') 64 | 65 | # Experiment Type Codes 66 | EXP_SHARDING = "sharding" 67 | EXP_DENORMALIZATION = "denormalization" 68 | EXP_INDEXING = "indexing" 69 | EXP_ALL = [ EXP_SHARDING, EXP_DENORMALIZATION, EXP_INDEXING ] 70 | 71 | # Sharding Config Types 72 | SHARDEXP_RANGE = 0 73 | SHARDEXP_HASH = 1 74 | SHARDEXP_ALL = [SHARDEXP_RANGE, SHARDEXP_HASH] 75 | 76 | # Indexing Config Types 77 | 78 | INDEXEXP_8020 = 0 # 80% reads / 20% writes 79 | INDEXEXP_9010 = 1 # 90% reads / 10% writes 80 | INDEXEXP_ALL = [INDEXEXP_8020, INDEXEXP_9010] 81 | -------------------------------------------------------------------------------- /exps/benchmarks/blog/maxnumofcomments.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import sys 3 | import os 4 | import string 5 | import re 6 | import logging 7 | import traceback 8 | import pymongo 9 | import constants 10 | from util import * 11 | from pprint import pprint, pformat 12 | 13 | 14 | # quick and dirty 15 | 16 | def test(): 17 | LOG = logging.getLogger(__name__) 18 | conn = None 19 | targetHost = "bronze.cs.brown.edu" 20 | targetPort = 27017 21 | try: 22 | conn = pymongo.Connection(targetHost, targetPort) 23 | except: 24 | LOG.error("Failed to connect to target MongoDB at %s:%s" % (targetHost, targetPort)) 25 | raise 26 | #assert conn 27 | db = conn["test"] 28 | titleSize = 150 29 | contentSize = 6000 30 | numComments = 100000000 31 | articleId = 1 32 | articleDate = randomDate(constants.START_DATE, constants.STOP_DATE) 33 | title = randomString(titleSize) 34 | slug = list(title.replace(" ", "")) 35 | if len(slug) > 64: slug = slug[:64] 36 | for idx in xrange(0, len(slug)): 37 | if random.randint(0, 10) == 0: 38 | slug[idx] = "-" 39 | ## FOR 40 | slug = "".join(slug) 41 | article = { 42 | "id": articleId, 43 | "title": title, 44 | "date": articleDate, 45 | "author": 1, 46 | "slug": slug, 47 | "content": randomString(contentSize), 48 | "numComments": numComments, 49 | } 50 | db[constants.ARTICLE_COLL].insert(article) 51 | print("perasa"); 52 | commentCtr=0 53 | lastDate = articleDate 54 | for ii in xrange(0, numComments): 55 | lastDate = randomDate(lastDate, constants.STOP_DATE) 56 | commentAuthor = randomString(15) 57 | commentSize = 1024 58 | commentContent = randomString(commentSize) 59 | 60 | comment = { 61 | "id": commentCtr, 62 | "article": articleId, 63 | "date": lastDate, 64 | "author": commentAuthor, 65 | "comment": commentContent, 66 | "rating": 100 67 | } 68 | commentCtr += 1 69 | db[constants.ARTICLE_COLL].update({"id": articleId},{"$push":{"comments":comment}},safe=True) 70 | if commentCtr==1 or commentCtr%1000==0: 71 | print(commentCtr) 72 | # def 73 | if __name__ == '__main__': 74 | #executed as script 75 | # do something 76 | test() 77 | -------------------------------------------------------------------------------- /exps/benchmarks/blog/util/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from rand import * 4 | from zipf import * -------------------------------------------------------------------------------- /exps/benchmarks/blog/util/rand.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # ----------------------------------------------------------------------- 3 | # Copyright (C) 2012 4 | # Andy Pavlo - http://www.cs.brown.edu/~pavlo/ 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining 7 | # a copy of this software and associated documentation files (the 8 | # "Software"), to deal in the Software without restriction, including 9 | # without limitation the rights to use, copy, modify, merge, publish, 10 | # distribute, sublicense, and/or sell copies of the Software, and to 11 | # permit persons to whom the Software is furnished to do so, subject to 12 | # the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be 15 | # included in all copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 20 | # IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 21 | # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 22 | # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 23 | # OTHER DEALINGS IN THE SOFTWARE. 24 | # ----------------------------------------------------------------------- 25 | 26 | import random 27 | import string 28 | from datetime import timedelta 29 | 30 | def randomString(size, chars=string.ascii_uppercase + string.digits): 31 | return ''.join(random.choice(chars) for x in range(size)) 32 | ## DEF 33 | 34 | #Discrete Date Generator 35 | def randomDate(start, end): 36 | """This returns a random datetime between two datetime objects but the time is the same.""" 37 | delta = end - start 38 | #int_delta = (delta.days * 24 * 60 * 60) + delta.seconds 39 | #random_second = random.randrange(int_delta) 40 | random_days = random.randrange(delta.days) 41 | return (start + timedelta(days=random_days)) 42 | ## DEF -------------------------------------------------------------------------------- /exps/benchmarks/blog/util/zipf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import math 4 | import random 5 | import bisect 6 | import numpy as np 7 | 8 | 9 | ## ----------------------------------------------------- 10 | ## Zipfian Distribution Generator 11 | ## ----------------------------------------------------- 12 | class ZipfGenerator: 13 | 14 | def __init__(self, n, skewin = 0.8): 15 | #if alpha <= 1.000: 16 | # self.alph = 1.001 17 | #else: 18 | # self.alph = alpha 19 | self.skew = skewin 20 | self.num = n #expected returned numbers 0...31 (e.g for n=32 authors) 21 | 22 | def next(self): 23 | #while 1: 24 | # tobereturned = np.random.zipf(self.alph) 25 | # if tobereturned <= self.num: 26 | # break 27 | #return tobereturned - 1; 28 | randomnum = random.random() 29 | if self.skew == 1.0: 30 | return 0 31 | if randomnum >= (1-self.skew): #80% of 32 | selected = random.randrange(1, int((1-self.skew)*self.num)) 33 | returnnum = selected * (1/(1-self.skew)) 34 | #print("80%=>"+str(int((1-self.skew)*self.num))) 35 | elif randomnum < (1-self.skew): #20% of times 36 | selected = random.randrange(1,int(self.skew*self.num)) 37 | returnnum = selected * (1/self.skew) 38 | #print("20%=>"+str(int(self.skew*self.num))) 39 | return int(returnnum-1) 40 | ## CLASS 41 | 42 | -------------------------------------------------------------------------------- /exps/benchmarks/replay/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __all__ = ["replaycoordinator", "replayworker"] 4 | -------------------------------------------------------------------------------- /exps/benchmarks/replay/dbmigrator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import logging 4 | from pprint import pformat 5 | import time 6 | import copy 7 | 8 | # Third-Party Dependencies 9 | basedir = os.getcwd() 10 | sys.path.append(os.path.join(basedir, "../../../libs")) 11 | 12 | # mongodb-d4 13 | sys.path.append(os.path.join(basedir, "../../../src")) 14 | sys.path.append(os.path.join(basedir, "../../tools")) 15 | 16 | from util import Histogram 17 | from util import constants 18 | import copy 19 | 20 | LOG = logging.getLogger(__name__) 21 | 22 | class DBMigrator: 23 | def __init__(self, ori_db, new_db): 24 | self.debug = LOG.isEnabledFor(logging.DEBUG) 25 | 26 | self.ori_db = ori_db 27 | self.new_db = new_db 28 | ## DEF 29 | 30 | ## DEF 31 | def copyData(self, doc, cur_name, parent_keys, docs=[]): 32 | ''' 33 | doc is a dict 34 | ''' 35 | #self.new_db[cur_name].insert(doc) 36 | #docs = self.new_db[cur_name].find().sort('_id',-1).limit(1) 37 | #for tmp in docs: 38 | # doc = tmp 39 | 40 | for key in doc.keys(): 41 | # Insert into new collection and add the parent's id 42 | if isinstance(doc[key], dict) and not parent_keys[key] is None and not parent_keys[key][cur_name] is None: 43 | 44 | ## For 45 | # set the foreign key of the child doc 46 | for f_id in parent_keys[key][cur_name]: 47 | doc[key][f_id] = doc[parent_keys[key][cur_name][f_id]] 48 | ## END FOR 49 | 50 | self.copyData(doc[key], str(key), parent_keys, docs) 51 | del doc[key] 52 | elif isinstance(doc[key], list): 53 | for obj in doc[key]: 54 | if isinstance(obj, dict) and not parent_keys[key] is None and not parent_keys[key][cur_name] is None: 55 | ## FOR 56 | # set the foreign key of the child doc 57 | for f_id in parent_keys[key][cur_name]: 58 | obj[f_id] = doc[parent_keys[key][cur_name][f_id]] 59 | self.copyData(obj, str(key), parent_keys, docs) 60 | ## END FOR 61 | 62 | newlist = [x for x in doc[key] if not isinstance(x, dict)] 63 | doc[key] = newlist 64 | if len(doc[key]) == 0: 65 | del doc[key] 66 | 67 | docs.append(doc) 68 | ## DEF 69 | 70 | ## DEF 71 | def migrate(self, parent_keys): 72 | # Normalization 73 | LOG.info("Migrating data from old db to new db") 74 | # TOFIX: collection_names(False):cannot take two arguments? 75 | for col_name in self.ori_db.collection_names(): 76 | if col_name == 'system.indexes': 77 | continue 78 | col = self.ori_db[col_name] 79 | cnt = 1 80 | docs = [] 81 | for doc in col.find({},{'_id':False}, timeout=False): 82 | #if cnt == 1000: 83 | # break 84 | self.copyData(doc, col_name, parent_keys, docs) 85 | if cnt % 1000 == 0: 86 | self.new_db[col_name].insert(docs) 87 | docs = [] 88 | cnt += 1 89 | if len(docs) != 0: 90 | self.new_db[col_name].insert(docs) 91 | 92 | -------------------------------------------------------------------------------- /exps/benchmarks/replay/unittest/test_combiner.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pymongo 4 | 5 | basedir = os.getcwd() 6 | 7 | # Third-party Dependencies 8 | sys.path.append(os.path.join(basedir, "../../../../libs")) 9 | sys.path.append(os.path.join(basedir, "../../../../src")) 10 | sys.path.append(os.path.join(basedir, "../../../tools")) 11 | sys.path.append(os.path.join(basedir, "../../../../src/search")) 12 | 13 | # mongo-d4-benchmark-replay 14 | sys.path.append(os.path.join(basedir, "..")) 15 | 16 | from dbcombiner import DBCombiner 17 | from dbdenormalizer import DBDenormalizer 18 | from design_deserializer import Deserializer 19 | from design import Design 20 | 21 | def test_combine_deletes(combiner, operations): 22 | return combiner.combineDeletes(operations) 23 | 24 | if __name__=="__main__": 25 | design_path = r"/home/ruiz1/mongodb-d4/exps/tpcc_design" 26 | print design_path 27 | deserializer = Deserializer() 28 | deserializer.loadDesignFile(design_path) 29 | design = Design() 30 | design.data = deserializer.json_doc 31 | print design.data 32 | 33 | dm = DBDenormalizer(None, None, None, None, design) 34 | graph = dm.constructGraph() 35 | dm.metadata_db = pymongo.Connection('localhost:27017')['tpcc_meta'] 36 | parent_keys = dm.readSchema('schema') 37 | 38 | combiner = DBCombiner(None, design, graph, parent_keys) 39 | 40 | operations = [] 41 | for i in range(5): 42 | op = dict() 43 | op['query_content'] = [] 44 | op['query_fields'] = None 45 | op['collection'] = 'order_line' 46 | op['query_content'].append({'ol_o_id':i,'ol_id':i+1}) 47 | op['predicates'] = {'ol_o_id':'eq','ol_id':'eq'} 48 | operations.append(op) 49 | 50 | for i in range(3): 51 | op = dict() 52 | op['query_content'] = [] 53 | op['query_fields'] = None 54 | op['collection'] = 'oorder' 55 | op['query_content'].append({'o_id':i}) 56 | op['predicates'] = {'o_id':'eq'} 57 | operations.append(op) 58 | 59 | 60 | print "---Test combining deletes---" 61 | print "----------------------------" 62 | ret, error, updates = test_combine_deletes(combiner, operations) 63 | print ret 64 | print "----------------------------" 65 | print updates 66 | print "----------------------------" 67 | -------------------------------------------------------------------------------- /exps/benchmarks/tpcc/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __all__ = ["tpcccoordinator", "tpccworker"] 4 | 5 | import runtime -------------------------------------------------------------------------------- /exps/benchmarks/tpcc/drivers/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | -------------------------------------------------------------------------------- /exps/benchmarks/tpcc/drivers/abstractdriver.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # ----------------------------------------------------------------------- 3 | # Copyright (C) 2011 4 | # Andy Pavlo 5 | # http://www.cs.brown.edu/~pavlo/ 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining 8 | # a copy of this software and associated documentation files (the 9 | # "Software"), to deal in the Software without restriction, including 10 | # without limitation the rights to use, copy, modify, merge, publish, 11 | # distribute, sublicense, and/or sell copies of the Software, and to 12 | # permit persons to whom the Software is furnished to do so, subject to 13 | # the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be 16 | # included in all copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 21 | # IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 22 | # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 23 | # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 | # OTHER DEALINGS IN THE SOFTWARE. 25 | # ----------------------------------------------------------------------- 26 | 27 | from datetime import datetime 28 | 29 | import constants 30 | 31 | ## ============================================== 32 | ## AbstractDriver 33 | ## ============================================== 34 | class AbstractDriver(object): 35 | def __init__(self, name, ddl): 36 | self.name = name 37 | self.driver_name = "%sDriver" % self.name.title() 38 | self.ddl = ddl 39 | 40 | def __str__(self): 41 | return self.driver_name 42 | 43 | def loadStart(self): 44 | """Optional callback to indicate to the driver that the data loading phase is about to begin.""" 45 | return None 46 | 47 | def loadFinish(self): 48 | """Optional callback to indicate to the driver that the data loading phase is finished.""" 49 | return None 50 | 51 | def loadFinishItem(self): 52 | """Optional callback to indicate to the driver that the ITEM data has been passed to the driver.""" 53 | return None 54 | 55 | def loadFinishWarehouse(self, w_id): 56 | """Optional callback to indicate to the driver that the data for the given warehouse is finished.""" 57 | return None 58 | 59 | def loadFinishDistrict(self, w_id, d_id): 60 | """Optional callback to indicate to the driver that the data for the given district is finished.""" 61 | return None 62 | 63 | def loadTuples(self, tableName, tuples): 64 | """Load a list of tuples into the target table""" 65 | raise NotImplementedError("%s does not implement loadTuples" % (self.driver_name)) 66 | 67 | def executeStart(self): 68 | """Optional callback before the execution phase starts""" 69 | return None 70 | 71 | def executeFinish(self): 72 | """Callback after the execution phase finishes""" 73 | return None 74 | ## CLASS -------------------------------------------------------------------------------- /exps/benchmarks/tpcc/runtime/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __all__ = ["executor", "loader"] 4 | -------------------------------------------------------------------------------- /exps/benchmarks/tpcc/runtime/nurand.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # ----------------------------------------------------------------------- 3 | # Copyright (C) 2011 4 | # Andy Pavlo 5 | # http://www.cs.brown.edu/~pavlo/ 6 | # 7 | # Original Java Version: 8 | # Copyright (C) 2008 9 | # Evan Jones 10 | # Massachusetts Institute of Technology 11 | # 12 | # Permission is hereby granted, free of charge, to any person obtaining 13 | # a copy of this software and associated documentation files (the 14 | # "Software"), to deal in the Software without restriction, including 15 | # without limitation the rights to use, copy, modify, merge, publish, 16 | # distribute, sublicense, and/or sell copies of the Software, and to 17 | # permit persons to whom the Software is furnished to do so, subject to 18 | # the following conditions: 19 | # 20 | # The above copyright notice and this permission notice shall be 21 | # included in all copies or substantial portions of the Software. 22 | # 23 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 26 | # IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 27 | # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 28 | # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 29 | # OTHER DEALINGS IN THE SOFTWARE. 30 | # ----------------------------------------------------------------------- 31 | 32 | import rand 33 | 34 | def makeForLoad(): 35 | """Create random NURand constants, appropriate for loading the database.""" 36 | cLast = rand.number(0, 255) 37 | cId = rand.number(0, 1023) 38 | orderLineItemId = rand.number(0, 8191) 39 | return NURandC(cLast, cId, orderLineItemId) 40 | 41 | def validCRun(cRun, cLoad): 42 | """Returns true if the cRun value is valid for running. See TPC-C 2.1.6.1 (page 20)""" 43 | cDelta = abs(cRun - cLoad) 44 | return 65 <= cDelta and cDelta <= 119 and cDelta != 96 and cDelta != 112 45 | 46 | def makeForRun(loadC): 47 | """Create random NURand constants for running TPC-C. TPC-C 2.1.6.1. (page 20) specifies the valid range for these constants.""" 48 | cRun = rand.number(0, 255) 49 | while validCRun(cRun, loadC.cLast) == False: 50 | cRun = rand.number(0, 255) 51 | assert validCRun(cRun, loadC.cLast) 52 | 53 | cId = rand.number(0, 1023) 54 | orderLineItemId = rand.number(0, 8191) 55 | return NURandC(cRun, cId, orderLineItemId) 56 | 57 | class NURandC: 58 | def __init__(self, cLast, cId, orderLineItemId): 59 | self.cLast = cLast 60 | self.cId = cId 61 | self.orderLineItemId = orderLineItemId 62 | -------------------------------------------------------------------------------- /exps/benchmarks/tpcc/runtime/scaleparameters.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # ----------------------------------------------------------------------- 4 | # Copyright (C) 2011 5 | # Andy Pavlo 6 | # http://www.cs.brown.edu/~pavlo/ 7 | # 8 | # Original Java Version: 9 | # Copyright (C) 2008 10 | # Evan Jones 11 | # Massachusetts Institute of Technology 12 | # 13 | # Permission is hereby granted, free of charge, to any person obtaining 14 | # a copy of this software and associated documentation files (the 15 | # "Software"), to deal in the Software without restriction, including 16 | # without limitation the rights to use, copy, modify, merge, publish, 17 | # distribute, sublicense, and/or sell copies of the Software, and to 18 | # permit persons to whom the Software is furnished to do so, subject to 19 | # the following conditions: 20 | # 21 | # The above copyright notice and this permission notice shall be 22 | # included in all copies or substantial portions of the Software. 23 | # 24 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 27 | # IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 28 | # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 29 | # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 30 | # OTHER DEALINGS IN THE SOFTWARE. 31 | # ----------------------------------------------------------------------- 32 | 33 | import constants 34 | 35 | def makeDefault(warehouses): 36 | return ScaleParameters(constants.NUM_ITEMS, \ 37 | warehouses, \ 38 | constants.DISTRICTS_PER_WAREHOUSE, \ 39 | constants.CUSTOMERS_PER_DISTRICT, \ 40 | constants.INITIAL_NEW_ORDERS_PER_DISTRICT) 41 | ## DEF 42 | 43 | def makeWithScaleFactor(warehouses, scaleFactor): 44 | items = int(constants.NUM_ITEMS*scaleFactor) 45 | if items <= 0: items = 1 46 | districts = int(max(constants.DISTRICTS_PER_WAREHOUSE, 1)) 47 | customers = int(max(constants.CUSTOMERS_PER_DISTRICT*scaleFactor, 1)) 48 | newOrders = int(max(constants.INITIAL_NEW_ORDERS_PER_DISTRICT*scaleFactor, 0)) 49 | 50 | return ScaleParameters(items, warehouses, districts, customers, newOrders) 51 | ## DEF 52 | 53 | class ScaleParameters: 54 | 55 | def __init__(self, items, warehouses, districtsPerWarehouse, customersPerDistrict, newOrdersPerDistrict): 56 | assert 1 <= items and items <= constants.NUM_ITEMS 57 | self.items = items 58 | assert warehouses > 0 59 | self.warehouses = warehouses 60 | self.starting_warehouse = 1 61 | assert 1 <= districtsPerWarehouse and districtsPerWarehouse <= constants.DISTRICTS_PER_WAREHOUSE 62 | self.districtsPerWarehouse = districtsPerWarehouse 63 | assert 1 <= customersPerDistrict and customersPerDistrict <= constants.CUSTOMERS_PER_DISTRICT 64 | self.customersPerDistrict = customersPerDistrict 65 | assert 0 <= newOrdersPerDistrict and newOrdersPerDistrict <= constants.CUSTOMERS_PER_DISTRICT 66 | assert newOrdersPerDistrict <= constants.INITIAL_NEW_ORDERS_PER_DISTRICT 67 | self.newOrdersPerDistrict = newOrdersPerDistrict 68 | self.ending_warehouse = (self.warehouses + self.starting_warehouse - 1) 69 | ## DEF 70 | 71 | def __str__(self): 72 | out = "%d items\n" % self.items 73 | out += "%d warehouses\n" % self.warehouses 74 | out += "%d districts/warehouse\n" % self.districtsPerWarehouse 75 | out += "%d customers/district\n" % self.customersPerDistrict 76 | out += "%d initial new orders/district" % self.newOrdersPerDistrict 77 | return out 78 | ## DEF 79 | 80 | ## CLASS -------------------------------------------------------------------------------- /exps/benchmarks/tpcc/tpcccoordinator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # ----------------------------------------------------------------------- 4 | # Copyright (C) 2011 5 | # Andy Pavlo & Yang Lu 6 | # http://www.cs.brown.edu/~pavlo/ 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining 9 | # a copy of this software and associated documentation files (the 10 | # "Software"), to deal in the Software without restriction, including 11 | # without limitation the rights to use, copy, modify, merge, publish, 12 | # distribute, sublicense, and/or sell copies of the Software, and to 13 | # permit persons to whom the Software is furnished to do so, subject to 14 | # the following conditions: 15 | # 16 | # The above copyright notice and this permission notice shall be 17 | # included in all copies or substantial portions of the Software. 18 | # 19 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 20 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 21 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 22 | # IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 23 | # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 24 | # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | # OTHER DEALINGS IN THE SOFTWARE. 26 | # ----------------------------------------------------------------------- 27 | 28 | import sys 29 | import os 30 | import string 31 | import re 32 | import glob 33 | import time 34 | import execnet 35 | import logging 36 | from pprint import pprint, pformat 37 | 38 | from api.abstractcoordinator import AbstractCoordinator 39 | from api.message import * 40 | 41 | import drivers 42 | from runtime import scaleparameters 43 | 44 | LOG = logging.getLogger(__name__) 45 | 46 | class TpccCoordinator(AbstractCoordinator) : 47 | DEFAULT_CONFIG = [ 48 | ("warehouses", "The number of warehouses to use in the benchmark run", 4), 49 | ("denormalize", "If set to true, then the CUSTOMER data will be denormalized into a single document", True), 50 | ] 51 | 52 | def benchmarkConfigImpl(self): 53 | return self.DEFAULT_CONFIG 54 | ## DEF 55 | 56 | def initImpl(self, config, channels): 57 | ## Create our ScaleParameter stuff that we're going to need 58 | num_warehouses = int(config[self.name]['warehouses']) 59 | self.scaleParameters = scaleparameters.makeWithScaleFactor(num_warehouses, config['default']["scalefactor"]) 60 | return dict([(channels[i], None) for i in xrange(len(channels))]) 61 | ## DEF 62 | 63 | def loadImpl(self, config, channels) : 64 | '''divide loading to several clients''' 65 | procs = len(channels) 66 | w_ids = map(lambda x:[], range(procs)) 67 | for w_id in range(self.scaleParameters.starting_warehouse, self.scaleParameters.ending_warehouse+1): 68 | idx = w_id % procs 69 | w_ids[idx].append(w_id) 70 | messages = dict([(channels[i], w_ids[i]) for i in xrange(procs)]) 71 | LOG.debug("TPC-C Load Messages:\n%s", pformat(messages)) 72 | return messages 73 | ## DEF 74 | 75 | def executeImpl(self, config, channels): 76 | return None 77 | 78 | ## CLASS 79 | -------------------------------------------------------------------------------- /exps/tools/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __all__ = ["design_deserializer"] -------------------------------------------------------------------------------- /exps/tools/design_deserializer.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | import json 4 | import os 5 | 6 | basedir = os.path.realpath(os.path.dirname(__file__)) 7 | sys.path.append(os.path.join(basedir, "../../src/search")) 8 | 9 | from design import Design 10 | 11 | class Deserializer: 12 | def __init__(self, json_string=None): 13 | if not json_string: 14 | self.json_doc = None 15 | else: 16 | self.json_doc = json.loads(json_string) 17 | ## DEF 18 | 19 | def loadDesignFile(self, file_path): 20 | f = open(file_path, 'r') 21 | content = f.read() 22 | f.close() 23 | 24 | self.json_doc = json.loads(content) 25 | ## DEF 26 | 27 | def Deserialize(self): 28 | d = Design() 29 | self.__deserialize__(self.json_doc, d) 30 | return d 31 | ## DEF 32 | 33 | def __deserialize__(self, doc, design): 34 | """ 35 | Just populate the given data into a design instance 36 | """ 37 | for key, value in doc.iteritems(): 38 | design.addCollection(key) 39 | for index in value['indexes']: 40 | design.addIndex(key, index) 41 | design.addShardKey(key, value['shardKeys']) 42 | design.setDenormalizationParent(key, value['denorm']) 43 | ## FOR 44 | 45 | ## CLASS 46 | -------------------------------------------------------------------------------- /exps/tools/dump-csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # ----------------------------------------------------------------------- 4 | # Copyright (C) 2012 by Brown University 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining 7 | # a copy of this software and associated documentation files (the 8 | # "Software"), to deal in the Software without restriction, including 9 | # without limitation the rights to use, copy, modify, merge, publish, 10 | # distribute, sublicense, and/or sell copies of the Software, and to 11 | # permit persons to whom the Software is furnished to do so, subject to 12 | # the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be 15 | # included in all copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 20 | # IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 21 | # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 22 | # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 23 | # OTHER DEALINGS IN THE SOFTWARE. 24 | # ----------------------------------------------------------------------- 25 | from __future__ import division 26 | from __future__ import with_statement 27 | 28 | import os, sys 29 | import re 30 | import subprocess 31 | 32 | ## ============================================== 33 | ## main 34 | ## ============================================== 35 | if __name__ == '__main__': 36 | if len(sys.argv) != 2: 37 | raise Exception("ERROR: Missing database name") 38 | 39 | db_name = sys.argv[1] 40 | cmd = "mongo %s --eval 'db.getCollectionNames()'" % db_name 41 | output = subprocess.check_output(cmd, shell=True) 42 | 43 | collections = set() 44 | for line in output.strip().split("\n"): 45 | if line.find("system.indexes") != -1: 46 | map(collections.add, line.split(",")) 47 | collections.remove("system.indexes") 48 | 49 | os.mkdir(db_name) 50 | for c in collections: 51 | output = os.path.join(db_name, "%s.json" % c) 52 | cmd = "mongoexport --db %s --collection %s --out %s" % (db_name, c, output) 53 | subprocess.check_call(cmd, shell=True) 54 | print output 55 | ## IF -------------------------------------------------------------------------------- /exps/tools/duplicator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # ----------------------------------------------------------------------- 4 | # Copyright (C) 2012 by Brown University 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining 7 | # a copy of this software and associated documentation files (the 8 | # "Software"), to deal in the Software without restriction, including 9 | # without limitation the rights to use, copy, modify, merge, publish, 10 | # distribute, sublicense, and/or sell copies of the Software, and to 11 | # permit persons to whom the Software is furnished to do so, subject to 12 | # the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be 15 | # included in all copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 20 | # IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 21 | # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 22 | # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 23 | # OTHER DEALINGS IN THE SOFTWARE. 24 | # ----------------------------------------------------------------------- 25 | from __future__ import division 26 | from __future__ import with_statement 27 | 28 | import os, sys 29 | import logging 30 | import random 31 | import re 32 | import string 33 | import json 34 | import glob 35 | import codecs 36 | from pprint import pformat 37 | from ConfigParser import RawConfigParser 38 | 39 | # Third-Party Dependencies 40 | basedir = os.path.realpath(os.path.dirname(__file__)) 41 | sys.path.append(os.path.join(basedir, "../../src")) 42 | sys.path.append(os.path.join(basedir, "../../libs")) 43 | import mongokit 44 | import argparse 45 | 46 | # mongodb-d4 47 | import catalog 48 | import workload 49 | from search import Designer 50 | from util import configutil 51 | from util import constants 52 | from util.histogram import Histogram 53 | 54 | logging.basicConfig( 55 | level = logging.INFO, 56 | format="%(asctime)s [%(filename)s:%(lineno)03d] %(levelname)-5s: %(message)s", 57 | datefmt="%m-%d-%Y %H:%M:%S", 58 | stream = sys.stdout 59 | ) 60 | 61 | LOG = logging.getLogger(__name__) 62 | 63 | ## ============================================== 64 | ## main 65 | ## ============================================== 66 | if __name__ == '__main__': 67 | aparser = argparse.ArgumentParser(description="CSV File Duplicator") 68 | aparser.add_argument('input', help='CSV Input Data Dump Directory') 69 | aparser.add_argument('output', help='CSV Output Data Dump Directory') 70 | aparser.add_argument('multiplier', type=int, help='Data Duplicator Multiplier') 71 | aparser.add_argument('--debug', action='store_true', help='Enable debug log messages.') 72 | args = vars(aparser.parse_args()) 73 | if args['debug']: LOG.setLevel(logging.DEBUG) 74 | 75 | if not os.path.exists(args["output"]): 76 | os.mkdir(args["output"]) 77 | for dataFile in glob.glob(os.path.join(args["input"], "*.json")): 78 | newDataFile = os.path.join(args["output"], os.path.basename(dataFile)) 79 | with codecs.open(newDataFile, encoding='utf-8', mode='w+') as out: 80 | with codecs.open(dataFile, encoding='utf-8') as fd: 81 | new_ctr = 0 82 | orig_ctr = 0 83 | for line in fd: 84 | try: 85 | row = json.loads(line.encode('utf-8')) 86 | except: 87 | LOG.error(row) 88 | raise 89 | id = row["_id"]["$oid"] 90 | orig_ctr += 1 91 | new_ctr += 1 92 | for i in xrange(args['multiplier']): 93 | # Just update the _id field 94 | new_id = '%04x%s' % (i, id[4:]) 95 | # print id, "->", new_id 96 | out.write(line.replace(id, new_id)) 97 | new_ctr += 1 98 | ## FOR 99 | ## FOR 100 | ## WITH 101 | LOG.info("DUPLICATED %s -> ORIG:%d / NEW:%d", newDataFile, orig_ctr, new_ctr) 102 | ## WITH 103 | ## FOR 104 | 105 | 106 | ## MAIN 107 | -------------------------------------------------------------------------------- /exps/tools/load-csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # ----------------------------------------------------------------------- 4 | # Copyright (C) 2012 by Brown University 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining 7 | # a copy of this software and associated documentation files (the 8 | # "Software"), to deal in the Software without restriction, including 9 | # without limitation the rights to use, copy, modify, merge, publish, 10 | # distribute, sublicense, and/or sell copies of the Software, and to 11 | # permit persons to whom the Software is furnished to do so, subject to 12 | # the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be 15 | # included in all copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 20 | # IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 21 | # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 22 | # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 23 | # OTHER DEALINGS IN THE SOFTWARE. 24 | # ----------------------------------------------------------------------- 25 | 26 | import os, sys 27 | import subprocess 28 | import logging 29 | import glob 30 | from ConfigParser import RawConfigParser 31 | 32 | # Third-Party Dependencies 33 | basedir = os.path.realpath(os.path.dirname(__file__)) 34 | sys.path.append(os.path.join(basedir, "../../src")) 35 | sys.path.append(os.path.join(basedir, "../../libs")) 36 | import argparse 37 | 38 | from util import constants 39 | from util import configutil 40 | 41 | logging.basicConfig( 42 | level = logging.INFO, 43 | format="%(asctime)s [%(filename)s:%(lineno)03d] %(levelname)-5s: %(message)s", 44 | datefmt="%m-%d-%Y %H:%M:%S", 45 | stream = sys.stdout 46 | ) 47 | LOG = logging.getLogger(__name__) 48 | 49 | ## ============================================== 50 | ## main 51 | ## ============================================== 52 | if __name__ == '__main__': 53 | aparser = argparse.ArgumentParser(description="CSV File Loader") 54 | aparser.add_argument('input', help='CSV Input Data Dump Directory') 55 | aparser.add_argument('--config', type=file, help='Path to %s configuration file' % constants.PROJECT_NAME) 56 | aparser.add_argument('--debug', action='store_true', help='Enable debug log messages.') 57 | args = vars(aparser.parse_args()) 58 | if args['debug']: LOG.setLevel(logging.DEBUG) 59 | 60 | if not args['config']: 61 | LOG.error("Missing configuration file") 62 | print 63 | aparser.print_usage() 64 | sys.exit(1) 65 | LOG.debug("Loading configuration file '%s'" % args['config']) 66 | config = RawConfigParser() 67 | configutil.setDefaultValues(config) 68 | config.read(os.path.realpath(args['config'].name)) 69 | 70 | db_host = config.get(configutil.SECT_MONGODB, 'host') 71 | db_name = config.get(configutil.SECT_MONGODB, 'dataset_db') 72 | for dataFile in glob.glob(os.path.join(args["input"], "*.json")): 73 | collection = os.path.basename(dataFile).replace(".csv", "") 74 | cmd = "mongoimport --host=%s --db %s --collection %s --file %s --type json" % (db_host, db_name, collection, dataFile) 75 | subprocess.check_call(cmd, shell=True) 76 | LOG.info("Loaded %s.%s", db_name, collection) 77 | ## FOR 78 | ## IF -------------------------------------------------------------------------------- /libs/argparse/__init__.py: -------------------------------------------------------------------------------- 1 | from argparse import * -------------------------------------------------------------------------------- /libs/mongokit/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) 2009-2011, Nicolas Clairon 4 | # All rights reserved. 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions are met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * Neither the name of the University of California, Berkeley nor the 14 | # names of its contributors may be used to endorse or promote products 15 | # derived from this software without specific prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY 18 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | # DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY 21 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | __version__ = "0.8.1" 29 | 30 | from bson.dbref import DBRef 31 | from cursor import Cursor 32 | from operators import * 33 | from schema_document import * 34 | from mongo_exceptions import * 35 | from document import Document, ObjectId 36 | from versioned_document import VersionedDocument 37 | from database import Database 38 | from collection import Collection 39 | from connection import Connection 40 | from master_slave_connection import MasterSlaveConnection 41 | from pymongo import ASCENDING as INDEX_ASCENDING,\ 42 | DESCENDING as INDEX_DESCENDING,\ 43 | ALL as INDEX_ALL,\ 44 | GEO2D as INDEX_GEO2D,\ 45 | OFF as INDEX_OFF 46 | from migration import DocumentMigration 47 | 48 | -------------------------------------------------------------------------------- /libs/mongokit/auth.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) 2009-2011, Nicolas Clairon 4 | # All rights reserved. 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions are met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * Neither the name of the University of California, Berkeley nor the 14 | # names of its contributors may be used to endorse or promote products 15 | # derived from this software without specific prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY 18 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | # DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY 21 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | from mongokit import Document 29 | import hashlib, os 30 | 31 | class User(Document): 32 | structure = { 33 | "_id":unicode, 34 | "user":{ 35 | "login":unicode, 36 | "password":unicode, # TODO validator 37 | "email":unicode, 38 | } 39 | } 40 | required_fields = ['user.password', 'user.email'] # what if openid ? password is None 41 | 42 | def set_login(self, login): 43 | self['_id'] = login 44 | self['user']['login'] = login 45 | 46 | def get_login(self): 47 | return self['_id'] 48 | 49 | def del_login(self): 50 | self['_id'] = None 51 | self['user']['login'] = None 52 | 53 | login = property(get_login, set_login, del_login) 54 | 55 | def set_password(self, password): 56 | """ Hash password on the fly """ 57 | if isinstance(password, unicode): 58 | password = password.encode('utf-8') 59 | password_salt = hashlib.sha1(os.urandom(60)).hexdigest() 60 | crypt = hashlib.sha1(password + password_salt).hexdigest() 61 | self['user']['password'] = unicode(password_salt + crypt, 'utf-8') 62 | 63 | def get_password(self): 64 | """ Return the password hashed """ 65 | return self['user']['password'] 66 | 67 | def del_password(self): 68 | self['user']['password'] = None 69 | 70 | password = property(get_password, set_password, del_password) 71 | 72 | def verify_password(self, password): 73 | """ Check the password against existing credentials """ 74 | if isinstance(password, unicode): 75 | password = password.encode('utf-8') 76 | password_salt = self['user']['password'][:40] 77 | crypt_pass = hashlib.sha1(password + password_salt).hexdigest() 78 | if crypt_pass == self['user']['password'][40:]: 79 | return True 80 | else: 81 | return False 82 | 83 | def get_email(self): 84 | return self['user']['email'] 85 | 86 | def set_email(self, email): 87 | # TODO check if it's a well formated email 88 | self['user']['email'] = email 89 | 90 | def del_email(self): 91 | self['user']['email'] = None 92 | 93 | email = property(get_email, set_email, del_email) 94 | 95 | def save(self, *args, **kwargs): 96 | assert self['_id'] == self['user']['login'] 97 | super(User, self).save(*args, **kwargs) 98 | -------------------------------------------------------------------------------- /libs/mongokit/cursor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) 2009-2011, Nicolas Clairon 4 | # All rights reserved. 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions are met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * Neither the name of the University of California, Berkeley nor the 14 | # names of its contributors may be used to endorse or promote products 15 | # derived from this software without specific prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY 18 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | # DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY 21 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | from pymongo.cursor import Cursor as PymongoCursor 29 | from collections import deque 30 | 31 | class Cursor(PymongoCursor): 32 | def __init__(self, *args, **kwargs): 33 | self.__wrap = None 34 | if kwargs: 35 | self.__wrap = kwargs.pop('wrap', None) 36 | super(Cursor, self).__init__(*args, **kwargs) 37 | 38 | def next(self): 39 | if self._Cursor__empty: 40 | raise StopIteration 41 | db = self._Cursor__collection.database 42 | if len(self.__data) or self._refresh(): 43 | if isinstance(self._Cursor__data, deque): 44 | item = self._Cursor__data.popleft() 45 | else: 46 | item = self._Cursor__data.pop(0) 47 | if self._Cursor__manipulate: 48 | son = db._fix_outgoing(item, self._Cursor__collection) 49 | else: 50 | son = item 51 | if self.__wrap is not None: 52 | return self.__wrap(son, collection=self._Cursor__collection) 53 | else: 54 | return son 55 | else: 56 | raise StopIteration 57 | 58 | def __getitem__(self, index): 59 | obj = super(Cursor, self).__getitem__(index) 60 | if (self.__wrap is not None) and isinstance(obj, dict): 61 | return self.__wrap(obj) 62 | return obj 63 | -------------------------------------------------------------------------------- /libs/mongokit/database.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) 2009-2011, Nicolas Clairon 4 | # All rights reserved. 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions are met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * Neither the name of the University of California, Berkeley nor the 14 | # names of its contributors may be used to endorse or promote products 15 | # derived from this software without specific prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY 18 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | # DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY 21 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | from pymongo.database import Database as PymongoDatabase 29 | from bson.dbref import DBRef 30 | from mongokit.document import Document 31 | from collection import Collection 32 | 33 | class Database(PymongoDatabase): 34 | 35 | def __init__(self, *args, **kwargs): 36 | self._collections = {} 37 | super(Database, self).__init__(*args, **kwargs) 38 | 39 | def __getattr__(self, key): 40 | if key in self.connection._registered_documents: 41 | document = self.connection._registered_documents[key] 42 | return getattr(self[document.__collection__], key) 43 | else: 44 | if not key in self._collections: 45 | self._collections[key] = Collection(self, key) 46 | return self._collections[key] 47 | 48 | def dereference(self, dbref, model = None): 49 | if model is None: 50 | return super(Database, self).dereference(dbref) 51 | if not isinstance(dbref, DBRef): 52 | raise TypeError("first argument must be a DBRef") 53 | if dbref.database is not None and dbref.database != self.name: 54 | raise ValueError("trying to dereference a DBRef that points to " 55 | "another database (%r not %r)" % (dbref.database, self._Database__name)) 56 | if not issubclass(model, Document): 57 | raise TypeError("second argument must be a Document") 58 | return getattr(self[dbref.collection], model.__name__).one({'_id': dbref.id}) 59 | -------------------------------------------------------------------------------- /libs/mongokit/master_slave_connection.py: -------------------------------------------------------------------------------- 1 | """ 2 | Master-Slave integration with for MongoKit 3 | Andreas Jung, info@zopyx.com 4 | (same license as Mongokit) 5 | """ 6 | 7 | from pymongo.master_slave_connection import MasterSlaveConnection as PymongoMasterSlaveConnection 8 | from pymongo import Connection as PyMongoConnection 9 | 10 | from mongokit.database import Database 11 | from mongokit.connection import CallableMixin, _iterables 12 | 13 | class MasterSlaveConnection(PymongoMasterSlaveConnection): 14 | """ Master-Slave support for MongoKit """ 15 | 16 | def __init__(self, master, slaves=[]): 17 | """ The MasterSlaveConnection is a wrapper around the 18 | pymongo.master_slave_connection implementation. The constructor accepts 19 | the connection parameter for the master MongoDB server and a non-empty 20 | list of connection parameters for one or more slaves. The connection 21 | parameters are expressed as a dictionary where the keys match the 22 | signature of the constructor of a standard 23 | pymongo.connection.Connection instance ('host', 'port' etc.). For the 24 | 'slaves' it is not necessary to specify the 'slave_okay' parameter 25 | (will be added internally automatically). 26 | 27 | The purpose of the MasterSlaveConnection is to hide a master-slave 28 | setup with one master and several slave servers. The slave 29 | server(s) will be used for read and write will be made to the 30 | master (and re-synced to the slave automatically as part of the 31 | master-slave setup). 32 | """ 33 | 34 | self._databases = {} 35 | self._registered_documents = {} 36 | 37 | # I am the master 38 | if not isinstance(master, dict): 39 | raise TypeError('"master" must be a dict containing pymongo.Connection parameters') 40 | master_connection = PyMongoConnection(**master) 41 | 42 | # You are my dirty slaves 43 | if not slaves: 44 | raise ValueError('You must specify at least one slave connection') 45 | 46 | slave_connections = list() 47 | for slave in slaves: 48 | if not isinstance(slave, dict): 49 | raise TypeError('"slaves" must be list of dicts containing pymongo.Connection parameters') 50 | slave['slave_okay'] = True 51 | slave_connections.append(PyMongoConnection(**slave)) 52 | 53 | super(MasterSlaveConnection, self).__init__(master_connection, slave_connections) 54 | 55 | def register(self, obj_list): 56 | decorator = None 57 | if not isinstance(obj_list, _iterables): 58 | # we assume that the user used this as a decorator 59 | # using @register syntax or using conn.register(SomeDoc) 60 | # we stock the class object in order to return it later 61 | decorator = obj_list 62 | obj_list = [obj_list] 63 | # cleanup 64 | for dbname, db in self._databases.items(): 65 | for colname, col in db._collections.items(): 66 | for docname, doc in col._documents.items(): 67 | del col._documents[docname] 68 | for obj_name in [obj.__name__ for obj in obj_list]: 69 | if obj_name in col._registered_documents: 70 | del col._registered_documents[obj_name] 71 | # register 72 | for obj in obj_list: 73 | CallableDocument = type( 74 | "Callable%s" % obj.__name__, 75 | (obj, CallableMixin), 76 | {"_obj_class":obj, "__repr__":object.__repr__} 77 | ) 78 | self._registered_documents[obj.__name__] = CallableDocument 79 | # if the class object is stored, it means the user used a decorator and 80 | # we must return the class object 81 | if decorator is not None: 82 | return decorator 83 | 84 | def __getattr__(self, key): 85 | if key not in self._databases: 86 | self._databases[key] = Database(self, key) 87 | return self._databases[key] 88 | 89 | -------------------------------------------------------------------------------- /libs/mongokit/mongo_exceptions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) 2009-2011, Nicolas Clairon 4 | # All rights reserved. 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions are met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * Neither the name of the University of California, Berkeley nor the 14 | # names of its contributors may be used to endorse or promote products 15 | # derived from this software without specific prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY 18 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | # DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY 21 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | from bson import InvalidDocument 29 | from pymongo.errors import OperationFailure 30 | class ConnectionError(Exception):pass 31 | class MongoAuthException(Exception):pass 32 | class MultipleResultsFound(Exception):pass 33 | class BadIndexError(Exception):pass 34 | class AutoReferenceError(Exception):pass 35 | class MaxDocumentSizeError(Exception):pass 36 | class OptionConflictError(Exception):pass 37 | class UpdateQueryError(Exception):pass 38 | -------------------------------------------------------------------------------- /libs/mongokit/operators.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Copyright (c) 2009-2010, Nicolas Clairon 4 | # All rights reserved. 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions are met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # * Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # * Neither the name of the University of California, Berkeley nor the 14 | # names of its contributors may be used to endorse or promote products 15 | # derived from this software without specific prior written permission. 16 | # 17 | # THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY 18 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | # DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY 21 | # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | class SchemaOperator(object): 29 | repr = None 30 | 31 | def __init__(self, *args): 32 | assert self.repr is not None 33 | self._operands = list(args) 34 | 35 | def __repr__(self): 36 | return str(self) 37 | 38 | def __iter__(self): 39 | for operand in self._operands: 40 | yield operand 41 | 42 | def __eq__(self, other): 43 | return type(self) == type(other) and self._operands == other._operands 44 | 45 | def validate(self, value): 46 | raise NotImplementedError 47 | 48 | class OR(SchemaOperator): 49 | repr = 'or' 50 | 51 | def __init__(self, *args): 52 | super(OR, self).__init__(*args) 53 | 54 | def __str__(self): 55 | repr = ' %s ' % self.repr 56 | return '<'+ repr.join([i.__name__ for i in self._operands]) + '>' 57 | 58 | def validate(self, value): 59 | if type(value) in self._operands: 60 | return True 61 | return False 62 | 63 | class NOT(SchemaOperator): 64 | repr = 'not' 65 | 66 | def __init__(self, *args): 67 | super(NOT, self).__init__(*args) 68 | 69 | def __str__(self): 70 | repr = ', %s ' % self.repr 71 | return '' 72 | 73 | def validate(self, value): 74 | if type(value) in self._operands: 75 | return False 76 | return True 77 | 78 | class IS(SchemaOperator): 79 | repr = 'is' 80 | 81 | def __init__(self, *args): 82 | super(IS, self).__init__(*args) 83 | 84 | def __str__(self): 85 | representation = ' or %s ' % self.repr 86 | return '' 87 | 88 | def validate(self, value): 89 | if value in self._operands: 90 | for op in self._operands: 91 | if value == op and isinstance(value, type(op)): 92 | return True 93 | return False 94 | 95 | -------------------------------------------------------------------------------- /libs/sqlparse/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2008 Andi Albrecht, albrecht.andi@gmail.com 2 | # 3 | # This module is part of python-sqlparse and is released under 4 | # the BSD License: http://www.opensource.org/licenses/bsd-license.php. 5 | 6 | """Parse SQL statements.""" 7 | 8 | 9 | __version__ = '0.1.11' 10 | 11 | 12 | # Setup namespace 13 | from sqlparse import engine 14 | from sqlparse import filters 15 | from sqlparse import formatter 16 | 17 | # Deprecated in 0.1.5. Will be removed in 0.2.0 18 | from sqlparse.exceptions import SQLParseError 19 | 20 | 21 | def parse(sql, encoding=None): 22 | """Parse sql and return a list of statements. 23 | 24 | :param sql: A string containting one or more SQL statements. 25 | :param encoding: The encoding of the statement (optional). 26 | :returns: A tuple of :class:`~sqlparse.sql.Statement` instances. 27 | """ 28 | return tuple(parsestream(sql, encoding)) 29 | 30 | 31 | def parsestream(stream, encoding=None): 32 | """Parses sql statements from file-like object. 33 | 34 | :param stream: A file-like object. 35 | :param encoding: The encoding of the stream contents (optional). 36 | :returns: A generator of :class:`~sqlparse.sql.Statement` instances. 37 | """ 38 | stack = engine.FilterStack() 39 | stack.full_analyze() 40 | return stack.run(stream, encoding) 41 | 42 | 43 | def format(sql, **options): 44 | """Format *sql* according to *options*. 45 | 46 | Available options are documented in :ref:`formatting`. 47 | 48 | In addition to the formatting options this function accepts the 49 | keyword "encoding" which determines the encoding of the statement. 50 | 51 | :returns: The formatted SQL statement as string. 52 | """ 53 | encoding = options.pop('encoding', None) 54 | stack = engine.FilterStack() 55 | options = formatter.validate_options(options) 56 | stack = formatter.build_filter_stack(stack, options) 57 | stack.postprocess.append(filters.SerializerUnicode()) 58 | return ''.join(stack.run(sql, encoding)) 59 | 60 | 61 | def split(sql, encoding=None): 62 | """Split *sql* into single statements. 63 | 64 | :param sql: A string containting one or more SQL statements. 65 | :param encoding: The encoding of the statement (optional). 66 | :returns: A list of strings. 67 | """ 68 | stack = engine.FilterStack() 69 | stack.split_statements = True 70 | return [unicode(stmt).strip() for stmt in stack.run(sql, encoding)] 71 | 72 | 73 | from sqlparse.engine.filter import StatementFilter 74 | 75 | 76 | def split2(stream): 77 | splitter = StatementFilter() 78 | return list(splitter.process(None, stream)) 79 | -------------------------------------------------------------------------------- /libs/sqlparse/engine/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2008 Andi Albrecht, albrecht.andi@gmail.com 2 | # 3 | # This module is part of python-sqlparse and is released under 4 | # the BSD License: http://www.opensource.org/licenses/bsd-license.php. 5 | 6 | """filter""" 7 | 8 | from sqlparse import lexer 9 | from sqlparse.engine import grouping 10 | from sqlparse.engine.filter import StatementFilter 11 | 12 | # XXX remove this when cleanup is complete 13 | Filter = object 14 | 15 | 16 | class FilterStack(object): 17 | 18 | def __init__(self): 19 | self.preprocess = [] 20 | self.stmtprocess = [] 21 | self.postprocess = [] 22 | self.split_statements = False 23 | self._grouping = False 24 | 25 | def _flatten(self, stream): 26 | for token in stream: 27 | if token.is_group(): 28 | for t in self._flatten(token.tokens): 29 | yield t 30 | else: 31 | yield token 32 | 33 | def enable_grouping(self): 34 | self._grouping = True 35 | 36 | def full_analyze(self): 37 | self.enable_grouping() 38 | 39 | def run(self, sql, encoding=None): 40 | stream = lexer.tokenize(sql, encoding) 41 | # Process token stream 42 | if self.preprocess: 43 | for filter_ in self.preprocess: 44 | stream = filter_.process(self, stream) 45 | 46 | if (self.stmtprocess or self.postprocess or self.split_statements 47 | or self._grouping): 48 | splitter = StatementFilter() 49 | stream = splitter.process(self, stream) 50 | 51 | if self._grouping: 52 | 53 | def _group(stream): 54 | for stmt in stream: 55 | grouping.group(stmt) 56 | yield stmt 57 | stream = _group(stream) 58 | 59 | if self.stmtprocess: 60 | 61 | def _run1(stream): 62 | ret = [] 63 | for stmt in stream: 64 | for filter_ in self.stmtprocess: 65 | filter_.process(self, stmt) 66 | ret.append(stmt) 67 | return ret 68 | stream = _run1(stream) 69 | 70 | if self.postprocess: 71 | 72 | def _run2(stream): 73 | for stmt in stream: 74 | stmt.tokens = list(self._flatten(stmt.tokens)) 75 | for filter_ in self.postprocess: 76 | stmt = filter_.process(self, stmt) 77 | yield stmt 78 | stream = _run2(stream) 79 | 80 | return stream 81 | -------------------------------------------------------------------------------- /libs/sqlparse/engine/filter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from sqlparse.sql import Statement, Token 4 | from sqlparse import tokens as T 5 | 6 | 7 | class StatementFilter: 8 | "Filter that split stream at individual statements" 9 | 10 | def __init__(self): 11 | self._in_declare = False 12 | self._in_dbldollar = False 13 | self._is_create = False 14 | self._begin_depth = 0 15 | 16 | def _reset(self): 17 | "Set the filter attributes to its default values" 18 | self._in_declare = False 19 | self._in_dbldollar = False 20 | self._is_create = False 21 | self._begin_depth = 0 22 | 23 | def _change_splitlevel(self, ttype, value): 24 | "Get the new split level (increase, decrease or remain equal)" 25 | # PostgreSQL 26 | if (ttype == T.Name.Builtin 27 | and value.startswith('$') and value.endswith('$')): 28 | if self._in_dbldollar: 29 | self._in_dbldollar = False 30 | return -1 31 | else: 32 | self._in_dbldollar = True 33 | return 1 34 | elif self._in_dbldollar: 35 | return 0 36 | 37 | # ANSI 38 | if ttype not in T.Keyword: 39 | return 0 40 | 41 | unified = value.upper() 42 | 43 | if unified == 'DECLARE' and self._is_create: 44 | self._in_declare = True 45 | return 1 46 | 47 | if unified == 'BEGIN': 48 | self._begin_depth += 1 49 | if self._in_declare or self._is_create: 50 | # FIXME(andi): This makes no sense. 51 | return 1 52 | return 0 53 | 54 | if unified == 'END': 55 | # Should this respect a preceeding BEGIN? 56 | # In CASE ... WHEN ... END this results in a split level -1. 57 | self._begin_depth = max(0, self._begin_depth - 1) 58 | return -1 59 | 60 | if ttype is T.Keyword.DDL and unified.startswith('CREATE'): 61 | self._is_create = True 62 | return 0 63 | 64 | if (unified in ('IF', 'FOR') 65 | and self._is_create and self._begin_depth > 0): 66 | return 1 67 | 68 | # Default 69 | return 0 70 | 71 | def process(self, stack, stream): 72 | "Process the stream" 73 | consume_ws = False 74 | splitlevel = 0 75 | stmt = None 76 | stmt_tokens = [] 77 | 78 | # Run over all stream tokens 79 | for ttype, value in stream: 80 | # Yield token if we finished a statement and there's no whitespaces 81 | if consume_ws and ttype not in (T.Whitespace, T.Comment.Single): 82 | stmt.tokens = stmt_tokens 83 | yield stmt 84 | 85 | # Reset filter and prepare to process next statement 86 | self._reset() 87 | consume_ws = False 88 | splitlevel = 0 89 | stmt = None 90 | 91 | # Create a new statement if we are not currently in one of them 92 | if stmt is None: 93 | stmt = Statement() 94 | stmt_tokens = [] 95 | 96 | # Change current split level (increase, decrease or remain equal) 97 | splitlevel += self._change_splitlevel(ttype, value) 98 | 99 | # Append the token to the current statement 100 | stmt_tokens.append(Token(ttype, value)) 101 | 102 | # Check if we get the end of a statement 103 | if splitlevel <= 0 and ttype is T.Punctuation and value == ';': 104 | consume_ws = True 105 | 106 | # Yield pending statement (if any) 107 | if stmt is not None: 108 | stmt.tokens = stmt_tokens 109 | yield stmt 110 | -------------------------------------------------------------------------------- /libs/sqlparse/exceptions.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2012 Andi Albrecht, albrecht.andi@gmail.com 2 | # 3 | # This module is part of python-sqlparse and is released under 4 | # the BSD License: http://www.opensource.org/licenses/bsd-license.php. 5 | 6 | """Exceptions used in this package.""" 7 | 8 | 9 | class SQLParseError(Exception): 10 | """Base class for exceptions in this module.""" 11 | -------------------------------------------------------------------------------- /libs/sqlparse/functions.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 17/05/2012 3 | 4 | @author: piranna 5 | 6 | Several utility functions to extract info from the SQL sentences 7 | ''' 8 | 9 | from sqlparse.filters import ColumnsSelect, Limit 10 | from sqlparse.pipeline import Pipeline 11 | from sqlparse.tokens import Keyword, Whitespace 12 | 13 | 14 | def getlimit(stream): 15 | """Function that return the LIMIT of a input SQL """ 16 | pipe = Pipeline() 17 | 18 | pipe.append(Limit()) 19 | 20 | result = pipe(stream) 21 | try: 22 | return int(result) 23 | except ValueError: 24 | return result 25 | 26 | 27 | def getcolumns(stream): 28 | """Function that return the colums of a SELECT query""" 29 | pipe = Pipeline() 30 | 31 | pipe.append(ColumnsSelect()) 32 | 33 | return pipe(stream) 34 | 35 | 36 | class IsType(object): 37 | """Functor that return is the statement is of a specific type""" 38 | def __init__(self, type): 39 | self.type = type 40 | 41 | def __call__(self, stream): 42 | for token_type, value in stream: 43 | if token_type not in Whitespace: 44 | return token_type in Keyword and value == self.type 45 | -------------------------------------------------------------------------------- /libs/sqlparse/pipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2011 Jesus Leganes "piranna", piranna@gmail.com 2 | # 3 | # This module is part of python-sqlparse and is released under 4 | # the BSD License: http://www.opensource.org/licenses/bsd-license.php. 5 | 6 | from types import GeneratorType 7 | 8 | 9 | class Pipeline(list): 10 | """Pipeline to process filters sequentially""" 11 | 12 | def __call__(self, stream): 13 | """Run the pipeline 14 | 15 | Return a static (non generator) version of the result 16 | """ 17 | 18 | # Run the stream over all the filters on the pipeline 19 | for filter in self: 20 | # Functions and callable objects (objects with '__call__' method) 21 | if callable(filter): 22 | stream = filter(stream) 23 | 24 | # Normal filters (objects with 'process' method) 25 | else: 26 | stream = filter.process(None, stream) 27 | 28 | # If last filter return a generator, staticalize it inside a list 29 | if isinstance(stream, GeneratorType): 30 | return list(stream) 31 | return stream 32 | -------------------------------------------------------------------------------- /libs/sqlparse/tokens.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2008 Andi Albrecht, albrecht.andi@gmail.com 2 | # 3 | # This module is part of python-sqlparse and is released under 4 | # the BSD License: http://www.opensource.org/licenses/bsd-license.php. 5 | 6 | # The Token implementation is based on pygment's token system written 7 | # by Georg Brandl. 8 | # http://pygments.org/ 9 | 10 | """Tokens""" 11 | 12 | 13 | class _TokenType(tuple): 14 | parent = None 15 | 16 | def split(self): 17 | buf = [] 18 | node = self 19 | while node is not None: 20 | buf.append(node) 21 | node = node.parent 22 | buf.reverse() 23 | return buf 24 | 25 | def __contains__(self, val): 26 | return val is not None and (self is val or val[:len(self)] == self) 27 | 28 | def __getattr__(self, val): 29 | if not val or not val[0].isupper(): 30 | return tuple.__getattribute__(self, val) 31 | new = _TokenType(self + (val,)) 32 | setattr(self, val, new) 33 | new.parent = self 34 | return new 35 | 36 | def __hash__(self): 37 | return hash(tuple(self)) 38 | 39 | def __repr__(self): 40 | return 'Token' + (self and '.' or '') + '.'.join(self) 41 | 42 | 43 | Token = _TokenType() 44 | 45 | # Special token types 46 | Text = Token.Text 47 | Whitespace = Text.Whitespace 48 | Newline = Whitespace.Newline 49 | Error = Token.Error 50 | # Text that doesn't belong to this lexer (e.g. HTML in PHP) 51 | Other = Token.Other 52 | 53 | # Common token types for source code 54 | Keyword = Token.Keyword 55 | Name = Token.Name 56 | Literal = Token.Literal 57 | String = Literal.String 58 | Number = Literal.Number 59 | Punctuation = Token.Punctuation 60 | Operator = Token.Operator 61 | Comparison = Operator.Comparison 62 | Wildcard = Token.Wildcard 63 | Comment = Token.Comment 64 | Assignment = Token.Assignement 65 | 66 | # Generic types for non-source code 67 | Generic = Token.Generic 68 | 69 | # String and some others are not direct childs of Token. 70 | # alias them: 71 | Token.Token = Token 72 | Token.String = String 73 | Token.Number = Number 74 | 75 | # SQL specific tokens 76 | DML = Keyword.DML 77 | DDL = Keyword.DDL 78 | Command = Keyword.Command 79 | 80 | Group = Token.Group 81 | Group.Parenthesis = Token.Group.Parenthesis 82 | Group.Comment = Token.Group.Comment 83 | Group.Where = Token.Group.Where 84 | -------------------------------------------------------------------------------- /src/README.md: -------------------------------------------------------------------------------- 1 | ## Setup 2 | 3 | 1. Create a default configuration file that you will use for your application: 4 | 5 | ./d4.py --print-config > application.config 6 | 7 | 2. Edit the settings in this configuration file according to your local environment. 8 | 9 | 10 | ## MongoDB Example 11 | 12 | 1. Execute [mongosniff](http://www.mongodb.org/display/DOCS/mongosniff) on your application server to collect 13 | a workload trace of operations executed on the MongoDB server. You can pipe this into a file for later processing. 14 | 15 | mongosniff --source NET lo | gzip --best > sniff.out.gz 16 | 17 | 2. Load this mongosniff workload trace from into **D4**'s internal catalog 18 | 19 | gunzip -c sniff.out.gz | ./d4.py --config=application.config --reset --no-search 20 | 21 | The *--reset* flag will erase all of the metadata that may exist in the catalog database in target MongoDB. 22 | This does not modify your application's database. 23 | The *--no-search* flag will cause **D4** to halt the program immediately after processing the workload trace. 24 | 25 | If you are just testing and do not want to process the entire workload trace file, you can use the *--sess-limit* and *--op-limit* options to limit the number of records processed. For example, the following command will halt loading after processing 1000 new Sessions from the trace: 26 | 27 | gunzip -c sniff.out.gz | ./d4.py --config=application.config --reset --no-search --sess-limit=1000 28 | 29 | 3. Now execute the search algorithm to find the optimal design. Note that we use the *--no-load* option and 30 | exclude the *--reset* option because we will use the workload that was loaded in the previous step: 31 | 32 | ./d4.py --config=application.config --no-load 33 | 34 | TODO: Need to discuss how to use an existing MongoDB design in **D4** to check whether there is better configuration. 35 | 36 | TODO: Need to discuss how to enable the debug log and where to report issues. 37 | 38 | ## MySQL Example 39 | *To be written* 40 | 41 | -------------------------------------------------------------------------------- /src/catalog/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Third-Party Dependencies 4 | import os, sys 5 | basedir = os.path.realpath(os.path.dirname(__file__)) 6 | sys.path.append(os.path.join(basedir, "../../libs")) 7 | 8 | from utilmethods import * 9 | del utilmethods 10 | 11 | from collection import Collection 12 | del collection -------------------------------------------------------------------------------- /src/costmodel/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Third-Party Dependencies 4 | import os, sys 5 | basedir = os.path.realpath(os.path.dirname(__file__)) 6 | sys.path.append(os.path.join(basedir, "../../libs")) 7 | sys.path.append(os.path.join(basedir, "../..")) 8 | 9 | from abstractcostcomponent import AbstractCostComponent 10 | from costmodel import CostModel 11 | from nodeestimator import NodeEstimator -------------------------------------------------------------------------------- /src/costmodel/abstractcostcomponent.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # ----------------------------------------------------------------------- 3 | # Copyright (C) 2012 by Brown University 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining 6 | # a copy of this software and associated documentation files (the 7 | # "Software"), to deal in the Software without restriction, including 8 | # without limitation the rights to use, copy, modify, merge, publish, 9 | # distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so, subject to 11 | # the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be 14 | # included in all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 19 | # IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | # OTHER DEALINGS IN THE SOFTWARE. 23 | # ----------------------------------------------------------------------- 24 | 25 | import logging 26 | 27 | LOG = logging.getLogger(__name__) 28 | 29 | ## ============================================== 30 | ## Abstract Cost Model Component 31 | ## ============================================== 32 | class AbstractCostComponent(): 33 | 34 | def __init__(self, state): 35 | self.state = state 36 | self.debug = LOG.isEnabledFor(logging.DEBUG) 37 | self.lastDesign = None 38 | ## DEF 39 | 40 | def getCost(self, design, num_nodes=None): 41 | cost = self.getCostImpl(design, num_nodes) 42 | self.lastDesign = design 43 | return (cost) 44 | ## DEF 45 | 46 | def getCostImpl(self, design, num_nodes=None): 47 | raise NotImplementedError("Unimplemented %s.getCostImpl()" % self.__init__.im_class) 48 | 49 | def invalidateCache(self, newDesign, col_name): 50 | """Optional callback for when the cost model needs to invalidate a collection's cache""" 51 | pass 52 | 53 | def reset(self): 54 | """Optional callback for when the cost model needs to reset itself""" 55 | pass 56 | 57 | def finish(self): 58 | """Optional callback for when the cost model is finished a round""" 59 | pass 60 | 61 | ## CLASS -------------------------------------------------------------------------------- /src/costmodel/disk/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from diskcostcomponent import DiskCostComponent 3 | -------------------------------------------------------------------------------- /src/costmodel/network/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from networkcostcomponent import NetworkCostComponent -------------------------------------------------------------------------------- /src/costmodel/skew/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from skewcostcomponent import SkewCostComponent 3 | -------------------------------------------------------------------------------- /src/inputs/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'pavlo' 2 | -------------------------------------------------------------------------------- /src/inputs/mongodb/README: -------------------------------------------------------------------------------- 1 | parse.py 2 | ---------------- 3 | parses mongo sniff trace and stores 'workload' (list of sessions with their operations) in a mongo db 4 | 5 | recreate.py 6 | ---------------- 7 | recreates the sample database from the 'workload' and stores it in mongo 8 | 9 | schema.py 10 | ---------------- 11 | inferes the schmea catalog from the 'recreated' and stores it in mongo 12 | 13 | 14 | 15 | --------------------------------- 16 | 17 | Collecting samples on OSX: 18 | 19 | sudo /Applications/mongodb/bin/mongosniff --source NET lo0 | ../sanitizer/anonymize.py 0 > sample1.txt -------------------------------------------------------------------------------- /src/inputs/mongodb/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'pavlo' 2 | 3 | # Third-Party Dependencies 4 | import os, sys 5 | basedir = os.path.realpath(os.path.dirname(__file__)) 6 | sys.path.append(os.path.join(basedir, "../../../libs")) 7 | sys.path.append(os.path.join(basedir, "..")) 8 | 9 | from abstractconverter import AbstractConverter 10 | from mongosniffconverter import MongoSniffConverter -------------------------------------------------------------------------------- /src/inputs/mongodb/salt_crack.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | sys.path.append("../sanitizer") 4 | import anonymize # just for hash_string() 5 | 6 | 7 | expected_plain = "\"drivers\"" 8 | expected_hash = "c9f685688b90e80b8055ef9f1d72b7ce/9" 9 | salt = 0 10 | while True: 11 | hash = anonymize.hash_string(expected_plain, salt) 12 | print "salt ", salt, ": ", hash 13 | if hash == expected_hash: 14 | print "FOUND SALT: ", salt 15 | salt += 1 16 | print "Done." -------------------------------------------------------------------------------- /src/inputs/mongodb/samplecreator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import fileinput 4 | import hashlib 5 | import time 6 | import re 7 | import argparse 8 | import yaml 9 | import json 10 | import logging 11 | from pymongo import Connection 12 | import random 13 | import string 14 | 15 | sys.path.append("../workload") 16 | from traces import * 17 | 18 | logging.basicConfig(level = logging.INFO, 19 | format="%(asctime)s [%(funcName)s:%(lineno)03d] %(levelname)-5s: %(message)s", 20 | datefmt="%m-%d-%Y %H:%M:%S", 21 | stream = sys.stdout) 22 | LOG = logging.getLogger(__name__) 23 | 24 | ### DEFAULT VALUES 25 | ### you can specify these with args 26 | TARGET_DB = "sample_db" 27 | DEFAULT_HOST = "localhost" 28 | DEFAULT_PORT = "27017" 29 | 30 | #GLOBAL vars 31 | target_db = None 32 | connection = None 33 | 34 | 35 | 36 | 37 | 38 | def initDB(hostname, port, t_db): 39 | global connection 40 | global target_db 41 | 42 | # Initialize connection to db that stores raw transactions 43 | connection = Connection(hostname, port) 44 | target_db = connection[t_db] 45 | 46 | return 47 | 48 | def getRandomString(l): 49 | return "".join(random.sample(string.letters+string.digits, l)) 50 | 51 | 52 | def getRandomUser(): 53 | return {"first": getRandomString(8), "last": getRandomString(8), "address": {"street": getRandomString(8), "list": [getRandomString(2), getRandomString(2), getRandomString(2)]}} 54 | 55 | def getRandomArticle(): 56 | return {"Title": getRandomString(20), "author": getRandomString(8), "text": getRandomString(30)} 57 | 58 | def populate(): 59 | #sanity check 60 | users = [] 61 | users.append({"first": "Emanuel", "last": "Buzek", "address": {"street": "Wix", "list": ["a", "b", "c"]}}) 62 | users.append({"first": "Andy", "last": "Pavlo", "address": {"street": "Brown", "list": ["1", "2", "3"]}}) 63 | users.append({"first": "Delete_me", "last": "XXX", "address": {"street": "homeless", "list": ["1", "2", "3"]}}) 64 | #add a bunch of other users... 65 | for i in range(20): 66 | users.append(getRandomUser()) 67 | target_db.users.insert(users) 68 | 69 | 70 | articles = [] 71 | articles.append({"Title": "Why We Should Ban Religion And Kill The Pope", "author": "Buzek", "text": "Read online on www.fuckreligion.org"}) 72 | articles.append({"Title": "Blah blah blah", "author": "Pavlo", "text": "Database bullshit"}) 73 | for i in range(5): 74 | articles.append(getRandomArticle()) 75 | target_db.articles.insert(articles) 76 | 77 | print("Done.") 78 | 79 | 80 | 81 | 82 | def clear(): 83 | target_db.users.remove() 84 | target_db.articles.remove() 85 | 86 | 87 | def test(): 88 | populate() 89 | 90 | target_db.users.find_one() 91 | 92 | #get the count of all articles 93 | target_db.articles.find().count() 94 | 95 | #delete one article 96 | target_db.users.remove({'first': 'Delete_me'}) 97 | 98 | #update 99 | target_db.users.update({'last': 'Buzek'}, {'first': 'Ema'}, True, True) 100 | 101 | #retrieve all articles 102 | target_db.articles.find() 103 | 104 | def main(): 105 | aparser = argparse.ArgumentParser(description='Sample Creator') 106 | aparser.add_argument('--host', 107 | help='hostname of machine running mongo server', default=DEFAULT_HOST) 108 | aparser.add_argument('--port', type=int, 109 | help='port to connect to', default=DEFAULT_PORT) 110 | aparser.add_argument('--target_db', help='db for the sample data', default=TARGET_DB) 111 | 112 | args = vars(aparser.parse_args()) 113 | 114 | LOG.info("..:: Sample Creator ::..") 115 | 116 | settings = "host: ", args['host'], " port: ", args['port'], " target_db: ", args['target_db'] 117 | LOG.info(settings) 118 | 119 | initDB(args['host'], args['port'], args['target_db']) 120 | 121 | clear() 122 | 123 | test() 124 | 125 | 126 | return 127 | 128 | if __name__ == '__main__': 129 | main() 130 | 131 | 132 | 133 | 134 | -------------------------------------------------------------------------------- /src/inputs/mysql/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Third-Party Dependencies 4 | import os, sys 5 | basedir = os.path.realpath(os.path.dirname(__file__)) 6 | sys.path.append(os.path.join(basedir, "../../../libs")) 7 | sys.path.append(os.path.join(basedir, "..")) 8 | 9 | from abstractconverter import AbstractConverter 10 | from mysqlconverter import MySQLConverter 11 | from sql2mongo import Sql2Mongo 12 | -------------------------------------------------------------------------------- /src/inputs/mysql/utilmethods.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import socket 4 | 5 | ''' 6 | Convert the user host field of the MySQL query log trace to extract the IP 7 | address for addition to the session object 8 | ''' 9 | def stripIPtoUnicode(sql_string) : 10 | l = sql_string.rfind('[') + 1; 11 | r = sql_string.rfind(']'); 12 | ip = sql_string[l:r] 13 | if (ip == '') : 14 | return u'127.0.0.1' 15 | else : 16 | return unicode(ip) 17 | ## ENDIF 18 | ## ENDDEF 19 | 20 | ''' 21 | Detect the host IP address 22 | ''' 23 | def detectHostIP() : 24 | return unicode(socket.gethostbyname(socket.gethostname())) 25 | ## ENDDEF 26 | -------------------------------------------------------------------------------- /src/multithreaded/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from multi_search import * 4 | from messageprocessor import * -------------------------------------------------------------------------------- /src/multithreaded/messageprocessor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # ----------------------------------------------------------------------- 3 | # Copyright (C) 2011 4 | # Yang Lu 5 | # http://www.cs.brown.edu/~yanglu/ 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining 8 | # a copy of this software and associated documentation files (the 9 | # "Software"), to deal in the Software without restriction, including 10 | # without limitation the rights to use, copy, modify, merge, publish, 11 | # distribute, sublicense, and/or sell copies of the Software, and to 12 | # permit persons to whom the Software is furnished to do so, subject to 13 | # the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be 16 | # included in all copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 21 | # IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 22 | # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 23 | # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 | # OTHER DEALINGS IN THE SOFTWARE. 25 | # ----------------------------------------------------------------------- 26 | import os 27 | import sys 28 | import logging 29 | 30 | basedir = os.path.realpath(os.path.dirname(__file__)) 31 | sys.path.append(os.path.join(basedir, "..")) 32 | 33 | from message import * 34 | from pprint import pprint, pformat 35 | from multi_search_worker import Worker 36 | from ConfigParser import RawConfigParser 37 | 38 | LOG = logging.getLogger(__name__) 39 | 40 | class MessageProcessor: 41 | ''' Message Processor''' 42 | def __init__(self, channel): 43 | self.channel = channel 44 | self.worker = None 45 | self.config = None 46 | self.benchmark = None 47 | 48 | def processMessage(self): 49 | '''Main loop''' 50 | for item in self.channel: 51 | msg = getMessage(item) 52 | LOG.info("Incoming Message: %s" % getMessageName(msg.header)) 53 | 54 | # MSG_CMD_INIT 55 | if msg.header == MSG_CMD_INIT: 56 | self.worker = Worker(msg.data[0], msg.data[1], self.channel, msg.data[2]) 57 | 58 | elif msg.header == MSG_CMD_LOAD_DB: 59 | self.worker.load() 60 | # MSG_CMD_EXECUTE 61 | # Tells the worker thread to begin the search process 62 | # This will only occur once all of the threads complete the 63 | # EXECUTE_INIT phase. 64 | elif msg.header == MSG_CMD_EXECUTE: 65 | self.worker.execute(msg.data[0], msg.data[1]) 66 | 67 | # MSG_CMD_UPDATE_BEST_COST 68 | # update the best cost of the current client 69 | elif msg.header == MSG_CMD_UPDATE_BEST_COST: 70 | self.worker.update(msg.data) 71 | 72 | # MSG_CMD_STOP 73 | # Tells the worker thread to halt the benchmark 74 | elif msg.header == MSG_CMD_STOP: 75 | # TODO 76 | pass 77 | 78 | # MSG_NOOP 79 | # A empty command that does not return the worker thread to return 80 | # a response. I forget why we have this... 81 | elif msg.header == MSG_NOOP: 82 | pass 83 | else: 84 | assert msg.header in MSG_NAME_MAPPING 85 | LOG.warn("Unexpected message type: %s", MSG_NAME_MAPPING[msg.header]) 86 | return 87 | ## DEF 88 | ## CLASS 89 | -------------------------------------------------------------------------------- /src/multithreaded/multi_search.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import sys 4 | import logging 5 | import execnet 6 | 7 | # Third-Party Dependencies 8 | # Remote execnet invocations won't have a __file__ 9 | basedir = os.getcwd() 10 | sys.path.append(os.path.join(basedir, "..")) 11 | sys.path.append(os.path.join(basedir, "../search")) 12 | 13 | from search.designer import Designer 14 | from multi_search_coordinator import Coordinator 15 | from util import configutil 16 | 17 | LOG = logging.getLogger(__name__) 18 | 19 | class MultiClientDesigner: 20 | """ 21 | This is the multithreaded version of LNS search 22 | """ 23 | def __init__(self, config, args): 24 | self.config = config 25 | self.args = args # ONLY USED FOR Designer.setOptionsFromArguments: Comment: this is a weired method 26 | self.coordinator = Coordinator() 27 | self.channels = None 28 | ## DEF 29 | 30 | def runSearch(self): 31 | self.channels = self.createChannels() 32 | 33 | # Step 1: Initialize all of the Workers on the client nodes 34 | self.coordinator.init(self.config, self.channels, self.args) 35 | 36 | # Step 2: Execute search 37 | self.coordinator.execute() 38 | ## DEF 39 | 40 | def createChannels(self): 41 | '''Create a list of channels used for communication between coordinator and worker''' 42 | num_clients = self.config.getint(configutil.SECT_MULTI_SEARCH, 'num_clients') 43 | LOG.info("Starting LNS search on %d clients" % num_clients) 44 | 45 | import d4 46 | remoteCall = d4 47 | channels=[] 48 | 49 | # create channels to client nodes 50 | for i in xrange(num_clients): 51 | gw = execnet.makegateway("popen//id=sub"+str(i)) 52 | ch = gw.remote_exec(remoteCall) 53 | channels.append(ch) 54 | ## FOR (hosts) 55 | 56 | LOG.debug(channels) 57 | return channels 58 | ## DEF 59 | 60 | ## CLASS -------------------------------------------------------------------------------- /src/multithreaded/multi_search_worker.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | basedir = os.path.realpath(os.path.dirname(__file__)) 5 | sys.path.append(os.path.join(basedir, "..")) 6 | sys.path.append(os.path.join(basedir, "../search")) 7 | 8 | from search.designer import Designer 9 | from util import configutil 10 | from message import * 11 | 12 | import catalog 13 | import workload 14 | import mongokit 15 | 16 | import logging 17 | LOG = logging.getLogger(__name__) 18 | 19 | class Worker: 20 | def __init__(self, config, args, channel, worker_id): 21 | self.config = config 22 | self.channel = channel 23 | self.args = args 24 | self.designer = None 25 | self.bestLock = None 26 | self.worker_id = worker_id 27 | 28 | sendMessage(MSG_INIT_COMPLETED, self.worker_id, self.channel) 29 | ## DEF 30 | 31 | def load(self): 32 | """ 33 | Load data from mongodb 34 | """ 35 | self.designer = self.establishConnection(self.config, self.args, self.channel) 36 | initialCost, initialDesign = self.designer.load() 37 | sendMessage(MSG_INITIAL_DESIGN, (initialCost, initialDesign, self.worker_id), self.channel) 38 | ## DEF 39 | 40 | def execute(self, initialCost, initialDesign): 41 | """ 42 | Run LNS/BB search and inform the coordinator once getting a new best design 43 | """ 44 | sendMessage(MSG_START_SEARCHING, self.worker_id, self.channel) 45 | self.designer.search(initialCost, initialDesign, self.worker_id) 46 | ## DEF 47 | 48 | def update(self, data): 49 | bestCost = data[0] 50 | bestDesign = data[1] 51 | 52 | self.designer.search_method.bbsearch_method.updateBest(bestCost, bestDesign) 53 | sendMessage(MSG_FINISHED_UPDATE, self.worker_id, self.channel) 54 | ## DEF 55 | 56 | def establishConnection(self, config, args, channel): 57 | ## ---------------------------------------------- 58 | ## Connect to MongoDB 59 | ## ---------------------------------------------- 60 | hostname = config.get(configutil.SECT_MONGODB, 'host') 61 | port = config.getint(configutil.SECT_MONGODB, 'port') 62 | assert hostname 63 | assert port 64 | try: 65 | conn = mongokit.Connection(host=hostname, port=port) 66 | except: 67 | LOG.error("Failed to connect to MongoDB at %s:%s" % (hostname, port)) 68 | raise 69 | ## Register our objects with MongoKit 70 | conn.register([ catalog.Collection, workload.Session ]) 71 | 72 | ## Make sure that the databases that we need are there 73 | db_names = conn.database_names() 74 | for key in [ 'dataset_db', ]: # FIXME 'workload_db' ]: 75 | if not config.has_option(configutil.SECT_MONGODB, key): 76 | raise Exception("Missing the configuration option '%s.%s'" % (configutil.SECT_MONGODB, key)) 77 | elif not config.get(configutil.SECT_MONGODB, key): 78 | raise Exception("Empty configuration option '%s.%s'" % (configutil.SECT_MONGODB, key)) 79 | ## FOR 80 | 81 | metadata_db = conn[config.get(configutil.SECT_MONGODB, 'metadata_db')] 82 | dataset_db = conn[config.get(configutil.SECT_MONGODB, 'dataset_db')] 83 | 84 | designer = Designer(config, metadata_db, dataset_db, channel) 85 | designer.setOptionsFromArguments(args) 86 | 87 | return designer 88 | ## DEF 89 | 90 | ## CLASS -------------------------------------------------------------------------------- /src/sanitizer/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Third-Party Dependencies 4 | import os, sys 5 | basedir = os.path.realpath(os.path.dirname(__file__)) 6 | sys.path.append(os.path.join(basedir, "../../libs")) 7 | -------------------------------------------------------------------------------- /src/sanitizer/anonymized-sample.txt: -------------------------------------------------------------------------------- 1 | sniffing... 27017 2 | 1335807341.9623771 - 127.0.0.1:53780 -->> 127.0.0.1:27017 fuck.col 83 bytes id:42ead4c2 1122686146 3 | insert: { _id: ObjectId('4f9ecd6dc4fa803676735bb7'), check: 662174a690c0493f30a33bc344d454c9/22 } 4 | 1335807341.9625001 - 127.0.0.1:53780 -->> 127.0.0.1:27017 fuck.$cmd 76 bytes id:42ead4c3 1122686147 5 | query: { getlasterror: 1.0, w: 1.0 } ntoreturn: -1 ntoskip: 0 6 | 1335807341.9625461 - 127.0.0.1:27017 <<-- 127.0.0.1:53780 94 bytes id:28b5a168 682991976 - 1122686147 7 | reply n:1 cursorId: 0 8 | { n: 0, connectionId: 5, wtime: 0, err: null, ok: 1.0 } 9 | 1335807341.9625919 - 127.0.0.1:53780 -->> 127.0.0.1:27017 admin.$cmd 80 bytes id:42ead4c4 1122686148 10 | query: { replSetGetStatus: 1, forShell: 1 } ntoreturn: 1 ntoskip: 0 11 | 1335807341.9626341 - 127.0.0.1:27017 <<-- 127.0.0.1:53780 92 bytes id:28b5a169 682991977 - 1122686148 12 | reply n:1 cursorId: 0 13 | { errmsg: 2e772a67d7c6cb78973d3eb496d282eb/28, ok: 0.0 } 14 | 1335807352.9636409 - 127.0.0.1:53780 -->> 127.0.0.1:27017 fuck.col 102 bytes id:42ead4c5 1122686149 15 | insert: { _id: ObjectId('4f9ecd78c4fa803676735bb8'), string-key: 610f7a015f985f0120ab21a8450fa162/36 } 16 | 1335807352.9637721 - 127.0.0.1:53780 -->> 127.0.0.1:27017 fuck.$cmd 76 bytes id:42ead4c6 1122686150 17 | query: { getlasterror: 1.0, w: 1.0 } ntoreturn: -1 ntoskip: 0 18 | 1335807352.9638169 - 127.0.0.1:27017 <<-- 127.0.0.1:53780 94 bytes id:28b5a16a 682991978 - 1122686150 19 | reply n:1 cursorId: 0 20 | { n: 0, connectionId: 5, wtime: 0, err: null, ok: 1.0 } 21 | 1335807352.9638691 - 127.0.0.1:53780 -->> 127.0.0.1:27017 admin.$cmd 80 bytes id:42ead4c7 1122686151 22 | query: { replSetGetStatus: 1, forShell: 1 } ntoreturn: 1 ntoskip: 0 23 | 1335807352.9639249 - 127.0.0.1:27017 <<-- 127.0.0.1:53780 92 bytes id:28b5a16b 682991979 - 1122686151 24 | reply n:1 cursorId: 0 25 | { errmsg: 2e772a67d7c6cb78973d3eb496d282eb/28, ok: 0.0 } 26 | 1335807361.9646549 - 127.0.0.1:53780 -->> 127.0.0.1:27017 fuck.col 95 bytes id:42ead4c8 1122686152 27 | insert: { _id: ObjectId('4f9ecd80c4fa803676735bb9'), 123: 610f7a015f985f0120ab21a8450fa162/36 } 28 | 1335807361.9647429 - 127.0.0.1:53780 -->> 127.0.0.1:27017 fuck.$cmd 76 bytes id:42ead4c9 1122686153 29 | query: { getlasterror: 1.0, w: 1.0 } ntoreturn: -1 ntoskip: 0 30 | 1335807361.964772 - 127.0.0.1:27017 <<-- 127.0.0.1:53780 94 bytes id:28b5a16c 682991980 - 1122686153 31 | reply n:1 cursorId: 0 32 | { n: 0, connectionId: 5, wtime: 0, err: null, ok: 1.0 } 33 | 1335807361.964807 - 127.0.0.1:53780 -->> 127.0.0.1:27017 admin.$cmd 80 bytes id:42ead4ca 1122686154 34 | query: { replSetGetStatus: 1, forShell: 1 } ntoreturn: 1 ntoskip: 0 35 | 1335807361.964834 - 127.0.0.1:27017 <<-- 127.0.0.1:53780 92 bytes id:28b5a16d 682991981 - 1122686154 36 | reply n:1 cursorId: 0 37 | { errmsg: 2e772a67d7c6cb78973d3eb496d282eb/28, ok: 0.0 } 38 | -------------------------------------------------------------------------------- /src/sanitizer/out.txt: -------------------------------------------------------------------------------- 1 | sniffing... 27017 2 | 1316312038.809952 - 127.0.0.1:50923 -->> 127.0.0.1:27017 test.$cmd 79 bytes id:5514ea16 1427434006 3 | query: { create: 626df501270f676cbe6ca9967587ccdb/7, capped: undefined, size: undefined, max: undefined } ntoreturn: -1 ntoskip: 0 4 | 1316312038.810056 - 127.0.0.1:27017 <<-- 127.0.0.1:50923 91 bytes id:52acceff 1387056895 - 1427434006 5 | reply n:1 cursorId: 0 6 | { errmsg: "collection already exists", ok: 0.0 } 7 | 1316312038.8101029 - 127.0.0.1:50923 -->> 127.0.0.1:27017 admin.$cmd 80 bytes id:5514ea17 1427434007 8 | query: { replSetGetStatus: 1, forShell: 1 } ntoreturn: 1 ntoskip: 0 9 | 1316312038.8101411 - 127.0.0.1:27017 <<-- 127.0.0.1:50923 92 bytes id:52accf00 1387056896 - 1427434007 10 | reply n:1 cursorId: 0 11 | { errmsg: "not running with --replSet", ok: 0.0 } 12 | -------------------------------------------------------------------------------- /src/sanitizer/sample-anonymize.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmu-db/mongodb-d4/e33cd9a5d5d15d842895909cd0a9f804d4b7a975/src/sanitizer/sample-anonymize.txt -------------------------------------------------------------------------------- /src/sanitizer/sample.dat: -------------------------------------------------------------------------------- 1 | sniffing... 27017 2 | 127.0.0.1:50923 -->> 127.0.0.1:27017 test.$cmd 79 bytes id:5514ea16 1427434006 3 | query: { create: "aaaa", capped: undefined, size: undefined, max: undefined } ntoreturn: -1 ntoskip: 0 4 | 127.0.0.1:27017 <<-- 127.0.0.1:50923 91 bytes id:52acceff 1387056895 - 1427434006 5 | reply n:1 cursorId: 0 6 | { errmsg: "collection already exists", ok: 0.0 } 7 | 127.0.0.1:50923 -->> 127.0.0.1:27017 admin.$cmd 80 bytes id:5514ea17 1427434007 8 | query: { replSetGetStatus: 1, forShell: 1 } ntoreturn: 1 ntoskip: 0 9 | 127.0.0.1:27017 <<-- 127.0.0.1:50923 92 bytes id:52accf00 1387056896 - 1427434007 10 | reply n:1 cursorId: 0 11 | { errmsg: "not running with --replSet", ok: 0.0 } 12 | -------------------------------------------------------------------------------- /src/search/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Third-Party Dependencies 4 | import os, sys 5 | basedir = os.path.realpath(os.path.dirname(__file__)) 6 | sys.path.append(os.path.join(basedir, "../../libs")) 7 | 8 | from designcandidates import DesignCandidates 9 | from design import Design 10 | #from designer import Designer 11 | from utilmethods import * 12 | 13 | # Designer Algorithms 14 | from initialdesigner import InitialDesigner 15 | from randomdesigner import RandomDesigner 16 | from lnsdesigner import LNSDesigner 17 | -------------------------------------------------------------------------------- /src/search/abstractdesigner.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # ----------------------------------------------------------------------- 3 | # Copyright (C) 2012 by Brown University 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining 6 | # a copy of this software and associated documentation files (the 7 | # "Software"), to deal in the Software without restriction, including 8 | # without limitation the rights to use, copy, modify, merge, publish, 9 | # distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so, subject to 11 | # the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be 14 | # included in all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 19 | # IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | # OTHER DEALINGS IN THE SOFTWARE. 23 | # ----------------------------------------------------------------------- 24 | 25 | import logging 26 | from threading import Thread 27 | 28 | LOG = logging.getLogger(__name__) 29 | 30 | ## ============================================== 31 | ## Abstract Designer 32 | ## ============================================== 33 | class AbstractDesigner(Thread): 34 | 35 | def __init__(self, collections, workload, config): 36 | Thread.__init__(self) 37 | assert isinstance(collections, dict) 38 | assert not workload is None 39 | #assert not config is None 40 | 41 | self.collections = collections 42 | self.workload = workload 43 | self.config = config 44 | self.debug = LOG.isEnabledFor(logging.DEBUG) 45 | ## DEF 46 | 47 | def generate(self): 48 | raise NotImplementedError("Unimplemented %s.generate()" % self.__init__.im_class) 49 | 50 | def run(self): 51 | pass 52 | ## CLASS -------------------------------------------------------------------------------- /src/search/designcandidates.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from pprint import pformat 4 | 5 | ## ============================================== 6 | ## DesignCandidates 7 | ## ============================================== 8 | ''' 9 | An instance of this class is given to the BBSearch. 10 | It basically defines the search space, i.e. BBSearch enumerates 11 | possible solutions using this object. 12 | 13 | = Basic structure of this class: = 14 | set of COLLECTIONS mapped to: 15 | a) list of possible shard keys 16 | b) list of collections it can be denormalized to 17 | c) list of possible index keys (this will be very likely the same as a)) 18 | ''' 19 | class DesignCandidates(): 20 | 21 | ''' 22 | class constructor 23 | ''' 24 | def __init__(self): 25 | # collection names 26 | self.collections = set() 27 | # col names mapped to possible index keys 28 | self.indexKeys = {} 29 | # col names mapped to possible shard keys 30 | self.shardKeys = {} 31 | # col names mapped to possible col names the collection can be denormalized to 32 | self.denorm = {} 33 | 34 | 35 | def addCollection(self, collection, indexKeys, shardKeys, denorm) : 36 | if collection not in self.collections : 37 | self.collections.add(collection) 38 | self.indexKeys[collection] = indexKeys 39 | self.shardKeys[collection] = shardKeys 40 | self.denorm[collection] = denorm 41 | 42 | def getCandidates(self, collection_names): 43 | candidates = DesignCandidates() 44 | for coll_name in collection_names: 45 | candidates.addCollection(coll_name, self.indexKeys[coll_name], self.shardKeys[coll_name], self.denorm[coll_name]) 46 | 47 | return candidates 48 | 49 | def __str__(self): 50 | return pformat(self.__dict__) 51 | 52 | ## CLASS -------------------------------------------------------------------------------- /src/search/randomdesigner.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # ----------------------------------------------------------------------- 3 | # Copyright (C) 2012 by Brown University 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining 6 | # a copy of this software and associated documentation files (the 7 | # "Software"), to deal in the Software without restriction, including 8 | # without limitation the rights to use, copy, modify, merge, publish, 9 | # distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so, subject to 11 | # the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be 14 | # included in all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 19 | # IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | # OTHER DEALINGS IN THE SOFTWARE. 23 | # ----------------------------------------------------------------------- 24 | 25 | import logging 26 | import random 27 | 28 | # mongodb-d4 29 | from design import Design 30 | from abstractdesigner import AbstractDesigner 31 | 32 | LOG = logging.getLogger(__name__) 33 | 34 | ## ============================================== 35 | ## InitialDesigner 36 | ## ============================================== 37 | class RandomDesigner(AbstractDesigner): 38 | 39 | def __init__(self, collections, workload, config): 40 | AbstractDesigner.__init__(self, collections, workload, config) 41 | ## DEF 42 | 43 | def generate(self): 44 | LOG.info("Generating random design") 45 | design = Design() 46 | rng = random.Random() 47 | for col_info in self.collections.itervalues(): 48 | design.addCollection(col_info['name']) 49 | 50 | col_fields = [] 51 | for field, data in col_info['fields'].iteritems(): 52 | col_fields.append(field) 53 | 54 | # Figure out which attribute has the highest value for 55 | # the params that we care about when choosing the best design 56 | attrs = [ ] 57 | chosen_field = None 58 | while chosen_field is None or str(chosen_field).startswith("#") or str(chosen_field).startswith("_"): 59 | chosen_field = random.choice(col_fields) 60 | attrs.append(chosen_field) 61 | print "field: ", chosen_field 62 | 63 | design.addShardKey(col_info['name'], attrs) 64 | design.addIndex(col_info['name'], attrs) 65 | 66 | return design 67 | ## DEF 68 | 69 | ## CLASS -------------------------------------------------------------------------------- /src/search/utilmethods.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import json 4 | import logging 5 | from pprint import pformat 6 | from design import Design 7 | 8 | import os 9 | import sys 10 | 11 | basedir = os.path.realpath(os.path.dirname(__file__)) 12 | sys.path.append(os.path.join(basedir, "../")) 13 | 14 | from util import constants 15 | 16 | LOG = logging.getLogger(__name__) 17 | 18 | def fromJSON(input) : 19 | ''' 20 | Convert the result of designer.py into a tuple of Design instances (initial, final) 21 | ''' 22 | solutions = json.loads(input) 23 | initial = fromLIST(solutions['initial']) 24 | final = fromLIST(solutions['final']) 25 | return (initial, final) 26 | 27 | def fromLIST(list) : 28 | d = Design() 29 | for col in list : 30 | d.addCollection(col['collection']) 31 | d.addShardKey(col['collection'], col['shardKey']) 32 | for i in col['indexes'] : 33 | d.addIndex(col['collection'], i) 34 | d.denorm[col['collection']] = col['denorm'] 35 | return d 36 | 37 | def getIndexSize(col_info, indexKeys): 38 | """Estimate the amount of memory required by the indexes of a given design""" 39 | # TODO: This should be precomputed ahead of time. No need to do this 40 | # over and over again. 41 | if not indexKeys: 42 | return 0 43 | ## IF 44 | index_size = 0 45 | for f_name in indexKeys: 46 | f = col_info.getField(f_name) 47 | if f: 48 | index_size += f['avg_size'] 49 | index_size += constants.DEFAULT_ADDRESS_SIZE 50 | 51 | #LOG.debug("%s Index %s Memory: %d bytes", col_info['name'], repr(indexKeys), index_size) 52 | return index_size 53 | 54 | def buildLoadingList(design): 55 | """Generate the ordered list of collections based on the order that we need to load them""" 56 | LOG.debug("Computing collection load order") 57 | 58 | # First split the list of collections between those that are normalized 59 | # and those are not 60 | loadOrder = [ ] 61 | denormalized = { } 62 | for collection in design.getCollections(): 63 | # Examine the design and see whether this collection 64 | # is denormalized into another collection 65 | if not design.isDenormalized(collection): 66 | loadOrder.append(collection) 67 | else: 68 | # Now for the denormalized guys, get their hierarchy 69 | # so that we can figure out who should get loaded first 70 | denormalized[collection] = design.getDenormalizationHierarchy(collection) 71 | LOG.debug("'%s' Denormalization Hierarchy: %s" % (collection, denormalized[collection])) 72 | ## FOR 73 | 74 | while len(denormalized) > 0: 75 | # Loop through each denormalized collection and remove any collection 76 | # from their heirarchy that is already in the load list 77 | for collection in denormalized.keys(): 78 | denormalized[collection] = filter(lambda x: not x in loadOrder, denormalized[collection]) 79 | ## FOR 80 | 81 | # Now any collection that is not waiting for any other collection 82 | # can be loaded! 83 | newLoads = [ ] 84 | for collection in denormalized.keys(): 85 | if len(denormalized[collection]) == 0: 86 | newLoads.append(collection) 87 | ## FOR 88 | assert len(newLoads) > 0, "Loading deadlock due to denormalization!" 89 | 90 | for collection in newLoads: 91 | loadOrder.append(collection) 92 | del denormalized[collection] 93 | ## FOR 94 | ## WHILE 95 | 96 | return loadOrder 97 | ## DEF -------------------------------------------------------------------------------- /src/util/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from constants import * 4 | from utilmethods import * 5 | from histogram import Histogram -------------------------------------------------------------------------------- /src/util/constants.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | 4 | PROJECT_NAME = "mongodb-d4" 5 | PROJECT_URL = "https://github.com/apavlo/mongodb-d4" 6 | 7 | ## ============================================== 8 | ## METADATA DB 9 | ## ============================================== 10 | 11 | # The default name of the metadata database 12 | METADATA_DB_NAME = "metadata" 13 | 14 | # The schema catalog information about the application 15 | COLLECTION_SCHEMA = "schema" 16 | COLLECTION_WORKLOAD = "sessions" 17 | 18 | CATALOG_COLL = "catalog" 19 | CATALOG_FIELDS = 'fields' 20 | 21 | ## ============================================== 22 | ## DATASET DB 23 | ## ============================================== 24 | 25 | # The default name of the reconstructed database 26 | DATASET_DB_NAME = "dataset" 27 | 28 | ## ============================================== 29 | ## WORKLOAD PROCESSING OPTIONS 30 | ## ============================================== 31 | 32 | SKIP_MONGODB_ID_FIELD = False 33 | 34 | # List of collection names prefixes that we should ignore 35 | # when performing various processing tasks 36 | IGNORED_COLLECTIONS = [ 'system', 'local', 'admin', 'config' ] 37 | 38 | # If a query's collection name is mangled when processing traces, 39 | # we'll use this value to indicate that it is invalid 40 | INVALID_COLLECTION_MARKER = "*INVALID*" 41 | 42 | # The default initial session id. New session ids will 43 | # start at this value 44 | INITIAL_SESSION_ID = 100 45 | 46 | # Special marker that represents a 'virtual' field for the 47 | # inner values of a list type 48 | LIST_INNER_FIELD = "__INNER__" 49 | 50 | # Replace any key that starts with a '$' with this string 51 | REPLACE_KEY_DOLLAR_PREFIX = '#' 52 | 53 | # Replace any '.' in a key with this string 54 | REPLACE_KEY_PERIOD = '__' 55 | 56 | # This identifies that an operation has to perform a full scan 57 | # on an entire collection rather than retrieving a single document 58 | FULL_SCAN_DOCUMENT_ID = sys.maxint 59 | 60 | ## ============================================== 61 | ## MONGO OPERATION TYPES 62 | ## ============================================== 63 | OP_TYPE_QUERY = '$query' 64 | OP_TYPE_INSERT = '$insert' 65 | OP_TYPE_ISERT = '$isert' 66 | OP_TYPE_DELETE = '$delete' 67 | OP_TYPE_UPDATE = '$update' 68 | OP_TYPE_REPLY = '$reply' 69 | OP_TYPE_GETMORE = '$getMore' 70 | OP_TYPE_KILLCURSORS = '$killCursors' 71 | OP_TYPE_UNKNOWN = 'unknown' 72 | OP_TYPE_ALL = [ ] 73 | for k in locals().keys(): 74 | if k.startswith("OP_TYPE_"): OP_TYPE_ALL.append(locals()[k]) 75 | 76 | ## ============================================== 77 | ## PREDICATE TYPES 78 | ## ============================================== 79 | PRED_TYPE_RANGE = 'range' 80 | PRED_TYPE_EQUALITY = 'eq' 81 | PRED_TYPE_REGEX = 'regex' 82 | 83 | ## ============================================== 84 | ## COSTMODEL DEFAULTS 85 | ## ============================================== 86 | DEFAULT_ADDRESS_SIZE = 8 # bytes 87 | DEFAULT_TIME_INTERVALS = 10 88 | 89 | # Whether to preload documents in the LRUBuffers 90 | DEFAULT_LRU_PRELOAD = True 91 | 92 | # The size of pages on disk for each MongoDB database node 93 | DEFAULT_PAGE_SIZE = 4096 # bytes 94 | 95 | # Window size in lru buffer: how many collection are preloaded into the buffer 96 | WINDOW_SIZE = 1024 97 | 98 | # Slot size upper bound: if the slot size is larger than this value, we will consider it as a 99 | # full page scan 100 | SLOT_SIZE_LIMIT = 10 101 | 102 | ## ============================================== 103 | ## CANDIDATES GENERATOR CONSTRAINTS 104 | ## ============================================== 105 | MIN_SELECTIVITY = 0.01 106 | 107 | MAX_INDEX_SIZE = 10 108 | 109 | EXAUSTED_SEARCH_BAR = 4 110 | 111 | NUMBER_OF_BACKUP_KEYS = 2 112 | 113 | ## ============================================== 114 | ## MONGO DATASET RECONSTRUCTION CONSTRAINTS 115 | ## ============================================== 116 | 117 | # The minimum size of nested fields, with which we will extract them from its parent collection 118 | MIN_SIZE_OF_NESTED_FIELDS = 3 119 | 120 | # Split documents with more than K fields 121 | MIN_SPLIT_SIZE = 3 122 | 123 | # We want to SKIP these two fields since we are functional fields not data fields 124 | FUNCTIONAL_FIELD = 'parent_col' 125 | ## ============================================== 126 | ## REPLAY BENCHMARK 127 | ## ============================================== 128 | 129 | # how many sessions to handle each time 130 | WORKLOAD_WINDOW_SIZE = 1000 131 | -------------------------------------------------------------------------------- /src/util/mathutil.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # ----------------------------------------------------------------------- 3 | # Copyright (C) 2012 by Brown University 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining 6 | # a copy of this software and associated documentation files (the 7 | # "Software"), to deal in the Software without restriction, including 8 | # without limitation the rights to use, copy, modify, merge, publish, 9 | # distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so, subject to 11 | # the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be 14 | # included in all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT 19 | # IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | # OTHER DEALINGS IN THE SOFTWARE. 23 | # ----------------------------------------------------------------------- 24 | 25 | import math 26 | import functools 27 | import logging 28 | from pprint import pformat 29 | 30 | LOG = logging.getLogger(__name__) 31 | 32 | def quartiles(N): 33 | debug = LOG.isEnabledFor(logging.DEBUG) 34 | 35 | # Calculate the median 36 | median = percentile(N, 0.50) 37 | if debug: LOG.debug("Median: %s" % median) 38 | 39 | # Split into two halves 40 | # Do not include the median into the halves, or the minimum and maximum 41 | lower = [] 42 | upper = [] 43 | isUpper = False 44 | for i in xrange(1, len(N)-1): 45 | if not isUpper and N[i] >= median: 46 | isUpper = True 47 | if isUpper: 48 | upper.append(N[i]) 49 | else: 50 | lower.append(N[i]) 51 | ## FOR 52 | 53 | if debug: LOG.debug("Lower Portion: %d [%s-%s]" % (len(lower), lower[0], lower[-1])) 54 | if debug: LOG.debug("Upper Portion: %d [%s-%s]" % (len(upper), upper[0], upper[-1])) 55 | 56 | # Return (lowerQuartile, upperQuartile) 57 | return (percentile(lower, 0.50), percentile(upper, 0.50)) 58 | ## DEF 59 | 60 | ## Original: http://code.activestate.com/recipes/511478-finding-the-percentile-of-the-values/ 61 | def percentile(N, percent, key=lambda x:x): 62 | """ 63 | Find the percentile of a list of values. 64 | 65 | @parameter N - is a list of values. Note N MUST BE already sorted. 66 | @parameter percent - a float value from 0.0 to 1.0. 67 | @parameter key - optional key function to compute value from each element of N. 68 | 69 | @return - the percentile of the values 70 | """ 71 | if not N: 72 | return None 73 | k = (len(N)-1) * percent 74 | f = math.floor(k) 75 | c = math.ceil(k) 76 | if f == c: 77 | return key(N[int(k)]) 78 | d0 = key(N[int(f)]) * (c-k) 79 | d1 = key(N[int(c)]) * (k-f) 80 | return d0+d1 81 | ## DEF 82 | 83 | ## Original: FROM: http://www.physics.rutgers.edu/~masud/computing/WPark_recipes_in_python.html 84 | def stddev(x): 85 | n, mean, std = len(x), 0, 0 86 | for a in x: 87 | mean = mean + a 88 | mean /= float(n) 89 | for a in x: 90 | std = std + (a - mean)**2 91 | std = math.sqrt(std / float(n-1)) 92 | return std -------------------------------------------------------------------------------- /src/util/termcolor.py: -------------------------------------------------------------------------------- 1 | # Copyright: 2008 Nadia Alramli 2 | # http://nadiana.com/python-curses-terminal-controller 3 | # License: BSD 4 | 5 | """Terminal controller module 6 | Example of usage: 7 | print BG_BLUE + 'Text on blue background' + NORMAL 8 | print BLUE + UNDERLINE + 'Blue underlined text' + NORMAL 9 | print BLUE + BG_YELLOW + BOLD + 'text' + NORMAL 10 | """ 11 | 12 | import sys 13 | 14 | # The current module 15 | MODULE = sys.modules[__name__] 16 | 17 | COLORS = "BLUE GREEN CYAN RED MAGENTA YELLOW WHITE BLACK".split() 18 | # List of terminal controls, you can add more to the list. 19 | CONTROLS = { 20 | 'BOL':'cr', 'UP':'cuu1', 'DOWN':'cud1', 'LEFT':'cub1', 'RIGHT':'cuf1', 21 | 'CLEAR_SCREEN':'clear', 'CLEAR_EOL':'el', 'CLEAR_BOL':'el1', 22 | 'CLEAR_EOS':'ed', 'BOLD':'bold', 'BLINK':'blink', 'DIM':'dim', 23 | 'REVERSE':'rev', 'UNDERLINE':'smul', 'NORMAL':'sgr0', 24 | 'HIDE_CURSOR':'cinvis', 'SHOW_CURSOR':'cnorm' 25 | } 26 | 27 | # List of numeric capabilities 28 | VALUES = { 29 | 'COLUMNS':'cols', # Width of the terminal (None for unknown) 30 | 'LINES':'lines', # Height of the terminal (None for unknown) 31 | 'MAX_COLORS': 'colors', 32 | } 33 | 34 | def default(): 35 | """Set the default attribute values""" 36 | for color in COLORS: 37 | setattr(MODULE, color, '') 38 | setattr(MODULE, 'BG_%s' % color, '') 39 | for control in CONTROLS: 40 | setattr(MODULE, control, '') 41 | for value in VALUES: 42 | setattr(MODULE, value, None) 43 | 44 | def setup(): 45 | """Set the terminal control strings""" 46 | # Initializing the terminal 47 | curses.setupterm() 48 | # Get the color escape sequence template or '' if not supported 49 | # setab and setaf are for ANSI escape sequences 50 | bgColorSeq = curses.tigetstr('setab') or curses.tigetstr('setb') or '' 51 | fgColorSeq = curses.tigetstr('setaf') or curses.tigetstr('setf') or '' 52 | 53 | for color in COLORS: 54 | # Get the color index from curses 55 | colorIndex = getattr(curses, 'COLOR_%s' % color) 56 | # Set the color escape sequence after filling the template with index 57 | setattr(MODULE, color, curses.tparm(fgColorSeq, colorIndex)) 58 | # Set background escape sequence 59 | setattr( 60 | MODULE, 'BG_%s' % color, curses.tparm(bgColorSeq, colorIndex) 61 | ) 62 | for control in CONTROLS: 63 | # Set the control escape sequence 64 | setattr(MODULE, control, curses.tigetstr(CONTROLS[control]) or '') 65 | for value in VALUES: 66 | # Set terminal related values 67 | setattr(MODULE, value, curses.tigetnum(VALUES[value])) 68 | 69 | def bold(text): 70 | return render('%(BOLD)s' + text + '%(NORMAL)s') 71 | 72 | def render(text): 73 | """Helper function to render text easily 74 | Example: 75 | render("%(GREEN)s%(BOLD)stext%(NORMAL)s") -> a bold green text 76 | """ 77 | return text % MODULE.__dict__ 78 | 79 | try: 80 | import curses 81 | setup() 82 | except Exception, e: 83 | # There is a failure; set all attributes to default 84 | print 'Warning: %s' % e 85 | default() -------------------------------------------------------------------------------- /src/util/utilmethods.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | 5 | from util import constants 6 | 7 | LOG = logging.getLogger(__name__) 8 | 9 | def escapeFieldNames(content): 10 | """Fix key names so that they can be stored in MongoDB""" 11 | copy = dict(content.items()) 12 | toFix = [ ] 13 | for k, v in copy.iteritems(): 14 | # Keys can't start with '$' and they can't contain '.' 15 | if k.startswith('$') or k.find(".") != -1: 16 | toFix.append(k) 17 | if type(v) == dict: 18 | v = escapeFieldNames(v) 19 | elif type(v) == list: 20 | for i in xrange(0, len(v)): 21 | if type(v[i]) == dict: 22 | v[i] = escapeFieldNames(v[i]) 23 | ## FOR 24 | copy[k] = v 25 | ## FOR 26 | 27 | for k in toFix: 28 | v = copy[k] 29 | del copy[k] 30 | 31 | if k.startswith('$'): 32 | k = constants.REPLACE_KEY_DOLLAR_PREFIX + k[1:] 33 | k = k.replace(".", constants.REPLACE_KEY_PERIOD) 34 | copy[k] = v 35 | ## FOR 36 | 37 | return copy 38 | ## DEF 39 | -------------------------------------------------------------------------------- /src/workload/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Third-Party Dependencies 4 | import os, sys 5 | basedir = os.path.realpath(os.path.dirname(__file__)) 6 | sys.path.append(os.path.join(basedir, "../../libs")) 7 | 8 | # Mongokit Objects 9 | from session import Session 10 | 11 | # workload combiner 12 | from workloadcombiner import WorkloadCombiner 13 | # Regular Classes 14 | from ophasher import OpHasher 15 | 16 | from utilmethods import * 17 | del utilmethods -------------------------------------------------------------------------------- /src/workload/utilmethods.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | from util import constants 5 | from pprint import pformat 6 | 7 | LOG = logging.getLogger(__name__) 8 | 9 | def isOpRegex(op, field=None): 10 | """Returns true if this operation contains a regex query""" 11 | 12 | # if "predicates" in op: 13 | # return constants.PRED_TYPE_REGEX in op["predicates"].itervalues() 14 | 15 | regex_flag = constants.REPLACE_KEY_DOLLAR_PREFIX + "regex" 16 | for contents in getOpContents(op): 17 | if field is None: 18 | for k, v in contents.iteritems(): 19 | if isinstance(v, dict) and regex_flag in v: 20 | return True 21 | elif field in contents: 22 | if isinstance(contents[field], dict) and regex_flag in contents[field]: 23 | return True 24 | ## FOR 25 | return False 26 | ## FOR 27 | 28 | def getOpContents(op): 29 | """Return a list of all of the query contents for the given operation""" 30 | # QUERY 31 | if op['type'] == constants.OP_TYPE_QUERY: 32 | # TODO: Why are we not examining the resp_content here? 33 | contents = [ ] 34 | for opContent in op['query_content']: 35 | try: 36 | if '#query' in opContent and opContent['#query']: 37 | contents.append(opContent['#query']) 38 | except: 39 | LOG.error("Invalid query content:\n%s", pformat(opContent)) 40 | raise 41 | 42 | # INSERT + UPDATE + DELETE 43 | elif op['type'] in [constants.OP_TYPE_INSERT, \ 44 | constants.OP_TYPE_ISERT, \ 45 | constants.OP_TYPE_UPDATE, \ 46 | constants.OP_TYPE_DELETE]: 47 | contents = op['query_content'] 48 | else: 49 | raise Exception("Unexpected type '%s' for %s" % (op['type'], op)) 50 | 51 | return contents 52 | ## DEF 53 | 54 | 55 | def getReferencedFields(op): 56 | """ 57 | Return a tuple of all the fields referenced in the fields dict 58 | The fields will be sorted lexiographically so that two documents with 59 | the same fields always come back with the same tuple 60 | """ 61 | fields = set() 62 | for contents in getOpContents(op): 63 | for key in contents.iterkeys(): 64 | if not key.startswith(constants.REPLACE_KEY_DOLLAR_PREFIX): 65 | fields.add(key) 66 | return tuple(sorted(list(fields))) 67 | ## DEF 68 | 69 | ## ============================================== 70 | ## OLD STUFF 71 | ## ============================================== 72 | 73 | # TODO: This is just for testing that our Sessions object 74 | # validates correctly. The parser/santizer should be fixed 75 | # to use the Sessions object directly 76 | @DeprecationWarning 77 | def convertWorkload(conn): 78 | old_workload = conn['designer']['mongo_comm'] 79 | new_workload = ['workload'] 80 | 81 | new_sess = conn['designer'].Session() 82 | new_sess['ip1'] = u'127.0.0.1:59829' 83 | new_sess['ip2'] = u'127.0.0.1:27017' 84 | 85 | for trace in old_workload.find({'IP1': new_sess['ip1'], 'IP2': new_sess['ip2']}): 86 | new_sess['uid'] = trace['uid'] 87 | if not trace['content']: continue 88 | 89 | assert len(trace['content']) == 1, pformat(trace['content']) 90 | #print "CONTENT:", pformat(trace['content']) 91 | op = { 92 | 'collection': trace['collection'], 93 | 'content': trace['content'][0], 94 | 'timestamp': float(trace['timestamp']), 95 | 'type': trace['type'], 96 | 'size': int(trace['size'].replace("bytes", "")), 97 | } 98 | new_sess['operations'].append(op) 99 | ## FOR 100 | 101 | print new_sess 102 | new_sess.save() 103 | ## DEF -------------------------------------------------------------------------------- /tests/README: -------------------------------------------------------------------------------- 1 | Testing code for MongoDB-Designer 2 | The script "runTests.sh" will execute all python scripts 3 | 4 | Dependencies: 5 | python-nose 6 | 7 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Third-Party Dependencies 4 | import os, sys 5 | basedir = os.path.realpath(os.path.dirname(__file__)) 6 | sys.path.append(os.path.join(basedir, "../libs")) 7 | sys.path.append(os.path.join(basedir, "../src")) 8 | 9 | from mongodbtestcase import MongoDBTestCase -------------------------------------------------------------------------------- /tests/api/unittest_results.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os, sys 5 | import string 6 | import random 7 | import unittest 8 | from pprint import pprint, pformat 9 | 10 | basedir = os.path.realpath(os.path.dirname(__file__)) 11 | sys.path.append(os.path.join(basedir, "../../src")) 12 | sys.path.append(os.path.join(basedir, "../../exps")) 13 | from api.results import Results 14 | 15 | class TestResults(unittest.TestCase): 16 | 17 | def setUp(self): 18 | self.txnNames = [ ] 19 | for i in xrange(0, 6): 20 | self.txnNames.append("txn-%02d" % i) 21 | pass 22 | 23 | def compareResults(self, r1, r2): 24 | self.assertEquals(r1.start, r2.start) 25 | self.assertEquals(r1.stop, r2.stop) 26 | for txn in self.txnNames: 27 | self.assertEquals(r1.txn_counters[txn], r2.txn_counters[txn]) 28 | self.assertEquals(r1.txn_times[txn], r2.txn_times[txn]) 29 | ## FOR 30 | self.assertEquals(len(r1.completed), len(r2.completed)) 31 | ## DEF 32 | 33 | def testOpCount(self): 34 | totalOpCount = 0 35 | results = [ Results() for i in xrange(10) ] 36 | map(Results.startBenchmark, results) 37 | for r in results: 38 | for i in xrange(0, 5000): 39 | txn = random.choice(self.txnNames) 40 | id = r.startTransaction(txn) 41 | assert id != None 42 | ops = random.randint(1, 10) 43 | r.stopTransaction(id, ops) 44 | totalOpCount += ops 45 | ## FOR 46 | ## FOR 47 | map(Results.stopBenchmark, results) 48 | 49 | r = Results() 50 | map(r.append, results) 51 | self.assertEquals(totalOpCount, r.opCount) 52 | ## DEF 53 | 54 | 55 | def testAppend(self): 56 | r1 = Results() 57 | r1.startBenchmark() 58 | for i in xrange(0, 5000): 59 | txn = random.choice(self.txnNames) 60 | id = r1.startTransaction(txn) 61 | assert id != None 62 | r1.stopTransaction(id, 1) 63 | ## FOR 64 | r1.stopBenchmark() 65 | print r1.show() 66 | 67 | # Append the time and then make sure they're the same 68 | r2 = Results() 69 | r2.append(r1) 70 | self.compareResults(r1, r2) 71 | 72 | ## DEF 73 | 74 | def testPickle(self): 75 | r = Results() 76 | r.startBenchmark() 77 | for i in xrange(0, 1000): 78 | txn = random.choice(self.txnNames) 79 | id = r.startTransaction(txn) 80 | assert id != None 81 | r.stopTransaction(id, 1) 82 | ## FOR 83 | 84 | # Serialize 85 | import pickle 86 | p = pickle.dumps(r, -1) 87 | assert p 88 | 89 | # Deserialize 90 | clone = pickle.loads(p) 91 | assert clone 92 | 93 | # Make sure the txn counts are equal 94 | self.compareResults(r, clone) 95 | ## DEF 96 | 97 | ## CLASS 98 | 99 | if __name__ == '__main__': 100 | unittest.main() 101 | ## MAIN -------------------------------------------------------------------------------- /tests/catalog/unittest_utilmethods.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os, sys 4 | import itertools 5 | 6 | basedir = os.path.realpath(os.path.dirname(__file__)) 7 | sys.path.append(os.path.join(basedir, "../../src")) 8 | 9 | import unittest 10 | 11 | import catalog 12 | 13 | class TestUtilMethods(unittest.TestCase): 14 | TEST_FIELDS = { 15 | "scalarKey": 1234, 16 | "listKey": range(10), 17 | "nestedKey": { 18 | "innerKey1": 5678, 19 | "innerKey2": 5678, 20 | } 21 | } 22 | 23 | def testGetAllValues(self): 24 | values = catalog.getAllValues(TestUtilMethods.TEST_FIELDS) 25 | self.assertIsNotNone(values) 26 | self.assertIsInstance(values, tuple) 27 | 28 | # Make sure we can hash it 29 | hash_v = hash(values) 30 | # print "hash_v:", hash_v 31 | self.assertIsNotNone(hash_v) 32 | 33 | for v in TestUtilMethods.TEST_FIELDS.itervalues(): 34 | if isinstance(v, dict): 35 | expected = tuple(v.values()) 36 | elif isinstance(v, list): 37 | expected = tuple(v) 38 | else: 39 | expected = v 40 | self.assertIn(expected, values) 41 | ## FOR 42 | ## DEF 43 | 44 | def testGetFieldValue(self): 45 | fields = TestUtilMethods.TEST_FIELDS 46 | for shardKey in fields.keys(): 47 | expected = fields[shardKey] 48 | if shardKey == "nestedKey": 49 | expected = fields[shardKey]["innerKey2"] 50 | shardKey += ".innerKey2" 51 | 52 | actual = catalog.getFieldValue(shardKey, fields) 53 | # print shardKey, "->", actual 54 | self.assertIsNotNone(actual, shardKey) 55 | self.assertEqual(expected, actual, shardKey) 56 | ## FOR 57 | 58 | ## Make sure that if we give it an invald key that we get back None 59 | actual = catalog.getFieldValue("LiptonSoup", fields) 60 | self.assertIsNone(actual) 61 | ## DEF 62 | 63 | def testFieldTypeSerialization(self): 64 | for t in [ int, str, unicode, float ]: 65 | t_bson = catalog.fieldTypeToString(t) 66 | self.assertFalse(t_bson == None) 67 | #print "BSON:", t_bson 68 | t_python = catalog.fieldTypeToPython(t_bson) 69 | self.assertFalse(t_python == None) 70 | #print "PYTHON:", t_python 71 | self.assertEquals(t, t_python) 72 | ## FOR 73 | ## DEF 74 | 75 | ## CLASS 76 | 77 | if __name__ == '__main__': 78 | unittest.main() 79 | ## MAIN -------------------------------------------------------------------------------- /tests/costmodel/disk/unittest_diskcostcomponent_indexinsertionpenalty.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os, sys 5 | from pprint import pformat 6 | import unittest 7 | 8 | basedir = os.path.realpath(os.path.dirname(__file__)) 9 | sys.path.append(os.path.join(basedir, "..")) 10 | 11 | # mongodb-d4 12 | from costmodeltestcase import CostModelTestCase 13 | from search import Design 14 | from workload import Session 15 | from util import constants 16 | from costmodel.disk import DiskCostComponent 17 | 18 | class TestDiskCost_IndexInsertionPenalty(CostModelTestCase): 19 | 20 | def setUp(self): 21 | CostModelTestCase.setUp(self) 22 | self.cm = DiskCostComponent(self.state) 23 | # DEF 24 | 25 | def testDiskCost_IndexInsertionPenalty(self): 26 | """ 27 | IndexInsertionPenalty should be high if we build bad indexes 28 | """ 29 | # 1 30 | d = Design() 31 | for col_name in CostModelTestCase.COLLECTION_NAMES: 32 | d.addCollection(col_name) 33 | d.addIndex(col_name, ["field00"]) 34 | ## FOR 35 | 36 | self.cm.reset() 37 | self.cm.state.reset() 38 | self.cm.getCost(d) 39 | p0 = self.cm.total_index_insertion_penalty 40 | 41 | # 2 42 | d = Design() 43 | for col_name in CostModelTestCase.COLLECTION_NAMES: 44 | d.addCollection(col_name) 45 | d.addIndex(col_name, ["field01"]) 46 | ## FOR 47 | 48 | self.cm.reset() 49 | self.cm.state.reset() 50 | self.cm.getCost(d) 51 | p1 = self.cm.total_index_insertion_penalty 52 | 53 | self.assertEqual(p0, p1) 54 | 55 | #3 56 | d = Design() 57 | for col_name in CostModelTestCase.COLLECTION_NAMES: 58 | d.addCollection(col_name) 59 | d.addIndex(col_name, ["field00", "field01"]) 60 | ## FOR 61 | 62 | self.cm.reset() 63 | self.cm.state.reset() 64 | self.cm.getCost(d) 65 | p2 = self.cm.total_index_insertion_penalty 66 | 67 | self.assertEqual(p0, p2) 68 | 69 | #4 70 | d = Design() 71 | for col_name in CostModelTestCase.COLLECTION_NAMES: 72 | d.addCollection(col_name) 73 | d.addIndex(col_name, ["field00", "field02"]) 74 | ## FOR 75 | 76 | self.cm.reset() 77 | self.cm.state.reset() 78 | self.cm.getCost(d) 79 | p3 = self.cm.total_index_insertion_penalty 80 | 81 | self.assertGreater(p3, p0) 82 | 83 | #5 84 | d = Design() 85 | for col_name in CostModelTestCase.COLLECTION_NAMES: 86 | d.addCollection(col_name) 87 | d.addIndex(col_name, ["field01", "field02"]) 88 | ## FOR 89 | 90 | self.cm.reset() 91 | self.cm.state.reset() 92 | self.cm.getCost(d) 93 | p4 = self.cm.total_index_insertion_penalty 94 | 95 | self.assertGreater(p4, p0) 96 | 97 | #6 98 | d = Design() 99 | for col_name in CostModelTestCase.COLLECTION_NAMES: 100 | d.addCollection(col_name) 101 | d.addIndex(col_name, ["field00", "field01", "field02"]) 102 | ## FOR 103 | 104 | self.cm.reset() 105 | self.cm.state.reset() 106 | self.cm.getCost(d) 107 | p5 = self.cm.total_index_insertion_penalty 108 | 109 | self.assertGreater(p5, p0) 110 | ## DEF 111 | 112 | def testDiskCost_IndexInsertionPenalty_integrated_to_cost_component(self): 113 | """ 114 | Check if index insertion penalty contributes to the total diskcost 115 | """ 116 | 117 | ## DEF 118 | 119 | ## CLASS 120 | 121 | if __name__ == '__main__': 122 | unittest.main() 123 | ## MAIN 124 | -------------------------------------------------------------------------------- /tests/costmodel/disk/unittest_diskcostcomponentindexes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os, sys 5 | from pprint import pformat 6 | import unittest 7 | 8 | basedir = os.path.realpath(os.path.dirname(__file__)) 9 | sys.path.append(os.path.join(basedir, "..")) 10 | 11 | # mongodb-d4 12 | from costmodeltestcase_index import CostModelTestCase 13 | from search import Design 14 | from workload import Session 15 | from util import constants 16 | from costmodel.disk import DiskCostComponent 17 | 18 | class TestDiskCostIndexes(CostModelTestCase): 19 | 20 | def setUp(self): 21 | CostModelTestCase.setUp(self) 22 | self.cm = DiskCostComponent(self.state) 23 | self.cm.no_index_insertion_penalty = True 24 | # DEF 25 | def testDiskCostIndexes(self): 26 | """Check whether disk cost calculations work correctly""" 27 | # First get the disk cost when there are no indexes 28 | d = Design() 29 | col_info = self.collections[CostModelTestCase.COLLECTION_NAME] 30 | d.addCollection(col_info['name']) 31 | 32 | cost0 = self.cm.getCost(d) 33 | print "diskCost0:", cost0 34 | # The cost should be exactly equal to one, which means that every operation 35 | # has to perform a full sequential scan on the collection 36 | self.assertEqual(cost0, 1.0) 37 | 38 | # Now add the all indexes. The disk cost should be lower 39 | d = Design() 40 | col_info = self.collections[CostModelTestCase.COLLECTION_NAME] 41 | d.addCollection(col_info['name']) 42 | d.addIndex(col_info['name'], col_info['interesting']) 43 | self.state.invalidateCache(col_info['name']) 44 | 45 | self.cm.reset() 46 | self.cm.state.reset() 47 | cost1 = self.cm.getCost(d) 48 | print "diskCost1:", cost1 49 | self.assertGreater(cost0, cost1) 50 | 51 | def testDiskCostOnDifferentIndexes(self): 52 | """Check how indexes will affect the disk cost""" 53 | # 1. Put index on both of the fields seperately 54 | d = Design() 55 | d.addCollection(CostModelTestCase.COLLECTION_NAME) 56 | d.addIndex(CostModelTestCase.COLLECTION_NAME, ["field00"]) 57 | d.addIndex(CostModelTestCase.COLLECTION_NAME, ["field01"]) 58 | 59 | self.cm.reset() 60 | self.cm.state.reset() 61 | cost0 = self.cm.getCost(d) 62 | print "diskCost0:", cost0 63 | 64 | # 3. Put indexes on both field together 65 | d = Design() 66 | col_info = self.collections[CostModelTestCase.COLLECTION_NAME] 67 | d.addCollection(CostModelTestCase.COLLECTION_NAME) 68 | d.addIndex(CostModelTestCase.COLLECTION_NAME, ["field01", "field00"]) 69 | self.state.invalidateCache(col_info['name']) 70 | 71 | self.cm.reset() 72 | self.cm.state.reset() 73 | cost1 = self.cm.getCost(d) 74 | print "diskCost1:", cost1 75 | 76 | self.assertGreater(cost0, cost1) 77 | 78 | def testDiskCostCaching(self): 79 | """Check whether disk cost calculations work correctly with caching enabled""" 80 | self.cm.cache_enable = True 81 | 82 | # Give the mofo a full Design with indexes 83 | d = Design() 84 | col_info = self.collections[CostModelTestCase.COLLECTION_NAME] 85 | d.addCollection(col_info['name']) 86 | d.addIndex(col_info['name'], col_info['interesting']) 87 | ## FOR 88 | cost0 = self.cm.getCost(d) 89 | print "diskCost0:", cost0 90 | # FIXME self.assertGreater(cost0, 0.0) 91 | 92 | # We should get the same cost back after we execute it a second time 93 | cost1 = self.cm.getCost(d) 94 | print "diskCost1:", cost1 95 | # FIXME self.assertEqual(cost0, cost1) 96 | ## DEF 97 | 98 | ## CLASS 99 | 100 | if __name__ == '__main__': 101 | unittest.main() 102 | ## MAIN 103 | -------------------------------------------------------------------------------- /tests/costmodel/disk/unittest_diskcostcomponentindexes_withprojection.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os, sys 5 | from pprint import pformat 6 | import unittest 7 | 8 | basedir = os.path.realpath(os.path.dirname(__file__)) 9 | sys.path.append(os.path.join(basedir, "..")) 10 | 11 | # mongodb-d4 12 | from costmodeltestcase_index_withprojection import CostModelTestCase 13 | from search import Design 14 | from workload import Session 15 | from util import constants 16 | from costmodel.disk import DiskCostComponent 17 | 18 | class TestDiskCostIndexesWithProjection(CostModelTestCase): 19 | 20 | def setUp(self): 21 | CostModelTestCase.setUp(self) 22 | self.cm = DiskCostComponent(self.state) 23 | self.cm.no_index_insertion_penalty = True 24 | 25 | ## DEF 26 | def testDiskCostIndexes(self): 27 | """Check whether disk cost calculations work correctly""" 28 | # First get the disk cost when there are no indexes 29 | d = Design() 30 | col_info = self.collections[CostModelTestCase.COLLECTION_NAME] 31 | d.addCollection(col_info['name']) 32 | 33 | cost0 = self.cm.getCost(d) 34 | print "diskCost0:", cost0 35 | # The cost should be exactly equal to one, which means that every operation 36 | # has to perform a full sequential scan on the collection 37 | self.assertEqual(cost0, 1.0) 38 | 39 | # Now add one index. The disk cost should be lower 40 | d = Design() 41 | col_info = self.collections[CostModelTestCase.COLLECTION_NAME] 42 | d.addCollection(col_info['name']) 43 | d.addIndex(col_info['name'], ["field01"]) 44 | self.state.invalidateCache(col_info['name']) 45 | 46 | self.cm.reset() 47 | self.cm.state.reset() 48 | cost1 = self.cm.getCost(d) 49 | print "diskCost1:", cost1 50 | self.assertGreater(cost0, cost1) 51 | 52 | # Now add one more index. The disk cost should be lower again 53 | d = Design() 54 | col_info = self.collections[CostModelTestCase.COLLECTION_NAME] 55 | d.addCollection(col_info['name']) 56 | d.addIndex(col_info['name'], ["field01", "field00"]) 57 | self.state.invalidateCache(col_info['name']) 58 | 59 | self.cm.reset() 60 | self.cm.state.reset() 61 | cost2 = self.cm.getCost(d) 62 | print "diskCost2:", cost2 63 | 64 | # Now add the one index. The disk cost should be much lower 65 | d = Design() 66 | col_info = self.collections[CostModelTestCase.COLLECTION_NAME] 67 | d.addCollection(col_info['name']) 68 | d.addIndex(col_info['name'], ["field01", "field00", "field02"]) 69 | self.state.invalidateCache(col_info['name']) 70 | 71 | self.cm.reset() 72 | self.cm.state.reset() 73 | cost3 = self.cm.getCost(d) 74 | print "diskCost3:", cost3 75 | self.assertGreater(cost2, cost3) 76 | 77 | ## CLASS 78 | 79 | if __name__ == '__main__': 80 | unittest.main() 81 | ## MAIN 82 | -------------------------------------------------------------------------------- /tests/costmodel/disk/unittest_fastlrubuffer.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import unittest 3 | 4 | basedir = os.path.realpath(os.path.dirname(__file__)) 5 | sys.path.append(os.path.join(basedir, "../../../src")) 6 | 7 | from costmodel.disk.fastlrubufferusingwindow import FastLRUBufferWithWindow 8 | 9 | class TestFastLRUbufferWithWindow(unittest.TestCase): 10 | 11 | def setUp(self): 12 | pass 13 | 14 | def testAllBufferOperations_push(self): 15 | self.lru = FastLRUBufferWithWindow(1) 16 | slot_size = 1 17 | for i in xrange(100): 18 | tup = (i) 19 | self.lru.__push__(tup, slot_size) 20 | 21 | self.assertEqual(len(self.lru.buffer), self.lru.window_size) 22 | 23 | def testAllBufferOperations_push_slotsize_0(self): 24 | self.lru = FastLRUBufferWithWindow(10) 25 | slot_size = 1 26 | for i in xrange(9): 27 | tup = (i) 28 | self.lru.__push__(tup, slot_size) 29 | 30 | tup = (9) 31 | slot_size = 9 32 | self.lru.__push__(9, slot_size) 33 | self.assertEqual(len(self.lru.buffer), 2) 34 | 35 | def testAllBufferOperations_push_slotsize_1(self): 36 | self.lru = FastLRUBufferWithWindow(10) 37 | slot_size = 1 38 | for i in xrange(9): 39 | tup = (i) 40 | self.lru.__push__(tup, slot_size) 41 | 42 | tup = (9) 43 | slot_size = 10 44 | self.lru.__push__(9, slot_size) 45 | self.assertEqual(len(self.lru.buffer), 1) 46 | ## DEF 47 | 48 | def testAllBufferOperations_push_slotsize_2(self): 49 | self.lru = FastLRUBufferWithWindow(10) 50 | slot_size = 1 51 | for i in xrange(9): 52 | tup = (i) 53 | self.lru.__push__(tup, slot_size) 54 | 55 | tup = (9) 56 | slot_size = 10 57 | self.lru.__push__(tup, slot_size) 58 | self.assertEqual(len(self.lru.buffer), 1) 59 | 60 | slot_size = 1 61 | for i in xrange(9): 62 | tup = (i) 63 | self.lru.__push__(tup, slot_size) 64 | 65 | self.assertEqual(len(self.lru.buffer), 9) 66 | ## DEF 67 | 68 | def testAllBufferOperations_push_slotsize_3(self): 69 | self.lru = FastLRUBufferWithWindow(10) 70 | slot_size = 1 71 | for i in xrange(9): 72 | tup = (i) 73 | self.lru.__push__(tup, slot_size) 74 | 75 | tup = (9) 76 | slot_size = 10 77 | self.lru.__push__(tup, slot_size) 78 | self.assertEqual(len(self.lru.buffer), 1) 79 | 80 | tup = (11) 81 | slot_size = 1 82 | self.lru.__push__(tup, slot_size) 83 | 84 | self.assertEqual(len(self.lru.buffer), 1) 85 | ## DEF 86 | 87 | def testAllBufferOperations_update(self): 88 | self.lru = FastLRUBufferWithWindow(100) 89 | slot_size = 1 90 | for i in xrange(100): 91 | tup = (i) 92 | self.lru.__push__(tup, slot_size) 93 | 94 | for i in xrange(100): 95 | tup = (i) 96 | self.lru.__update__(tup) 97 | self.assertEqual(self.lru.tail[2], i) 98 | 99 | def testAllBufferOperations_pop(self): 100 | self.lru = FastLRUBufferWithWindow(100) 101 | slot_size = 1 102 | for i in xrange(100): 103 | tup = (i) 104 | self.lru.__push__(tup, slot_size) 105 | for i in xrange(100): 106 | self.lru.__pop__() 107 | self.assertEqual(len(self.lru.buffer), self.lru.window_size - i - 1) 108 | 109 | 110 | if __name__ == '__main__': 111 | unittest.main() 112 | 113 | 114 | -------------------------------------------------------------------------------- /tests/costmodel/network/unittest_networkcostcomponenttpcc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os, sys 5 | from pprint import pformat 6 | import unittest 7 | import copy 8 | 9 | basedir = os.path.realpath(os.path.dirname(__file__)) 10 | sys.path.append(os.path.join(basedir, "../..")) 11 | 12 | # mongodb-d4 13 | from tpcctestcase import TPCCTestCase as CostModelTestCase 14 | from search import Design 15 | from workload import Session 16 | from util import constants 17 | from costmodel.network import NetworkCostComponent 18 | from workload.workloadcombiner import WorkloadCombiner 19 | from tpcc import constants as tpccConstants 20 | 21 | class TestNetworkCostTPCC(CostModelTestCase): 22 | 23 | def setUp(self): 24 | CostModelTestCase.setUp(self) 25 | self.cm = NetworkCostComponent(self.state) 26 | ## DEF 27 | 28 | def testNetworkCostDenormalization(self): 29 | """Check network cost for queries that reference denormalized collections""" 30 | # Get the "base" design cost when all of the collections 31 | # are sharded on their "interesting" fields 32 | d = Design() 33 | i = 0 34 | for col_info in self.collections.itervalues(): 35 | d.addCollection(col_info['name']) 36 | if i == 0: 37 | d.addShardKey(col_info['name'], col_info['interesting']) 38 | else: 39 | d.addShardKey(col_info['name'], ["_id"]) 40 | 41 | self.cm.invalidateCache(d, col_info['name']) 42 | i += 1 43 | ## FOR 44 | self.cm.reset() 45 | self.state.reset() 46 | cost0 = self.cm.getCost(d) 47 | 48 | print "cost0:", cost0 49 | 50 | # Now get the network cost for when we denormalize the 51 | # second collection inside of the first one 52 | # We should have a lower cost because there should now be fewer queries 53 | d = Design() 54 | i = 0 55 | for col_info in self.collections.itervalues(): 56 | self.assertTrue(col_info['interesting']) 57 | d.addCollection(col_info['name']) 58 | if i == 0: 59 | d.addShardKey(col_info['name'], col_info['interesting']) 60 | else: 61 | d.addShardKey(col_info['name'], ["_id"]) 62 | self.cm.invalidateCache(d, col_info['name']) 63 | i += 1 64 | 65 | d.setDenormalizationParent(tpccConstants.TABLENAME_ORDER_LINE, tpccConstants.TABLENAME_ORDERS) 66 | 67 | combiner = WorkloadCombiner(self.collections, self.workload) 68 | combinedWorkload = combiner.process(d) 69 | self.state.updateWorkload(combinedWorkload) 70 | 71 | self.cm.reset() 72 | self.state.reset() 73 | cost1 = self.cm.getCost(d) 74 | print "cost1:", cost1 75 | 76 | self.assertLess(cost1, cost0) 77 | # DEF 78 | 79 | ## CLASS 80 | 81 | if __name__ == '__main__': 82 | unittest.main() 83 | ## MAIN -------------------------------------------------------------------------------- /tests/costmodel/skew/unittest_skewcostcomponent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os, sys 5 | from pprint import pformat 6 | import unittest 7 | 8 | basedir = os.path.realpath(os.path.dirname(__file__)) 9 | sys.path.append(os.path.join(basedir, "../")) 10 | 11 | # mongodb-d4 12 | from costmodeltestcase import CostModelTestCase 13 | from search import Design 14 | from workload import Session 15 | from util import constants 16 | from costmodel.skew import SkewCostComponent 17 | 18 | class TestSkewCost(CostModelTestCase): 19 | 20 | def setUp(self): 21 | CostModelTestCase.setUp(self) 22 | self.cm = SkewCostComponent(self.state) 23 | ## DEF 24 | 25 | def testSkewCost(self): 26 | """Check whether skew cost calculations work correctly""" 27 | col_info = self.collections[CostModelTestCase.COLLECTION_NAMES[0]] 28 | shard_key = col_info['interesting'][0] 29 | 30 | d = Design() 31 | d.addCollection(col_info['name']) 32 | d.addShardKey(col_info['name'], [shard_key]) 33 | 34 | # First get the skew cost when the queries got each node uniformly 35 | # This is the best-case scenario 36 | op_ctr = 0 37 | for sess in self.workload: 38 | for op in sess['operations']: 39 | query_content = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query":\ 40 | {shard_key: op_ctr % CostModelTestCase.NUM_NODES }\ 41 | } ] 42 | op['collection'] = col_info['name'] 43 | op['query_content'] = query_content 44 | op['predicates'] = { shard_key: constants.PRED_TYPE_EQUALITY } 45 | op_ctr += 1 46 | ## FOR (op) 47 | ## FOR (session) 48 | 49 | col_info["fields"][shard_key]["ranges"] = range(CostModelTestCase.NUM_NODES) 50 | 51 | cost0 = self.cm.getCost(d) 52 | self.assertLessEqual(cost0, 1.0) 53 | # print "skewCost0:", cost0 54 | 55 | # Then make all of the operations go to a single node 56 | # This is the worst-case scenario 57 | query_content = [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query":\ 58 | {shard_key: 1000l }\ 59 | } ] 60 | for sess in self.workload: 61 | for op in sess['operations']: 62 | op['collection'] = col_info['name'] 63 | op['query_content'] = query_content 64 | op['predicates'] = { shard_key: constants.PRED_TYPE_EQUALITY } 65 | ## FOR 66 | self.state.reset() 67 | self.cm.reset() 68 | cost1 = self.cm.getCost(d) 69 | self.assertLessEqual(cost1, 1.0) 70 | # print "skewCost1:", cost1 71 | 72 | self.assertGreater(cost1, cost0) 73 | 74 | ## DEF 75 | 76 | def testGetSplitWorkload(self): 77 | """Check that the workload is split into intervals""" 78 | 79 | self.assertEqual(CostModelTestCase.NUM_SESSIONS, sum(map(len, self.cm.workload_segments))) 80 | for i in xrange(0, CostModelTestCase.NUM_INTERVALS): 81 | # print "[%02d]: %d" % (i, len(self.cm.workload_segments[i])) 82 | self.assertGreater(len(self.cm.workload_segments[i]), 0) 83 | ## FOR 84 | self.assertEqual(CostModelTestCase.NUM_INTERVALS, len(self.cm.workload_segments)) 85 | ## DEF 86 | 87 | 88 | ## CLASS 89 | 90 | if __name__ == '__main__': 91 | unittest.main() 92 | ## MAIN -------------------------------------------------------------------------------- /tests/costmodel/unittest_costmodel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os, sys 5 | import unittest 6 | 7 | basedir = os.path.realpath(os.path.dirname(__file__)) 8 | sys.path.append(os.path.join(basedir, "../../")) 9 | 10 | # mongodb-d4 11 | from costmodeltestcase import CostModelTestCase 12 | import costmodel 13 | from search import Design 14 | 15 | class TestCostModel(CostModelTestCase): 16 | 17 | def setUp(self): 18 | CostModelTestCase.setUp(self) 19 | self.cm = costmodel.CostModel(self.collections, self.workload, self.costModelConfig) 20 | ## DEF 21 | 22 | def testSameDesignExecutedTwice_withemptydesign(self): 23 | """ 24 | If the same design is executed twice, they should have the same result 25 | """ 26 | d = Design() 27 | for col_name in CostModelTestCase.COLLECTION_NAMES: 28 | d.addCollection(col_name) 29 | 30 | ## for 31 | cost0 = self.cm.overallCost(d) 32 | cost1 = self.cm.overallCost(d) 33 | 34 | self.assertEqual(cost0, cost1) 35 | 36 | ## def 37 | 38 | def testSameDesignExecutedTwice_withfulldesign(self): 39 | """ 40 | If the same design is executed twice, they should have the same result 41 | """ 42 | d = Design() 43 | for col_name in CostModelTestCase.COLLECTION_NAMES: 44 | d.addCollection(col_name) 45 | col_info = self.collections[col_name] 46 | d.addIndex(col_name, col_info['interesting']) 47 | ## for 48 | 49 | cost0 = self.cm.overallCost(d) 50 | cost1 = self.cm.overallCost(d) 51 | 52 | self.assertEqual(cost0, cost1) 53 | ## def 54 | 55 | ## CLASS 56 | 57 | if __name__ == '__main__': 58 | unittest.main() 59 | ## MAIN -------------------------------------------------------------------------------- /tests/costmodel/unittest_costmodel_denormalization.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | import os 4 | import sys 5 | 6 | basedir = os.path.realpath(os.path.dirname(__file__)) 7 | sys.path.append(os.path.join(basedir, "../../src")) 8 | sys.path.append(os.path.join(basedir, "../../src/search")) 9 | sys.path.append(os.path.join(basedir, "../")) 10 | 11 | from util import constants 12 | from tpcctestcase import TPCCTestCase 13 | from search import Design 14 | from costmodel import CostModel 15 | from tpcc import constants as tpccConstants 16 | 17 | class FindExpectedDesign(TPCCTestCase): 18 | """ 19 | Try to see if the existing cost model could generate the best desgin we 20 | expected 21 | """ 22 | def setUp(self): 23 | TPCCTestCase.setUp(self) 24 | ## DEF 25 | 26 | def testfindExpectedDesign(self): 27 | """Perform the actual search for a design""" 28 | # Generate all the design candidates 29 | # Instantiate cost model 30 | cmConfig = { 31 | 'weight_network': 4, 32 | 'weight_disk': 1, 33 | 'weight_skew': 1, 34 | 'nodes': 10, 35 | 'max_memory': 1024, 36 | 'skew_intervals': 10, 37 | 'address_size': 64, 38 | 'window_size': 500 39 | } 40 | cm = CostModel(self.collections, self.workload, cmConfig) 41 | d0 = self.getManMadeDesign() 42 | cost0 = cm.overallCost(d0) 43 | 44 | d1 = d0.copy() 45 | d1.setDenormalizationParent(tpccConstants.TABLENAME_ORDER_LINE, tpccConstants.TABLENAME_ORDERS) 46 | cost1 = cm.overallCost(d1) 47 | 48 | self.assertLess(cost1, cost0) 49 | ## def 50 | 51 | def getManMadeDesign(self, denorm=True): 52 | # create a best design mannually 53 | 54 | d = Design() 55 | d.addCollection(tpccConstants.TABLENAME_ITEM) 56 | d.addCollection(tpccConstants.TABLENAME_WAREHOUSE) 57 | d.addCollection(tpccConstants.TABLENAME_DISTRICT) 58 | d.addCollection(tpccConstants.TABLENAME_CUSTOMER) 59 | d.addCollection(tpccConstants.TABLENAME_STOCK) 60 | d.addCollection(tpccConstants.TABLENAME_ORDERS) 61 | d.addCollection(tpccConstants.TABLENAME_NEW_ORDER) 62 | d.addCollection(tpccConstants.TABLENAME_ORDER_LINE) 63 | 64 | d.addIndex(tpccConstants.TABLENAME_ITEM, ["I_ID"]) 65 | d.addIndex(tpccConstants.TABLENAME_WAREHOUSE, ["W_ID"]) 66 | d.addIndex(tpccConstants.TABLENAME_DISTRICT, ["D_W_ID", "D_ID"]) 67 | d.addIndex(tpccConstants.TABLENAME_CUSTOMER, ["C_W_ID", "C_D_ID","C_ID"]) 68 | d.addIndex(tpccConstants.TABLENAME_ORDERS, ["O_W_ID", "O_D_ID", "O_C_ID"]) 69 | d.addIndex(tpccConstants.TABLENAME_ORDERS, ["O_W_ID", "O_D_ID", "O_ID"]) 70 | d.addIndex(tpccConstants.TABLENAME_STOCK, ["S_W_ID", "S_I_ID"]) 71 | d.addIndex(tpccConstants.TABLENAME_NEW_ORDER, ["NO_W_ID", "NO_D_ID", "NO_O_ID"]) 72 | d.addIndex(tpccConstants.TABLENAME_ORDER_LINE, ["OL_W_ID", "OL_D_ID", "OL_O_ID"]) 73 | 74 | d.addShardKey(tpccConstants.TABLENAME_ITEM, ["I_ID"]) 75 | d.addShardKey(tpccConstants.TABLENAME_WAREHOUSE, ["W_ID"]) 76 | d.addShardKey(tpccConstants.TABLENAME_DISTRICT, ["W_ID"]) 77 | d.addShardKey(tpccConstants.TABLENAME_CUSTOMER, ["W_ID"]) 78 | d.addShardKey(tpccConstants.TABLENAME_ORDERS, ["W_ID"]) 79 | d.addShardKey(tpccConstants.TABLENAME_STOCK, ["W_ID"]) 80 | d.addShardKey(tpccConstants.TABLENAME_NEW_ORDER, ["W_ID"]) 81 | d.addShardKey(tpccConstants.TABLENAME_ORDER_LINE, ["W_ID"]) 82 | 83 | return d 84 | 85 | if __name__ == '__main__': 86 | unittest.main() 87 | ## MAIN 88 | -------------------------------------------------------------------------------- /tests/exps/replay/unittest_denormalizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os, sys 4 | basedir = os.path.realpath(os.path.dirname(__file__)) 5 | sys.path.append(os.path.join(basedir, "../../../src")) 6 | sys.path.append(os.path.join(basedir, "../../../src/search")) 7 | sys.path.append(os.path.join(basedir, "../../../exps/benchmarks/replay")) 8 | 9 | import unittest 10 | from workloadgenerator import CostModelTestCase 11 | from search import Design 12 | from denormalizer import Denormalizer 13 | 14 | class TestDenormalizer(CostModelTestCase): 15 | 16 | def setUp(self): 17 | CostModelTestCase.setUp(self) 18 | self.col_names = [ x for x in self.collections.iterkeys()] 19 | ## DEF 20 | 21 | def testDenormalizer(self): 22 | d = Design() 23 | for col_name in self.col_names: 24 | d.addCollection(col_name) 25 | ## FOR 26 | op_list = self.printOperations() 27 | col_list = self.printAllCollections() 28 | d.setDenormalizationParent("koalas", "apples") 29 | 30 | dn = Denormalizer(self.metadata_db, self.dataset_db, d) 31 | dn.process() 32 | 33 | new_op_list = self.printOperations() 34 | new_col_list = self.printAllCollections() 35 | 36 | self.assertTrue("koalas" not in new_op_list) 37 | self.assertTrue("koalas" not in new_col_list) 38 | ## DEF 39 | 40 | def printOperations(self): 41 | op_list = [] 42 | for sess in self.metadata_db.Session.fetch(): 43 | for op in sess['operations']: 44 | op_list.append(op['collection']) 45 | ## FOR 46 | ## FOR 47 | return op_list 48 | ## DEF 49 | 50 | def printAllCollections(self): 51 | col_list = [ ] 52 | for col_name in self.dataset_db.collection_names(): 53 | col_list.append(col_name) 54 | ## FOR 55 | return col_list 56 | ## DEF 57 | 58 | if __name__ == '__main__': 59 | unittest.main() 60 | ## MAIN -------------------------------------------------------------------------------- /tests/exps/replay/workloadgenerator.py: -------------------------------------------------------------------------------- 1 | 2 | import os, sys 3 | import random 4 | import time 5 | 6 | basedir = os.path.realpath(os.path.dirname(__file__)) 7 | sys.path.append(os.path.join(basedir, "../../../")) 8 | sys.path.append(os.path.join(basedir, "../")) 9 | # mongodb-d4 10 | try: 11 | from mongodbtestcase import MongoDBTestCase 12 | except ImportError: 13 | from tests import MongoDBTestCase 14 | 15 | from costmodel.state import State 16 | from search import Design 17 | from workload import Session 18 | from util import constants 19 | from inputs.mongodb import MongoSniffConverter 20 | 21 | class CostModelTestCase(MongoDBTestCase): 22 | """ 23 | Base test case for cost model components 24 | """ 25 | 26 | COLLECTION_NAMES = ["apples", "unexpected", "koalas"] 27 | NUM_DOCUMENTS = 10000 28 | NUM_SESSIONS = 2 29 | NUM_NODES = 8 30 | NUM_INTERVALS = 10 31 | 32 | def setUp(self): 33 | MongoDBTestCase.setUp(self) 34 | 35 | # WORKLOAD 36 | timestamp = time.time() 37 | for i in xrange(CostModelTestCase.NUM_SESSIONS): 38 | sess = self.metadata_db.Session() 39 | sess['session_id'] = i 40 | sess['ip_client'] = "client:%d" % (1234+i) 41 | sess['ip_server'] = "server:5678" 42 | sess['start_time'] = timestamp 43 | 44 | for j in xrange(0, len(CostModelTestCase.COLLECTION_NAMES)): 45 | _id = str(random.random()) 46 | queryId = long((i<<16) + j) 47 | queryContent = { } 48 | queryPredicates = { } 49 | 50 | responseContent = {"_id": _id} 51 | responseId = (queryId<<8) 52 | 53 | f_name = "field" + str(random.randint(0, 10)) 54 | responseContent[f_name] = random.randint(0, 100) 55 | queryContent[f_name] = responseContent[f_name] 56 | queryPredicates[f_name] = constants.PRED_TYPE_EQUALITY 57 | 58 | queryContent = { constants.REPLACE_KEY_DOLLAR_PREFIX + "query": queryContent } 59 | op = Session.operationFactory() 60 | op['collection'] = CostModelTestCase.COLLECTION_NAMES[j] 61 | op['type'] = constants.OP_TYPE_QUERY 62 | op['query_id'] = queryId 63 | op['query_content'] = [ queryContent ] 64 | op['resp_content'] = [ responseContent ] 65 | op['resp_id'] = responseId 66 | op['predicates'] = queryPredicates 67 | op['query_time'] = timestamp 68 | timestamp += 1 69 | op['resp_time'] = timestamp 70 | sess['operations'].append(op) 71 | ## FOR (ops) 72 | sess['end_time'] = timestamp 73 | timestamp += 2 74 | sess.save() 75 | ## FOR (sess) 76 | 77 | # Use the MongoSniffConverter to populate our metadata 78 | converter = MongoSniffConverter(self.metadata_db, self.dataset_db) 79 | converter.no_mongo_parse = True 80 | converter.no_mongo_sessionizer = True 81 | converter.process() 82 | self.assertEqual(CostModelTestCase.NUM_SESSIONS, self.metadata_db.Session.find().count()) 83 | 84 | self.collections = dict([ (c['name'], c) for c in self.metadata_db.Collection.fetch()]) 85 | self.assertEqual(len(CostModelTestCase.COLLECTION_NAMES), len(self.collections)) 86 | 87 | populated_workload = [c for c in self.metadata_db.Session.fetch()] 88 | self.workload = populated_workload 89 | 90 | # Increase the database size beyond what the converter derived from the workload 91 | for col_name, col_info in self.collections.iteritems(): 92 | col_info['doc_count'] = CostModelTestCase.NUM_DOCUMENTS 93 | col_info['avg_doc_size'] = 1024 # bytes 94 | col_info['max_pages'] = col_info['doc_count'] * col_info['avg_doc_size'] / (4 * 1024) 95 | col_info.save() 96 | # print pformat(col_info) 97 | 98 | self.costModelConfig = { 99 | 'max_memory': 1024, # MB 100 | 'skew_intervals': CostModelTestCase.NUM_INTERVALS, 101 | 'address_size': 64, 102 | 'nodes': CostModelTestCase.NUM_NODES, 103 | 'window_size': 3 104 | } 105 | 106 | self.state = State(self.collections, populated_workload, self.costModelConfig) 107 | ## DEF 108 | ## CLASS -------------------------------------------------------------------------------- /tests/exps/tools/unittest_design_deserializer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import sys 4 | 5 | basedir = os.path.realpath(os.path.dirname(__file__)) 6 | sys.path.append(os.path.join(basedir, "../../../src")) 7 | sys.path.append(os.path.join(basedir, "../../../src/search")) 8 | sys.path.append(os.path.join(basedir, "../..")) 9 | sys.path.append(os.path.join(basedir, "../../../exps/tools")) 10 | 11 | from util import constants 12 | from tpcctestcase import TPCCTestCase 13 | from search import Design 14 | from design_deserializer import Deserializer 15 | from costmodel import CostModel 16 | from tpcc import constants as tpccConstants 17 | 18 | class FindExpectedDesign(TPCCTestCase): 19 | """ 20 | Try to see if the existing cost model could generate the best desgin we 21 | expected 22 | """ 23 | def setUp(self): 24 | TPCCTestCase.setUp(self) 25 | ## DEF 26 | 27 | def testfindExpectedDesign(self): 28 | """Perform the actual search for a design""" 29 | # Generate all the design candidates 30 | # Instantiate cost model 31 | cmConfig = { 32 | 'weight_network': 4, 33 | 'weight_disk': 1, 34 | 'weight_skew': 1, 35 | 'nodes': 10, 36 | 'max_memory': 1024, 37 | 'skew_intervals': 10, 38 | 'address_size': 64, 39 | 'window_size': 500 40 | } 41 | cm = CostModel(self.collections, self.workload, cmConfig) 42 | d0 = self.getManMadeDesign() 43 | print d0 44 | output_design = d0.toJSON() 45 | cost0 = cm.overallCost(d0) 46 | ds = Deserializer(output_design) 47 | d1 = ds.Deserialize() 48 | print d1 49 | cost1 = cm.overallCost(d1) 50 | 51 | self.assertEqual(cost1, cost0) 52 | ## def 53 | 54 | def getManMadeDesign(self, denorm=True): 55 | # create a best design mannually 56 | 57 | d = Design() 58 | d.addCollection(tpccConstants.TABLENAME_ITEM) 59 | d.addCollection(tpccConstants.TABLENAME_WAREHOUSE) 60 | d.addCollection(tpccConstants.TABLENAME_DISTRICT) 61 | d.addCollection(tpccConstants.TABLENAME_CUSTOMER) 62 | d.addCollection(tpccConstants.TABLENAME_STOCK) 63 | d.addCollection(tpccConstants.TABLENAME_ORDERS) 64 | d.addCollection(tpccConstants.TABLENAME_NEW_ORDER) 65 | d.addCollection(tpccConstants.TABLENAME_ORDER_LINE) 66 | 67 | d.addIndex(tpccConstants.TABLENAME_ITEM, ["I_ID"]) 68 | d.addIndex(tpccConstants.TABLENAME_WAREHOUSE, ["W_ID"]) 69 | d.addIndex(tpccConstants.TABLENAME_DISTRICT, ["D_W_ID", "D_ID"]) 70 | d.addIndex(tpccConstants.TABLENAME_CUSTOMER, ["C_W_ID", "C_D_ID","C_ID"]) 71 | d.addIndex(tpccConstants.TABLENAME_ORDERS, ["O_W_ID", "O_D_ID", "O_C_ID"]) 72 | d.addIndex(tpccConstants.TABLENAME_ORDERS, ["O_W_ID", "O_D_ID", "O_ID"]) 73 | d.addIndex(tpccConstants.TABLENAME_STOCK, ["S_W_ID", "S_I_ID"]) 74 | d.addIndex(tpccConstants.TABLENAME_NEW_ORDER, ["NO_W_ID", "NO_D_ID", "NO_O_ID"]) 75 | d.addIndex(tpccConstants.TABLENAME_ORDER_LINE, ["OL_W_ID", "OL_D_ID", "OL_O_ID"]) 76 | 77 | d.addShardKey(tpccConstants.TABLENAME_ITEM, ["I_ID"]) 78 | d.addShardKey(tpccConstants.TABLENAME_WAREHOUSE, ["W_ID"]) 79 | d.addShardKey(tpccConstants.TABLENAME_DISTRICT, ["W_ID"]) 80 | d.addShardKey(tpccConstants.TABLENAME_CUSTOMER, ["W_ID"]) 81 | d.addShardKey(tpccConstants.TABLENAME_ORDERS, ["W_ID"]) 82 | d.addShardKey(tpccConstants.TABLENAME_STOCK, ["W_ID"]) 83 | d.addShardKey(tpccConstants.TABLENAME_NEW_ORDER, ["W_ID"]) 84 | d.addShardKey(tpccConstants.TABLENAME_ORDER_LINE, ["W_ID"]) 85 | 86 | return d 87 | 88 | if __name__ == '__main__': 89 | unittest.main() 90 | ## MAIN 91 | -------------------------------------------------------------------------------- /tests/mongodbtestcase.py: -------------------------------------------------------------------------------- 1 | 2 | import os, sys 3 | import unittest 4 | 5 | import logging 6 | logging.basicConfig(level = logging.INFO, 7 | format="%(asctime)s [%(filename)s:%(lineno)03d] %(levelname)-5s: %(message)s", 8 | datefmt="%m-%d-%Y %H:%M:%S", 9 | stream = sys.stdout) 10 | 11 | basedir = os.path.realpath(os.path.dirname(__file__)) 12 | sys.path.append(os.path.realpath(os.path.join(basedir, "../libs"))) 13 | sys.path.append(os.path.realpath(os.path.join(basedir, "../src"))) 14 | 15 | # Third-Party Dependencies 16 | import mongokit 17 | 18 | # mongodb-d4 19 | from catalog import Collection 20 | from workload import Session 21 | from util import constants 22 | 23 | class MongoDBTestCase(unittest.TestCase): 24 | """ 25 | Special test case that will automatically setup our connections 26 | for the metadata and workload databases 27 | """ 28 | 29 | def setUp(self): 30 | conn = mongokit.Connection() 31 | conn.register([ Collection, Session ]) 32 | 33 | # Drop the databases first 34 | # Note that we prepend "test_" in front of the db names 35 | db_prefix = "test_" 36 | for dbName in [constants.METADATA_DB_NAME, constants.DATASET_DB_NAME]: 37 | conn.drop_database(db_prefix + dbName) 38 | self.metadata_db = conn[db_prefix + constants.METADATA_DB_NAME] 39 | self.dataset_db = conn[db_prefix + constants.DATASET_DB_NAME] 40 | 41 | ## DEF -------------------------------------------------------------------------------- /tests/runTests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh -x 2 | 3 | nosetests --verbose --nocapture $(find . -name "unittest*.py" -type f) -------------------------------------------------------------------------------- /tests/sanitizer/trace-anon.out: -------------------------------------------------------------------------------- 1 | sniffing... 27017 2 | 000.000 - 127.0.0.1:33082 -->> 127.0.0.1:27017 admin.$cmd 60 bytes id:0 0 3 | query: { "whatsmyuri" : 1 } ntoreturn: 1 ntoskip: 0 4 | 000.000 - 127.0.0.1:27017 <<-- 127.0.0.1:33082 78 bytes id:10 16 - 0 5 | reply n:1 cursorId: 0 6 | { "you" : XXX_HASH_XXX/15, "ok" : 1 } 7 | 000.000 - 127.0.0.1:33082 -->> 127.0.0.1:27017 test_db.col 41 bytes id:1 1 8 | delete flags: 0 q: {} 9 | 000.000 - 127.0.0.1:33082 -->> 127.0.0.1:27017 test_db.col 109 bytes id:2 2 10 | insert: { "_id" : { "$oid" : XXX_HASH_XXX/24 }, "num" : 1, "key" : XXX_HASH_XXX/36 } 11 | 000.000 - 127.0.0.1:33082 -->> 127.0.0.1:27017 test_db.col 111 bytes id:3 3 12 | insert: { "_id" : { "$oid" : XXX_HASH_XXX/24 }, "num" : 2, "key" : XXX_HASH_XXX/38 } 13 | 000.000 - 127.0.0.1:33082 -->> 127.0.0.1:27017 test_db.col 114 bytes id:4 4 14 | insert: { "_id" : { "$oid" : XXX_HASH_XXX/24 }, "num" : 3, "key" : XXX_HASH_XXX/39 } 15 | 000.000 - 127.0.0.1:33082 -->> 127.0.0.1:27017 test_db.col 103 bytes id:5 5 16 | insert: { "_id" : { "$oid" : XXX_HASH_XXX/24 }, "num" : 4, "key" : XXX_HASH_XXX/36 } 17 | 000.000 - 127.0.0.1:33082 -->> 127.0.0.1:27017 test_db.col 108 bytes id:6 6 18 | update flags:0 q:{ "num" : 3 } o:{ "num" : 3, "key" : XXX_HASH_XXX/32 } 19 | 000.000 - 127.0.0.1:33082 -->> 127.0.0.1:27017 test_db.col 83 bytes id:7 7 20 | update flags:0 q:{ "num" : 1 } o:{ "num" : 1, "key" : XXX_HASH_XXX/2 } 21 | 000.000 - 127.0.0.1:33082 -->> 127.0.0.1:27017 test_db.col 88 bytes id:8 8 22 | delete flags: 0 q: { "key" : XXX_HASH_XXX/39 } 23 | 000.000 - 127.0.0.1:33082 -->> 127.0.0.1:27017 test_db.col 52 bytes id:9 9 24 | delete flags: 0 q: { "key" : XXX_HASH_XXX/2 } 25 | 000.000 - 127.0.0.1:33082 -->> 127.0.0.1:27017 test_db.col 65 bytes id:a 10 26 | delete flags: 0 q: { "key" : XXX_HASH_XXX/28 } 27 | 000.000 - 127.0.0.1:33082 -->> 127.0.0.1:27017 test_db.$cmd 86 bytes id:b 11 28 | query: { "count" : XXX_HASH_XXX/3, "query" : {}, "fields" : {} } ntoreturn: -1 ntoskip: 0 29 | 000.000 - 127.0.0.1:27017 <<-- 127.0.0.1:33082 64 bytes id:11 17 - 11 30 | reply n:1 cursorId: 0 31 | { "n" : 3, "ok" : 1 } 32 | -------------------------------------------------------------------------------- /tests/sanitizer/trace-clean.out: -------------------------------------------------------------------------------- 1 | sniffing... 27017 2 | 127.0.0.1:33082 -->> 127.0.0.1:27017 admin.$cmd 60 bytes id:0 0 3 | query: { "whatsmyuri" : 1 } ntoreturn: 1 ntoskip: 0 4 | 127.0.0.1:27017 <<-- 127.0.0.1:33082 78 bytes id:10 16 - 0 5 | reply n:1 cursorId: 0 6 | { "you" : "127.0.0.1:33082", "ok" : 1 } 7 | 127.0.0.1:33082 -->> 127.0.0.1:27017 test_db.col 41 bytes id:1 1 8 | delete flags: 0 q: {} 9 | 127.0.0.1:33082 -->> 127.0.0.1:27017 test_db.col 109 bytes id:2 2 10 | insert: { "_id" : { "$oid" : "4fbe85545df2ef2def485677" }, "num" : 1, "key" : " \t \tsome string \twith spaces \t" } 11 | 127.0.0.1:33082 -->> 127.0.0.1:27017 test_db.col 111 bytes id:3 3 12 | insert: { "_id" : { "$oid" : "4fbe85545df2ef2def485678" }, "num" : 2, "key" : "These \"quotes\" should be \"escaped\"" } 13 | 127.0.0.1:33082 -->> 127.0.0.1:27017 test_db.col 114 bytes id:4 4 14 | insert: { "_id" : { "$oid" : "4fbe85545df2ef2def485679" }, "num" : 3, "key" : "These \"quotes\" should be escaped too." } 15 | 127.0.0.1:33082 -->> 127.0.0.1:27017 test_db.col 103 bytes id:5 5 16 | insert: { "_id" : { "$oid" : "4fbe85545df2ef2def48567a" }, "num" : 4, "key" : "\n\n newlines \\ \\ \" \" \t \n \"\"" } 17 | 127.0.0.1:33082 -->> 127.0.0.1:27017 test_db.col 108 bytes id:6 6 18 | update flags:0 q:{ "num" : 3 } o:{ "num" : 3, "key" : "This is \" \" \n \n \t \t a TEST" } 19 | 127.0.0.1:33082 -->> 127.0.0.1:27017 test_db.col 83 bytes id:7 7 20 | update flags:0 q:{ "num" : 1 } o:{ "num" : 1, "key" : "\"" } 21 | 127.0.0.1:33082 -->> 127.0.0.1:27017 test_db.col 88 bytes id:8 8 22 | delete flags: 0 q: { "key" : "These \"quotes\" should be escaped too." } 23 | 127.0.0.1:33082 -->> 127.0.0.1:27017 test_db.col 52 bytes id:9 9 24 | delete flags: 0 q: { "key" : "\"" } 25 | 127.0.0.1:33082 -->> 127.0.0.1:27017 test_db.col 65 bytes id:a 10 26 | delete flags: 0 q: { "key" : "\n\n\n\n\n\t\t\t\t\t\"\"\"\"" } 27 | 127.0.0.1:33082 -->> 127.0.0.1:27017 test_db.$cmd 86 bytes id:b 11 28 | query: { "count" : "col", "query" : {}, "fields" : {} } ntoreturn: -1 ntoskip: 0 29 | 127.0.0.1:27017 <<-- 127.0.0.1:33082 64 bytes id:11 17 - 11 30 | reply n:1 cursorId: 0 31 | { "n" : 3, "ok" : 1 } 32 | -------------------------------------------------------------------------------- /tests/search/unittest_bbsearch_ShardKeyIterator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import logging 4 | import unittest 5 | 6 | basedir = os.path.realpath(os.path.dirname(__file__)) 7 | sys.path.append(os.path.join(basedir, "../../src")) 8 | 9 | from search import bbsearch 10 | LOG = logging.getLogger(__name__) 11 | 12 | class TestShardKeyIterator(unittest.TestCase): 13 | def setUp(self): 14 | pass 15 | 16 | def testIfGeneratedAllCombination(self): 17 | expected = [["3", "2", "1"], ["3", "2"], ["3", "1"], ["2", "1"], ["3"], ["2"], ["1"]] 18 | iterator = bbsearch.ShardKeyIterator(["3", "2", "1"], -1) 19 | for combinations in expected: 20 | result = iterator.next() 21 | self.assertEqual(tuple(combinations), tuple(result)) 22 | if len(result) == 1 and result[0] == "1": 23 | break 24 | 25 | def testIfGeneratedLimitedCombination(self): 26 | expected = [["3", "2"], ["3"], ["2"]] 27 | iterator = bbsearch.ShardKeyIterator(["3", "2", "1"], 2) 28 | for combinations in expected: 29 | result = iterator.next() 30 | self.assertEqual(tuple(combinations), tuple(result)) 31 | if len(result) == 0 and result[0] == "2": 32 | break 33 | 34 | if __name__ == '__main__': 35 | unittest.main() 36 | -------------------------------------------------------------------------------- /tests/search/unittest_findExpectedDesign.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest 3 | import os 4 | import sys 5 | 6 | basedir = os.path.realpath(os.path.dirname(__file__)) 7 | sys.path.append(os.path.join(basedir, "../../src")) 8 | sys.path.append(os.path.join(basedir, "../../src/search")) 9 | sys.path.append(os.path.join(basedir, "../")) 10 | 11 | from util import configutil 12 | from util import constants 13 | 14 | from tpcctestcase import TPCCTestCase 15 | from ConfigParser import RawConfigParser 16 | from search.designer import Designer 17 | from search import Design 18 | from designcandidates import DesignCandidates 19 | from initialdesigner import InitialDesigner 20 | from lnsdesigner import LNSDesigner 21 | from costmodel import CostModel 22 | from tpcc import constants as tpccConstants 23 | from search import bbsearch 24 | 25 | LNS_RUN_TIME = 2 * 60 * 60 # seconds 26 | 27 | class FindExpectedDesign(TPCCTestCase): 28 | """ 29 | Try to see if the existing cost model could generate the best desgin we 30 | expected 31 | """ 32 | def setUp(self): 33 | TPCCTestCase.setUp(self) 34 | 35 | config = RawConfigParser() 36 | configutil.setDefaultValues(config) 37 | 38 | self.designer = Designer(config, self.metadata_db, self.dataset_db) 39 | self.dc = self.designer.generateDesignCandidates(self.collections, self.workload) 40 | self.assertIsNotNone(self.dc) 41 | 42 | # Make sure that we don't have any invalid candidate keys 43 | for col_name in self.collections.iterkeys(): 44 | for index_keys in self.dc.indexKeys[col_name]: 45 | for key in index_keys: 46 | assert not key.startswith(constants.REPLACE_KEY_DOLLAR_PREFIX), \ 47 | "Unexpected candidate key '%s.%s'" % (col_name, key) 48 | ## FOR 49 | 50 | ## DEF 51 | 52 | def outtestfindExpectedDesign(self): 53 | """Perform the actual search for a design""" 54 | # Generate all the design candidates 55 | # Instantiate cost model 56 | cmConfig = { 57 | 'weight_network': 4, 58 | 'weight_disk': 1, 59 | 'weight_skew': 1, 60 | 'nodes': 10, 61 | 'max_memory': 1024, 62 | 'skew_intervals': 10, 63 | 'address_size': 64, 64 | 'window_size': 500 65 | } 66 | cm = CostModel(self.collections, self.workload, cmConfig) 67 | 68 | initialDesign = InitialDesigner(self.collections, self.workload, None).generate() 69 | upper_bound = cm.overallCost(initialDesign) 70 | print "init solution: ", initialDesign 71 | print "init solution cost: ", upper_bound 72 | collectionNames = [c for c in self.collections] 73 | 74 | dc = self.dc.getCandidates(collectionNames) 75 | print "candidates: ", dc 76 | ln = LNSDesigner(self.collections, \ 77 | self.dc, \ 78 | self.workload, \ 79 | None, \ 80 | cm, \ 81 | initialDesign, \ 82 | upper_bound, \ 83 | LNS_RUN_TIME) 84 | solution = ln.solve() 85 | print "Best cost: ", ln.bestCost 86 | print "solution: ", solution 87 | ## DEF 88 | 89 | if __name__ == '__main__': 90 | unittest.main() 91 | ## MAIN 92 | -------------------------------------------------------------------------------- /tests/search/unittest_initialdesigner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os, sys 5 | import random 6 | import unittest 7 | import logging 8 | from pprint import pprint 9 | 10 | basedir = os.path.realpath(os.path.dirname(__file__)) 11 | sys.path.append(os.path.join(basedir, "../")) 12 | 13 | # mongodb-d4 14 | from tpcctestcase import TPCCTestCase 15 | from search import Design 16 | from workload import Session 17 | import catalog 18 | from search import InitialDesigner 19 | from util import constants, configutil 20 | 21 | class TestInitialDesigner(TPCCTestCase): 22 | 23 | def setUp(self): 24 | TPCCTestCase.setUp(self) 25 | self.config = configutil.makeDefaultConfig() 26 | self.designer = InitialDesigner(self.collections, self.workload, self.config) 27 | self.col_keys = self.designer.generateCollectionHistograms() 28 | self.design = Design() 29 | map(self.design.addCollection, self.col_keys.iterkeys()) 30 | ## DEF 31 | 32 | def testCheckForInvalidKeys(self): 33 | d = self.designer.generate() 34 | self.assertIsNotNone(d) 35 | 36 | # Make sure that we don't have any invalid keys 37 | for col_name in d.getCollections(): 38 | for index_keys in d.getIndexes(col_name): 39 | for key in index_keys: 40 | assert not key.startswith(constants.REPLACE_KEY_DOLLAR_PREFIX), \ 41 | "Invalid index key '%s.%s'" % (col_name, key) 42 | ## FOR 43 | for key in d.getShardKeys(col_name): 44 | assert not key.startswith(constants.REPLACE_KEY_DOLLAR_PREFIX), \ 45 | "Invalid shard key '%s.%s'" % (col_name, key) 46 | ## FOR 47 | ## DEF 48 | 49 | def testSelectShardingKeys(self): 50 | # Select on set of keys at random and increase its occurence 51 | # in the histogram so that we will pick it 52 | expected = { } 53 | for col_name, h in self.col_keys.iteritems(): 54 | keys = random.choice(h.keys()) 55 | h.put(keys, 999999) 56 | expected[col_name] = keys 57 | 58 | self.designer.__selectShardingKeys__(self.design, self.col_keys) 59 | 60 | # Then check to make sure it picked what we expected it to 61 | for col_name in self.col_keys.iterkeys(): 62 | shard_keys = self.design.getShardKeys(col_name) 63 | self.assertIsNotNone(shard_keys) 64 | self.assertIsInstance(shard_keys, tuple) 65 | self.assertEquals(expected[col_name], shard_keys) 66 | #print self.design 67 | ## DEF 68 | 69 | def testSelectIndexKeys(self): 70 | # Select on set of keys at random and increase its occurence 71 | # in the histogram so that we will pick it 72 | expected = { } 73 | for col_name, h in self.col_keys.iteritems(): 74 | keys = random.choice(h.keys()) 75 | h.put(keys, 999999) 76 | expected[col_name] = keys 77 | 78 | node_memory = self.config.get(configutil.SECT_CLUSTER, "node_memory") 79 | self.designer.__selectIndexKeys__(self.design, self.col_keys, node_memory) 80 | #print self.design 81 | 82 | # Then check to make sure it picked what we expected it to 83 | for col_name in self.col_keys.iterkeys(): 84 | index_keys = self.design.getIndexKeys(col_name) 85 | self.assertIsNotNone(index_keys) 86 | self.assertIsInstance(index_keys, list) 87 | # FIXME self.assertEquals(expected[col_name], shard_keys) 88 | ## DEF 89 | 90 | ## CLASS 91 | 92 | if __name__ == '__main__': 93 | unittest.main() 94 | ## MAIN -------------------------------------------------------------------------------- /tests/search/unittest_lnsdesigner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os, sys 5 | import unittest 6 | 7 | basedir = os.path.realpath(os.path.dirname(__file__)) 8 | sys.path.append(os.path.join(basedir, "../../src")) 9 | 10 | from search.lnsdesigner import LNSDesigner 11 | 12 | class TestSearchSpace (unittest.TestCase) : 13 | 14 | def setUp(self): 15 | self.collections = { } 16 | for i in xrange(100): 17 | self.collections["key" + str(i)] = i 18 | pass 19 | ## DEF 20 | 21 | def testRandomCollectionGenerator(self): 22 | """ 23 | Check whether RandomCollectionGenerator can generate random collections 24 | """ 25 | rcg = LNSDesigner.RandomCollectionGenerator(self.collections) 26 | map_round_to_set = { } 27 | for j in xrange(3): 28 | map_round_to_set[j] = rcg.getRandomCollections(3) 29 | ## FOR 30 | 31 | value_list = [val for val in map_round_to_set.itervalues()] 32 | 33 | self.assertNotEqual(sorted(value_list[0]), sorted(value_list[1])) 34 | self.assertNotEqual(sorted(value_list[0]), sorted(value_list[2])) 35 | self.assertNotEqual(sorted(value_list[1]), sorted(value_list[2])) 36 | ## DEF 37 | ## CLASS 38 | 39 | if __name__ == '__main__': 40 | unittest.main() 41 | ## MAIN -------------------------------------------------------------------------------- /tests/search/unittest_utilmethods.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import unittest 5 | import itertools 6 | from search import * 7 | 8 | class TestUtilMethods (unittest.TestCase): 9 | 10 | def setUp(self): 11 | pass 12 | 13 | def testBuildLoadingList(self) : 14 | # Denormalization Tree 15 | # A 16 | # / \ 17 | # B C 18 | # | 19 | # D 20 | expected = [ 21 | ['A'], ['B', 'C'], ['D'] 22 | ] 23 | 24 | d = design.Design() 25 | d.addCollections(itertools.chain(*expected)) 26 | d.setDenormalizationParent('B', 'A') 27 | d.setDenormalizationParent('C', 'A') 28 | d.setDenormalizationParent('D', 'B') 29 | print d 30 | 31 | loadOrder = utilmethods.buildLoadingList(d) 32 | print loadOrder 33 | self.assertNotEqual(loadOrder, None) 34 | 35 | # Go through each round and pop out collections 36 | # as we simulate them being loaded 37 | for loadRound in expected: 38 | while len(loadRound) > 0: 39 | collection = loadOrder.pop(0) 40 | self.assertNotEqual(collection, None) 41 | self.assertTrue(collection in loadRound) 42 | loadRound.remove(collection) 43 | ## WHILE 44 | ## FOR 45 | 46 | # Make sure that we processed all of our collections 47 | self.assertEqual(0, len(loadOrder)) 48 | 49 | ## DEF 50 | 51 | ## CLASS 52 | 53 | if __name__ == '__main__': 54 | unittest.main() 55 | ## MAIN -------------------------------------------------------------------------------- /tests/util/unittest_configutil.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os, sys 5 | basedir = os.path.realpath(os.path.dirname(__file__)) 6 | sys.path.append(os.path.join(basedir, "../../src")) 7 | 8 | import unittest 9 | from pprint import pprint, pformat 10 | 11 | from util import configutil 12 | 13 | class TestConfigUtil(unittest.TestCase): 14 | 15 | def setUp(self): 16 | pass 17 | 18 | def testMakeDefaultConfig(self): 19 | c = configutil.makeDefaultConfig() 20 | self.assertIsNotNone(c) 21 | for sect in configutil.ALL_SECTIONS: 22 | self.assertIn(sect, c.sections()) 23 | for key, desc, default in configutil.DEFAULT_CONFIG[sect]: 24 | self.assertIn(key, c.options(sect)) 25 | self.assertEqual(default, c.get(sect, key)) 26 | ## DEF 27 | 28 | ## CLASS 29 | 30 | if __name__ == '__main__': 31 | unittest.main() 32 | ## MAIN -------------------------------------------------------------------------------- /tests/util/unittest_histogram.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os, sys 5 | import string 6 | import random 7 | import unittest 8 | from pprint import pprint, pformat 9 | 10 | basedir = os.path.realpath(os.path.dirname(__file__)) 11 | sys.path.append(os.path.join(basedir, "../../src")) 12 | from util.histogram import Histogram 13 | 14 | class TestHistogram(unittest.TestCase): 15 | 16 | def setUp(self): 17 | pass 18 | 19 | def testPickle(self): 20 | h = Histogram() 21 | letters = [ x for x in string.letters ] + ["-"] 22 | 23 | for i in xrange(0, 100): 24 | key = "" 25 | for x in xrange(0, 10): 26 | key += random.choice(letters) 27 | assert len(key) > 0 28 | 29 | h.put(key, delta=random.randint(1, 10)) 30 | assert h[key] > 0 31 | ## FOR 32 | 33 | # Serialize 34 | import pickle 35 | p = pickle.dumps(h, -1) 36 | assert p 37 | 38 | # Deserialize 39 | clone = pickle.loads(p) 40 | assert clone 41 | 42 | for key in h.keys(): 43 | self.assertEquals(h[key], clone[key]) 44 | ## FOR 45 | self.assertEquals(h.getSampleCount(), clone.getSampleCount()) 46 | self.assertEquals(sorted(h.getMinCountKeys()), sorted(clone.getMinCountKeys())) 47 | ## DEF 48 | 49 | ## CLASS 50 | 51 | if __name__ == '__main__': 52 | unittest.main() 53 | ## MAIN -------------------------------------------------------------------------------- /tests/util/unittest_mathutil.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os, sys 5 | basedir = os.path.realpath(os.path.dirname(__file__)) 6 | sys.path.append(os.path.join(basedir, "../../src")) 7 | 8 | import unittest 9 | from pprint import pprint, pformat 10 | 11 | from util import mathutil 12 | 13 | class TestMathUtil(unittest.TestCase): 14 | 15 | def setUp(self): 16 | pass 17 | 18 | def testPercentile(self): 19 | data = [ 20 | (range(10), 0.25, 2.25), 21 | (range(10), 0.75, 6.75), 22 | (range(10), 0.50, 4.5), 23 | (range(11), 0.50, 5) 24 | ] 25 | for values, p, expected in data: 26 | actual = mathutil.percentile(values, p) 27 | self.assertEqual(expected, actual) 28 | ## DEF 29 | 30 | ## CLASS 31 | 32 | if __name__ == '__main__': 33 | unittest.main() 34 | ## MAIN -------------------------------------------------------------------------------- /tests/util/unittest_utilmethods.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os, sys 5 | basedir = os.path.realpath(os.path.dirname(__file__)) 6 | sys.path.append(os.path.join(basedir, "../../src")) 7 | 8 | import unittest 9 | from pprint import pprint, pformat 10 | 11 | import util 12 | 13 | class TestUtilMethods (unittest.TestCase): 14 | 15 | def setUp(self): 16 | pass 17 | 18 | def getAllKeys(self, d, keys=None): 19 | if keys == None: keys = [] 20 | for k, v in d.iteritems(): 21 | if not k in keys: keys.append(k) 22 | if type(v) == dict: 23 | self.getAllKeys(v, keys) 24 | return keys 25 | ## DEF 26 | 27 | def testEscapeFieldNames(self): 28 | content = [ 29 | {'$query': {'_id': '1cba73b8a555ba442a3630ccf735dffd/14'}}, 30 | {'$query': {'_id': {'$in': []}}}, 31 | {'count': '107f3bf172abf9dae6458f1dbb0d4ad6/11', 32 | 'query': {'md5': {'$in': ['c3117f341b734d3ce6e71608480de82d/34']}}}, 33 | {'$query': {'foo.bar': 1234}}, 34 | ] 35 | 36 | for i in xrange(0, len(content)): 37 | orig = content[i] 38 | 39 | escaped = util.escapeFieldNames(content[i]) 40 | self.assertNotEqual(escaped, None) 41 | keys = self.getAllKeys(escaped) 42 | for k in keys: 43 | self.assertFalse(k.startswith('$'), pformat(escaped)) 44 | self.assertEqual(-1, k.find(".")) 45 | print pformat(escaped) 46 | ## FOR 47 | ## DEF 48 | 49 | ## CLASS 50 | 51 | if __name__ == '__main__': 52 | unittest.main() 53 | ## MAIN -------------------------------------------------------------------------------- /tests/workload/unittest_ophasher.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os, sys 5 | basedir = os.path.realpath(os.path.dirname(__file__)) 6 | sys.path.append(os.path.join(basedir, "../../src")) 7 | 8 | import unittest 9 | 10 | from util import constants 11 | from workload.ophasher import OpHasher 12 | 13 | class TestOpHasher (unittest.TestCase): 14 | 15 | def setUp(self): 16 | self.hasher = OpHasher() 17 | pass 18 | 19 | def genQuery(self, query): 20 | return [ {constants.REPLACE_KEY_DOLLAR_PREFIX + "query": query} ] 21 | 22 | def genUpdate(self, query, update): 23 | return [ query, update ] 24 | 25 | def testHashQuery(self): 26 | op = { 27 | "collection": u'ABC', 28 | "query_content": self.genQuery({"a": 2}), 29 | "type": "$query", 30 | } 31 | h0 = self.hasher.hash(op) 32 | self.assertNotEqual(h0, None) 33 | 34 | op["query_content"] = self.genQuery({"a": 3}) 35 | h1 = self.hasher.hash(op) 36 | self.assertEqual(h0, h1) 37 | 38 | op["query_content"] = self.genQuery({"a": {"$all": [2, 3]}}) 39 | h2 = self.hasher.hash(op) 40 | self.assertNotEqual(h0, h2) 41 | ## DEF 42 | 43 | def testComplexQuery(self): 44 | content = {u'_id': u'7df2cdb0268fe84ad602e228d75f4812/108', 45 | u'cid': {u'#oid': u'310794ef49b9b02c7f29b1ff64c6f7b3/26'}, 46 | u'd': u'b34918b94d030d5b288053f08258f1c9/10', 47 | u'g': u'5e3f1e67d663a535fe0ceeab07dd0e12/12', 48 | u'hid': u'd259f04f68e37fdebff7c55b67a04fb7/34', 49 | u'hy': {u'0': {u'n': 0, u't': 0}, 50 | u'1': {u'n': 0, u't': 0}, 51 | u'10': {u'n': 0, u't': 0}, 52 | u'11': {u'n': 0, u't': 0}, 53 | u'12': {u'n': 0, u't': 0}, 54 | u'13': {u'n': 0, u't': 0}, 55 | u'14': {u'n': 0, u't': 0}, 56 | u'15': {u'n': 0, u't': 0}, 57 | u'16': {u'n': 0, u't': 0}, 58 | u'17': {u'n': 0, u't': 0}, 59 | u'18': {u'n': 0, u't': 0}, 60 | u'19': {u'n': 0, u't': 0}, 61 | u'2': {u'n': 0, u't': 0}, 62 | u'20': {u'n': 0, u't': 0}, 63 | u'21': {u'n': 0, u't': 0}, 64 | u'22': {u'n': 0, u't': 0}, 65 | u'23': {u'n': 0, u't': 0}, 66 | u'3': {u'n': 0, u't': 0}, 67 | u'4': {u'n': 0, u't': 0}, 68 | u'5': {u'n': 0, u't': 0}, 69 | u'6': {u'n': 0, u't': 0}, 70 | u'7': {u'n': 0, u't': 0}, 71 | u'8': {u'n': 0, u't': 0}, 72 | u'9': {u'n': 0, u't': 0}}, 73 | u'i': u'22922d9f495e1502e3af3dac1a8a4a8b/22'} 74 | op = { 75 | "collection": u'ABC', 76 | "query_content": self.genQuery(content), 77 | "type": "$query", 78 | } 79 | h0 = self.hasher.hash(op) 80 | self.assertNotEqual(h0, None) 81 | ## DEF 82 | 83 | def testHashUpdate(self): 84 | whereClause = {"u_id": 123, "i_id": 456} 85 | updateClause = {"rating": 999} 86 | 87 | op = { 88 | "collection": u'ABC', 89 | "query_content": self.genUpdate(whereClause, updateClause), 90 | "type": "$update", 91 | } 92 | h0 = self.hasher.hash(op) 93 | self.assertNotEqual(h0, None) 94 | 95 | newWhere = dict(whereClause.items() + [("XXX", 123)]) 96 | op["query_content"] = self.genUpdate(newWhere, updateClause) 97 | h1 = self.hasher.hash(op) 98 | self.assertNotEqual(h0, h1) 99 | 100 | newUpdate = dict(updateClause.items() + [("XXX", 123)]) 101 | op["query_content"] = self.genUpdate(whereClause, newUpdate) 102 | h2 = self.hasher.hash(op) 103 | self.assertNotEqual(h0, h2) 104 | ## DEF 105 | 106 | ## CLASS 107 | 108 | if __name__ == '__main__': 109 | unittest.main() 110 | ## MAIN -------------------------------------------------------------------------------- /tests/workload/unittest_utilmethods.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os, sys 4 | 5 | basedir = os.path.realpath(os.path.dirname(__file__)) 6 | sys.path.append(os.path.join(basedir, "../../src")) 7 | 8 | import unittest 9 | 10 | import workload 11 | from util import constants 12 | 13 | class TestUtilMethods(unittest.TestCase): 14 | 15 | def testGetReferencedFields(self): 16 | op = { 17 | 'collection': 'blah', 18 | 'predicates': { }, 19 | 'query_aggregate': True, 20 | 'query_content': [ ], 21 | 'resp_content': [{'n': 16, 'ok': 1}], 22 | 'type': constants.OP_TYPE_QUERY, 23 | } 24 | expected = set() 25 | for i in xrange(4): 26 | keyName = 'key%02d' % i 27 | for ii in xrange(10): 28 | op['query_content'].append({"#query": {keyName: {"#gt": i*ii}}}) 29 | expected.add(keyName) 30 | op['predicates'][keyName] = constants.PRED_TYPE_RANGE 31 | expected = sorted(expected) 32 | #print "EXPECTED:", expected 33 | 34 | fields = workload.getReferencedFields(op) 35 | #print "FIELDS:", fields 36 | self.assertIsNotNone(fields) 37 | self.assertIsInstance(fields, tuple) 38 | self.assertEquals(len(expected), len(fields)) 39 | 40 | for i in xrange(len(expected)): 41 | self.assertEquals(expected[i], fields[i]) 42 | ## FOR 43 | ## DEF 44 | 45 | def testIsOpRegex(self): 46 | op = { 47 | 'collection': 'blah', 48 | 'predicates': {'_id': constants.PRED_TYPE_REGEX}, 49 | 'query_aggregate': True, 50 | 'query_content': [ 51 | {'#query': {'_id': {'#options': 'XXXXXXX', 52 | '#regex': 'YYYYY'}}, 53 | 'count': 'site.songs', 54 | 'fields': None}], 55 | 'query_group': None, 56 | 'query_hash': 3563430808431869716L, 57 | 'query_id': 579750519L, 58 | 'query_limit': -1, 59 | 'query_offset': 0, 60 | 'query_size': 125, 61 | 'query_time': 1338410992.894204, 62 | 'resp_content': [{'n': 16, 'ok': 1}], 63 | 'resp_id': 108641633L, 64 | 'resp_size': 64, 65 | 'resp_time': 1338410992.911907, 66 | 'type': constants.OP_TYPE_QUERY, 67 | 'update_multi': None, 68 | 'update_upsert': None 69 | } 70 | 71 | ret = workload.isOpRegex(op) 72 | self.assertTrue(ret) 73 | 74 | ## DEF 75 | 76 | 77 | ## CLASS 78 | 79 | if __name__ == '__main__': 80 | unittest.main() 81 | ## MAIN -------------------------------------------------------------------------------- /tests/workload/unittest_workloadcombinerwithtpcc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os, sys 4 | basedir = os.path.realpath(os.path.dirname(__file__)) 5 | sys.path.append(os.path.join(basedir, "../search")) 6 | sys.path.append(os.path.join(basedir, "..")) 7 | sys.path.append(os.path.join(basedir, "../../src")) 8 | 9 | import unittest 10 | from workload.workloadcombiner import WorkloadCombiner 11 | from tpcctestcase import TPCCTestCase as CostModelTestCase 12 | from costmodel.disk import DiskCostComponent 13 | from search import Design 14 | from tpcc import constants as tpccConstants 15 | 16 | class TestWorkloadCombiner(CostModelTestCase): 17 | 18 | def setUp(self): 19 | CostModelTestCase.setUp(self) 20 | self.cm = DiskCostComponent(self.state) 21 | self.col_names = [ x for x in self.collections.iterkeys()] 22 | ## DEF 23 | 24 | def testQueriesCombination(self): 25 | """Test if the total number of queries are reduced""" 26 | original_number_of_queries = 0 27 | for sess in self.workload: 28 | for op in sess["operations"]: 29 | original_number_of_queries += 1 30 | 31 | print "orignal number of queries: " + str(original_number_of_queries) 32 | 33 | # Initialize a combiner 34 | combiner = WorkloadCombiner(self.col_names, self.workload) 35 | 36 | # initialize a design with denormalization 37 | d = Design() 38 | for col_name in self.collections.iterkeys(): 39 | d.addCollection(col_name) 40 | 41 | d.setDenormalizationParent(tpccConstants.TABLENAME_ORDER_LINE, tpccConstants.TABLENAME_ORDERS) 42 | 43 | combinedWorkload = combiner.process(d) 44 | 45 | number_of_queries_from_combined_workload = 0 46 | for sess in combinedWorkload: 47 | for op in sess["operations"]: 48 | number_of_queries_from_combined_workload += 1 49 | 50 | print "number of queries after query combination: " + str(number_of_queries_from_combined_workload) 51 | 52 | self.assertGreater(original_number_of_queries, number_of_queries_from_combined_workload) 53 | 54 | def testDiskCostNotChangedAfterQueryCombination(self): 55 | """Disk cost should not be changed after query combination""" 56 | d = Design() 57 | d = Design() 58 | for col_name in self.collections.iterkeys(): 59 | d.addCollection(col_name) 60 | 61 | cost0 = self.cm.getCost(d) 62 | print "cost0 " + str(cost0) 63 | 64 | # Initialize a combiner 65 | combiner = WorkloadCombiner(self.col_names, self.workload) 66 | 67 | # initialize a design with denormalization 68 | d = Design() 69 | d = Design() 70 | for col_name in self.collections.iterkeys(): 71 | d.addCollection(col_name) 72 | self.state.invalidateCache(col_name) 73 | 74 | d.setDenormalizationParent(tpccConstants.TABLENAME_ORDER_LINE, tpccConstants.TABLENAME_ORDERS) 75 | 76 | combinedWorkload = combiner.process(d) 77 | self.state.updateWorkload(combinedWorkload) 78 | 79 | self.cm.reset() 80 | self.cm.state.reset() 81 | cost1 = self.cm.getCost(d) 82 | 83 | print "cost1 " + str(cost1) 84 | 85 | self.assertEqual(cost0, cost1) 86 | ## CLASS 87 | 88 | if __name__ == '__main__': 89 | unittest.main() 90 | ## MAIN 91 | --------------------------------------------------------------------------------