├── .gitignore ├── .gitmodules ├── .travis.yml ├── LICENSE ├── Makefile ├── README.md ├── docs ├── Makefile ├── make.bat └── source │ ├── conf.py │ └── index.rst ├── iceis2012.pdf ├── images ├── TPC-H_Datamodel.png └── tpch_process.png ├── query_root ├── perf_query_template │ ├── 1.sql │ ├── 10.sql │ ├── 11.sql │ ├── 12.sql │ ├── 13.sql │ ├── 14.sql │ ├── 15.sql │ ├── 16.sql │ ├── 17.sql │ ├── 18.sql │ ├── 19.sql │ ├── 2.sql │ ├── 20.sql │ ├── 21.sql │ ├── 22.sql │ ├── 3.sql │ ├── 4.sql │ ├── 5.sql │ ├── 6.sql │ ├── 7.sql │ ├── 8.sql │ └── 9.sql └── prep_query │ ├── create_idx.sql │ └── create_tbl.sql ├── requirements.txt ├── tests ├── test_common.py ├── test_load_after.py ├── test_prepare_after.py ├── test_prepare_before.py ├── test_query_after.py └── test_tpch_pgsql.py ├── tpch4pgsql ├── load.py ├── postgresqldb.py ├── prepare.py ├── query.py └── result.py └── tpch_pgsql.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Object files 5 | *.o 6 | *.ko 7 | *.obj 8 | *.elf 9 | 10 | # Linker output 11 | *.ilk 12 | *.map 13 | *.exp 14 | 15 | # Precompiled Headers 16 | *.gch 17 | *.pch 18 | 19 | # Libraries 20 | *.lib 21 | *.a 22 | *.la 23 | *.lo 24 | 25 | # Shared objects (inc. Windows DLLs) 26 | *.dll 27 | *.so 28 | *.so.* 29 | *.dylib 30 | 31 | # Executables 32 | *.exe 33 | *.out 34 | *.app 35 | *.i*86 36 | *.x86_64 37 | *.hex 38 | 39 | # Debug files 40 | *.dSYM/ 41 | *.su 42 | *.idb 43 | *.pdb 44 | 45 | # Kernel Module Compile Results 46 | *.mod* 47 | *.cmd 48 | .tmp_versions/ 49 | modules.order 50 | Module.symvers 51 | Mkfile.old 52 | dkms.conf 53 | 54 | #Ignore data files 55 | *.tbl 56 | *.tbl.csv 57 | *.csv 58 | 59 | perf_query_gen/ 60 | tpch-dbgen/ 61 | tags 62 | results/ 63 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Science-Platform/tpch-pgsql/4a052f13476367f6890b3b59af9611c9253c9cb7/.gitmodules -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | - "3.4" 5 | 6 | services: 7 | - postgresql 8 | 9 | addons: 10 | postgresql: "9.6" 11 | 12 | env: 13 | - TPCH_PASSWORD=dbf8pXCrzZ5cDeqv 14 | 15 | before_install: 16 | - python --version 17 | - pip --version 18 | - pg_config --version 19 | - psql --version 20 | 21 | - wget -q https://github.com/electrum/tpch-dbgen/archive/32f1c1b92d1664dba542e927d23d86ffa57aa253.zip -O tpch-dbgen.zip 22 | - unzip -q tpch-dbgen.zip 23 | - mv tpch-dbgen-32f1c1b92d1664dba542e927d23d86ffa57aa253 tpch-dbgen 24 | - rm tpch-dbgen.zip 25 | 26 | install: 27 | - pip install -r requirements.txt 28 | 29 | before_script: 30 | - psql --command="CREATE DATABASE tpchdb;" -U postgres 31 | - psql --command="CREATE USER tpch WITH PASSWORD '${TPCH_PASSWORD}';" -U postgres 32 | - psql --command="GRANT ALL PRIVILEGES ON DATABASE tpchdb TO tpch;" -U postgres 33 | 34 | script: 35 | - ./tpch_pgsql.py || export EXIT_CODE=$? && echo $EXIT_CODE && if [ "$EXIT_CODE" == "2" ]; then true; else false; fi 36 | 37 | - ./tpch_pgsql.py --help 38 | 39 | - export PYTHONPATH=$(pwd) 40 | - echo $PYTHONPATH 41 | - $(cd tests/ && python -m unittest test_tpch_pgsql.py --verbose && cd ..) 42 | 43 | - $(cd tests/ && python -m unittest test_prepare_before.py --verbose && cd ..) 44 | - ./tpch_pgsql.py --scale 0.01 prepare 45 | - $(cd tests/ && python -m unittest test_prepare_after.py --verbose && cd ..) 46 | 47 | - ./tpch_pgsql.py --dbname tpchdb --username tpch --password $TPCH_PASSWORD load 48 | - $(cd tests/ && python -m unittest test_load_after.py --verbose && cd ..) 49 | 50 | - ./tpch_pgsql.py --dbname tpchdb --username tpch --password $TPCH_PASSWORD query 51 | - $(cd tests/ && python -m unittest test_query_after.py --verbose && cd ..) 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # $Id: makefile.suite,v 1.25 2009/10/22 19:10:21 jms Exp $ 3 | # 4 | # Revision History 5 | # =================== 6 | # $Log: makefile.suite,v $ 7 | # Revision 1.25 2009/10/22 19:10:21 jms 8 | # update revision to 2.9.0, disable bug55 fix 9 | # 10 | # Revision 1.24 2009/10/22 19:06:10 jms 11 | # update revision to 2.9.0, disable bug55 fix 12 | # 13 | # Revision 1.23 2009/06/28 14:01:08 jms 14 | # bug fix for DOP 15 | # 16 | # Revision 1.22 2008/09/15 16:37:46 jms 17 | # release 2.8.0 makefile.suite 18 | # 19 | # Revision 1.21 2008/03/21 18:26:54 jms 20 | # recursive zip for reference data, chmod for update_release 21 | # 22 | # Revision 1.20 2008/03/21 17:38:39 jms 23 | # changes for 2.6.3 24 | # 25 | # Revision 1.19 2007/03/08 20:36:03 jms 26 | # update release number to 2.6.1 27 | # 28 | # Revision 1.18 2007/02/06 18:15:56 jms 29 | # remove update release from general target 30 | # 31 | # Revision 1.17 2007/01/25 19:35:50 jms 32 | # add sln file used by VS2005 33 | # 34 | # Revision 1.16 2007/01/05 20:05:41 jms 35 | # update release number 36 | # 37 | # Revision 1.15 2006/09/07 17:25:57 jms 38 | # correct dss.ddl 39 | # 40 | # Revision 1.14 2006/08/01 17:21:22 jms 41 | # fix bad merge 42 | # 43 | # Revision 1.13 2006/08/01 16:55:44 jms 44 | # move to 2.4.1 45 | # 46 | # Revision 1.12 2006/06/29 20:46:17 jms 47 | # 2.4.0 changes from Meikel 48 | # 49 | # Revision 1.10 2006/05/25 22:30:44 jms 50 | # qgen porting for 32b/64b 51 | # 52 | # Revision 1.9 2006/04/26 23:17:09 jms 53 | # checking release.h prior to release build 54 | # 55 | # Revision 1.8 2006/04/26 23:03:00 jms 56 | # release 2.3.4-1 57 | # 58 | # Revision 1.7 2006/04/12 18:13:58 jms 59 | # release 2.3.3 60 | # 61 | # Revision 1.6 2006/03/09 18:59:19 jms 62 | # move to version 2.3.2 63 | # 64 | # Revision 1.5 2006/01/28 23:54:32 jms 65 | # add reference data to release 66 | # 67 | # Revision 1.4 2005/10/28 03:00:32 jms 68 | # fix release target 69 | # 70 | # Revision 1.3 2005/10/28 02:54:14 jms 71 | # increment build count with each release creation 72 | # 73 | # Revision 1.2 2005/01/03 20:08:58 jms 74 | # change line terminations 75 | # 76 | # Revision 1.1.1.1 2004/11/24 23:31:47 jms 77 | # re-establish external server 78 | # 79 | # Revision 1.5 2004/03/26 20:39:23 jms 80 | # add tpch tag to release files 81 | # 82 | # Revision 1.4 2004/03/16 14:45:57 jms 83 | # correct release target in makefile 84 | # 85 | # Revision 1.3 2004/03/02 20:49:01 jms 86 | # simplify distributions, add Windows IDE files 87 | # releases should use make release from now on 88 | # 89 | # Revision 1.2 2004/02/18 14:05:53 jms 90 | # porting changes for LINUX and 64 bit RNG 91 | # 92 | # Revision 1.1.1.1 2003/04/03 18:54:21 jms 93 | # recreation after CVS crash 94 | # 95 | # Revision 1.1.1.1 2003/04/03 18:54:21 jms 96 | # initial checkin 97 | # 98 | # 99 | # 100 | ################ 101 | ## CHANGE NAME OF ANSI COMPILER HERE 102 | ################ 103 | CC=gcc 104 | # Current values for DATABASE are: INFORMIX, DB2, TDAT (Teradata) 105 | # SQLSERVER, SYBASE, ORACLE 106 | # Current values for MACHINE are: ATT, DOS, HP, IBM, ICL, MVS, 107 | # SGI, SUN, U2200, VMS, LINUX, WIN32 108 | # Current values for WORKLOAD are: TPCH 109 | DATABASE=ORACLE 110 | MACHINE=LINUX 111 | WORKLOAD=TPCH 112 | # 113 | CFLAGS = -g -DDBNAME=\"dss\" -D$(MACHINE) -D$(DATABASE) -D$(WORKLOAD) -DRNG_TEST -D_FILE_OFFSET_BITS=64 114 | LDFLAGS = -O 115 | # The OBJ,EXE and LIB macros will need to be changed for compilation under 116 | # Windows NT 117 | OBJ = .o 118 | EXE = 119 | LIBS = -lm 120 | # 121 | # NO CHANGES SHOULD BE NECESSARY BELOW THIS LINE 122 | ############### 123 | VERSION=2 124 | RELEASE=13 125 | PATCH=0 126 | BUILD=`grep BUILD release.h | cut -f3 -d' '` 127 | NEW_BUILD=`expr ${BUILD} + 1` 128 | TREE_ROOT=/tmp/tree 129 | # 130 | PROG1 = dbgen$(EXE) 131 | PROG2 = qgen$(EXE) 132 | PROGS = $(PROG1) $(PROG2) 133 | # 134 | HDR1 = dss.h rnd.h config.h dsstypes.h shared.h bcd2.h rng64.h release.h 135 | HDR2 = tpcd.h permute.h 136 | HDR = $(HDR1) $(HDR2) 137 | # 138 | SRC1 = build.c driver.c bm_utils.c rnd.c print.c load_stub.c bcd2.c \ 139 | speed_seed.c text.c permute.c rng64.c 140 | SRC2 = qgen.c varsub.c 141 | SRC = $(SRC1) $(SRC2) 142 | # 143 | OBJ1 = build$(OBJ) driver$(OBJ) bm_utils$(OBJ) rnd$(OBJ) print$(OBJ) \ 144 | load_stub$(OBJ) bcd2$(OBJ) speed_seed$(OBJ) text$(OBJ) permute$(OBJ) \ 145 | rng64$(OBJ) 146 | OBJ2 = build$(OBJ) bm_utils$(OBJ) qgen$(OBJ) rnd$(OBJ) varsub$(OBJ) \ 147 | text$(OBJ) bcd2$(OBJ) permute$(OBJ) speed_seed$(OBJ) rng64$(OBJ) 148 | OBJS = $(OBJ1) $(OBJ2) 149 | # 150 | SETS = dists.dss 151 | DOC=README HISTORY PORTING.NOTES BUGS 152 | DDL = dss.ddl dss.ri 153 | WINDOWS_IDE = tpch.dsw dbgen.dsp tpch.sln tpch.vcproj qgen.vcproj 154 | OTHER=makefile.suite $(SETS) $(DDL) $(WINDOWS_IDE) 155 | # case is *important* in TEST_RES 156 | TEST_RES = O.res L.res c.res s.res P.res S.res n.res r.res 157 | # 158 | DBGENSRC=$(SRC1) $(HDR1) $(OTHER) $(DOC) $(SRC2) $(HDR2) $(SRC3) 159 | FQD=queries/1.sql queries/2.sql queries/3.sql queries/4.sql queries/5.sql queries/6.sql queries/7.sql \ 160 | queries/8.sql queries/9.sql queries/10.sql queries/11.sql queries/12.sql queries/13.sql \ 161 | queries/14.sql queries/15.sql queries/16.sql queries/17.sql queries/18.sql queries/19.sql queries/20.sql \ 162 | queries/21.sql queries/22.sql 163 | VARIANTS= variants/8a.sql variants/12a.sql variants/13a.sql variants/14a.sql variants/15a.sql 164 | ANS = answers/q1.out answers/q2.out answers/q3.out answers/q4.out answers/q5.out answers/q6.out answers/q7.out answers/q8.out \ 165 | answers/q9.out answers/q10.out answers/q11.out answers/q12.out answers/q13.out answers/q14.out answers/q15.out \ 166 | answers/q16.out answers/q17.out answers/q18.out answers/q19.out answers/q20.out answers/q21.out answers/q22.out 167 | QSRC = $(FQD) $(VARIANTS) $(ANS) 168 | TREE_DOC=tree.readme tree.changes appendix.readme appendix.version answers.readme queries.readme variants.readme 169 | REFERENCE=reference/[tcR]* 170 | REFERENCE_DATA=referenceData/[13]* 171 | SCRIPTS= check55.sh column_split.sh dop.sh gen_tasks.sh last_row.sh load_balance.sh new55.sh check_dirs.sh 172 | ALLSRC=$(DBGENSRC) $(REFERENCE) $(QSRC) $(SCRIPTS) 173 | JUNK = 174 | # 175 | all: $(PROGS) 176 | $(PROG1): $(OBJ1) $(SETS) 177 | $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(OBJ1) $(LIBS) 178 | $(PROG2): permute.h $(OBJ2) 179 | $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(OBJ2) $(LIBS) 180 | clean: 181 | rm -f $(PROGS) $(OBJS) $(JUNK) 182 | lint: 183 | lint $(CFLAGS) -u -x -wO -Ma -p $(SRC1) 184 | lint $(CFLAGS) -u -x -wO -Ma -p $(SRC2) 185 | 186 | tar: $(ALLSRC) 187 | tar cvhf - $(ALLSRC) --exclude .svn\*/\* |gzip - > tpch_${VERSION}_${RELEASE}_${PATCH}.tar.gz 188 | tar cvhf - $(REFERENCE_DATA) --exclude .svn\*/\* |gzip - > reference_${VERSION}_${RELEASE}_${PATCH}.tar.gz 189 | zip: $(ALLSRC) 190 | zip -r tpch_${VERSION}_${RELEASE}_${PATCH}.zip $(ALLSRC) -x *.svn* 191 | zip -r reference_${VERSION}_${RELEASE}_${PATCH}.zip $(REFERENCE_DATA) -x *.svn* 192 | release: 193 | make -f makefile.suite tar 194 | make -f makefile.suite zip 195 | ( cd tests; sh test_list.sh `date '+%Y%m%d'` ) 196 | rnd$(OBJ): rnd.h 197 | $(OBJ1): $(HDR1) 198 | $(OBJ2): dss.h tpcd.h config.h rng64.h release.h 199 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tpch-pgsql 2 | [![Build status](https://travis-ci.org/Data-Science-Platform/tpch-pgsql.svg?branch=master)](https://travis-ci.org/Data-Science-Platform/tpch-pgsql) 3 | 4 | Implements the [TPC-H benchmark](http://www.tpc.org/tpch/) for Postgres 5 | 6 | ### Requirements 7 | * The benchmark requires TPC-H dbgen: 8 | ``` 9 | wget -q https://github.com/electrum/tpch-dbgen/archive/32f1c1b92d1664dba542e927d23d86ffa57aa253.zip -O tpch-dbgen.zip 10 | unzip -q tpch-dbgen.zip && mv tpch-dbgen-32f1c1b92d1664dba542e927d23d86ffa57aa253 tpch-dbgen && rm tpch-dbgen.zip 11 | ``` 12 | * gcc 13 | 14 | ``` 15 | gcc --version 16 | ``` 17 | 18 | * python3 19 | 20 | ``` 21 | python3 --version 22 | ``` 23 | 24 | * python requirements 25 | 26 | ``` 27 | pip3 install -r requirements.txt 28 | ``` 29 | 30 | * some running instance of Postgres, e.g. if running locally, the following command should not fail 31 | 32 | ``` 33 | pg_config --version 34 | ``` 35 | 36 | * if you want to run the database locally, please find below the commands for Ubuntu 14.04 37 | ``` 38 | sudo apt-get install -y postgresql postgresql-contrib 39 | 40 | sudo -u postgres createuser tpch 41 | sudo -u postgres createdb tpchdb 42 | 43 | sudo -u postgres psql << PSQL 44 | ALTER USER tpch WITH ENCRYPTED PASSWORD '********'; 45 | GRANT ALL PRIVILEGES ON DATABASE tpchdb TO tpch; 46 | \l 47 | \q 48 | PSQL 49 | ``` 50 | these can be adjusted for your OS easily. 51 | 52 | In case you are using a remote PostgreSQL database, make sure your connection is working and 53 | you have a valid username and password. 54 | 55 | ``` 56 | $ psql -h -p 5432 -d -U -W 57 | Password for user tpch: 58 | psql (9.3.23) 59 | SSL connection (cipher: DHE-RSA-AES256-GCM-SHA384, bits: 256) 60 | Type "help" for help. 61 | 62 | tpchdb=> \q 63 | ``` 64 | Also make sure that you have full rights on the target database (GRANT ALL PRIVILEGES) 65 | 66 | ### Usage 67 | There is a single python file that implements all phases of the benchmark. 68 | 69 | ``` 70 | usage: tpch_pgsql.py [-h] [-H HOST] [-p PORT] [-U USERNAME] [-W [PASSWORD]] 71 | [-d DBNAME] [-i DATA_DIR] [-q QUERY_ROOT] [-g DBGEN_DIR] 72 | [-s SCALE] [-n NUM_STREAMS] [-b] [-r] 73 | {prepare,load,query} 74 | 75 | tpch_pgsql 76 | 77 | positional arguments: 78 | {prepare,load,query} Phase of TPC-H benchmark to run. 79 | 80 | optional arguments: 81 | -h, --help show this help message and exit 82 | -H HOST, --host HOST Address of host on which PostgreSQL instance runs; 83 | default is localhost 84 | -p PORT, --port PORT Port on which PostgreSQL instance runs; default is 85 | 5432 86 | -U USERNAME, --username USERNAME 87 | User for the PostgreSQL instance; default is postgres 88 | -W [PASSWORD], --password [PASSWORD] 89 | Password for the PostgreSQL instance; default is 90 | test123 91 | -d DBNAME, --dbname DBNAME 92 | Name of the database; default is tpch 93 | -i DATA_DIR, --data-dir DATA_DIR 94 | Directory for generated data; default is ./data 95 | -q QUERY_ROOT, --query-root QUERY_ROOT 96 | Directory for query files; default is ./query_root 97 | -g DBGEN_DIR, --dbgen-dir DBGEN_DIR 98 | Directory containing tpch dbgen source; default is 99 | ./tpch-dbgen 100 | -s SCALE, --scale SCALE 101 | Size of the data generated, scale factor; default is 102 | 1.0 = 1GB 103 | -n NUM_STREAMS, --num-streams NUM_STREAMS 104 | Number of streams to run the throughput test with; 105 | default is 0, i.e. based on scale factor SF 106 | -b, --verbose Print more information to standard output 107 | -r, --read-only Do not execute refresh functions during the query 108 | phase, which allows for running it repeatedly 109 | ``` 110 | 111 | ### Phases 112 | * `prepare` 113 | The prepare phase builds TPC-H dbgen and querygen and creates the load and refresh (update/delete) files. 114 | 115 | * `load` 116 | The load phase cleans the database (if required), loads the tables into the database and 117 | creates indexes for querying. The results for this phase consist of the following metrics: 118 | * Schema creation time 119 | * Data loading time 120 | * Foreign key constraint and index creation time 121 | 122 | * `query` 123 | The query phase is the actual performance test. Ir runs twice, with a reboot. 124 | Each run consists of two parts: 125 | * Power test: This consists of sequential execution of the refresh functions and the query streams. It reports back with the execution times for: 126 | * refresh function 1 127 | * query execution time for the 22 TPC-H queries 128 | * refresh function 2 129 | * Throughput test: This consists of parallel execution of the query streams and the pairs of refresh functions 130 | 131 | ### TPC-H Process 132 | The complete process for executing TPC-H tests is illustrated in the following figure: 133 | ![tpch-process](images/tpch_process.png "TPC-H Benchmark Process") 134 | 135 | ### Database Schema 136 | ![db-schema](images/TPC-H_Datamodel.png "TPC-H Database Schema") 137 | 138 | ### Known Issues 139 | * Sometimes the data generation phase fails due to file permission issues. In such a scenario delete the data directory and all generated `.tbl` files inside your `tpch-dbgen` directory. 140 | 141 | ### References 142 | 143 | * For notes on how to the TPC-H benchmark works see the paper [iceis2012](https://github.com/Data-Science-Platform/tpch-pgsql/blob/master/iceis2012.pdf). 144 | * For the TPC-H benchmark specification see [this document](http://www.tpc.org/tpc_documents_current_versions/pdf/tpc-h_v2.17.3.pdf). 145 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = TPC-HforPostgreSQL 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | set SPHINXPROJ=TPC-HforPostgreSQL 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | # import os 16 | # import sys 17 | # sys.path.insert(0, os.path.abspath('.')) 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = 'TPC-H for PostgreSQL' 23 | copyright = '2018, Sohaib Iftakhar, Slavo Nagy' 24 | author = 'Sohaib Iftakhar, Slavo Nagy' 25 | 26 | # The short X.Y version 27 | version = '' 28 | # The full version, including alpha/beta/rc tags 29 | release = '0.1' 30 | 31 | 32 | # -- General configuration --------------------------------------------------- 33 | 34 | # If your documentation needs a minimal Sphinx version, state it here. 35 | # 36 | # needs_sphinx = '1.0' 37 | 38 | # Add any Sphinx extension module names here, as strings. They can be 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 40 | # ones. 41 | extensions = [ 42 | 'sphinx.ext.todo', 43 | 'sphinx.ext.coverage', 44 | ] 45 | 46 | # Add any paths that contain templates here, relative to this directory. 47 | templates_path = ['_templates'] 48 | 49 | # The suffix(es) of source filenames. 50 | # You can specify multiple suffix as a list of string: 51 | # 52 | # source_suffix = ['.rst', '.md'] 53 | source_suffix = '.rst' 54 | 55 | # The master toctree document. 56 | master_doc = 'index' 57 | 58 | # The language for content autogenerated by Sphinx. Refer to documentation 59 | # for a list of supported languages. 60 | # 61 | # This is also used if you do content translation via gettext catalogs. 62 | # Usually you set "language" from the command line for these cases. 63 | language = None 64 | 65 | # List of patterns, relative to source directory, that match files and 66 | # directories to ignore when looking for source files. 67 | # This pattern also affects html_static_path and html_extra_path . 68 | exclude_patterns = [] 69 | 70 | # The name of the Pygments (syntax highlighting) style to use. 71 | pygments_style = 'sphinx' 72 | 73 | 74 | # -- Options for HTML output ------------------------------------------------- 75 | 76 | # The theme to use for HTML and HTML Help pages. See the documentation for 77 | # a list of builtin themes. 78 | # 79 | html_theme = 'alabaster' 80 | 81 | # Theme options are theme-specific and customize the look and feel of a theme 82 | # further. For a list of options available for each theme, see the 83 | # documentation. 84 | # 85 | # html_theme_options = {} 86 | 87 | # Add any paths that contain custom static files (such as style sheets) here, 88 | # relative to this directory. They are copied after the builtin static files, 89 | # so a file named "default.css" will overwrite the builtin "default.css". 90 | html_static_path = ['_static'] 91 | 92 | # Custom sidebar templates, must be a dictionary that maps document names 93 | # to template names. 94 | # 95 | # The default sidebars (for documents that don't match any pattern) are 96 | # defined by theme itself. Builtin themes are using these templates by 97 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 98 | # 'searchbox.html']``. 99 | # 100 | # html_sidebars = {} 101 | 102 | 103 | # -- Options for HTMLHelp output --------------------------------------------- 104 | 105 | # Output file base name for HTML help builder. 106 | htmlhelp_basename = 'TPC-HforPostgreSQLdoc' 107 | 108 | 109 | # -- Options for LaTeX output ------------------------------------------------ 110 | 111 | latex_elements = { 112 | # The paper size ('letterpaper' or 'a4paper'). 113 | # 114 | # 'papersize': 'letterpaper', 115 | 116 | # The font size ('10pt', '11pt' or '12pt'). 117 | # 118 | # 'pointsize': '10pt', 119 | 120 | # Additional stuff for the LaTeX preamble. 121 | # 122 | # 'preamble': '', 123 | 124 | # Latex figure (float) alignment 125 | # 126 | # 'figure_align': 'htbp', 127 | } 128 | 129 | # Grouping the document tree into LaTeX files. List of tuples 130 | # (source start file, target name, title, 131 | # author, documentclass [howto, manual, or own class]). 132 | latex_documents = [ 133 | (master_doc, 'TPC-HforPostgreSQL.tex', 'TPC-H for PostgreSQL Documentation', 134 | 'Sohaib Iftakhar, Slavo Nagy', 'manual'), 135 | ] 136 | 137 | 138 | # -- Options for manual page output ------------------------------------------ 139 | 140 | # One entry per manual page. List of tuples 141 | # (source start file, name, description, authors, manual section). 142 | man_pages = [ 143 | (master_doc, 'tpc-hforpostgresql', 'TPC-H for PostgreSQL Documentation', 144 | [author], 1) 145 | ] 146 | 147 | 148 | # -- Options for Texinfo output ---------------------------------------------- 149 | 150 | # Grouping the document tree into Texinfo files. List of tuples 151 | # (source start file, target name, title, author, 152 | # dir menu entry, description, category) 153 | texinfo_documents = [ 154 | (master_doc, 'TPC-HforPostgreSQL', 'TPC-H for PostgreSQL Documentation', 155 | author, 'TPC-HforPostgreSQL', 'One line description of project.', 156 | 'Miscellaneous'), 157 | ] 158 | 159 | 160 | # -- Extension configuration ------------------------------------------------- 161 | 162 | # -- Options for todo extension ---------------------------------------------- 163 | 164 | # If true, `todo` and `todoList` produce output, else they produce nothing. 165 | todo_include_todos = True -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. TPC-H for PostgreSQL documentation master file, created by 2 | sphinx-quickstart on Mon Jul 30 14:09:56 2018. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to TPC-H for PostgreSQL's documentation! 7 | ================================================ 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | 14 | 15 | Indices and tables 16 | ================== 17 | 18 | * :ref:`genindex` 19 | * :ref:`modindex` 20 | * :ref:`search` 21 | -------------------------------------------------------------------------------- /iceis2012.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Science-Platform/tpch-pgsql/4a052f13476367f6890b3b59af9611c9253c9cb7/iceis2012.pdf -------------------------------------------------------------------------------- /images/TPC-H_Datamodel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Science-Platform/tpch-pgsql/4a052f13476367f6890b3b59af9611c9253c9cb7/images/TPC-H_Datamodel.png -------------------------------------------------------------------------------- /images/tpch_process.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Data-Science-Platform/tpch-pgsql/4a052f13476367f6890b3b59af9611c9253c9cb7/images/tpch_process.png -------------------------------------------------------------------------------- /query_root/perf_query_template/1.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Pricing Summary Report Query (Q1) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | l_returnflag, 9 | l_linestatus, 10 | sum(l_quantity) as sum_qty, 11 | sum(l_extendedprice) as sum_base_price, 12 | sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, 13 | sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, 14 | avg(l_quantity) as avg_qty, 15 | avg(l_extendedprice) as avg_price, 16 | avg(l_discount) as avg_disc, 17 | count(*) as count_order 18 | from 19 | lineitem 20 | where 21 | l_shipdate <= date '1998-12-01' - interval ':1' day 22 | group by 23 | l_returnflag, 24 | l_linestatus 25 | order by 26 | l_returnflag, 27 | l_linestatus 28 | LIMIT 1; 29 | -------------------------------------------------------------------------------- /query_root/perf_query_template/10.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Returned Item Reporting Query (Q10) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | c_custkey, 9 | c_name, 10 | sum(l_extendedprice * (1 - l_discount)) as revenue, 11 | c_acctbal, 12 | n_name, 13 | c_address, 14 | c_phone, 15 | c_comment 16 | from 17 | customer, 18 | orders, 19 | lineitem, 20 | nation 21 | where 22 | c_custkey = o_custkey 23 | and l_orderkey = o_orderkey 24 | and o_orderdate >= date ':1' 25 | and o_orderdate < date ':1' + interval '3' month 26 | and l_returnflag = 'R' 27 | and c_nationkey = n_nationkey 28 | group by 29 | c_custkey, 30 | c_name, 31 | c_acctbal, 32 | c_phone, 33 | n_name, 34 | c_address, 35 | c_comment 36 | order by 37 | revenue desc 38 | LIMIT 20; -------------------------------------------------------------------------------- /query_root/perf_query_template/11.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Important Stock Identification Query (Q11) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | ps_partkey, 9 | sum(ps_supplycost * ps_availqty) as value 10 | from 11 | partsupp, 12 | supplier, 13 | nation 14 | where 15 | ps_suppkey = s_suppkey 16 | and s_nationkey = n_nationkey 17 | and n_name = ':1' 18 | group by 19 | ps_partkey having 20 | sum(ps_supplycost * ps_availqty) > ( 21 | select 22 | sum(ps_supplycost * ps_availqty) * :2 23 | from 24 | partsupp, 25 | supplier, 26 | nation 27 | where 28 | ps_suppkey = s_suppkey 29 | and s_nationkey = n_nationkey 30 | and n_name = ':1' 31 | ) 32 | order by 33 | value desc 34 | LIMIT 1; -------------------------------------------------------------------------------- /query_root/perf_query_template/12.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Shipping Modes and Order Priority Query (Q12) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | l_shipmode, 9 | sum(case 10 | when o_orderpriority = '1-URGENT' 11 | or o_orderpriority = '2-HIGH' 12 | then 1 13 | else 0 14 | end) as high_line_count, 15 | sum(case 16 | when o_orderpriority <> '1-URGENT' 17 | and o_orderpriority <> '2-HIGH' 18 | then 1 19 | else 0 20 | end) as low_line_count 21 | from 22 | orders, 23 | lineitem 24 | where 25 | o_orderkey = l_orderkey 26 | and l_shipmode in (':1', ':2') 27 | and l_commitdate < l_receiptdate 28 | and l_shipdate < l_commitdate 29 | and l_receiptdate >= date ':3' 30 | and l_receiptdate < date ':3' + interval '1' year 31 | group by 32 | l_shipmode 33 | order by 34 | l_shipmode 35 | LIMIT 1; -------------------------------------------------------------------------------- /query_root/perf_query_template/13.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Customer Distribution Query (Q13) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | c_count, 9 | count(*) as custdist 10 | from 11 | ( 12 | select 13 | c_custkey, 14 | count(o_orderkey) 15 | from 16 | customer left outer join orders on 17 | c_custkey = o_custkey 18 | and o_comment not like '%:1%:2%' 19 | group by 20 | c_custkey 21 | ) as c_orders (c_custkey, c_count) 22 | group by 23 | c_count 24 | order by 25 | custdist desc, 26 | c_count desc 27 | LIMIT 1; -------------------------------------------------------------------------------- /query_root/perf_query_template/14.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Promotion Effect Query (Q14) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | 100.00 * sum(case 9 | when p_type like 'PROMO%' 10 | then l_extendedprice * (1 - l_discount) 11 | else 0 12 | end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue 13 | from 14 | lineitem, 15 | part 16 | where 17 | l_partkey = p_partkey 18 | and l_shipdate >= date ':1' 19 | and l_shipdate < date ':1' + interval '1' month 20 | LIMIT 1; -------------------------------------------------------------------------------- /query_root/perf_query_template/15.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Top Supplier Query (Q15) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | create view revenue:s (supplier_no, total_revenue) as 7 | select 8 | l_suppkey, 9 | sum(l_extendedprice * (1 - l_discount)) 10 | from 11 | lineitem 12 | where 13 | l_shipdate >= date ':1' 14 | and l_shipdate < date ':1' + interval '3' month 15 | group by 16 | l_suppkey; 17 | 18 | :o 19 | select 20 | s_suppkey, 21 | s_name, 22 | s_address, 23 | s_phone, 24 | total_revenue 25 | from 26 | supplier, 27 | revenue:s 28 | where 29 | s_suppkey = supplier_no 30 | and total_revenue = ( 31 | select 32 | max(total_revenue) 33 | from 34 | revenue:s 35 | ) 36 | order by 37 | s_suppkey 38 | LIMIT 1; 39 | 40 | drop view revenue:s; -------------------------------------------------------------------------------- /query_root/perf_query_template/16.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Parts/Supplier Relationship Query (Q16) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | p_brand, 9 | p_type, 10 | p_size, 11 | count(distinct ps_suppkey) as supplier_cnt 12 | from 13 | partsupp, 14 | part 15 | where 16 | p_partkey = ps_partkey 17 | and p_brand <> ':1' 18 | and p_type not like ':2%' 19 | and p_size in (:3, :4, :5, :6, :7, :8, :9, :10) 20 | and ps_suppkey not in ( 21 | select 22 | s_suppkey 23 | from 24 | supplier 25 | where 26 | s_comment like '%Customer%Complaints%' 27 | ) 28 | group by 29 | p_brand, 30 | p_type, 31 | p_size 32 | order by 33 | supplier_cnt desc, 34 | p_brand, 35 | p_type, 36 | p_size 37 | LIMIT 1; -------------------------------------------------------------------------------- /query_root/perf_query_template/17.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Small-Quantity-Order Revenue Query (Q17) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | sum(l_extendedprice) / 7.0 as avg_yearly 9 | from 10 | lineitem, 11 | part, 12 | (SELECT l_partkey AS agg_partkey, 0.2 * avg(l_quantity) AS avg_quantity FROM lineitem GROUP BY l_partkey) part_agg 13 | where 14 | p_partkey = l_partkey 15 | and agg_partkey = l_partkey 16 | and p_brand = ':1' 17 | and p_container = ':2' 18 | and l_quantity < avg_quantity 19 | LIMIT 1; 20 | -------------------------------------------------------------------------------- /query_root/perf_query_template/18.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Large Volume Customer Query (Q18) 3 | -- Function Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | c_name, 9 | c_custkey, 10 | o_orderkey, 11 | o_orderdate, 12 | o_totalprice, 13 | sum(l_quantity) 14 | from 15 | customer, 16 | orders, 17 | lineitem 18 | where 19 | o_orderkey in ( 20 | select 21 | l_orderkey 22 | from 23 | lineitem 24 | group by 25 | l_orderkey having 26 | sum(l_quantity) > :1 27 | ) 28 | and c_custkey = o_custkey 29 | and o_orderkey = l_orderkey 30 | group by 31 | c_name, 32 | c_custkey, 33 | o_orderkey, 34 | o_orderdate, 35 | o_totalprice 36 | order by 37 | o_totalprice desc, 38 | o_orderdate 39 | LIMIT 100; -------------------------------------------------------------------------------- /query_root/perf_query_template/19.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Discounted Revenue Query (Q19) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | sum(l_extendedprice* (1 - l_discount)) as revenue 9 | from 10 | lineitem, 11 | part 12 | where 13 | ( 14 | p_partkey = l_partkey 15 | and p_brand = ':1' 16 | and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') 17 | and l_quantity >= :4 and l_quantity <= :4 + 10 18 | and p_size between 1 and 5 19 | and l_shipmode in ('AIR', 'AIR REG') 20 | and l_shipinstruct = 'DELIVER IN PERSON' 21 | ) 22 | or 23 | ( 24 | p_partkey = l_partkey 25 | and p_brand = ':2' 26 | and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') 27 | and l_quantity >= :5 and l_quantity <= :5 + 10 28 | and p_size between 1 and 10 29 | and l_shipmode in ('AIR', 'AIR REG') 30 | and l_shipinstruct = 'DELIVER IN PERSON' 31 | ) 32 | or 33 | ( 34 | p_partkey = l_partkey 35 | and p_brand = ':3' 36 | and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') 37 | and l_quantity >= :6 and l_quantity <= :6 + 10 38 | and p_size between 1 and 15 39 | and l_shipmode in ('AIR', 'AIR REG') 40 | and l_shipinstruct = 'DELIVER IN PERSON' 41 | ) 42 | LIMIT 1; -------------------------------------------------------------------------------- /query_root/perf_query_template/2.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Minimum Cost Supplier Query (Q2) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | s_acctbal, 9 | s_name, 10 | n_name, 11 | p_partkey, 12 | p_mfgr, 13 | s_address, 14 | s_phone, 15 | s_comment 16 | from 17 | part, 18 | supplier, 19 | partsupp, 20 | nation, 21 | region 22 | where 23 | p_partkey = ps_partkey 24 | and s_suppkey = ps_suppkey 25 | and p_size = :1 26 | and p_type like '%:2' 27 | and s_nationkey = n_nationkey 28 | and n_regionkey = r_regionkey 29 | and r_name = ':3' 30 | and ps_supplycost = ( 31 | select 32 | min(ps_supplycost) 33 | from 34 | partsupp, 35 | supplier, 36 | nation, 37 | region 38 | where 39 | p_partkey = ps_partkey 40 | and s_suppkey = ps_suppkey 41 | and s_nationkey = n_nationkey 42 | and n_regionkey = r_regionkey 43 | and r_name = ':3' 44 | ) 45 | order by 46 | s_acctbal desc, 47 | n_name, 48 | s_name, 49 | p_partkey 50 | LIMIT 100; 51 | -------------------------------------------------------------------------------- /query_root/perf_query_template/20.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Potential Part Promotion Query (Q20) 3 | -- Function Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | s_name, 9 | s_address 10 | from 11 | supplier, 12 | nation 13 | where 14 | s_suppkey in ( 15 | select 16 | ps_suppkey 17 | from 18 | partsupp, 19 | ( 20 | select 21 | l_partkey agg_partkey, 22 | l_suppkey agg_suppkey, 23 | 0.5 * sum(l_quantity) AS agg_quantity 24 | from 25 | lineitem 26 | where 27 | l_shipdate >= date ':2' 28 | and l_shipdate < date ':2' + interval '1' year 29 | group by 30 | l_partkey, 31 | l_suppkey 32 | ) agg_lineitem 33 | where 34 | agg_partkey = ps_partkey 35 | and agg_suppkey = ps_suppkey 36 | and ps_partkey in ( 37 | select 38 | p_partkey 39 | from 40 | part 41 | where 42 | p_name like ':1%' 43 | ) 44 | and ps_availqty > agg_quantity 45 | ) 46 | and s_nationkey = n_nationkey 47 | and n_name = ':3' 48 | order by 49 | s_name 50 | LIMIT 1; 51 | -------------------------------------------------------------------------------- /query_root/perf_query_template/21.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Suppliers Who Kept Orders Waiting Query (Q21) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | s_name, 9 | count(*) as numwait 10 | from 11 | supplier, 12 | lineitem l1, 13 | orders, 14 | nation 15 | where 16 | s_suppkey = l1.l_suppkey 17 | and o_orderkey = l1.l_orderkey 18 | and o_orderstatus = 'F' 19 | and l1.l_receiptdate > l1.l_commitdate 20 | and exists ( 21 | select 22 | * 23 | from 24 | lineitem l2 25 | where 26 | l2.l_orderkey = l1.l_orderkey 27 | and l2.l_suppkey <> l1.l_suppkey 28 | ) 29 | and not exists ( 30 | select 31 | * 32 | from 33 | lineitem l3 34 | where 35 | l3.l_orderkey = l1.l_orderkey 36 | and l3.l_suppkey <> l1.l_suppkey 37 | and l3.l_receiptdate > l3.l_commitdate 38 | ) 39 | and s_nationkey = n_nationkey 40 | and n_name = ':1' 41 | group by 42 | s_name 43 | order by 44 | numwait desc, 45 | s_name 46 | LIMIT 100; -------------------------------------------------------------------------------- /query_root/perf_query_template/22.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Global Sales Opportunity Query (Q22) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | cntrycode, 9 | count(*) as numcust, 10 | sum(c_acctbal) as totacctbal 11 | from 12 | ( 13 | select 14 | substring(c_phone from 1 for 2) as cntrycode, 15 | c_acctbal 16 | from 17 | customer 18 | where 19 | substring(c_phone from 1 for 2) in 20 | (':1', ':2', ':3', ':4', ':5', ':6', ':7') 21 | and c_acctbal > ( 22 | select 23 | avg(c_acctbal) 24 | from 25 | customer 26 | where 27 | c_acctbal > 0.00 28 | and substring(c_phone from 1 for 2) in 29 | (':1', ':2', ':3', ':4', ':5', ':6', ':7') 30 | ) 31 | and not exists ( 32 | select 33 | * 34 | from 35 | orders 36 | where 37 | o_custkey = c_custkey 38 | ) 39 | ) as custsale 40 | group by 41 | cntrycode 42 | order by 43 | cntrycode 44 | LIMIT 1; -------------------------------------------------------------------------------- /query_root/perf_query_template/3.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Shipping Priority Query (Q3) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | l_orderkey, 9 | sum(l_extendedprice * (1 - l_discount)) as revenue, 10 | o_orderdate, 11 | o_shippriority 12 | from 13 | customer, 14 | orders, 15 | lineitem 16 | where 17 | c_mktsegment = ':1' 18 | and c_custkey = o_custkey 19 | and l_orderkey = o_orderkey 20 | and o_orderdate < date ':2' 21 | and l_shipdate > date ':2' 22 | group by 23 | l_orderkey, 24 | o_orderdate, 25 | o_shippriority 26 | order by 27 | revenue desc, 28 | o_orderdate 29 | LIMIT 10; -------------------------------------------------------------------------------- /query_root/perf_query_template/4.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Order Priority Checking Query (Q4) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | o_orderpriority, 9 | count(*) as order_count 10 | from 11 | orders 12 | where 13 | o_orderdate >= date ':1' 14 | and o_orderdate < date ':1' + interval '3' month 15 | and exists ( 16 | select 17 | * 18 | from 19 | lineitem 20 | where 21 | l_orderkey = o_orderkey 22 | and l_commitdate < l_receiptdate 23 | ) 24 | group by 25 | o_orderpriority 26 | order by 27 | o_orderpriority 28 | LIMIT 1; 29 | -------------------------------------------------------------------------------- /query_root/perf_query_template/5.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Local Supplier Volume Query (Q5) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | n_name, 9 | sum(l_extendedprice * (1 - l_discount)) as revenue 10 | from 11 | customer, 12 | orders, 13 | lineitem, 14 | supplier, 15 | nation, 16 | region 17 | where 18 | c_custkey = o_custkey 19 | and l_orderkey = o_orderkey 20 | and l_suppkey = s_suppkey 21 | and c_nationkey = s_nationkey 22 | and s_nationkey = n_nationkey 23 | and n_regionkey = r_regionkey 24 | and r_name = ':1' 25 | and o_orderdate >= date ':2' 26 | and o_orderdate < date ':2' + interval '1' year 27 | group by 28 | n_name 29 | order by 30 | revenue desc 31 | LIMIT 1; 32 | -------------------------------------------------------------------------------- /query_root/perf_query_template/6.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Forecasting Revenue Change Query (Q6) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | sum(l_extendedprice * l_discount) as revenue 9 | from 10 | lineitem 11 | where 12 | l_shipdate >= date ':1' 13 | and l_shipdate < date ':1' + interval '1' year 14 | and l_discount between :2 - 0.01 and :2 + 0.01 15 | and l_quantity < :3 16 | LIMIT 1; 17 | -------------------------------------------------------------------------------- /query_root/perf_query_template/7.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Volume Shipping Query (Q7) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | supp_nation, 9 | cust_nation, 10 | l_year, 11 | sum(volume) as revenue 12 | from 13 | ( 14 | select 15 | n1.n_name as supp_nation, 16 | n2.n_name as cust_nation, 17 | extract(year from l_shipdate) as l_year, 18 | l_extendedprice * (1 - l_discount) as volume 19 | from 20 | supplier, 21 | lineitem, 22 | orders, 23 | customer, 24 | nation n1, 25 | nation n2 26 | where 27 | s_suppkey = l_suppkey 28 | and o_orderkey = l_orderkey 29 | and c_custkey = o_custkey 30 | and s_nationkey = n1.n_nationkey 31 | and c_nationkey = n2.n_nationkey 32 | and ( 33 | (n1.n_name = ':1' and n2.n_name = ':2') 34 | or (n1.n_name = ':2' and n2.n_name = ':1') 35 | ) 36 | and l_shipdate between date '1995-01-01' and date '1996-12-31' 37 | ) as shipping 38 | group by 39 | supp_nation, 40 | cust_nation, 41 | l_year 42 | order by 43 | supp_nation, 44 | cust_nation, 45 | l_year 46 | LIMIT 1; -------------------------------------------------------------------------------- /query_root/perf_query_template/8.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R National Market Share Query (Q8) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | o_year, 9 | sum(case 10 | when nation = ':1' then volume 11 | else 0 12 | end) / sum(volume) as mkt_share 13 | from 14 | ( 15 | select 16 | extract(year from o_orderdate) as o_year, 17 | l_extendedprice * (1 - l_discount) as volume, 18 | n2.n_name as nation 19 | from 20 | part, 21 | supplier, 22 | lineitem, 23 | orders, 24 | customer, 25 | nation n1, 26 | nation n2, 27 | region 28 | where 29 | p_partkey = l_partkey 30 | and s_suppkey = l_suppkey 31 | and l_orderkey = o_orderkey 32 | and o_custkey = c_custkey 33 | and c_nationkey = n1.n_nationkey 34 | and n1.n_regionkey = r_regionkey 35 | and r_name = ':2' 36 | and s_nationkey = n2.n_nationkey 37 | and o_orderdate between date '1995-01-01' and date '1996-12-31' 38 | and p_type = ':3' 39 | ) as all_nations 40 | group by 41 | o_year 42 | order by 43 | o_year 44 | LIMIT 1; -------------------------------------------------------------------------------- /query_root/perf_query_template/9.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Product Type Profit Measure Query (Q9) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | nation, 9 | o_year, 10 | sum(amount) as sum_profit 11 | from 12 | ( 13 | select 14 | n_name as nation, 15 | extract(year from o_orderdate) as o_year, 16 | l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount 17 | from 18 | part, 19 | supplier, 20 | lineitem, 21 | partsupp, 22 | orders, 23 | nation 24 | where 25 | s_suppkey = l_suppkey 26 | and ps_suppkey = l_suppkey 27 | and ps_partkey = l_partkey 28 | and p_partkey = l_partkey 29 | and o_orderkey = l_orderkey 30 | and s_nationkey = n_nationkey 31 | and p_name like '%:1%' 32 | ) as profit 33 | group by 34 | nation, 35 | o_year 36 | order by 37 | nation, 38 | o_year desc 39 | LIMIT 1; -------------------------------------------------------------------------------- /query_root/prep_query/create_idx.sql: -------------------------------------------------------------------------------- 1 | -- REFERENCE: https://github.com/tvondra/pg_tpch 2 | 3 | 4 | -- Primary Keys 5 | 6 | ALTER TABLE PART ADD PRIMARY KEY (P_PARTKEY); 7 | ALTER TABLE SUPPLIER ADD PRIMARY KEY (S_SUPPKEY); 8 | ALTER TABLE PARTSUPP ADD PRIMARY KEY (PS_PARTKEY, PS_SUPPKEY); 9 | ALTER TABLE CUSTOMER ADD PRIMARY KEY (C_CUSTKEY); 10 | ALTER TABLE ORDERS ADD PRIMARY KEY (O_ORDERKEY); 11 | ALTER TABLE LINEITEM ADD PRIMARY KEY (L_ORDERKEY, L_LINENUMBER); 12 | ALTER TABLE NATION ADD PRIMARY KEY (N_NATIONKEY); 13 | ALTER TABLE REGION ADD PRIMARY KEY (R_REGIONKEY); 14 | 15 | 16 | -- Foreign Keys 17 | 18 | ALTER TABLE SUPPLIER ADD FOREIGN KEY (S_NATIONKEY) REFERENCES NATION(N_NATIONKEY); 19 | 20 | ALTER TABLE PARTSUPP ADD FOREIGN KEY (PS_PARTKEY) REFERENCES PART(P_PARTKEY); 21 | ALTER TABLE PARTSUPP ADD FOREIGN KEY (PS_SUPPKEY) REFERENCES SUPPLIER(S_SUPPKEY); 22 | 23 | ALTER TABLE CUSTOMER ADD FOREIGN KEY (C_NATIONKEY) REFERENCES NATION(N_NATIONKEY); 24 | 25 | ALTER TABLE ORDERS ADD FOREIGN KEY (O_CUSTKEY) REFERENCES CUSTOMER(C_CUSTKEY); 26 | 27 | ALTER TABLE LINEITEM ADD FOREIGN KEY (L_ORDERKEY) REFERENCES ORDERS(O_ORDERKEY) ON DELETE CASCADE; 28 | ALTER TABLE LINEITEM ADD FOREIGN KEY (L_PARTKEY,L_SUPPKEY) REFERENCES PARTSUPP(PS_PARTKEY,PS_SUPPKEY); 29 | 30 | ALTER TABLE NATION ADD FOREIGN KEY (N_REGIONKEY) REFERENCES REGION(R_REGIONKEY); 31 | 32 | 33 | -- Indexes on Foreign Keys 34 | 35 | CREATE INDEX IDX_SUPPLIER_NATION_KEY ON SUPPLIER (S_NATIONKEY); 36 | 37 | CREATE INDEX IDX_PARTSUPP_PARTKEY ON PARTSUPP (PS_PARTKEY); 38 | CREATE INDEX IDX_PARTSUPP_SUPPKEY ON PARTSUPP (PS_SUPPKEY); 39 | 40 | CREATE INDEX IDX_CUSTOMER_NATIONKEY ON CUSTOMER (C_NATIONKEY); 41 | 42 | CREATE INDEX IDX_ORDERS_CUSTKEY ON ORDERS (O_CUSTKEY); 43 | 44 | CREATE INDEX IDX_LINEITEM_ORDERKEY ON LINEITEM (L_ORDERKEY); 45 | CREATE INDEX IDX_LINEITEM_PART_SUPP ON LINEITEM (L_PARTKEY,L_SUPPKEY); 46 | 47 | CREATE INDEX IDX_NATION_REGIONKEY ON NATION (N_REGIONKEY); 48 | -------------------------------------------------------------------------------- /query_root/prep_query/create_tbl.sql: -------------------------------------------------------------------------------- 1 | -- REFERENCE: https://github.com/tvondra/pg_tpch 2 | 3 | CREATE TABLE PART ( 4 | 5 | P_PARTKEY SERIAL, 6 | P_NAME VARCHAR(55), 7 | P_MFGR CHAR(25), 8 | P_BRAND CHAR(10), 9 | P_TYPE VARCHAR(25), 10 | P_SIZE INTEGER, 11 | P_CONTAINER CHAR(10), 12 | P_RETAILPRICE DECIMAL, 13 | P_COMMENT VARCHAR(23) 14 | ); 15 | 16 | CREATE TABLE SUPPLIER ( 17 | S_SUPPKEY SERIAL, 18 | S_NAME CHAR(25), 19 | S_ADDRESS VARCHAR(40), 20 | S_NATIONKEY INTEGER NOT NULL, -- references N_NATIONKEY 21 | S_PHONE CHAR(15), 22 | S_ACCTBAL DECIMAL, 23 | S_COMMENT VARCHAR(101) 24 | ); 25 | 26 | CREATE TABLE PARTSUPP ( 27 | PS_PARTKEY INTEGER NOT NULL, -- references P_PARTKEY 28 | PS_SUPPKEY INTEGER NOT NULL, -- references S_SUPPKEY 29 | PS_AVAILQTY INTEGER, 30 | PS_SUPPLYCOST DECIMAL, 31 | PS_COMMENT VARCHAR(199) 32 | ); 33 | 34 | CREATE TABLE CUSTOMER ( 35 | C_CUSTKEY SERIAL, 36 | C_NAME VARCHAR(25), 37 | C_ADDRESS VARCHAR(40), 38 | C_NATIONKEY INTEGER NOT NULL, -- references N_NATIONKEY 39 | C_PHONE CHAR(15), 40 | C_ACCTBAL DECIMAL, 41 | C_MKTSEGMENT CHAR(10), 42 | C_COMMENT VARCHAR(117) 43 | ); 44 | 45 | CREATE TABLE ORDERS ( 46 | O_ORDERKEY SERIAL, 47 | O_CUSTKEY INTEGER NOT NULL, -- references C_CUSTKEY 48 | O_ORDERSTATUS CHAR(1), 49 | O_TOTALPRICE DECIMAL, 50 | O_ORDERDATE DATE, 51 | O_ORDERPRIORITY CHAR(15), 52 | O_CLERK CHAR(15), 53 | O_SHIPPRIORITY INTEGER, 54 | O_COMMENT VARCHAR(79) 55 | ); 56 | 57 | CREATE TABLE LINEITEM ( 58 | L_ORDERKEY INTEGER NOT NULL, -- references O_ORDERKEY 59 | L_PARTKEY INTEGER NOT NULL, -- references P_PARTKEY (compound fk to PARTSUPP) 60 | L_SUPPKEY INTEGER NOT NULL, -- references S_SUPPKEY (compound fk to PARTSUPP) 61 | L_LINENUMBER INTEGER, 62 | L_QUANTITY DECIMAL, 63 | L_EXTENDEDPRICE DECIMAL, 64 | L_DISCOUNT DECIMAL, 65 | L_TAX DECIMAL, 66 | L_RETURNFLAG CHAR(1), 67 | L_LINESTATUS CHAR(1), 68 | L_SHIPDATE DATE, 69 | L_COMMITDATE DATE, 70 | L_RECEIPTDATE DATE, 71 | L_SHIPINSTRUCT CHAR(25), 72 | L_SHIPMODE CHAR(10), 73 | L_COMMENT VARCHAR(44) 74 | ); 75 | 76 | CREATE TABLE NATION ( 77 | N_NATIONKEY SERIAL, 78 | N_NAME CHAR(25), 79 | N_REGIONKEY INTEGER NOT NULL, -- references R_REGIONKEY 80 | N_COMMENT VARCHAR(152) 81 | ); 82 | 83 | CREATE TABLE REGION ( 84 | R_REGIONKEY SERIAL, 85 | R_NAME CHAR(25), 86 | R_COMMENT VARCHAR(152) 87 | ); 88 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | psycopg2-binary 2 | mock -------------------------------------------------------------------------------- /tests/test_common.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import psycopg2 5 | 6 | 7 | class TestCommon(object): 8 | CONNECT_STRING = "host='%s' port='%s' dbname='%s' user='%s' password='%s'" 9 | HOSTNAME = "localhost" 10 | PORT = 5432 11 | DBNAME = "tpchdb" 12 | USERNAME = "tpch" 13 | PASSWORD = "hello123" 14 | NUM_STREAMS = 2 # because we use scale factor 0.01 15 | QUERY_NR_RANGE = range(1, NUM_STREAMS + 1) 16 | ROOT_DIR = ".." # parent of tests/ 17 | 18 | TABLES = ["customer", "lineitem", "nation", "orders", 19 | "part", "partsupp", "region", "supplier"] 20 | ROW_COUNTS = {"customer": 1500, "lineitem": 60175, # will be changed after query 21 | "nation": 25, "orders": 15000, 22 | "part": 2000, "partsupp": 8000, "region": 5, "supplier": 100} 23 | 24 | def __init__(self): 25 | self.assertEqual(self.TABLES, list(self.ROW_COUNTS.keys()).sort()) 26 | 27 | def set_table_count(self, table, count): 28 | self.ROW_COUNTS[table] = count 29 | 30 | def pgconnect(self): 31 | conn = psycopg2.connect(self.CONNECT_STRING % (self.HOSTNAME, self.PORT, self.DBNAME, self.USERNAME, self.PASSWORD)) 32 | return conn 33 | 34 | def check_dir(self, path): 35 | self.assertTrue(os.path.exists(path), "Folder %s does not exist!" % path) 36 | self.assertTrue(os.path.isdir(path), "Path %s is not a directory!" % path) 37 | 38 | def check_dir_not_exist(self, path): 39 | self.assertFalse(os.path.exists(path), "Folder %s already exists!" % path) 40 | 41 | def check_file(self, filename, check_if_not_empty=False): 42 | self.assertTrue(os.path.exists(filename), "File %s does not exist!" % filename) 43 | self.assertTrue(os.path.isfile(filename), "Path %s is not a file!" % filename) 44 | if check_if_not_empty: 45 | self.assertTrue(os.stat(filename).st_size > 0, "Path %s is empty!" % filename) 46 | 47 | def check_table(self, conn, table): 48 | cursor = conn.cursor() 49 | sql = "select relname from pg_class where relkind='r' and relname = '%s';" % table 50 | cursor.execute(sql) 51 | found = cursor.fetchall() 52 | self.assertTrue(len(found) == 1 and found[0][0] == table, "Table %s does not exist!" % table) 53 | 54 | def check_table_count(self, conn, table): 55 | cursor = conn.cursor() 56 | sql = "select count(1) from %s;" % table 57 | cursor.execute(sql) 58 | count = cursor.fetchall() 59 | expected_rows = self.ROW_COUNTS[table] 60 | actual_rows = count[0][0] 61 | self.assertTrue(len(count) == 1 and actual_rows != 0, "Table %s is empty!" % table) 62 | self.assertTrue(len(count) == 1 and actual_rows == expected_rows, 63 | "Table %s does not contain expected number of rows! " 64 | "(expected=%s vs actual=%s)" % (table, expected_rows, actual_rows)) 65 | -------------------------------------------------------------------------------- /tests/test_load_after.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import unittest 4 | from test_common import TestCommon 5 | 6 | 7 | class TestLoadAfter(unittest.TestCase, TestCommon): 8 | 9 | def test_tables_created(self): 10 | conn = self.pgconnect() 11 | # 12 | for table in self.TABLES: 13 | self.check_table(conn, table) 14 | # 15 | self.set_table_count("lineitem", 60175) 16 | self.check_table_count(conn, table) 17 | 18 | 19 | if __name__ == '__main__': 20 | unittest.main() 21 | -------------------------------------------------------------------------------- /tests/test_prepare_after.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import unittest 4 | from test_common import TestCommon 5 | import os 6 | 7 | 8 | class TestPrepareAfter(unittest.TestCase, TestCommon): 9 | 10 | def test_folders_do_exist_now(self): 11 | folders = ["data", os.path.join("query_root", "perf_query_gen")] 12 | for folder in folders: 13 | folder_path = os.path.join(self.ROOT_DIR, folder) 14 | self.check_dir(folder_path) 15 | # 16 | if folder == "data": 17 | subfolders = ["delete", "load", "update"] 18 | for subfolder in subfolders: 19 | subfolder_path = os.path.join(folder_path, subfolder) 20 | self.check_dir(subfolder_path) 21 | # 22 | if subfolder == "delete": 23 | for i in self.QUERY_NR_RANGE: 24 | filename = os.path.join(subfolder_path, "delete.%s.csv" % i) 25 | self.check_file(filename) 26 | elif subfolder == "load": 27 | tables = ["customer", "lineitem", "nation", 28 | "orders", "part", "partsupp", "region", 29 | "supplier"] 30 | for table in tables: 31 | filename = os.path.join(subfolder_path, "%s.tbl.csv" % table) 32 | self.check_file(filename) 33 | elif subfolder == "update": 34 | tables = ["lineitem", "orders"] 35 | for table in tables: 36 | for i in self.QUERY_NR_RANGE: 37 | filename = os.path.join(subfolder_path, "%s.tbl.u%s.csv" % (table,i)) 38 | self.check_file(filename) 39 | elif folder == "query_root": 40 | subfolder = "perf_query_gen" 41 | subfolder_path = os.path.join(folder_path, subfolder) 42 | self.check_dir(subfolder_path) 43 | for i in self.QUERY_NR_RANGE: 44 | filename = os.path.join(subfolder_path, "%s.sql" % i) 45 | self.check_file(filename) 46 | 47 | 48 | if __name__ == '__main__': 49 | unittest.main() 50 | -------------------------------------------------------------------------------- /tests/test_prepare_before.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import unittest 4 | from test_common import TestCommon 5 | import os 6 | 7 | 8 | class TestPrepareBefore(unittest.TestCase, TestCommon): 9 | 10 | def test_folders_do_not_exist(self): 11 | folders = ["data", "results", os.path.join("query_root", "perf_query_gen")] 12 | for folder in folders: 13 | path = os.path.join(self.ROOT_DIR, folder) 14 | self.check_dir_not_exist(path) 15 | 16 | 17 | if __name__ == '__main__': 18 | unittest.main() 19 | -------------------------------------------------------------------------------- /tests/test_query_after.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import unittest 4 | from test_common import TestCommon 5 | import os 6 | import glob 7 | 8 | 9 | class TestLoadAfter(unittest.TestCase, TestCommon): 10 | 11 | def test_tables(self): 12 | conn = self.pgconnect() 13 | for table in self.TABLES: 14 | self.check_table(conn, table) 15 | # 16 | self.set_table_count("lineitem", 60176) 17 | self.check_table_count(conn, table) 18 | 19 | def test_results(self): 20 | root_dir = ".." # parent of results/ 21 | results_dir = os.path.join(root_dir, "results") 22 | self.check_dir(results_dir) 23 | for run_dir in glob.glob(os.path.join(results_dir, '*')): 24 | self.assertRegex(run_dir, 'run_\d{8}_\d{6}', "Run folders should be names run_YYYYMMDD_HHMMSS") 25 | load_dir = os.path.join(run_dir, "load") 26 | if os.path.exists(load_dir): 27 | load_file = os.path.join(load_dir, "Load.json") 28 | self.check_file(load_file) 29 | else: 30 | power_dir = os.path.join(run_dir, "power") 31 | self.check_dir(power_dir) 32 | power_file = os.path.join(power_dir, "Power.json") 33 | self.check_file(power_file, check_if_not_empty=True) 34 | throughput_dir = os.path.join(run_dir, "throughput") 35 | self.check_dir(throughput_dir) 36 | files = [] 37 | for i in range(1, self.NUM_STREAMS + 1): 38 | throughput_file = os.path.join(throughput_dir, "Throughput%s%i.json" % ('QueryStream', i)) 39 | files.append(throughput_file) 40 | for f in ['RefreshStream', 'Total']: 41 | throughput_file = os.path.join(throughput_dir, "Throughput%s.json" % f) 42 | files.append(throughput_file) 43 | for f in files: 44 | self.check_file(f, check_if_not_empty=True) 45 | 46 | 47 | if __name__ == '__main__': 48 | unittest.main() 49 | -------------------------------------------------------------------------------- /tests/test_tpch_pgsql.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import unittest 4 | 5 | import os 6 | import mock 7 | 8 | import tpch_pgsql as bm 9 | from tpch4pgsql import query 10 | 11 | 12 | class TestBenchmark(unittest.TestCase): 13 | 14 | def test_get_timedelta_in_seconds(self): 15 | testdata = [ 16 | {"00:00:00.123450": .12345}, 17 | {"00:00:00.012345": .012345}, 18 | {"00:00:01.345678": 1.345678}, 19 | {"00:02:31.678912": 151.678912}, 20 | {"10:25:59.741852": 37559.741852}, 21 | {"10:25:59.741853": (10*60*60 + 25*60 + 59) + 0.741853}, 22 | ] 23 | for td in testdata: 24 | for input, expected in td.items(): 25 | self.assertEqual(query.get_timedelta_in_seconds(input), expected) 26 | 27 | 28 | def test_get_qphh_size(self): 29 | testdata = [ 30 | {"input": (1, 1), "expected": 1}, 31 | {"input": (2, 2), "expected": 2}, 32 | {"input": (2, 3), "expected": 2.449489742783178} 33 | ] 34 | for td in testdata: 35 | self.assertEqual(query.get_qphh_size(td["input"][0], td["input"][1]), td["expected"]) 36 | 37 | 38 | def test_scale_to_num_streams(self): 39 | testdata = [ 40 | {"input": 0, "expected": 2}, 41 | {"input": 0.42, "expected": 2}, 42 | {"input": 1, "expected": 2}, 43 | {"input": 3.14, "expected": 3}, 44 | {"input": 10, "expected": 3}, 45 | {"input": 30, "expected": 4}, 46 | {"input": 100, "expected": 5}, 47 | {"input": 300, "expected": 6}, 48 | {"input": 1000, "expected": 7}, 49 | {"input": 3000, "expected": 8}, 50 | {"input": 10000, "expected": 9}, 51 | {"input": 30000, "expected": 10}, 52 | {"input": 30000.01, "expected": 11}, 53 | {"input": 100000, "expected": 11} 54 | ] 55 | for td in testdata: 56 | self.assertEqual(bm.scale_to_num_streams(td["input"]), td["expected"]) 57 | 58 | def get_json_files_from(path): 59 | json_files = [pos_json for pos_json in os.listdir(path) if pos_json.endswith('.json')] 60 | json_files = [os.path.join(path, s) for s in json_files] 61 | return json_files 62 | 63 | @mock.patch('tpch_pgsql.os.listdir') 64 | def test_get_json_files_from(self, mock_listdir): 65 | mock_listdir.return_value = ['a.json', 'b.txt', 'C.json'] 66 | root_dir = 'dummy' 67 | expected = [os.path.join(root_dir, x) for x in ['a.json', 'C.json']] 68 | files = query.get_json_files_from(root_dir) 69 | self.assertEqual(expected, files, 70 | "Some json files were not found, others were included, but are not json files!") 71 | 72 | @staticmethod 73 | def mock_path_isdir_side_effect(arg): 74 | basename = os.path.basename(arg) 75 | if basename in ('power', 'throughput'): 76 | return True 77 | else: 78 | return False 79 | 80 | @staticmethod 81 | def mock_path_exists_side_effect(arg): 82 | return True 83 | 84 | @mock.patch('tpch_pgsql.os.listdir') 85 | def test_get_json_files(self, mock_listdir): 86 | mock_listdir.side_effect = [['run1', 'run2', 'run3', 'run4'], 87 | ['power1.json'], ['throughput1a.json', 'throughput1b.json'], 88 | ['power2.json', 'power2.txt'], ['throughput2.json'], 89 | ['power3a.txt'], ['throughput.txt'], 90 | [], []] 91 | mock_isdir = mock.patch('os.path.isdir').start() 92 | mock_isdir.side_effect = self.mock_path_isdir_side_effect 93 | mock_exists = mock.patch('os.path.exists').start() 94 | mock_exists.side_effect = self.mock_path_exists_side_effect 95 | root_dir = 'dummy' 96 | expected = [os.path.join('dummy', 'run1', 'power', 'power1.json'), 97 | os.path.join('dummy', 'run1', 'throughput', 'throughput1a.json'), 98 | os.path.join('dummy', 'run1', 'throughput', 'throughput1b.json'), 99 | os.path.join('dummy', 'run2', 'power', 'power2.json'), 100 | os.path.join('dummy', 'run2', 'throughput', 'throughput2.json')] 101 | files = query.get_json_files(root_dir) 102 | self.assertEqual(expected, files, 103 | "Some json files were not found, others were included, but are not json files!") 104 | 105 | 106 | if __name__ == '__main__': 107 | unittest.main() 108 | -------------------------------------------------------------------------------- /tpch4pgsql/load.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tpch4pgsql import postgresqldb as pgdb 3 | 4 | 5 | def clean_database(query_root, host, port, db_name, user, password, tables): 6 | """Drops the tables if they exist 7 | 8 | Args: 9 | query_root (str): Directory in which generated queries directory exists 10 | host (str): IP/hostname of the PG instance 11 | port (int): port for the PG instance 12 | db_name (str): name of the tpch database 13 | user (str): user for the PG instance 14 | password (str): password for the PG instance 15 | tables (str): list of tables 16 | 17 | Return: 18 | 0 if successful 19 | non zero otherwise 20 | """ 21 | try: 22 | conn = pgdb.PGDB(host, port, db_name, user, password) 23 | try: 24 | for table in tables: 25 | conn.executeQuery("DROP TABLE IF EXISTS %s " % table) 26 | except Exception as e: 27 | print("unable to remove existing tables. %s" % e) 28 | return 1 29 | print("dropped existing tables") 30 | conn.commit() 31 | conn.close() 32 | return 0 33 | except Exception as e: 34 | print("unable to connect to the database. %s" % e) 35 | return 1 36 | 37 | 38 | def create_schema(query_root, host, port, db_name, user, password, prep_query_dir): 39 | """Creates the schema for the tests. Drops the tables if they exist 40 | 41 | Args: 42 | query_root (str): Directory in which generated queries directory exists 43 | host (str): IP/hostname of the PG instance 44 | port (int): port for the PG instance 45 | db_name (str): name of the tpch database 46 | user (str): user for the PG instance 47 | password (str): password for the PG instance 48 | prep_query_dir (str): directory with queries for schema creation 49 | 50 | Return: 51 | 0 if successful 52 | non zero otherwise 53 | """ 54 | try: 55 | conn = pgdb.PGDB(host, port, db_name, user, password) 56 | try: 57 | conn.executeQueryFromFile(os.path.join(query_root, prep_query_dir, "create_tbl.sql")) 58 | except Exception as e: 59 | print("unable to run create tables. %s" % e) 60 | return 1 61 | conn.commit() 62 | conn.close() 63 | except Exception as e: 64 | print("unable to connect to the database. %s" % e) 65 | return 1 66 | 67 | 68 | def load_tables(data_dir, host, port, db_name, user, password, tables, load_dir): 69 | """Loads data into tables. Expects that tables are already empty. 70 | 71 | Args: 72 | data_dir (str): Directory in which load data exists 73 | host (str): IP/hostname of the PG instance 74 | port (int): port for the PG instance 75 | db_name (str): name of the tpch database 76 | user (str): user for the PG instance 77 | password (str): password for the PG instance 78 | tables (str): list of tables 79 | load_dir (str): directory with data files to be loaded 80 | 81 | Return: 82 | 0 if successful 83 | non zero otherwise 84 | """ 85 | try: 86 | conn = pgdb.PGDB(host, port, db_name, user, password) 87 | try: 88 | for table in tables: 89 | filepath = os.path.join(data_dir, load_dir, table.lower() + ".tbl.csv") 90 | conn.copyFrom(filepath, separator="|", table=table) 91 | conn.commit() 92 | except Exception as e: 93 | print("unable to run load tables. %s" %e) 94 | return 1 95 | conn.close() 96 | return 0 97 | except Exception as e: 98 | print("unable to connect to the database. %s" % e) 99 | return 1 100 | 101 | 102 | def index_tables(query_root, host, port, db_name, user, password, prep_query_dir): 103 | """Creates indexes and foreign keys for loaded tables. 104 | 105 | Args: 106 | query_root (str): Directory in which preparation queries directory exists 107 | host (str): IP/hostname of the PG instance 108 | port (int): port for the PG instance 109 | db_name (str): name of the tpch database 110 | user (str): user for the PG instance 111 | password (str): password for the PG instance 112 | prep_query_dir (str): directory with create index script 113 | 114 | Return: 115 | 0 if successful 116 | non zero otherwise 117 | """ 118 | try: 119 | conn = pgdb.PGDB(host, port, db_name, user, password) 120 | try: 121 | conn.executeQueryFromFile(os.path.join(query_root, prep_query_dir, "create_idx.sql")) 122 | conn.commit() 123 | except Exception as e: 124 | print("unable to run index tables. %s" % e) 125 | return 1 126 | conn.close() 127 | return 0 128 | except Exception as e: 129 | print("unable to connect to the database. %s" % e) 130 | return 1 131 | -------------------------------------------------------------------------------- /tpch4pgsql/postgresqldb.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | 3 | 4 | class PGDB: 5 | """Class for connections to PostgreSQL database 6 | """ 7 | __connection__ = None 8 | __cursor__ = None 9 | 10 | def __init__(self, host, port, db_name, user, password): 11 | # Exception handling is done by the method using this. 12 | self.__connection__ = psycopg2.connect("host='%s' port='%s' dbname='%s' user='%s' password='%s'" % 13 | (host, port, db_name, user, password)) 14 | self.__cursor__ = self.__connection__.cursor() 15 | 16 | def close(self): 17 | if self.__cursor__ is not None: 18 | self.__cursor__.close() 19 | self.__cursor__ = None 20 | if self.__connection__ is not None: 21 | self.__connection__.close() 22 | self.__connection__ = None 23 | 24 | def executeQueryFromFile(self, filepath, function=None): 25 | if function is None: 26 | function = lambda x: x 27 | with open(filepath) as query_file: 28 | query = query_file.read() 29 | query = function(query) 30 | return self.executeQuery(query) 31 | 32 | def executeQuery(self, query): 33 | if self.__cursor__ is not None: 34 | self.__cursor__.execute(query) 35 | return 0 36 | else: 37 | print("database has been closed") 38 | return 1 39 | 40 | def copyFrom(self, filepath, separator, table): 41 | if self.__cursor__ is not None: 42 | with open(filepath, 'r') as in_file: 43 | self.__cursor__.copy_from(in_file, table=table, sep=separator) 44 | return 0 45 | else: 46 | print("database has been closed") 47 | return 1 48 | 49 | def commit(self): 50 | if self.__connection__ is not None: 51 | self.__connection__.commit() 52 | return 0 53 | else: 54 | print("cursor not initialized") 55 | return 1 56 | -------------------------------------------------------------------------------- /tpch4pgsql/prepare.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import re 4 | import subprocess 5 | 6 | 7 | def build_dbgen(dbgen_dir): 8 | """Compiles the dbgen from source. 9 | 10 | The Makefile must be present in the same directory as this script. 11 | 12 | Args: 13 | dbgen_dir (str): Directory in which the source code is placed. 14 | 15 | Return: 16 | 0 if successful 17 | non zero otherwise 18 | """ 19 | cur_dir = os.getcwd() 20 | p = subprocess.Popen(["make", "-f", os.path.join(cur_dir, "Makefile")], cwd=dbgen_dir) 21 | p.communicate() 22 | return p.returncode 23 | 24 | 25 | def inner_generate_data(data_dir, dbgen_dir, file_pattern, out_ext): 26 | """Generate data for load/update/delete operations on the tables. 27 | 28 | This function is used by different stages of function generate_data(): load / update / delete 29 | 30 | Args: 31 | data_dir (str): Root directory for storing generated data and scripts. 32 | dbgen_dir (str): Directory in which the source code is placed. 33 | file_pattern (str): file pattern 34 | out_ext (str): output file extension 35 | 36 | Return: 37 | 0 if successful 38 | non zero otherwise 39 | """ 40 | try: 41 | os.makedirs(data_dir, exist_ok=True) 42 | for in_fname in glob.glob(os.path.join(dbgen_dir, file_pattern)): 43 | fname = os.path.basename(in_fname) 44 | out_fname = os.path.join(data_dir, fname + out_ext) 45 | try: 46 | with open(in_fname) as in_file, open(out_fname, "w") as out_file: 47 | for inline in in_file: 48 | outline = re.sub("\|$", "", inline) 49 | out_file.write(outline) 50 | os.remove(in_fname) 51 | except IOError as e: 52 | print("something bad happened while transforming data files. (%s)" % e) 53 | return 1 54 | except IOError as e: 55 | print("unable to create data directory %s. (%s)" % (data_dir, e)) 56 | return 1 57 | # All files written successfully. Return success code. 58 | return 0 59 | 60 | 61 | def generate_data(dbgen_dir, data_dir, load_dir, update_dir, delete_dir, scale, num_streams): 62 | """Generates data for the loading into tables. 63 | 64 | Args: 65 | dbgen_dir (str): Directory in which the source code is to be placed. 66 | data_dir (str): Directory where generated data is to be placed. 67 | load_dir (str): Subdirectory where data to be loaded is to be placed. 68 | update_dir (str): Subdirectory where scripts with data update operations is to be placed. 69 | delete_dir (str): Subdirectory where scripts with data delete operations is to be placed. 70 | scale (float): Amount of data to be generated. 1 = 1GB. 71 | num_streams (int): Number of streams on which the throuput tests is going to be performed. 72 | 73 | Return: 74 | 0 if successful 75 | non zero otherwise 76 | """ 77 | p = subprocess.Popen([os.path.join(".", "dbgen"), "-vf", "-s", str(scale)], cwd=dbgen_dir) 78 | p.communicate() 79 | if not p.returncode: 80 | load_path = os.path.join(data_dir, load_dir) 81 | if inner_generate_data(load_path, dbgen_dir, "*.tbl", ".csv"): 82 | print("unable to generate data for load phase") 83 | return 1 84 | print("generated data for the load phase") 85 | else: 86 | return p.returncode 87 | 88 | # Update/Delete phase data 89 | # we generate num_streams + 1 number of updates because 1 is used by the power tests 90 | p = subprocess.Popen([os.path.join(".", "dbgen"), "-vf", "-s", str(scale), "-U", str(num_streams + 1)], 91 | cwd=dbgen_dir) 92 | p.communicate() 93 | if not p.returncode: 94 | update_path = os.path.join(data_dir, update_dir) 95 | delete_path = os.path.join(data_dir, delete_dir) 96 | if inner_generate_data(update_path, dbgen_dir, "*.tbl.u*", ".csv"): 97 | print("unable to generate data for the update phase") 98 | return 1 99 | print("generated data for the update phase") 100 | if inner_generate_data(delete_path, dbgen_dir, "delete.*", ".csv"): 101 | print("unable to generate data for the delete phase") 102 | return 1 103 | print("generated data for the delete phase") 104 | # All files written successfully. Return success code. 105 | return 0 106 | else: 107 | return p.returncode 108 | 109 | 110 | def generate_queries(dbgen_dir, query_root, template_query_dir, generated_query_dir): 111 | """Generates queries for performance tests. 112 | 113 | Args: 114 | dbgen_dir (str): Directory in which the source code is placed. 115 | query_root (str): Directory in which query templates directory exists. 116 | Also the place where the generated queries are going to be placed. 117 | template_query_dir (str): Subdirectory where template SQL queries are to be placed. 118 | generated_query_dir (str): Subdirectory where generated SQL queries are to be placed. 119 | 120 | Return: 121 | 0 if successful 122 | non zero otherwise 123 | """ 124 | query_root = os.path.abspath(query_root) 125 | dss_query_path = os.path.join(query_root, template_query_dir) 126 | query_env = os.environ.copy() 127 | query_env['DSS_QUERY'] = dss_query_path 128 | query_gen_path = os.path.join(query_root, generated_query_dir) 129 | os.makedirs(query_gen_path, exist_ok=True) 130 | for i in range(1, 23): 131 | try: 132 | with open(os.path.join(query_gen_path, str(i) + ".sql"), "w") as out_file: 133 | p = subprocess.Popen([os.path.join(".", "qgen"), str(i)], 134 | cwd=dbgen_dir, env=query_env, stdout=out_file) 135 | p.communicate() 136 | if p.returncode: 137 | print("Process returned non zero when generating query number %s" % i) 138 | return p.returncode 139 | except IOError as e: 140 | print("IO Error during query generation %s" % e) 141 | return 1 142 | return p.returncode 143 | 144 | -------------------------------------------------------------------------------- /tpch4pgsql/query.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import json 4 | from itertools import zip_longest 5 | from multiprocessing import Process, Queue 6 | 7 | from tpch4pgsql import postgresqldb as pgdb, result as r 8 | 9 | POWER = "power" 10 | THROUGHPUT = "throughput" 11 | QUERY_METRIC = "query_stream_%s_query_%s" 12 | REFRESH_METRIC = "refresh_stream_%s_func_%s" 13 | THROUGHPUT_TOTAL_METRIC = "throughput_test_total" 14 | 15 | QUERY_ORDER = [ # As given in appendix A of the TPCH-specification 16 | [14, 2, 9, 20, 6, 17, 18, 8, 21, 13, 3, 22, 16, 4, 11, 15, 1, 10, 19, 5, 7, 12], 17 | [21, 3, 18, 5, 11, 7, 6, 20, 17, 12, 16, 15, 13, 10, 2, 8, 14, 19, 9, 22, 1, 4], 18 | [6, 17, 14, 16, 19, 10, 9, 2, 15, 8, 5, 22, 12, 7, 13, 18, 1, 4, 20, 3, 11, 21], 19 | [8, 5, 4, 6, 17, 7, 1, 18, 22, 14, 9, 10, 15, 11, 20, 2, 21, 19, 13, 16, 12, 3], 20 | [5, 21, 14, 19, 15, 17, 12, 6, 4, 9, 8, 16, 11, 2, 10, 18, 1, 13, 7, 22, 3, 20], 21 | [21, 15, 4, 6, 7, 16, 19, 18, 14, 22, 11, 13, 3, 1, 2, 5, 8, 20, 12, 17, 10, 9], 22 | [10, 3, 15, 13, 6, 8, 9, 7, 4, 11, 22, 18, 12, 1, 5, 16, 2, 14, 19, 20, 17, 21], 23 | [18, 8, 20, 21, 2, 4, 22, 17, 1, 11, 9, 19, 3, 13, 5, 7, 10, 16, 6, 14, 15, 12], 24 | [19, 1, 15, 17, 5, 8, 9, 12, 14, 7, 4, 3, 20, 16, 6, 22, 10, 13, 2, 21, 18, 11], 25 | [8, 13, 2, 20, 17, 3, 6, 21, 18, 11, 19, 10, 15, 4, 22, 1, 7, 12, 9, 14, 5, 16], 26 | [6, 15, 18, 17, 12, 1, 7, 2, 22, 13, 21, 10, 14, 9, 3, 16, 20, 19, 11, 4, 8, 5], 27 | [15, 14, 18, 17, 10, 20, 16, 11, 1, 8, 4, 22, 5, 12, 3, 9, 21, 2, 13, 6, 19, 7], 28 | [1, 7, 16, 17, 18, 22, 12, 6, 8, 9, 11, 4, 2, 5, 20, 21, 13, 10, 19, 3, 14, 15], 29 | [21, 17, 7, 3, 1, 10, 12, 22, 9, 16, 6, 11, 2, 4, 5, 14, 8, 20, 13, 18, 15, 19], 30 | [2, 9, 5, 4, 18, 1, 20, 15, 16, 17, 7, 21, 13, 14, 19, 8, 22, 11, 10, 3, 12, 6], 31 | [16, 9, 17, 8, 14, 11, 10, 12, 6, 21, 7, 3, 15, 5, 22, 20, 1, 13, 19, 2, 4, 18], 32 | [1, 3, 6, 5, 2, 16, 14, 22, 17, 20, 4, 9, 10, 11, 15, 8, 12, 19, 18, 13, 7, 21], 33 | [3, 16, 5, 11, 21, 9, 2, 15, 10, 18, 17, 7, 8, 19, 14, 13, 1, 4, 22, 20, 6, 12], 34 | [14, 4, 13, 5, 21, 11, 8, 6, 3, 17, 2, 20, 1, 19, 10, 9, 12, 18, 15, 7, 22, 16], 35 | [4, 12, 22, 14, 5, 15, 16, 2, 8, 10, 17, 9, 21, 7, 3, 6, 13, 18, 11, 20, 19, 1], 36 | [16, 15, 14, 13, 4, 22, 18, 19, 7, 1, 12, 17, 5, 10, 20, 3, 9, 21, 11, 2, 6, 8], 37 | [20, 14, 21, 12, 15, 17, 4, 19, 13, 10, 11, 1, 16, 5, 18, 7, 8, 22, 9, 6, 3, 2], 38 | [16, 14, 13, 2, 21, 10, 11, 4, 1, 22, 18, 12, 19, 5, 7, 8, 6, 3, 15, 20, 9, 17], 39 | [18, 15, 9, 14, 12, 2, 8, 11, 22, 21, 16, 1, 6, 17, 5, 10, 19, 4, 20, 13, 3, 7], 40 | [7, 3, 10, 14, 13, 21, 18, 6, 20, 4, 9, 8, 22, 15, 2, 1, 5, 12, 19, 17, 11, 16], 41 | [18, 1, 13, 7, 16, 10, 14, 2, 19, 5, 21, 11, 22, 15, 8, 17, 20, 3, 4, 12, 6, 9], 42 | [13, 2, 22, 5, 11, 21, 20, 14, 7, 10, 4, 9, 19, 18, 6, 3, 1, 8, 15, 12, 17, 16], 43 | [14, 17, 21, 8, 2, 9, 6, 4, 5, 13, 22, 7, 15, 3, 1, 18, 16, 11, 10, 12, 20, 19], 44 | [10, 22, 1, 12, 13, 18, 21, 20, 2, 14, 16, 7, 15, 3, 4, 17, 5, 19, 6, 8, 9, 11], 45 | [10, 8, 9, 18, 12, 6, 1, 5, 20, 11, 17, 22, 16, 3, 13, 2, 15, 21, 14, 19, 7, 4], 46 | [7, 17, 22, 5, 3, 10, 13, 18, 9, 1, 14, 15, 21, 19, 16, 12, 8, 6, 11, 20, 4, 2], 47 | [2, 9, 21, 3, 4, 7, 1, 11, 16, 5, 20, 19, 18, 8, 17, 13, 10, 12, 15, 6, 14, 22], 48 | [15, 12, 8, 4, 22, 13, 16, 17, 18, 3, 7, 5, 6, 1, 9, 11, 21, 10, 14, 20, 19, 2], 49 | [15, 16, 2, 11, 17, 7, 5, 14, 20, 4, 21, 3, 10, 9, 12, 8, 13, 6, 18, 19, 22, 1], 50 | [1, 13, 11, 3, 4, 21, 6, 14, 15, 22, 18, 9, 7, 5, 10, 20, 12, 16, 17, 8, 19, 2], 51 | [14, 17, 22, 20, 8, 16, 5, 10, 1, 13, 2, 21, 12, 9, 4, 18, 3, 7, 6, 19, 15, 11], 52 | [9, 17, 7, 4, 5, 13, 21, 18, 11, 3, 22, 1, 6, 16, 20, 14, 15, 10, 8, 2, 12, 19], 53 | [13, 14, 5, 22, 19, 11, 9, 6, 18, 15, 8, 10, 7, 4, 17, 16, 3, 1, 12, 2, 21, 20], 54 | [20, 5, 4, 14, 11, 1, 6, 16, 8, 22, 7, 3, 2, 12, 21, 19, 17, 13, 10, 15, 18, 9], 55 | [3, 7, 14, 15, 6, 5, 21, 20, 18, 10, 4, 16, 19, 1, 13, 9, 8, 17, 11, 12, 22, 2], 56 | [13, 15, 17, 1, 22, 11, 3, 4, 7, 20, 14, 21, 9, 8, 2, 18, 16, 6, 10, 12, 5, 19] 57 | ] 58 | NUM_QUERIES = len(QUERY_ORDER[0]) # 22 59 | 60 | 61 | def grouper(iterable, n, fillvalue=None): 62 | """Fill iterable up to N values by using fillvalue 63 | 64 | :param iterable: iterable 65 | :param n: number of values needed 66 | :param fillvalue: value to be used to fill missing values 67 | :return: list of values filled up to n elements by using fillvalue 68 | """ 69 | args = [iter(iterable)] * n 70 | return zip_longest(*args, fillvalue=fillvalue) 71 | 72 | 73 | def insert_lineitem(cols, conn): 74 | """Insert a row into table LINEITEM 75 | 76 | :param cols: tuple with values to be inserted, 77 | order of the values must be the same as order of the columns in the target table 78 | :param conn: open connection to the database 79 | :return: 0 if successful, 1 otherwise 80 | """ 81 | li_insert_stmt = """INSERT INTO lineitem VALUES (%s, %s, %s, %s, %s, %s, %s, %s, '%s', 82 | '%s', '%s', '%s', '%s', '%s', '%s', '%s')""" % cols 83 | conn.executeQuery(li_insert_stmt) 84 | 85 | 86 | def refresh_func1(conn, data_dir, update_dir, stream, num_streams, verbose): 87 | """Run refresh function #1 (update) 88 | 89 | :param conn: open connection to the database 90 | :param data_dir: subdirectory with data to be loaded 91 | :param update_dir: subdirectory with data to be updated 92 | :param stream: stream number 93 | :param num_streams: total number of streams 94 | :param verbose: True if more verbose output is required 95 | :return: 0 if successful, 1 otherwise 96 | """ 97 | try: 98 | if verbose: 99 | print("Running refresh function #1 in stream #%s" % stream) 100 | file_nr = stream + 1 # generated files are named 1,2,3,... while streams are indexed 0,1,2,... 101 | filepath_o = os.path.join(data_dir, update_dir, "orders.tbl.u" + str(file_nr) + ".csv") 102 | filepath_l = os.path.join(data_dir, update_dir, "lineitem.tbl.u" + str(file_nr) + ".csv") 103 | with open(filepath_o) as orders_file, open(filepath_l) as lineitem_file: 104 | todo_licols = None 105 | for orders_lines in grouper(orders_file, 100, ''): 106 | orders_gen = [x.strip() for x in orders_lines if x.strip()] 107 | for order_line in orders_gen: 108 | o_cols = tuple(order_line.split('|')) 109 | o_insert_stmt = "INSERT INTO ORDERS VALUES (%s, %s, '%s', %s, '%s', '%s', '%s', %s, '%s')" % o_cols 110 | conn.executeQuery(o_insert_stmt) 111 | # As per specification for every ORDERS row we add one to seven LINEITEM rows. 112 | if todo_licols: 113 | if todo_licols[0] != o_cols[0]: 114 | print("bad data file for lineitem. Does not match orders key") 115 | return 1 116 | else: 117 | insert_lineitem(todo_licols, conn) 118 | todo_licols = None 119 | lineitem_line = lineitem_file.readline() 120 | if lineitem_line: 121 | li_cols = tuple(lineitem_line.strip().split("|")) 122 | while li_cols and o_cols[0] == li_cols[0]: 123 | insert_lineitem(li_cols, conn) 124 | lineitem_line = lineitem_file.readline() 125 | if lineitem_line: 126 | li_cols = tuple(lineitem_line.strip().split("|")) 127 | else: 128 | li_cols = None 129 | if li_cols is not None: 130 | todo_licols = li_cols 131 | 132 | conn.commit() 133 | return 0 134 | except Exception as e: 135 | print("refresh function #1 failed. %s" % e) 136 | return 1 137 | 138 | 139 | def refresh_func2(conn, data_dir, delete_dir, stream, num_streams, verbose): 140 | """Run refresh function #2 (delete) 141 | 142 | :param conn: open connection to the database 143 | :param data_dir: subdirectory with data to be loaded 144 | :param delete_dir: subdirectory with data to be deleted 145 | :param stream: stream number 146 | :param num_streams: total number of streams 147 | :param verbose: True if more verbose output is required 148 | :return: 0 if successful, 1 otherwise 149 | """ 150 | try: 151 | if verbose: 152 | print("Running refresh function #2 in stream #%s" % stream) 153 | file_nr = stream + 1 154 | filepath = os.path.join(data_dir, delete_dir, "delete." + str(file_nr) + ".csv") 155 | with open(filepath, 'r') as in_file: 156 | for ids in grouper(in_file, 100, ''): 157 | query = "DELETE FROM orders WHERE O_ORDERKEY IN (%s)" % ", ".join([x.strip() for x in ids if x.strip()]) 158 | conn.executeQuery(query) 159 | conn.commit() 160 | return 0 161 | except Exception as e: 162 | print("refresh function #2 failed. %s" % e) 163 | return 1 164 | 165 | 166 | def run_query_stream(conn, query_root, generated_query_dir, stream, num_streams, result, verbose): 167 | """ 168 | 169 | :param conn: open connection to the database 170 | :param query_root: directory where generated SQL statements are stored 171 | :param generated_query_dir: subdirectory with generated queries 172 | :param stream: stream number 173 | :param num_streams: total number of streams 174 | :param result: result object for string start and stop times 175 | :param verbose: True if more verbose output is required 176 | :return: 0 if successful, 1 otherwise 177 | """ 178 | index = stream % len(QUERY_ORDER) 179 | order = QUERY_ORDER[index] 180 | for i in range(0, 22): 181 | try: 182 | if verbose: 183 | print("Running query #%s in stream #%s ..." % (order[i], stream)) 184 | filepath = os.path.join(query_root, generated_query_dir, str(order[i]) + ".sql") 185 | result.startTimer() 186 | conn.executeQueryFromFile(filepath) 187 | result.setMetric(QUERY_METRIC % (stream, order[i]), result.stopTimer()) 188 | except Exception as e: 189 | print("unable to execute query %s in stream %s: %s" % (order[i], stream, e)) 190 | return 1 191 | return 0 192 | 193 | 194 | def run_power_test(query_root, data_dir, update_dir, delete_dir, generated_query_dir, results_dir, 195 | host, port, database, user, password, 196 | run_timestamp, num_streams, verbose, read_only): 197 | """ 198 | 199 | :param query_root: directory where generated SQL statements are stored 200 | :param data_dir: subdirectory with data to be loaded 201 | :param update_dir: subdirectory with data to be updated 202 | :param delete_dir: subdirectory with data to be deleted 203 | :param generated_query_dir: subdirectory with generated queries 204 | :param results_dir: path to the results folder 205 | :param host: hostname where the Postgres database is running 206 | :param port: port number where the Postgres database is listening 207 | :param database: database name, where the benchmark will be run 208 | :param user: username of the Postgres user with full access to the benchmark DB 209 | :param password: password for the Postgres user 210 | :param run_timestamp: name of the run folder, format run_YYYYMMDD_HHMMSS 211 | :param num_streams: number of streams 212 | :param verbose: True if more verbose output is required 213 | :param read_only: True if no inserts/updates/deletes are to be run; can be used to run the same test multiple times 214 | without (re)loading the data, e.g. while developing 215 | :return: 0 if successful, 1 otherwise 216 | """ 217 | try: 218 | print("Power tests started ...") 219 | conn = pgdb.PGDB(host, port, database, user, password) 220 | result = r.Result("Power") 221 | result.startTimer() 222 | stream = 0 # constant for power tests 223 | # 224 | if not read_only: 225 | if refresh_func1(conn, data_dir, update_dir, stream, num_streams, verbose): 226 | return 1 227 | result.setMetric(REFRESH_METRIC % (stream, 1), result.stopTimer()) 228 | # 229 | if run_query_stream(conn, query_root, generated_query_dir, stream, num_streams, result, verbose): 230 | return 1 231 | # 232 | result.startTimer() 233 | if not read_only: 234 | if refresh_func2(conn, data_dir, delete_dir, stream, num_streams, verbose): 235 | return 1 236 | result.setMetric(REFRESH_METRIC % (stream, 2), result.stopTimer()) 237 | # 238 | print("Power tests finished.") 239 | if verbose: 240 | result.printMetrics() 241 | result.saveMetrics(results_dir, run_timestamp, "power") 242 | except Exception as e: 243 | print("unable to run power tests. DB connection failed: %s" % e) 244 | return 1 245 | return 0 246 | 247 | 248 | def run_throughput_inner(query_root, data_dir, generated_query_dir, 249 | host, port, database, user, password, 250 | stream, num_streams, queue, verbose): 251 | """ 252 | 253 | :param query_root: 254 | :param data_dir: subdirectory with data to be loaded 255 | :param generated_query_dir: subdirectory with generated queries 256 | :param host: hostname where the Postgres database is running 257 | :param port: port number where the Postgres database is listening 258 | :param database: database name, where the benchmark will be run 259 | :param user: username of the Postgres user with full access to the benchmark DB 260 | :param password: password for the Postgres user 261 | :param stream: stream number 262 | :param num_streams: number of streams 263 | :param queue: process queue 264 | :param verbose: True if more verbose output is required 265 | :return: none, uses exit(1) to abort on errors 266 | """ 267 | try: 268 | conn = pgdb.PGDB(host, port, database, user, password) 269 | result = r.Result("ThroughputQueryStream%s" % stream) 270 | if run_query_stream(conn, query_root, generated_query_dir, stream, num_streams, result, verbose): 271 | print("unable to finish query in stream #%s" % stream) 272 | exit(1) 273 | queue.put(result) 274 | except Exception as e: 275 | print("unable to connect to DB for query in stream #%s: %s" % (stream, e)) 276 | exit(1) 277 | 278 | 279 | def run_throughput_test(query_root, data_dir, update_dir, delete_dir, generated_query_dir, results_dir, 280 | host, port, database, user, password, 281 | run_timestamp, num_streams, verbose, read_only): 282 | """ 283 | 284 | :param query_root: 285 | :param data_dir: subdirectory with data to be loaded 286 | :param update_dir: subdirectory with data to be updated 287 | :param delete_dir: subdirectory with data to be deleted 288 | :param generated_query_dir: subdirectory with generated queries 289 | :param results_dir: path to the results folder 290 | :param host: hostname where the Postgres database is running 291 | :param port: port number where the Postgres database is listening 292 | :param database: database name, where the benchmark will be run 293 | :param user: username of the Postgres user with full access to the benchmark DB 294 | :param password: password for the Postgres user 295 | :param run_timestamp: name of the run folder, format run_YYYYMMDD_HHMMSS 296 | :param num_streams: number of streams 297 | :param verbose: True if more verbose output is required 298 | :param read_only: True if no inserts/updates/deletes are to be run; can be used to run the same test multiple times 299 | without (re)loading the data, e.g. while developing 300 | :return: 0 if successful, 1 otherwise 301 | """ 302 | try: 303 | print("Throughput tests started ...") 304 | conn = pgdb.PGDB(host, port, database, user, password) 305 | total = r.Result("ThroughputTotal") 306 | total.startTimer() 307 | processes = [] 308 | queue = Queue() 309 | for i in range(num_streams): 310 | stream = i + 1 311 | # queries 312 | print("Throughput tests in stream #%s started ..." % stream) 313 | p = Process(target=run_throughput_inner, 314 | args=(query_root, data_dir, generated_query_dir, 315 | host, port, database, user, password, 316 | stream, num_streams, queue, verbose)) 317 | processes.append(p) 318 | p.start() 319 | result = r.Result("ThroughputRefreshStream") 320 | for i in range(num_streams): 321 | stream = i + 1 322 | # refresh functions 323 | result.startTimer() 324 | if not read_only: 325 | if refresh_func1(conn, data_dir, update_dir, stream, num_streams, verbose): 326 | return 1 327 | result.setMetric(REFRESH_METRIC % (stream, 1), result.stopTimer()) 328 | # 329 | result.startTimer() 330 | if not read_only: 331 | if refresh_func2(conn, data_dir, delete_dir, stream, num_streams, verbose): 332 | return 1 333 | result.setMetric(REFRESH_METRIC % (stream, 2), result.stopTimer()) 334 | # 335 | queue.put(result) 336 | for p in processes: 337 | p.join() 338 | print("Throughput tests finished.") 339 | for i in range(queue.qsize()): 340 | res = queue.get(False) 341 | if verbose: 342 | res.printMetrics() 343 | res.saveMetrics(results_dir, run_timestamp, THROUGHPUT) 344 | # 345 | total.setMetric(THROUGHPUT_TOTAL_METRIC, total.stopTimer()) 346 | if verbose: 347 | total.printMetrics() 348 | total.saveMetrics(results_dir, run_timestamp, THROUGHPUT) 349 | # 350 | except Exception as e: 351 | print("unable to execute throughput tests: %s" % e) 352 | return 1 353 | return 0 354 | 355 | 356 | def get_json_files_from(path): 357 | """Get list of all JSON file names in path 358 | 359 | :param path: path to a folder 360 | :return: list of all JSON files, identified by file extension .json, not by content 361 | """ 362 | json_files = [pos_json for pos_json in os.listdir(path) if pos_json.endswith('.json')] 363 | json_files = [os.path.join(path, s) for s in json_files] 364 | return json_files 365 | 366 | 367 | def get_json_files(path): 368 | """Gather list of all JSON files in path, incl. subfolders 369 | It is expected, that the folder structure is as follows 370 | - path 371 | - run_YYYYMMDD_HHMMSS 372 | - power 373 | - ... JSON files ... 374 | - throughput 375 | - ... JSON files ... 376 | 377 | :param path: path to be scanned (only "power" and "throughput" subfolders will be considered on level 2) 378 | :return: list of JSON file names from all subfolders with expected folder structure 379 | """ 380 | json_files = [] 381 | for run_timestamp in os.listdir(os.path.join(path)): 382 | for mode in [POWER, THROUGHPUT]: 383 | sub_dir = os.path.join(path, run_timestamp, mode) 384 | if os.path.exists(sub_dir) and os.path.isdir(sub_dir): 385 | json_files += get_json_files_from(sub_dir) 386 | return json_files 387 | 388 | 389 | def load_results(results_dir): 390 | """Load all results into a list 391 | 392 | :param results_dir: path to results directory 393 | :return: list of dictionary pairs with metric name as key and value as value 394 | """ 395 | results = [] 396 | for json_filename in get_json_files(results_dir): 397 | with open(json_filename, 'r') as json_file: 398 | raw = json_file.read() 399 | js = json.loads(raw) 400 | for key, value in js.items(): 401 | results.append({"key": key, "value": value}) 402 | return results 403 | 404 | 405 | def get_timedelta_in_seconds(time_interval): 406 | """Convert time delta as string into numeric value in seconds 407 | 408 | :param time_interval: time interval as string in format HH:MM:SS.FFFFFF 409 | :return: time interval in seconds 410 | """ 411 | if ":" not in time_interval: 412 | return 0 413 | (hours, minutes, sf) = time_interval.split(":") 414 | (seconds, fraction) = sf.split(".") if "." in sf else (0, 0) 415 | secs = int(hours) * 60 * 60 + \ 416 | int(minutes) * 60 + \ 417 | int(seconds) + \ 418 | int(fraction) / 1000000 419 | return secs 420 | 421 | 422 | def get_average(results, metric_name): 423 | """Calculate average value for the metric 424 | 425 | :param results: list of results 426 | :param metric_name: metric name 427 | :return: average value for value from results with specified metric name 428 | """ 429 | values = [js["value"] for js in results if js["key"] == metric_name] 430 | seconds = [get_timedelta_in_seconds(value) for value in values] 431 | avg = sum(seconds) / len(values) 432 | return avg 433 | 434 | 435 | def qi(results, i, s): 436 | """Calculate execution time for query Qi within the query stream s 437 | 438 | :param results: list of results 439 | :param i: the ordering number of the query ranging from 1 to 22 440 | :param s: either 0 for the power function or the position of the query stream for the throughput tests 441 | :return: execution time for query Qi within the query stream s 442 | """ 443 | assert(1 <= i <= 22) 444 | assert(0 <= s) 445 | metric_name = QUERY_METRIC % (s, i) 446 | ret = get_average(results, metric_name) 447 | return ret 448 | 449 | 450 | def ri(results, j, s): 451 | """Calculate execution time for the refresh function RFi within a refresh stream s 452 | 453 | :param results: list of results 454 | :param j: ordering function of the refresh function ranging from 1 to 2 455 | :param s: either 0 for the power function 456 | or the position of the pair of refresh functions in the stream for the throughput tests 457 | :return: execution time for the refresh function RFi within a refresh stream s 458 | """ 459 | assert(j == 1 or j == 2) 460 | assert(0 <= s) 461 | metric_name = REFRESH_METRIC % (s, j) 462 | ret = get_average(results, metric_name) 463 | return ret 464 | 465 | 466 | def ts(results): 467 | """Calculate average total time needed to execute the throughput tests 468 | 469 | :param results: list of results 470 | :return: total time needed to execute the throughput tests 471 | """ 472 | metric_name = THROUGHPUT_TOTAL_METRIC 473 | ret = get_average(results, metric_name) 474 | return ret 475 | 476 | 477 | def get_power_size(results, scale_factor): 478 | """Calculate the Power@Size 479 | 480 | :param results: list of results 481 | :param scale_factor: scale factor 482 | :return: Power@Size 483 | """ 484 | qi_product = 1 485 | for i in range(1, NUM_QUERIES + 1): 486 | qi_product *= qi(results, i, 0) 487 | ri_product = 1 488 | for j in [1, 2]: # two refresh functions 489 | ri_product *= ri(results, j, 0) 490 | denominator = math.pow(qi_product * ri_product, 1/24) 491 | power_size = (3600 / denominator) * scale_factor 492 | return power_size 493 | 494 | 495 | def get_throughput_size(results, scale_factor, num_streams): 496 | """Calculate the Troughput@Size 497 | 498 | :param results: list of results 499 | :param scale_factor: scale factor 500 | :param num_streams: number of streams 501 | :return: Troughput@Size 502 | """ 503 | throughput_size = ((num_streams * NUM_QUERIES) / ts(results)) * 3600 * scale_factor 504 | return throughput_size 505 | 506 | 507 | def get_qphh_size(power_size, throughput_size): 508 | """Calculate QphH@Size 509 | 510 | :param power_size: Power@Size 511 | :param throughput_size: Throughput@Size 512 | :return: QphH@Size 513 | """ 514 | qphh_size = math.sqrt(power_size * throughput_size) 515 | return qphh_size 516 | 517 | 518 | def calc_metrics(results_dir, run_timestamp, scale_factor, num_streams): 519 | """Calculate metrics and save them in an output JSON file 520 | 521 | :param results_dir: path to the results folder 522 | :param run_timestamp: name of the run folder, format run_YYYYMMDD_HHMMSS 523 | :param scale_factor: scale factor 524 | :param num_streams: number of streams 525 | :return: none 526 | """ 527 | results = load_results(results_dir) 528 | res = r.Result("Metric") 529 | # 530 | power_size = get_power_size(results, scale_factor) 531 | res.setMetric("power_size", power_size) 532 | print("Power@Size = %s" % power_size) 533 | # 534 | throughput_size = get_throughput_size(results, scale_factor, num_streams) 535 | res.setMetric("throughput_size", throughput_size) 536 | print("Throughput@Size = %s" % throughput_size) 537 | # 538 | qphh_size = get_qphh_size(power_size, throughput_size) 539 | res.setMetric("qphh_size", qphh_size) 540 | print("QphH@Size = %s" % qphh_size) 541 | # 542 | res.printMetrics("Metrics") 543 | res.saveMetrics(results_dir, run_timestamp, "metrics") 544 | -------------------------------------------------------------------------------- /tpch4pgsql/result.py: -------------------------------------------------------------------------------- 1 | import json 2 | import datetime as dt 3 | import os 4 | 5 | 6 | class Result: 7 | """Class for storing result for metrics, with start/stop times, used for calculation of benchmark metrics 8 | 9 | """ 10 | def __init__(self, title = None): 11 | self.__title__ = "Result" 12 | if title: 13 | self.__title__ = title 14 | # Stuff for time tracking 15 | self.__start__ = None 16 | # Metrics stored in dict 17 | self.__metrics__ = dict() 18 | 19 | def startTimer(self): 20 | self.__start__ = dt.datetime.now() 21 | 22 | def stopTimer(self): 23 | if self.__start__ is not None: 24 | delta = dt.datetime.now() - self.__start__ 25 | self.__start__ = None 26 | return delta 27 | else: 28 | print("timer not started") 29 | return None 30 | 31 | def setMetric(self, name, value): 32 | self.__metrics__[name] = value 33 | 34 | def printPadded(self, txt, width, fill='='): 35 | space = ' ' 36 | w = int((width - len(txt) - 2 * len(space)) / 2) 37 | x = len(txt) % 2 # extra fill char if needed 38 | print(fill * w + space + txt + space + fill * x + fill * w) 39 | 40 | def printResultHeader(self, title): 41 | title = self.__title__ if not title else title 42 | width = 60 43 | print("="*width) 44 | self.printPadded(title, width) 45 | print("="*width) 46 | 47 | def printResultFooter(self): 48 | self.printResultHeader("End Results") 49 | 50 | def printMetrics(self, title=None): 51 | self.printResultHeader(title) 52 | for key, value in self.__metrics__.items(): 53 | print("%s: %s" % (key, value)) 54 | self.printResultFooter() 55 | 56 | def saveMetrics(self, results_dir, run_timestamp, folder): 57 | path = os.path.join(results_dir, run_timestamp, folder) 58 | os.makedirs(path, exist_ok=True) 59 | metrics = dict() 60 | for key, value in self.__metrics__.items(): 61 | metrics[key] = str(value) 62 | with open(os.path.join(path, self.__title__ + '.json'), 'w') as fp: 63 | json.dump(metrics, fp, indent=4, sort_keys=True) 64 | -------------------------------------------------------------------------------- /tpch_pgsql.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import os 5 | import time 6 | import argparse 7 | import getpass 8 | 9 | from tpch4pgsql import postgresqldb as pgdb, load, query, prepare as prep, result as r 10 | 11 | # Constants 12 | 13 | # default values for command line arguments: 14 | DEFAULT_HOST = "localhost" 15 | DEFAULT_PORT = 5432 16 | DEFAULT_USERNAME = "postgres" 17 | DEFAULT_PASSWORD = "test123" 18 | DEFAULT_DBNAME = "tpch" 19 | DEFAULT_DATA_DIR = os.path.join(".", "data") 20 | DEFAULT_QUERY_ROOT = os.path.join(".", "query_root") 21 | DEFAULT_DBGEN_DIR = os.path.join(".", "tpch-dbgen") 22 | DEFAULT_SCALE = 1.0 23 | DEFAULT_NUM_STREAMS = 0 24 | 25 | # other constants 26 | LOAD_DIR = "load" 27 | UPDATE_DIR = "update" 28 | DELETE_DIR = "delete" 29 | TEMPLATE_QUERY_DIR = "perf_query_template" 30 | GENERATED_QUERY_DIR = "perf_query_gen" 31 | PREP_QUERY_DIR = "prep_query" 32 | RESULTS_DIR = "results" 33 | TABLES = ['LINEITEM', 'PARTSUPP', 'ORDERS', 'CUSTOMER', 'SUPPLIER', 'NATION', 'REGION', 'PART'] 34 | # End Constants 35 | 36 | 37 | class Password(argparse.Action): 38 | """Class for handling Passwords in command line arguments 39 | 40 | """ 41 | def __call__(self, parser, namespace, values, option_string): 42 | if values is None: 43 | values = getpass.getpass() 44 | setattr(namespace, self.dest, values) 45 | 46 | 47 | def scale_to_num_streams(scale): 48 | """Converts scale factor to number of streams as defined in 49 | https://github.com/slavong/tpch-pgsql/blob/master/iceis2012.pdf 50 | on page 6 in section 3.3.4 Throughput Tests in table 2 51 | 52 | :param scale: scale factor, 1.0 = 1GB 53 | :return: number of streams 54 | """ 55 | num_streams = 2 56 | if scale <= 1: 57 | num_streams = 2 58 | elif scale <= 10: 59 | num_streams = 3 60 | elif scale <= 30: 61 | num_streams = 4 62 | elif scale <= 100: 63 | num_streams = 5 64 | elif scale <= 300: 65 | num_streams = 6 66 | elif scale <= 1000: 67 | num_streams = 7 68 | elif scale <= 3000: 69 | num_streams = 8 70 | elif scale <= 10000: 71 | num_streams = 9 72 | elif scale <= 30000: 73 | num_streams = 10 74 | else: 75 | num_streams = 11 76 | return num_streams 77 | 78 | 79 | def main(phase, host, port, user, password, database, 80 | dbgen_dir, data_dir, query_root, 81 | scale, num_streams, verbose, read_only): 82 | # TODO: unify doctsring, some is in reStructuredText, some is Google style 83 | # TODO: finish sphinx integration 84 | """Runs main code for three different phases. 85 | It expects parsed command line arguments, with default already applied. 86 | 87 | :param phase: prepare, load or query 88 | :param host: hostname where the Postgres database is running 89 | :param port: port number where the Postgres database is listening 90 | :param user: username of the Postgres user with full access to the benchmark DB 91 | :param password: password for the Postgres user 92 | :param database: database name, where the benchmark will be run 93 | :param dbgen_dir: directory where dbgen is to be run 94 | :param data_dir: subdirectory with data to be loaded 95 | :param query_root: subdirectory with SQL statements 96 | :param scale: scale factor, 1.0 = 1GB 97 | :param num_streams: number of streams 98 | :param verbose: True is more verbose output is required 99 | :param read_only: True if no update/delete statements are to be executed during throughput test (query phase) 100 | :return: no return value, uses exit(1) if something goes wrong 101 | """ 102 | run_timestamp = "run_%s" % time.strftime("%Y%m%d_%H%M%S", time.gmtime()) 103 | if phase == "prepare": 104 | # try to build dbgen from source and quit if failed 105 | if prep.build_dbgen(dbgen_dir): 106 | print("could not build the dbgen/querygen. Check logs.") 107 | exit(1) 108 | print("built dbgen from source") 109 | # try to generate data files 110 | if prep.generate_data(dbgen_dir, data_dir, 111 | LOAD_DIR, UPDATE_DIR, DELETE_DIR, 112 | scale, num_streams): 113 | print("could not generate data files.") 114 | exit(1) 115 | print("created data files in %s" % data_dir) 116 | if prep.generate_queries(dbgen_dir, query_root, TEMPLATE_QUERY_DIR, GENERATED_QUERY_DIR): 117 | print("could not generate query files") 118 | exit(1) 119 | print("created query files in %s" % query_root) 120 | elif phase == "load": 121 | result = r.Result("Load") 122 | if load.clean_database(query_root, host, port, database, user, password, TABLES): 123 | print("could not clean the database.") 124 | exit(1) 125 | print("cleaned database %s" % database) 126 | result.startTimer() 127 | if load.create_schema(query_root, host, port, database, user, password, PREP_QUERY_DIR): 128 | print("could not create schema.") 129 | exit(1) 130 | result.setMetric("create_schema: ", result.stopTimer()) 131 | print("done creating schemas") 132 | result.startTimer() 133 | if load.load_tables(data_dir, host, port, database, user, password, TABLES, LOAD_DIR): 134 | print("could not load data to tables") 135 | exit(1) 136 | result.setMetric("load_data", result.stopTimer()) 137 | print("done loading data to tables") 138 | result.startTimer() 139 | if load.index_tables(query_root, host, port, database, user, password, PREP_QUERY_DIR): 140 | print("could not create indexes for tables") 141 | exit(1) 142 | result.setMetric("index_tables", result.stopTimer()) 143 | print("done creating indexes and foreign keys") 144 | result.printMetrics() 145 | result.saveMetrics(RESULTS_DIR, run_timestamp, "load") 146 | elif phase == "query": 147 | if query.run_power_test(query_root, data_dir, UPDATE_DIR, DELETE_DIR, GENERATED_QUERY_DIR, RESULTS_DIR, 148 | host, port, database, user, password, 149 | run_timestamp, num_streams, verbose, read_only): 150 | print("running power tests failed") 151 | exit(1) 152 | # Throughput tests 153 | if query.run_throughput_test(query_root, data_dir, UPDATE_DIR, DELETE_DIR, GENERATED_QUERY_DIR, RESULTS_DIR, 154 | host, port, database, user, password, 155 | run_timestamp, num_streams, verbose, read_only): 156 | print("running throughput tests failed") 157 | exit(1) 158 | print("done performance tests") 159 | query.calc_metrics(RESULTS_DIR, run_timestamp, scale, num_streams) 160 | 161 | 162 | if __name__ == "__main__": 163 | parser = argparse.ArgumentParser(description="tpch_pgsql") 164 | 165 | parser.add_argument("phase", choices=["prepare", "load", "query"], 166 | help="Phase of TPC-H benchmark to run.") 167 | parser.add_argument("-H", "--host", default=DEFAULT_HOST, 168 | help="Address of host on which PostgreSQL instance runs; default is %s" % DEFAULT_HOST) 169 | parser.add_argument("-p", "--port", type=int, default=DEFAULT_PORT, 170 | help="Port on which PostgreSQL instance runs; default is %s" % str(DEFAULT_PORT)) 171 | parser.add_argument("-U", "--username", default=DEFAULT_USERNAME, 172 | help="User for the PostgreSQL instance; default is %s" % DEFAULT_USERNAME) 173 | parser.add_argument("-W", "--password", nargs='?', default=DEFAULT_PASSWORD, action=Password, 174 | help="Password for the PostgreSQL instance; default is %s" % DEFAULT_PASSWORD) 175 | parser.add_argument("-d", "--dbname", default=DEFAULT_DBNAME, 176 | help="Name of the database; default is %s" % DEFAULT_DBNAME) 177 | parser.add_argument("-i", "--data-dir", default=DEFAULT_DATA_DIR, 178 | help="Directory for generated data; default is %s" % DEFAULT_DATA_DIR) 179 | parser.add_argument("-q", "--query-root", default=DEFAULT_QUERY_ROOT, 180 | help="Directory for query files; default is %s" % DEFAULT_QUERY_ROOT) 181 | parser.add_argument("-g", "--dbgen-dir", default=DEFAULT_DBGEN_DIR, 182 | help="Directory containing tpch dbgen source; default is %s" % DEFAULT_DBGEN_DIR) 183 | parser.add_argument("-s", "--scale", type=float, default=DEFAULT_SCALE, 184 | help="Size of the data generated, scale factor; default is %s = 1GB" % DEFAULT_SCALE) 185 | parser.add_argument("-n", "--num-streams", type=int, default=DEFAULT_NUM_STREAMS, 186 | help="Number of streams to run the throughput tests with; default is %s" % DEFAULT_NUM_STREAMS + 187 | ", i.e. based on scale factor SF") 188 | parser.add_argument("-b", "--verbose", action="store_true", 189 | help="Print more information to standard output") 190 | parser.add_argument("-r", "--read-only", action="store_true", 191 | help="Do not execute refresh functions during the query phase, " + 192 | "which allows for running it repeatedly") 193 | args = parser.parse_args() 194 | 195 | # Extract all arguments into variables 196 | phase = args.phase 197 | host = args.host 198 | port = args.port 199 | database = args.dbname 200 | data_dir = args.data_dir 201 | query_root = args.query_root 202 | dbgen_dir = args.dbgen_dir 203 | scale = args.scale 204 | num_streams = args.num_streams 205 | user = args.username 206 | password = args.password 207 | verbose = args.verbose 208 | read_only = args.read_only 209 | 210 | # if no num_streams was provided, then calculate default based on scale factor 211 | if num_streams == 0: 212 | num_streams = scale_to_num_streams(scale) 213 | 214 | # main 215 | main(phase, host, port, user, password, database, dbgen_dir, data_dir, query_root, scale, num_streams, verbose, read_only) 216 | --------------------------------------------------------------------------------