├── .github └── workflows │ └── tests.yaml ├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── benchmark ├── __init__.py ├── cli.py ├── constants.py ├── job │ ├── __init__.py │ ├── cli.py │ ├── clone_job_queries.sh │ ├── job_schema.sql │ └── load_info.py ├── tests │ ├── __init__.py │ ├── benchmark_integtest_dbgym_config.yaml │ └── integtest_benchmark.py └── tpch │ ├── __init__.py │ ├── cli.py │ ├── clone_tpch_kit.sh │ ├── constants.py │ ├── load_info.py │ ├── tpch_constraints.sql │ └── tpch_schema.sql ├── dbgym_config.yaml ├── dbms ├── __init__.py ├── cli.py ├── load_info_base_class.py ├── postgres │ ├── __init__.py │ ├── _build_repo.sh │ ├── cli.py │ └── default_boot_config.yaml └── tests │ ├── __init__.py │ ├── dbms_integtest_dbgym_config.yaml │ └── integtest_dbms.py ├── gymlib_package ├── __init__.py ├── gymlib │ ├── __init__.py │ ├── infra_paths.py │ ├── pg.py │ ├── pg_conn.py │ ├── py.typed │ ├── tests │ │ ├── __init__.py │ │ ├── _set_up_gymlib_integtest_workspace.sh │ │ ├── filesystem_unittest_util.py │ │ ├── gymlib_integtest_dbgym_config.yaml │ │ ├── gymlib_integtest_util.py │ │ ├── integtest_pg_conn.py │ │ ├── integtest_tuning_artifacts.py │ │ ├── integtest_workload.py │ │ ├── unittest_filesystem_unittest_util.py │ │ └── unittest_workspace.py │ ├── tuning_artifacts.py │ ├── workload.py │ └── workspace.py └── pyproject.toml ├── orchestrate ├── __init__.py ├── clean.py ├── cli.py ├── replay.py └── tests │ ├── __init__.py │ ├── integtest_replay.py │ └── unittest_clean.py ├── scripts ├── __init__.py ├── _build_conda_env.sh ├── _load_per_machine_envvars.sh ├── _run_tests.py ├── build_agent_conda_env.sh ├── build_dbgym_conda_env.sh ├── check_format.sh ├── configs │ ├── .python_version │ ├── apt_requirements.txt │ ├── e2e_test_dbgym_config.yaml │ ├── mypy.ini │ └── requirements.txt ├── format.sh ├── install_sysdeps.sh ├── mypy.sh ├── pat_test.sh ├── pipfreeze.sh ├── quickstart.sh ├── run_integration_tests.sh └── run_unit_tests.sh ├── task.py └── util ├── __init__.py └── shell.py /.github/workflows/tests.yaml: -------------------------------------------------------------------------------- 1 | name: Static, Unit, Integration, and End-to-End Tests 2 | 3 | on: 4 | pull_request: 5 | branches: [main] 6 | 7 | jobs: 8 | tests: 9 | # The code for the self-hosted runners is at https://github.com/wangpatrick57/dbgym-runners. 10 | runs-on: self-hosted 11 | 12 | steps: 13 | - uses: actions/checkout@v2 14 | 15 | - name: Set up Python 16 | uses: actions/setup-python@v4 17 | with: 18 | python-version: '3.10' 19 | 20 | # We could choose to set up dependencies manually in the GHA runner instead of installing them during the GHA. 21 | # 22 | # However, I think it's better to do them in the GHA itself so that we're testing our dependency installation step 23 | # in addition to our actual code. It also removes the need to manually reinstall dependencies on the GHA runners 24 | # every time we add a new dependency. 25 | # 26 | # Note that the GHA runners are stateful. Dependencies installed from previous runs will still be on the runner. 27 | # This means this step will usually be pretty fast as most dependencies will already be cached. However, it also 28 | # means that past runs might interfere with the current run, so you sometimes may need to restart the GHA runners. 29 | 30 | # We need to do `. "$HOME/.cargo/env"` in each step for it to work. 31 | - name: Install dependencies 32 | run: | 33 | pip install -r ./scripts/configs/requirements.txt 34 | pip install ./gymlib_package 35 | ./scripts/install_sysdeps.sh 36 | 37 | - name: Check formatting 38 | run: | 39 | ./scripts/check_format.sh 40 | 41 | - name: Static type checking 42 | run: | 43 | ./scripts/mypy.sh 44 | 45 | - name: Run unit tests 46 | # Unit tests are defined as tests which don't require any external systems to be running. 47 | run: | 48 | . "$HOME/.cargo/env" 49 | ./scripts/run_unit_tests.sh 50 | 51 | - name: Run integration tests 52 | # Integration tests do require external systems to be running (most commonly a database instance). 53 | # Unlike end-to-end tests though, they test a specific module in a detailed manner, much like a unit test does. 54 | env: 55 | # The CI runs on ssd so we have to set this. 56 | INTENDED_DBDATA_HARDWARE: ssd 57 | run: | 58 | . "$HOME/.cargo/env" 59 | export 60 | ./scripts/run_integration_tests.sh 61 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .mypy_cache/ 3 | .conda/ 4 | .idea/ 5 | build/ 6 | *_scratchspace/ 7 | workspace/ 8 | default_*_benchbase_config_*.xml 9 | *.egg-info/ 10 | *.code-workspace -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "agents/hello-tune"] 2 | path = agents/hello-tune 3 | url = git@github.com:wangpatrick57/hello-tune.git 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 CMU Database Group 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🛢️ Database Gym 🏋️ 2 | [\[Slides\]](http://www.cidrdb.org/cidr2023/slides/p27-lim-slides.pdf) [\[Paper\]](https://www.cidrdb.org/cidr2023/papers/p27-lim.pdf) 3 | 4 | *An end-to-end research vehicle for the field of self-driving DBMSs.* 5 | 6 | ## Quickstart 7 | 8 | These steps were tested on a fresh repository clone, Ubuntu 22.04. 9 | 10 | ``` 11 | # Setup dependencies. 12 | # You may want to create a Python 3.10 virtual environment (e.g. with conda) before doing this. 13 | ./dependency/install_dependencies.sh 14 | 15 | # Compile a custom fork of PostgreSQL, load TPC-H (SF 0.01), train the Proto-X agent, and tune. 16 | ./scripts/quickstart.sh postgres tpch 0.01 protox 17 | ``` 18 | 19 | ## Overview 20 | 21 | Autonomous DBMS research often involves more engineering than research. 22 | As new advances in state-of-the-art technology are made, it is common to find that they have 23 | reimplemented the database tuning pipeline from scratch: workload capture, database setup, 24 | training data collection, model creation, model deployment, and more. 25 | Moreover, these bespoke pipelines make it difficult to combine different techniques even when they 26 | should be independent (e.g., using a different operator latency model in a tuning algorithm). 27 | 28 | The database gym project is our attempt at standardizing the APIs between these disparate tasks, 29 | allowing researchers to mix-and-match the different pipeline components. 30 | It draws inspiration from the Farama Foundation's Gymnasium (formerly OpenAI Gym), which 31 | accelerates the development and comparison of reinforcement learning algorithms by providing a set 32 | of agents, environments, and a standardized API for communicating between them. 33 | Through the database gym, we hope to save other people time and reimplementation effort by 34 | providing an extensible open-source platform for autonomous DBMS research. 35 | 36 | This project is under active development. 37 | Currently, we decompose the database tuning pipeline into the following components: 38 | 39 | 1. Workload: collection, forecasting, synthesis 40 | 2. Database: database loading, instrumentation, orchestrating workload execution 41 | 3. Agent: identifying tuning actions, suggesting an action 42 | 43 | ## Repository Structure 44 | 45 | `task.py` is the entrypoint for all tasks. 46 | The tasks are grouped into categories that correspond to the top-level directories of the repository: 47 | 48 | - `benchmark` - tasks to generate data and queries for different benchmarks (e.g., TPC-H, JOB) 49 | - `dbms` - tasks to build and start DBMSs (e.g., PostgreSQL) 50 | 51 | ## Credits 52 | 53 | The Database Gym project rose from the ashes of the [NoisePage](https://db.cs.cmu.edu/projects/noisepage/) self-driving DBMS project. 54 | 55 | The first prototype was written by [Patrick Wang](https://github.com/wangpatrick57), integrating [Boot (VLDB 2024)](https://github.com/lmwnshn/boot) and [Proto-X (VLDB 2024)](https://github.com/17zhangw/protox) into a cohesive system. 56 | 57 | ## Citing This Repository 58 | 59 | If you use this repository in an academic paper, please cite: 60 | 61 | ``` 62 | @inproceedings{lim23, 63 | author = {Lim, Wan Shen and Butrovich, Matthew and Zhang, William and Crotty, Andrew and Ma, Lin and Xu, Peijing and Gehrke, Johannes and Pavlo, Andrew}, 64 | title = {Database Gyms}, 65 | booktitle = {{CIDR} 2023, Conference on Innovative Data Systems Research}, 66 | year = {2023}, 67 | url = {https://db.cs.cmu.edu/papers/2023/p27-lim.pdf}, 68 | } 69 | ``` 70 | 71 | Additionally, please cite any module-specific paper that is relevant to your use. 72 | 73 | **Accelerating Training Data Generation** 74 | 75 | ``` 76 | (citation pending) 77 | Boot, appearing at VLDB 2024. 78 | ``` 79 | 80 | **Simultaneously Tuning Multiple Configuration Spaces with Proto Actions** 81 | 82 | ``` 83 | (citation pending) 84 | Proto-X, appearing at VLDB 2024. 85 | ``` 86 | -------------------------------------------------------------------------------- /benchmark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/benchmark/__init__.py -------------------------------------------------------------------------------- /benchmark/cli.py: -------------------------------------------------------------------------------- 1 | import click 2 | from gymlib.workspace import DBGymWorkspace 3 | 4 | from benchmark.job.cli import job_group 5 | from benchmark.tpch.cli import tpch_group 6 | 7 | 8 | @click.group(name="benchmark") 9 | @click.pass_obj 10 | def benchmark_group(dbgym_workspace: DBGymWorkspace) -> None: 11 | pass 12 | 13 | 14 | benchmark_group.add_command(tpch_group) 15 | benchmark_group.add_command(job_group) 16 | -------------------------------------------------------------------------------- /benchmark/constants.py: -------------------------------------------------------------------------------- 1 | DEFAULT_SCALE_FACTOR = 1.0 2 | -------------------------------------------------------------------------------- /benchmark/job/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/benchmark/job/__init__.py -------------------------------------------------------------------------------- /benchmark/job/cli.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Optional 3 | 4 | import click 5 | from gymlib.infra_paths import ( 6 | get_tables_dirname, 7 | get_workload_dirname, 8 | get_workload_suffix, 9 | ) 10 | from gymlib.workspace import DBGymWorkspace, fully_resolve_path, name_to_linkname 11 | 12 | from benchmark.constants import DEFAULT_SCALE_FACTOR 13 | from util.shell import subprocess_run 14 | 15 | JOB_TABLES_URL = "https://event.cwi.nl/da/job/imdb.tgz" 16 | JOB_QUERIES_URL = "https://event.cwi.nl/da/job/job.tgz" 17 | JOB_QUERY_NAMES = [ 18 | "1a", 19 | "1b", 20 | "1c", 21 | "1d", 22 | "2a", 23 | "2b", 24 | "2c", 25 | "2d", 26 | "3a", 27 | "3b", 28 | "3c", 29 | "4a", 30 | "4b", 31 | "4c", 32 | "5a", 33 | "5b", 34 | "5c", 35 | "6a", 36 | "6b", 37 | "6c", 38 | "6d", 39 | "6e", 40 | "6f", 41 | "7a", 42 | "7b", 43 | "7c", 44 | "8a", 45 | "8b", 46 | "8c", 47 | "8d", 48 | "9a", 49 | "9b", 50 | "9c", 51 | "9d", 52 | "10a", 53 | "10b", 54 | "10c", 55 | "11a", 56 | "11b", 57 | "11c", 58 | "11d", 59 | "12a", 60 | "12b", 61 | "12c", 62 | "13a", 63 | "13b", 64 | "13c", 65 | "13d", 66 | "14a", 67 | "14b", 68 | "14c", 69 | "15a", 70 | "15b", 71 | "15c", 72 | "15d", 73 | "16a", 74 | "16b", 75 | "16c", 76 | "16d", 77 | "17a", 78 | "17b", 79 | "17c", 80 | "17d", 81 | "17e", 82 | "17f", 83 | "18a", 84 | "18b", 85 | "18c", 86 | "19a", 87 | "19b", 88 | "19c", 89 | "19d", 90 | "20a", 91 | "20b", 92 | "20c", 93 | "21a", 94 | "21b", 95 | "21c", 96 | "22a", 97 | "22b", 98 | "22c", 99 | "22d", 100 | "23a", 101 | "23b", 102 | "23c", 103 | "24a", 104 | "24b", 105 | "25a", 106 | "25b", 107 | "25c", 108 | "26a", 109 | "26b", 110 | "26c", 111 | "27a", 112 | "27b", 113 | "27c", 114 | "28a", 115 | "28b", 116 | "28c", 117 | "29a", 118 | "29b", 119 | "29c", 120 | "30a", 121 | "30b", 122 | "30c", 123 | "31a", 124 | "31b", 125 | "31c", 126 | "32a", 127 | "32b", 128 | "33a", 129 | "33b", 130 | "33c", 131 | ] 132 | JOB_QUERIES_DNAME = "job-queries" 133 | 134 | 135 | @click.group(name="job") 136 | @click.pass_obj 137 | def job_group(dbgym_workspace: DBGymWorkspace) -> None: 138 | pass 139 | 140 | 141 | @job_group.command(name="tables") 142 | # We expose this option to keep its interface consistent with other workloads, but you should never pass in something other than DEFAULT_SCALE_FACTOR. 143 | @click.argument("scale-factor", type=float) 144 | @click.pass_obj 145 | # The reason generate data is separate from create dbdata is because generate data is generic 146 | # to all DBMSs while create dbdata is specific to a single DBMS. 147 | def job_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None: 148 | _job_tables(dbgym_workspace, scale_factor) 149 | 150 | 151 | def _job_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None: 152 | assert scale_factor == DEFAULT_SCALE_FACTOR 153 | _download_job_tables(dbgym_workspace) 154 | 155 | 156 | @job_group.command(name="workload") 157 | @click.option( 158 | "--query-subset", 159 | type=click.Choice(["all", "a", "demo"]), 160 | default="all", 161 | ) 162 | @click.option("--scale-factor", type=float, default=DEFAULT_SCALE_FACTOR) 163 | @click.pass_obj 164 | def job_workload( 165 | dbgym_workspace: DBGymWorkspace, query_subset: str, scale_factor: float 166 | ) -> None: 167 | _job_workload(dbgym_workspace, query_subset, scale_factor) 168 | 169 | 170 | def _job_workload( 171 | dbgym_workspace: DBGymWorkspace, query_subset: str, scale_factor: float 172 | ) -> None: 173 | assert scale_factor == DEFAULT_SCALE_FACTOR 174 | _download_job_queries(dbgym_workspace) 175 | _generate_job_workload(dbgym_workspace, query_subset) 176 | 177 | 178 | def _download_job_tables(dbgym_workspace: DBGymWorkspace) -> None: 179 | _download_and_untar_dir( 180 | dbgym_workspace, 181 | JOB_TABLES_URL, 182 | "imdb.tgz", 183 | get_tables_dirname("job", DEFAULT_SCALE_FACTOR), 184 | ) 185 | 186 | 187 | def _download_job_queries(dbgym_workspace: DBGymWorkspace) -> None: 188 | _download_and_untar_dir( 189 | dbgym_workspace, 190 | JOB_QUERIES_URL, 191 | "job.tgz", 192 | JOB_QUERIES_DNAME, 193 | untarred_original_dname="job", 194 | ) 195 | 196 | 197 | def _download_and_untar_dir( 198 | dbgym_workspace: DBGymWorkspace, 199 | download_url: str, 200 | download_tarred_fname: str, 201 | untarred_dname: str, 202 | untarred_original_dname: Optional[str] = None, 203 | ) -> None: 204 | """ 205 | Some .tgz files are built from a directory while others are built from the contents of 206 | the directory. If the .tgz file we're untarring is built from a directory, it will have 207 | an "original" directory name. If this is the case, you should set 208 | `untarred_original_dname` to ensure that it gets renamed to `untarred_dname`. 209 | """ 210 | expected_symlink_path = ( 211 | dbgym_workspace.dbgym_cur_symlinks_path / f"{untarred_dname}.link" 212 | ) 213 | if expected_symlink_path.exists(): 214 | logging.info(f"Skipping download: {expected_symlink_path}") 215 | return 216 | 217 | logging.info(f"Downloading: {expected_symlink_path}") 218 | subprocess_run(f"curl -O {download_url}", cwd=dbgym_workspace.dbgym_this_run_path) 219 | untarred_data_path = dbgym_workspace.dbgym_this_run_path / untarred_dname 220 | 221 | if untarred_original_dname is not None: 222 | assert not untarred_data_path.exists() 223 | subprocess_run( 224 | f"tar -zxvf {download_tarred_fname}", 225 | cwd=dbgym_workspace.dbgym_this_run_path, 226 | ) 227 | assert (dbgym_workspace.dbgym_this_run_path / untarred_original_dname).exists() 228 | subprocess_run( 229 | f"mv {untarred_original_dname} {untarred_dname}", 230 | cwd=dbgym_workspace.dbgym_this_run_path, 231 | ) 232 | else: 233 | untarred_data_path.mkdir(parents=True, exist_ok=False) 234 | subprocess_run(f"tar -zxvf ../{download_tarred_fname}", cwd=untarred_data_path) 235 | 236 | assert untarred_data_path.exists() 237 | subprocess_run( 238 | f"rm {download_tarred_fname}", cwd=dbgym_workspace.dbgym_this_run_path 239 | ) 240 | symlink_path = dbgym_workspace.link_result(untarred_data_path) 241 | assert expected_symlink_path.samefile(symlink_path) 242 | logging.info(f"Downloaded: {expected_symlink_path}") 243 | 244 | 245 | def _generate_job_workload( 246 | dbgym_workspace: DBGymWorkspace, 247 | query_subset: str, 248 | ) -> None: 249 | workload_name = get_workload_dirname( 250 | "job", 251 | DEFAULT_SCALE_FACTOR, 252 | get_workload_suffix("job", query_subset=query_subset), 253 | ) 254 | expected_workload_symlink_path = dbgym_workspace.dbgym_cur_symlinks_path / ( 255 | name_to_linkname(workload_name) 256 | ) 257 | if expected_workload_symlink_path.exists(): 258 | logging.info(f"Skipping generation: {expected_workload_symlink_path}") 259 | return 260 | 261 | logging.info(f"Generating: {expected_workload_symlink_path}") 262 | workload_path = dbgym_workspace.dbgym_this_run_path / workload_name 263 | workload_path.mkdir(parents=False, exist_ok=False) 264 | 265 | query_names = None 266 | if query_subset == "all": 267 | query_names = JOB_QUERY_NAMES 268 | elif query_subset == "a": 269 | query_names = [qname for qname in JOB_QUERY_NAMES if qname[-1] == "a"] 270 | elif query_subset == "demo": 271 | query_names = [f"{i}a" for i in range(1, 6)] 272 | else: 273 | assert False 274 | 275 | with open(workload_path / "order.txt", "w") as f: 276 | queries_parent_path = dbgym_workspace.dbgym_cur_symlinks_path / ( 277 | name_to_linkname(JOB_QUERIES_DNAME) 278 | ) 279 | 280 | for qname in query_names: 281 | sql_path = fully_resolve_path(queries_parent_path / f"{qname}.sql") 282 | f.write(f"Q{qname},{sql_path}\n") 283 | 284 | workload_symlink_path = dbgym_workspace.link_result(workload_path) 285 | assert workload_symlink_path == expected_workload_symlink_path 286 | logging.info(f"Generated: {expected_workload_symlink_path}") 287 | -------------------------------------------------------------------------------- /benchmark/job/clone_job_queries.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euxo pipefail 4 | 5 | JOB_REPO_ROOT="$1" 6 | 7 | if [ ! -d "${JOB_REPO_ROOT}/job-queries" ]; then 8 | mkdir -p "${JOB_REPO_ROOT}" 9 | cd "${JOB_REPO_ROOT}" 10 | git clone https://github.com/wangpatrick57/job-queries.git --single-branch --branch master --depth 1 11 | fi 12 | -------------------------------------------------------------------------------- /benchmark/job/job_schema.sql: -------------------------------------------------------------------------------- 1 | -- Copied over from https://event.cwi.nl/da/job/job.tgz. 2 | -- We copied it over so that we have control over the schema. 3 | CREATE TABLE aka_name ( 4 | id integer NOT NULL PRIMARY KEY, 5 | person_id integer NOT NULL, 6 | name text NOT NULL, 7 | imdb_index character varying(12), 8 | name_pcode_cf character varying(5), 9 | name_pcode_nf character varying(5), 10 | surname_pcode character varying(5), 11 | md5sum character varying(32) 12 | ); 13 | 14 | CREATE TABLE aka_title ( 15 | id integer NOT NULL PRIMARY KEY, 16 | movie_id integer NOT NULL, 17 | title text NOT NULL, 18 | imdb_index character varying(12), 19 | kind_id integer NOT NULL, 20 | production_year integer, 21 | phonetic_code character varying(5), 22 | episode_of_id integer, 23 | season_nr integer, 24 | episode_nr integer, 25 | note text, 26 | md5sum character varying(32) 27 | ); 28 | 29 | CREATE TABLE cast_info ( 30 | id integer NOT NULL PRIMARY KEY, 31 | person_id integer NOT NULL, 32 | movie_id integer NOT NULL, 33 | person_role_id integer, 34 | note text, 35 | nr_order integer, 36 | role_id integer NOT NULL 37 | ); 38 | 39 | CREATE TABLE char_name ( 40 | id integer NOT NULL PRIMARY KEY, 41 | name text NOT NULL, 42 | imdb_index character varying(12), 43 | imdb_id integer, 44 | name_pcode_nf character varying(5), 45 | surname_pcode character varying(5), 46 | md5sum character varying(32) 47 | ); 48 | 49 | CREATE TABLE comp_cast_type ( 50 | id integer NOT NULL PRIMARY KEY, 51 | kind character varying(32) NOT NULL 52 | ); 53 | 54 | CREATE TABLE company_name ( 55 | id integer NOT NULL PRIMARY KEY, 56 | name text NOT NULL, 57 | country_code character varying(255), 58 | imdb_id integer, 59 | name_pcode_nf character varying(5), 60 | name_pcode_sf character varying(5), 61 | md5sum character varying(32) 62 | ); 63 | 64 | CREATE TABLE company_type ( 65 | id integer NOT NULL PRIMARY KEY, 66 | kind character varying(32) NOT NULL 67 | ); 68 | 69 | CREATE TABLE complete_cast ( 70 | id integer NOT NULL PRIMARY KEY, 71 | movie_id integer, 72 | subject_id integer NOT NULL, 73 | status_id integer NOT NULL 74 | ); 75 | 76 | CREATE TABLE info_type ( 77 | id integer NOT NULL PRIMARY KEY, 78 | info character varying(32) NOT NULL 79 | ); 80 | 81 | CREATE TABLE keyword ( 82 | id integer NOT NULL PRIMARY KEY, 83 | keyword text NOT NULL, 84 | phonetic_code character varying(5) 85 | ); 86 | 87 | CREATE TABLE kind_type ( 88 | id integer NOT NULL PRIMARY KEY, 89 | kind character varying(15) NOT NULL 90 | ); 91 | 92 | CREATE TABLE link_type ( 93 | id integer NOT NULL PRIMARY KEY, 94 | link character varying(32) NOT NULL 95 | ); 96 | 97 | CREATE TABLE movie_companies ( 98 | id integer NOT NULL PRIMARY KEY, 99 | movie_id integer NOT NULL, 100 | company_id integer NOT NULL, 101 | company_type_id integer NOT NULL, 102 | note text 103 | ); 104 | 105 | CREATE TABLE movie_info ( 106 | id integer NOT NULL PRIMARY KEY, 107 | movie_id integer NOT NULL, 108 | info_type_id integer NOT NULL, 109 | info text NOT NULL, 110 | note text 111 | ); 112 | 113 | CREATE TABLE movie_info_idx ( 114 | id integer NOT NULL PRIMARY KEY, 115 | movie_id integer NOT NULL, 116 | info_type_id integer NOT NULL, 117 | info text NOT NULL, 118 | note text 119 | ); 120 | 121 | CREATE TABLE movie_keyword ( 122 | id integer NOT NULL PRIMARY KEY, 123 | movie_id integer NOT NULL, 124 | keyword_id integer NOT NULL 125 | ); 126 | 127 | CREATE TABLE movie_link ( 128 | id integer NOT NULL PRIMARY KEY, 129 | movie_id integer NOT NULL, 130 | linked_movie_id integer NOT NULL, 131 | link_type_id integer NOT NULL 132 | ); 133 | 134 | CREATE TABLE name ( 135 | id integer NOT NULL PRIMARY KEY, 136 | name text NOT NULL, 137 | imdb_index character varying(12), 138 | imdb_id integer, 139 | gender character varying(1), 140 | name_pcode_cf character varying(5), 141 | name_pcode_nf character varying(5), 142 | surname_pcode character varying(5), 143 | md5sum character varying(32) 144 | ); 145 | 146 | CREATE TABLE person_info ( 147 | id integer NOT NULL PRIMARY KEY, 148 | person_id integer NOT NULL, 149 | info_type_id integer NOT NULL, 150 | info text NOT NULL, 151 | note text 152 | ); 153 | 154 | CREATE TABLE role_type ( 155 | id integer NOT NULL PRIMARY KEY, 156 | role character varying(32) NOT NULL 157 | ); 158 | 159 | CREATE TABLE title ( 160 | id integer NOT NULL PRIMARY KEY, 161 | title text NOT NULL, 162 | imdb_index character varying(12), 163 | kind_id integer NOT NULL, 164 | production_year integer, 165 | imdb_id integer, 166 | phonetic_code character varying(5), 167 | episode_of_id integer, 168 | season_nr integer, 169 | episode_nr integer, 170 | series_years character varying(49), 171 | md5sum character varying(32) 172 | ); -------------------------------------------------------------------------------- /benchmark/job/load_info.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Optional 3 | 4 | from gymlib.infra_paths import get_tables_symlink_path 5 | from gymlib.workspace import DBGymWorkspace, fully_resolve_path 6 | 7 | from benchmark.constants import DEFAULT_SCALE_FACTOR 8 | from dbms.load_info_base_class import LoadInfoBaseClass 9 | 10 | JOB_SCHEMA_FNAME = "job_schema.sql" 11 | 12 | 13 | class JobLoadInfo(LoadInfoBaseClass): 14 | TABLES = [ 15 | "aka_name", 16 | "aka_title", 17 | "cast_info", 18 | "char_name", 19 | "comp_cast_type", 20 | "company_name", 21 | "company_type", 22 | "complete_cast", 23 | "info_type", 24 | "keyword", 25 | "kind_type", 26 | "link_type", 27 | "movie_companies", 28 | "movie_info", 29 | "movie_info_idx", 30 | "movie_keyword", 31 | "movie_link", 32 | "name", 33 | "person_info", 34 | "role_type", 35 | "title", 36 | ] 37 | 38 | def __init__(self, dbgym_workspace: DBGymWorkspace): 39 | # Schema (directly in the codebase). 40 | job_codebase_path = dbgym_workspace.base_dbgym_repo_path / "benchmark" / "job" 41 | self._schema_path = job_codebase_path / JOB_SCHEMA_FNAME 42 | assert ( 43 | self._schema_path.exists() 44 | ), f"self._schema_path ({self._schema_path}) does not exist" 45 | 46 | # Tables 47 | tables_path = fully_resolve_path( 48 | get_tables_symlink_path( 49 | dbgym_workspace.dbgym_workspace_path, "job", DEFAULT_SCALE_FACTOR 50 | ) 51 | ) 52 | self._tables_and_paths = [] 53 | for table in JobLoadInfo.TABLES: 54 | table_path = tables_path / f"{table}.csv" 55 | self._tables_and_paths.append((table, table_path)) 56 | 57 | def get_schema_path(self) -> Path: 58 | return self._schema_path 59 | 60 | def get_tables_and_paths(self) -> list[tuple[str, Path]]: 61 | return self._tables_and_paths 62 | 63 | def get_table_file_delimiter(self) -> str: 64 | return "," 65 | 66 | def get_constraints_path(self) -> Optional[Path]: 67 | # JOB does not have any constraints. It does have indexes, but we don't want to create 68 | # those indexes so that the tuning agent can start from a clean slate. 69 | return None 70 | -------------------------------------------------------------------------------- /benchmark/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/benchmark/tests/__init__.py -------------------------------------------------------------------------------- /benchmark/tests/benchmark_integtest_dbgym_config.yaml: -------------------------------------------------------------------------------- 1 | dbgym_workspace_path: ../dbgym_benchmark_integtest_workspace/ 2 | -------------------------------------------------------------------------------- /benchmark/tests/integtest_benchmark.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import unittest 3 | from pathlib import Path 4 | 5 | from gymlib.infra_paths import ( 6 | get_tables_symlink_path, 7 | get_workload_suffix, 8 | get_workload_symlink_path, 9 | ) 10 | from gymlib.workspace import ( 11 | DBGymWorkspace, 12 | fully_resolve_path, 13 | get_workspace_path_from_config, 14 | ) 15 | 16 | # It's ok to import private functions from the benchmark module because this is an integration test. 17 | from benchmark.constants import DEFAULT_SCALE_FACTOR 18 | from benchmark.job.cli import _job_tables, _job_workload 19 | from benchmark.tpch.cli import _tpch_tables, _tpch_workload 20 | from benchmark.tpch.constants import DEFAULT_TPCH_SEED 21 | 22 | 23 | class BenchmarkTests(unittest.TestCase): 24 | DBGYM_CONFIG_PATH = Path("benchmark/tests/benchmark_integtest_dbgym_config.yaml") 25 | 26 | def setUp(self) -> None: 27 | workspace_path = get_workspace_path_from_config( 28 | BenchmarkTests.DBGYM_CONFIG_PATH 29 | ) 30 | # Get a clean start each time. 31 | if workspace_path.exists(): 32 | shutil.rmtree(workspace_path) 33 | 34 | # Reset this to avoid the error of it being created twice. 35 | # In real usage, the second run would be a different Python process so DBGymWorkspace._num_times_created_this_run would be 0. 36 | DBGymWorkspace._num_times_created_this_run = 0 37 | self.workspace = DBGymWorkspace(workspace_path) 38 | 39 | def tearDown(self) -> None: 40 | if self.workspace.dbgym_workspace_path.exists(): 41 | shutil.rmtree(self.workspace.dbgym_workspace_path) 42 | 43 | def test_tpch_tables(self) -> None: 44 | scale_factor = 0.01 45 | tables_path = get_tables_symlink_path( 46 | self.workspace.dbgym_workspace_path, "tpch", scale_factor 47 | ) 48 | self.assertFalse(tables_path.exists()) 49 | _tpch_tables(self.workspace, scale_factor) 50 | self.assertTrue(tables_path.exists()) 51 | self.assertTrue(fully_resolve_path(tables_path).exists()) 52 | 53 | def test_job_tables(self) -> None: 54 | tables_path = get_tables_symlink_path( 55 | self.workspace.dbgym_workspace_path, "job", DEFAULT_SCALE_FACTOR 56 | ) 57 | self.assertFalse(tables_path.exists()) 58 | _job_tables(self.workspace, DEFAULT_SCALE_FACTOR) 59 | self.assertTrue(tables_path.exists()) 60 | self.assertTrue(fully_resolve_path(tables_path).exists()) 61 | 62 | def test_tpch_workload(self) -> None: 63 | scale_factor = 0.01 64 | workload_path = get_workload_symlink_path( 65 | self.workspace.dbgym_workspace_path, 66 | "tpch", 67 | scale_factor, 68 | get_workload_suffix( 69 | "tpch", 70 | seed_start=DEFAULT_TPCH_SEED, 71 | seed_end=DEFAULT_TPCH_SEED, 72 | query_subset="all", 73 | ), 74 | ) 75 | self.assertFalse(workload_path.exists()) 76 | _tpch_workload( 77 | self.workspace, DEFAULT_TPCH_SEED, DEFAULT_TPCH_SEED, "all", scale_factor 78 | ) 79 | self.assertTrue(workload_path.exists()) 80 | self.assertTrue(fully_resolve_path(workload_path).exists()) 81 | 82 | def test_job_workload(self) -> None: 83 | workload_path = get_workload_symlink_path( 84 | self.workspace.dbgym_workspace_path, 85 | "job", 86 | DEFAULT_SCALE_FACTOR, 87 | get_workload_suffix( 88 | "job", 89 | query_subset="all", 90 | ), 91 | ) 92 | self.assertFalse(workload_path.exists()) 93 | _job_workload(self.workspace, "all", DEFAULT_SCALE_FACTOR) 94 | self.assertTrue(workload_path.exists()) 95 | self.assertTrue(fully_resolve_path(workload_path).exists()) 96 | 97 | 98 | if __name__ == "__main__": 99 | unittest.main() 100 | -------------------------------------------------------------------------------- /benchmark/tpch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/benchmark/tpch/__init__.py -------------------------------------------------------------------------------- /benchmark/tpch/cli.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import click 4 | from gymlib.infra_paths import ( 5 | get_scale_factor_string, 6 | get_tables_dirname, 7 | get_tables_symlink_path, 8 | get_workload_suffix, 9 | get_workload_symlink_path, 10 | ) 11 | from gymlib.workspace import ( 12 | DBGymWorkspace, 13 | fully_resolve_path, 14 | is_fully_resolved, 15 | linkname_to_name, 16 | name_to_linkname, 17 | ) 18 | 19 | from benchmark.constants import DEFAULT_SCALE_FACTOR 20 | from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES 21 | from util.shell import subprocess_run 22 | 23 | TPCH_KIT_DIRNAME = "tpch-kit" 24 | 25 | 26 | @click.group(name="tpch") 27 | @click.pass_obj 28 | def tpch_group(dbgym_workspace: DBGymWorkspace) -> None: 29 | pass 30 | 31 | 32 | @tpch_group.command(name="tables") 33 | @click.argument("scale-factor", type=float) 34 | @click.pass_obj 35 | # The reason generate tables is separate from create dbdata is because tpch_tables is generic 36 | # to all DBMSs while create dbdata is specific to a single DBMS. 37 | def tpch_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None: 38 | _tpch_tables(dbgym_workspace, scale_factor) 39 | 40 | 41 | def _tpch_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None: 42 | """ 43 | This function exists as a hook for integration tests. 44 | """ 45 | _clone_tpch_kit(dbgym_workspace) 46 | _generate_tpch_tables(dbgym_workspace, scale_factor) 47 | 48 | 49 | @tpch_group.command(name="workload") 50 | @click.option( 51 | "--seed-start", 52 | type=int, 53 | default=DEFAULT_TPCH_SEED, 54 | help="A workload consists of queries from multiple seeds. This is the starting seed (inclusive).", 55 | ) 56 | @click.option( 57 | "--seed-end", 58 | type=int, 59 | default=DEFAULT_TPCH_SEED, 60 | help="A workload consists of queries from multiple seeds. This is the ending seed (inclusive).", 61 | ) 62 | @click.option( 63 | "--query-subset", 64 | type=click.Choice(["all", "even", "odd"]), 65 | default="all", 66 | ) 67 | @click.option("--scale-factor", type=float, default=DEFAULT_SCALE_FACTOR) 68 | @click.pass_obj 69 | def tpch_workload( 70 | dbgym_workspace: DBGymWorkspace, 71 | seed_start: int, 72 | seed_end: int, 73 | query_subset: str, 74 | scale_factor: float, 75 | ) -> None: 76 | _tpch_workload(dbgym_workspace, seed_start, seed_end, query_subset, scale_factor) 77 | 78 | 79 | def _tpch_workload( 80 | dbgym_workspace: DBGymWorkspace, 81 | seed_start: int, 82 | seed_end: int, 83 | query_subset: str, 84 | scale_factor: float, 85 | ) -> None: 86 | """ 87 | This function exists as a hook for integration tests. 88 | """ 89 | assert ( 90 | seed_start <= seed_end 91 | ), f"seed_start ({seed_start}) must be <= seed_end ({seed_end})" 92 | _clone_tpch_kit(dbgym_workspace) 93 | _generate_tpch_queries(dbgym_workspace, seed_start, seed_end, scale_factor) 94 | _generate_tpch_workload( 95 | dbgym_workspace, seed_start, seed_end, query_subset, scale_factor 96 | ) 97 | 98 | 99 | def _get_queries_dirname(seed: int, scale_factor: float) -> str: 100 | return f"queries_{seed}_sf{get_scale_factor_string(scale_factor)}" 101 | 102 | 103 | def _clone_tpch_kit(dbgym_workspace: DBGymWorkspace) -> None: 104 | expected_symlink_path = dbgym_workspace.dbgym_cur_symlinks_path / ( 105 | name_to_linkname(TPCH_KIT_DIRNAME) 106 | ) 107 | if expected_symlink_path.exists(): 108 | logging.info(f"Skipping clone: {expected_symlink_path}") 109 | return 110 | 111 | logging.info(f"Cloning: {expected_symlink_path}") 112 | subprocess_run( 113 | f"./clone_tpch_kit.sh {dbgym_workspace.dbgym_this_run_path}", 114 | cwd=dbgym_workspace.base_dbgym_repo_path / "benchmark" / "tpch", 115 | ) 116 | symlink_path = dbgym_workspace.link_result( 117 | dbgym_workspace.dbgym_this_run_path / TPCH_KIT_DIRNAME 118 | ) 119 | assert expected_symlink_path.samefile(symlink_path) 120 | logging.info(f"Cloned: {expected_symlink_path}") 121 | 122 | 123 | def _generate_tpch_queries( 124 | dbgym_workspace: DBGymWorkspace, seed_start: int, seed_end: int, scale_factor: float 125 | ) -> None: 126 | tpch_kit_path = dbgym_workspace.dbgym_cur_symlinks_path / ( 127 | name_to_linkname(TPCH_KIT_DIRNAME) 128 | ) 129 | logging.info(f"Generating queries: [{seed_start}, {seed_end}]") 130 | for seed in range(seed_start, seed_end + 1): 131 | expected_queries_symlink_path = dbgym_workspace.dbgym_cur_symlinks_path / ( 132 | name_to_linkname(_get_queries_dirname(seed, scale_factor)) 133 | ) 134 | if expected_queries_symlink_path.exists(): 135 | continue 136 | 137 | queries_parent_path = ( 138 | dbgym_workspace.dbgym_this_run_path 139 | / _get_queries_dirname(seed, scale_factor) 140 | ) 141 | queries_parent_path.mkdir(parents=False, exist_ok=False) 142 | for i in range(1, NUM_TPCH_QUERIES + 1): 143 | target_sql = (queries_parent_path / f"{i}.sql").resolve() 144 | subprocess_run( 145 | f"DSS_QUERY=./queries ./qgen {i} -r {seed} -s {scale_factor} > {target_sql}", 146 | cwd=tpch_kit_path / "dbgen", 147 | verbose=False, 148 | ) 149 | queries_symlink_path = dbgym_workspace.link_result(queries_parent_path) 150 | assert queries_symlink_path.samefile(expected_queries_symlink_path) 151 | logging.info(f"Generated queries: [{seed_start}, {seed_end}]") 152 | 153 | 154 | def _generate_tpch_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None: 155 | tpch_kit_path = dbgym_workspace.dbgym_cur_symlinks_path / ( 156 | name_to_linkname(TPCH_KIT_DIRNAME) 157 | ) 158 | expected_tables_symlink_path = get_tables_symlink_path( 159 | dbgym_workspace.dbgym_workspace_path, "tpch", scale_factor 160 | ) 161 | if expected_tables_symlink_path.exists(): 162 | logging.info(f"Skipping generation: {expected_tables_symlink_path}") 163 | return 164 | 165 | logging.info(f"Generating: {expected_tables_symlink_path}") 166 | subprocess_run(f"./dbgen -vf -s {scale_factor}", cwd=tpch_kit_path / "dbgen") 167 | tables_parent_path = dbgym_workspace.dbgym_this_run_path / get_tables_dirname( 168 | "tpch", scale_factor 169 | ) 170 | tables_parent_path.mkdir(parents=False, exist_ok=False) 171 | subprocess_run(f"mv ./*.tbl {tables_parent_path}", cwd=tpch_kit_path / "dbgen") 172 | 173 | tables_symlink_path = dbgym_workspace.link_result(tables_parent_path) 174 | assert tables_symlink_path.samefile(expected_tables_symlink_path) 175 | logging.info(f"Generated: {expected_tables_symlink_path}") 176 | 177 | 178 | def _generate_tpch_workload( 179 | dbgym_workspace: DBGymWorkspace, 180 | seed_start: int, 181 | seed_end: int, 182 | query_subset: str, 183 | scale_factor: float, 184 | ) -> None: 185 | expected_workload_symlink_path = get_workload_symlink_path( 186 | dbgym_workspace.dbgym_workspace_path, 187 | "tpch", 188 | scale_factor, 189 | get_workload_suffix( 190 | "tpch", seed_start=seed_start, seed_end=seed_end, query_subset=query_subset 191 | ), 192 | ) 193 | if expected_workload_symlink_path.exists(): 194 | logging.info(f"Skipping generation: {expected_workload_symlink_path}") 195 | return 196 | 197 | logging.info(f"Generating: {expected_workload_symlink_path}") 198 | workload_path = dbgym_workspace.dbgym_this_run_path / linkname_to_name( 199 | expected_workload_symlink_path.name 200 | ) 201 | workload_path.mkdir(parents=False, exist_ok=False) 202 | 203 | query_names = None 204 | if query_subset == "all": 205 | query_names = [f"{i}" for i in range(1, NUM_TPCH_QUERIES + 1)] 206 | elif query_subset == "even": 207 | query_names = [f"{i}" for i in range(1, NUM_TPCH_QUERIES + 1) if i % 2 == 0] 208 | elif query_subset == "odd": 209 | query_names = [f"{i}" for i in range(1, NUM_TPCH_QUERIES + 1) if i % 2 == 1] 210 | else: 211 | assert False 212 | 213 | with open(workload_path / "order.txt", "w") as f: 214 | for seed in range(seed_start, seed_end + 1): 215 | queries_parent_path = dbgym_workspace.dbgym_cur_symlinks_path / ( 216 | name_to_linkname(_get_queries_dirname(seed, scale_factor)) 217 | ) 218 | 219 | for qname in query_names: 220 | sql_path = fully_resolve_path(queries_parent_path / f"{qname}.sql") 221 | assert is_fully_resolved( 222 | sql_path 223 | ), "We should only write existent real absolute paths to a file" 224 | f.write(f"S{seed}-Q{qname},{sql_path}\n") 225 | 226 | workload_symlink_path = dbgym_workspace.link_result(workload_path) 227 | assert workload_symlink_path == expected_workload_symlink_path 228 | logging.info(f"Generated: {expected_workload_symlink_path}") 229 | -------------------------------------------------------------------------------- /benchmark/tpch/clone_tpch_kit.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euxo pipefail 4 | 5 | TPCH_REPO_ROOT="$1" 6 | 7 | if [ ! -d "${TPCH_REPO_ROOT}/tpch-kit" ]; then 8 | mkdir -p "${TPCH_REPO_ROOT}" 9 | cd "${TPCH_REPO_ROOT}" 10 | git clone https://github.com/lmwnshn/tpch-kit.git --single-branch --branch master --depth 1 11 | cd ./tpch-kit/dbgen 12 | make MACHINE=LINUX DATABASE=POSTGRESQL 13 | fi 14 | -------------------------------------------------------------------------------- /benchmark/tpch/constants.py: -------------------------------------------------------------------------------- 1 | DEFAULT_TPCH_SEED = 15721 2 | NUM_TPCH_QUERIES = 22 3 | -------------------------------------------------------------------------------- /benchmark/tpch/load_info.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Optional 3 | 4 | from gymlib.infra_paths import get_tables_symlink_path 5 | from gymlib.workspace import DBGymWorkspace, fully_resolve_path 6 | 7 | from dbms.load_info_base_class import LoadInfoBaseClass 8 | 9 | TPCH_SCHEMA_FNAME = "tpch_schema.sql" 10 | TPCH_CONSTRAINTS_FNAME = "tpch_constraints.sql" 11 | 12 | 13 | class TpchLoadInfo(LoadInfoBaseClass): 14 | TABLES = [ 15 | "region", 16 | "nation", 17 | "part", 18 | "supplier", 19 | "partsupp", 20 | "customer", 21 | "orders", 22 | "lineitem", 23 | ] 24 | 25 | def __init__(self, dbgym_workspace: DBGymWorkspace, scale_factor: float): 26 | # Schema and constraints (directly in the codebase). 27 | tpch_codebase_path = dbgym_workspace.base_dbgym_repo_path / "benchmark" / "tpch" 28 | self._schema_path = tpch_codebase_path / TPCH_SCHEMA_FNAME 29 | assert ( 30 | self._schema_path.exists() 31 | ), f"self._schema_path ({self._schema_path}) does not exist" 32 | self._constraints_path = tpch_codebase_path / TPCH_CONSTRAINTS_FNAME 33 | assert ( 34 | self._constraints_path.exists() 35 | ), f"self._constraints_path ({self._constraints_path}) does not exist" 36 | 37 | # Tables 38 | tables_path = fully_resolve_path( 39 | get_tables_symlink_path( 40 | dbgym_workspace.dbgym_workspace_path, "tpch", scale_factor 41 | ) 42 | ) 43 | self._tables_and_paths = [] 44 | for table in TpchLoadInfo.TABLES: 45 | table_path = tables_path / f"{table}.tbl" 46 | self._tables_and_paths.append((table, table_path)) 47 | 48 | def get_schema_path(self) -> Path: 49 | return self._schema_path 50 | 51 | def get_tables_and_paths(self) -> list[tuple[str, Path]]: 52 | return self._tables_and_paths 53 | 54 | def get_table_file_delimiter(self) -> str: 55 | return "|" 56 | 57 | def get_constraints_path(self) -> Optional[Path]: 58 | return self._constraints_path 59 | -------------------------------------------------------------------------------- /benchmark/tpch/tpch_constraints.sql: -------------------------------------------------------------------------------- 1 | ALTER TABLE nation ADD CONSTRAINT nation_n_regionkey_fkey FOREIGN KEY (n_regionkey) REFERENCES region (r_regionkey) ON DELETE CASCADE; 2 | ALTER TABLE supplier ADD CONSTRAINT supplier_s_nationkey_fkey FOREIGN KEY (s_nationkey) REFERENCES nation (n_nationkey) ON DELETE CASCADE; 3 | ALTER TABLE partsupp ADD CONSTRAINT partsupp_ps_partkey_fkey FOREIGN KEY (ps_partkey) REFERENCES part (p_partkey) ON DELETE CASCADE; 4 | ALTER TABLE partsupp ADD CONSTRAINT partsupp_ps_suppkey_fkey FOREIGN KEY (ps_suppkey) REFERENCES supplier (s_suppkey) ON DELETE CASCADE; 5 | ALTER TABLE customer ADD CONSTRAINT customer_c_nationkey_fkey FOREIGN KEY (c_nationkey) REFERENCES nation (n_nationkey) ON DELETE CASCADE; 6 | ALTER TABLE orders ADD CONSTRAINT orders_o_custkey_fkey FOREIGN KEY (o_custkey) REFERENCES customer (c_custkey) ON DELETE CASCADE; 7 | ALTER TABLE lineitem ADD CONSTRAINT lineitem_l_orderkey_fkey FOREIGN KEY (l_orderkey) REFERENCES orders (o_orderkey) ON DELETE CASCADE; 8 | ALTER TABLE lineitem ADD CONSTRAINT lineitem_l_partkey_l_suppkey_fkey FOREIGN KEY (l_partkey, l_suppkey) REFERENCES partsupp (ps_partkey, ps_suppkey) ON DELETE CASCADE; 9 | 10 | -- We don't create any indexes so that there's a clean slate for tuning 11 | -- CREATE UNIQUE INDEX r_rk ON region (r_regionkey ASC); 12 | -- CREATE UNIQUE INDEX n_nk ON nation (n_nationkey ASC); 13 | -- CREATE INDEX n_rk ON nation (n_regionkey ASC); 14 | -- CREATE UNIQUE INDEX p_pk ON part (p_partkey ASC); 15 | -- CREATE UNIQUE INDEX s_sk ON supplier (s_suppkey ASC); 16 | -- CREATE INDEX s_nk ON supplier (s_nationkey ASC); 17 | -- CREATE INDEX ps_pk ON partsupp (ps_partkey ASC); 18 | -- CREATE INDEX ps_sk ON partsupp (ps_suppkey ASC); 19 | -- CREATE UNIQUE INDEX ps_pk_sk ON partsupp (ps_partkey ASC, ps_suppkey ASC); 20 | -- CREATE UNIQUE INDEX ps_sk_pk ON partsupp (ps_suppkey ASC, ps_partkey ASC); 21 | -- CREATE UNIQUE INDEX c_ck ON customer (c_custkey ASC); 22 | -- CREATE INDEX c_nk ON customer (c_nationkey ASC); 23 | -- CREATE UNIQUE INDEX o_ok ON orders (o_orderkey ASC); 24 | -- CREATE INDEX o_ck ON orders (o_custkey ASC); 25 | -- CREATE INDEX o_od ON orders (o_orderdate ASC); 26 | -- CREATE INDEX l_ok ON lineitem (l_orderkey ASC); 27 | -- CREATE INDEX l_pk ON lineitem (l_partkey ASC); 28 | -- CREATE INDEX l_sk ON lineitem (l_suppkey ASC); 29 | -- CREATE INDEX l_sd ON lineitem (l_shipdate ASC); 30 | -- CREATE INDEX l_cd ON lineitem (l_commitdate ASC); 31 | -- CREATE INDEX l_rd ON lineitem (l_receiptdate ASC); 32 | -- CREATE INDEX l_pk_sk ON lineitem (l_partkey ASC, l_suppkey ASC); 33 | -- CREATE INDEX l_sk_pk ON lineitem (l_suppkey ASC, l_partkey ASC); -------------------------------------------------------------------------------- /benchmark/tpch/tpch_schema.sql: -------------------------------------------------------------------------------- 1 | -- Copied over from https://github.com/cmu-db/benchbase/blob/main/src/main/resources/benchmarks/tpch/ddl-postgres.sql 2 | -- We copied it over so that we have control over the schema, not tpch-kit. 3 | 4 | DROP TABLE IF EXISTS nation CASCADE; 5 | DROP TABLE IF EXISTS region CASCADE; 6 | DROP TABLE IF EXISTS part CASCADE; 7 | DROP TABLE IF EXISTS supplier CASCADE; 8 | DROP TABLE IF EXISTS partsupp CASCADE; 9 | DROP TABLE IF EXISTS orders CASCADE; 10 | DROP TABLE IF EXISTS customer CASCADE; 11 | DROP TABLE IF EXISTS lineitem CASCADE; 12 | 13 | CREATE TABLE region ( 14 | r_regionkey integer NOT NULL, 15 | r_name char(25) NOT NULL, 16 | r_comment varchar(152), 17 | PRIMARY KEY (r_regionkey) 18 | ); 19 | 20 | CREATE TABLE nation ( 21 | n_nationkey integer NOT NULL, 22 | n_name char(25) NOT NULL, 23 | n_regionkey integer NOT NULL, 24 | n_comment varchar(152), 25 | PRIMARY KEY (n_nationkey) 26 | ); 27 | 28 | CREATE TABLE part ( 29 | p_partkey integer NOT NULL, 30 | p_name varchar(55) NOT NULL, 31 | p_mfgr char(25) NOT NULL, 32 | p_brand char(10) NOT NULL, 33 | p_type varchar(25) NOT NULL, 34 | p_size integer NOT NULL, 35 | p_container char(10) NOT NULL, 36 | p_retailprice decimal(15, 2) NOT NULL, 37 | p_comment varchar(23) NOT NULL, 38 | PRIMARY KEY (p_partkey) 39 | ); 40 | 41 | CREATE TABLE supplier ( 42 | s_suppkey integer NOT NULL, 43 | s_name char(25) NOT NULL, 44 | s_address varchar(40) NOT NULL, 45 | s_nationkey integer NOT NULL, 46 | s_phone char(15) NOT NULL, 47 | s_acctbal decimal(15, 2) NOT NULL, 48 | s_comment varchar(101) NOT NULL, 49 | PRIMARY KEY (s_suppkey) 50 | ); 51 | 52 | CREATE TABLE partsupp ( 53 | ps_partkey integer NOT NULL, 54 | ps_suppkey integer NOT NULL, 55 | ps_availqty integer NOT NULL, 56 | ps_supplycost decimal(15, 2) NOT NULL, 57 | ps_comment varchar(199) NOT NULL, 58 | PRIMARY KEY (ps_partkey, ps_suppkey) 59 | ); 60 | 61 | CREATE TABLE customer ( 62 | c_custkey integer NOT NULL, 63 | c_name varchar(25) NOT NULL, 64 | c_address varchar(40) NOT NULL, 65 | c_nationkey integer NOT NULL, 66 | c_phone char(15) NOT NULL, 67 | c_acctbal decimal(15, 2) NOT NULL, 68 | c_mktsegment char(10) NOT NULL, 69 | c_comment varchar(117) NOT NULL, 70 | PRIMARY KEY (c_custkey) 71 | ); 72 | 73 | CREATE TABLE orders ( 74 | o_orderkey integer NOT NULL, 75 | o_custkey integer NOT NULL, 76 | o_orderstatus char(1) NOT NULL, 77 | o_totalprice decimal(15, 2) NOT NULL, 78 | o_orderdate date NOT NULL, 79 | o_orderpriority char(15) NOT NULL, 80 | o_clerk char(15) NOT NULL, 81 | o_shippriority integer NOT NULL, 82 | o_comment varchar(79) NOT NULL, 83 | PRIMARY KEY (o_orderkey) 84 | ); 85 | 86 | CREATE TABLE lineitem ( 87 | l_orderkey integer NOT NULL, 88 | l_partkey integer NOT NULL, 89 | l_suppkey integer NOT NULL, 90 | l_linenumber integer NOT NULL, 91 | l_quantity decimal(15, 2) NOT NULL, 92 | l_extendedprice decimal(15, 2) NOT NULL, 93 | l_discount decimal(15, 2) NOT NULL, 94 | l_tax decimal(15, 2) NOT NULL, 95 | l_returnflag char(1) NOT NULL, 96 | l_linestatus char(1) NOT NULL, 97 | l_shipdate date NOT NULL, 98 | l_commitdate date NOT NULL, 99 | l_receiptdate date NOT NULL, 100 | l_shipinstruct char(25) NOT NULL, 101 | l_shipmode char(10) NOT NULL, 102 | l_comment varchar(44) NOT NULL, 103 | PRIMARY KEY (l_orderkey, l_linenumber) 104 | ); 105 | -------------------------------------------------------------------------------- /dbgym_config.yaml: -------------------------------------------------------------------------------- 1 | dbgym_workspace_path: ../dbgym_workspace 2 | boot_redis_port: 6379 3 | ray_gcs_port: 6380 -------------------------------------------------------------------------------- /dbms/__init__.py: -------------------------------------------------------------------------------- 1 | # This folder contains code for building DBMSs. 2 | # It should not be confused with code that uses DBMSs (e.g. those in tune/env/). 3 | -------------------------------------------------------------------------------- /dbms/cli.py: -------------------------------------------------------------------------------- 1 | import click 2 | from gymlib.workspace import DBGymWorkspace 3 | 4 | from dbms.postgres.cli import postgres_group 5 | 6 | 7 | @click.group(name="dbms") 8 | @click.pass_obj 9 | def dbms_group(dbgym_workspace: DBGymWorkspace) -> None: 10 | pass 11 | 12 | 13 | dbms_group.add_command(postgres_group) 14 | -------------------------------------------------------------------------------- /dbms/load_info_base_class.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Optional 3 | 4 | 5 | class LoadInfoBaseClass: 6 | """ 7 | A base class for providing info for DBMSs to load the data of a benchmark 8 | When copying these functions to a specific benchmark's load_info.py file, don't 9 | copy the comments or type annotations or else they might become out of sync. 10 | """ 11 | 12 | def get_schema_path(self) -> Path: 13 | raise NotImplementedError 14 | 15 | def get_tables_and_paths(self) -> list[tuple[str, Path]]: 16 | raise NotImplementedError 17 | 18 | # We assume the table file has a "csv-like" format where values are separated by a delimiter. 19 | def get_table_file_delimiter(self) -> str: 20 | raise NotImplementedError 21 | 22 | # If the subclassing benchmark does not have constraints, you can return None here. 23 | # Constraints are also indexes. 24 | def get_constraints_path(self) -> Optional[Path]: 25 | raise NotImplementedError 26 | -------------------------------------------------------------------------------- /dbms/postgres/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/dbms/postgres/__init__.py -------------------------------------------------------------------------------- /dbms/postgres/_build_repo.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euxo pipefail 4 | 5 | REPO_REAL_PARENT_PATH="$1" 6 | 7 | # Download and make postgres from the boot repository. 8 | mkdir -p "${REPO_REAL_PARENT_PATH}" 9 | cd "${REPO_REAL_PARENT_PATH}" 10 | git clone https://github.com/lmwnshn/boot.git --single-branch --branch vldb_2024 --depth 1 11 | cd ./boot 12 | ./cmudb/build/configure.sh release "${REPO_REAL_PARENT_PATH}/boot/build/postgres" 13 | make clean 14 | make install-world-bin -j4 15 | 16 | # Download and make boot. 17 | cd ./cmudb/extension/boot_rs/ 18 | cargo build --release 19 | cbindgen . -o target/boot_rs.h --lang c 20 | cd "${REPO_REAL_PARENT_PATH}/boot" 21 | 22 | cd ./cmudb/extension/boot/ 23 | make clean 24 | make install -j 25 | cd "${REPO_REAL_PARENT_PATH}/boot" 26 | 27 | # Download and make hypopg. 28 | git clone https://github.com/HypoPG/hypopg.git 29 | cd ./hypopg 30 | PG_CONFIG="${REPO_REAL_PARENT_PATH}/boot/build/postgres/bin/pg_config" make install 31 | cd "${REPO_REAL_PARENT_PATH}/boot" 32 | 33 | # Download and make pg_hint_plan. 34 | # We need -L to follow links. 35 | curl -L https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL15_1_5_1.tar.gz -o REL15_1_5_1.tar.gz 36 | tar -xzf REL15_1_5_1.tar.gz 37 | rm REL15_1_5_1.tar.gz 38 | cd ./pg_hint_plan-REL15_1_5_1 39 | PATH="${REPO_REAL_PARENT_PATH}/boot/build/postgres/bin:$PATH" make 40 | PATH="${REPO_REAL_PARENT_PATH}/boot/build/postgres/bin:$PATH" make install 41 | cp ./pg_hint_plan.so ${REPO_REAL_PARENT_PATH}/boot/build/postgres/lib 42 | -------------------------------------------------------------------------------- /dbms/postgres/cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | At a high level, this file's goal is to (1) build postgres and (2) create dbdata (aka pgdata). 3 | """ 4 | 5 | import logging 6 | import shutil 7 | import subprocess 8 | from pathlib import Path 9 | from typing import Any, Optional 10 | 11 | import click 12 | import sqlalchemy 13 | from gymlib.infra_paths import ( 14 | get_dbdata_tgz_symlink_path, 15 | get_pgbin_symlink_path, 16 | get_repo_symlink_path, 17 | ) 18 | from gymlib.pg import create_sqlalchemy_conn, sql_file_execute 19 | from gymlib.workspace import ( 20 | WORKSPACE_PATH_PLACEHOLDER, 21 | DBGymWorkspace, 22 | fully_resolve_path, 23 | get_tmp_path_from_workspace_path, 24 | is_fully_resolved, 25 | is_ssd, 26 | linkname_to_name, 27 | ) 28 | from sqlalchemy import text 29 | 30 | from benchmark.constants import DEFAULT_SCALE_FACTOR 31 | from benchmark.job.load_info import JobLoadInfo 32 | from benchmark.tpch.load_info import TpchLoadInfo 33 | from dbms.load_info_base_class import LoadInfoBaseClass 34 | from util.shell import subprocess_run 35 | 36 | DBGYM_POSTGRES_USER = "dbgym_user" 37 | DBGYM_POSTGRES_PASS = "dbgym_pass" 38 | DBGYM_POSTGRES_DBNAME = "dbgym" 39 | DEFAULT_POSTGRES_DBNAME = "postgres" 40 | DEFAULT_POSTGRES_PORT = 5432 41 | SHARED_PRELOAD_LIBRARIES = "boot,pg_hint_plan,pg_prewarm" 42 | 43 | 44 | @click.group(name="postgres") 45 | @click.pass_obj 46 | def postgres_group(dbgym_workspace: DBGymWorkspace) -> None: 47 | pass 48 | 49 | 50 | @postgres_group.command( 51 | name="build", 52 | help="Download and build the Postgres repository and all necessary extensions/shared libraries. Does not create dbdata.", 53 | ) 54 | @click.pass_obj 55 | @click.option( 56 | "--rebuild", 57 | is_flag=True, 58 | help="Include this flag to rebuild Postgres even if it already exists.", 59 | ) 60 | def postgres_build(dbgym_workspace: DBGymWorkspace, rebuild: bool) -> None: 61 | _postgres_build(dbgym_workspace, rebuild) 62 | 63 | 64 | def _postgres_build(dbgym_workspace: DBGymWorkspace, rebuild: bool) -> None: 65 | """ 66 | This function exists as a hook for integration tests. 67 | """ 68 | expected_repo_symlink_path = get_repo_symlink_path( 69 | dbgym_workspace.dbgym_workspace_path 70 | ) 71 | if not rebuild and expected_repo_symlink_path.exists(): 72 | logging.info(f"Skipping _postgres_build: {expected_repo_symlink_path}") 73 | return 74 | 75 | logging.info(f"Setting up repo in {expected_repo_symlink_path}") 76 | repo_real_path = dbgym_workspace.dbgym_this_run_path / "repo" 77 | repo_real_path.mkdir(parents=False, exist_ok=False) 78 | subprocess_run( 79 | f"./_build_repo.sh {repo_real_path}", 80 | cwd=dbgym_workspace.base_dbgym_repo_path / "dbms" / "postgres", 81 | ) 82 | 83 | # only link at the end so that the link only ever points to a complete repo 84 | repo_symlink_path = dbgym_workspace.link_result(repo_real_path) 85 | assert expected_repo_symlink_path.samefile(repo_symlink_path) 86 | logging.info(f"Set up repo in {expected_repo_symlink_path}") 87 | 88 | 89 | @postgres_group.command( 90 | name="dbdata", 91 | help="Build a .tgz file of dbdata with various specifications for its contents.", 92 | ) 93 | @click.pass_obj 94 | @click.argument("benchmark_name", type=str) 95 | @click.option("--scale-factor", type=float, default=DEFAULT_SCALE_FACTOR) 96 | @click.option( 97 | "--pgbin-path", 98 | type=Path, 99 | default=None, 100 | help=f"The path to the bin containing Postgres executables. The default is {get_pgbin_symlink_path(WORKSPACE_PATH_PLACEHOLDER)}.", 101 | ) 102 | @click.option( 103 | "--intended-dbdata-hardware", 104 | type=click.Choice(["hdd", "ssd"]), 105 | default="hdd", 106 | help=f"The intended hardware dbdata should be on. Used as a sanity check for --dbdata-parent-path.", 107 | ) 108 | @click.option( 109 | "--dbdata-parent-path", 110 | default=None, 111 | type=Path, 112 | help=f"The path to the parent directory of the dbdata which will be actively tuned. The default is {get_tmp_path_from_workspace_path(WORKSPACE_PATH_PLACEHOLDER)}.", 113 | ) 114 | def postgres_dbdata( 115 | dbgym_workspace: DBGymWorkspace, 116 | benchmark_name: str, 117 | scale_factor: float, 118 | pgbin_path: Optional[Path], 119 | intended_dbdata_hardware: str, 120 | dbdata_parent_path: Optional[Path], 121 | ) -> None: 122 | _postgres_dbdata( 123 | dbgym_workspace, 124 | benchmark_name, 125 | scale_factor, 126 | pgbin_path, 127 | intended_dbdata_hardware, 128 | dbdata_parent_path, 129 | ) 130 | 131 | 132 | def _postgres_dbdata( 133 | dbgym_workspace: DBGymWorkspace, 134 | benchmark_name: str, 135 | scale_factor: float, 136 | pgbin_path: Optional[Path], 137 | intended_dbdata_hardware: str, 138 | dbdata_parent_path: Optional[Path], 139 | ) -> None: 140 | """ 141 | This function exists as a hook for integration tests. 142 | """ 143 | # Set args to defaults programmatically (do this before doing anything else in the function) 144 | if pgbin_path is None: 145 | pgbin_path = get_pgbin_symlink_path(dbgym_workspace.dbgym_workspace_path) 146 | if dbdata_parent_path is None: 147 | dbdata_parent_path = get_tmp_path_from_workspace_path( 148 | dbgym_workspace.dbgym_workspace_path 149 | ) 150 | 151 | # Fully resolve all input paths. 152 | pgbin_path = fully_resolve_path(pgbin_path) 153 | dbdata_parent_path = fully_resolve_path(dbdata_parent_path) 154 | 155 | # Check assertions on args 156 | if intended_dbdata_hardware == "hdd": 157 | assert not is_ssd( 158 | dbdata_parent_path 159 | ), f"Intended hardware is HDD but dbdata_parent_path ({dbdata_parent_path}) is an SSD" 160 | elif intended_dbdata_hardware == "ssd": 161 | assert is_ssd( 162 | dbdata_parent_path 163 | ), f"Intended hardware is SSD but dbdata_parent_path ({dbdata_parent_path}) is an HDD" 164 | else: 165 | assert ( 166 | False 167 | ), f'Intended hardware is "{intended_dbdata_hardware}" which is invalid' 168 | 169 | # Create dbdata 170 | _create_dbdata( 171 | dbgym_workspace, benchmark_name, scale_factor, pgbin_path, dbdata_parent_path 172 | ) 173 | 174 | 175 | def _create_dbdata( 176 | dbgym_workspace: DBGymWorkspace, 177 | benchmark_name: str, 178 | scale_factor: float, 179 | pgbin_path: Path, 180 | dbdata_parent_path: Path, 181 | ) -> None: 182 | """ 183 | If you change the code of _create_dbdata(), you should also delete the symlink so that the next time you run 184 | `dbms postgres dbdata` it will re-create the dbdata. 185 | """ 186 | expected_dbdata_tgz_symlink_path = get_dbdata_tgz_symlink_path( 187 | dbgym_workspace.dbgym_workspace_path, 188 | benchmark_name, 189 | scale_factor, 190 | ) 191 | if expected_dbdata_tgz_symlink_path.exists(): 192 | logging.info(f"Skipping _create_dbdata: {expected_dbdata_tgz_symlink_path}") 193 | return 194 | 195 | # It's ok for the dbdata/ directory to be temporary. It just matters that the .tgz is saved in a safe place. 196 | dbdata_path = dbdata_parent_path / "dbdata_being_created" 197 | # We might be reusing the same dbdata_parent_path, so delete dbdata_path if it already exists 198 | if dbdata_path.exists(): 199 | shutil.rmtree(dbdata_path) 200 | 201 | # Call initdb. 202 | # Save any script we call from pgbin_symlink_path because they are dependencies generated from another task run. 203 | dbgym_workspace.save_file(pgbin_path / "initdb") 204 | subprocess_run(f'./initdb -D "{dbdata_path}"', cwd=pgbin_path) 205 | 206 | # Start Postgres (all other dbdata setup requires postgres to be started). 207 | # Note that subprocess_run() never returns when running "pg_ctl start", so I'm using subprocess.run() instead. 208 | start_postgres(dbgym_workspace, pgbin_path, dbdata_path) 209 | 210 | # Set up Postgres. 211 | _generic_dbdata_setup(dbgym_workspace) 212 | _load_benchmark_into_dbdata(dbgym_workspace, benchmark_name, scale_factor) 213 | 214 | # Stop Postgres so that we don't "leak" processes. 215 | stop_postgres(dbgym_workspace, pgbin_path, dbdata_path) 216 | 217 | # Create .tgz file. 218 | dbdata_tgz_real_path = dbgym_workspace.dbgym_this_run_path / linkname_to_name( 219 | expected_dbdata_tgz_symlink_path.name 220 | ) 221 | # We need to cd into dbdata_path so that the tar file does not contain folders for the whole path of dbdata_path. 222 | subprocess_run(f"tar -czf {dbdata_tgz_real_path} .", cwd=dbdata_path) 223 | 224 | # Create symlink. 225 | # Only link at the end so that the link only ever points to a complete dbdata. 226 | dbdata_tgz_symlink_path = dbgym_workspace.link_result(dbdata_tgz_real_path) 227 | assert expected_dbdata_tgz_symlink_path.samefile(dbdata_tgz_symlink_path) 228 | logging.info(f"Created dbdata in {dbdata_tgz_symlink_path}") 229 | 230 | 231 | def _generic_dbdata_setup(dbgym_workspace: DBGymWorkspace) -> None: 232 | # get necessary vars 233 | pgbin_real_path = get_pgbin_symlink_path( 234 | dbgym_workspace.dbgym_workspace_path 235 | ).resolve() 236 | assert pgbin_real_path.exists() 237 | dbgym_pguser = DBGYM_POSTGRES_USER 238 | dbgym_pgpass = DBGYM_POSTGRES_PASS 239 | pgport = DEFAULT_POSTGRES_PORT 240 | 241 | # Create user 242 | dbgym_workspace.save_file(pgbin_real_path / "psql") 243 | subprocess_run( 244 | f"./psql -c \"create user {dbgym_pguser} with superuser password '{dbgym_pgpass}'\" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost", 245 | cwd=pgbin_real_path, 246 | ) 247 | subprocess_run( 248 | f'./psql -c "grant pg_monitor to {dbgym_pguser}" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost', 249 | cwd=pgbin_real_path, 250 | ) 251 | 252 | # Load shared preload libraries 253 | if SHARED_PRELOAD_LIBRARIES: 254 | subprocess_run( 255 | # You have to use TO and you can't put single quotes around the libraries (https://postgrespro.com/list/thread-id/2580120) 256 | # The method I wrote here works for both one library and multiple libraries 257 | f'./psql -c "ALTER SYSTEM SET shared_preload_libraries TO {SHARED_PRELOAD_LIBRARIES};" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost', 258 | cwd=pgbin_real_path, 259 | ) 260 | 261 | # Create the dbgym database. Since one dbdata dir maps to one benchmark, all benchmarks will use the same database 262 | # as opposed to using databases named after the benchmark. 263 | subprocess_run( 264 | f"./psql -c \"create database {DBGYM_POSTGRES_DBNAME} with owner = '{dbgym_pguser}'\" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost", 265 | cwd=pgbin_real_path, 266 | ) 267 | 268 | 269 | def _load_benchmark_into_dbdata( 270 | dbgym_workspace: DBGymWorkspace, benchmark_name: str, scale_factor: float 271 | ) -> None: 272 | load_info: LoadInfoBaseClass 273 | 274 | with create_sqlalchemy_conn() as conn: 275 | if benchmark_name == "tpch": 276 | load_info = TpchLoadInfo(dbgym_workspace, scale_factor) 277 | elif benchmark_name == "job": 278 | load_info = JobLoadInfo(dbgym_workspace) 279 | else: 280 | raise AssertionError( 281 | f"_load_benchmark_into_dbdata(): the benchmark of name {benchmark_name} is not implemented" 282 | ) 283 | 284 | _load_into_dbdata(dbgym_workspace, conn, load_info) 285 | 286 | 287 | def _load_into_dbdata( 288 | dbgym_workspace: DBGymWorkspace, 289 | conn: sqlalchemy.Connection, 290 | load_info: LoadInfoBaseClass, 291 | ) -> None: 292 | sql_file_execute(dbgym_workspace, conn, load_info.get_schema_path()) 293 | 294 | # Truncate all tables first before even loading a single one. 295 | for table, _ in load_info.get_tables_and_paths(): 296 | sqlalchemy_conn_execute(conn, f"TRUNCATE {table} CASCADE") 297 | # Then, load the tables. 298 | for table, table_path in load_info.get_tables_and_paths(): 299 | with dbgym_workspace.open_and_save(table_path, "r") as table_csv: 300 | assert conn.connection.dbapi_connection is not None 301 | cur = conn.connection.dbapi_connection.cursor() 302 | try: 303 | with cur.copy( 304 | f"COPY {table} FROM STDIN CSV DELIMITER '{load_info.get_table_file_delimiter()}' ESCAPE '\\'" 305 | ) as copy: 306 | while data := table_csv.read(8192): 307 | copy.write(data) 308 | finally: 309 | cur.close() 310 | 311 | constraints_path = load_info.get_constraints_path() 312 | if constraints_path is not None: 313 | sql_file_execute(dbgym_workspace, conn, constraints_path) 314 | 315 | 316 | # The start and stop functions slightly duplicate functionality from pg_conn.py. However, I chose to do it this way 317 | # because what the `dbms` CLI needs in terms of starting and stopping Postgres is much simpler than what an agent 318 | # that is tuning the database needs. Because these functions are so simple, I think it's okay to leave them here 319 | # even though they are a little redundant. It seems better than making `dbms` depend on the behavior of the 320 | # tuning environment. 321 | def start_postgres( 322 | dbgym_workspace: DBGymWorkspace, pgbin_path: Path, dbdata_path: Path 323 | ) -> None: 324 | _start_or_stop_postgres(dbgym_workspace, pgbin_path, dbdata_path, True) 325 | 326 | 327 | def stop_postgres( 328 | dbgym_workspace: DBGymWorkspace, pgbin_path: Path, dbdata_path: Path 329 | ) -> None: 330 | _start_or_stop_postgres(dbgym_workspace, pgbin_path, dbdata_path, False) 331 | 332 | 333 | def _start_or_stop_postgres( 334 | dbgym_workspace: DBGymWorkspace, 335 | pgbin_path: Path, 336 | dbdata_path: Path, 337 | is_start: bool, 338 | ) -> None: 339 | # They should be absolute paths and should exist 340 | assert is_fully_resolved(pgbin_path) 341 | assert is_fully_resolved(dbdata_path) 342 | pgport = DEFAULT_POSTGRES_PORT 343 | dbgym_workspace.save_file(pgbin_path / "pg_ctl") 344 | 345 | if is_start: 346 | # We use subprocess.run() because subprocess_run() never returns when running "pg_ctl start". 347 | # The reason subprocess_run() never returns is because pg_ctl spawns a postgres process so .poll() always returns None. 348 | # On the other hand, subprocess.run() does return normally, like calling `./pg_ctl` on the command line would do. 349 | result = subprocess.run( 350 | f"./pg_ctl -D \"{dbdata_path}\" -o '-p {pgport}' start", 351 | cwd=pgbin_path, 352 | shell=True, 353 | ) 354 | result.check_returncode() 355 | else: 356 | subprocess_run( 357 | f"./pg_ctl -D \"{dbdata_path}\" -o '-p {pgport}' stop", 358 | cwd=pgbin_path, 359 | ) 360 | 361 | 362 | def sqlalchemy_conn_execute( 363 | conn: sqlalchemy.Connection, sql: str 364 | ) -> sqlalchemy.engine.CursorResult[Any]: 365 | return conn.execute(text(sql)) 366 | -------------------------------------------------------------------------------- /dbms/postgres/default_boot_config.yaml: -------------------------------------------------------------------------------- 1 | # Macro accelerator 2 | intelligent_cache: true 3 | 4 | # Micro accelerator 5 | early_stop: true 6 | seq_sample: true 7 | seq_sample_pct: 50 8 | seq_sample_seed: 15721 9 | mu_hyp_opt: 0.01 10 | mu_hyp_time: 100000 11 | mu_hyp_stdev: 1.0 -------------------------------------------------------------------------------- /dbms/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/dbms/tests/__init__.py -------------------------------------------------------------------------------- /dbms/tests/dbms_integtest_dbgym_config.yaml: -------------------------------------------------------------------------------- 1 | dbgym_workspace_path: ../dbgym_dbms_integtest_workspace/ 2 | -------------------------------------------------------------------------------- /dbms/tests/integtest_dbms.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import unittest 4 | from pathlib import Path 5 | 6 | from gymlib.infra_paths import get_dbdata_tgz_symlink_path, get_repo_symlink_path 7 | from gymlib.workspace import ( 8 | DBGymWorkspace, 9 | fully_resolve_path, 10 | get_workspace_path_from_config, 11 | ) 12 | 13 | from benchmark.tpch.cli import _tpch_tables 14 | from dbms.postgres.cli import _postgres_build, _postgres_dbdata 15 | 16 | 17 | class DBMSTests(unittest.TestCase): 18 | DBGYM_CONFIG_PATH = Path("dbms/tests/dbms_integtest_dbgym_config.yaml") 19 | 20 | def setUp(self) -> None: 21 | workspace_path = get_workspace_path_from_config(DBMSTests.DBGYM_CONFIG_PATH) 22 | # Get a clean start each time. 23 | if workspace_path.exists(): 24 | shutil.rmtree(workspace_path) 25 | 26 | # Reset this to avoid the error of it being created twice. 27 | # In real usage, the second run would be a different Python process so DBGymWorkspace._num_times_created_this_run would be 0. 28 | DBGymWorkspace._num_times_created_this_run = 0 29 | self.workspace = DBGymWorkspace(workspace_path) 30 | 31 | def tearDown(self) -> None: 32 | if self.workspace.dbgym_workspace_path.exists(): 33 | shutil.rmtree(self.workspace.dbgym_workspace_path) 34 | 35 | def test_postgres_build(self) -> None: 36 | repo_path = get_repo_symlink_path(self.workspace.dbgym_workspace_path) 37 | self.assertFalse(repo_path.exists()) 38 | _postgres_build(self.workspace, False) 39 | self.assertTrue(repo_path.exists()) 40 | self.assertTrue(fully_resolve_path(repo_path).exists()) 41 | 42 | def test_postgres_dbdata(self) -> None: 43 | # Setup 44 | # Make sure to recreate self.workspace so that each function call counts as its own run. 45 | scale_factor = 0.01 46 | _postgres_build(self.workspace, False) 47 | DBGymWorkspace._num_times_created_this_run = 0 48 | self.workspace = DBGymWorkspace(self.workspace.dbgym_workspace_path) 49 | _tpch_tables(self.workspace, scale_factor) 50 | DBGymWorkspace._num_times_created_this_run = 0 51 | self.workspace = DBGymWorkspace(self.workspace.dbgym_workspace_path) 52 | 53 | # Test 54 | dbdata_tgz_path = get_dbdata_tgz_symlink_path( 55 | self.workspace.dbgym_workspace_path, "tpch", scale_factor 56 | ) 57 | self.assertFalse(dbdata_tgz_path.exists()) 58 | intended_dbdata_hardware = os.environ.get("INTENDED_DBDATA_HARDWARE", "hdd") 59 | _postgres_dbdata( 60 | self.workspace, "tpch", scale_factor, None, intended_dbdata_hardware, None 61 | ) 62 | self.assertTrue(dbdata_tgz_path.exists()) 63 | self.assertTrue(fully_resolve_path(dbdata_tgz_path).exists()) 64 | 65 | 66 | if __name__ == "__main__": 67 | unittest.main() 68 | -------------------------------------------------------------------------------- /gymlib_package/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/gymlib_package/__init__.py -------------------------------------------------------------------------------- /gymlib_package/gymlib/__init__.py: -------------------------------------------------------------------------------- 1 | from . import infra_paths, workspace 2 | -------------------------------------------------------------------------------- /gymlib_package/gymlib/infra_paths.py: -------------------------------------------------------------------------------- 1 | """ 2 | "Infra" refers to benchmark/ and dbms/. These are all the paths used to access the files created by benchmark/ and dbms/. 3 | They're inside gymlib because agents will need to access them. 4 | """ 5 | 6 | from pathlib import Path 7 | from typing import Any 8 | 9 | from gymlib.workspace import DBGYM_APP_NAME, SYMLINKS_DNAME, name_to_linkname 10 | 11 | SCALE_FACTOR_PLACEHOLDER: str = "[scale_factor]" 12 | BENCHMARK_NAME_PLACEHOLDER: str = "[benchmark_name]" 13 | WORKLOAD_NAME_PLACEHOLDER: str = "[workload_name]" 14 | 15 | 16 | def get_scale_factor_string(scale_factor: float | str) -> str: 17 | if type(scale_factor) is str and scale_factor == SCALE_FACTOR_PLACEHOLDER: 18 | return scale_factor 19 | else: 20 | if float(int(scale_factor)) == scale_factor: 21 | return str(int(scale_factor)) 22 | else: 23 | return str(scale_factor).replace(".", "point") 24 | 25 | 26 | def get_tables_dirname(benchmark: str, scale_factor: float | str) -> str: 27 | return f"tables_{benchmark}_sf{get_scale_factor_string(scale_factor)}" 28 | 29 | 30 | def get_workload_suffix(benchmark: str, **kwargs: Any) -> str: 31 | if benchmark == "tpch": 32 | assert kwargs.keys() == {"seed_start", "seed_end", "query_subset"} 33 | return f"{kwargs['seed_start']}_{kwargs['seed_end']}_{kwargs['query_subset']}" 34 | elif benchmark == "job": 35 | assert kwargs.keys() == {"query_subset"} 36 | return f"{kwargs['query_subset']}" 37 | else: 38 | assert False 39 | 40 | 41 | def get_workload_dirname(benchmark: str, scale_factor: float | str, suffix: str) -> str: 42 | return f"workload_{benchmark}_sf{get_scale_factor_string(scale_factor)}_{suffix}" 43 | 44 | 45 | def get_dbdata_tgz_filename(benchmark_name: str, scale_factor: float | str) -> str: 46 | return f"{benchmark_name}_sf{get_scale_factor_string(scale_factor)}_pristine_dbdata.tgz" 47 | 48 | 49 | def get_tables_symlink_path( 50 | workspace_path: Path, benchmark: str, scale_factor: float | str 51 | ) -> Path: 52 | return ( 53 | workspace_path 54 | / SYMLINKS_DNAME 55 | / DBGYM_APP_NAME 56 | / name_to_linkname(get_tables_dirname(benchmark, scale_factor)) 57 | ) 58 | 59 | 60 | def get_workload_symlink_path( 61 | workspace_path: Path, benchmark: str, scale_factor: float | str, suffix: str 62 | ) -> Path: 63 | return ( 64 | workspace_path 65 | / SYMLINKS_DNAME 66 | / DBGYM_APP_NAME 67 | / name_to_linkname(get_workload_dirname(benchmark, scale_factor, suffix)) 68 | ) 69 | 70 | 71 | def get_repo_symlink_path(workspace_path: Path) -> Path: 72 | return workspace_path / SYMLINKS_DNAME / DBGYM_APP_NAME / "repo.link" 73 | 74 | 75 | def get_pgbin_symlink_path(workspace_path: Path) -> Path: 76 | return get_repo_symlink_path(workspace_path) / "boot" / "build" / "postgres" / "bin" 77 | 78 | 79 | def get_dbdata_tgz_symlink_path( 80 | workspace_path: Path, benchmark_name: str, scale_factor: float | str 81 | ) -> Path: 82 | return ( 83 | workspace_path 84 | / SYMLINKS_DNAME 85 | / DBGYM_APP_NAME 86 | / name_to_linkname(get_dbdata_tgz_filename(benchmark_name, scale_factor)) 87 | ) 88 | -------------------------------------------------------------------------------- /gymlib_package/gymlib/pg.py: -------------------------------------------------------------------------------- 1 | """ 2 | There are multiple parts of the codebase which interact with Postgres. This file contains helpers common to all those parts. 3 | """ 4 | 5 | from pathlib import Path 6 | from typing import Any 7 | 8 | import pglast 9 | import psutil 10 | import psycopg 11 | import sqlalchemy 12 | from gymlib.workspace import DBGymWorkspace 13 | from sqlalchemy import create_engine, text 14 | 15 | DBGYM_POSTGRES_USER = "dbgym_user" 16 | DBGYM_POSTGRES_PASS = "dbgym_pass" 17 | DBGYM_POSTGRES_DBNAME = "dbgym" 18 | DEFAULT_POSTGRES_DBNAME = "postgres" 19 | DEFAULT_POSTGRES_PORT = 5432 20 | SHARED_PRELOAD_LIBRARIES = "boot,pg_hint_plan,pg_prewarm" 21 | 22 | 23 | def sqlalchemy_conn_execute( 24 | conn: sqlalchemy.Connection, sql: str 25 | ) -> sqlalchemy.engine.CursorResult[Any]: 26 | return conn.execute(text(sql)) 27 | 28 | 29 | def sql_file_queries(dbgym_workspace: DBGymWorkspace, filepath: Path) -> list[str]: 30 | with dbgym_workspace.open_and_save(filepath) as f: 31 | lines: list[str] = [] 32 | for line in f: 33 | if line.startswith("--"): 34 | continue 35 | if len(line.strip()) == 0: 36 | continue 37 | lines.append(line) 38 | queries_str = "".join(lines) 39 | queries: list[str] = pglast.split(queries_str) 40 | return queries 41 | 42 | 43 | def sql_file_execute( 44 | dbgym_workspace: DBGymWorkspace, conn: sqlalchemy.Connection, filepath: Path 45 | ) -> None: 46 | for sql in sql_file_queries(dbgym_workspace, filepath): 47 | sqlalchemy_conn_execute(conn, sql) 48 | 49 | 50 | # The reason pgport is an argument is because when doing agnet HPO, we want to run multiple instances of Postgres 51 | # at the same time. In this situation, they need to have different ports 52 | def get_connstr(pgport: int = DEFAULT_POSTGRES_PORT, use_psycopg: bool = True) -> str: 53 | connstr_suffix = f"{DBGYM_POSTGRES_USER}:{DBGYM_POSTGRES_PASS}@localhost:{pgport}/{DBGYM_POSTGRES_DBNAME}" 54 | # use_psycopg means whether or not we use the psycopg.connect() function 55 | # counterintuively, you *don't* need psycopg in the connection string if you *are* 56 | # using the psycopg.connect() function 57 | connstr_prefix = "postgresql" if use_psycopg else "postgresql+psycopg" 58 | return connstr_prefix + "://" + connstr_suffix 59 | 60 | 61 | def get_kv_connstr(pgport: int = DEFAULT_POSTGRES_PORT) -> str: 62 | return f"host=localhost port={pgport} user={DBGYM_POSTGRES_USER} password={DBGYM_POSTGRES_PASS} dbname={DBGYM_POSTGRES_DBNAME}" 63 | 64 | 65 | def create_psycopg_conn(pgport: int = DEFAULT_POSTGRES_PORT) -> psycopg.Connection[Any]: 66 | connstr = get_connstr(use_psycopg=True, pgport=pgport) 67 | psycopg_conn = psycopg.connect(connstr, autocommit=True, prepare_threshold=None) 68 | return psycopg_conn 69 | 70 | 71 | def create_sqlalchemy_conn( 72 | pgport: int = DEFAULT_POSTGRES_PORT, 73 | ) -> sqlalchemy.Connection: 74 | connstr = get_connstr(use_psycopg=False, pgport=pgport) 75 | engine: sqlalchemy.Engine = create_engine( 76 | connstr, 77 | execution_options={"isolation_level": "AUTOCOMMIT"}, 78 | ) 79 | return engine.connect() 80 | 81 | 82 | def get_is_postgres_running() -> bool: 83 | """ 84 | This is often used in assertions to ensure that Postgres isn't running before we 85 | execute some code. 86 | 87 | I intentionally do not have a function that forcefully *stops* all Postgres instances. 88 | This is risky because it could accidentally stop instances it wasn't supposed (e.g. 89 | Postgres instances run by other users on the same machine). 90 | 91 | Stopping Postgres instances is thus a responsibility of the human to take care of. 92 | """ 93 | return len(get_running_postgres_ports()) > 0 94 | 95 | 96 | def get_running_postgres_ports() -> list[int]: 97 | """ 98 | Returns a list of all ports on which Postgres is currently running. 99 | 100 | There are ways to check with psycopg/sqlalchemy. However, I chose to check using 101 | psutil to keep it as simple as possible and orthogonal to how connections work. 102 | """ 103 | running_ports = [] 104 | 105 | for conn in psutil.net_connections(kind="inet"): 106 | if conn.status == "LISTEN": 107 | try: 108 | proc = psutil.Process(conn.pid) 109 | if proc.name() == "postgres": 110 | running_ports.append(conn.laddr.port) 111 | except (psutil.NoSuchProcess, psutil.AccessDenied): 112 | continue 113 | 114 | return running_ports 115 | -------------------------------------------------------------------------------- /gymlib_package/gymlib/pg_conn.py: -------------------------------------------------------------------------------- 1 | """ 2 | At a high level, this file's goal is to provide helpers to manage a Postgres instance during 3 | agent tuning. 4 | 5 | On the other hand, the goal of dbms.postgres.cli is to (1) install+build postgres and (2) 6 | create dbdata. 7 | 8 | util.pg provides helpers used by *both* of the above files (as well as other files). 9 | """ 10 | 11 | import logging 12 | import os 13 | import shutil 14 | import threading 15 | import time 16 | from pathlib import Path 17 | from typing import Any, Optional, Union 18 | 19 | import psutil 20 | import psycopg 21 | import yaml 22 | from gymlib.pg import DBGYM_POSTGRES_DBNAME, SHARED_PRELOAD_LIBRARIES, get_kv_connstr 23 | from gymlib.workspace import DBGymWorkspace, parent_path_of_path 24 | from plumbum import local 25 | from psycopg.errors import ProgramLimitExceeded, QueryCanceled 26 | 27 | CONNECT_TIMEOUT = 300 28 | 29 | 30 | class PostgresConn: 31 | # The reason that PostgresConn takes in all these paths (e.g. `pgbin_path`) instead of inferring them 32 | # automatically from the default workspace paths is so that it's fully decoupled from how the files 33 | # are organized in the workspace. 34 | def __init__( 35 | self, 36 | dbgym_workspace: DBGymWorkspace, 37 | pgport: int, 38 | pristine_dbdata_snapshot_path: Path, 39 | dbdata_parent_path: Path, 40 | pgbin_path: Union[str, Path], 41 | # Whether this is None determines whether Boot is enabled. 42 | boot_config_path: Optional[Path], 43 | ) -> None: 44 | 45 | self.dbgym_workspace = dbgym_workspace 46 | self.pgport = pgport 47 | self.pgbin_path = pgbin_path 48 | self.boot_config_path = boot_config_path 49 | self.log_step = 0 50 | 51 | # All the paths related to dbdata 52 | # pristine_dbdata_snapshot_path is the .tgz snapshot that represents the starting state 53 | # of the database (with the default configuration). It is generated by a call to 54 | # `python tune.py dbms postgres ...` and should not be overwritten. 55 | self.pristine_dbdata_snapshot_path = pristine_dbdata_snapshot_path 56 | # checkpoint_dbdata_snapshot_path is the .tgz snapshot that represents the current 57 | # state of the database as it is being tuned. It is generated while tuning and is 58 | # discarded once tuning is completed. 59 | self.checkpoint_dbdata_snapshot_path = ( 60 | dbgym_workspace.dbgym_tmp_path / "checkpoint_dbdata.tgz" 61 | ) 62 | # dbdata_parent_path is the parent directory of the dbdata that is *actively being tuned*. 63 | # It is *not* the parent directory of pristine_dbdata_snapshot_path. 64 | # Setting this lets us control the hardware device dbdata is built on (e.g. HDD vs. SSD). 65 | self.dbdata_parent_path = dbdata_parent_path 66 | # dbdata_path is the dbdata that is *actively being tuned* 67 | self.dbdata_path = self.dbdata_parent_path / f"dbdata{self.pgport}" 68 | 69 | self._conn: Optional[psycopg.Connection[Any]] = None 70 | self.hint_check_failed_with: Optional[str] = None 71 | 72 | def get_kv_connstr(self) -> str: 73 | return get_kv_connstr(self.pgport) 74 | 75 | def conn(self) -> psycopg.Connection[Any]: 76 | if self._conn is None: 77 | self._conn = psycopg.connect( 78 | self.get_kv_connstr(), autocommit=True, prepare_threshold=None 79 | ) 80 | 81 | def hint_check_notice_handler(notice: psycopg.errors.Diagnostic) -> None: 82 | """ 83 | Custom handler for raising errors if hints fail. 84 | """ 85 | if ( 86 | notice.message_detail is not None 87 | and "hint" in notice.message_detail.lower() 88 | ): 89 | self.hint_check_failed_with = notice.message_detail 90 | 91 | # We add the notice handler when the _conn is created instead of before executing a 92 | # query to avoid adding it more than once. 93 | self._conn.add_notice_handler(hint_check_notice_handler) 94 | 95 | return self._conn 96 | 97 | def disconnect(self) -> None: 98 | if self._conn is not None: 99 | self._conn.close() 100 | self._conn = None 101 | 102 | def move_log(self) -> None: 103 | pglog_path = self.dbgym_workspace.dbgym_this_run_path / f"pg{self.pgport}.log" 104 | pglog_this_step_path = ( 105 | self.dbgym_workspace.dbgym_this_run_path 106 | / f"pg{self.pgport}.log.{self.log_step}" 107 | ) 108 | if pglog_path.exists(): 109 | shutil.move(pglog_path, pglog_this_step_path) 110 | self.log_step += 1 111 | 112 | def force_statement_timeout(self, timeout: float) -> None: 113 | timeout_ms = timeout * 1000 114 | retry = True 115 | while retry: 116 | retry = False 117 | try: 118 | self.conn().execute(f"SET statement_timeout = {timeout_ms}") 119 | except QueryCanceled: 120 | retry = True 121 | 122 | def time_query( 123 | self, 124 | query: str, 125 | query_knobs: list[str] = [], 126 | add_explain: bool = False, 127 | timeout: float = 0, 128 | ) -> tuple[float, bool, Optional[dict[str, Any]]]: 129 | """ 130 | It returns the runtime in milliseconds, whether the query timed out, and the explain data if add_explain is True. 131 | 132 | If the query timed out, it won't have any explain data and thus explain_data will be None. Its runtime will be 133 | the timeout value. 134 | 135 | Run a query with a timeout (in seconds). Following Postgres's convention, timeout=0 indicates "disable timeout". 136 | 137 | Use query_knobs to pass query knobs. An example input is query_knobs=["SET (enable_sort on)", "IndexOnlyScan(it)"]. 138 | 139 | If you write explain in the query manually instead of setting add_explain, it won't return explain_data. This 140 | is because it won't know the format of the explain data. 141 | """ 142 | if timeout > 0: 143 | self.force_statement_timeout(timeout) 144 | else: 145 | assert ( 146 | timeout == 0 147 | ), f'Setting timeout to 0 indicates "disable timeout". However, setting timeout ({timeout}) < 0 is a bug.' 148 | 149 | did_time_out = False 150 | explain_data = None 151 | 152 | try: 153 | if query_knobs: 154 | query = f"/*+ {' '.join(query_knobs)} */ {query}" 155 | 156 | if add_explain: 157 | assert ( 158 | "explain" not in query.lower() 159 | ), "If you're using add_explain, don't also write explain manually in the query." 160 | query = f"explain (analyze, format json, timing off) {query}" 161 | 162 | # Reset this every time before calling execute() so that hint_check_notice_handler works correctly. 163 | self.hint_check_failed_with = None 164 | 165 | start_time = time.time() 166 | cursor = self.conn().execute(query) 167 | qid_runtime = (time.time() - start_time) * 1e6 168 | 169 | if self.hint_check_failed_with is not None: 170 | raise RuntimeError(f"Query hint failed: {self.hint_check_failed_with}") 171 | 172 | if add_explain: 173 | c = [c for c in cursor][0][0][0] 174 | assert "Execution Time" in c 175 | qid_runtime = float(c["Execution Time"]) * 1e3 176 | explain_data = c 177 | 178 | logging.debug(f"{query} evaluated in {qid_runtime/1e6}") 179 | 180 | except QueryCanceled: 181 | logging.debug(f"{query} exceeded evaluation timeout {timeout}") 182 | qid_runtime = timeout * 1e6 183 | did_time_out = True 184 | except Exception as e: 185 | raise e 186 | finally: 187 | # Wipe the statement timeout. 188 | self.force_statement_timeout(0) 189 | 190 | # qid_runtime is in microseconds. 191 | return qid_runtime, did_time_out, explain_data 192 | 193 | def shutdown_postgres(self) -> None: 194 | """Shuts down postgres.""" 195 | self.disconnect() 196 | if not Path(self.dbdata_path).exists(): 197 | return 198 | 199 | while True: 200 | logging.debug("Shutting down postgres...") 201 | _, stdout, stderr = local[f"{self.pgbin_path}/pg_ctl"][ 202 | "stop", "--wait", "-t", "180", "-D", self.dbdata_path 203 | ].run(retcode=None) 204 | time.sleep(1) 205 | logging.debug("Stop message: (%s, %s)", stdout, stderr) 206 | 207 | # Wait until pg_isready fails. 208 | retcode, _, _ = local[f"{self.pgbin_path}/pg_isready"][ 209 | "--host", 210 | "localhost", 211 | "--port", 212 | str(self.pgport), 213 | "--dbname", 214 | DBGYM_POSTGRES_DBNAME, 215 | ].run(retcode=None) 216 | 217 | exists = (Path(self.dbdata_path) / "postmaster.pid").exists() 218 | if not exists and retcode != 0: 219 | break 220 | 221 | def restart_postgres(self) -> bool: 222 | # TODO: check if we still get the shared preload libraries correctly if we do None 223 | return self.restart_with_changes(conf_changes=None) 224 | 225 | def restart_with_changes( 226 | self, 227 | conf_changes: Optional[dict[str, str]], 228 | dump_page_cache: bool = False, 229 | save_checkpoint: bool = False, 230 | ) -> bool: 231 | """ 232 | This function is called "(re)start" because it also shuts down Postgres before starting it. 233 | This function assumes that some snapshot has already been untarred into self.dbdata_path. 234 | You can do this by calling one of the wrappers around _restore_snapshot(). 235 | 236 | Note that multiple calls are not "additive". Calling this will restart from the latest saved 237 | snapshot. If you want it to be additive without the overhead of saving a snapshot, pass in 238 | multiple changes to `conf_changes`. 239 | """ 240 | # Install the new configuration changes. 241 | if conf_changes is not None: 242 | dbdata_auto_conf_path = self.dbdata_path / "postgresql.auto.conf" 243 | with open(dbdata_auto_conf_path, "w") as f: 244 | f.write( 245 | "\n".join([f"{knob} = {val}" for knob, val in conf_changes.items()]) 246 | + "\n" 247 | ) 248 | 249 | assert ( 250 | "shared_preload_libraries" not in conf_changes 251 | ), f"You should not set shared_preload_libraries manually." 252 | 253 | # Using single quotes around SHARED_PRELOAD_LIBRARIES works for both single or multiple libraries. 254 | f.write(f"shared_preload_libraries = '{SHARED_PRELOAD_LIBRARIES}'") 255 | 256 | # Start postgres instance. 257 | self.shutdown_postgres() 258 | self.move_log() 259 | 260 | if save_checkpoint: 261 | local["tar"][ 262 | "cf", 263 | # We append .tmp so that if we fail in the *middle* of running tar, we 264 | # still have the previous checkpoint available to us 265 | f"{self.checkpoint_dbdata_snapshot_path}.tmp", 266 | "-C", 267 | parent_path_of_path(self.dbdata_path), 268 | self.dbdata_path, 269 | ].run() 270 | 271 | # Make sure the PID lock file doesn't exist. 272 | pid_lock = Path(f"{self.dbdata_path}/postmaster.pid") 273 | assert not pid_lock.exists() 274 | 275 | if dump_page_cache: 276 | # Dump the OS page cache. 277 | os.system('sudo sh -c "sync; echo 3 > /proc/sys/vm/drop_caches"') 278 | 279 | attempts = 0 280 | while not pid_lock.exists(): 281 | # Try starting up. 282 | retcode, stdout, stderr = local[f"{self.pgbin_path}/pg_ctl"][ 283 | "-D", 284 | self.dbdata_path, 285 | "--wait", 286 | "-t", 287 | "180", 288 | "-l", 289 | # We log to pg{self.pgport}.log instead of pg.log so that different PostgresConn objects 290 | # don't all try to write to the same file. 291 | self.dbgym_workspace.dbgym_this_run_path / f"pg{self.pgport}.log", 292 | "start", 293 | ].run(retcode=None) 294 | 295 | if retcode == 0 or pid_lock.exists(): 296 | break 297 | 298 | logging.warning("startup encountered: (%s, %s)", stdout, stderr) 299 | attempts += 1 300 | if attempts >= 5: 301 | logging.error( 302 | "Number of attempts to start postgres has exceeded limit." 303 | ) 304 | assert False, "Could not start postgres." 305 | 306 | # Wait until postgres is ready to accept connections. 307 | num_cycles = 0 308 | while True: 309 | if num_cycles >= CONNECT_TIMEOUT: 310 | # In this case, we've failed to start postgres. 311 | logging.error("Failed to start postgres before timeout...") 312 | return False 313 | 314 | retcode, _, _ = local[f"{self.pgbin_path}/pg_isready"][ 315 | "--host", 316 | "localhost", 317 | "--port", 318 | str(self.pgport), 319 | "--dbname", 320 | DBGYM_POSTGRES_DBNAME, 321 | ].run(retcode=None) 322 | if retcode == 0: 323 | break 324 | 325 | time.sleep(1) 326 | num_cycles += 1 327 | logging.debug("Waiting for postgres to bootup but it is not...") 328 | 329 | # Set up Boot if we're told to do so 330 | if self.boot_config_path is not None: 331 | with self.dbgym_workspace.open_and_save(self.boot_config_path) as f: 332 | boot_config = yaml.safe_load(f) 333 | 334 | self._set_up_boot( 335 | boot_config["intelligent_cache"], 336 | boot_config["early_stop"], 337 | boot_config["seq_sample"], 338 | boot_config["seq_sample_pct"], 339 | boot_config["seq_sample_seed"], 340 | boot_config["mu_hyp_opt"], 341 | boot_config["mu_hyp_time"], 342 | boot_config["mu_hyp_stdev"], 343 | ) 344 | 345 | # Move the temporary over since we now know the temporary can load. 346 | if save_checkpoint: 347 | shutil.move(f"{self.dbdata_path}.tgz.tmp", f"{self.dbdata_path}.tgz") 348 | 349 | return True 350 | 351 | def _set_up_boot( 352 | self, 353 | intelligent_cache: bool, 354 | early_stop: bool, 355 | seq_sample: bool, 356 | seq_sample_pct: int, 357 | seq_sample_seed: int, 358 | mu_hyp_opt: float, 359 | mu_hyp_time: int, 360 | mu_hyp_stdev: float, 361 | ) -> None: 362 | """ 363 | Sets up Boot on the currently running Postgres instances. 364 | Uses instance vars of PostgresConn for configuration. 365 | I chose to not encode any "default values" in this function. This is so that all values 366 | are explicitly included in the config file. This way, we can know what Boot config 367 | was used in a given experiment by looking only at the config file. If we did encode 368 | "default values" in the function, we would need to know the state of the code at the 369 | time of the experiment, which is very difficult in the general case. 370 | """ 371 | # If any of these commands fail, they'll throw a Python exception 372 | # Thus, if none of them throw an exception, we know they passed 373 | logging.debug("Setting up boot") 374 | self.conn().execute("DROP EXTENSION IF EXISTS boot") 375 | self.conn().execute("CREATE EXTENSION IF NOT EXISTS boot") 376 | self.conn().execute("SELECT boot_connect()") 377 | self.conn().execute("SELECT boot_cache_clear()") 378 | self.conn().execute("SET boot.enable=true") 379 | self.conn().execute("SET boot.intercept_explain_analyze=true") 380 | self.conn().execute(f"SET boot.intelligent_cache={intelligent_cache}") 381 | self.conn().execute(f"SET boot.early_stop={early_stop}") 382 | self.conn().execute(f"SET boot.seq_sample={seq_sample}") 383 | self.conn().execute(f"SET boot.seq_sample_pct={seq_sample_pct}") 384 | self.conn().execute(f"SET boot.seq_sample_seed={seq_sample_seed}") 385 | self.conn().execute(f"SET boot.mu_hyp_opt={mu_hyp_opt}") 386 | self.conn().execute(f"SET boot.mu_hyp_time={mu_hyp_time}") 387 | self.conn().execute(f"SET boot.mu_hyp_stdev={mu_hyp_stdev}") 388 | logging.debug("Set up boot") 389 | 390 | def psql(self, sql: str) -> tuple[int, Optional[str]]: 391 | """ 392 | Execute a SQL command (equivalent to psql -C "[cmd]") and return a status code and its stderr. 393 | 394 | This is meant for commands that modify the database, not those that get information from the database, which 395 | is why it doesn't return a Cursor with the result. I designed it this way because it's difficult to provide 396 | a general-purpose API which returns results for arbitrary SQL queries as those results could be very large. 397 | 398 | A return code of 0 means success while a non-zero return code means failure. The stderr will be None if success 399 | and a string if failure. 400 | """ 401 | 402 | def cancel_fn(conn_str: str) -> None: 403 | with psycopg.connect( 404 | conn_str, autocommit=True, prepare_threshold=None 405 | ) as tconn: 406 | r = [ 407 | r 408 | for r in tconn.execute( 409 | "SELECT pid FROM pg_stat_progress_create_index" 410 | ) 411 | ] 412 | 413 | for row in r: 414 | logging.info(f"Killing process {row[0]}") 415 | try: 416 | psutil.Process(row[0]).kill() 417 | except: 418 | pass 419 | 420 | # Get a fresh connection. 421 | self.disconnect() 422 | conn = self.conn() 423 | conn.execute("SET maintenance_work_mem = '4GB'") 424 | # TODO(wz2): Make this a configuration/runtime option for action timeout. 425 | conn.execute("SET statement_timeout = 300000") 426 | 427 | try: 428 | timer = threading.Timer(300.0, cancel_fn, args=(self.get_kv_connstr(),)) 429 | timer.start() 430 | 431 | conn.execute(sql) 432 | timer.cancel() 433 | except ProgramLimitExceeded as e: 434 | timer.cancel() 435 | self.disconnect() 436 | logging.debug(f"Action error: {e}") 437 | return -1, str(e) 438 | except QueryCanceled as e: 439 | timer.cancel() 440 | self.disconnect() 441 | logging.debug(f"Action error: {e}") 442 | return -1, f"canceling statement: {sql}." 443 | except psycopg.OperationalError as e: 444 | timer.cancel() 445 | self.disconnect() 446 | logging.debug(f"Action error: {e}") 447 | return -1, f"operational error: {sql}." 448 | except psycopg.errors.UndefinedTable: 449 | timer.cancel() 450 | raise 451 | 452 | self.disconnect() 453 | return 0, None 454 | 455 | def get_system_knobs(self) -> dict[str, str]: 456 | """ 457 | System knobs are those applied across the entire system. They do not include table-specific 458 | knobs, query-specific knobs (aka query hints), or indexes. 459 | """ 460 | conn = self.conn() 461 | result = conn.execute("SHOW ALL").fetchall() 462 | knobs = {} 463 | for row in result: 464 | knobs[row[0]] = row[1] 465 | return knobs 466 | 467 | def restore_pristine_snapshot(self) -> bool: 468 | return self._restore_snapshot(self.pristine_dbdata_snapshot_path) 469 | 470 | def restore_checkpointed_snapshot(self) -> bool: 471 | return self._restore_snapshot(self.checkpoint_dbdata_snapshot_path) 472 | 473 | def _restore_snapshot( 474 | self, 475 | dbdata_snapshot_path: Path, 476 | ) -> bool: 477 | self.shutdown_postgres() 478 | 479 | local["rm"]["-rf", self.dbdata_path].run() 480 | local["mkdir"]["-m", "0700", "-p", self.dbdata_path].run() 481 | 482 | # Strip the "dbdata" so we can implant directly into the target dbdata_path. 483 | assert dbdata_snapshot_path.exists() 484 | local["tar"][ 485 | "xf", 486 | dbdata_snapshot_path, 487 | "-C", 488 | self.dbdata_path, 489 | "--strip-components", 490 | "1", 491 | ].run() 492 | # Imprint the required port. 493 | ( 494 | (local["echo"][f"port={self.pgport}"]) 495 | >> f"{self.dbdata_path}/postgresql.conf" 496 | )() 497 | 498 | return self.restart_postgres() 499 | -------------------------------------------------------------------------------- /gymlib_package/gymlib/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/gymlib_package/gymlib/py.typed -------------------------------------------------------------------------------- /gymlib_package/gymlib/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/gymlib_package/gymlib/tests/__init__.py -------------------------------------------------------------------------------- /gymlib_package/gymlib/tests/_set_up_gymlib_integtest_workspace.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # DO NOT RUN THIS SCRIPT DIRECTLY. 4 | # This script only runs correctly when run by GymlibIntegtestManager.set_up_workspace() as it sets the necessary envvars. 5 | # By allowing GymlibIntegtestManager.set_up_workspace() to set the envvars, we ensure that the envvars are only defined 6 | # in a single location (inside GymlibIntegtestManager). 7 | 8 | # Gymlib integration tests relies on Postgres being built and workloads/dbdata being generated. 9 | # Generating these things is not considered a part of the test which is why it's in its own shell script. 10 | # The reason there's a shell script generating them instead of them just being in the repo is because (a) 11 | # the Postgres repo is very large and (b) the built binary will be different for different machines. 12 | # This script should be run from the base dbgym/ directory. 13 | 14 | set -euxo pipefail 15 | 16 | # INTENDED_DBDATA_HARDWARE can be set elsewhere (e.g. by tests_ci.yaml) but we use hdd by default. 17 | INTENDED_DBDATA_HARDWARE="${INTENDED_DBDATA_HARDWARE:-hdd}" 18 | 19 | python3 task.py benchmark $BENCHMARK tables $SCALE_FACTOR 20 | python3 task.py benchmark $BENCHMARK workload --scale-factor $SCALE_FACTOR 21 | 22 | python3 task.py dbms postgres build 23 | python3 task.py dbms postgres dbdata $BENCHMARK --scale-factor $SCALE_FACTOR --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE 24 | -------------------------------------------------------------------------------- /gymlib_package/gymlib/tests/filesystem_unittest_util.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from pathlib import Path 4 | from typing import Any, NewType, cast 5 | 6 | from gymlib.workspace import RUNS_DNAME, SYMLINKS_DNAME, TMP_DNAME 7 | 8 | FilesystemStructure = NewType("FilesystemStructure", dict[str, Any]) 9 | 10 | 11 | def create_structure(root_path: Path, structure: FilesystemStructure) -> None: 12 | """ 13 | Create files and directories according to the structure. 14 | """ 15 | 16 | def create_structure_internal( 17 | root_path: Path, cur_path: Path, structure: FilesystemStructure 18 | ) -> None: 19 | for path, content in structure.items(): 20 | full_path: Path = cur_path / path 21 | 22 | if isinstance(content, dict): # Directory 23 | full_path.mkdir(parents=True, exist_ok=True) 24 | create_structure_internal( 25 | root_path, 26 | full_path, 27 | FilesystemStructure(cast(dict[str, Any], content)), 28 | ) 29 | elif isinstance(content, tuple) and content[0] == "file": 30 | full_path.parent.mkdir(parents=True, exist_ok=True) 31 | if len(content) == 2: 32 | full_path.write_text(content[1]) 33 | else: 34 | assert len(content) == 1 35 | full_path.touch() 36 | elif isinstance(content, tuple) and content[0] == "symlink": 37 | assert len(content) == 2 38 | target_path = root_path / content[1] 39 | os.symlink(target_path, full_path) 40 | else: 41 | raise ValueError(f"Unsupported type for path ({path}): {content}") 42 | 43 | root_path.mkdir(parents=True, exist_ok=True) 44 | create_structure_internal(root_path, root_path, structure) 45 | 46 | 47 | def verify_structure(root_path: Path, structure: FilesystemStructure) -> bool: 48 | """ 49 | Verify that the files and directories match the expected structure. 50 | """ 51 | 52 | def verify_structure_internal( 53 | root_path: Path, cur_path: Path, structure: FilesystemStructure 54 | ) -> bool: 55 | # Check for the presence of each item specified in the structure 56 | for name, item in structure.items(): 57 | new_cur_path = cur_path / name 58 | if not path_exists_dont_follow_symlinks(new_cur_path): 59 | logging.debug(f"{new_cur_path} does not exist") 60 | return False 61 | elif isinstance(item, dict): 62 | if not new_cur_path.is_dir(): 63 | logging.debug(f"expected {new_cur_path} to be a directory") 64 | return False 65 | if not verify_structure_internal( 66 | root_path, 67 | new_cur_path, 68 | FilesystemStructure(cast(dict[str, Any], item)), 69 | ): 70 | return False 71 | elif isinstance(item, tuple) and item[0] == "file": 72 | if not new_cur_path.is_file(): 73 | logging.debug(f"expected {new_cur_path} to be a regular file") 74 | return False 75 | elif isinstance(item, tuple) and item[0] == "symlink": 76 | if not new_cur_path.is_symlink(): 77 | logging.debug(f"expected {new_cur_path} to be a symlink") 78 | return False 79 | # If item[1] is None, this indicates that we expect the symlink to be broken 80 | if item[1] != None: 81 | expected_target = root_path / item[1] 82 | if not new_cur_path.resolve().samefile(expected_target): 83 | logging.debug( 84 | f"expected {new_cur_path} to link to {expected_target}, but it links to {new_cur_path.resolve()}" 85 | ) 86 | return False 87 | else: 88 | assert False, "structure misconfigured" 89 | 90 | # Check for any extra files or directories not described by the structure 91 | expected_names = set(structure.keys()) 92 | actual_names = {entry.name for entry in cur_path.iterdir()} 93 | if not expected_names.issuperset(actual_names): 94 | logging.debug( 95 | f"expected_names={expected_names}, actual_names={actual_names}" 96 | ) 97 | return False 98 | 99 | return True 100 | 101 | if not root_path.exists(): 102 | logging.debug(f"{root_path} does not exist") 103 | return False 104 | return verify_structure_internal(root_path, root_path, structure) 105 | 106 | 107 | def make_workspace_structure( 108 | symlinks_structure: FilesystemStructure, 109 | task_runs_structure: FilesystemStructure, 110 | ) -> FilesystemStructure: 111 | """ 112 | This function exists so that it's easier to refactor the tests in case we ever change 113 | how the workspace is organized. 114 | """ 115 | return FilesystemStructure( 116 | { 117 | "dbgym_workspace": { 118 | SYMLINKS_DNAME: symlinks_structure, 119 | RUNS_DNAME: task_runs_structure, 120 | TMP_DNAME: {}, 121 | } 122 | } 123 | ) 124 | 125 | 126 | def path_exists_dont_follow_symlinks(path: Path) -> bool: 127 | """ 128 | As of writing this comment, ray is currently constraining us to python <3.12. However, the "follow_symlinks" option in 129 | Path.exists() only comes up in python 3.12. Thus, this is the only way to check if a path exists without following symlinks. 130 | """ 131 | # If the path exists and is a symlink, os.path.islink() will be true (even if the symlink is broken) 132 | if os.path.islink(path): 133 | return True 134 | # Otherwise, we know it's either non-existent or not a symlink, so path.exists() works fine 135 | else: 136 | return path.exists() 137 | -------------------------------------------------------------------------------- /gymlib_package/gymlib/tests/gymlib_integtest_dbgym_config.yaml: -------------------------------------------------------------------------------- 1 | dbgym_workspace_path: ../dbgym_gymlib_integtest_workspace/ 2 | -------------------------------------------------------------------------------- /gymlib_package/gymlib/tests/gymlib_integtest_util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | from pathlib import Path 4 | from typing import Optional 5 | 6 | # TODO: remove infra_paths from the import 7 | from gymlib.infra_paths import ( 8 | get_dbdata_tgz_symlink_path, 9 | get_pgbin_symlink_path, 10 | get_workload_suffix, 11 | get_workload_symlink_path, 12 | ) 13 | from gymlib.tuning_artifacts import TuningMetadata 14 | from gymlib.workspace import ( 15 | fully_resolve_path, 16 | get_tmp_path_from_workspace_path, 17 | get_workspace_path_from_config, 18 | ) 19 | 20 | from benchmark.tpch.constants import DEFAULT_TPCH_SEED 21 | 22 | 23 | class GymlibIntegtestManager: 24 | """ 25 | This is essentially a singleton class. This avoids multiple integtest_*.py files creating 26 | the workspace and/or the DBGymWorkspace object redundantly. 27 | 28 | The reason I put all these static methods in a class instead of directly in the module is 29 | that the functions have very generic names (e.g. set_up_workspace()) but having them 30 | inside a class makes it clear that they are related to the gymlib integration tests. 31 | """ 32 | 33 | # These constants are also used by _set_up_gymlib_integtest_workspace.sh. 34 | BENCHMARK = "tpch" 35 | SCALE_FACTOR = 0.01 36 | DBGYM_CONFIG_PATH = Path( 37 | "gymlib_package/gymlib/tests/gymlib_integtest_dbgym_config.yaml" 38 | ) 39 | WORKSPACE_PATH: Optional[Path] = None 40 | 41 | @staticmethod 42 | def set_up_workspace() -> None: 43 | """ 44 | Set up the workspace if it has not already been set up. 45 | None of the integtest_*.py files will delete the workspace so that future tests run faster. 46 | """ 47 | GymlibIntegtestManager.WORKSPACE_PATH = get_workspace_path_from_config( 48 | GymlibIntegtestManager.DBGYM_CONFIG_PATH 49 | ) 50 | 51 | # This if statement prevents us from setting up the workspace twice, which saves time. 52 | if not GymlibIntegtestManager.WORKSPACE_PATH.exists(): 53 | subprocess.run( 54 | ["./gymlib_package/gymlib/tests/_set_up_gymlib_integtest_workspace.sh"], 55 | env={ 56 | "BENCHMARK": GymlibIntegtestManager.BENCHMARK, 57 | "SCALE_FACTOR": str(GymlibIntegtestManager.SCALE_FACTOR), 58 | # By setting this envvar, we ensure that when running _set_up_gymlib_integtest_workspace.sh, 59 | # make_standard_dbgym_workspace() will use the correct DBGYM_CONFIG_PATH. 60 | "DBGYM_CONFIG_PATH": str(GymlibIntegtestManager.DBGYM_CONFIG_PATH), 61 | **os.environ, 62 | }, 63 | check=True, 64 | ) 65 | 66 | @staticmethod 67 | def get_workspace_path() -> Path: 68 | assert GymlibIntegtestManager.WORKSPACE_PATH is not None 69 | return GymlibIntegtestManager.WORKSPACE_PATH 70 | 71 | @staticmethod 72 | def get_default_metadata() -> TuningMetadata: 73 | assert GymlibIntegtestManager.BENCHMARK == "tpch" 74 | suffix = get_workload_suffix( 75 | GymlibIntegtestManager.BENCHMARK, 76 | seed_start=DEFAULT_TPCH_SEED, 77 | seed_end=DEFAULT_TPCH_SEED, 78 | query_subset="all", 79 | ) 80 | return TuningMetadata( 81 | workload_path=fully_resolve_path( 82 | get_workload_symlink_path( 83 | GymlibIntegtestManager.get_workspace_path(), 84 | GymlibIntegtestManager.BENCHMARK, 85 | GymlibIntegtestManager.SCALE_FACTOR, 86 | suffix, 87 | ), 88 | ), 89 | pristine_dbdata_snapshot_path=fully_resolve_path( 90 | get_dbdata_tgz_symlink_path( 91 | GymlibIntegtestManager.get_workspace_path(), 92 | GymlibIntegtestManager.BENCHMARK, 93 | GymlibIntegtestManager.SCALE_FACTOR, 94 | ), 95 | ), 96 | dbdata_parent_path=fully_resolve_path( 97 | get_tmp_path_from_workspace_path( 98 | GymlibIntegtestManager.get_workspace_path() 99 | ), 100 | ), 101 | pgbin_path=fully_resolve_path( 102 | get_pgbin_symlink_path(GymlibIntegtestManager.get_workspace_path()), 103 | ), 104 | ) 105 | -------------------------------------------------------------------------------- /gymlib_package/gymlib/tests/integtest_pg_conn.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import unittest 3 | 4 | import psycopg 5 | from gymlib.pg import ( 6 | DEFAULT_POSTGRES_PORT, 7 | get_is_postgres_running, 8 | get_running_postgres_ports, 9 | ) 10 | from gymlib.pg_conn import PostgresConn 11 | from gymlib.tests.gymlib_integtest_util import GymlibIntegtestManager 12 | from gymlib.workspace import DBGymWorkspace 13 | 14 | 15 | class PostgresConnTests(unittest.TestCase): 16 | workspace: DBGymWorkspace 17 | 18 | @staticmethod 19 | def setUpClass() -> None: 20 | GymlibIntegtestManager.set_up_workspace() 21 | # Reset _num_times_created_this_run since previous tests may have created a workspace. 22 | DBGymWorkspace._num_times_created_this_run = 0 23 | PostgresConnTests.workspace = DBGymWorkspace( 24 | GymlibIntegtestManager.get_workspace_path() 25 | ) 26 | 27 | def setUp(self) -> None: 28 | self.assertFalse( 29 | get_is_postgres_running(), 30 | "Make sure Postgres isn't running before starting the integration test. `pkill postgres` is one way " 31 | + "to ensure this. Be careful about accidentally taking down other people's Postgres instances though.", 32 | ) 33 | self.metadata = GymlibIntegtestManager.get_default_metadata() 34 | 35 | # The reason we restart Postgres every time is to ensure a "clean" starting point 36 | # so that all tests are independent of each other. 37 | self.pg_conn = self.create_pg_conn() 38 | self.pg_conn.restore_pristine_snapshot() 39 | self.pg_conn.restart_postgres() 40 | self.assertTrue(get_is_postgres_running()) 41 | 42 | def tearDown(self) -> None: 43 | self.pg_conn.shutdown_postgres() 44 | self.assertFalse(get_is_postgres_running()) 45 | 46 | def create_pg_conn(self, pgport: int = DEFAULT_POSTGRES_PORT) -> PostgresConn: 47 | return PostgresConn( 48 | PostgresConnTests.workspace, 49 | pgport, 50 | self.metadata.pristine_dbdata_snapshot_path, 51 | self.metadata.dbdata_parent_path, 52 | self.metadata.pgbin_path, 53 | None, 54 | ) 55 | 56 | def test_start_on_multiple_ports(self) -> None: 57 | # The setUp() function should have started Postgres on DEFAULT_POSTGRES_PORT. 58 | self.assertEqual(set(get_running_postgres_ports()), {DEFAULT_POSTGRES_PORT}) 59 | 60 | # Now, we start Postgres on a new port. 61 | pg_conn1 = self.create_pg_conn(DEFAULT_POSTGRES_PORT + 1) 62 | pg_conn1.restore_pristine_snapshot() 63 | pg_conn1.restart_postgres() 64 | self.assertEqual( 65 | set(get_running_postgres_ports()), 66 | {DEFAULT_POSTGRES_PORT, DEFAULT_POSTGRES_PORT + 1}, 67 | ) 68 | 69 | # Clean up 70 | pg_conn1.shutdown_postgres() 71 | 72 | def test_connect_and_disconnect(self) -> None: 73 | self.assertIsNone(self.pg_conn._conn) 74 | conn = self.pg_conn.conn() 75 | self.assertIsNotNone(conn) 76 | self.assertIs( 77 | conn, self.pg_conn._conn 78 | ) # The conn should be cached so these objects should be the same 79 | self.assertIs(conn, self.pg_conn.conn()) # Same thing here 80 | self.pg_conn.disconnect() 81 | self.assertIsNone(self.pg_conn._conn) 82 | 83 | def test_start_with_changes(self) -> None: 84 | initial_sysknobs = self.pg_conn.get_system_knobs() 85 | 86 | # First call 87 | self.assertEqual(initial_sysknobs["wal_buffers"], "4MB") 88 | self.pg_conn.restart_with_changes({"wal_buffers": "8MB"}) 89 | new_sysknobs = self.pg_conn.get_system_knobs() 90 | self.assertEqual(new_sysknobs["wal_buffers"], "8MB") 91 | 92 | # Second call 93 | self.assertEqual(initial_sysknobs["enable_nestloop"], "on") 94 | self.pg_conn.restart_with_changes({"enable_nestloop": "off"}) 95 | new_sysknobs = self.pg_conn.get_system_knobs() 96 | self.assertEqual(new_sysknobs["enable_nestloop"], "off") 97 | # The changes should not be additive. The "wal_buffers" should have "reset" to 4MB. 98 | self.assertEqual(new_sysknobs["wal_buffers"], "4MB") 99 | 100 | def test_start_with_changes_doesnt_modify_input(self) -> None: 101 | conf_changes = {"wal_buffers": "8MB"} 102 | orig_conf_changes = copy.deepcopy(conf_changes) 103 | self.pg_conn.restart_with_changes(conf_changes) 104 | self.assertEqual(conf_changes, orig_conf_changes) 105 | 106 | def test_time_query(self) -> None: 107 | runtime, did_time_out, explain_data = self.pg_conn.time_query( 108 | "select pg_sleep(1)" 109 | ) 110 | # The runtime should be about 1 second. 111 | self.assertTrue(abs(runtime - 1_000_000) < 100_000) 112 | self.assertFalse(did_time_out) 113 | self.assertIsNone(explain_data) 114 | 115 | def test_time_query_with_explain(self) -> None: 116 | _, _, explain_data = self.pg_conn.time_query("select 1", add_explain=True) 117 | self.assertIsNotNone(explain_data) 118 | 119 | def test_time_query_with_timeout(self) -> None: 120 | runtime, did_time_out, _ = self.pg_conn.time_query( 121 | "select pg_sleep(3)", timeout=2 122 | ) 123 | # The runtime should be about what the timeout is. 124 | self.assertTrue(abs(runtime - 2_000_000) < 100_000) 125 | self.assertTrue(did_time_out) 126 | 127 | def test_time_query_with_valid_table(self) -> None: 128 | # This just ensures that it doesn't raise any errors. 129 | self.pg_conn.time_query("select * from lineitem limit 10") 130 | 131 | def test_time_query_with_invalid_table(self) -> None: 132 | with self.assertRaises(psycopg.errors.UndefinedTable): 133 | self.pg_conn.time_query("select * from itemline limit 10") 134 | 135 | def test_time_query_with_valid_hints(self) -> None: 136 | join_query = """SELECT * 137 | FROM orders 138 | JOIN lineitem ON o_orderkey = l_orderkey 139 | WHERE o_orderdate BETWEEN '1995-01-01' AND '1995-12-31' 140 | LIMIT 10""" 141 | join_types = [ 142 | ("MergeJoin", "Merge Join"), 143 | ("HashJoin", "Hash Join"), 144 | ("NestLoop", "Nested Loop"), 145 | ] 146 | 147 | for hint_join_type, expected_join_type in join_types: 148 | _, _, explain_data = self.pg_conn.time_query( 149 | join_query, 150 | query_knobs=[f"{hint_join_type}(lineitem orders)"], 151 | add_explain=True, 152 | ) 153 | assert explain_data is not None # This assertion is for mypy. 154 | actual_join_type = explain_data["Plan"]["Plans"][0]["Node Type"] 155 | self.assertEqual(expected_join_type, actual_join_type) 156 | 157 | def test_time_query_with_invalid_hint(self) -> None: 158 | with self.assertRaises(RuntimeError) as context: 159 | self.pg_conn.time_query("select 1", query_knobs=["dbgym"]) 160 | self.assertTrue( 161 | 'Unrecognized hint keyword "dbgym"' in str(context.exception) 162 | ) 163 | 164 | 165 | if __name__ == "__main__": 166 | unittest.main() 167 | -------------------------------------------------------------------------------- /gymlib_package/gymlib/tests/integtest_tuning_artifacts.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from gymlib.tests.gymlib_integtest_util import GymlibIntegtestManager 4 | from gymlib.tuning_artifacts import ( 5 | DBMSConfigDelta, 6 | IndexesDelta, 7 | QueryKnobsDelta, 8 | SysKnobsDelta, 9 | TuningArtifactsReader, 10 | TuningArtifactsWriter, 11 | ) 12 | from gymlib.workspace import DBGymWorkspace 13 | 14 | 15 | class PostgresConnTests(unittest.TestCase): 16 | @staticmethod 17 | def setUpClass() -> None: 18 | GymlibIntegtestManager.set_up_workspace() 19 | 20 | def setUp(self) -> None: 21 | # We re-create a workspace for each test because each test will create its own TuningArtifactsWriter. 22 | DBGymWorkspace._num_times_created_this_run = 0 23 | self.workspace = DBGymWorkspace(GymlibIntegtestManager.get_workspace_path()) 24 | 25 | @staticmethod 26 | def make_config(letter: str) -> DBMSConfigDelta: 27 | return DBMSConfigDelta( 28 | indexes=IndexesDelta([letter]), 29 | sysknobs=SysKnobsDelta({letter: letter}), 30 | qknobs=QueryKnobsDelta({letter: [letter]}), 31 | ) 32 | 33 | def test_get_delta_at_step(self) -> None: 34 | writer = TuningArtifactsWriter( 35 | self.workspace, 36 | GymlibIntegtestManager.get_default_metadata(), 37 | ) 38 | 39 | writer.write_step(PostgresConnTests.make_config("a")) 40 | writer.write_step(PostgresConnTests.make_config("b")) 41 | writer.write_step(PostgresConnTests.make_config("c")) 42 | 43 | reader = TuningArtifactsReader(writer.tuning_artifacts_path) 44 | 45 | self.assertEqual( 46 | reader.get_delta_at_step(1), PostgresConnTests.make_config("b") 47 | ) 48 | self.assertEqual( 49 | reader.get_delta_at_step(0), PostgresConnTests.make_config("a") 50 | ) 51 | self.assertEqual( 52 | reader.get_delta_at_step(1), PostgresConnTests.make_config("b") 53 | ) 54 | self.assertEqual( 55 | reader.get_delta_at_step(2), PostgresConnTests.make_config("c") 56 | ) 57 | 58 | def test_get_all_deltas_in_order(self) -> None: 59 | writer = TuningArtifactsWriter( 60 | self.workspace, 61 | GymlibIntegtestManager.get_default_metadata(), 62 | ) 63 | 64 | writer.write_step(PostgresConnTests.make_config("a")) 65 | writer.write_step(PostgresConnTests.make_config("b")) 66 | writer.write_step(PostgresConnTests.make_config("c")) 67 | 68 | reader = TuningArtifactsReader(writer.tuning_artifacts_path) 69 | 70 | self.assertEqual( 71 | reader.get_all_deltas_in_order(), 72 | [ 73 | PostgresConnTests.make_config("a"), 74 | PostgresConnTests.make_config("b"), 75 | PostgresConnTests.make_config("c"), 76 | ], 77 | ) 78 | 79 | def test_get_metadata(self) -> None: 80 | writer = TuningArtifactsWriter( 81 | self.workspace, 82 | GymlibIntegtestManager.get_default_metadata(), 83 | ) 84 | reader = TuningArtifactsReader(writer.tuning_artifacts_path) 85 | metadata = reader.get_metadata() 86 | expected_metadata = GymlibIntegtestManager.get_default_metadata() 87 | self.assertEqual(metadata, expected_metadata) 88 | 89 | 90 | if __name__ == "__main__": 91 | unittest.main() 92 | -------------------------------------------------------------------------------- /gymlib_package/gymlib/tests/integtest_workload.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from gymlib.tests.gymlib_integtest_util import GymlibIntegtestManager 4 | from gymlib.workload import Workload 5 | from gymlib.workspace import DBGymWorkspace 6 | 7 | from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES 8 | 9 | 10 | class WorkloadTests(unittest.TestCase): 11 | workspace: DBGymWorkspace 12 | 13 | @staticmethod 14 | def setUpClass() -> None: 15 | GymlibIntegtestManager.set_up_workspace() 16 | # Reset _num_times_created_this_run since previous tests may have created a workspace. 17 | DBGymWorkspace._num_times_created_this_run = 0 18 | WorkloadTests.workspace = DBGymWorkspace( 19 | GymlibIntegtestManager.get_workspace_path() 20 | ) 21 | 22 | def test_workload(self) -> None: 23 | workload_path = GymlibIntegtestManager.get_default_metadata().workload_path 24 | workload = Workload(WorkloadTests.workspace, workload_path) 25 | 26 | # Check the order of query IDs. 27 | self.assertEqual( 28 | workload.get_query_order(), 29 | [f"S{DEFAULT_TPCH_SEED}-Q{i}" for i in range(1, NUM_TPCH_QUERIES + 1)], 30 | ) 31 | 32 | # Sanity check all queries. 33 | for query in workload.get_queries_in_order(): 34 | self.assertTrue("select" in query.lower()) 35 | 36 | 37 | if __name__ == "__main__": 38 | unittest.main() 39 | -------------------------------------------------------------------------------- /gymlib_package/gymlib/tests/unittest_filesystem_unittest_util.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import shutil 3 | import unittest 4 | from pathlib import Path 5 | 6 | from gymlib.tests.filesystem_unittest_util import ( 7 | FilesystemStructure, 8 | create_structure, 9 | verify_structure, 10 | ) 11 | 12 | 13 | class FilesystemUnittestUtilTests(unittest.TestCase): 14 | scratchspace_path: Path = Path() 15 | 16 | @classmethod 17 | def setUpClass(cls) -> None: 18 | cls.scratchspace_path = ( 19 | Path.cwd() / "util/tests/test_filesystem_unittest_util_scratchspace/" 20 | ) 21 | 22 | def setUp(self) -> None: 23 | if self.scratchspace_path.exists(): 24 | shutil.rmtree(self.scratchspace_path) 25 | 26 | def tearDown(self) -> None: 27 | if self.scratchspace_path.exists(): 28 | shutil.rmtree(self.scratchspace_path) 29 | 30 | def test_filesystem_unittest_util(self) -> None: 31 | structure = FilesystemStructure( 32 | { 33 | "dir1": {"file1.txt": ("file",), "dir2": {"file2.txt": ("file",)}}, 34 | "dir3": {"nested_link_to_dir1": ("symlink", "dir1")}, 35 | "link_to_dir1": ("symlink", "dir1"), 36 | "link_to_file2": ("symlink", "dir1/dir2/file2.txt"), 37 | } 38 | ) 39 | create_structure(self.scratchspace_path, structure) 40 | self.assertTrue(verify_structure(self.scratchspace_path, structure)) 41 | 42 | extra_dir_structure = copy.deepcopy(structure) 43 | # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it 44 | self.assertTrue(verify_structure(self.scratchspace_path, extra_dir_structure)) 45 | extra_dir_structure["dir4"] = {} 46 | self.assertFalse(verify_structure(self.scratchspace_path, extra_dir_structure)) 47 | 48 | missing_dir_structure = copy.deepcopy(structure) 49 | # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it 50 | self.assertTrue(verify_structure(self.scratchspace_path, missing_dir_structure)) 51 | del missing_dir_structure["dir1"] 52 | self.assertFalse( 53 | verify_structure(self.scratchspace_path, missing_dir_structure) 54 | ) 55 | 56 | extra_file_structure = copy.deepcopy(structure) 57 | # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it 58 | self.assertTrue(verify_structure(self.scratchspace_path, extra_file_structure)) 59 | extra_file_structure["file3.txt"] = ("file",) 60 | self.assertFalse(verify_structure(self.scratchspace_path, extra_file_structure)) 61 | 62 | missing_file_structure = copy.deepcopy(structure) 63 | # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it 64 | self.assertTrue( 65 | verify_structure(self.scratchspace_path, missing_file_structure) 66 | ) 67 | del missing_file_structure["dir1"]["file1.txt"] 68 | self.assertFalse( 69 | verify_structure(self.scratchspace_path, missing_file_structure) 70 | ) 71 | 72 | extra_link_structure = copy.deepcopy(structure) 73 | # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it 74 | self.assertTrue(verify_structure(self.scratchspace_path, extra_link_structure)) 75 | extra_link_structure["link_to_dir3"] = ("symlink", "dir3") 76 | self.assertFalse(verify_structure(self.scratchspace_path, extra_link_structure)) 77 | 78 | missing_link_structure = copy.deepcopy(structure) 79 | # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it 80 | self.assertTrue( 81 | verify_structure(self.scratchspace_path, missing_link_structure) 82 | ) 83 | del missing_link_structure["link_to_dir1"] 84 | self.assertFalse( 85 | verify_structure(self.scratchspace_path, missing_link_structure) 86 | ) 87 | 88 | wrong_link_structure = copy.deepcopy(structure) 89 | # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it 90 | self.assertTrue(verify_structure(self.scratchspace_path, wrong_link_structure)) 91 | wrong_link_structure["link_to_dir1"] = ("symlink", "dir3") 92 | self.assertFalse(verify_structure(self.scratchspace_path, wrong_link_structure)) 93 | 94 | 95 | if __name__ == "__main__": 96 | unittest.main() 97 | -------------------------------------------------------------------------------- /gymlib_package/gymlib/tests/unittest_workspace.py: -------------------------------------------------------------------------------- 1 | # TODO: figure out where to put the filesystem structure helpers. I think I want to put them inside gymlib and make a separate folder just testing the helpers. 2 | 3 | import os 4 | import shutil 5 | import unittest 6 | from pathlib import Path 7 | from typing import Optional 8 | 9 | from gymlib.tests.filesystem_unittest_util import ( 10 | FilesystemStructure, 11 | create_structure, 12 | make_workspace_structure, 13 | verify_structure, 14 | ) 15 | from gymlib.workspace import ( 16 | DBGYM_APP_NAME, 17 | RUNS_DNAME, 18 | SYMLINKS_DNAME, 19 | DBGymWorkspace, 20 | name_to_linkname, 21 | ) 22 | 23 | from gymlib_package.gymlib.workspace import LATEST_RUN_FNAME 24 | 25 | 26 | class WorkspaceTests(unittest.TestCase): 27 | scratchspace_path: Path = Path() 28 | workspace_path: Path = Path() 29 | 30 | @classmethod 31 | def setUpClass(cls) -> None: 32 | cls.scratchspace_path = Path.cwd() / "util/tests/test_workspace_scratchspace/" 33 | cls.workspace_path = cls.scratchspace_path / "dbgym_workspace" 34 | 35 | def setUp(self) -> None: 36 | if self.scratchspace_path.exists(): 37 | shutil.rmtree(self.scratchspace_path) 38 | 39 | self.workspace: Optional[DBGymWorkspace] = None 40 | self.expected_structure: Optional[FilesystemStructure] = None 41 | 42 | def tearDown(self) -> None: 43 | # You can comment this out if you want to inspect the scratchspace after a test (often used for debugging). 44 | if self.scratchspace_path.exists(): 45 | shutil.rmtree(self.scratchspace_path) 46 | 47 | # All these helper functions will perform an action, update the expected structure, and then verify the structure. 48 | # Importantly though, I don't have helper functions for the complex functions that I want to test (e.g. link_result and save_file). 49 | def init_workspace_helper(self) -> None: 50 | # Reset this to avoid the error of it being created twice. 51 | # In real usage, the second run would be a different Python process so DBGymWorkspace._num_times_created_this_run would be 0. 52 | DBGymWorkspace._num_times_created_this_run = 0 53 | self.workspace = DBGymWorkspace(self.workspace_path) 54 | 55 | if self.expected_structure is None: 56 | self.expected_structure = make_workspace_structure( 57 | FilesystemStructure({}), 58 | FilesystemStructure( 59 | { 60 | "latest_run.link": ( 61 | "symlink", 62 | f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}", 63 | ), 64 | self.workspace.dbgym_this_run_path.name: {}, 65 | } 66 | ), 67 | ) 68 | else: 69 | self.expected_structure["dbgym_workspace"][RUNS_DNAME][ 70 | self.workspace.dbgym_this_run_path.name 71 | ] = {} 72 | self.expected_structure["dbgym_workspace"][RUNS_DNAME][ 73 | name_to_linkname(LATEST_RUN_FNAME) 74 | ] = ( 75 | "symlink", 76 | f"dbgym_workspace/{RUNS_DNAME}/{self.workspace.dbgym_this_run_path.name}", 77 | ) 78 | 79 | self.assertTrue( 80 | verify_structure(self.scratchspace_path, self.expected_structure) 81 | ) 82 | 83 | def make_file_helper( 84 | self, relative_path: str, file_obj: tuple[str, ...] = ("file",) 85 | ) -> Path: 86 | """ 87 | You can override file_obj to make it a symlink instead. 88 | """ 89 | assert self.workspace is not None and self.expected_structure is not None 90 | assert ( 91 | ".." not in relative_path 92 | ), 'relative_path should not contain ".." (it should be inside the scratchspace dir)' 93 | file_path = self.scratchspace_path / relative_path 94 | file_path.parent.mkdir(parents=True, exist_ok=True) 95 | 96 | if file_obj[0] == "file": 97 | assert len(file_obj) in [1, 2] 98 | file_path.touch() 99 | elif file_obj[0] == "symlink": 100 | assert len(file_obj) == 2 101 | target_path = self.scratchspace_path / file_obj[1] 102 | os.symlink(target_path, file_path) 103 | else: 104 | assert False, f"Unsupported file_obj: {file_obj}" 105 | 106 | # Build up the nested dict structure for the expected path 107 | current_dict = self.expected_structure 108 | path_parts = relative_path.split("/") 109 | for part in path_parts[:-1]: 110 | if part not in current_dict: 111 | current_dict[part] = {} 112 | current_dict = current_dict[part] 113 | current_dict[path_parts[-1]] = file_obj 114 | 115 | self.assertTrue( 116 | verify_structure(self.scratchspace_path, self.expected_structure) 117 | ) 118 | return file_path 119 | 120 | def make_result_helper( 121 | self, relative_path: str = "result.txt", file_obj: tuple[str, ...] = ("file",) 122 | ) -> Path: 123 | assert self.workspace is not None and self.expected_structure is not None 124 | assert ( 125 | ".." not in relative_path 126 | ), 'relative_path should not contain ".." (it should be inside the run_*/ dir)' 127 | return self.make_file_helper( 128 | f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{relative_path}", 129 | file_obj=file_obj, 130 | ) 131 | 132 | def test_init_fields(self) -> None: 133 | workspace = DBGymWorkspace(self.workspace_path) 134 | self.assertEqual(workspace.app_name, DBGYM_APP_NAME) 135 | 136 | def test_init_from_nonexistent_workspace(self) -> None: 137 | self.init_workspace_helper() 138 | 139 | def test_init_from_empty_workspace(self) -> None: 140 | starting_structure = FilesystemStructure({"dbgym_workspace": {}}) 141 | create_structure(self.scratchspace_path, starting_structure) 142 | self.init_workspace_helper() 143 | 144 | def test_init_from_already_initialized_workspace(self) -> None: 145 | self.init_workspace_helper() 146 | self.init_workspace_helper() 147 | 148 | def test_link_result_basic_functionality(self) -> None: 149 | self.init_workspace_helper() 150 | assert self.workspace is not None and self.expected_structure is not None 151 | result_path = self.make_result_helper() 152 | self.workspace.link_result(result_path) 153 | self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME] = {} 154 | self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME][ 155 | name_to_linkname(result_path.name) 156 | ] = ( 157 | "symlink", 158 | f"dbgym_workspace/{RUNS_DNAME}/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", 159 | ) 160 | self.assertTrue( 161 | verify_structure(self.scratchspace_path, self.expected_structure) 162 | ) 163 | 164 | def test_link_result_does_not_copy_directory_structure_to_symlinks_dir( 165 | self, 166 | ) -> None: 167 | """ 168 | We always just want link_result to link to the base symlinks dir. 169 | """ 170 | self.init_workspace_helper() 171 | assert self.workspace is not None and self.expected_structure is not None 172 | result_path = self.make_result_helper(relative_path="dir1/dir2/dir3/result.txt") 173 | self.workspace.link_result(result_path) 174 | self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME] = {} 175 | self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME][ 176 | name_to_linkname(result_path.name) 177 | ] = ( 178 | "symlink", 179 | f"dbgym_workspace/{RUNS_DNAME}/{self.workspace.dbgym_this_run_path.name}/dir1/dir2/dir3/{result_path.name}", 180 | ) 181 | self.assertTrue( 182 | verify_structure(self.scratchspace_path, self.expected_structure) 183 | ) 184 | 185 | def test_link_result_invalid_custom_link_name(self) -> None: 186 | self.init_workspace_helper() 187 | assert self.workspace is not None and self.expected_structure is not None 188 | result_path = self.make_result_helper() 189 | with self.assertRaisesRegex( 190 | AssertionError, 'link_name \\(custom\\) should end with "\\.link"' 191 | ): 192 | self.workspace.link_result(result_path, custom_link_name=f"custom") 193 | 194 | def test_link_result_valid_custom_link_name(self) -> None: 195 | self.init_workspace_helper() 196 | assert self.workspace is not None and self.expected_structure is not None 197 | result_path = self.make_result_helper() 198 | self.workspace.link_result( 199 | result_path, custom_link_name=name_to_linkname("custom") 200 | ) 201 | self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME] = {} 202 | self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME][ 203 | name_to_linkname("custom") 204 | ] = ( 205 | "symlink", 206 | f"dbgym_workspace/{RUNS_DNAME}/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", 207 | ) 208 | self.assertTrue( 209 | verify_structure(self.scratchspace_path, self.expected_structure) 210 | ) 211 | 212 | def test_link_same_result_twice_with_same_link_name(self) -> None: 213 | self.init_workspace_helper() 214 | assert self.workspace is not None and self.expected_structure is not None 215 | result_path = self.make_result_helper() 216 | self.workspace.link_result(result_path) 217 | self.workspace.link_result(result_path) 218 | self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME] = {} 219 | self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME][ 220 | name_to_linkname(result_path.name) 221 | ] = ( 222 | "symlink", 223 | f"dbgym_workspace/{RUNS_DNAME}/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", 224 | ) 225 | self.assertTrue( 226 | verify_structure(self.scratchspace_path, self.expected_structure) 227 | ) 228 | 229 | def test_link_same_result_with_different_name(self) -> None: 230 | self.init_workspace_helper() 231 | assert self.workspace is not None and self.expected_structure is not None 232 | result_path = self.make_result_helper() 233 | self.workspace.link_result(result_path) 234 | self.workspace.link_result( 235 | result_path, custom_link_name=name_to_linkname("custom") 236 | ) 237 | self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME] = {} 238 | self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME][ 239 | name_to_linkname(result_path.name) 240 | ] = ( 241 | "symlink", 242 | f"dbgym_workspace/{RUNS_DNAME}/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", 243 | ) 244 | self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME][ 245 | name_to_linkname("custom") 246 | ] = ( 247 | "symlink", 248 | f"dbgym_workspace/{RUNS_DNAME}/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", 249 | ) 250 | self.assertTrue( 251 | verify_structure(self.scratchspace_path, self.expected_structure) 252 | ) 253 | 254 | def test_link_result_from_another_run_raises_error(self) -> None: 255 | self.init_workspace_helper() 256 | result_path = self.make_result_helper() 257 | self.init_workspace_helper() 258 | assert self.workspace is not None and self.expected_structure is not None 259 | with self.assertRaisesRegex( 260 | AssertionError, 261 | "The result must have been generated in \*this\* run\_\*/ dir", 262 | ): 263 | self.workspace.link_result(result_path) 264 | 265 | def test_link_result_from_external_dir_raises_error(self) -> None: 266 | self.init_workspace_helper() 267 | assert self.workspace is not None and self.expected_structure is not None 268 | result_path = self.make_file_helper("external/result.txt") 269 | with self.assertRaisesRegex( 270 | AssertionError, 271 | "The result must have been generated in \*this\* run\_\*/ dir", 272 | ): 273 | self.workspace.link_result(result_path) 274 | 275 | def test_link_result_cannot_link_symlink(self) -> None: 276 | self.init_workspace_helper() 277 | assert self.workspace is not None and self.expected_structure is not None 278 | result_path = self.make_result_helper() 279 | symlink_path = self.make_result_helper( 280 | name_to_linkname("symlink"), 281 | file_obj=( 282 | "symlink", 283 | f"dbgym_workspace/{RUNS_DNAME}/{self.workspace.dbgym_this_run_path.name}/{result_path.name}", 284 | ), 285 | ) 286 | with self.assertRaisesRegex( 287 | AssertionError, 288 | "result_path \(.*\) should be a fully resolved path", 289 | ): 290 | self.workspace.link_result(symlink_path) 291 | 292 | def test_save_file_dependency(self) -> None: 293 | """ 294 | See the comments in save_file() for what a "dependency" is. 295 | """ 296 | self.init_workspace_helper() 297 | assert self.workspace is not None and self.expected_structure is not None 298 | prev_run_name = self.workspace.dbgym_this_run_path.name 299 | result_path = self.make_result_helper() 300 | self.init_workspace_helper() 301 | self.workspace.save_file(result_path) 302 | self.expected_structure["dbgym_workspace"][RUNS_DNAME][ 303 | self.workspace.dbgym_this_run_path.name 304 | ][name_to_linkname(result_path.name)] = ( 305 | "symlink", 306 | f"dbgym_workspace/{RUNS_DNAME}/{prev_run_name}/{result_path.name}", 307 | ) 308 | self.assertTrue( 309 | verify_structure(self.scratchspace_path, self.expected_structure) 310 | ) 311 | 312 | def test_save_file_same_dependency_twice(self) -> None: 313 | self.init_workspace_helper() 314 | assert self.workspace is not None and self.expected_structure is not None 315 | prev_run_name = self.workspace.dbgym_this_run_path.name 316 | result_path = self.make_result_helper(file_obj=("file",)) 317 | self.init_workspace_helper() 318 | self.workspace.save_file(result_path) 319 | self.workspace.save_file(result_path) 320 | self.expected_structure["dbgym_workspace"][RUNS_DNAME][ 321 | self.workspace.dbgym_this_run_path.name 322 | ][name_to_linkname(result_path.name)] = ( 323 | "symlink", 324 | f"dbgym_workspace/{RUNS_DNAME}/{prev_run_name}/{result_path.name}", 325 | ) 326 | self.assertTrue( 327 | verify_structure(self.scratchspace_path, self.expected_structure) 328 | ) 329 | 330 | def test_save_file_two_different_dependencies_with_same_filename_both_directly_inside_run( 331 | self, 332 | ) -> None: 333 | self.init_workspace_helper() 334 | assert self.workspace is not None and self.expected_structure is not None 335 | prev_run_names = [] 336 | prev_run_names.append(self.workspace.dbgym_this_run_path.name) 337 | result1_path = self.make_result_helper(file_obj=("file",)) 338 | self.init_workspace_helper() 339 | prev_run_names.append(self.workspace.dbgym_this_run_path.name) 340 | result2_path = self.make_result_helper(file_obj=("file",)) 341 | filename = result1_path.name 342 | assert filename == result2_path.name 343 | 344 | self.init_workspace_helper() 345 | self.workspace.save_file(result1_path) 346 | self.workspace.save_file(result2_path) 347 | # The second save_file() should have overwritten the first one. 348 | self.expected_structure["dbgym_workspace"][RUNS_DNAME][ 349 | self.workspace.dbgym_this_run_path.name 350 | ][name_to_linkname(filename)] = ( 351 | "symlink", 352 | f"dbgym_workspace/{RUNS_DNAME}/{prev_run_names[-1]}/{filename}", 353 | ) 354 | self.assertTrue( 355 | verify_structure(self.scratchspace_path, self.expected_structure) 356 | ) 357 | 358 | def test_save_file_two_different_dependencies_with_same_filename_but_different_outermost_dirs( 359 | self, 360 | ) -> None: 361 | self.init_workspace_helper() 362 | assert self.workspace is not None and self.expected_structure is not None 363 | prev_run_name = self.workspace.dbgym_this_run_path.name 364 | result1_path = self.make_result_helper("dir1/result.txt", file_obj=("file",)) 365 | result2_path = self.make_result_helper("result.txt", file_obj=("file",)) 366 | filename = result1_path.name 367 | assert filename == result2_path.name 368 | 369 | self.init_workspace_helper() 370 | self.workspace.save_file(result1_path) 371 | self.workspace.save_file(result2_path) 372 | # The second save_file() should not overwrite the first one because the outermost dirs are different. 373 | self.expected_structure["dbgym_workspace"][RUNS_DNAME][ 374 | self.workspace.dbgym_this_run_path.name 375 | ][name_to_linkname(filename)] = ( 376 | "symlink", 377 | f"dbgym_workspace/{RUNS_DNAME}/{prev_run_name}/{filename}", 378 | ) 379 | self.expected_structure["dbgym_workspace"][RUNS_DNAME][ 380 | self.workspace.dbgym_this_run_path.name 381 | ][name_to_linkname("dir1")] = ( 382 | "symlink", 383 | f"dbgym_workspace/{RUNS_DNAME}/{prev_run_name}/dir1", 384 | ) 385 | self.assertTrue( 386 | verify_structure(self.scratchspace_path, self.expected_structure) 387 | ) 388 | 389 | def test_save_file_config(self) -> None: 390 | """ 391 | See the comments in save_file() for what a "config" is. 392 | """ 393 | self.init_workspace_helper() 394 | assert self.workspace is not None and self.expected_structure is not None 395 | result_path = self.make_file_helper( 396 | "external/result.txt", file_obj=("file", "contents") 397 | ) 398 | self.workspace.save_file(result_path) 399 | self.expected_structure["dbgym_workspace"][RUNS_DNAME][ 400 | self.workspace.dbgym_this_run_path.name 401 | ][f"{result_path.name}"] = ("file", "contents") 402 | self.assertTrue( 403 | verify_structure(self.scratchspace_path, self.expected_structure) 404 | ) 405 | 406 | def test_save_file_same_config_twice(self) -> None: 407 | self.init_workspace_helper() 408 | assert self.workspace is not None and self.expected_structure is not None 409 | result_path = self.make_file_helper( 410 | "external/result.txt", file_obj=("file", "contents") 411 | ) 412 | self.workspace.save_file(result_path) 413 | self.workspace.save_file(result_path) 414 | self.expected_structure["dbgym_workspace"][RUNS_DNAME][ 415 | self.workspace.dbgym_this_run_path.name 416 | ][f"{result_path.name}"] = ("file", "contents") 417 | self.assertTrue( 418 | verify_structure(self.scratchspace_path, self.expected_structure) 419 | ) 420 | 421 | def test_save_file_two_different_configs_with_same_filename(self) -> None: 422 | self.init_workspace_helper() 423 | assert self.workspace is not None and self.expected_structure is not None 424 | result1_path = self.make_file_helper( 425 | "external/result.txt", file_obj=("file", "contents1") 426 | ) 427 | result2_path = self.make_file_helper( 428 | "external/dir1/result.txt", file_obj=("file", "contents2") 429 | ) 430 | filename = result1_path.name 431 | assert filename == result2_path.name 432 | 433 | self.workspace.save_file(result1_path) 434 | self.workspace.save_file(result2_path) 435 | self.expected_structure["dbgym_workspace"][RUNS_DNAME][ 436 | self.workspace.dbgym_this_run_path.name 437 | ][f"{filename}"] = ("file", "contents2") 438 | self.assertTrue( 439 | verify_structure(self.scratchspace_path, self.expected_structure) 440 | ) 441 | 442 | def test_save_file_dependency_inside_directory(self) -> None: 443 | self.init_workspace_helper() 444 | assert self.workspace is not None and self.expected_structure is not None 445 | prev_run_name = self.workspace.dbgym_this_run_path.name 446 | result_path = self.make_result_helper("dir1/dir2/result.txt") 447 | self.make_result_helper("dir1/other1.txt") 448 | self.make_result_helper("dir1/dir3/other2.txt") 449 | self.init_workspace_helper() 450 | self.workspace.save_file(result_path) 451 | self.expected_structure["dbgym_workspace"][RUNS_DNAME][ 452 | self.workspace.dbgym_this_run_path.name 453 | ][name_to_linkname("dir1")] = ( 454 | "symlink", 455 | f"dbgym_workspace/{RUNS_DNAME}/{prev_run_name}/dir1", 456 | ) 457 | self.assertTrue( 458 | verify_structure(self.scratchspace_path, self.expected_structure) 459 | ) 460 | 461 | def test_save_file_generated_this_run_raises_error(self) -> None: 462 | self.init_workspace_helper() 463 | assert self.workspace is not None and self.expected_structure is not None 464 | result_path = self.make_result_helper() 465 | with self.assertRaisesRegex( 466 | AssertionError, 467 | "path \(.*\) was generated in this task run \(.*\)\. You do not need to save it", 468 | ): 469 | self.workspace.save_file(result_path) 470 | 471 | 472 | if __name__ == "__main__": 473 | unittest.main() 474 | -------------------------------------------------------------------------------- /gymlib_package/gymlib/tuning_artifacts.py: -------------------------------------------------------------------------------- 1 | import json 2 | from dataclasses import asdict, dataclass 3 | from pathlib import Path 4 | from typing import Any, NewType 5 | 6 | from gymlib.workspace import DBGymWorkspace, is_fully_resolved 7 | 8 | # PostgresConn doesn't use these types because PostgresConn is used internally by tuning agents 9 | # while these types are only used in the interface between the orchestrator and the tuning agents. 10 | IndexesDelta = NewType("IndexesDelta", list[str]) 11 | SysKnobsDelta = NewType("SysKnobsDelta", dict[str, str]) 12 | # TODO: I'm not decided whether these should be deltas or full configs. I'm going to figure this out once I integrate Proto-X and UDO. 13 | QueryKnobsDelta = NewType("QueryKnobsDelta", dict[str, list[str]]) 14 | 15 | 16 | @dataclass 17 | class TuningMetadata: 18 | """Metadata for the tuning process.""" 19 | 20 | workload_path: Path 21 | pristine_dbdata_snapshot_path: Path 22 | dbdata_parent_path: Path 23 | pgbin_path: Path 24 | 25 | def __post_init__(self) -> None: 26 | """ 27 | Since the metadata needs to persist over time, we need to make sure that the paths are 28 | fully resolved. 29 | """ 30 | assert is_fully_resolved( 31 | self.workload_path 32 | ), f"workload_path={self.workload_path}" 33 | assert is_fully_resolved( 34 | self.pristine_dbdata_snapshot_path 35 | ), f"pristine_dbdata_snapshot_path={self.pristine_dbdata_snapshot_path}" 36 | assert is_fully_resolved( 37 | self.dbdata_parent_path 38 | ), f"dbdata_parent_path={self.dbdata_parent_path}" 39 | assert is_fully_resolved(self.pgbin_path), f"pgbin_path={self.pgbin_path}" 40 | 41 | def asdict(self) -> dict[str, Any]: 42 | return { 43 | "workload_path": str(self.workload_path), 44 | "pristine_dbdata_snapshot_path": str(self.pristine_dbdata_snapshot_path), 45 | "dbdata_parent_path": str(self.dbdata_parent_path), 46 | "pgbin_path": str(self.pgbin_path), 47 | } 48 | 49 | 50 | @dataclass 51 | class DBMSConfigDelta: 52 | """ 53 | This class represents a DBMS config delta. A "DBMS config" is the indexes, system knobs, 54 | and query knobs set by the tuning agent. A "delta" is the change from the prior config. 55 | 56 | `indexes` contains a list of SQL statements for creating indexes. Note that since it's a 57 | config delta, it might contain "DROP ..." statements. 58 | 59 | `sysknobs` contains a mapping from knob names to their values. 60 | 61 | `qknobs` contains a mapping from query IDs to a list of knobs. Each list contains knobs 62 | to prepend to the start of the query. The knobs are a list[str] instead of a dict[str, str] 63 | because knobs can be settings ("SET (enable_sort on)") or flags ("IndexOnlyScan(it)"). 64 | """ 65 | 66 | indexes: IndexesDelta 67 | sysknobs: SysKnobsDelta 68 | qknobs: QueryKnobsDelta 69 | 70 | 71 | def get_delta_at_step_path(tuning_artifacts_path: Path, step_num: int) -> Path: 72 | return tuning_artifacts_path / f"step{step_num}_delta.json" 73 | 74 | 75 | def get_metadata_path(tuning_artifacts_path: Path) -> Path: 76 | return tuning_artifacts_path / "metadata.json" 77 | 78 | 79 | class TuningArtifactsWriter: 80 | def __init__( 81 | self, dbgym_workspace: DBGymWorkspace, metadata: TuningMetadata 82 | ) -> None: 83 | self.dbgym_workspace = dbgym_workspace 84 | self.tuning_artifacts_path = ( 85 | self.dbgym_workspace.dbgym_this_run_path / "tuning_artifacts" 86 | ) 87 | # exist_ok is False because you should only create one TuningArtifactsWriter per run. 88 | self.tuning_artifacts_path.mkdir(parents=False, exist_ok=False) 89 | assert is_fully_resolved(self.tuning_artifacts_path) 90 | self.next_step_num = 0 91 | 92 | # Write metadata file 93 | with get_metadata_path(self.tuning_artifacts_path).open("w") as f: 94 | json.dump(metadata.asdict(), f) 95 | 96 | def write_step(self, dbms_cfg_delta: DBMSConfigDelta) -> None: 97 | """ 98 | This wraps _step() and saves the cfg to a file so that it can be replayed. 99 | """ 100 | curr_step_num = self.next_step_num 101 | self.next_step_num += 1 102 | with get_delta_at_step_path(self.tuning_artifacts_path, curr_step_num).open( 103 | "w" 104 | ) as f: 105 | json.dump(asdict(dbms_cfg_delta), f) 106 | 107 | 108 | class TuningArtifactsReader: 109 | def __init__(self, tuning_artifacts_path: Path) -> None: 110 | self.tuning_artifacts_path = tuning_artifacts_path 111 | assert is_fully_resolved(self.tuning_artifacts_path) 112 | num_steps = 0 113 | while get_delta_at_step_path(self.tuning_artifacts_path, num_steps).exists(): 114 | num_steps += 1 115 | self.num_steps = num_steps 116 | 117 | def get_metadata(self) -> TuningMetadata: 118 | with get_metadata_path(self.tuning_artifacts_path).open("r") as f: 119 | data = json.load(f) 120 | return TuningMetadata( 121 | workload_path=Path(data["workload_path"]), 122 | pristine_dbdata_snapshot_path=Path( 123 | data["pristine_dbdata_snapshot_path"] 124 | ), 125 | dbdata_parent_path=Path(data["dbdata_parent_path"]), 126 | pgbin_path=Path(data["pgbin_path"]), 127 | ) 128 | 129 | def get_delta_at_step(self, step_num: int) -> DBMSConfigDelta: 130 | assert step_num >= 0 and step_num < self.num_steps 131 | with get_delta_at_step_path(self.tuning_artifacts_path, step_num).open( 132 | "r" 133 | ) as f: 134 | data = json.load(f) 135 | return DBMSConfigDelta( 136 | indexes=data["indexes"], 137 | sysknobs=data["sysknobs"], 138 | qknobs=data["qknobs"], 139 | ) 140 | 141 | def get_all_deltas_in_order(self) -> list[DBMSConfigDelta]: 142 | return [self.get_delta_at_step(step_num) for step_num in range(self.num_steps)] 143 | -------------------------------------------------------------------------------- /gymlib_package/gymlib/workload.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from gymlib.workspace import DBGymWorkspace, is_fully_resolved 4 | 5 | 6 | class Workload: 7 | def __init__(self, dbgym_workspace: DBGymWorkspace, workload_path: Path) -> None: 8 | self.dbgym_workspace = dbgym_workspace 9 | self.workload_path = workload_path 10 | assert is_fully_resolved(self.workload_path) 11 | 12 | self.queries: dict[str, str] = {} 13 | order_path = self.workload_path / "order.txt" 14 | self.query_order: list[str] = [] 15 | 16 | assert order_path.exists() 17 | 18 | with self.dbgym_workspace.open_and_save(order_path) as f: 19 | for line in f: 20 | qid, qpath = line.strip().split(",") 21 | qpath = Path(qpath) 22 | assert is_fully_resolved(qpath) 23 | 24 | with self.dbgym_workspace.open_and_save(qpath) as qf: 25 | self.queries[qid] = qf.read() 26 | self.query_order.append(qid) 27 | 28 | def get_query(self, qid: str) -> str: 29 | return self.queries[qid] 30 | 31 | def get_query_order(self) -> list[str]: 32 | return self.query_order 33 | 34 | def get_queries_in_order(self) -> list[str]: 35 | return [self.queries[qid] for qid in self.query_order] 36 | -------------------------------------------------------------------------------- /gymlib_package/gymlib/workspace.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains everything needed to manage the workspace (the dbgym_workspace/ folder). 3 | """ 4 | 5 | import logging 6 | import os 7 | import shutil 8 | import subprocess 9 | import time 10 | from datetime import datetime 11 | from pathlib import Path 12 | from typing import IO, Any, Optional 13 | 14 | import yaml 15 | 16 | WORKSPACE_PATH_PLACEHOLDER = Path("[workspace]") 17 | SYMLINKS_DNAME = "symlinks" 18 | TMP_DNAME = "tmp" 19 | RUNS_DNAME = "task_runs" 20 | DBGYM_APP_NAME = "dbgym" 21 | LATEST_RUN_FNAME = "latest_run" 22 | 23 | 24 | def is_linkname(name: str) -> bool: 25 | assert not name.endswith(".link.link") 26 | return name.endswith(".link") 27 | 28 | 29 | def name_to_linkname(name: str) -> str: 30 | assert not is_linkname(name) 31 | return f"{name}.link" 32 | 33 | 34 | def linkname_to_name(linkname: str) -> str: 35 | assert is_linkname(linkname) 36 | return linkname[: -len(".link")] 37 | 38 | 39 | def get_symlinks_path_from_workspace_path(workspace_path: Path) -> Path: 40 | return workspace_path / SYMLINKS_DNAME 41 | 42 | 43 | def get_tmp_path_from_workspace_path(workspace_path: Path) -> Path: 44 | return workspace_path / TMP_DNAME 45 | 46 | 47 | def get_runs_path_from_workspace_path(workspace_path: Path) -> Path: 48 | return workspace_path / RUNS_DNAME 49 | 50 | 51 | def get_latest_run_path_from_workspace_path(workspace_path: Path) -> Path: 52 | return get_runs_path_from_workspace_path(workspace_path) / name_to_linkname( 53 | LATEST_RUN_FNAME 54 | ) 55 | 56 | 57 | # Paths of config files in the codebase. These are always relative paths. 58 | # The reason these can be relative paths instead of functions taking in codebase_path as input is because relative paths are relative to the codebase root 59 | DEFAULT_BOOT_CONFIG_PATH = Path("dbms") / "postgres" / "default_boot_config.yaml" 60 | 61 | 62 | class DBGymWorkspace: 63 | """ 64 | Global configurations that apply to all parts of DB-Gym 65 | """ 66 | 67 | _num_times_created_this_run: int = 0 68 | 69 | def __init__(self, dbgym_workspace_path: Path): 70 | # The logic around dbgym_tmp_path assumes that DBGymWorkspace is only constructed once. 71 | # This is because DBGymWorkspace creates a new run_*/ dir when it's initialized. 72 | DBGymWorkspace._num_times_created_this_run += 1 73 | assert ( 74 | DBGymWorkspace._num_times_created_this_run == 1 75 | ), f"DBGymWorkspace has been created {DBGymWorkspace._num_times_created_this_run} times. It should only be created once per run." 76 | 77 | self.base_dbgym_repo_path = get_base_dbgym_repo_path() 78 | self.app_name = DBGYM_APP_NAME # TODO: discover this dynamically. app means dbgym or an agent 79 | 80 | # Set and create paths. 81 | self.dbgym_workspace_path = dbgym_workspace_path 82 | self.dbgym_workspace_path.mkdir(parents=True, exist_ok=True) 83 | 84 | # Now that the workspace is guaranteed to be created, we can check if it's fully resolved. 85 | assert is_fully_resolved(self.dbgym_workspace_path) 86 | 87 | self.dbgym_runs_path = get_runs_path_from_workspace_path( 88 | self.dbgym_workspace_path 89 | ) 90 | self.dbgym_runs_path.mkdir(parents=True, exist_ok=True) 91 | self.dbgym_symlinks_path = get_symlinks_path_from_workspace_path( 92 | self.dbgym_workspace_path 93 | ) 94 | self.dbgym_symlinks_path.mkdir(parents=True, exist_ok=True) 95 | self.dbgym_cur_symlinks_path = self.dbgym_symlinks_path / self.app_name 96 | # tmp/ is a workspace for this run only 97 | # One use for it is to place the unzipped dbdata. 98 | # There's no need to save the actual dbdata dir in run_*/ because we just save a symlink to 99 | # the .tgz file we unzipped. 100 | self.dbgym_tmp_path = get_tmp_path_from_workspace_path( 101 | self.dbgym_workspace_path 102 | ) 103 | # The best place to delete the old dbgym_tmp_path is in DBGymWorkspace.__init__(). 104 | # This is better than deleting the dbgym_tmp_path is in DBGymWorkspace.__del__() because DBGymWorkspace may get deleted before execution has completed. 105 | # Also, by keeping the tmp directory around, you can look at it to debug issues. 106 | if self.dbgym_tmp_path.exists(): 107 | shutil.rmtree(self.dbgym_tmp_path) 108 | self.dbgym_tmp_path.mkdir(parents=True, exist_ok=True) 109 | 110 | # Set the path for this task run's results. 111 | for _ in range(2): 112 | try: 113 | self.dbgym_this_run_path = ( 114 | self.dbgym_runs_path 115 | / f"run_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}" 116 | ) 117 | # `exist_ok` is False because we don't want to override a previous task run's data. 118 | self.dbgym_this_run_path.mkdir(parents=True, exist_ok=False) 119 | # Break if it succeeds so we don't do it a second time. 120 | break 121 | except FileExistsError: 122 | # In case we call task.py twice in one second, sleeping here will fix it. 123 | # Waiting one second is enough since we assume there's only one task.py running at a time. 124 | time.sleep(1) 125 | except Exception as e: 126 | raise e 127 | 128 | self.dbgym_latest_run_path = get_latest_run_path_from_workspace_path( 129 | self.dbgym_workspace_path 130 | ) 131 | try_remove_file(self.dbgym_latest_run_path) 132 | try_create_symlink(self.dbgym_this_run_path, self.dbgym_latest_run_path) 133 | 134 | # TODO(phw2): refactor our manual symlinking in postgres/cli.py to use link_result() instead 135 | def link_result( 136 | self, 137 | result_path: Path, 138 | custom_link_name: Optional[str] = None, 139 | ) -> Path: 140 | """ 141 | result_path must be a "result", meaning it was generated inside dbgym_workspace.dbgym_this_run_path. 142 | Further, result_path must have been generated by this invocation to task.py. This also means that 143 | result_path itself can be a file or a dir but not a symlink. 144 | Given a file or directory in task_runs/run_*/[codebase]/[org], this will create a symlink inside 145 | symlinks/[codebase]/[org]/. 146 | Will override the old symlink if there is one, so that symlinks/ always contains the latest generated 147 | version of a file. 148 | This function will return the path to the symlink that was created. 149 | """ 150 | assert isinstance(result_path, Path) 151 | assert is_fully_resolved( 152 | result_path 153 | ), f"result_path ({result_path}) should be a fully resolved path" 154 | assert is_child_path( 155 | result_path, self.dbgym_this_run_path 156 | ), "The result must have been generated in *this* run_*/ dir" 157 | assert not os.path.islink(result_path) 158 | 159 | if type(custom_link_name) is str: 160 | link_name = custom_link_name 161 | else: 162 | if os.path.isfile(result_path): 163 | link_name = name_to_linkname(basename_of_path(result_path)) 164 | elif os.path.isdir(result_path): 165 | link_name = name_to_linkname(basename_of_path(result_path)) 166 | else: 167 | raise AssertionError("result_path must be either a file or dir") 168 | 169 | symlink_parent_path = self.dbgym_symlinks_path / self.app_name 170 | symlink_parent_path.mkdir(parents=True, exist_ok=True) 171 | 172 | # Remove the old symlink ("old" meaning created in an earlier run) if there is one 173 | # Note that in a multi-threaded setting, this might remove one created by a process in the same run, 174 | # meaning it's not "old" by our definition of "old". However, we'll always end up with a symlink 175 | # file of the current run regardless of the order of threads. 176 | assert is_linkname( 177 | link_name 178 | ), f'link_name ({link_name}) should end with ".link"' 179 | symlink_path = symlink_parent_path / link_name 180 | try_remove_file(symlink_path) 181 | try_create_symlink(result_path, symlink_path) 182 | 183 | return symlink_path 184 | 185 | def get_run_path_from_path(self, path: Path) -> Path: 186 | run_path = path 187 | while not parent_path_of_path(run_path).samefile(self.dbgym_runs_path): 188 | run_path = parent_path_of_path(run_path) 189 | return run_path 190 | 191 | # TODO(phw2): really look at the clean PR to see what it changed 192 | # TODO(phw2): after merging agent-train, refactor some code in agent-train to use save_file() instead of open_and_save() 193 | def save_file(self, path: Path) -> None: 194 | """ 195 | If an external function takes in a file/directory as input, you will not be able to call open_and_save(). 196 | In these situations, just call save_file(). 197 | Like open_and_save(), this function only works with real absolute paths. 198 | "Saving" can mean either copying the file or creating a symlink to it 199 | We copy the file if it is a "config", meaning it just exists without having been generated 200 | We create a symlink if it is a "dependency", meaning a task.py command was run to generate it 201 | In these cases we create a symlink so we have full provenance for how the dependency was created 202 | 203 | **Notable Behavior** 204 | - When you save a dependency, it actually creates a link to the outermost directory still inside run_*/. 205 | - The second save will overwrite the first. 206 | - If you save the same file twice in the same run, the second save will overwrite the first. 207 | - If you save two configs with the same name, the second save will overwrite the first. 208 | - If you save two dependencies with the same *outermost* directory, or two dependencies with the same filename 209 | both directly inside run_*/, the second save will overwrite the first. 210 | """ 211 | # validate path 212 | assert isinstance(path, Path) 213 | assert not os.path.islink(path), f"path ({path}) should not be a symlink" 214 | assert os.path.exists(path), f"path ({path}) does not exist" 215 | assert os.path.isfile(path), f"path ({path}) is not a file" 216 | assert not is_child_path( 217 | path, self.dbgym_this_run_path 218 | ), f"path ({path}) was generated in this task run ({self.dbgym_this_run_path}). You do not need to save it" 219 | 220 | # Save _something_ to dbgym_this_run_path. 221 | # Save a symlink if the opened file was generated by a run. This is for two reasons: 222 | # 1. Files or dirs generated by a run are supposed to be immutable so saving a symlink is safe. 223 | # 2. Files or dirs generated by a run may be very large (up to 100s of GBs) so we don't want to copy them. 224 | if is_child_path(path, self.dbgym_runs_path): 225 | # If the path file is directly in run_path, we symlink the file directly. 226 | run_path = self.get_run_path_from_path(path) 227 | parent_path = parent_path_of_path(path) 228 | if parent_path.samefile(run_path): 229 | fname = basename_of_path(path) 230 | symlink_path = self.dbgym_this_run_path / name_to_linkname(fname) 231 | try_remove_file(symlink_path) 232 | try_create_symlink(path, symlink_path) 233 | # Otherwise, we know the path file is _not_ directly inside run_path dir. 234 | # We go as far back as we can while still staying in run_path and symlink that "base" dir. 235 | # This is because lots of runs create dirs within run_path and it creates too much clutter to symlink every individual file. 236 | # Further, this avoids an edge case where you both save a file and the dir it's in. 237 | else: 238 | # Set base_path such that its parent is run_path. 239 | base_path = parent_path 240 | while not parent_path_of_path(base_path).samefile(run_path): 241 | base_path = parent_path_of_path(base_path) 242 | 243 | # Create symlink 244 | open_base_dname = basename_of_path(base_path) 245 | symlink_path = self.dbgym_this_run_path / name_to_linkname( 246 | open_base_dname 247 | ) 248 | try_remove_file(symlink_path) 249 | try_create_symlink(base_path, symlink_path) 250 | # If the file wasn't generated by a run, we can't just symlink it because we don't know that it's immutable. 251 | else: 252 | fname = basename_of_path(path) 253 | # In this case, we want to copy instead of symlinking since it might disappear in the future. 254 | copy_path = self.dbgym_this_run_path / fname 255 | shutil.copy(path, copy_path) 256 | 257 | def open_and_save(self, open_path: Path, mode: str = "r") -> IO[Any]: 258 | """ 259 | Open a file and "save" it to [workspace]/task_runs/run_*/. 260 | It takes in a str | Path to match the interface of open(). 261 | This file does not work if open_path is a symlink, to make its interface identical to that of open(). 262 | Make sure to resolve all symlinks with fully_resolve_path(). 263 | To avoid confusion, I'm enforcing this function to only work with absolute paths. 264 | # TODO: maybe make it work on non-fully-resolved paths to better match open() 265 | See the comment of save_file() for what "saving" means 266 | If you are generating a "result" for the run, _do not_ use this. Just use the normal open(). 267 | This shouldn't be too hard to remember because this function crashes if open_path doesn't exist, 268 | and when you write results you're usually opening open_paths which do not exist. 269 | """ 270 | # Validate open_path 271 | assert isinstance(open_path, Path) 272 | assert is_fully_resolved( 273 | open_path 274 | ), f"open_and_save(): open_path ({open_path}) should be a fully resolved path" 275 | assert not os.path.islink( 276 | open_path 277 | ), f"open_path ({open_path}) should not be a symlink" 278 | assert os.path.exists(open_path), f"open_path ({open_path}) does not exist" 279 | # `open_and_save`` *must* be called on files because it doesn't make sense to open a directory. note that this doesn't mean we'll always save 280 | # a file though. we sometimes save a directory (see save_file() for details) 281 | assert os.path.isfile(open_path), f"open_path ({open_path}) is not a file" 282 | 283 | # Save 284 | self.save_file(open_path) 285 | 286 | # Open 287 | return open(open_path, mode=mode) 288 | 289 | 290 | def get_workspace_path_from_config(dbgym_config_path: Path) -> Path: 291 | """ 292 | Returns the workspace path (as a fully resolved path) from the config file. 293 | """ 294 | with open(dbgym_config_path) as f: 295 | # We do *not* call fully_resolve_path() here because the workspace may not exist yet. 296 | return Path(yaml.safe_load(f)["dbgym_workspace_path"]).resolve().absolute() 297 | 298 | 299 | def make_standard_dbgym_workspace() -> DBGymWorkspace: 300 | """ 301 | The "standard" way to make a DBGymWorkspace using the DBGYM_CONFIG_PATH envvar and the 302 | default path of dbgym_config.yaml. 303 | """ 304 | dbgym_config_path = Path(os.getenv("DBGYM_CONFIG_PATH", "dbgym_config.yaml")) 305 | dbgym_workspace_path = get_workspace_path_from_config(dbgym_config_path) 306 | dbgym_workspace = DBGymWorkspace(dbgym_workspace_path) 307 | return dbgym_workspace 308 | 309 | 310 | def fully_resolve_path(inputpath: os.PathLike[str]) -> Path: 311 | """ 312 | Fully resolve any path to a real, absolute path. 313 | 314 | For flexibility, we take in any os.PathLike. However, for consistency, we always output a Path object. 315 | 316 | Whenever a path is required, the user is allowed to enter relative paths, absolute paths, or paths starting with ~. 317 | 318 | Relative paths are relative to the base dbgym repo dir. 319 | 320 | It *does not* check whether the path exists, since the user might be wanting to create a new file/dir. 321 | 322 | Raises RuntimeError for errors. 323 | """ 324 | # For simplicity, we only process Path objects. 325 | realabspath = Path(inputpath) 326 | # `expanduser()` is always "ok" to call first. 327 | realabspath = realabspath.expanduser() 328 | # The reason we don't call Path.absolute() is because the path should be relative to get_base_dbgym_repo_path(), 329 | # which is not necessary where cwd() points at the time of calling this function. 330 | if not realabspath.is_absolute(): 331 | realabspath = get_base_dbgym_repo_path() / realabspath 332 | # `resolve()` has two uses: normalize the path (remove ..) and resolve symlinks. 333 | # I believe the pathlib library (https://docs.python.org/3/library/pathlib.html#pathlib.Path.resolve) does these together this 334 | # way to avoid an edge case related to symlinks and normalizing paths (footnote 1 of the linked docs) 335 | realabspath = realabspath.resolve() 336 | assert is_fully_resolved( 337 | realabspath 338 | ), f"realabspath ({realabspath}) is not fully resolved" 339 | return realabspath 340 | 341 | 342 | def get_base_dbgym_repo_path() -> Path: 343 | path = Path(os.getcwd()) 344 | assert _is_base_dbgym_repo_path( 345 | path 346 | ), "This script should be invoked from the root of the dbgym repo." 347 | return path 348 | 349 | 350 | def _is_base_dbgym_repo_path(path: Path) -> bool: 351 | """ 352 | Returns whether we are in the base directory of some git repository 353 | """ 354 | try: 355 | git_toplevel = subprocess.check_output( 356 | ["git", "rev-parse", "--show-toplevel"], encoding="utf-8" 357 | ).strip() 358 | return Path(git_toplevel) == path 359 | except subprocess.CalledProcessError: 360 | # This means we are not in _any_ git repo 361 | return False 362 | except Exception as e: 363 | raise e 364 | 365 | 366 | def is_fully_resolved(path: Path) -> bool: 367 | """ 368 | Checks if a path is fully resolved (exists, is absolute, and contains no symlinks in its entire ancestry). 369 | 370 | The reason we check for existence is because that's the only way we know that there are no symlinks in its entire ancestry. 371 | If we didn't check for existence, we could later create a new symlink in the path's ancestry. 372 | 373 | Even if a path exists, is absolute, and is not itself a symlink, it could still contain 374 | symlinks in its parent directories. For example: 375 | /home/user/ # Real directory 376 | /home/user/links/ # Symlink to /data/links 377 | /home/user/links/file.txt # Real file 378 | 379 | In this case, "/home/user/links/file.txt" exists and isn't itself a symlink, 380 | but it's not fully resolved because it contains a symlink in its ancestry. 381 | The fully resolved path would be "/data/links/file.txt". 382 | """ 383 | assert isinstance(path, Path) 384 | resolved_path = path.resolve() 385 | 386 | # Check if the path exists. 387 | if not resolved_path.exists(): 388 | return False 389 | 390 | # Check if the path contains no symlinks in its entire ancestry. 391 | # This also checks if the path is absolute because resolved_path is absolute. 392 | assert ( 393 | resolved_path.is_absolute() 394 | ), "resolved_path should be absolute (see comment above)" 395 | # Converting them to strings is the most unambiguously strict way of checking equality. 396 | # Stuff like Path.__eq__() or Path.samefile() might be more lenient. 397 | return str(resolved_path) == str(path) 398 | 399 | 400 | def parent_path_of_path(path: Path) -> Path: 401 | """ 402 | This function only calls Path.parent, but in a safer way. 403 | """ 404 | assert isinstance(path, Path) 405 | assert is_fully_resolved( 406 | path 407 | ), f"path must be fully resolved because Path.parent has weird behavior on non-resolved paths (see https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.parent)" 408 | parent_path = path.parent 409 | assert isinstance(parent_path, Path) 410 | return parent_path 411 | 412 | 413 | def basename_of_path(path: Path) -> str: 414 | """ 415 | This function only calls Path.name, but in a safer way. 416 | """ 417 | assert isinstance(path, Path) 418 | assert is_fully_resolved( 419 | path 420 | ), f'path must be fully resolved because Path.name has weird behavior on non-resolved paths (like giving ".." if the path ends with a "..")' 421 | path_dirname, path_basename = os.path.split(path) 422 | # this means the path ended with a '/' so all os.path.split() does is get rid of the slash 423 | if path_basename == "": 424 | return os.path.basename(path_dirname) 425 | else: 426 | return path_basename 427 | 428 | 429 | # TODO(phw2): refactor to use Path 430 | def is_child_path(child_path: os.PathLike[str], parent_path: os.PathLike[str]) -> bool: 431 | """ 432 | Checks whether child_path refers to a file/dir/link that is a child of the dir referred to by parent_path 433 | If the two paths are equal, this function returns FALSE 434 | """ 435 | assert os.path.isdir(parent_path) 436 | if os.path.samefile(child_path, parent_path): 437 | return False 438 | else: 439 | return os.path.samefile( 440 | os.path.commonpath([parent_path, child_path]), parent_path 441 | ) 442 | 443 | 444 | def extract_from_task_run_path( 445 | dbgym_workspace: DBGymWorkspace, task_run_path: Path 446 | ) -> tuple[Path, str, Path, str]: 447 | """ 448 | The task_runs/ folder is organized like task_runs/run_*/[codebase]/[org]/any/path/you/want. 449 | This function extracts the [codebase] and [org] components 450 | """ 451 | assert isinstance(task_run_path, Path) 452 | assert not task_run_path.is_symlink() 453 | parent_path = task_run_path.parent 454 | # TODO(phw2): make this a common function 455 | assert not parent_path.samefile( 456 | dbgym_workspace.dbgym_runs_path 457 | ), f"task_run_path ({task_run_path}) should be inside a run_*/ dir instead of directly in dbgym_workspace.dbgym_runs_path ({dbgym_workspace.dbgym_runs_path})" 458 | assert not parent_path_of_path(parent_path).samefile( 459 | dbgym_workspace.dbgym_runs_path 460 | ), f"task_run_path ({task_run_path}) should be inside a run_*/[codebase]/ dir instead of directly in run_*/ ({dbgym_workspace.dbgym_runs_path})" 461 | assert not parent_path_of_path(parent_path_of_path(parent_path)).samefile( 462 | dbgym_workspace.dbgym_runs_path 463 | ), f"task_run_path ({task_run_path}) should be inside a run_*/[codebase]/[organization]/ dir instead of directly in run_*/ ({dbgym_workspace.dbgym_runs_path})" 464 | # org_path is the run_*/[codebase]/[organization]/ dir that task_run_path is in 465 | org_path = parent_path 466 | while not parent_path_of_path( 467 | parent_path_of_path(parent_path_of_path(org_path)) 468 | ).samefile(dbgym_workspace.dbgym_runs_path): 469 | org_path = parent_path_of_path(org_path) 470 | org_dname = basename_of_path(org_path) 471 | codebase_path = parent_path_of_path(org_path) 472 | codebase_dname = basename_of_path(codebase_path) 473 | 474 | return codebase_path, codebase_dname, org_path, org_dname 475 | 476 | 477 | def try_create_symlink(src_path: Path, dst_path: Path) -> None: 478 | """ 479 | Our functions that create symlinks might be called by multiple processes at once 480 | during HPO. Thus, this is a thread-safe way to create a symlink. 481 | """ 482 | assert is_linkname(dst_path.name) 483 | try: 484 | os.symlink(src_path, dst_path) 485 | except FileExistsError: 486 | # it's ok if it exists 487 | pass 488 | 489 | 490 | def try_remove_file(path: Path) -> None: 491 | """ 492 | Our functions that remove files might be called by multiple processes at once 493 | during HPO. Thus, this is a thread-safe way to remove a file. 494 | """ 495 | try: 496 | os.remove(path) 497 | except FileNotFoundError: 498 | # it's ok if it doesn't exist 499 | pass 500 | 501 | 502 | def is_ssd(path: Path) -> bool: 503 | try: 504 | device = ( 505 | subprocess.check_output(["df", path]).decode().split("\n")[1].split()[0] 506 | ) 507 | device_basename = os.path.basename(device) 508 | lsblk_output = subprocess.check_output( 509 | ["lsblk", "-d", "-o", "name,rota"] 510 | ).decode() 511 | for line in lsblk_output.split("\n")[1:]: 512 | parts = line.split() 513 | if parts and parts[0] == device_basename: 514 | is_ssd = int(parts[1]) == 0 515 | return is_ssd 516 | return False 517 | except Exception as e: 518 | logging.error(f"An error occurred: {e}") 519 | return False 520 | -------------------------------------------------------------------------------- /gymlib_package/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "gymlib" 7 | version = "0.1.0" 8 | 9 | [tool.setuptools.packages.find] 10 | where = ["."] 11 | include = ["gymlib*"] -------------------------------------------------------------------------------- /orchestrate/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/orchestrate/__init__.py -------------------------------------------------------------------------------- /orchestrate/clean.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import shutil 4 | from itertools import chain 5 | from pathlib import Path 6 | 7 | from gymlib.workspace import DBGymWorkspace, is_child_path, parent_path_of_path 8 | 9 | 10 | def add_symlinks_in_path( 11 | symlinks_stack: list[Path], root_path: Path, processed_symlinks: set[Path] 12 | ) -> None: 13 | """ 14 | Will modify symlinks_stack and processed_symlinks. 15 | """ 16 | for root_pathstr, dir_names, file_names in os.walk(root_path): 17 | root_path = Path(root_pathstr) 18 | # symlinks can either be files or directories, so we go through both dir_names and file_names 19 | for file_name in chain(dir_names, file_names): 20 | file_path = root_path / file_name 21 | if file_path.is_symlink() and file_path not in processed_symlinks: 22 | symlinks_stack.append(file_path) 23 | processed_symlinks.add(file_path) 24 | 25 | 26 | def count_files_in_workspace(dbgym_workspace: DBGymWorkspace) -> int: 27 | """ 28 | Counts the number of files (regular file or dir or symlink) in the workspace. 29 | """ 30 | total_count = 0 31 | for dirpath, dirnames, filenames in os.walk( 32 | dbgym_workspace.dbgym_workspace_path, followlinks=False 33 | ): 34 | # Check if any of the directories are symbolic links and remove them from dirnames 35 | dirnames[:] = [ 36 | d for d in dirnames if not os.path.islink(os.path.join(dirpath, d)) 37 | ] 38 | 39 | # Count files and directories (non-symlink directories already filtered) 40 | total_count += len(filenames) + len(dirnames) 41 | 42 | return total_count 43 | 44 | 45 | def clean_workspace( 46 | dbgym_workspace: DBGymWorkspace, 47 | mode: str = "safe", 48 | verbose: bool = False, 49 | ) -> None: 50 | """ 51 | Clean all [workspace]/task_runs/run_*/ directories that are not referenced by any "active symlinks". 52 | If mode is "aggressive", "active symlinks" means *only* the symlinks directly in [workspace]/symlinks/. 53 | If mode is "safe", "active symlinks" means the symlinks directly in [workspace]/symlinks/ as well as 54 | any symlinks referenced in task_runs/run_*/ directories we have already decided to keep. 55 | """ 56 | # This stack holds the symlinks that are left to be processed 57 | symlink_paths_to_process: list[Path] = [] 58 | # This set holds the symlinks that have already been processed to avoid infinite loops 59 | processed_symlinks: set[Path] = set() 60 | 61 | # 1. Initialize paths to process 62 | if dbgym_workspace.dbgym_symlinks_path.exists(): 63 | add_symlinks_in_path( 64 | symlink_paths_to_process, 65 | dbgym_workspace.dbgym_symlinks_path, 66 | processed_symlinks, 67 | ) 68 | 69 | # 2. Go through symlinks, figuring out which "children of task runs" to keep 70 | # Based on the rules of the framework, "children of task runs" should be run_*/ directories. 71 | # However, the user's workspace might happen to break these rules by putting directories not 72 | # named "run_*/" or files directly in task_runs/. Thus, I use the term "task_run_child_paths" 73 | # instead of "run_paths". 74 | task_run_child_paths_to_keep = set() 75 | 76 | if dbgym_workspace.dbgym_runs_path.exists(): 77 | while symlink_paths_to_process: 78 | symlink_path: Path = symlink_paths_to_process.pop() 79 | assert symlink_path.is_symlink() 80 | # Path.resolve() resolves all layers of symlinks while os.readlink() only resolves one layer. 81 | # However, os.readlink() literally reads the string contents of the link. We need to do some 82 | # processing on the result of os.readlink() to convert it to an absolute path 83 | real_path = symlink_path.resolve() 84 | one_layer_resolved_path = os.readlink(symlink_path) 85 | assert str(real_path) == str( 86 | os.readlink(symlink_path) 87 | ), f"symlink_path ({symlink_path}) seems to point to *another* symlink. This is difficult to handle, so it is currently disallowed. Please resolve this situation manually." 88 | 89 | # If the file doesn't exist, we'll just ignore it. 90 | if not real_path.exists(): 91 | continue 92 | # We're only trying to figure out which direct children of task_runs/ to save. If the file isn't 93 | # even a descendant, we don't care about it. 94 | if not is_child_path(real_path, dbgym_workspace.dbgym_runs_path): 95 | continue 96 | 97 | assert not real_path.samefile(dbgym_workspace.dbgym_runs_path) 98 | 99 | # Figure out the task_run_child_path to put into task_run_child_paths_to_keep 100 | task_run_child_path = None 101 | if parent_path_of_path(real_path).samefile(dbgym_workspace.dbgym_runs_path): 102 | # While it's true that it shouldn't be possible to symlink to a directory directly in task_runs/, 103 | # we'll just not delete it if the user happens to have one like this. Even if the user messed up 104 | # the structure somehow, it's just a good idea not to delete it. 105 | task_run_child_path = real_path 106 | else: 107 | # Technically, it's not allowed to symlink to any files not in task_runs/run_*/[codebase]/[organization]/. 108 | # However, as with above, we won't just nuke files if the workspace doesn't follow this rule for 109 | # some reason. 110 | task_run_child_path = real_path 111 | while not parent_path_of_path(task_run_child_path).samefile( 112 | dbgym_workspace.dbgym_runs_path 113 | ): 114 | task_run_child_path = parent_path_of_path(task_run_child_path) 115 | assert task_run_child_path != None 116 | assert parent_path_of_path(task_run_child_path).samefile( 117 | dbgym_workspace.dbgym_runs_path 118 | ), f"task_run_child_path ({task_run_child_path}) is not a direct child of dbgym_workspace.dbgym_runs_path" 119 | task_run_child_paths_to_keep.add(task_run_child_path) 120 | 121 | # If on safe mode, add symlinks inside the task_run_child_path to be processed 122 | if mode == "safe": 123 | add_symlinks_in_path( 124 | symlink_paths_to_process, 125 | task_run_child_path, 126 | processed_symlinks, 127 | ) 128 | 129 | # 3. Go through all children of task_runs/*, deleting any that we weren't told to keep 130 | # It's true that symlinks might link outside of task_runs/*. We'll just not care about those 131 | starting_num_files = count_files_in_workspace(dbgym_workspace) 132 | if dbgym_workspace.dbgym_runs_path.exists(): 133 | for child_path in dbgym_workspace.dbgym_runs_path.iterdir(): 134 | if child_path not in task_run_child_paths_to_keep: 135 | if child_path.is_dir(): 136 | shutil.rmtree(child_path) 137 | else: 138 | os.remove(child_path) 139 | ending_num_files = count_files_in_workspace(dbgym_workspace) 140 | 141 | if verbose: 142 | logging.info( 143 | f"Removed {starting_num_files - ending_num_files} out of {starting_num_files} files" 144 | ) 145 | logging.info( 146 | f"Workspace went from {starting_num_files - ending_num_files} to {starting_num_files}" 147 | ) 148 | -------------------------------------------------------------------------------- /orchestrate/cli.py: -------------------------------------------------------------------------------- 1 | import click 2 | from gymlib.workspace import DBGymWorkspace 3 | 4 | from orchestrate.clean import clean_workspace, count_files_in_workspace 5 | 6 | 7 | @click.group(name="manage") 8 | def manage_group() -> None: 9 | pass 10 | 11 | 12 | @click.command("clean") 13 | @click.pass_obj 14 | @click.option( 15 | "--mode", 16 | type=click.Choice(["safe", "aggressive"]), 17 | default="safe", 18 | help='The mode to clean the workspace (default="safe"). "aggressive" means "only keep run_*/ folders referenced by a file in symlinks/". "safe" means "in addition to that, recursively keep any run_*/ folders referenced by any symlinks in run_*/ folders we are keeping."', 19 | ) 20 | def manage_clean(dbgym_workspace: DBGymWorkspace, mode: str) -> None: 21 | clean_workspace(dbgym_workspace, mode=mode, verbose=True) 22 | 23 | 24 | @click.command("count") 25 | @click.pass_obj 26 | def manage_count(dbgym_workspace: DBGymWorkspace) -> None: 27 | num_files = count_files_in_workspace(dbgym_workspace) 28 | print( 29 | f"The workspace ({dbgym_workspace.dbgym_workspace_path}) has {num_files} total files/dirs/symlinks." 30 | ) 31 | 32 | 33 | manage_group.add_command(manage_clean) 34 | manage_group.add_command(manage_count) 35 | -------------------------------------------------------------------------------- /orchestrate/replay.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from pathlib import Path 3 | 4 | from gymlib.pg import DEFAULT_POSTGRES_PORT 5 | from gymlib.pg_conn import PostgresConn 6 | from gymlib.tuning_artifacts import TuningArtifactsReader 7 | from gymlib.workload import Workload 8 | from gymlib.workspace import DBGymWorkspace 9 | 10 | 11 | def replay( 12 | dbgym_workspace: DBGymWorkspace, tuning_artifacts_path: Path 13 | ) -> list[tuple[float, int]]: 14 | """ 15 | Returns the total runtime and the number of timed out queries for each step. 16 | 17 | The first step will use no configuration changes. 18 | """ 19 | replay_data: list[tuple[float, int]] = [] 20 | 21 | reader = TuningArtifactsReader(tuning_artifacts_path) 22 | pg_conn = PostgresConn( 23 | dbgym_workspace, 24 | DEFAULT_POSTGRES_PORT, 25 | reader.get_metadata().pristine_dbdata_snapshot_path, 26 | reader.get_metadata().dbdata_parent_path, 27 | reader.get_metadata().pgbin_path, 28 | None, 29 | ) 30 | workload = Workload( 31 | dbgym_workspace, 32 | reader.get_metadata().workload_path, 33 | ) 34 | 35 | pg_conn.restore_pristine_snapshot() 36 | pg_conn.restart_postgres() 37 | qknobs: defaultdict[str, list[str]] = defaultdict(list) 38 | replay_data.append(time_workload(pg_conn, workload, qknobs)) 39 | 40 | for delta in reader.get_all_deltas_in_order(): 41 | pg_conn.restart_with_changes(delta.sysknobs) 42 | 43 | for index in delta.indexes: 44 | pg_conn.psql(index) 45 | 46 | for query, knobs in delta.qknobs.items(): 47 | # TODO: account for deleting a knob if we are representing knobs as deltas. 48 | qknobs[query].extend(knobs) 49 | 50 | replay_data.append(time_workload(pg_conn, workload, qknobs)) 51 | 52 | pg_conn.shutdown_postgres() 53 | return replay_data 54 | 55 | 56 | def time_workload( 57 | pg_conn: PostgresConn, workload: Workload, qknobs: dict[str, list[str]] 58 | ) -> tuple[float, int]: 59 | """ 60 | Returns the total runtime and the number of timed out queries. 61 | """ 62 | total_runtime: float = 0 63 | num_timed_out_queries: int = 0 64 | 65 | for qid in workload.get_query_order(): 66 | query = workload.get_query(qid) 67 | this_query_knobs = qknobs[qid] 68 | runtime, did_time_out, _ = pg_conn.time_query( 69 | query, query_knobs=this_query_knobs 70 | ) 71 | total_runtime += runtime 72 | if did_time_out: 73 | num_timed_out_queries += 1 74 | 75 | return total_runtime, num_timed_out_queries 76 | -------------------------------------------------------------------------------- /orchestrate/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/orchestrate/tests/__init__.py -------------------------------------------------------------------------------- /orchestrate/tests/integtest_replay.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from gymlib.tests.gymlib_integtest_util import GymlibIntegtestManager 4 | from gymlib.tuning_artifacts import ( 5 | DBMSConfigDelta, 6 | IndexesDelta, 7 | QueryKnobsDelta, 8 | SysKnobsDelta, 9 | TuningArtifactsWriter, 10 | ) 11 | from gymlib.workspace import DBGymWorkspace 12 | 13 | from benchmark.tpch.constants import DEFAULT_TPCH_SEED 14 | from orchestrate.replay import replay 15 | 16 | 17 | class ReplayTests(unittest.TestCase): 18 | workspace: DBGymWorkspace 19 | 20 | @staticmethod 21 | def setUpClass() -> None: 22 | GymlibIntegtestManager.set_up_workspace() 23 | # Reset _num_times_created_this_run since previous tests may have created a workspace. 24 | DBGymWorkspace._num_times_created_this_run = 0 25 | ReplayTests.workspace = DBGymWorkspace( 26 | GymlibIntegtestManager.get_workspace_path() 27 | ) 28 | 29 | def test_replay(self) -> None: 30 | writer = TuningArtifactsWriter( 31 | ReplayTests.workspace, 32 | GymlibIntegtestManager.get_default_metadata(), 33 | ) 34 | writer.write_step( 35 | DBMSConfigDelta( 36 | indexes=IndexesDelta( 37 | ["CREATE INDEX idx_orders_custkey ON orders(o_custkey)"] 38 | ), 39 | sysknobs=SysKnobsDelta( 40 | {"shared_buffers": "2GB"}, 41 | ), 42 | qknobs=QueryKnobsDelta( 43 | { 44 | f"S{DEFAULT_TPCH_SEED}-Q1": [ 45 | "set enable_hashagg = off", 46 | "set enable_sort = on", 47 | ], 48 | } 49 | ), 50 | ) 51 | ) 52 | replay_data = replay( 53 | ReplayTests.workspace, 54 | writer.tuning_artifacts_path, 55 | ) 56 | 57 | # We do some very simple sanity checks here due to the inherent randomness of executing a workload. 58 | # We check that there is one data point for the initial config and one for the config change. 59 | self.assertEqual(len(replay_data), 2) 60 | # We check that the second step is faster. 61 | self.assertLess(replay_data[1][0], replay_data[0][0]) 62 | # We check that no queries timed out in either step. 63 | self.assertEqual(replay_data[0][1], 0) 64 | self.assertEqual(replay_data[1][1], 0) 65 | 66 | 67 | if __name__ == "__main__": 68 | unittest.main() 69 | -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/scripts/__init__.py -------------------------------------------------------------------------------- /scripts/_build_conda_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This helper script creates a conda environment. 3 | # You should not run this directly. Instead, use build_agent_conda_env.sh or build_gym_conda_env.sh. 4 | 5 | set -euo pipefail 6 | 7 | # 1. Checks. 8 | # 1.1. Check that conda is installed. 9 | if ! command -v conda &> /dev/null; then 10 | echo "Error: Conda is not installed" 11 | exit 1 12 | fi 13 | 14 | # 1.2. Input validation. 15 | if [ "$#" -lt 3 ]; then 16 | echo "Usage: ./_build_conda_env.sh " 17 | exit 1 18 | fi 19 | 20 | env_name=$1 21 | python_version_path=$2 22 | requirements_path=$3 23 | 24 | # 1.3. Check that the environment doesn't already exist. 25 | if conda info --envs | grep -q "^$env_name "; then 26 | echo "Error: Conda environment '$env_name' already exists" 27 | exit 1 28 | fi 29 | 30 | # 2. Set up the environment. 31 | # Note: I am intentionally not using environment.yml. I am instead using 32 | # requirements.txt and .python_version. This is for two reasons: 33 | # 1. environment.yml sets the conda env name. However, I want to enforce 34 | # that the conda env name is the same as the agent name. 35 | # 2. requirements.txt can be used by pip and only contains packages and 36 | # not any additional conda-specific syntax, making it more modular 37 | # and flexible. 38 | 39 | # 2.1. Set python_version variable. 40 | if [ -f "$python_version_path" ]; then 41 | python_version=$(cat "$python_version_path") 42 | else 43 | echo "Info: .python_version not found in $python_version_path. Using default Python 3.10." 44 | python_version="3.10" 45 | fi 46 | 47 | # 2.2. Create conda environment with specified Python version. 48 | echo "Creating conda environment '$env_name' with Python $python_version..." 49 | eval "$(conda shell.bash hook)" 50 | conda create -y -n "$env_name" python="$python_version" 51 | 52 | # 2.3. Install the packages. 53 | conda activate "$env_name" 54 | 55 | if [ -f "$requirements_path" ]; then 56 | echo "Installing pip requirements from $requirements_path..." 57 | pip install -r "$requirements_path" 58 | else 59 | echo "Info: $requirements_path not found. Skipping pip install." 60 | fi 61 | 62 | # We always install gymlib so that the agent has access to it. 63 | if [ -d "gymlib_package" ]; then 64 | echo "Installing gymlib..." 65 | # Note that I don't use -e here. When I tried -e, the editor wouldn't be able to find gymlib. 66 | pip install ./gymlib_package 67 | else 68 | echo "Error: gymlib_package directory not found in $(pwd). Please ensure you're running this script from the right folder." 69 | exit 1 70 | fi 71 | 72 | conda deactivate 73 | 74 | # 2.4. Success message. 75 | echo "Conda environment '$env_name' created successfully." 76 | echo "It is not currently activated. To activate it, run 'conda activate $env_name'." 77 | -------------------------------------------------------------------------------- /scripts/_load_per_machine_envvars.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | host=$(hostname) 3 | 4 | if [ "$host" == "dev4" ]; then 5 | export DBDATA_PARENT_PATH=/mnt/nvme1n1/phw2/dbgym_tmp/ 6 | export INTENDED_DBDATA_HARDWARE=ssd 7 | elif [ "$host" == "dev6" ]; then 8 | export DBDATA_PARENT_PATH=/mnt/nvme0n1/phw2/dbgym_tmp/ 9 | export INTENDED_DBDATA_HARDWARE=ssd 10 | elif [ "$host" == "patnuc" ]; then 11 | export DBDATA_PARENT_PATH=../dbgym_workspace/tmp/ 12 | export INTENDED_DBDATA_HARDWARE=hdd 13 | else 14 | echo "Did not recognize host \"$host\"" 15 | exit 1 16 | fi -------------------------------------------------------------------------------- /scripts/_run_tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import unittest 4 | 5 | if __name__ == "__main__": 6 | loader = unittest.TestLoader() 7 | suite = loader.discover(".", pattern=sys.argv[1]) 8 | runner = unittest.TextTestRunner() 9 | result = runner.run(suite) 10 | if not result.wasSuccessful(): 11 | # This is needed so that the GHA fails if the unit tests fail. 12 | sys.exit(1) 13 | -------------------------------------------------------------------------------- /scripts/build_agent_conda_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script creates a conda environment for a specific agent. 3 | # - Name matches the agent name. 4 | # - Python version from .python_version file in the agent's folder (if exists). 5 | # - Dependencies from requirements.txt file in the agent's folder (if exists). 6 | # - gymlib is installed. 7 | # 8 | # Using this script is *optional*. If you have a more complex environment setup 9 | # for your agent, just do that manually. 10 | # 11 | # Run it from the dbgym root folder (e.g. `./scripts/build_agent_conda_env.sh `). 12 | # 13 | # Before running this script, the user must update the folder of the agent 14 | # they want to create a conda environment for (e.g. by calling submodule update). 15 | # There are other things the user must do as well but these are all checked 16 | # automatically by this script. 17 | 18 | set -euo pipefail 19 | 20 | if [ -z "$1" ]; then 21 | echo "Usage: ./build_agent_conda_env.sh " 22 | exit 1 23 | fi 24 | 25 | agent_name=$1 26 | 27 | if [ ! -d "agents/$agent_name" ]; then 28 | echo "Error: Agent folder '$agent_name' does not exist" 29 | exit 1 30 | fi 31 | 32 | ./scripts/_build_conda_env.sh "$agent_name" "agents/$agent_name/.python_version" "agents/$agent_name/requirements.txt" -------------------------------------------------------------------------------- /scripts/build_dbgym_conda_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script builds the conda environment used by the gym itself (i.e. the orchestrator). 3 | # This script is optional. You don't need to use conda if you don't want to (the CI doesn't use conda, for instance) 4 | 5 | set -euo pipefail 6 | 7 | ./scripts/_build_conda_env.sh "dbgym" "scripts/configs/.python_version" "scripts/configs/requirements.txt" 8 | -------------------------------------------------------------------------------- /scripts/check_format.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euxo pipefail 3 | 4 | # Ignore agents/ because those are all submodules. 5 | black . --check --exclude agents 6 | isort . --profile black -c --skip agents 7 | -------------------------------------------------------------------------------- /scripts/configs/.python_version: -------------------------------------------------------------------------------- 1 | 3.10.13 -------------------------------------------------------------------------------- /scripts/configs/apt_requirements.txt: -------------------------------------------------------------------------------- 1 | bison 2 | build-essential 3 | flex 4 | libreadline-dev 5 | rpm 6 | zlib1g-dev 7 | cbindgen 8 | redis-server 9 | redis-tools -------------------------------------------------------------------------------- /scripts/configs/e2e_test_dbgym_config.yaml: -------------------------------------------------------------------------------- 1 | dbgym_workspace_path: ../dbgym_e2etest_workspace 2 | boot_redis_port: 7379 3 | ray_gcs_port: 7380 -------------------------------------------------------------------------------- /scripts/configs/mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | strict = True 3 | ignore_missing_imports = True 4 | -------------------------------------------------------------------------------- /scripts/configs/requirements.txt: -------------------------------------------------------------------------------- 1 | async-timeout==5.0.1 2 | black==24.10.0 3 | cffi==1.17.1 4 | click==8.1.8 5 | cryptography==44.0.0 6 | greenlet==3.1.1 7 | isort==5.13.2 8 | mypy==1.14.0 9 | mypy-extensions==1.0.0 10 | packaging==24.2 11 | pathspec==0.12.1 12 | pglast==7.2 13 | platformdirs==4.3.6 14 | plumbum==1.9.0 15 | psutil==6.1.1 16 | psycopg==3.2.3 17 | pycparser==2.22 18 | PyYAML==6.0.2 19 | redis==5.2.1 20 | SQLAlchemy==2.0.36 21 | tomli==2.2.1 22 | types-cffi==1.16.0.20241221 23 | types-pyOpenSSL==24.1.0.20240722 24 | types-PyYAML==6.0.12.20241221 25 | types-redis==4.6.0.20241004 26 | types-setuptools==75.6.0.20241223 27 | typing_extensions==4.12.2 28 | -------------------------------------------------------------------------------- /scripts/format.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | # Ignore agents/ because those are all submodules. 5 | black . --exclude agents 6 | isort . --profile black --skip agents 7 | -------------------------------------------------------------------------------- /scripts/install_sysdeps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # "sysdeps" stands for "system dependencies". 3 | # These are dependencies unrelated to Python that the dbgym needs. 4 | cat scripts/configs/apt_requirements.txt | xargs sudo apt-get install -y 5 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y 6 | -------------------------------------------------------------------------------- /scripts/mypy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Ignore agents/ because those are all submodules. 3 | # Ignore gymlib_package/build/ to avoid the error of mypy finding two gymlib packages. 4 | mypy --config-file scripts/configs/mypy.ini . --exclude agents/ --exclude gymlib_package/build/ -------------------------------------------------------------------------------- /scripts/pat_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euxo pipefail 4 | 5 | . ./scripts/_load_per_machine_envvars.sh 6 | 7 | # space for testing. uncomment this to run individual commands from the script (copy pasting is harder because there are envvars) 8 | exit 0 9 | 10 | # benchmark 11 | python3 task.py benchmark job data 12 | python3 task.py benchmark job workload --query-subset demo 13 | 14 | # postgres 15 | python3 task.py dbms postgres build 16 | python3 task.py dbms postgres dbdata job --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-path $DBDATA_PARENT_PATH -------------------------------------------------------------------------------- /scripts/pipfreeze.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Ignore gymlib because we install it manually inside _build_conda_env.sh (not from requirements.txt). 3 | pip freeze | grep -v "^gymlib @" >scripts/configs/requirements.txt -------------------------------------------------------------------------------- /scripts/quickstart.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euxo pipefail 4 | 5 | DBMS=$1 6 | BENCHMARK=$2 7 | SCALE_FACTOR=$3 8 | AGENT=$4 9 | 10 | # Benchmark 11 | python3 task.py benchmark $BENCHMARK data $SCALE_FACTOR 12 | python3 task.py benchmark $BENCHMARK workload --scale-factor $SCALE_FACTOR 13 | 14 | # DBMS 15 | python3 task.py dbms $DBMS build 16 | python3 task.py dbms $DBMS dbdata tpch --scale-factor $SCALE_FACTOR 17 | 18 | # Tune 19 | python3 task.py tune $AGENT embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" # long datagen so that train doesn't crash 20 | python3 task.py tune $AGENT embedding train tpch --scale-factor $SCALE_FACTOR --iterations-per-epoch 1 --num-points-to-sample 1 --num-batches 1 --batch-size 64 --start-epoch 15 --num-samples 4 --train-max-concurrent 4 --num-curate 2 21 | python3 task.py tune $AGENT agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 2 --max-concurrent 2 --workload-timeout 15 --query-timeout 1 --tune-duration-during-hpo 0.01 22 | python3 task.py tune $AGENT agent tune tpch --scale-factor $SCALE_FACTOR 23 | -------------------------------------------------------------------------------- /scripts/run_integration_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python -m scripts._run_tests "integtest_*.py" -------------------------------------------------------------------------------- /scripts/run_unit_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python -m scripts._run_tests "unittest_*.py" 3 | -------------------------------------------------------------------------------- /task.py: -------------------------------------------------------------------------------- 1 | import click 2 | from gymlib.workspace import make_standard_dbgym_workspace 3 | 4 | from benchmark.cli import benchmark_group 5 | from dbms.cli import dbms_group 6 | from orchestrate.cli import manage_group 7 | 8 | # TODO(phw2): Save commit, git diff, and run command. 9 | # TODO(phw2): Remove write permissions on old run_*/ dirs to enforce that they are immutable. 10 | # TODO(phw2): Rename run_*/ to the command used (e.g. tune_protox_*/). 11 | 12 | 13 | @click.group() 14 | @click.pass_context 15 | def task(ctx: click.Context) -> None: 16 | """🛢️ CMU-DB Database Gym: github.com/cmu-db/dbgym 🏋️""" 17 | dbgym_workspace = make_standard_dbgym_workspace() 18 | ctx.obj = dbgym_workspace 19 | 20 | 21 | if __name__ == "__main__": 22 | task.add_command(benchmark_group) 23 | task.add_command(manage_group) 24 | task.add_command(dbms_group) 25 | task() 26 | -------------------------------------------------------------------------------- /util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/util/__init__.py -------------------------------------------------------------------------------- /util/shell.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import subprocess 4 | from pathlib import Path 5 | from typing import Optional 6 | 7 | 8 | def subprocess_run( 9 | c: str, 10 | cwd: Optional[Path] = None, 11 | check_returncode: bool = True, 12 | verbose: bool = True, 13 | ) -> subprocess.Popen[str]: 14 | """ 15 | We use this instead of subprocess.run() because of the cwd option. 16 | """ 17 | cwd_msg = f"(cwd: {cwd if cwd is not None else os.getcwd()})" 18 | 19 | if verbose: 20 | logging.info(f"Running {cwd_msg}: {c}") 21 | 22 | with subprocess.Popen( 23 | c, 24 | stdout=subprocess.PIPE, 25 | stderr=subprocess.STDOUT, 26 | shell=True, 27 | cwd=cwd, 28 | text=True, 29 | bufsize=0, 30 | ) as proc: 31 | while True: 32 | loop = proc.poll() is None 33 | assert proc.stdout is not None 34 | for line in proc.stdout: 35 | if verbose: 36 | logging.info(line) 37 | if not loop: 38 | break 39 | if check_returncode and proc.returncode != 0: 40 | raise RuntimeError(f"Non-zero returncode {proc.returncode} for: {c}") 41 | 42 | return proc 43 | --------------------------------------------------------------------------------