├── .github
    └── workflows
    │   └── tests.yaml
├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── benchmark
    ├── __init__.py
    ├── cli.py
    ├── constants.py
    ├── job
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── clone_job_queries.sh
    │   ├── job_schema.sql
    │   └── load_info.py
    ├── tests
    │   ├── __init__.py
    │   ├── benchmark_integtest_dbgym_config.yaml
    │   └── integtest_benchmark.py
    └── tpch
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── clone_tpch_kit.sh
    │   ├── constants.py
    │   ├── load_info.py
    │   ├── tpch_constraints.sql
    │   └── tpch_schema.sql
├── dbgym_config.yaml
├── dbms
    ├── __init__.py
    ├── cli.py
    ├── load_info_base_class.py
    ├── postgres
    │   ├── __init__.py
    │   ├── _build_repo.sh
    │   ├── cli.py
    │   └── default_boot_config.yaml
    └── tests
    │   ├── __init__.py
    │   ├── dbms_integtest_dbgym_config.yaml
    │   └── integtest_dbms.py
├── gymlib_package
    ├── __init__.py
    ├── gymlib
    │   ├── __init__.py
    │   ├── infra_paths.py
    │   ├── pg.py
    │   ├── pg_conn.py
    │   ├── py.typed
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── _set_up_gymlib_integtest_workspace.sh
    │   │   ├── filesystem_unittest_util.py
    │   │   ├── gymlib_integtest_dbgym_config.yaml
    │   │   ├── gymlib_integtest_util.py
    │   │   ├── integtest_pg_conn.py
    │   │   ├── integtest_tuning_artifacts.py
    │   │   ├── integtest_workload.py
    │   │   ├── unittest_filesystem_unittest_util.py
    │   │   └── unittest_workspace.py
    │   ├── tuning_artifacts.py
    │   ├── workload.py
    │   └── workspace.py
    └── pyproject.toml
├── orchestrate
    ├── __init__.py
    ├── clean.py
    ├── cli.py
    ├── replay.py
    └── tests
    │   ├── __init__.py
    │   ├── integtest_replay.py
    │   └── unittest_clean.py
├── scripts
    ├── __init__.py
    ├── _build_conda_env.sh
    ├── _load_per_machine_envvars.sh
    ├── _run_tests.py
    ├── build_agent_conda_env.sh
    ├── build_dbgym_conda_env.sh
    ├── check_format.sh
    ├── configs
    │   ├── .python_version
    │   ├── apt_requirements.txt
    │   ├── e2e_test_dbgym_config.yaml
    │   ├── mypy.ini
    │   └── requirements.txt
    ├── format.sh
    ├── install_sysdeps.sh
    ├── mypy.sh
    ├── pat_test.sh
    ├── pipfreeze.sh
    ├── quickstart.sh
    ├── run_integration_tests.sh
    └── run_unit_tests.sh
├── task.py
└── util
    ├── __init__.py
    └── shell.py


/.github/workflows/tests.yaml:
--------------------------------------------------------------------------------
 1 | name: Static, Unit, Integration, and End-to-End Tests
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [main]
 6 | 
 7 | jobs:
 8 |   tests:
 9 |     # The code for the self-hosted runners is at https://github.com/wangpatrick57/dbgym-runners.
10 |     runs-on: self-hosted
11 | 
12 |     steps:
13 |     - uses: actions/checkout@v2
14 | 
15 |     - name: Set up Python
16 |       uses: actions/setup-python@v4
17 |       with:
18 |         python-version: '3.10'
19 | 
20 |     # We could choose to set up dependencies manually in the GHA runner instead of installing them during the GHA.
21 |     #
22 |     # However, I think it's better to do them in the GHA itself so that we're testing our dependency installation step
23 |     # in addition to our actual code. It also removes the need to manually reinstall dependencies on the GHA runners
24 |     # every time we add a new dependency.
25 |     #
26 |     # Note that the GHA runners are stateful. Dependencies installed from previous runs will still be on the runner.
27 |     # This means this step will usually be pretty fast as most dependencies will already be cached. However, it also
28 |     # means that past runs might interfere with the current run, so you sometimes may need to restart the GHA runners.
29 | 
30 |     # We need to do `. "$HOME/.cargo/env"` in each step for it to work.
31 |     - name: Install dependencies
32 |       run: |
33 |         pip install -r ./scripts/configs/requirements.txt
34 |         pip install ./gymlib_package
35 |         ./scripts/install_sysdeps.sh
36 | 
37 |     - name: Check formatting
38 |       run: |
39 |         ./scripts/check_format.sh
40 | 
41 |     - name: Static type checking
42 |       run: |
43 |         ./scripts/mypy.sh
44 | 
45 |     - name: Run unit tests
46 |       # Unit tests are defined as tests which don't require any external systems to be running.
47 |       run: |
48 |         . "$HOME/.cargo/env"
49 |         ./scripts/run_unit_tests.sh
50 | 
51 |     - name: Run integration tests
52 |       # Integration tests do require external systems to be running (most commonly a database instance).
53 |       # Unlike end-to-end tests though, they test a specific module in a detailed manner, much like a unit test does.
54 |       env:
55 |         # The CI runs on ssd so we have to set this.
56 |         INTENDED_DBDATA_HARDWARE: ssd
57 |       run: |
58 |         . "$HOME/.cargo/env"
59 |         export 
60 |         ./scripts/run_integration_tests.sh
61 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | .mypy_cache/
 3 | .conda/
 4 | .idea/
 5 | build/
 6 | *_scratchspace/
 7 | workspace/
 8 | default_*_benchbase_config_*.xml
 9 | *.egg-info/
10 | *.code-workspace


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "agents/hello-tune"]
2 | 	path = agents/hello-tune
3 | 	url = git@github.com:wangpatrick57/hello-tune.git
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 CMU Database Group
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 🛢️ Database Gym 🏋️
 2 | [\[Slides\]](http://www.cidrdb.org/cidr2023/slides/p27-lim-slides.pdf) [\[Paper\]](https://www.cidrdb.org/cidr2023/papers/p27-lim.pdf)
 3 | 
 4 | *An end-to-end research vehicle for the field of self-driving DBMSs.*
 5 | 
 6 | ## Quickstart
 7 | 
 8 | These steps were tested on a fresh repository clone, Ubuntu 22.04.
 9 | 
10 | ```
11 | # Setup dependencies.
12 | # You may want to create a Python 3.10 virtual environment (e.g. with conda) before doing this.
13 | ./dependency/install_dependencies.sh
14 | 
15 | # Compile a custom fork of PostgreSQL, load TPC-H (SF 0.01), train the Proto-X agent, and tune.
16 | ./scripts/quickstart.sh postgres tpch 0.01 protox
17 | ```
18 | 
19 | ## Overview
20 | 
21 | Autonomous DBMS research often involves more engineering than research.
22 | As new advances in state-of-the-art technology are made, it is common to find that they have
23 | reimplemented the database tuning pipeline from scratch: workload capture, database setup,
24 | training data collection, model creation, model deployment, and more.
25 | Moreover, these bespoke pipelines make it difficult to combine different techniques even when they
26 | should be independent (e.g., using a different operator latency model in a tuning algorithm).
27 | 
28 | The database gym project is our attempt at standardizing the APIs between these disparate tasks,
29 | allowing researchers to mix-and-match the different pipeline components.
30 | It draws inspiration from the Farama Foundation's Gymnasium (formerly OpenAI Gym), which
31 | accelerates the development and comparison of reinforcement learning algorithms by providing a set
32 | of agents, environments, and a standardized API for communicating between them.
33 | Through the database gym, we hope to save other people time and reimplementation effort by
34 | providing an extensible open-source platform for autonomous DBMS research.
35 | 
36 | This project is under active development.
37 | Currently, we decompose the database tuning pipeline into the following components:
38 | 
39 | 1. Workload: collection, forecasting, synthesis
40 | 2. Database: database loading, instrumentation, orchestrating workload execution
41 | 3. Agent: identifying tuning actions, suggesting an action
42 | 
43 | ## Repository Structure
44 | 
45 | `task.py` is the entrypoint for all tasks.
46 | The tasks are grouped into categories that correspond to the top-level directories of the repository:
47 | 
48 | - `benchmark` - tasks to generate data and queries for different benchmarks (e.g., TPC-H, JOB)
49 | - `dbms` - tasks to build and start DBMSs (e.g., PostgreSQL)
50 | 
51 | ## Credits
52 | 
53 | The Database Gym project rose from the ashes of the [NoisePage](https://db.cs.cmu.edu/projects/noisepage/) self-driving DBMS project.
54 | 
55 | The first prototype was written by [Patrick Wang](https://github.com/wangpatrick57), integrating [Boot (VLDB 2024)](https://github.com/lmwnshn/boot) and [Proto-X (VLDB 2024)](https://github.com/17zhangw/protox) into a cohesive system.
56 | 
57 | ## Citing This Repository
58 | 
59 | If you use this repository in an academic paper, please cite:
60 | 
61 | ```
62 | @inproceedings{lim23,
63 |   author = {Lim, Wan Shen and Butrovich, Matthew and Zhang, William and Crotty, Andrew and Ma, Lin and Xu, Peijing and Gehrke, Johannes and Pavlo, Andrew},
64 |   title = {Database Gyms},
65 |   booktitle = {{CIDR} 2023, Conference on Innovative Data Systems Research},
66 |   year = {2023},
67 |   url = {https://db.cs.cmu.edu/papers/2023/p27-lim.pdf},
68 |  }
69 | ```
70 | 
71 | Additionally, please cite any module-specific paper that is relevant to your use.
72 | 
73 | **Accelerating Training Data Generation**
74 | 
75 | ```
76 | (citation pending)
77 | Boot, appearing at VLDB 2024.
78 | ```
79 | 
80 | **Simultaneously Tuning Multiple Configuration Spaces with Proto Actions**
81 | 
82 | ```
83 | (citation pending)
84 | Proto-X, appearing at VLDB 2024.
85 | ```
86 | 


--------------------------------------------------------------------------------
/benchmark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/benchmark/__init__.py


--------------------------------------------------------------------------------
/benchmark/cli.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from gymlib.workspace import DBGymWorkspace
 3 | 
 4 | from benchmark.job.cli import job_group
 5 | from benchmark.tpch.cli import tpch_group
 6 | 
 7 | 
 8 | @click.group(name="benchmark")
 9 | @click.pass_obj
10 | def benchmark_group(dbgym_workspace: DBGymWorkspace) -> None:
11 |     pass
12 | 
13 | 
14 | benchmark_group.add_command(tpch_group)
15 | benchmark_group.add_command(job_group)
16 | 


--------------------------------------------------------------------------------
/benchmark/constants.py:
--------------------------------------------------------------------------------
1 | DEFAULT_SCALE_FACTOR = 1.0
2 | 


--------------------------------------------------------------------------------
/benchmark/job/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/benchmark/job/__init__.py


--------------------------------------------------------------------------------
/benchmark/job/cli.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Optional
  3 | 
  4 | import click
  5 | from gymlib.infra_paths import (
  6 |     get_tables_dirname,
  7 |     get_workload_dirname,
  8 |     get_workload_suffix,
  9 | )
 10 | from gymlib.workspace import DBGymWorkspace, fully_resolve_path, name_to_linkname
 11 | 
 12 | from benchmark.constants import DEFAULT_SCALE_FACTOR
 13 | from util.shell import subprocess_run
 14 | 
 15 | JOB_TABLES_URL = "https://event.cwi.nl/da/job/imdb.tgz"
 16 | JOB_QUERIES_URL = "https://event.cwi.nl/da/job/job.tgz"
 17 | JOB_QUERY_NAMES = [
 18 |     "1a",
 19 |     "1b",
 20 |     "1c",
 21 |     "1d",
 22 |     "2a",
 23 |     "2b",
 24 |     "2c",
 25 |     "2d",
 26 |     "3a",
 27 |     "3b",
 28 |     "3c",
 29 |     "4a",
 30 |     "4b",
 31 |     "4c",
 32 |     "5a",
 33 |     "5b",
 34 |     "5c",
 35 |     "6a",
 36 |     "6b",
 37 |     "6c",
 38 |     "6d",
 39 |     "6e",
 40 |     "6f",
 41 |     "7a",
 42 |     "7b",
 43 |     "7c",
 44 |     "8a",
 45 |     "8b",
 46 |     "8c",
 47 |     "8d",
 48 |     "9a",
 49 |     "9b",
 50 |     "9c",
 51 |     "9d",
 52 |     "10a",
 53 |     "10b",
 54 |     "10c",
 55 |     "11a",
 56 |     "11b",
 57 |     "11c",
 58 |     "11d",
 59 |     "12a",
 60 |     "12b",
 61 |     "12c",
 62 |     "13a",
 63 |     "13b",
 64 |     "13c",
 65 |     "13d",
 66 |     "14a",
 67 |     "14b",
 68 |     "14c",
 69 |     "15a",
 70 |     "15b",
 71 |     "15c",
 72 |     "15d",
 73 |     "16a",
 74 |     "16b",
 75 |     "16c",
 76 |     "16d",
 77 |     "17a",
 78 |     "17b",
 79 |     "17c",
 80 |     "17d",
 81 |     "17e",
 82 |     "17f",
 83 |     "18a",
 84 |     "18b",
 85 |     "18c",
 86 |     "19a",
 87 |     "19b",
 88 |     "19c",
 89 |     "19d",
 90 |     "20a",
 91 |     "20b",
 92 |     "20c",
 93 |     "21a",
 94 |     "21b",
 95 |     "21c",
 96 |     "22a",
 97 |     "22b",
 98 |     "22c",
 99 |     "22d",
100 |     "23a",
101 |     "23b",
102 |     "23c",
103 |     "24a",
104 |     "24b",
105 |     "25a",
106 |     "25b",
107 |     "25c",
108 |     "26a",
109 |     "26b",
110 |     "26c",
111 |     "27a",
112 |     "27b",
113 |     "27c",
114 |     "28a",
115 |     "28b",
116 |     "28c",
117 |     "29a",
118 |     "29b",
119 |     "29c",
120 |     "30a",
121 |     "30b",
122 |     "30c",
123 |     "31a",
124 |     "31b",
125 |     "31c",
126 |     "32a",
127 |     "32b",
128 |     "33a",
129 |     "33b",
130 |     "33c",
131 | ]
132 | JOB_QUERIES_DNAME = "job-queries"
133 | 
134 | 
135 | @click.group(name="job")
136 | @click.pass_obj
137 | def job_group(dbgym_workspace: DBGymWorkspace) -> None:
138 |     pass
139 | 
140 | 
141 | @job_group.command(name="tables")
142 | # We expose this option to keep its interface consistent with other workloads, but you should never pass in something other than DEFAULT_SCALE_FACTOR.
143 | @click.argument("scale-factor", type=float)
144 | @click.pass_obj
145 | # The reason generate data is separate from create dbdata is because generate data is generic
146 | #   to all DBMSs while create dbdata is specific to a single DBMS.
147 | def job_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
148 |     _job_tables(dbgym_workspace, scale_factor)
149 | 
150 | 
151 | def _job_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
152 |     assert scale_factor == DEFAULT_SCALE_FACTOR
153 |     _download_job_tables(dbgym_workspace)
154 | 
155 | 
156 | @job_group.command(name="workload")
157 | @click.option(
158 |     "--query-subset",
159 |     type=click.Choice(["all", "a", "demo"]),
160 |     default="all",
161 | )
162 | @click.option("--scale-factor", type=float, default=DEFAULT_SCALE_FACTOR)
163 | @click.pass_obj
164 | def job_workload(
165 |     dbgym_workspace: DBGymWorkspace, query_subset: str, scale_factor: float
166 | ) -> None:
167 |     _job_workload(dbgym_workspace, query_subset, scale_factor)
168 | 
169 | 
170 | def _job_workload(
171 |     dbgym_workspace: DBGymWorkspace, query_subset: str, scale_factor: float
172 | ) -> None:
173 |     assert scale_factor == DEFAULT_SCALE_FACTOR
174 |     _download_job_queries(dbgym_workspace)
175 |     _generate_job_workload(dbgym_workspace, query_subset)
176 | 
177 | 
178 | def _download_job_tables(dbgym_workspace: DBGymWorkspace) -> None:
179 |     _download_and_untar_dir(
180 |         dbgym_workspace,
181 |         JOB_TABLES_URL,
182 |         "imdb.tgz",
183 |         get_tables_dirname("job", DEFAULT_SCALE_FACTOR),
184 |     )
185 | 
186 | 
187 | def _download_job_queries(dbgym_workspace: DBGymWorkspace) -> None:
188 |     _download_and_untar_dir(
189 |         dbgym_workspace,
190 |         JOB_QUERIES_URL,
191 |         "job.tgz",
192 |         JOB_QUERIES_DNAME,
193 |         untarred_original_dname="job",
194 |     )
195 | 
196 | 
197 | def _download_and_untar_dir(
198 |     dbgym_workspace: DBGymWorkspace,
199 |     download_url: str,
200 |     download_tarred_fname: str,
201 |     untarred_dname: str,
202 |     untarred_original_dname: Optional[str] = None,
203 | ) -> None:
204 |     """
205 |     Some .tgz files are built from a directory while others are built from the contents of
206 |     the directory. If the .tgz file we're untarring is built from a directory, it will have
207 |     an "original" directory name. If this is the case, you should set
208 |     `untarred_original_dname` to ensure that it gets renamed to `untarred_dname`.
209 |     """
210 |     expected_symlink_path = (
211 |         dbgym_workspace.dbgym_cur_symlinks_path / f"{untarred_dname}.link"
212 |     )
213 |     if expected_symlink_path.exists():
214 |         logging.info(f"Skipping download: {expected_symlink_path}")
215 |         return
216 | 
217 |     logging.info(f"Downloading: {expected_symlink_path}")
218 |     subprocess_run(f"curl -O {download_url}", cwd=dbgym_workspace.dbgym_this_run_path)
219 |     untarred_data_path = dbgym_workspace.dbgym_this_run_path / untarred_dname
220 | 
221 |     if untarred_original_dname is not None:
222 |         assert not untarred_data_path.exists()
223 |         subprocess_run(
224 |             f"tar -zxvf {download_tarred_fname}",
225 |             cwd=dbgym_workspace.dbgym_this_run_path,
226 |         )
227 |         assert (dbgym_workspace.dbgym_this_run_path / untarred_original_dname).exists()
228 |         subprocess_run(
229 |             f"mv {untarred_original_dname} {untarred_dname}",
230 |             cwd=dbgym_workspace.dbgym_this_run_path,
231 |         )
232 |     else:
233 |         untarred_data_path.mkdir(parents=True, exist_ok=False)
234 |         subprocess_run(f"tar -zxvf ../{download_tarred_fname}", cwd=untarred_data_path)
235 | 
236 |     assert untarred_data_path.exists()
237 |     subprocess_run(
238 |         f"rm {download_tarred_fname}", cwd=dbgym_workspace.dbgym_this_run_path
239 |     )
240 |     symlink_path = dbgym_workspace.link_result(untarred_data_path)
241 |     assert expected_symlink_path.samefile(symlink_path)
242 |     logging.info(f"Downloaded: {expected_symlink_path}")
243 | 
244 | 
245 | def _generate_job_workload(
246 |     dbgym_workspace: DBGymWorkspace,
247 |     query_subset: str,
248 | ) -> None:
249 |     workload_name = get_workload_dirname(
250 |         "job",
251 |         DEFAULT_SCALE_FACTOR,
252 |         get_workload_suffix("job", query_subset=query_subset),
253 |     )
254 |     expected_workload_symlink_path = dbgym_workspace.dbgym_cur_symlinks_path / (
255 |         name_to_linkname(workload_name)
256 |     )
257 |     if expected_workload_symlink_path.exists():
258 |         logging.info(f"Skipping generation: {expected_workload_symlink_path}")
259 |         return
260 | 
261 |     logging.info(f"Generating: {expected_workload_symlink_path}")
262 |     workload_path = dbgym_workspace.dbgym_this_run_path / workload_name
263 |     workload_path.mkdir(parents=False, exist_ok=False)
264 | 
265 |     query_names = None
266 |     if query_subset == "all":
267 |         query_names = JOB_QUERY_NAMES
268 |     elif query_subset == "a":
269 |         query_names = [qname for qname in JOB_QUERY_NAMES if qname[-1] == "a"]
270 |     elif query_subset == "demo":
271 |         query_names = [f"{i}a" for i in range(1, 6)]
272 |     else:
273 |         assert False
274 | 
275 |     with open(workload_path / "order.txt", "w") as f:
276 |         queries_parent_path = dbgym_workspace.dbgym_cur_symlinks_path / (
277 |             name_to_linkname(JOB_QUERIES_DNAME)
278 |         )
279 | 
280 |         for qname in query_names:
281 |             sql_path = fully_resolve_path(queries_parent_path / f"{qname}.sql")
282 |             f.write(f"Q{qname},{sql_path}\n")
283 | 
284 |     workload_symlink_path = dbgym_workspace.link_result(workload_path)
285 |     assert workload_symlink_path == expected_workload_symlink_path
286 |     logging.info(f"Generated: {expected_workload_symlink_path}")
287 | 


--------------------------------------------------------------------------------
/benchmark/job/clone_job_queries.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -euxo pipefail
 4 | 
 5 | JOB_REPO_ROOT="$1"
 6 | 
 7 | if [ ! -d "${JOB_REPO_ROOT}/job-queries" ]; then
 8 |   mkdir -p "${JOB_REPO_ROOT}"
 9 |   cd "${JOB_REPO_ROOT}"
10 |   git clone https://github.com/wangpatrick57/job-queries.git --single-branch --branch master --depth 1
11 | fi
12 | 


--------------------------------------------------------------------------------
/benchmark/job/job_schema.sql:
--------------------------------------------------------------------------------
  1 | -- Copied over from https://event.cwi.nl/da/job/job.tgz.
  2 | -- We copied it over so that we have control over the schema.
  3 | CREATE TABLE aka_name (
  4 |     id integer NOT NULL PRIMARY KEY,
  5 |     person_id integer NOT NULL,
  6 |     name text NOT NULL,
  7 |     imdb_index character varying(12),
  8 |     name_pcode_cf character varying(5),
  9 |     name_pcode_nf character varying(5),
 10 |     surname_pcode character varying(5),
 11 |     md5sum character varying(32)
 12 | );
 13 | 
 14 | CREATE TABLE aka_title (
 15 |     id integer NOT NULL PRIMARY KEY,
 16 |     movie_id integer NOT NULL,
 17 |     title text NOT NULL,
 18 |     imdb_index character varying(12),
 19 |     kind_id integer NOT NULL,
 20 |     production_year integer,
 21 |     phonetic_code character varying(5),
 22 |     episode_of_id integer,
 23 |     season_nr integer,
 24 |     episode_nr integer,
 25 |     note text,
 26 |     md5sum character varying(32)
 27 | );
 28 | 
 29 | CREATE TABLE cast_info (
 30 |     id integer NOT NULL PRIMARY KEY,
 31 |     person_id integer NOT NULL,
 32 |     movie_id integer NOT NULL,
 33 |     person_role_id integer,
 34 |     note text,
 35 |     nr_order integer,
 36 |     role_id integer NOT NULL
 37 | );
 38 | 
 39 | CREATE TABLE char_name (
 40 |     id integer NOT NULL PRIMARY KEY,
 41 |     name text NOT NULL,
 42 |     imdb_index character varying(12),
 43 |     imdb_id integer,
 44 |     name_pcode_nf character varying(5),
 45 |     surname_pcode character varying(5),
 46 |     md5sum character varying(32)
 47 | );
 48 | 
 49 | CREATE TABLE comp_cast_type (
 50 |     id integer NOT NULL PRIMARY KEY,
 51 |     kind character varying(32) NOT NULL
 52 | );
 53 | 
 54 | CREATE TABLE company_name (
 55 |     id integer NOT NULL PRIMARY KEY,
 56 |     name text NOT NULL,
 57 |     country_code character varying(255),
 58 |     imdb_id integer,
 59 |     name_pcode_nf character varying(5),
 60 |     name_pcode_sf character varying(5),
 61 |     md5sum character varying(32)
 62 | );
 63 | 
 64 | CREATE TABLE company_type (
 65 |     id integer NOT NULL PRIMARY KEY,
 66 |     kind character varying(32) NOT NULL
 67 | );
 68 | 
 69 | CREATE TABLE complete_cast (
 70 |     id integer NOT NULL PRIMARY KEY,
 71 |     movie_id integer,
 72 |     subject_id integer NOT NULL,
 73 |     status_id integer NOT NULL
 74 | );
 75 | 
 76 | CREATE TABLE info_type (
 77 |     id integer NOT NULL PRIMARY KEY,
 78 |     info character varying(32) NOT NULL
 79 | );
 80 | 
 81 | CREATE TABLE keyword (
 82 |     id integer NOT NULL PRIMARY KEY,
 83 |     keyword text NOT NULL,
 84 |     phonetic_code character varying(5)
 85 | );
 86 | 
 87 | CREATE TABLE kind_type (
 88 |     id integer NOT NULL PRIMARY KEY,
 89 |     kind character varying(15) NOT NULL
 90 | );
 91 | 
 92 | CREATE TABLE link_type (
 93 |     id integer NOT NULL PRIMARY KEY,
 94 |     link character varying(32) NOT NULL
 95 | );
 96 | 
 97 | CREATE TABLE movie_companies (
 98 |     id integer NOT NULL PRIMARY KEY,
 99 |     movie_id integer NOT NULL,
100 |     company_id integer NOT NULL,
101 |     company_type_id integer NOT NULL,
102 |     note text
103 | );
104 | 
105 | CREATE TABLE movie_info (
106 |     id integer NOT NULL PRIMARY KEY,
107 |     movie_id integer NOT NULL,
108 |     info_type_id integer NOT NULL,
109 |     info text NOT NULL,
110 |     note text
111 | );
112 | 
113 | CREATE TABLE movie_info_idx (
114 |     id integer NOT NULL PRIMARY KEY,
115 |     movie_id integer NOT NULL,
116 |     info_type_id integer NOT NULL,
117 |     info text NOT NULL,
118 |     note text
119 | );
120 | 
121 | CREATE TABLE movie_keyword (
122 |     id integer NOT NULL PRIMARY KEY,
123 |     movie_id integer NOT NULL,
124 |     keyword_id integer NOT NULL
125 | );
126 | 
127 | CREATE TABLE movie_link (
128 |     id integer NOT NULL PRIMARY KEY,
129 |     movie_id integer NOT NULL,
130 |     linked_movie_id integer NOT NULL,
131 |     link_type_id integer NOT NULL
132 | );
133 | 
134 | CREATE TABLE name (
135 |     id integer NOT NULL PRIMARY KEY,
136 |     name text NOT NULL,
137 |     imdb_index character varying(12),
138 |     imdb_id integer,
139 |     gender character varying(1),
140 |     name_pcode_cf character varying(5),
141 |     name_pcode_nf character varying(5),
142 |     surname_pcode character varying(5),
143 |     md5sum character varying(32)
144 | );
145 | 
146 | CREATE TABLE person_info (
147 |     id integer NOT NULL PRIMARY KEY,
148 |     person_id integer NOT NULL,
149 |     info_type_id integer NOT NULL,
150 |     info text NOT NULL,
151 |     note text
152 | );
153 | 
154 | CREATE TABLE role_type (
155 |     id integer NOT NULL PRIMARY KEY,
156 |     role character varying(32) NOT NULL
157 | );
158 | 
159 | CREATE TABLE title (
160 |     id integer NOT NULL PRIMARY KEY,
161 |     title text NOT NULL,
162 |     imdb_index character varying(12),
163 |     kind_id integer NOT NULL,
164 |     production_year integer,
165 |     imdb_id integer,
166 |     phonetic_code character varying(5),
167 |     episode_of_id integer,
168 |     season_nr integer,
169 |     episode_nr integer,
170 |     series_years character varying(49),
171 |     md5sum character varying(32)
172 | );


--------------------------------------------------------------------------------
/benchmark/job/load_info.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Optional
 3 | 
 4 | from gymlib.infra_paths import get_tables_symlink_path
 5 | from gymlib.workspace import DBGymWorkspace, fully_resolve_path
 6 | 
 7 | from benchmark.constants import DEFAULT_SCALE_FACTOR
 8 | from dbms.load_info_base_class import LoadInfoBaseClass
 9 | 
10 | JOB_SCHEMA_FNAME = "job_schema.sql"
11 | 
12 | 
13 | class JobLoadInfo(LoadInfoBaseClass):
14 |     TABLES = [
15 |         "aka_name",
16 |         "aka_title",
17 |         "cast_info",
18 |         "char_name",
19 |         "comp_cast_type",
20 |         "company_name",
21 |         "company_type",
22 |         "complete_cast",
23 |         "info_type",
24 |         "keyword",
25 |         "kind_type",
26 |         "link_type",
27 |         "movie_companies",
28 |         "movie_info",
29 |         "movie_info_idx",
30 |         "movie_keyword",
31 |         "movie_link",
32 |         "name",
33 |         "person_info",
34 |         "role_type",
35 |         "title",
36 |     ]
37 | 
38 |     def __init__(self, dbgym_workspace: DBGymWorkspace):
39 |         # Schema (directly in the codebase).
40 |         job_codebase_path = dbgym_workspace.base_dbgym_repo_path / "benchmark" / "job"
41 |         self._schema_path = job_codebase_path / JOB_SCHEMA_FNAME
42 |         assert (
43 |             self._schema_path.exists()
44 |         ), f"self._schema_path ({self._schema_path}) does not exist"
45 | 
46 |         # Tables
47 |         tables_path = fully_resolve_path(
48 |             get_tables_symlink_path(
49 |                 dbgym_workspace.dbgym_workspace_path, "job", DEFAULT_SCALE_FACTOR
50 |             )
51 |         )
52 |         self._tables_and_paths = []
53 |         for table in JobLoadInfo.TABLES:
54 |             table_path = tables_path / f"{table}.csv"
55 |             self._tables_and_paths.append((table, table_path))
56 | 
57 |     def get_schema_path(self) -> Path:
58 |         return self._schema_path
59 | 
60 |     def get_tables_and_paths(self) -> list[tuple[str, Path]]:
61 |         return self._tables_and_paths
62 | 
63 |     def get_table_file_delimiter(self) -> str:
64 |         return ","
65 | 
66 |     def get_constraints_path(self) -> Optional[Path]:
67 |         # JOB does not have any constraints. It does have indexes, but we don't want to create
68 |         # those indexes so that the tuning agent can start from a clean slate.
69 |         return None
70 | 


--------------------------------------------------------------------------------
/benchmark/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/benchmark/tests/__init__.py


--------------------------------------------------------------------------------
/benchmark/tests/benchmark_integtest_dbgym_config.yaml:
--------------------------------------------------------------------------------
1 | dbgym_workspace_path: ../dbgym_benchmark_integtest_workspace/
2 | 


--------------------------------------------------------------------------------
/benchmark/tests/integtest_benchmark.py:
--------------------------------------------------------------------------------
  1 | import shutil
  2 | import unittest
  3 | from pathlib import Path
  4 | 
  5 | from gymlib.infra_paths import (
  6 |     get_tables_symlink_path,
  7 |     get_workload_suffix,
  8 |     get_workload_symlink_path,
  9 | )
 10 | from gymlib.workspace import (
 11 |     DBGymWorkspace,
 12 |     fully_resolve_path,
 13 |     get_workspace_path_from_config,
 14 | )
 15 | 
 16 | # It's ok to import private functions from the benchmark module because this is an integration test.
 17 | from benchmark.constants import DEFAULT_SCALE_FACTOR
 18 | from benchmark.job.cli import _job_tables, _job_workload
 19 | from benchmark.tpch.cli import _tpch_tables, _tpch_workload
 20 | from benchmark.tpch.constants import DEFAULT_TPCH_SEED
 21 | 
 22 | 
 23 | class BenchmarkTests(unittest.TestCase):
 24 |     DBGYM_CONFIG_PATH = Path("benchmark/tests/benchmark_integtest_dbgym_config.yaml")
 25 | 
 26 |     def setUp(self) -> None:
 27 |         workspace_path = get_workspace_path_from_config(
 28 |             BenchmarkTests.DBGYM_CONFIG_PATH
 29 |         )
 30 |         # Get a clean start each time.
 31 |         if workspace_path.exists():
 32 |             shutil.rmtree(workspace_path)
 33 | 
 34 |         # Reset this to avoid the error of it being created twice.
 35 |         # In real usage, the second run would be a different Python process so DBGymWorkspace._num_times_created_this_run would be 0.
 36 |         DBGymWorkspace._num_times_created_this_run = 0
 37 |         self.workspace = DBGymWorkspace(workspace_path)
 38 | 
 39 |     def tearDown(self) -> None:
 40 |         if self.workspace.dbgym_workspace_path.exists():
 41 |             shutil.rmtree(self.workspace.dbgym_workspace_path)
 42 | 
 43 |     def test_tpch_tables(self) -> None:
 44 |         scale_factor = 0.01
 45 |         tables_path = get_tables_symlink_path(
 46 |             self.workspace.dbgym_workspace_path, "tpch", scale_factor
 47 |         )
 48 |         self.assertFalse(tables_path.exists())
 49 |         _tpch_tables(self.workspace, scale_factor)
 50 |         self.assertTrue(tables_path.exists())
 51 |         self.assertTrue(fully_resolve_path(tables_path).exists())
 52 | 
 53 |     def test_job_tables(self) -> None:
 54 |         tables_path = get_tables_symlink_path(
 55 |             self.workspace.dbgym_workspace_path, "job", DEFAULT_SCALE_FACTOR
 56 |         )
 57 |         self.assertFalse(tables_path.exists())
 58 |         _job_tables(self.workspace, DEFAULT_SCALE_FACTOR)
 59 |         self.assertTrue(tables_path.exists())
 60 |         self.assertTrue(fully_resolve_path(tables_path).exists())
 61 | 
 62 |     def test_tpch_workload(self) -> None:
 63 |         scale_factor = 0.01
 64 |         workload_path = get_workload_symlink_path(
 65 |             self.workspace.dbgym_workspace_path,
 66 |             "tpch",
 67 |             scale_factor,
 68 |             get_workload_suffix(
 69 |                 "tpch",
 70 |                 seed_start=DEFAULT_TPCH_SEED,
 71 |                 seed_end=DEFAULT_TPCH_SEED,
 72 |                 query_subset="all",
 73 |             ),
 74 |         )
 75 |         self.assertFalse(workload_path.exists())
 76 |         _tpch_workload(
 77 |             self.workspace, DEFAULT_TPCH_SEED, DEFAULT_TPCH_SEED, "all", scale_factor
 78 |         )
 79 |         self.assertTrue(workload_path.exists())
 80 |         self.assertTrue(fully_resolve_path(workload_path).exists())
 81 | 
 82 |     def test_job_workload(self) -> None:
 83 |         workload_path = get_workload_symlink_path(
 84 |             self.workspace.dbgym_workspace_path,
 85 |             "job",
 86 |             DEFAULT_SCALE_FACTOR,
 87 |             get_workload_suffix(
 88 |                 "job",
 89 |                 query_subset="all",
 90 |             ),
 91 |         )
 92 |         self.assertFalse(workload_path.exists())
 93 |         _job_workload(self.workspace, "all", DEFAULT_SCALE_FACTOR)
 94 |         self.assertTrue(workload_path.exists())
 95 |         self.assertTrue(fully_resolve_path(workload_path).exists())
 96 | 
 97 | 
 98 | if __name__ == "__main__":
 99 |     unittest.main()
100 | 


--------------------------------------------------------------------------------
/benchmark/tpch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/benchmark/tpch/__init__.py


--------------------------------------------------------------------------------
/benchmark/tpch/cli.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import click
  4 | from gymlib.infra_paths import (
  5 |     get_scale_factor_string,
  6 |     get_tables_dirname,
  7 |     get_tables_symlink_path,
  8 |     get_workload_suffix,
  9 |     get_workload_symlink_path,
 10 | )
 11 | from gymlib.workspace import (
 12 |     DBGymWorkspace,
 13 |     fully_resolve_path,
 14 |     is_fully_resolved,
 15 |     linkname_to_name,
 16 |     name_to_linkname,
 17 | )
 18 | 
 19 | from benchmark.constants import DEFAULT_SCALE_FACTOR
 20 | from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES
 21 | from util.shell import subprocess_run
 22 | 
 23 | TPCH_KIT_DIRNAME = "tpch-kit"
 24 | 
 25 | 
 26 | @click.group(name="tpch")
 27 | @click.pass_obj
 28 | def tpch_group(dbgym_workspace: DBGymWorkspace) -> None:
 29 |     pass
 30 | 
 31 | 
 32 | @tpch_group.command(name="tables")
 33 | @click.argument("scale-factor", type=float)
 34 | @click.pass_obj
 35 | # The reason generate tables is separate from create dbdata is because tpch_tables is generic
 36 | #   to all DBMSs while create dbdata is specific to a single DBMS.
 37 | def tpch_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
 38 |     _tpch_tables(dbgym_workspace, scale_factor)
 39 | 
 40 | 
 41 | def _tpch_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
 42 |     """
 43 |     This function exists as a hook for integration tests.
 44 |     """
 45 |     _clone_tpch_kit(dbgym_workspace)
 46 |     _generate_tpch_tables(dbgym_workspace, scale_factor)
 47 | 
 48 | 
 49 | @tpch_group.command(name="workload")
 50 | @click.option(
 51 |     "--seed-start",
 52 |     type=int,
 53 |     default=DEFAULT_TPCH_SEED,
 54 |     help="A workload consists of queries from multiple seeds. This is the starting seed (inclusive).",
 55 | )
 56 | @click.option(
 57 |     "--seed-end",
 58 |     type=int,
 59 |     default=DEFAULT_TPCH_SEED,
 60 |     help="A workload consists of queries from multiple seeds. This is the ending seed (inclusive).",
 61 | )
 62 | @click.option(
 63 |     "--query-subset",
 64 |     type=click.Choice(["all", "even", "odd"]),
 65 |     default="all",
 66 | )
 67 | @click.option("--scale-factor", type=float, default=DEFAULT_SCALE_FACTOR)
 68 | @click.pass_obj
 69 | def tpch_workload(
 70 |     dbgym_workspace: DBGymWorkspace,
 71 |     seed_start: int,
 72 |     seed_end: int,
 73 |     query_subset: str,
 74 |     scale_factor: float,
 75 | ) -> None:
 76 |     _tpch_workload(dbgym_workspace, seed_start, seed_end, query_subset, scale_factor)
 77 | 
 78 | 
 79 | def _tpch_workload(
 80 |     dbgym_workspace: DBGymWorkspace,
 81 |     seed_start: int,
 82 |     seed_end: int,
 83 |     query_subset: str,
 84 |     scale_factor: float,
 85 | ) -> None:
 86 |     """
 87 |     This function exists as a hook for integration tests.
 88 |     """
 89 |     assert (
 90 |         seed_start <= seed_end
 91 |     ), f"seed_start ({seed_start}) must be <= seed_end ({seed_end})"
 92 |     _clone_tpch_kit(dbgym_workspace)
 93 |     _generate_tpch_queries(dbgym_workspace, seed_start, seed_end, scale_factor)
 94 |     _generate_tpch_workload(
 95 |         dbgym_workspace, seed_start, seed_end, query_subset, scale_factor
 96 |     )
 97 | 
 98 | 
 99 | def _get_queries_dirname(seed: int, scale_factor: float) -> str:
100 |     return f"queries_{seed}_sf{get_scale_factor_string(scale_factor)}"
101 | 
102 | 
103 | def _clone_tpch_kit(dbgym_workspace: DBGymWorkspace) -> None:
104 |     expected_symlink_path = dbgym_workspace.dbgym_cur_symlinks_path / (
105 |         name_to_linkname(TPCH_KIT_DIRNAME)
106 |     )
107 |     if expected_symlink_path.exists():
108 |         logging.info(f"Skipping clone: {expected_symlink_path}")
109 |         return
110 | 
111 |     logging.info(f"Cloning: {expected_symlink_path}")
112 |     subprocess_run(
113 |         f"./clone_tpch_kit.sh {dbgym_workspace.dbgym_this_run_path}",
114 |         cwd=dbgym_workspace.base_dbgym_repo_path / "benchmark" / "tpch",
115 |     )
116 |     symlink_path = dbgym_workspace.link_result(
117 |         dbgym_workspace.dbgym_this_run_path / TPCH_KIT_DIRNAME
118 |     )
119 |     assert expected_symlink_path.samefile(symlink_path)
120 |     logging.info(f"Cloned: {expected_symlink_path}")
121 | 
122 | 
123 | def _generate_tpch_queries(
124 |     dbgym_workspace: DBGymWorkspace, seed_start: int, seed_end: int, scale_factor: float
125 | ) -> None:
126 |     tpch_kit_path = dbgym_workspace.dbgym_cur_symlinks_path / (
127 |         name_to_linkname(TPCH_KIT_DIRNAME)
128 |     )
129 |     logging.info(f"Generating queries: [{seed_start}, {seed_end}]")
130 |     for seed in range(seed_start, seed_end + 1):
131 |         expected_queries_symlink_path = dbgym_workspace.dbgym_cur_symlinks_path / (
132 |             name_to_linkname(_get_queries_dirname(seed, scale_factor))
133 |         )
134 |         if expected_queries_symlink_path.exists():
135 |             continue
136 | 
137 |         queries_parent_path = (
138 |             dbgym_workspace.dbgym_this_run_path
139 |             / _get_queries_dirname(seed, scale_factor)
140 |         )
141 |         queries_parent_path.mkdir(parents=False, exist_ok=False)
142 |         for i in range(1, NUM_TPCH_QUERIES + 1):
143 |             target_sql = (queries_parent_path / f"{i}.sql").resolve()
144 |             subprocess_run(
145 |                 f"DSS_QUERY=./queries ./qgen {i} -r {seed} -s {scale_factor} > {target_sql}",
146 |                 cwd=tpch_kit_path / "dbgen",
147 |                 verbose=False,
148 |             )
149 |         queries_symlink_path = dbgym_workspace.link_result(queries_parent_path)
150 |         assert queries_symlink_path.samefile(expected_queries_symlink_path)
151 |     logging.info(f"Generated queries: [{seed_start}, {seed_end}]")
152 | 
153 | 
154 | def _generate_tpch_tables(dbgym_workspace: DBGymWorkspace, scale_factor: float) -> None:
155 |     tpch_kit_path = dbgym_workspace.dbgym_cur_symlinks_path / (
156 |         name_to_linkname(TPCH_KIT_DIRNAME)
157 |     )
158 |     expected_tables_symlink_path = get_tables_symlink_path(
159 |         dbgym_workspace.dbgym_workspace_path, "tpch", scale_factor
160 |     )
161 |     if expected_tables_symlink_path.exists():
162 |         logging.info(f"Skipping generation: {expected_tables_symlink_path}")
163 |         return
164 | 
165 |     logging.info(f"Generating: {expected_tables_symlink_path}")
166 |     subprocess_run(f"./dbgen -vf -s {scale_factor}", cwd=tpch_kit_path / "dbgen")
167 |     tables_parent_path = dbgym_workspace.dbgym_this_run_path / get_tables_dirname(
168 |         "tpch", scale_factor
169 |     )
170 |     tables_parent_path.mkdir(parents=False, exist_ok=False)
171 |     subprocess_run(f"mv ./*.tbl {tables_parent_path}", cwd=tpch_kit_path / "dbgen")
172 | 
173 |     tables_symlink_path = dbgym_workspace.link_result(tables_parent_path)
174 |     assert tables_symlink_path.samefile(expected_tables_symlink_path)
175 |     logging.info(f"Generated: {expected_tables_symlink_path}")
176 | 
177 | 
178 | def _generate_tpch_workload(
179 |     dbgym_workspace: DBGymWorkspace,
180 |     seed_start: int,
181 |     seed_end: int,
182 |     query_subset: str,
183 |     scale_factor: float,
184 | ) -> None:
185 |     expected_workload_symlink_path = get_workload_symlink_path(
186 |         dbgym_workspace.dbgym_workspace_path,
187 |         "tpch",
188 |         scale_factor,
189 |         get_workload_suffix(
190 |             "tpch", seed_start=seed_start, seed_end=seed_end, query_subset=query_subset
191 |         ),
192 |     )
193 |     if expected_workload_symlink_path.exists():
194 |         logging.info(f"Skipping generation: {expected_workload_symlink_path}")
195 |         return
196 | 
197 |     logging.info(f"Generating: {expected_workload_symlink_path}")
198 |     workload_path = dbgym_workspace.dbgym_this_run_path / linkname_to_name(
199 |         expected_workload_symlink_path.name
200 |     )
201 |     workload_path.mkdir(parents=False, exist_ok=False)
202 | 
203 |     query_names = None
204 |     if query_subset == "all":
205 |         query_names = [f"{i}" for i in range(1, NUM_TPCH_QUERIES + 1)]
206 |     elif query_subset == "even":
207 |         query_names = [f"{i}" for i in range(1, NUM_TPCH_QUERIES + 1) if i % 2 == 0]
208 |     elif query_subset == "odd":
209 |         query_names = [f"{i}" for i in range(1, NUM_TPCH_QUERIES + 1) if i % 2 == 1]
210 |     else:
211 |         assert False
212 | 
213 |     with open(workload_path / "order.txt", "w") as f:
214 |         for seed in range(seed_start, seed_end + 1):
215 |             queries_parent_path = dbgym_workspace.dbgym_cur_symlinks_path / (
216 |                 name_to_linkname(_get_queries_dirname(seed, scale_factor))
217 |             )
218 | 
219 |             for qname in query_names:
220 |                 sql_path = fully_resolve_path(queries_parent_path / f"{qname}.sql")
221 |                 assert is_fully_resolved(
222 |                     sql_path
223 |                 ), "We should only write existent real absolute paths to a file"
224 |                 f.write(f"S{seed}-Q{qname},{sql_path}\n")
225 | 
226 |     workload_symlink_path = dbgym_workspace.link_result(workload_path)
227 |     assert workload_symlink_path == expected_workload_symlink_path
228 |     logging.info(f"Generated: {expected_workload_symlink_path}")
229 | 


--------------------------------------------------------------------------------
/benchmark/tpch/clone_tpch_kit.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -euxo pipefail
 4 | 
 5 | TPCH_REPO_ROOT="$1"
 6 | 
 7 | if [ ! -d "${TPCH_REPO_ROOT}/tpch-kit" ]; then
 8 |   mkdir -p "${TPCH_REPO_ROOT}"
 9 |   cd "${TPCH_REPO_ROOT}"
10 |   git clone https://github.com/lmwnshn/tpch-kit.git --single-branch --branch master --depth 1
11 |   cd ./tpch-kit/dbgen
12 |   make MACHINE=LINUX DATABASE=POSTGRESQL
13 | fi
14 | 


--------------------------------------------------------------------------------
/benchmark/tpch/constants.py:
--------------------------------------------------------------------------------
1 | DEFAULT_TPCH_SEED = 15721
2 | NUM_TPCH_QUERIES = 22
3 | 


--------------------------------------------------------------------------------
/benchmark/tpch/load_info.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Optional
 3 | 
 4 | from gymlib.infra_paths import get_tables_symlink_path
 5 | from gymlib.workspace import DBGymWorkspace, fully_resolve_path
 6 | 
 7 | from dbms.load_info_base_class import LoadInfoBaseClass
 8 | 
 9 | TPCH_SCHEMA_FNAME = "tpch_schema.sql"
10 | TPCH_CONSTRAINTS_FNAME = "tpch_constraints.sql"
11 | 
12 | 
13 | class TpchLoadInfo(LoadInfoBaseClass):
14 |     TABLES = [
15 |         "region",
16 |         "nation",
17 |         "part",
18 |         "supplier",
19 |         "partsupp",
20 |         "customer",
21 |         "orders",
22 |         "lineitem",
23 |     ]
24 | 
25 |     def __init__(self, dbgym_workspace: DBGymWorkspace, scale_factor: float):
26 |         # Schema and constraints (directly in the codebase).
27 |         tpch_codebase_path = dbgym_workspace.base_dbgym_repo_path / "benchmark" / "tpch"
28 |         self._schema_path = tpch_codebase_path / TPCH_SCHEMA_FNAME
29 |         assert (
30 |             self._schema_path.exists()
31 |         ), f"self._schema_path ({self._schema_path}) does not exist"
32 |         self._constraints_path = tpch_codebase_path / TPCH_CONSTRAINTS_FNAME
33 |         assert (
34 |             self._constraints_path.exists()
35 |         ), f"self._constraints_path ({self._constraints_path}) does not exist"
36 | 
37 |         # Tables
38 |         tables_path = fully_resolve_path(
39 |             get_tables_symlink_path(
40 |                 dbgym_workspace.dbgym_workspace_path, "tpch", scale_factor
41 |             )
42 |         )
43 |         self._tables_and_paths = []
44 |         for table in TpchLoadInfo.TABLES:
45 |             table_path = tables_path / f"{table}.tbl"
46 |             self._tables_and_paths.append((table, table_path))
47 | 
48 |     def get_schema_path(self) -> Path:
49 |         return self._schema_path
50 | 
51 |     def get_tables_and_paths(self) -> list[tuple[str, Path]]:
52 |         return self._tables_and_paths
53 | 
54 |     def get_table_file_delimiter(self) -> str:
55 |         return "|"
56 | 
57 |     def get_constraints_path(self) -> Optional[Path]:
58 |         return self._constraints_path
59 | 


--------------------------------------------------------------------------------
/benchmark/tpch/tpch_constraints.sql:
--------------------------------------------------------------------------------
 1 | ALTER TABLE nation ADD CONSTRAINT nation_n_regionkey_fkey FOREIGN KEY (n_regionkey) REFERENCES region (r_regionkey) ON DELETE CASCADE;
 2 | ALTER TABLE supplier ADD CONSTRAINT supplier_s_nationkey_fkey FOREIGN KEY (s_nationkey) REFERENCES nation (n_nationkey) ON DELETE CASCADE;
 3 | ALTER TABLE partsupp ADD CONSTRAINT partsupp_ps_partkey_fkey FOREIGN KEY (ps_partkey) REFERENCES part (p_partkey) ON DELETE CASCADE;
 4 | ALTER TABLE partsupp ADD CONSTRAINT partsupp_ps_suppkey_fkey FOREIGN KEY (ps_suppkey) REFERENCES supplier (s_suppkey) ON DELETE CASCADE;
 5 | ALTER TABLE customer ADD CONSTRAINT customer_c_nationkey_fkey FOREIGN KEY (c_nationkey) REFERENCES nation (n_nationkey) ON DELETE CASCADE;
 6 | ALTER TABLE orders ADD CONSTRAINT orders_o_custkey_fkey FOREIGN KEY (o_custkey) REFERENCES customer (c_custkey) ON DELETE CASCADE;
 7 | ALTER TABLE lineitem ADD CONSTRAINT lineitem_l_orderkey_fkey FOREIGN KEY (l_orderkey) REFERENCES orders (o_orderkey) ON DELETE CASCADE;
 8 | ALTER TABLE lineitem ADD CONSTRAINT lineitem_l_partkey_l_suppkey_fkey FOREIGN KEY (l_partkey, l_suppkey) REFERENCES partsupp (ps_partkey, ps_suppkey) ON DELETE CASCADE;
 9 | 
10 | -- We don't create any indexes so that there's a clean slate for tuning
11 | -- CREATE UNIQUE INDEX r_rk ON region (r_regionkey ASC);
12 | -- CREATE UNIQUE INDEX n_nk ON nation (n_nationkey ASC);
13 | -- CREATE INDEX n_rk ON nation (n_regionkey ASC);
14 | -- CREATE UNIQUE INDEX p_pk ON part (p_partkey ASC);
15 | -- CREATE UNIQUE INDEX s_sk ON supplier (s_suppkey ASC);
16 | -- CREATE INDEX s_nk ON supplier (s_nationkey ASC);
17 | -- CREATE INDEX ps_pk ON partsupp (ps_partkey ASC);
18 | -- CREATE INDEX ps_sk ON partsupp (ps_suppkey ASC);
19 | -- CREATE UNIQUE INDEX ps_pk_sk ON partsupp (ps_partkey ASC, ps_suppkey ASC);
20 | -- CREATE UNIQUE INDEX ps_sk_pk ON partsupp (ps_suppkey ASC, ps_partkey ASC);
21 | -- CREATE UNIQUE INDEX c_ck ON customer (c_custkey ASC);
22 | -- CREATE INDEX c_nk ON customer (c_nationkey ASC);
23 | -- CREATE UNIQUE INDEX o_ok ON orders (o_orderkey ASC);
24 | -- CREATE INDEX o_ck ON orders (o_custkey ASC);
25 | -- CREATE INDEX o_od ON orders (o_orderdate ASC);
26 | -- CREATE INDEX l_ok ON lineitem (l_orderkey ASC);
27 | -- CREATE INDEX l_pk ON lineitem (l_partkey ASC);
28 | -- CREATE INDEX l_sk ON lineitem (l_suppkey ASC);
29 | -- CREATE INDEX l_sd ON lineitem (l_shipdate ASC);
30 | -- CREATE INDEX l_cd ON lineitem (l_commitdate ASC);
31 | -- CREATE INDEX l_rd ON lineitem (l_receiptdate ASC);
32 | -- CREATE INDEX l_pk_sk ON lineitem (l_partkey ASC, l_suppkey ASC);
33 | -- CREATE INDEX l_sk_pk ON lineitem (l_suppkey ASC, l_partkey ASC);


--------------------------------------------------------------------------------
/benchmark/tpch/tpch_schema.sql:
--------------------------------------------------------------------------------
  1 | -- Copied over from https://github.com/cmu-db/benchbase/blob/main/src/main/resources/benchmarks/tpch/ddl-postgres.sql
  2 | -- We copied it over so that we have control over the schema, not tpch-kit.
  3 | 
  4 | DROP TABLE IF EXISTS nation CASCADE;
  5 | DROP TABLE IF EXISTS region CASCADE;
  6 | DROP TABLE IF EXISTS part CASCADE;
  7 | DROP TABLE IF EXISTS supplier CASCADE;
  8 | DROP TABLE IF EXISTS partsupp CASCADE;
  9 | DROP TABLE IF EXISTS orders CASCADE;
 10 | DROP TABLE IF EXISTS customer CASCADE;
 11 | DROP TABLE IF EXISTS lineitem CASCADE;
 12 | 
 13 | CREATE TABLE region (
 14 |     r_regionkey integer  NOT NULL,
 15 |     r_name      char(25) NOT NULL,
 16 |     r_comment   varchar(152),
 17 |     PRIMARY KEY (r_regionkey)
 18 | );
 19 | 
 20 | CREATE TABLE nation (
 21 |     n_nationkey integer  NOT NULL,
 22 |     n_name      char(25) NOT NULL,
 23 |     n_regionkey integer  NOT NULL,
 24 |     n_comment   varchar(152),
 25 |     PRIMARY KEY (n_nationkey)
 26 | );
 27 | 
 28 | CREATE TABLE part (
 29 |     p_partkey     integer        NOT NULL,
 30 |     p_name        varchar(55)    NOT NULL,
 31 |     p_mfgr        char(25)       NOT NULL,
 32 |     p_brand       char(10)       NOT NULL,
 33 |     p_type        varchar(25)    NOT NULL,
 34 |     p_size        integer        NOT NULL,
 35 |     p_container   char(10)       NOT NULL,
 36 |     p_retailprice decimal(15, 2) NOT NULL,
 37 |     p_comment     varchar(23)    NOT NULL,
 38 |     PRIMARY KEY (p_partkey)
 39 | );
 40 | 
 41 | CREATE TABLE supplier (
 42 |     s_suppkey   integer        NOT NULL,
 43 |     s_name      char(25)       NOT NULL,
 44 |     s_address   varchar(40)    NOT NULL,
 45 |     s_nationkey integer        NOT NULL,
 46 |     s_phone     char(15)       NOT NULL,
 47 |     s_acctbal   decimal(15, 2) NOT NULL,
 48 |     s_comment   varchar(101)   NOT NULL,
 49 |     PRIMARY KEY (s_suppkey)
 50 | );
 51 | 
 52 | CREATE TABLE partsupp (
 53 |     ps_partkey    integer        NOT NULL,
 54 |     ps_suppkey    integer        NOT NULL,
 55 |     ps_availqty   integer        NOT NULL,
 56 |     ps_supplycost decimal(15, 2) NOT NULL,
 57 |     ps_comment    varchar(199)   NOT NULL,
 58 |     PRIMARY KEY (ps_partkey, ps_suppkey)
 59 | );
 60 | 
 61 | CREATE TABLE customer (
 62 |     c_custkey    integer        NOT NULL,
 63 |     c_name       varchar(25)    NOT NULL,
 64 |     c_address    varchar(40)    NOT NULL,
 65 |     c_nationkey  integer        NOT NULL,
 66 |     c_phone      char(15)       NOT NULL,
 67 |     c_acctbal    decimal(15, 2) NOT NULL,
 68 |     c_mktsegment char(10)       NOT NULL,
 69 |     c_comment    varchar(117)   NOT NULL,
 70 |     PRIMARY KEY (c_custkey)
 71 | );
 72 | 
 73 | CREATE TABLE orders (
 74 |     o_orderkey      integer        NOT NULL,
 75 |     o_custkey       integer        NOT NULL,
 76 |     o_orderstatus   char(1)        NOT NULL,
 77 |     o_totalprice    decimal(15, 2) NOT NULL,
 78 |     o_orderdate     date           NOT NULL,
 79 |     o_orderpriority char(15)       NOT NULL,
 80 |     o_clerk         char(15)       NOT NULL,
 81 |     o_shippriority  integer        NOT NULL,
 82 |     o_comment       varchar(79)    NOT NULL,
 83 |     PRIMARY KEY (o_orderkey)
 84 | );
 85 | 
 86 | CREATE TABLE lineitem (
 87 |     l_orderkey      integer        NOT NULL,
 88 |     l_partkey       integer        NOT NULL,
 89 |     l_suppkey       integer        NOT NULL,
 90 |     l_linenumber    integer        NOT NULL,
 91 |     l_quantity      decimal(15, 2) NOT NULL,
 92 |     l_extendedprice decimal(15, 2) NOT NULL,
 93 |     l_discount      decimal(15, 2) NOT NULL,
 94 |     l_tax           decimal(15, 2) NOT NULL,
 95 |     l_returnflag    char(1)        NOT NULL,
 96 |     l_linestatus    char(1)        NOT NULL,
 97 |     l_shipdate      date           NOT NULL,
 98 |     l_commitdate    date           NOT NULL,
 99 |     l_receiptdate   date           NOT NULL,
100 |     l_shipinstruct  char(25)       NOT NULL,
101 |     l_shipmode      char(10)       NOT NULL,
102 |     l_comment       varchar(44)    NOT NULL,
103 |     PRIMARY KEY (l_orderkey, l_linenumber)
104 | );
105 | 


--------------------------------------------------------------------------------
/dbgym_config.yaml:
--------------------------------------------------------------------------------
1 | dbgym_workspace_path: ../dbgym_workspace
2 | boot_redis_port: 6379
3 | ray_gcs_port: 6380


--------------------------------------------------------------------------------
/dbms/__init__.py:
--------------------------------------------------------------------------------
1 | # This folder contains code for building DBMSs.
2 | # It should not be confused with code that uses DBMSs (e.g. those in tune/env/).
3 | 


--------------------------------------------------------------------------------
/dbms/cli.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from gymlib.workspace import DBGymWorkspace
 3 | 
 4 | from dbms.postgres.cli import postgres_group
 5 | 
 6 | 
 7 | @click.group(name="dbms")
 8 | @click.pass_obj
 9 | def dbms_group(dbgym_workspace: DBGymWorkspace) -> None:
10 |     pass
11 | 
12 | 
13 | dbms_group.add_command(postgres_group)
14 | 


--------------------------------------------------------------------------------
/dbms/load_info_base_class.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Optional
 3 | 
 4 | 
 5 | class LoadInfoBaseClass:
 6 |     """
 7 |     A base class for providing info for DBMSs to load the data of a benchmark
 8 |     When copying these functions to a specific benchmark's load_info.py file, don't
 9 |       copy the comments or type annotations or else they might become out of sync.
10 |     """
11 | 
12 |     def get_schema_path(self) -> Path:
13 |         raise NotImplementedError
14 | 
15 |     def get_tables_and_paths(self) -> list[tuple[str, Path]]:
16 |         raise NotImplementedError
17 | 
18 |     # We assume the table file has a "csv-like" format where values are separated by a delimiter.
19 |     def get_table_file_delimiter(self) -> str:
20 |         raise NotImplementedError
21 | 
22 |     # If the subclassing benchmark does not have constraints, you can return None here.
23 |     # Constraints are also indexes.
24 |     def get_constraints_path(self) -> Optional[Path]:
25 |         raise NotImplementedError
26 | 


--------------------------------------------------------------------------------
/dbms/postgres/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/dbms/postgres/__init__.py


--------------------------------------------------------------------------------
/dbms/postgres/_build_repo.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -euxo pipefail
 4 | 
 5 | REPO_REAL_PARENT_PATH="$1"
 6 | 
 7 | # Download and make postgres from the boot repository.
 8 | mkdir -p "${REPO_REAL_PARENT_PATH}"
 9 | cd "${REPO_REAL_PARENT_PATH}"
10 | git clone https://github.com/lmwnshn/boot.git --single-branch --branch vldb_2024 --depth 1
11 | cd ./boot
12 | ./cmudb/build/configure.sh release "${REPO_REAL_PARENT_PATH}/boot/build/postgres"
13 | make clean
14 | make install-world-bin -j4
15 | 
16 | # Download and make boot.
17 | cd ./cmudb/extension/boot_rs/
18 | cargo build --release
19 | cbindgen . -o target/boot_rs.h --lang c
20 | cd "${REPO_REAL_PARENT_PATH}/boot"
21 | 
22 | cd ./cmudb/extension/boot/
23 | make clean
24 | make install -j
25 | cd "${REPO_REAL_PARENT_PATH}/boot"
26 | 
27 | # Download and make hypopg.
28 | git clone https://github.com/HypoPG/hypopg.git
29 | cd ./hypopg
30 | PG_CONFIG="${REPO_REAL_PARENT_PATH}/boot/build/postgres/bin/pg_config" make install
31 | cd "${REPO_REAL_PARENT_PATH}/boot"
32 | 
33 | # Download and make pg_hint_plan.
34 | # We need -L to follow links.
35 | curl -L https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL15_1_5_1.tar.gz -o REL15_1_5_1.tar.gz
36 | tar -xzf REL15_1_5_1.tar.gz
37 | rm REL15_1_5_1.tar.gz
38 | cd ./pg_hint_plan-REL15_1_5_1
39 | PATH="${REPO_REAL_PARENT_PATH}/boot/build/postgres/bin:$PATH" make
40 | PATH="${REPO_REAL_PARENT_PATH}/boot/build/postgres/bin:$PATH" make install
41 | cp ./pg_hint_plan.so ${REPO_REAL_PARENT_PATH}/boot/build/postgres/lib
42 | 


--------------------------------------------------------------------------------
/dbms/postgres/cli.py:
--------------------------------------------------------------------------------
  1 | """
  2 | At a high level, this file's goal is to (1) build postgres and (2) create dbdata (aka pgdata).
  3 | """
  4 | 
  5 | import logging
  6 | import shutil
  7 | import subprocess
  8 | from pathlib import Path
  9 | from typing import Any, Optional
 10 | 
 11 | import click
 12 | import sqlalchemy
 13 | from gymlib.infra_paths import (
 14 |     get_dbdata_tgz_symlink_path,
 15 |     get_pgbin_symlink_path,
 16 |     get_repo_symlink_path,
 17 | )
 18 | from gymlib.pg import create_sqlalchemy_conn, sql_file_execute
 19 | from gymlib.workspace import (
 20 |     WORKSPACE_PATH_PLACEHOLDER,
 21 |     DBGymWorkspace,
 22 |     fully_resolve_path,
 23 |     get_tmp_path_from_workspace_path,
 24 |     is_fully_resolved,
 25 |     is_ssd,
 26 |     linkname_to_name,
 27 | )
 28 | from sqlalchemy import text
 29 | 
 30 | from benchmark.constants import DEFAULT_SCALE_FACTOR
 31 | from benchmark.job.load_info import JobLoadInfo
 32 | from benchmark.tpch.load_info import TpchLoadInfo
 33 | from dbms.load_info_base_class import LoadInfoBaseClass
 34 | from util.shell import subprocess_run
 35 | 
 36 | DBGYM_POSTGRES_USER = "dbgym_user"
 37 | DBGYM_POSTGRES_PASS = "dbgym_pass"
 38 | DBGYM_POSTGRES_DBNAME = "dbgym"
 39 | DEFAULT_POSTGRES_DBNAME = "postgres"
 40 | DEFAULT_POSTGRES_PORT = 5432
 41 | SHARED_PRELOAD_LIBRARIES = "boot,pg_hint_plan,pg_prewarm"
 42 | 
 43 | 
 44 | @click.group(name="postgres")
 45 | @click.pass_obj
 46 | def postgres_group(dbgym_workspace: DBGymWorkspace) -> None:
 47 |     pass
 48 | 
 49 | 
 50 | @postgres_group.command(
 51 |     name="build",
 52 |     help="Download and build the Postgres repository and all necessary extensions/shared libraries. Does not create dbdata.",
 53 | )
 54 | @click.pass_obj
 55 | @click.option(
 56 |     "--rebuild",
 57 |     is_flag=True,
 58 |     help="Include this flag to rebuild Postgres even if it already exists.",
 59 | )
 60 | def postgres_build(dbgym_workspace: DBGymWorkspace, rebuild: bool) -> None:
 61 |     _postgres_build(dbgym_workspace, rebuild)
 62 | 
 63 | 
 64 | def _postgres_build(dbgym_workspace: DBGymWorkspace, rebuild: bool) -> None:
 65 |     """
 66 |     This function exists as a hook for integration tests.
 67 |     """
 68 |     expected_repo_symlink_path = get_repo_symlink_path(
 69 |         dbgym_workspace.dbgym_workspace_path
 70 |     )
 71 |     if not rebuild and expected_repo_symlink_path.exists():
 72 |         logging.info(f"Skipping _postgres_build: {expected_repo_symlink_path}")
 73 |         return
 74 | 
 75 |     logging.info(f"Setting up repo in {expected_repo_symlink_path}")
 76 |     repo_real_path = dbgym_workspace.dbgym_this_run_path / "repo"
 77 |     repo_real_path.mkdir(parents=False, exist_ok=False)
 78 |     subprocess_run(
 79 |         f"./_build_repo.sh {repo_real_path}",
 80 |         cwd=dbgym_workspace.base_dbgym_repo_path / "dbms" / "postgres",
 81 |     )
 82 | 
 83 |     # only link at the end so that the link only ever points to a complete repo
 84 |     repo_symlink_path = dbgym_workspace.link_result(repo_real_path)
 85 |     assert expected_repo_symlink_path.samefile(repo_symlink_path)
 86 |     logging.info(f"Set up repo in {expected_repo_symlink_path}")
 87 | 
 88 | 
 89 | @postgres_group.command(
 90 |     name="dbdata",
 91 |     help="Build a .tgz file of dbdata with various specifications for its contents.",
 92 | )
 93 | @click.pass_obj
 94 | @click.argument("benchmark_name", type=str)
 95 | @click.option("--scale-factor", type=float, default=DEFAULT_SCALE_FACTOR)
 96 | @click.option(
 97 |     "--pgbin-path",
 98 |     type=Path,
 99 |     default=None,
100 |     help=f"The path to the bin containing Postgres executables. The default is {get_pgbin_symlink_path(WORKSPACE_PATH_PLACEHOLDER)}.",
101 | )
102 | @click.option(
103 |     "--intended-dbdata-hardware",
104 |     type=click.Choice(["hdd", "ssd"]),
105 |     default="hdd",
106 |     help=f"The intended hardware dbdata should be on. Used as a sanity check for --dbdata-parent-path.",
107 | )
108 | @click.option(
109 |     "--dbdata-parent-path",
110 |     default=None,
111 |     type=Path,
112 |     help=f"The path to the parent directory of the dbdata which will be actively tuned. The default is {get_tmp_path_from_workspace_path(WORKSPACE_PATH_PLACEHOLDER)}.",
113 | )
114 | def postgres_dbdata(
115 |     dbgym_workspace: DBGymWorkspace,
116 |     benchmark_name: str,
117 |     scale_factor: float,
118 |     pgbin_path: Optional[Path],
119 |     intended_dbdata_hardware: str,
120 |     dbdata_parent_path: Optional[Path],
121 | ) -> None:
122 |     _postgres_dbdata(
123 |         dbgym_workspace,
124 |         benchmark_name,
125 |         scale_factor,
126 |         pgbin_path,
127 |         intended_dbdata_hardware,
128 |         dbdata_parent_path,
129 |     )
130 | 
131 | 
132 | def _postgres_dbdata(
133 |     dbgym_workspace: DBGymWorkspace,
134 |     benchmark_name: str,
135 |     scale_factor: float,
136 |     pgbin_path: Optional[Path],
137 |     intended_dbdata_hardware: str,
138 |     dbdata_parent_path: Optional[Path],
139 | ) -> None:
140 |     """
141 |     This function exists as a hook for integration tests.
142 |     """
143 |     # Set args to defaults programmatically (do this before doing anything else in the function)
144 |     if pgbin_path is None:
145 |         pgbin_path = get_pgbin_symlink_path(dbgym_workspace.dbgym_workspace_path)
146 |     if dbdata_parent_path is None:
147 |         dbdata_parent_path = get_tmp_path_from_workspace_path(
148 |             dbgym_workspace.dbgym_workspace_path
149 |         )
150 | 
151 |     # Fully resolve all input paths.
152 |     pgbin_path = fully_resolve_path(pgbin_path)
153 |     dbdata_parent_path = fully_resolve_path(dbdata_parent_path)
154 | 
155 |     # Check assertions on args
156 |     if intended_dbdata_hardware == "hdd":
157 |         assert not is_ssd(
158 |             dbdata_parent_path
159 |         ), f"Intended hardware is HDD but dbdata_parent_path ({dbdata_parent_path}) is an SSD"
160 |     elif intended_dbdata_hardware == "ssd":
161 |         assert is_ssd(
162 |             dbdata_parent_path
163 |         ), f"Intended hardware is SSD but dbdata_parent_path ({dbdata_parent_path}) is an HDD"
164 |     else:
165 |         assert (
166 |             False
167 |         ), f'Intended hardware is "{intended_dbdata_hardware}" which is invalid'
168 | 
169 |     # Create dbdata
170 |     _create_dbdata(
171 |         dbgym_workspace, benchmark_name, scale_factor, pgbin_path, dbdata_parent_path
172 |     )
173 | 
174 | 
175 | def _create_dbdata(
176 |     dbgym_workspace: DBGymWorkspace,
177 |     benchmark_name: str,
178 |     scale_factor: float,
179 |     pgbin_path: Path,
180 |     dbdata_parent_path: Path,
181 | ) -> None:
182 |     """
183 |     If you change the code of _create_dbdata(), you should also delete the symlink so that the next time you run
184 |     `dbms postgres dbdata` it will re-create the dbdata.
185 |     """
186 |     expected_dbdata_tgz_symlink_path = get_dbdata_tgz_symlink_path(
187 |         dbgym_workspace.dbgym_workspace_path,
188 |         benchmark_name,
189 |         scale_factor,
190 |     )
191 |     if expected_dbdata_tgz_symlink_path.exists():
192 |         logging.info(f"Skipping _create_dbdata: {expected_dbdata_tgz_symlink_path}")
193 |         return
194 | 
195 |     # It's ok for the dbdata/ directory to be temporary. It just matters that the .tgz is saved in a safe place.
196 |     dbdata_path = dbdata_parent_path / "dbdata_being_created"
197 |     # We might be reusing the same dbdata_parent_path, so delete dbdata_path if it already exists
198 |     if dbdata_path.exists():
199 |         shutil.rmtree(dbdata_path)
200 | 
201 |     # Call initdb.
202 |     # Save any script we call from pgbin_symlink_path because they are dependencies generated from another task run.
203 |     dbgym_workspace.save_file(pgbin_path / "initdb")
204 |     subprocess_run(f'./initdb -D "{dbdata_path}"', cwd=pgbin_path)
205 | 
206 |     # Start Postgres (all other dbdata setup requires postgres to be started).
207 |     # Note that subprocess_run() never returns when running "pg_ctl start", so I'm using subprocess.run() instead.
208 |     start_postgres(dbgym_workspace, pgbin_path, dbdata_path)
209 | 
210 |     # Set up Postgres.
211 |     _generic_dbdata_setup(dbgym_workspace)
212 |     _load_benchmark_into_dbdata(dbgym_workspace, benchmark_name, scale_factor)
213 | 
214 |     # Stop Postgres so that we don't "leak" processes.
215 |     stop_postgres(dbgym_workspace, pgbin_path, dbdata_path)
216 | 
217 |     # Create .tgz file.
218 |     dbdata_tgz_real_path = dbgym_workspace.dbgym_this_run_path / linkname_to_name(
219 |         expected_dbdata_tgz_symlink_path.name
220 |     )
221 |     # We need to cd into dbdata_path so that the tar file does not contain folders for the whole path of dbdata_path.
222 |     subprocess_run(f"tar -czf {dbdata_tgz_real_path} .", cwd=dbdata_path)
223 | 
224 |     # Create symlink.
225 |     # Only link at the end so that the link only ever points to a complete dbdata.
226 |     dbdata_tgz_symlink_path = dbgym_workspace.link_result(dbdata_tgz_real_path)
227 |     assert expected_dbdata_tgz_symlink_path.samefile(dbdata_tgz_symlink_path)
228 |     logging.info(f"Created dbdata in {dbdata_tgz_symlink_path}")
229 | 
230 | 
231 | def _generic_dbdata_setup(dbgym_workspace: DBGymWorkspace) -> None:
232 |     # get necessary vars
233 |     pgbin_real_path = get_pgbin_symlink_path(
234 |         dbgym_workspace.dbgym_workspace_path
235 |     ).resolve()
236 |     assert pgbin_real_path.exists()
237 |     dbgym_pguser = DBGYM_POSTGRES_USER
238 |     dbgym_pgpass = DBGYM_POSTGRES_PASS
239 |     pgport = DEFAULT_POSTGRES_PORT
240 | 
241 |     # Create user
242 |     dbgym_workspace.save_file(pgbin_real_path / "psql")
243 |     subprocess_run(
244 |         f"./psql -c \"create user {dbgym_pguser} with superuser password '{dbgym_pgpass}'\" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost",
245 |         cwd=pgbin_real_path,
246 |     )
247 |     subprocess_run(
248 |         f'./psql -c "grant pg_monitor to {dbgym_pguser}" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost',
249 |         cwd=pgbin_real_path,
250 |     )
251 | 
252 |     # Load shared preload libraries
253 |     if SHARED_PRELOAD_LIBRARIES:
254 |         subprocess_run(
255 |             # You have to use TO and you can't put single quotes around the libraries (https://postgrespro.com/list/thread-id/2580120)
256 |             # The method I wrote here works for both one library and multiple libraries
257 |             f'./psql -c "ALTER SYSTEM SET shared_preload_libraries TO {SHARED_PRELOAD_LIBRARIES};" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost',
258 |             cwd=pgbin_real_path,
259 |         )
260 | 
261 |     # Create the dbgym database. Since one dbdata dir maps to one benchmark, all benchmarks will use the same database
262 |     # as opposed to using databases named after the benchmark.
263 |     subprocess_run(
264 |         f"./psql -c \"create database {DBGYM_POSTGRES_DBNAME} with owner = '{dbgym_pguser}'\" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost",
265 |         cwd=pgbin_real_path,
266 |     )
267 | 
268 | 
269 | def _load_benchmark_into_dbdata(
270 |     dbgym_workspace: DBGymWorkspace, benchmark_name: str, scale_factor: float
271 | ) -> None:
272 |     load_info: LoadInfoBaseClass
273 | 
274 |     with create_sqlalchemy_conn() as conn:
275 |         if benchmark_name == "tpch":
276 |             load_info = TpchLoadInfo(dbgym_workspace, scale_factor)
277 |         elif benchmark_name == "job":
278 |             load_info = JobLoadInfo(dbgym_workspace)
279 |         else:
280 |             raise AssertionError(
281 |                 f"_load_benchmark_into_dbdata(): the benchmark of name {benchmark_name} is not implemented"
282 |             )
283 | 
284 |         _load_into_dbdata(dbgym_workspace, conn, load_info)
285 | 
286 | 
287 | def _load_into_dbdata(
288 |     dbgym_workspace: DBGymWorkspace,
289 |     conn: sqlalchemy.Connection,
290 |     load_info: LoadInfoBaseClass,
291 | ) -> None:
292 |     sql_file_execute(dbgym_workspace, conn, load_info.get_schema_path())
293 | 
294 |     # Truncate all tables first before even loading a single one.
295 |     for table, _ in load_info.get_tables_and_paths():
296 |         sqlalchemy_conn_execute(conn, f"TRUNCATE {table} CASCADE")
297 |     # Then, load the tables.
298 |     for table, table_path in load_info.get_tables_and_paths():
299 |         with dbgym_workspace.open_and_save(table_path, "r") as table_csv:
300 |             assert conn.connection.dbapi_connection is not None
301 |             cur = conn.connection.dbapi_connection.cursor()
302 |             try:
303 |                 with cur.copy(
304 |                     f"COPY {table} FROM STDIN CSV DELIMITER '{load_info.get_table_file_delimiter()}' ESCAPE '\\'"
305 |                 ) as copy:
306 |                     while data := table_csv.read(8192):
307 |                         copy.write(data)
308 |             finally:
309 |                 cur.close()
310 | 
311 |     constraints_path = load_info.get_constraints_path()
312 |     if constraints_path is not None:
313 |         sql_file_execute(dbgym_workspace, conn, constraints_path)
314 | 
315 | 
316 | # The start and stop functions slightly duplicate functionality from pg_conn.py. However, I chose to do it this way
317 | # because what the `dbms` CLI needs in terms of starting and stopping Postgres is much simpler than what an agent
318 | # that is tuning the database needs. Because these functions are so simple, I think it's okay to leave them here
319 | # even though they are a little redundant. It seems better than making `dbms` depend on the behavior of the
320 | # tuning environment.
321 | def start_postgres(
322 |     dbgym_workspace: DBGymWorkspace, pgbin_path: Path, dbdata_path: Path
323 | ) -> None:
324 |     _start_or_stop_postgres(dbgym_workspace, pgbin_path, dbdata_path, True)
325 | 
326 | 
327 | def stop_postgres(
328 |     dbgym_workspace: DBGymWorkspace, pgbin_path: Path, dbdata_path: Path
329 | ) -> None:
330 |     _start_or_stop_postgres(dbgym_workspace, pgbin_path, dbdata_path, False)
331 | 
332 | 
333 | def _start_or_stop_postgres(
334 |     dbgym_workspace: DBGymWorkspace,
335 |     pgbin_path: Path,
336 |     dbdata_path: Path,
337 |     is_start: bool,
338 | ) -> None:
339 |     # They should be absolute paths and should exist
340 |     assert is_fully_resolved(pgbin_path)
341 |     assert is_fully_resolved(dbdata_path)
342 |     pgport = DEFAULT_POSTGRES_PORT
343 |     dbgym_workspace.save_file(pgbin_path / "pg_ctl")
344 | 
345 |     if is_start:
346 |         # We use subprocess.run() because subprocess_run() never returns when running "pg_ctl start".
347 |         # The reason subprocess_run() never returns is because pg_ctl spawns a postgres process so .poll() always returns None.
348 |         # On the other hand, subprocess.run() does return normally, like calling `./pg_ctl` on the command line would do.
349 |         result = subprocess.run(
350 |             f"./pg_ctl -D \"{dbdata_path}\" -o '-p {pgport}' start",
351 |             cwd=pgbin_path,
352 |             shell=True,
353 |         )
354 |         result.check_returncode()
355 |     else:
356 |         subprocess_run(
357 |             f"./pg_ctl -D \"{dbdata_path}\" -o '-p {pgport}' stop",
358 |             cwd=pgbin_path,
359 |         )
360 | 
361 | 
362 | def sqlalchemy_conn_execute(
363 |     conn: sqlalchemy.Connection, sql: str
364 | ) -> sqlalchemy.engine.CursorResult[Any]:
365 |     return conn.execute(text(sql))
366 | 


--------------------------------------------------------------------------------
/dbms/postgres/default_boot_config.yaml:
--------------------------------------------------------------------------------
 1 | # Macro accelerator
 2 | intelligent_cache: true
 3 | 
 4 | # Micro accelerator
 5 | early_stop: true
 6 | seq_sample: true
 7 | seq_sample_pct: 50
 8 | seq_sample_seed: 15721
 9 | mu_hyp_opt: 0.01
10 | mu_hyp_time: 100000
11 | mu_hyp_stdev: 1.0


--------------------------------------------------------------------------------
/dbms/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/dbms/tests/__init__.py


--------------------------------------------------------------------------------
/dbms/tests/dbms_integtest_dbgym_config.yaml:
--------------------------------------------------------------------------------
1 | dbgym_workspace_path: ../dbgym_dbms_integtest_workspace/
2 | 


--------------------------------------------------------------------------------
/dbms/tests/integtest_dbms.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import unittest
 4 | from pathlib import Path
 5 | 
 6 | from gymlib.infra_paths import get_dbdata_tgz_symlink_path, get_repo_symlink_path
 7 | from gymlib.workspace import (
 8 |     DBGymWorkspace,
 9 |     fully_resolve_path,
10 |     get_workspace_path_from_config,
11 | )
12 | 
13 | from benchmark.tpch.cli import _tpch_tables
14 | from dbms.postgres.cli import _postgres_build, _postgres_dbdata
15 | 
16 | 
17 | class DBMSTests(unittest.TestCase):
18 |     DBGYM_CONFIG_PATH = Path("dbms/tests/dbms_integtest_dbgym_config.yaml")
19 | 
20 |     def setUp(self) -> None:
21 |         workspace_path = get_workspace_path_from_config(DBMSTests.DBGYM_CONFIG_PATH)
22 |         # Get a clean start each time.
23 |         if workspace_path.exists():
24 |             shutil.rmtree(workspace_path)
25 | 
26 |         # Reset this to avoid the error of it being created twice.
27 |         # In real usage, the second run would be a different Python process so DBGymWorkspace._num_times_created_this_run would be 0.
28 |         DBGymWorkspace._num_times_created_this_run = 0
29 |         self.workspace = DBGymWorkspace(workspace_path)
30 | 
31 |     def tearDown(self) -> None:
32 |         if self.workspace.dbgym_workspace_path.exists():
33 |             shutil.rmtree(self.workspace.dbgym_workspace_path)
34 | 
35 |     def test_postgres_build(self) -> None:
36 |         repo_path = get_repo_symlink_path(self.workspace.dbgym_workspace_path)
37 |         self.assertFalse(repo_path.exists())
38 |         _postgres_build(self.workspace, False)
39 |         self.assertTrue(repo_path.exists())
40 |         self.assertTrue(fully_resolve_path(repo_path).exists())
41 | 
42 |     def test_postgres_dbdata(self) -> None:
43 |         # Setup
44 |         # Make sure to recreate self.workspace so that each function call counts as its own run.
45 |         scale_factor = 0.01
46 |         _postgres_build(self.workspace, False)
47 |         DBGymWorkspace._num_times_created_this_run = 0
48 |         self.workspace = DBGymWorkspace(self.workspace.dbgym_workspace_path)
49 |         _tpch_tables(self.workspace, scale_factor)
50 |         DBGymWorkspace._num_times_created_this_run = 0
51 |         self.workspace = DBGymWorkspace(self.workspace.dbgym_workspace_path)
52 | 
53 |         # Test
54 |         dbdata_tgz_path = get_dbdata_tgz_symlink_path(
55 |             self.workspace.dbgym_workspace_path, "tpch", scale_factor
56 |         )
57 |         self.assertFalse(dbdata_tgz_path.exists())
58 |         intended_dbdata_hardware = os.environ.get("INTENDED_DBDATA_HARDWARE", "hdd")
59 |         _postgres_dbdata(
60 |             self.workspace, "tpch", scale_factor, None, intended_dbdata_hardware, None
61 |         )
62 |         self.assertTrue(dbdata_tgz_path.exists())
63 |         self.assertTrue(fully_resolve_path(dbdata_tgz_path).exists())
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     unittest.main()
68 | 


--------------------------------------------------------------------------------
/gymlib_package/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/gymlib_package/__init__.py


--------------------------------------------------------------------------------
/gymlib_package/gymlib/__init__.py:
--------------------------------------------------------------------------------
1 | from . import infra_paths, workspace
2 | 


--------------------------------------------------------------------------------
/gymlib_package/gymlib/infra_paths.py:
--------------------------------------------------------------------------------
 1 | """
 2 | "Infra" refers to benchmark/ and dbms/. These are all the paths used to access the files created by benchmark/ and dbms/.
 3 | They're inside gymlib because agents will need to access them.
 4 | """
 5 | 
 6 | from pathlib import Path
 7 | from typing import Any
 8 | 
 9 | from gymlib.workspace import DBGYM_APP_NAME, SYMLINKS_DNAME, name_to_linkname
10 | 
11 | SCALE_FACTOR_PLACEHOLDER: str = "[scale_factor]"
12 | BENCHMARK_NAME_PLACEHOLDER: str = "[benchmark_name]"
13 | WORKLOAD_NAME_PLACEHOLDER: str = "[workload_name]"
14 | 
15 | 
16 | def get_scale_factor_string(scale_factor: float | str) -> str:
17 |     if type(scale_factor) is str and scale_factor == SCALE_FACTOR_PLACEHOLDER:
18 |         return scale_factor
19 |     else:
20 |         if float(int(scale_factor)) == scale_factor:
21 |             return str(int(scale_factor))
22 |         else:
23 |             return str(scale_factor).replace(".", "point")
24 | 
25 | 
26 | def get_tables_dirname(benchmark: str, scale_factor: float | str) -> str:
27 |     return f"tables_{benchmark}_sf{get_scale_factor_string(scale_factor)}"
28 | 
29 | 
30 | def get_workload_suffix(benchmark: str, **kwargs: Any) -> str:
31 |     if benchmark == "tpch":
32 |         assert kwargs.keys() == {"seed_start", "seed_end", "query_subset"}
33 |         return f"{kwargs['seed_start']}_{kwargs['seed_end']}_{kwargs['query_subset']}"
34 |     elif benchmark == "job":
35 |         assert kwargs.keys() == {"query_subset"}
36 |         return f"{kwargs['query_subset']}"
37 |     else:
38 |         assert False
39 | 
40 | 
41 | def get_workload_dirname(benchmark: str, scale_factor: float | str, suffix: str) -> str:
42 |     return f"workload_{benchmark}_sf{get_scale_factor_string(scale_factor)}_{suffix}"
43 | 
44 | 
45 | def get_dbdata_tgz_filename(benchmark_name: str, scale_factor: float | str) -> str:
46 |     return f"{benchmark_name}_sf{get_scale_factor_string(scale_factor)}_pristine_dbdata.tgz"
47 | 
48 | 
49 | def get_tables_symlink_path(
50 |     workspace_path: Path, benchmark: str, scale_factor: float | str
51 | ) -> Path:
52 |     return (
53 |         workspace_path
54 |         / SYMLINKS_DNAME
55 |         / DBGYM_APP_NAME
56 |         / name_to_linkname(get_tables_dirname(benchmark, scale_factor))
57 |     )
58 | 
59 | 
60 | def get_workload_symlink_path(
61 |     workspace_path: Path, benchmark: str, scale_factor: float | str, suffix: str
62 | ) -> Path:
63 |     return (
64 |         workspace_path
65 |         / SYMLINKS_DNAME
66 |         / DBGYM_APP_NAME
67 |         / name_to_linkname(get_workload_dirname(benchmark, scale_factor, suffix))
68 |     )
69 | 
70 | 
71 | def get_repo_symlink_path(workspace_path: Path) -> Path:
72 |     return workspace_path / SYMLINKS_DNAME / DBGYM_APP_NAME / "repo.link"
73 | 
74 | 
75 | def get_pgbin_symlink_path(workspace_path: Path) -> Path:
76 |     return get_repo_symlink_path(workspace_path) / "boot" / "build" / "postgres" / "bin"
77 | 
78 | 
79 | def get_dbdata_tgz_symlink_path(
80 |     workspace_path: Path, benchmark_name: str, scale_factor: float | str
81 | ) -> Path:
82 |     return (
83 |         workspace_path
84 |         / SYMLINKS_DNAME
85 |         / DBGYM_APP_NAME
86 |         / name_to_linkname(get_dbdata_tgz_filename(benchmark_name, scale_factor))
87 |     )
88 | 


--------------------------------------------------------------------------------
/gymlib_package/gymlib/pg.py:
--------------------------------------------------------------------------------
  1 | """
  2 | There are multiple parts of the codebase which interact with Postgres. This file contains helpers common to all those parts.
  3 | """
  4 | 
  5 | from pathlib import Path
  6 | from typing import Any
  7 | 
  8 | import pglast
  9 | import psutil
 10 | import psycopg
 11 | import sqlalchemy
 12 | from gymlib.workspace import DBGymWorkspace
 13 | from sqlalchemy import create_engine, text
 14 | 
 15 | DBGYM_POSTGRES_USER = "dbgym_user"
 16 | DBGYM_POSTGRES_PASS = "dbgym_pass"
 17 | DBGYM_POSTGRES_DBNAME = "dbgym"
 18 | DEFAULT_POSTGRES_DBNAME = "postgres"
 19 | DEFAULT_POSTGRES_PORT = 5432
 20 | SHARED_PRELOAD_LIBRARIES = "boot,pg_hint_plan,pg_prewarm"
 21 | 
 22 | 
 23 | def sqlalchemy_conn_execute(
 24 |     conn: sqlalchemy.Connection, sql: str
 25 | ) -> sqlalchemy.engine.CursorResult[Any]:
 26 |     return conn.execute(text(sql))
 27 | 
 28 | 
 29 | def sql_file_queries(dbgym_workspace: DBGymWorkspace, filepath: Path) -> list[str]:
 30 |     with dbgym_workspace.open_and_save(filepath) as f:
 31 |         lines: list[str] = []
 32 |         for line in f:
 33 |             if line.startswith("--"):
 34 |                 continue
 35 |             if len(line.strip()) == 0:
 36 |                 continue
 37 |             lines.append(line)
 38 |         queries_str = "".join(lines)
 39 |         queries: list[str] = pglast.split(queries_str)
 40 |         return queries
 41 | 
 42 | 
 43 | def sql_file_execute(
 44 |     dbgym_workspace: DBGymWorkspace, conn: sqlalchemy.Connection, filepath: Path
 45 | ) -> None:
 46 |     for sql in sql_file_queries(dbgym_workspace, filepath):
 47 |         sqlalchemy_conn_execute(conn, sql)
 48 | 
 49 | 
 50 | # The reason pgport is an argument is because when doing agnet HPO, we want to run multiple instances of Postgres
 51 | #   at the same time. In this situation, they need to have different ports
 52 | def get_connstr(pgport: int = DEFAULT_POSTGRES_PORT, use_psycopg: bool = True) -> str:
 53 |     connstr_suffix = f"{DBGYM_POSTGRES_USER}:{DBGYM_POSTGRES_PASS}@localhost:{pgport}/{DBGYM_POSTGRES_DBNAME}"
 54 |     # use_psycopg means whether or not we use the psycopg.connect() function
 55 |     # counterintuively, you *don't* need psycopg in the connection string if you *are*
 56 |     #   using the psycopg.connect() function
 57 |     connstr_prefix = "postgresql" if use_psycopg else "postgresql+psycopg"
 58 |     return connstr_prefix + "://" + connstr_suffix
 59 | 
 60 | 
 61 | def get_kv_connstr(pgport: int = DEFAULT_POSTGRES_PORT) -> str:
 62 |     return f"host=localhost port={pgport} user={DBGYM_POSTGRES_USER} password={DBGYM_POSTGRES_PASS} dbname={DBGYM_POSTGRES_DBNAME}"
 63 | 
 64 | 
 65 | def create_psycopg_conn(pgport: int = DEFAULT_POSTGRES_PORT) -> psycopg.Connection[Any]:
 66 |     connstr = get_connstr(use_psycopg=True, pgport=pgport)
 67 |     psycopg_conn = psycopg.connect(connstr, autocommit=True, prepare_threshold=None)
 68 |     return psycopg_conn
 69 | 
 70 | 
 71 | def create_sqlalchemy_conn(
 72 |     pgport: int = DEFAULT_POSTGRES_PORT,
 73 | ) -> sqlalchemy.Connection:
 74 |     connstr = get_connstr(use_psycopg=False, pgport=pgport)
 75 |     engine: sqlalchemy.Engine = create_engine(
 76 |         connstr,
 77 |         execution_options={"isolation_level": "AUTOCOMMIT"},
 78 |     )
 79 |     return engine.connect()
 80 | 
 81 | 
 82 | def get_is_postgres_running() -> bool:
 83 |     """
 84 |     This is often used in assertions to ensure that Postgres isn't running before we
 85 |     execute some code.
 86 | 
 87 |     I intentionally do not have a function that forcefully *stops* all Postgres instances.
 88 |     This is risky because it could accidentally stop instances it wasn't supposed (e.g.
 89 |     Postgres instances run by other users on the same machine).
 90 | 
 91 |     Stopping Postgres instances is thus a responsibility of the human to take care of.
 92 |     """
 93 |     return len(get_running_postgres_ports()) > 0
 94 | 
 95 | 
 96 | def get_running_postgres_ports() -> list[int]:
 97 |     """
 98 |     Returns a list of all ports on which Postgres is currently running.
 99 | 
100 |     There are ways to check with psycopg/sqlalchemy. However, I chose to check using
101 |     psutil to keep it as simple as possible and orthogonal to how connections work.
102 |     """
103 |     running_ports = []
104 | 
105 |     for conn in psutil.net_connections(kind="inet"):
106 |         if conn.status == "LISTEN":
107 |             try:
108 |                 proc = psutil.Process(conn.pid)
109 |                 if proc.name() == "postgres":
110 |                     running_ports.append(conn.laddr.port)
111 |             except (psutil.NoSuchProcess, psutil.AccessDenied):
112 |                 continue
113 | 
114 |     return running_ports
115 | 


--------------------------------------------------------------------------------
/gymlib_package/gymlib/pg_conn.py:
--------------------------------------------------------------------------------
  1 | """
  2 | At a high level, this file's goal is to provide helpers to manage a Postgres instance during
  3 | agent tuning.
  4 | 
  5 | On the other hand, the goal of dbms.postgres.cli is to (1) install+build postgres and (2)
  6 | create dbdata.
  7 | 
  8 | util.pg provides helpers used by *both* of the above files (as well as other files).
  9 | """
 10 | 
 11 | import logging
 12 | import os
 13 | import shutil
 14 | import threading
 15 | import time
 16 | from pathlib import Path
 17 | from typing import Any, Optional, Union
 18 | 
 19 | import psutil
 20 | import psycopg
 21 | import yaml
 22 | from gymlib.pg import DBGYM_POSTGRES_DBNAME, SHARED_PRELOAD_LIBRARIES, get_kv_connstr
 23 | from gymlib.workspace import DBGymWorkspace, parent_path_of_path
 24 | from plumbum import local
 25 | from psycopg.errors import ProgramLimitExceeded, QueryCanceled
 26 | 
 27 | CONNECT_TIMEOUT = 300
 28 | 
 29 | 
 30 | class PostgresConn:
 31 |     # The reason that PostgresConn takes in all these paths (e.g. `pgbin_path`) instead of inferring them
 32 |     # automatically from the default workspace paths is so that it's fully decoupled from how the files
 33 |     # are organized in the workspace.
 34 |     def __init__(
 35 |         self,
 36 |         dbgym_workspace: DBGymWorkspace,
 37 |         pgport: int,
 38 |         pristine_dbdata_snapshot_path: Path,
 39 |         dbdata_parent_path: Path,
 40 |         pgbin_path: Union[str, Path],
 41 |         # Whether this is None determines whether Boot is enabled.
 42 |         boot_config_path: Optional[Path],
 43 |     ) -> None:
 44 | 
 45 |         self.dbgym_workspace = dbgym_workspace
 46 |         self.pgport = pgport
 47 |         self.pgbin_path = pgbin_path
 48 |         self.boot_config_path = boot_config_path
 49 |         self.log_step = 0
 50 | 
 51 |         # All the paths related to dbdata
 52 |         # pristine_dbdata_snapshot_path is the .tgz snapshot that represents the starting state
 53 |         #   of the database (with the default configuration). It is generated by a call to
 54 |         #   `python tune.py dbms postgres ...` and should not be overwritten.
 55 |         self.pristine_dbdata_snapshot_path = pristine_dbdata_snapshot_path
 56 |         # checkpoint_dbdata_snapshot_path is the .tgz snapshot that represents the current
 57 |         #   state of the database as it is being tuned. It is generated while tuning and is
 58 |         #   discarded once tuning is completed.
 59 |         self.checkpoint_dbdata_snapshot_path = (
 60 |             dbgym_workspace.dbgym_tmp_path / "checkpoint_dbdata.tgz"
 61 |         )
 62 |         # dbdata_parent_path is the parent directory of the dbdata that is *actively being tuned*.
 63 |         # It is *not* the parent directory of pristine_dbdata_snapshot_path.
 64 |         #   Setting this lets us control the hardware device dbdata is built on (e.g. HDD vs. SSD).
 65 |         self.dbdata_parent_path = dbdata_parent_path
 66 |         # dbdata_path is the dbdata that is *actively being tuned*
 67 |         self.dbdata_path = self.dbdata_parent_path / f"dbdata{self.pgport}"
 68 | 
 69 |         self._conn: Optional[psycopg.Connection[Any]] = None
 70 |         self.hint_check_failed_with: Optional[str] = None
 71 | 
 72 |     def get_kv_connstr(self) -> str:
 73 |         return get_kv_connstr(self.pgport)
 74 | 
 75 |     def conn(self) -> psycopg.Connection[Any]:
 76 |         if self._conn is None:
 77 |             self._conn = psycopg.connect(
 78 |                 self.get_kv_connstr(), autocommit=True, prepare_threshold=None
 79 |             )
 80 | 
 81 |             def hint_check_notice_handler(notice: psycopg.errors.Diagnostic) -> None:
 82 |                 """
 83 |                 Custom handler for raising errors if hints fail.
 84 |                 """
 85 |                 if (
 86 |                     notice.message_detail is not None
 87 |                     and "hint" in notice.message_detail.lower()
 88 |                 ):
 89 |                     self.hint_check_failed_with = notice.message_detail
 90 | 
 91 |             # We add the notice handler when the _conn is created instead of before executing a
 92 |             # query to avoid adding it more than once.
 93 |             self._conn.add_notice_handler(hint_check_notice_handler)
 94 | 
 95 |         return self._conn
 96 | 
 97 |     def disconnect(self) -> None:
 98 |         if self._conn is not None:
 99 |             self._conn.close()
100 |             self._conn = None
101 | 
102 |     def move_log(self) -> None:
103 |         pglog_path = self.dbgym_workspace.dbgym_this_run_path / f"pg{self.pgport}.log"
104 |         pglog_this_step_path = (
105 |             self.dbgym_workspace.dbgym_this_run_path
106 |             / f"pg{self.pgport}.log.{self.log_step}"
107 |         )
108 |         if pglog_path.exists():
109 |             shutil.move(pglog_path, pglog_this_step_path)
110 |             self.log_step += 1
111 | 
112 |     def force_statement_timeout(self, timeout: float) -> None:
113 |         timeout_ms = timeout * 1000
114 |         retry = True
115 |         while retry:
116 |             retry = False
117 |             try:
118 |                 self.conn().execute(f"SET statement_timeout = {timeout_ms}")
119 |             except QueryCanceled:
120 |                 retry = True
121 | 
122 |     def time_query(
123 |         self,
124 |         query: str,
125 |         query_knobs: list[str] = [],
126 |         add_explain: bool = False,
127 |         timeout: float = 0,
128 |     ) -> tuple[float, bool, Optional[dict[str, Any]]]:
129 |         """
130 |         It returns the runtime in milliseconds, whether the query timed out, and the explain data if add_explain is True.
131 | 
132 |         If the query timed out, it won't have any explain data and thus explain_data will be None. Its runtime will be
133 |         the timeout value.
134 | 
135 |         Run a query with a timeout (in seconds). Following Postgres's convention, timeout=0 indicates "disable timeout".
136 | 
137 |         Use query_knobs to pass query knobs. An example input is query_knobs=["SET (enable_sort on)", "IndexOnlyScan(it)"].
138 | 
139 |         If you write explain in the query manually instead of setting add_explain, it won't return explain_data. This
140 |         is because it won't know the format of the explain data.
141 |         """
142 |         if timeout > 0:
143 |             self.force_statement_timeout(timeout)
144 |         else:
145 |             assert (
146 |                 timeout == 0
147 |             ), f'Setting timeout to 0 indicates "disable timeout". However, setting timeout ({timeout}) < 0 is a bug.'
148 | 
149 |         did_time_out = False
150 |         explain_data = None
151 | 
152 |         try:
153 |             if query_knobs:
154 |                 query = f"/*+ {' '.join(query_knobs)} */ {query}"
155 | 
156 |             if add_explain:
157 |                 assert (
158 |                     "explain" not in query.lower()
159 |                 ), "If you're using add_explain, don't also write explain manually in the query."
160 |                 query = f"explain (analyze, format json, timing off) {query}"
161 | 
162 |             # Reset this every time before calling execute() so that hint_check_notice_handler works correctly.
163 |             self.hint_check_failed_with = None
164 | 
165 |             start_time = time.time()
166 |             cursor = self.conn().execute(query)
167 |             qid_runtime = (time.time() - start_time) * 1e6
168 | 
169 |             if self.hint_check_failed_with is not None:
170 |                 raise RuntimeError(f"Query hint failed: {self.hint_check_failed_with}")
171 | 
172 |             if add_explain:
173 |                 c = [c for c in cursor][0][0][0]
174 |                 assert "Execution Time" in c
175 |                 qid_runtime = float(c["Execution Time"]) * 1e3
176 |                 explain_data = c
177 | 
178 |             logging.debug(f"{query} evaluated in {qid_runtime/1e6}")
179 | 
180 |         except QueryCanceled:
181 |             logging.debug(f"{query} exceeded evaluation timeout {timeout}")
182 |             qid_runtime = timeout * 1e6
183 |             did_time_out = True
184 |         except Exception as e:
185 |             raise e
186 |         finally:
187 |             # Wipe the statement timeout.
188 |             self.force_statement_timeout(0)
189 | 
190 |         # qid_runtime is in microseconds.
191 |         return qid_runtime, did_time_out, explain_data
192 | 
193 |     def shutdown_postgres(self) -> None:
194 |         """Shuts down postgres."""
195 |         self.disconnect()
196 |         if not Path(self.dbdata_path).exists():
197 |             return
198 | 
199 |         while True:
200 |             logging.debug("Shutting down postgres...")
201 |             _, stdout, stderr = local[f"{self.pgbin_path}/pg_ctl"][
202 |                 "stop", "--wait", "-t", "180", "-D", self.dbdata_path
203 |             ].run(retcode=None)
204 |             time.sleep(1)
205 |             logging.debug("Stop message: (%s, %s)", stdout, stderr)
206 | 
207 |             # Wait until pg_isready fails.
208 |             retcode, _, _ = local[f"{self.pgbin_path}/pg_isready"][
209 |                 "--host",
210 |                 "localhost",
211 |                 "--port",
212 |                 str(self.pgport),
213 |                 "--dbname",
214 |                 DBGYM_POSTGRES_DBNAME,
215 |             ].run(retcode=None)
216 | 
217 |             exists = (Path(self.dbdata_path) / "postmaster.pid").exists()
218 |             if not exists and retcode != 0:
219 |                 break
220 | 
221 |     def restart_postgres(self) -> bool:
222 |         # TODO: check if we still get the shared preload libraries correctly if we do None
223 |         return self.restart_with_changes(conf_changes=None)
224 | 
225 |     def restart_with_changes(
226 |         self,
227 |         conf_changes: Optional[dict[str, str]],
228 |         dump_page_cache: bool = False,
229 |         save_checkpoint: bool = False,
230 |     ) -> bool:
231 |         """
232 |         This function is called "(re)start" because it also shuts down Postgres before starting it.
233 |         This function assumes that some snapshot has already been untarred into self.dbdata_path.
234 |         You can do this by calling one of the wrappers around _restore_snapshot().
235 | 
236 |         Note that multiple calls are not "additive". Calling this will restart from the latest saved
237 |         snapshot. If you want it to be additive without the overhead of saving a snapshot, pass in
238 |         multiple changes to `conf_changes`.
239 |         """
240 |         # Install the new configuration changes.
241 |         if conf_changes is not None:
242 |             dbdata_auto_conf_path = self.dbdata_path / "postgresql.auto.conf"
243 |             with open(dbdata_auto_conf_path, "w") as f:
244 |                 f.write(
245 |                     "\n".join([f"{knob} = {val}" for knob, val in conf_changes.items()])
246 |                     + "\n"
247 |                 )
248 | 
249 |                 assert (
250 |                     "shared_preload_libraries" not in conf_changes
251 |                 ), f"You should not set shared_preload_libraries manually."
252 | 
253 |                 # Using single quotes around SHARED_PRELOAD_LIBRARIES works for both single or multiple libraries.
254 |                 f.write(f"shared_preload_libraries = '{SHARED_PRELOAD_LIBRARIES}'")
255 | 
256 |         # Start postgres instance.
257 |         self.shutdown_postgres()
258 |         self.move_log()
259 | 
260 |         if save_checkpoint:
261 |             local["tar"][
262 |                 "cf",
263 |                 # We append .tmp so that if we fail in the *middle* of running tar, we
264 |                 #   still have the previous checkpoint available to us
265 |                 f"{self.checkpoint_dbdata_snapshot_path}.tmp",
266 |                 "-C",
267 |                 parent_path_of_path(self.dbdata_path),
268 |                 self.dbdata_path,
269 |             ].run()
270 | 
271 |         # Make sure the PID lock file doesn't exist.
272 |         pid_lock = Path(f"{self.dbdata_path}/postmaster.pid")
273 |         assert not pid_lock.exists()
274 | 
275 |         if dump_page_cache:
276 |             # Dump the OS page cache.
277 |             os.system('sudo sh -c "sync; echo 3 > /proc/sys/vm/drop_caches"')
278 | 
279 |         attempts = 0
280 |         while not pid_lock.exists():
281 |             # Try starting up.
282 |             retcode, stdout, stderr = local[f"{self.pgbin_path}/pg_ctl"][
283 |                 "-D",
284 |                 self.dbdata_path,
285 |                 "--wait",
286 |                 "-t",
287 |                 "180",
288 |                 "-l",
289 |                 # We log to pg{self.pgport}.log instead of pg.log so that different PostgresConn objects
290 |                 #   don't all try to write to the same file.
291 |                 self.dbgym_workspace.dbgym_this_run_path / f"pg{self.pgport}.log",
292 |                 "start",
293 |             ].run(retcode=None)
294 | 
295 |             if retcode == 0 or pid_lock.exists():
296 |                 break
297 | 
298 |             logging.warning("startup encountered: (%s, %s)", stdout, stderr)
299 |             attempts += 1
300 |             if attempts >= 5:
301 |                 logging.error(
302 |                     "Number of attempts to start postgres has exceeded limit."
303 |                 )
304 |                 assert False, "Could not start postgres."
305 | 
306 |         # Wait until postgres is ready to accept connections.
307 |         num_cycles = 0
308 |         while True:
309 |             if num_cycles >= CONNECT_TIMEOUT:
310 |                 # In this case, we've failed to start postgres.
311 |                 logging.error("Failed to start postgres before timeout...")
312 |                 return False
313 | 
314 |             retcode, _, _ = local[f"{self.pgbin_path}/pg_isready"][
315 |                 "--host",
316 |                 "localhost",
317 |                 "--port",
318 |                 str(self.pgport),
319 |                 "--dbname",
320 |                 DBGYM_POSTGRES_DBNAME,
321 |             ].run(retcode=None)
322 |             if retcode == 0:
323 |                 break
324 | 
325 |             time.sleep(1)
326 |             num_cycles += 1
327 |             logging.debug("Waiting for postgres to bootup but it is not...")
328 | 
329 |         # Set up Boot if we're told to do so
330 |         if self.boot_config_path is not None:
331 |             with self.dbgym_workspace.open_and_save(self.boot_config_path) as f:
332 |                 boot_config = yaml.safe_load(f)
333 | 
334 |             self._set_up_boot(
335 |                 boot_config["intelligent_cache"],
336 |                 boot_config["early_stop"],
337 |                 boot_config["seq_sample"],
338 |                 boot_config["seq_sample_pct"],
339 |                 boot_config["seq_sample_seed"],
340 |                 boot_config["mu_hyp_opt"],
341 |                 boot_config["mu_hyp_time"],
342 |                 boot_config["mu_hyp_stdev"],
343 |             )
344 | 
345 |         # Move the temporary over since we now know the temporary can load.
346 |         if save_checkpoint:
347 |             shutil.move(f"{self.dbdata_path}.tgz.tmp", f"{self.dbdata_path}.tgz")
348 | 
349 |         return True
350 | 
351 |     def _set_up_boot(
352 |         self,
353 |         intelligent_cache: bool,
354 |         early_stop: bool,
355 |         seq_sample: bool,
356 |         seq_sample_pct: int,
357 |         seq_sample_seed: int,
358 |         mu_hyp_opt: float,
359 |         mu_hyp_time: int,
360 |         mu_hyp_stdev: float,
361 |     ) -> None:
362 |         """
363 |         Sets up Boot on the currently running Postgres instances.
364 |         Uses instance vars of PostgresConn for configuration.
365 |         I chose to not encode any "default values" in this function. This is so that all values
366 |             are explicitly included in the config file. This way, we can know what Boot config
367 |             was used in a given experiment by looking only at the config file. If we did encode
368 |             "default values" in the function, we would need to know the state of the code at the
369 |             time of the experiment, which is very difficult in the general case.
370 |         """
371 |         # If any of these commands fail, they'll throw a Python exception
372 |         # Thus, if none of them throw an exception, we know they passed
373 |         logging.debug("Setting up boot")
374 |         self.conn().execute("DROP EXTENSION IF EXISTS boot")
375 |         self.conn().execute("CREATE EXTENSION IF NOT EXISTS boot")
376 |         self.conn().execute("SELECT boot_connect()")
377 |         self.conn().execute("SELECT boot_cache_clear()")
378 |         self.conn().execute("SET boot.enable=true")
379 |         self.conn().execute("SET boot.intercept_explain_analyze=true")
380 |         self.conn().execute(f"SET boot.intelligent_cache={intelligent_cache}")
381 |         self.conn().execute(f"SET boot.early_stop={early_stop}")
382 |         self.conn().execute(f"SET boot.seq_sample={seq_sample}")
383 |         self.conn().execute(f"SET boot.seq_sample_pct={seq_sample_pct}")
384 |         self.conn().execute(f"SET boot.seq_sample_seed={seq_sample_seed}")
385 |         self.conn().execute(f"SET boot.mu_hyp_opt={mu_hyp_opt}")
386 |         self.conn().execute(f"SET boot.mu_hyp_time={mu_hyp_time}")
387 |         self.conn().execute(f"SET boot.mu_hyp_stdev={mu_hyp_stdev}")
388 |         logging.debug("Set up boot")
389 | 
390 |     def psql(self, sql: str) -> tuple[int, Optional[str]]:
391 |         """
392 |         Execute a SQL command (equivalent to psql -C "[cmd]") and return a status code and its stderr.
393 | 
394 |         This is meant for commands that modify the database, not those that get information from the database, which
395 |         is why it doesn't return a Cursor with the result. I designed it this way because it's difficult to provide
396 |         a general-purpose API which returns results for arbitrary SQL queries as those results could be very large.
397 | 
398 |         A return code of 0 means success while a non-zero return code means failure. The stderr will be None if success
399 |         and a string if failure.
400 |         """
401 | 
402 |         def cancel_fn(conn_str: str) -> None:
403 |             with psycopg.connect(
404 |                 conn_str, autocommit=True, prepare_threshold=None
405 |             ) as tconn:
406 |                 r = [
407 |                     r
408 |                     for r in tconn.execute(
409 |                         "SELECT pid FROM pg_stat_progress_create_index"
410 |                     )
411 |                 ]
412 | 
413 |             for row in r:
414 |                 logging.info(f"Killing process {row[0]}")
415 |                 try:
416 |                     psutil.Process(row[0]).kill()
417 |                 except:
418 |                     pass
419 | 
420 |         # Get a fresh connection.
421 |         self.disconnect()
422 |         conn = self.conn()
423 |         conn.execute("SET maintenance_work_mem = '4GB'")
424 |         # TODO(wz2): Make this a configuration/runtime option for action timeout.
425 |         conn.execute("SET statement_timeout = 300000")
426 | 
427 |         try:
428 |             timer = threading.Timer(300.0, cancel_fn, args=(self.get_kv_connstr(),))
429 |             timer.start()
430 | 
431 |             conn.execute(sql)
432 |             timer.cancel()
433 |         except ProgramLimitExceeded as e:
434 |             timer.cancel()
435 |             self.disconnect()
436 |             logging.debug(f"Action error: {e}")
437 |             return -1, str(e)
438 |         except QueryCanceled as e:
439 |             timer.cancel()
440 |             self.disconnect()
441 |             logging.debug(f"Action error: {e}")
442 |             return -1, f"canceling statement: {sql}."
443 |         except psycopg.OperationalError as e:
444 |             timer.cancel()
445 |             self.disconnect()
446 |             logging.debug(f"Action error: {e}")
447 |             return -1, f"operational error: {sql}."
448 |         except psycopg.errors.UndefinedTable:
449 |             timer.cancel()
450 |             raise
451 | 
452 |         self.disconnect()
453 |         return 0, None
454 | 
455 |     def get_system_knobs(self) -> dict[str, str]:
456 |         """
457 |         System knobs are those applied across the entire system. They do not include table-specific
458 |         knobs, query-specific knobs (aka query hints), or indexes.
459 |         """
460 |         conn = self.conn()
461 |         result = conn.execute("SHOW ALL").fetchall()
462 |         knobs = {}
463 |         for row in result:
464 |             knobs[row[0]] = row[1]
465 |         return knobs
466 | 
467 |     def restore_pristine_snapshot(self) -> bool:
468 |         return self._restore_snapshot(self.pristine_dbdata_snapshot_path)
469 | 
470 |     def restore_checkpointed_snapshot(self) -> bool:
471 |         return self._restore_snapshot(self.checkpoint_dbdata_snapshot_path)
472 | 
473 |     def _restore_snapshot(
474 |         self,
475 |         dbdata_snapshot_path: Path,
476 |     ) -> bool:
477 |         self.shutdown_postgres()
478 | 
479 |         local["rm"]["-rf", self.dbdata_path].run()
480 |         local["mkdir"]["-m", "0700", "-p", self.dbdata_path].run()
481 | 
482 |         # Strip the "dbdata" so we can implant directly into the target dbdata_path.
483 |         assert dbdata_snapshot_path.exists()
484 |         local["tar"][
485 |             "xf",
486 |             dbdata_snapshot_path,
487 |             "-C",
488 |             self.dbdata_path,
489 |             "--strip-components",
490 |             "1",
491 |         ].run()
492 |         # Imprint the required port.
493 |         (
494 |             (local["echo"][f"port={self.pgport}"])
495 |             >> f"{self.dbdata_path}/postgresql.conf"
496 |         )()
497 | 
498 |         return self.restart_postgres()
499 | 


--------------------------------------------------------------------------------
/gymlib_package/gymlib/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/gymlib_package/gymlib/py.typed


--------------------------------------------------------------------------------
/gymlib_package/gymlib/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/gymlib_package/gymlib/tests/__init__.py


--------------------------------------------------------------------------------
/gymlib_package/gymlib/tests/_set_up_gymlib_integtest_workspace.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # DO NOT RUN THIS SCRIPT DIRECTLY.
 4 | # This script only runs correctly when run by GymlibIntegtestManager.set_up_workspace() as it sets the necessary envvars.
 5 | # By allowing GymlibIntegtestManager.set_up_workspace() to set the envvars, we ensure that the envvars are only defined
 6 | # in a single location (inside GymlibIntegtestManager).
 7 | 
 8 | # Gymlib integration tests relies on Postgres being built and workloads/dbdata being generated.
 9 | # Generating these things is not considered a part of the test which is why it's in its own shell script.
10 | # The reason there's a shell script generating them instead of them just being in the repo is because (a)
11 | #   the Postgres repo is very large and (b) the built binary will be different for different machines.
12 | # This script should be run from the base dbgym/ directory.
13 | 
14 | set -euxo pipefail
15 | 
16 | # INTENDED_DBDATA_HARDWARE can be set elsewhere (e.g. by tests_ci.yaml) but we use hdd by default.
17 | INTENDED_DBDATA_HARDWARE="${INTENDED_DBDATA_HARDWARE:-hdd}"
18 | 
19 | python3 task.py benchmark $BENCHMARK tables $SCALE_FACTOR
20 | python3 task.py benchmark $BENCHMARK workload --scale-factor $SCALE_FACTOR
21 | 
22 | python3 task.py dbms postgres build
23 | python3 task.py dbms postgres dbdata $BENCHMARK --scale-factor $SCALE_FACTOR --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE
24 | 


--------------------------------------------------------------------------------
/gymlib_package/gymlib/tests/filesystem_unittest_util.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from pathlib import Path
  4 | from typing import Any, NewType, cast
  5 | 
  6 | from gymlib.workspace import RUNS_DNAME, SYMLINKS_DNAME, TMP_DNAME
  7 | 
  8 | FilesystemStructure = NewType("FilesystemStructure", dict[str, Any])
  9 | 
 10 | 
 11 | def create_structure(root_path: Path, structure: FilesystemStructure) -> None:
 12 |     """
 13 |     Create files and directories according to the structure.
 14 |     """
 15 | 
 16 |     def create_structure_internal(
 17 |         root_path: Path, cur_path: Path, structure: FilesystemStructure
 18 |     ) -> None:
 19 |         for path, content in structure.items():
 20 |             full_path: Path = cur_path / path
 21 | 
 22 |             if isinstance(content, dict):  # Directory
 23 |                 full_path.mkdir(parents=True, exist_ok=True)
 24 |                 create_structure_internal(
 25 |                     root_path,
 26 |                     full_path,
 27 |                     FilesystemStructure(cast(dict[str, Any], content)),
 28 |                 )
 29 |             elif isinstance(content, tuple) and content[0] == "file":
 30 |                 full_path.parent.mkdir(parents=True, exist_ok=True)
 31 |                 if len(content) == 2:
 32 |                     full_path.write_text(content[1])
 33 |                 else:
 34 |                     assert len(content) == 1
 35 |                     full_path.touch()
 36 |             elif isinstance(content, tuple) and content[0] == "symlink":
 37 |                 assert len(content) == 2
 38 |                 target_path = root_path / content[1]
 39 |                 os.symlink(target_path, full_path)
 40 |             else:
 41 |                 raise ValueError(f"Unsupported type for path ({path}): {content}")
 42 | 
 43 |     root_path.mkdir(parents=True, exist_ok=True)
 44 |     create_structure_internal(root_path, root_path, structure)
 45 | 
 46 | 
 47 | def verify_structure(root_path: Path, structure: FilesystemStructure) -> bool:
 48 |     """
 49 |     Verify that the files and directories match the expected structure.
 50 |     """
 51 | 
 52 |     def verify_structure_internal(
 53 |         root_path: Path, cur_path: Path, structure: FilesystemStructure
 54 |     ) -> bool:
 55 |         # Check for the presence of each item specified in the structure
 56 |         for name, item in structure.items():
 57 |             new_cur_path = cur_path / name
 58 |             if not path_exists_dont_follow_symlinks(new_cur_path):
 59 |                 logging.debug(f"{new_cur_path} does not exist")
 60 |                 return False
 61 |             elif isinstance(item, dict):
 62 |                 if not new_cur_path.is_dir():
 63 |                     logging.debug(f"expected {new_cur_path} to be a directory")
 64 |                     return False
 65 |                 if not verify_structure_internal(
 66 |                     root_path,
 67 |                     new_cur_path,
 68 |                     FilesystemStructure(cast(dict[str, Any], item)),
 69 |                 ):
 70 |                     return False
 71 |             elif isinstance(item, tuple) and item[0] == "file":
 72 |                 if not new_cur_path.is_file():
 73 |                     logging.debug(f"expected {new_cur_path} to be a regular file")
 74 |                     return False
 75 |             elif isinstance(item, tuple) and item[0] == "symlink":
 76 |                 if not new_cur_path.is_symlink():
 77 |                     logging.debug(f"expected {new_cur_path} to be a symlink")
 78 |                     return False
 79 |                 # If item[1] is None, this indicates that we expect the symlink to be broken
 80 |                 if item[1] != None:
 81 |                     expected_target = root_path / item[1]
 82 |                     if not new_cur_path.resolve().samefile(expected_target):
 83 |                         logging.debug(
 84 |                             f"expected {new_cur_path} to link to {expected_target}, but it links to {new_cur_path.resolve()}"
 85 |                         )
 86 |                         return False
 87 |             else:
 88 |                 assert False, "structure misconfigured"
 89 | 
 90 |         # Check for any extra files or directories not described by the structure
 91 |         expected_names = set(structure.keys())
 92 |         actual_names = {entry.name for entry in cur_path.iterdir()}
 93 |         if not expected_names.issuperset(actual_names):
 94 |             logging.debug(
 95 |                 f"expected_names={expected_names}, actual_names={actual_names}"
 96 |             )
 97 |             return False
 98 | 
 99 |         return True
100 | 
101 |     if not root_path.exists():
102 |         logging.debug(f"{root_path} does not exist")
103 |         return False
104 |     return verify_structure_internal(root_path, root_path, structure)
105 | 
106 | 
107 | def make_workspace_structure(
108 |     symlinks_structure: FilesystemStructure,
109 |     task_runs_structure: FilesystemStructure,
110 | ) -> FilesystemStructure:
111 |     """
112 |     This function exists so that it's easier to refactor the tests in case we ever change
113 |         how the workspace is organized.
114 |     """
115 |     return FilesystemStructure(
116 |         {
117 |             "dbgym_workspace": {
118 |                 SYMLINKS_DNAME: symlinks_structure,
119 |                 RUNS_DNAME: task_runs_structure,
120 |                 TMP_DNAME: {},
121 |             }
122 |         }
123 |     )
124 | 
125 | 
126 | def path_exists_dont_follow_symlinks(path: Path) -> bool:
127 |     """
128 |     As of writing this comment, ray is currently constraining us to python <3.12. However, the "follow_symlinks" option in
129 |     Path.exists() only comes up in python 3.12. Thus, this is the only way to check if a path exists without following symlinks.
130 |     """
131 |     # If the path exists and is a symlink, os.path.islink() will be true (even if the symlink is broken)
132 |     if os.path.islink(path):
133 |         return True
134 |     # Otherwise, we know it's either non-existent or not a symlink, so path.exists() works fine
135 |     else:
136 |         return path.exists()
137 | 


--------------------------------------------------------------------------------
/gymlib_package/gymlib/tests/gymlib_integtest_dbgym_config.yaml:
--------------------------------------------------------------------------------
1 | dbgym_workspace_path: ../dbgym_gymlib_integtest_workspace/
2 | 


--------------------------------------------------------------------------------
/gymlib_package/gymlib/tests/gymlib_integtest_util.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | from pathlib import Path
  4 | from typing import Optional
  5 | 
  6 | # TODO: remove infra_paths from the import
  7 | from gymlib.infra_paths import (
  8 |     get_dbdata_tgz_symlink_path,
  9 |     get_pgbin_symlink_path,
 10 |     get_workload_suffix,
 11 |     get_workload_symlink_path,
 12 | )
 13 | from gymlib.tuning_artifacts import TuningMetadata
 14 | from gymlib.workspace import (
 15 |     fully_resolve_path,
 16 |     get_tmp_path_from_workspace_path,
 17 |     get_workspace_path_from_config,
 18 | )
 19 | 
 20 | from benchmark.tpch.constants import DEFAULT_TPCH_SEED
 21 | 
 22 | 
 23 | class GymlibIntegtestManager:
 24 |     """
 25 |     This is essentially a singleton class. This avoids multiple integtest_*.py files creating
 26 |     the workspace and/or the DBGymWorkspace object redundantly.
 27 | 
 28 |     The reason I put all these static methods in a class instead of directly in the module is
 29 |     that the functions have very generic names (e.g. set_up_workspace()) but having them
 30 |     inside a class makes it clear that they are related to the gymlib integration tests.
 31 |     """
 32 | 
 33 |     # These constants are also used by _set_up_gymlib_integtest_workspace.sh.
 34 |     BENCHMARK = "tpch"
 35 |     SCALE_FACTOR = 0.01
 36 |     DBGYM_CONFIG_PATH = Path(
 37 |         "gymlib_package/gymlib/tests/gymlib_integtest_dbgym_config.yaml"
 38 |     )
 39 |     WORKSPACE_PATH: Optional[Path] = None
 40 | 
 41 |     @staticmethod
 42 |     def set_up_workspace() -> None:
 43 |         """
 44 |         Set up the workspace if it has not already been set up.
 45 |         None of the integtest_*.py files will delete the workspace so that future tests run faster.
 46 |         """
 47 |         GymlibIntegtestManager.WORKSPACE_PATH = get_workspace_path_from_config(
 48 |             GymlibIntegtestManager.DBGYM_CONFIG_PATH
 49 |         )
 50 | 
 51 |         # This if statement prevents us from setting up the workspace twice, which saves time.
 52 |         if not GymlibIntegtestManager.WORKSPACE_PATH.exists():
 53 |             subprocess.run(
 54 |                 ["./gymlib_package/gymlib/tests/_set_up_gymlib_integtest_workspace.sh"],
 55 |                 env={
 56 |                     "BENCHMARK": GymlibIntegtestManager.BENCHMARK,
 57 |                     "SCALE_FACTOR": str(GymlibIntegtestManager.SCALE_FACTOR),
 58 |                     # By setting this envvar, we ensure that when running _set_up_gymlib_integtest_workspace.sh,
 59 |                     # make_standard_dbgym_workspace() will use the correct DBGYM_CONFIG_PATH.
 60 |                     "DBGYM_CONFIG_PATH": str(GymlibIntegtestManager.DBGYM_CONFIG_PATH),
 61 |                     **os.environ,
 62 |                 },
 63 |                 check=True,
 64 |             )
 65 | 
 66 |     @staticmethod
 67 |     def get_workspace_path() -> Path:
 68 |         assert GymlibIntegtestManager.WORKSPACE_PATH is not None
 69 |         return GymlibIntegtestManager.WORKSPACE_PATH
 70 | 
 71 |     @staticmethod
 72 |     def get_default_metadata() -> TuningMetadata:
 73 |         assert GymlibIntegtestManager.BENCHMARK == "tpch"
 74 |         suffix = get_workload_suffix(
 75 |             GymlibIntegtestManager.BENCHMARK,
 76 |             seed_start=DEFAULT_TPCH_SEED,
 77 |             seed_end=DEFAULT_TPCH_SEED,
 78 |             query_subset="all",
 79 |         )
 80 |         return TuningMetadata(
 81 |             workload_path=fully_resolve_path(
 82 |                 get_workload_symlink_path(
 83 |                     GymlibIntegtestManager.get_workspace_path(),
 84 |                     GymlibIntegtestManager.BENCHMARK,
 85 |                     GymlibIntegtestManager.SCALE_FACTOR,
 86 |                     suffix,
 87 |                 ),
 88 |             ),
 89 |             pristine_dbdata_snapshot_path=fully_resolve_path(
 90 |                 get_dbdata_tgz_symlink_path(
 91 |                     GymlibIntegtestManager.get_workspace_path(),
 92 |                     GymlibIntegtestManager.BENCHMARK,
 93 |                     GymlibIntegtestManager.SCALE_FACTOR,
 94 |                 ),
 95 |             ),
 96 |             dbdata_parent_path=fully_resolve_path(
 97 |                 get_tmp_path_from_workspace_path(
 98 |                     GymlibIntegtestManager.get_workspace_path()
 99 |                 ),
100 |             ),
101 |             pgbin_path=fully_resolve_path(
102 |                 get_pgbin_symlink_path(GymlibIntegtestManager.get_workspace_path()),
103 |             ),
104 |         )
105 | 


--------------------------------------------------------------------------------
/gymlib_package/gymlib/tests/integtest_pg_conn.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import unittest
  3 | 
  4 | import psycopg
  5 | from gymlib.pg import (
  6 |     DEFAULT_POSTGRES_PORT,
  7 |     get_is_postgres_running,
  8 |     get_running_postgres_ports,
  9 | )
 10 | from gymlib.pg_conn import PostgresConn
 11 | from gymlib.tests.gymlib_integtest_util import GymlibIntegtestManager
 12 | from gymlib.workspace import DBGymWorkspace
 13 | 
 14 | 
 15 | class PostgresConnTests(unittest.TestCase):
 16 |     workspace: DBGymWorkspace
 17 | 
 18 |     @staticmethod
 19 |     def setUpClass() -> None:
 20 |         GymlibIntegtestManager.set_up_workspace()
 21 |         # Reset _num_times_created_this_run since previous tests may have created a workspace.
 22 |         DBGymWorkspace._num_times_created_this_run = 0
 23 |         PostgresConnTests.workspace = DBGymWorkspace(
 24 |             GymlibIntegtestManager.get_workspace_path()
 25 |         )
 26 | 
 27 |     def setUp(self) -> None:
 28 |         self.assertFalse(
 29 |             get_is_postgres_running(),
 30 |             "Make sure Postgres isn't running before starting the integration test. `pkill postgres` is one way "
 31 |             + "to ensure this. Be careful about accidentally taking down other people's Postgres instances though.",
 32 |         )
 33 |         self.metadata = GymlibIntegtestManager.get_default_metadata()
 34 | 
 35 |         # The reason we restart Postgres every time is to ensure a "clean" starting point
 36 |         # so that all tests are independent of each other.
 37 |         self.pg_conn = self.create_pg_conn()
 38 |         self.pg_conn.restore_pristine_snapshot()
 39 |         self.pg_conn.restart_postgres()
 40 |         self.assertTrue(get_is_postgres_running())
 41 | 
 42 |     def tearDown(self) -> None:
 43 |         self.pg_conn.shutdown_postgres()
 44 |         self.assertFalse(get_is_postgres_running())
 45 | 
 46 |     def create_pg_conn(self, pgport: int = DEFAULT_POSTGRES_PORT) -> PostgresConn:
 47 |         return PostgresConn(
 48 |             PostgresConnTests.workspace,
 49 |             pgport,
 50 |             self.metadata.pristine_dbdata_snapshot_path,
 51 |             self.metadata.dbdata_parent_path,
 52 |             self.metadata.pgbin_path,
 53 |             None,
 54 |         )
 55 | 
 56 |     def test_start_on_multiple_ports(self) -> None:
 57 |         # The setUp() function should have started Postgres on DEFAULT_POSTGRES_PORT.
 58 |         self.assertEqual(set(get_running_postgres_ports()), {DEFAULT_POSTGRES_PORT})
 59 | 
 60 |         # Now, we start Postgres on a new port.
 61 |         pg_conn1 = self.create_pg_conn(DEFAULT_POSTGRES_PORT + 1)
 62 |         pg_conn1.restore_pristine_snapshot()
 63 |         pg_conn1.restart_postgres()
 64 |         self.assertEqual(
 65 |             set(get_running_postgres_ports()),
 66 |             {DEFAULT_POSTGRES_PORT, DEFAULT_POSTGRES_PORT + 1},
 67 |         )
 68 | 
 69 |         # Clean up
 70 |         pg_conn1.shutdown_postgres()
 71 | 
 72 |     def test_connect_and_disconnect(self) -> None:
 73 |         self.assertIsNone(self.pg_conn._conn)
 74 |         conn = self.pg_conn.conn()
 75 |         self.assertIsNotNone(conn)
 76 |         self.assertIs(
 77 |             conn, self.pg_conn._conn
 78 |         )  # The conn should be cached so these objects should be the same
 79 |         self.assertIs(conn, self.pg_conn.conn())  # Same thing here
 80 |         self.pg_conn.disconnect()
 81 |         self.assertIsNone(self.pg_conn._conn)
 82 | 
 83 |     def test_start_with_changes(self) -> None:
 84 |         initial_sysknobs = self.pg_conn.get_system_knobs()
 85 | 
 86 |         # First call
 87 |         self.assertEqual(initial_sysknobs["wal_buffers"], "4MB")
 88 |         self.pg_conn.restart_with_changes({"wal_buffers": "8MB"})
 89 |         new_sysknobs = self.pg_conn.get_system_knobs()
 90 |         self.assertEqual(new_sysknobs["wal_buffers"], "8MB")
 91 | 
 92 |         # Second call
 93 |         self.assertEqual(initial_sysknobs["enable_nestloop"], "on")
 94 |         self.pg_conn.restart_with_changes({"enable_nestloop": "off"})
 95 |         new_sysknobs = self.pg_conn.get_system_knobs()
 96 |         self.assertEqual(new_sysknobs["enable_nestloop"], "off")
 97 |         # The changes should not be additive. The "wal_buffers" should have "reset" to 4MB.
 98 |         self.assertEqual(new_sysknobs["wal_buffers"], "4MB")
 99 | 
100 |     def test_start_with_changes_doesnt_modify_input(self) -> None:
101 |         conf_changes = {"wal_buffers": "8MB"}
102 |         orig_conf_changes = copy.deepcopy(conf_changes)
103 |         self.pg_conn.restart_with_changes(conf_changes)
104 |         self.assertEqual(conf_changes, orig_conf_changes)
105 | 
106 |     def test_time_query(self) -> None:
107 |         runtime, did_time_out, explain_data = self.pg_conn.time_query(
108 |             "select pg_sleep(1)"
109 |         )
110 |         # The runtime should be about 1 second.
111 |         self.assertTrue(abs(runtime - 1_000_000) < 100_000)
112 |         self.assertFalse(did_time_out)
113 |         self.assertIsNone(explain_data)
114 | 
115 |     def test_time_query_with_explain(self) -> None:
116 |         _, _, explain_data = self.pg_conn.time_query("select 1", add_explain=True)
117 |         self.assertIsNotNone(explain_data)
118 | 
119 |     def test_time_query_with_timeout(self) -> None:
120 |         runtime, did_time_out, _ = self.pg_conn.time_query(
121 |             "select pg_sleep(3)", timeout=2
122 |         )
123 |         # The runtime should be about what the timeout is.
124 |         self.assertTrue(abs(runtime - 2_000_000) < 100_000)
125 |         self.assertTrue(did_time_out)
126 | 
127 |     def test_time_query_with_valid_table(self) -> None:
128 |         # This just ensures that it doesn't raise any errors.
129 |         self.pg_conn.time_query("select * from lineitem limit 10")
130 | 
131 |     def test_time_query_with_invalid_table(self) -> None:
132 |         with self.assertRaises(psycopg.errors.UndefinedTable):
133 |             self.pg_conn.time_query("select * from itemline limit 10")
134 | 
135 |     def test_time_query_with_valid_hints(self) -> None:
136 |         join_query = """SELECT *
137 | FROM orders
138 | JOIN lineitem ON o_orderkey = l_orderkey
139 | WHERE o_orderdate BETWEEN '1995-01-01' AND '1995-12-31'
140 | LIMIT 10"""
141 |         join_types = [
142 |             ("MergeJoin", "Merge Join"),
143 |             ("HashJoin", "Hash Join"),
144 |             ("NestLoop", "Nested Loop"),
145 |         ]
146 | 
147 |         for hint_join_type, expected_join_type in join_types:
148 |             _, _, explain_data = self.pg_conn.time_query(
149 |                 join_query,
150 |                 query_knobs=[f"{hint_join_type}(lineitem orders)"],
151 |                 add_explain=True,
152 |             )
153 |             assert explain_data is not None  # This assertion is for mypy.
154 |             actual_join_type = explain_data["Plan"]["Plans"][0]["Node Type"]
155 |             self.assertEqual(expected_join_type, actual_join_type)
156 | 
157 |     def test_time_query_with_invalid_hint(self) -> None:
158 |         with self.assertRaises(RuntimeError) as context:
159 |             self.pg_conn.time_query("select 1", query_knobs=["dbgym"])
160 |             self.assertTrue(
161 |                 'Unrecognized hint keyword "dbgym"' in str(context.exception)
162 |             )
163 | 
164 | 
165 | if __name__ == "__main__":
166 |     unittest.main()
167 | 


--------------------------------------------------------------------------------
/gymlib_package/gymlib/tests/integtest_tuning_artifacts.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from gymlib.tests.gymlib_integtest_util import GymlibIntegtestManager
 4 | from gymlib.tuning_artifacts import (
 5 |     DBMSConfigDelta,
 6 |     IndexesDelta,
 7 |     QueryKnobsDelta,
 8 |     SysKnobsDelta,
 9 |     TuningArtifactsReader,
10 |     TuningArtifactsWriter,
11 | )
12 | from gymlib.workspace import DBGymWorkspace
13 | 
14 | 
15 | class PostgresConnTests(unittest.TestCase):
16 |     @staticmethod
17 |     def setUpClass() -> None:
18 |         GymlibIntegtestManager.set_up_workspace()
19 | 
20 |     def setUp(self) -> None:
21 |         # We re-create a workspace for each test because each test will create its own TuningArtifactsWriter.
22 |         DBGymWorkspace._num_times_created_this_run = 0
23 |         self.workspace = DBGymWorkspace(GymlibIntegtestManager.get_workspace_path())
24 | 
25 |     @staticmethod
26 |     def make_config(letter: str) -> DBMSConfigDelta:
27 |         return DBMSConfigDelta(
28 |             indexes=IndexesDelta([letter]),
29 |             sysknobs=SysKnobsDelta({letter: letter}),
30 |             qknobs=QueryKnobsDelta({letter: [letter]}),
31 |         )
32 | 
33 |     def test_get_delta_at_step(self) -> None:
34 |         writer = TuningArtifactsWriter(
35 |             self.workspace,
36 |             GymlibIntegtestManager.get_default_metadata(),
37 |         )
38 | 
39 |         writer.write_step(PostgresConnTests.make_config("a"))
40 |         writer.write_step(PostgresConnTests.make_config("b"))
41 |         writer.write_step(PostgresConnTests.make_config("c"))
42 | 
43 |         reader = TuningArtifactsReader(writer.tuning_artifacts_path)
44 | 
45 |         self.assertEqual(
46 |             reader.get_delta_at_step(1), PostgresConnTests.make_config("b")
47 |         )
48 |         self.assertEqual(
49 |             reader.get_delta_at_step(0), PostgresConnTests.make_config("a")
50 |         )
51 |         self.assertEqual(
52 |             reader.get_delta_at_step(1), PostgresConnTests.make_config("b")
53 |         )
54 |         self.assertEqual(
55 |             reader.get_delta_at_step(2), PostgresConnTests.make_config("c")
56 |         )
57 | 
58 |     def test_get_all_deltas_in_order(self) -> None:
59 |         writer = TuningArtifactsWriter(
60 |             self.workspace,
61 |             GymlibIntegtestManager.get_default_metadata(),
62 |         )
63 | 
64 |         writer.write_step(PostgresConnTests.make_config("a"))
65 |         writer.write_step(PostgresConnTests.make_config("b"))
66 |         writer.write_step(PostgresConnTests.make_config("c"))
67 | 
68 |         reader = TuningArtifactsReader(writer.tuning_artifacts_path)
69 | 
70 |         self.assertEqual(
71 |             reader.get_all_deltas_in_order(),
72 |             [
73 |                 PostgresConnTests.make_config("a"),
74 |                 PostgresConnTests.make_config("b"),
75 |                 PostgresConnTests.make_config("c"),
76 |             ],
77 |         )
78 | 
79 |     def test_get_metadata(self) -> None:
80 |         writer = TuningArtifactsWriter(
81 |             self.workspace,
82 |             GymlibIntegtestManager.get_default_metadata(),
83 |         )
84 |         reader = TuningArtifactsReader(writer.tuning_artifacts_path)
85 |         metadata = reader.get_metadata()
86 |         expected_metadata = GymlibIntegtestManager.get_default_metadata()
87 |         self.assertEqual(metadata, expected_metadata)
88 | 
89 | 
90 | if __name__ == "__main__":
91 |     unittest.main()
92 | 


--------------------------------------------------------------------------------
/gymlib_package/gymlib/tests/integtest_workload.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from gymlib.tests.gymlib_integtest_util import GymlibIntegtestManager
 4 | from gymlib.workload import Workload
 5 | from gymlib.workspace import DBGymWorkspace
 6 | 
 7 | from benchmark.tpch.constants import DEFAULT_TPCH_SEED, NUM_TPCH_QUERIES
 8 | 
 9 | 
10 | class WorkloadTests(unittest.TestCase):
11 |     workspace: DBGymWorkspace
12 | 
13 |     @staticmethod
14 |     def setUpClass() -> None:
15 |         GymlibIntegtestManager.set_up_workspace()
16 |         # Reset _num_times_created_this_run since previous tests may have created a workspace.
17 |         DBGymWorkspace._num_times_created_this_run = 0
18 |         WorkloadTests.workspace = DBGymWorkspace(
19 |             GymlibIntegtestManager.get_workspace_path()
20 |         )
21 | 
22 |     def test_workload(self) -> None:
23 |         workload_path = GymlibIntegtestManager.get_default_metadata().workload_path
24 |         workload = Workload(WorkloadTests.workspace, workload_path)
25 | 
26 |         # Check the order of query IDs.
27 |         self.assertEqual(
28 |             workload.get_query_order(),
29 |             [f"S{DEFAULT_TPCH_SEED}-Q{i}" for i in range(1, NUM_TPCH_QUERIES + 1)],
30 |         )
31 | 
32 |         # Sanity check all queries.
33 |         for query in workload.get_queries_in_order():
34 |             self.assertTrue("select" in query.lower())
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     unittest.main()
39 | 


--------------------------------------------------------------------------------
/gymlib_package/gymlib/tests/unittest_filesystem_unittest_util.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import shutil
 3 | import unittest
 4 | from pathlib import Path
 5 | 
 6 | from gymlib.tests.filesystem_unittest_util import (
 7 |     FilesystemStructure,
 8 |     create_structure,
 9 |     verify_structure,
10 | )
11 | 
12 | 
13 | class FilesystemUnittestUtilTests(unittest.TestCase):
14 |     scratchspace_path: Path = Path()
15 | 
16 |     @classmethod
17 |     def setUpClass(cls) -> None:
18 |         cls.scratchspace_path = (
19 |             Path.cwd() / "util/tests/test_filesystem_unittest_util_scratchspace/"
20 |         )
21 | 
22 |     def setUp(self) -> None:
23 |         if self.scratchspace_path.exists():
24 |             shutil.rmtree(self.scratchspace_path)
25 | 
26 |     def tearDown(self) -> None:
27 |         if self.scratchspace_path.exists():
28 |             shutil.rmtree(self.scratchspace_path)
29 | 
30 |     def test_filesystem_unittest_util(self) -> None:
31 |         structure = FilesystemStructure(
32 |             {
33 |                 "dir1": {"file1.txt": ("file",), "dir2": {"file2.txt": ("file",)}},
34 |                 "dir3": {"nested_link_to_dir1": ("symlink", "dir1")},
35 |                 "link_to_dir1": ("symlink", "dir1"),
36 |                 "link_to_file2": ("symlink", "dir1/dir2/file2.txt"),
37 |             }
38 |         )
39 |         create_structure(self.scratchspace_path, structure)
40 |         self.assertTrue(verify_structure(self.scratchspace_path, structure))
41 | 
42 |         extra_dir_structure = copy.deepcopy(structure)
43 |         # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it
44 |         self.assertTrue(verify_structure(self.scratchspace_path, extra_dir_structure))
45 |         extra_dir_structure["dir4"] = {}
46 |         self.assertFalse(verify_structure(self.scratchspace_path, extra_dir_structure))
47 | 
48 |         missing_dir_structure = copy.deepcopy(structure)
49 |         # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it
50 |         self.assertTrue(verify_structure(self.scratchspace_path, missing_dir_structure))
51 |         del missing_dir_structure["dir1"]
52 |         self.assertFalse(
53 |             verify_structure(self.scratchspace_path, missing_dir_structure)
54 |         )
55 | 
56 |         extra_file_structure = copy.deepcopy(structure)
57 |         # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it
58 |         self.assertTrue(verify_structure(self.scratchspace_path, extra_file_structure))
59 |         extra_file_structure["file3.txt"] = ("file",)
60 |         self.assertFalse(verify_structure(self.scratchspace_path, extra_file_structure))
61 | 
62 |         missing_file_structure = copy.deepcopy(structure)
63 |         # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it
64 |         self.assertTrue(
65 |             verify_structure(self.scratchspace_path, missing_file_structure)
66 |         )
67 |         del missing_file_structure["dir1"]["file1.txt"]
68 |         self.assertFalse(
69 |             verify_structure(self.scratchspace_path, missing_file_structure)
70 |         )
71 | 
72 |         extra_link_structure = copy.deepcopy(structure)
73 |         # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it
74 |         self.assertTrue(verify_structure(self.scratchspace_path, extra_link_structure))
75 |         extra_link_structure["link_to_dir3"] = ("symlink", "dir3")
76 |         self.assertFalse(verify_structure(self.scratchspace_path, extra_link_structure))
77 | 
78 |         missing_link_structure = copy.deepcopy(structure)
79 |         # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it
80 |         self.assertTrue(
81 |             verify_structure(self.scratchspace_path, missing_link_structure)
82 |         )
83 |         del missing_link_structure["link_to_dir1"]
84 |         self.assertFalse(
85 |             verify_structure(self.scratchspace_path, missing_link_structure)
86 |         )
87 | 
88 |         wrong_link_structure = copy.deepcopy(structure)
89 |         # The "assertTrue, modify, assertFalse" patterns makes sure it was the modification that broke it
90 |         self.assertTrue(verify_structure(self.scratchspace_path, wrong_link_structure))
91 |         wrong_link_structure["link_to_dir1"] = ("symlink", "dir3")
92 |         self.assertFalse(verify_structure(self.scratchspace_path, wrong_link_structure))
93 | 
94 | 
95 | if __name__ == "__main__":
96 |     unittest.main()
97 | 


--------------------------------------------------------------------------------
/gymlib_package/gymlib/tests/unittest_workspace.py:
--------------------------------------------------------------------------------
  1 | # TODO: figure out where to put the filesystem structure helpers. I think I want to put them inside gymlib and make a separate folder just testing the helpers.
  2 | 
  3 | import os
  4 | import shutil
  5 | import unittest
  6 | from pathlib import Path
  7 | from typing import Optional
  8 | 
  9 | from gymlib.tests.filesystem_unittest_util import (
 10 |     FilesystemStructure,
 11 |     create_structure,
 12 |     make_workspace_structure,
 13 |     verify_structure,
 14 | )
 15 | from gymlib.workspace import (
 16 |     DBGYM_APP_NAME,
 17 |     RUNS_DNAME,
 18 |     SYMLINKS_DNAME,
 19 |     DBGymWorkspace,
 20 |     name_to_linkname,
 21 | )
 22 | 
 23 | from gymlib_package.gymlib.workspace import LATEST_RUN_FNAME
 24 | 
 25 | 
 26 | class WorkspaceTests(unittest.TestCase):
 27 |     scratchspace_path: Path = Path()
 28 |     workspace_path: Path = Path()
 29 | 
 30 |     @classmethod
 31 |     def setUpClass(cls) -> None:
 32 |         cls.scratchspace_path = Path.cwd() / "util/tests/test_workspace_scratchspace/"
 33 |         cls.workspace_path = cls.scratchspace_path / "dbgym_workspace"
 34 | 
 35 |     def setUp(self) -> None:
 36 |         if self.scratchspace_path.exists():
 37 |             shutil.rmtree(self.scratchspace_path)
 38 | 
 39 |         self.workspace: Optional[DBGymWorkspace] = None
 40 |         self.expected_structure: Optional[FilesystemStructure] = None
 41 | 
 42 |     def tearDown(self) -> None:
 43 |         # You can comment this out if you want to inspect the scratchspace after a test (often used for debugging).
 44 |         if self.scratchspace_path.exists():
 45 |             shutil.rmtree(self.scratchspace_path)
 46 | 
 47 |     # All these helper functions will perform an action, update the expected structure, and then verify the structure.
 48 |     # Importantly though, I don't have helper functions for the complex functions that I want to test (e.g. link_result and save_file).
 49 |     def init_workspace_helper(self) -> None:
 50 |         # Reset this to avoid the error of it being created twice.
 51 |         # In real usage, the second run would be a different Python process so DBGymWorkspace._num_times_created_this_run would be 0.
 52 |         DBGymWorkspace._num_times_created_this_run = 0
 53 |         self.workspace = DBGymWorkspace(self.workspace_path)
 54 | 
 55 |         if self.expected_structure is None:
 56 |             self.expected_structure = make_workspace_structure(
 57 |                 FilesystemStructure({}),
 58 |                 FilesystemStructure(
 59 |                     {
 60 |                         "latest_run.link": (
 61 |                             "symlink",
 62 |                             f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}",
 63 |                         ),
 64 |                         self.workspace.dbgym_this_run_path.name: {},
 65 |                     }
 66 |                 ),
 67 |             )
 68 |         else:
 69 |             self.expected_structure["dbgym_workspace"][RUNS_DNAME][
 70 |                 self.workspace.dbgym_this_run_path.name
 71 |             ] = {}
 72 |             self.expected_structure["dbgym_workspace"][RUNS_DNAME][
 73 |                 name_to_linkname(LATEST_RUN_FNAME)
 74 |             ] = (
 75 |                 "symlink",
 76 |                 f"dbgym_workspace/{RUNS_DNAME}/{self.workspace.dbgym_this_run_path.name}",
 77 |             )
 78 | 
 79 |         self.assertTrue(
 80 |             verify_structure(self.scratchspace_path, self.expected_structure)
 81 |         )
 82 | 
 83 |     def make_file_helper(
 84 |         self, relative_path: str, file_obj: tuple[str, ...] = ("file",)
 85 |     ) -> Path:
 86 |         """
 87 |         You can override file_obj to make it a symlink instead.
 88 |         """
 89 |         assert self.workspace is not None and self.expected_structure is not None
 90 |         assert (
 91 |             ".." not in relative_path
 92 |         ), 'relative_path should not contain ".." (it should be inside the scratchspace dir)'
 93 |         file_path = self.scratchspace_path / relative_path
 94 |         file_path.parent.mkdir(parents=True, exist_ok=True)
 95 | 
 96 |         if file_obj[0] == "file":
 97 |             assert len(file_obj) in [1, 2]
 98 |             file_path.touch()
 99 |         elif file_obj[0] == "symlink":
100 |             assert len(file_obj) == 2
101 |             target_path = self.scratchspace_path / file_obj[1]
102 |             os.symlink(target_path, file_path)
103 |         else:
104 |             assert False, f"Unsupported file_obj: {file_obj}"
105 | 
106 |         # Build up the nested dict structure for the expected path
107 |         current_dict = self.expected_structure
108 |         path_parts = relative_path.split("/")
109 |         for part in path_parts[:-1]:
110 |             if part not in current_dict:
111 |                 current_dict[part] = {}
112 |             current_dict = current_dict[part]
113 |         current_dict[path_parts[-1]] = file_obj
114 | 
115 |         self.assertTrue(
116 |             verify_structure(self.scratchspace_path, self.expected_structure)
117 |         )
118 |         return file_path
119 | 
120 |     def make_result_helper(
121 |         self, relative_path: str = "result.txt", file_obj: tuple[str, ...] = ("file",)
122 |     ) -> Path:
123 |         assert self.workspace is not None and self.expected_structure is not None
124 |         assert (
125 |             ".." not in relative_path
126 |         ), 'relative_path should not contain ".." (it should be inside the run_*/ dir)'
127 |         return self.make_file_helper(
128 |             f"dbgym_workspace/task_runs/{self.workspace.dbgym_this_run_path.name}/{relative_path}",
129 |             file_obj=file_obj,
130 |         )
131 | 
132 |     def test_init_fields(self) -> None:
133 |         workspace = DBGymWorkspace(self.workspace_path)
134 |         self.assertEqual(workspace.app_name, DBGYM_APP_NAME)
135 | 
136 |     def test_init_from_nonexistent_workspace(self) -> None:
137 |         self.init_workspace_helper()
138 | 
139 |     def test_init_from_empty_workspace(self) -> None:
140 |         starting_structure = FilesystemStructure({"dbgym_workspace": {}})
141 |         create_structure(self.scratchspace_path, starting_structure)
142 |         self.init_workspace_helper()
143 | 
144 |     def test_init_from_already_initialized_workspace(self) -> None:
145 |         self.init_workspace_helper()
146 |         self.init_workspace_helper()
147 | 
148 |     def test_link_result_basic_functionality(self) -> None:
149 |         self.init_workspace_helper()
150 |         assert self.workspace is not None and self.expected_structure is not None
151 |         result_path = self.make_result_helper()
152 |         self.workspace.link_result(result_path)
153 |         self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME] = {}
154 |         self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME][
155 |             name_to_linkname(result_path.name)
156 |         ] = (
157 |             "symlink",
158 |             f"dbgym_workspace/{RUNS_DNAME}/{self.workspace.dbgym_this_run_path.name}/{result_path.name}",
159 |         )
160 |         self.assertTrue(
161 |             verify_structure(self.scratchspace_path, self.expected_structure)
162 |         )
163 | 
164 |     def test_link_result_does_not_copy_directory_structure_to_symlinks_dir(
165 |         self,
166 |     ) -> None:
167 |         """
168 |         We always just want link_result to link to the base symlinks dir.
169 |         """
170 |         self.init_workspace_helper()
171 |         assert self.workspace is not None and self.expected_structure is not None
172 |         result_path = self.make_result_helper(relative_path="dir1/dir2/dir3/result.txt")
173 |         self.workspace.link_result(result_path)
174 |         self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME] = {}
175 |         self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME][
176 |             name_to_linkname(result_path.name)
177 |         ] = (
178 |             "symlink",
179 |             f"dbgym_workspace/{RUNS_DNAME}/{self.workspace.dbgym_this_run_path.name}/dir1/dir2/dir3/{result_path.name}",
180 |         )
181 |         self.assertTrue(
182 |             verify_structure(self.scratchspace_path, self.expected_structure)
183 |         )
184 | 
185 |     def test_link_result_invalid_custom_link_name(self) -> None:
186 |         self.init_workspace_helper()
187 |         assert self.workspace is not None and self.expected_structure is not None
188 |         result_path = self.make_result_helper()
189 |         with self.assertRaisesRegex(
190 |             AssertionError, 'link_name \\(custom\\) should end with "\\.link"'
191 |         ):
192 |             self.workspace.link_result(result_path, custom_link_name=f"custom")
193 | 
194 |     def test_link_result_valid_custom_link_name(self) -> None:
195 |         self.init_workspace_helper()
196 |         assert self.workspace is not None and self.expected_structure is not None
197 |         result_path = self.make_result_helper()
198 |         self.workspace.link_result(
199 |             result_path, custom_link_name=name_to_linkname("custom")
200 |         )
201 |         self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME] = {}
202 |         self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME][
203 |             name_to_linkname("custom")
204 |         ] = (
205 |             "symlink",
206 |             f"dbgym_workspace/{RUNS_DNAME}/{self.workspace.dbgym_this_run_path.name}/{result_path.name}",
207 |         )
208 |         self.assertTrue(
209 |             verify_structure(self.scratchspace_path, self.expected_structure)
210 |         )
211 | 
212 |     def test_link_same_result_twice_with_same_link_name(self) -> None:
213 |         self.init_workspace_helper()
214 |         assert self.workspace is not None and self.expected_structure is not None
215 |         result_path = self.make_result_helper()
216 |         self.workspace.link_result(result_path)
217 |         self.workspace.link_result(result_path)
218 |         self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME] = {}
219 |         self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME][
220 |             name_to_linkname(result_path.name)
221 |         ] = (
222 |             "symlink",
223 |             f"dbgym_workspace/{RUNS_DNAME}/{self.workspace.dbgym_this_run_path.name}/{result_path.name}",
224 |         )
225 |         self.assertTrue(
226 |             verify_structure(self.scratchspace_path, self.expected_structure)
227 |         )
228 | 
229 |     def test_link_same_result_with_different_name(self) -> None:
230 |         self.init_workspace_helper()
231 |         assert self.workspace is not None and self.expected_structure is not None
232 |         result_path = self.make_result_helper()
233 |         self.workspace.link_result(result_path)
234 |         self.workspace.link_result(
235 |             result_path, custom_link_name=name_to_linkname("custom")
236 |         )
237 |         self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME] = {}
238 |         self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME][
239 |             name_to_linkname(result_path.name)
240 |         ] = (
241 |             "symlink",
242 |             f"dbgym_workspace/{RUNS_DNAME}/{self.workspace.dbgym_this_run_path.name}/{result_path.name}",
243 |         )
244 |         self.expected_structure["dbgym_workspace"][SYMLINKS_DNAME][DBGYM_APP_NAME][
245 |             name_to_linkname("custom")
246 |         ] = (
247 |             "symlink",
248 |             f"dbgym_workspace/{RUNS_DNAME}/{self.workspace.dbgym_this_run_path.name}/{result_path.name}",
249 |         )
250 |         self.assertTrue(
251 |             verify_structure(self.scratchspace_path, self.expected_structure)
252 |         )
253 | 
254 |     def test_link_result_from_another_run_raises_error(self) -> None:
255 |         self.init_workspace_helper()
256 |         result_path = self.make_result_helper()
257 |         self.init_workspace_helper()
258 |         assert self.workspace is not None and self.expected_structure is not None
259 |         with self.assertRaisesRegex(
260 |             AssertionError,
261 |             "The result must have been generated in \*this\* run\_\*/ dir",
262 |         ):
263 |             self.workspace.link_result(result_path)
264 | 
265 |     def test_link_result_from_external_dir_raises_error(self) -> None:
266 |         self.init_workspace_helper()
267 |         assert self.workspace is not None and self.expected_structure is not None
268 |         result_path = self.make_file_helper("external/result.txt")
269 |         with self.assertRaisesRegex(
270 |             AssertionError,
271 |             "The result must have been generated in \*this\* run\_\*/ dir",
272 |         ):
273 |             self.workspace.link_result(result_path)
274 | 
275 |     def test_link_result_cannot_link_symlink(self) -> None:
276 |         self.init_workspace_helper()
277 |         assert self.workspace is not None and self.expected_structure is not None
278 |         result_path = self.make_result_helper()
279 |         symlink_path = self.make_result_helper(
280 |             name_to_linkname("symlink"),
281 |             file_obj=(
282 |                 "symlink",
283 |                 f"dbgym_workspace/{RUNS_DNAME}/{self.workspace.dbgym_this_run_path.name}/{result_path.name}",
284 |             ),
285 |         )
286 |         with self.assertRaisesRegex(
287 |             AssertionError,
288 |             "result_path \(.*\) should be a fully resolved path",
289 |         ):
290 |             self.workspace.link_result(symlink_path)
291 | 
292 |     def test_save_file_dependency(self) -> None:
293 |         """
294 |         See the comments in save_file() for what a "dependency" is.
295 |         """
296 |         self.init_workspace_helper()
297 |         assert self.workspace is not None and self.expected_structure is not None
298 |         prev_run_name = self.workspace.dbgym_this_run_path.name
299 |         result_path = self.make_result_helper()
300 |         self.init_workspace_helper()
301 |         self.workspace.save_file(result_path)
302 |         self.expected_structure["dbgym_workspace"][RUNS_DNAME][
303 |             self.workspace.dbgym_this_run_path.name
304 |         ][name_to_linkname(result_path.name)] = (
305 |             "symlink",
306 |             f"dbgym_workspace/{RUNS_DNAME}/{prev_run_name}/{result_path.name}",
307 |         )
308 |         self.assertTrue(
309 |             verify_structure(self.scratchspace_path, self.expected_structure)
310 |         )
311 | 
312 |     def test_save_file_same_dependency_twice(self) -> None:
313 |         self.init_workspace_helper()
314 |         assert self.workspace is not None and self.expected_structure is not None
315 |         prev_run_name = self.workspace.dbgym_this_run_path.name
316 |         result_path = self.make_result_helper(file_obj=("file",))
317 |         self.init_workspace_helper()
318 |         self.workspace.save_file(result_path)
319 |         self.workspace.save_file(result_path)
320 |         self.expected_structure["dbgym_workspace"][RUNS_DNAME][
321 |             self.workspace.dbgym_this_run_path.name
322 |         ][name_to_linkname(result_path.name)] = (
323 |             "symlink",
324 |             f"dbgym_workspace/{RUNS_DNAME}/{prev_run_name}/{result_path.name}",
325 |         )
326 |         self.assertTrue(
327 |             verify_structure(self.scratchspace_path, self.expected_structure)
328 |         )
329 | 
330 |     def test_save_file_two_different_dependencies_with_same_filename_both_directly_inside_run(
331 |         self,
332 |     ) -> None:
333 |         self.init_workspace_helper()
334 |         assert self.workspace is not None and self.expected_structure is not None
335 |         prev_run_names = []
336 |         prev_run_names.append(self.workspace.dbgym_this_run_path.name)
337 |         result1_path = self.make_result_helper(file_obj=("file",))
338 |         self.init_workspace_helper()
339 |         prev_run_names.append(self.workspace.dbgym_this_run_path.name)
340 |         result2_path = self.make_result_helper(file_obj=("file",))
341 |         filename = result1_path.name
342 |         assert filename == result2_path.name
343 | 
344 |         self.init_workspace_helper()
345 |         self.workspace.save_file(result1_path)
346 |         self.workspace.save_file(result2_path)
347 |         # The second save_file() should have overwritten the first one.
348 |         self.expected_structure["dbgym_workspace"][RUNS_DNAME][
349 |             self.workspace.dbgym_this_run_path.name
350 |         ][name_to_linkname(filename)] = (
351 |             "symlink",
352 |             f"dbgym_workspace/{RUNS_DNAME}/{prev_run_names[-1]}/{filename}",
353 |         )
354 |         self.assertTrue(
355 |             verify_structure(self.scratchspace_path, self.expected_structure)
356 |         )
357 | 
358 |     def test_save_file_two_different_dependencies_with_same_filename_but_different_outermost_dirs(
359 |         self,
360 |     ) -> None:
361 |         self.init_workspace_helper()
362 |         assert self.workspace is not None and self.expected_structure is not None
363 |         prev_run_name = self.workspace.dbgym_this_run_path.name
364 |         result1_path = self.make_result_helper("dir1/result.txt", file_obj=("file",))
365 |         result2_path = self.make_result_helper("result.txt", file_obj=("file",))
366 |         filename = result1_path.name
367 |         assert filename == result2_path.name
368 | 
369 |         self.init_workspace_helper()
370 |         self.workspace.save_file(result1_path)
371 |         self.workspace.save_file(result2_path)
372 |         # The second save_file() should not overwrite the first one because the outermost dirs are different.
373 |         self.expected_structure["dbgym_workspace"][RUNS_DNAME][
374 |             self.workspace.dbgym_this_run_path.name
375 |         ][name_to_linkname(filename)] = (
376 |             "symlink",
377 |             f"dbgym_workspace/{RUNS_DNAME}/{prev_run_name}/{filename}",
378 |         )
379 |         self.expected_structure["dbgym_workspace"][RUNS_DNAME][
380 |             self.workspace.dbgym_this_run_path.name
381 |         ][name_to_linkname("dir1")] = (
382 |             "symlink",
383 |             f"dbgym_workspace/{RUNS_DNAME}/{prev_run_name}/dir1",
384 |         )
385 |         self.assertTrue(
386 |             verify_structure(self.scratchspace_path, self.expected_structure)
387 |         )
388 | 
389 |     def test_save_file_config(self) -> None:
390 |         """
391 |         See the comments in save_file() for what a "config" is.
392 |         """
393 |         self.init_workspace_helper()
394 |         assert self.workspace is not None and self.expected_structure is not None
395 |         result_path = self.make_file_helper(
396 |             "external/result.txt", file_obj=("file", "contents")
397 |         )
398 |         self.workspace.save_file(result_path)
399 |         self.expected_structure["dbgym_workspace"][RUNS_DNAME][
400 |             self.workspace.dbgym_this_run_path.name
401 |         ][f"{result_path.name}"] = ("file", "contents")
402 |         self.assertTrue(
403 |             verify_structure(self.scratchspace_path, self.expected_structure)
404 |         )
405 | 
406 |     def test_save_file_same_config_twice(self) -> None:
407 |         self.init_workspace_helper()
408 |         assert self.workspace is not None and self.expected_structure is not None
409 |         result_path = self.make_file_helper(
410 |             "external/result.txt", file_obj=("file", "contents")
411 |         )
412 |         self.workspace.save_file(result_path)
413 |         self.workspace.save_file(result_path)
414 |         self.expected_structure["dbgym_workspace"][RUNS_DNAME][
415 |             self.workspace.dbgym_this_run_path.name
416 |         ][f"{result_path.name}"] = ("file", "contents")
417 |         self.assertTrue(
418 |             verify_structure(self.scratchspace_path, self.expected_structure)
419 |         )
420 | 
421 |     def test_save_file_two_different_configs_with_same_filename(self) -> None:
422 |         self.init_workspace_helper()
423 |         assert self.workspace is not None and self.expected_structure is not None
424 |         result1_path = self.make_file_helper(
425 |             "external/result.txt", file_obj=("file", "contents1")
426 |         )
427 |         result2_path = self.make_file_helper(
428 |             "external/dir1/result.txt", file_obj=("file", "contents2")
429 |         )
430 |         filename = result1_path.name
431 |         assert filename == result2_path.name
432 | 
433 |         self.workspace.save_file(result1_path)
434 |         self.workspace.save_file(result2_path)
435 |         self.expected_structure["dbgym_workspace"][RUNS_DNAME][
436 |             self.workspace.dbgym_this_run_path.name
437 |         ][f"{filename}"] = ("file", "contents2")
438 |         self.assertTrue(
439 |             verify_structure(self.scratchspace_path, self.expected_structure)
440 |         )
441 | 
442 |     def test_save_file_dependency_inside_directory(self) -> None:
443 |         self.init_workspace_helper()
444 |         assert self.workspace is not None and self.expected_structure is not None
445 |         prev_run_name = self.workspace.dbgym_this_run_path.name
446 |         result_path = self.make_result_helper("dir1/dir2/result.txt")
447 |         self.make_result_helper("dir1/other1.txt")
448 |         self.make_result_helper("dir1/dir3/other2.txt")
449 |         self.init_workspace_helper()
450 |         self.workspace.save_file(result_path)
451 |         self.expected_structure["dbgym_workspace"][RUNS_DNAME][
452 |             self.workspace.dbgym_this_run_path.name
453 |         ][name_to_linkname("dir1")] = (
454 |             "symlink",
455 |             f"dbgym_workspace/{RUNS_DNAME}/{prev_run_name}/dir1",
456 |         )
457 |         self.assertTrue(
458 |             verify_structure(self.scratchspace_path, self.expected_structure)
459 |         )
460 | 
461 |     def test_save_file_generated_this_run_raises_error(self) -> None:
462 |         self.init_workspace_helper()
463 |         assert self.workspace is not None and self.expected_structure is not None
464 |         result_path = self.make_result_helper()
465 |         with self.assertRaisesRegex(
466 |             AssertionError,
467 |             "path \(.*\) was generated in this task run \(.*\)\. You do not need to save it",
468 |         ):
469 |             self.workspace.save_file(result_path)
470 | 
471 | 
472 | if __name__ == "__main__":
473 |     unittest.main()
474 | 


--------------------------------------------------------------------------------
/gymlib_package/gymlib/tuning_artifacts.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from dataclasses import asdict, dataclass
  3 | from pathlib import Path
  4 | from typing import Any, NewType
  5 | 
  6 | from gymlib.workspace import DBGymWorkspace, is_fully_resolved
  7 | 
  8 | # PostgresConn doesn't use these types because PostgresConn is used internally by tuning agents
  9 | # while these types are only used in the interface between the orchestrator and the tuning agents.
 10 | IndexesDelta = NewType("IndexesDelta", list[str])
 11 | SysKnobsDelta = NewType("SysKnobsDelta", dict[str, str])
 12 | # TODO: I'm not decided whether these should be deltas or full configs. I'm going to figure this out once I integrate Proto-X and UDO.
 13 | QueryKnobsDelta = NewType("QueryKnobsDelta", dict[str, list[str]])
 14 | 
 15 | 
 16 | @dataclass
 17 | class TuningMetadata:
 18 |     """Metadata for the tuning process."""
 19 | 
 20 |     workload_path: Path
 21 |     pristine_dbdata_snapshot_path: Path
 22 |     dbdata_parent_path: Path
 23 |     pgbin_path: Path
 24 | 
 25 |     def __post_init__(self) -> None:
 26 |         """
 27 |         Since the metadata needs to persist over time, we need to make sure that the paths are
 28 |         fully resolved.
 29 |         """
 30 |         assert is_fully_resolved(
 31 |             self.workload_path
 32 |         ), f"workload_path={self.workload_path}"
 33 |         assert is_fully_resolved(
 34 |             self.pristine_dbdata_snapshot_path
 35 |         ), f"pristine_dbdata_snapshot_path={self.pristine_dbdata_snapshot_path}"
 36 |         assert is_fully_resolved(
 37 |             self.dbdata_parent_path
 38 |         ), f"dbdata_parent_path={self.dbdata_parent_path}"
 39 |         assert is_fully_resolved(self.pgbin_path), f"pgbin_path={self.pgbin_path}"
 40 | 
 41 |     def asdict(self) -> dict[str, Any]:
 42 |         return {
 43 |             "workload_path": str(self.workload_path),
 44 |             "pristine_dbdata_snapshot_path": str(self.pristine_dbdata_snapshot_path),
 45 |             "dbdata_parent_path": str(self.dbdata_parent_path),
 46 |             "pgbin_path": str(self.pgbin_path),
 47 |         }
 48 | 
 49 | 
 50 | @dataclass
 51 | class DBMSConfigDelta:
 52 |     """
 53 |     This class represents a DBMS config delta. A "DBMS config" is the indexes, system knobs,
 54 |     and query knobs set by the tuning agent. A "delta" is the change from the prior config.
 55 | 
 56 |     `indexes` contains a list of SQL statements for creating indexes. Note that since it's a
 57 |     config delta, it might contain "DROP ..." statements.
 58 | 
 59 |     `sysknobs` contains a mapping from knob names to their values.
 60 | 
 61 |     `qknobs` contains a mapping from query IDs to a list of knobs. Each list contains knobs
 62 |     to prepend to the start of the query. The knobs are a list[str] instead of a dict[str, str]
 63 |     because knobs can be settings ("SET (enable_sort on)") or flags ("IndexOnlyScan(it)").
 64 |     """
 65 | 
 66 |     indexes: IndexesDelta
 67 |     sysknobs: SysKnobsDelta
 68 |     qknobs: QueryKnobsDelta
 69 | 
 70 | 
 71 | def get_delta_at_step_path(tuning_artifacts_path: Path, step_num: int) -> Path:
 72 |     return tuning_artifacts_path / f"step{step_num}_delta.json"
 73 | 
 74 | 
 75 | def get_metadata_path(tuning_artifacts_path: Path) -> Path:
 76 |     return tuning_artifacts_path / "metadata.json"
 77 | 
 78 | 
 79 | class TuningArtifactsWriter:
 80 |     def __init__(
 81 |         self, dbgym_workspace: DBGymWorkspace, metadata: TuningMetadata
 82 |     ) -> None:
 83 |         self.dbgym_workspace = dbgym_workspace
 84 |         self.tuning_artifacts_path = (
 85 |             self.dbgym_workspace.dbgym_this_run_path / "tuning_artifacts"
 86 |         )
 87 |         # exist_ok is False because you should only create one TuningArtifactsWriter per run.
 88 |         self.tuning_artifacts_path.mkdir(parents=False, exist_ok=False)
 89 |         assert is_fully_resolved(self.tuning_artifacts_path)
 90 |         self.next_step_num = 0
 91 | 
 92 |         # Write metadata file
 93 |         with get_metadata_path(self.tuning_artifacts_path).open("w") as f:
 94 |             json.dump(metadata.asdict(), f)
 95 | 
 96 |     def write_step(self, dbms_cfg_delta: DBMSConfigDelta) -> None:
 97 |         """
 98 |         This wraps _step() and saves the cfg to a file so that it can be replayed.
 99 |         """
100 |         curr_step_num = self.next_step_num
101 |         self.next_step_num += 1
102 |         with get_delta_at_step_path(self.tuning_artifacts_path, curr_step_num).open(
103 |             "w"
104 |         ) as f:
105 |             json.dump(asdict(dbms_cfg_delta), f)
106 | 
107 | 
108 | class TuningArtifactsReader:
109 |     def __init__(self, tuning_artifacts_path: Path) -> None:
110 |         self.tuning_artifacts_path = tuning_artifacts_path
111 |         assert is_fully_resolved(self.tuning_artifacts_path)
112 |         num_steps = 0
113 |         while get_delta_at_step_path(self.tuning_artifacts_path, num_steps).exists():
114 |             num_steps += 1
115 |         self.num_steps = num_steps
116 | 
117 |     def get_metadata(self) -> TuningMetadata:
118 |         with get_metadata_path(self.tuning_artifacts_path).open("r") as f:
119 |             data = json.load(f)
120 |             return TuningMetadata(
121 |                 workload_path=Path(data["workload_path"]),
122 |                 pristine_dbdata_snapshot_path=Path(
123 |                     data["pristine_dbdata_snapshot_path"]
124 |                 ),
125 |                 dbdata_parent_path=Path(data["dbdata_parent_path"]),
126 |                 pgbin_path=Path(data["pgbin_path"]),
127 |             )
128 | 
129 |     def get_delta_at_step(self, step_num: int) -> DBMSConfigDelta:
130 |         assert step_num >= 0 and step_num < self.num_steps
131 |         with get_delta_at_step_path(self.tuning_artifacts_path, step_num).open(
132 |             "r"
133 |         ) as f:
134 |             data = json.load(f)
135 |             return DBMSConfigDelta(
136 |                 indexes=data["indexes"],
137 |                 sysknobs=data["sysknobs"],
138 |                 qknobs=data["qknobs"],
139 |             )
140 | 
141 |     def get_all_deltas_in_order(self) -> list[DBMSConfigDelta]:
142 |         return [self.get_delta_at_step(step_num) for step_num in range(self.num_steps)]
143 | 


--------------------------------------------------------------------------------
/gymlib_package/gymlib/workload.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from gymlib.workspace import DBGymWorkspace, is_fully_resolved
 4 | 
 5 | 
 6 | class Workload:
 7 |     def __init__(self, dbgym_workspace: DBGymWorkspace, workload_path: Path) -> None:
 8 |         self.dbgym_workspace = dbgym_workspace
 9 |         self.workload_path = workload_path
10 |         assert is_fully_resolved(self.workload_path)
11 | 
12 |         self.queries: dict[str, str] = {}
13 |         order_path = self.workload_path / "order.txt"
14 |         self.query_order: list[str] = []
15 | 
16 |         assert order_path.exists()
17 | 
18 |         with self.dbgym_workspace.open_and_save(order_path) as f:
19 |             for line in f:
20 |                 qid, qpath = line.strip().split(",")
21 |                 qpath = Path(qpath)
22 |                 assert is_fully_resolved(qpath)
23 | 
24 |                 with self.dbgym_workspace.open_and_save(qpath) as qf:
25 |                     self.queries[qid] = qf.read()
26 |                 self.query_order.append(qid)
27 | 
28 |     def get_query(self, qid: str) -> str:
29 |         return self.queries[qid]
30 | 
31 |     def get_query_order(self) -> list[str]:
32 |         return self.query_order
33 | 
34 |     def get_queries_in_order(self) -> list[str]:
35 |         return [self.queries[qid] for qid in self.query_order]
36 | 


--------------------------------------------------------------------------------
/gymlib_package/gymlib/workspace.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file contains everything needed to manage the workspace (the dbgym_workspace/ folder).
  3 | """
  4 | 
  5 | import logging
  6 | import os
  7 | import shutil
  8 | import subprocess
  9 | import time
 10 | from datetime import datetime
 11 | from pathlib import Path
 12 | from typing import IO, Any, Optional
 13 | 
 14 | import yaml
 15 | 
 16 | WORKSPACE_PATH_PLACEHOLDER = Path("[workspace]")
 17 | SYMLINKS_DNAME = "symlinks"
 18 | TMP_DNAME = "tmp"
 19 | RUNS_DNAME = "task_runs"
 20 | DBGYM_APP_NAME = "dbgym"
 21 | LATEST_RUN_FNAME = "latest_run"
 22 | 
 23 | 
 24 | def is_linkname(name: str) -> bool:
 25 |     assert not name.endswith(".link.link")
 26 |     return name.endswith(".link")
 27 | 
 28 | 
 29 | def name_to_linkname(name: str) -> str:
 30 |     assert not is_linkname(name)
 31 |     return f"{name}.link"
 32 | 
 33 | 
 34 | def linkname_to_name(linkname: str) -> str:
 35 |     assert is_linkname(linkname)
 36 |     return linkname[: -len(".link")]
 37 | 
 38 | 
 39 | def get_symlinks_path_from_workspace_path(workspace_path: Path) -> Path:
 40 |     return workspace_path / SYMLINKS_DNAME
 41 | 
 42 | 
 43 | def get_tmp_path_from_workspace_path(workspace_path: Path) -> Path:
 44 |     return workspace_path / TMP_DNAME
 45 | 
 46 | 
 47 | def get_runs_path_from_workspace_path(workspace_path: Path) -> Path:
 48 |     return workspace_path / RUNS_DNAME
 49 | 
 50 | 
 51 | def get_latest_run_path_from_workspace_path(workspace_path: Path) -> Path:
 52 |     return get_runs_path_from_workspace_path(workspace_path) / name_to_linkname(
 53 |         LATEST_RUN_FNAME
 54 |     )
 55 | 
 56 | 
 57 | # Paths of config files in the codebase. These are always relative paths.
 58 | # The reason these can be relative paths instead of functions taking in codebase_path as input is because relative paths are relative to the codebase root
 59 | DEFAULT_BOOT_CONFIG_PATH = Path("dbms") / "postgres" / "default_boot_config.yaml"
 60 | 
 61 | 
 62 | class DBGymWorkspace:
 63 |     """
 64 |     Global configurations that apply to all parts of DB-Gym
 65 |     """
 66 | 
 67 |     _num_times_created_this_run: int = 0
 68 | 
 69 |     def __init__(self, dbgym_workspace_path: Path):
 70 |         # The logic around dbgym_tmp_path assumes that DBGymWorkspace is only constructed once.
 71 |         # This is because DBGymWorkspace creates a new run_*/ dir when it's initialized.
 72 |         DBGymWorkspace._num_times_created_this_run += 1
 73 |         assert (
 74 |             DBGymWorkspace._num_times_created_this_run == 1
 75 |         ), f"DBGymWorkspace has been created {DBGymWorkspace._num_times_created_this_run} times. It should only be created once per run."
 76 | 
 77 |         self.base_dbgym_repo_path = get_base_dbgym_repo_path()
 78 |         self.app_name = DBGYM_APP_NAME  # TODO: discover this dynamically. app means dbgym or an agent
 79 | 
 80 |         # Set and create paths.
 81 |         self.dbgym_workspace_path = dbgym_workspace_path
 82 |         self.dbgym_workspace_path.mkdir(parents=True, exist_ok=True)
 83 | 
 84 |         # Now that the workspace is guaranteed to be created, we can check if it's fully resolved.
 85 |         assert is_fully_resolved(self.dbgym_workspace_path)
 86 | 
 87 |         self.dbgym_runs_path = get_runs_path_from_workspace_path(
 88 |             self.dbgym_workspace_path
 89 |         )
 90 |         self.dbgym_runs_path.mkdir(parents=True, exist_ok=True)
 91 |         self.dbgym_symlinks_path = get_symlinks_path_from_workspace_path(
 92 |             self.dbgym_workspace_path
 93 |         )
 94 |         self.dbgym_symlinks_path.mkdir(parents=True, exist_ok=True)
 95 |         self.dbgym_cur_symlinks_path = self.dbgym_symlinks_path / self.app_name
 96 |         # tmp/ is a workspace for this run only
 97 |         # One use for it is to place the unzipped dbdata.
 98 |         # There's no need to save the actual dbdata dir in run_*/ because we just save a symlink to
 99 |         #   the .tgz file we unzipped.
100 |         self.dbgym_tmp_path = get_tmp_path_from_workspace_path(
101 |             self.dbgym_workspace_path
102 |         )
103 |         # The best place to delete the old dbgym_tmp_path is in DBGymWorkspace.__init__().
104 |         # This is better than deleting the dbgym_tmp_path is in DBGymWorkspace.__del__() because DBGymWorkspace may get deleted before execution has completed.
105 |         # Also, by keeping the tmp directory around, you can look at it to debug issues.
106 |         if self.dbgym_tmp_path.exists():
107 |             shutil.rmtree(self.dbgym_tmp_path)
108 |         self.dbgym_tmp_path.mkdir(parents=True, exist_ok=True)
109 | 
110 |         # Set the path for this task run's results.
111 |         for _ in range(2):
112 |             try:
113 |                 self.dbgym_this_run_path = (
114 |                     self.dbgym_runs_path
115 |                     / f"run_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
116 |                 )
117 |                 # `exist_ok` is False because we don't want to override a previous task run's data.
118 |                 self.dbgym_this_run_path.mkdir(parents=True, exist_ok=False)
119 |                 # Break if it succeeds so we don't do it a second time.
120 |                 break
121 |             except FileExistsError:
122 |                 # In case we call task.py twice in one second, sleeping here will fix it.
123 |                 # Waiting one second is enough since we assume there's only one task.py running at a time.
124 |                 time.sleep(1)
125 |             except Exception as e:
126 |                 raise e
127 | 
128 |         self.dbgym_latest_run_path = get_latest_run_path_from_workspace_path(
129 |             self.dbgym_workspace_path
130 |         )
131 |         try_remove_file(self.dbgym_latest_run_path)
132 |         try_create_symlink(self.dbgym_this_run_path, self.dbgym_latest_run_path)
133 | 
134 |     # TODO(phw2): refactor our manual symlinking in postgres/cli.py to use link_result() instead
135 |     def link_result(
136 |         self,
137 |         result_path: Path,
138 |         custom_link_name: Optional[str] = None,
139 |     ) -> Path:
140 |         """
141 |         result_path must be a "result", meaning it was generated inside dbgym_workspace.dbgym_this_run_path.
142 |         Further, result_path must have been generated by this invocation to task.py. This also means that
143 |             result_path itself can be a file or a dir but not a symlink.
144 |         Given a file or directory in task_runs/run_*/[codebase]/[org], this will create a symlink inside
145 |             symlinks/[codebase]/[org]/.
146 |         Will override the old symlink if there is one, so that symlinks/ always contains the latest generated
147 |             version of a file.
148 |         This function will return the path to the symlink that was created.
149 |         """
150 |         assert isinstance(result_path, Path)
151 |         assert is_fully_resolved(
152 |             result_path
153 |         ), f"result_path ({result_path}) should be a fully resolved path"
154 |         assert is_child_path(
155 |             result_path, self.dbgym_this_run_path
156 |         ), "The result must have been generated in *this* run_*/ dir"
157 |         assert not os.path.islink(result_path)
158 | 
159 |         if type(custom_link_name) is str:
160 |             link_name = custom_link_name
161 |         else:
162 |             if os.path.isfile(result_path):
163 |                 link_name = name_to_linkname(basename_of_path(result_path))
164 |             elif os.path.isdir(result_path):
165 |                 link_name = name_to_linkname(basename_of_path(result_path))
166 |             else:
167 |                 raise AssertionError("result_path must be either a file or dir")
168 | 
169 |         symlink_parent_path = self.dbgym_symlinks_path / self.app_name
170 |         symlink_parent_path.mkdir(parents=True, exist_ok=True)
171 | 
172 |         # Remove the old symlink ("old" meaning created in an earlier run) if there is one
173 |         # Note that in a multi-threaded setting, this might remove one created by a process in the same run,
174 |         #   meaning it's not "old" by our definition of "old". However, we'll always end up with a symlink
175 |         #   file of the current run regardless of the order of threads.
176 |         assert is_linkname(
177 |             link_name
178 |         ), f'link_name ({link_name}) should end with ".link"'
179 |         symlink_path = symlink_parent_path / link_name
180 |         try_remove_file(symlink_path)
181 |         try_create_symlink(result_path, symlink_path)
182 | 
183 |         return symlink_path
184 | 
185 |     def get_run_path_from_path(self, path: Path) -> Path:
186 |         run_path = path
187 |         while not parent_path_of_path(run_path).samefile(self.dbgym_runs_path):
188 |             run_path = parent_path_of_path(run_path)
189 |         return run_path
190 | 
191 |     # TODO(phw2): really look at the clean PR to see what it changed
192 |     # TODO(phw2): after merging agent-train, refactor some code in agent-train to use save_file() instead of open_and_save()
193 |     def save_file(self, path: Path) -> None:
194 |         """
195 |         If an external function takes in a file/directory as input, you will not be able to call open_and_save().
196 |             In these situations, just call save_file().
197 |         Like open_and_save(), this function only works with real absolute paths.
198 |         "Saving" can mean either copying the file or creating a symlink to it
199 |         We copy the file if it is a "config", meaning it just exists without having been generated
200 |         We create a symlink if it is a "dependency", meaning a task.py command was run to generate it
201 |             In these cases we create a symlink so we have full provenance for how the dependency was created
202 | 
203 |         **Notable Behavior**
204 |           - When you save a dependency, it actually creates a link to the outermost directory still inside run_*/.
205 |           - The second save will overwrite the first.
206 |             - If you save the same file twice in the same run, the second save will overwrite the first.
207 |             - If you save two configs with the same name, the second save will overwrite the first.
208 |             - If you save two dependencies with the same *outermost* directory, or two dependencies with the same filename
209 |               both directly inside run_*/, the second save will overwrite the first.
210 |         """
211 |         # validate path
212 |         assert isinstance(path, Path)
213 |         assert not os.path.islink(path), f"path ({path}) should not be a symlink"
214 |         assert os.path.exists(path), f"path ({path}) does not exist"
215 |         assert os.path.isfile(path), f"path ({path}) is not a file"
216 |         assert not is_child_path(
217 |             path, self.dbgym_this_run_path
218 |         ), f"path ({path}) was generated in this task run ({self.dbgym_this_run_path}). You do not need to save it"
219 | 
220 |         # Save _something_ to dbgym_this_run_path.
221 |         # Save a symlink if the opened file was generated by a run. This is for two reasons:
222 |         #   1. Files or dirs generated by a run are supposed to be immutable so saving a symlink is safe.
223 |         #   2. Files or dirs generated by a run may be very large (up to 100s of GBs) so we don't want to copy them.
224 |         if is_child_path(path, self.dbgym_runs_path):
225 |             # If the path file is directly in run_path, we symlink the file directly.
226 |             run_path = self.get_run_path_from_path(path)
227 |             parent_path = parent_path_of_path(path)
228 |             if parent_path.samefile(run_path):
229 |                 fname = basename_of_path(path)
230 |                 symlink_path = self.dbgym_this_run_path / name_to_linkname(fname)
231 |                 try_remove_file(symlink_path)
232 |                 try_create_symlink(path, symlink_path)
233 |             # Otherwise, we know the path file is _not_ directly inside run_path dir.
234 |             # We go as far back as we can while still staying in run_path and symlink that "base" dir.
235 |             # This is because lots of runs create dirs within run_path and it creates too much clutter to symlink every individual file.
236 |             # Further, this avoids an edge case where you both save a file and the dir it's in.
237 |             else:
238 |                 # Set base_path such that its parent is run_path.
239 |                 base_path = parent_path
240 |                 while not parent_path_of_path(base_path).samefile(run_path):
241 |                     base_path = parent_path_of_path(base_path)
242 | 
243 |                 # Create symlink
244 |                 open_base_dname = basename_of_path(base_path)
245 |                 symlink_path = self.dbgym_this_run_path / name_to_linkname(
246 |                     open_base_dname
247 |                 )
248 |                 try_remove_file(symlink_path)
249 |                 try_create_symlink(base_path, symlink_path)
250 |         # If the file wasn't generated by a run, we can't just symlink it because we don't know that it's immutable.
251 |         else:
252 |             fname = basename_of_path(path)
253 |             # In this case, we want to copy instead of symlinking since it might disappear in the future.
254 |             copy_path = self.dbgym_this_run_path / fname
255 |             shutil.copy(path, copy_path)
256 | 
257 |     def open_and_save(self, open_path: Path, mode: str = "r") -> IO[Any]:
258 |         """
259 |         Open a file and "save" it to [workspace]/task_runs/run_*/.
260 |         It takes in a str | Path to match the interface of open().
261 |         This file does not work if open_path is a symlink, to make its interface identical to that of open().
262 |             Make sure to resolve all symlinks with fully_resolve_path().
263 |         To avoid confusion, I'm enforcing this function to only work with absolute paths.
264 |         # TODO: maybe make it work on non-fully-resolved paths to better match open()
265 |         See the comment of save_file() for what "saving" means
266 |         If you are generating a "result" for the run, _do not_ use this. Just use the normal open().
267 |             This shouldn't be too hard to remember because this function crashes if open_path doesn't exist,
268 |             and when you write results you're usually opening open_paths which do not exist.
269 |         """
270 |         # Validate open_path
271 |         assert isinstance(open_path, Path)
272 |         assert is_fully_resolved(
273 |             open_path
274 |         ), f"open_and_save(): open_path ({open_path}) should be a fully resolved path"
275 |         assert not os.path.islink(
276 |             open_path
277 |         ), f"open_path ({open_path}) should not be a symlink"
278 |         assert os.path.exists(open_path), f"open_path ({open_path}) does not exist"
279 |         # `open_and_save`` *must* be called on files because it doesn't make sense to open a directory. note that this doesn't mean we'll always save
280 |         #   a file though. we sometimes save a directory (see save_file() for details)
281 |         assert os.path.isfile(open_path), f"open_path ({open_path}) is not a file"
282 | 
283 |         # Save
284 |         self.save_file(open_path)
285 | 
286 |         # Open
287 |         return open(open_path, mode=mode)
288 | 
289 | 
290 | def get_workspace_path_from_config(dbgym_config_path: Path) -> Path:
291 |     """
292 |     Returns the workspace path (as a fully resolved path) from the config file.
293 |     """
294 |     with open(dbgym_config_path) as f:
295 |         # We do *not* call fully_resolve_path() here because the workspace may not exist yet.
296 |         return Path(yaml.safe_load(f)["dbgym_workspace_path"]).resolve().absolute()
297 | 
298 | 
299 | def make_standard_dbgym_workspace() -> DBGymWorkspace:
300 |     """
301 |     The "standard" way to make a DBGymWorkspace using the DBGYM_CONFIG_PATH envvar and the
302 |     default path of dbgym_config.yaml.
303 |     """
304 |     dbgym_config_path = Path(os.getenv("DBGYM_CONFIG_PATH", "dbgym_config.yaml"))
305 |     dbgym_workspace_path = get_workspace_path_from_config(dbgym_config_path)
306 |     dbgym_workspace = DBGymWorkspace(dbgym_workspace_path)
307 |     return dbgym_workspace
308 | 
309 | 
310 | def fully_resolve_path(inputpath: os.PathLike[str]) -> Path:
311 |     """
312 |     Fully resolve any path to a real, absolute path.
313 | 
314 |     For flexibility, we take in any os.PathLike. However, for consistency, we always output a Path object.
315 | 
316 |     Whenever a path is required, the user is allowed to enter relative paths, absolute paths, or paths starting with ~.
317 | 
318 |     Relative paths are relative to the base dbgym repo dir.
319 | 
320 |     It *does not* check whether the path exists, since the user might be wanting to create a new file/dir.
321 | 
322 |     Raises RuntimeError for errors.
323 |     """
324 |     # For simplicity, we only process Path objects.
325 |     realabspath = Path(inputpath)
326 |     # `expanduser()` is always "ok" to call first.
327 |     realabspath = realabspath.expanduser()
328 |     # The reason we don't call Path.absolute() is because the path should be relative to get_base_dbgym_repo_path(),
329 |     #   which is not necessary where cwd() points at the time of calling this function.
330 |     if not realabspath.is_absolute():
331 |         realabspath = get_base_dbgym_repo_path() / realabspath
332 |     # `resolve()` has two uses: normalize the path (remove ..) and resolve symlinks.
333 |     # I believe the pathlib library (https://docs.python.org/3/library/pathlib.html#pathlib.Path.resolve) does these together this
334 |     #   way to avoid an edge case related to symlinks and normalizing paths (footnote 1 of the linked docs)
335 |     realabspath = realabspath.resolve()
336 |     assert is_fully_resolved(
337 |         realabspath
338 |     ), f"realabspath ({realabspath}) is not fully resolved"
339 |     return realabspath
340 | 
341 | 
342 | def get_base_dbgym_repo_path() -> Path:
343 |     path = Path(os.getcwd())
344 |     assert _is_base_dbgym_repo_path(
345 |         path
346 |     ), "This script should be invoked from the root of the dbgym repo."
347 |     return path
348 | 
349 | 
350 | def _is_base_dbgym_repo_path(path: Path) -> bool:
351 |     """
352 |     Returns whether we are in the base directory of some git repository
353 |     """
354 |     try:
355 |         git_toplevel = subprocess.check_output(
356 |             ["git", "rev-parse", "--show-toplevel"], encoding="utf-8"
357 |         ).strip()
358 |         return Path(git_toplevel) == path
359 |     except subprocess.CalledProcessError:
360 |         # This means we are not in _any_ git repo
361 |         return False
362 |     except Exception as e:
363 |         raise e
364 | 
365 | 
366 | def is_fully_resolved(path: Path) -> bool:
367 |     """
368 |     Checks if a path is fully resolved (exists, is absolute, and contains no symlinks in its entire ancestry).
369 | 
370 |     The reason we check for existence is because that's the only way we know that there are no symlinks in its entire ancestry.
371 |     If we didn't check for existence, we could later create a new symlink in the path's ancestry.
372 | 
373 |     Even if a path exists, is absolute, and is not itself a symlink, it could still contain
374 |     symlinks in its parent directories. For example:
375 |         /home/user/           # Real directory
376 |         /home/user/links/     # Symlink to /data/links
377 |         /home/user/links/file.txt  # Real file
378 | 
379 |     In this case, "/home/user/links/file.txt" exists and isn't itself a symlink,
380 |     but it's not fully resolved because it contains a symlink in its ancestry.
381 |     The fully resolved path would be "/data/links/file.txt".
382 |     """
383 |     assert isinstance(path, Path)
384 |     resolved_path = path.resolve()
385 | 
386 |     # Check if the path exists.
387 |     if not resolved_path.exists():
388 |         return False
389 | 
390 |     # Check if the path contains no symlinks in its entire ancestry.
391 |     # This also checks if the path is absolute because resolved_path is absolute.
392 |     assert (
393 |         resolved_path.is_absolute()
394 |     ), "resolved_path should be absolute (see comment above)"
395 |     # Converting them to strings is the most unambiguously strict way of checking equality.
396 |     # Stuff like Path.__eq__() or Path.samefile() might be more lenient.
397 |     return str(resolved_path) == str(path)
398 | 
399 | 
400 | def parent_path_of_path(path: Path) -> Path:
401 |     """
402 |     This function only calls Path.parent, but in a safer way.
403 |     """
404 |     assert isinstance(path, Path)
405 |     assert is_fully_resolved(
406 |         path
407 |     ), f"path must be fully resolved because Path.parent has weird behavior on non-resolved paths (see https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.parent)"
408 |     parent_path = path.parent
409 |     assert isinstance(parent_path, Path)
410 |     return parent_path
411 | 
412 | 
413 | def basename_of_path(path: Path) -> str:
414 |     """
415 |     This function only calls Path.name, but in a safer way.
416 |     """
417 |     assert isinstance(path, Path)
418 |     assert is_fully_resolved(
419 |         path
420 |     ), f'path must be fully resolved because Path.name has weird behavior on non-resolved paths (like giving ".." if the path ends with a "..")'
421 |     path_dirname, path_basename = os.path.split(path)
422 |     # this means the path ended with a '/' so all os.path.split() does is get rid of the slash
423 |     if path_basename == "":
424 |         return os.path.basename(path_dirname)
425 |     else:
426 |         return path_basename
427 | 
428 | 
429 | # TODO(phw2): refactor to use Path
430 | def is_child_path(child_path: os.PathLike[str], parent_path: os.PathLike[str]) -> bool:
431 |     """
432 |     Checks whether child_path refers to a file/dir/link that is a child of the dir referred to by parent_path
433 |     If the two paths are equal, this function returns FALSE
434 |     """
435 |     assert os.path.isdir(parent_path)
436 |     if os.path.samefile(child_path, parent_path):
437 |         return False
438 |     else:
439 |         return os.path.samefile(
440 |             os.path.commonpath([parent_path, child_path]), parent_path
441 |         )
442 | 
443 | 
444 | def extract_from_task_run_path(
445 |     dbgym_workspace: DBGymWorkspace, task_run_path: Path
446 | ) -> tuple[Path, str, Path, str]:
447 |     """
448 |     The task_runs/ folder is organized like task_runs/run_*/[codebase]/[org]/any/path/you/want.
449 |     This function extracts the [codebase] and [org] components
450 |     """
451 |     assert isinstance(task_run_path, Path)
452 |     assert not task_run_path.is_symlink()
453 |     parent_path = task_run_path.parent
454 |     # TODO(phw2): make this a common function
455 |     assert not parent_path.samefile(
456 |         dbgym_workspace.dbgym_runs_path
457 |     ), f"task_run_path ({task_run_path}) should be inside a run_*/ dir instead of directly in dbgym_workspace.dbgym_runs_path ({dbgym_workspace.dbgym_runs_path})"
458 |     assert not parent_path_of_path(parent_path).samefile(
459 |         dbgym_workspace.dbgym_runs_path
460 |     ), f"task_run_path ({task_run_path}) should be inside a run_*/[codebase]/ dir instead of directly in run_*/ ({dbgym_workspace.dbgym_runs_path})"
461 |     assert not parent_path_of_path(parent_path_of_path(parent_path)).samefile(
462 |         dbgym_workspace.dbgym_runs_path
463 |     ), f"task_run_path ({task_run_path}) should be inside a run_*/[codebase]/[organization]/ dir instead of directly in run_*/ ({dbgym_workspace.dbgym_runs_path})"
464 |     # org_path is the run_*/[codebase]/[organization]/ dir that task_run_path is in
465 |     org_path = parent_path
466 |     while not parent_path_of_path(
467 |         parent_path_of_path(parent_path_of_path(org_path))
468 |     ).samefile(dbgym_workspace.dbgym_runs_path):
469 |         org_path = parent_path_of_path(org_path)
470 |     org_dname = basename_of_path(org_path)
471 |     codebase_path = parent_path_of_path(org_path)
472 |     codebase_dname = basename_of_path(codebase_path)
473 | 
474 |     return codebase_path, codebase_dname, org_path, org_dname
475 | 
476 | 
477 | def try_create_symlink(src_path: Path, dst_path: Path) -> None:
478 |     """
479 |     Our functions that create symlinks might be called by multiple processes at once
480 |     during HPO. Thus, this is a thread-safe way to create a symlink.
481 |     """
482 |     assert is_linkname(dst_path.name)
483 |     try:
484 |         os.symlink(src_path, dst_path)
485 |     except FileExistsError:
486 |         # it's ok if it exists
487 |         pass
488 | 
489 | 
490 | def try_remove_file(path: Path) -> None:
491 |     """
492 |     Our functions that remove files might be called by multiple processes at once
493 |     during HPO. Thus, this is a thread-safe way to remove a file.
494 |     """
495 |     try:
496 |         os.remove(path)
497 |     except FileNotFoundError:
498 |         # it's ok if it doesn't exist
499 |         pass
500 | 
501 | 
502 | def is_ssd(path: Path) -> bool:
503 |     try:
504 |         device = (
505 |             subprocess.check_output(["df", path]).decode().split("\n")[1].split()[0]
506 |         )
507 |         device_basename = os.path.basename(device)
508 |         lsblk_output = subprocess.check_output(
509 |             ["lsblk", "-d", "-o", "name,rota"]
510 |         ).decode()
511 |         for line in lsblk_output.split("\n")[1:]:
512 |             parts = line.split()
513 |             if parts and parts[0] == device_basename:
514 |                 is_ssd = int(parts[1]) == 0
515 |                 return is_ssd
516 |         return False
517 |     except Exception as e:
518 |         logging.error(f"An error occurred: {e}")
519 |         return False
520 | 


--------------------------------------------------------------------------------
/gymlib_package/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "gymlib"
 7 | version = "0.1.0"
 8 | 
 9 | [tool.setuptools.packages.find]
10 | where = ["."]
11 | include = ["gymlib*"]


--------------------------------------------------------------------------------
/orchestrate/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/orchestrate/__init__.py


--------------------------------------------------------------------------------
/orchestrate/clean.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import shutil
  4 | from itertools import chain
  5 | from pathlib import Path
  6 | 
  7 | from gymlib.workspace import DBGymWorkspace, is_child_path, parent_path_of_path
  8 | 
  9 | 
 10 | def add_symlinks_in_path(
 11 |     symlinks_stack: list[Path], root_path: Path, processed_symlinks: set[Path]
 12 | ) -> None:
 13 |     """
 14 |     Will modify symlinks_stack and processed_symlinks.
 15 |     """
 16 |     for root_pathstr, dir_names, file_names in os.walk(root_path):
 17 |         root_path = Path(root_pathstr)
 18 |         # symlinks can either be files or directories, so we go through both dir_names and file_names
 19 |         for file_name in chain(dir_names, file_names):
 20 |             file_path = root_path / file_name
 21 |             if file_path.is_symlink() and file_path not in processed_symlinks:
 22 |                 symlinks_stack.append(file_path)
 23 |                 processed_symlinks.add(file_path)
 24 | 
 25 | 
 26 | def count_files_in_workspace(dbgym_workspace: DBGymWorkspace) -> int:
 27 |     """
 28 |     Counts the number of files (regular file or dir or symlink) in the workspace.
 29 |     """
 30 |     total_count = 0
 31 |     for dirpath, dirnames, filenames in os.walk(
 32 |         dbgym_workspace.dbgym_workspace_path, followlinks=False
 33 |     ):
 34 |         # Check if any of the directories are symbolic links and remove them from dirnames
 35 |         dirnames[:] = [
 36 |             d for d in dirnames if not os.path.islink(os.path.join(dirpath, d))
 37 |         ]
 38 | 
 39 |         # Count files and directories (non-symlink directories already filtered)
 40 |         total_count += len(filenames) + len(dirnames)
 41 | 
 42 |     return total_count
 43 | 
 44 | 
 45 | def clean_workspace(
 46 |     dbgym_workspace: DBGymWorkspace,
 47 |     mode: str = "safe",
 48 |     verbose: bool = False,
 49 | ) -> None:
 50 |     """
 51 |     Clean all [workspace]/task_runs/run_*/ directories that are not referenced by any "active symlinks".
 52 |     If mode is "aggressive", "active symlinks" means *only* the symlinks directly in [workspace]/symlinks/.
 53 |     If mode is "safe", "active symlinks" means the symlinks directly in [workspace]/symlinks/ as well as
 54 |       any symlinks referenced in task_runs/run_*/ directories we have already decided to keep.
 55 |     """
 56 |     # This stack holds the symlinks that are left to be processed
 57 |     symlink_paths_to_process: list[Path] = []
 58 |     # This set holds the symlinks that have already been processed to avoid infinite loops
 59 |     processed_symlinks: set[Path] = set()
 60 | 
 61 |     # 1. Initialize paths to process
 62 |     if dbgym_workspace.dbgym_symlinks_path.exists():
 63 |         add_symlinks_in_path(
 64 |             symlink_paths_to_process,
 65 |             dbgym_workspace.dbgym_symlinks_path,
 66 |             processed_symlinks,
 67 |         )
 68 | 
 69 |     # 2. Go through symlinks, figuring out which "children of task runs" to keep
 70 |     # Based on the rules of the framework, "children of task runs" should be run_*/ directories.
 71 |     # However, the user's workspace might happen to break these rules by putting directories not
 72 |     #   named "run_*/" or files directly in task_runs/. Thus, I use the term "task_run_child_paths"
 73 |     #   instead of "run_paths".
 74 |     task_run_child_paths_to_keep = set()
 75 | 
 76 |     if dbgym_workspace.dbgym_runs_path.exists():
 77 |         while symlink_paths_to_process:
 78 |             symlink_path: Path = symlink_paths_to_process.pop()
 79 |             assert symlink_path.is_symlink()
 80 |             # Path.resolve() resolves all layers of symlinks while os.readlink() only resolves one layer.
 81 |             # However, os.readlink() literally reads the string contents of the link. We need to do some
 82 |             #   processing on the result of os.readlink() to convert it to an absolute path
 83 |             real_path = symlink_path.resolve()
 84 |             one_layer_resolved_path = os.readlink(symlink_path)
 85 |             assert str(real_path) == str(
 86 |                 os.readlink(symlink_path)
 87 |             ), f"symlink_path ({symlink_path}) seems to point to *another* symlink. This is difficult to handle, so it is currently disallowed. Please resolve this situation manually."
 88 | 
 89 |             # If the file doesn't exist, we'll just ignore it.
 90 |             if not real_path.exists():
 91 |                 continue
 92 |             # We're only trying to figure out which direct children of task_runs/ to save. If the file isn't
 93 |             #   even a descendant, we don't care about it.
 94 |             if not is_child_path(real_path, dbgym_workspace.dbgym_runs_path):
 95 |                 continue
 96 | 
 97 |             assert not real_path.samefile(dbgym_workspace.dbgym_runs_path)
 98 | 
 99 |             # Figure out the task_run_child_path to put into task_run_child_paths_to_keep
100 |             task_run_child_path = None
101 |             if parent_path_of_path(real_path).samefile(dbgym_workspace.dbgym_runs_path):
102 |                 # While it's true that it shouldn't be possible to symlink to a directory directly in task_runs/,
103 |                 #   we'll just not delete it if the user happens to have one like this. Even if the user messed up
104 |                 #   the structure somehow, it's just a good idea not to delete it.
105 |                 task_run_child_path = real_path
106 |             else:
107 |                 # Technically, it's not allowed to symlink to any files not in task_runs/run_*/[codebase]/[organization]/.
108 |                 #   However, as with above, we won't just nuke files if the workspace doesn't follow this rule for
109 |                 #   some reason.
110 |                 task_run_child_path = real_path
111 |                 while not parent_path_of_path(task_run_child_path).samefile(
112 |                     dbgym_workspace.dbgym_runs_path
113 |                 ):
114 |                     task_run_child_path = parent_path_of_path(task_run_child_path)
115 |             assert task_run_child_path != None
116 |             assert parent_path_of_path(task_run_child_path).samefile(
117 |                 dbgym_workspace.dbgym_runs_path
118 |             ), f"task_run_child_path ({task_run_child_path}) is not a direct child of dbgym_workspace.dbgym_runs_path"
119 |             task_run_child_paths_to_keep.add(task_run_child_path)
120 | 
121 |             # If on safe mode, add symlinks inside the task_run_child_path to be processed
122 |             if mode == "safe":
123 |                 add_symlinks_in_path(
124 |                     symlink_paths_to_process,
125 |                     task_run_child_path,
126 |                     processed_symlinks,
127 |                 )
128 | 
129 |     # 3. Go through all children of task_runs/*, deleting any that we weren't told to keep
130 |     # It's true that symlinks might link outside of task_runs/*. We'll just not care about those
131 |     starting_num_files = count_files_in_workspace(dbgym_workspace)
132 |     if dbgym_workspace.dbgym_runs_path.exists():
133 |         for child_path in dbgym_workspace.dbgym_runs_path.iterdir():
134 |             if child_path not in task_run_child_paths_to_keep:
135 |                 if child_path.is_dir():
136 |                     shutil.rmtree(child_path)
137 |                 else:
138 |                     os.remove(child_path)
139 |     ending_num_files = count_files_in_workspace(dbgym_workspace)
140 | 
141 |     if verbose:
142 |         logging.info(
143 |             f"Removed {starting_num_files - ending_num_files} out of {starting_num_files} files"
144 |         )
145 |         logging.info(
146 |             f"Workspace went from {starting_num_files - ending_num_files} to {starting_num_files}"
147 |         )
148 | 


--------------------------------------------------------------------------------
/orchestrate/cli.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from gymlib.workspace import DBGymWorkspace
 3 | 
 4 | from orchestrate.clean import clean_workspace, count_files_in_workspace
 5 | 
 6 | 
 7 | @click.group(name="manage")
 8 | def manage_group() -> None:
 9 |     pass
10 | 
11 | 
12 | @click.command("clean")
13 | @click.pass_obj
14 | @click.option(
15 |     "--mode",
16 |     type=click.Choice(["safe", "aggressive"]),
17 |     default="safe",
18 |     help='The mode to clean the workspace (default="safe"). "aggressive" means "only keep run_*/ folders referenced by a file in symlinks/". "safe" means "in addition to that, recursively keep any run_*/ folders referenced by any symlinks in run_*/ folders we are keeping."',
19 | )
20 | def manage_clean(dbgym_workspace: DBGymWorkspace, mode: str) -> None:
21 |     clean_workspace(dbgym_workspace, mode=mode, verbose=True)
22 | 
23 | 
24 | @click.command("count")
25 | @click.pass_obj
26 | def manage_count(dbgym_workspace: DBGymWorkspace) -> None:
27 |     num_files = count_files_in_workspace(dbgym_workspace)
28 |     print(
29 |         f"The workspace ({dbgym_workspace.dbgym_workspace_path}) has {num_files} total files/dirs/symlinks."
30 |     )
31 | 
32 | 
33 | manage_group.add_command(manage_clean)
34 | manage_group.add_command(manage_count)
35 | 


--------------------------------------------------------------------------------
/orchestrate/replay.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from pathlib import Path
 3 | 
 4 | from gymlib.pg import DEFAULT_POSTGRES_PORT
 5 | from gymlib.pg_conn import PostgresConn
 6 | from gymlib.tuning_artifacts import TuningArtifactsReader
 7 | from gymlib.workload import Workload
 8 | from gymlib.workspace import DBGymWorkspace
 9 | 
10 | 
11 | def replay(
12 |     dbgym_workspace: DBGymWorkspace, tuning_artifacts_path: Path
13 | ) -> list[tuple[float, int]]:
14 |     """
15 |     Returns the total runtime and the number of timed out queries for each step.
16 | 
17 |     The first step will use no configuration changes.
18 |     """
19 |     replay_data: list[tuple[float, int]] = []
20 | 
21 |     reader = TuningArtifactsReader(tuning_artifacts_path)
22 |     pg_conn = PostgresConn(
23 |         dbgym_workspace,
24 |         DEFAULT_POSTGRES_PORT,
25 |         reader.get_metadata().pristine_dbdata_snapshot_path,
26 |         reader.get_metadata().dbdata_parent_path,
27 |         reader.get_metadata().pgbin_path,
28 |         None,
29 |     )
30 |     workload = Workload(
31 |         dbgym_workspace,
32 |         reader.get_metadata().workload_path,
33 |     )
34 | 
35 |     pg_conn.restore_pristine_snapshot()
36 |     pg_conn.restart_postgres()
37 |     qknobs: defaultdict[str, list[str]] = defaultdict(list)
38 |     replay_data.append(time_workload(pg_conn, workload, qknobs))
39 | 
40 |     for delta in reader.get_all_deltas_in_order():
41 |         pg_conn.restart_with_changes(delta.sysknobs)
42 | 
43 |         for index in delta.indexes:
44 |             pg_conn.psql(index)
45 | 
46 |         for query, knobs in delta.qknobs.items():
47 |             # TODO: account for deleting a knob if we are representing knobs as deltas.
48 |             qknobs[query].extend(knobs)
49 | 
50 |         replay_data.append(time_workload(pg_conn, workload, qknobs))
51 | 
52 |     pg_conn.shutdown_postgres()
53 |     return replay_data
54 | 
55 | 
56 | def time_workload(
57 |     pg_conn: PostgresConn, workload: Workload, qknobs: dict[str, list[str]]
58 | ) -> tuple[float, int]:
59 |     """
60 |     Returns the total runtime and the number of timed out queries.
61 |     """
62 |     total_runtime: float = 0
63 |     num_timed_out_queries: int = 0
64 | 
65 |     for qid in workload.get_query_order():
66 |         query = workload.get_query(qid)
67 |         this_query_knobs = qknobs[qid]
68 |         runtime, did_time_out, _ = pg_conn.time_query(
69 |             query, query_knobs=this_query_knobs
70 |         )
71 |         total_runtime += runtime
72 |         if did_time_out:
73 |             num_timed_out_queries += 1
74 | 
75 |     return total_runtime, num_timed_out_queries
76 | 


--------------------------------------------------------------------------------
/orchestrate/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/orchestrate/tests/__init__.py


--------------------------------------------------------------------------------
/orchestrate/tests/integtest_replay.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from gymlib.tests.gymlib_integtest_util import GymlibIntegtestManager
 4 | from gymlib.tuning_artifacts import (
 5 |     DBMSConfigDelta,
 6 |     IndexesDelta,
 7 |     QueryKnobsDelta,
 8 |     SysKnobsDelta,
 9 |     TuningArtifactsWriter,
10 | )
11 | from gymlib.workspace import DBGymWorkspace
12 | 
13 | from benchmark.tpch.constants import DEFAULT_TPCH_SEED
14 | from orchestrate.replay import replay
15 | 
16 | 
17 | class ReplayTests(unittest.TestCase):
18 |     workspace: DBGymWorkspace
19 | 
20 |     @staticmethod
21 |     def setUpClass() -> None:
22 |         GymlibIntegtestManager.set_up_workspace()
23 |         # Reset _num_times_created_this_run since previous tests may have created a workspace.
24 |         DBGymWorkspace._num_times_created_this_run = 0
25 |         ReplayTests.workspace = DBGymWorkspace(
26 |             GymlibIntegtestManager.get_workspace_path()
27 |         )
28 | 
29 |     def test_replay(self) -> None:
30 |         writer = TuningArtifactsWriter(
31 |             ReplayTests.workspace,
32 |             GymlibIntegtestManager.get_default_metadata(),
33 |         )
34 |         writer.write_step(
35 |             DBMSConfigDelta(
36 |                 indexes=IndexesDelta(
37 |                     ["CREATE INDEX idx_orders_custkey ON orders(o_custkey)"]
38 |                 ),
39 |                 sysknobs=SysKnobsDelta(
40 |                     {"shared_buffers": "2GB"},
41 |                 ),
42 |                 qknobs=QueryKnobsDelta(
43 |                     {
44 |                         f"S{DEFAULT_TPCH_SEED}-Q1": [
45 |                             "set enable_hashagg = off",
46 |                             "set enable_sort = on",
47 |                         ],
48 |                     }
49 |                 ),
50 |             )
51 |         )
52 |         replay_data = replay(
53 |             ReplayTests.workspace,
54 |             writer.tuning_artifacts_path,
55 |         )
56 | 
57 |         # We do some very simple sanity checks here due to the inherent randomness of executing a workload.
58 |         # We check that there is one data point for the initial config and one for the config change.
59 |         self.assertEqual(len(replay_data), 2)
60 |         # We check that the second step is faster.
61 |         self.assertLess(replay_data[1][0], replay_data[0][0])
62 |         # We check that no queries timed out in either step.
63 |         self.assertEqual(replay_data[0][1], 0)
64 |         self.assertEqual(replay_data[1][1], 0)
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     unittest.main()
69 | 


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/scripts/__init__.py


--------------------------------------------------------------------------------
/scripts/_build_conda_env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This helper script creates a conda environment.
 3 | # You should not run this directly. Instead, use build_agent_conda_env.sh or build_gym_conda_env.sh.
 4 | 
 5 | set -euo pipefail
 6 | 
 7 | # 1. Checks.
 8 | # 1.1. Check that conda is installed.
 9 | if ! command -v conda &> /dev/null; then
10 |     echo "Error: Conda is not installed"
11 |     exit 1
12 | fi
13 | 
14 | # 1.2. Input validation.
15 | if [ "$#" -lt 3 ]; then
16 |     echo "Usage: ./_build_conda_env.sh <env_name> <python_version_path> <requirements_path>"
17 |     exit 1
18 | fi
19 | 
20 | env_name=$1
21 | python_version_path=$2
22 | requirements_path=$3
23 | 
24 | # 1.3. Check that the environment doesn't already exist.
25 | if conda info --envs | grep -q "^$env_name "; then
26 |     echo "Error: Conda environment '$env_name' already exists"
27 |     exit 1
28 | fi
29 | 
30 | # 2. Set up the environment.
31 | # Note: I am intentionally not using environment.yml. I am instead using
32 | # requirements.txt and .python_version. This is for two reasons:
33 | #   1. environment.yml sets the conda env name. However, I want to enforce
34 | #      that the conda env name is the same as the agent name.
35 | #   2. requirements.txt can be used by pip and only contains packages and
36 | #      not any additional conda-specific syntax, making it more modular
37 | #      and flexible.
38 | 
39 | # 2.1. Set python_version variable.
40 | if [ -f "$python_version_path" ]; then
41 |     python_version=$(cat "$python_version_path")
42 | else
43 |     echo "Info: .python_version not found in $python_version_path. Using default Python 3.10."
44 |     python_version="3.10"
45 | fi
46 | 
47 | # 2.2. Create conda environment with specified Python version.
48 | echo "Creating conda environment '$env_name' with Python $python_version..."
49 | eval "$(conda shell.bash hook)"
50 | conda create -y -n "$env_name" python="$python_version"
51 | 
52 | # 2.3. Install the packages.
53 | conda activate "$env_name"
54 | 
55 | if [ -f "$requirements_path" ]; then
56 |     echo "Installing pip requirements from $requirements_path..."
57 |     pip install -r "$requirements_path"
58 | else
59 |     echo "Info: $requirements_path not found. Skipping pip install."
60 | fi
61 | 
62 | # We always install gymlib so that the agent has access to it.
63 | if [ -d "gymlib_package" ]; then
64 |     echo "Installing gymlib..."
65 |     # Note that I don't use -e here. When I tried -e, the editor wouldn't be able to find gymlib.
66 |     pip install ./gymlib_package
67 | else
68 |     echo "Error: gymlib_package directory not found in $(pwd). Please ensure you're running this script from the right folder."
69 |     exit 1
70 | fi
71 | 
72 | conda deactivate
73 | 
74 | # 2.4. Success message.
75 | echo "Conda environment '$env_name' created successfully."
76 | echo "It is not currently activated. To activate it, run 'conda activate $env_name'."
77 | 


--------------------------------------------------------------------------------
/scripts/_load_per_machine_envvars.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | host=$(hostname)
 3 | 
 4 | if [ "$host" == "dev4" ]; then
 5 |     export DBDATA_PARENT_PATH=/mnt/nvme1n1/phw2/dbgym_tmp/
 6 |     export INTENDED_DBDATA_HARDWARE=ssd
 7 | elif [ "$host" == "dev6" ]; then
 8 |     export DBDATA_PARENT_PATH=/mnt/nvme0n1/phw2/dbgym_tmp/
 9 |     export INTENDED_DBDATA_HARDWARE=ssd
10 | elif [ "$host" == "patnuc" ]; then
11 |     export DBDATA_PARENT_PATH=../dbgym_workspace/tmp/
12 |     export INTENDED_DBDATA_HARDWARE=hdd
13 | else
14 |     echo "Did not recognize host \"$host\""
15 |     exit 1
16 | fi


--------------------------------------------------------------------------------
/scripts/_run_tests.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import unittest
 4 | 
 5 | if __name__ == "__main__":
 6 |     loader = unittest.TestLoader()
 7 |     suite = loader.discover(".", pattern=sys.argv[1])
 8 |     runner = unittest.TextTestRunner()
 9 |     result = runner.run(suite)
10 |     if not result.wasSuccessful():
11 |         # This is needed so that the GHA fails if the unit tests fail.
12 |         sys.exit(1)
13 | 


--------------------------------------------------------------------------------
/scripts/build_agent_conda_env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script creates a conda environment for a specific agent.
 3 | # - Name matches the agent name.
 4 | # - Python version from .python_version file in the agent's folder (if exists).
 5 | # - Dependencies from requirements.txt file in the agent's folder (if exists).
 6 | # - gymlib is installed.
 7 | #
 8 | # Using this script is *optional*. If you have a more complex environment setup
 9 | # for your agent, just do that manually.
10 | #
11 | # Run it from the dbgym root folder (e.g. `./scripts/build_agent_conda_env.sh <agent_name>`).
12 | #
13 | # Before running this script, the user must update the folder of the agent
14 | # they want to create a conda environment for (e.g. by calling submodule update).
15 | # There are other things the user must do as well but these are all checked
16 | # automatically by this script.
17 | 
18 | set -euo pipefail
19 | 
20 | if [ -z "$1" ]; then
21 |     echo "Usage: ./build_agent_conda_env.sh <agent_name>"
22 |     exit 1
23 | fi
24 | 
25 | agent_name=$1
26 | 
27 | if [ ! -d "agents/$agent_name" ]; then
28 |     echo "Error: Agent folder '$agent_name' does not exist"
29 |     exit 1
30 | fi
31 | 
32 | ./scripts/_build_conda_env.sh "$agent_name" "agents/$agent_name/.python_version" "agents/$agent_name/requirements.txt"


--------------------------------------------------------------------------------
/scripts/build_dbgym_conda_env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # This script builds the conda environment used by the gym itself (i.e. the orchestrator).
3 | # This script is optional. You don't need to use conda if you don't want to (the CI doesn't use conda, for instance)
4 | 
5 | set -euo pipefail
6 | 
7 | ./scripts/_build_conda_env.sh "dbgym" "scripts/configs/.python_version" "scripts/configs/requirements.txt"
8 | 


--------------------------------------------------------------------------------
/scripts/check_format.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -euxo pipefail
3 | 
4 | # Ignore agents/ because those are all submodules.
5 | black . --check --exclude agents
6 | isort . --profile black -c --skip agents
7 | 


--------------------------------------------------------------------------------
/scripts/configs/.python_version:
--------------------------------------------------------------------------------
1 | 3.10.13


--------------------------------------------------------------------------------
/scripts/configs/apt_requirements.txt:
--------------------------------------------------------------------------------
1 | bison
2 | build-essential
3 | flex
4 | libreadline-dev
5 | rpm
6 | zlib1g-dev
7 | cbindgen
8 | redis-server
9 | redis-tools


--------------------------------------------------------------------------------
/scripts/configs/e2e_test_dbgym_config.yaml:
--------------------------------------------------------------------------------
1 | dbgym_workspace_path: ../dbgym_e2etest_workspace
2 | boot_redis_port: 7379
3 | ray_gcs_port: 7380


--------------------------------------------------------------------------------
/scripts/configs/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | strict = True
3 | ignore_missing_imports = True
4 | 


--------------------------------------------------------------------------------
/scripts/configs/requirements.txt:
--------------------------------------------------------------------------------
 1 | async-timeout==5.0.1
 2 | black==24.10.0
 3 | cffi==1.17.1
 4 | click==8.1.8
 5 | cryptography==44.0.0
 6 | greenlet==3.1.1
 7 | isort==5.13.2
 8 | mypy==1.14.0
 9 | mypy-extensions==1.0.0
10 | packaging==24.2
11 | pathspec==0.12.1
12 | pglast==7.2
13 | platformdirs==4.3.6
14 | plumbum==1.9.0
15 | psutil==6.1.1
16 | psycopg==3.2.3
17 | pycparser==2.22
18 | PyYAML==6.0.2
19 | redis==5.2.1
20 | SQLAlchemy==2.0.36
21 | tomli==2.2.1
22 | types-cffi==1.16.0.20241221
23 | types-pyOpenSSL==24.1.0.20240722
24 | types-PyYAML==6.0.12.20241221
25 | types-redis==4.6.0.20241004
26 | types-setuptools==75.6.0.20241223
27 | typing_extensions==4.12.2
28 | 


--------------------------------------------------------------------------------
/scripts/format.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -euo pipefail
3 | 
4 | # Ignore agents/ because those are all submodules.
5 | black . --exclude agents
6 | isort . --profile black --skip agents
7 | 


--------------------------------------------------------------------------------
/scripts/install_sysdeps.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # "sysdeps" stands for "system dependencies".
3 | # These are dependencies unrelated to Python that the dbgym needs.
4 | cat scripts/configs/apt_requirements.txt | xargs sudo apt-get install -y
5 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
6 | 


--------------------------------------------------------------------------------
/scripts/mypy.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Ignore agents/ because those are all submodules.
3 | # Ignore gymlib_package/build/ to avoid the error of mypy finding two gymlib packages.
4 | mypy --config-file scripts/configs/mypy.ini . --exclude agents/ --exclude gymlib_package/build/


--------------------------------------------------------------------------------
/scripts/pat_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -euxo pipefail
 4 | 
 5 | . ./scripts/_load_per_machine_envvars.sh
 6 | 
 7 | # space for testing. uncomment this to run individual commands from the script (copy pasting is harder because there are envvars)
 8 | exit 0
 9 | 
10 | # benchmark
11 | python3 task.py benchmark job data
12 | python3 task.py benchmark job workload --query-subset demo
13 | 
14 | # postgres
15 | python3 task.py dbms postgres build
16 | python3 task.py dbms postgres dbdata job --intended-dbdata-hardware $INTENDED_DBDATA_HARDWARE --dbdata-parent-path $DBDATA_PARENT_PATH


--------------------------------------------------------------------------------
/scripts/pipfreeze.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Ignore gymlib because we install it manually inside _build_conda_env.sh (not from requirements.txt).
3 | pip freeze | grep -v "^gymlib @" >scripts/configs/requirements.txt


--------------------------------------------------------------------------------
/scripts/quickstart.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -euxo pipefail
 4 | 
 5 | DBMS=$1
 6 | BENCHMARK=$2
 7 | SCALE_FACTOR=$3
 8 | AGENT=$4
 9 | 
10 | # Benchmark
11 | python3 task.py benchmark $BENCHMARK data $SCALE_FACTOR
12 | python3 task.py benchmark $BENCHMARK workload --scale-factor $SCALE_FACTOR
13 | 
14 | # DBMS
15 | python3 task.py dbms $DBMS build
16 | python3 task.py dbms $DBMS dbdata tpch --scale-factor $SCALE_FACTOR
17 | 
18 | # Tune
19 | python3 task.py tune $AGENT embedding datagen tpch --scale-factor $SCALE_FACTOR --override-sample-limits "lineitem,32768" # long datagen so that train doesn't crash
20 | python3 task.py tune $AGENT embedding train tpch --scale-factor $SCALE_FACTOR --iterations-per-epoch 1 --num-points-to-sample 1 --num-batches 1 --batch-size 64 --start-epoch 15 --num-samples 4 --train-max-concurrent 4 --num-curate 2
21 | python3 task.py tune $AGENT agent hpo tpch --scale-factor $SCALE_FACTOR --num-samples 2 --max-concurrent 2 --workload-timeout 15 --query-timeout 1 --tune-duration-during-hpo 0.01
22 | python3 task.py tune $AGENT agent tune tpch --scale-factor $SCALE_FACTOR
23 | 


--------------------------------------------------------------------------------
/scripts/run_integration_tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python -m scripts._run_tests "integtest_*.py"


--------------------------------------------------------------------------------
/scripts/run_unit_tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python -m scripts._run_tests "unittest_*.py"
3 | 


--------------------------------------------------------------------------------
/task.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from gymlib.workspace import make_standard_dbgym_workspace
 3 | 
 4 | from benchmark.cli import benchmark_group
 5 | from dbms.cli import dbms_group
 6 | from orchestrate.cli import manage_group
 7 | 
 8 | # TODO(phw2): Save commit, git diff, and run command.
 9 | # TODO(phw2): Remove write permissions on old run_*/ dirs to enforce that they are immutable.
10 | # TODO(phw2): Rename run_*/ to the command used (e.g. tune_protox_*/).
11 | 
12 | 
13 | @click.group()
14 | @click.pass_context
15 | def task(ctx: click.Context) -> None:
16 |     """🛢️ CMU-DB Database Gym: github.com/cmu-db/dbgym 🏋️"""
17 |     dbgym_workspace = make_standard_dbgym_workspace()
18 |     ctx.obj = dbgym_workspace
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     task.add_command(benchmark_group)
23 |     task.add_command(manage_group)
24 |     task.add_command(dbms_group)
25 |     task()
26 | 


--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cmu-db/dbgym/1994c6f0de557fae2d03781b1aa85f8ea43d8dde/util/__init__.py


--------------------------------------------------------------------------------
/util/shell.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import subprocess
 4 | from pathlib import Path
 5 | from typing import Optional
 6 | 
 7 | 
 8 | def subprocess_run(
 9 |     c: str,
10 |     cwd: Optional[Path] = None,
11 |     check_returncode: bool = True,
12 |     verbose: bool = True,
13 | ) -> subprocess.Popen[str]:
14 |     """
15 |     We use this instead of subprocess.run() because of the cwd option.
16 |     """
17 |     cwd_msg = f"(cwd: {cwd if cwd is not None else os.getcwd()})"
18 | 
19 |     if verbose:
20 |         logging.info(f"Running {cwd_msg}: {c}")
21 | 
22 |     with subprocess.Popen(
23 |         c,
24 |         stdout=subprocess.PIPE,
25 |         stderr=subprocess.STDOUT,
26 |         shell=True,
27 |         cwd=cwd,
28 |         text=True,
29 |         bufsize=0,
30 |     ) as proc:
31 |         while True:
32 |             loop = proc.poll() is None
33 |             assert proc.stdout is not None
34 |             for line in proc.stdout:
35 |                 if verbose:
36 |                     logging.info(line)
37 |             if not loop:
38 |                 break
39 |         if check_returncode and proc.returncode != 0:
40 |             raise RuntimeError(f"Non-zero returncode {proc.returncode} for: {c}")
41 | 
42 |     return proc
43 | 


--------------------------------------------------------------------------------