├── docs ├── .gitignore ├── src │ ├── chapter_1.md │ ├── tutorial │ │ ├── 1_bao_setup.md │ │ ├── 6_final_notes.md │ │ ├── 4_2_analyze_workload_regblock.md │ │ ├── 3_example_workload.md │ │ ├── 3_2_analyze_workload.md │ │ ├── 2_bao_setup.md │ │ ├── 1_pg_setup.md │ │ ├── 4_exploration_mode.md │ │ └── 5_advisor.md │ ├── pg_vars.md │ ├── SUMMARY.md │ ├── tutorial.md │ ├── bao_vars.md │ ├── pg_vars_table.html │ └── introduction.md └── book.toml ├── bao_server ├── TreeConvolution │ ├── test │ │ ├── __init__.py │ │ ├── test_tree_conv.py │ │ └── test_utils.py │ ├── .gitignore │ ├── example.png │ ├── tcnn.py │ ├── example.py │ └── util.py ├── common.py ├── .gitignore ├── constants.py ├── config.py ├── bao.cfg ├── net.py ├── train.py ├── baoctl.py ├── storage.py ├── main.py ├── model.py ├── featurize.py └── reg_blocker.py ├── pg_extension ├── .gitignore ├── pg_bao--0.0.1.sql ├── pg_bao.control ├── Makefile ├── bao_configs.h ├── compile_commands.json ├── bao_bufferstate.h ├── bao_util.h ├── main.c └── bao_planner.h ├── .gitignore ├── branding ├── bao_loves_pg.odg ├── bao_loves_pg.pdf └── bao_loves_pg-crop.pdf ├── .drone.yml ├── sample_queries ├── q24_32a.sql ├── q27_3c.sql ├── q29_6e.sql ├── q11_17e.sql ├── q12_17a.sql ├── q6_16b.sql ├── q15_18a.sql ├── q25_13d.sql ├── q28_13a.sql ├── q30_18c.sql ├── q23_19d.sql ├── q35_1a1508.sql ├── q34_1a275.sql ├── q26_2a274.sql ├── q31_2a39.sql ├── q33_2a156.sql ├── q5_8a423.sql ├── q16_26c.sql ├── q22_8a27.sql ├── q10_2a265.sql ├── q9_5a48.sql ├── q21_2a396.sql ├── q4_8a122.sql ├── q39_2a2781.sql ├── q19_2a471.sql ├── q20_24b.sql ├── q2_8a82.sql ├── q38_2a1870.sql ├── q40_2a8120.sql ├── q37_2a1291.sql ├── q1_8a463.sql ├── q32_2a493.sql ├── q13_7a121.sql ├── q17_7a164.sql ├── q3_7a99.sql ├── q7_7a48.sql ├── q8_6a505.sql ├── q14_6a349.sql ├── q18_7a103.sql └── q36_7a136.sql ├── run_queries.py ├── README.md └── analyze_bao.ipynb /docs/.gitignore: -------------------------------------------------------------------------------- 1 | book 2 | -------------------------------------------------------------------------------- /docs/src/chapter_1.md: -------------------------------------------------------------------------------- 1 | # Chapter 1 2 | -------------------------------------------------------------------------------- /bao_server/TreeConvolution/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pg_extension/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | *.bc 4 | -------------------------------------------------------------------------------- /pg_extension/pg_bao--0.0.1.sql: -------------------------------------------------------------------------------- 1 | -- nothing to do 2 | -------------------------------------------------------------------------------- /docs/src/tutorial/1_bao_setup.md: -------------------------------------------------------------------------------- 1 | # Bao Server Setup 2 | -------------------------------------------------------------------------------- /bao_server/TreeConvolution/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | __pycache__ 3 | -------------------------------------------------------------------------------- /bao_server/common.py: -------------------------------------------------------------------------------- 1 | 2 | class BaoException(Exception): 3 | pass 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.bc 3 | *.o 4 | *.so 5 | *\#* 6 | .clangd 7 | .ipynb* 8 | -------------------------------------------------------------------------------- /bao_server/.gitignore: -------------------------------------------------------------------------------- 1 | bao_*_model 2 | bao.db 3 | __pycache__ 4 | *.txt 5 | tmp_model 6 | -------------------------------------------------------------------------------- /branding/bao_loves_pg.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/learnedsystems/BaoForPostgreSQL/HEAD/branding/bao_loves_pg.odg -------------------------------------------------------------------------------- /branding/bao_loves_pg.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/learnedsystems/BaoForPostgreSQL/HEAD/branding/bao_loves_pg.pdf -------------------------------------------------------------------------------- /pg_extension/pg_bao.control: -------------------------------------------------------------------------------- 1 | comment = 'PostgreSQL Bao Prototype' 2 | default_version = '0.0.1' 3 | relocatable = true 4 | -------------------------------------------------------------------------------- /branding/bao_loves_pg-crop.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/learnedsystems/BaoForPostgreSQL/HEAD/branding/bao_loves_pg-crop.pdf -------------------------------------------------------------------------------- /bao_server/TreeConvolution/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/learnedsystems/BaoForPostgreSQL/HEAD/bao_server/TreeConvolution/example.png -------------------------------------------------------------------------------- /docs/book.toml: -------------------------------------------------------------------------------- 1 | [book] 2 | authors = ["Ryan Marcus"] 3 | language = "en" 4 | multilingual = false 5 | src = "src" 6 | title = "Bao for PostgreSQL" 7 | -------------------------------------------------------------------------------- /bao_server/constants.py: -------------------------------------------------------------------------------- 1 | PG_OPTIMIZER_INDEX = 0 2 | DEFAULT_MODEL_PATH = "bao_default_model" 3 | TMP_MODEL_PATH = "bao_tmp_model" 4 | OLD_MODEL_PATH = "bao_previous_model" 5 | -------------------------------------------------------------------------------- /pg_extension/Makefile: -------------------------------------------------------------------------------- 1 | EXTENSION = pg_bao 2 | MODULE_big = pg_bao 3 | DATA = pg_bao--0.0.1.sql 4 | OBJS = main.o 5 | PG_CONFIG = pg_config 6 | PGXS := $(shell $(PG_CONFIG) --pgxs) 7 | #SHLIB_LINK = -lsqlite3 8 | include $(PGXS) 9 | -------------------------------------------------------------------------------- /.drone.yml: -------------------------------------------------------------------------------- 1 | kind: pipeline 2 | type: docker 3 | name: default 4 | 5 | steps: 6 | - name: build 7 | image: ryanmarcus/pg_bao:v0 8 | commands: 9 | - cargo install mdbook 10 | - /root/.cargo/bin/mdbook build docs 11 | - cd pg_extension 12 | - make USE_PGXS=1 install 13 | 14 | -------------------------------------------------------------------------------- /bao_server/config.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | 3 | def read_config(): 4 | config = configparser.ConfigParser() 5 | config.read("bao.cfg") 6 | 7 | if "bao" not in config: 8 | print("bao.cfg does not have a [bao] section.") 9 | exit(-1) 10 | 11 | config = config["bao"] 12 | return config 13 | -------------------------------------------------------------------------------- /docs/src/pg_vars.md: -------------------------------------------------------------------------------- 1 | # PostgreSQL Configuration Variables 2 | 3 | Bao can be configured through a number of *session level* PostgreSQL configuration variables. These variables are set to their default values every time you open a session (e.g., `psql` session or a connection from an application). 4 | 5 | {{#include pg_vars_table.html}} 6 | -------------------------------------------------------------------------------- /sample_queries/q24_32a.sql: -------------------------------------------------------------------------------- 1 | SELECT MIN(lt.link) AS link_type, MIN(t1.title) AS first_movie, MIN(t2.title) AS second_movie FROM keyword AS k, link_type AS lt, movie_keyword AS mk, movie_link AS ml, title AS t1, title AS t2 WHERE k.keyword ='10,000-mile-club' AND mk.keyword_id = k.id AND t1.id = mk.movie_id AND ml.movie_id = t1.id AND ml.linked_movie_id = t2.id AND lt.id = ml.link_type_id AND mk.movie_id = t1.id; 2 | -------------------------------------------------------------------------------- /sample_queries/q27_3c.sql: -------------------------------------------------------------------------------- 1 | SELECT MIN(t.title) AS movie_title FROM keyword AS k, movie_info AS mi, movie_keyword AS mk, title AS t WHERE k.keyword like '%sequel%' AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Denish', 'Norwegian', 'German', 'USA', 'American') AND t.production_year > 1990 AND t.id = mi.movie_id AND t.id = mk.movie_id AND mk.movie_id = mi.movie_id AND k.id = mk.keyword_id; 2 | -------------------------------------------------------------------------------- /sample_queries/q29_6e.sql: -------------------------------------------------------------------------------- 1 | SELECT MIN(k.keyword) AS movie_keyword, MIN(n.name) AS actor_name, MIN(t.title) AS marvel_movie FROM cast_info AS ci, keyword AS k, movie_keyword AS mk, name AS n, title AS t WHERE k.keyword = 'marvel-cinematic-universe' AND n.name LIKE '%Downey%Robert%' AND t.production_year > 2000 AND k.id = mk.keyword_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND ci.movie_id = mk.movie_id AND n.id = ci.person_id; 2 | -------------------------------------------------------------------------------- /sample_queries/q11_17e.sql: -------------------------------------------------------------------------------- 1 | SELECT MIN(n.name) AS member_in_charnamed_movie FROM cast_info AS ci, company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, name AS n, title AS t WHERE cn.country_code ='[us]' AND k.keyword ='character-name-in-title' AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND ci.movie_id = mc.movie_id AND ci.movie_id = mk.movie_id AND mc.movie_id = mk.movie_id; 2 | -------------------------------------------------------------------------------- /sample_queries/q12_17a.sql: -------------------------------------------------------------------------------- 1 | SELECT MIN(n.name) AS member_in_charnamed_american_movie, MIN(n.name) AS a1 FROM cast_info AS ci, company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, name AS n, title AS t WHERE cn.country_code ='[us]' AND k.keyword ='character-name-in-title' AND n.name LIKE 'B%' AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND ci.movie_id = mc.movie_id AND ci.movie_id = mk.movie_id AND mc.movie_id = mk.movie_id; 2 | -------------------------------------------------------------------------------- /pg_extension/bao_configs.h: -------------------------------------------------------------------------------- 1 | #ifndef BAO_CONFIGS_H 2 | #define BAO_CONFIGS_H 3 | 4 | #include "c.h" 5 | 6 | #define BAO_MAX_ARMS 26 7 | 8 | // Each Bao config variable is linked to a PostgreSQL session variable. 9 | // See the string docs provided to the PG functions in main.c. 10 | static bool enable_bao = false; 11 | static bool enable_bao_rewards = false; 12 | static bool enable_bao_selection = false; 13 | static char* bao_host = NULL; 14 | static int bao_port = 9381; 15 | static int bao_num_arms = 5; 16 | static bool bao_include_json_in_explain = false; 17 | #endif 18 | -------------------------------------------------------------------------------- /sample_queries/q6_16b.sql: -------------------------------------------------------------------------------- 1 | SELECT MIN(an.name) AS cool_actor_pseudonym, MIN(t.title) AS series_named_after_char FROM aka_name AS an, cast_info AS ci, company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, name AS n, title AS t WHERE cn.country_code ='[us]' AND k.keyword ='character-name-in-title' AND an.person_id = n.id AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND an.person_id = ci.person_id AND ci.movie_id = mc.movie_id AND ci.movie_id = mk.movie_id AND mc.movie_id = mk.movie_id; 2 | -------------------------------------------------------------------------------- /sample_queries/q15_18a.sql: -------------------------------------------------------------------------------- 1 | SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(t.title) AS movie_title FROM cast_info AS ci, info_type AS it1, info_type AS it2, movie_info AS mi, movie_info_idx AS mi_idx, name AS n, title AS t WHERE ci.note in ('(producer)', '(executive producer)') AND it1.info = 'budget' AND it2.info = 'votes' AND n.gender = 'm' and n.name like '%Tim%' AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND mi.movie_id = mi_idx.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id; 2 | -------------------------------------------------------------------------------- /sample_queries/q25_13d.sql: -------------------------------------------------------------------------------- 1 | SELECT MIN(cn.name) AS producing_company, MIN(miidx.info) AS rating, MIN(t.title) AS movie FROM company_name AS cn, company_type AS ct, info_type AS it, info_type AS it2, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_info_idx AS miidx, title AS t WHERE cn.country_code ='[us]' AND ct.kind ='production companies' AND it.info ='rating' AND it2.info ='release dates' AND kt.kind ='movie' AND mi.movie_id = t.id AND it2.id = mi.info_type_id AND kt.id = t.kind_id AND mc.movie_id = t.id AND cn.id = mc.company_id AND ct.id = mc.company_type_id AND miidx.movie_id = t.id AND it.id = miidx.info_type_id AND mi.movie_id = miidx.movie_id AND mi.movie_id = mc.movie_id AND miidx.movie_id = mc.movie_id; 2 | -------------------------------------------------------------------------------- /sample_queries/q28_13a.sql: -------------------------------------------------------------------------------- 1 | SELECT MIN(mi.info) AS release_date, MIN(miidx.info) AS rating, MIN(t.title) AS german_movie FROM company_name AS cn, company_type AS ct, info_type AS it, info_type AS it2, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_info_idx AS miidx, title AS t WHERE cn.country_code ='[de]' AND ct.kind ='production companies' AND it.info ='rating' AND it2.info ='release dates' AND kt.kind ='movie' AND mi.movie_id = t.id AND it2.id = mi.info_type_id AND kt.id = t.kind_id AND mc.movie_id = t.id AND cn.id = mc.company_id AND ct.id = mc.company_type_id AND miidx.movie_id = t.id AND it.id = miidx.info_type_id AND mi.movie_id = miidx.movie_id AND mi.movie_id = mc.movie_id AND miidx.movie_id = mc.movie_id; 2 | -------------------------------------------------------------------------------- /sample_queries/q30_18c.sql: -------------------------------------------------------------------------------- 1 | SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(t.title) AS movie_title FROM cast_info AS ci, info_type AS it1, info_type AS it2, movie_info AS mi, movie_info_idx AS mi_idx, name AS n, title AS t WHERE ci.note in ('(writer)', '(head writer)', '(written by)', '(story)', '(story editor)') AND it1.info = 'genres' AND it2.info = 'votes' AND mi.info in ('Horror', 'Action', 'Sci-Fi', 'Thriller', 'Crime', 'War') AND n.gender = 'm' AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND mi.movie_id = mi_idx.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id; 2 | -------------------------------------------------------------------------------- /docs/src/SUMMARY.md: -------------------------------------------------------------------------------- 1 | # Summary 2 | 3 | - [Introduction](./introduction.md) 4 | - [Tutorial](./tutorial.md) 5 | - [PostgreSQL Setup](tutorial/1_pg_setup.md) 6 | - [Bao Server Setup](tutorial/2_bao_setup.md) 7 | - [Run an Example Workload](tutorial/3_example_workload.md) 8 | - [Analyze the Results](tutorial/3_2_analyze_workload.md) 9 | - [Use Exploration Mode](tutorial/4_exploration_mode.md) 10 | - [Analyze the Results with Exploration Mode](tutorial/4_2_analyze_workload_regblock.md) 11 | - [Use Bao as an Advisor](tutorial/5_advisor.md) 12 | - [Final Notes](tutorial/6_final_notes.md) 13 | - [PostgreSQL Configuration Variables](./pg_vars.md) 14 | - [Bao Server Configuration Variables](./bao_vars.md) 15 | -------------------------------------------------------------------------------- /sample_queries/q23_19d.sql: -------------------------------------------------------------------------------- 1 | SELECT MIN(n.name) AS voicing_actress, MIN(t.title) AS jap_engl_voiced_movie FROM aka_name AS an, char_name AS chn, cast_info AS ci, company_name AS cn, info_type AS it, movie_companies AS mc, movie_info AS mi, name AS n, role_type AS rt, title AS t WHERE ci.note in ('(voice)', '(voice: Japanese version)', '(voice) (uncredited)', '(voice: English version)') AND cn.country_code ='[us]' AND it.info = 'release dates' AND n.gender ='f' AND rt.role ='actress' AND t.production_year > 2000 AND t.id = mi.movie_id AND t.id = mc.movie_id AND t.id = ci.movie_id AND mc.movie_id = ci.movie_id AND mc.movie_id = mi.movie_id AND mi.movie_id = ci.movie_id AND cn.id = mc.company_id AND it.id = mi.info_type_id AND n.id = ci.person_id AND rt.id = ci.role_id AND n.id = an.person_id AND ci.person_id = an.person_id AND chn.id = ci.person_role_id; 2 | -------------------------------------------------------------------------------- /docs/src/tutorial.md: -------------------------------------------------------------------------------- 1 | # Bao Tutorial 2 | 3 | In this tutorial, we'll set up Bao with PostgreSQL, load the [IMDB dataset](https://git.io/imdb), and then execute a small sample workload with and without Bao to see the difference. Then, we'll look at how Bao's exploratory mode can be used to prevent query regressions. Finally, we will look at how Bao can be used as a query advisor. 4 | 5 | 1. [Setup PostgreSQL with an example DB and the Bao extension](./tutorial/1_pg_setup.md) 6 | 2. [Start the Bao server](./tutorial/2_bao_setup.md) 7 | 3. [Run an example workload, and compare performance with PostgreSQL's optimizer](./tutorial/3_example_workload.md) 8 | 4. [Use exploration mode to eliminate query regressions](./tutorial/4_exploration_mode.md) 9 | 5. [Use Bao as an advisor](./tutorial/5_advisor.md) 10 | 6. [Notes about this tutorial](./tutorial/6_final_notes.md) 11 | -------------------------------------------------------------------------------- /bao_server/bao.cfg: -------------------------------------------------------------------------------- 1 | [bao] 2 | # ============================================================== 3 | # BAO SERVER SETTINGS 4 | # ============================================================== 5 | 6 | # port to listen on. Note that the corresponding PostgreSQL 7 | # variable, bao_port, must be set to match. 8 | Port = 9381 9 | 10 | # network address to listen on. If not localhost, don't forget 11 | # to set the PostgreSQL bao_host variable. 12 | ListenOn = localhost 13 | 14 | # ============================================================== 15 | # EXPLORATION MODE SETTINGS 16 | # ============================================================== 17 | 18 | # maximum time a query should reasonably take (used in 19 | # exploration mode). 20 | MaxQueryTimeSeconds = 120 21 | 22 | # psycopg2 / JDBC connection string to access PostgreSQL 23 | # (used by the experiment runner to prevent regressions) 24 | PostgreSQLConnectString = user=imdb 25 | -------------------------------------------------------------------------------- /sample_queries/q35_1a1508.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) FROM title as t, 2 | kind_type as kt, 3 | movie_info as mi1, 4 | info_type as it1, 5 | movie_info as mi2, 6 | info_type as it2, 7 | cast_info as ci, 8 | role_type as rt, 9 | name as n 10 | WHERE 11 | t.id = ci.movie_id 12 | AND t.id = mi1.movie_id 13 | AND t.id = mi2.movie_id 14 | AND mi1.movie_id = mi2.movie_id 15 | AND mi1.info_type_id = it1.id 16 | AND mi2.info_type_id = it2.id 17 | AND it1.id = '3' 18 | AND it2.id = '7' 19 | AND t.kind_id = kt.id 20 | AND ci.person_id = n.id 21 | AND ci.role_id = rt.id 22 | AND mi1.info IN ('Comedy','Crime','Fantasy','Mystery','Short') 23 | AND mi2.info IN ('LAB:FotoKem Laboratory, Burbank (CA), USA','MET:','MET:300 m','PCS:Spherical','RAT:1.33 : 1','RAT:1.66 : 1') 24 | AND kt.kind IN ('episode','movie','video movie') 25 | AND rt.role IN ('miscellaneous crew') 26 | AND n.gender IN ('f') 27 | AND t.production_year <= 2015 28 | AND 1925 < t.production_year 29 | -------------------------------------------------------------------------------- /sample_queries/q34_1a275.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) FROM title as t, 2 | kind_type as kt, 3 | movie_info as mi1, 4 | info_type as it1, 5 | movie_info as mi2, 6 | info_type as it2, 7 | cast_info as ci, 8 | role_type as rt, 9 | name as n 10 | WHERE 11 | t.id = ci.movie_id 12 | AND t.id = mi1.movie_id 13 | AND t.id = mi2.movie_id 14 | AND mi1.movie_id = mi2.movie_id 15 | AND mi1.info_type_id = it1.id 16 | AND mi2.info_type_id = it2.id 17 | AND it1.id = '8' 18 | AND it2.id = '4' 19 | AND t.kind_id = kt.id 20 | AND ci.person_id = n.id 21 | AND ci.role_id = rt.id 22 | AND mi1.info IN ('Brazil','India','Ireland','Italy','Netherlands','Philippines','Poland','USA') 23 | AND mi2.info IN ('English','French','Italian','Malayalam','Polish','Portuguese','Tagalog') 24 | AND kt.kind IN ('tv movie','tv series','video game') 25 | AND rt.role IN ('cinematographer','composer') 26 | AND n.gender IN ('m') 27 | AND t.production_year <= 2015 28 | AND 1990 < t.production_year 29 | -------------------------------------------------------------------------------- /sample_queries/q26_2a274.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) FROM title as t, 2 | kind_type as kt, 3 | info_type as it1, 4 | movie_info as mi1, 5 | movie_info as mi2, 6 | info_type as it2, 7 | cast_info as ci, 8 | role_type as rt, 9 | name as n, 10 | movie_keyword as mk, 11 | keyword as k 12 | WHERE 13 | t.id = ci.movie_id 14 | AND t.id = mi1.movie_id 15 | AND t.id = mi2.movie_id 16 | AND t.id = mk.movie_id 17 | AND k.id = mk.keyword_id 18 | AND mi1.movie_id = mi2.movie_id 19 | AND mi1.info_type_id = it1.id 20 | AND mi2.info_type_id = it2.id 21 | AND (it1.id in ('8')) 22 | AND (it2.id in ('2')) 23 | AND t.kind_id = kt.id 24 | AND ci.person_id = n.id 25 | AND ci.role_id = rt.id 26 | AND (mi1.info in ('Austria','Belgium','Brazil','Denmark','France','Mexico','Spain','UK','USA','Yugoslavia')) 27 | AND (mi2.info in ('Black and White','Color')) 28 | AND (kt.kind in ('episode','movie','tv movie')) 29 | AND (rt.role in ('editor','writer')) 30 | AND (n.gender in ('f','m')) 31 | AND (t.production_year <= 1975) 32 | AND (t.production_year >= 1875) 33 | -------------------------------------------------------------------------------- /sample_queries/q31_2a39.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) FROM title as t, 2 | kind_type as kt, 3 | info_type as it1, 4 | movie_info as mi1, 5 | movie_info as mi2, 6 | info_type as it2, 7 | cast_info as ci, 8 | role_type as rt, 9 | name as n, 10 | movie_keyword as mk, 11 | keyword as k 12 | WHERE 13 | t.id = ci.movie_id 14 | AND t.id = mi1.movie_id 15 | AND t.id = mi2.movie_id 16 | AND t.id = mk.movie_id 17 | AND k.id = mk.keyword_id 18 | AND mi1.movie_id = mi2.movie_id 19 | AND mi1.info_type_id = it1.id 20 | AND mi2.info_type_id = it2.id 21 | AND (it1.id in ('7')) 22 | AND (it2.id in ('8')) 23 | AND t.kind_id = kt.id 24 | AND ci.person_id = n.id 25 | AND ci.role_id = rt.id 26 | AND (mi1.info in ('MET:600 m','OFM:35 mm','PCS:Spherical','PFM:35 mm','RAT:1.37 : 1')) 27 | AND (mi2.info in ('France','Germany','Japan','Mexico','Portugal','Spain','UK','USA')) 28 | AND (kt.kind in ('tv series','video game','video movie')) 29 | AND (rt.role in ('producer')) 30 | AND (n.gender in ('m') OR n.gender IS NULL) 31 | AND (t.production_year <= 1975) 32 | AND (t.production_year >= 1925) 33 | -------------------------------------------------------------------------------- /sample_queries/q33_2a156.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) FROM title as t, 2 | kind_type as kt, 3 | info_type as it1, 4 | movie_info as mi1, 5 | movie_info as mi2, 6 | info_type as it2, 7 | cast_info as ci, 8 | role_type as rt, 9 | name as n, 10 | movie_keyword as mk, 11 | keyword as k 12 | WHERE 13 | t.id = ci.movie_id 14 | AND t.id = mi1.movie_id 15 | AND t.id = mi2.movie_id 16 | AND t.id = mk.movie_id 17 | AND k.id = mk.keyword_id 18 | AND mi1.movie_id = mi2.movie_id 19 | AND mi1.info_type_id = it1.id 20 | AND mi2.info_type_id = it2.id 21 | AND (it1.id in ('7')) 22 | AND (it2.id in ('3')) 23 | AND t.kind_id = kt.id 24 | AND ci.person_id = n.id 25 | AND ci.role_id = rt.id 26 | AND (mi1.info in ('PCS:Spherical','PFM:16 mm','PFM:35 mm','RAT:1.33 : 1','RAT:1.66 : 1','RAT:1.85 : 1')) 27 | AND (mi2.info in ('Adult','Comedy','Documentary','Drama','Mystery','Romance','Short','Thriller','Western')) 28 | AND (kt.kind in ('tv series','video game','video movie')) 29 | AND (rt.role in ('miscellaneous crew','producer')) 30 | AND (n.gender in ('f','m')) 31 | AND (t.production_year <= 1990) 32 | AND (t.production_year >= 1950) 33 | -------------------------------------------------------------------------------- /sample_queries/q5_8a423.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) FROM title as t, 2 | kind_type as kt, 3 | info_type as it1, 4 | movie_info as mi1, 5 | cast_info as ci, 6 | role_type as rt, 7 | name as n, 8 | movie_keyword as mk, 9 | keyword as k, 10 | movie_companies as mc, 11 | company_type as ct, 12 | company_name as cn 13 | WHERE 14 | t.id = ci.movie_id 15 | AND t.id = mc.movie_id 16 | AND t.id = mi1.movie_id 17 | AND t.id = mk.movie_id 18 | AND mc.company_type_id = ct.id 19 | AND mc.company_id = cn.id 20 | AND k.id = mk.keyword_id 21 | AND mi1.info_type_id = it1.id 22 | AND t.kind_id = kt.id 23 | AND ci.person_id = n.id 24 | AND ci.role_id = rt.id 25 | AND (it1.id IN ('2')) 26 | AND (mi1.info in ('Color')) 27 | AND (kt.kind in ('movie','tv series')) 28 | AND (rt.role in ('actor','producer')) 29 | AND (n.gender in ('m')) 30 | AND (n.name_pcode_cf in ('B6261','B6526','J5252','M6352','R363','S3152','S5362')) 31 | AND (t.production_year <= 1990) 32 | AND (t.production_year >= 1950) 33 | AND (cn.name in ('American Broadcasting Company (ABC)','Columbia Broadcasting System (CBS)','National Broadcasting Company (NBC)')) 34 | AND (ct.kind in ('distributors')) 35 | -------------------------------------------------------------------------------- /sample_queries/q16_26c.sql: -------------------------------------------------------------------------------- 1 | SELECT MIN(chn.name) AS character_name, MIN(mi_idx.info) AS rating, MIN(t.title) AS complete_hero_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, char_name AS chn, cast_info AS ci, info_type AS it2, keyword AS k, kind_type AS kt, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE cct1.kind = 'cast' AND cct2.kind like '%complete%' AND chn.name is not NULL and (chn.name like '%man%' or chn.name like '%Man%') AND it2.info = 'rating' AND k.keyword in ('superhero', 'marvel-comics', 'based-on-comic', 'tv-special', 'fight', 'violence', 'magnet', 'web', 'claw', 'laser') AND kt.kind = 'movie' AND t.production_year > 2000 AND kt.id = t.kind_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND t.id = cc.movie_id AND t.id = mi_idx.movie_id AND mk.movie_id = ci.movie_id AND mk.movie_id = cc.movie_id AND mk.movie_id = mi_idx.movie_id AND ci.movie_id = cc.movie_id AND ci.movie_id = mi_idx.movie_id AND cc.movie_id = mi_idx.movie_id AND chn.id = ci.person_role_id AND n.id = ci.person_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id AND it2.id = mi_idx.info_type_id; 2 | -------------------------------------------------------------------------------- /docs/src/bao_vars.md: -------------------------------------------------------------------------------- 1 | # Bao Server Configuration Variables 2 | 3 | The Bao sever is configured through the `bao.cfg` file in the `bao_server` directory. The default configuration file, reproduced below, contains a description of each variable. 4 | 5 | ```ini 6 | [bao] 7 | # ============================================================== 8 | # BAO SERVER SETTINGS 9 | # ============================================================== 10 | 11 | # port to listen on. Note that the corresponding PostgreSQL 12 | # variable, bao_port, must be set to match. 13 | Port = 9381 14 | 15 | # network address to listen on. If not localhost, don't forget 16 | # to set the PostgreSQL bao_host variable. 17 | ListenOn = localhost 18 | 19 | # ============================================================== 20 | # EXPLORATION MODE SETTINGS 21 | # ============================================================== 22 | 23 | # maximum time a query should reasonably take (used in 24 | # exploration mode). 25 | MaxQueryTimeSeconds = 120 26 | 27 | # psycopg2 / JDBC connection string to access PostgreSQL 28 | # (used by the experiment runner to prevent regressions) 29 | PostgreSQLConnectString = user=imdb 30 | ``` 31 | -------------------------------------------------------------------------------- /docs/src/tutorial/6_final_notes.md: -------------------------------------------------------------------------------- 1 | # Final Notes 2 | 3 | This tutorial was intended to give you an idea for what Bao could do and how the various components of Bao work. However, please note that the gains we saw in this tutorial might not be reflected in your real workload. Here's a few things to keep in mind: 4 | 5 | * This workload was artifically constructed to represent a mixture of queries where the PostgreSQL optimizer found the best plan, and where PostgreSQL found a terrible plan. Most queries fall somewhere in between these two extremes. Check out [the Bao paper](https://rm.cab/bao) for a more detailed analysis on more realistic workloads. 6 | * Bao's query optimizer has overhead. For queries that run very quickly (under 500ms), Bao is unlikely to be helpful, and may just slow down these queries with additional optimization time. 7 | * Bao's value model needs to be retrained and verified, beyond what exploration mode could do. Most of the time, the verification done by Bao currently is sufficient. If training goes awry, Bao always saves the previous model. Training could easily be configured with a `cron` job. We're currently working on more advanced mechanisms for this, but currenlty it has to be done manually. 8 | -------------------------------------------------------------------------------- /sample_queries/q22_8a27.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) FROM title as t, 2 | kind_type as kt, 3 | info_type as it1, 4 | movie_info as mi1, 5 | cast_info as ci, 6 | role_type as rt, 7 | name as n, 8 | movie_keyword as mk, 9 | keyword as k, 10 | movie_companies as mc, 11 | company_type as ct, 12 | company_name as cn 13 | WHERE 14 | t.id = ci.movie_id 15 | AND t.id = mc.movie_id 16 | AND t.id = mi1.movie_id 17 | AND t.id = mk.movie_id 18 | AND mc.company_type_id = ct.id 19 | AND mc.company_id = cn.id 20 | AND k.id = mk.keyword_id 21 | AND mi1.info_type_id = it1.id 22 | AND t.kind_id = kt.id 23 | AND ci.person_id = n.id 24 | AND ci.role_id = rt.id 25 | AND (it1.id IN ('3')) 26 | AND (mi1.info in ('Adventure','Animation','Crime','Drama')) 27 | AND (kt.kind in ('movie')) 28 | AND (rt.role in ('actor','actress')) 29 | AND (n.gender in ('f','m')) 30 | AND (n.surname_pcode in ('C4','C62','C632','D5','F6','F63','G63','H2','L5','M245','S','S6')) 31 | AND (t.production_year <= 1975) 32 | AND (t.production_year >= 1875) 33 | AND (cn.name in ('Columbia Broadcasting System (CBS)','Metro-Goldwyn-Mayer (MGM)','Paramount Pictures','Pathé Frères','Universal Pictures','Warner Home Video')) 34 | AND (ct.kind in ('distributors','production companies')) 35 | -------------------------------------------------------------------------------- /sample_queries/q10_2a265.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) FROM title as t, 2 | kind_type as kt, 3 | info_type as it1, 4 | movie_info as mi1, 5 | movie_info as mi2, 6 | info_type as it2, 7 | cast_info as ci, 8 | role_type as rt, 9 | name as n, 10 | movie_keyword as mk, 11 | keyword as k 12 | WHERE 13 | t.id = ci.movie_id 14 | AND t.id = mi1.movie_id 15 | AND t.id = mi2.movie_id 16 | AND t.id = mk.movie_id 17 | AND k.id = mk.keyword_id 18 | AND mi1.movie_id = mi2.movie_id 19 | AND mi1.info_type_id = it1.id 20 | AND mi2.info_type_id = it2.id 21 | AND (it1.id in ('8')) 22 | AND (it2.id in ('6')) 23 | AND t.kind_id = kt.id 24 | AND ci.person_id = n.id 25 | AND ci.role_id = rt.id 26 | AND (mi1.info in ('Austria','Belgium','Brazil','Hungary','India','Mexico','Poland','Spain')) 27 | AND (mi2.info in ('Mono','Silent')) 28 | AND (kt.kind in ('episode','movie','tv movie')) 29 | AND (rt.role in ('costume designer','production designer')) 30 | AND (n.gender IS NULL) 31 | AND (t.production_year <= 1975) 32 | AND (t.production_year >= 1875) 33 | AND (k.keyword IN ('based-on-play','cigarette-smoking','friendship','independent-film','jealousy','lesbian-sex','male-nudity','marriage','mother-daughter-relationship','one-word-title','oral-sex','police','singing','song')) 34 | -------------------------------------------------------------------------------- /sample_queries/q9_5a48.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) 2 | FROM title as t, 3 | movie_info as mi1, 4 | kind_type as kt, 5 | info_type as it1, 6 | info_type as it3, 7 | info_type as it4, 8 | movie_info_idx as mii1, 9 | movie_info_idx as mii2, 10 | movie_keyword as mk, 11 | keyword as k 12 | WHERE 13 | t.id = mi1.movie_id 14 | AND t.id = mii1.movie_id 15 | AND t.id = mii2.movie_id 16 | AND t.id = mk.movie_id 17 | AND mii2.movie_id = mii1.movie_id 18 | AND mi1.movie_id = mii1.movie_id 19 | AND mk.movie_id = mi1.movie_id 20 | AND mk.keyword_id = k.id 21 | AND mi1.info_type_id = it1.id 22 | AND mii1.info_type_id = it3.id 23 | AND mii2.info_type_id = it4.id 24 | AND t.kind_id = kt.id 25 | AND (kt.kind IN ('episode','movie')) 26 | AND (t.production_year <= 1975) 27 | AND (t.production_year >= 1925) 28 | AND (mi1.info IN ('PCS:Spherical','PFM:35 mm','RAT:1.33 : 1','RAT:1.37 : 1')) 29 | AND (it1.id IN ('1','16','7')) 30 | AND it3.id = '100' 31 | AND it4.id = '101' 32 | AND (mii2.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND mii2.info::float <= 7.0) 33 | AND (mii2.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND 3.0 <= mii2.info::float) 34 | AND (mii1.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND 0.0 <= mii1.info::float) 35 | AND (mii1.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND mii1.info::float <= 1000.0) 36 | -------------------------------------------------------------------------------- /sample_queries/q21_2a396.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) FROM title as t, 2 | kind_type as kt, 3 | info_type as it1, 4 | movie_info as mi1, 5 | movie_info as mi2, 6 | info_type as it2, 7 | cast_info as ci, 8 | role_type as rt, 9 | name as n, 10 | movie_keyword as mk, 11 | keyword as k 12 | WHERE 13 | t.id = ci.movie_id 14 | AND t.id = mi1.movie_id 15 | AND t.id = mi2.movie_id 16 | AND t.id = mk.movie_id 17 | AND k.id = mk.keyword_id 18 | AND mi1.movie_id = mi2.movie_id 19 | AND mi1.info_type_id = it1.id 20 | AND mi2.info_type_id = it2.id 21 | AND (it1.id in ('5')) 22 | AND (it2.id in ('7')) 23 | AND t.kind_id = kt.id 24 | AND ci.person_id = n.id 25 | AND ci.role_id = rt.id 26 | AND (mi1.info in ('Argentina:Atp','Canada:G','Iceland:L','UK:X','USA:X')) 27 | AND (mi2.info in ('OFM:35 mm','PFM:35 mm','RAT:1.33 : 1')) 28 | AND (kt.kind in ('tv series','video movie')) 29 | AND (rt.role in ('director','producer')) 30 | AND (n.gender IS NULL) 31 | AND (t.production_year <= 1975) 32 | AND (t.production_year >= 1875) 33 | AND (k.keyword IN ('based-on-play','dog','family-relationships','father-son-relationship','female-nudity','hardcore','husband-wife-relationship','independent-film','lesbian-sex','love','marriage','mother-daughter-relationship','nudity','one-word-title','police','sequel','sex','singer')) 34 | -------------------------------------------------------------------------------- /sample_queries/q4_8a122.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) FROM title as t, 2 | kind_type as kt, 3 | info_type as it1, 4 | movie_info as mi1, 5 | cast_info as ci, 6 | role_type as rt, 7 | name as n, 8 | movie_keyword as mk, 9 | keyword as k, 10 | movie_companies as mc, 11 | company_type as ct, 12 | company_name as cn 13 | WHERE 14 | t.id = ci.movie_id 15 | AND t.id = mc.movie_id 16 | AND t.id = mi1.movie_id 17 | AND t.id = mk.movie_id 18 | AND mc.company_type_id = ct.id 19 | AND mc.company_id = cn.id 20 | AND k.id = mk.keyword_id 21 | AND mi1.info_type_id = it1.id 22 | AND t.kind_id = kt.id 23 | AND ci.person_id = n.id 24 | AND ci.role_id = rt.id 25 | AND (it1.id IN ('2')) 26 | AND (mi1.info in ('Black and White','Color')) 27 | AND (kt.kind in ('movie','tv movie','tv series')) 28 | AND (rt.role in ('actor','composer','miscellaneous crew','producer','production designer')) 29 | AND (n.gender in ('m') OR n.gender IS NULL) 30 | AND (n.name_pcode_cf in ('A2365','A6252','C52','D1614','E1524','E2163','L1214','L2','P5215','Q5325','R2425','S1452','T5212','V4524','V4626')) 31 | AND (t.production_year <= 2015) 32 | AND (t.production_year >= 1990) 33 | AND (cn.name in ('ABS-CBN','American Broadcasting Company (ABC)','British Broadcasting Corporation (BBC)')) 34 | AND (ct.kind in ('distributors','production companies')) 35 | -------------------------------------------------------------------------------- /sample_queries/q39_2a2781.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) FROM title as t, 2 | kind_type as kt, 3 | info_type as it1, 4 | movie_info as mi1, 5 | movie_info as mi2, 6 | info_type as it2, 7 | cast_info as ci, 8 | role_type as rt, 9 | name as n, 10 | movie_keyword as mk, 11 | keyword as k 12 | WHERE 13 | t.id = ci.movie_id 14 | AND t.id = mi1.movie_id 15 | AND t.id = mi2.movie_id 16 | AND t.id = mk.movie_id 17 | AND k.id = mk.keyword_id 18 | AND mi1.movie_id = mi2.movie_id 19 | AND mi1.info_type_id = it1.id 20 | AND mi2.info_type_id = it2.id 21 | AND (it1.id in ('7')) 22 | AND (it2.id in ('8')) 23 | AND t.kind_id = kt.id 24 | AND ci.person_id = n.id 25 | AND ci.role_id = rt.id 26 | AND (mi1.info in ('CAM:Panavision Cameras and Lenses','OFM:16 mm','OFM:35 mm','OFM:Video','PCS:Spherical','PFM:35 mm','RAT:1.33 : 1','RAT:1.37 : 1','RAT:1.66 : 1','RAT:1.78 : 1','RAT:2.35 : 1','RAT:4:3')) 27 | AND (mi2.info in ('East Germany','Hong Kong','Italy','Taiwan','UK','USA','West Germany')) 28 | AND (kt.kind in ('episode','movie')) 29 | AND (rt.role in ('production designer')) 30 | AND (n.gender in ('f')) 31 | AND (t.production_year <= 2010) 32 | AND (t.production_year >= 1950) 33 | AND (k.keyword IN ('father-son-relationship','combat','gay','independent-film','lesbian-sex','mother-daughter-relationship','murder','number-in-title')) 34 | -------------------------------------------------------------------------------- /sample_queries/q19_2a471.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) FROM title as t, 2 | kind_type as kt, 3 | info_type as it1, 4 | movie_info as mi1, 5 | movie_info as mi2, 6 | info_type as it2, 7 | cast_info as ci, 8 | role_type as rt, 9 | name as n, 10 | movie_keyword as mk, 11 | keyword as k 12 | WHERE 13 | t.id = ci.movie_id 14 | AND t.id = mi1.movie_id 15 | AND t.id = mi2.movie_id 16 | AND t.id = mk.movie_id 17 | AND k.id = mk.keyword_id 18 | AND mi1.movie_id = mi2.movie_id 19 | AND mi1.info_type_id = it1.id 20 | AND mi2.info_type_id = it2.id 21 | AND (it1.id in ('7')) 22 | AND (it2.id in ('8')) 23 | AND t.kind_id = kt.id 24 | AND ci.person_id = n.id 25 | AND ci.role_id = rt.id 26 | AND (mi1.info in ('CAM:Panavision Cameras and Lenses','OFM:16 mm','OFM:35 mm','OFM:Video','PCS:Spherical','PFM:35 mm','RAT:1.33 : 1','RAT:1.37 : 1','RAT:1.66 : 1','RAT:1.78 : 1','RAT:2.35 : 1','RAT:4:3')) 27 | AND (mi2.info in ('East Germany','Hong Kong','Italy','Taiwan','UK','USA','West Germany')) 28 | AND (kt.kind in ('episode','movie')) 29 | AND (rt.role in ('production designer')) 30 | AND (n.gender in ('f')) 31 | AND (t.production_year <= 2010) 32 | AND (t.production_year >= 1950) 33 | AND (k.keyword IN ('death','father-son-relationship','fight','gay','independent-film','lesbian-sex','mother-daughter-relationship','murder','number-in-title')) 34 | -------------------------------------------------------------------------------- /sample_queries/q20_24b.sql: -------------------------------------------------------------------------------- 1 | SELECT MIN(chn.name) AS voiced_char_name, MIN(n.name) AS voicing_actress_name, MIN(t.title) AS kung_fu_panda FROM aka_name AS an, char_name AS chn, cast_info AS ci, company_name AS cn, info_type AS it, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, name AS n, role_type AS rt, title AS t WHERE ci.note in ('(voice)', '(voice: Japanese version)', '(voice) (uncredited)', '(voice: English version)') AND cn.country_code ='[us]' AND cn.name = 'DreamWorks Animation' AND it.info = 'release dates' AND k.keyword in ('hero', 'martial-arts', 'hand-to-hand-combat', 'computer-animated-movie') AND mi.info is not null and (mi.info like 'Japan:%201%' or mi.info like 'USA:%201%') AND n.gender ='f' and n.name like '%An%' AND rt.role ='actress' AND t.production_year > 2010 AND t.title like 'Kung Fu Panda%' AND t.id = mi.movie_id AND t.id = mc.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND mc.movie_id = ci.movie_id AND mc.movie_id = mi.movie_id AND mc.movie_id = mk.movie_id AND mi.movie_id = ci.movie_id AND mi.movie_id = mk.movie_id AND ci.movie_id = mk.movie_id AND cn.id = mc.company_id AND it.id = mi.info_type_id AND n.id = ci.person_id AND rt.id = ci.role_id AND n.id = an.person_id AND ci.person_id = an.person_id AND chn.id = ci.person_role_id AND k.id = mk.keyword_id; 2 | -------------------------------------------------------------------------------- /sample_queries/q2_8a82.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) FROM title as t, 2 | kind_type as kt, 3 | info_type as it1, 4 | movie_info as mi1, 5 | cast_info as ci, 6 | role_type as rt, 7 | name as n, 8 | movie_keyword as mk, 9 | keyword as k, 10 | movie_companies as mc, 11 | company_type as ct, 12 | company_name as cn 13 | WHERE 14 | t.id = ci.movie_id 15 | AND t.id = mc.movie_id 16 | AND t.id = mi1.movie_id 17 | AND t.id = mk.movie_id 18 | AND mc.company_type_id = ct.id 19 | AND mc.company_id = cn.id 20 | AND k.id = mk.keyword_id 21 | AND mi1.info_type_id = it1.id 22 | AND t.kind_id = kt.id 23 | AND ci.person_id = n.id 24 | AND ci.role_id = rt.id 25 | AND (it1.id IN ('2')) 26 | AND (mi1.info in ('Color')) 27 | AND (kt.kind in ('movie','tv movie','tv series')) 28 | AND (rt.role in ('actor','writer')) 29 | AND (n.gender in ('m')) 30 | AND (n.surname_pcode in ('B4','B6','C462','D12','D25','G65','H65','J525','K5','P6','P62','R3','S5') OR n.surname_pcode IS NULL) 31 | AND (t.production_year <= 2015) 32 | AND (t.production_year >= 1925) 33 | AND (cn.name in ('Columbia Broadcasting System (CBS)','Fox Network','Independent Television (ITV)','Metro-Goldwyn-Mayer (MGM)','National Broadcasting Company (NBC)','Shout! Factory','Universal Pictures','Universal TV','Warner Bros')) 34 | AND (ct.kind in ('distributors','production companies')) 35 | -------------------------------------------------------------------------------- /sample_queries/q38_2a1870.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) FROM title as t, 2 | kind_type as kt, 3 | info_type as it1, 4 | movie_info as mi1, 5 | movie_info as mi2, 6 | info_type as it2, 7 | cast_info as ci, 8 | role_type as rt, 9 | name as n, 10 | movie_keyword as mk, 11 | keyword as k 12 | WHERE 13 | t.id = ci.movie_id 14 | AND t.id = mi1.movie_id 15 | AND t.id = mi2.movie_id 16 | AND t.id = mk.movie_id 17 | AND k.id = mk.keyword_id 18 | AND mi1.movie_id = mi2.movie_id 19 | AND mi1.info_type_id = it1.id 20 | AND mi2.info_type_id = it2.id 21 | AND (it1.id in ('7')) 22 | AND (it2.id in ('8')) 23 | AND t.kind_id = kt.id 24 | AND ci.person_id = n.id 25 | AND ci.role_id = rt.id 26 | AND (mi1.info in ('CAM:Panavision Cameras and Lenses','OFM:16 mm','OFM:35 mm','OFM:Video','PCS:Spherical','PFM:35 mm','RAT:1.33 : 1','RAT:1.37 : 1','RAT:1.66 : 1','RAT:1.78 : 1','RAT:2.35 : 1','RAT:4:3')) 27 | AND (mi2.info in ('East Germany','Hong Kong','Italy','Taiwan','UK','USA','West Germany')) 28 | AND (kt.kind in ('episode','movie')) 29 | AND (rt.role in ('production designer')) 30 | AND (n.gender in ('f')) 31 | AND (t.production_year <= 2011) 32 | AND (t.production_year >= 1949) 33 | AND (k.keyword IN ('death','father-son-relationship','bruce','gay','independent-film','lesbian-sex','mother-daughter-relationship','murder','number-in-title')) 34 | -------------------------------------------------------------------------------- /sample_queries/q40_2a8120.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) FROM title as t, 2 | kind_type as kt, 3 | info_type as it1, 4 | movie_info as mi1, 5 | movie_info as mi2, 6 | info_type as it2, 7 | cast_info as ci, 8 | role_type as rt, 9 | name as n, 10 | movie_keyword as mk, 11 | keyword as k 12 | WHERE 13 | t.id = ci.movie_id 14 | AND t.id = mi1.movie_id 15 | AND t.id = mi2.movie_id 16 | AND t.id = mk.movie_id 17 | AND k.id = mk.keyword_id 18 | AND mi1.movie_id = mi2.movie_id 19 | AND mi1.info_type_id = it1.id 20 | AND mi2.info_type_id = it2.id 21 | AND (it1.id in ('7')) 22 | AND (it2.id in ('8')) 23 | AND t.kind_id = kt.id 24 | AND ci.person_id = n.id 25 | AND ci.role_id = rt.id 26 | AND (mi1.info in ('CAM:Panavision Cameras and Lenses','OFM:16 mm','OFM:35 mm','OFM:Video','PCS:Spherical','PFM:35 mm','RAT:1.33 : 1','RAT:1.37 : 1','RAT:1.66 : 1','RAT:1.78 : 1','RAT:2.35 : 1','RAT:4:3')) 27 | AND (mi2.info in ('East Germany','Hong Kong','Italy','Taiwan','UK','USA','West Germany')) 28 | AND (kt.kind in ('episode','movie')) 29 | AND (rt.role in ('production designer')) 30 | AND (n.gender in ('m')) 31 | AND (t.production_year <= 2008) 32 | AND (t.production_year >= 1952) 33 | AND (k.keyword IN ('death','father-son-relationship','fight','gay','independent-film','lesbian-sex','mother-daughter-relationship','murder','number-in-title')) 34 | -------------------------------------------------------------------------------- /sample_queries/q37_2a1291.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) FROM title as t, 2 | kind_type as kt, 3 | info_type as it1, 4 | movie_info as mi1, 5 | movie_info as mi2, 6 | info_type as it2, 7 | cast_info as ci, 8 | role_type as rt, 9 | name as n, 10 | movie_keyword as mk, 11 | keyword as k 12 | WHERE 13 | t.id = ci.movie_id 14 | AND t.id = mi1.movie_id 15 | AND t.id = mi2.movie_id 16 | AND t.id = mk.movie_id 17 | AND k.id = mk.keyword_id 18 | AND mi1.movie_id = mi2.movie_id 19 | AND mi1.info_type_id = it1.id 20 | AND mi2.info_type_id = it2.id 21 | AND (it1.id in ('7')) 22 | AND (it2.id in ('8')) 23 | AND t.kind_id = kt.id 24 | AND ci.person_id = n.id 25 | AND ci.role_id = rt.id 26 | AND (mi1.info in ('CAM:Panavision Cameras and Lenses','OFM:16 mm','OFM:35 mm','OFM:Video','PCS:Spherical','PFM:35 mm','RAT:1.33 : 1','RAT:1.37 : 1','RAT:1.66 : 1','RAT:1.78 : 1','RAT:2.35 : 1','RAT:4:3')) 27 | AND (mi2.info in ('East Germany','Hong Kong','Italy','Taiwan','UK','USA','West Germany')) 28 | AND (kt.kind in ('episode','movie')) 29 | AND (rt.role in ('production designer')) 30 | AND (n.gender in ('f')) 31 | AND (t.production_year <= 2010) 32 | AND (t.production_year >= 1952) 33 | AND (k.keyword IN ('death','elmo','father-son-relationship','fight','gay','independent-film','lesbian-sex','mother-daughter-relationship','murder','number-in-title')) 34 | -------------------------------------------------------------------------------- /sample_queries/q1_8a463.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) FROM title as t, 2 | kind_type as kt, 3 | info_type as it1, 4 | movie_info as mi1, 5 | cast_info as ci, 6 | role_type as rt, 7 | name as n, 8 | movie_keyword as mk, 9 | keyword as k, 10 | movie_companies as mc, 11 | company_type as ct, 12 | company_name as cn 13 | WHERE 14 | t.id = ci.movie_id 15 | AND t.id = mc.movie_id 16 | AND t.id = mi1.movie_id 17 | AND t.id = mk.movie_id 18 | AND mc.company_type_id = ct.id 19 | AND mc.company_id = cn.id 20 | AND k.id = mk.keyword_id 21 | AND mi1.info_type_id = it1.id 22 | AND t.kind_id = kt.id 23 | AND ci.person_id = n.id 24 | AND ci.role_id = rt.id 25 | AND (it1.id IN ('7')) 26 | AND (mi1.info in ('MET:','OFM:35 mm','PCS:Digital Intermediate','PFM:35 mm','PFM:Video','RAT:1.33 : 1','RAT:1.37 : 1')) 27 | AND (kt.kind in ('episode','movie','tv movie')) 28 | AND (rt.role in ('actor','actress')) 29 | AND (n.gender in ('f','m') OR n.gender IS NULL) 30 | AND (n.name_pcode_cf in ('A5362','J5252','R1632','R2632','W4525')) 31 | AND (t.production_year <= 2015) 32 | AND (t.production_year >= 1925) 33 | AND (cn.name in ('Fox Network','Independent Television (ITV)','Metro-Goldwyn-Mayer (MGM)','National Broadcasting Company (NBC)','Paramount Pictures','Shout! Factory','Sony Pictures Home Entertainment','Universal Pictures','Universal TV')) 34 | AND (ct.kind in ('distributors','production companies')) 35 | -------------------------------------------------------------------------------- /sample_queries/q32_2a493.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) FROM title as t, 2 | kind_type as kt, 3 | info_type as it1, 4 | movie_info as mi1, 5 | movie_info as mi2, 6 | info_type as it2, 7 | cast_info as ci, 8 | role_type as rt, 9 | name as n, 10 | movie_keyword as mk, 11 | keyword as k 12 | WHERE 13 | t.id = ci.movie_id 14 | AND t.id = mi1.movie_id 15 | AND t.id = mi2.movie_id 16 | AND t.id = mk.movie_id 17 | AND k.id = mk.keyword_id 18 | AND mi1.movie_id = mi2.movie_id 19 | AND mi1.info_type_id = it1.id 20 | AND mi2.info_type_id = it2.id 21 | AND (it1.id in ('7')) 22 | AND (it2.id in ('18')) 23 | AND t.kind_id = kt.id 24 | AND ci.person_id = n.id 25 | AND ci.role_id = rt.id 26 | AND (mi1.info in ('OFM:35 mm','OFM:Live','PFM:35 mm','RAT:1.33 : 1')) 27 | AND (mi2.info in ('20th Century Fox Studios - 10201 Pico Blvd., Century City, Los Angeles, California, USA','Desilu Studios - 9336 W. Washington Blvd., Culver City, California, USA','Hal Roach Studios - 8822 Washington Blvd., Culver City, California, USA','New York City, New York, USA','Revue Studios, Hollywood, Los Angeles, California, USA','Universal Studios - 100 Universal City Plaza, Universal City, California, USA','Warner Brothers Burbank Studios - 4000 Warner Boulevard, Burbank, California, USA')) 28 | AND (kt.kind in ('tv series','video game','video movie')) 29 | AND (rt.role in ('actress','writer')) 30 | AND (n.gender in ('f','m')) 31 | AND (t.production_year <= 1975) 32 | AND (t.production_year >= 1925) 33 | -------------------------------------------------------------------------------- /bao_server/net.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from TreeConvolution.tcnn import BinaryTreeConv, TreeLayerNorm 3 | from TreeConvolution.tcnn import TreeActivation, DynamicPooling 4 | from TreeConvolution.util import prepare_trees 5 | 6 | def left_child(x): 7 | if len(x) != 3: 8 | return None 9 | return x[1] 10 | 11 | def right_child(x): 12 | if len(x) != 3: 13 | return None 14 | return x[2] 15 | 16 | def features(x): 17 | return x[0] 18 | 19 | class BaoNet(nn.Module): 20 | def __init__(self, in_channels): 21 | super(BaoNet, self).__init__() 22 | self.__in_channels = in_channels 23 | self.__cuda = False 24 | 25 | self.tree_conv = nn.Sequential( 26 | BinaryTreeConv(self.__in_channels, 256), 27 | TreeLayerNorm(), 28 | TreeActivation(nn.LeakyReLU()), 29 | BinaryTreeConv(256, 128), 30 | TreeLayerNorm(), 31 | TreeActivation(nn.LeakyReLU()), 32 | BinaryTreeConv(128, 64), 33 | TreeLayerNorm(), 34 | DynamicPooling(), 35 | nn.Linear(64, 32), 36 | nn.LeakyReLU(), 37 | nn.Linear(32, 1) 38 | ) 39 | 40 | def in_channels(self): 41 | return self.__in_channels 42 | 43 | def forward(self, x): 44 | trees = prepare_trees(x, features, left_child, right_child, 45 | cuda=self.__cuda) 46 | return self.tree_conv(trees) 47 | 48 | def cuda(self): 49 | self.__cuda = True 50 | return super().cuda() 51 | -------------------------------------------------------------------------------- /docs/src/tutorial/4_2_analyze_workload_regblock.md: -------------------------------------------------------------------------------- 1 | # Analyze the workload with exploration mode 2 | 3 | We'll use [the same notebook as before](./3_2_analyze_workload.md) to analyze our new `bao_with_regblock.txt` results. You'll need to change the `SHOW_RG = False` line in the 2nd cell to `SHOW_RG = True` to plot both our previous run (without exploration mode) and our new run. 4 | 5 | First, we'll look at queries completed vs. time. 6 | 7 |
8 | 9 | ![Queries vs. time](../assets/queries_vs_time_rg.svg) 10 | 11 |
12 | 13 | The new green line shows the performance of Bao with our three test queries entered into experimental mode. In terms of overall workload performance, exploration mode doesn't help all tha tmuch: the workload finishes only a little bit faster. 14 | 15 | Next, we'll look at the query latency CDFs. 16 | 17 |
18 | 19 | ![Query latency CDF](../assets/cdf_rg.svg) 20 | 21 |
22 | 23 | The green line shows that tail latency has been significantly reduced, which is accounted for almost entirely by avoiding a few regressing query plans. We can verify this with the same table we looked at before: 24 | 25 | {{#include table_rg.html}} 26 | 27 | The first column shows the latency from the query plans produced by the PostgreSQL optimizer. The next two columns show the latency from the query plans produced Bao optimizer. The final two columns show our new results, the latency from the query plans produced by the Bao optimizer with exploration mode. 28 | 29 | The large regressions on query 2 and 3 are eliminated, with both having a much more reasonble worst case time. 30 | 31 | -------------------------------------------------------------------------------- /sample_queries/q13_7a121.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) 2 | FROM title as t, 3 | movie_info as mi1, 4 | kind_type as kt, 5 | info_type as it1, 6 | info_type as it3, 7 | info_type as it4, 8 | movie_info_idx as mii1, 9 | movie_info_idx as mii2, 10 | movie_keyword as mk, 11 | keyword as k, 12 | aka_name as an, 13 | name as n, 14 | info_type as it5, 15 | person_info as pi1, 16 | cast_info as ci, 17 | role_type as rt 18 | WHERE 19 | t.id = mi1.movie_id 20 | AND t.id = ci.movie_id 21 | AND t.id = mii1.movie_id 22 | AND t.id = mii2.movie_id 23 | AND t.id = mk.movie_id 24 | AND mk.keyword_id = k.id 25 | AND mi1.info_type_id = it1.id 26 | AND mii1.info_type_id = it3.id 27 | AND mii2.info_type_id = it4.id 28 | AND t.kind_id = kt.id 29 | AND (kt.kind IN ('episode')) 30 | AND (t.production_year <= 1975) 31 | AND (t.production_year >= 1925) 32 | AND (mi1.info IN ('Mono')) 33 | AND (it1.id IN ('13','6','8')) 34 | AND it3.id = '100' 35 | AND it4.id = '101' 36 | AND (mii2.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND mii2.info::float <= 7.0) 37 | AND (mii2.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND 3.0 <= mii2.info::float) 38 | AND (mii1.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND 0.0 <= mii1.info::float) 39 | AND (mii1.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND mii1.info::float <= 10000.0) 40 | AND n.id = ci.person_id 41 | AND ci.person_id = pi1.person_id 42 | AND it5.id = pi1.info_type_id 43 | AND n.id = pi1.person_id 44 | AND n.id = an.person_id 45 | AND rt.id = ci.role_id 46 | AND (n.gender in ('m') OR n.gender IS NULL) 47 | AND (n.name_pcode_nf in ('A4163','B6563','D1316','F6521','F6523','F6524','J5216','J5262','P3616')) 48 | AND (ci.note IS NULL) 49 | AND (rt.role in ('actor')) 50 | AND (it5.id in ('32')) 51 | -------------------------------------------------------------------------------- /sample_queries/q17_7a164.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) 2 | FROM title as t, 3 | movie_info as mi1, 4 | kind_type as kt, 5 | info_type as it1, 6 | info_type as it3, 7 | info_type as it4, 8 | movie_info_idx as mii1, 9 | movie_info_idx as mii2, 10 | movie_keyword as mk, 11 | keyword as k, 12 | aka_name as an, 13 | name as n, 14 | info_type as it5, 15 | person_info as pi1, 16 | cast_info as ci, 17 | role_type as rt 18 | WHERE 19 | t.id = mi1.movie_id 20 | AND t.id = ci.movie_id 21 | AND t.id = mii1.movie_id 22 | AND t.id = mii2.movie_id 23 | AND t.id = mk.movie_id 24 | AND mk.keyword_id = k.id 25 | AND mi1.info_type_id = it1.id 26 | AND mii1.info_type_id = it3.id 27 | AND mii2.info_type_id = it4.id 28 | AND t.kind_id = kt.id 29 | AND (kt.kind IN ('tv movie','video movie')) 30 | AND (t.production_year <= 2015) 31 | AND (t.production_year >= 1925) 32 | AND (mi1.info IN ('Italy','Japan','Nigeria','Spain','West Germany')) 33 | AND (it1.id IN ('8')) 34 | AND it3.id = '100' 35 | AND it4.id = '101' 36 | AND (mii2.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND mii2.info::float <= 8.0) 37 | AND (mii2.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND 0.0 <= mii2.info::float) 38 | AND (mii1.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND 1000.0 <= mii1.info::float) 39 | AND (mii1.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND mii1.info::float <= 10000.0) 40 | AND n.id = ci.person_id 41 | AND ci.person_id = pi1.person_id 42 | AND it5.id = pi1.info_type_id 43 | AND n.id = pi1.person_id 44 | AND n.id = an.person_id 45 | AND rt.id = ci.role_id 46 | AND (n.gender in ('f')) 47 | AND (n.name_pcode_nf in ('C6235','E4213') OR n.name_pcode_nf IS NULL) 48 | AND (ci.note IS NULL) 49 | AND (rt.role in ('actress')) 50 | AND (it5.id in ('25')) 51 | -------------------------------------------------------------------------------- /sample_queries/q3_7a99.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) 2 | FROM title as t, 3 | movie_info as mi1, 4 | kind_type as kt, 5 | info_type as it1, 6 | info_type as it3, 7 | info_type as it4, 8 | movie_info_idx as mii1, 9 | movie_info_idx as mii2, 10 | movie_keyword as mk, 11 | keyword as k, 12 | aka_name as an, 13 | name as n, 14 | info_type as it5, 15 | person_info as pi1, 16 | cast_info as ci, 17 | role_type as rt 18 | WHERE 19 | t.id = mi1.movie_id 20 | AND t.id = ci.movie_id 21 | AND t.id = mii1.movie_id 22 | AND t.id = mii2.movie_id 23 | AND t.id = mk.movie_id 24 | AND mk.keyword_id = k.id 25 | AND mi1.info_type_id = it1.id 26 | AND mii1.info_type_id = it3.id 27 | AND mii2.info_type_id = it4.id 28 | AND t.kind_id = kt.id 29 | AND (kt.kind IN ('episode')) 30 | AND (t.production_year <= 1975) 31 | AND (t.production_year >= 1875) 32 | AND (mi1.info IN ('Color','OFM:Live','OFM:Video','PFM:Video')) 33 | AND (it1.id IN ('103','2','7')) 34 | AND it3.id = '100' 35 | AND it4.id = '101' 36 | AND (mii2.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND mii2.info::float <= 11.0) 37 | AND (mii2.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND 7.0 <= mii2.info::float) 38 | AND (mii1.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND 0.0 <= mii1.info::float) 39 | AND (mii1.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND mii1.info::float <= 10000.0) 40 | AND n.id = ci.person_id 41 | AND ci.person_id = pi1.person_id 42 | AND it5.id = pi1.info_type_id 43 | AND n.id = pi1.person_id 44 | AND n.id = an.person_id 45 | AND rt.id = ci.role_id 46 | AND (n.gender in ('m')) 47 | AND (n.name_pcode_nf in ('C6231','F6362','F6525','J513','R1631','R1632','R1636','R2631','S2153')) 48 | AND (ci.note IS NULL) 49 | AND (rt.role in ('actor')) 50 | AND (it5.id in ('25')) 51 | -------------------------------------------------------------------------------- /sample_queries/q7_7a48.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) 2 | FROM title as t, 3 | movie_info as mi1, 4 | kind_type as kt, 5 | info_type as it1, 6 | info_type as it3, 7 | info_type as it4, 8 | movie_info_idx as mii1, 9 | movie_info_idx as mii2, 10 | movie_keyword as mk, 11 | keyword as k, 12 | aka_name as an, 13 | name as n, 14 | info_type as it5, 15 | person_info as pi1, 16 | cast_info as ci, 17 | role_type as rt 18 | WHERE 19 | t.id = mi1.movie_id 20 | AND t.id = ci.movie_id 21 | AND t.id = mii1.movie_id 22 | AND t.id = mii2.movie_id 23 | AND t.id = mk.movie_id 24 | AND mk.keyword_id = k.id 25 | AND mi1.info_type_id = it1.id 26 | AND mii1.info_type_id = it3.id 27 | AND mii2.info_type_id = it4.id 28 | AND t.kind_id = kt.id 29 | AND (kt.kind IN ('tv movie','video movie')) 30 | AND (t.production_year <= 2015) 31 | AND (t.production_year >= 1975) 32 | AND (mi1.info IN ('Biography','Fantasy','OFM:35 mm','OFM:Video','Romance','Sci-Fi','Sport','Thriller')) 33 | AND (it1.id IN ('3','7')) 34 | AND it3.id = '100' 35 | AND it4.id = '101' 36 | AND (mii2.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND mii2.info::float <= 7.0) 37 | AND (mii2.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND 3.0 <= mii2.info::float) 38 | AND (mii1.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND 5000.0 <= mii1.info::float) 39 | AND (mii1.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND mii1.info::float <= 500000.0) 40 | AND n.id = ci.person_id 41 | AND ci.person_id = pi1.person_id 42 | AND it5.id = pi1.info_type_id 43 | AND n.id = pi1.person_id 44 | AND n.id = an.person_id 45 | AND rt.id = ci.role_id 46 | AND (n.gender in ('f') OR n.gender IS NULL) 47 | AND (n.name_pcode_nf in ('C6235') OR n.name_pcode_nf IS NULL) 48 | AND (ci.note in ('(archive footage)') OR ci.note IS NULL) 49 | AND (rt.role in ('actress')) 50 | AND (it5.id in ('34')) 51 | -------------------------------------------------------------------------------- /sample_queries/q8_6a505.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) 2 | FROM title as t, 3 | movie_info as mi1, 4 | kind_type as kt, 5 | info_type as it1, 6 | info_type as it3, 7 | info_type as it4, 8 | movie_info_idx as mii1, 9 | movie_info_idx as mii2, 10 | aka_name as an, 11 | name as n, 12 | info_type as it5, 13 | person_info as pi1, 14 | cast_info as ci, 15 | role_type as rt 16 | WHERE 17 | t.id = mi1.movie_id 18 | AND t.id = ci.movie_id 19 | AND t.id = mii1.movie_id 20 | AND t.id = mii2.movie_id 21 | AND mii2.movie_id = mii1.movie_id 22 | AND mi1.movie_id = mii1.movie_id 23 | AND mi1.info_type_id = it1.id 24 | AND mii1.info_type_id = it3.id 25 | AND mii2.info_type_id = it4.id 26 | AND t.kind_id = kt.id 27 | AND (kt.kind IN ('episode','movie')) 28 | AND (t.production_year <= 2015) 29 | AND (t.production_year >= 1925) 30 | AND (mi1.info IN ('Black and White')) 31 | AND (it1.id IN ('2')) 32 | AND it3.id = '100' 33 | AND it4.id = '101' 34 | AND (mii2.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND mii2.info::float <= 7.0) 35 | AND (mii2.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND 3.0 <= mii2.info::float) 36 | AND (mii1.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND 0.0 <= mii1.info::float) 37 | AND (mii1.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND mii1.info::float <= 10000.0) 38 | AND n.id = ci.person_id 39 | AND ci.person_id = pi1.person_id 40 | AND it5.id = pi1.info_type_id 41 | AND n.id = pi1.person_id 42 | AND n.id = an.person_id 43 | AND ci.person_id = an.person_id 44 | AND an.person_id = pi1.person_id 45 | AND rt.id = ci.role_id 46 | AND (n.gender in ('f') OR n.gender IS NULL) 47 | AND (n.name_pcode_nf in ('A5136','B4532','C6435','H4524','J2451','J6362','L2525','M6415','S4125','W5245')) 48 | AND (ci.note in ('(writer)') OR ci.note IS NULL) 49 | AND (rt.role in ('actress','cinematographer','writer')) 50 | AND (it5.id in ('26')) 51 | -------------------------------------------------------------------------------- /bao_server/TreeConvolution/tcnn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class BinaryTreeConv(nn.Module): 5 | def __init__(self, in_channels, out_channels): 6 | super(BinaryTreeConv, self).__init__() 7 | 8 | self.__in_channels = in_channels 9 | self.__out_channels = out_channels 10 | # we can think of the tree conv as a single dense layer 11 | # that we "drag" across the tree. 12 | self.weights = nn.Conv1d(in_channels, out_channels, stride=3, kernel_size=3) 13 | 14 | def forward(self, flat_data): 15 | trees, idxes = flat_data 16 | orig_idxes = idxes 17 | idxes = idxes.expand(-1, -1, self.__in_channels).transpose(1, 2) 18 | expanded = torch.gather(trees, 2, idxes) 19 | 20 | results = self.weights(expanded) 21 | 22 | # add a zero vector back on 23 | zero_vec = torch.zeros((trees.shape[0], self.__out_channels)).unsqueeze(2) 24 | zero_vec = zero_vec.to(results.device) 25 | results = torch.cat((zero_vec, results), dim=2) 26 | return (results, orig_idxes) 27 | 28 | class TreeActivation(nn.Module): 29 | def __init__(self, activation): 30 | super(TreeActivation, self).__init__() 31 | self.activation = activation 32 | 33 | def forward(self, x): 34 | return (self.activation(x[0]), x[1]) 35 | 36 | class TreeLayerNorm(nn.Module): 37 | def forward(self, x): 38 | data, idxes = x 39 | mean = torch.mean(data, dim=(1, 2)).unsqueeze(1).unsqueeze(1) 40 | std = torch.std(data, dim=(1, 2)).unsqueeze(1).unsqueeze(1) 41 | normd = (data - mean) / (std + 0.00001) 42 | return (normd, idxes) 43 | 44 | class DynamicPooling(nn.Module): 45 | def forward(self, x): 46 | return torch.max(x[0], dim=2).values 47 | 48 | -------------------------------------------------------------------------------- /sample_queries/q14_6a349.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) 2 | FROM title as t, 3 | movie_info as mi1, 4 | kind_type as kt, 5 | info_type as it1, 6 | info_type as it3, 7 | info_type as it4, 8 | movie_info_idx as mii1, 9 | movie_info_idx as mii2, 10 | aka_name as an, 11 | name as n, 12 | info_type as it5, 13 | person_info as pi1, 14 | cast_info as ci, 15 | role_type as rt 16 | WHERE 17 | t.id = mi1.movie_id 18 | AND t.id = ci.movie_id 19 | AND t.id = mii1.movie_id 20 | AND t.id = mii2.movie_id 21 | AND mii2.movie_id = mii1.movie_id 22 | AND mi1.movie_id = mii1.movie_id 23 | AND mi1.info_type_id = it1.id 24 | AND mii1.info_type_id = it3.id 25 | AND mii2.info_type_id = it4.id 26 | AND t.kind_id = kt.id 27 | AND (kt.kind IN ('episode','movie')) 28 | AND (t.production_year <= 1990) 29 | AND (t.production_year >= 1950) 30 | AND (mi1.info IN ('OFM:35 mm','PCS:Spherical','PFM:35 mm','PFM:Video','RAT:1.33 : 1','RAT:1.37 : 1')) 31 | AND (it1.id IN ('15','7','98')) 32 | AND it3.id = '100' 33 | AND it4.id = '101' 34 | AND (mii2.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND mii2.info::float <= 11.0) 35 | AND (mii2.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND 7.0 <= mii2.info::float) 36 | AND (mii1.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND 0.0 <= mii1.info::float) 37 | AND (mii1.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND mii1.info::float <= 1000.0) 38 | AND n.id = ci.person_id 39 | AND ci.person_id = pi1.person_id 40 | AND it5.id = pi1.info_type_id 41 | AND n.id = pi1.person_id 42 | AND n.id = an.person_id 43 | AND ci.person_id = an.person_id 44 | AND an.person_id = pi1.person_id 45 | AND rt.id = ci.role_id 46 | AND (n.gender in ('m') OR n.gender IS NULL) 47 | AND (n.name_pcode_nf in ('B6514','D1352','J5163','J5245','M2423','M6126','M6241','M6245','M6252','P3614','V2361','W4125','W4525')) 48 | AND (ci.note in ('(executive producer)') OR ci.note IS NULL) 49 | AND (rt.role in ('actor','producer')) 50 | AND (it5.id in ('19')) 51 | -------------------------------------------------------------------------------- /sample_queries/q18_7a103.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) 2 | FROM title as t, 3 | movie_info as mi1, 4 | kind_type as kt, 5 | info_type as it1, 6 | info_type as it3, 7 | info_type as it4, 8 | movie_info_idx as mii1, 9 | movie_info_idx as mii2, 10 | movie_keyword as mk, 11 | keyword as k, 12 | aka_name as an, 13 | name as n, 14 | info_type as it5, 15 | person_info as pi1, 16 | cast_info as ci, 17 | role_type as rt 18 | WHERE 19 | t.id = mi1.movie_id 20 | AND t.id = ci.movie_id 21 | AND t.id = mii1.movie_id 22 | AND t.id = mii2.movie_id 23 | AND t.id = mk.movie_id 24 | AND mk.keyword_id = k.id 25 | AND mi1.info_type_id = it1.id 26 | AND mii1.info_type_id = it3.id 27 | AND mii2.info_type_id = it4.id 28 | AND t.kind_id = kt.id 29 | AND (kt.kind IN ('episode','movie')) 30 | AND (t.production_year <= 2015) 31 | AND (t.production_year >= 1925) 32 | AND (mi1.info IN ('Buenos Aires, Federal District, Argentina','Los Angeles, California, USA','Mexico','New York City, New York, USA','Paramount Studios - 5555 Melrose Avenue, Hollywood, Los Angeles, California, USA')) 33 | AND (it1.id IN ('18')) 34 | AND it3.id = '100' 35 | AND it4.id = '101' 36 | AND (mii2.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND mii2.info::float <= 7.0) 37 | AND (mii2.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND 3.0 <= mii2.info::float) 38 | AND (mii1.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND 0.0 <= mii1.info::float) 39 | AND (mii1.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND mii1.info::float <= 10000.0) 40 | AND n.id = ci.person_id 41 | AND ci.person_id = pi1.person_id 42 | AND it5.id = pi1.info_type_id 43 | AND n.id = pi1.person_id 44 | AND n.id = an.person_id 45 | AND rt.id = ci.role_id 46 | AND (n.gender in ('m') OR n.gender IS NULL) 47 | AND (n.name_pcode_nf in ('D2313','E3261','G625','J5141','K1524','L2125','M2563','M265','M3215','P3625','S5326','S6256','T5252')) 48 | AND (ci.note in ('(writer)') OR ci.note IS NULL) 49 | AND (rt.role in ('actor','director','writer')) 50 | AND (it5.id in ('31')) 51 | -------------------------------------------------------------------------------- /pg_extension/compile_commands.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "arguments": [ 4 | "gcc", 5 | "-c", 6 | "-Wall", 7 | "-Wmissing-prototypes", 8 | "-Wpointer-arith", 9 | "-Wdeclaration-after-statement", 10 | "-Werror=vla", 11 | "-Wendif-labels", 12 | "-Wmissing-format-attribute", 13 | "-Wformat-security", 14 | "-fno-strict-aliasing", 15 | "-fwrapv", 16 | "-fexcess-precision=standard", 17 | "-Wno-format-truncation", 18 | "-Wno-stringop-truncation", 19 | "-march=x86-64", 20 | "-mtune=generic", 21 | "-O2", 22 | "-pipe", 23 | "-fno-plt", 24 | "-fPIC", 25 | "-I.", 26 | "-I./", 27 | "-I/usr/include/postgresql/server", 28 | "-I/usr/include/postgresql/internal", 29 | "-D_FORTIFY_SOURCE=2", 30 | "-D_GNU_SOURCE", 31 | "-I/usr/include/libxml2", 32 | "-o", 33 | "main.o", 34 | "main.c" 35 | ], 36 | "directory": "/home/ryan/Dropbox/projects/pg_bao", 37 | "file": "main.c" 38 | }, 39 | { 40 | "arguments": [ 41 | "/usr/bin/clang", 42 | "-c", 43 | "-Wno-ignored-attributes", 44 | "-fno-strict-aliasing", 45 | "-fwrapv", 46 | "-O2", 47 | "-I.", 48 | "-I./", 49 | "-I/usr/include/postgresql/server", 50 | "-I/usr/include/postgresql/internal", 51 | "-D_FORTIFY_SOURCE=2", 52 | "-D_GNU_SOURCE", 53 | "-I/usr/include/libxml2", 54 | "-flto=thin", 55 | "-emit-llvm", 56 | "-o", 57 | "main.bc", 58 | "main.c" 59 | ], 60 | "directory": "/home/ryan/Dropbox/projects/pg_bao", 61 | "file": "main.c" 62 | } 63 | ] -------------------------------------------------------------------------------- /docs/src/pg_vars_table.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 |
VariableDefaultDescription
enable_baoOFFTop-level switch for all Bao functions. When disabled, Bao operates purely in advisor mode.
enable_bao_rewardsONIf this and enable_bao are on, Bao will report and record rewards from queries executed in this session.
enable_bao_selectionONIf this and enable_bao are on, Bao will use its value model to select query plans at optimization time.
bao_hostlocalhostHost where the Bao server is running. Can be changed to put the Bao server on a different machine than PostgreSQL.
bao_port9381Port to access the Bao server. If you change this, you may also need to change the port the Bao server listens on.
bao_num_arms5The number of arms Bao should examine during query optimization. Lower values decrease optimization time, higher values may lead to better query plans.
bao_include_json_in_explainOFFIncludes the JSON of the Bao plan in the output of EXPLAIN. Used internally.
47 | 48 | More information about each variable can be found in the `pg_settings` internal table: 49 | 50 | ```sql 51 | SELECT name, extra_desc FROM pg_settings WHERE name LIKE '%bao%'; 52 | ``` 53 | -------------------------------------------------------------------------------- /sample_queries/q36_7a136.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) 2 | FROM title as t, 3 | movie_info as mi1, 4 | kind_type as kt, 5 | info_type as it1, 6 | info_type as it3, 7 | info_type as it4, 8 | movie_info_idx as mii1, 9 | movie_info_idx as mii2, 10 | movie_keyword as mk, 11 | keyword as k, 12 | aka_name as an, 13 | name as n, 14 | info_type as it5, 15 | person_info as pi1, 16 | cast_info as ci, 17 | role_type as rt 18 | WHERE 19 | t.id = mi1.movie_id 20 | AND t.id = ci.movie_id 21 | AND t.id = mii1.movie_id 22 | AND t.id = mii2.movie_id 23 | AND t.id = mk.movie_id 24 | AND mk.keyword_id = k.id 25 | AND mi1.info_type_id = it1.id 26 | AND mii1.info_type_id = it3.id 27 | AND mii2.info_type_id = it4.id 28 | AND t.kind_id = kt.id 29 | AND (kt.kind IN ('episode','movie')) 30 | AND (t.production_year <= 2015) 31 | AND (t.production_year >= 1925) 32 | AND (mi1.info IN ('Austria','Czechoslovakia','Denmark','Hong Kong','Poland','Portugal','South Korea','Soviet Union','Sweden','Switzerland','Turkey','Yugoslavia')) 33 | AND (it1.id IN ('15','8','97')) 34 | AND it3.id = '100' 35 | AND it4.id = '101' 36 | AND (mii2.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND mii2.info::float <= 8.0) 37 | AND (mii2.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND 0.0 <= mii2.info::float) 38 | AND (mii1.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND 0.0 <= mii1.info::float) 39 | AND (mii1.info ~ '^(?:[1-9]\d*|0)?(?:\.\d+)?$' AND mii1.info::float <= 1000.0) 40 | AND n.id = ci.person_id 41 | AND ci.person_id = pi1.person_id 42 | AND it5.id = pi1.info_type_id 43 | AND n.id = pi1.person_id 44 | AND n.id = an.person_id 45 | AND rt.id = ci.role_id 46 | AND (n.gender IS NULL) 47 | AND (n.name_pcode_nf in ('A4163','A4253','A5362','A6532','C5321','C6231','C6235','R516','R5316','S3152','S3521') OR n.name_pcode_nf IS NULL) 48 | AND (ci.note in ('(deviser)','(producer)','(production assistant)','(senior producer)','(supervising producer)','(writer)') OR ci.note IS NULL) 49 | AND (rt.role in ('cinematographer','composer','director','editor','miscellaneous crew','producer','production designer','writer')) 50 | AND (it5.id in ('19')) 51 | -------------------------------------------------------------------------------- /bao_server/TreeConvolution/test/test_tree_conv.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | from torch import nn 4 | 5 | from util import prepare_trees 6 | import tcnn 7 | 8 | class TestTreeConvolution(unittest.TestCase): 9 | 10 | def test_example(self): 11 | # simple smoke test from the example file 12 | tree1 = ( 13 | (0, 1), 14 | ((1, 2), ((0, 1),), ((-1, 0),)), 15 | ((-3, 0), ((2, 3),), ((1, 2),)) 16 | ) 17 | 18 | tree2 = ( 19 | (16, 3), 20 | ((0, 1), ((5, 3),), ((2, 6),)), 21 | ((2, 9),) 22 | ) 23 | 24 | trees = [tree1, tree2] 25 | 26 | # function to extract the left child of a node 27 | def left_child(x): 28 | assert isinstance(x, tuple) 29 | if len(x) == 1: 30 | # leaf. 31 | return None 32 | return x[1] 33 | 34 | # function to extract the right child of node 35 | def right_child(x): 36 | assert isinstance(x, tuple) 37 | if len(x) == 1: 38 | # leaf. 39 | return None 40 | return x[2] 41 | 42 | # function to transform a node into a (feature) vector, 43 | # should be a numpy array. 44 | def transformer(x): 45 | return np.array(x[0]) 46 | 47 | 48 | prepared_trees = prepare_trees(trees, transformer, left_child, right_child) 49 | net = nn.Sequential( 50 | tcnn.BinaryTreeConv(2, 16), 51 | tcnn.TreeLayerNorm(), 52 | tcnn.TreeActivation(nn.ReLU()), 53 | tcnn.BinaryTreeConv(16, 8), 54 | tcnn.TreeLayerNorm(), 55 | tcnn.TreeActivation(nn.ReLU()), 56 | tcnn.BinaryTreeConv(8, 4), 57 | tcnn.TreeLayerNorm(), 58 | tcnn.TreeActivation(nn.ReLU()), 59 | tcnn.DynamicPooling() 60 | ) 61 | 62 | # output: torch.Size([2, 4]) 63 | shape = tuple(net(prepared_trees).shape) 64 | self.assertEqual(shape, (2, 4)) 65 | 66 | if __name__ == '__main__': 67 | unittest.main() 68 | -------------------------------------------------------------------------------- /bao_server/TreeConvolution/example.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from torch import nn 3 | 4 | from util import prepare_trees 5 | import tcnn 6 | 7 | # First tree: 8 | # (0, 1) 9 | # (1, 2) (-3, 0) 10 | # (0, 1) (-1, 0) (2, 3) (1, 2) 11 | 12 | tree1 = ( 13 | (0, 1), 14 | ((1, 2), ((0, 1),), ((-1, 0),)), 15 | ((-3, 0), ((2, 3),), ((1, 2),)) 16 | ) 17 | 18 | # Second tree: 19 | # (16, 3) 20 | # (0, 1) (2, 9) 21 | # (5, 3) (2, 6) 22 | 23 | tree2 = ( 24 | (16, 3), 25 | ((0, 1), ((5, 3),), ((2, 6),)), 26 | ((2, 9),) 27 | ) 28 | 29 | 30 | trees = [tree1, tree2] 31 | 32 | # function to extract the left child of a node 33 | def left_child(x): 34 | assert isinstance(x, tuple) 35 | if len(x) == 1: 36 | # leaf. 37 | return None 38 | return x[1] 39 | 40 | # function to extract the right child of node 41 | def right_child(x): 42 | assert isinstance(x, tuple) 43 | if len(x) == 1: 44 | # leaf. 45 | return None 46 | return x[2] 47 | 48 | # function to transform a node into a (feature) vector, 49 | # should be a numpy array. 50 | def transformer(x): 51 | return np.array(x[0]) 52 | 53 | 54 | # this call to `prepare_trees` will create the correct input for 55 | # a `tcnn.BinaryTreeConv` operator. 56 | prepared_trees = prepare_trees(trees, transformer, left_child, right_child) 57 | 58 | # A tree convolution neural network mapping our input trees with 59 | # 2 channels to trees with 16 channels, then 8 channels, then 4 channels. 60 | # Between each mapping, we apply layer norm and then a ReLU activation. 61 | # Finally, we apply "dynamic pooling", which returns a flattened vector. 62 | 63 | net = nn.Sequential( 64 | tcnn.BinaryTreeConv(2, 16), 65 | tcnn.TreeLayerNorm(), 66 | tcnn.TreeActivation(nn.ReLU()), 67 | tcnn.BinaryTreeConv(16, 8), 68 | tcnn.TreeLayerNorm(), 69 | tcnn.TreeActivation(nn.ReLU()), 70 | tcnn.BinaryTreeConv(8, 4), 71 | tcnn.TreeLayerNorm(), 72 | tcnn.TreeActivation(nn.ReLU()), 73 | tcnn.DynamicPooling() 74 | ) 75 | 76 | # output: torch.Size([2, 4]) 77 | print(net(prepared_trees).shape) 78 | -------------------------------------------------------------------------------- /run_queries.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | import os 3 | import sys 4 | import random 5 | from time import time, sleep 6 | 7 | USE_BAO = True 8 | PG_CONNECTION_STR = "dbname=imdb user=imdb host=localhost" 9 | 10 | # https://stackoverflow.com/questions/312443/ 11 | def chunks(lst, n): 12 | """Yield successive n-sized chunks from lst.""" 13 | for i in range(0, len(lst), n): 14 | yield lst[i:i + n] 15 | 16 | 17 | def run_query(sql, bao_select=False, bao_reward=False): 18 | start = time() 19 | while True: 20 | try: 21 | conn = psycopg2.connect(PG_CONNECTION_STR) 22 | cur = conn.cursor() 23 | cur.execute(f"SET enable_bao TO {bao_select or bao_reward}") 24 | cur.execute(f"SET enable_bao_selection TO {bao_select}") 25 | cur.execute(f"SET enable_bao_rewards TO {bao_reward}") 26 | cur.execute("SET bao_num_arms TO 5") 27 | cur.execute("SET statement_timeout TO 300000") 28 | cur.execute(q) 29 | cur.fetchall() 30 | conn.close() 31 | break 32 | except: 33 | sleep(1) 34 | continue 35 | stop = time() 36 | return stop - start 37 | 38 | 39 | query_paths = sys.argv[1:] 40 | queries = [] 41 | for fp in query_paths: 42 | with open(fp) as f: 43 | query = f.read() 44 | queries.append((fp, query)) 45 | print("Read", len(queries), "queries.") 46 | print("Using Bao:", USE_BAO) 47 | 48 | random.seed(42) 49 | query_sequence = random.choices(queries, k=500) 50 | pg_chunks, *bao_chunks = list(chunks(query_sequence, 25)) 51 | 52 | print("Executing queries using PG optimizer for initial training") 53 | 54 | for fp, q in pg_chunks: 55 | pg_time = run_query(q, bao_reward=True) 56 | print("x", "x", time(), fp, pg_time, "PG", flush=True) 57 | 58 | for c_idx, chunk in enumerate(bao_chunks): 59 | if USE_BAO: 60 | os.system("cd bao_server && python3 baoctl.py --retrain") 61 | os.system("sync") 62 | for q_idx, (fp, q) in enumerate(chunk): 63 | q_time = run_query(q, bao_reward=USE_BAO, bao_select=USE_BAO) 64 | print(c_idx, q_idx, time(), fp, q_time, flush=True) 65 | -------------------------------------------------------------------------------- /bao_server/train.py: -------------------------------------------------------------------------------- 1 | import storage 2 | import model 3 | import os 4 | import shutil 5 | import reg_blocker 6 | 7 | class BaoTrainingException(Exception): 8 | pass 9 | 10 | def train_and_swap(fn, old, tmp, verbose=False): 11 | if os.path.exists(fn): 12 | old_model = model.BaoRegression(have_cache_data=True) 13 | old_model.load(fn) 14 | else: 15 | old_model = None 16 | 17 | new_model = train_and_save_model(tmp, verbose=verbose) 18 | max_retries = 5 19 | current_retry = 1 20 | while not reg_blocker.should_replace_model(old_model, new_model): 21 | if current_retry >= max_retries == 0: 22 | print("Could not train model with better regression profile.") 23 | return 24 | 25 | print("New model rejected when compared with old model. " 26 | + "Trying to retrain with emphasis on regressions.") 27 | print("Retry #", current_retry) 28 | new_model = train_and_save_model(tmp, verbose=verbose, 29 | emphasize_experiments=current_retry) 30 | current_retry += 1 31 | 32 | if os.path.exists(fn): 33 | shutil.rmtree(old, ignore_errors=True) 34 | os.rename(fn, old) 35 | os.rename(tmp, fn) 36 | 37 | def train_and_save_model(fn, verbose=True, emphasize_experiments=0): 38 | all_experience = storage.experience() 39 | 40 | for _ in range(emphasize_experiments): 41 | all_experience.extend(storage.experiment_experience()) 42 | 43 | x = [i[0] for i in all_experience] 44 | y = [i[1] for i in all_experience] 45 | 46 | if not all_experience: 47 | raise BaoTrainingException("Cannot train a Bao model with no experience") 48 | 49 | if len(all_experience) < 20: 50 | print("Warning: trying to train a Bao model with fewer than 20 datapoints.") 51 | 52 | reg = model.BaoRegression(have_cache_data=True, verbose=verbose) 53 | reg.fit(x, y) 54 | reg.save(fn) 55 | return reg 56 | 57 | 58 | if __name__ == "__main__": 59 | import sys 60 | if len(sys.argv) != 2: 61 | print("Usage: train.py MODEL_FILE") 62 | exit(-1) 63 | train_and_save_model(sys.argv[1]) 64 | 65 | print("Model saved, attempting load...") 66 | reg = model.BaoRegression(have_cache_data=True) 67 | reg.load(sys.argv[1]) 68 | 69 | -------------------------------------------------------------------------------- /bao_server/TreeConvolution/test/test_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | from util import prepare_trees, TreeConvolutionError 4 | 5 | 6 | class TestUtils(unittest.TestCase): 7 | 8 | def test_prepare(self): 9 | # simple smoke test from the example file 10 | tree1 = ( 11 | (0, 1), 12 | ((1, 2), ((0, 1),), ((-1, 0),)), 13 | ((-3, 0), ((2, 3),), ((1, 2),)) 14 | ) 15 | 16 | tree2 = ( 17 | (16, 3), 18 | ((0, 1), ((5, 3),), ((2, 6),)), 19 | ((2, 9),) 20 | ) 21 | 22 | trees = [tree1, tree2] 23 | 24 | # function to extract the left child of a node 25 | def left_child(x): 26 | assert isinstance(x, tuple) 27 | if len(x) == 1: 28 | # leaf. 29 | return None 30 | return x[1] 31 | 32 | # function to extract the right child of node 33 | def right_child(x): 34 | assert isinstance(x, tuple) 35 | if len(x) == 1: 36 | # leaf. 37 | return None 38 | return x[2] 39 | 40 | # function to transform a node into a (feature) vector, 41 | # should be a numpy array. 42 | def transformer(x): 43 | return np.array(x[0]) 44 | 45 | 46 | prepared_trees = prepare_trees(trees, transformer, left_child, right_child) 47 | self.assertEqual(len(prepared_trees), 2) 48 | 49 | def test_raises_on_malformed(self): 50 | # simple smoke test from the example file 51 | tree1 = ( 52 | (0, 1), 53 | ((1, 2), ((0, 1),), ((-1, 0),)), 54 | ((-3, 0), ((2, 3),), ((1, 2),)) 55 | ) 56 | 57 | tree2 = ( 58 | (16, 3, 2), 59 | ((0, 1), ((5, 3),), ((2, 6),)), 60 | ((2, 9),) 61 | ) 62 | 63 | trees = [tree1, tree2] 64 | 65 | # function to extract the left child of a node 66 | def left_child(x): 67 | assert isinstance(x, tuple) 68 | if len(x) == 1: 69 | # leaf. 70 | return None 71 | return x[1] 72 | 73 | # function to extract the right child of node 74 | def right_child(x): 75 | assert isinstance(x, tuple) 76 | if len(x) == 1: 77 | # leaf. 78 | return None 79 | return x[2] 80 | 81 | # function to transform a node into a (feature) vector, 82 | # should be a numpy array. 83 | def transformer(x): 84 | return np.array(x[0]) 85 | 86 | 87 | with self.assertRaises(TreeConvolutionError): 88 | prepare_trees(trees, 89 | transformer, left_child, right_child) 90 | 91 | if __name__ == '__main__': 92 | unittest.main() 93 | -------------------------------------------------------------------------------- /docs/src/introduction.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | This is the documentatation for Bao for PostgreSQL. 4 | 5 | * GitHub: [https://learned.systems/bao](https://learned.systems/bao) 6 | 7 | 8 | Note that Bao requires at least PostgreSQL 12. 9 | 10 | Bao is a learned query optimizer for PostgreSQL. Bao works by providing automatic coarse-grained query hints (e.g., `SET enable_nestloop TO off`) on a per-query basis. Bao uses reinforcement learning, so Bao learns from it mistakes. 11 | 12 | Bao has two components: the *Bao server*, which is a standalone Python application, and the *PostgreSQL extension*, which integrates directly with PostgreSQL and communicates with the Bao server. The best way to try out Bao is to follow [the tutorial](./tutorial.md). 13 | 14 | This implementation has a number of features: 15 | 16 | * In the default configuration, Bao works as a learned query optimizer, providing coarse-grained hints to the PostgreSQL query planner and incorporating feedback from query execution to improve its recommendations. 17 | * Bao provides a continually-updated *query performance prediction* model that is custom-tailored to your DB and workload. Even if you do not use Bao for query optimization, you can still use it to predict the runtime of your queries. Runtime predictions are made available via `EXPLAIN`. 18 | * Bao can be used as an *advisor*, simply providing the coarse-grained hints that Bao would use if Bao were running as a full optimizer. This allows you to manually apply Bao's recommendations to only a few queries. 19 | * Since Bao uses reinforcement learning, Bao must balance exploration and exploitation, and will occasionally try out a query plan that may be slower than the one chosen by PostgreSQL; you have to make mistakes in order to learn! However, when regressions on certain queries are unacceptable, these special queries can be pre-explored using Bao's *exploratory mode*. Queries added to Bao's exploratory mode are tested at user-defined times, and future Bao models are checked to properly handle these queries. Bao will *never* pick a regressed query plan for a query processed in exploratory mode. 20 | * Separate server process that can run on a different machine from your database. You can offload model training, potentially to a machine with a GPU. You can also co-locate multiple Bao servers together, if you have multiple DBs, so they share training resources. 21 | 22 | Bao is provided under a GPLv3 license. Specifically, please note: 23 | 24 | > THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 25 | > APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 26 | > HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 27 | > OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 28 | > THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 | > PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 30 | > IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 31 | > ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 32 | -------------------------------------------------------------------------------- /docs/src/tutorial/3_example_workload.md: -------------------------------------------------------------------------------- 1 | # Run an example workload 2 | 3 | Next, we'll use Bao to execute a small sample workload. Then, we'll execute that same workload without Bao, and analyze the results. 4 | 5 | The `sample_queries` folder in the repository root contains 40 sample queries drawn from the original join order benchmark (JOB)[^howgood] and from extended JOB[^neo]: 6 | 7 | ``` 8 | $ cd sample_queries 9 | $ ls 10 | q10_2a265.sql q17_7a164.sql q23_19d.sql q29_6e.sql q36_7a136.sql q5_8a423.sql 11 | q11_17e.sql q18_7a103.sql q24_32a.sql q30_18c.sql q37_2a1291.sql q6_16b.sql 12 | q12_17a.sql q1_8a463.sql q25_13d.sql q31_2a39.sql q3_7a99.sql q7_7a48.sql 13 | q13_7a121.sql q19_2a471.sql q26_2a274.sql q32_2a493.sql q38_2a1870.sql q8_6a505.sql 14 | q14_6a349.sql q20_24b.sql q27_3c.sql q33_2a156.sql q39_2a2781.sql q9_5a48.sql 15 | q15_18a.sql q21_2a396.sql q28_13a.sql q34_1a275.sql q40_2a8120.sql 16 | q16_26c.sql q22_8a27.sql q2_8a82.sql q35_1a1508.sql q4_8a122.sql 17 | ``` 18 | 19 | The `run_queries.py` script will execute a random workload with 500 queries drawn from these samples. First, 25 queries will be executed to provide some basic training data for Bao. Then, and for every 25 queries processed afterwards, the script will pause query execution to retrain Bao's model. 20 | 21 | The `run_queries.py` script assumes your DB is reachable on `localhost`, with the username `imdb` in the database `imdb`. If this is not the case, modify the `PG_CONNECTION_STR` variable at the top of the file. 22 | 23 | Start the run: 24 | ``` 25 | $ python3 run_queries.py sample_queries/*.sql | tee ~/bao_run.txt 26 | ``` 27 | 28 | We use the `tee` command to both show us the output and redirect the output to a file, which we analyze later. Grab a coffee, this run will take a while to finish (around 75 minutes on my hardware). 29 | 30 | Next, once this run is finished, change the line in `run_queries.py`: 31 | 32 | ```python 33 | USE_BAO = True 34 | ``` 35 | 36 | To: 37 | 38 | 39 | ```python 40 | USE_BAO = False 41 | ``` 42 | 43 | This will cause the `run_queries.py` script to execute the exact same workload, but without using Bao to select query plans and without retraining a model every 25 queries. Start the run: 44 | 45 | ``` 46 | $ python3 run_queries.py sample_queries/*.sql | tee ~/pg_run.txt 47 | 48 | ``` 49 | 50 | ... and grab another coffee. This took just under 3 hours on my hardware. The fact that the workload finishes faster with Bao enabled is already telling, but next we will analyze these two runs in detail. 51 | 52 | # Notes 53 | 54 | [^howgood]: Leis, Viktor, Andrey Gubichev, Atanas Mirchev, Peter Boncz, Alfons Kemper, and Thomas Neumann. “How Good Are Query Optimizers, Really?” PVLDB, VLDB ’15, 9, no. 3 (2015): 204–215. https://doi.org/10.14778/2850583.2850594. 55 | 56 | [^neo]: Marcus, Ryan, Parimarjan Negi, Hongzi Mao, Chi Zhang, Mohammad Alizadeh, Tim Kraska, Olga Papaemmanouil, and Nesime Tatbul. “Neo: A Learned Query Optimizer.” PVLDB, VLDB ’19, 12, no. 11 (2019): 1705–18. 57 | -------------------------------------------------------------------------------- /docs/src/tutorial/3_2_analyze_workload.md: -------------------------------------------------------------------------------- 1 | # Analyzing the executions 2 | 3 | With logs of our saved runs in `bao_run.txt` and `pg_run.txt`, we can next analyze them. If you are using the VM, you can move these files to `/vagrant/` to access them from outside the VM. 4 | 5 | To analyze the logs, run the `analyze_bao.ipynb` Python notebook. Make sure `bao_run.txt` and `pg_run.txt` are in the same folder. You'll need [Jupyter](https://jupyter.org/) to run the notebook if you don't already have it. 6 | 7 | The notebook will first generate a queries complete vs. time graph. Depending on your hardware, the results will vary. Below is what I saw on my hardware: 8 | 9 |
10 | 11 | ![Queries completed over time](../assets/queries_vs_time.svg) 12 | 13 |
14 | 15 | The graph shows the number of queries that have been completed since the start of the experiment, *including the training time of Bao*. Each red circle indicates a time when query execution was paused and the Bao model was trained. Obviously, when deployed, one does not have to pause query execution to retrain the Bao model, and this retraining can either be offloaded to another machine or done at a fixed time during the day, concurrent with query processing. On my hardware, Bao executed the example workload about twice as fast as the default PostgreSQL optimizer. 16 | 17 | Where do Bao's gains come from? Finishing the workload faster could come from making every query a little faster, or making a few queries much faster. To examine this, we look at the CDF of query times, which is the next pair of graphs generated by the notebook. 18 | 19 |
20 | 21 | ![CDF of query completion time](../assets/cdf.svg) 22 | 23 |
24 | 25 | The left figure shows the CDF with a linear y-axis, whereas the right figure shows the same CDF with a log scale. We can see that the majority of Bao's gains come from decreasing latency "at the tail" of the distribution: the 90%, 95%, etc. Bao incurs a slightly longer longest-running query (best visible on the left plot), and increases the query time of the fastest queries slightly (best visible on the right plot). 26 | 27 | Bao increases the latency of the fastest query by a small margin because of increased optimization time. This is because query planning with Bao requires executing the PostgreSQL planner several times, and running inference through a neural network. If you care about a particular query taking 0.01 seconds instead of 0.05 seconds, you can always disable Bao on a per-query basis. 28 | 29 | What about query regressions? The next table shows the latency of each query when using the PostgreSQL optimizer, and the worst / best time achieved by Bao. 30 | 31 | {{#include table.html}} 32 | 33 | We can see immediately that Bao has huge gains on Q1, which takes almost 5 minutes for PostgreSQL to process. However, on Q2 and Q3, while Bao normally found a pretty good plan (the Bao best column), sometimes Bao picked a regressing plan that took significantly longer to execute than the PostgreSQL plan. 34 | 35 | For some applications, the raw workload speedup achieved by Bao may be more important than any of these query regressions. For other applications, these query regressions may be a huge problem. Next, we discuss how to use Bao's exploration mode to avoid these regressions. 36 | -------------------------------------------------------------------------------- /pg_extension/bao_bufferstate.h: -------------------------------------------------------------------------------- 1 | #ifndef BAO_BUFFERSTATE_H 2 | #define BAO_BUFFERSTATE_H 3 | 4 | #include 5 | #include 6 | 7 | #include "postgres.h" 8 | #include "uthash.h" 9 | #include "postgres_ext.h" 10 | #include "storage/bufmgr.h" 11 | #include "storage/buf_internals.h" 12 | #include "utils/lsyscache.h" 13 | #include "utils/relfilenodemap.h" 14 | 15 | #include "bao_util.h" 16 | 17 | // Functions to create a JSON representation of the current PostgreSQL buffer state. 18 | 19 | // Used to track how many buffer blocks are used for a particular relation. 20 | struct buffer_counter { 21 | const char* key; 22 | int count; 23 | UT_hash_handle hh; 24 | }; 25 | 26 | 27 | // modified from pg_buffercache_pages 28 | static char* buffer_state() { 29 | // Generate a JSON string mapping each relation name to the number of buffered 30 | // blocks that relation has in the PG buffer cache. 31 | 32 | int i; 33 | Oid tablespace, relfilenode, relid; 34 | char* rel_name; 35 | char* buf; 36 | size_t json_size; 37 | FILE* stream; 38 | 39 | 40 | struct buffer_counter* map = NULL; 41 | struct buffer_counter* query = NULL; 42 | struct buffer_counter* tmp = NULL; 43 | 44 | // For each buffer, we either add or increment a hash table entry. 45 | for (i = 0; i < NBuffers; i++) { 46 | BufferDesc *bufHdr; 47 | 48 | bufHdr = GetBufferDescriptor(i); 49 | 50 | // In theory, we could lock each buffer header before reading it. 51 | // But this might slow down PG, and if our buffer cache is a little 52 | // inaccurate that is OK. Just keep in mind that the tablespace 53 | // and relfilenode we read from the buffer header may be inconsistent. 54 | //buf_state = LockBufHdr(bufHdr); 55 | 56 | tablespace = bufHdr->tag.rnode.spcNode; 57 | relfilenode = bufHdr->tag.rnode.relNode; 58 | 59 | // Ensure both are valid. 60 | if (tablespace == InvalidOid || relfilenode == InvalidOid) 61 | continue; 62 | 63 | // Get the relation ID attached to this file node. 64 | relid = RelidByRelfilenode(tablespace, relfilenode); 65 | if (relid == InvalidOid) 66 | continue; 67 | 68 | // Convert the relid to an actual relation name. 69 | rel_name = get_rel_name(relid); 70 | if (rel_name == NULL) 71 | continue; 72 | 73 | // Exclude system tables. 74 | if (starts_with(rel_name, "pg_") || starts_with(rel_name, "sql_")) 75 | continue; 76 | 77 | // See if this string is already in the table. If so, increment the count. 78 | // If not, add a new entry. 79 | HASH_FIND_STR(map, rel_name, query); 80 | if (query) { 81 | query->count++; 82 | } else { 83 | query = (struct buffer_counter*) malloc(sizeof(struct buffer_counter)); 84 | query->key = rel_name; 85 | query->count = 1; 86 | HASH_ADD_KEYPTR(hh, map, query->key, strlen(query->key), query); 87 | } 88 | } 89 | 90 | // The hash table aggregation is done. Next, construct a JSON string. 91 | stream = open_memstream(&buf, &json_size); 92 | 93 | fprintf(stream, "{ "); // extra space here in case there is nothing in cache 94 | HASH_ITER(hh, map, query, tmp) { 95 | HASH_DEL(map, query); 96 | fprintf(stream, "\"%s\": %d,", query->key, query->count); 97 | } 98 | fprintf(stream, "}\n"); 99 | fclose(stream); 100 | 101 | // Replace the last trailing comma with a space. 102 | buf[json_size-3] = ' '; 103 | return buf; 104 | 105 | } 106 | 107 | 108 | #endif 109 | -------------------------------------------------------------------------------- /bao_server/baoctl.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import socket 3 | import json 4 | 5 | def __json_bytes(obj): 6 | return (json.dumps(obj) + "\n").encode("UTF-8") 7 | 8 | def __connect(): 9 | s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 10 | s.connect(("localhost", 9381)) 11 | return s 12 | 13 | def send_model_load(path): 14 | with __connect() as s: 15 | s.sendall(__json_bytes({"type": "load model"})) 16 | s.sendall(__json_bytes({"path": path})) 17 | s.sendall(__json_bytes({"final": True})) 18 | 19 | 20 | if __name__ == "__main__": 21 | parser = argparse.ArgumentParser("Bao for PostgreSQL Controller") 22 | parser.add_argument("--load", 23 | metavar="PATH", 24 | help="Load the saved Bao model") 25 | parser.add_argument("--train", 26 | metavar="PATH", 27 | help="Train a Bao model and save it") 28 | parser.add_argument("--retrain", action="store_true", 29 | help="Force the Bao server to train a model and load it") 30 | parser.add_argument("--test-connection", action="store_true", 31 | help="Test the connection from the Bao server to the PostgreSQL instance.") 32 | parser.add_argument("--add-test-query", metavar="PATH", 33 | help="Add the SQL query in the file at PATH to the test query list") 34 | parser.add_argument("--status", action="store_true", 35 | help="Print out information about the Bao server.") 36 | parser.add_argument("--experiment", metavar="SECONDS", type=int, 37 | help="Conduct experiments on test queries for (up to) SECONDS seconds.") 38 | 39 | args = parser.parse_args() 40 | 41 | if args.train: 42 | import train 43 | print("Training Bao model from collected experience") 44 | train.train_and_save_model(args.train) 45 | exit(0) 46 | 47 | if args.load: 48 | import model 49 | print("Attempting to load the Bao model...") 50 | reg = model.BaoRegression(have_cache_data=True) 51 | reg.load(args.load) 52 | 53 | print("Model loaded. Sending message to Bao server...") 54 | send_model_load(args.load) 55 | print("Message sent to server.") 56 | exit(0) 57 | 58 | if args.retrain: 59 | import train 60 | from constants import DEFAULT_MODEL_PATH, OLD_MODEL_PATH, TMP_MODEL_PATH 61 | train.train_and_swap(DEFAULT_MODEL_PATH, OLD_MODEL_PATH, TMP_MODEL_PATH, 62 | verbose=True) 63 | send_model_load(DEFAULT_MODEL_PATH) 64 | exit(0) 65 | 66 | if args.test_connection: 67 | from reg_blocker import ExperimentRunner 68 | er = ExperimentRunner() 69 | if er.test_connection(): 70 | print("Connection successful!") 71 | exit(0) 72 | else: 73 | print("Could not connect to PostgreSQL.") 74 | exit(1) 75 | 76 | if args.add_test_query: 77 | from reg_blocker import ExperimentRunner 78 | er = ExperimentRunner() 79 | 80 | with open(args.add_test_query) as f: 81 | sql = f.read() 82 | 83 | er.add_experimental_query(sql) 84 | exit(0) 85 | 86 | if args.experiment: 87 | from reg_blocker import ExperimentRunner 88 | er = ExperimentRunner() 89 | er.explore(args.experiment) 90 | exit(0) 91 | 92 | if args.status: 93 | from reg_blocker import ExperimentRunner 94 | er = ExperimentRunner() 95 | info = er.status() 96 | 97 | max_key_length = max(len(x) for x in info.keys()) 98 | 99 | for k, v in info.items(): 100 | print(k.ljust(max_key_length), ":", v) 101 | 102 | exit(0) 103 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /docs/src/tutorial/2_bao_setup.md: -------------------------------------------------------------------------------- 1 | # Bao Server Setup 2 | 3 | After configuring the PostgreSQL extension, we'll next setup the Bao server. The Bao server is responsible for loading, maintaining, and serving the learned query optimization model. For now, we'll assume that the Bao server will run on the same machine as PostgreSQL, although this is not required. 4 | 5 | The Bao server is configured with the `BaoForPostgreSQL/bao_server/bao.cfg` configuration file, but the defaults should work fine for this tutorial. Before starting the server, several Python dependencies are required. If you are using the VM, these are already installed. On Arch Linux, these can be installed by: 6 | 7 | ``` 8 | pacman -S python-scikit-learn python-numpy python-joblib python-pytorch-opt 9 | ``` 10 | 11 | If you are using a different Linux distribution, the package names may be slightly different. If you'd prefer, you can also install these dependencies with `pip`: 12 | 13 | ``` 14 | pip3 install scikit-learn numpy joblib 15 | pip3 install torch==1.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html 16 | ``` 17 | 18 | Once these dependencies are installed, we can launch the Bao server. 19 | 20 | ``` 21 | $ cd bao_server 22 | $ python3 main.py 23 | Listening on localhost port 9381 24 | Spawning server process... 25 | ``` 26 | 27 | With the Bao server running, we can now test to see if PostgreSQL can connect to it. If you are not running the Bao server on the same node as PostgreSQL, you'll need to change `ListenAddress` in `bao.cfg` and set the PostgreSQL variables `bao_host` and `bao_port`. 28 | 29 | ``` 30 | # add -h localhost if you are connecting to the VM from your host machine 31 | $ psql -U imdb 32 | psql (12.3) 33 | Type "help" for help. 34 | 35 | imdb=# SET enable_bao TO on; 36 | SET 37 | imdb=# EXPLAIN SELECT count(*) FROM title; 38 | QUERY PLAN 39 | ------------------------------------------------------------------------------------------ 40 | Bao prediction: NaN 41 | Bao recommended hint: (no hint) 42 | Finalize Aggregate (cost=50166.51..50166.52 rows=1 width=8) 43 | -> Gather (cost=50166.29..50166.50 rows=2 width=8) 44 | Workers Planned: 2 45 | -> Partial Aggregate (cost=49166.29..49166.30 rows=1 width=8) 46 | -> Parallel Seq Scan on title (cost=0.00..46532.63 rows=1053463 width=0) 47 | (7 rows) 48 | ``` 49 | 50 | If everything is setup correctly, you should see two lines in the `EXPLAIN` output related to Bao -- the prediction and the hint. Since Bao currently has no experience, it has no model, so there's no prediction or hint provided. 51 | 52 | Next, we'll test to make sure Bao can correctly record feedback from PostgreSQL query executions. In the same session (with `enable_bao` set to `ON`), execute a query: 53 | 54 | ``` 55 | imdb=# SELECT count(*) FROM title; 56 | count 57 | --------- 58 | 2528312 59 | (1 row) 60 | 61 | ``` 62 | 63 | If you look at the `stdout` of the Bao server, you should see a line like: 64 | ``` 65 | Logged reward of 2103.556027 66 | ``` 67 | 68 | This indicates that Bao has recorded a runtime of 2 seconds for this simple query plan. Note that Bao does not record your SQL queries, only a scrubbed version of your query plans. Bao keeps the type of each node, the estimated cost and cardinality, and the names of the involved relations. Bao never stores, for example, the predicate values from an executed query. Here's what Bao stores for the above query: 69 | 70 | ```json 71 | { 72 | "Plan": { 73 | "Node Type": "Other", 74 | "Node Type ID": "42", 75 | "Total Cost": 50166.515833, 76 | "Plan Rows": 1, 77 | "Plans": [ 78 | { 79 | "Node Type": "Other", 80 | "Node Type ID": "45", 81 | "Total Cost": 50166.500833, 82 | "Plan Rows": 2, 83 | "Plans": [ 84 | { 85 | "Node Type": "Other", 86 | "Node Type ID": "42", 87 | "Total Cost": 49166.300833, 88 | "Plan Rows": 1, 89 | "Plans": [ 90 | { 91 | "Node Type": "Seq Scan", 92 | "Node Type ID": "19", 93 | "Relation Name": "title", 94 | "Total Cost": 46532.633333, 95 | "Plan Rows": 1053463 96 | } 97 | ] 98 | } 99 | ] 100 | } 101 | ] 102 | }, 103 | "Buffers": { 104 | "title_pkey": 1, 105 | "kind_id_title": 1 106 | } 107 | } 108 | ``` 109 | 110 | -------------------------------------------------------------------------------- /bao_server/storage.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import json 3 | import itertools 4 | 5 | from common import BaoException 6 | 7 | def _bao_db(): 8 | conn = sqlite3.connect("bao.db") 9 | c = conn.cursor() 10 | c.execute(""" 11 | CREATE TABLE IF NOT EXISTS experience ( 12 | id INTEGER PRIMARY KEY, 13 | pg_pid INTEGER, 14 | plan TEXT, 15 | reward REAL 16 | )""") 17 | c.execute(""" 18 | CREATE TABLE IF NOT EXISTS experimental_query ( 19 | id INTEGER PRIMARY KEY, 20 | query TEXT UNIQUE 21 | )""") 22 | c.execute(""" 23 | CREATE TABLE IF NOT EXISTS experience_for_experimental ( 24 | experience_id INTEGER, 25 | experimental_id INTEGER, 26 | arm_idx INTEGER, 27 | FOREIGN KEY (experience_id) REFERENCES experience(id), 28 | FOREIGN KEY (experimental_id) REFERENCES experimental_query(id), 29 | PRIMARY KEY (experience_id, experimental_id, arm_idx) 30 | )""") 31 | conn.commit() 32 | return conn 33 | 34 | def record_reward(plan, reward, pid): 35 | with _bao_db() as conn: 36 | c = conn.cursor() 37 | c.execute("INSERT INTO experience (plan, reward, pg_pid) VALUES (?, ?, ?)", 38 | (json.dumps(plan), reward, pid)) 39 | conn.commit() 40 | 41 | print("Logged reward of", reward) 42 | 43 | def last_reward_from_pid(pid): 44 | with _bao_db() as conn: 45 | c = conn.cursor() 46 | c.execute("SELECT id FROM experience WHERE pg_pid = ? ORDER BY id DESC LIMIT 1", 47 | (pid,)) 48 | res = c.fetchall() 49 | if not res: 50 | return None 51 | return res[0][0] 52 | 53 | def experience(): 54 | with _bao_db() as conn: 55 | c = conn.cursor() 56 | c.execute("SELECT plan, reward FROM experience") 57 | return c.fetchall() 58 | 59 | def experiment_experience(): 60 | all_experiment_experience = [] 61 | for res in experiment_results(): 62 | all_experiment_experience.extend( 63 | [(x["plan"], x["reward"]) for x in res] 64 | ) 65 | return all_experiment_experience 66 | 67 | def experience_size(): 68 | with _bao_db() as conn: 69 | c = conn.cursor() 70 | c.execute("SELECT count(*) FROM experience") 71 | return c.fetchone()[0] 72 | 73 | def clear_experience(): 74 | with _bao_db() as conn: 75 | c = conn.cursor() 76 | c.execute("DELETE FROM experience") 77 | conn.commit() 78 | 79 | def record_experimental_query(sql): 80 | try: 81 | with _bao_db() as conn: 82 | c = conn.cursor() 83 | c.execute("INSERT INTO experimental_query (query) VALUES(?)", 84 | (sql,)) 85 | conn.commit() 86 | except sqlite3.IntegrityError as e: 87 | raise BaoException("Could not add experimental query. " 88 | + "Was it already added?") from e 89 | 90 | print("Added new test query.") 91 | 92 | def num_experimental_queries(): 93 | with _bao_db() as conn: 94 | c = conn.cursor() 95 | c.execute("SELECT count(*) FROM experimental_query") 96 | return c.fetchall()[0][0] 97 | 98 | def unexecuted_experiments(): 99 | with _bao_db() as conn: 100 | c = conn.cursor() 101 | c.execute("CREATE TEMP TABLE arms (arm_idx INTEGER)") 102 | c.execute("INSERT INTO arms (arm_idx) VALUES (0),(1),(2),(3),(4)") 103 | 104 | c.execute(""" 105 | SELECT eq.id, eq.query, arms.arm_idx 106 | FROM experimental_query eq, arms 107 | LEFT OUTER JOIN experience_for_experimental efe 108 | ON eq.id = efe.experimental_id AND arms.arm_idx = efe.arm_idx 109 | WHERE efe.experience_id IS NULL 110 | """) 111 | return [{"id": x[0], "query": x[1], "arm": x[2]} 112 | for x in c.fetchall()] 113 | 114 | def experiment_results(): 115 | with _bao_db() as conn: 116 | c = conn.cursor() 117 | c.execute(""" 118 | SELECT eq.id, e.reward, e.plan, efe.arm_idx 119 | FROM experimental_query eq, 120 | experience_for_experimental efe, 121 | experience e 122 | WHERE eq.id = efe.experimental_id AND e.id = efe.experience_id 123 | ORDER BY eq.id, efe.arm_idx; 124 | """) 125 | for eq_id, grp in itertools.groupby(c, key=lambda x: x[0]): 126 | yield ({"reward": x[1], "plan": x[2], "arm": x[3]} for x in grp) 127 | 128 | 129 | def record_experiment(experimental_id, experience_id, arm_idx): 130 | with _bao_db() as conn: 131 | c = conn.cursor() 132 | c.execute(""" 133 | INSERT INTO experience_for_experimental (experience_id, experimental_id, arm_idx) 134 | VALUES (?, ?, ?)""", (experience_id, experimental_id, arm_idx)) 135 | conn.commit() 136 | 137 | 138 | # select eq.id, efe.arm_idx, min(e.reward) from experimental_query eq, experience_for_experimental efe, experience e WHERE eq.id = efe.experimental_id AND e.id = efe.experience_id GROUP BY eq.id; 139 | -------------------------------------------------------------------------------- /bao_server/TreeConvolution/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | 5 | class TreeConvolutionError(Exception): 6 | pass 7 | 8 | def _is_leaf(x, left_child, right_child): 9 | has_left = left_child(x) is not None 10 | has_right = right_child(x) is not None 11 | 12 | if has_left != has_right: 13 | raise TreeConvolutionError( 14 | "All nodes must have both a left and a right child or no children" 15 | ) 16 | 17 | return not has_left 18 | 19 | def _flatten(root, transformer, left_child, right_child): 20 | """ turns a tree into a flattened vector, preorder """ 21 | 22 | if not callable(transformer): 23 | raise TreeConvolutionError( 24 | "Transformer must be a function mapping a tree node to a vector" 25 | ) 26 | 27 | if not callable(left_child) or not callable(right_child): 28 | raise TreeConvolutionError( 29 | "left_child and right_child must be a function mapping a " 30 | + "tree node to its child, or None" 31 | ) 32 | 33 | 34 | accum = [] 35 | 36 | def recurse(x): 37 | if _is_leaf(x, left_child, right_child): 38 | accum.append(transformer(x)) 39 | return 40 | 41 | accum.append(transformer(x)) 42 | recurse(left_child(x)) 43 | recurse(right_child(x)) 44 | 45 | recurse(root) 46 | 47 | try: 48 | accum = [np.zeros(accum[0].shape)] + accum 49 | except: 50 | raise TreeConvolutionError( 51 | "Output of transformer must have a .shape (e.g., numpy array)" 52 | ) 53 | 54 | return np.array(accum) 55 | 56 | def _preorder_indexes(root, left_child, right_child, idx=1): 57 | """ transforms a tree into a tree of preorder indexes """ 58 | 59 | if not callable(left_child) or not callable(right_child): 60 | raise TreeConvolutionError( 61 | "left_child and right_child must be a function mapping a " + 62 | "tree node to its child, or None" 63 | ) 64 | 65 | 66 | if _is_leaf(root, left_child, right_child): 67 | # leaf 68 | return idx 69 | 70 | def rightmost(tree): 71 | if isinstance(tree, tuple): 72 | return rightmost(tree[2]) 73 | return tree 74 | 75 | left_subtree = _preorder_indexes(left_child(root), left_child, right_child, 76 | idx=idx+1) 77 | 78 | max_index_in_left = rightmost(left_subtree) 79 | right_subtree = _preorder_indexes(right_child(root), left_child, right_child, 80 | idx=max_index_in_left + 1) 81 | 82 | return (idx, left_subtree, right_subtree) 83 | 84 | def _tree_conv_indexes(root, left_child, right_child): 85 | """ 86 | Create indexes that, when used as indexes into the output of `flatten`, 87 | create an array such that a stride-3 1D convolution is the same as a 88 | tree convolution. 89 | """ 90 | 91 | if not callable(left_child) or not callable(right_child): 92 | raise TreeConvolutionError( 93 | "left_child and right_child must be a function mapping a " 94 | + "tree node to its child, or None" 95 | ) 96 | 97 | index_tree = _preorder_indexes(root, left_child, right_child) 98 | 99 | def recurse(root): 100 | if isinstance(root, tuple): 101 | my_id = root[0] 102 | left_id = root[1][0] if isinstance(root[1], tuple) else root[1] 103 | right_id = root[2][0] if isinstance(root[2], tuple) else root[2] 104 | yield [my_id, left_id, right_id] 105 | 106 | yield from recurse(root[1]) 107 | yield from recurse(root[2]) 108 | else: 109 | yield [root, 0, 0] 110 | 111 | return np.array(list(recurse(index_tree))).flatten().reshape(-1, 1) 112 | 113 | def _pad_and_combine(x): 114 | assert len(x) >= 1 115 | assert len(x[0].shape) == 2 116 | 117 | for itm in x: 118 | if itm.dtype == np.dtype("object"): 119 | raise TreeConvolutionError( 120 | "Transformer outputs could not be unified into an array. " 121 | + "Are they all the same size?" 122 | ) 123 | 124 | second_dim = x[0].shape[1] 125 | for itm in x[1:]: 126 | assert itm.shape[1] == second_dim 127 | 128 | max_first_dim = max(arr.shape[0] for arr in x) 129 | 130 | vecs = [] 131 | for arr in x: 132 | padded = np.zeros((max_first_dim, second_dim)) 133 | padded[0:arr.shape[0]] = arr 134 | vecs.append(padded) 135 | 136 | return np.array(vecs) 137 | 138 | def prepare_trees(trees, transformer, left_child, right_child, cuda=False): 139 | flat_trees = [_flatten(x, transformer, left_child, right_child) for x in trees] 140 | flat_trees = _pad_and_combine(flat_trees) 141 | flat_trees = torch.Tensor(flat_trees) 142 | 143 | # flat trees is now batch x max tree nodes x channels 144 | flat_trees = flat_trees.transpose(1, 2) 145 | if cuda: 146 | flat_trees = flat_trees.cuda() 147 | 148 | indexes = [_tree_conv_indexes(x, left_child, right_child) for x in trees] 149 | indexes = _pad_and_combine(indexes) 150 | indexes = torch.Tensor(indexes).long() 151 | 152 | if cuda: 153 | indexes = indexes.cuda() 154 | 155 | return (flat_trees, indexes) 156 | 157 | 158 | -------------------------------------------------------------------------------- /docs/src/tutorial/1_pg_setup.md: -------------------------------------------------------------------------------- 1 | # PostgreSQL Setup 2 | 3 | In this tutorial, we'll use the IMDB dataset from the "How Good are Query Optimizers, Really?"[^howgood] paper. You can quickly setup a PostgreSQL database with this data [via a virtual machine](https://git.io/imdb), or you can download a [a PostgreSQL dump from the Harvard dataverse](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/2QYZBT) and manually load the data yourself. 4 | 5 | Don't forget to configure PostgreSQL with sane defaults. The [VM setup](https://git.io/imdb) will do this automatically, but at a minimum you should set the `shared_buffers` variable to something larger than the default (around 25% to 40% of your total RAM [is recommended](https://wiki.postgresql.org/wiki/Tuning_Your_PostgreSQL_Server)). 6 | 7 | Assuming you've used the virtual machine, we can test that the DB was setup correctly: 8 | ``` 9 | # add -h localhost if you are connecting to the VM from your host machine 10 | $ psql -U imdb 11 | psql (12.3) 12 | Type "help" for help. 13 | 14 | imdb=# select count(*) from title; 15 | count 16 | --------- 17 | 2528312 18 | (1 row) 19 | ``` 20 | 21 | ## Install the Bao extension 22 | 23 | With our PostgreSQL database setup, it is time to install Bao. If you are using a virtual machine, the following steps should be performed from within the VM (i.e., type `vagrant ssh` to login). 24 | 25 | ```bash 26 | $ git clone https://github.com/learnedsystems/BaoForPostgreSQL 27 | $ cd BaoForPostgreSQL 28 | $ ls 29 | bao_server branding COPYING LICENSE pg_extension 30 | README.md run_queries.py sample_queries 31 | ``` 32 | 33 | The directory `pg_extension` contains the code for the PostgreSQL extension. We'll install that next, using the PGXS system. Make sure your machine has the these installed. 34 | 35 | For Ubuntu, you need these packages: 36 | ```bash 37 | sudo apt-get install postgresql-server-dev-all postgresql-common 38 | ``` 39 | 40 | For Arch Linux: 41 | ``` 42 | sudo pacman -S postgresql-libs 43 | ``` 44 | 45 | 46 | With the correct packages installed, we can proceed to install the Bao extension. First, turn off PostgreSQL: 47 | 48 | ```bash 49 | systemctl stop postgresql 50 | ``` 51 | 52 | Next, build and install the Bao PostgreSQL extension. 53 | 54 | ```bash 55 | # depending on your setup, this may require sudo 56 | cd pg_extension 57 | make USE_PGXS=1 install 58 | ``` 59 | 60 | If everything goes correctly, you should see output like below: 61 | 62 | ``` 63 | $ make USE_PGXS=1 install 64 | /usr/bin/clang -Wno-ignored-attributes -fno-strict-aliasing -fwrapv -O2 -I. -I./ -I/usr/include/postgresql/server -I/usr/include/postgresql/internal -D_FORTIFY_SOURCE=2 -D_GNU_SOURCE -I/usr/include/libxml2 -flto=thin -emit-llvm -c -o main.bc main.c 65 | gcc -Wall -Wmissing-prototypes -Wpointer-arith -Wdeclaration-after-statement -Werror=vla -Wendif-labels -Wmissing-format-attribute -Wformat-security -fno-strict-aliasing -fwrapv -fexcess-precision=standard -Wno-format-truncation -Wno-stringop-truncation -march=x86-64 -mtune=generic -O2 -pipe -fno-plt -fPIC -I. -I./ -I/usr/include/postgresql/server -I/usr/include/postgresql/internal -D_FORTIFY_SOURCE=2 -D_GNU_SOURCE -I/usr/include/libxml2 -c -o main.o main.c 66 | gcc -Wall -Wmissing-prototypes -Wpointer-arith -Wdeclaration-after-statement -Werror=vla -Wendif-labels -Wmissing-format-attribute -Wformat-security -fno-strict-aliasing -fwrapv -fexcess-precision=standard -Wno-format-truncation -Wno-stringop-truncation -march=x86-64 -mtune=generic -O2 -pipe -fno-plt -fPIC -shared -o pg_bao.so main.o -L/usr/lib -Wl,-O1,--sort-common,--as-needed,-z,relro,-z,now -L/usr/lib -Wl,--as-needed 67 | /usr/bin/mkdir -p '/usr/lib/postgresql' 68 | /usr/bin/mkdir -p '/usr/share/postgresql/extension' 69 | /usr/bin/mkdir -p '/usr/share/postgresql/extension' 70 | /usr/bin/install -c -m 755 pg_bao.so '/usr/lib/postgresql/pg_bao.so' 71 | /usr/bin/install -c -m 644 .//pg_bao.control '/usr/share/postgresql/extension/' 72 | /usr/bin/install -c -m 644 .//pg_bao--0.0.1.sql '/usr/share/postgresql/extension/' 73 | /usr/bin/mkdir -p '/usr/lib/postgresql/bitcode/pg_bao' 74 | /usr/bin/mkdir -p '/usr/lib/postgresql/bitcode'/pg_bao/ 75 | /usr/bin/install -c -m 644 main.bc '/usr/lib/postgresql/bitcode'/pg_bao/./ 76 | cd '/usr/lib/postgresql/bitcode' && /usr/bin/llvm-lto -thinlto -thinlto-action=thinlink -o pg_bao.index.bc pg_bao/main.bc 77 | ``` 78 | 79 | Now that the Bao extension is installed, we have to tell PostgreSQL to load it. To do this, modify your `postgresql.conf` file to load the `pg_bao` shared library. If you are using the VM, you can do this like so (as a superuser): 80 | 81 | ```bash 82 | echo "shared_preload_libraries = 'pg_bao'" >> /media/data/pg_data/data/postgresql.conf 83 | ``` 84 | 85 | Next, we restart PostgreSQL: 86 | ```bash 87 | systemctl restart postgresql 88 | ``` 89 | 90 | 91 | We can now reconnect to the database and test to make sure Bao is installed: 92 | ``` 93 | # add -h localhost if you are connecting to the VM from your host machine 94 | $ psql -U imdb 95 | psql (12.3) 96 | Type "help" for help. 97 | 98 | imdb=# SHOW enable_bao; 99 | enable_bao 100 | ------------ 101 | off 102 | (1 row) 103 | 104 | ``` 105 | 106 | If the `enable_bao` is present (it defaults to `off`), then the Bao extension is installed. **If instead you see an error like below, then the Bao extension has not been installed:** 107 | 108 | ``` 109 | imdb=# show enable_bao; 110 | ERROR: unrecognized configuration parameter "enable_bao" 111 | ``` 112 | 113 | # Notes 114 | 115 | [^howgood]: Leis, Viktor, Andrey Gubichev, Atanas Mirchev, Peter Boncz, Alfons Kemper, and Thomas Neumann. “How Good Are Query Optimizers, Really?” PVLDB, VLDB ’15, 9, no. 3 (2015): 204–215. https://doi.org/10.14778/2850583.2850594. 116 | -------------------------------------------------------------------------------- /bao_server/main.py: -------------------------------------------------------------------------------- 1 | import socketserver 2 | import json 3 | import struct 4 | import sys 5 | import time 6 | import os 7 | import storage 8 | import model 9 | import train 10 | import baoctl 11 | import math 12 | import reg_blocker 13 | from constants import (PG_OPTIMIZER_INDEX, DEFAULT_MODEL_PATH, 14 | OLD_MODEL_PATH, TMP_MODEL_PATH) 15 | 16 | def add_buffer_info_to_plans(buffer_info, plans): 17 | for p in plans: 18 | p["Buffers"] = buffer_info 19 | return plans 20 | 21 | class BaoModel: 22 | def __init__(self): 23 | self.__current_model = None 24 | 25 | def select_plan(self, messages): 26 | start = time.time() 27 | # the last message is the buffer state 28 | *arms, buffers = messages 29 | 30 | # if we don't have a model, default to the PG optimizer 31 | if self.__current_model is None: 32 | return PG_OPTIMIZER_INDEX 33 | 34 | # if we do have a model, make predictions for each plan. 35 | arms = add_buffer_info_to_plans(buffers, arms) 36 | res = self.__current_model.predict(arms) 37 | idx = res.argmin() 38 | stop = time.time() 39 | print("Selected index", idx, 40 | "after", f"{round((stop - start) * 1000)}ms", 41 | "Predicted reward / PG:", res[idx][0], 42 | "/", res[0][0]) 43 | return idx 44 | 45 | def predict(self, messages): 46 | # the last message is the buffer state 47 | plan, buffers = messages 48 | 49 | # if we don't have a model, make a prediction of NaN 50 | if self.__current_model is None: 51 | return math.nan 52 | 53 | # if we do have a model, make predictions for each plan. 54 | plans = add_buffer_info_to_plans(buffers, [plan]) 55 | res = self.__current_model.predict(plans) 56 | return res[0][0] 57 | 58 | def load_model(self, fp): 59 | try: 60 | new_model = model.BaoRegression(have_cache_data=True) 61 | new_model.load(fp) 62 | 63 | if reg_blocker.should_replace_model( 64 | self.__current_model, 65 | new_model): 66 | self.__current_model = new_model 67 | print("Accepted new model.") 68 | else: 69 | print("Rejecting load of new model due to regresison profile.") 70 | 71 | except Exception as e: 72 | print("Failed to load Bao model from", fp, 73 | "Exception:", sys.exc_info()[0]) 74 | raise e 75 | 76 | 77 | class JSONTCPHandler(socketserver.BaseRequestHandler): 78 | def handle(self): 79 | str_buf = "" 80 | while True: 81 | str_buf += self.request.recv(1024).decode("UTF-8") 82 | if not str_buf: 83 | # no more data, connection is finished. 84 | return 85 | 86 | if (null_loc := str_buf.find("\n")) != -1: 87 | json_msg = str_buf[:null_loc].strip() 88 | str_buf = str_buf[null_loc + 1:] 89 | if json_msg: 90 | try: 91 | if self.handle_json(json.loads(json_msg)): 92 | break 93 | except json.decoder.JSONDecodeError: 94 | print("Error decoding JSON:", json_msg) 95 | break 96 | 97 | 98 | class BaoJSONHandler(JSONTCPHandler): 99 | def setup(self): 100 | self.__messages = [] 101 | 102 | def handle_json(self, data): 103 | if "final" in data: 104 | message_type = self.__messages[0]["type"] 105 | self.__messages = self.__messages[1:] 106 | 107 | if message_type == "query": 108 | result = self.server.bao_model.select_plan(self.__messages) 109 | self.request.sendall(struct.pack("I", result)) 110 | self.request.close() 111 | elif message_type == "predict": 112 | result = self.server.bao_model.predict(self.__messages) 113 | self.request.sendall(struct.pack("d", result)) 114 | self.request.close() 115 | elif message_type == "reward": 116 | plan, buffers, obs_reward = self.__messages 117 | plan = add_buffer_info_to_plans(buffers, [plan])[0] 118 | storage.record_reward(plan, obs_reward["reward"], obs_reward["pid"]) 119 | elif message_type == "load model": 120 | path = self.__messages[0]["path"] 121 | self.server.bao_model.load_model(path) 122 | else: 123 | print("Unknown message type:", message_type) 124 | 125 | return True 126 | 127 | self.__messages.append(data) 128 | return False 129 | 130 | 131 | def start_server(listen_on, port): 132 | model = BaoModel() 133 | 134 | if os.path.exists(DEFAULT_MODEL_PATH): 135 | print("Loading existing model") 136 | model.load_model(DEFAULT_MODEL_PATH) 137 | 138 | socketserver.TCPServer.allow_reuse_address = True 139 | with socketserver.TCPServer((listen_on, port), BaoJSONHandler) as server: 140 | server.bao_model = model 141 | server.serve_forever() 142 | 143 | 144 | if __name__ == "__main__": 145 | from multiprocessing import Process 146 | from config import read_config 147 | 148 | config = read_config() 149 | port = int(config["Port"]) 150 | listen_on = config["ListenOn"] 151 | 152 | print(f"Listening on {listen_on} port {port}") 153 | 154 | server = Process(target=start_server, args=[listen_on, port]) 155 | 156 | print("Spawning server process...") 157 | server.start() 158 | -------------------------------------------------------------------------------- /pg_extension/bao_util.h: -------------------------------------------------------------------------------- 1 | #ifndef BAO_UTIL_H 2 | #define BAO_UTIL_H 3 | 4 | #include 5 | #include 6 | 7 | #include "postgres.h" 8 | #include "optimizer/planner.h" 9 | #include "optimizer/cost.h" 10 | #include "parser/parsetree.h" 11 | #include "utils/lsyscache.h" 12 | #include "executor/execdesc.h" 13 | 14 | // Utility functions and common structs used throughout Bao. 15 | 16 | // JSON tags for sending to the Bao server. 17 | static const char* START_QUERY_MESSAGE = "{\"type\": \"query\"}\n"; 18 | static const char *START_FEEDBACK_MESSAGE = "{\"type\": \"reward\"}\n"; 19 | static const char* START_PREDICTION_MESSAGE = "{\"type\": \"predict\"}\n"; 20 | static const char* TERMINAL_MESSAGE = "{\"final\": true}\n"; 21 | 22 | 23 | // Bao-specific information associated with a query plan. 24 | typedef struct BaoQueryInfo { 25 | // A JSON representation of the query plan we can send to the Bao server. 26 | char* plan_json; 27 | 28 | // A JSON representation of the buffer state when the query was planned. 29 | char* buffer_json; 30 | } BaoQueryInfo; 31 | 32 | 33 | // A struct containing a PG query plan and the related Bao-specific information. 34 | typedef struct BaoPlan { 35 | BaoQueryInfo* query_info; 36 | 37 | // The PostgreSQL plan. 38 | PlannedStmt* plan; 39 | 40 | // The arm index we used to generate this plan. 41 | unsigned int selection; 42 | } BaoPlan; 43 | 44 | // Free a BaoQueryInfo struct. 45 | static void free_bao_query_info(BaoQueryInfo* info) { 46 | if (!info) return; 47 | if (info->plan_json) free(info->plan_json); 48 | if (info->buffer_json) free(info->buffer_json); 49 | free(info); 50 | } 51 | 52 | // Free a BaoPlan (including the contained BaoQueryInfo). 53 | static void free_bao_plan(BaoPlan* plan) { 54 | if (!plan) return; 55 | if (plan->query_info) free_bao_query_info(plan->query_info); 56 | free(plan); 57 | } 58 | 59 | // Determine if we should report the reward of this query or not. 60 | static bool should_report_reward(QueryDesc* queryDesc) { 61 | // before reporting a reward, check that: 62 | // (1) that the query ID is not zero (query ID is left as 0 for INSERT, UPDATE, etc.) 63 | // (2) that the query actually executed (e.g., was not an EXPLAIN). 64 | // (3) the the instrument_options is zero (e.g., was not an EXPLAIN ANALYZE) 65 | return (queryDesc->plannedstmt->queryId != 0 66 | && queryDesc->already_executed 67 | && queryDesc->instrument_options == 0); 68 | } 69 | 70 | // Determine if we should optimize this query or not. 71 | static bool should_bao_optimize(Query* parse) { 72 | Oid relid; 73 | char* namespace; 74 | 75 | // Don't try and optimize anything that isn't a SELECT query. 76 | if (parse->commandType != CMD_SELECT) return false; 77 | 78 | // Iterate over all the relations in this query. 79 | for (int i = 0; i < list_length(parse->rtable); i++) { 80 | relid = rt_fetch(i, parse->rtable)->relid; 81 | // A relid of zero seems to have a special meaning, and it causes 82 | // get_rel_namespace or get_namespace_name to crash. Relid of zero 83 | // doesn't seem to appear in "normal" queries though. 84 | if (!relid) return false; 85 | 86 | // Ignore queries that involve the pg_catalog (internal data used by PostgreSQL). 87 | namespace = get_namespace_name(get_rel_namespace(relid)); 88 | if (strcmp(namespace, "pg_catalog") == 0) return false; 89 | } 90 | 91 | return true; 92 | 93 | } 94 | 95 | 96 | // https://stackoverflow.com/a/4770992/1464282 97 | static bool starts_with(const char *str, const char *pre) { 98 | return strncmp(pre, str, strlen(pre)) == 0; 99 | } 100 | 101 | // Create a JSON object containing the reward, suitable to send to the Bao 102 | // server. 103 | static char* reward_json(double reward) { 104 | char* buf; 105 | size_t json_size; 106 | FILE* stream; 107 | pid_t pid = getpid(); 108 | 109 | stream = open_memstream(&buf, &json_size); 110 | 111 | fprintf(stream, "{\"reward\": %f , \"pid\": %d }\n", reward, pid); 112 | fclose(stream); 113 | 114 | return buf; 115 | 116 | } 117 | 118 | // Write the entire string to the given socket. 119 | static void write_all_to_socket(int conn_fd, const char* json) { 120 | size_t json_length; 121 | ssize_t written, written_total; 122 | json_length = strlen(json); 123 | written_total = 0; 124 | 125 | while (written_total != json_length) { 126 | written = write(conn_fd, 127 | json + written_total, 128 | json_length - written_total); 129 | written_total += written; 130 | } 131 | } 132 | 133 | // Connect to the Bao server. 134 | static int connect_to_bao(const char* host, int port) { 135 | int ret, conn_fd; 136 | struct sockaddr_in server_addr = { 0 }; 137 | 138 | server_addr.sin_family = AF_INET; 139 | server_addr.sin_port = htons(port); 140 | inet_pton(AF_INET, host, &server_addr.sin_addr); 141 | conn_fd = socket(AF_INET, SOCK_STREAM, 0); 142 | if (conn_fd < 0) { 143 | return conn_fd; 144 | } 145 | 146 | ret = connect(conn_fd, (struct sockaddr*)&server_addr, sizeof(server_addr)); 147 | if (ret == -1) { 148 | return ret; 149 | } 150 | 151 | return conn_fd; 152 | 153 | } 154 | 155 | // Get the relation name of a particular plan node with a PostgreSQL 156 | // PlannedStmt. 157 | static char* get_relation_name(PlannedStmt* stmt, Plan* node) { 158 | Index rti; 159 | 160 | switch (node->type) { 161 | case T_SeqScan: 162 | case T_SampleScan: 163 | case T_IndexScan: 164 | case T_IndexOnlyScan: 165 | case T_BitmapHeapScan: 166 | case T_BitmapIndexScan: 167 | case T_TidScan: 168 | case T_ForeignScan: 169 | case T_CustomScan: 170 | case T_ModifyTable: 171 | rti = ((Scan*)node)->scanrelid; 172 | return get_rel_name(rt_fetch(rti, stmt->rtable)->relid); 173 | break; 174 | default: 175 | return NULL; 176 | } 177 | } 178 | 179 | 180 | #endif 181 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Bao loves PostgreSQL](https://github.com/LearnedSystems/BaoForPostgreSQL/blob/master/branding/bao_loves_pg.svg) 2 | 3 | This is a prototype implementation of Bao for PostgreSQL. Bao is a learned query optimizer that learns to "steer" the PostgreSQL optimizer by issuing coarse-grained query hints. For more information about Bao, [check out the paper](https://rm.cab/bao). 4 | 5 | Documentation, including a tutorial, is available here: https://rmarcus.info/bao_docs/ 6 | 7 | While this repository contains working prototype implementations of many of the pieces required to build a production-ready learned query optimizer, this code itself should not be used in production in its current form. Notable limitations include: 8 | 9 | * The reward function is currently restricted to being a user-supplied value or the query latency in wall time. Thus, results may be inconsistent with high degrees of parallelism. 10 | * The Bao server component does not perform any level of authentication or encryption. Do not run it on a machine directly accessible from an untrusted network. 11 | * The code has not been audited for security issues. Since the PostgreSQL integration is written using the C hooks system, there are almost certainly issues. 12 | 13 | This software is available under the AGPLv3 license. 14 | 15 | ## FAQ 16 | 17 | ### Bao seems to take a long time to optimize queries. Why is optimization time so high?** 18 | 19 | For simplicity, this prototype plans each hint (arm) sequentially. That means that if you are using 5 arms, Bao calls the PostgreSQL query planner 5 times per query. This work is in-theory embarrassingly parallel, but this prototype does not use parallelism. 20 | 21 | To compensate for this in a rough way, you can measure the maximum time it takes to plan any arm, then pretend that time is how long the entire planning process took (i.e., perfect parallelism). Obviously, this will underestimate the planning time. *If you want true measurements of Bao' planning time, you'll need to implement parallel planning.* 22 | 23 | Note that parallel planning is not normally needed to get good performance. Each call to the planner typically takes 50-200ms. So if a query plan takes multiple minutes to execute, the additional time from planning will be inconsequential. However, *if you are optimizing shorter queries, this may not be the case.* 24 | 25 | For more information, see "Query optimization time" in Section 6.2 of the Bao paper. 26 | 27 | ### Bao isn't improving query performance for me, what's wrong?** 28 | 29 | The two most common reasons for poor performance with Bao are: 30 | 31 | 1. **Tuning the set of hints/arms used**. By default, this prototype uses 5 arms that we found to be optimal for a GCP N1-4 VM. Since the IMDb dataset is much smaller than the average analytics dataset, we intentionally choose a small VM. *If you test on different hardware, you need to choose a different set of arms.* The easiest way to select a good set of arms is with manual testing: run all of your queries with all possible arms, then pick the set of arms that has the potential to improve performance the most. See Section 6.3 of the Bao paper for more information. 32 | 33 | 2. **Training with too little data**. While we think Bao's neural network model is much more sample efficient than prior work (e.g., Neo), Bao is still relatively "data hungry." You will need to train on, at minimum, hundreds of query executions in order to get reasonable results. Note that since this prototype uses a sliding window based approach (Section 3.2 of the Bao paper), Bao will only retrain it's model every 25 queries. This means that if you execute 50 queries, the first 25 will be assigned the default optimizer plan, the second 25 will use the Bao model trained on the first 25 queries. *Thus, for the last query executed, you are evaluating Bao with 25, not 49, training examples*. 34 | 35 | ### How can I test Bao on my own training / test splits? 36 | 37 | The core learning algorithm in Bao is a reinforcement learning based approach. The usual "training set" and "testing set" terminology do not typically apply: Bao is designed to continually learn from a stream of queries. In the paper, we describe this as "cross validation over time," where Bao makes a decision at time `t` based only on data from time `t-1`. This is technically true, but might not be the most intuitive way to think about how reinforcement learning works. 38 | 39 | Since Bao is not magic, it cannot magically extrapolate to totally novel and unseen queries. As an extreme example, consider a database table with four tables `a`, `b`, `c`, and `d`. If Bao is "trained" on queries over `a` and `b`, and then "tested" on queries over `c` and `d`, performance will be poor! Bao takes advantage of the fact that all the queries issued up to time `t` *gives you information about the query at time `t+1`*. If you engineer a scenario where this is not the case, Bao will unsurprisingly fail. 40 | 41 | Thus, if you want to test Bao on your own workload, we suggest putting your queries into a random order and running Bao as intended. To increase robustness, you can measure performance across multiple random orderings. If you don't have enough queries in your workload, you can either (a) add noise to your queries to create new ones, or (b) "loop" the workload by repeating each query 1 to 3 times (note that if you repeat each query too many times, Bao might have the opportunity to test out every single hint!). 42 | 43 | ## Other work 44 | 45 | A non-exhaustive list of extensions and applications of Bao, both from us and from others: 46 | 47 | * Microsoft published two papers describing how they built a Bao-like system into the SCOPE analytics system: [paper 1](https://dl.acm.org/doi/10.1145/3448016.3457568) [paper 2](https://dl.acm.org/doi/10.1145/3514221.3526052) 48 | * Woltmann et al. published [FASTgres](https://dl.acm.org/doi/10.14778/3611479.3611528), which combines clustering and supervised learning to train hint selection models in an offline fashion. 49 | * Annesser et al. published [AutoSteer](https://dl.acm.org/doi/10.14778/3611540.3611544), which shows how Meta built a Bao-like system for their dashboarding analytics system. 50 | * Yi et al. published [LimeQO](https://doi.org/10.1145/3663742.3663974), which learns ideal hints for an entire query workload at once, in an offline fashion. 51 | 52 | Feel free to open PRs or contact us to add more! -------------------------------------------------------------------------------- /bao_server/model.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import torch 4 | import torch.optim 5 | import joblib 6 | import os 7 | from sklearn import preprocessing 8 | from sklearn.pipeline import Pipeline 9 | 10 | from torch.utils.data import DataLoader 11 | import net 12 | from featurize import TreeFeaturizer 13 | 14 | CUDA = torch.cuda.is_available() 15 | 16 | def _nn_path(base): 17 | return os.path.join(base, "nn_weights") 18 | 19 | def _x_transform_path(base): 20 | return os.path.join(base, "x_transform") 21 | 22 | def _y_transform_path(base): 23 | return os.path.join(base, "y_transform") 24 | 25 | def _channels_path(base): 26 | return os.path.join(base, "channels") 27 | 28 | def _n_path(base): 29 | return os.path.join(base, "n") 30 | 31 | 32 | def _inv_log1p(x): 33 | return np.exp(x) - 1 34 | 35 | class BaoData: 36 | def __init__(self, data): 37 | assert data 38 | self.__data = data 39 | 40 | def __len__(self): 41 | return len(self.__data) 42 | 43 | def __getitem__(self, idx): 44 | return (self.__data[idx]["tree"], 45 | self.__data[idx]["target"]) 46 | 47 | def collate(x): 48 | trees = [] 49 | targets = [] 50 | 51 | for tree, target in x: 52 | trees.append(tree) 53 | targets.append(target) 54 | 55 | targets = torch.tensor(targets) 56 | return trees, targets 57 | 58 | class BaoRegression: 59 | def __init__(self, verbose=False, have_cache_data=False): 60 | self.__net = None 61 | self.__verbose = verbose 62 | 63 | log_transformer = preprocessing.FunctionTransformer( 64 | np.log1p, _inv_log1p, 65 | validate=True) 66 | scale_transformer = preprocessing.MinMaxScaler() 67 | 68 | self.__pipeline = Pipeline([("log", log_transformer), 69 | ("scale", scale_transformer)]) 70 | 71 | self.__tree_transform = TreeFeaturizer() 72 | self.__have_cache_data = have_cache_data 73 | self.__in_channels = None 74 | self.__n = 0 75 | 76 | def __log(self, *args): 77 | if self.__verbose: 78 | print(*args) 79 | 80 | def num_items_trained_on(self): 81 | return self.__n 82 | 83 | def load(self, path): 84 | with open(_n_path(path), "rb") as f: 85 | self.__n = joblib.load(f) 86 | with open(_channels_path(path), "rb") as f: 87 | self.__in_channels = joblib.load(f) 88 | 89 | self.__net = net.BaoNet(self.__in_channels) 90 | self.__net.load_state_dict(torch.load(_nn_path(path))) 91 | self.__net.eval() 92 | 93 | with open(_y_transform_path(path), "rb") as f: 94 | self.__pipeline = joblib.load(f) 95 | with open(_x_transform_path(path), "rb") as f: 96 | self.__tree_transform = joblib.load(f) 97 | 98 | def save(self, path): 99 | # try to create a directory here 100 | os.makedirs(path, exist_ok=True) 101 | 102 | torch.save(self.__net.state_dict(), _nn_path(path)) 103 | with open(_y_transform_path(path), "wb") as f: 104 | joblib.dump(self.__pipeline, f) 105 | with open(_x_transform_path(path), "wb") as f: 106 | joblib.dump(self.__tree_transform, f) 107 | with open(_channels_path(path), "wb") as f: 108 | joblib.dump(self.__in_channels, f) 109 | with open(_n_path(path), "wb") as f: 110 | joblib.dump(self.__n, f) 111 | 112 | def fit(self, X, y): 113 | if isinstance(y, list): 114 | y = np.array(y) 115 | 116 | X = [json.loads(x) if isinstance(x, str) else x for x in X] 117 | self.__n = len(X) 118 | 119 | # transform the set of trees into feature vectors using a log 120 | # (assuming the tail behavior exists, TODO investigate 121 | # the quantile transformer from scikit) 122 | y = self.__pipeline.fit_transform(y.reshape(-1, 1)).astype(np.float32) 123 | 124 | self.__tree_transform.fit(X) 125 | X = self.__tree_transform.transform(X) 126 | 127 | pairs = list(zip(X, y)) 128 | dataset = DataLoader(pairs, 129 | batch_size=16, 130 | shuffle=True, 131 | collate_fn=collate) 132 | 133 | # determine the initial number of channels 134 | for inp, _tar in dataset: 135 | in_channels = inp[0][0].shape[0] 136 | break 137 | 138 | self.__log("Initial input channels:", in_channels) 139 | 140 | if self.__have_cache_data: 141 | assert in_channels == self.__tree_transform.num_operators() + 3 142 | else: 143 | assert in_channels == self.__tree_transform.num_operators() + 2 144 | 145 | self.__net = net.BaoNet(in_channels) 146 | self.__in_channels = in_channels 147 | if CUDA: 148 | self.__net = self.__net.cuda() 149 | 150 | optimizer = torch.optim.Adam(self.__net.parameters()) 151 | loss_fn = torch.nn.MSELoss() 152 | 153 | losses = [] 154 | for epoch in range(100): 155 | loss_accum = 0 156 | for x, y in dataset: 157 | if CUDA: 158 | y = y.cuda() 159 | y_pred = self.__net(x) 160 | loss = loss_fn(y_pred, y) 161 | loss_accum += loss.item() 162 | 163 | optimizer.zero_grad() 164 | loss.backward() 165 | optimizer.step() 166 | 167 | loss_accum /= len(dataset) 168 | losses.append(loss_accum) 169 | if epoch % 15 == 0: 170 | self.__log("Epoch", epoch, "training loss:", loss_accum) 171 | 172 | # stopping condition 173 | if len(losses) > 10 and losses[-1] < 0.1: 174 | last_two = np.min(losses[-2:]) 175 | if last_two > losses[-10] or (losses[-10] - last_two < 0.0001): 176 | self.__log("Stopped training from convergence condition at epoch", epoch) 177 | break 178 | else: 179 | self.__log("Stopped training after max epochs") 180 | 181 | def predict(self, X): 182 | if not isinstance(X, list): 183 | X = [X] 184 | X = [json.loads(x) if isinstance(x, str) else x for x in X] 185 | 186 | X = self.__tree_transform.transform(X) 187 | 188 | self.__net.eval() 189 | pred = self.__net(X).cpu().detach().numpy() 190 | return self.__pipeline.inverse_transform(pred) 191 | 192 | -------------------------------------------------------------------------------- /docs/src/tutorial/4_exploration_mode.md: -------------------------------------------------------------------------------- 1 | # Use Exploration Mode 2 | 3 | Bao is powered by reinforcement learning, and must therefore strike a balance between the *exploration* of new plans and the *exploitation* of plans that are known to be good. If you do no exploration, you'll never do any better than PostgreSQL -- if you do too much exploration, you'll suffer many query regressions. 4 | 5 | Bao uses an algorithm called [Thompson sampling](https://en.wikipedia.org/wiki/Thompson_sampling), which has a theoretical guarantee about *regret*. Regret is the difference in performance between the plan chosen by Bao and the (unknown) best possible choice. Thompson sampling ensures that, in the long run, regret approaches zero.[^limit] This is considered to be an optimal balance of exploration and exploitation. 6 | 7 | However, sometimes "in the long run" isn't good enough for an application. For example, you may need to ensure that a particular query *never* regresses. In order to do this, Bao supports *exploration mode*, a special facility that lets Bao explore plans for specific queries offline, and then ensure the best plan is *always* chosen at runtime. 8 | 9 | At a high level, exploration mode works as follows. You tell Bao about a particular SQL query that you never want to regress. Then, you give Bao a fixed period of time -- like 20 minutes -- to run experiments against your database. Bao uses this time (and no more) to run as many experiments as it can, saving the results. When a new model is trained, that model is checked to make sure it would make the right decision for each executed experiment. If the model would not, the model is retrained, with increased emphasis on those experiments. 10 | 11 | We'll start from scratch, removing our previous Bao model and the experience we observed. Stop the Bao server (i.e., Control + C), then delete the model, Bao DB, and restart PostgreSQL: 12 | 13 | ``` 14 | $ rm -rf bao_server/bao_default_model 15 | $ rm bao_server/bao.db 16 | $ systemctl restart postgresql 17 | ``` 18 | 19 | ## Configure Bao to talk to PostgreSQL 20 | 21 | Until now, the PostgreSQL extension has communicated with the Bao server, but the Bao server has never directly connected to the database. For exploration mode, we'll need such a connection. Edit the `bao_server/bao.cfg` file to tell Bao how to connect to your PostgreSQL instance: 22 | 23 | ```ini 24 | # ============================================================== 25 | # EXPLORATION MODE SETTINGS 26 | # ============================================================== 27 | 28 | # maximum time a query should reasonably take (used in 29 | # exploration mode). 30 | MaxQueryTimeSeconds = 120 31 | 32 | # psycopg2 / JDBC connection string to access PostgreSQL 33 | # (used by the experiment runner to prevent regressions) 34 | PostgreSQLConnectString = user=imdb 35 | 36 | ``` 37 | 38 | * `MaxQueryTimeSeconds` is an upper-bound on how long any non-regressed query plan you add to exploration mode should take. For this sample workload, 120 seconds was a reasonable value for this workload. Bao uses this value as a cutoff for its experiments, assuming any plan that takes longer than this amount of time must be a regression. Don't set this value too tightly, however, because Bao can still gain knowledge from observing *how much* a query plan regressed. 39 | * `PostgreSQLConnectString` is the JDBC-like string used by the Bao server to connect to the postgreSQL database. You can find documentation for it [from the psycopg docs](https://www.psycopg.org/docs/module.html#psycopg2.connect). 40 | 41 | 42 | ### Testing the connection 43 | 44 | To test the connection, we can use the `baoctl.py` script. You can find this script in the `bao_server` directory. It should be executed on the same machine as the Bao server is running. 45 | 46 | Run `baoctl.py --test-connection` to see if Bao can connect to your PostgreSQL instance: 47 | 48 | ``` 49 | $ python3 baoctl.py --test-connection 50 | Connection successful! 51 | ``` 52 | 53 | ## Adding exploration queries 54 | 55 | Once Bao can connect to our PostgreSQL instance, we can add exploration queries. Let's add the first three workload queries: 56 | 57 | ``` 58 | $ python3 baoctl.py --add-test-query ../sample_queries/q1_8a463.sql 59 | Added new test query. 60 | $ python3 baoctl.py --add-test-query ../sample_queries/q2_8a82.sql 61 | Added new test query. 62 | $ python3 baoctl.py --add-test-query ../sample_queries/q3_7a99.sql 63 | Added new test query. 64 | ``` 65 | 66 | ## Start exploration 67 | 68 | You can give Bao as much or as little time to execute experiments as you'd like. It is a good idea to provide at least a little longer than `MaxQueryTimeSeconds`, to ensure that at least one experiment is fully executed. 69 | 70 | Each query added creates 5 experiments, so we currently have 15 un-ran experiments. We can see this by running `baoctl.py --status`: 71 | 72 | ``` 73 | $ python3 baoctl.py --status 74 | Unexecuted experiments : 15 75 | Completed experiments : 0 76 | Exploration queries : 3 77 | ``` 78 | 79 | We can start executing these 15 experiments by running `baoctl.py --experiment`. We'll give Bao 30 minutes to run these experiments so that they can all be finished: 80 | 81 | ``` 82 | $ python3 baoctl.py --experiment 1800 83 | We have 15 unexecuted experiment(s). 84 | Running on backend PID 57718 85 | ... 86 | Finished all experiments 87 | ``` 88 | 89 | 90 | Note: sometimes, a particular configuration will cause the PostgreSQL session to crash. When this is the case, PostgreSQL automatically restarts / recovers. However, since this can be an expensive operation, Bao ends any experimentation when this occurs. Bao notes when a configuration causes such a crash, and will not execute it again. If this occurs during the tutorial, restart the exploration process until every experiment finishes. When this occurs, the output looks like this: 91 | 92 | ``` 93 | Time remaining: 1539696 ms 94 | Server down after experiment with arm 1 95 | Treating this as a timeout and ceasing further experiments. 96 | Logged reward of 240000 97 | Finished all experiments 98 | ``` 99 | 100 | Once these experiments finish, we can re-run the entire workload and analyze the performance: 101 | 102 | 103 | ``` 104 | $ python3 run_queries.py sample_queries/*.sql | tee ~/bao_with_regblock.txt 105 | ``` 106 | 107 | Grab yet another coffee. This one will be faster, but will still take some time. 108 | 109 | 110 | # Notes 111 | 112 | [^limit]: Precisely, the gaurantee is that the limit as time tends to infinity is that regret will tend towards zero. This gaurantee makes several assumptions about the underlying reward function, which do not strictly apply to query optimization. Nevertheless, Thompson sampling appears to be an effective algorithm in this domain. 113 | -------------------------------------------------------------------------------- /bao_server/featurize.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | JOIN_TYPES = ["Nested Loop", "Hash Join", "Merge Join"] 4 | LEAF_TYPES = ["Seq Scan", "Index Scan", "Index Only Scan", "Bitmap Index Scan"] 5 | ALL_TYPES = JOIN_TYPES + LEAF_TYPES 6 | 7 | 8 | class TreeBuilderError(Exception): 9 | def __init__(self, msg): 10 | self.__msg = msg 11 | 12 | def is_join(node): 13 | return node["Node Type"] in JOIN_TYPES 14 | 15 | def is_scan(node): 16 | return node["Node Type"] in LEAF_TYPES 17 | 18 | class TreeBuilder: 19 | def __init__(self, stats_extractor, relations): 20 | self.__stats = stats_extractor 21 | self.__relations = sorted(relations, key=lambda x: len(x), reverse=True) 22 | 23 | def __relation_name(self, node): 24 | if "Relation Name" in node: 25 | return node["Relation Name"] 26 | 27 | if node["Node Type"] == "Bitmap Index Scan": 28 | # find the first (longest) relation name that appears in the index name 29 | name_key = "Index Name" if "Index Name" in node else "Relation Name" 30 | if name_key not in node: 31 | print(node) 32 | raise TreeBuilderError("Bitmap operator did not have an index name or a relation name") 33 | for rel in self.__relations: 34 | if rel in node[name_key]: 35 | return rel 36 | 37 | raise TreeBuilderError("Could not find relation name for bitmap index scan") 38 | 39 | raise TreeBuilderError("Cannot extract relation type from node") 40 | 41 | def __featurize_join(self, node): 42 | assert is_join(node) 43 | arr = np.zeros(len(ALL_TYPES)) 44 | arr[ALL_TYPES.index(node["Node Type"])] = 1 45 | return np.concatenate((arr, self.__stats(node))) 46 | 47 | def __featurize_scan(self, node): 48 | assert is_scan(node) 49 | arr = np.zeros(len(ALL_TYPES)) 50 | arr[ALL_TYPES.index(node["Node Type"])] = 1 51 | return (np.concatenate((arr, self.__stats(node))), 52 | self.__relation_name(node)) 53 | 54 | def plan_to_feature_tree(self, plan): 55 | children = plan["Plans"] if "Plans" in plan else [] 56 | 57 | if len(children) == 1: 58 | return self.plan_to_feature_tree(children[0]) 59 | 60 | if is_join(plan): 61 | assert len(children) == 2 62 | my_vec = self.__featurize_join(plan) 63 | left = self.plan_to_feature_tree(children[0]) 64 | right = self.plan_to_feature_tree(children[1]) 65 | return (my_vec, left, right) 66 | 67 | if is_scan(plan): 68 | assert not children 69 | return self.__featurize_scan(plan) 70 | 71 | raise TreeBuilderError("Node wasn't transparent, a join, or a scan: " + str(plan)) 72 | 73 | def norm(x, lo, hi): 74 | return (np.log(x + 1) - lo) / (hi - lo) 75 | 76 | def get_buffer_count_for_leaf(leaf, buffers): 77 | total = 0 78 | if "Relation Name" in leaf: 79 | total += buffers.get(leaf["Relation Name"], 0) 80 | 81 | if "Index Name" in leaf: 82 | total += buffers.get(leaf["Index Name"], 0) 83 | 84 | return total 85 | 86 | class StatExtractor: 87 | def __init__(self, fields, mins, maxs): 88 | self.__fields = fields 89 | self.__mins = mins 90 | self.__maxs = maxs 91 | 92 | def __call__(self, inp): 93 | res = [] 94 | for f, lo, hi in zip(self.__fields, self.__mins, self.__maxs): 95 | if f not in inp: 96 | res.append(0) 97 | else: 98 | res.append(norm(inp[f], lo, hi)) 99 | return res 100 | 101 | def get_plan_stats(data): 102 | costs = [] 103 | rows = [] 104 | bufs = [] 105 | 106 | def recurse(n, buffers=None): 107 | costs.append(n["Total Cost"]) 108 | rows.append(n["Plan Rows"]) 109 | if "Buffers" in n: 110 | bufs.append(n["Buffers"]) 111 | 112 | if "Plans" in n: 113 | for child in n["Plans"]: 114 | recurse(child) 115 | 116 | for plan in data: 117 | recurse(plan["Plan"], buffers=plan.get("Buffers", None)) 118 | 119 | costs = np.array(costs) 120 | rows = np.array(rows) 121 | bufs = np.array(bufs) 122 | 123 | costs = np.log(costs + 1) 124 | rows = np.log(rows + 1) 125 | bufs = np.log(bufs + 1) 126 | 127 | costs_min = np.min(costs) 128 | costs_max = np.max(costs) 129 | rows_min = np.min(rows) 130 | rows_max = np.max(rows) 131 | bufs_min = np.min(bufs) if len(bufs) != 0 else 0 132 | bufs_max = np.max(bufs) if len(bufs) != 0 else 0 133 | 134 | if len(bufs) != 0: 135 | return StatExtractor( 136 | ["Buffers", "Total Cost", "Plan Rows"], 137 | [bufs_min, costs_min, rows_min], 138 | [bufs_max, costs_max, rows_max] 139 | ) 140 | else: 141 | return StatExtractor( 142 | ["Total Cost", "Plan Rows"], 143 | [costs_min, rows_min], 144 | [costs_max, rows_max] 145 | ) 146 | 147 | 148 | def get_all_relations(data): 149 | all_rels = [] 150 | 151 | def recurse(plan): 152 | if "Relation Name" in plan: 153 | yield plan["Relation Name"] 154 | 155 | if "Plans" in plan: 156 | for child in plan["Plans"]: 157 | yield from recurse(child) 158 | 159 | for plan in data: 160 | all_rels.extend(list(recurse(plan["Plan"]))) 161 | 162 | return set(all_rels) 163 | 164 | def get_featurized_trees(data): 165 | all_rels = get_all_relations(data) 166 | stats_extractor = get_plan_stats(data) 167 | 168 | t = TreeBuilder(stats_extractor, all_rels) 169 | trees = [] 170 | 171 | for plan in data: 172 | tree = t.plan_to_feature_tree(plan) 173 | trees.append(tree) 174 | 175 | return trees 176 | 177 | def _attach_buf_data(tree): 178 | if "Buffers" not in tree: 179 | return 180 | 181 | buffers = tree["Buffers"] 182 | 183 | def recurse(n): 184 | if "Plans" in n: 185 | for child in n["Plans"]: 186 | recurse(child) 187 | return 188 | 189 | # it is a leaf 190 | n["Buffers"] = get_buffer_count_for_leaf(n, buffers) 191 | 192 | recurse(tree["Plan"]) 193 | 194 | class TreeFeaturizer: 195 | def __init__(self): 196 | self.__tree_builder = None 197 | 198 | def fit(self, trees): 199 | for t in trees: 200 | _attach_buf_data(t) 201 | all_rels = get_all_relations(trees) 202 | stats_extractor = get_plan_stats(trees) 203 | self.__tree_builder = TreeBuilder(stats_extractor, all_rels) 204 | 205 | def transform(self, trees): 206 | for t in trees: 207 | _attach_buf_data(t) 208 | return [self.__tree_builder.plan_to_feature_tree(x["Plan"]) for x in trees] 209 | 210 | def num_operators(self): 211 | return len(ALL_TYPES) 212 | -------------------------------------------------------------------------------- /docs/src/tutorial/5_advisor.md: -------------------------------------------------------------------------------- 1 | # Bao as an advisor 2 | 3 | For some applications, any amount of exploration -- and thus regression -- on any query is unacceptable. In these scenarios, it is possible to use Bao as an advisor instead of a full-blown optimizer. 4 | 5 | To demonstrate this, let's use a simple `psql` session. 6 | 7 | ``` 8 | $ psql -U imdb -h localhost 9 | psql (12.2, server 12.3) 10 | Type "help" for help. 11 | 12 | imdb=# 13 | ``` 14 | 15 | Bao can be controlled through three main session-level PostgreSQL configs. 16 | 17 | * `enable_bao` is the top level config. When set to `off`, the default, Bao does not observe or interfere with the query optimizer at all. When set to `on`, Bao will behave according to the values of `enable_bao_rewards` and `enable_bao_selection`. 18 | * `enable_bao_rewards` determines whether or not Bao collects additional experience from queries from this session. 19 | * `enable_bao_selection` determines whether or not Bao will use its value model to select query plans. 20 | 21 | To use Bao as a pure advisor, we can set `enable_bao` but disable `enable_bao_rewards` and `enable_bao_selection`. 22 | 23 | ``` 24 | imdb=# SET enable_bao TO on; 25 | SET 26 | imdb=# SET enable_bao_selection TO off; 27 | SET 28 | imdb=# SET enable_bao_rewards TO off; 29 | SET 30 | ``` 31 | 32 | Next, we'll execute a simple `EXPLAIN` statement: 33 | 34 | ``` 35 | imdb=# EXPLAIN SELECT count(*) FROM title; 36 | QUERY PLAN 37 | ------------------------------------------------------------------------------------------ 38 | Bao prediction: 894.349 ms 39 | Bao recommended hint: (no hint) 40 | Finalize Aggregate (cost=50165.40..50165.41 rows=1 width=8) 41 | -> Gather (cost=50165.19..50165.40 rows=2 width=8) 42 | Workers Planned: 2 43 | -> Partial Aggregate (cost=49165.19..49165.20 rows=1 width=8) 44 | -> Parallel Seq Scan on title (cost=0.00..46531.75 rows=1053375 width=0) 45 | (7 rows) 46 | 47 | imdb=# 48 | ``` 49 | 50 | Since `enable_bao_selection` is off, this plan is generated using the PostgreSQL optimizer, exactly as it would be if you were not using Bao. Two additional lines are added to the outputof the `EXPLAIN` plan: 51 | 52 | * **Bao prediction** shows the time that Bao thinks this query plan will take to execute. In this case, about a second. 53 | * **Bao recommended hint** shows the query hint that Bao *would* use if `enable_bao_selection` was on. In this case, Bao would not use any query hints. 54 | 55 | Let's execute the query and run the same `EXPLAIN` statement again: 56 | 57 | ``` 58 | imdb=# SELECT count(*) FROM title; 59 | count 60 | --------- 61 | 2528312 62 | (1 row) 63 | 64 | imdb=# EXPLAIN SELECT count(*) FROM title; 65 | QUERY PLAN 66 | ------------------------------------------------------------------------------------------ 67 | Bao prediction: 661.193 ms 68 | Bao recommended hint: (no hint) 69 | Finalize Aggregate (cost=50165.40..50165.41 rows=1 width=8) 70 | -> Gather (cost=50165.19..50165.40 rows=2 width=8) 71 | Workers Planned: 2 72 | -> Partial Aggregate (cost=49165.19..49165.20 rows=1 width=8) 73 | -> Parallel Seq Scan on title (cost=0.00..46531.75 rows=1053375 width=0) 74 | (7 rows) 75 | ``` 76 | 77 | Bao's prediction for the query changed, even though the query plan, cost estimates, and cardinality estimates are all the same as before! This is because the buffer pool has changed states: after the execution of the `SELECT` query, more data relevant to this query has been cached, so Bao predicts that it will execute faster. 78 | 79 | Of course, predictions are exactly that -- predictions. While Bao's value model should get better over time, these predictions should be used as *advice*, and have no bounds whatsoever (although Bao will never predict a negative query runtime). 80 | 81 | Let's look at `q1` from our sample workload. You can copy and paste the below statement to see the `EXPLAIN` output. 82 | 83 | ```sql 84 | EXPLAIN SELECT COUNT(*) FROM title as t, kind_type as kt, info_type as it1, movie_info as mi1, cast_info as ci, role_type as rt, name as n, movie_keyword as mk, keyword as k, movie_companies as mc, company_type as ct, company_name as cn WHERE t.id = ci.movie_id AND t.id = mc.movie_id AND t.id = mi1.movie_id AND t.id = mk.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND k.id = mk.keyword_id AND mi1.info_type_id = it1.id AND t.kind_id = kt.id AND ci.person_id = n.id AND ci.role_id = rt.id AND (it1.id IN ('7')) AND (mi1.info in ('MET:','OFM:35 mm','PCS:Digital Intermediate','PFM:35 mm','PFM:Video','RAT:1.33 : 1','RAT:1.37 : 1')) AND (kt.kind in ('episode','movie','tv movie')) AND (rt.role in ('actor','actress')) AND (n.gender in ('f','m') OR n.gender IS NULL) AND (n.name_pcode_cf in ('A5362','J5252','R1632','R2632','W4525')) AND (t.production_year <= 2015) AND (t.production_year >= 1925) AND (cn.name in ('Fox Network','Independent Television (ITV)','Metro-Goldwyn-Mayer (MGM)','National Broadcasting Company (NBC)','Paramount Pictures','Shout! Factory','Sony Pictures Home Entertainment','Universal Pictures','Universal TV')) AND (ct.kind in ('distributors','production companies')); 85 | ``` 86 | 87 | This should result in something like this: 88 | 89 | ``` 90 | imdb=# EXPLAIN SELECT COUNT(*) FROM title as t, kind_type as kt, info_type as it1, movie_info as mi1, cast_info as ci, role_type as rt, name as n, movie_keyword as mk, keyword as k, movie_companies as mc, company_type as ct, company_name as cn WHERE t.id = ci.movie_id AND t.id = mc.movie_id AND t.id = mi1.movie_id AND t.id = mk.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND k.id = mk.keyword_id AND mi1.info_type_id = it1.id AND t.kind_id = kt.id AND ci.person_id = n.id AND ci.role_id = rt.id AND (it1.id IN ('7')) AND (mi1.info in ('MET:','OFM:35 mm','PCS:Digital Intermediate','PFM:35 mm','PFM:Video','RAT:1.33 : 1','RAT:1.37 : 1')) AND (kt.kind in ('episode','movie','tv movie')) AND (rt.role in ('actor','actress')) AND (n.gender in ('f','m') OR n.gender IS NULL) AND (n.name_pcode_cf in ('A5362','J5252','R1632','R2632','W4525')) AND (t.production_year <= 2015) AND (t.production_year >= 1925) AND (cn.name in ('Fox Network','Independent Television (ITV)','Metro-Goldwyn-Mayer (MGM)','National Broadcasting Company (NBC)','Paramount Pictures','Shout! Factory','Sony Pictures Home Entertainment','Universal Pictures','Universal TV')) AND (ct.kind in ('distributors','production companies')); 91 | 92 | 93 | QUERY PLAN 94 | ---------------------------------------------------------------------- 95 | ------------------------------------------------------------------------------------------------------------------ 96 | Bao prediction: 74053.945 ms 97 | Bao recommended hint: SET enable_nestloop TO off; 98 | Aggregate (cost=9644.44..9644.45 rows=1 width=8) 99 | -> Gather (cost=1006.53..9644.44 rows=1 width=0) 100 | Workers Planned: 1 101 | -> Nested Loop (cost=6.53..8644.34 rows=1 width=0) 102 | -> Hash Join (cost=6.10..8643.30 rows=1 width=4) 103 | ... 104 | ``` 105 | 106 | Here, Bao thinks the query plan generated by PostgreSQL will take a little over a minute to execute. Bao recommends a hint to disable nested loop joins for this query. Let's take the hint, and re-run the EXPLAIN: 107 | 108 | ``` 109 | imdb=# SET enable_nestloop TO off; 110 | SET 111 | imdb=# EXPLAIN SELECT COUNT(*) FROM title as t, kind_type as kt, info_type as it1, movie_info as mi1, cast_info as ci, role_type as rt, name as n, movie_keyword as mk, keyword as k, movie_companies as mc, company_type as ct, company_name as cn WHERE t.id = ci.movie_id AND t.id = mc.movie_id AND t.id = mi1.movie_id AND t.id = mk.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND k.id = mk.keyword_id AND mi1.info_type_id = it1.id AND t.kind_id = kt.id AND ci.person_id = n.id AND ci.role_id = rt.id AND (it1.id IN ('7')) AND (mi1.info in ('MET:','OFM:35 mm','PCS:Digital Intermediate','PFM:35 mm','PFM:Video','RAT:1.33 : 1','RAT:1.37 : 1')) AND (kt.kind in ('episode','movie','tv movie')) AND (rt.role in ('actor','actress')) AND (n.gender in ('f','m') OR n.gender IS NULL) AND (n.name_pcode_cf in ('A5362','J5252','R1632','R2632','W4525')) AND (t.production_year <= 2015) AND (t.production_year >= 1925) AND (cn.name in ('Fox Network','Independent Television (ITV)','Metro-Goldwyn-Mayer (MGM)','National Broadcasting Company (NBC)','Paramount Pictures','Shout! Factory','Sony Pictures Home Entertainment','Universal Pictures','Universal TV')) AND (ct.kind in ('distributors','production companies')); 112 | 113 | 114 | QUERY PLAN 115 | 116 | ---------------------------------------------------------------------------------- 117 | Bao prediction: 15032.299 ms 118 | Bao recommended hint: SET enable_nestloop TO off; 119 | Aggregate (cost=10000977592.82..10000977592.83 rows=1 width=8) 120 | -> Nested Loop (cost=10000683246.19..10000977592.82 rows=1 width=0) 121 | -> Seq Scan on info_type it1 (cost=0.00..2.41 rows=1 width=4) 122 | Filter: (id = 7) 123 | ... 124 | ``` 125 | 126 | For the new query plan, Bao predicts a time of only 15 seconds. If you'd like, you can execute both query plans to measure the quality of Bao's predictions. 127 | 128 | Keep in mind that using Bao as we've configured it here won't let the value model get any better. In order to improve its model, Bao needs experience. For any given session, you can allow Bao to collect experience and thus improve its model, but still prevent Bao from modifying any query plans. To do this, just set `enable_bao` and `enable_bao_rewards` to `ON`, but set `enable_bao_selection` to `OFF`. 129 | -------------------------------------------------------------------------------- /analyze_bao.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%matplotlib inline\n", 10 | "import pandas as pd\n", 11 | "import numpy as np\n", 12 | "import matplotlib\n", 13 | "from matplotlib import pyplot as plt\n", 14 | "import string\n", 15 | "from collections import defaultdict" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "plt.rcParams[\"font.size\"] = 16\n", 25 | "SHOW_RG = False" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "with open(\"pg_run.txt\") as f:\n", 35 | " data = f.read().split(\"\\n\")[2:]\n", 36 | "data = [x.split(\" \") for x in data if len(x) > 1 and (x[0] in string.digits or x[0] == \"x\")]\n", 37 | "\n", 38 | "data = [(x[0], x[1], float(x[2]), x[3], float(x[4])) for x in data]\n", 39 | "pg_data = data\n", 40 | "pg_times = np.array([x[2] for x in pg_data])\n", 41 | "pg_times -= np.min(pg_times)\n", 42 | "pg_times /= 60\n", 43 | "\n", 44 | "\n", 45 | "def read_bao_data(fp):\n", 46 | " with open(fp) as f:\n", 47 | " data = f.read().split(\"\\n\")[2:]\n", 48 | "\n", 49 | " training_times = []\n", 50 | " for idx in range(len(data)):\n", 51 | " if data[idx].strip().startswith(\"Initial input channels\"):\n", 52 | " prev_line = data[idx-1].split(\" \")\n", 53 | " if prev_line[0] == \"Retry\":\n", 54 | " continue\n", 55 | " training_times.append(float(prev_line[2]))\n", 56 | "\n", 57 | "\n", 58 | " training_times = np.array(training_times)\n", 59 | "\n", 60 | " data = [x.split(\" \") for x in data if len(x) > 1 and (x[0] in string.digits or x[0] == \"x\")]\n", 61 | " data = [(x[0], x[1], float(x[2]), x[3], float(x[4])) for x in data]\n", 62 | " bao_data = data\n", 63 | "\n", 64 | " bao_times = np.array([x[2] for x in bao_data])\n", 65 | " training_times -= np.min(bao_times)\n", 66 | " bao_times -= np.min(bao_times)\n", 67 | "\n", 68 | " bao_times /= 60\n", 69 | " training_times /= 60\n", 70 | " return bao_data, bao_times, training_times\n", 71 | "\n", 72 | "bao_data, bao_times, training_times = read_bao_data(\"bao_run.txt\")\n", 73 | "if SHOW_RG:\n", 74 | " bao_rb_data, bao_rb_times, training_rb_times = read_bao_data(\"bao_with_regblock.txt\")" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "queries_complete = np.arange(0, len(pg_times))\n", 84 | "\n", 85 | "fig, ax = plt.subplots(1, 1, constrained_layout=True)\n", 86 | "\n", 87 | "\n", 88 | "train_y = []\n", 89 | "train_rb_y = []\n", 90 | "for tt in training_times:\n", 91 | " idx = np.searchsorted(bao_times, tt)\n", 92 | " train_y.append(idx)\n", 93 | " \n", 94 | "if SHOW_RG:\n", 95 | " for tt in training_rb_times:\n", 96 | " idx = np.searchsorted(bao_rb_times, tt)\n", 97 | " train_rb_y.append(idx)\n", 98 | " \n", 99 | "plt.scatter(training_times, train_y, s=45, color=\"red\", label=\"Training\")\n", 100 | "\n", 101 | "ax.plot(pg_times, queries_complete, label=\"PostgreSQL\", lw=3)\n", 102 | "ax.plot(bao_times, queries_complete, label=\"Bao\", lw=3)\n", 103 | "\n", 104 | "if SHOW_RG:\n", 105 | " plt.scatter(training_rb_times, train_rb_y, s=45, color=\"red\")\n", 106 | " ax.plot(bao_rb_times, queries_complete, label=\"Bao (w/ exploration)\", lw=3)\n", 107 | "\n", 108 | "ax.set_xlabel(\"Time (m)\")\n", 109 | "ax.set_ylabel(\"Queries complete\")\n", 110 | "ax.set_title(\"PostgreSQL vs Bao Optimizer\")\n", 111 | "\n", 112 | "ax.grid(linestyle=\"--\", linewidth=1)\n", 113 | "ax.legend()\n", 114 | "fig.savefig(\"queries_vs_time.svg\")" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "all_pg_times = sorted([x[4] for x in pg_data])\n", 124 | "all_bao_times = sorted([x[4] for x in bao_data])\n", 125 | "\n", 126 | "if SHOW_RG:\n", 127 | " all_bao_rb_times = sorted([x[4] for x in bao_rb_data])\n", 128 | "\n", 129 | "\n", 130 | "fig, axes = plt.subplots(1, 2, figsize=(10, 4), constrained_layout=True)\n", 131 | "\n", 132 | "ax = axes[0]\n", 133 | "ax.plot(np.linspace(0, 1, len(all_pg_times)), all_pg_times, lw=3, label=\"PostgreSQL\")\n", 134 | "ax.plot(np.linspace(0, 1, len(all_pg_times)), all_bao_times, lw=3, label=\"Bao\")\n", 135 | "\n", 136 | "if SHOW_RG:\n", 137 | " ax.plot(np.linspace(0, 1, len(all_pg_times)), all_bao_rb_times, lw=3, label=\"Bao (w/ exploration)\")\n", 138 | "\n", 139 | "ax.grid(linestyle=\"--\", linewidth=1)\n", 140 | "ax.set_xlabel(\"Proportion of Queries\")\n", 141 | "ax.set_ylabel(\"Max Latency (s)\")\n", 142 | "ax.set_title(\"Query Latency CDF\")\n", 143 | "ax.legend()\n", 144 | "#ax.set_yscale(\"log\")\n", 145 | "\n", 146 | "\n", 147 | "ax = axes[1]\n", 148 | "ax.plot(np.linspace(0, 1, len(all_pg_times)), all_pg_times, lw=3, label=\"PostgreSQL\")\n", 149 | "ax.plot(np.linspace(0, 1, len(all_pg_times)), all_bao_times, lw=3, label=\"Bao\")\n", 150 | "\n", 151 | "if SHOW_RG:\n", 152 | " ax.plot(np.linspace(0, 1, len(all_pg_times)), all_bao_rb_times, lw=3, label=\"Bao (w/ exploration)\")\n", 153 | "\n", 154 | "ax.grid(linestyle=\"--\", linewidth=1)\n", 155 | "ax.set_xlabel(\"Proportion of Queries\")\n", 156 | "ax.set_ylabel(\"Max Latency (s)\")\n", 157 | "ax.set_title(\"Query Latency CDF (log scale)\")\n", 158 | "ax.legend()\n", 159 | "ax.set_yscale(\"log\")\n", 160 | "fig.savefig(\"cdf.svg\")\n", 161 | "fig" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "# get the last PG time for each query\n", 171 | "pg_query_time = {}\n", 172 | "for itm in pg_data:\n", 173 | " pg_query_time[itm[3]] = itm[4]\n", 174 | " \n", 175 | "# get each Bao time\n", 176 | "bao_query_times = defaultdict(list)\n", 177 | "for itm in bao_data[50:]:\n", 178 | " bao_query_times[itm[3]].append(itm[4])\n", 179 | " \n", 180 | "if SHOW_RG:\n", 181 | " # get each Bao time\n", 182 | " bao_rb_query_times = defaultdict(list)\n", 183 | " for itm in bao_rb_data[50:]:\n", 184 | " bao_rb_query_times[itm[3]].append(itm[4])\n", 185 | "\n", 186 | "max_repeats = max(len(x) for x in bao_query_times.values())\n", 187 | "\n", 188 | "def extract_q_number(x):\n", 189 | " return int(x[x.find(\"/q\")+2:x.find(\"_\", x.find(\"/q\"))])\n", 190 | "\n", 191 | "q_order = sorted(bao_query_times.keys(), key=extract_q_number)\n", 192 | "\n", 193 | "grid = [bao_query_times[x] for x in q_order]\n", 194 | "\n", 195 | "if SHOW_RG:\n", 196 | " grid_rb = [bao_rb_query_times[x] for x in q_order]\n", 197 | "\n", 198 | "\n", 199 | "reg_data = []\n", 200 | "for idx, q in enumerate(q_order):\n", 201 | " if SHOW_RG:\n", 202 | " reg_data.append({\"Q\": f\"q{extract_q_number(q)}\", \n", 203 | " \"PG\": pg_query_time[q],\n", 204 | " \"Bao worst\": max(grid[idx]),\n", 205 | " \"Bao best\": min(grid[idx]),\n", 206 | " \"Bao + E worst\": max(grid_rb[idx]),\n", 207 | " \"Bao + E best\": min(grid_rb[idx])})\n", 208 | " else:\n", 209 | " reg_data.append({\"Q\": f\"q{extract_q_number(q)}\", \n", 210 | " \"PG\": pg_query_time[q],\n", 211 | " \"Bao worst\": max(grid[idx]),\n", 212 | " \"Bao best\": min(grid[idx])})\n", 213 | " \n", 214 | "\n", 215 | "\n", 216 | "def color_regression(col):\n", 217 | " def c_for_diff(diff):\n", 218 | " if diff < 2 and diff > -2:\n", 219 | " return \"background-color: white\"\n", 220 | " elif diff > 0.5:\n", 221 | " return \"background-color: #f27281\"\n", 222 | " else:\n", 223 | " return \"background-color: #9ee3ad\"\n", 224 | " \n", 225 | " to_r = [\"\"]\n", 226 | " \n", 227 | " if SHOW_RG:\n", 228 | " pg, bao_worst, bao_best, bao_rg_worst, bao_rg_best = col\n", 229 | " else:\n", 230 | " pg, bao_worst, bao_best = col\n", 231 | "\n", 232 | " \n", 233 | " to_r.append(c_for_diff(bao_worst - pg))\n", 234 | " to_r.append(c_for_diff(bao_best - pg))\n", 235 | " \n", 236 | " if SHOW_RG:\n", 237 | " to_r.append(c_for_diff(bao_rg_worst - pg)) \n", 238 | " to_r.append(c_for_diff(bao_rg_best - pg))\n", 239 | "\n", 240 | " return to_r\n", 241 | "\n", 242 | "reg_data = pd.DataFrame(reg_data).set_index(\"Q\")\n", 243 | "reg_data.style.apply(color_regression, axis=1)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [] 252 | } 253 | ], 254 | "metadata": { 255 | "kernelspec": { 256 | "display_name": "Python 3", 257 | "language": "python", 258 | "name": "python3" 259 | }, 260 | "language_info": { 261 | "codemirror_mode": { 262 | "name": "ipython", 263 | "version": 3 264 | }, 265 | "file_extension": ".py", 266 | "mimetype": "text/x-python", 267 | "name": "python", 268 | "nbconvert_exporter": "python", 269 | "pygments_lexer": "ipython3", 270 | "version": "3.8.3" 271 | } 272 | }, 273 | "nbformat": 4, 274 | "nbformat_minor": 4 275 | } 276 | -------------------------------------------------------------------------------- /bao_server/reg_blocker.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | import psycopg2 4 | import json 5 | 6 | import storage 7 | from common import BaoException 8 | from config import read_config 9 | 10 | # Code to block models that would create query regressions on important queries. 11 | # The basic methodology is to allow the user to submit the SQL of important queries, 12 | # which we store in an internal database. When triggered, we execute different 13 | # plans for those queries, and record their performance (this is "exploration mode.") 14 | # When a new model is proposed, we can compute it's maximum regression on the known 15 | # queries. 16 | 17 | _ALL_OPTIONS = [ 18 | "enable_nestloop", "enable_hashjoin", "enable_mergejoin", 19 | "enable_seqscan", "enable_indexscan", "enable_indexonlyscan" 20 | ] 21 | 22 | def _arm_idx_to_hints(arm_idx): 23 | hints = [] 24 | for option in _ALL_OPTIONS: 25 | hints.append(f"SET {option} TO off") 26 | 27 | if arm_idx == 0: 28 | for option in _ALL_OPTIONS: 29 | hints.append(f"SET {option} TO on") 30 | elif arm_idx == 1: 31 | hints.append("SET enable_hashjoin TO on") 32 | hints.append("SET enable_indexonlyscan TO on") 33 | hints.append("SET enable_indexscan TO on") 34 | hints.append("SET enable_mergejoin TO on") 35 | hints.append("SET enable_seqscan TO on") 36 | elif arm_idx == 2: 37 | hints.append("SET enable_hashjoin TO on") 38 | hints.append("SET enable_indexonlyscan TO on") 39 | hints.append("SET enable_nestloop TO on") 40 | hints.append("SET enable_seqscan TO on") 41 | elif arm_idx == 3: 42 | hints.append("SET enable_hashjoin TO on") 43 | hints.append("SET enable_indexonlyscan TO on") 44 | hints.append("SET enable_seqscan TO on") 45 | elif arm_idx == 4: 46 | hints.append("SET enable_hashjoin TO on") 47 | hints.append("SET enable_indexonlyscan TO on") 48 | hints.append("SET enable_indexscan TO on") 49 | hints.append("SET enable_nestloop TO on") 50 | hints.append("SET enable_seqscan TO on") 51 | else: 52 | raise BaoException("RegBlocker only supports the first 5 arms") 53 | return hints 54 | 55 | class ExperimentRunner: 56 | def __init__(self): 57 | config = read_config() 58 | self.__pg_connect_str = config["PostgreSQLConnectString"] 59 | self.__max_query_time = int(config["MaxQueryTimeSeconds"]) * 1000 60 | 61 | def __get_pg_cursor(self): 62 | try: 63 | conn = psycopg2.connect(self.__pg_connect_str) 64 | return conn.cursor() 65 | except psycopg2.OperationalError as e: 66 | raise BaoException("Could not connect to PG database") from e 67 | 68 | def add_experimental_query(self, sql): 69 | sql = sql.strip() 70 | if not sql.upper().startswith("SELECT"): 71 | raise BaoException("Experiment queries must be SELECT queries.") 72 | 73 | # First, make sure this query parses and that we can connect to PG. 74 | with self.__get_pg_cursor() as cur: 75 | try: 76 | cur.execute(f"EXPLAIN {sql}") 77 | cur.fetchall() 78 | except psycopg2.errors.ProgrammingError as e: 79 | raise BaoException( 80 | "Could not generate EXPLAIN output for experimental query, " 81 | + "it will not be added.") from e 82 | 83 | # Add this as an experimental query. 84 | storage.record_experimental_query(sql) 85 | 86 | def test_connection(self): 87 | with self.__get_pg_cursor() as cur: 88 | try: 89 | cur.execute("SELECT 1") 90 | cur.fetchall() 91 | except Exception as e: 92 | raise BaoException("Could not connect to the PostgreSQL database.") from e 93 | return True 94 | 95 | def status(self): 96 | to_r = {} 97 | 98 | to_r["Unexecuted experiments"] = len(storage.unexecuted_experiments()) 99 | to_r["Completed experiments"] = len(storage.experiment_experience()) 100 | to_r["Exploration queries"] = storage.num_experimental_queries() 101 | 102 | return to_r 103 | 104 | def explore(self, time_limit): 105 | start = time.time() 106 | unexecuted = storage.unexecuted_experiments() 107 | 108 | if not unexecuted: 109 | print("All experiments have been executed.") 110 | return 111 | 112 | print("We have", len(unexecuted), "unexecuted experiment(s).") 113 | random.shuffle(unexecuted) 114 | 115 | with self.__get_pg_cursor() as c: 116 | c.execute("SELECT pg_backend_pid()") 117 | pid = c.fetchall()[0][0] 118 | print("Running on backend PID", pid) 119 | c.execute("SET bao_include_json_in_explain TO on") 120 | c.execute("SET enable_bao TO on") 121 | c.execute("SET enable_bao_selection TO off") 122 | c.execute("SET enable_bao_rewards TO on") 123 | c.execute("commit") 124 | 125 | for experiment in unexecuted: 126 | experiment_id = experiment["id"] 127 | sql = experiment["query"] 128 | arm_idx = experiment["arm"] 129 | prev_id = storage.last_reward_from_pid(pid) 130 | 131 | time_remaining = round((time_limit - (time.time() - start)) * 1000.0) 132 | print("Time remaining:", time_remaining, "ms") 133 | if time_remaining < 0: 134 | break 135 | 136 | statement_timeout = min(self.__max_query_time, time_remaining) 137 | is_timeout_from_time_remaining = time_remaining < self.__max_query_time 138 | 139 | # set PG to timeout and to use the arm we want to test 140 | c.execute(f"SET statement_timeout TO {statement_timeout}") 141 | for stmt in _arm_idx_to_hints(arm_idx): 142 | c.execute(stmt) 143 | 144 | # get the Bao plan JSON so we can record a timeout if there is one 145 | c.execute("EXPLAIN (FORMAT JSON) " + sql) 146 | explain_json = c.fetchall()[0][0] 147 | 148 | bao_props, _qplan = explain_json 149 | 150 | bao_plan = json.loads(bao_props["Bao"]["Bao plan JSON"]) 151 | bao_buffer = json.loads(bao_props["Bao"]["Bao buffer JSON"]) 152 | 153 | try: 154 | c.execute(sql) 155 | c.fetchall() 156 | except psycopg2.errors.QueryCanceled as e: 157 | assert "timeout" in str(e) 158 | if is_timeout_from_time_remaining: 159 | print("Hit experimental timeout, stopping.") 160 | break 161 | 162 | # otherwise, the timeout was because we went past the 163 | # reasonable query limit. We should record that experiene. 164 | print("Query hit timeout, recording 2*timeout as the reward.") 165 | bao_plan["Buffers"] = bao_buffer 166 | storage.record_reward(bao_plan, 2 * self.__max_query_time, 167 | pid) 168 | c.execute("rollback") 169 | except psycopg2.OperationalError as e: 170 | # this query caused the server to go down! give it a 171 | # bit to restart, then try again. 172 | print("Server down after experiment with arm", arm_idx) 173 | if arm_idx != 0: 174 | print("Treating this as a timeout and ceasing further experiments.") 175 | bao_plan["Buffers"] = bao_buffer 176 | storage.record_reward(bao_plan, 2 * self.__max_query_time, 177 | pid) 178 | raise BaoException(f"Server down after experiment with arm {arm_idx}") from e 179 | 180 | retries_remaining = 5 181 | while (last_id := storage.last_reward_from_pid(pid)) == prev_id: 182 | # wait a second to make sure the reward is flushed to the DB 183 | time.sleep(1) 184 | retries_remaining -= 1 185 | if retries_remaining <= 0: 186 | raise BaoException( 187 | "Reward for experiment did not appear after 5 seconds, " 188 | + "is the Bao server running?") 189 | 190 | # last_id is the ID of the experience for this experiment 191 | storage.record_experiment(experiment_id, last_id, arm_idx) 192 | print("Finished all experiments") 193 | 194 | 195 | def compute_regressions(bao_reg): 196 | total_regressed = 0 197 | total_regression = 0 198 | for plan_group in storage.experiment_results(): 199 | plan_group = list(plan_group) 200 | plans = [x["plan"] for x in plan_group] 201 | best_latency = min(plan_group, key=lambda x: x["reward"])["reward"] 202 | 203 | if bao_reg: 204 | selection = bao_reg.predict(plans).argmin() 205 | else: 206 | # If bao_reg is false-y, compare against PostgreSQL. 207 | selection = 0 208 | 209 | selected_plan_latency = plan_group[selection]["reward"] 210 | 211 | # Check to see if the regression is more than 1%. 212 | if selected_plan_latency > best_latency * 1.01: 213 | total_regressed += 1 214 | 215 | total_regression += selected_plan_latency - best_latency 216 | 217 | return (total_regressed, total_regression) 218 | 219 | 220 | def should_replace_model(old_model, new_model): 221 | # Check the trained model for regressions on experimental queries. 222 | new_num_reg, new_reg_amnt = compute_regressions(new_model) 223 | cur_num_reg, cur_reg_amnt = compute_regressions(old_model) 224 | 225 | print("Old model # regressions:", cur_num_reg, 226 | "regression amount:", cur_reg_amnt) 227 | print("New model # regressions:", new_num_reg, 228 | "regression amount:", new_reg_amnt) 229 | 230 | # If our new model has no regressions, always accept it. 231 | # Otherwise, see if our regression profile is strictly better than 232 | # the previous model. 233 | if new_num_reg == 0: 234 | print("New model had no regressions.") 235 | return True 236 | elif cur_num_reg >= new_num_reg and cur_reg_amnt >= new_reg_amnt: 237 | print("New model with better regression profile", 238 | "than the old model.") 239 | return True 240 | else: 241 | print("New model did not have a better regression profile.") 242 | return False 243 | 244 | 245 | if __name__ == "__main__": 246 | import model 247 | 248 | # Add Q1 and Q2 as sample queries. 249 | tmp = ExperimentRunner() 250 | 251 | try: 252 | with open("../sample_queries/q1_8a463.sql") as f: 253 | sql = f.read() 254 | tmp.add_experimental_query(sql) 255 | 256 | with open("../sample_queries/q2_8a82.sql") as f: 257 | sql = f.read() 258 | tmp.add_experimental_query(sql) 259 | except BaoException: 260 | pass # already have them 261 | 262 | tmp.explore(5 * 60) 263 | 264 | new_model = model.BaoRegression(have_cache_data=True) 265 | new_model.load("bao_default_model") 266 | print(compute_regressions(new_model)) 267 | print(compute_regressions(None)) 268 | -------------------------------------------------------------------------------- /pg_extension/main.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "bao_configs.h" 8 | #include "bao_util.h" 9 | #include "bao_bufferstate.h" 10 | #include "bao_planner.h" 11 | #include "postgres.h" 12 | #include "fmgr.h" 13 | #include "parser/parsetree.h" 14 | #include "executor/executor.h" 15 | #include "optimizer/planner.h" 16 | #include "utils/guc.h" 17 | #include "commands/explain.h" 18 | #include "tcop/tcopprot.h" 19 | 20 | 21 | 22 | PG_MODULE_MAGIC; 23 | void _PG_init(void); 24 | void _PG_fini(void); 25 | 26 | 27 | 28 | 29 | 30 | // Bao works by integrating with PostgreSQL's hook functionality. 31 | // 1) The bao_planner hook intercepts a query before the PG optimizer handles 32 | // it, and communicates with the Bao server. 33 | // 2) The bao_ExecutorStart hook sets up time recording for the given query. 34 | // static PlannedStmt* bao_planner(Query* parse, 35 | // 3) The bao_ExecutorEnd hook gets the query timing and sends the reward 36 | // for the query back to the Bao server. 37 | // 4) The bao_ExplainOneQuery hook adds the Bao suggested hint and the reward 38 | // prediction to the EXPLAIN output of a query. 39 | static PlannedStmt* bao_planner(Query *parse, 40 | int cursorOptions, ParamListInfo boundParams); 41 | static void bao_ExecutorStart(QueryDesc *queryDesc, int eflags); 42 | static void bao_ExecutorEnd(QueryDesc *queryDesc); 43 | static void bao_ExplainOneQuery(Query* query, int cursorOptions, IntoClause* into, 44 | ExplainState* es, const char* queryString, 45 | ParamListInfo params, QueryEnvironment *queryEnv); 46 | 47 | static planner_hook_type prev_planner_hook = NULL; 48 | static ExecutorStart_hook_type prev_ExecutorStart = NULL; 49 | static ExecutorEnd_hook_type prev_ExecutorEnd = NULL; 50 | static ExplainOneQuery_hook_type prev_ExplainOneQuery = NULL; 51 | 52 | void _PG_init(void) { 53 | // install each Bao hook 54 | prev_ExecutorStart = ExecutorStart_hook; 55 | ExecutorStart_hook = bao_ExecutorStart; 56 | 57 | prev_ExecutorEnd = ExecutorEnd_hook; 58 | ExecutorEnd_hook = bao_ExecutorEnd; 59 | 60 | prev_planner_hook = planner_hook; 61 | planner_hook = bao_planner; 62 | 63 | prev_ExplainOneQuery = ExplainOneQuery_hook; 64 | ExplainOneQuery_hook = bao_ExplainOneQuery; 65 | 66 | // define Bao user-visible variables 67 | DefineCustomBoolVariable( 68 | "enable_bao", 69 | "Enable the Bao optimizer", 70 | "Enables the Bao optimizer. When enabled, the variables enable_bao_rewards" 71 | " and enable_bao_selection can be used to control whether or not Bao records" 72 | " query latency or selects query plans.", 73 | &enable_bao, 74 | false, 75 | PGC_USERSET, 76 | 0, 77 | NULL, NULL, NULL); 78 | 79 | DefineCustomBoolVariable( 80 | "enable_bao_rewards", 81 | "Send reward info to Bao", 82 | "Enables reward collection. When enabled, and when enable_bao is true, query latencies" 83 | " are sent to the Bao server after execution.", 84 | &enable_bao_rewards, 85 | true, 86 | PGC_USERSET, 87 | 0, 88 | NULL, NULL, NULL); 89 | 90 | DefineCustomBoolVariable( 91 | "enable_bao_selection", 92 | "Use Bao to select query plans", 93 | "Enables Bao query plan selection. When enabled, and when enable_bao is true, Bao" 94 | " will choose a query plan according to its learned model.", 95 | &enable_bao_selection, 96 | true, 97 | PGC_USERSET, 98 | 0, 99 | NULL, NULL, NULL); 100 | 101 | DefineCustomStringVariable( 102 | "bao_host", 103 | "Bao server host", NULL, 104 | &bao_host, 105 | "localhost", 106 | PGC_USERSET, 107 | 0, 108 | NULL, NULL, NULL); 109 | 110 | DefineCustomIntVariable( 111 | "bao_port", 112 | "Bao server port", NULL, 113 | &bao_port, 114 | 9381, 1, 65536, 115 | PGC_USERSET, 116 | 0, 117 | NULL, NULL, NULL); 118 | 119 | DefineCustomIntVariable( 120 | "bao_num_arms", 121 | "Number of arms to consider", 122 | "The number of arms to consider for each query plan. Each arm represents " 123 | "a planner configuration. Higher values give better plans, but higher " 124 | "optimization times. The standard planner is always considered.", 125 | &bao_num_arms, 126 | 5, 1, BAO_MAX_ARMS, 127 | PGC_USERSET, 128 | 0, 129 | NULL, NULL, NULL); 130 | 131 | DefineCustomBoolVariable( 132 | "bao_include_json_in_explain", 133 | "Includes Bao's JSON representation in EXPLAIN output.", 134 | "Includes Bao's JSON representation of a query plan in the " 135 | "output of EXPLAIN commands. Used by the Bao server.", 136 | &bao_include_json_in_explain, 137 | false, 138 | PGC_USERSET, 139 | 0, 140 | NULL, NULL, NULL); 141 | } 142 | 143 | 144 | void _PG_fini(void) { 145 | elog(LOG, "finished extension"); 146 | } 147 | 148 | static PlannedStmt* bao_planner(Query *parse, 149 | int cursorOptions, 150 | ParamListInfo boundParams) { 151 | // Bao planner. This is where we select a query plan. 152 | 153 | // The plan returned by the Bao planner, containing the PG plan, 154 | // the JSON query plan (for reward tracking), and the arm selected. 155 | BaoPlan* plan; 156 | 157 | // For timing Bao's overhead. 158 | clock_t t_start, t_final; 159 | double plan_time_ms; 160 | 161 | // Final PG plan to execute. 162 | PlannedStmt* to_return; 163 | 164 | if (prev_planner_hook) { 165 | elog(WARNING, "Skipping Bao hook, another planner hook is installed."); 166 | return prev_planner_hook(parse, cursorOptions, 167 | boundParams); 168 | } 169 | 170 | // Skip optimizing this query if it is not a SELECT statement (checked by 171 | // `should_bao_optimize`), or if Bao is not enabled. We do not check 172 | // enable_bao_selection here, because if enable_bao is on, we still need 173 | // to attach a query plan to the query to record the reward later. 174 | if (!should_bao_optimize(parse) || !enable_bao) { 175 | return standard_planner(parse, cursorOptions, 176 | boundParams); 177 | } 178 | 179 | 180 | t_start = clock(); 181 | 182 | // Call Bao query planning routine (in `bao_planner.h`). 183 | plan = plan_query(parse, cursorOptions, boundParams); 184 | 185 | if (plan == NULL) { 186 | // something went wrong, default to the PG plan. 187 | return standard_planner(parse, cursorOptions, boundParams); 188 | } 189 | 190 | // We need some way to associate this query with the BaoQueryInfo data. 191 | // Hack: connect the Bao plan info to this plan via the queryId field. 192 | to_return = plan->plan; 193 | to_return->queryId = (uint64_t)(void*) plan->query_info; 194 | plan->query_info = NULL; 195 | 196 | t_final = clock(); 197 | plan_time_ms = ((double)(t_final - t_start) 198 | / (double)CLOCKS_PER_SEC) * (double)1000.0; 199 | 200 | elog(LOG, "Bao planning selected arm %d, in %f CPU ms.", 201 | plan->selection, plan_time_ms); 202 | 203 | // Free the BaoPlan* object now that we have gotten the BaoQueryInfo 204 | // and after we have gotten the PG plan out of it. 205 | free_bao_plan(plan); 206 | 207 | return to_return; 208 | } 209 | 210 | 211 | static void bao_ExecutorStart(QueryDesc *queryDesc, int eflags) { 212 | // Code from pg_stat_statements. If needed, setup query timing 213 | // to use as Bao's reward signal. 214 | 215 | if (prev_ExecutorStart) 216 | prev_ExecutorStart(queryDesc, eflags); 217 | else 218 | standard_ExecutorStart(queryDesc, eflags); 219 | 220 | if (enable_bao_rewards 221 | && queryDesc->plannedstmt->queryId != 0) { 222 | if (queryDesc->totaltime == NULL) { 223 | MemoryContext oldcxt; 224 | 225 | oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt); 226 | queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_TIMER); 227 | MemoryContextSwitchTo(oldcxt); 228 | } 229 | } 230 | 231 | } 232 | 233 | static void bao_ExecutorEnd(QueryDesc *queryDesc) { 234 | // A query has finished. We need to check if it was a query Bao could optimize, 235 | // and if so, report the reward to the Bao server. 236 | 237 | BaoQueryInfo* bao_query_info; 238 | char* r_json; 239 | int conn_fd; 240 | 241 | if (enable_bao_rewards && should_report_reward(queryDesc)) { 242 | // We are tracking rewards for queries, and this query was 243 | // eligible for optimization by Bao. 244 | conn_fd = connect_to_bao(bao_host, bao_port); 245 | if (conn_fd < 0) { 246 | elog(WARNING, "Unable to connect to Bao server, reward for query will be dropped."); 247 | return; 248 | } 249 | 250 | if (!queryDesc->totaltime) { 251 | elog(WARNING, "Bao could not read instrumentation result, reward for query will be dropped."); 252 | return; 253 | } 254 | 255 | // Finalize the instrumentation so we can read the final time. 256 | InstrEndLoop(queryDesc->totaltime); 257 | 258 | // Generate a JSON blob with our reward. 259 | r_json = reward_json(queryDesc->totaltime->total * 1000.0); 260 | 261 | // Extract the BaoQueryInfo, which we hid inside the queryId of the 262 | // PlannedStmt. `should_report_reward` ensures it is set. 263 | bao_query_info = (BaoQueryInfo*)(void*)queryDesc->plannedstmt->queryId; 264 | queryDesc->plannedstmt->queryId = 0; 265 | 266 | // Write out the query plan, buffer information, and reward to the Bao 267 | // server. 268 | write_all_to_socket(conn_fd, START_FEEDBACK_MESSAGE); 269 | write_all_to_socket(conn_fd, bao_query_info->plan_json); 270 | write_all_to_socket(conn_fd, bao_query_info->buffer_json); 271 | write_all_to_socket(conn_fd, r_json); 272 | write_all_to_socket(conn_fd, TERMINAL_MESSAGE); 273 | shutdown(conn_fd, SHUT_RDWR); 274 | 275 | free_bao_query_info(bao_query_info); 276 | } 277 | 278 | if (prev_ExecutorEnd) { 279 | prev_ExecutorEnd(queryDesc); 280 | } else { 281 | standard_ExecutorEnd(queryDesc); 282 | } 283 | } 284 | 285 | static void bao_ExplainOneQuery(Query* query, int cursorOptions, IntoClause* into, 286 | ExplainState* es, const char* queryString, 287 | ParamListInfo params, QueryEnvironment* queryEnv) { 288 | 289 | PlannedStmt* plan; 290 | BaoPlan* bao_plan; 291 | instr_time plan_start, plan_duration; 292 | int conn_fd; 293 | char* buffer_json; 294 | char* plan_json; 295 | double prediction; 296 | char* hint_text; 297 | bool old_selection_val; 298 | bool connected = false; 299 | 300 | 301 | // If there are no other EXPLAIN hooks, add to the EXPLAIN output Bao's estimate 302 | // of this query plan's execution time, as well as what hints would be used 303 | // by Bao. 304 | 305 | // TODO: right now we add to the start of the EXPLAIN output, because I cannot 306 | // figure out how to add to the end of it. 307 | 308 | if (prev_ExplainOneQuery) { 309 | prev_ExplainOneQuery(query, cursorOptions, into, es, 310 | queryString, params, queryEnv); 311 | } 312 | 313 | // There should really be a standard_ExplainOneQuery, but there 314 | // isn't, so we will do our best. We will replicate some PG code 315 | // here as a consequence. 316 | 317 | INSTR_TIME_SET_CURRENT(plan_start); 318 | plan = (planner_hook ? planner_hook(query, cursorOptions, params) 319 | : standard_planner(query, cursorOptions, params)); 320 | INSTR_TIME_SET_CURRENT(plan_duration); 321 | INSTR_TIME_SUBTRACT(plan_duration, plan_start); 322 | 323 | if (!enable_bao) { 324 | // Bao is disabled, do the deault explain thing. 325 | ExplainOnePlan(plan, into, es, queryString, 326 | params, queryEnv, &plan_duration); 327 | return; 328 | } 329 | 330 | buffer_json = buffer_state(); 331 | plan_json = plan_to_json(plan); 332 | 333 | // Ask the Bao server for an estimate for this plan. 334 | conn_fd = connect_to_bao(bao_host, bao_port); 335 | if (conn_fd < 0) { 336 | elog(WARNING, "Unable to connect to Bao server, no prediction provided."); 337 | prediction = NAN; 338 | } else { 339 | write_all_to_socket(conn_fd, START_PREDICTION_MESSAGE); 340 | write_all_to_socket(conn_fd, plan_json); 341 | write_all_to_socket(conn_fd, buffer_json); 342 | write_all_to_socket(conn_fd, TERMINAL_MESSAGE); 343 | shutdown(conn_fd, SHUT_WR); 344 | 345 | // Read the response from the Bao server. 346 | if (read(conn_fd, &prediction, sizeof(double)) != sizeof(double)) { 347 | elog(WARNING, "Bao could not read the response from the server during EXPLAIN."); 348 | prediction = NAN; 349 | } 350 | 351 | connected = true; 352 | shutdown(conn_fd, SHUT_RDWR); 353 | } 354 | 355 | // Open a new explain group called "Bao" and add our prediction into it. 356 | ExplainOpenGroup("BaoProps", NULL, true, es); 357 | ExplainOpenGroup("Bao", "Bao", true, es); 358 | 359 | if (connected) { 360 | // The Bao server will (correctly) give a NaN if no model is available, 361 | // but PostgreSQL will dump that NaN into the raw JSON, causing parse bugs. 362 | if (isnan(prediction)) 363 | ExplainPropertyText("Bao prediction", "NaN", es); 364 | else 365 | ExplainPropertyFloat("Bao prediction", "ms", prediction, 3, es); 366 | } 367 | 368 | if (bao_include_json_in_explain) { 369 | ExplainPropertyText("Bao plan JSON", plan_json, es); 370 | ExplainPropertyText("Bao buffer JSON", buffer_json, es); 371 | } 372 | 373 | free(plan_json); 374 | free(buffer_json); 375 | 376 | // Next, plan the query so that we can suggest a hint. If enable_bao_selection 377 | // was on, this repeats some work, as the query will be planned twice. That's OK 378 | // since EXPLAIN should still be fast. 379 | old_selection_val = enable_bao_selection; 380 | enable_bao_selection = true; 381 | bao_plan = plan_query(query, cursorOptions, params); 382 | enable_bao_selection = old_selection_val; 383 | 384 | if (!bao_plan) { 385 | elog(WARNING, "Could not plan query with Bao during explain, omitting hint."); 386 | } else { 387 | hint_text = arm_to_hint(bao_plan->selection); 388 | ExplainPropertyText("Bao recommended hint", 389 | (hint_text ? hint_text : "(no hint)"), 390 | es); 391 | free(hint_text); 392 | free_bao_plan(bao_plan); 393 | } 394 | 395 | ExplainCloseGroup("Bao", "Bao", true, es); 396 | ExplainCloseGroup("BaoProps", NULL, true, es); 397 | 398 | // Do the deault explain thing. 399 | ExplainOnePlan(plan, into, es, queryString, 400 | params, queryEnv, &plan_duration); 401 | } 402 | -------------------------------------------------------------------------------- /pg_extension/bao_planner.h: -------------------------------------------------------------------------------- 1 | #ifndef BAO_PLANNER_H 2 | #define BAO_PLANNER_H 3 | 4 | #include 5 | #include "bao_configs.h" 6 | #include "bao_util.h" 7 | #include "bao_bufferstate.h" 8 | 9 | 10 | // Functions to help with Bao query planning. 11 | 12 | 13 | // This macro can be used to wrap a codeblock to save and restore the current 14 | // values of the plan hints. 15 | #define save_arm_options(x) { \ 16 | bool hj = enable_hashjoin;\ 17 | bool mj = enable_mergejoin;\ 18 | bool nl = enable_nestloop;\ 19 | bool is = enable_indexscan;\ 20 | bool ss = enable_seqscan;\ 21 | bool io = enable_indexonlyscan;\ 22 | { x } \ 23 | enable_hashjoin = hj;\ 24 | enable_mergejoin = mj;\ 25 | enable_nestloop = nl;\ 26 | enable_indexscan = is;\ 27 | enable_seqscan = ss;\ 28 | enable_indexonlyscan = io; } 29 | 30 | 31 | 32 | 33 | // Connect to a Bao server, construct plans for each arm, have the server 34 | // select a plan. Has the same signature as the PG optimizer. 35 | BaoPlan *plan_query(Query *parse, int cursorOptions, ParamListInfo boundParams); 36 | 37 | // Translate an arm index into SQL statements to give the hint (used for EXPLAIN). 38 | char* arm_to_hint(int arm); 39 | 40 | 41 | 42 | // Set the planner hint options to the correct one for the passed-in arm 43 | // index. Should be called with the `save_arm_options` macro so we don't 44 | // blast-away the user's config. 45 | static void set_arm_options(int arm) { 46 | enable_hashjoin = false; 47 | enable_mergejoin = false; 48 | enable_nestloop = false; 49 | enable_indexscan = false; 50 | enable_seqscan = false; 51 | enable_indexonlyscan = false; 52 | 53 | switch (arm) { 54 | case 0: 55 | enable_hashjoin = true; 56 | enable_indexscan = true; 57 | enable_mergejoin = true; 58 | enable_nestloop = true; 59 | enable_seqscan = true; 60 | enable_indexonlyscan = true; 61 | break; 62 | 63 | case 1: 64 | enable_hashjoin = true; 65 | enable_indexonlyscan = true; 66 | enable_indexscan = true; 67 | enable_mergejoin = true; 68 | enable_seqscan = true; 69 | break; 70 | case 2: 71 | enable_hashjoin = true; 72 | enable_indexonlyscan = true; 73 | enable_nestloop = true; 74 | enable_seqscan = true; 75 | break; 76 | case 3: 77 | enable_hashjoin = true; 78 | enable_indexonlyscan = true; 79 | enable_seqscan = true; 80 | break; 81 | case 4: 82 | enable_hashjoin = true; 83 | enable_indexonlyscan = true; 84 | enable_indexscan = true; 85 | enable_nestloop = true; 86 | enable_seqscan = true; 87 | break; 88 | case 5: 89 | enable_hashjoin = true; 90 | enable_indexonlyscan = true; 91 | enable_mergejoin = true; 92 | enable_nestloop = true; 93 | break; 94 | case 6: 95 | enable_hashjoin = true; 96 | enable_indexscan = true; 97 | enable_mergejoin = true; 98 | enable_nestloop = true; 99 | break; 100 | case 7: 101 | enable_indexonlyscan = true; 102 | enable_mergejoin = true; 103 | enable_nestloop = true; 104 | break; 105 | case 8: 106 | enable_hashjoin = true; 107 | enable_indexonlyscan = true; 108 | break; 109 | case 9: 110 | enable_hashjoin = true; 111 | enable_indexonlyscan = true; 112 | enable_indexscan = true; 113 | enable_nestloop = true; 114 | break; 115 | case 10: 116 | enable_hashjoin = true; 117 | enable_indexonlyscan = true; 118 | enable_indexscan = true; 119 | enable_seqscan = true; 120 | break; 121 | case 11: 122 | enable_hashjoin = true; 123 | enable_indexonlyscan = true; 124 | enable_mergejoin = true; 125 | enable_nestloop = true; 126 | enable_seqscan = true; 127 | break; 128 | case 12: 129 | enable_hashjoin = true; 130 | enable_indexonlyscan = true; 131 | enable_mergejoin = true; 132 | enable_seqscan = true; 133 | break; 134 | case 13: 135 | enable_hashjoin = true; 136 | enable_indexscan = true; 137 | enable_nestloop = true; 138 | break; 139 | case 14: 140 | enable_indexscan = true; 141 | enable_nestloop = true; 142 | break; 143 | case 15: 144 | enable_indexscan = true; 145 | enable_mergejoin = true; 146 | enable_nestloop = true; 147 | enable_seqscan = true; 148 | break; 149 | case 16: 150 | enable_indexonlyscan = true; 151 | enable_indexscan = true; 152 | enable_nestloop = true; 153 | break; 154 | case 17: 155 | enable_hashjoin = true; 156 | enable_indexonlyscan = true; 157 | enable_indexscan = true; 158 | enable_mergejoin = true; 159 | enable_nestloop = true; 160 | break; 161 | case 18: 162 | enable_indexscan = true; 163 | enable_mergejoin = true; 164 | enable_nestloop = true; 165 | break; 166 | case 19: 167 | enable_indexonlyscan = true; 168 | enable_mergejoin = true; 169 | enable_nestloop = true; 170 | enable_seqscan = true; 171 | break; 172 | case 20: 173 | enable_indexonlyscan = true; 174 | enable_indexscan = true; 175 | enable_nestloop = true; 176 | enable_seqscan = true; 177 | break; 178 | case 21: 179 | enable_hashjoin = true; 180 | enable_indexonlyscan = true; 181 | enable_indexscan = true; 182 | enable_mergejoin = true; 183 | break; 184 | case 22: 185 | enable_hashjoin = true; 186 | enable_indexonlyscan = true; 187 | enable_mergejoin = true; 188 | break; 189 | case 23: 190 | enable_hashjoin = true; 191 | enable_indexscan = true; 192 | enable_nestloop = true; 193 | enable_seqscan = true; 194 | break; 195 | case 24: 196 | enable_hashjoin = true; 197 | enable_indexscan = true; 198 | break; 199 | case 25: 200 | enable_hashjoin = true; 201 | enable_indexonlyscan = true; 202 | enable_nestloop = true; 203 | break; 204 | default: 205 | elog(ERROR, "Invalid arm index %d selected.", arm); 206 | break; 207 | } 208 | } 209 | 210 | 211 | // Get a query plan for a particular arm. 212 | static PlannedStmt* plan_arm(int arm, Query* parse, 213 | int cursorOptions, ParamListInfo boundParams) { 214 | 215 | PlannedStmt* plan = NULL; 216 | Query* query_copy = copyObject(parse); // create a copy of the query plan 217 | 218 | if (arm == -1) { 219 | // Use whatever the user has set as the current configuration. 220 | plan = standard_planner(query_copy, cursorOptions, boundParams); 221 | return plan; 222 | } 223 | 224 | // Preserving the user's options, set the config to match the arm index 225 | // and invoke the PG planner. 226 | save_arm_options({ 227 | set_arm_options(arm); 228 | plan = standard_planner(query_copy, cursorOptions, boundParams); 229 | }); 230 | 231 | return plan; 232 | } 233 | 234 | // A struct to represent a query plan before we transform it into JSON. 235 | typedef struct BaoPlanNode { 236 | // An integer representation of the PG NodeTag. 237 | unsigned int node_type; 238 | 239 | // The optimizer cost for this node (total cost). 240 | double optimizer_cost; 241 | 242 | // The cardinality estimate (plan rows) for this node. 243 | double cardinality_estimate; 244 | 245 | // If this is a scan or index lookup, the name of the underlying relation. 246 | char* relation_name; 247 | 248 | // Left child. 249 | struct BaoPlanNode* left; 250 | 251 | 252 | // Right child. 253 | struct BaoPlanNode* right; 254 | } BaoPlanNode; 255 | 256 | // Transform the operator types we care about from their PG tag to a 257 | // string. Call other operators "Other". 258 | static const char* node_type_to_string(NodeTag tag) { 259 | switch (tag) { 260 | case T_SeqScan: 261 | return "Seq Scan"; 262 | case T_IndexScan: 263 | return "Index Scan"; 264 | case T_IndexOnlyScan: 265 | return "Index Only Scan"; 266 | case T_BitmapIndexScan: 267 | return "Bitmap Index Scan"; 268 | case T_NestLoop: 269 | return "Nested Loop"; 270 | case T_MergeJoin: 271 | return "Merge Join"; 272 | case T_HashJoin: 273 | return "Hash Join"; 274 | default: 275 | return "Other"; 276 | } 277 | } 278 | 279 | // Allocate an empty BaoPlanNode. 280 | static BaoPlanNode* new_bao_plan() { 281 | return (BaoPlanNode*) malloc(sizeof(BaoPlanNode)); 282 | } 283 | 284 | // Free (recursively) an entire BaoPlanNode. Frees children as well. 285 | static void free_bao_plan_node(BaoPlanNode* node) { 286 | if (node->left) free_bao_plan_node(node->left); 287 | if (node->right) free_bao_plan_node(node->right); 288 | free(node); 289 | } 290 | 291 | // Emit a JSON representation of the given BaoPlanNode to the stream given. 292 | // Recursive function, the entry point is `plan_to_json`. 293 | static void emit_json(BaoPlanNode* node, FILE* stream) { 294 | fprintf(stream, "{\"Node Type\": \"%s\",", node_type_to_string(node->node_type)); 295 | fprintf(stream, "\"Node Type ID\": \"%d\",", node->node_type); 296 | if (node->relation_name) 297 | // TODO need to escape the relation name for JSON... 298 | fprintf(stream, "\"Relation Name\": \"%s\",", node->relation_name); 299 | fprintf(stream, "\"Total Cost\": %f,", node->optimizer_cost); 300 | fprintf(stream, "\"Plan Rows\": %f", node->cardinality_estimate); 301 | if (!node->left && !node->right) { 302 | fprintf(stream, "}"); 303 | return; 304 | } 305 | 306 | fprintf(stream, ", \"Plans\": ["); 307 | if (node->left) emit_json(node->left, stream); 308 | if (node->right) { 309 | fprintf(stream, ", "); 310 | emit_json(node->right, stream); 311 | } 312 | fprintf(stream, "]}"); 313 | } 314 | 315 | // Transform a PostgreSQL PlannedStmt into a BaoPlanNode tree. 316 | static BaoPlanNode* transform_plan(PlannedStmt* stmt, Plan* node) { 317 | BaoPlanNode* result = new_bao_plan(); 318 | 319 | result->node_type = node->type; 320 | result->optimizer_cost = node->total_cost; 321 | result->cardinality_estimate = node->plan_rows; 322 | result->relation_name = get_relation_name(stmt, node); 323 | 324 | result->left = NULL; 325 | result->right = NULL; 326 | if (node->lefttree) result->left = transform_plan(stmt, node->lefttree); 327 | if (node->righttree) result->right = transform_plan(stmt, node->righttree); 328 | 329 | return result; 330 | } 331 | 332 | // Given a PostgreSQL PlannedStmt, produce the JSON representation we need to 333 | // send to the Bao server. 334 | static char* plan_to_json(PlannedStmt* plan) { 335 | char* buf; 336 | size_t json_size; 337 | FILE* stream; 338 | BaoPlanNode* transformed_plan; 339 | 340 | transformed_plan = transform_plan(plan, plan->planTree); 341 | 342 | stream = open_memstream(&buf, &json_size); 343 | fprintf(stream, "{\"Plan\": "); 344 | emit_json(transformed_plan, stream); 345 | fprintf(stream, "}\n"); 346 | fclose(stream); 347 | 348 | free_bao_plan_node(transformed_plan); 349 | 350 | return buf; 351 | } 352 | 353 | // Primary planning function. Invokes the PG planner for each arm, sends the 354 | // results to the Bao server, gets the response, and returns the corrosponding 355 | // query plan (as a BaoPlan). 356 | BaoPlan* plan_query(Query *parse, int cursorOptions, ParamListInfo boundParams) { 357 | BaoPlan* plan; 358 | PlannedStmt* plan_for_arm[BAO_MAX_ARMS]; 359 | char* json_for_arm[BAO_MAX_ARMS]; 360 | Query* query_copy; 361 | int conn_fd; 362 | 363 | // Prepare the plan object to store a BaoQueryInfo instance. 364 | plan = (BaoPlan*) malloc(sizeof(BaoPlan)); 365 | plan->query_info = (BaoQueryInfo*) malloc(sizeof(BaoQueryInfo)); 366 | plan->selection = 0; 367 | 368 | // Connect this buffer state with the query. 369 | plan->query_info->buffer_json = buffer_state(); 370 | 371 | if (!enable_bao_selection) { 372 | // If Bao is not picking query plans, we use arm -1 to get the 373 | // default PostgreSQL plan. Note that we do *not* use arm 0, as 374 | // this would ignore the user's settings for things like 375 | // enable_nestloop. 376 | plan->plan = plan_arm(-1, parse, cursorOptions, boundParams); 377 | plan->query_info->plan_json = plan_to_json(plan->plan); 378 | return plan; 379 | } 380 | 381 | conn_fd = connect_to_bao(bao_host, bao_port); 382 | if (conn_fd == -1) { 383 | elog(WARNING, "Unable to connect to Bao server."); 384 | return NULL; 385 | } 386 | 387 | memset(plan_for_arm, 0, BAO_MAX_ARMS*sizeof(PlannedStmt*)); 388 | 389 | write_all_to_socket(conn_fd, START_QUERY_MESSAGE); 390 | for (int i = 0; i < bao_num_arms; i++) { 391 | // Plan the query for this arm. 392 | query_copy = copyObject(parse); 393 | plan_for_arm[i] = plan_arm(i, query_copy, cursorOptions, boundParams); 394 | 395 | // Transform it into JSON, transmit it to the Bao server. 396 | json_for_arm[i] = plan_to_json(plan_for_arm[i]); 397 | write_all_to_socket(conn_fd, json_for_arm[i]); 398 | } 399 | 400 | write_all_to_socket(conn_fd, plan->query_info->buffer_json); 401 | write_all_to_socket(conn_fd, TERMINAL_MESSAGE); 402 | shutdown(conn_fd, SHUT_WR); 403 | 404 | // Read the response. 405 | if (read(conn_fd, &plan->selection, sizeof(unsigned int)) != sizeof(unsigned int)) { 406 | shutdown(conn_fd, SHUT_RDWR); 407 | elog(WARNING, "Bao could not read the response from the server."); 408 | plan->selection = 0; 409 | } 410 | shutdown(conn_fd, SHUT_RDWR); 411 | 412 | if (plan->selection >= BAO_MAX_ARMS) { 413 | elog(ERROR, "Bao server returned arm index %d, which is outside the range.", 414 | plan->selection); 415 | plan->selection = 0; 416 | } 417 | 418 | // Keep the plan the Bao server selected, and associate the JSON representation 419 | // of the plan with the BaoPlan. Free everything else. 420 | plan->plan = plan_for_arm[plan->selection]; 421 | for (int i = 0; i < bao_num_arms; i++) { 422 | if (i == plan->selection) { 423 | plan->query_info->plan_json = json_for_arm[i]; 424 | } else { 425 | free(json_for_arm[i]); 426 | } 427 | } 428 | 429 | return plan; 430 | } 431 | 432 | // Given an arm index, produce the SQL statements that would cause PostgreSQL to 433 | // select the same query plan as Bao would. 434 | char* arm_to_hint(int arm) { 435 | char* buf; 436 | size_t size; 437 | FILE* stream; 438 | 439 | stream = open_memstream(&buf, &size); 440 | 441 | save_arm_options({ 442 | set_arm_options(arm); 443 | if (!enable_nestloop) fprintf(stream, "SET enable_nestloop TO off; "); 444 | if (!enable_hashjoin) fprintf(stream, "SET enable_hashjoin TO off; "); 445 | if (!enable_mergejoin) fprintf(stream, "SET enable_mergejoin TO off; "); 446 | if (!enable_seqscan) fprintf(stream, "SET enable_seqscan TO off; "); 447 | if (!enable_indexscan) fprintf(stream, "SET enable_indexscan TO off; "); 448 | if (!enable_indexonlyscan) fprintf(stream, "SET enable_indexonlyscan TO off; "); 449 | }); 450 | 451 | fclose(stream); 452 | 453 | if (size == 0) return NULL; 454 | return buf; 455 | } 456 | 457 | #endif 458 | --------------------------------------------------------------------------------