├── .env.sample ├── .github └── workflows │ └── build.yml ├── .gitignore ├── CHANGELOG.md ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── benchmark ├── README.md ├── data │ ├── create-indexes.ddl │ └── create-tables.ddl ├── queries.sql ├── query-templates │ ├── 1.sql │ ├── 10.sql │ ├── 11.sql │ ├── 12.sql │ ├── 13.sql │ ├── 14.sql │ ├── 15.sql │ ├── 16.sql │ ├── 17.sql │ ├── 18.sql │ ├── 19.sql │ ├── 2.sql │ ├── 20.sql │ ├── 21.sql │ ├── 22.sql │ ├── 3.sql │ ├── 4.sql │ ├── 5.sql │ ├── 6.sql │ ├── 7.sql │ ├── 8.sql │ └── 9.sql └── scripts │ ├── generate-data.sh │ ├── load-pg-data.sh │ └── measure-memory.sh ├── build └── .gitkeep ├── devbox.json ├── devbox.lock ├── img ├── BemiDB.gif ├── architecture.png └── tpc-h_database_structure.png ├── scripts ├── build-darwin.sh ├── build-linux.sh ├── install.sh ├── publish-docker.sh ├── test-data-types.sql ├── test-partitioned-tables.sql └── test-schemas.sql └── src ├── capped_buffer.go ├── capped_buffer_test.go ├── config.go ├── config_test.go ├── custom_types.go ├── duckdb.go ├── duckdb_test.go ├── error_utils.go ├── go.mod ├── go.sum ├── iceberg_reader.go ├── iceberg_writer.go ├── iceberg_writer_table.go ├── iceberg_writer_table_test.go ├── init_test.go ├── logger.go ├── main.go ├── parser_a_expr.go ├── parser_column_ref.go ├── parser_function.go ├── parser_select.go ├── parser_show.go ├── parser_table.go ├── parser_type_cast.go ├── parser_utils.go ├── pg_constants.go ├── pg_schema_column.go ├── postgres.go ├── query_handler.go ├── query_handler_test.go ├── query_remapper.go ├── query_remapper_expression.go ├── query_remapper_function.go ├── query_remapper_select.go ├── query_remapper_show.go ├── query_remapper_table.go ├── storage_interface.go ├── storage_local.go ├── storage_local_test.go ├── storage_s3.go ├── storage_utils.go ├── syncer.go ├── syncer_table.go ├── syncer_table_test.go ├── syncer_test.go └── utils.go /.env.sample: -------------------------------------------------------------------------------- 1 | BEMIDB_PORT=54321 2 | BEMIDB_DATABASE=bemidb 3 | BEMIDB_USER= 4 | BEMIDB_PASSWORD= 5 | BEMIDB_HOST=127.0.0.1 6 | BEMIDB_INIT_SQL=./init.sql 7 | BEMIDB_LOG_LEVEL=INFO 8 | 9 | # Local storage 10 | BEMIDB_STORAGE_TYPE=LOCAL 11 | BEMIDB_STORAGE_PATH=./iceberg 12 | 13 | # S3 storage 14 | # BEMIDB_STORAGE_TYPE=S3 15 | # BEMIDB_STORAGE_PATH=iceberg 16 | # AWS_REGION=us-west-1 17 | # AWS_ENDPOINT=s3.amazonaws.com 18 | # AWS_S3_BUCKET=[REPLACE_ME] 19 | # AWS_ACCESS_KEY_ID=[REPLACE_ME] 20 | # AWS_SECRET_ACCESS_KEY=[REPLACE_ME] 21 | 22 | # BEMIDB_DISABLE_ANONYMOUS_ANALYTICS=true 23 | 24 | # Postgres syncing 25 | PG_DATABASE_URL=postgres://[USER]:[PASSWORD]@localhost:5432/[DATABASE] 26 | # PG_SYNC_INTERVAL=1h 27 | # PG_SCHEMA_PREFIX=mydb_ 28 | # PG_INCLUDE_TABLES=public.users,public.posts 29 | # PG_EXCLUDE_TABLES=public.logs 30 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: 4 | push: 5 | branches: ['**'] 6 | pull_request: 7 | branches: ['**'] 8 | 9 | jobs: 10 | test: 11 | name: Test 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout Code 15 | uses: actions/checkout@v4 16 | 17 | - name: Set Up Go 18 | uses: actions/setup-go@v5 19 | with: 20 | go-version: '1.24.3' 21 | 22 | - name: Install Dependencies 23 | run: go get . 24 | working-directory: ./src 25 | 26 | - name: Run Tests 27 | run: go test -v ./... 28 | working-directory: ./src 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /iceberg 2 | /iceberg-test 3 | /src/iceberg 4 | /src/iceberg-test 5 | .DS_Store 6 | .vscode 7 | .env 8 | /benchmark/tpch-kit 9 | /benchmark/data/*.tbl 10 | /build 11 | bemidb 12 | /src/__debug* 13 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | Please see [Releases](https://github.com/BemiHQ/BemiDB/releases). 4 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PLATFORM 2 | ARG GOOS 3 | ARG GOARCH 4 | 5 | FROM --platform=$PLATFORM golang:1.24.3 AS builder 6 | 7 | WORKDIR /app 8 | 9 | COPY src/go.mod src/go.sum ./ 10 | RUN go mod download 11 | 12 | COPY src/ . 13 | RUN CGO_ENABLED=1 GOOS=$GOOS GOARCH=$GOARCH go build -o /app/bemidb 14 | 15 | ################################################################################ 16 | 17 | FROM --platform=$PLATFORM debian:bookworm-slim 18 | 19 | WORKDIR /app 20 | 21 | COPY --from=builder /app/bemidb /app/bemidb 22 | 23 | ENTRYPOINT ["/app/bemidb"] 24 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | sh: 2 | devbox --env-file .env shell 3 | 4 | install: 5 | devbox run "cd src && go mod tidy" 6 | 7 | up: 8 | devbox run --env-file .env "cd src && go run ." 9 | 10 | .PHONY: build 11 | build: 12 | rm -rf build/bemidb-* && \ 13 | devbox run "./scripts/build-darwin.sh" && \ 14 | ./scripts/build-linux.sh 15 | 16 | build-local: 17 | rm -rf build/bemidb-* && \ 18 | cd src && go build -o ../build/bemidb-darwin-arm64 19 | 20 | publish: 21 | ./scripts/publish-docker.sh 22 | 23 | sync: 24 | devbox run --env-file .env "cd src && go run . sync" 25 | 26 | test: 27 | devbox run "cd src && go test ./..." 28 | 29 | test-function: 30 | devbox run "cd src && go test ./... -run $(FUNC)" 31 | 32 | debug: 33 | devbox run "cd src && dlv test github.com/BemiHQ/BemiDB" 34 | 35 | lint: 36 | devbox run "cd src && go fmt && deadcode . && staticcheck ." 37 | 38 | console: 39 | devbox run "cd src && gore" 40 | 41 | outdated: 42 | devbox run "cd src && go list -u -m -f '{{if and .Update (not .Indirect)}}{{.}}{{end}}' all" 43 | 44 | .PHONY: benchmark 45 | benchmark: 46 | devbox run "time psql postgres://127.0.0.1:54321/bemidb < ./benchmark/queries.sql" 47 | 48 | pg-init: 49 | devbox run initdb && \ 50 | sed -i "s/#log_statement = 'none'/log_statement = 'all'/g" ./.devbox/virtenv/postgresql/data/postgresql.conf && \ 51 | sed -i "s/#logging_collector = off/logging_collector = on/g" ./.devbox/virtenv/postgresql/data/postgresql.conf && \ 52 | sed -i "s/#log_directory = 'log'/log_directory = 'log'/g" ./.devbox/virtenv/postgresql/data/postgresql.conf 53 | 54 | pg-up: 55 | devbox services start postgresql 56 | 57 | pg-create: 58 | devbox run "(dropdb tpch || true) && \ 59 | createdb tpch && \ 60 | ./benchmark/scripts/load-pg-data.sh" 61 | 62 | pg-index: 63 | devbox run "psql postgres://127.0.0.1:5432/tpch -f ./benchmark/data/create-indexes.ddl" 64 | 65 | pg-benchmark: 66 | devbox run "psql postgres://127.0.0.1:5432/tpch -c 'ANALYZE VERBOSE' && \ 67 | time psql postgres://127.0.0.1:5432/tpch < ./benchmark/queries.sql" 68 | 69 | pg-down: 70 | devbox services stop postgresql 71 | 72 | pg-logs: 73 | tail -f .devbox/virtenv/postgresql/data/log/postgresql-*.log 74 | 75 | pg-sniff: 76 | sudo tshark -i lo0 -f 'tcp port 5432' -d tcp.port==5432,pgsql -O pgsql 77 | 78 | tpch-install: 79 | devbox run "cd benchmark && \ 80 | rm -rf tpch-kit && \ 81 | git clone https://github.com/gregrahn/tpch-kit.git && \ 82 | cd tpch-kit/dbgen && \ 83 | make MACHINE=$$MACHINE DATABASE=POSTGRESQL" 84 | 85 | tpch-generate: 86 | devbox run "./benchmark/scripts/generate-data.sh" 87 | 88 | sniff: 89 | sudo tshark -i lo0 -f 'tcp port 54321' -d tcp.port==54321,pgsql -O pgsql 90 | 91 | measure-mem: 92 | devbox run "./benchmark/scripts/measure-memory.sh" 93 | 94 | profile-mem: 95 | devbox run "watch -n 1 go tool pprof -top http://localhost:6060/debug/pprof/heap" 96 | -------------------------------------------------------------------------------- /benchmark/README.md: -------------------------------------------------------------------------------- 1 | # BemiDB Bechmark 2 | 3 | We use the standardized TPC-H benchmark to compare PostgreSQL with BemiDB. 4 | This benchmark measures the performance of databases that handle large volumes of data and perform business-oriented ad-hoc queries (OLAP). 5 | 6 | ![TPC-H database structure](/img/tpc-h_database_structure.png) 7 | 8 | ## Running the TPC-H Benchmark 9 | 10 | ### PostgreSQL 11 | 12 | Download and unzip `TPC-H_generated_data_s*.zip` from the latest release into the "benchmark/data" directory and then set up a local PostgreSQL database: 13 | 14 | ```sh 15 | make pg-init 16 | make pg-up 17 | make pg-create 18 | ``` 19 | 20 | Run the benchmark queries with PostgreSQL: 21 | 22 | ```sh 23 | make pg-benchmark 24 | ``` 25 | 26 | Run the benchmark queries with indexed PostgreSQL: 27 | 28 | ```sh 29 | make pg-index 30 | make pg-benchmark 31 | ``` 32 | 33 | ### BemiDB 34 | 35 | Set up a local BemiDB database: 36 | 37 | ```sh 38 | make sync 39 | make up 40 | ``` 41 | 42 | Run the benchmark queries with BemiDB: 43 | 44 | ```sh 45 | make benchmark 46 | ``` 47 | 48 | ## Generating the TPC-H Data 49 | 50 | Install the TPC-H benchmark kit: 51 | 52 | ```sh 53 | make tpch-install MACHINE=MACOS # MACHINE=LINUX for Linux 54 | make tpch-generate SCALE_FACTOR=1 55 | ``` 56 | -------------------------------------------------------------------------------- /benchmark/data/create-indexes.ddl: -------------------------------------------------------------------------------- 1 | CREATE INDEX IF NOT EXISTS idx_part_name ON part (p_name varchar_pattern_ops); 2 | CREATE INDEX IF NOT EXISTS idx_part_brand_container ON part (p_brand, p_container, p_partkey); 3 | CREATE INDEX IF NOT EXISTS idx_part_partkey ON part (p_partkey); 4 | 5 | CREATE INDEX IF NOT EXISTS idx_partsupp_part_supp ON partsupp (ps_partkey, ps_suppkey); 6 | CREATE INDEX IF NOT EXISTS idx_partsupp_suppkey ON partsupp (ps_suppkey); 7 | 8 | CREATE INDEX IF NOT EXISTS idx_lineitem_dates ON lineitem (l_shipdate); 9 | CREATE INDEX IF NOT EXISTS idx_lineitem_part_supp ON lineitem (l_partkey, l_suppkey); 10 | CREATE INDEX IF NOT EXISTS idx_lineitem_part_qty ON lineitem (l_partkey, l_quantity, l_extendedprice); 11 | 12 | CREATE INDEX IF NOT EXISTS idx_nation_name ON nation (n_name); 13 | CREATE INDEX IF NOT EXISTS idx_nation_nationkey ON nation (n_nationkey); 14 | 15 | CREATE INDEX IF NOT EXISTS idx_supplier_nationkey ON supplier (s_nationkey); 16 | -------------------------------------------------------------------------------- /benchmark/data/create-tables.ddl: -------------------------------------------------------------------------------- 1 | -- Sccsid: @(#)dss.ddl 2.1.8.1 2 | CREATE TABLE NATION ( N_NATIONKEY INTEGER NOT NULL, 3 | N_NAME CHAR(25) NOT NULL, 4 | N_REGIONKEY INTEGER NOT NULL, 5 | N_COMMENT VARCHAR(152)); 6 | 7 | CREATE TABLE REGION ( R_REGIONKEY INTEGER NOT NULL, 8 | R_NAME CHAR(25) NOT NULL, 9 | R_COMMENT VARCHAR(152)); 10 | 11 | CREATE TABLE PART ( P_PARTKEY INTEGER NOT NULL, 12 | P_NAME VARCHAR(55) NOT NULL, 13 | P_MFGR CHAR(25) NOT NULL, 14 | P_BRAND CHAR(10) NOT NULL, 15 | P_TYPE VARCHAR(25) NOT NULL, 16 | P_SIZE INTEGER NOT NULL, 17 | P_CONTAINER CHAR(10) NOT NULL, 18 | P_RETAILPRICE DECIMAL(15,2) NOT NULL, 19 | P_COMMENT VARCHAR(23) NOT NULL ); 20 | 21 | CREATE TABLE SUPPLIER ( S_SUPPKEY INTEGER NOT NULL, 22 | S_NAME CHAR(25) NOT NULL, 23 | S_ADDRESS VARCHAR(40) NOT NULL, 24 | S_NATIONKEY INTEGER NOT NULL, 25 | S_PHONE CHAR(15) NOT NULL, 26 | S_ACCTBAL DECIMAL(15,2) NOT NULL, 27 | S_COMMENT VARCHAR(101) NOT NULL); 28 | 29 | CREATE TABLE PARTSUPP ( PS_PARTKEY INTEGER NOT NULL, 30 | PS_SUPPKEY INTEGER NOT NULL, 31 | PS_AVAILQTY INTEGER NOT NULL, 32 | PS_SUPPLYCOST DECIMAL(15,2) NOT NULL, 33 | PS_COMMENT VARCHAR(199) NOT NULL ); 34 | 35 | CREATE TABLE CUSTOMER ( C_CUSTKEY INTEGER NOT NULL, 36 | C_NAME VARCHAR(25) NOT NULL, 37 | C_ADDRESS VARCHAR(40) NOT NULL, 38 | C_NATIONKEY INTEGER NOT NULL, 39 | C_PHONE CHAR(15) NOT NULL, 40 | C_ACCTBAL DECIMAL(15,2) NOT NULL, 41 | C_MKTSEGMENT CHAR(10) NOT NULL, 42 | C_COMMENT VARCHAR(117) NOT NULL); 43 | 44 | CREATE TABLE ORDERS ( O_ORDERKEY INTEGER NOT NULL, 45 | O_CUSTKEY INTEGER NOT NULL, 46 | O_ORDERSTATUS CHAR(1) NOT NULL, 47 | O_TOTALPRICE DECIMAL(15,2) NOT NULL, 48 | O_ORDERDATE DATE NOT NULL, 49 | O_ORDERPRIORITY CHAR(15) NOT NULL, 50 | O_CLERK CHAR(15) NOT NULL, 51 | O_SHIPPRIORITY INTEGER NOT NULL, 52 | O_COMMENT VARCHAR(79) NOT NULL); 53 | 54 | CREATE TABLE LINEITEM ( L_ORDERKEY INTEGER NOT NULL, 55 | L_PARTKEY INTEGER NOT NULL, 56 | L_SUPPKEY INTEGER NOT NULL, 57 | L_LINENUMBER INTEGER NOT NULL, 58 | L_QUANTITY DECIMAL(15,2) NOT NULL, 59 | L_EXTENDEDPRICE DECIMAL(15,2) NOT NULL, 60 | L_DISCOUNT DECIMAL(15,2) NOT NULL, 61 | L_TAX DECIMAL(15,2) NOT NULL, 62 | L_RETURNFLAG CHAR(1) NOT NULL, 63 | L_LINESTATUS CHAR(1) NOT NULL, 64 | L_SHIPDATE DATE NOT NULL, 65 | L_COMMITDATE DATE NOT NULL, 66 | L_RECEIPTDATE DATE NOT NULL, 67 | L_SHIPINSTRUCT CHAR(25) NOT NULL, 68 | L_SHIPMODE CHAR(10) NOT NULL, 69 | L_COMMENT VARCHAR(44) NOT NULL); 70 | 71 | -------------------------------------------------------------------------------- /benchmark/query-templates/1.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Pricing Summary Report Query (Q1) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | l_returnflag, 9 | l_linestatus, 10 | sum(l_quantity) as sum_qty, 11 | sum(l_extendedprice) as sum_base_price, 12 | sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, 13 | sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, 14 | avg(l_quantity) as avg_qty, 15 | avg(l_extendedprice) as avg_price, 16 | avg(l_discount) as avg_disc, 17 | count(*) as count_order 18 | from 19 | lineitem 20 | where 21 | l_shipdate <= date '1998-12-01' - interval ':1 day' 22 | group by 23 | l_returnflag, 24 | l_linestatus 25 | order by 26 | l_returnflag, 27 | l_linestatus; 28 | :n -1 29 | -------------------------------------------------------------------------------- /benchmark/query-templates/10.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Returned Item Reporting Query (Q10) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | c_custkey, 9 | c_name, 10 | sum(l_extendedprice * (1 - l_discount)) as revenue, 11 | c_acctbal, 12 | n_name, 13 | c_address, 14 | c_phone, 15 | c_comment 16 | from 17 | customer, 18 | orders, 19 | lineitem, 20 | nation 21 | where 22 | c_custkey = o_custkey 23 | and l_orderkey = o_orderkey 24 | and o_orderdate >= date ':1' 25 | and o_orderdate < date ':1' + interval '3 months' 26 | and l_returnflag = 'R' 27 | and c_nationkey = n_nationkey 28 | group by 29 | c_custkey, 30 | c_name, 31 | c_acctbal, 32 | c_phone, 33 | n_name, 34 | c_address, 35 | c_comment 36 | order by 37 | revenue desc; 38 | :n 20 39 | -------------------------------------------------------------------------------- /benchmark/query-templates/11.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Important Stock Identification Query (Q11) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | ps_partkey, 9 | sum(ps_supplycost * ps_availqty) as value 10 | from 11 | partsupp, 12 | supplier, 13 | nation 14 | where 15 | ps_suppkey = s_suppkey 16 | and s_nationkey = n_nationkey 17 | and n_name = ':1' 18 | group by 19 | ps_partkey having 20 | sum(ps_supplycost * ps_availqty) > ( 21 | select 22 | sum(ps_supplycost * ps_availqty) * :2 23 | from 24 | partsupp, 25 | supplier, 26 | nation 27 | where 28 | ps_suppkey = s_suppkey 29 | and s_nationkey = n_nationkey 30 | and n_name = ':1' 31 | ) 32 | order by 33 | value desc; 34 | :n -1 35 | -------------------------------------------------------------------------------- /benchmark/query-templates/12.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Shipping Modes and Order Priority Query (Q12) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | l_shipmode, 9 | sum(case 10 | when o_orderpriority = '1-URGENT' 11 | or o_orderpriority = '2-HIGH' 12 | then 1 13 | else 0 14 | end) as high_line_count, 15 | sum(case 16 | when o_orderpriority <> '1-URGENT' 17 | and o_orderpriority <> '2-HIGH' 18 | then 1 19 | else 0 20 | end) as low_line_count 21 | from 22 | orders, 23 | lineitem 24 | where 25 | o_orderkey = l_orderkey 26 | and l_shipmode in (':1', ':2') 27 | and l_commitdate < l_receiptdate 28 | and l_shipdate < l_commitdate 29 | and l_receiptdate >= date ':3' 30 | and l_receiptdate < date ':3' + interval '1 year' 31 | group by 32 | l_shipmode 33 | order by 34 | l_shipmode; 35 | :n -1 36 | -------------------------------------------------------------------------------- /benchmark/query-templates/13.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Customer Distribution Query (Q13) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | c_count, 9 | count(*) as custdist 10 | from 11 | ( 12 | select 13 | c_custkey, 14 | count(o_orderkey) 15 | from 16 | customer left outer join orders on 17 | c_custkey = o_custkey 18 | and o_comment not like '%:1%:2%' 19 | group by 20 | c_custkey 21 | ) as c_orders (c_custkey, c_count) 22 | group by 23 | c_count 24 | order by 25 | custdist desc, 26 | c_count desc; 27 | :n -1 28 | -------------------------------------------------------------------------------- /benchmark/query-templates/14.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Promotion Effect Query (Q14) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | 100.00 * sum(case 9 | when p_type like 'PROMO%' 10 | then l_extendedprice * (1 - l_discount) 11 | else 0 12 | end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue 13 | from 14 | lineitem, 15 | part 16 | where 17 | l_partkey = p_partkey 18 | and l_shipdate >= date ':1' 19 | and l_shipdate < date ':1' + interval '1 month'; 20 | :n -1 21 | -------------------------------------------------------------------------------- /benchmark/query-templates/15.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Top Supplier Query (Q15) 3 | -- Variant A 4 | -- Approved February 1998 5 | -- 6 | -- BemiDB: Variant without "view" 7 | :x 8 | with revenue (supplier_no, total_revenue) as ( 9 | select 10 | l_suppkey, 11 | sum(l_extendedprice * (1-l_discount)) 12 | from 13 | lineitem 14 | where 15 | l_shipdate >= date ':1' 16 | and l_shipdate < date ':1' + interval '3 months' 17 | group by 18 | l_suppkey 19 | ) 20 | 21 | :o 22 | select 23 | s_suppkey, 24 | s_name, 25 | s_address, 26 | s_phone, 27 | total_revenue 28 | from 29 | supplier, 30 | revenue 31 | where 32 | s_suppkey = supplier_no 33 | and total_revenue = ( 34 | select 35 | max(total_revenue) 36 | from 37 | revenue 38 | ) 39 | order by 40 | s_suppkey; 41 | :n -1 42 | -------------------------------------------------------------------------------- /benchmark/query-templates/16.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Parts/Supplier Relationship Query (Q16) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | p_brand, 9 | p_type, 10 | p_size, 11 | count(distinct ps_suppkey) as supplier_cnt 12 | from 13 | partsupp, 14 | part 15 | where 16 | p_partkey = ps_partkey 17 | and p_brand <> ':1' 18 | and p_type not like ':2%' 19 | and p_size in (:3, :4, :5, :6, :7, :8, :9, :10) 20 | and ps_suppkey not in ( 21 | select 22 | s_suppkey 23 | from 24 | supplier 25 | where 26 | s_comment like '%Customer%Complaints%' 27 | ) 28 | group by 29 | p_brand, 30 | p_type, 31 | p_size 32 | order by 33 | supplier_cnt desc, 34 | p_brand, 35 | p_type, 36 | p_size; 37 | :n -1 38 | -------------------------------------------------------------------------------- /benchmark/query-templates/17.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Small-Quantity-Order Revenue Query (Q17) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | -- 6 | -- BemiDB: 20s with unindexed PostgreSQL 7 | :x 8 | :o 9 | select 10 | sum(l_extendedprice) / 7.0 as avg_yearly 11 | from 12 | lineitem, 13 | part 14 | where 15 | p_partkey = l_partkey 16 | and p_brand = ':1' 17 | and p_container = ':2' 18 | and l_quantity < ( 19 | select 20 | 0.2 * avg(l_quantity) 21 | from 22 | lineitem 23 | where 24 | l_partkey = p_partkey 25 | ); 26 | :n -1 27 | -------------------------------------------------------------------------------- /benchmark/query-templates/18.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Large Volume Customer Query (Q18) 3 | -- Function Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | c_name, 9 | c_custkey, 10 | o_orderkey, 11 | o_orderdate, 12 | o_totalprice, 13 | sum(l_quantity) 14 | from 15 | customer, 16 | orders, 17 | lineitem 18 | where 19 | o_orderkey in ( 20 | select 21 | l_orderkey 22 | from 23 | lineitem 24 | group by 25 | l_orderkey having 26 | sum(l_quantity) > :1 27 | ) 28 | and c_custkey = o_custkey 29 | and o_orderkey = l_orderkey 30 | group by 31 | c_name, 32 | c_custkey, 33 | o_orderkey, 34 | o_orderdate, 35 | o_totalprice 36 | order by 37 | o_totalprice desc, 38 | o_orderdate; 39 | :n 100 40 | -------------------------------------------------------------------------------- /benchmark/query-templates/19.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Discounted Revenue Query (Q19) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | sum(l_extendedprice* (1 - l_discount)) as revenue 9 | from 10 | lineitem, 11 | part 12 | where 13 | ( 14 | p_partkey = l_partkey 15 | and p_brand = ':1' 16 | and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') 17 | and l_quantity >= :4 and l_quantity <= :4 + 10 18 | and p_size between 1 and 5 19 | and l_shipmode in ('AIR', 'AIR REG') 20 | and l_shipinstruct = 'DELIVER IN PERSON' 21 | ) 22 | or 23 | ( 24 | p_partkey = l_partkey 25 | and p_brand = ':2' 26 | and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') 27 | and l_quantity >= :5 and l_quantity <= :5 + 10 28 | and p_size between 1 and 10 29 | and l_shipmode in ('AIR', 'AIR REG') 30 | and l_shipinstruct = 'DELIVER IN PERSON' 31 | ) 32 | or 33 | ( 34 | p_partkey = l_partkey 35 | and p_brand = ':3' 36 | and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') 37 | and l_quantity >= :6 and l_quantity <= :6 + 10 38 | and p_size between 1 and 15 39 | and l_shipmode in ('AIR', 'AIR REG') 40 | and l_shipinstruct = 'DELIVER IN PERSON' 41 | ); 42 | :n -1 43 | -------------------------------------------------------------------------------- /benchmark/query-templates/2.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Minimum Cost Supplier Query (Q2) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | s_acctbal, 9 | s_name, 10 | n_name, 11 | p_partkey, 12 | p_mfgr, 13 | s_address, 14 | s_phone, 15 | s_comment 16 | from 17 | part, 18 | supplier, 19 | partsupp, 20 | nation, 21 | region 22 | where 23 | p_partkey = ps_partkey 24 | and s_suppkey = ps_suppkey 25 | and p_size = :1 26 | and p_type like '%:2' 27 | and s_nationkey = n_nationkey 28 | and n_regionkey = r_regionkey 29 | and r_name = ':3' 30 | and ps_supplycost = ( 31 | select 32 | min(ps_supplycost) 33 | from 34 | partsupp, 35 | supplier, 36 | nation, 37 | region 38 | where 39 | p_partkey = ps_partkey 40 | and s_suppkey = ps_suppkey 41 | and s_nationkey = n_nationkey 42 | and n_regionkey = r_regionkey 43 | and r_name = ':3' 44 | ) 45 | order by 46 | s_acctbal desc, 47 | n_name, 48 | s_name, 49 | p_partkey; 50 | :n 100 51 | -------------------------------------------------------------------------------- /benchmark/query-templates/20.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Potential Part Promotion Query (Q20) 3 | -- Function Query Definition 4 | -- Approved February 1998 5 | -- 6 | -- BemiDB: 1h 23m with unindexed PostgreSQL 7 | :x 8 | :o 9 | select 10 | s_name, 11 | s_address 12 | from 13 | supplier, 14 | nation 15 | where 16 | s_suppkey in ( 17 | select 18 | ps_suppkey 19 | from 20 | partsupp 21 | where 22 | ps_partkey in ( 23 | select 24 | p_partkey 25 | from 26 | part 27 | where 28 | p_name like ':1%' 29 | ) 30 | and ps_availqty > ( 31 | select 32 | 0.5 * sum(l_quantity) 33 | from 34 | lineitem 35 | where 36 | l_partkey = ps_partkey 37 | and l_suppkey = ps_suppkey 38 | and l_shipdate >= date ':2' 39 | and l_shipdate < date ':2' + interval '1 year' 40 | ) 41 | ) 42 | and s_nationkey = n_nationkey 43 | and n_name = ':3' 44 | order by 45 | s_name; 46 | :n -1 47 | -------------------------------------------------------------------------------- /benchmark/query-templates/21.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Suppliers Who Kept Orders Waiting Query (Q21) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | s_name, 9 | count(*) as numwait 10 | from 11 | supplier, 12 | lineitem l1, 13 | orders, 14 | nation 15 | where 16 | s_suppkey = l1.l_suppkey 17 | and o_orderkey = l1.l_orderkey 18 | and o_orderstatus = 'F' 19 | and l1.l_receiptdate > l1.l_commitdate 20 | and exists ( 21 | select 22 | * 23 | from 24 | lineitem l2 25 | where 26 | l2.l_orderkey = l1.l_orderkey 27 | and l2.l_suppkey <> l1.l_suppkey 28 | ) 29 | and not exists ( 30 | select 31 | * 32 | from 33 | lineitem l3 34 | where 35 | l3.l_orderkey = l1.l_orderkey 36 | and l3.l_suppkey <> l1.l_suppkey 37 | and l3.l_receiptdate > l3.l_commitdate 38 | ) 39 | and s_nationkey = n_nationkey 40 | and n_name = ':1' 41 | group by 42 | s_name 43 | order by 44 | numwait desc, 45 | s_name; 46 | :n 100 47 | -------------------------------------------------------------------------------- /benchmark/query-templates/22.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Global Sales Opportunity Query (Q22) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | cntrycode, 9 | count(*) as numcust, 10 | sum(c_acctbal) as totacctbal 11 | from 12 | ( 13 | select 14 | substring(c_phone from 1 for 2) as cntrycode, 15 | c_acctbal 16 | from 17 | customer 18 | where 19 | substring(c_phone from 1 for 2) in 20 | (':1', ':2', ':3', ':4', ':5', ':6', ':7') 21 | and c_acctbal > ( 22 | select 23 | avg(c_acctbal) 24 | from 25 | customer 26 | where 27 | c_acctbal > 0.00 28 | and substring(c_phone from 1 for 2) in 29 | (':1', ':2', ':3', ':4', ':5', ':6', ':7') 30 | ) 31 | and not exists ( 32 | select 33 | * 34 | from 35 | orders 36 | where 37 | o_custkey = c_custkey 38 | ) 39 | ) as custsale 40 | group by 41 | cntrycode 42 | order by 43 | cntrycode; 44 | :n -1 45 | -------------------------------------------------------------------------------- /benchmark/query-templates/3.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Shipping Priority Query (Q3) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | l_orderkey, 9 | sum(l_extendedprice * (1 - l_discount)) as revenue, 10 | o_orderdate, 11 | o_shippriority 12 | from 13 | customer, 14 | orders, 15 | lineitem 16 | where 17 | c_mktsegment = ':1' 18 | and c_custkey = o_custkey 19 | and l_orderkey = o_orderkey 20 | and o_orderdate < date ':2' 21 | and l_shipdate > date ':2' 22 | group by 23 | l_orderkey, 24 | o_orderdate, 25 | o_shippriority 26 | order by 27 | revenue desc, 28 | o_orderdate; 29 | :n 10 30 | -------------------------------------------------------------------------------- /benchmark/query-templates/4.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Order Priority Checking Query (Q4) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | o_orderpriority, 9 | count(*) as order_count 10 | from 11 | orders 12 | where 13 | o_orderdate >= date ':1' 14 | and o_orderdate < date ':1' + interval '3 months' 15 | and exists ( 16 | select 17 | * 18 | from 19 | lineitem 20 | where 21 | l_orderkey = o_orderkey 22 | and l_commitdate < l_receiptdate 23 | ) 24 | group by 25 | o_orderpriority 26 | order by 27 | o_orderpriority; 28 | :n -1 29 | -------------------------------------------------------------------------------- /benchmark/query-templates/5.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Local Supplier Volume Query (Q5) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | n_name, 9 | sum(l_extendedprice * (1 - l_discount)) as revenue 10 | from 11 | customer, 12 | orders, 13 | lineitem, 14 | supplier, 15 | nation, 16 | region 17 | where 18 | c_custkey = o_custkey 19 | and l_orderkey = o_orderkey 20 | and l_suppkey = s_suppkey 21 | and c_nationkey = s_nationkey 22 | and s_nationkey = n_nationkey 23 | and n_regionkey = r_regionkey 24 | and r_name = ':1' 25 | and o_orderdate >= date ':2' 26 | and o_orderdate < date ':2' + interval '1 year' 27 | group by 28 | n_name 29 | order by 30 | revenue desc; 31 | :n -1 32 | -------------------------------------------------------------------------------- /benchmark/query-templates/6.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Forecasting Revenue Change Query (Q6) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | sum(l_extendedprice * l_discount) as revenue 9 | from 10 | lineitem 11 | where 12 | l_shipdate >= date ':1' 13 | and l_shipdate < date ':1' + interval '1 year' 14 | and l_discount between :2 - 0.01 and :2 + 0.01 15 | and l_quantity < :3; 16 | :n -1 17 | -------------------------------------------------------------------------------- /benchmark/query-templates/7.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Volume Shipping Query (Q7) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | supp_nation, 9 | cust_nation, 10 | l_year, 11 | sum(volume) as revenue 12 | from 13 | ( 14 | select 15 | n1.n_name as supp_nation, 16 | n2.n_name as cust_nation, 17 | extract(year from l_shipdate) as l_year, 18 | l_extendedprice * (1 - l_discount) as volume 19 | from 20 | supplier, 21 | lineitem, 22 | orders, 23 | customer, 24 | nation n1, 25 | nation n2 26 | where 27 | s_suppkey = l_suppkey 28 | and o_orderkey = l_orderkey 29 | and c_custkey = o_custkey 30 | and s_nationkey = n1.n_nationkey 31 | and c_nationkey = n2.n_nationkey 32 | and ( 33 | (n1.n_name = ':1' and n2.n_name = ':2') 34 | or (n1.n_name = ':2' and n2.n_name = ':1') 35 | ) 36 | and l_shipdate between date '1995-01-01' and date '1996-12-31' 37 | ) as shipping 38 | group by 39 | supp_nation, 40 | cust_nation, 41 | l_year 42 | order by 43 | supp_nation, 44 | cust_nation, 45 | l_year; 46 | :n -1 47 | -------------------------------------------------------------------------------- /benchmark/query-templates/8.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R National Market Share Query (Q8) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | o_year, 9 | sum(case 10 | when nation = ':1' then volume 11 | else 0 12 | end) / sum(volume) as mkt_share 13 | from 14 | ( 15 | select 16 | extract(year from o_orderdate) as o_year, 17 | l_extendedprice * (1 - l_discount) as volume, 18 | n2.n_name as nation 19 | from 20 | part, 21 | supplier, 22 | lineitem, 23 | orders, 24 | customer, 25 | nation n1, 26 | nation n2, 27 | region 28 | where 29 | p_partkey = l_partkey 30 | and s_suppkey = l_suppkey 31 | and l_orderkey = o_orderkey 32 | and o_custkey = c_custkey 33 | and c_nationkey = n1.n_nationkey 34 | and n1.n_regionkey = r_regionkey 35 | and r_name = ':2' 36 | and s_nationkey = n2.n_nationkey 37 | and o_orderdate between date '1995-01-01' and date '1996-12-31' 38 | and p_type = ':3' 39 | ) as all_nations 40 | group by 41 | o_year 42 | order by 43 | o_year; 44 | :n -1 45 | -------------------------------------------------------------------------------- /benchmark/query-templates/9.sql: -------------------------------------------------------------------------------- 1 | -- $ID$ 2 | -- TPC-H/TPC-R Product Type Profit Measure Query (Q9) 3 | -- Functional Query Definition 4 | -- Approved February 1998 5 | :x 6 | :o 7 | select 8 | nation, 9 | o_year, 10 | sum(amount) as sum_profit 11 | from 12 | ( 13 | select 14 | n_name as nation, 15 | extract(year from o_orderdate) as o_year, 16 | l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount 17 | from 18 | part, 19 | supplier, 20 | lineitem, 21 | partsupp, 22 | orders, 23 | nation 24 | where 25 | s_suppkey = l_suppkey 26 | and ps_suppkey = l_suppkey 27 | and ps_partkey = l_partkey 28 | and p_partkey = l_partkey 29 | and o_orderkey = l_orderkey 30 | and s_nationkey = n_nationkey 31 | and p_name like '%:1%' 32 | ) as profit 33 | group by 34 | nation, 35 | o_year 36 | order by 37 | nation, 38 | o_year desc; 39 | :n -1 40 | -------------------------------------------------------------------------------- /benchmark/scripts/generate-data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd benchmark 4 | 5 | # Structure 6 | cp ./tpch-kit/dbgen/dss.ddl ./data/create-tables.ddl 7 | 8 | # Data 9 | cd ./tpch-kit/dbgen 10 | export DSS_PATH=../../data 11 | export DSS_CONFIG=./ 12 | ./dbgen -vf -s $SCALE_FACTOR # 1 = 1GB 13 | 14 | # Queries 15 | cd - 16 | rm -rf /tmp/query-templates 17 | mkdir /tmp/query-templates 18 | for i in `ls query-templates/*.sql`; do 19 | tac $i | sed '2s/;//' | tac > /tmp/$i # Remove ";" 20 | done 21 | cd ./tpch-kit/dbgen 22 | export DSS_QUERY=/tmp/query-templates 23 | ./qgen -v -s 0.1 | sed 's/limit -1//' > ../../queries.sql 24 | -------------------------------------------------------------------------------- /benchmark/scripts/load-pg-data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | curl -L -o ./tpch.zip https://github.com/BemiHQ/BemiDB/releases/download/v0.1.0/TPC-H_generated_data_s0.1.zip 4 | unzip ./tpch.zip -d ./benchmark/data 5 | rm ./tpch.zip 6 | 7 | cd ./benchmark/data 8 | 9 | mv ./TPC-H_generated_data/* ./ 10 | rm -rf ./TPC-H_generated_data 11 | rm -rf ./__MACOSX 12 | 13 | psql postgres://127.0.0.1:5432/tpch -f ./create-tables.ddl 14 | 15 | for i in `ls *.tbl`; do 16 | table=${i/.tbl/} 17 | echo "Loading $table..." 18 | sed 's/|$//' $i > /tmp/$i 19 | psql postgres://127.0.0.1:5432/tpch -q -c "TRUNCATE $table" 20 | psql postgres://127.0.0.1:5432/tpch -c "\\copy $table FROM '/tmp/$i' CSV DELIMITER '|'" 21 | done 22 | -------------------------------------------------------------------------------- /benchmark/scripts/measure-memory.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PID=$(ps | grep "/exe/BemiDB sync" | grep -v grep | awk '{print $1}') 4 | 5 | if ! ps -p "$PID" > /dev/null 2>&1; then 6 | echo "Error: Process with PID $PID does not exist" 7 | exit 1 8 | fi 9 | 10 | echo "PID Time Mem" 11 | 12 | max_memory=0 13 | 14 | while true; do 15 | current_time=$(date "+%Y-%m-%d %H:%M:%S") 16 | current_memory=$(top -pid $PID -stats mem -l 1 | tail -n 1 | sed 's/[^0-9]*//g') 17 | 18 | if [ "$current_memory" -gt "$max_memory" ]; then 19 | max_memory=$current_memory 20 | printf "%s %s %sMB (%sMB new max)\n" "$PID" "$current_time" "$current_memory" "$max_memory" 21 | else 22 | printf "%s %s %sMB\n" "$PID" "$current_time" "$current_memory" 23 | fi 24 | 25 | sleep 1 26 | done 27 | -------------------------------------------------------------------------------- /build/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BemiHQ/BemiDB/989c88ec373493fce3e76694a07a3cae0c429323/build/.gitkeep -------------------------------------------------------------------------------- /devbox.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://raw.githubusercontent.com/jetify-com/devbox/0.13.1/.schema/devbox.schema.json", 3 | "packages": [ 4 | "go@latest", 5 | "postgresql@latest", 6 | "gcc@latest" 7 | ], 8 | "shell": { 9 | "init_hook": [], 10 | "scripts": { 11 | "test": [ 12 | "echo \"Error: no test specified\" && exit 1" 13 | ] 14 | } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /devbox.lock: -------------------------------------------------------------------------------- 1 | { 2 | "lockfile_version": "1", 3 | "packages": { 4 | "gcc@latest": { 5 | "last_modified": "2025-02-07T11:26:36Z", 6 | "resolved": "github:NixOS/nixpkgs/d98abf5cf5914e5e4e9d57205e3af55ca90ffc1d#gcc", 7 | "source": "devbox-search", 8 | "version": "14-20241116", 9 | "systems": { 10 | "aarch64-darwin": { 11 | "outputs": [ 12 | { 13 | "name": "out", 14 | "path": "/nix/store/3k1dfk03xkmaf6cksgpk492k3m8brvmp-gcc-wrapper-14-20241116", 15 | "default": true 16 | }, 17 | { 18 | "name": "man", 19 | "path": "/nix/store/cf67mihrjf3a1w4sw8jkgw49kfi54wpq-gcc-wrapper-14-20241116-man", 20 | "default": true 21 | }, 22 | { 23 | "name": "info", 24 | "path": "/nix/store/jgsywypzhdim8s6x25cnr31ygm28lhin-gcc-wrapper-14-20241116-info" 25 | } 26 | ], 27 | "store_path": "/nix/store/3k1dfk03xkmaf6cksgpk492k3m8brvmp-gcc-wrapper-14-20241116" 28 | }, 29 | "aarch64-linux": { 30 | "outputs": [ 31 | { 32 | "name": "out", 33 | "path": "/nix/store/pavcqvq7ycdbpal1mfjsscvgngzsg9sp-gcc-wrapper-14-20241116", 34 | "default": true 35 | }, 36 | { 37 | "name": "man", 38 | "path": "/nix/store/srwbca27wpylwzaqz9ssbhlkx910ryv1-gcc-wrapper-14-20241116-man", 39 | "default": true 40 | }, 41 | { 42 | "name": "info", 43 | "path": "/nix/store/yrdz1bra65cc9i2n2ghsz98g4fx2jra3-gcc-wrapper-14-20241116-info" 44 | } 45 | ], 46 | "store_path": "/nix/store/pavcqvq7ycdbpal1mfjsscvgngzsg9sp-gcc-wrapper-14-20241116" 47 | }, 48 | "x86_64-darwin": { 49 | "outputs": [ 50 | { 51 | "name": "out", 52 | "path": "/nix/store/g2462k2svl4zn5q7yypqirp6xxq0s9aq-gcc-wrapper-14-20241116", 53 | "default": true 54 | }, 55 | { 56 | "name": "man", 57 | "path": "/nix/store/s0vndfpg8gsz40wmf3v11csk3n497kqm-gcc-wrapper-14-20241116-man", 58 | "default": true 59 | }, 60 | { 61 | "name": "info", 62 | "path": "/nix/store/3x75ibh7rj95vm09q38iqj8l5jwbrarl-gcc-wrapper-14-20241116-info" 63 | } 64 | ], 65 | "store_path": "/nix/store/g2462k2svl4zn5q7yypqirp6xxq0s9aq-gcc-wrapper-14-20241116" 66 | }, 67 | "x86_64-linux": { 68 | "outputs": [ 69 | { 70 | "name": "out", 71 | "path": "/nix/store/4ijy8jbsiqmj37avrk83gn2m903486mr-gcc-wrapper-14-20241116", 72 | "default": true 73 | }, 74 | { 75 | "name": "man", 76 | "path": "/nix/store/vyax7fpbw688qwx32c1i8n1f5jbjkcii-gcc-wrapper-14-20241116-man", 77 | "default": true 78 | }, 79 | { 80 | "name": "info", 81 | "path": "/nix/store/m4641rrm7dw80bn23ab0812pm6aj6402-gcc-wrapper-14-20241116-info" 82 | } 83 | ], 84 | "store_path": "/nix/store/4ijy8jbsiqmj37avrk83gn2m903486mr-gcc-wrapper-14-20241116" 85 | } 86 | } 87 | }, 88 | "github:NixOS/nixpkgs/nixpkgs-unstable": { 89 | "resolved": "github:NixOS/nixpkgs/d9b69c3ec2a2e2e971c534065bdd53374bd68b97?lastModified=1740396192&narHash=sha256-ATMHHrg3sG1KgpQA5x8I%2BzcYpp5Sf17FaFj%2FfN%2B8OoQ%3D" 90 | }, 91 | "go@latest": { 92 | "last_modified": "2025-05-16T20:19:48Z", 93 | "resolved": "github:NixOS/nixpkgs/12a55407652e04dcf2309436eb06fef0d3713ef3#go", 94 | "source": "devbox-search", 95 | "version": "1.24.3", 96 | "systems": { 97 | "aarch64-darwin": { 98 | "outputs": [ 99 | { 100 | "name": "out", 101 | "path": "/nix/store/ps3admpzmc1ryvn9q7sw5xfd94dkrb3f-go-1.24.3", 102 | "default": true 103 | } 104 | ], 105 | "store_path": "/nix/store/ps3admpzmc1ryvn9q7sw5xfd94dkrb3f-go-1.24.3" 106 | }, 107 | "aarch64-linux": { 108 | "outputs": [ 109 | { 110 | "name": "out", 111 | "path": "/nix/store/45bnqhyyq40p91k3cjw0farx3hn1swx6-go-1.24.3", 112 | "default": true 113 | } 114 | ], 115 | "store_path": "/nix/store/45bnqhyyq40p91k3cjw0farx3hn1swx6-go-1.24.3" 116 | }, 117 | "x86_64-darwin": { 118 | "outputs": [ 119 | { 120 | "name": "out", 121 | "path": "/nix/store/9z2kb6hxij7pqi0fgcn9ijhpb7ajpazs-go-1.24.3", 122 | "default": true 123 | } 124 | ], 125 | "store_path": "/nix/store/9z2kb6hxij7pqi0fgcn9ijhpb7ajpazs-go-1.24.3" 126 | }, 127 | "x86_64-linux": { 128 | "outputs": [ 129 | { 130 | "name": "out", 131 | "path": "/nix/store/5xvi25nqmbrg58aixp4zgczilfnp7pwg-go-1.24.3", 132 | "default": true 133 | } 134 | ], 135 | "store_path": "/nix/store/5xvi25nqmbrg58aixp4zgczilfnp7pwg-go-1.24.3" 136 | } 137 | } 138 | }, 139 | "postgresql@latest": { 140 | "last_modified": "2025-03-25T17:32:05Z", 141 | "plugin_version": "0.0.2", 142 | "resolved": "github:NixOS/nixpkgs/25d1b84f5c90632a623c48d83a2faf156451e6b1#postgresql", 143 | "source": "devbox-search", 144 | "version": "17.4", 145 | "systems": { 146 | "aarch64-darwin": { 147 | "outputs": [ 148 | { 149 | "name": "out", 150 | "path": "/nix/store/prh52g9iwjdddxbv4n0b52gbnlxnnk6w-postgresql-17.4", 151 | "default": true 152 | }, 153 | { 154 | "name": "man", 155 | "path": "/nix/store/il144892arv36x68b5y95bkvrq32ym91-postgresql-17.4-man", 156 | "default": true 157 | }, 158 | { 159 | "name": "dev", 160 | "path": "/nix/store/7gf8hy13r7li2balcini7004aml54l5n-postgresql-17.4-dev" 161 | }, 162 | { 163 | "name": "doc", 164 | "path": "/nix/store/6x00505hxzfwjfpk15v6p4qqnbpk5dza-postgresql-17.4-doc" 165 | }, 166 | { 167 | "name": "lib", 168 | "path": "/nix/store/c9g6v34cjsf308m9xzcs7figc1vgbbw3-postgresql-17.4-lib" 169 | } 170 | ], 171 | "store_path": "/nix/store/prh52g9iwjdddxbv4n0b52gbnlxnnk6w-postgresql-17.4" 172 | }, 173 | "aarch64-linux": { 174 | "outputs": [ 175 | { 176 | "name": "out", 177 | "path": "/nix/store/1lgjdy1nm8l68y2jw6m1lhas4j5jcmk1-postgresql-17.4", 178 | "default": true 179 | }, 180 | { 181 | "name": "man", 182 | "path": "/nix/store/1v352rrzfv5p105jfaizxhd29nk41hgp-postgresql-17.4-man", 183 | "default": true 184 | }, 185 | { 186 | "name": "debug", 187 | "path": "/nix/store/5bywayb6ywgznzh9cck9wpya1bzg4v0a-postgresql-17.4-debug" 188 | }, 189 | { 190 | "name": "dev", 191 | "path": "/nix/store/zs35b02p7cay6jp7zr1xihwx8vzab17c-postgresql-17.4-dev" 192 | }, 193 | { 194 | "name": "doc", 195 | "path": "/nix/store/1qccl3dm5wcja6h2kjkhvs5r9l1bx4hz-postgresql-17.4-doc" 196 | }, 197 | { 198 | "name": "lib", 199 | "path": "/nix/store/kyh4l6wsjgwghvjw9810p7nn1ap106mj-postgresql-17.4-lib" 200 | } 201 | ], 202 | "store_path": "/nix/store/1lgjdy1nm8l68y2jw6m1lhas4j5jcmk1-postgresql-17.4" 203 | }, 204 | "x86_64-darwin": { 205 | "outputs": [ 206 | { 207 | "name": "out", 208 | "path": "/nix/store/0mzgv54qxafr66f4d7prz42fhs833mhk-postgresql-17.4", 209 | "default": true 210 | }, 211 | { 212 | "name": "man", 213 | "path": "/nix/store/akb98lb29c1x3mflzcwqy4a0gqfk331r-postgresql-17.4-man", 214 | "default": true 215 | }, 216 | { 217 | "name": "dev", 218 | "path": "/nix/store/gpkbg9yhx7jji2hr3jp89q06hi6v7qrk-postgresql-17.4-dev" 219 | }, 220 | { 221 | "name": "doc", 222 | "path": "/nix/store/9hxw6pf1qnlz1ygx5ximyvc48swb54n0-postgresql-17.4-doc" 223 | }, 224 | { 225 | "name": "lib", 226 | "path": "/nix/store/kyml5v1q498ympq67jvcnhgmsn8384zk-postgresql-17.4-lib" 227 | } 228 | ], 229 | "store_path": "/nix/store/0mzgv54qxafr66f4d7prz42fhs833mhk-postgresql-17.4" 230 | }, 231 | "x86_64-linux": { 232 | "outputs": [ 233 | { 234 | "name": "out", 235 | "path": "/nix/store/snfxmriwav4i0k1fxp78xk5w12hbv4q9-postgresql-17.4", 236 | "default": true 237 | }, 238 | { 239 | "name": "man", 240 | "path": "/nix/store/pcx190vq4awjcgpmj2flrbp9awhdc74q-postgresql-17.4-man", 241 | "default": true 242 | }, 243 | { 244 | "name": "lib", 245 | "path": "/nix/store/yja4rgfrwyxckwqf10rbr4armbn0p2y5-postgresql-17.4-lib" 246 | }, 247 | { 248 | "name": "debug", 249 | "path": "/nix/store/zrlrz84kzfvnxcx5mis53scr205p29hx-postgresql-17.4-debug" 250 | }, 251 | { 252 | "name": "dev", 253 | "path": "/nix/store/piqzr58swwmbsngl3jp98xgrf17a960n-postgresql-17.4-dev" 254 | }, 255 | { 256 | "name": "doc", 257 | "path": "/nix/store/ilc5sycwvqjjfa33978nb1p14x358l1c-postgresql-17.4-doc" 258 | } 259 | ], 260 | "store_path": "/nix/store/snfxmriwav4i0k1fxp78xk5w12hbv4q9-postgresql-17.4" 261 | } 262 | } 263 | } 264 | } 265 | } 266 | -------------------------------------------------------------------------------- /img/BemiDB.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BemiHQ/BemiDB/989c88ec373493fce3e76694a07a3cae0c429323/img/BemiDB.gif -------------------------------------------------------------------------------- /img/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BemiHQ/BemiDB/989c88ec373493fce3e76694a07a3cae0c429323/img/architecture.png -------------------------------------------------------------------------------- /img/tpc-h_database_structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BemiHQ/BemiDB/989c88ec373493fce3e76694a07a3cae0c429323/img/tpc-h_database_structure.png -------------------------------------------------------------------------------- /scripts/build-darwin.sh: -------------------------------------------------------------------------------- 1 | cd src 2 | go build -o ../build/bemidb-darwin-arm64 3 | 4 | create_dir_if_needed() { 5 | local dir=$1 6 | if [ ! -d "$dir" ]; then 7 | echo "Creating directory: $dir" 8 | sudo mkdir -p "$dir" 9 | sudo chmod 755 "$dir" 10 | fi 11 | } 12 | 13 | create_dir_if_needed "/usr/local/lib" 14 | cd ../build 15 | LIBCPP_OLD_PATH=$(otool -L ./bemidb-darwin-arm64 | grep -o '/.*/libc++\.1\.0\.dylib') 16 | if [ -z "$LIBCPP_OLD_PATH" ]; then 17 | echo "Error: Could not find libc++ dependency in binary" 18 | exit 1 19 | fi 20 | LIBCPP_NEW_PATH=/usr/local/lib/libc++.1.0.dylib 21 | sudo cp $LIBCPP_OLD_PATH $LIBCPP_NEW_PATH 22 | 23 | LIBCPPABI_OLD_PATH=$(otool -L $LIBCPP_NEW_PATH | grep -o '/.*/libc++abi\.1\.dylib') 24 | if [ -z "$LIBCPPABI_OLD_PATH" ]; then 25 | echo "Error: Could not find libc++abi dependency" 26 | exit 1 27 | fi 28 | LIBCPPABI_NEW_PATH=/usr/local/lib/libc++abi.1.dylib 29 | sudo cp $LIBCPPABI_OLD_PATH $LIBCPPABI_NEW_PATH 30 | 31 | sudo install_name_tool -change $LIBCPPABI_OLD_PATH $LIBCPPABI_NEW_PATH $LIBCPP_NEW_PATH 32 | sudo install_name_tool -change $LIBCPP_OLD_PATH $LIBCPP_NEW_PATH ./bemidb-darwin-arm64 33 | 34 | sudo cp $LIBCPP_NEW_PATH ./libc++.1.0.dylib 35 | sudo cp $LIBCPPABI_NEW_PATH ./libc++abi.1.dylib 36 | otool -L ./bemidb-darwin-arm64 37 | -------------------------------------------------------------------------------- /scripts/build-linux.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | platforms=("linux/amd64" "linux/arm64") 4 | 5 | version=$(grep -E 'VERSION = "[^"]+"' src/config.go | sed -E 's/.*VERSION = "([^"]+)".*/\1/') 6 | if [ -z "$version" ]; then 7 | echo "Error: Could not extract version from config.go" 8 | exit 1 9 | fi 10 | 11 | for platform in "${platforms[@]}" 12 | do 13 | os="${platform%/*}" 14 | arch="${platform#*/}" 15 | tag="ghcr.io/bemihq/bemidb:$version-$arch" 16 | 17 | echo "Building bemidb version $version for $os/$arch" 18 | 19 | docker buildx build \ 20 | --build-arg PLATFORM=$platform \ 21 | --build-arg GOOS=$os \ 22 | --build-arg GOARCH="$arch" \ 23 | -t $tag . 24 | 25 | docker create --name temp-container $tag 26 | docker cp temp-container:/app/bemidb ./build/bemidb-$os-$arch 27 | docker rm temp-container 28 | done 29 | -------------------------------------------------------------------------------- /scripts/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Detect OS and architecture 4 | OS=$(uname -s | tr '[:upper:]' '[:lower:]') 5 | ARCH=$(uname -m) 6 | 7 | # Map architecture to Go naming convention 8 | case $ARCH in 9 | x86_64|amd64) 10 | ARCH="amd64" 11 | ;; 12 | aarch64|arm64) 13 | ARCH="arm64" 14 | ;; 15 | *) 16 | echo "Unsupported architecture: $ARCH" 17 | exit 1 18 | ;; 19 | esac 20 | 21 | # Set the download URL and binary name 22 | BINARY_NAME="bemidb-${OS}-${ARCH}" 23 | DOWNLOAD_URL="https://github.com/BemiHQ/BemiDB/releases/latest/download/$BINARY_NAME" 24 | 25 | # Download the binary 26 | echo "Downloading $DOWNLOAD_URL..." 27 | curl -L "$DOWNLOAD_URL" -o ./bemidb 28 | 29 | if [ "$ARCH" = "arm64" ] && [ "$OS" = "darwin" ]; then 30 | # Ensure /usr/local/lib exists 31 | if [ ! -d "/usr/local/lib" ]; then 32 | sudo mkdir -p /usr/local/lib 33 | fi 34 | 35 | # Download the libc++ dynamic libraries for macOS (can't be statically linked) 36 | curl -sL "https://github.com/BemiHQ/BemiDB/releases/latest/download/libc++.1.0.dylib" -o ./libc++.1.0.dylib 37 | sudo mv ./libc++.1.0.dylib /usr/local/lib/libc++.1.0.dylib 38 | curl -sL "https://github.com/BemiHQ/BemiDB/releases/latest/download/libc++abi.1.dylib" -o ./libc++abi.1.dylib 39 | sudo mv ./libc++abi.1.dylib /usr/local/lib/libc++abi.1.dylib 40 | fi 41 | 42 | # Make the binary executable 43 | chmod +x ./bemidb 44 | -------------------------------------------------------------------------------- /scripts/publish-docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | version=$(grep -E 'VERSION = "[^"]+"' src/config.go | sed -E 's/.*VERSION = "([^"]+)".*/\1/') 4 | if [ -z "$version" ]; then 5 | echo "Error: Could not extract version from config.go" 6 | exit 1 7 | fi 8 | 9 | echo "Pushing bemidb version $version to ghcr.io" 10 | 11 | docker push ghcr.io/bemihq/bemidb:$version-amd64 12 | docker push ghcr.io/bemihq/bemidb:$version-arm64 13 | 14 | docker manifest inspect ghcr.io/bemihq/bemidb:$version &> /dev/null && docker manifest rm ghcr.io/bemihq/bemidb:$version 15 | docker manifest create ghcr.io/bemihq/bemidb:$version ghcr.io/bemihq/bemidb:$version-amd64 ghcr.io/bemihq/bemidb:$version-arm64 16 | docker manifest annotate ghcr.io/bemihq/bemidb:$version ghcr.io/bemihq/bemidb:$version-amd64 --arch amd64 17 | docker manifest annotate ghcr.io/bemihq/bemidb:$version ghcr.io/bemihq/bemidb:$version-arm64 --arch arm64 18 | docker manifest push ghcr.io/bemihq/bemidb:$version 19 | 20 | docker manifest inspect ghcr.io/bemihq/bemidb:latest &> /dev/null && docker manifest rm ghcr.io/bemihq/bemidb:latest 21 | docker manifest create ghcr.io/bemihq/bemidb:latest ghcr.io/bemihq/bemidb:$version-amd64 ghcr.io/bemihq/bemidb:$version-arm64 22 | docker manifest annotate ghcr.io/bemihq/bemidb:latest ghcr.io/bemihq/bemidb:$version-amd64 --arch amd64 23 | docker manifest annotate ghcr.io/bemihq/bemidb:latest ghcr.io/bemihq/bemidb:$version-arm64 --arch arm64 24 | docker manifest push ghcr.io/bemihq/bemidb:latest 25 | 26 | echo 27 | echo "See https://github.com/orgs/BemiHQ/packages/container/package/bemidb" 28 | -------------------------------------------------------------------------------- /scripts/test-data-types.sql: -------------------------------------------------------------------------------- 1 | -- Usage: psql postgres://127.0.0.1:5432/dbname -P pager=off -v ON_ERROR_STOP=on -f ./scripts/test-data-types.sql 2 | 3 | DROP TABLE IF EXISTS test_table; 4 | DROP TYPE IF EXISTS address; 5 | 6 | CREATE EXTENSION IF NOT EXISTS ltree; 7 | 8 | CREATE TYPE address AS ( 9 | city VARCHAR(50) 10 | ); 11 | 12 | CREATE TABLE test_table ( 13 | id SERIAL PRIMARY KEY, 14 | bit_column BIT, 15 | bool_column BOOLEAN, 16 | bpchar_column BPCHAR(10), 17 | varchar_column VARCHAR(255), 18 | text_column TEXT, 19 | int2_column INT2, 20 | int4_column INT4, 21 | int8_column INT8, 22 | hugeint_column NUMERIC(20, 0), 23 | xid_column XID, 24 | xid8_column XID8, 25 | float4_column FLOAT4, 26 | float8_column FLOAT8, 27 | numeric_column NUMERIC(40, 2), 28 | numeric_column_without_precision NUMERIC, 29 | date_column DATE, 30 | time_column TIME, 31 | timeMsColumn TIME(3), 32 | timetz_column TIMETZ, 33 | timetz_ms_column TIMETZ(3), 34 | timestamp_column TIMESTAMP, 35 | timestamp_ms_column TIMESTAMP(3), 36 | timestamptz_column TIMESTAMPTZ, 37 | timestamptz_ms_column TIMESTAMPTZ(3), 38 | uuid_column UUID, 39 | bytea_column BYTEA, 40 | interval_column INTERVAL, 41 | point_column POINT, 42 | inet_column INET, 43 | json_column JSON, 44 | jsonb_column JSONB, 45 | tsvector_column TSVECTOR, 46 | xml_column XML, 47 | pg_snapshot_column PG_SNAPSHOT, 48 | array_text_column TEXT[], 49 | array_int_column INT[], 50 | array_jsonb_column JSONB[], 51 | array_ltree_column LTREE[], 52 | user_defined_column address 53 | ); 54 | 55 | INSERT INTO test_table ( 56 | bit_column, 57 | bool_column, 58 | bpchar_column, 59 | varchar_column, 60 | text_column, 61 | int2_column, 62 | int4_column, 63 | int8_column, 64 | hugeint_column, 65 | xid_column, 66 | xid8_column, 67 | float4_column, 68 | float8_column, 69 | numeric_column, 70 | numeric_column_without_precision, 71 | date_column, 72 | time_column, 73 | timeMsColumn, 74 | timetz_column, 75 | timetz_ms_column, 76 | timestamp_column, 77 | timestamp_ms_column, 78 | timestamptz_column, 79 | timestamptz_ms_column, 80 | uuid_column, 81 | bytea_column, 82 | interval_column, 83 | point_column, 84 | inet_column, 85 | json_column, 86 | jsonb_column, 87 | tsvector_column, 88 | xml_column, 89 | pg_snapshot_column, 90 | array_text_column, 91 | array_int_column, 92 | array_jsonb_column, 93 | array_ltree_column, 94 | user_defined_column 95 | ) VALUES ( 96 | B'1', -- bit_column 97 | TRUE, -- bool_column 98 | 'bpchar', -- bpchar_column 99 | 'varchar', -- varchar_column 100 | 'text', -- text_column 101 | 32767::INT2, -- int2_column 102 | 2147483647::INT4, -- int4_column 103 | 9223372036854775807::INT8, -- int8_column 104 | 10000000000000000000, -- hugeint_column 105 | '4294967295'::XID, -- xid_column 106 | '18446744073709551615'::XID8, -- xid8_column 107 | 3.14::FLOAT4, -- float4_column 108 | 3.141592653589793::FLOAT8, -- float8_column 109 | 12345.67::NUMERIC(10, 2), -- numeric_column 110 | 12345.67, -- numeric_column_without_precision 111 | '2024-01-01', -- date_column 112 | '12:00:00.123456', -- time_column 113 | '12:00:00.123', -- timeMsColumn 114 | '12:00:00.123456-05', -- timetz_column 115 | '12:00:00.123-05', -- timetz_ms_column 116 | '2024-01-01 12:00:00.123456', -- timestamp_column 117 | '2024-01-01 12:00:00.123', -- timestamp_ms_column 118 | '2024-01-01 12:00:00.123456-05', -- timestamptz_column 119 | '2024-01-01 12:00:00.123-05', -- timestamptz_ms_column 120 | '58a7c845-af77-44b2-8664-7ca613d92f04', -- uuid_column 121 | decode('48656c6c6f', 'hex'), -- bytea_column 122 | '1 mon 2 days 01:00:01.000001'::INTERVAL, -- interval_column 123 | '(1, 2)'::POINT, -- point_column 124 | '192.168.0.1', -- inet_column 125 | '{"key": "value"}'::JSON, -- json_column 126 | '{"key": "value"}'::JSONB, -- jsonb_column 127 | to_tsvector('Sample text for tsvector'), -- tsvector_column 128 | 'text', -- xml_column 129 | pg_current_snapshot(), -- pg_snapshot_column 130 | '{"one", "two", "three"}', -- array_text_column 131 | '{1, 2, 3}', -- array_int_column 132 | '{"{\"key\": \"value1\"}", "{\"key\": \"value2\"}"}'::JSONB[], -- array_jsonb_column 133 | '{"a.b", "c.d"}'::LTREE[], -- array_ltree_column 134 | ROW('Toronto') -- user_defined_column 135 | ), ( 136 | NULL, -- bit_column 137 | FALSE, -- bool_column 138 | '', -- bpchar_column 139 | NULL, -- varchar_column 140 | '', -- text_column 141 | -32767::INT2, -- int2_column 142 | NULL, -- int4_column 143 | -9223372036854775807::INT8, -- int8_column 144 | NULL, -- hugeint_column 145 | NULL, -- xid_column 146 | NULL, -- xid8_column 147 | 'NaN', -- float4_column 148 | -3.141592653589793::FLOAT8, -- float8_column 149 | -12345.00::NUMERIC(10, 2), -- numeric_column 150 | NULL, -- numeric_column_without_precision 151 | '20025-11-12', -- date_column 152 | '12:00:00.123', -- time_column 153 | NULL, -- timeMsColumn 154 | '12:00:00.12300+05', -- timetz_column 155 | '12:00:00.1+05', -- timetz_ms_column 156 | '2024-01-01 12:00:00', -- timestamp_column 157 | NULL, -- timestamp_ms_column 158 | '2024-01-01 12:00:00.000123+05', -- timestamptz_column 159 | '2024-01-01 12:00:00.12+05', -- timestamptz_ms_column 160 | NULL, -- uuid_column 161 | NULL, -- bytea_column 162 | NULL, -- interval_column 163 | NULL, -- point_column 164 | NULL, -- inet_column 165 | NULL, -- json_column 166 | '{}'::JSONB, -- jsonb_column 167 | NULL, -- tsvector_column 168 | NULL, -- xml_column 169 | NULL, -- pg_snapshot_column 170 | NULL, -- array_text_column 171 | '{}', -- array_int_column 172 | NULL, -- array_jsonb_column 173 | NULL, -- array_ltree_column 174 | NULL -- user_defined_column 175 | ); 176 | 177 | SELECT 178 | table_schema, 179 | table_name, 180 | column_name, 181 | data_type, 182 | udt_name, 183 | is_nullable, 184 | character_maximum_length, 185 | numeric_precision, 186 | numeric_scale, 187 | datetime_precision 188 | FROM information_schema.columns 189 | WHERE table_schema = 'public' 190 | ORDER BY table_schema, table_name, ordinal_position; 191 | -------------------------------------------------------------------------------- /scripts/test-partitioned-tables.sql: -------------------------------------------------------------------------------- 1 | -- Usage: psql postgres://127.0.0.1:5432/dbname -P pager=off -v ON_ERROR_STOP=on -f ./scripts/test-partitioned-tables.sql 2 | 3 | DROP TABLE IF EXISTS test_table; 4 | 5 | CREATE TABLE test_table ( 6 | id SERIAL, 7 | created_at TIMESTAMP NOT NULL 8 | ) PARTITION BY RANGE (created_at); 9 | 10 | CREATE TABLE test_table_q1 PARTITION OF test_table FOR VALUES FROM ('2024-01-01') TO ('2024-04-01'); 11 | CREATE TABLE test_table_q2 PARTITION OF test_table FOR VALUES FROM ('2024-04-01') TO ('2024-07-01'); 12 | CREATE TABLE test_table_q3 PARTITION OF test_table FOR VALUES FROM ('2024-07-01') TO ('2024-10-01'); 13 | CREATE TABLE test_table_q4 PARTITION OF test_table FOR VALUES FROM ('2024-10-01') TO ('2025-01-01'); 14 | 15 | INSERT INTO test_table (created_at) VALUES 16 | ('2024-02-15 10:00:00'), 17 | ('2024-09-01 12:00:30'), 18 | ('2024-10-12 08:00:00'), 19 | ('2024-05-20 14:30:00'); 20 | -------------------------------------------------------------------------------- /scripts/test-schemas.sql: -------------------------------------------------------------------------------- 1 | -- Usage: psql postgres://127.0.0.1:5432/dbname -P pager=off -v ON_ERROR_STOP=on -f ./scripts/test-schemas.sql 2 | 3 | DROP SCHEMA IF EXISTS test_schema CASCADE; 4 | 5 | CREATE SCHEMA test_schema; 6 | 7 | CREATE TABLE test_schema.test_table ( 8 | id SERIAL PRIMARY KEY 9 | ); 10 | 11 | CREATE TABLE test_schema.simple_table ( 12 | id SERIAL PRIMARY KEY 13 | ); 14 | 15 | INSERT INTO test_schema.simple_table DEFAULT VALUES; 16 | 17 | SELECT 18 | table_schema, 19 | table_name, 20 | column_name, 21 | data_type, 22 | udt_name, 23 | is_nullable, 24 | character_maximum_length, 25 | numeric_precision, 26 | numeric_scale, 27 | datetime_precision 28 | FROM information_schema.columns 29 | WHERE table_schema = 'test_schema' 30 | ORDER BY table_schema, table_name, ordinal_position; 31 | -------------------------------------------------------------------------------- /src/capped_buffer.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "errors" 5 | "io" 6 | "sync" 7 | ) 8 | 9 | type CappedBuffer struct { 10 | config *Config 11 | maxSizeBytes int 12 | 13 | buffer []byte 14 | mutex sync.Mutex 15 | conditionalSync *sync.Cond 16 | 17 | closeOnceSync sync.Once 18 | closed bool 19 | } 20 | 21 | func NewCappedBuffer(maxSizeBytes int, config *Config) *CappedBuffer { 22 | sizedBuffer := &CappedBuffer{ 23 | config: config, 24 | buffer: make([]byte, 0, maxSizeBytes), 25 | maxSizeBytes: maxSizeBytes, 26 | } 27 | sizedBuffer.conditionalSync = sync.NewCond(&sizedBuffer.mutex) 28 | return sizedBuffer 29 | } 30 | 31 | // Implements io.Writer 32 | func (buf *CappedBuffer) Write(payload []byte) (writtenBytes int, err error) { 33 | if len(payload) == 0 { 34 | return 0, nil 35 | } 36 | 37 | buf.mutex.Lock() 38 | defer buf.mutex.Unlock() 39 | 40 | if buf.closed { 41 | return 0, errors.New("buffer is closed") 42 | } 43 | 44 | for len(buf.buffer)+len(payload) > buf.maxSizeBytes && !buf.closed { 45 | LogTrace(buf.config, ">> Waiting for more space in capped buffer...") 46 | buf.conditionalSync.Wait() // Wait for the reader 47 | } 48 | 49 | // Check again if buffer was closed while waiting 50 | if buf.closed { 51 | return 0, errors.New("buffer is closed") 52 | } 53 | 54 | writtenBytes = len(payload) 55 | buf.buffer = append(buf.buffer, payload...) 56 | LogTrace(buf.config, ">> Writing", writtenBytes, "bytes to capped buffer...") 57 | 58 | buf.conditionalSync.Broadcast() // Notify the reader that new data is available 59 | 60 | return writtenBytes, nil 61 | } 62 | 63 | // Implements io.Reader 64 | func (buf *CappedBuffer) Read(payload []byte) (readBytes int, err error) { 65 | if len(payload) == 0 { 66 | return 0, nil 67 | } 68 | 69 | buf.mutex.Lock() 70 | defer buf.mutex.Unlock() 71 | 72 | for len(buf.buffer) == 0 && !buf.closed { 73 | LogTrace(buf.config, "<< Waiting for more data in capped buffer...") 74 | buf.conditionalSync.Wait() // Wait for the writer 75 | } 76 | 77 | if len(buf.buffer) == 0 && buf.closed { 78 | return 0, io.EOF 79 | } 80 | 81 | maxReadBytes := len(payload) 82 | readBytes = copy(payload, buf.buffer) 83 | buf.buffer = buf.buffer[readBytes:] 84 | LogTrace(buf.config, "<< Reading "+IntToString(readBytes)+"/"+IntToString(maxReadBytes)+" bytes from capped buffer...") 85 | 86 | buf.conditionalSync.Broadcast() // Notify the writer that space is now available 87 | 88 | return readBytes, nil 89 | } 90 | 91 | func (buf *CappedBuffer) Close() error { 92 | buf.closeOnceSync.Do(func() { 93 | buf.mutex.Lock() 94 | 95 | LogTrace(buf.config, "== Closing capped buffer...") 96 | buf.closed = true 97 | 98 | buf.conditionalSync.Broadcast() // Wake up any waiting writers/readers 99 | 100 | buf.mutex.Unlock() 101 | }) 102 | return nil 103 | } 104 | -------------------------------------------------------------------------------- /src/capped_buffer_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "io" 6 | "sync" 7 | "testing" 8 | "time" 9 | ) 10 | 11 | func initTestConfig() *Config { 12 | return &Config{ 13 | LogLevel: LOG_LEVEL_INFO, // Use INFO to avoid excessive logging during tests 14 | } 15 | } 16 | 17 | func TestCappedBufferWrite(t *testing.T) { 18 | t.Run("Writes data to buffer", func(t *testing.T) { 19 | config := initTestConfig() 20 | bufferSize := 100 // 100 bytes 21 | buffer := NewCappedBuffer(bufferSize, config) 22 | writeData := []byte("hello world") 23 | 24 | writtenBytes, err := buffer.Write(writeData) 25 | 26 | if err != nil { 27 | t.Fatalf("Failed to write to buffer: %v", err) 28 | } 29 | if writtenBytes != len(writeData) { 30 | t.Errorf("Expected to write %d bytes, but wrote %d", len(writeData), writtenBytes) 31 | } 32 | }) 33 | 34 | t.Run("Waits to write data to a full buffer", func(t *testing.T) { 35 | config := initTestConfig() 36 | bufferSize := 11 37 | buffer := NewCappedBuffer(bufferSize, config) 38 | writeDataFull := []byte("hello world") 39 | buffer.Write(writeDataFull) 40 | writeDataOverflow := []byte("overflow") 41 | done := make(chan struct{}) 42 | 43 | go func() { 44 | buffer.Write(writeDataOverflow) 45 | close(done) 46 | }() 47 | 48 | select { 49 | case <-done: 50 | t.Error("Write to full buffer should block, but it returned immediately") 51 | case <-time.After(100 * time.Millisecond): 52 | // This is expected - Write should block 53 | } 54 | }) 55 | 56 | t.Run("Writes data to a buffer after it was read (more space available)", func(t *testing.T) { 57 | config := initTestConfig() 58 | bufferSize := 11 59 | buffer := NewCappedBuffer(bufferSize, config) 60 | writeDataFull := []byte("hello world") 61 | buffer.Write(writeDataFull) 62 | writeDataOverflow := []byte("over") 63 | done := make(chan struct{}) 64 | 65 | go func() { 66 | writtenBytes, err := buffer.Write(writeDataOverflow) 67 | if err != nil { 68 | t.Errorf("Failed to write to buffer: %v", err) 69 | } 70 | if writtenBytes != len(writeDataOverflow) { 71 | t.Errorf("Expected to write %d bytes, but wrote %d", len(writeDataOverflow), writtenBytes) 72 | } 73 | 74 | close(done) 75 | }() 76 | 77 | readData := make([]byte, 5) 78 | readBytes, err := buffer.Read(readData) 79 | if err != nil { 80 | t.Fatalf("Failed to read from buffer: %v", err) 81 | } 82 | if readBytes != 5 { 83 | t.Errorf("Expected to read 5 bytes, but read %d", readBytes) 84 | } 85 | 86 | select { 87 | case <-done: 88 | // This is expected - Write should proceed 89 | case <-time.After(100 * time.Millisecond): 90 | t.Error("Write should have proceeded after Read") 91 | } 92 | }) 93 | 94 | t.Run("Receive error when writing to a closed buffer", func(t *testing.T) { 95 | config := initTestConfig() 96 | bufferSize := 100 97 | buffer := NewCappedBuffer(bufferSize, config) 98 | writeData := []byte("hello world") 99 | 100 | buffer.Close() 101 | 102 | writtenBytes, err := buffer.Write(writeData) 103 | 104 | if err == nil { 105 | t.Error("Write to closed buffer should return an error") 106 | } 107 | if err.Error() != "buffer is closed" { 108 | t.Errorf("Expected error message 'buffer is closed', but got: %v", err) 109 | } 110 | if writtenBytes != 0 { 111 | t.Errorf("Expected to write 0 bytes, but wrote %d", writtenBytes) 112 | } 113 | }) 114 | } 115 | 116 | func TestCappedBufferRead(t *testing.T) { 117 | t.Run("Reads data from buffer", func(t *testing.T) { 118 | config := initTestConfig() 119 | bufferSize := 100 // 100 bytes 120 | buffer := NewCappedBuffer(bufferSize, config) 121 | writeData := []byte("hello world") 122 | buffer.Write(writeData) 123 | readData := make([]byte, len(writeData)) 124 | 125 | readBytes, err := buffer.Read(readData) 126 | 127 | if err != nil { 128 | t.Fatalf("Failed to read from buffer: %v", err) 129 | } 130 | if readBytes != len(writeData) { 131 | t.Errorf("Expected to read %d bytes, but read %d", len(writeData), readBytes) 132 | } 133 | if !bytes.Equal(readData, writeData) { 134 | t.Errorf("Read data does not match written data. Got %q, want %q", readData, writeData) 135 | } 136 | }) 137 | 138 | t.Run("Waits to read data from an empty buffer", func(t *testing.T) { 139 | config := initTestConfig() 140 | bufferSize := 100 // 100 bytes 141 | buffer := NewCappedBuffer(bufferSize, config) 142 | done := make(chan struct{}) // Create a channel to signal when read is done 143 | readData := make([]byte, 10) 144 | 145 | // Start a goroutine to read from the buffer 146 | go func() { 147 | buffer.Read(readData) 148 | close(done) 149 | }() 150 | 151 | // Wait for a short time to see if Read blocks 152 | select { 153 | case <-done: 154 | t.Error("Read from empty buffer should block, but it returned immediately") 155 | case <-time.After(100 * time.Millisecond): 156 | // This is expected - Read should block 157 | } 158 | }) 159 | 160 | t.Run("Waits and reads data from a buffer after it was closed", func(t *testing.T) { 161 | config := initTestConfig() 162 | bufferSize := 100 // 100 bytes 163 | buffer := NewCappedBuffer(bufferSize, config) 164 | writeData := []byte("hello world") 165 | buffer.Write(writeData) 166 | done := make(chan struct{}) // Create a channel to signal when read is done 167 | readData := make([]byte, 11) 168 | 169 | // Start a goroutine to read from the buffer 170 | go func() { 171 | readBytes, err := buffer.Read(readData) 172 | if err != nil { 173 | t.Errorf("Failed to read from buffer: %v", err) 174 | } 175 | if readBytes != len(writeData) { 176 | t.Errorf("Expected to read %d bytes, but read %d", len(writeData), readBytes) 177 | } 178 | close(done) 179 | }() 180 | 181 | buffer.Close() // Close the buffer to unblock the read 182 | 183 | // Wait for the read to complete 184 | select { 185 | case <-done: 186 | if !bytes.Equal(readData, writeData) { 187 | t.Errorf("Read data does not match written data. Got %q, want %q", readData, writeData) 188 | } 189 | case <-time.After(100 * time.Millisecond): 190 | t.Error("Read should have returned after Close") 191 | } 192 | }) 193 | 194 | t.Run("Reads data from a buffer after it was written to (more data available)", func(t *testing.T) { 195 | config := initTestConfig() 196 | bufferSize := 11 197 | buffer := NewCappedBuffer(bufferSize, config) 198 | readData := make([]byte, 5) 199 | done := make(chan struct{}) 200 | writeData := []byte("hello world") 201 | 202 | go func() { 203 | readBytes, err := buffer.Read(readData) 204 | if err != nil { 205 | t.Errorf("Failed to read from buffer: %v", err) 206 | } 207 | if readBytes != len(readData) { 208 | t.Errorf("Expected to read %d bytes, but read %d", len(readData), readBytes) 209 | } 210 | if !bytes.Equal(readData, writeData[:len(readData)]) { 211 | t.Errorf("Read data does not match written data. Got %q, want %q", readData, writeData[:len(readData)]) 212 | } 213 | close(done) 214 | }() 215 | 216 | buffer.Write(writeData) 217 | 218 | select { 219 | case <-done: 220 | // This is expected - Read should proceed 221 | case <-time.After(100 * time.Millisecond): 222 | t.Error("Read should have proceeded after Write") 223 | } 224 | }) 225 | 226 | t.Run("Receive EOF when reading from a closed and empty buffer", func(t *testing.T) { 227 | config := initTestConfig() 228 | bufferSize := 100 229 | buffer := NewCappedBuffer(bufferSize, config) 230 | readData := make([]byte, 10) 231 | 232 | buffer.Close() 233 | 234 | readBytes, err := buffer.Read(readData) 235 | 236 | if err != io.EOF { 237 | t.Errorf("Read from closed and empty buffer should return EOF, but got: %v", err) 238 | } 239 | if readBytes != 0 { 240 | t.Errorf("Expected to read 0 bytes, but read %d", readBytes) 241 | } 242 | }) 243 | } 244 | 245 | func TestCappedBufferConcurrentReadWrite(t *testing.T) { 246 | t.Run("Concurrent read and write operations", func(t *testing.T) { 247 | config := initTestConfig() 248 | bufferSize := 100 // 100 bytes 249 | buffer := NewCappedBuffer(bufferSize, config) 250 | iterations := 100 251 | writeData := []byte("test data") 252 | 253 | // WaitGroup to wait for all goroutines to complete 254 | var wg sync.WaitGroup 255 | wg.Add(2) // One for reader, one for writer 256 | 257 | // Start writer goroutine 258 | go func() { 259 | defer wg.Done() 260 | for i := 0; i < iterations; i++ { 261 | _, err := buffer.Write(writeData) 262 | if err != nil { 263 | t.Errorf("Write error at iteration %d: %v", i, err) 264 | return 265 | } 266 | } 267 | }() 268 | 269 | // Start reader goroutine 270 | go func() { 271 | defer wg.Done() 272 | readData := make([]byte, len(writeData)) 273 | for i := 0; i < iterations; i++ { 274 | _, err := buffer.Read(readData) 275 | if err != nil { 276 | t.Errorf("Read error at iteration %d: %v", i, err) 277 | return 278 | } 279 | if !bytes.Equal(readData, writeData) { 280 | t.Errorf("Read data does not match at iteration %d. Got %q, want %q", i, readData, writeData) 281 | return 282 | } 283 | } 284 | }() 285 | 286 | // Wait for both goroutines to complete 287 | wg.Wait() 288 | }) 289 | 290 | t.Run("Multiple sequential read and write operations", func(t *testing.T) { 291 | config := initTestConfig() 292 | bufferSize := 20 293 | buffer := NewCappedBuffer(bufferSize, config) 294 | data1 := []byte("first") 295 | data2 := []byte("second") 296 | data3 := []byte("third") 297 | readData1 := make([]byte, 5) // "first" 298 | readData2 := make([]byte, 6) // "second" 299 | readData3 := make([]byte, 5) // "third" 300 | 301 | buffer.Write(data1) 302 | buffer.Write(data2) 303 | buffer.Write(data3) 304 | 305 | readBytes1, _ := buffer.Read(readData1) 306 | if readBytes1 != 5 || string(readData1) != "first" { 307 | t.Errorf("First read failed: got %q, want %q", readData1, "first") 308 | } 309 | 310 | readBytes2, _ := buffer.Read(readData2) 311 | if readBytes2 != 6 || string(readData2) != "second" { 312 | t.Errorf("Second read failed: got %q, want %q", readData2, "second") 313 | } 314 | 315 | readBytes3, _ := buffer.Read(readData3) 316 | if readBytes3 != 5 || string(readData3) != "third" { 317 | t.Errorf("Third read failed: got %q, want %q", readData3, "third") 318 | } 319 | }) 320 | } 321 | -------------------------------------------------------------------------------- /src/config.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "os" 6 | "slices" 7 | "strings" 8 | ) 9 | 10 | const ( 11 | VERSION = "0.51.1" 12 | 13 | ENV_PORT = "BEMIDB_PORT" 14 | ENV_DATABASE = "BEMIDB_DATABASE" 15 | ENV_USER = "BEMIDB_USER" 16 | ENV_PASSWORD = "BEMIDB_PASSWORD" 17 | ENV_HOST = "BEMIDB_HOST" 18 | ENV_INIT_SQL_FILEPATH = "BEMIDB_INIT_SQL" 19 | 20 | ENV_STORAGE_PATH = "BEMIDB_STORAGE_PATH" 21 | ENV_STORAGE_TYPE = "BEMIDB_STORAGE_TYPE" 22 | ENV_AWS_REGION = "AWS_REGION" 23 | ENV_AWS_S3_ENDPOINT = "AWS_S3_ENDPOINT" 24 | ENV_AWS_S3_BUCKET = "AWS_S3_BUCKET" 25 | ENV_AWS_ACCESS_KEY_ID = "AWS_ACCESS_KEY_ID" 26 | ENV_AWS_SECRET_ACCESS_KEY = "AWS_SECRET_ACCESS_KEY" 27 | 28 | ENV_PG_DATABASE_URL = "PG_DATABASE_URL" 29 | ENV_PG_SYNC_INTERVAL = "PG_SYNC_INTERVAL" 30 | ENV_PG_SCHEMA_PREFIX = "PG_SCHEMA_PREFIX" 31 | ENV_PG_INCLUDE_TABLES = "PG_INCLUDE_TABLES" 32 | ENV_PG_EXCLUDE_TABLES = "PG_EXCLUDE_TABLES" 33 | ENV_PG_INCREMENTALLY_REFRESHED_TABLES = "PG_INCREMENTALLY_REFRESHED_TABLES" 34 | ENV_PG_PRESERVE_UNSYNCED = "PG_PRESERVE_UNSYNCED" 35 | 36 | ENV_LOG_LEVEL = "BEMIDB_LOG_LEVEL" 37 | ENV_DISABLE_ANONYMOUS_ANALYTICS = "BEMIDB_DISABLE_ANONYMOUS_ANALYTICS" 38 | 39 | DEFAULT_PORT = "54321" 40 | DEFAULT_DATABASE = "bemidb" 41 | DEFAULT_USER = "" 42 | DEFAULT_PASSWORD = "" 43 | DEFAULT_HOST = "127.0.0.1" 44 | DEFAULT_STORAGE_PATH = "iceberg" 45 | DEFAULT_LOG_LEVEL = "INFO" 46 | DEFAULT_DB_STORAGE_TYPE = "LOCAL" 47 | 48 | DEFAULT_AWS_S3_ENDPOINT = "s3.amazonaws.com" 49 | 50 | STORAGE_TYPE_LOCAL = "LOCAL" 51 | STORAGE_TYPE_S3 = "S3" 52 | ) 53 | 54 | var STORAGE_TYPES = []string{STORAGE_TYPE_LOCAL, STORAGE_TYPE_S3} 55 | 56 | type AwsConfig struct { 57 | Region string 58 | S3Endpoint string // optional 59 | S3Bucket string 60 | AccessKeyId string 61 | SecretAccessKey string 62 | } 63 | 64 | type PgConfig struct { 65 | DatabaseUrl string 66 | SyncInterval string // optional 67 | SchemaPrefix string // optional 68 | IncludeTables []string // optional 69 | ExcludeTables []string // optional 70 | IncrementallyRefreshedTables []string // optional 71 | PreserveUnsynced bool // optional 72 | } 73 | 74 | type Config struct { 75 | Host string 76 | Port string 77 | Database string 78 | User string 79 | EncryptedPassword string 80 | LogLevel string 81 | StorageType string 82 | StoragePath string 83 | Aws AwsConfig 84 | Pg PgConfig 85 | DisableAnonymousAnalytics bool 86 | } 87 | 88 | type configParseValues struct { 89 | password string 90 | pgIncludeTables string 91 | pgExcludeTables string 92 | pgIncrementallyRefreshedTables string 93 | } 94 | 95 | var _config Config 96 | var _configParseValues configParseValues 97 | 98 | func init() { 99 | registerFlags() 100 | } 101 | 102 | func registerFlags() { 103 | flag.StringVar(&_config.Host, "host", os.Getenv(ENV_HOST), "Database host. Default: \""+DEFAULT_HOST+"\"") 104 | flag.StringVar(&_config.Port, "port", os.Getenv(ENV_PORT), "Port for BemiDB to listen on. Default: \""+DEFAULT_PORT+"\"") 105 | flag.StringVar(&_config.Database, "database", os.Getenv(ENV_DATABASE), "Database name. Default: \""+DEFAULT_DATABASE+"\"") 106 | flag.StringVar(&_config.User, "user", os.Getenv(ENV_USER), "Database user. Default: \""+DEFAULT_USER+"\"") 107 | flag.StringVar(&_configParseValues.password, "password", os.Getenv(ENV_PASSWORD), "Database password. Default: \""+DEFAULT_PASSWORD+"\"") 108 | flag.StringVar(&_config.StoragePath, "storage-path", os.Getenv(ENV_STORAGE_PATH), "Path to the storage folder. Default: \""+DEFAULT_STORAGE_PATH+"\"") 109 | flag.StringVar(&_config.LogLevel, "log-level", os.Getenv(ENV_LOG_LEVEL), "Log level: \"ERROR\", \"WARN\", \"INFO\", \"DEBUG\", \"TRACE\". Default: \""+DEFAULT_LOG_LEVEL+"\"") 110 | flag.StringVar(&_config.StorageType, "storage-type", os.Getenv(ENV_STORAGE_TYPE), "Storage type: \"LOCAL\", \"S3\". Default: \""+DEFAULT_DB_STORAGE_TYPE+"\"") 111 | flag.StringVar(&_config.Pg.SchemaPrefix, "pg-schema-prefix", os.Getenv(ENV_PG_SCHEMA_PREFIX), "(Optional) Prefix for PostgreSQL schema names") 112 | flag.StringVar(&_config.Pg.SyncInterval, "pg-sync-interval", os.Getenv(ENV_PG_SYNC_INTERVAL), "(Optional) Interval between syncs. Valid units: \"ns\", \"us\" (or \"µs\"), \"ms\", \"s\", \"m\", \"h\"") 113 | flag.StringVar(&_configParseValues.pgIncludeTables, "pg-include-tables", os.Getenv(ENV_PG_INCLUDE_TABLES), "(Optional) Comma-separated list of tables to include in sync (format: schema.table)") 114 | flag.StringVar(&_configParseValues.pgExcludeTables, "pg-exclude-tables", os.Getenv(ENV_PG_EXCLUDE_TABLES), "(Optional) Comma-separated list of tables to exclude from sync (format: schema.table)") 115 | flag.StringVar(&_configParseValues.pgIncrementallyRefreshedTables, "pg-incrementally-refreshed-tables", os.Getenv(ENV_PG_INCREMENTALLY_REFRESHED_TABLES), "(Optional) Comma-separated list of tables to refresh incrementally (format: schema.table)") 116 | flag.BoolVar(&_config.Pg.PreserveUnsynced, "pg-preserve-unsynced", os.Getenv(ENV_PG_PRESERVE_UNSYNCED) == "true", "(Optional) Do not delete the existing tables in BemiDB that are not part of the sync") 117 | flag.StringVar(&_config.Pg.DatabaseUrl, "pg-database-url", os.Getenv(ENV_PG_DATABASE_URL), "PostgreSQL database URL to sync") 118 | flag.StringVar(&_config.Aws.Region, "aws-region", os.Getenv(ENV_AWS_REGION), "AWS region") 119 | flag.StringVar(&_config.Aws.S3Endpoint, "aws-s3-endpoint", os.Getenv(ENV_AWS_S3_ENDPOINT), "AWS S3 endpoint. Default: \""+DEFAULT_AWS_S3_ENDPOINT+"\"") 120 | flag.StringVar(&_config.Aws.S3Bucket, "aws-s3-bucket", os.Getenv(ENV_AWS_S3_BUCKET), "AWS S3 bucket name") 121 | flag.StringVar(&_config.Aws.AccessKeyId, "aws-access-key-id", os.Getenv(ENV_AWS_ACCESS_KEY_ID), "AWS access key ID") 122 | flag.StringVar(&_config.Aws.SecretAccessKey, "aws-secret-access-key", os.Getenv(ENV_AWS_SECRET_ACCESS_KEY), "AWS secret access key") 123 | flag.BoolVar(&_config.DisableAnonymousAnalytics, "disable-anonymous-analytics", os.Getenv(ENV_DISABLE_ANONYMOUS_ANALYTICS) == "true", "Disable anonymous analytics collection") 124 | } 125 | 126 | func parseFlags() { 127 | flag.Parse() 128 | 129 | if _config.Host == "" { 130 | _config.Host = DEFAULT_HOST 131 | } 132 | if _config.Port == "" { 133 | _config.Port = DEFAULT_PORT 134 | } 135 | if _config.Database == "" { 136 | _config.Database = DEFAULT_DATABASE 137 | } 138 | if _config.User == "" { 139 | _config.User = DEFAULT_USER 140 | } 141 | if _configParseValues.password == "" { 142 | _configParseValues.password = DEFAULT_PASSWORD 143 | } 144 | if _configParseValues.password != "" { 145 | if _config.User == "" { 146 | panic("Password is set without a user") 147 | } 148 | _config.EncryptedPassword = StringToScramSha256(_configParseValues.password) 149 | } 150 | if _config.StoragePath == "" { 151 | _config.StoragePath = DEFAULT_STORAGE_PATH 152 | } 153 | if _config.LogLevel == "" { 154 | _config.LogLevel = DEFAULT_LOG_LEVEL 155 | } else if !slices.Contains(LOG_LEVELS, _config.LogLevel) { 156 | panic("Invalid log level " + _config.LogLevel + ". Must be one of " + strings.Join(LOG_LEVELS, ", ")) 157 | } 158 | if _config.StorageType == "" { 159 | _config.StorageType = DEFAULT_DB_STORAGE_TYPE 160 | } else if !slices.Contains(STORAGE_TYPES, _config.StorageType) { 161 | panic("Invalid storage type " + _config.StorageType + ". Must be one of " + strings.Join(STORAGE_TYPES, ", ")) 162 | } 163 | 164 | if _config.StorageType == STORAGE_TYPE_S3 { 165 | if _config.Aws.Region == "" { 166 | panic("AWS region is required") 167 | } 168 | if _config.Aws.S3Endpoint == "" { 169 | _config.Aws.S3Endpoint = DEFAULT_AWS_S3_ENDPOINT 170 | } 171 | if _config.Aws.S3Bucket == "" { 172 | panic("AWS S3 bucket name is required") 173 | } 174 | if _config.Aws.AccessKeyId != "" && _config.Aws.SecretAccessKey == "" { 175 | panic("AWS secret access key is required") 176 | } 177 | if _config.Aws.AccessKeyId == "" && _config.Aws.SecretAccessKey != "" { 178 | panic("AWS access key ID is required") 179 | } 180 | } 181 | if _configParseValues.pgIncludeTables != "" { 182 | _config.Pg.IncludeTables = strings.Split(_configParseValues.pgIncludeTables, ",") 183 | } 184 | if _configParseValues.pgIncrementallyRefreshedTables != "" { 185 | _config.Pg.IncrementallyRefreshedTables = strings.Split(_configParseValues.pgIncrementallyRefreshedTables, ",") 186 | } 187 | if _configParseValues.pgExcludeTables != "" { 188 | _config.Pg.ExcludeTables = strings.Split(_configParseValues.pgExcludeTables, ",") 189 | } 190 | 191 | _configParseValues = configParseValues{} 192 | } 193 | 194 | func LoadConfig(reRegisterFlags ...bool) *Config { 195 | if reRegisterFlags != nil && reRegisterFlags[0] { 196 | flag.CommandLine = flag.NewFlagSet(os.Args[0], flag.ExitOnError) 197 | registerFlags() 198 | } 199 | parseFlags() 200 | return &_config 201 | } 202 | -------------------------------------------------------------------------------- /src/config_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestLoadConfig(t *testing.T) { 8 | t.Run("Uses default config values with local storage", func(t *testing.T) { 9 | config := LoadConfig(true) 10 | 11 | if config.Port != "54321" { 12 | t.Errorf("Expected port to be 54321, got %s", config.Port) 13 | } 14 | if config.Database != "bemidb" { 15 | t.Errorf("Expected database to be bemidb, got %s", config.Database) 16 | } 17 | if config.StoragePath != "iceberg" { 18 | t.Errorf("Expected StoragePath to be iceberg, got %s", config.StoragePath) 19 | } 20 | if config.LogLevel != "INFO" { 21 | t.Errorf("Expected logLevel to be INFO, got %s", config.LogLevel) 22 | } 23 | if config.StorageType != "LOCAL" { 24 | t.Errorf("Expected storageType to be LOCAL, got %s", config.StorageType) 25 | } 26 | if config.Pg.DatabaseUrl != "" { 27 | t.Errorf("Expected pgDatabaseUrl to be empty, got %s", config.Pg.DatabaseUrl) 28 | } 29 | if config.Pg.SyncInterval != "" { 30 | t.Errorf("Expected interval to be empty, got %s", config.Pg.SyncInterval) 31 | } 32 | if config.Pg.SchemaPrefix != "" { 33 | t.Errorf("Expected schemaPrefix to be empty, got %s", config.Pg.SchemaPrefix) 34 | } 35 | if config.Pg.IncludeTables != nil { 36 | t.Errorf("Expected includeTables to be empty, got %v", config.Pg.IncludeTables) 37 | } 38 | if config.Pg.ExcludeTables != nil { 39 | t.Errorf("Expected includeTables to be empty, got %v", config.Pg.ExcludeTables) 40 | } 41 | }) 42 | 43 | t.Run("Uses config values from environment variables with LOCAL storage", func(t *testing.T) { 44 | t.Setenv("BEMIDB_PORT", "12345") 45 | t.Setenv("BEMIDB_DATABASE", "mydb") 46 | t.Setenv("BEMIDB_INIT_SQL", "./init/duckdb.sql") 47 | t.Setenv("BEMIDB_STORAGE_PATH", "storage-path") 48 | t.Setenv("BEMIDB_LOG_LEVEL", "ERROR") 49 | t.Setenv("BEMIDB_STORAGE_TYPE", "LOCAL") 50 | 51 | config := LoadConfig(true) 52 | 53 | if config.Port != "12345" { 54 | t.Errorf("Expected port to be 12345, got %s", config.Port) 55 | } 56 | if config.Database != "mydb" { 57 | t.Errorf("Expected database to be mydb, got %s", config.Database) 58 | } 59 | if config.StoragePath != "storage-path" { 60 | t.Errorf("Expected StoragePath to be storage-path, got %s", config.StoragePath) 61 | } 62 | if config.LogLevel != "ERROR" { 63 | t.Errorf("Expected logLevel to be ERROR, got %s", config.LogLevel) 64 | } 65 | if config.StorageType != "LOCAL" { 66 | t.Errorf("Expected storageType to be local, got %s", config.StorageType) 67 | } 68 | }) 69 | 70 | t.Run("Uses config values from environment variables with AWS S3 storage", func(t *testing.T) { 71 | t.Setenv("BEMIDB_PORT", "12345") 72 | t.Setenv("BEMIDB_DATABASE", "mydb") 73 | t.Setenv("BEMIDB_INIT_SQL", "./init/duckdb.sql") 74 | t.Setenv("BEMIDB_STORAGE_PATH", "storage-path") 75 | t.Setenv("BEMIDB_LOG_LEVEL", "ERROR") 76 | t.Setenv("BEMIDB_STORAGE_TYPE", "S3") 77 | t.Setenv("AWS_REGION", "us-west-1") 78 | t.Setenv("AWS_S3_ENDPOINT", "s3-us-west-1.amazonaws.com") 79 | t.Setenv("AWS_S3_BUCKET", "my_bucket") 80 | t.Setenv("AWS_ACCESS_KEY_ID", "my_access_key_id") 81 | t.Setenv("AWS_SECRET_ACCESS_KEY", "my_secret_access_key") 82 | 83 | config := LoadConfig(true) 84 | 85 | if config.Port != "12345" { 86 | t.Errorf("Expected port to be 12345, got %s", config.Port) 87 | } 88 | if config.Database != "mydb" { 89 | t.Errorf("Expected database to be mydb, got %s", config.Database) 90 | } 91 | if config.StoragePath != "storage-path" { 92 | t.Errorf("Expected StoragePath to be storage-path, got %s", config.StoragePath) 93 | } 94 | if config.LogLevel != "ERROR" { 95 | t.Errorf("Expected logLevel to be ERROR, got %s", config.LogLevel) 96 | } 97 | if config.StorageType != "S3" { 98 | t.Errorf("Expected storageType to be S3, got %s", config.StorageType) 99 | } 100 | if config.Aws.Region != "us-west-1" { 101 | t.Errorf("Expected awsRegion to be us-west-1, got %s", config.Aws.Region) 102 | } 103 | if config.Aws.S3Endpoint != "s3-us-west-1.amazonaws.com" { 104 | t.Errorf("Expected awsS3Endpoint to be s3-us-west-1.amazonaws.com, got %s", config.Aws.S3Endpoint) 105 | } 106 | if config.Aws.S3Bucket != "my_bucket" { 107 | t.Errorf("Expected awsS3Bucket to be mybucket, got %s", config.Aws.S3Bucket) 108 | } 109 | if config.Aws.AccessKeyId != "my_access_key_id" { 110 | t.Errorf("Expected awsAccessKeyId to be my_access_key_id, got %s", config.Aws.AccessKeyId) 111 | } 112 | if config.Aws.SecretAccessKey != "my_secret_access_key" { 113 | t.Errorf("Expected awsSecretAccessKey to be my_secret_access_key, got %s", config.Aws.SecretAccessKey) 114 | } 115 | }) 116 | 117 | t.Run("Uses config values from environment variables for PG", func(t *testing.T) { 118 | t.Setenv("PG_DATABASE_URL", "postgres://user:password@localhost:5432/template1") 119 | t.Setenv("PG_SYNC_INTERVAL", "1h") 120 | t.Setenv("PG_SCHEMA_PREFIX", "mydb_") 121 | t.Setenv("PG_EXCLUDE_TABLES", "public.users,public.secrets") 122 | 123 | config := LoadConfig(true) 124 | 125 | if config.Pg.DatabaseUrl != "postgres://user:password@localhost:5432/template1" { 126 | t.Errorf("Expected pgDatabaseUrl to be postgres://user:password@localhost:5432/template1, got %s", config.Pg.DatabaseUrl) 127 | } 128 | if config.Pg.SyncInterval != "1h" { 129 | t.Errorf("Expected interval to be 1h, got %s", config.Pg.SyncInterval) 130 | } 131 | if config.Pg.SchemaPrefix != "mydb_" { 132 | t.Errorf("Expected schemaPrefix to be empty, got %s", config.Pg.SchemaPrefix) 133 | } 134 | if !HasExactOrWildcardMatch(config.Pg.ExcludeTables, "public.users") { 135 | t.Errorf("Expected ExcludeTables to contain public.users, got %v", config.Pg.ExcludeTables) 136 | } 137 | if !HasExactOrWildcardMatch(config.Pg.ExcludeTables, "public.secrets") { 138 | t.Errorf("Expected ExcludeTables to contain public.secrets, got %v", config.Pg.ExcludeTables) 139 | } 140 | }) 141 | 142 | t.Run("Panics when only AWS_ACCESS_KEY_ID is set without AWS_SECRET_ACCESS_KEY", func(t *testing.T) { 143 | t.Setenv("BEMIDB_STORAGE_TYPE", "S3") 144 | t.Setenv("AWS_ACCESS_KEY_ID", "my_access_key_id") 145 | 146 | defer func() { 147 | if r := recover(); r == nil { 148 | t.Error("Expected panic when only AWS_ACCESS_KEY_ID is set") 149 | } 150 | }() 151 | 152 | LoadConfig(true) 153 | }) 154 | 155 | t.Run("Panics when only AWS_SECRET_ACCESS_KEY is set without AWS_ACCESS_KEY_ID", func(t *testing.T) { 156 | t.Setenv("BEMIDB_STORAGE_TYPE", "S3") 157 | t.Setenv("AWS_SECRET_ACCESS_KEY", "my_secret_access_key") 158 | 159 | defer func() { 160 | if r := recover(); r == nil { 161 | t.Error("Expected panic when only AWS_SECRET_ACCESS_KEY is set") 162 | } 163 | }() 164 | 165 | LoadConfig(true) 166 | }) 167 | 168 | t.Run("Uses command line arguments", func(t *testing.T) { 169 | setTestArgs([]string{ 170 | "--port", "12345", 171 | "--database", "mydb", 172 | "--storage-path", "storage-path", 173 | "--log-level", "ERROR", 174 | "--storage-type", "LOCAL", 175 | "--pg-database-url", "postgres://user:password@localhost:5432/db", 176 | "--pg-sync-interval", "2h30m", 177 | "--pg-schema-prefix", "mydb_", 178 | "--pg-exclude-tables", "public.users,public.secrets", 179 | }) 180 | 181 | config := LoadConfig(true) 182 | 183 | if config.Port != "12345" { 184 | t.Errorf("Expected port to be 12345, got %s", config.Port) 185 | } 186 | if config.Database != "mydb" { 187 | t.Errorf("Expected database to be mydb, got %s", config.Database) 188 | } 189 | if config.StoragePath != "storage-path" { 190 | t.Errorf("Expected StoragePath to be storage-path, got %s", config.StoragePath) 191 | } 192 | if config.LogLevel != "ERROR" { 193 | t.Errorf("Expected logLevel to be ERROR, got %s", config.LogLevel) 194 | } 195 | if config.StorageType != "LOCAL" { 196 | t.Errorf("Expected storageType to be local, got %s", config.StorageType) 197 | } 198 | if config.Pg.DatabaseUrl != "postgres://user:password@localhost:5432/db" { 199 | t.Errorf("Expected pgDatabaseUrl to be postgres://user:password@localhost:5432/db, got %s", config.Pg.DatabaseUrl) 200 | } 201 | if config.Pg.SyncInterval != "2h30m" { 202 | t.Errorf("Expected interval to be 2h30m, got %s", config.Pg.SyncInterval) 203 | } 204 | if config.Pg.SchemaPrefix != "mydb_" { 205 | t.Errorf("Expected schemaPrefix to be mydb_, got %s", config.Pg.SchemaPrefix) 206 | } 207 | if !HasExactOrWildcardMatch(config.Pg.ExcludeTables, "public.users") { 208 | t.Errorf("Expected ExcludeTables to have public.users, got %v", config.Pg.ExcludeTables) 209 | } 210 | if !HasExactOrWildcardMatch(config.Pg.ExcludeTables, "public.secrets") { 211 | t.Errorf("Expected ExcludeTables to have public.secrets, got %v", config.Pg.ExcludeTables) 212 | } 213 | }) 214 | } 215 | -------------------------------------------------------------------------------- /src/custom_types.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | ) 7 | 8 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 9 | 10 | type Set[T comparable] map[T]struct{} 11 | 12 | func NewSet[T comparable](items []T) Set[T] { 13 | set := make(Set[T]) 14 | 15 | for _, item := range items { 16 | set.Add(item) 17 | } 18 | 19 | return set 20 | } 21 | 22 | func (set Set[T]) Add(item T) { 23 | set[item] = struct{}{} 24 | } 25 | 26 | func (set Set[T]) Contains(item T) bool { 27 | _, ok := set[item] 28 | return ok 29 | } 30 | 31 | func (set Set[T]) Values() []T { 32 | values := make([]T, 0, len(set)) 33 | for val := range set { 34 | values = append(values, val) 35 | } 36 | 37 | return values 38 | } 39 | 40 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 41 | 42 | type IcebergSchemaTable struct { 43 | Schema string 44 | Table string 45 | } 46 | 47 | func (schemaTable IcebergSchemaTable) String() string { 48 | return fmt.Sprintf(`"%s"."%s"`, schemaTable.Schema, schemaTable.Table) 49 | } 50 | 51 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 52 | 53 | type IcebergTableField struct { 54 | Name string 55 | Type string 56 | Required bool 57 | IsList bool 58 | } 59 | 60 | func (tableField IcebergTableField) ToSql() string { 61 | sql := fmt.Sprintf(`"%s" %s`, tableField.Name, tableField.Type) 62 | 63 | if tableField.IsList { 64 | sql += "[]" 65 | } 66 | 67 | if tableField.Required { 68 | sql += " NOT NULL" 69 | } 70 | 71 | return sql 72 | } 73 | 74 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 75 | 76 | type QuerySchemaTable struct { 77 | Schema string 78 | Table string 79 | Alias string 80 | } 81 | 82 | func NewQuerySchemaTableFromString(schemaTable string) QuerySchemaTable { 83 | parts := strings.Split(schemaTable, ".") 84 | 85 | qSchemaTable := QuerySchemaTable{ 86 | Table: parts[len(parts)-1], 87 | } 88 | if len(parts) > 1 { 89 | qSchemaTable.Schema = parts[0] 90 | } 91 | 92 | if !StringContainsUpper(qSchemaTable.Schema) { 93 | qSchemaTable.Schema = strings.ReplaceAll(qSchemaTable.Schema, "\"", "") 94 | } 95 | if !StringContainsUpper(qSchemaTable.Table) { 96 | qSchemaTable.Table = strings.ReplaceAll(qSchemaTable.Table, "\"", "") 97 | } 98 | 99 | return qSchemaTable 100 | } 101 | 102 | func (qSchemaTable QuerySchemaTable) ToIcebergSchemaTable() IcebergSchemaTable { 103 | if qSchemaTable.Schema == "" { 104 | return IcebergSchemaTable{ 105 | Schema: PG_SCHEMA_PUBLIC, 106 | Table: qSchemaTable.Table, 107 | } 108 | } 109 | 110 | return IcebergSchemaTable{ 111 | Schema: qSchemaTable.Schema, 112 | Table: qSchemaTable.Table, 113 | } 114 | } 115 | 116 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 117 | 118 | type QuerySchemaFunction struct { 119 | Schema string 120 | Function string 121 | } 122 | 123 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 124 | 125 | type PgSchemaTable struct { 126 | Schema string 127 | Table string 128 | ParentPartitionedTable string 129 | } 130 | 131 | func (pgSchemaTable PgSchemaTable) String() string { 132 | return fmt.Sprintf(`"%s"."%s"`, pgSchemaTable.Schema, pgSchemaTable.Table) 133 | } 134 | 135 | func (pgSchemaTable PgSchemaTable) ParentPartitionedTableString() string { 136 | if pgSchemaTable.ParentPartitionedTable == "" { 137 | return "" 138 | } 139 | 140 | return fmt.Sprintf(`"%s"."%s"`, pgSchemaTable.Schema, pgSchemaTable.ParentPartitionedTable) 141 | } 142 | 143 | func (pgSchemaTable PgSchemaTable) ToConfigArg() string { 144 | return fmt.Sprintf(`%s.%s`, pgSchemaTable.Schema, pgSchemaTable.Table) 145 | } 146 | 147 | func (pgSchemaTable PgSchemaTable) ToIcebergSchemaTable() IcebergSchemaTable { 148 | return IcebergSchemaTable{ 149 | Schema: pgSchemaTable.Schema, 150 | Table: pgSchemaTable.Table, 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /src/duckdb.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "context" 6 | "database/sql" 7 | "io" 8 | "regexp" 9 | "slices" 10 | "strings" 11 | "time" 12 | 13 | _ "github.com/marcboeker/go-duckdb" 14 | ) 15 | 16 | const ( 17 | DUCKDB_SCHEMA_MAIN = "main" 18 | REFRESH_IMPLICIT_AWS_CREDENTIALS_INTERVAL = 10 * time.Minute 19 | ) 20 | 21 | var DUCKDB_INIT_BOOT_QUERIES = []string{ 22 | // Set up Iceberg 23 | "INSTALL iceberg", 24 | "LOAD iceberg", 25 | 26 | // Set up schemas 27 | "SELECT oid FROM pg_catalog.pg_namespace", 28 | "CREATE SCHEMA public", 29 | 30 | // Configure DuckDB 31 | "SET scalar_subquery_error_on_multiple_rows=false", 32 | "SET timezone='UTC'", 33 | } 34 | 35 | type Duckdb struct { 36 | db *sql.DB 37 | config *Config 38 | stopImplicitAwsCredentialsRefreshChan chan struct{} 39 | } 40 | 41 | func NewDuckdb(config *Config, withPgCompatibility bool) *Duckdb { 42 | ctx := context.Background() 43 | db, err := sql.Open("duckdb", "") 44 | PanicIfError(config, err) 45 | 46 | duckdb := &Duckdb{ 47 | db: db, 48 | config: config, 49 | stopImplicitAwsCredentialsRefreshChan: make(chan struct{}), 50 | } 51 | 52 | bootQueries := []string{} 53 | if withPgCompatibility { 54 | bootQueries = slices.Concat( 55 | // Set up DuckDB 56 | DUCKDB_INIT_BOOT_QUERIES, 57 | 58 | // Create pg-compatible functions 59 | CreatePgCatalogMacroQueries(config), 60 | CreateInformationSchemaMacroQueries(config), 61 | 62 | // Create pg-compatible tables and views 63 | CreatePgCatalogTableQueries(config), 64 | CreateInformationSchemaTableQueries(config), 65 | 66 | // Use the public schema 67 | []string{"USE public"}, 68 | ) 69 | } 70 | 71 | for _, query := range bootQueries { 72 | _, err := duckdb.ExecContext(ctx, query, nil) 73 | PanicIfError(config, err) 74 | } 75 | 76 | switch config.StorageType { 77 | case STORAGE_TYPE_S3: 78 | if duckdb.config.Aws.AccessKeyId != "" && duckdb.config.Aws.SecretAccessKey != "" { 79 | duckdb.setExplicitAwsCredentials(ctx) 80 | } else { 81 | duckdb.setImplicitAwsCredentials(ctx) 82 | duckdb.autoRefreshImplicitAwsCredentials(ctx) 83 | } 84 | 85 | if IsLocalHost(config.Aws.S3Endpoint) { 86 | _, err = duckdb.ExecContext(ctx, "SET s3_use_ssl=false", nil) 87 | PanicIfError(config, err) 88 | } 89 | 90 | if config.Aws.S3Endpoint != DEFAULT_AWS_S3_ENDPOINT { 91 | // Use endpoint/bucket/key (path, deprecated on AWS) instead of bucket.endpoint/key (vhost) 92 | _, err = duckdb.ExecContext(ctx, "SET s3_url_style='path'", nil) 93 | PanicIfError(config, err) 94 | } 95 | 96 | if config.LogLevel == LOG_LEVEL_TRACE { 97 | _, err = duckdb.ExecContext(ctx, "SET enable_http_logging=true", nil) 98 | PanicIfError(config, err) 99 | } 100 | } 101 | 102 | return duckdb 103 | } 104 | 105 | func (duckdb *Duckdb) ExecContext(ctx context.Context, query string, args map[string]string) (sql.Result, error) { 106 | LogDebug(duckdb.config, "Querying DuckDB:", query) 107 | return duckdb.db.ExecContext(ctx, replaceNamedStringArgs(query, args)) 108 | } 109 | 110 | func (duckdb *Duckdb) QueryContext(ctx context.Context, query string) (*sql.Rows, error) { 111 | LogDebug(duckdb.config, "Querying DuckDB:", query) 112 | return duckdb.db.QueryContext(ctx, query) 113 | } 114 | 115 | func (duckdb *Duckdb) PrepareContext(ctx context.Context, query string) (*sql.Stmt, error) { 116 | LogDebug(duckdb.config, "Preparing DuckDB statement:", query) 117 | return duckdb.db.PrepareContext(ctx, query) 118 | } 119 | 120 | func (duckdb *Duckdb) Close() { 121 | close(duckdb.stopImplicitAwsCredentialsRefreshChan) 122 | duckdb.db.Close() 123 | } 124 | 125 | func (duckdb *Duckdb) ExecTransactionContext(ctx context.Context, queries []string) error { 126 | tx, err := duckdb.db.Begin() 127 | LogDebug(duckdb.config, "Querying DuckDB: BEGIN") 128 | if err != nil { 129 | return err 130 | } 131 | 132 | for _, query := range queries { 133 | LogDebug(duckdb.config, "Querying DuckDB:", query) 134 | _, err := tx.ExecContext(ctx, query) 135 | if err != nil { 136 | tx.Rollback() 137 | return err 138 | } 139 | } 140 | 141 | LogDebug(duckdb.config, "Querying DuckDB: COMMIT") 142 | return tx.Commit() 143 | } 144 | 145 | func (duckdb *Duckdb) ExecFile(reader io.ReadCloser) { 146 | defer reader.Close() 147 | 148 | lines := []string{} 149 | scanner := bufio.NewScanner(reader) 150 | for scanner.Scan() { 151 | lines = append(lines, scanner.Text()) 152 | } 153 | PanicIfError(duckdb.config, scanner.Err()) 154 | 155 | ctx := context.Background() 156 | for _, sql := range lines { 157 | _, err := duckdb.ExecContext(ctx, sql, nil) 158 | PanicIfError(duckdb.config, err) 159 | } 160 | } 161 | 162 | func (duckdb *Duckdb) setExplicitAwsCredentials(ctx context.Context) { 163 | config := duckdb.config 164 | query := "CREATE OR REPLACE SECRET aws_s3_secret (TYPE S3, KEY_ID '$accessKeyId', SECRET '$secretAccessKey', REGION '$region', ENDPOINT '$endpoint', SCOPE '$s3Bucket')" 165 | _, err := duckdb.ExecContext(ctx, query, map[string]string{ 166 | "accessKeyId": config.Aws.AccessKeyId, 167 | "secretAccessKey": config.Aws.SecretAccessKey, 168 | "region": config.Aws.Region, 169 | "endpoint": config.Aws.S3Endpoint, 170 | "s3Bucket": "s3://" + config.Aws.S3Bucket, 171 | }) 172 | PanicIfError(config, err) 173 | } 174 | 175 | func (duckdb *Duckdb) setImplicitAwsCredentials(ctx context.Context) { 176 | config := duckdb.config 177 | query := "CREATE OR REPLACE SECRET aws_s3_secret (TYPE S3, PROVIDER CREDENTIAL_CHAIN, REGION '$region', ENDPOINT '$endpoint', SCOPE '$s3Bucket')" 178 | _, err := duckdb.ExecContext(ctx, query, map[string]string{ 179 | "region": config.Aws.Region, 180 | "endpoint": config.Aws.S3Endpoint, 181 | "s3Bucket": "s3://" + config.Aws.S3Bucket, 182 | }) 183 | PanicIfError(config, err) 184 | } 185 | 186 | func (duckdb *Duckdb) autoRefreshImplicitAwsCredentials(ctx context.Context) { 187 | ticker := time.NewTicker(REFRESH_IMPLICIT_AWS_CREDENTIALS_INTERVAL) 188 | go func() { 189 | for { 190 | select { 191 | case <-ticker.C: 192 | duckdb.setImplicitAwsCredentials(ctx) 193 | case <-duckdb.stopImplicitAwsCredentialsRefreshChan: 194 | ticker.Stop() 195 | return 196 | } 197 | } 198 | }() 199 | } 200 | 201 | func replaceNamedStringArgs(query string, args map[string]string) string { 202 | re := regexp.MustCompile(`['";]`) // Escape single quotes, double quotes, and semicolons from args 203 | 204 | for key, value := range args { 205 | query = strings.ReplaceAll(query, "$"+key, re.ReplaceAllString(value, "")) 206 | } 207 | return query 208 | } 209 | -------------------------------------------------------------------------------- /src/duckdb_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "io" 6 | "strings" 7 | "testing" 8 | ) 9 | 10 | func TestNewDuckdb(t *testing.T) { 11 | t.Run("Creates a new DuckDB instance", func(t *testing.T) { 12 | config := loadTestConfig() 13 | 14 | duckdb := NewDuckdb(config, false) 15 | 16 | defer duckdb.Close() 17 | rows, err := duckdb.QueryContext(context.Background(), "SELECT 1") 18 | if err != nil { 19 | t.Errorf("Expected query to succeed") 20 | } 21 | defer rows.Close() 22 | 23 | for rows.Next() { 24 | var result int 25 | err = rows.Scan(&result) 26 | if err != nil { 27 | t.Errorf("Expected query to return a result") 28 | } 29 | if result != 1 { 30 | t.Errorf("Expected query result to be 1, got %d", result) 31 | } 32 | } 33 | }) 34 | } 35 | 36 | func TestExecFile(t *testing.T) { 37 | t.Run("Executes SQL file", func(t *testing.T) { 38 | config := loadTestConfig() 39 | duckdb := NewDuckdb(config, false) 40 | defer duckdb.Close() 41 | fileContent := strings.Join([]string{ 42 | "CREATE TABLE test (id INTEGER);", 43 | "INSERT INTO test VALUES (1);", 44 | }, "\n") 45 | file := io.NopCloser(strings.NewReader(fileContent)) 46 | 47 | duckdb.ExecFile(file) 48 | 49 | rows, err := duckdb.QueryContext(context.Background(), "SELECT COUNT(*) FROM test") 50 | if err != nil { 51 | t.Errorf("Expected query to succeed") 52 | } 53 | defer rows.Close() 54 | var count int 55 | rows.Next() 56 | err = rows.Scan(&count) 57 | if err != nil { 58 | t.Errorf("Expected query to return a result") 59 | } 60 | if count != 1 { 61 | t.Errorf("Expected query result to be 1, got %d", count) 62 | } 63 | }) 64 | } 65 | -------------------------------------------------------------------------------- /src/error_utils.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "errors" 7 | "flag" 8 | "fmt" 9 | "net/http" 10 | "net/url" 11 | "os" 12 | "runtime" 13 | "runtime/debug" 14 | "strings" 15 | "time" 16 | ) 17 | 18 | func PanicIfError(config *Config, err error) { 19 | if err != nil { 20 | sendAnonymousErrorReport(config, err) 21 | printUnexpectedError(config, err) 22 | os.Exit(1) 23 | } 24 | } 25 | 26 | func Panic(config *Config, message string) { 27 | err := errors.New(message) 28 | PanicIfError(config, err) 29 | } 30 | 31 | func PrintErrorAndExit(config *Config, message string) { 32 | LogError(config, message+"\n") 33 | os.Exit(1) 34 | } 35 | 36 | func HandleUnexpectedError(config *Config, err error) { 37 | sendAnonymousErrorReport(config, err) 38 | printUnexpectedError(config, err) 39 | os.Exit(1) 40 | } 41 | 42 | func printUnexpectedError(config *Config, err error) { 43 | errorMessage := err.Error() 44 | stackTrace := string(debug.Stack()) 45 | 46 | title := "Unexpected error: " + strings.Split(errorMessage, "\n")[0] 47 | body := "* Version: " + VERSION + 48 | "\n* OS: " + runtime.GOOS + "-" + runtime.GOARCH + 49 | "\n\n```\n" + errorMessage + "\n\n" + stackTrace + "\n```" 50 | 51 | fmt.Println("Unexpected error:", errorMessage) 52 | fmt.Println(stackTrace) 53 | fmt.Println("________________________________________________________________________________") 54 | fmt.Println("\nPlease submit a new issue by simply visiting the following link:") 55 | fmt.Println( 56 | "https://github.com/BemiHQ/BemiDB/issues/new?title=" + 57 | url.QueryEscape(title) + 58 | "&body=" + 59 | url.QueryEscape(body), 60 | ) 61 | fmt.Println("\nAlternatively, send us an email at hi@bemidb.com") 62 | } 63 | 64 | type AnonymousErrorData struct { 65 | Command string `json:"command"` 66 | OsName string `json:"osName"` 67 | Version string `json:"version"` 68 | Error string `json:"error"` 69 | StackTrace string `json:"stackTrace"` 70 | PgHost string `json:"pgHost"` 71 | } 72 | 73 | func sendAnonymousErrorReport(config *Config, err error) { 74 | if config.DisableAnonymousAnalytics { 75 | return 76 | } 77 | 78 | data := AnonymousErrorData{ 79 | Command: flag.Arg(0), 80 | OsName: runtime.GOOS + "-" + runtime.GOARCH, 81 | Version: VERSION, 82 | Error: err.Error(), 83 | StackTrace: string(debug.Stack()), 84 | PgHost: ParseDatabaseHost(config.Pg.DatabaseUrl), 85 | } 86 | 87 | jsonData, err := json.Marshal(data) 88 | if err != nil { 89 | return 90 | } 91 | 92 | client := http.Client{Timeout: 5 * time.Second} 93 | _, _ = client.Post("https://api.bemidb.com/api/errors", "application/json", bytes.NewBuffer(jsonData)) 94 | } 95 | -------------------------------------------------------------------------------- /src/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/BemiHQ/BemiDB 2 | 3 | go 1.24.3 4 | 5 | require ( 6 | github.com/aws/aws-sdk-go-v2 v1.32.3 7 | github.com/aws/aws-sdk-go-v2/config v1.28.1 8 | github.com/aws/aws-sdk-go-v2/credentials v1.17.42 9 | github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.35 10 | github.com/aws/aws-sdk-go-v2/service/s3 v1.66.2 11 | github.com/google/uuid v1.6.0 12 | github.com/jackc/pgx/v5 v5.7.2 13 | github.com/linkedin/goavro v2.1.0+incompatible 14 | github.com/marcboeker/go-duckdb v1.8.3 15 | github.com/pganalyze/pg_query_go/v5 v5.1.0 16 | github.com/xitongsys/parquet-go v1.6.3-0.20240813051905-693d3323dee0 17 | ) 18 | 19 | require ( 20 | github.com/xitongsys/parquet-go-source v0.0.0-20241021075129-b732d2ac9c9b 21 | golang.org/x/crypto v0.35.0 22 | golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 23 | ) 24 | 25 | require ( 26 | github.com/andybalholm/brotli v1.1.1 // indirect 27 | github.com/apache/arrow-go/v18 v18.0.0 // indirect 28 | github.com/apache/arrow/go/v12 v12.0.1 // indirect 29 | github.com/apache/thrift v0.21.0 // indirect 30 | github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.6 // indirect 31 | github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.18 // indirect 32 | github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.22 // indirect 33 | github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.22 // indirect 34 | github.com/aws/aws-sdk-go-v2/internal/ini v1.8.1 // indirect 35 | github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.22 // indirect 36 | github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.0 // indirect 37 | github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.4.3 // indirect 38 | github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.3 // indirect 39 | github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.3 // indirect 40 | github.com/aws/aws-sdk-go-v2/service/sso v1.24.3 // indirect 41 | github.com/aws/aws-sdk-go-v2/service/ssooidc v1.28.3 // indirect 42 | github.com/aws/aws-sdk-go-v2/service/sts v1.32.3 // indirect 43 | github.com/aws/smithy-go v1.22.0 // indirect 44 | github.com/goccy/go-json v0.10.3 // indirect 45 | github.com/goccy/go-reflect v1.2.0 // indirect 46 | github.com/golang/snappy v0.0.4 // indirect 47 | github.com/google/flatbuffers v24.3.25+incompatible // indirect 48 | github.com/jackc/pgpassfile v1.0.0 // indirect 49 | github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect 50 | github.com/klauspost/asmfmt v1.3.2 // indirect 51 | github.com/klauspost/compress v1.17.11 // indirect 52 | github.com/klauspost/cpuid/v2 v2.2.8 // indirect 53 | github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 // indirect 54 | github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 // indirect 55 | github.com/mitchellh/mapstructure v1.5.0 // indirect 56 | github.com/pierrec/lz4/v4 v4.1.21 // indirect 57 | github.com/zeebo/xxh3 v1.0.2 // indirect 58 | golang.org/x/mod v0.21.0 // indirect 59 | golang.org/x/sync v0.11.0 // indirect 60 | golang.org/x/sys v0.30.0 // indirect 61 | golang.org/x/text v0.22.0 // indirect 62 | golang.org/x/tools v0.26.0 // indirect 63 | golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect 64 | google.golang.org/protobuf v1.35.1 // indirect 65 | gopkg.in/linkedin/goavro.v1 v1.0.5 // indirect 66 | ) 67 | -------------------------------------------------------------------------------- /src/iceberg_reader.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "io" 5 | ) 6 | 7 | type IcebergReader struct { 8 | config *Config 9 | storage StorageInterface 10 | } 11 | 12 | func NewIcebergReader(config *Config) *IcebergReader { 13 | storage := NewStorage(config) 14 | return &IcebergReader{config: config, storage: storage} 15 | } 16 | 17 | func (reader *IcebergReader) Schemas() (icebergSchemas []string, err error) { 18 | LogDebug(reader.config, "Reading Iceberg schemas...") 19 | return reader.storage.IcebergSchemas() 20 | } 21 | 22 | func (reader *IcebergReader) SchemaTables() (icebergSchemaTables Set[IcebergSchemaTable], err error) { 23 | LogDebug(reader.config, "Reading Iceberg tables...") 24 | return reader.storage.IcebergSchemaTables() 25 | } 26 | 27 | func (reader *IcebergReader) TableFields(icebergSchemaTable IcebergSchemaTable) (icebergTableFields []IcebergTableField, err error) { 28 | LogDebug(reader.config, "Reading Iceberg table "+icebergSchemaTable.String()+" fields...") 29 | return reader.storage.IcebergTableFields(icebergSchemaTable) 30 | } 31 | 32 | func (reader *IcebergReader) MetadataFilePath(icebergSchemaTable IcebergSchemaTable) string { 33 | return reader.storage.IcebergMetadataFilePath(icebergSchemaTable) 34 | } 35 | 36 | func (reader *IcebergReader) InternalTableMetadata(pgSchemaTable PgSchemaTable) (internalTableMetadata InternalTableMetadata, err error) { 37 | LogDebug(reader.config, "Reading internal table metadata for "+pgSchemaTable.String()+"...") 38 | return reader.storage.InternalTableMetadata(pgSchemaTable) 39 | } 40 | 41 | func (reader *IcebergReader) InternalStartSqlFile() io.ReadCloser { 42 | return reader.storage.InternalStartSqlFile() 43 | } 44 | -------------------------------------------------------------------------------- /src/iceberg_writer.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | type IcebergWriter struct { 4 | config *Config 5 | storage StorageInterface 6 | } 7 | 8 | func NewIcebergWriter(config *Config) *IcebergWriter { 9 | storage := NewStorage(config) 10 | return &IcebergWriter{config: config, storage: storage} 11 | } 12 | 13 | func (writer *IcebergWriter) DeleteSchema(icebergSchema string) (err error) { 14 | return writer.storage.DeleteSchema(icebergSchema) 15 | } 16 | 17 | func (writer *IcebergWriter) DeleteSchemaTable(icebergSchemaTable IcebergSchemaTable) (err error) { 18 | return writer.storage.DeleteSchemaTable(icebergSchemaTable) 19 | } 20 | 21 | func (writer *IcebergWriter) WriteInternalStartSqlFile(queries []string) (err error) { 22 | return writer.storage.WriteInternalStartSqlFile(queries) 23 | } 24 | -------------------------------------------------------------------------------- /src/iceberg_writer_table.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | type IcebergWriterTable struct { 4 | config *Config 5 | schemaTable IcebergSchemaTable 6 | storage StorageInterface 7 | pgSchemaColumns []PgSchemaColumn 8 | dynamicRowCountPerBatch int 9 | maxParquetPayloadThreshold int 10 | continuedRefresh bool 11 | } 12 | 13 | func NewIcebergWriterTable( 14 | config *Config, 15 | schemaTable IcebergSchemaTable, 16 | pgSchemaColumns []PgSchemaColumn, 17 | dynamicRowCountPerBatch int, 18 | maxParquetPayloadThreshold int, 19 | continuedRefresh bool, 20 | ) *IcebergWriterTable { 21 | return &IcebergWriterTable{ 22 | config: config, 23 | schemaTable: schemaTable, 24 | pgSchemaColumns: pgSchemaColumns, 25 | dynamicRowCountPerBatch: dynamicRowCountPerBatch, 26 | maxParquetPayloadThreshold: maxParquetPayloadThreshold, 27 | continuedRefresh: continuedRefresh, 28 | storage: NewStorage(config), 29 | } 30 | } 31 | 32 | func (writer *IcebergWriterTable) Write(loadRows func() ([][]string, InternalTableMetadata)) { 33 | dataDirPath := writer.storage.CreateDataDir(writer.schemaTable) 34 | metadataDirPath := writer.storage.CreateMetadataDir(writer.schemaTable) 35 | 36 | var lastSequenceNumber int 37 | newManifestListItemsSortedDesc := []ManifestListItem{} 38 | existingManifestListItemsSortedDesc := []ManifestListItem{} 39 | finalManifestListFilesSortedAsc := []ManifestListFile{} 40 | 41 | if writer.continuedRefresh { 42 | existingManifestListFilesSortedAsc, err := writer.storage.ExistingManifestListFiles(metadataDirPath) 43 | PanicIfError(writer.config, err) 44 | 45 | existingManifestListItemsSortedDesc, err = writer.storage.ExistingManifestListItems(existingManifestListFilesSortedAsc[len(existingManifestListFilesSortedAsc)-1]) 46 | PanicIfError(writer.config, err) 47 | 48 | lastSequenceNumber = existingManifestListItemsSortedDesc[0].SequenceNumber 49 | finalManifestListFilesSortedAsc = existingManifestListFilesSortedAsc 50 | } 51 | 52 | var firstNewParquetFile ParquetFile 53 | var newParquetCount int 54 | loadMoreRows := true 55 | 56 | for loadMoreRows { 57 | newParquetFile, newInternalTableMetadata, err := writer.storage.CreateParquet( 58 | dataDirPath, 59 | writer.pgSchemaColumns, 60 | writer.maxParquetPayloadThreshold, 61 | loadRows, 62 | ) 63 | PanicIfError(writer.config, err) 64 | 65 | // If Parquet is empty and we are continuing to refresh / process subsequent chunks, delete it, mark the sync as completed and exit (no trailing Parquet files) 66 | if newParquetFile.RecordCount == 0 && (writer.continuedRefresh || newParquetCount > 0) { 67 | err = writer.storage.DeleteParquet(newParquetFile) 68 | PanicIfError(writer.config, err) 69 | 70 | err = writer.storage.WriteInternalTableMetadata(metadataDirPath, newInternalTableMetadata) 71 | PanicIfError(writer.config, err) 72 | 73 | return 74 | } 75 | 76 | newParquetCount++ 77 | if firstNewParquetFile.Path == "" { 78 | firstNewParquetFile = newParquetFile 79 | } 80 | 81 | if writer.continuedRefresh { 82 | var overwrittenManifestListFilesSortedAsc []ManifestListFile 83 | 84 | existingManifestListItemsSortedDesc, overwrittenManifestListFilesSortedAsc, lastSequenceNumber = writer.overwriteExistingFiles( 85 | dataDirPath, 86 | metadataDirPath, 87 | existingManifestListItemsSortedDesc, 88 | newParquetFile, 89 | firstNewParquetFile, 90 | lastSequenceNumber, 91 | ) 92 | 93 | finalManifestListFilesSortedAsc = append(finalManifestListFilesSortedAsc, overwrittenManifestListFilesSortedAsc...) 94 | } 95 | 96 | newManifestFile, err := writer.storage.CreateManifest(metadataDirPath, newParquetFile) 97 | PanicIfError(writer.config, err) 98 | 99 | lastSequenceNumber++ 100 | newManifestListItem := ManifestListItem{SequenceNumber: lastSequenceNumber, ManifestFile: newManifestFile} 101 | newManifestListItemsSortedDesc = append([]ManifestListItem{newManifestListItem}, newManifestListItemsSortedDesc...) 102 | 103 | finalManifestListItemsSortedDesc := append(newManifestListItemsSortedDesc, existingManifestListItemsSortedDesc...) 104 | newManifestListFile, err := writer.storage.CreateManifestList(metadataDirPath, firstNewParquetFile.Uuid, finalManifestListItemsSortedDesc) 105 | PanicIfError(writer.config, err) 106 | 107 | finalManifestListFilesSortedAsc = append(finalManifestListFilesSortedAsc, newManifestListFile) 108 | _, err = writer.storage.CreateMetadata(metadataDirPath, writer.pgSchemaColumns, finalManifestListFilesSortedAsc) 109 | PanicIfError(writer.config, err) 110 | 111 | err = writer.storage.WriteInternalTableMetadata(metadataDirPath, newInternalTableMetadata) 112 | PanicIfError(writer.config, err) 113 | 114 | loadMoreRows = newInternalTableMetadata.IsInProgress() 115 | LogDebug(writer.config, "Written", newParquetCount, "Parquet file(s). Load more rows:", loadMoreRows) 116 | } 117 | } 118 | 119 | func (writer *IcebergWriterTable) overwriteExistingFiles( 120 | dataDirPath string, 121 | metadataDirPath string, 122 | originalExistingManifestListItemsSortedDesc []ManifestListItem, 123 | newParquetFile ParquetFile, 124 | firstNewParquetFile ParquetFile, 125 | originalLastSequenceNumber int, 126 | ) (existingManifestListItemsSortedDesc []ManifestListItem, overwrittenManifestListFilesSortedAsc []ManifestListFile, lastSequenceNumber int) { 127 | originalExistingManifestListItemsSortedAsc := Reverse(originalExistingManifestListItemsSortedDesc) 128 | lastSequenceNumber = originalLastSequenceNumber 129 | 130 | for i, existingManifestListItem := range originalExistingManifestListItemsSortedAsc { 131 | existingManifestFile := existingManifestListItem.ManifestFile 132 | existingParquetFilePath, err := writer.storage.ExistingParquetFilePath(existingManifestFile) 133 | PanicIfError(writer.config, err) 134 | 135 | overwrittenParquetFile, err := writer.storage.CreateOverwrittenParquet(dataDirPath, existingParquetFilePath, newParquetFile.Path, writer.pgSchemaColumns, writer.dynamicRowCountPerBatch) 136 | PanicIfError(writer.config, err) 137 | 138 | // Keep as is if no overlapping records found 139 | if overwrittenParquetFile.Path == "" { 140 | LogDebug(writer.config, "No overlapping records found") 141 | existingManifestListItemsSortedDesc = append([]ManifestListItem{existingManifestListItem}, existingManifestListItemsSortedDesc...) 142 | continue 143 | } 144 | 145 | if overwrittenParquetFile.RecordCount == 0 { 146 | // DELETE 147 | LogDebug(writer.config, "Deleting", existingManifestFile.RecordCount, "record(s)...") 148 | 149 | deletedRecsManifestFile, err := writer.storage.CreateDeletedRecordsManifest(metadataDirPath, overwrittenParquetFile.Uuid, existingManifestFile) 150 | PanicIfError(writer.config, err) 151 | 152 | // Constructing a new manifest list without the previous manifest file and with the new "deleted" manifest file 153 | finalManifestListItemsSortedAsc := []ManifestListItem{} 154 | for j, existingItem := range originalExistingManifestListItemsSortedAsc { 155 | if i != j { 156 | finalManifestListItemsSortedAsc = append(finalManifestListItemsSortedAsc, existingItem) 157 | } 158 | } 159 | lastSequenceNumber++ 160 | overwrittenManifestListItem := ManifestListItem{SequenceNumber: lastSequenceNumber, ManifestFile: deletedRecsManifestFile} 161 | finalManifestListItemsSortedAsc = append(finalManifestListItemsSortedAsc, overwrittenManifestListItem) 162 | 163 | overwrittenManifestList, err := writer.storage.CreateManifestList(metadataDirPath, firstNewParquetFile.Uuid, Reverse(finalManifestListItemsSortedAsc)) 164 | PanicIfError(writer.config, err) 165 | overwrittenManifestListFilesSortedAsc = append(overwrittenManifestListFilesSortedAsc, overwrittenManifestList) 166 | continue 167 | } else { 168 | // UPDATE (overwrite) 169 | LogDebug(writer.config, "Overwritting", existingManifestFile.RecordCount, "record(s) with", overwrittenParquetFile.RecordCount, "record(s)...") 170 | 171 | deletedRecsManifestFile, err := writer.storage.CreateDeletedRecordsManifest(metadataDirPath, overwrittenParquetFile.Uuid, existingManifestFile) 172 | PanicIfError(writer.config, err) 173 | 174 | overwrittenManifestFile, err := writer.storage.CreateManifest(metadataDirPath, overwrittenParquetFile) 175 | PanicIfError(writer.config, err) 176 | 177 | lastSequenceNumber++ 178 | overwrittenManifestListItem := ManifestListItem{SequenceNumber: lastSequenceNumber, ManifestFile: overwrittenManifestFile} 179 | deletedRecsManifestListItem := ManifestListItem{SequenceNumber: lastSequenceNumber, ManifestFile: deletedRecsManifestFile} 180 | overwrittenManifestList, err := writer.storage.CreateManifestList(metadataDirPath, firstNewParquetFile.Uuid, []ManifestListItem{overwrittenManifestListItem, deletedRecsManifestListItem}) 181 | PanicIfError(writer.config, err) 182 | 183 | existingManifestListItemsSortedDesc = append([]ManifestListItem{overwrittenManifestListItem}, existingManifestListItemsSortedDesc...) 184 | overwrittenManifestListFilesSortedAsc = append(overwrittenManifestListFilesSortedAsc, overwrittenManifestList) 185 | } 186 | } 187 | 188 | return existingManifestListItemsSortedDesc, overwrittenManifestListFilesSortedAsc, lastSequenceNumber 189 | } 190 | -------------------------------------------------------------------------------- /src/logger.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | ) 6 | 7 | type LogLevel string 8 | 9 | const ( 10 | LOG_LEVEL_TRACE = "TRACE" 11 | LOG_LEVEL_DEBUG = "DEBUG" 12 | LOG_LEVEL_WARN = "WARN" 13 | LOG_LEVEL_INFO = "INFO" 14 | LOG_LEVEL_ERROR = "ERROR" 15 | ) 16 | 17 | var LOG_LEVELS = []string{ 18 | LOG_LEVEL_TRACE, 19 | LOG_LEVEL_DEBUG, 20 | LOG_LEVEL_WARN, 21 | LOG_LEVEL_INFO, 22 | LOG_LEVEL_ERROR, 23 | } 24 | 25 | func LogError(config *Config, message ...interface{}) { 26 | log.Println(append([]interface{}{"[ERROR]"}, message...)...) 27 | } 28 | 29 | func LogWarn(config *Config, message ...interface{}) { 30 | if config.LogLevel != LOG_LEVEL_ERROR { 31 | log.Println(append([]interface{}{"[WARN]"}, message...)...) 32 | } 33 | } 34 | 35 | func LogInfo(config *Config, message ...interface{}) { 36 | if config.LogLevel != LOG_LEVEL_ERROR && config.LogLevel != LOG_LEVEL_WARN { 37 | log.Println(append([]interface{}{"[INFO]"}, message...)...) 38 | } 39 | } 40 | 41 | func LogDebug(config *Config, message ...interface{}) { 42 | if config.LogLevel == LOG_LEVEL_DEBUG || config.LogLevel == LOG_LEVEL_TRACE { 43 | log.Println(append([]interface{}{"[DEBUG]"}, message...)...) 44 | } 45 | } 46 | 47 | func LogTrace(config *Config, message ...interface{}) { 48 | if config.LogLevel == LOG_LEVEL_TRACE { 49 | log.Println(append([]interface{}{"[TRACE]"}, message...)...) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "log" 7 | "net/http" 8 | _ "net/http/pprof" 9 | "time" 10 | ) 11 | 12 | const ( 13 | COMMAND_START = "start" 14 | COMMAND_SYNC = "sync" 15 | COMMAND_VERSION = "version" 16 | ) 17 | 18 | func main() { 19 | config := LoadConfig() 20 | defer handlePanic(config) 21 | 22 | if config.LogLevel == LOG_LEVEL_TRACE { 23 | go enableProfiling() 24 | } 25 | 26 | command := flag.Arg(0) 27 | if len(flag.Args()) == 0 { 28 | command = COMMAND_START 29 | } 30 | 31 | switch command { 32 | case COMMAND_START: 33 | LogDebug(config, "Starting BemiDB v"+VERSION) 34 | start(config) 35 | case COMMAND_SYNC: 36 | LogDebug(config, "Syncing with BemiDB v"+VERSION) 37 | if config.Pg.SyncInterval != "" { 38 | duration, err := time.ParseDuration(config.Pg.SyncInterval) 39 | if err != nil { 40 | PrintErrorAndExit(config, "Invalid interval format: "+config.Pg.SyncInterval+".\n\n"+ 41 | "Supported formats: 1h, 20m, 30s.\n"+ 42 | "See https://github.com/BemiHQ/BemiDB#sync-command-options for more information.", 43 | ) 44 | 45 | } 46 | LogInfo(config, "Starting sync loop with interval:", config.Pg.SyncInterval) 47 | for { 48 | syncFromPg(config) 49 | LogInfo(config, "Sleeping for", config.Pg.SyncInterval) 50 | time.Sleep(duration) 51 | } 52 | } else { 53 | syncFromPg(config) 54 | } 55 | case COMMAND_VERSION: 56 | fmt.Println("BemiDB version:", VERSION) 57 | default: 58 | PrintErrorAndExit(config, "Unknown command: "+command+".\n\n"+ 59 | "Supported commands: "+COMMAND_START+", "+COMMAND_SYNC+", "+COMMAND_VERSION+".\n"+ 60 | "See https://github.com/BemiHQ/BemiDB#quickstart for more information.", 61 | ) 62 | } 63 | } 64 | 65 | func start(config *Config) { 66 | tcpListener := NewTcpListener(config) 67 | LogInfo(config, "BemiDB: Listening on", tcpListener.Addr()) 68 | 69 | duckdb := NewDuckdb(config, true) 70 | LogInfo(config, "DuckDB: Connected") 71 | defer duckdb.Close() 72 | 73 | icebergReader := NewIcebergReader(config) 74 | duckdb.ExecFile(icebergReader.InternalStartSqlFile()) 75 | 76 | queryHandler := NewQueryHandler(config, duckdb, icebergReader) 77 | 78 | for { 79 | conn := AcceptConnection(config, tcpListener) 80 | LogInfo(config, "BemiDB: Accepted connection from", conn.RemoteAddr()) 81 | postgres := NewPostgres(config, &conn) 82 | 83 | go func() { 84 | postgres.Run(queryHandler) 85 | defer postgres.Close() 86 | LogInfo(config, "BemiDB: Closed connection from", conn.RemoteAddr()) 87 | }() 88 | } 89 | } 90 | 91 | func syncFromPg(config *Config) { 92 | syncer := NewSyncer(config) 93 | syncer.SyncFromPostgres() 94 | LogInfo(config, "Sync from PostgreSQL completed successfully.") 95 | } 96 | 97 | func enableProfiling() { 98 | func() { log.Println(http.ListenAndServe(":6060", nil)) }() 99 | } 100 | 101 | func handlePanic(config *Config) { 102 | func() { 103 | if r := recover(); r != nil { 104 | err, _ := r.(error) 105 | HandleUnexpectedError(config, err) 106 | } 107 | }() 108 | } 109 | -------------------------------------------------------------------------------- /src/parser_a_expr.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "strings" 5 | 6 | pgQuery "github.com/pganalyze/pg_query_go/v5" 7 | ) 8 | 9 | type ParserAExpr struct { 10 | config *Config 11 | utils *ParserUtils 12 | } 13 | 14 | func NewParserAExpr(config *Config) *ParserAExpr { 15 | return &ParserAExpr{ 16 | config: config, 17 | utils: NewParserUtils(config), 18 | } 19 | } 20 | 21 | func (parser *ParserAExpr) AExpr(node *pgQuery.Node) *pgQuery.A_Expr { 22 | return node.GetAExpr() 23 | } 24 | 25 | // = ANY({schema_information}) -> IN (schema_information) 26 | func (parser *ParserAExpr) ConvertedRightAnyToIn(node *pgQuery.Node) *pgQuery.Node { 27 | aExpr := parser.AExpr(node) 28 | 29 | if aExpr.Kind != pgQuery.A_Expr_Kind_AEXPR_OP_ANY { 30 | return node 31 | } 32 | 33 | if aExpr.Rexpr.GetAConst() == nil { 34 | // NOTE: ... = ANY() on non-constants is not fully supported yet 35 | return parser.utils.MakeNullNode() 36 | } 37 | 38 | arrayStr := aExpr.Rexpr.GetAConst().GetSval().Sval 39 | arrayStr = strings.Trim(arrayStr, "{}") 40 | values := strings.Split(arrayStr, ",") 41 | 42 | items := make([]*pgQuery.Node, len(values)) 43 | for i, value := range values { 44 | value = strings.Trim(value, " ") 45 | items[i] = &pgQuery.Node{ 46 | Node: &pgQuery.Node_AConst{ 47 | AConst: &pgQuery.A_Const{ 48 | Val: &pgQuery.A_Const_Sval{ 49 | Sval: &pgQuery.String{ 50 | Sval: value, 51 | }, 52 | }, 53 | Location: 0, 54 | }, 55 | }, 56 | } 57 | } 58 | 59 | return &pgQuery.Node{ 60 | Node: &pgQuery.Node_AExpr{ 61 | AExpr: &pgQuery.A_Expr{ 62 | Kind: pgQuery.A_Expr_Kind_AEXPR_IN, 63 | Name: []*pgQuery.Node{{Node: &pgQuery.Node_String_{String_: &pgQuery.String{Sval: "="}}}}, 64 | Lexpr: aExpr.Lexpr, 65 | Rexpr: &pgQuery.Node{ 66 | Node: &pgQuery.Node_List{ 67 | List: &pgQuery.List{ 68 | Items: items, 69 | }, 70 | }, 71 | }, 72 | Location: aExpr.Location, 73 | }, 74 | }, 75 | } 76 | } 77 | 78 | // pg_catalog.[operator] -> [operator] 79 | func (parser *ParserAExpr) RemovePgCatalog(node *pgQuery.Node) { 80 | aExpr := parser.AExpr(node) 81 | 82 | if aExpr == nil || aExpr.Kind != pgQuery.A_Expr_Kind_AEXPR_OP { 83 | return 84 | } 85 | 86 | if len(aExpr.Name) == 2 && aExpr.Name[0].GetString_().Sval == PG_SCHEMA_PG_CATALOG { 87 | aExpr.Name = aExpr.Name[1:] // Remove the first element 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/parser_column_ref.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | pgQuery "github.com/pganalyze/pg_query_go/v5" 5 | ) 6 | 7 | type ParserColumnRef struct { 8 | config *Config 9 | } 10 | 11 | func NewParserColumnRef(config *Config) *ParserColumnRef { 12 | return &ParserColumnRef{config: config} 13 | } 14 | 15 | func (parser *ParserColumnRef) FieldNames(node *pgQuery.Node) []string { 16 | columnRef := node.GetColumnRef() 17 | if columnRef == nil { 18 | return nil 19 | } 20 | 21 | fieldNames := make([]string, 0) 22 | for _, field := range columnRef.Fields { 23 | if field.GetString_() == nil { 24 | return nil 25 | } 26 | fieldNames = append(fieldNames, field.GetString_().Sval) 27 | } 28 | return fieldNames 29 | } 30 | 31 | func (parser *ParserColumnRef) SetFields(node *pgQuery.Node, fields []string) { 32 | columnRef := node.GetColumnRef() 33 | 34 | columnRef.Fields = make([]*pgQuery.Node, len(fields)) 35 | for i, field := range fields { 36 | columnRef.Fields[i] = pgQuery.MakeStrNode(field) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/parser_function.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "strings" 5 | 6 | pgQuery "github.com/pganalyze/pg_query_go/v5" 7 | ) 8 | 9 | type ParserFunction struct { 10 | config *Config 11 | utils *ParserUtils 12 | } 13 | 14 | func NewParserFunction(config *Config) *ParserFunction { 15 | return &ParserFunction{config: config, utils: NewParserUtils(config)} 16 | } 17 | 18 | func (parser *ParserFunction) FunctionCall(targetNode *pgQuery.Node) *pgQuery.FuncCall { 19 | return targetNode.GetResTarget().Val.GetFuncCall() 20 | } 21 | 22 | func (parser *ParserFunction) FirstArgumentToString(functionCall *pgQuery.FuncCall) string { 23 | if len(functionCall.Args) < 1 { 24 | return "" 25 | } 26 | return functionCall.Args[0].GetAConst().GetSval().Sval 27 | } 28 | 29 | // n from (FUNCTION()).n 30 | func (parser *ParserFunction) IndirectionName(targetNode *pgQuery.Node) string { 31 | indirection := targetNode.GetResTarget().Val.GetAIndirection() 32 | if indirection != nil { 33 | return indirection.Indirection[0].GetString_().Sval 34 | } 35 | 36 | return "" 37 | } 38 | 39 | func (parser *ParserFunction) NestedFunctionCalls(functionCall *pgQuery.FuncCall) []*pgQuery.FuncCall { 40 | nestedFunctionCalls := []*pgQuery.FuncCall{} 41 | 42 | for _, arg := range functionCall.Args { 43 | nestedFunctionCalls = append(nestedFunctionCalls, arg.GetFuncCall()) 44 | } 45 | 46 | return nestedFunctionCalls 47 | } 48 | 49 | func (parser *ParserFunction) SchemaFunction(functionCall *pgQuery.FuncCall) *QuerySchemaFunction { 50 | return parser.utils.SchemaFunction(functionCall) 51 | } 52 | 53 | // pg_catalog.func() -> main.func() 54 | func (parser *ParserFunction) RemapSchemaToMain(functionCall *pgQuery.FuncCall) *pgQuery.FuncCall { 55 | switch len(functionCall.Funcname) { 56 | case 1: 57 | functionCall.Funcname = append([]*pgQuery.Node{pgQuery.MakeStrNode(DUCKDB_SCHEMA_MAIN)}, functionCall.Funcname...) 58 | case 2: 59 | functionCall.Funcname[0] = pgQuery.MakeStrNode(DUCKDB_SCHEMA_MAIN) 60 | } 61 | 62 | return functionCall 63 | } 64 | 65 | // format('%s %1$s', str) -> printf('%1$s %1$s', str) 66 | func (parser *ParserFunction) RemapFormatToPrintf(functionCall *pgQuery.FuncCall) *pgQuery.FuncCall { 67 | format := parser.FirstArgumentToString(functionCall) 68 | for i := range functionCall.Args[1:] { 69 | format = strings.Replace(format, "%s", "%"+IntToString(i+1)+"$s", 1) 70 | } 71 | 72 | functionCall.Funcname = []*pgQuery.Node{pgQuery.MakeStrNode("printf")} 73 | functionCall.Args[0] = pgQuery.MakeAConstStrNode(format, 0) 74 | return functionCall 75 | } 76 | 77 | // encode(sha256(...), 'hex') -> sha256(...) 78 | func (parser *ParserFunction) RemoveEncode(functionCall *pgQuery.FuncCall) { 79 | if len(functionCall.Args) != 2 { 80 | return 81 | } 82 | 83 | firstArg := functionCall.Args[0] 84 | nestedFunctionCall := firstArg.GetFuncCall() 85 | schemaFunction := parser.utils.SchemaFunction(nestedFunctionCall) 86 | if schemaFunction.Function != "sha256" { 87 | return 88 | } 89 | 90 | secondArg := functionCall.Args[1] 91 | var format string 92 | if secondArg.GetAConst() != nil { 93 | format = secondArg.GetAConst().GetSval().Sval 94 | } else if secondArg.GetTypeCast() != nil { 95 | format = secondArg.GetTypeCast().Arg.GetAConst().GetSval().Sval 96 | } 97 | if format != "hex" { 98 | return 99 | } 100 | 101 | functionCall.Funcname = nestedFunctionCall.Funcname 102 | functionCall.Args = nestedFunctionCall.Args 103 | } 104 | 105 | // to_timestamp(...) 106 | func (parser *ParserFunction) RemapToTimestamp(functionCall *pgQuery.FuncCall, timestamp int64) { 107 | functionCall.Funcname = []*pgQuery.Node{pgQuery.MakeStrNode("to_timestamp")} 108 | 109 | if timestamp == 0 { 110 | functionCall.Args[0] = parser.utils.MakeNullNode() 111 | } else { 112 | functionCall.Args[0] = pgQuery.MakeAConstIntNode(timestamp, 0) 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /src/parser_select.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | pgQuery "github.com/pganalyze/pg_query_go/v5" 5 | ) 6 | 7 | type ParserSelect struct { 8 | config *Config 9 | utils *ParserUtils 10 | } 11 | 12 | func NewParserSelect(config *Config) *ParserSelect { 13 | return &ParserSelect{config: config, utils: NewParserUtils(config)} 14 | } 15 | 16 | func (parser *ParserSelect) SetDefaultTargetName(targetNode *pgQuery.Node, name string) { 17 | target := targetNode.GetResTarget() 18 | 19 | if target.Name == "" { 20 | target.Name = name 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/parser_show.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | pgQuery "github.com/pganalyze/pg_query_go/v5" 5 | ) 6 | 7 | type ParserShow struct { 8 | config *Config 9 | } 10 | 11 | func NewParserShow(config *Config) *ParserShow { 12 | return &ParserShow{config: config} 13 | } 14 | 15 | func (parser *ParserShow) VariableName(stmt *pgQuery.RawStmt) string { 16 | return stmt.Stmt.GetVariableShowStmt().Name 17 | } 18 | 19 | // SHOW var -> SELECT value AS var FROM duckdb_settings() WHERE LOWER(name) = 'var'; 20 | func (parser *ParserShow) MakeSelectFromDuckdbSettings(variableName string) *pgQuery.RawStmt { 21 | return &pgQuery.RawStmt{ 22 | Stmt: &pgQuery.Node{ 23 | Node: &pgQuery.Node_SelectStmt{ 24 | SelectStmt: &pgQuery.SelectStmt{ 25 | TargetList: []*pgQuery.Node{ 26 | pgQuery.MakeResTargetNodeWithNameAndVal( 27 | variableName, 28 | pgQuery.MakeColumnRefNode( 29 | []*pgQuery.Node{pgQuery.MakeStrNode("value")}, 30 | 0, 31 | ), 32 | 0, 33 | ), 34 | }, 35 | FromClause: []*pgQuery.Node{ 36 | pgQuery.MakeSimpleRangeFunctionNode( 37 | []*pgQuery.Node{ 38 | pgQuery.MakeListNode( 39 | []*pgQuery.Node{ 40 | pgQuery.MakeFuncCallNode( 41 | []*pgQuery.Node{pgQuery.MakeStrNode("duckdb_settings")}, 42 | nil, 43 | 0, 44 | ), 45 | }, 46 | ), 47 | }, 48 | ), 49 | }, 50 | WhereClause: pgQuery.MakeAExprNode( 51 | pgQuery.A_Expr_Kind_AEXPR_OP, 52 | []*pgQuery.Node{pgQuery.MakeStrNode("=")}, 53 | pgQuery.MakeFuncCallNode( 54 | []*pgQuery.Node{pgQuery.MakeStrNode("lower")}, 55 | []*pgQuery.Node{ 56 | pgQuery.MakeColumnRefNode( 57 | []*pgQuery.Node{pgQuery.MakeStrNode("name")}, 58 | 0, 59 | ), 60 | }, 61 | 0, 62 | ), 63 | pgQuery.MakeAConstStrNode(variableName, 0), 64 | 0, 65 | ), 66 | }, 67 | }, 68 | }, 69 | } 70 | } 71 | 72 | // SELECT value AS search_path -> SELECT CONCAT('"$user", ', value) AS search_path 73 | func (parser *ParserShow) SetTargetListForSearchPath(stmt *pgQuery.RawStmt) { 74 | stmt.Stmt.GetSelectStmt().TargetList = []*pgQuery.Node{ 75 | pgQuery.MakeResTargetNodeWithNameAndVal( 76 | PG_VAR_SEARCH_PATH, 77 | pgQuery.MakeFuncCallNode( 78 | []*pgQuery.Node{pgQuery.MakeStrNode("concat")}, 79 | []*pgQuery.Node{ 80 | pgQuery.MakeAConstStrNode(`"$user", `, 0), 81 | pgQuery.MakeColumnRefNode( 82 | []*pgQuery.Node{pgQuery.MakeStrNode("value")}, 83 | 0, 84 | ), 85 | }, 86 | 0, 87 | ), 88 | 0, 89 | ), 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/parser_table.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | pgQuery "github.com/pganalyze/pg_query_go/v5" 5 | ) 6 | 7 | type QueryToIcebergTable struct { 8 | QuerySchemaTable QuerySchemaTable 9 | IcebergTablePath string 10 | } 11 | 12 | type ParserTable struct { 13 | config *Config 14 | utils *ParserUtils 15 | } 16 | 17 | func NewParserTable(config *Config) *ParserTable { 18 | return &ParserTable{config: config, utils: NewParserUtils(config)} 19 | } 20 | 21 | func (parser *ParserTable) NodeToQuerySchemaTable(node *pgQuery.Node) QuerySchemaTable { 22 | rangeVar := node.GetRangeVar() 23 | var alias string 24 | 25 | if rangeVar.Alias != nil { 26 | alias = rangeVar.Alias.Aliasname 27 | } 28 | 29 | return QuerySchemaTable{ 30 | Schema: rangeVar.Schemaname, 31 | Table: rangeVar.Relname, 32 | Alias: alias, 33 | } 34 | } 35 | 36 | func (parser *ParserTable) RemapSchemaToMain(node *pgQuery.Node) { 37 | node.GetRangeVar().Schemaname = DUCKDB_SCHEMA_MAIN 38 | } 39 | 40 | // Other information_schema.* tables 41 | func (parser *ParserTable) IsTableFromInformationSchema(qSchemaTable QuerySchemaTable) bool { 42 | return qSchemaTable.Schema == PG_SCHEMA_INFORMATION_SCHEMA 43 | } 44 | 45 | // public.table -> FROM iceberg_scan('path', skip_schema_inference = true) table 46 | // schema.table -> FROM iceberg_scan('path', skip_schema_inference = true) schema_table 47 | func (parser *ParserTable) MakeIcebergTableNode(queryToIcebergTable QueryToIcebergTable) *pgQuery.Node { 48 | node := pgQuery.MakeSimpleRangeFunctionNode([]*pgQuery.Node{ 49 | pgQuery.MakeListNode([]*pgQuery.Node{ 50 | pgQuery.MakeFuncCallNode( 51 | []*pgQuery.Node{ 52 | pgQuery.MakeStrNode("iceberg_scan"), 53 | }, 54 | []*pgQuery.Node{ 55 | pgQuery.MakeAConstStrNode( 56 | queryToIcebergTable.IcebergTablePath, 57 | 0, 58 | ), 59 | pgQuery.MakeAExprNode( 60 | pgQuery.A_Expr_Kind_AEXPR_OP, 61 | []*pgQuery.Node{pgQuery.MakeStrNode("=")}, 62 | pgQuery.MakeColumnRefNode([]*pgQuery.Node{pgQuery.MakeStrNode("skip_schema_inference")}, 0), 63 | parser.utils.MakeAConstBoolNode(true), 64 | 0, 65 | ), 66 | }, 67 | 0, 68 | ), 69 | }), 70 | }) 71 | 72 | // DuckDB doesn't support aliases on iceberg_scan() functions, so we need to wrap it in a nested select that can have an alias 73 | selectStarNode := pgQuery.MakeResTargetNodeWithVal( 74 | pgQuery.MakeColumnRefNode( 75 | []*pgQuery.Node{pgQuery.MakeAStarNode()}, 76 | 0, 77 | ), 78 | 0, 79 | ) 80 | return parser.utils.MakeSubselectFromNode(queryToIcebergTable.QuerySchemaTable, []*pgQuery.Node{selectStarNode}, node) 81 | } 82 | 83 | func (parser *ParserTable) TopLevelSchemaFunction(rangeFunction *pgQuery.RangeFunction) *QuerySchemaFunction { 84 | if len(rangeFunction.Functions) == 0 || len(rangeFunction.Functions[0].GetList().Items) == 0 { 85 | return nil 86 | } 87 | 88 | functionNode := rangeFunction.Functions[0].GetList().Items[0] 89 | if functionNode.GetFuncCall() == nil { 90 | return nil // E.g., system PG calls like "... FROM user" => sqlvalue_function:{op:SVFOP_USER} 91 | } 92 | 93 | return parser.utils.SchemaFunction(functionNode.GetFuncCall()) 94 | } 95 | 96 | func (parser *ParserTable) TableFunctionCalls(rangeFunction *pgQuery.RangeFunction) []*pgQuery.FuncCall { 97 | functionCalls := []*pgQuery.FuncCall{} 98 | 99 | for _, funcNode := range rangeFunction.Functions { 100 | for _, funcItemNode := range funcNode.GetList().Items { 101 | functionCall := funcItemNode.GetFuncCall() 102 | if functionCall != nil { 103 | functionCalls = append(functionCalls, functionCall) 104 | } 105 | } 106 | } 107 | 108 | return functionCalls 109 | } 110 | 111 | func (parser *ParserTable) SetAliasIfNotExists(rangeFunction *pgQuery.RangeFunction, alias string) { 112 | if rangeFunction.GetAlias() != nil { 113 | return 114 | } 115 | 116 | rangeFunction.Alias = &pgQuery.Alias{Aliasname: alias} 117 | } 118 | -------------------------------------------------------------------------------- /src/parser_type_cast.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "strings" 5 | 6 | pgQuery "github.com/pganalyze/pg_query_go/v5" 7 | ) 8 | 9 | type ParserTypeCast struct { 10 | utils *ParserUtils 11 | config *Config 12 | } 13 | 14 | func NewParserTypeCast(config *Config) *ParserTypeCast { 15 | return &ParserTypeCast{utils: NewParserUtils(config), config: config} 16 | } 17 | 18 | func (parser *ParserTypeCast) TypeCast(node *pgQuery.Node) *pgQuery.TypeCast { 19 | if node.GetTypeCast() == nil { 20 | return nil 21 | } 22 | 23 | typeCast := node.GetTypeCast() 24 | if len(typeCast.TypeName.Names) == 0 { 25 | return nil 26 | } 27 | 28 | return typeCast 29 | } 30 | 31 | func (parser *ParserTypeCast) TypeName(typeCast *pgQuery.TypeCast) string { 32 | if typeCast == nil { 33 | return "" 34 | } 35 | 36 | typeNameNode := typeCast.TypeName 37 | var typeNames []string 38 | 39 | for _, name := range typeNameNode.Names { 40 | typeNames = append(typeNames, name.GetString_().Sval) 41 | } 42 | 43 | typeName := strings.Join(typeNames, ".") 44 | 45 | if typeNameNode.ArrayBounds != nil { 46 | typeName += "[]" 47 | } 48 | 49 | return typeName 50 | } 51 | 52 | func (parser *ParserTypeCast) NestedTypeCast(typeCast *pgQuery.TypeCast) *pgQuery.TypeCast { 53 | return parser.TypeCast(typeCast.Arg) 54 | } 55 | 56 | // "value" COLLATE pg_catalog.default -> "value" 57 | func (parser *ParserTypeCast) RemovedDefaultCollateClause(node *pgQuery.Node) *pgQuery.Node { 58 | collname := node.GetCollateClause().Collname 59 | 60 | if len(collname) == 2 && collname[0].GetString_().Sval == "pg_catalog" && collname[1].GetString_().Sval == "default" { 61 | return node.GetCollateClause().Arg 62 | } 63 | 64 | return node 65 | } 66 | 67 | func (parser *ParserTypeCast) ArgStringValue(typeCast *pgQuery.TypeCast) string { 68 | return typeCast.Arg.GetAConst().GetSval().Sval 69 | } 70 | 71 | // pg_catalog.[type] -> [type] 72 | func (parser *ParserTypeCast) RemovePgCatalog(typeCast *pgQuery.TypeCast) { 73 | if typeCast != nil && len(typeCast.TypeName.Names) == 2 && typeCast.TypeName.Names[0].GetString_().Sval == PG_SCHEMA_PG_CATALOG { 74 | typeCast.TypeName.Names = typeCast.TypeName.Names[1:] 75 | } 76 | } 77 | 78 | func (parser *ParserTypeCast) SetTypeCastArg(typeCast *pgQuery.TypeCast, arg *pgQuery.Node) { 79 | typeCast.Arg = arg 80 | } 81 | 82 | func (parser *ParserTypeCast) MakeListValueFromArray(node *pgQuery.Node) *pgQuery.Node { 83 | arrayStr := node.GetAConst().GetSval().Sval 84 | arrayStr = strings.Trim(arrayStr, "{}") 85 | elements := strings.Split(arrayStr, ",") 86 | 87 | funcCall := &pgQuery.FuncCall{ 88 | Funcname: []*pgQuery.Node{ 89 | pgQuery.MakeStrNode("list_value"), 90 | }, 91 | } 92 | 93 | for _, elem := range elements { 94 | funcCall.Args = append(funcCall.Args, 95 | pgQuery.MakeAConstStrNode(elem, 0)) 96 | } 97 | 98 | return &pgQuery.Node{ 99 | Node: &pgQuery.Node_FuncCall{ 100 | FuncCall: funcCall, 101 | }, 102 | } 103 | } 104 | 105 | // SELECT c.oid 106 | // FROM pg_class c 107 | // JOIN pg_namespace n ON n.oid = c.relnamespace 108 | // WHERE n.nspname = 'schema' AND c.relname = 'table' 109 | func (parser *ParserTypeCast) MakeSubselectOidBySchemaTableArg(argumentNode *pgQuery.Node) *pgQuery.Node { 110 | targetNode := pgQuery.MakeResTargetNodeWithVal( 111 | pgQuery.MakeColumnRefNode([]*pgQuery.Node{ 112 | pgQuery.MakeStrNode("c"), 113 | pgQuery.MakeStrNode("oid"), 114 | }, 0), 115 | 0, 116 | ) 117 | 118 | joinNode := pgQuery.MakeJoinExprNode( 119 | pgQuery.JoinType_JOIN_INNER, 120 | pgQuery.MakeFullRangeVarNode("", "pg_class", "c", 0), 121 | pgQuery.MakeFullRangeVarNode("", "pg_namespace", "n", 0), 122 | pgQuery.MakeAExprNode( 123 | pgQuery.A_Expr_Kind_AEXPR_OP, 124 | []*pgQuery.Node{ 125 | pgQuery.MakeStrNode("="), 126 | }, 127 | pgQuery.MakeColumnRefNode([]*pgQuery.Node{ 128 | pgQuery.MakeStrNode("n"), 129 | pgQuery.MakeStrNode("oid"), 130 | }, 0), 131 | pgQuery.MakeColumnRefNode([]*pgQuery.Node{ 132 | pgQuery.MakeStrNode("c"), 133 | pgQuery.MakeStrNode("relnamespace"), 134 | }, 0), 135 | 0, 136 | ), 137 | ) 138 | 139 | if argumentNode.GetAConst() == nil { 140 | // NOTE: ::regclass::oid on non-constants is not fully supported yet 141 | return parser.utils.MakeNullNode() 142 | } 143 | 144 | value := argumentNode.GetAConst().GetSval().Sval 145 | qSchemaTable := NewQuerySchemaTableFromString(value) 146 | if qSchemaTable.Schema == "" { 147 | qSchemaTable.Schema = PG_SCHEMA_PUBLIC 148 | } 149 | 150 | whereNode := pgQuery.MakeBoolExprNode( 151 | pgQuery.BoolExprType_AND_EXPR, 152 | []*pgQuery.Node{ 153 | pgQuery.MakeAExprNode( 154 | pgQuery.A_Expr_Kind_AEXPR_OP, 155 | []*pgQuery.Node{ 156 | pgQuery.MakeStrNode("="), 157 | }, 158 | pgQuery.MakeColumnRefNode([]*pgQuery.Node{ 159 | pgQuery.MakeStrNode("n"), 160 | pgQuery.MakeStrNode("nspname"), 161 | }, 0), 162 | pgQuery.MakeAConstStrNode(qSchemaTable.Schema, 0), 163 | 0, 164 | ), 165 | pgQuery.MakeAExprNode( 166 | pgQuery.A_Expr_Kind_AEXPR_OP, 167 | []*pgQuery.Node{ 168 | pgQuery.MakeStrNode("="), 169 | }, 170 | pgQuery.MakeColumnRefNode([]*pgQuery.Node{ 171 | pgQuery.MakeStrNode("c"), 172 | pgQuery.MakeStrNode("relname"), 173 | }, 0), 174 | pgQuery.MakeAConstStrNode(qSchemaTable.Table, 0), 175 | 0, 176 | ), 177 | }, 178 | 0, 179 | ) 180 | 181 | return &pgQuery.Node{ 182 | Node: &pgQuery.Node_SubLink{ 183 | SubLink: &pgQuery.SubLink{ 184 | SubLinkType: pgQuery.SubLinkType_EXPR_SUBLINK, 185 | Subselect: &pgQuery.Node{ 186 | Node: &pgQuery.Node_SelectStmt{ 187 | SelectStmt: &pgQuery.SelectStmt{ 188 | TargetList: []*pgQuery.Node{targetNode}, 189 | FromClause: []*pgQuery.Node{joinNode}, 190 | WhereClause: whereNode, 191 | }, 192 | }, 193 | }, 194 | }, 195 | }, 196 | } 197 | 198 | } 199 | -------------------------------------------------------------------------------- /src/parser_utils.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | pgQuery "github.com/pganalyze/pg_query_go/v5" 5 | ) 6 | 7 | type ParserUtils struct { 8 | config *Config 9 | } 10 | 11 | func NewParserUtils(config *Config) *ParserUtils { 12 | return &ParserUtils{config: config} 13 | } 14 | 15 | func (utils *ParserUtils) SchemaFunction(functionCall *pgQuery.FuncCall) *QuerySchemaFunction { 16 | switch len(functionCall.Funcname) { 17 | case 1: 18 | return &QuerySchemaFunction{ 19 | Schema: "", 20 | Function: functionCall.Funcname[0].GetString_().Sval, 21 | } 22 | case 2: 23 | return &QuerySchemaFunction{ 24 | Schema: functionCall.Funcname[0].GetString_().Sval, 25 | Function: functionCall.Funcname[1].GetString_().Sval, 26 | } 27 | default: 28 | Panic(utils.config, "Invalid function call") 29 | return nil 30 | } 31 | } 32 | 33 | func (utils *ParserUtils) MakeSubselectFromNode(qSchemaTable QuerySchemaTable, targetList []*pgQuery.Node, fromNode *pgQuery.Node) *pgQuery.Node { 34 | alias := qSchemaTable.Alias 35 | if alias == "" { 36 | if qSchemaTable.Schema == PG_SCHEMA_PUBLIC || qSchemaTable.Schema == "" { 37 | alias = qSchemaTable.Table 38 | } else { 39 | alias = qSchemaTable.Schema + "_" + qSchemaTable.Table 40 | } 41 | } 42 | 43 | return &pgQuery.Node{ 44 | Node: &pgQuery.Node_RangeSubselect{ 45 | RangeSubselect: &pgQuery.RangeSubselect{ 46 | Subquery: &pgQuery.Node{ 47 | Node: &pgQuery.Node_SelectStmt{ 48 | SelectStmt: &pgQuery.SelectStmt{ 49 | TargetList: targetList, 50 | FromClause: []*pgQuery.Node{fromNode}, 51 | }, 52 | }, 53 | }, 54 | Alias: &pgQuery.Alias{ 55 | Aliasname: alias, 56 | }, 57 | }, 58 | }, 59 | } 60 | } 61 | 62 | func (utils *ParserUtils) MakeAConstBoolNode(val bool) *pgQuery.Node { 63 | return &pgQuery.Node{ 64 | Node: &pgQuery.Node_AConst{ 65 | AConst: &pgQuery.A_Const{ 66 | Val: &pgQuery.A_Const_Boolval{ 67 | Boolval: &pgQuery.Boolean{ 68 | Boolval: val, 69 | }, 70 | }, 71 | Isnull: false, 72 | Location: 0, 73 | }, 74 | }, 75 | } 76 | } 77 | 78 | func (utils *ParserUtils) MakeNullNode() *pgQuery.Node { 79 | return &pgQuery.Node{ 80 | Node: &pgQuery.Node_AConst{ 81 | AConst: &pgQuery.A_Const{ 82 | Isnull: true, 83 | }, 84 | }, 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/pg_constants.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | const ( 4 | PG_SCHEMA_INFORMATION_SCHEMA = "information_schema" 5 | PG_SCHEMA_PG_CATALOG = "pg_catalog" 6 | PG_SCHEMA_PUBLIC = "public" 7 | 8 | PG_FUNCTION_FORMAT = "format" 9 | PG_FUNCTION_ENCODE = "encode" 10 | 11 | PG_TABLE_PG_CLASS = "pg_class" 12 | PG_TABLE_PG_STAT_USER_TABLES = "pg_stat_user_tables" 13 | PG_TABLE_TABLES = "tables" 14 | 15 | PG_VAR_SEARCH_PATH = "search_path" 16 | ) 17 | 18 | var PG_SYSTEM_TABLES = NewSet([]string{ 19 | "pg_aggregate", 20 | "pg_am", 21 | "pg_amop", 22 | "pg_amproc", 23 | "pg_attrdef", 24 | "pg_attribute", 25 | "pg_auth_members", 26 | "pg_authid", 27 | "pg_cast", 28 | "pg_class", 29 | "pg_collation", 30 | "pg_constraint", 31 | "pg_conversion", 32 | "pg_database", 33 | "pg_db_role_setting", 34 | "pg_default_acl", 35 | "pg_depend", 36 | "pg_description", 37 | "pg_enum", 38 | "pg_event_trigger", 39 | "pg_extension", 40 | "pg_foreign_data_wrapper", 41 | "pg_foreign_server", 42 | "pg_foreign_table", 43 | "pg_index", 44 | "pg_inherits", 45 | "pg_init_privs", 46 | "pg_language", 47 | "pg_largeobject", 48 | "pg_largeobject_metadata", 49 | "pg_matviews", 50 | "pg_namespace", 51 | "pg_opclass", 52 | "pg_operator", 53 | "pg_opfamily", 54 | "pg_parameter_acl", 55 | "pg_partitioned_table", 56 | "pg_policy", 57 | "pg_proc", 58 | "pg_publication", 59 | "pg_publication_namespace", 60 | "pg_publication_rel", 61 | "pg_user", 62 | "pg_range", 63 | "pg_replication_origin", 64 | "pg_replication_slots", 65 | "pg_rewrite", 66 | "pg_roles", 67 | "pg_seclabel", 68 | "pg_sequence", 69 | "pg_shadow", 70 | "pg_shdepend", 71 | "pg_shdescription", 72 | "pg_shseclabel", 73 | "pg_statistic", 74 | "pg_statistic_ext", 75 | "pg_statistic_ext_data", 76 | "pg_subscription", 77 | "pg_subscription_rel", 78 | "pg_tablespace", 79 | "pg_transform", 80 | "pg_trigger", 81 | "pg_ts_config", 82 | "pg_ts_config_map", 83 | "pg_ts_dict", 84 | "pg_ts_parser", 85 | "pg_ts_template", 86 | "pg_type", 87 | "pg_user_mapping", 88 | "pg_views", 89 | }) 90 | 91 | var PG_SYSTEM_VIEWS = NewSet([]string{ 92 | "pg_stat_activity", 93 | "pg_stat_replication", 94 | "pg_stat_wal_receiver", 95 | "pg_stat_recovery_prefetch", 96 | "pg_stat_subscription", 97 | "pg_stat_ssl", 98 | "pg_stat_gssapi", 99 | "pg_stat_progress_analyze", 100 | "pg_stat_progress_create_index", 101 | "pg_stat_progress_vacuum", 102 | "pg_stat_progress_cluster", 103 | "pg_stat_progress_basebackup", 104 | "pg_stat_progress_copy", 105 | "pg_stat_archiver", 106 | "pg_stat_bgwriter", 107 | "pg_stat_checkpointer", 108 | "pg_stat_database", 109 | "pg_stat_database_conflicts", 110 | "pg_stat_io", 111 | "pg_stat_replication_slots", 112 | "pg_stat_slru", 113 | "pg_stat_subscription_stats", 114 | "pg_stat_wal", 115 | "pg_stat_all_tables", 116 | "pg_stat_sys_tables", 117 | "pg_stat_user_tables", 118 | "pg_stat_xact_all_tables", 119 | "pg_stat_xact_sys_tables", 120 | "pg_stat_xact_user_tables", 121 | "pg_stat_all_indexes", 122 | "pg_stat_sys_indexes", 123 | "pg_stat_user_indexes", 124 | "pg_stat_user_functions", 125 | "pg_stat_xact_user_functions", 126 | "pg_statio_all_tables", 127 | "pg_statio_sys_tables", 128 | "pg_statio_user_tables", 129 | "pg_statio_all_indexes", 130 | "pg_statio_sys_indexes", 131 | "pg_statio_user_indexes", 132 | "pg_statio_all_sequences", 133 | "pg_statio_sys_sequences", 134 | "pg_statio_user_sequences", 135 | }) 136 | -------------------------------------------------------------------------------- /src/postgres.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "errors" 5 | "net" 6 | 7 | "github.com/jackc/pgx/v5/pgproto3" 8 | ) 9 | 10 | const ( 11 | PG_VERSION = "17.0" 12 | PG_ENCODING = "UTF8" 13 | PG_TX_STATUS_IDLE = 'I' 14 | 15 | SYSTEM_AUTH_USER = "bemidb" 16 | ) 17 | 18 | type Postgres struct { 19 | backend *pgproto3.Backend 20 | conn *net.Conn 21 | config *Config 22 | } 23 | 24 | func NewPostgres(config *Config, conn *net.Conn) *Postgres { 25 | return &Postgres{ 26 | conn: conn, 27 | backend: pgproto3.NewBackend(*conn, *conn), 28 | config: config, 29 | } 30 | } 31 | 32 | func NewTcpListener(config *Config) net.Listener { 33 | parsedIp := net.ParseIP(config.Host) 34 | if parsedIp == nil { 35 | PrintErrorAndExit(config, "Invalid host: "+config.Host+".") 36 | } 37 | 38 | var network, host string 39 | if parsedIp.To4() == nil { 40 | network = "tcp6" 41 | host = "[" + config.Host + "]" 42 | } else { 43 | network = "tcp4" 44 | host = config.Host 45 | } 46 | 47 | tcpListener, err := net.Listen(network, host+":"+config.Port) 48 | PanicIfError(config, err) 49 | return tcpListener 50 | } 51 | 52 | func AcceptConnection(config *Config, listener net.Listener) net.Conn { 53 | conn, err := listener.Accept() 54 | PanicIfError(config, err) 55 | return conn 56 | } 57 | 58 | func (postgres *Postgres) Run(queryHandler *QueryHandler) { 59 | err := postgres.handleStartup() 60 | if err != nil { 61 | LogError(postgres.config, "Error handling startup:", err) 62 | return // Terminate connection 63 | } 64 | 65 | for { 66 | message, err := postgres.backend.Receive() 67 | if err != nil { 68 | return // Terminate connection 69 | } 70 | 71 | switch message := message.(type) { 72 | case *pgproto3.Query: 73 | postgres.handleSimpleQuery(queryHandler, message) 74 | case *pgproto3.Parse: 75 | err = postgres.handleExtendedQuery(queryHandler, message) 76 | if err != nil { 77 | return // Terminate connection 78 | } 79 | case *pgproto3.Terminate: 80 | LogDebug(postgres.config, "Client terminated connection") 81 | return 82 | default: 83 | LogError(postgres.config, "Received message other than Query from client:", message) 84 | return // Terminate connection 85 | } 86 | } 87 | } 88 | 89 | func (postgres *Postgres) Close() error { 90 | return (*postgres.conn).Close() 91 | } 92 | 93 | func (postgres *Postgres) handleSimpleQuery(queryHandler *QueryHandler, queryMessage *pgproto3.Query) { 94 | LogDebug(postgres.config, "Received query:", queryMessage.String) 95 | messages, err := queryHandler.HandleSimpleQuery(queryMessage.String) 96 | if err != nil { 97 | postgres.writeError(err) 98 | return 99 | } 100 | messages = append(messages, &pgproto3.ReadyForQuery{TxStatus: PG_TX_STATUS_IDLE}) 101 | postgres.writeMessages(messages...) 102 | } 103 | 104 | func (postgres *Postgres) handleExtendedQuery(queryHandler *QueryHandler, parseMessage *pgproto3.Parse) error { 105 | LogDebug(postgres.config, "Parsing query", parseMessage.Query) 106 | messages, preparedStatement, err := queryHandler.HandleParseQuery(parseMessage) 107 | if err != nil { 108 | postgres.writeError(err) 109 | return nil 110 | } 111 | postgres.writeMessages(messages...) 112 | 113 | var previousErr error 114 | for { 115 | message, err := postgres.backend.Receive() 116 | if err != nil { 117 | return err 118 | } 119 | 120 | switch message := message.(type) { 121 | case *pgproto3.Bind: 122 | if previousErr != nil { // Skip processing the next message if there was an error in the previous message 123 | continue 124 | } 125 | 126 | LogDebug(postgres.config, "Binding query", message.PreparedStatement) 127 | messages, preparedStatement, err = queryHandler.HandleBindQuery(message, preparedStatement) 128 | if err != nil { 129 | postgres.writeError(err) 130 | previousErr = err 131 | } 132 | postgres.writeMessages(messages...) 133 | case *pgproto3.Describe: 134 | if previousErr != nil { // Skip processing the next message if there was an error in the previous message 135 | continue 136 | } 137 | 138 | LogDebug(postgres.config, "Describing query", message.Name, "("+string(message.ObjectType)+")") 139 | var messages []pgproto3.Message 140 | messages, preparedStatement, err = queryHandler.HandleDescribeQuery(message, preparedStatement) 141 | if err != nil { 142 | postgres.writeError(err) 143 | previousErr = err 144 | } 145 | postgres.writeMessages(messages...) 146 | case *pgproto3.Execute: 147 | if previousErr != nil { // Skip processing the next message if there was an error in the previous message 148 | continue 149 | } 150 | 151 | LogDebug(postgres.config, "Executing query", message.Portal) 152 | messages, err := queryHandler.HandleExecuteQuery(message, preparedStatement) 153 | if err != nil { 154 | postgres.writeError(err) 155 | previousErr = err 156 | } 157 | postgres.writeMessages(messages...) 158 | case *pgproto3.Sync: 159 | LogDebug(postgres.config, "Syncing query") 160 | postgres.writeMessages( 161 | &pgproto3.ReadyForQuery{TxStatus: PG_TX_STATUS_IDLE}, 162 | ) 163 | 164 | // If there was an error or Parse->Bind->Sync (...) or Parse->Describe->Sync (e.g., Metabase) 165 | // it means that sync is the last message in the extended query protocol, we can exit handleExtendedQuery 166 | if previousErr != nil || preparedStatement.Bound || preparedStatement.Described { 167 | return nil 168 | } 169 | // Otherwise, wait for Bind/Describe/Execute/Sync. 170 | // For example, psycopg sends Parse->[extra Sync]->Bind->Describe->Execute->Sync 171 | } 172 | } 173 | } 174 | 175 | func (postgres *Postgres) writeMessages(messages ...pgproto3.Message) { 176 | var buf []byte 177 | for _, message := range messages { 178 | buf, _ = message.Encode(buf) 179 | } 180 | (*postgres.conn).Write(buf) 181 | } 182 | 183 | func (postgres *Postgres) writeError(err error) { 184 | LogError(postgres.config, err.Error()) 185 | 186 | postgres.writeMessages( 187 | &pgproto3.ErrorResponse{ 188 | Severity: "ERROR", 189 | Message: err.Error(), 190 | }, 191 | &pgproto3.ReadyForQuery{TxStatus: PG_TX_STATUS_IDLE}, 192 | ) 193 | } 194 | 195 | func (postgres *Postgres) handleStartup() error { 196 | startupMessage, err := postgres.backend.ReceiveStartupMessage() 197 | if err != nil { 198 | return err 199 | } 200 | 201 | switch startupMessage := startupMessage.(type) { 202 | case *pgproto3.StartupMessage: 203 | params := startupMessage.Parameters 204 | LogDebug(postgres.config, "BemiDB: startup message", params) 205 | 206 | if params["database"] != postgres.config.Database { 207 | postgres.writeError(errors.New("database " + params["database"] + " does not exist")) 208 | return errors.New("database does not exist") 209 | } 210 | 211 | if postgres.config.User != "" && params["user"] != postgres.config.User && params["user"] != SYSTEM_AUTH_USER { 212 | postgres.writeError(errors.New("role \"" + params["user"] + "\" does not exist")) 213 | return errors.New("role does not exist") 214 | } 215 | 216 | postgres.writeMessages( 217 | &pgproto3.AuthenticationOk{}, 218 | &pgproto3.ParameterStatus{Name: "client_encoding", Value: PG_ENCODING}, 219 | &pgproto3.ParameterStatus{Name: "server_version", Value: PG_VERSION}, 220 | &pgproto3.ReadyForQuery{TxStatus: PG_TX_STATUS_IDLE}, 221 | ) 222 | return nil 223 | case *pgproto3.SSLRequest: 224 | _, err = (*postgres.conn).Write([]byte("N")) 225 | if err != nil { 226 | return err 227 | } 228 | postgres.handleStartup() 229 | return nil 230 | default: 231 | return errors.New("unknown startup message") 232 | } 233 | } 234 | -------------------------------------------------------------------------------- /src/query_remapper_expression.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "strings" 5 | 6 | pgQuery "github.com/pganalyze/pg_query_go/v5" 7 | ) 8 | 9 | type QueryRemapperExpression struct { 10 | parserTypeCast *ParserTypeCast 11 | parserColumnRef *ParserColumnRef 12 | parserAExpr *ParserAExpr 13 | config *Config 14 | } 15 | 16 | func NewQueryRemapperExpression(config *Config) *QueryRemapperExpression { 17 | remapper := &QueryRemapperExpression{ 18 | parserTypeCast: NewParserTypeCast(config), 19 | parserColumnRef: NewParserColumnRef(config), 20 | parserAExpr: NewParserAExpr(config), 21 | config: config, 22 | } 23 | return remapper 24 | } 25 | 26 | func (remapper *QueryRemapperExpression) RemappedExpression(node *pgQuery.Node) *pgQuery.Node { 27 | node = remapper.remappedTypeCast(node) 28 | node = remapper.remappedArithmeticExpression(node) 29 | node = remapper.remappedCollateClause(node) 30 | remapper.remapColumnReference(node) 31 | 32 | return node 33 | } 34 | 35 | // value::type or CAST(value AS type) 36 | func (remapper *QueryRemapperExpression) remappedTypeCast(node *pgQuery.Node) *pgQuery.Node { 37 | typeCast := remapper.parserTypeCast.TypeCast(node) 38 | if typeCast == nil { 39 | return node 40 | } 41 | 42 | remapper.parserTypeCast.RemovePgCatalog(typeCast) 43 | typeName := remapper.parserTypeCast.TypeName(typeCast) 44 | 45 | switch typeName { 46 | case "text[]": 47 | // '{a,b,c}'::text[] -> ARRAY['a', 'b', 'c'] 48 | return remapper.parserTypeCast.MakeListValueFromArray(typeCast.Arg) 49 | case "regproc": 50 | // 'schema.function_name'::regproc -> 'function_name' 51 | nameParts := strings.Split(remapper.parserTypeCast.ArgStringValue(typeCast), ".") 52 | return pgQuery.MakeAConstStrNode(nameParts[len(nameParts)-1], 0) 53 | case "regclass": 54 | // 'schema.table'::regclass -> SELECT c.oid FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'schema' AND c.relname = 'table' 55 | return remapper.parserTypeCast.MakeSubselectOidBySchemaTableArg(typeCast.Arg) 56 | case "oid": 57 | // 'schema.table'::regclass::oid -> SELECT c.oid FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = 'schema' AND c.relname = 'table' 58 | nestedTypeCast := remapper.parserTypeCast.NestedTypeCast(typeCast) 59 | remapper.parserTypeCast.RemovePgCatalog(nestedTypeCast) 60 | nestedTypeName := remapper.parserTypeCast.TypeName(nestedTypeCast) 61 | if nestedTypeName != "regclass" { 62 | return node 63 | } 64 | return remapper.parserTypeCast.MakeSubselectOidBySchemaTableArg(nestedTypeCast.Arg) 65 | case "text": 66 | // value::(regtype|regnamespace|regclass)::text -> value::text 67 | nestedTypeCast := remapper.parserTypeCast.NestedTypeCast(typeCast) 68 | remapper.parserTypeCast.RemovePgCatalog(nestedTypeCast) 69 | nestedTypeName := remapper.parserTypeCast.TypeName(nestedTypeCast) 70 | if nestedTypeName != "regtype" && nestedTypeName != "regnamespace" && nestedTypeName != "regclass" { 71 | return node 72 | } 73 | remapper.parserTypeCast.SetTypeCastArg(typeCast, nestedTypeCast.Arg) 74 | } 75 | 76 | return node 77 | } 78 | 79 | func (remapper *QueryRemapperExpression) remappedArithmeticExpression(node *pgQuery.Node) *pgQuery.Node { 80 | aExpr := remapper.parserAExpr.AExpr(node) 81 | if aExpr == nil { 82 | return node 83 | } 84 | 85 | // = ANY({schema_information}) -> IN (schema_information) 86 | node = remapper.parserAExpr.ConvertedRightAnyToIn(node) 87 | 88 | // pg_catalog.[operator] -> [operator] 89 | remapper.parserAExpr.RemovePgCatalog(node) 90 | 91 | return node 92 | } 93 | 94 | // public.table.column -> table.column 95 | // schema.table.column -> schema_table.column 96 | func (remapper *QueryRemapperExpression) remapColumnReference(node *pgQuery.Node) { 97 | fieldNames := remapper.parserColumnRef.FieldNames(node) 98 | if fieldNames == nil || len(fieldNames) != 3 { 99 | return 100 | } 101 | 102 | schema := fieldNames[0] 103 | if schema == PG_SCHEMA_PG_CATALOG || schema == PG_SCHEMA_INFORMATION_SCHEMA { 104 | return 105 | } 106 | 107 | table := fieldNames[1] 108 | column := fieldNames[2] 109 | if schema == PG_SCHEMA_PUBLIC { 110 | remapper.parserColumnRef.SetFields(node, []string{table, column}) 111 | return 112 | } 113 | 114 | remapper.parserColumnRef.SetFields(node, []string{schema + "_" + table, column}) 115 | } 116 | 117 | // "value" COLLATE pg_catalog.default -> "value" 118 | func (remapper *QueryRemapperExpression) remappedCollateClause(node *pgQuery.Node) *pgQuery.Node { 119 | if node.GetCollateClause() == nil { 120 | return node 121 | } 122 | 123 | return remapper.parserTypeCast.RemovedDefaultCollateClause(node) 124 | } 125 | -------------------------------------------------------------------------------- /src/query_remapper_function.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "regexp" 5 | "strings" 6 | 7 | pgQuery "github.com/pganalyze/pg_query_go/v5" 8 | ) 9 | 10 | const ( 11 | BEMIDB_FUNCTION_LAST_SYNCED_AT = "bemidb_last_synced_at" 12 | ) 13 | 14 | var PG_CATALOG_MACRO_FUNCTION_NAMES = Set[string]{} 15 | var PG_INFORMATION_SCHEMA_MACRO_FUNCTION_NAMES = Set[string]{} 16 | 17 | func CreatePgCatalogMacroQueries(config *Config) []string { 18 | result := []string{ 19 | // Functions 20 | "CREATE MACRO aclexplode(aclitem_array) AS json(aclitem_array)", 21 | "CREATE MACRO current_setting(setting_name) AS '', (setting_name, missing_ok) AS ''", 22 | "CREATE MACRO pg_backend_pid() AS 0", 23 | "CREATE MACRO pg_encoding_to_char(encoding_int) AS 'UTF8'", 24 | "CREATE MACRO pg_get_expr(pg_node_tree, relation_oid) AS pg_catalog.pg_get_expr(pg_node_tree, relation_oid), (pg_node_tree, relation_oid, pretty_bool) AS pg_catalog.pg_get_expr(pg_node_tree, relation_oid)", 25 | "CREATE MACRO pg_get_function_identity_arguments(func_oid) AS ''", 26 | "CREATE MACRO pg_get_indexdef(index_oid) AS '', (index_oid, column_int) AS '', (index_oid, column_int, pretty_bool) AS ''", 27 | "CREATE MACRO pg_get_partkeydef(table_oid) AS ''", 28 | "CREATE MACRO pg_get_userbyid(role_id) AS 'bemidb'", 29 | "CREATE MACRO pg_get_viewdef(view_oid) AS pg_catalog.pg_get_viewdef(view_oid), (view_oid, pretty_bool) AS pg_catalog.pg_get_viewdef(view_oid)", 30 | "CREATE MACRO pg_indexes_size(regclass) AS 0", 31 | "CREATE MACRO pg_is_in_recovery() AS false", 32 | "CREATE MACRO pg_table_size(regclass) AS 0", 33 | "CREATE MACRO pg_tablespace_location(tablespace_oid) AS ''", 34 | "CREATE MACRO pg_total_relation_size(regclass) AS 0", 35 | "CREATE MACRO quote_ident(text) AS '\"' || text || '\"'", 36 | "CREATE MACRO row_to_json(record) AS to_json(record), (record, pretty_bool) AS to_json(record)", 37 | "CREATE MACRO set_config(setting_name, new_value, is_local) AS new_value", 38 | "CREATE MACRO version() AS 'PostgreSQL " + PG_VERSION + ", compiled by BemiDB'", 39 | "CREATE MACRO pg_get_statisticsobjdef_columns(oid) AS NULL", 40 | "CREATE MACRO pg_relation_is_publishable(val) AS NULL", 41 | `CREATE MACRO jsonb_extract_path_text(from_json, path_elems) AS 42 | CASE typeof(path_elems) LIKE '%[]' 43 | WHEN true THEN json_extract_path_text(from_json, path_elems)[1]::varchar 44 | ELSE json_extract_path_text(from_json, path_elems)::varchar 45 | END`, 46 | `CREATE MACRO json_build_object(k1, v1) AS json_object(k1, v1), 47 | (k1, v1, k2, v2) AS json_object(k1, v1, k2, v2), 48 | (k1, v1, k2, v2, k3, v3) AS json_object(k1, v1, k2, v2, k3, v3), 49 | (k1, v1, k2, v2, k3, v3, k4, v4) AS json_object(k1, v1, k2, v2, k3, v3, k4, v4)`, 50 | `CREATE MACRO array_upper(arr, dimension) AS 51 | CASE dimension 52 | WHEN 1 THEN len(arr) 53 | ELSE NULL 54 | END`, 55 | 56 | // Table functions 57 | "CREATE MACRO pg_is_in_recovery() AS TABLE SELECT false AS pg_is_in_recovery", 58 | `CREATE MACRO pg_show_all_settings() AS TABLE SELECT 59 | name, 60 | value AS setting, 61 | NULL::text AS unit, 62 | 'Settings' AS category, 63 | description AS short_desc, 64 | NULL::text AS extra_desc, 65 | 'user' AS context, 66 | input_type AS vartype, 67 | 'default' AS source, 68 | NULL::int4 AS min_val, 69 | NULL::int4 AS max_val, 70 | NULL::text[] AS enumvals, 71 | value AS boot_val, 72 | value AS reset_val, 73 | NULL::text AS sourcefile, 74 | NULL::int4 AS sourceline, 75 | FALSE AS pending_restart 76 | FROM duckdb_settings()`, 77 | `CREATE MACRO pg_get_keywords() AS TABLE SELECT 78 | keyword_name AS word, 79 | 'U' AS catcode, 80 | TRUE AS barelabel, 81 | keyword_category AS catdesc, 82 | 'can be bare label' AS baredesc 83 | FROM duckdb_keywords()`, 84 | } 85 | PG_CATALOG_MACRO_FUNCTION_NAMES = extractMacroNames(result) 86 | return result 87 | } 88 | 89 | func CreateInformationSchemaMacroQueries(config *Config) []string { 90 | result := []string{ 91 | "CREATE MACRO _pg_expandarray(arr) AS STRUCT_PACK(x := unnest(arr), n := unnest(generate_series(1, array_length(arr))))", 92 | } 93 | PG_INFORMATION_SCHEMA_MACRO_FUNCTION_NAMES = extractMacroNames(result) 94 | return result 95 | } 96 | 97 | var BUILTIN_DUCKDB_PG_FUNCTION_NAMES = NewSet([]string{ 98 | "array_to_string", 99 | "generate_series", 100 | }) 101 | 102 | type QueryRemapperFunction struct { 103 | parserFunction *ParserFunction 104 | icebergReader *IcebergReader 105 | config *Config 106 | } 107 | 108 | func NewQueryRemapperFunction(config *Config, icebergReader *IcebergReader) *QueryRemapperFunction { 109 | return &QueryRemapperFunction{ 110 | parserFunction: NewParserFunction(config), 111 | icebergReader: icebergReader, 112 | config: config, 113 | } 114 | } 115 | 116 | func (remapper *QueryRemapperFunction) SchemaFunction(functionCall *pgQuery.FuncCall) *QuerySchemaFunction { 117 | return remapper.parserFunction.SchemaFunction(functionCall) 118 | } 119 | 120 | // FUNCTION(...) -> ANOTHER_FUNCTION(...) 121 | func (remapper *QueryRemapperFunction) RemapFunctionCall(functionCall *pgQuery.FuncCall) *QuerySchemaFunction { 122 | schemaFunction := remapper.SchemaFunction(functionCall) 123 | 124 | // Pre-defined macro functions 125 | switch schemaFunction.Schema { 126 | 127 | // pg_catalog.func() -> main.func() 128 | case PG_SCHEMA_PG_CATALOG, "": 129 | if PG_CATALOG_MACRO_FUNCTION_NAMES.Contains(schemaFunction.Function) || BUILTIN_DUCKDB_PG_FUNCTION_NAMES.Contains(schemaFunction.Function) { 130 | remapper.parserFunction.RemapSchemaToMain(functionCall) 131 | return schemaFunction 132 | } 133 | 134 | // information_schema.func() -> main.func() 135 | case PG_SCHEMA_INFORMATION_SCHEMA: 136 | if PG_INFORMATION_SCHEMA_MACRO_FUNCTION_NAMES.Contains(schemaFunction.Function) { 137 | remapper.parserFunction.RemapSchemaToMain(functionCall) 138 | return schemaFunction 139 | } 140 | } 141 | 142 | switch { 143 | 144 | // format('%s %1$s', str) -> printf('%1$s %1$s', str) 145 | case schemaFunction.Function == PG_FUNCTION_FORMAT: 146 | remapper.parserFunction.RemapFormatToPrintf(functionCall) 147 | return schemaFunction 148 | 149 | // encode(sha256(...), 'hex') -> sha256(...) 150 | case schemaFunction.Function == PG_FUNCTION_ENCODE: 151 | remapper.parserFunction.RemoveEncode(functionCall) 152 | return schemaFunction 153 | 154 | // bemidb_last_synced_at('schema.table') -> to_timestamp(internalTableMetadata.LastSyncedAt) 155 | case schemaFunction.Function == BEMIDB_FUNCTION_LAST_SYNCED_AT: 156 | schemaTableName := remapper.parserFunction.FirstArgumentToString(functionCall) 157 | schemaTableParts := strings.Split(schemaTableName, ".") 158 | var pgSchemaTable PgSchemaTable 159 | if len(schemaTableParts) == 2 { 160 | pgSchemaTable.Schema = schemaTableParts[0] 161 | pgSchemaTable.Table = schemaTableParts[1] 162 | } else { 163 | pgSchemaTable.Schema = PG_SCHEMA_PUBLIC 164 | pgSchemaTable.Table = schemaTableParts[0] 165 | } 166 | 167 | internalTableMetadata, err := remapper.icebergReader.InternalTableMetadata(pgSchemaTable) 168 | 169 | if err != nil { 170 | LogError(remapper.config, "Failed to get internal table metadata for %s: %v", pgSchemaTable, err) 171 | remapper.parserFunction.RemapToTimestamp(functionCall, 0) 172 | } else { 173 | remapper.parserFunction.RemapToTimestamp(functionCall, internalTableMetadata.LastSyncedAt) 174 | } 175 | 176 | return schemaFunction 177 | } 178 | 179 | return nil 180 | } 181 | 182 | func (remapper *QueryRemapperFunction) RemapNestedFunctionCalls(functionCall *pgQuery.FuncCall) { 183 | nestedFunctionCalls := remapper.parserFunction.NestedFunctionCalls(functionCall) 184 | if len(nestedFunctionCalls) == 0 { 185 | return 186 | } 187 | 188 | for _, nestedFunctionCall := range nestedFunctionCalls { 189 | if nestedFunctionCall == nil { 190 | continue 191 | } 192 | 193 | schemaFunction := remapper.RemapFunctionCall(nestedFunctionCall) 194 | if schemaFunction != nil { 195 | continue 196 | } 197 | 198 | remapper.RemapNestedFunctionCalls(nestedFunctionCall) // self-recursion 199 | } 200 | } 201 | 202 | func extractMacroNames(macros []string) Set[string] { 203 | names := make(Set[string]) 204 | re := regexp.MustCompile(`CREATE MACRO (\w+)\(`) 205 | 206 | for _, macro := range macros { 207 | matches := re.FindStringSubmatch(macro) 208 | if len(matches) > 1 { 209 | names.Add(matches[1]) 210 | } 211 | } 212 | 213 | return names 214 | } 215 | -------------------------------------------------------------------------------- /src/query_remapper_select.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | pgQuery "github.com/pganalyze/pg_query_go/v5" 5 | ) 6 | 7 | type QueryRemapperSelect struct { 8 | parserSelect *ParserSelect 9 | parserFunction *ParserFunction 10 | config *Config 11 | } 12 | 13 | func NewQueryRemapperSelect(config *Config) *QueryRemapperSelect { 14 | return &QueryRemapperSelect{ 15 | parserSelect: NewParserSelect(config), 16 | parserFunction: NewParserFunction(config), 17 | config: config, 18 | } 19 | } 20 | 21 | // SELECT FUNCTION(...) -> SELECT FUNCTION(...) AS FUNCTION 22 | func (remapper *QueryRemapperSelect) SetDefaultTargetNameToFunctionName(targetNode *pgQuery.Node) *pgQuery.Node { 23 | functionCall := remapper.parserFunction.FunctionCall(targetNode) 24 | if functionCall != nil { 25 | schemaFunction := remapper.parserFunction.SchemaFunction(functionCall) 26 | // FUNCTION(...) -> FUNCTION(...) AS FUNCTION 27 | remapper.parserSelect.SetDefaultTargetName(targetNode, schemaFunction.Function) 28 | return targetNode 29 | } 30 | 31 | indirectionName := remapper.parserFunction.IndirectionName(targetNode) 32 | if indirectionName != "" { 33 | // (FUNCTION()).n -> (FUNCTION()).n AS n 34 | remapper.parserSelect.SetDefaultTargetName(targetNode, indirectionName) 35 | return targetNode 36 | } 37 | 38 | return targetNode 39 | } 40 | -------------------------------------------------------------------------------- /src/query_remapper_show.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | pgQuery "github.com/pganalyze/pg_query_go/v5" 5 | ) 6 | 7 | type QueryRemapperShow struct { 8 | config *Config 9 | parserShow *ParserShow 10 | } 11 | 12 | func NewQueryRemapperShow(config *Config) *QueryRemapperShow { 13 | return &QueryRemapperShow{ 14 | config: config, 15 | parserShow: NewParserShow(config), 16 | } 17 | } 18 | 19 | func (remapper *QueryRemapperShow) RemapShowStatement(stmt *pgQuery.RawStmt) *pgQuery.RawStmt { 20 | parser := remapper.parserShow 21 | variableName := parser.VariableName(stmt) 22 | 23 | // SHOW var -> SELECT value AS var FROM duckdb_settings() WHERE LOWER(name) = 'var'; 24 | newStmt := parser.MakeSelectFromDuckdbSettings(variableName) 25 | 26 | // SELECT value AS search_path -> SELECT CONCAT('"$user", ', value) AS search_path 27 | if variableName == PG_VAR_SEARCH_PATH { 28 | parser.SetTargetListForSearchPath(newStmt) 29 | } 30 | 31 | return newStmt 32 | } 33 | -------------------------------------------------------------------------------- /src/storage_interface.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | ) 7 | 8 | type RefreshMode string 9 | 10 | const ( 11 | RefreshModeFull RefreshMode = "FULL" 12 | RefreshModeFullInProgress RefreshMode = "FULL_IN_PROGRESS" 13 | RefreshModeIncremental RefreshMode = "INCREMENTAL" 14 | RefreshModeIncrementalInProgress RefreshMode = "INCREMENTAL_IN_PROGRESS" 15 | 16 | INTERNAL_START_SQL_FILE_NAME = "bemidb-start.sql" 17 | ) 18 | 19 | type ParquetFileStats struct { 20 | ColumnSizes map[int]int64 21 | ValueCounts map[int]int64 22 | NullValueCounts map[int]int64 23 | LowerBounds map[int][]byte 24 | UpperBounds map[int][]byte 25 | SplitOffsets []int64 26 | } 27 | 28 | type ParquetFile struct { 29 | Uuid string 30 | Path string 31 | Size int64 32 | RecordCount int64 33 | Stats ParquetFileStats 34 | } 35 | 36 | type ManifestFile struct { 37 | RecordsDeleted bool 38 | SnapshotId int64 39 | Path string 40 | Size int64 41 | RecordCount int64 42 | DataFileSize int64 43 | } 44 | 45 | type ManifestListItem struct { 46 | SequenceNumber int 47 | ManifestFile ManifestFile 48 | } 49 | 50 | type ManifestListFile struct { 51 | SequenceNumber int 52 | SnapshotId int64 53 | TimestampMs int64 54 | Path string 55 | Operation string 56 | AddedFilesSize int64 57 | AddedDataFiles int64 58 | AddedRecords int64 59 | RemovedFilesSize int64 60 | DeletedDataFiles int64 61 | DeletedRecords int64 62 | } 63 | 64 | type MetadataFile struct { 65 | Version int64 66 | Path string 67 | } 68 | 69 | type InternalTableMetadata struct { 70 | LastRefreshMode RefreshMode `json:"last-refresh-mode"` 71 | LastSyncedAt int64 `json:"last-synced-at"` 72 | LastTxid int64 `json:"last-txid"` 73 | MaxXmin *uint32 `json:"max-xmin"` 74 | } 75 | 76 | func (internalTableMetadata InternalTableMetadata) IsInProgress() bool { 77 | return internalTableMetadata.LastRefreshMode == RefreshModeIncrementalInProgress || internalTableMetadata.LastRefreshMode == RefreshModeFullInProgress 78 | } 79 | 80 | func (internalTableMetadata InternalTableMetadata) MaxXminString() string { 81 | if internalTableMetadata.MaxXmin == nil { 82 | panic("MaxXmin is unexpectedly null. " + internalTableMetadata.String()) 83 | } 84 | return Uint32ToString(*internalTableMetadata.MaxXmin) 85 | } 86 | 87 | func (internalTableMetadata InternalTableMetadata) LastWrappedAroundTxidString() string { 88 | return Int64ToString(PgWraparoundTxid(internalTableMetadata.LastTxid)) 89 | } 90 | 91 | func (internalTableMetadata InternalTableMetadata) String() string { 92 | maxXmin := "null" 93 | if internalTableMetadata.MaxXmin != nil { 94 | maxXmin = Uint32ToString(*internalTableMetadata.MaxXmin) 95 | } 96 | 97 | return fmt.Sprintf( 98 | "LastRefreshMode: %s, LastSyncedAt: %d, MaxXmin: %s", 99 | internalTableMetadata.LastRefreshMode, 100 | internalTableMetadata.LastSyncedAt, 101 | maxXmin, 102 | ) 103 | } 104 | 105 | type StorageInterface interface { 106 | // Read 107 | IcebergSchemas() (icebergSchemas []string, err error) 108 | IcebergSchemaTables() (icebersSchemaTables Set[IcebergSchemaTable], err error) 109 | IcebergMetadataFilePath(icebergSchemaTable IcebergSchemaTable) (path string) 110 | IcebergTableFields(icebergSchemaTable IcebergSchemaTable) (icebergTableFields []IcebergTableField, err error) 111 | ExistingManifestListFiles(metadataDirPath string) (manifestListFilesSortedAsc []ManifestListFile, err error) 112 | ExistingManifestListItems(manifestListFile ManifestListFile) (manifestListItemsSortedDesc []ManifestListItem, err error) 113 | ExistingParquetFilePath(manifestFile ManifestFile) (parquetFilePath string, err error) 114 | 115 | // Write 116 | DeleteSchema(schema string) (err error) 117 | DeleteSchemaTable(schemaTable IcebergSchemaTable) (err error) 118 | CreateDataDir(schemaTable IcebergSchemaTable) (dataDirPath string) 119 | CreateMetadataDir(schemaTable IcebergSchemaTable) (metadataDirPath string) 120 | CreateParquet(dataDirPath string, pgSchemaColumns []PgSchemaColumn, maxPayloadThreshold int, loadRows func() ([][]string, InternalTableMetadata)) (parquetFile ParquetFile, internalTableMetadata InternalTableMetadata, err error) 121 | CreateOverwrittenParquet(dataDirPath string, existingParquetFilePath string, newParquetFilePath string, pgSchemaColumns []PgSchemaColumn, dynamicRowCountPerBatch int) (overwrittenParquetFile ParquetFile, err error) 122 | DeleteParquet(parquetFile ParquetFile) (err error) 123 | CreateManifest(metadataDirPath string, parquetFile ParquetFile) (manifestFile ManifestFile, err error) 124 | CreateDeletedRecordsManifest(metadataDirPath string, uuid string, existingManifestFile ManifestFile) (deletedRecsManifestFile ManifestFile, err error) 125 | CreateManifestList(metadataDirPath string, parquetFileUuid string, manifestListItemsSortedDesc []ManifestListItem) (manifestListFile ManifestListFile, err error) 126 | CreateMetadata(metadataDirPath string, pgSchemaColumns []PgSchemaColumn, manifestListFilesSortedAsc []ManifestListFile) (metadataFile MetadataFile, err error) 127 | 128 | // Read (internal) 129 | InternalStartSqlFile() (sqlFile io.ReadCloser) 130 | InternalTableMetadata(pgSchemaTable PgSchemaTable) (internalTableMetadata InternalTableMetadata, err error) 131 | // Write (internal) 132 | WriteInternalStartSqlFile(queries []string) (err error) 133 | WriteInternalTableMetadata(metadataDirPath string, internalTableMetadata InternalTableMetadata) (err error) 134 | } 135 | 136 | func NewStorage(config *Config) StorageInterface { 137 | switch config.StorageType { 138 | case STORAGE_TYPE_LOCAL: 139 | return NewLocalStorage(config) 140 | case STORAGE_TYPE_S3: 141 | return NewS3Storage(config) 142 | } 143 | 144 | return nil 145 | } 146 | -------------------------------------------------------------------------------- /src/storage_local_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/binary" 5 | "os" 6 | "testing" 7 | ) 8 | 9 | var TEST_STORAGE_PG_SCHEMA_COLUMNS = []PgSchemaColumn{ 10 | {ColumnName: "id", DataType: "integer", UdtName: "int4", IsNullable: "NO", NumericPrecision: "32", OrdinalPosition: "1", Namespace: "pg_catalog"}, 11 | {ColumnName: "name", DataType: "character varying", UdtName: "varchar", IsNullable: "YES", CharacterMaximumLength: "255", OrdinalPosition: "2", Namespace: "pg_catalog"}, 12 | } 13 | var TEST_STORAGE_ROWS = [][]string{ 14 | {"1", "John"}, 15 | {"2", PG_NULL_STRING}, 16 | } 17 | 18 | func TestCreateParquet(t *testing.T) { 19 | t.Run("Creates a parquet file", func(t *testing.T) { 20 | tempDir := os.TempDir() 21 | config := loadTestConfig() 22 | storage := NewLocalStorage(config) 23 | loadedRows := false 24 | loadRows := func() ([][]string, InternalTableMetadata) { 25 | if loadedRows { 26 | return [][]string{}, InternalTableMetadata{} 27 | } 28 | loadedRows = true 29 | return TEST_STORAGE_ROWS, InternalTableMetadata{} 30 | } 31 | 32 | parquetFile, _, err := storage.CreateParquet(tempDir, TEST_STORAGE_PG_SCHEMA_COLUMNS, 0, loadRows) 33 | 34 | if err != nil { 35 | t.Errorf("Expected no error, got %v", err) 36 | } 37 | if parquetFile.Uuid == "" { 38 | t.Errorf("Expected a non-empty UUID, got %v", parquetFile.Uuid) 39 | } 40 | if parquetFile.Path == "" { 41 | t.Errorf("Expected a non-empty path, got %v", parquetFile.Path) 42 | } 43 | if parquetFile.Size == 0 { 44 | t.Errorf("Expected a non-zero size, got %v", parquetFile.Size) 45 | } 46 | if parquetFile.RecordCount != 2 { 47 | t.Errorf("Expected a non-zero record count, got %v", parquetFile.RecordCount) 48 | } 49 | if len(parquetFile.Stats.ColumnSizes) != 2 { 50 | t.Errorf("Expected 2 column sizes, got %v", len(parquetFile.Stats.ColumnSizes)) 51 | } 52 | if parquetFile.Stats.ColumnSizes[1] == 0 { 53 | t.Errorf("Expected a non-zero column size, got %v", parquetFile.Stats.ColumnSizes[1]) 54 | } 55 | if parquetFile.Stats.ColumnSizes[2] == 0 { 56 | t.Errorf("Expected a non-zero value count, got %v", parquetFile.Stats.ColumnSizes[2]) 57 | } 58 | if parquetFile.Stats.ValueCounts[1] != 2 { 59 | t.Errorf("Expected a value count of 2, got %v", parquetFile.Stats.ValueCounts[1]) 60 | } 61 | if parquetFile.Stats.ValueCounts[2] != 2 { 62 | t.Errorf("Expected a value count of 1, got %v", parquetFile.Stats.ValueCounts[2]) 63 | } 64 | if parquetFile.Stats.NullValueCounts[1] != 0 { 65 | t.Errorf("Expected a null value count of 0, got %v", parquetFile.Stats.NullValueCounts[1]) 66 | } 67 | if parquetFile.Stats.NullValueCounts[2] != 1 { 68 | t.Errorf("Expected a null value count of 1, got %v", parquetFile.Stats.NullValueCounts[2]) 69 | } 70 | if binary.LittleEndian.Uint32(parquetFile.Stats.LowerBounds[1]) != 1 { 71 | t.Errorf("Expected a lower bound of 1, got %v", binary.LittleEndian.Uint32(parquetFile.Stats.LowerBounds[1])) 72 | } 73 | if string(parquetFile.Stats.LowerBounds[2]) != "John" { 74 | t.Errorf("Expected a lower bound of John, got %v", parquetFile.Stats.LowerBounds[2]) 75 | } 76 | if binary.LittleEndian.Uint32(parquetFile.Stats.UpperBounds[1]) != 2 { 77 | t.Errorf("Expected an upper bound of 2, got %v", binary.LittleEndian.Uint32(parquetFile.Stats.UpperBounds[1])) 78 | } 79 | if string(parquetFile.Stats.UpperBounds[2]) != "John" { 80 | t.Errorf("Expected an upper bound of John, got %v", parquetFile.Stats.UpperBounds[2]) 81 | } 82 | if len(parquetFile.Stats.SplitOffsets) != 0 { 83 | t.Errorf("Expected 0 split offsets, got %v", len(parquetFile.Stats.SplitOffsets)) 84 | } 85 | }) 86 | } 87 | 88 | func TestCreateManifest(t *testing.T) { 89 | t.Run("Creates a manifest file", func(t *testing.T) { 90 | tempDir := os.TempDir() 91 | config := loadTestConfig() 92 | storage := NewLocalStorage(config) 93 | parquetFile := createTestParquetFile(storage, tempDir) 94 | 95 | manifestFile, err := storage.CreateManifest(tempDir, parquetFile) 96 | 97 | if err != nil { 98 | t.Errorf("Expected no error, got %v", err) 99 | } 100 | if manifestFile.SnapshotId == 0 { 101 | t.Errorf("Expected a non-zero snapshot ID, got %v", manifestFile.SnapshotId) 102 | } 103 | if manifestFile.Path == "" { 104 | t.Errorf("Expected a non-empty path, got %v", manifestFile.Path) 105 | } 106 | if manifestFile.Size == 0 { 107 | t.Errorf("Expected a non-zero size, got %v", manifestFile.Size) 108 | } 109 | if manifestFile.RecordCount != parquetFile.RecordCount { 110 | t.Errorf("Expected a record count of %v, got %v", parquetFile.RecordCount, manifestFile.RecordCount) 111 | } 112 | if manifestFile.DataFileSize != parquetFile.Size { 113 | t.Errorf("Expected a data file size of %v, got %v", parquetFile.Size, manifestFile.DataFileSize) 114 | } 115 | }) 116 | } 117 | 118 | func TestCreateManifestList(t *testing.T) { 119 | t.Run("Creates a manifest list file", func(t *testing.T) { 120 | tempDir := os.TempDir() 121 | config := loadTestConfig() 122 | storage := NewLocalStorage(config) 123 | parquetFile := createTestParquetFile(storage, tempDir) 124 | manifestFile, err := storage.CreateManifest(tempDir, parquetFile) 125 | PanicIfError(config, err) 126 | manifestListItem := ManifestListItem{SequenceNumber: 1, ManifestFile: manifestFile} 127 | 128 | manifestListFile, err := storage.CreateManifestList(tempDir, parquetFile.Uuid, []ManifestListItem{manifestListItem}) 129 | 130 | if err != nil { 131 | t.Errorf("Expected no error, got %v", err) 132 | } 133 | if manifestListFile.SnapshotId != manifestFile.SnapshotId { 134 | t.Errorf("Expected a snapshot ID of %v, got %v", manifestFile.SnapshotId, manifestListFile.SnapshotId) 135 | } 136 | if manifestListFile.TimestampMs == 0 { 137 | t.Errorf("Expected a non-zero timestamp, got %v", manifestListFile.TimestampMs) 138 | } 139 | if manifestListFile.Path == "" { 140 | t.Errorf("Expected a non-empty path, got %v", manifestListFile.Path) 141 | } 142 | if manifestListFile.Operation != "append" { 143 | t.Errorf("Expected an operation of append, got %v", manifestListFile.Operation) 144 | } 145 | if manifestListFile.AddedFilesSize != parquetFile.Size { 146 | t.Errorf("Expected an added files size of %v, got %v", parquetFile.Size, manifestListFile.AddedFilesSize) 147 | } 148 | if manifestListFile.AddedDataFiles != 1 { 149 | t.Errorf("Expected an added data files count of 1, got %v", manifestListFile.AddedDataFiles) 150 | } 151 | if manifestListFile.AddedRecords != parquetFile.RecordCount { 152 | t.Errorf("Expected an added records count of %v, got %v", parquetFile.RecordCount, manifestListFile.AddedRecords) 153 | } 154 | }) 155 | } 156 | 157 | func TestCreateMetadata(t *testing.T) { 158 | t.Run("Creates a metadata file", func(t *testing.T) { 159 | tempDir := os.TempDir() 160 | config := loadTestConfig() 161 | storage := NewLocalStorage(config) 162 | parquetFile := createTestParquetFile(storage, tempDir) 163 | manifestFile, err := storage.CreateManifest(tempDir, parquetFile) 164 | PanicIfError(config, err) 165 | manifestListItem := ManifestListItem{SequenceNumber: 1, ManifestFile: manifestFile} 166 | manifestListFile, err := storage.CreateManifestList(tempDir, parquetFile.Uuid, []ManifestListItem{manifestListItem}) 167 | PanicIfError(config, err) 168 | 169 | metadataFile, err := storage.CreateMetadata(tempDir, TEST_STORAGE_PG_SCHEMA_COLUMNS, []ManifestListFile{manifestListFile}) 170 | 171 | if err != nil { 172 | t.Errorf("Expected no error, got %v", err) 173 | } 174 | if metadataFile.Version != 1 { 175 | t.Errorf("Expected a version of 1, got %v", metadataFile.Version) 176 | } 177 | if metadataFile.Path == "" { 178 | t.Errorf("Expected a non-empty path, got %v", metadataFile.Path) 179 | } 180 | }) 181 | } 182 | 183 | func TestExistingManifestListFiles(t *testing.T) { 184 | t.Run("Returns existing manifest list files", func(t *testing.T) { 185 | tempDir := os.TempDir() 186 | config := loadTestConfig() 187 | storage := NewLocalStorage(config) 188 | parquetFile := createTestParquetFile(storage, tempDir) 189 | manifestFile, err := storage.CreateManifest(tempDir, parquetFile) 190 | PanicIfError(config, err) 191 | manifestListItem := ManifestListItem{SequenceNumber: 1, ManifestFile: manifestFile} 192 | manifestListFile, err := storage.CreateManifestList(tempDir, parquetFile.Uuid, []ManifestListItem{manifestListItem}) 193 | PanicIfError(config, err) 194 | _, err = storage.CreateMetadata(tempDir, TEST_STORAGE_PG_SCHEMA_COLUMNS, []ManifestListFile{manifestListFile}) 195 | PanicIfError(config, err) 196 | 197 | existingManifestListFiles, err := storage.ExistingManifestListFiles(tempDir) 198 | 199 | if err != nil { 200 | t.Errorf("Expected no error, got %v", err) 201 | } 202 | if len(existingManifestListFiles) != 1 { 203 | t.Errorf("Expected 1 existing manifest list file, got %v", len(existingManifestListFiles)) 204 | } 205 | if existingManifestListFiles[0].SnapshotId != manifestListFile.SnapshotId { 206 | t.Errorf("Expected a snapshot ID of %v, got %v", manifestListFile.SnapshotId, existingManifestListFiles[0].SnapshotId) 207 | } 208 | if existingManifestListFiles[0].TimestampMs != manifestListFile.TimestampMs { 209 | t.Errorf("Expected a timestamp of %v, got %v", manifestListFile.TimestampMs, existingManifestListFiles[0].TimestampMs) 210 | } 211 | if existingManifestListFiles[0].Path != manifestListFile.Path { 212 | t.Errorf("Expected a path of %v, got %v", manifestListFile.Path, existingManifestListFiles[0].Path) 213 | } 214 | if existingManifestListFiles[0].Operation != manifestListFile.Operation { 215 | t.Errorf("Expected an operation of %v, got %v", manifestListFile.Operation, existingManifestListFiles[0].Operation) 216 | } 217 | if existingManifestListFiles[0].AddedFilesSize != manifestListFile.AddedFilesSize { 218 | t.Errorf("Expected an added files size of %v, got %v", manifestListFile.AddedFilesSize, existingManifestListFiles[0].AddedFilesSize) 219 | } 220 | if existingManifestListFiles[0].AddedDataFiles != manifestListFile.AddedDataFiles { 221 | t.Errorf("Expected an added data files count of %v, got %v", manifestListFile.AddedDataFiles, existingManifestListFiles[0].AddedDataFiles) 222 | } 223 | if existingManifestListFiles[0].AddedRecords != manifestListFile.AddedRecords { 224 | t.Errorf("Expected an added records count of %v, got %v", manifestListFile.AddedRecords, existingManifestListFiles[0].AddedRecords) 225 | } 226 | }) 227 | } 228 | 229 | func TestExistingManifestFiles(t *testing.T) { 230 | t.Run("Returns existing manifest files", func(t *testing.T) { 231 | tempDir := os.TempDir() 232 | config := loadTestConfig() 233 | storage := NewLocalStorage(config) 234 | parquetFile := createTestParquetFile(storage, tempDir) 235 | manifestFile, err := storage.CreateManifest(tempDir, parquetFile) 236 | PanicIfError(config, err) 237 | manifestListItem := ManifestListItem{SequenceNumber: 1, ManifestFile: manifestFile} 238 | manifestListFile, err := storage.CreateManifestList(tempDir, parquetFile.Uuid, []ManifestListItem{manifestListItem}) 239 | PanicIfError(config, err) 240 | 241 | existingManifestListItems, err := storage.ExistingManifestListItems(manifestListFile) 242 | 243 | if err != nil { 244 | t.Errorf("Expected no error, got %v", err) 245 | } 246 | if len(existingManifestListItems) != 1 { 247 | t.Errorf("Expected 1 existing manifest file, got %v", len(existingManifestListItems)) 248 | } 249 | if existingManifestListItems[0].SequenceNumber != 1 { 250 | t.Errorf("Expected a sequence number of 1, got %v", existingManifestListItems[0].SequenceNumber) 251 | } 252 | if existingManifestListItems[0].ManifestFile.SnapshotId != manifestFile.SnapshotId { 253 | t.Errorf("Expected a snapshot ID of %v, got %v", manifestFile.SnapshotId, existingManifestListItems[0].ManifestFile.SnapshotId) 254 | } 255 | if existingManifestListItems[0].ManifestFile.Path != manifestFile.Path { 256 | t.Errorf("Expected a path of %v, got %v", manifestFile.Path, existingManifestListItems[0].ManifestFile.Path) 257 | } 258 | if existingManifestListItems[0].ManifestFile.Size != manifestFile.Size { 259 | t.Errorf("Expected a size of %v, got %v", manifestFile.Size, existingManifestListItems[0].ManifestFile.Size) 260 | } 261 | if existingManifestListItems[0].ManifestFile.RecordCount != manifestFile.RecordCount { 262 | t.Errorf("Expected a record count of %v, got %v", manifestFile.RecordCount, existingManifestListItems[0].ManifestFile.RecordCount) 263 | } 264 | }) 265 | } 266 | 267 | func createTestParquetFile(storage *StorageLocal, dir string) ParquetFile { 268 | loadedRows := false 269 | loadRows := func() ([][]string, InternalTableMetadata) { 270 | if loadedRows { 271 | return [][]string{}, InternalTableMetadata{} 272 | } 273 | loadedRows = true 274 | return TEST_STORAGE_ROWS, InternalTableMetadata{} 275 | } 276 | 277 | parquetFile, _, err := storage.CreateParquet(dir, TEST_STORAGE_PG_SCHEMA_COLUMNS, 0, loadRows) 278 | if err != nil { 279 | panic(err) 280 | } 281 | 282 | return parquetFile 283 | } 284 | -------------------------------------------------------------------------------- /src/syncer.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "encoding/json" 7 | "fmt" 8 | "net/http" 9 | "net/url" 10 | "runtime" 11 | "strings" 12 | "time" 13 | 14 | "github.com/jackc/pgx/v5" 15 | ) 16 | 17 | const ( 18 | MAX_IN_MEMORY_BUFFER_SIZE = 128 * 1024 * 1024 // 128 MB (expands to ~160 MB memory usage) 19 | MAX_PG_ROWS_BATCH_SIZE = 1 * 1024 * 1024 // 1 MB 20 | PING_PG_INTERVAL_SECONDS = 24 21 | 22 | MAX_PARQUET_PAYLOAD_THRESHOLD = 2 * 1024 * 1024 * 1024 // 2 GB (compressed to ~256 MB Parquet) 23 | ) 24 | 25 | type Syncer struct { 26 | config *Config 27 | icebergWriter *IcebergWriter 28 | icebergReader *IcebergReader 29 | syncerTable *SyncerTable 30 | } 31 | 32 | func NewSyncer(config *Config) *Syncer { 33 | if config.Pg.DatabaseUrl == "" { 34 | PrintErrorAndExit(config, "Missing PostgreSQL database URL.\n\n"+ 35 | "See https://github.com/BemiHQ/BemiDB#sync-command-options for more information.", 36 | ) 37 | } 38 | 39 | icebergWriter := NewIcebergWriter(config) 40 | icebergReader := NewIcebergReader(config) 41 | return &Syncer{ 42 | config: config, 43 | icebergWriter: icebergWriter, 44 | icebergReader: icebergReader, 45 | syncerTable: NewSyncerTable(config), 46 | } 47 | } 48 | 49 | func (syncer *Syncer) SyncFromPostgres() { 50 | ctx := context.Background() 51 | if syncer.config.Pg.IncrementallyRefreshedTables == nil { 52 | syncer.sendAnonymousAnalytics("sync-start") 53 | } else { 54 | syncer.sendAnonymousAnalytics("sync-start-incremental") 55 | } 56 | 57 | databaseUrl := syncer.urlEncodePassword(syncer.config.Pg.DatabaseUrl) 58 | icebergSchemaTables, icebergSchemaTablesErr := syncer.icebergReader.SchemaTables() 59 | 60 | structureConn := syncer.newConnection(ctx, databaseUrl) 61 | defer structureConn.Close(ctx) 62 | 63 | copyConn := syncer.newConnection(ctx, databaseUrl) 64 | defer copyConn.Close(ctx) 65 | 66 | syncedPgSchemaTables := []PgSchemaTable{} 67 | 68 | for _, schema := range syncer.listPgSchemas(structureConn) { 69 | for _, pgSchemaTable := range syncer.listPgSchemaTables(structureConn, schema) { 70 | if syncer.shouldSyncTable(pgSchemaTable) { 71 | var internalTableMetadata InternalTableMetadata 72 | syncedPreviously := icebergSchemaTablesErr == nil && icebergSchemaTables.Contains(pgSchemaTable.ToIcebergSchemaTable()) 73 | if syncedPreviously { 74 | internalTableMetadata = syncer.readInternalTableMetadata(pgSchemaTable) 75 | } 76 | 77 | incrementalRefresh := syncer.config.Pg.IncrementallyRefreshedTables != nil && HasExactOrWildcardMatch(syncer.config.Pg.IncrementallyRefreshedTables, pgSchemaTable.ToConfigArg()) 78 | 79 | syncer.syncerTable.SyncPgTable(pgSchemaTable, structureConn, copyConn, internalTableMetadata, incrementalRefresh) 80 | LogInfo(syncer.config, "Finished writing to Iceberg\n") 81 | 82 | syncedPgSchemaTables = append(syncedPgSchemaTables, pgSchemaTable) 83 | } 84 | } 85 | } 86 | 87 | syncer.WriteInternalStartSqlFile(syncedPgSchemaTables) 88 | 89 | if !syncer.config.Pg.PreserveUnsynced { 90 | syncer.deleteOldIcebergSchemaTables(syncedPgSchemaTables) 91 | } 92 | 93 | if syncer.config.Pg.IncrementallyRefreshedTables == nil { 94 | syncer.sendAnonymousAnalytics("sync-finish") 95 | } else { 96 | syncer.sendAnonymousAnalytics("sync-finish-incremental") 97 | } 98 | } 99 | 100 | func (syncer *Syncer) WriteInternalStartSqlFile(pgSchemaTables []PgSchemaTable) { 101 | childTablesByParentTable := make(map[string][]string) 102 | for _, pgSchemaTable := range pgSchemaTables { 103 | if pgSchemaTable.ParentPartitionedTable != "" { 104 | parent := pgSchemaTable.ParentPartitionedTableString() 105 | childTablesByParentTable[parent] = append(childTablesByParentTable[parent], pgSchemaTable.String()) 106 | } 107 | } 108 | 109 | queryRemapper := NewQueryRemapper(syncer.config, syncer.icebergReader, nil) 110 | queries := []string{} 111 | 112 | for parent, children := range childTablesByParentTable { 113 | // CREATE OR REPLACE TABLE test_table AS 114 | // SELECT * FROM iceberg_scan('/iceberg/public/test_table_q1/metadata/v1.metadata.json', skip_schema_inference = true) 115 | // UNION ALL 116 | // SELECT * FROM iceberg_scan('/iceberg/public/test_table_q2/metadata/v1.metadata.json', skip_schema_inference = true) 117 | 118 | subqueries := []string{} 119 | for _, child := range children { 120 | originalSubquery := fmt.Sprintf("SELECT * FROM %s", child) 121 | queryStatements, _, err := queryRemapper.ParseAndRemapQuery(originalSubquery) 122 | PanicIfError(syncer.config, err) 123 | subqueries = append(subqueries, queryStatements[0]) 124 | } 125 | queries = append(queries, fmt.Sprintf("CREATE OR REPLACE TABLE %s AS %s", parent, strings.Join(subqueries, " UNION ALL "))) 126 | } 127 | 128 | syncer.icebergWriter.WriteInternalStartSqlFile(queries) 129 | } 130 | 131 | // Example: 132 | // - From postgres://username:pas$:wor^d@host:port/database 133 | // - To postgres://username:pas%24%3Awor%5Ed@host:port/database 134 | func (syncer *Syncer) urlEncodePassword(databaseUrl string) string { 135 | // No credentials 136 | if !strings.Contains(databaseUrl, "@") { 137 | return databaseUrl 138 | } 139 | 140 | password := strings.TrimPrefix(databaseUrl, "postgresql://") 141 | password = strings.TrimPrefix(password, "postgres://") 142 | passwordEndIndex := strings.LastIndex(password, "@") 143 | password = password[:passwordEndIndex] 144 | 145 | // Credentials without password 146 | if !strings.Contains(password, ":") { 147 | return databaseUrl 148 | } 149 | 150 | _, password, _ = strings.Cut(password, ":") 151 | decodedPassword, err := url.QueryUnescape(password) 152 | if err != nil { 153 | return databaseUrl 154 | } 155 | 156 | // Password is already encoded 157 | if decodedPassword != password { 158 | return databaseUrl 159 | } 160 | 161 | return strings.Replace(databaseUrl, ":"+password+"@", ":"+url.QueryEscape(password)+"@", 1) 162 | } 163 | 164 | func (syncer *Syncer) shouldSyncTable(pgSchemaTable PgSchemaTable) bool { 165 | if syncer.config.Pg.ExcludeTables != nil && HasExactOrWildcardMatch(syncer.config.Pg.ExcludeTables, pgSchemaTable.ToConfigArg()) { 166 | return false 167 | } 168 | 169 | if syncer.config.Pg.IncludeTables != nil { 170 | return HasExactOrWildcardMatch(syncer.config.Pg.IncludeTables, pgSchemaTable.ToConfigArg()) 171 | } 172 | 173 | return true 174 | } 175 | 176 | func (syncer *Syncer) listPgSchemas(conn *pgx.Conn) []string { 177 | var schemas []string 178 | 179 | schemasRows, err := conn.Query( 180 | context.Background(), 181 | "SELECT schema_name FROM information_schema.schemata WHERE schema_name NOT IN ('pg_catalog', 'pg_toast', 'information_schema')", 182 | ) 183 | PanicIfError(syncer.config, err) 184 | defer schemasRows.Close() 185 | 186 | for schemasRows.Next() { 187 | var schema string 188 | err = schemasRows.Scan(&schema) 189 | PanicIfError(syncer.config, err) 190 | schemas = append(schemas, schema) 191 | } 192 | 193 | return schemas 194 | } 195 | 196 | func (syncer *Syncer) listPgSchemaTables(conn *pgx.Conn, schema string) []PgSchemaTable { 197 | var pgSchemaTables []PgSchemaTable 198 | 199 | tablesRows, err := conn.Query( 200 | context.Background(), 201 | ` 202 | SELECT pg_class.relname AS table, COALESCE(parent.relname, '') AS parent_partitioned_table 203 | FROM pg_class 204 | JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace 205 | LEFT JOIN pg_inherits ON pg_inherits.inhrelid = pg_class.oid 206 | LEFT JOIN pg_class AS parent ON pg_inherits.inhparent = parent.oid 207 | WHERE pg_namespace.nspname = $1 AND pg_class.relkind = 'r'; 208 | `, 209 | schema, 210 | ) 211 | PanicIfError(syncer.config, err) 212 | defer tablesRows.Close() 213 | 214 | for tablesRows.Next() { 215 | pgSchemaTable := PgSchemaTable{Schema: schema} 216 | err = tablesRows.Scan(&pgSchemaTable.Table, &pgSchemaTable.ParentPartitionedTable) 217 | PanicIfError(syncer.config, err) 218 | pgSchemaTables = append(pgSchemaTables, pgSchemaTable) 219 | } 220 | 221 | return pgSchemaTables 222 | } 223 | 224 | func (syncer *Syncer) newConnection(ctx context.Context, databaseUrl string) *pgx.Conn { 225 | conn, err := pgx.Connect(ctx, databaseUrl) 226 | PanicIfError(syncer.config, err) 227 | 228 | _, err = conn.Exec(ctx, "BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE") 229 | PanicIfError(syncer.config, err) 230 | 231 | return conn 232 | } 233 | 234 | func (syncer *Syncer) readInternalTableMetadata(pgSchemaTable PgSchemaTable) InternalTableMetadata { 235 | internalTableMetadata, err := syncer.icebergReader.InternalTableMetadata(pgSchemaTable) 236 | PanicIfError(syncer.config, err) 237 | return internalTableMetadata 238 | } 239 | 240 | func (syncer *Syncer) deleteOldIcebergSchemaTables(pgSchemaTables []PgSchemaTable) { 241 | var prefixedPgSchemaTables []PgSchemaTable 242 | for _, pgSchemaTable := range pgSchemaTables { 243 | prefixedPgSchemaTables = append( 244 | prefixedPgSchemaTables, 245 | PgSchemaTable{Schema: syncer.config.Pg.SchemaPrefix + pgSchemaTable.Schema, Table: pgSchemaTable.Table}, 246 | ) 247 | } 248 | 249 | icebergSchemas, err := syncer.icebergReader.Schemas() 250 | PanicIfError(syncer.config, err) 251 | 252 | for _, icebergSchema := range icebergSchemas { 253 | found := false 254 | for _, pgSchemaTable := range prefixedPgSchemaTables { 255 | if icebergSchema == pgSchemaTable.Schema { 256 | found = true 257 | break 258 | } 259 | } 260 | 261 | if !found { 262 | LogInfo(syncer.config, "Deleting", icebergSchema, "...") 263 | err := syncer.icebergWriter.DeleteSchema(icebergSchema) 264 | PanicIfError(syncer.config, err) 265 | } 266 | } 267 | 268 | icebergSchemaTables, err := syncer.icebergReader.SchemaTables() 269 | PanicIfError(syncer.config, err) 270 | 271 | for _, icebergSchemaTable := range icebergSchemaTables.Values() { 272 | found := false 273 | for _, pgSchemaTable := range prefixedPgSchemaTables { 274 | if icebergSchemaTable.String() == pgSchemaTable.String() { 275 | found = true 276 | break 277 | } 278 | } 279 | 280 | if !found { 281 | LogInfo(syncer.config, "Deleting", icebergSchemaTable.String(), "...") 282 | err := syncer.icebergWriter.DeleteSchemaTable(icebergSchemaTable) 283 | PanicIfError(syncer.config, err) 284 | } 285 | } 286 | } 287 | 288 | type AnonymousAnalyticsData struct { 289 | Command string `json:"command"` 290 | OsName string `json:"osName"` 291 | Version string `json:"version"` 292 | PgHost string `json:"pgHost"` 293 | } 294 | 295 | func (syncer *Syncer) sendAnonymousAnalytics(command string) { 296 | if syncer.config.DisableAnonymousAnalytics { 297 | return 298 | } 299 | 300 | data := AnonymousAnalyticsData{ 301 | Command: command, 302 | OsName: runtime.GOOS + "-" + runtime.GOARCH, 303 | Version: VERSION, 304 | PgHost: ParseDatabaseHost(syncer.config.Pg.DatabaseUrl), 305 | } 306 | if data.PgHost == "" || IsLocalHost(data.PgHost) { 307 | return 308 | } 309 | 310 | jsonData, err := json.Marshal(data) 311 | if err != nil { 312 | return 313 | } 314 | 315 | client := http.Client{Timeout: 5 * time.Second} 316 | _, _ = client.Post("https://api.bemidb.com/api/analytics", "application/json", bytes.NewBuffer(jsonData)) 317 | } 318 | -------------------------------------------------------------------------------- /src/syncer_table_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestCopyFromPgTableSql(t *testing.T) { 8 | config := &Config{} 9 | syncer := NewSyncerTable(config) 10 | pgSchemaTable := PgSchemaTable{Schema: "public", Table: "users"} 11 | 12 | t.Run("Full refresh", func(t *testing.T) { 13 | // [**************************************************************************************************] 14 | // 0 curr max xmin 15 | t.Run("Runs a full refresh if there is no previous internalTableMetadata", func(t *testing.T) { 16 | internalTableMetadata := InternalTableMetadata{} 17 | currentTxid := int64(100) 18 | 19 | sql := syncer.CopyFromPgTableSql(pgSchemaTable, internalTableMetadata, currentTxid, false) 20 | 21 | expected := "COPY (SELECT *, xmin::text::bigint AS xmin FROM \"public\".\"users\" ORDER BY xmin::text::bigint ASC) TO STDOUT WITH CSV HEADER NULL 'BEMIDB_NULL'" 22 | if sql != expected { 23 | t.Errorf("Expected SQL:\n%s\nGot:\n%s", expected, sql) 24 | } 25 | }) 26 | 27 | // [**************************************************************************************************] 28 | // 0 curr max xmin 29 | t.Run("Runs a full refresh after successful full sync", func(t *testing.T) { 30 | previousMaxXmin := uint32(500) 31 | initialTxid := int64(800) 32 | currentTxid := int64(1000) 33 | internalTableMetadata := InternalTableMetadata{LastRefreshMode: RefreshModeFull, LastTxid: initialTxid, MaxXmin: &previousMaxXmin} 34 | 35 | sql := syncer.CopyFromPgTableSql(pgSchemaTable, internalTableMetadata, currentTxid, false) 36 | 37 | expected := "COPY (SELECT *, xmin::text::bigint AS xmin FROM \"public\".\"users\" ORDER BY xmin::text::bigint ASC) TO STDOUT WITH CSV HEADER NULL 'BEMIDB_NULL'" 38 | if sql != expected { 39 | t.Errorf("Expected SQL:\n%s\nGot:\n%s", expected, sql) 40 | } 41 | }) 42 | }) 43 | 44 | t.Run("Continued in-progress refresh without a wraparound", func(t *testing.T) { 45 | // Full refresh in progress 46 | // [-----------------------|************************|************************|------------------------] 47 | // 0 prev max xmin init (wraparound) txid curr (wraparound) txid 32^2 48 | t.Run("Continues a full refresh before reaching the initial txid", func(t *testing.T) { 49 | previousMaxXmin := uint32(1_000_000_000) 50 | initialTxid := int64(2_000_000_000) + (int64(1) << 32) 51 | currentTxid := int64(3_000_000_000) + (int64(1) << 32) 52 | internalTableMetadata := InternalTableMetadata{LastRefreshMode: RefreshModeFullInProgress, LastTxid: initialTxid, MaxXmin: &previousMaxXmin} 53 | 54 | sql := syncer.CopyFromPgTableSql(pgSchemaTable, internalTableMetadata, currentTxid, true) 55 | 56 | expected := "COPY (SELECT *, xmin::text::bigint AS xmin FROM \"public\".\"users\" WHERE xmin::text::bigint >= 1000000000 AND xmin::text::bigint <= 3000000000 ORDER BY xmin::text::bigint ASC) TO STDOUT WITH CSV HEADER NULL 'BEMIDB_NULL'" 57 | if sql != expected { 58 | t.Errorf("Expected SQL:\n%s\nGot:\n%s", expected, sql) 59 | } 60 | }) 61 | 62 | // Incremental refresh 63 | // [-----------------------|************************|************************|------------------------] 64 | // 0 prev max xmin init (wraparound) txid curr (wraparound) txid 32^2 65 | t.Run("Starts an incremental refresh before reaching the initial txid equal to the current txid", func(t *testing.T) { 66 | previousMaxXmin := uint32(1_000_000_000) 67 | initialTxid := int64(2_000_000_000) + (int64(1) << 32) 68 | currentTxid := int64(3_000_000_000) + (int64(1) << 32) 69 | internalTableMetadata := InternalTableMetadata{LastRefreshMode: RefreshModeIncremental, LastTxid: initialTxid, MaxXmin: &previousMaxXmin} 70 | 71 | sql := syncer.CopyFromPgTableSql(pgSchemaTable, internalTableMetadata, currentTxid, true) 72 | 73 | expected := "COPY (SELECT *, xmin::text::bigint AS xmin FROM \"public\".\"users\" WHERE xmin::text::bigint > 1000000000 AND xmin::text::bigint <= 3000000000 ORDER BY xmin::text::bigint ASC) TO STDOUT WITH CSV HEADER NULL 'BEMIDB_NULL'" 74 | if sql != expected { 75 | t.Errorf("Expected SQL:\n%s\nGot:\n%s", expected, sql) 76 | } 77 | }) 78 | 79 | // [-----------------------|************************|-------------------------------------------------] 80 | // 0 prev max xmin init (wraparound) txid 32^2 81 | // curr (wraparound) txid 82 | t.Run("Continues a full refresh before reaching the initial txid equal to the current txid", func(t *testing.T) { 83 | previousMaxXmin := uint32(1_000_000_000) 84 | initialTxid := int64(2_000_000_000) + (int64(1) << 32) 85 | currentTxid := int64(2_000_000_000) + (int64(1) << 32) 86 | internalTableMetadata := InternalTableMetadata{LastRefreshMode: RefreshModeFullInProgress, LastTxid: initialTxid, MaxXmin: &previousMaxXmin} 87 | 88 | sql := syncer.CopyFromPgTableSql(pgSchemaTable, internalTableMetadata, currentTxid, true) 89 | 90 | expected := "COPY (SELECT *, xmin::text::bigint AS xmin FROM \"public\".\"users\" WHERE xmin::text::bigint >= 1000000000 AND xmin::text::bigint <= 2000000000 ORDER BY xmin::text::bigint ASC) TO STDOUT WITH CSV HEADER NULL 'BEMIDB_NULL'" 91 | if sql != expected { 92 | t.Errorf("Expected SQL:\n%s\nGot:\n%s", expected, sql) 93 | } 94 | }) 95 | 96 | // [-----------------------|------------------------|************************|------------------------] 97 | // 0 init (wraparound) txid prev max xmin curr (wraparound) txid 32^2 98 | t.Run("Continues a full refresh after reaching the initial txid", func(t *testing.T) { 99 | initialTxid := int64(1_000_000_000) 100 | previousMaxXmin := uint32(2_000_000_000) 101 | currentTxid := int64(3_000_000_000) 102 | internalTableMetadata := InternalTableMetadata{LastRefreshMode: RefreshModeFullInProgress, LastTxid: initialTxid, MaxXmin: &previousMaxXmin} 103 | 104 | sql := syncer.CopyFromPgTableSql(pgSchemaTable, internalTableMetadata, currentTxid, true) 105 | 106 | expected := "COPY (SELECT *, xmin::text::bigint AS xmin FROM \"public\".\"users\" WHERE xmin::text::bigint >= 2000000000 AND xmin::text::bigint <= 3000000000 ORDER BY xmin::text::bigint ASC) TO STDOUT WITH CSV HEADER NULL 'BEMIDB_NULL'" 107 | if sql != expected { 108 | t.Errorf("Expected SQL:\n%s\nGot:\n%s", expected, sql) 109 | } 110 | }) 111 | }) 112 | 113 | t.Run("Continued in-progress refresh with a wraparound", func(t *testing.T) { 114 | // [***********************|------------------------|************************|************************] 115 | // 0 curr wraparound txid prev max xmin init (wraparound) txid 32^2 116 | t.Run("Continues a full refresh before reaching the initial txid", func(t *testing.T) { 117 | currentTxid := int64(1_000_000_000) + (int64(1) << 32) 118 | previousMaxXmin := uint32(2_000_000_000) 119 | initialTxid := int64(3_000_000_000) 120 | internalTableMetadata := InternalTableMetadata{LastRefreshMode: RefreshModeFullInProgress, LastTxid: initialTxid, MaxXmin: &previousMaxXmin} 121 | 122 | sql := syncer.CopyFromPgTableSql(pgSchemaTable, internalTableMetadata, currentTxid, true) 123 | 124 | expected := "COPY (SELECT *, xmin::text::bigint AS xmin FROM \"public\".\"users\" WHERE xmin::text::bigint >= 2000000000 OR xmin::text::bigint <= 1000000000 ORDER BY xmin::text::bigint <= 1000000000 ASC, xmin::text::bigint ASC) TO STDOUT WITH CSV HEADER NULL 'BEMIDB_NULL'" 125 | if sql != expected { 126 | t.Errorf("Expected SQL:\n%s\nGot:\n%s", expected, sql) 127 | } 128 | }) 129 | 130 | // [***********************|------------------------|------------------------|************************] 131 | // 0 curr wraparound txid init (wraparound) txid prev max xmin 32^2 132 | t.Run("Continues a full refresh after reaching the initial txid", func(t *testing.T) { 133 | currentTxid := int64(1_000_000_000) + (int64(1) << 32) 134 | initialTxid := int64(2_000_000_000) 135 | previousMaxXmin := uint32(3_000_000_000) 136 | internalTableMetadata := InternalTableMetadata{LastRefreshMode: RefreshModeFullInProgress, LastTxid: initialTxid, MaxXmin: &previousMaxXmin} 137 | 138 | sql := syncer.CopyFromPgTableSql(pgSchemaTable, internalTableMetadata, currentTxid, true) 139 | 140 | expected := "COPY (SELECT *, xmin::text::bigint AS xmin FROM \"public\".\"users\" WHERE xmin::text::bigint >= 3000000000 OR xmin::text::bigint <= 1000000000 ORDER BY xmin::text::bigint <= 1000000000 ASC, xmin::text::bigint ASC) TO STDOUT WITH CSV HEADER NULL 'BEMIDB_NULL'" 141 | if sql != expected { 142 | t.Errorf("Expected SQL:\n%s\nGot:\n%s", expected, sql) 143 | } 144 | }) 145 | 146 | // [-----------------------|************************|------------------------|------------------------] 147 | // 0 prev max xmin curr wraparound txid init (wraparound) txid 32^2 148 | t.Run("Continues a full refresh if a wraparound occurred during a full sync and max xmin was reset", func(t *testing.T) { 149 | previousMaxXmin := uint32(1_000_000_000) 150 | currentTxid := int64(2_000_000_000) + (int64(1) << 32) 151 | initialTxid := int64(3_000_000_000) 152 | internalTableMetadata := InternalTableMetadata{LastRefreshMode: RefreshModeFullInProgress, LastTxid: initialTxid, MaxXmin: &previousMaxXmin} 153 | 154 | sql := syncer.CopyFromPgTableSql(pgSchemaTable, internalTableMetadata, currentTxid, true) 155 | 156 | expected := "COPY (SELECT *, xmin::text::bigint AS xmin FROM \"public\".\"users\" WHERE xmin::text::bigint >= 1000000000 AND xmin::text::bigint <= 2000000000 ORDER BY xmin::text::bigint ASC) TO STDOUT WITH CSV HEADER NULL 'BEMIDB_NULL'" 157 | if sql != expected { 158 | t.Errorf("Expected SQL:\n%s\nGot:\n%s", expected, sql) 159 | } 160 | }) 161 | 162 | // [***********************|************************|------------------------|************************] 163 | // 0 init (wraparound) txid curr wraparound txid prev max xmin 32^2 164 | t.Run("Continues a full refresh after the current wrapparound txid exceeds the initial txid", func(t *testing.T) { 165 | initialTxid := int64(1_000_000_000) 166 | currentTxid := int64(2_000_000_000) + (int64(1) << 32) 167 | previousMaxXmin := uint32(3_000_000_000) 168 | internalTableMetadata := InternalTableMetadata{LastRefreshMode: RefreshModeFullInProgress, LastTxid: initialTxid, MaxXmin: &previousMaxXmin} 169 | 170 | sql := syncer.CopyFromPgTableSql(pgSchemaTable, internalTableMetadata, currentTxid, true) 171 | 172 | expected := "COPY (SELECT *, xmin::text::bigint AS xmin FROM \"public\".\"users\" WHERE xmin::text::bigint >= 3000000000 OR xmin::text::bigint <= 2000000000 ORDER BY xmin::text::bigint <= 2000000000 ASC, xmin::text::bigint ASC) TO STDOUT WITH CSV HEADER NULL 'BEMIDB_NULL'" 173 | if sql != expected { 174 | t.Errorf("Expected SQL:\n%s\nGot:\n%s", expected, sql) 175 | } 176 | }) 177 | }) 178 | } 179 | -------------------------------------------------------------------------------- /src/syncer_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import "testing" 4 | 5 | func TestShouldSyncTable(t *testing.T) { 6 | t.Run("returns true when no filters are set", func(t *testing.T) { 7 | config := &Config{ 8 | Pg: PgConfig{ 9 | DatabaseUrl: "postgres://user:pass@localhost:5432/db", 10 | }, 11 | } 12 | syncer := NewSyncer(config) 13 | pgSchemaTable := PgSchemaTable{Schema: "public", Table: "users"} 14 | 15 | if !syncer.shouldSyncTable(pgSchemaTable) { 16 | t.Error("Expected shouldSyncTable to return true when no filters are set") 17 | } 18 | }) 19 | 20 | t.Run("respects include filter", func(t *testing.T) { 21 | config := &Config{ 22 | Pg: PgConfig{ 23 | DatabaseUrl: "postgres://user:pass@localhost:5432/db", 24 | IncludeTables: []string{"public.users", "public.orders"}, 25 | }, 26 | } 27 | syncer := NewSyncer(config) 28 | 29 | pgSchemaTableIncluded := PgSchemaTable{Schema: "public", Table: "users"} 30 | if !syncer.shouldSyncTable(pgSchemaTableIncluded) { 31 | t.Error("Expected shouldSyncTable to return true for included table") 32 | } 33 | 34 | pgSchemaTableExcluded := PgSchemaTable{Schema: "public", Table: "secrets"} 35 | if syncer.shouldSyncTable(pgSchemaTableExcluded) { 36 | t.Error("Expected shouldSyncTable to return false for non-included table") 37 | } 38 | }) 39 | 40 | t.Run("respects exclude filter", func(t *testing.T) { 41 | config := &Config{ 42 | Pg: PgConfig{ 43 | DatabaseUrl: "postgres://user:pass@localhost:5432/db", 44 | ExcludeTables: []string{"public.secrets", "public.cache"}, 45 | }, 46 | } 47 | syncer := NewSyncer(config) 48 | 49 | pgSchemaTableIncluded := PgSchemaTable{Schema: "public", Table: "users"} 50 | if !syncer.shouldSyncTable(pgSchemaTableIncluded) { 51 | t.Error("Expected shouldSyncTable to return true for non-excluded table") 52 | } 53 | 54 | pgSchemaTableExcluded := PgSchemaTable{Schema: "public", Table: "secrets"} 55 | if syncer.shouldSyncTable(pgSchemaTableExcluded) { 56 | t.Error("Expected shouldSyncTable to return false for excluded table") 57 | } 58 | }) 59 | } 60 | -------------------------------------------------------------------------------- /src/utils.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "crypto/hmac" 5 | "crypto/rand" 6 | "crypto/sha256" 7 | "encoding/base64" 8 | "errors" 9 | "fmt" 10 | "net/url" 11 | "regexp" 12 | "strconv" 13 | "strings" 14 | "time" 15 | "unicode" 16 | 17 | "golang.org/x/crypto/pbkdf2" 18 | ) 19 | 20 | func IntToString(i int) string { 21 | return strconv.Itoa(i) 22 | } 23 | 24 | func Int64ToString(i int64) string { 25 | return strconv.FormatInt(i, 10) 26 | } 27 | 28 | func Uint32ToString(i uint32) string { 29 | return strconv.FormatUint(uint64(i), 10) 30 | } 31 | 32 | func StringToInt(s string) (int, error) { 33 | return strconv.Atoi(s) 34 | } 35 | 36 | func StringToInt64(s string) (int64, error) { 37 | return strconv.ParseInt(s, 10, 64) 38 | } 39 | 40 | func StringToUint32(s string) (uint32, error) { 41 | i, err := strconv.ParseUint(s, 10, 32) 42 | return uint32(i), err 43 | } 44 | 45 | func StringToScramSha256(password string) string { 46 | saltLength := 16 47 | digestLength := 32 48 | iterations := 4096 49 | clientKey := []byte("Client Key") 50 | serverKey := []byte("Server Key") 51 | 52 | salt := make([]byte, saltLength) 53 | _, err := rand.Read(salt) 54 | if err != nil { 55 | return "" 56 | } 57 | 58 | digestKey := pbkdf2.Key([]byte(password), salt, iterations, digestLength, sha256.New) 59 | clientKeyHash := hmacSha256Hash(digestKey, clientKey) 60 | serverKeyHash := hmacSha256Hash(digestKey, serverKey) 61 | storedKeyHash := sha256Hash(clientKeyHash) 62 | 63 | return fmt.Sprintf( 64 | "SCRAM-SHA-256$%d:%s$%s:%s", 65 | iterations, 66 | base64.StdEncoding.EncodeToString(salt), 67 | base64.StdEncoding.EncodeToString(storedKeyHash), 68 | base64.StdEncoding.EncodeToString(serverKeyHash), 69 | ) 70 | } 71 | 72 | func StringDateToTime(str string) (time.Time, error) { 73 | // Golang's time.Parse() function does not support parsing dates with 5+ digit years 74 | // So we need to handle this case manually by parsing the year separately 75 | var nonStandardYear int 76 | var err error 77 | parts := strings.Split(str, "-") 78 | if len(parts) == 3 && len(parts[0]) > 4 { 79 | nonStandardYear, err = StringToInt(parts[0]) 80 | if err != nil { 81 | return time.Time{}, errors.New("Invalid year: " + parts[0]) 82 | } 83 | 84 | str = str[len(parts[0])-4:] // Remove the prefix from str leaving only the standard 10 characters (YYYY-MM-DD) 85 | } 86 | 87 | parsedTime, err := time.Parse("2006-01-02", str) 88 | 89 | // If the year is non-standard, add the year difference to the parsed time after parsing 90 | if err == nil && nonStandardYear != 0 { 91 | parsedTime = parsedTime.AddDate(nonStandardYear-parsedTime.Year(), 0, 0) 92 | return parsedTime, nil 93 | } 94 | 95 | return parsedTime, err 96 | } 97 | 98 | func StringContainsUpper(str string) bool { 99 | for _, char := range str { 100 | if unicode.IsUpper(char) { 101 | return true 102 | } 103 | } 104 | return false 105 | } 106 | 107 | func Reverse[T any](originalSlice []T) []T { 108 | length := len(originalSlice) 109 | reversedSlice := make([]T, length) 110 | 111 | for i, elem := range originalSlice { 112 | reversedSlice[length-1-i] = elem 113 | } 114 | 115 | return reversedSlice 116 | } 117 | 118 | func HasExactOrWildcardMatch(strs []string, value string) bool { 119 | for _, str := range strs { 120 | if str == value { 121 | return true 122 | } 123 | 124 | if strings.Contains(str, "*") { 125 | pattern := strings.ReplaceAll(regexp.QuoteMeta(str), "\\*", ".*") 126 | matched, _ := regexp.MatchString("\\A"+pattern+"\\z", value) 127 | if matched { 128 | return true 129 | } 130 | } 131 | } 132 | 133 | return false 134 | } 135 | 136 | func ParseDatabaseHost(dbUrl string) string { 137 | if dbUrl == "" { 138 | return "" 139 | } 140 | 141 | url, err := url.Parse(dbUrl) 142 | if err != nil { 143 | return "" 144 | } 145 | 146 | return url.Hostname() 147 | } 148 | 149 | func IsLocalHost(host string) bool { 150 | return strings.HasPrefix(host, "127.0.0.1") || strings.HasPrefix(host, "localhost") 151 | } 152 | 153 | func PgWraparoundTxid(txid int64) int64 { 154 | return txid % (int64(1) << 32) 155 | } 156 | 157 | func IsPgWraparoundTxid(txid int64) bool { 158 | return txid > (int64(1) << 32) 159 | } 160 | 161 | func hmacSha256Hash(key []byte, message []byte) []byte { 162 | hash := hmac.New(sha256.New, key) 163 | hash.Write(message) 164 | return hash.Sum(nil) 165 | } 166 | 167 | func sha256Hash(data []byte) []byte { 168 | hash := sha256.New() 169 | hash.Write(data) 170 | return hash.Sum(nil) 171 | } 172 | --------------------------------------------------------------------------------