├── CNAME
├── .gitignore
├── duckdb
    ├── ddl.sql
    ├── uninstall.sh
    ├── install.sh
    ├── results
    │   ├── m6i.8xlarge_bluesky_1000m.errors
    │   ├── m6i.8xlarge_bluesky_1m.json
    │   ├── m6i.8xlarge_bluesky_10m.json
    │   ├── m6i.8xlarge_bluesky_100m.json
    │   └── m6i.8xlarge_bluesky_1000m.json
    ├── drop_table.sh
    ├── count.sh
    ├── total_size.sh
    ├── query_results.sh
    ├── physical_query_plans.sh
    ├── run_queries.sh
    ├── benchmark.sh
    ├── create_and_load.sh
    ├── queries.sql
    ├── load_data.sh
    └── main.sh
├── favicon.png
├── clickhouse
    ├── install.sh
    ├── start.sh
    ├── count.sh
    ├── total_size.sh
    ├── data_size.sh
    ├── drop_table.sh
    ├── index_size.sh
    ├── results
    │   ├── m6i.8xlarge_bluesky_1m.json
    │   ├── m6i.8xlarge_bluesky_10m.json
    │   ├── m6i.8xlarge_bluesky_100m.json
    │   └── m6i.8xlarge_bluesky_1000m.json
    ├── index_usage.sh
    ├── physical_query_plans.sh
    ├── query_results.sh
    ├── ddl.sql
    ├── run_queries.sh
    ├── create_and_load.sh
    ├── queries.sql
    ├── benchmark.sh
    ├── main.sh
    ├── load_data.sh
    └── queries_formatted.sql
├── postgresql
    ├── uninstall.sh
    ├── install.sh
    ├── drop_tables.sh
    ├── data_size.sh
    ├── total_size.sh
    ├── count.sh
    ├── ddl.sql
    ├── index_size.sh
    ├── results
    │   ├── m6i.8xlarge_bluesky_10m.json
    │   ├── m6i.8xlarge_bluesky_1m.json
    │   ├── m6i.8xlarge_bluesky_1000m.json
    │   └── m6i.8xlarge_bluesky_100m.json
    ├── query_results.sh
    ├── index_usage.sh
    ├── run_queries.sh
    ├── benchmark.sh
    ├── create_and_load.sh
    ├── queries.sql
    ├── main.sh
    └── load_data.sh
├── elasticsearch
    ├── uninstall.sh
    ├── config
    │   ├── ilm.json
    │   ├── elasticsearch.yml
    │   ├── index_template_source.json
    │   ├── index_template_no_source.json
    │   └── filebeat.yml
    ├── results
    │   ├── m6i.8xlarge_bluesky_source_1m.json
    │   ├── m6i.8xlarge_bluesky_source_10m.json
    │   ├── m6i.8xlarge_bluesky_no_source_10m.json
    │   ├── m6i.8xlarge_bluesky_no_source_1m.json
    │   ├── m6i.8xlarge_bluesky_source_1000m.json
    │   ├── m6i.8xlarge_bluesky_source_100m.json
    │   ├── m6i.8xlarge_bluesky_no_source_100m.json
    │   └── m6i.8xlarge_bluesky_no_source_1000m.json
    ├── count.sh
    ├── total_size.sh
    ├── install.sh
    ├── benchmark.sh
    ├── start.sh
    ├── drop_tables.sh
    ├── queries.txt
    ├── query_results.sh
    ├── queries_formatted.txt
    ├── run_queries.sh
    ├── create_and_load.sh
    └── main.sh
├── mongodb
    ├── uninstall.sh
    ├── ddl.js
    ├── drop_table.sh
    ├── results
    │   ├── m6i.8xlarge_bluesky_1m.json
    │   ├── m6i.8xlarge_bluesky_10m.json
    │   ├── m6i.8xlarge_bluesky_100m.json
    │   └── m6i.8xlarge_bluesky_1000m.json
    ├── install.sh
    ├── data_size.sh
    ├── total_size.sh
    ├── index_size.sh
    ├── count.sh
    ├── create_and_load.sh
    ├── benchmark.sh
    ├── query_results.sh
    ├── queries.js
    ├── index_usage.sh
    ├── main.sh
    └── load_data.sh
├── doris
    ├── stop.sh
    ├── uninstall.sh
    ├── install.sh
    ├── total_size.sh
    ├── count.sh
    ├── drop_table.sh
    ├── start.sh
    ├── results
    │   ├── m6i.8xlarge_bluesky_1m.json
    │   ├── m6i.8xlarge_bluesky_10m.json
    │   ├── m6i.8xlarge_bluesky_100m.json
    │   └── m6i.8xlarge_bluesky_1000m.json
    ├── ddl.sql
    ├── queries.sql
    ├── create_and_load.sh
    ├── benchmark.sh
    ├── run_queries.sh
    ├── queries_default.sql
    ├── main.sh
    └── load_data.sh
├── victorialogs
    ├── count.sh
    ├── index_size.sh
    ├── drop_tables.sh
    ├── start.sh
    ├── data_size.sh
    ├── install.sh
    ├── total_size.sh
    ├── query_results.sh
    ├── results
    │   ├── m6i.8xlarge_bluesky_1m.json
    │   ├── m6i.8xlarge_bluesky_10m.json
    │   ├── m6i.8xlarge_bluesky_100m.json
    │   └── m6i.8xlarge_bluesky_1000m.json
    ├── queries.logsql
    ├── run_queries.sh
    ├── load_data.sh
    ├── main.sh
    └── queries_formatted.logsql
├── starrocks
    ├── uninstall.sh
    ├── count.sh
    ├── total_size.sh
    ├── ddl.sql
    ├── drop_table.sh
    ├── results
    │   ├── m6i.8xlarge_bluesky_10m.json
    │   ├── m6i.8xlarge_bluesky_1m.json
    │   ├── m6i.8xlarge_bluesky_100m.json
    │   └── m6i.8xlarge_bluesky_1000m.json
    ├── physical_query_plans.sh
    ├── install.sh
    ├── run_queries.sh
    ├── create_and_load.sh
    ├── benchmark.sh
    ├── queries.sql
    └── main.sh
├── greptimedb
    ├── drop_tables.sh
    ├── count.sh
    ├── install.sh
    ├── data_size.sh
    ├── index_size.sh
    ├── total_size.sh
    ├── results
    │   ├── m6i.8xlarge_bluesky_1m.json
    │   ├── m6i.8xlarge_bluesky_10m.json
    │   ├── m6i.8xlarge_bluesky_100m.json
    │   └── m6i.8xlarge_bluesky_1000m.json
    ├── pipeline.yaml
    ├── start.sh
    ├── run_queries.sh
    ├── queries.sql
    ├── load_data.sh
    ├── main.sh
    └── queries_formatted.sql
├── singlestore
    ├── uninstall.sh
    ├── count.sh
    ├── drop_table.sh
    ├── index_size.sh
    ├── data_size.sh
    ├── ddl.sql
    ├── total_size.sh
    ├── benchmark.sh
    ├── query_results.sh
    ├── install.sh
    ├── results
    │   ├── m6i.8xlarge_bluesky_10m.json
    │   ├── m6i.8xlarge_bluesky_1m.json
    │   ├── m6i.8xlarge_bluesky_100m.json
    │   ├── m6i.8xlarge_bluesky_1000m.json
    │   └── _query_results
    │   │   └── _m6i.8xlarge_bluesky_1m.query_results
    ├── physical_query_plans.sh
    ├── run_queries.sh
    ├── create_and_load.sh
    ├── queries.sql
    └── load_data.sh
├── _files_gz
    ├── results
    │   ├── _files_bluesky_gz_1m.json
    │   ├── _files_bluesky_gz_10m.json
    │   ├── _files_bluesky_gz_1000m.json
    │   └── _files_bluesky_gz_100m.json
    ├── main.sh
    └── total_size.sh
├── _files_json
    ├── results
    │   ├── _files_bluesky_json_10m.json
    │   ├── _files_bluesky_json_1m.json
    │   ├── _files_bluesky_json_1000m.json
    │   └── _files_bluesky_json_100m.json
    ├── total_size.sh
    ├── load_data.sh
    └── main.sh
├── _files_lz4
    ├── results
    │   ├── _files_bluesky_lz4_1m.json
    │   ├── _files_bluesky_lz4_100m.json
    │   ├── _files_bluesky_lz4_10m.json
    │   └── _files_bluesky_lz4_1000m.json
    ├── total_size.sh
    ├── main.sh
    └── load_data.sh
├── _files_zstd
    ├── results
    │   ├── _files_bluesky_zstd_10m.json
    │   ├── _files_bluesky_zstd_1m.json
    │   ├── _files_bluesky_zstd_1000m.json
    │   └── _files_bluesky_zstd_100m.json
    ├── total_size.sh
    ├── main.sh
    └── load_data.sh
├── .github
    └── workflows
    │   └── generate-results.yml
├── generate-results.sh
└── download_data.sh


/CNAME:
--------------------------------------------------------------------------------
1 | jsonbench.com


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.bak
2 | .idea
3 | 


--------------------------------------------------------------------------------
/duckdb/ddl.sql:
--------------------------------------------------------------------------------
1 | create table bluesky (j JSON);


--------------------------------------------------------------------------------
/duckdb/uninstall.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | rm -rf ~/.duckdb
4 | 


--------------------------------------------------------------------------------
/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ClickHouse/JSONBench/HEAD/favicon.png


--------------------------------------------------------------------------------
/clickhouse/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | curl https://clickhouse.com/ | sh
4 | 


--------------------------------------------------------------------------------
/postgresql/uninstall.sh:
--------------------------------------------------------------------------------
1 | sudo apt-get remove -y postgresql-common postgresql-16
2 | 


--------------------------------------------------------------------------------
/elasticsearch/uninstall.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | sudo apt-get remove --yes elasticsearch filebeat
4 | 


--------------------------------------------------------------------------------
/mongodb/uninstall.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | sudo systemctl stop mongod
4 | sudo apt-get remove -y mongodb-org
5 | 


--------------------------------------------------------------------------------
/doris/stop.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ${DORIS_FULL_NAME}/be/bin/stop_be.sh
4 | ${DORIS_FULL_NAME}/fe/bin/stop_fe.sh
5 | 


--------------------------------------------------------------------------------
/duckdb/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | rm -rf ~/.duckdb # remove remainders
4 | curl https://install.duckdb.org | sh
5 | 


--------------------------------------------------------------------------------
/doris/uninstall.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | sudo apt-get remove -y mysql-client openjdk-17-jre-headless
4 | 
5 | rm -rf ${DORIS_FULL_NAME}
6 | 


--------------------------------------------------------------------------------
/victorialogs/count.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | curl -s --fail http://localhost:9428/select/logsql/query --data-urlencode "query=* | count() rows"
4 | 


--------------------------------------------------------------------------------
/starrocks/uninstall.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | docker stop starrocks
4 | docker rm starrocks
5 | 
6 | sudo apt-get remove -y mysql-client
7 | sudo snap remove --purge docker
8 | 


--------------------------------------------------------------------------------
/greptimedb/drop_tables.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | echo "Stopping GreptimeDB"
4 | pidof greptime && kill `pidof greptime`
5 | 
6 | echo "Dropping all data"
7 | rm -rf ./greptimedb_data
8 | 


--------------------------------------------------------------------------------
/postgresql/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # https://www.postgresql.org/download/linux/ubuntu/
4 | 
5 | sudo apt-get update
6 | sudo apt-get install -y postgresql-common postgresql-16
7 | 


--------------------------------------------------------------------------------
/duckdb/results/m6i.8xlarge_bluesky_1000m.errors:
--------------------------------------------------------------------------------
1 | `Invalid Input Error: Malformed JSON at byte 3 of input: unexpected content after document.  Input: ":"This user is a Sable!","lang":"en","name":"S..."`


--------------------------------------------------------------------------------
/victorialogs/index_size.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | curl -s --fail http://localhost:9428/select/logsql/query --data-urlencode "query=* | block_stats | sum(bloom_bytes) index_bytes | keep index_bytes"
4 | 


--------------------------------------------------------------------------------
/victorialogs/drop_tables.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | echo "Stopping VictoriaLogs"
4 | pidof victoria-logs-prod && kill `pidof victoria-logs-prod`
5 | 
6 | echo "Dropping all data"
7 | rm -rf victoria-logs-data
8 | 


--------------------------------------------------------------------------------
/singlestore/uninstall.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # stop and remove all docker containers
4 | docker stop $(docker ps -a -q)
5 | docker rm $(docker ps -a -q)
6 | 
7 | sudo apt-get remove -y mysql-client
8 | sudo snap remove --purge docker
9 | 


--------------------------------------------------------------------------------
/clickhouse/start.sh:
--------------------------------------------------------------------------------
 1 | pidof clickhouse > /dev/null && exit 1
 2 | 
 3 | ./clickhouse server > /dev/null 2>&1 &
 4 | 
 5 | sleep 5
 6 | 
 7 | while true
 8 | do
 9 |     ./clickhouse client --query "SELECT 1" && break
10 |     sleep 1
11 | done
12 | 
13 | 


--------------------------------------------------------------------------------
/mongodb/ddl.js:
--------------------------------------------------------------------------------
1 | db.createCollection(
2 |    "bluesky",
3 |    { storageEngine: { wiredTiger: { configString: "block_compressor=zstd" } } }
4 | );
5 | 
6 | db.bluesky.createIndex({"kind": 1, "commit.operation": 1, "commit.collection": 1, "did": 1, "time_us": 1});


--------------------------------------------------------------------------------
/victorialogs/start.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Do we run already?
4 | pidof victoria-logs-prod >/dev/null && exit 1
5 | 
6 | echo "Starting VictoriaLogs"
7 | ./victoria-logs-prod -loggerOutput=stdout -retentionPeriod=20y -search.maxQueryDuration=5m > server.log &
8 | 


--------------------------------------------------------------------------------
/victorialogs/data_size.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | curl -s --fail http://localhost:9428/select/logsql/query --data-urlencode "query=* | block_stats | sum(values_bytes) values_bytes, sum(dict_bytes) dict_bytes | math values_bytes + dict_bytes data_bytes | keep data_bytes"
4 | 


--------------------------------------------------------------------------------
/greptimedb/count.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | curl -s --fail http://localhost:4000/v1/sql \
4 |     -H 'Content-Type: application/x-www-form-urlencoded' \
5 |     -d "sql=select count(*) as cnt from bluesky"  \
6 |     -d "format=json" \
7 |     | grep -o "cnt\":[0-9]*" | sed 's/cnt\"://g'


--------------------------------------------------------------------------------
/victorialogs/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | RELEASE_VERSION=v1.17.0-victorialogs
4 | 
5 | wget -N https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/${RELEASE_VERSION}/victoria-logs-linux-amd64-${RELEASE_VERSION}.tar.gz
6 | tar xzf victoria-logs-linux-amd64-${RELEASE_VERSION}.tar.gz
7 | 


--------------------------------------------------------------------------------
/_files_gz/results/_files_bluesky_gz_1m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json.gz",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "retains_structure": "yes",
 7 |   "tags": [
 8 |   ],
 9 |   "dataset_size": 1000000,
10 |   "total_size": 135176827
11 | }
12 | 


--------------------------------------------------------------------------------
/_files_gz/results/_files_bluesky_gz_10m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json.gz",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "retains_structure": "yes",
 7 |   "tags": [
 8 |   ],
 9 |   "dataset_size": 10000000,
10 |   "total_size": 1354902507
11 | }
12 | 


--------------------------------------------------------------------------------
/_files_json/results/_files_bluesky_json_10m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "retains_structure": "yes",
 7 |   "tags": [
 8 |   ],
 9 |   "dataset_size": 10000000,
10 |   "total_size": 4858741288
11 | }
12 | 


--------------------------------------------------------------------------------
/_files_json/results/_files_bluesky_json_1m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "retains_structure": "yes",
 7 |   "tags": [
 8 |   ],
 9 |   "dataset_size": 1000000,
10 |   "total_size": 480778277
11 | }
12 | 


--------------------------------------------------------------------------------
/_files_lz4/results/_files_bluesky_lz4_1m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json.lz4",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "retains_structure": "yes",
 7 |   "tags": [
 8 |   ],
 9 |   "dataset_size": 1000000,
10 |   "total_size": 208385826
11 | }
12 | 


--------------------------------------------------------------------------------
/victorialogs/total_size.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | curl -s --fail http://localhost:9428/select/logsql/query --data-urlencode "query=* | block_stats | sum(values_bytes) values_bytes, sum(dict_bytes) dict_bytes, sum(bloom_bytes) bloom_bytes | math values_bytes + dict_bytes + bloom_bytes total_bytes | keep total_bytes"
4 | 


--------------------------------------------------------------------------------
/_files_gz/results/_files_bluesky_gz_1000m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json.gz",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "retains_structure": "yes",
 7 |   "tags": [
 8 |   ],
 9 |   "dataset_size": 1000000000,
10 |   "total_size": 134117979655
11 | }
12 | 


--------------------------------------------------------------------------------
/_files_gz/results/_files_bluesky_gz_100m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json.gz",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "retains_structure": "yes",
 7 |   "tags": [
 8 |   ],
 9 |   "dataset_size": 100000000,
10 |   "total_size": 13372936569
11 | }
12 | 


--------------------------------------------------------------------------------
/_files_json/results/_files_bluesky_json_1000m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "retains_structure": "yes",
 7 |   "tags": [
 8 |   ],
 9 |   "dataset_size": 1000000000,
10 |   "total_size": 482108809691
11 | }
12 | 


--------------------------------------------------------------------------------
/_files_json/results/_files_bluesky_json_100m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "retains_structure": "yes",
 7 |   "tags": [
 8 |   ],
 9 |   "dataset_size": 100000000,
10 |   "total_size": 47813179260
11 | }
12 | 


--------------------------------------------------------------------------------
/_files_lz4/results/_files_bluesky_lz4_100m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json.lz4",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "retains_structure": "yes",
 7 |   "tags": [
 8 |   ],
 9 |   "dataset_size": 100000000,
10 |   "total_size": 20591959778
11 | }
12 | 


--------------------------------------------------------------------------------
/_files_lz4/results/_files_bluesky_lz4_10m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json.lz4",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "retains_structure": "yes",
 7 |   "tags": [
 8 |   ],
 9 |   "dataset_size": 10000000,
10 |   "total_size": 2084888024
11 | }
12 | 


--------------------------------------------------------------------------------
/_files_zstd/results/_files_bluesky_zstd_10m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json.zstd",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "retains_structure": "yes",
 7 |   "tags": [
 8 |   ],
 9 |   "dataset_size": 10000000,
10 |   "total_size": 1269817486
11 | }
12 | 


--------------------------------------------------------------------------------
/_files_zstd/results/_files_bluesky_zstd_1m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json.zstd",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "retains_structure": "yes",
 7 |   "tags": [
 8 |   ],
 9 |   "dataset_size": 1000000,
10 |   "total_size": 126734406
11 | }
12 | 


--------------------------------------------------------------------------------
/_files_lz4/results/_files_bluesky_lz4_1000m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json.lz4",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "retains_structure": "yes",
 7 |   "tags": [
 8 |   ],
 9 |   "dataset_size": 1000000000,
10 |   "total_size": 206562787263
11 | }
12 | 


--------------------------------------------------------------------------------
/_files_zstd/results/_files_bluesky_zstd_1000m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json.zstd",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "retains_structure": "yes",
 7 |   "tags": [
 8 |   ],
 9 |   "dataset_size": 1000000000,
10 |   "total_size": 123797963671
11 | }
12 | 


--------------------------------------------------------------------------------
/_files_zstd/results/_files_bluesky_zstd_100m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json.zstd",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "retains_structure": "yes",
 7 |   "tags": [
 8 |   ],
 9 |   "dataset_size": 100000000,
10 |   "total_size": 12245368182
11 | }
12 | 


--------------------------------------------------------------------------------
/mongodb/drop_table.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | DB_NAME="$1"
10 | 
11 | echo "Dropping database: $DB_NAME"
12 | 
13 | mongosh --eval "use $DB_NAME" --eval "db.dropDatabase()"
14 | 


--------------------------------------------------------------------------------
/postgresql/drop_tables.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | 
12 | echo "Dropping database"
13 | sudo -u postgres psql -t -c "DROP DATABASE $DB_NAME"
14 | 


--------------------------------------------------------------------------------
/postgresql/data_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | TABLE_NAME="$2"
12 | 
13 | sudo -u postgres psql -d "$DB_NAME" -t -c "SELECT pg_table_size('$TABLE_NAME')"


--------------------------------------------------------------------------------
/clickhouse/count.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | TABLE_NAME="$2"
12 | 
13 | ./clickhouse client --database="$DB_NAME" --query "SELECT count() FROM '$TABLE_NAME';"
14 | 


--------------------------------------------------------------------------------
/postgresql/total_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | TABLE_NAME="$2"
12 | 
13 | sudo -u postgres psql -d "$DB_NAME" -t -c "SELECT pg_total_relation_size('$TABLE_NAME')"


--------------------------------------------------------------------------------
/duckdb/drop_table.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <database_name>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATABASE_NAME="$1"
11 | 
12 | echo "Dropping database: $DATABASE_NAME"
13 | 
14 | rm -f ~/${DATABASE_NAME}
15 | rm -f ~/${DATABASE_NAME}-c
16 | 


--------------------------------------------------------------------------------
/postgresql/count.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | TABLE_NAME="$2"
12 | 
13 | # Corrected SQL query
14 | sudo -u postgres psql -d "$DB_NAME" -t -c "SELECT count(*) from $TABLE_NAME"


--------------------------------------------------------------------------------
/postgresql/ddl.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE bluesky (
 2 |     data JSONB COMPRESSION lz4 NOT NULL
 3 | );
 4 | 
 5 | CREATE INDEX idx_bluesky
 6 | ON bluesky (
 7 |     (data ->> 'kind'),
 8 |     (data -> 'commit' ->> 'operation'),
 9 |     (data -> 'commit' ->> 'collection'),
10 |     (data ->> 'did'),
11 |     (TO_TIMESTAMP((data ->> 'time_us')::BIGINT / 1000000.0))
12 | );


--------------------------------------------------------------------------------
/doris/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | wget https://apache-doris-releases.oss-accelerate.aliyuncs.com/${DORIS_FULL_NAME}.tar.gz
4 | mkdir ${DORIS_FULL_NAME}
5 | tar -xvf ${DORIS_FULL_NAME}.tar.gz --strip-components 1 -C ${DORIS_FULL_NAME}
6 | 
7 | sudo apt-get update
8 | sudo apt-get install -y mysql-client openjdk-17-jre-headless # somehow _EXACTLY_ v17 is needed
9 | 


--------------------------------------------------------------------------------
/postgresql/index_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | TABLE_NAME="$2"
12 | 
13 | sudo -u postgres psql -d "$DB_NAME" -t -c "SELECT pg_relation_size(oid) FROM pg_class WHERE relname = 'idx_bluesky'"


--------------------------------------------------------------------------------
/duckdb/count.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <database_name> <table_name>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATABASE_NAME="$1"
11 | TABLE_NAME="$2"
12 | 
13 | # Fetch the count using duckDB
14 | duckdb ~/$DATABASE_NAME -c "select count() from '$TABLE_NAME';"
15 | 
16 | 


--------------------------------------------------------------------------------
/clickhouse/total_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | TABLE_NAME="$2"
12 | 
13 | ./clickhouse client --query "SELECT sum(bytes_on_disk) FROM system.parts WHERE database = '$DB_NAME' AND table = '$TABLE_NAME' AND active"
14 | 


--------------------------------------------------------------------------------
/clickhouse/data_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | TABLE_NAME="$2"
12 | 
13 | ./clickhouse client --query "SELECT sum(data_compressed_bytes) FROM system.parts WHERE database = '$DB_NAME' AND table = '$TABLE_NAME' AND active"
14 | 


--------------------------------------------------------------------------------
/clickhouse/drop_table.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "Stopping ClickHouse"
 4 | pidof clickhouse && kill -9 `pidof clickhouse`
 5 | 
 6 | # 'DROP TABLE' has a build-in safety mechanism that prevents users from dropping large tables. We hit that with large
 7 | # numbers of ingested data. Instead, make our lifes easy and remove the persistence manually.
 8 | echo "Dropping all data"
 9 | rm -rf data/ metadata/ store/
10 | 


--------------------------------------------------------------------------------
/clickhouse/index_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | TABLE_NAME="$2"
12 | 
13 | ./clickhouse client --query "SELECT sum(primary_key_size) + sum(marks_bytes) FROM system.parts WHERE database = '$DB_NAME' AND table = '$TABLE_NAME' AND active"
14 | 


--------------------------------------------------------------------------------
/doris/total_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | TABLE_NAME="$2"
12 | 
13 | mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "ANALYZE TABLE $TABLE_NAME WITH SYNC"
14 | mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "SHOW DATA FROM $TABLE_NAME"
15 | 


--------------------------------------------------------------------------------
/greptimedb/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | RELEASE_VERSION=v0.13.0-nightly-20250315
 4 | 
 5 | # download greptimedb
 6 | wget -N "https://github.com/GreptimeTeam/greptimedb/releases/download/${RELEASE_VERSION}/greptime-linux-amd64-${RELEASE_VERSION}.tar.gz"
 7 | tar xzf greptime-linux-amd64-${RELEASE_VERSION}.tar.gz
 8 | mv greptime-linux-amd64-${RELEASE_VERSION}/greptime ./
 9 | rm -rf greptime-linux-amd64-${RELEASE_VERSION}
10 | 


--------------------------------------------------------------------------------
/singlestore/count.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 3 ]]; then
 5 |     echo "Usage: $0 <ROOT_PASSWORD> <DB_NAME> <TABLE_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | ROOT_PASSWORD="$1"
11 | DB_NAME="$2"
12 | TABLE_NAME="$3"
13 | 
14 | export MYSQL_PWD=${ROOT_PASSWORD}
15 | 
16 | mysql -h 127.0.0.1 -P 3306 -u root -e "SELECT count(*) FROM $DB_NAME.$TABLE_NAME"
17 | 


--------------------------------------------------------------------------------
/doris/count.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # If you change something in this file, please change also in starrocks/count.sh.
 4 | 
 5 | # Check if the required arguments are provided
 6 | if [[ $# -lt 2 ]]; then
 7 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
 8 |     exit 1
 9 | fi
10 | 
11 | # Arguments
12 | DB_NAME="$1"
13 | TABLE_NAME="$2"
14 | 
15 | mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "SELECT count() FROM $TABLE_NAME;"
16 | 


--------------------------------------------------------------------------------
/greptimedb/data_size.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | curl -s --fail http://localhost:4000/v1/sql \
4 |     -H 'Content-Type: application/x-www-form-urlencoded' \
5 |     -d "sql=SELECT sum(r.sst_size) as data_size FROM information_schema.REGION_STATISTICS r LEFT JOIN information_schema.TABLES t on r.table_id = t.table_id WHERE t.table_name = 'bluesky'"  \
6 |     -d "format=json"  \
7 |     | grep -o "data_size\":[0-9]*" | sed 's/data_size\"://g'
8 | 


--------------------------------------------------------------------------------
/greptimedb/index_size.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | curl -s --fail http://localhost:4000/v1/sql \
4 |     -H 'Content-Type: application/x-www-form-urlencoded' \
5 |     -d "sql=SELECT sum(r.index_size) as index_size FROM information_schema.REGION_STATISTICS r LEFT JOIN information_schema.TABLES t on r.table_id = t.table_id WHERE t.table_name = 'bluesky'"  \
6 |     -d "format=json"  \
7 |     | grep -o "index_size\":[0-9]*" | sed 's/index_size\"://g'
8 | 


--------------------------------------------------------------------------------
/greptimedb/total_size.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | curl -s --fail http://localhost:4000/v1/sql \
4 |     -H 'Content-Type: application/x-www-form-urlencoded' \
5 |     -d "sql=SELECT sum(r.disk_size) as total_size FROM information_schema.REGION_STATISTICS r LEFT JOIN information_schema.TABLES t on r.table_id = t.table_id WHERE t.table_name = 'bluesky'"  \
6 |     -d "format=json"  \
7 |     | grep -o "total_size\":[0-9]*" | sed 's/total_size\"://g'
8 | 


--------------------------------------------------------------------------------
/starrocks/count.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # If you change something in this file, please change also in doris/count.sh.
 4 | 
 5 | # Check if the required arguments are provided
 6 | if [[ $# -lt 2 ]]; then
 7 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
 8 |     exit 1
 9 | fi
10 | 
11 | # Arguments
12 | DB_NAME="$1"
13 | TABLE_NAME="$2"
14 | 
15 | mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" -e "SELECT count() FROM $TABLE_NAME;"
16 | 


--------------------------------------------------------------------------------
/singlestore/drop_table.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 3 ]]; then
 5 |     echo "Usage: $0 <ROOT_PASSWORD> <DB_NAME> <TABLE_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | ROOT_PASSWORD="$1"
10 | DB_NAME="$2"
11 | TABLE_NAME="$3"
12 | 
13 | echo "Dropping table: $DB_NAME.$TABLE_NAME"
14 | 
15 | export MYSQL_PWD=${ROOT_PASSWORD}
16 | mysql -h 127.0.0.1 -P 3306 -u root -e "DROP DATABASE IF EXISTS $DB_NAME"
17 | 


--------------------------------------------------------------------------------
/starrocks/total_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # If you change something in this file, please change also in doris/total_size.sh.
 4 | 
 5 | # Check if the required arguments are provided
 6 | if [[ $# -lt 2 ]]; then
 7 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
 8 |     exit 1
 9 | fi
10 | 
11 | # Arguments
12 | DB_NAME="$1"
13 | TABLE_NAME="$2"
14 | 
15 | mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" -e "SHOW DATA FROM $TABLE_NAME"
16 | 


--------------------------------------------------------------------------------
/doris/drop_table.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # If you change something in this file, please change also in starrocks/drop_table.sh.
 4 | 
 5 | # Check if the required arguments are provided
 6 | if [[ $# -lt 2 ]]; then
 7 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
 8 |     exit 1
 9 | fi
10 | 
11 | DB_NAME="$1"
12 | TABLE_NAME="$2"
13 | 
14 | echo "Dropping table: $DB_NAME.$TABLE_NAME"
15 | mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "DROP TABLE IF EXISTS $TABLE_NAME"
16 | 


--------------------------------------------------------------------------------
/starrocks/ddl.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE bluesky (
 2 |     `id` BIGINT AUTO_INCREMENT,
 3 |     `data` JSON NOT NULL COMMENT "Primary JSON object, optimized for field access using FlatJSON",
 4 | 
 5 |     sort_key VARBINARY AS encode_sort_key(
 6 |         get_json_string(data, 'kind'),
 7 |         get_json_string(data, 'commit.operation'),
 8 |         get_json_string(data, 'commit.collection'),
 9 |         get_json_string(data, 'did')
10 |     )
11 | )
12 | ORDER BY (sort_key);
13 | 


--------------------------------------------------------------------------------
/elasticsearch/config/ilm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "policy": {
 3 |       "phases": {
 4 |         "hot": {
 5 |           "min_age": "0ms",
 6 |           "actions": {
 7 |             "rollover": {
 8 |               "max_age": "30d",
 9 |               "max_primary_shard_size": "50gb"
10 |             },
11 |             "forcemerge": {
12 |               "max_num_segments": 1
13 |             },
14 |             "readonly": {}
15 |           }
16 |         }
17 |       }
18 |     }
19 |   }
20 |   


--------------------------------------------------------------------------------
/starrocks/drop_table.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # If you change something in this file, please change also in doris/drop_table.sh.
 4 | 
 5 | # Check if the required arguments are provided
 6 | if [[ $# -lt 2 ]]; then
 7 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
 8 |     exit 1
 9 | fi
10 | 
11 | DB_NAME="$1"
12 | TABLE_NAME="$2"
13 | 
14 | echo "Dropping table: $DB_NAME.$TABLE_NAME"
15 | mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" -e "DROP TABLE IF EXISTS $TABLE_NAME"
16 | 


--------------------------------------------------------------------------------
/_files_json/total_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required argument is provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DIRECTORY>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Argument
10 | DIRECTORY="$1"
11 | 
12 | # Check if the directory exists
13 | if [[ ! -d "$DIRECTORY" ]]; then
14 |     echo "Error: Directory '$DIRECTORY' does not exist."
15 |     exit 1
16 | fi
17 | 
18 | # Get the total size in bytes and suppress the directory name
19 | du -sb "$DIRECTORY" | awk '{print $1}'


--------------------------------------------------------------------------------
/_files_lz4/total_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required argument is provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DIRECTORY>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Argument
10 | DIRECTORY="$1"
11 | 
12 | # Check if the directory exists
13 | if [[ ! -d "$DIRECTORY" ]]; then
14 |     echo "Error: Directory '$DIRECTORY' does not exist."
15 |     exit 1
16 | fi
17 | 
18 | # Get the total size in bytes and suppress the directory name
19 | du -sb "$DIRECTORY" | awk '{print $1}'


--------------------------------------------------------------------------------
/_files_zstd/total_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required argument is provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DIRECTORY>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Argument
10 | DIRECTORY="$1"
11 | 
12 | # Check if the directory exists
13 | if [[ ! -d "$DIRECTORY" ]]; then
14 |     echo "Error: Directory '$DIRECTORY' does not exist."
15 |     exit 1
16 | fi
17 | 
18 | # Get the total size in bytes and suppress the directory name
19 | du -sb "$DIRECTORY" | awk '{print $1}'


--------------------------------------------------------------------------------
/singlestore/index_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 3 ]]; then
 5 |     echo "Usage: $0 <ROOT_PASSWORD> <DB_NAME> <TABLE_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | ROOT_PASSWORD="$1"
11 | DB_NAME="$2"
12 | TABLE_NAME="$3"
13 | 
14 | export MYSQL_PWD=${ROOT_PASSWORD}
15 | 
16 | mysql -h 127.0.0.1 -P 3306 -u root -e "SELECT sum(memory_use) FROM information_schema.index_statistics WHERE database_name = '$DB_NAME' AND table_name = '$TABLE_NAME'"
17 | 


--------------------------------------------------------------------------------
/doris/start.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-amd64"
 4 | sudo sysctl -w vm.max_map_count=2000000
 5 | sudo sh -c ulimit -n 655350
 6 | 
 7 | ${DORIS_FULL_NAME}/be/bin/start_be.sh --daemon
 8 | ${DORIS_FULL_NAME}/fe/bin/start_fe.sh --daemon
 9 | 
10 | echo "Sleep 30 sec to wait doris start"
11 | sleep 30s
12 | 
13 | mysql -P 9030 -h 127.0.0.1 -u root -e "ALTER SYSTEM ADD BACKEND \"127.0.0.1:9050\";"
14 | 
15 | echo "Sleep 10 sec to wait frontend are connected to backend"
16 | sleep 10s
17 | 


--------------------------------------------------------------------------------
/singlestore/data_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 3 ]]; then
 5 |     echo "Usage: $0 <ROOT_PASSWORD> <DB_NAME> <TABLE_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | ROOT_PASSWORD="$1"
11 | DB_NAME="$2"
12 | TABLE_NAME="$3"
13 | 
14 | export MYSQL_PWD=${ROOT_PASSWORD}
15 | 
16 | mysql -h 127.0.0.1 -P 3306 -u root -e "SELECT sum(compressed_size) FROM information_schema.columnar_segments WHERE database_name = '$DB_NAME' AND table_name = '$TABLE_NAME'"
17 | 


--------------------------------------------------------------------------------
/duckdb/results/m6i.8xlarge_bluesky_1m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "DuckDB",
 3 |   "version": "1.1.3",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-03-27",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000,
11 |   "num_loaded_documents": 1000000,
12 |   "total_size": 485490688,
13 |   "result": [
14 |     [3.554,0.112,0.111],
15 |     [3.441,0.324,0.321],
16 |     [2.921,0.329,0.339],
17 |     [2.961,0.255,0.255],
18 |     [3.131,0.262,0.262]
19 |   ]
20 | }
21 | 


--------------------------------------------------------------------------------
/singlestore/ddl.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE bluesky
2 | (
3 |     data JSON
4 | );
5 | -- Notes:
6 | --   - Not using data structures to speed up scans. In SingleStore, no sort keys or indexes can be created on JSON sub-columns.
7 | --   - The only physical optimization we use is 'use_seekable_json' but that is implicitly on: https://docs.singlestore.com/db/v8.9/create-a-database/columnstore/columnstore-seekability-using-json/
8 | --   - We _could_ run OPTIMIZE to force a merge but since we are also not doing this for other benchmarked databases, we omit that.
9 | 


--------------------------------------------------------------------------------
/starrocks/results/m6i.8xlarge_bluesky_10m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Starrocks",
 3 |   "version": "4.0.1",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-11-17",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 10000000,
11 |   "num_loaded_documents": 9999997,
12 |   "total_size": 2043330691,
13 |   "result": [
14 |       [0.09,0.03,0.03],
15 |       [0.50,0.22,0.19],
16 |       [0.12,0.06,0.05],
17 |       [0.10,0.04,0.04],
18 |       [0.10,0.04,0.04]
19 |   ]
20 | }
21 | 


--------------------------------------------------------------------------------
/starrocks/results/m6i.8xlarge_bluesky_1m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Starrocks",
 3 |   "version": "4.0.1",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-11-17",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000,
11 |   "num_loaded_documents": 1000000,
12 |   "total_size": 207987146,
13 |   "result": [
14 |       [0.14,0.05,0.05],
15 |       [0.21,0.06,0.06],
16 |       [0.34,0.05,0.05],
17 |       [0.11,0.03,0.03],
18 |       [0.09,0.03,0.03]
19 |   ]
20 | }
21 | 


--------------------------------------------------------------------------------
/duckdb/results/m6i.8xlarge_bluesky_10m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "DuckDB",
 3 |   "version": "1.1.3",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-03-27",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 10000000,
11 |   "num_loaded_documents": 9700000,
12 |   "total_size": 4753981440,
13 |   "result": [
14 |     [36.379,0.612,0.609],
15 |     [36.357,1.611,1.608],
16 |     [36.310,1.551,1.561],
17 |     [36.337,1.028,1.108],
18 |     [36.372,1.113,1.118]
19 |   ]
20 | }
21 | 


--------------------------------------------------------------------------------
/starrocks/results/m6i.8xlarge_bluesky_100m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Starrocks",
 3 |   "version": "4.0.1",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-11-17",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 100000000,
11 |   "num_loaded_documents": 99999984,
12 |   "total_size": 18193481465,
13 |   "result": [
14 |       [0.17,0.09,0.09],
15 |       [7.14,1.07,1.04],
16 |       [1.13,0.21,0.22],
17 |       [0.18,0.12,0.12],
18 |       [0.19,0.13,0.14]
19 |   ]
20 | }
21 | 


--------------------------------------------------------------------------------
/victorialogs/query_results.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | QUERY_NUM=1
 4 | 
 5 | set -f
 6 | cat queries.logsql | while read -r query; do
 7 | 
 8 |     # Print the query
 9 |     echo "------------------------------------------------------------------------------------------------------------------------"
10 |     echo "Result for query Q$QUERY_NUM:"
11 |     echo
12 | 
13 |     curl -s --fail http://localhost:9428/select/logsql/query --data-urlencode "query=$query"
14 | 
15 |     # Increment the query number
16 |     QUERY_NUM=$((QUERY_NUM + 1))
17 | done;
18 | 


--------------------------------------------------------------------------------
/duckdb/total_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <database_name> <table_name>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATABASE_NAME="$1"
11 | TABLE_NAME="$2"
12 | 
13 | # Fetch the total size using duckDB
14 | duckdb ~/$DATABASE_NAME -c "select '$TABLE_NAME' as table_name, count(distinct block_id) as num_blocks, count(distinct block_id) * (select block_size from pragma_database_size()) as num_bytes from pragma_storage_info('$TABLE_NAME') group by all;"
15 | 
16 | 


--------------------------------------------------------------------------------
/starrocks/results/m6i.8xlarge_bluesky_1000m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Starrocks",
 3 |   "version": "4.0.1",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-11-17",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000000,
11 |   "num_loaded_documents": 997999662,
12 |   "total_size": 192981470543,
13 |   "result": [
14 |       [0.69,0.52,0.51],
15 |       [74.01,7.27,7.18],
16 |       [20.58,1.51,1.45],
17 |       [15.86,1.03,1.01],
18 |       [1.18,1.10,1.11]
19 |   ]
20 | }
21 | 


--------------------------------------------------------------------------------
/duckdb/results/m6i.8xlarge_bluesky_100m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "DuckDB",
 3 |   "version": "1.1.3",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-03-27",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 100000000,
11 |   "num_loaded_documents": 98700000,
12 |   "total_size": 47184347136,
13 |   "result": [
14 |     [367.536,5.485,5.487],
15 |     [367.771,15.209,15.225],
16 |     [367.548,13.420,13.357],
17 |     [367.689,7.544,7.576],
18 |     [367.900,8.177,8.120]
19 |   ]
20 | }
21 | 


--------------------------------------------------------------------------------
/elasticsearch/results/m6i.8xlarge_bluesky_source_1m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Elasticsearch",
 3 |   "version": "8.17.0",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000,
11 |   "num_loaded_documents": 1000000,
12 |   "total_size": 400974094,
13 |   "result": [
14 |     [0.039,0.036,0.036],
15 |     [0.344,0.303,0.305],
16 |     [0.171,0.166,0.159],
17 |     [0.047,0.047,0.049],
18 |     [0.056,0.056,0.056]
19 |   ]
20 | }
21 | 


--------------------------------------------------------------------------------
/elasticsearch/results/m6i.8xlarge_bluesky_source_10m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Elasticsearch",
 3 |   "version": "8.17.0",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 10000000,
11 |   "num_loaded_documents": 9999998,
12 |   "total_size": 3785308904,
13 |   "result": [
14 |     [0.286,0.290,0.287],
15 |     [2.487,2.367,2.406],
16 |     [1.747,1.671,1.656],
17 |     [0.368,0.360,0.364],
18 |     [0.423,0.424,0.422]
19 |   ]
20 | }
21 | 


--------------------------------------------------------------------------------
/singlestore/total_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 3 ]]; then
 5 |     echo "Usage: $0 <ROOT_PASSWORD> <DB_NAME> <TABLE_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | ROOT_PASSWORD="$1"
11 | DB_NAME="$2"
12 | TABLE_NAME="$3"
13 | 
14 | export MYSQL_PWD=${ROOT_PASSWORD}
15 | 
16 | # No indexes are used, same query as in data_size.sh
17 | mysql -h 127.0.0.1 -P 3306 -u root -e "SELECT sum(compressed_size) FROM information_schema.columnar_segments WHERE database_name = '$DB_NAME' AND table_name = '$TABLE_NAME'"
18 | 


--------------------------------------------------------------------------------
/elasticsearch/results/m6i.8xlarge_bluesky_no_source_10m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Elasticsearch (no source)",
 3 |   "version": "8.17.0",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "retains_structure": "no",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 10000000,
11 |   "num_loaded_documents": 9999998,
12 |   "total_size": 2834172690,
13 |   "result": [
14 |     [0.270,0.263,0.275],
15 |     [2.942,2.683,2.655],
16 |     [2.014,2.008,2.037],
17 |     [0.414,0.412,0.437],
18 |     [0.562,0.470,0.463]
19 |   ]
20 | }
21 | 


--------------------------------------------------------------------------------
/elasticsearch/results/m6i.8xlarge_bluesky_no_source_1m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Elasticsearch (no source)",
 3 |   "version": "8.17.0",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "retains_structure": "no",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000,
11 |   "num_loaded_documents": 1000000,
12 |   "total_size": 400948257,
13 |   "result": [
14 |     [0.041,0.037,0.035],
15 |     [0.426,0.321,0.323],
16 |     [0.192,0.186,0.213],
17 |     [0.056,0.052,0.053],
18 |     [0.099,0.061,0.060]
19 |   ]
20 | }
21 | 


--------------------------------------------------------------------------------
/elasticsearch/results/m6i.8xlarge_bluesky_source_1000m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Elasticsearch",
 3 |   "version": "8.17.0",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000000,
11 |   "num_loaded_documents": 999999101,
12 |   "total_size": 386099682721,
13 |   "result": [
14 |     [3.854,3.884,4.081],
15 |     [37.078,29.084,28.548],
16 |     [24.382,24.279,23.570],
17 |     [8.106,8.228,8.080],
18 |     [9.208,8.994,9.084]
19 |   ]
20 | }
21 | 


--------------------------------------------------------------------------------
/elasticsearch/results/m6i.8xlarge_bluesky_source_100m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Elasticsearch",
 3 |   "version": "8.17.0",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 100000000,
11 |   "num_loaded_documents": 99999947,
12 |   "total_size": 34182479705,
13 |   "result": [
14 |     [2.765,2.718,2.799],
15 |     [20.788,20.822,20.270],
16 |     [16.306,16.642,15.693],
17 |     [2.454,2.461,2.423],
18 |     [2.761,2.768,2.784]
19 |   ]
20 | }
21 | 


--------------------------------------------------------------------------------
/elasticsearch/results/m6i.8xlarge_bluesky_no_source_100m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Elasticsearch (no source)",
 3 |   "version": "8.17.0",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "retains_structure": "no",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 100000000,
11 |   "num_loaded_documents": 99999947,
12 |   "total_size": 21268479403,
13 |   "result": [
14 |     [2.532,2.536,2.486],
15 |     [23.194,22.932,23.188],
16 |     [19.521,19.321,19.159],
17 |     [2.867,2.791,2.884],
18 |     [3.099,3.136,3.171]
19 |   ]
20 | }
21 | 


--------------------------------------------------------------------------------
/doris/results/m6i.8xlarge_bluesky_1m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Apache Doris",
 3 |   "version": "doris-3.0.5-rc01-e277cfb83f",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-05-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000,
11 |   "num_loaded_documents": 1000000,
12 |   "total_size": 207785820,
13 |   "data_size": 207785820,
14 |   "result": [
15 |       [0.03,0.02,0.02],
16 |       [0.06,0.04,0.05],
17 |       [0.04,0.02,0.03],
18 |       [0.03,0.02,0.02],
19 |       [0.03,0.02,0.02]
20 |   ]
21 | }
22 | 


--------------------------------------------------------------------------------
/elasticsearch/results/m6i.8xlarge_bluesky_no_source_1000m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Elasticsearch (no source)",
 3 |   "version": "8.17.0",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "retains_structure": "no",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000000,
11 |   "num_loaded_documents": 999998998,
12 |   "total_size": 235840659266,
13 |   "result": [
14 |     [5.022,5.019,5.078],
15 |     [51.486,45.510,45.713],
16 |     [41.789,41.359,41.608],
17 |     [8.807,8.812,8.711],
18 |     [9.696,9.723,9.533]
19 |   ]
20 | }
21 | 


--------------------------------------------------------------------------------
/mongodb/results/m6i.8xlarge_bluesky_1m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "MongoDB",
 3 |   "version": "8.0.6",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000,
11 |   "num_loaded_documents": 1000000,
12 |   "total_size": 954368,
13 |   "data_size": 4096,
14 |   "index_size": 950272,
15 |   "result": [
16 |     [1.324,1.338,1.341],
17 |     [1.815,1.823,1.832],
18 |     [1.555,1.603,1.532],
19 |     [0.249,0.256,0.266],
20 |     [0.267,0.278,0.276]
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/doris/results/m6i.8xlarge_bluesky_10m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Apache Doris",
 3 |   "version": "doris-3.0.5-rc01-e277cfb83f",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-05-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 10000000,
11 |   "num_loaded_documents": 10000000,
12 |   "total_size": 2170032226,
13 |   "data_size": 2170032226,
14 |   "result": [
15 |       [0.05,0.04,0.04],
16 |       [0.58,0.12,0.12],
17 |       [0.06,0.04,0.05],
18 |       [0.04,0.03,0.04],
19 |       [0.04,0.04,0.04]
20 |   ]
21 | }
22 | 


--------------------------------------------------------------------------------
/duckdb/results/m6i.8xlarge_bluesky_1000m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "DuckDB",
 3 |   "version": "1.1.3",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-03-27",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000000,
11 |   "num_loaded_documents": 974400000,
12 |   "total_size": 472599756800,
13 |   "result": [
14 |     [3734.026,3722.939,3717.611],
15 |     [3737.451,3726.788,3721.045],
16 |     [3734.092,3722.939,3717.631],
17 |     [3737.381,3724.588,3719.273],
18 |     [3737.908,3726.648,3722.804]
19 |   ]
20 | }
21 | 


--------------------------------------------------------------------------------
/doris/results/m6i.8xlarge_bluesky_100m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Apache Doris",
 3 |   "version": "doris-3.0.5-rc01-e277cfb83f",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-05-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 100000000,
11 |   "num_loaded_documents": 100000000,
12 |   "total_size": 21304111530,
13 |   "data_size": 21304111530,
14 |   "result": [
15 |       [0.23,0.19,0.18],
16 |       [9.39,0.77,0.77],
17 |       [1.12,0.19,0.19],
18 |       [0.17,0.15,0.17],
19 |       [0.19,0.17,0.16]
20 |   ]
21 | }
22 | 


--------------------------------------------------------------------------------
/doris/results/m6i.8xlarge_bluesky_1000m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Apache Doris",
 3 |   "version": "doris-3.0.5-rc01-e277cfb83f",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-05-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000000,
11 |   "num_loaded_documents": 999999994,
12 |   "total_size": 214623810748,
13 |   "data_size": 214623810748,
14 |   "result": [
15 |       [2.02,1.63,1.63],
16 |       [96.02,6.61,6.64],
17 |       [21.89,2.15,1.78],
18 |       [10.26,0.95,0.93],
19 |       [1.04,1.03,1.04]
20 |   ]
21 | }
22 | 


--------------------------------------------------------------------------------
/mongodb/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # https://www.mongodb.com/docs/manual/tutorial/install-mongodb-on-ubuntu/
 4 | 
 5 | sudo sudo apt-get install gnupg curl
 6 | curl -fsSL https://www.mongodb.org/static/pgp/server-8.0.asc | \
 7 |    sudo gpg --dearmor --yes -o /usr/share/keyrings/mongodb-server-8.0.gpg
 8 | echo "deb [ arch=amd64,arm64 signed-by=/usr/share/keyrings/mongodb-server-8.0.gpg ] https://repo.mongodb.org/apt/ubuntu noble/mongodb-org/8.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-8.0.list
 9 | sudo apt-get update
10 | sudo apt-get install -y mongodb-org
11 | sudo systemctl start mongod
12 | 


--------------------------------------------------------------------------------
/clickhouse/results/m6i.8xlarge_bluesky_1m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "ClickHouse",
 3 |   "version": "25.11",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-11-15",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000,
11 |   "num_loaded_documents": 1000000,
12 |   "total_size": 98534792,
13 |   "data_size": 98424457,
14 |   "index_size": 110328,
15 |   "result": [
16 |     [0.007, 0.045, 0.004],
17 |     [0.042, 0.035, 0.022],
18 |     [0.022, 0.013, 0.012],
19 |     [0.033, 0.017, 0.017],
20 |     [0.041, 0.020, 0.019]
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/mongodb/results/m6i.8xlarge_bluesky_10m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "MongoDB",
 3 |   "version": "8.0.6",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 10000000,
11 |   "num_loaded_documents": 7860000,
12 |   "total_size": 1536905216,
13 |   "data_size": 1171361792,
14 |   "index_size": 365543424,
15 |   "result": [
16 |     [10.334,10.266,10.298],
17 |     [37.401,36.807,37.979],
18 |     [13.209,12.799,12.889],
19 |     [2.071,2.029,2.119],
20 |     [2.165,2.076,2.119]
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/clickhouse/results/m6i.8xlarge_bluesky_10m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "ClickHouse",
 3 |   "version": "25.11",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-11-15",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 10000000,
11 |   "num_loaded_documents": 9999994,
12 |   "total_size": 1001159528,
13 |   "data_size": 1000084752,
14 |   "index_size": 1074709,
15 |   "result": [
16 |     [0.010, 0.049, 0.006],
17 |     [0.151, 0.107, 0.102],
18 |     [0.051, 0.036, 0.037],
19 |     [0.079, 0.043, 0.040],
20 |     [0.075, 0.046, 0.045]
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/clickhouse/results/m6i.8xlarge_bluesky_100m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "ClickHouse",
 3 |   "version": "25.11",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-11-15",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 100000000,
11 |   "num_loaded_documents": 99999968,
12 |   "total_size": 9662241335,
13 |   "data_size": 9645940572,
14 |   "index_size": 16300557,
15 |   "result": [
16 |     [0.077, 0.247, 0.027],
17 |     [2.322, 0.404, 0.402],
18 |     [2.680, 0.214, 0.220],
19 |     [0.234, 0.081, 0.079],
20 |     [0.513, 0.088, 0.091]
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/greptimedb/results/m6i.8xlarge_bluesky_1m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "GreptimeDB",
 3 |   "version": "v0.13.0-nightly-20250315",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-03-17",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "no",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000,
11 |   "num_loaded_documents": 1000000,
12 |   "total_size": 112555783,
13 |   "data_size": 112546830,
14 |   "index_size": 7048,
15 |   "result": [
16 |     [0.148, 0.012, 0.012],
17 |     [0.184, 0.064, 0.064],
18 |     [0.116, 0.014, 0.024],
19 |     [0.111, 0.016, 0.014],
20 |     [0.122, 0.017, 0.034]
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/clickhouse/results/m6i.8xlarge_bluesky_1000m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "ClickHouse",
 3 |   "version": "25.11",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-11-15",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000000,
11 |   "num_loaded_documents": 999999258,
12 |   "total_size": 99560268152,
13 |   "data_size": 99068986216,
14 |   "index_size": 491281201,
15 |   "result": [
16 |     [0.492, 0.617, 0.225],
17 |     [16.298, 3.241, 3.236],
18 |     [31.711, 2.136, 2.174],
19 |     [6.985, 0.480, 0.479],
20 |     [7.393, 0.518, 0.514]
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/greptimedb/results/m6i.8xlarge_bluesky_10m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "GreptimeDB",
 3 |   "version": "v0.13.0-nightly-20250315",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-03-17",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "no",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 10000000,
11 |   "num_loaded_documents": 9999994,
12 |   "total_size": 1136875364,
13 |   "data_size": 1136814387,
14 |   "index_size": 55651,
15 |   "result": [
16 |     [0.172, 0.029, 0.028],
17 |     [0.46, 0.386, 0.421],
18 |     [0.146, 0.044, 0.034],
19 |     [0.121, 0.023, 0.023],
20 |     [0.156, 0.064, 0.051]
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/mongodb/results/m6i.8xlarge_bluesky_100m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "MongoDB",
 3 |   "version": "8.0.6",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 100000000,
11 |   "num_loaded_documents": 94408000,
12 |   "total_size": 19315617792,
13 |   "data_size": 13590446080,
14 |   "index_size": 5725171712,
15 |   "result": [
16 |     [113.722,108.665,108.909],
17 |     [1551.96,1563.52,1542.5],
18 |     [141.132,138.307,140.456],
19 |     [21.948,21.717,21.424],
20 |     [23.103,22.574,22.302]
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/victorialogs/results/m6i.8xlarge_bluesky_1m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "VictoriaLogs",
 3 |   "version": "v1.17.0-victorialogs",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-03-28",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "no",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000,
11 |   "num_loaded_documents": 1000000,
12 |   "total_size": 121728495,
13 |   "data_size": 108413431,
14 |   "index_size": 13315064,
15 |   "result": [
16 |     [0.074, 0.006, 0.006],
17 |     [0.128, 0.069, 0.046],
18 |     [0.082, 0.015, 0.015],
19 |     [0.094, 0.021, 0.017],
20 |     [0.126, 0.052, 0.056]
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/greptimedb/results/m6i.8xlarge_bluesky_100m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "GreptimeDB",
 3 |   "version": "v0.13.0-nightly-20250315",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-03-17",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "no",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 100000000,
11 |   "num_loaded_documents": 99999968,
12 |   "total_size": 10824010669,
13 |   "data_size": 10823634560,
14 |   "index_size": 348813,
15 |   "result": [
16 |     [0.5, 0.167, 0.164],
17 |     [10.711, 3.362, 3.324],
18 |     [0.314, 0.225, 0.228],
19 |     [0.146, 0.069, 0.068],
20 |     [0.308, 0.22, 0.206]
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/victorialogs/results/m6i.8xlarge_bluesky_10m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "VictoriaLogs",
 3 |   "version": "v1.17.0-victorialogs",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-03-28",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "no",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 10000000,
11 |   "num_loaded_documents": 9999994,
12 |   "total_size": 1242217952,
13 |   "data_size": 1108712800,
14 |   "index_size": 133505152,
15 |   "result": [
16 |     [0.086, 0.007, 0.007],
17 |     [0.47, 0.241, 0.329],
18 |     [0.095, 0.027, 0.028],
19 |     [0.199, 0.142, 0.128],
20 |     [0.211, 0.148, 0.176]
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/postgresql/results/m6i.8xlarge_bluesky_10m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "PostgreSQL",
 3 |   "version": "16.6",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 10000000,
11 |   "num_loaded_documents": 7000000,
12 |   "total_size": 5794693120,
13 |   "data_size": 4653178880,
14 |   "index_size": 1141514240,
15 |   "result": [
16 |     [35.1545,0.949281,0.949623],
17 |     [51.8718,9.55589,9.57942],
18 |     [36.1771,2.60652,2.59737],
19 |     [175.424,1.99499,1.93142],
20 |     [176.353,2.1341,2.11399]
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/postgresql/results/m6i.8xlarge_bluesky_1m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "PostgreSQL",
 3 |   "version": "16.6",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000,
11 |   "num_loaded_documents": 1000000,
12 |   "total_size": 731111424,
13 |   "data_size": 586768384,
14 |   "index_size": 144343040,
15 |   "result": [
16 |     [4.08449,0.133099,0.134603],
17 |     [30.3611,2.15746,2.11889],
18 |     [4.13365,0.35889,0.357372],
19 |     [15.6849,0.487806,0.224725],
20 |     [16.1242,0.251007,0.249576]
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/victorialogs/results/m6i.8xlarge_bluesky_100m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "VictoriaLogs",
 3 |   "version": "v1.17.0-victorialogs",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-03-28",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "no",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 100000000,
11 |   "num_loaded_documents": 99999968,
12 |   "total_size": 11984344799,
13 |   "data_size": 10684518543,
14 |   "index_size": 1299826256,
15 |   "result": [
16 |     [0.097, 0.009, 0.009],
17 |     [9.057, 2.721, 2.406],
18 |     [0.474, 0.151, 0.148],
19 |     [0.542, 0.464, 0.453],
20 |     [0.735, 0.623, 0.664]
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/mongodb/results/m6i.8xlarge_bluesky_1000m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "MongoDB",
 3 |   "version": "8.0.6",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000000,
11 |   "num_loaded_documents": 893632990,
12 |   "total_size": 176737361920,
13 |   "data_size": 131007311872,
14 |   "index_size": 45730050048,
15 |   "result": [
16 |     [1472.45,1109.15,1054.37],
17 |     [20715.1,20484.9,20461.8],
18 |     [1218.37,1216.24,1217.69],
19 |     [169.069,170.265,168.797],
20 |     [174.406,173.932,173.268]
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/postgresql/results/m6i.8xlarge_bluesky_1000m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "PostgreSQL",
 3 |   "version": "16.6",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000000,
11 |   "num_loaded_documents": 804000000,
12 |   "total_size": 660356890624,
13 |   "data_size": 512144687104,
14 |   "index_size": 148180910080,
15 |   "result": [
16 |     [3904.83,3884.18,3884.17],
17 |     [32594.9,32590.5,4277.8],
18 |     [4249.8,4253.34,4927.79],
19 |     [4903.66,4907.09,4947.12],
20 |     [4922.98,4913.5,4928.3]
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/postgresql/results/m6i.8xlarge_bluesky_100m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "PostgreSQL",
 3 |   "version": "16.6",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 100000000,
11 |   "num_loaded_documents": 91000000,
12 |   "total_size": 69064736768,
13 |   "data_size": 54598713344,
14 |   "index_size": 14465753088,
15 |   "result": [
16 |     [416.392,10.3372,10.3301],
17 |     [1868.87,1458.26,1457.45],
18 |     [440.283,33.0992,33.1468],
19 |     [477.82,17.7637,17.3674],
20 |     [476.837,18.9056,18.4372]
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/greptimedb/results/m6i.8xlarge_bluesky_1000m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "GreptimeDB",
 3 |   "version": "v0.13.0-nightly-20250315",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-03-17",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "no",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000000,
11 |   "num_loaded_documents": 999999233,
12 |   "total_size": 108808568186,
13 |   "data_size": 108803782584,
14 |   "index_size": 4656890,
15 |   "result": [
16 |     [13.643, 1.568, 1.553],
17 |     [93.269, 22.235, 21.625],
18 |     [2.176, 2.081, 2.086],
19 |     [0.507, 0.427, 0.439],
20 |     [1.694, 1.614, 1.606]
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/victorialogs/results/m6i.8xlarge_bluesky_1000m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "VictoriaLogs",
 3 |   "version": "v1.17.0-victorialogs",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-03-28",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "no",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000000,
11 |   "num_loaded_documents": 999999241,
12 |   "total_size": 121860870843,
13 |   "data_size": 108388327979,
14 |   "index_size": 13472542864,
15 |   "result": [
16 |     [1.076, 0.029, 0.028],
17 |     [98.958, 20.591, 19.794],
18 |     [14.423, 1.607, 1.609],
19 |     [5.065, 5.185, 5.216],
20 |     [7.22, 6.963, 7.351]
21 |   ]
22 | }
23 | 


--------------------------------------------------------------------------------
/duckdb/query_results.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DATABASE_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATABASE_NAME="$1"
11 | 
12 | QUERY_NUM=1
13 | 
14 | cat queries.sql | while read -r query; do
15 | 
16 |     # Print the query
17 |     echo "------------------------------------------------------------------------------------------------------------------------"
18 |     echo "Result for query Q$QUERY_NUM:"
19 |     echo
20 |     duckdb ~/$DATABASE_NAME -c "$query"
21 | 
22 |     # Increment the query number
23 |     QUERY_NUM=$((QUERY_NUM + 1))
24 | done;
25 | 


--------------------------------------------------------------------------------
/postgresql/query_results.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | 
12 | QUERY_NUM=1
13 | 
14 | cat queries.sql | while read -r query; do
15 | 
16 |     # Print the query
17 |     echo "------------------------------------------------------------------------------------------------------------------------"
18 |     echo "Result for query Q$QUERY_NUM:"
19 |     echo
20 | 
21 |     sudo -u postgres psql -d "$DB_NAME" -c "$query"
22 | 
23 |     # Increment the query number
24 |     QUERY_NUM=$((QUERY_NUM + 1))
25 | done;


--------------------------------------------------------------------------------
/singlestore/benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 4 ]]; then
 5 |     echo "Usage: $0 <ROOT_PASSWORD> <DB_NAME> <RESULT_FILE_RUNTIMES> <RESULT_FILE_MEMORY_USAGE>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | ROOT_PASSWORD="$1"
11 | DB_NAME="$2"
12 | RESULT_FILE_RUNTIMES="$3"
13 | RESULT_FILE_MEMORY_USAGE="$4"
14 | 
15 | # Construct the query log file name using $DB_NAME
16 | QUERY_LOG_FILE="_query_log_${DB_NAME}.txt"
17 | 
18 | # Print the database name
19 | echo "Running queries on database: $DB_NAME"
20 | 
21 | # Run queries and log the output
22 | ./run_queries.sh "$ROOT_PASSWORD" "$DB_NAME" 2>&1 | tee "$QUERY_LOG_FILE"
23 | 


--------------------------------------------------------------------------------
/postgresql/index_usage.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | 
12 | QUERY_NUM=1
13 | 
14 | 
15 | cat queries.sql | while read -r query; do
16 | 
17 |     # Print the query number
18 |     echo "------------------------------------------------------------------------------------------------------------------------"
19 |     echo "Index usage for query Q$QUERY_NUM:"
20 |     echo
21 | 
22 |     sudo -u postgres psql -d "$DB_NAME" -t -c "EXPLAIN $query"
23 | 
24 |     # Increment the query number
25 |     QUERY_NUM=$((QUERY_NUM + 1))
26 | 
27 | done;


--------------------------------------------------------------------------------
/duckdb/physical_query_plans.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DATABASE_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATABASE_NAME="$1"
11 | 
12 | QUERY_NUM=1
13 | 
14 | cat queries.sql | while read -r query; do
15 | 
16 |     # Print the query number
17 |     echo "------------------------------------------------------------------------------------------------------------------------"
18 |     echo "Physical query plan for query Q$QUERY_NUM:"
19 |     echo
20 | 
21 |     duckdb ~/$DATABASE_NAME -c "EXPLAIN $query"
22 | 
23 |     # Increment the query number
24 |     QUERY_NUM=$((QUERY_NUM + 1))
25 | done;
26 | 


--------------------------------------------------------------------------------
/clickhouse/index_usage.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | 
12 | QUERY_NUM=1
13 | 
14 | cat queries.sql | while read -r query; do
15 | 
16 |     # Print the query number
17 |     echo "------------------------------------------------------------------------------------------------------------------------"
18 |     echo "Index usage for query Q$QUERY_NUM:"
19 |     echo
20 | 
21 |     ./clickhouse client --database="$DB_NAME" --query="EXPLAIN indexes=1 $query"
22 | 
23 |     # Increment the query number
24 |     QUERY_NUM=$((QUERY_NUM + 1))
25 | done;
26 | 


--------------------------------------------------------------------------------
/clickhouse/physical_query_plans.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | 
12 | QUERY_NUM=1
13 | 
14 | cat queries.sql | while read -r query; do
15 | 
16 |     # Print the query number
17 |     echo "------------------------------------------------------------------------------------------------------------------------"
18 |     echo "Physical query plan for query Q$QUERY_NUM:"
19 |     echo
20 | 
21 |     ./clickhouse client --database="$DB_NAME" --query="EXPLAIN PIPELINE $query"
22 | 
23 |     # Increment the query number
24 |     QUERY_NUM=$((QUERY_NUM + 1))
25 | done;
26 | 


--------------------------------------------------------------------------------
/clickhouse/query_results.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | 
12 | QUERY_NUM=1
13 | 
14 | cat queries.sql | while read -r query; do
15 | 
16 |     # Print the query
17 |     echo "------------------------------------------------------------------------------------------------------------------------"
18 |     echo "Result for query Q$QUERY_NUM:"
19 |     echo
20 | 
21 |     ./clickhouse client --database="$DB_NAME" --format=PrettyCompactMonoBlock --query="$query" --progress 0
22 | 
23 |     # Increment the query number
24 |     QUERY_NUM=$((QUERY_NUM + 1))
25 | done;
26 | 


--------------------------------------------------------------------------------
/mongodb/data_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <database_name> <collection_name>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATABASE_NAME="$1"
11 | COLLECTION_NAME="$2"
12 | 
13 | # Fetch the totalSize using mongosh
14 | total_size=$(mongosh --quiet --eval "
15 |     const db = db.getSiblingDB('$DATABASE_NAME');
16 |     const stats = db.getCollection('$COLLECTION_NAME').stats();
17 |     print(stats.storageSize);
18 | ")
19 | 
20 | # Print the result
21 | if [[ -z "$total_size" ]]; then
22 |     echo "Error: Unable to fetch totalSize. Ensure the database and collection exist."
23 |     exit 1
24 | else
25 |     echo $total_size
26 | fi


--------------------------------------------------------------------------------
/mongodb/total_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <database_name> <collection_name>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATABASE_NAME="$1"
11 | COLLECTION_NAME="$2"
12 | 
13 | # Fetch the totalSize using mongosh
14 | total_size=$(mongosh --quiet --eval "
15 |     const db = db.getSiblingDB('$DATABASE_NAME');
16 |     const stats = db.getCollection('$COLLECTION_NAME').stats();
17 |     print(stats.totalSize);
18 | ")
19 | 
20 | # Print the result
21 | if [[ -z "$total_size" ]]; then
22 |     echo "Error: Unable to fetch totalSize. Ensure the database and collection exist."
23 |     exit 1
24 | else
25 |     echo $total_size
26 | fi


--------------------------------------------------------------------------------
/starrocks/physical_query_plans.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | 
12 | QUERY_NUM=1
13 | 
14 | cat queries.sql | while read -r query; do
15 | 
16 |     # Print the query number
17 |     echo "------------------------------------------------------------------------------------------------------------------------"
18 |     echo "Physical query plan for query Q$QUERY_NUM:"
19 |     echo
20 |     mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" -e "EXPLAIN $query"
21 | 
22 |     # Increment the query number
23 |     QUERY_NUM=$((QUERY_NUM + 1))
24 | done;
25 | 


--------------------------------------------------------------------------------
/mongodb/index_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <database_name> <collection_name>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATABASE_NAME="$1"
11 | COLLECTION_NAME="$2"
12 | 
13 | # Fetch the totalSize using mongosh
14 | total_size=$(mongosh --quiet --eval "
15 |     const db = db.getSiblingDB('$DATABASE_NAME');
16 |     const stats = db.getCollection('$COLLECTION_NAME').stats();
17 |     print(stats.totalIndexSize);
18 | ")
19 | 
20 | # Print the result
21 | if [[ -z "$total_size" ]]; then
22 |     echo "Error: Unable to fetch totalSize. Ensure the database and collection exist."
23 |     exit 1
24 | else
25 |     echo $total_size
26 | fi


--------------------------------------------------------------------------------
/elasticsearch/count.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <index_name>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Check if ELASTIC_PASSWORD env variable is set, if set from not read from .elastic_password file
10 | if [[ -z "$ELASTIC_PASSWORD" ]]; then
11 |     [[ ! -f ".elastic_password" ]] && { echo "Error: ELASTIC_PASSWORD environment variable is not set and .elastic_password file does not exist."; exit 1; }
12 |     export $(cat .elastic_password)
13 | fi
14 | 
15 | # Arguments
16 | INDEX_NAME="$1"
17 | 
18 | echo $(curl -s -k -X GET "https://localhost:9200/${INDEX_NAME}/_count" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' | jq '.count')
19 | 


--------------------------------------------------------------------------------
/elasticsearch/total_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <index_name>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Check if ELASTIC_PASSWORD env variable is set, if set from not read from .elastic_password file
10 | if [[ -z "$ELASTIC_PASSWORD" ]]; then
11 |     [[ ! -f ".elastic_password" ]] && { echo "Error: ELASTIC_PASSWORD environment variable is not set and .elastic_password file does not exist."; exit 1; }
12 |     export $(cat .elastic_password)
13 | fi
14 | 
15 | # Arguments
16 | INDEX_NAME="$1"
17 | 
18 | # Get data size
19 | curl -k -XGET "https://localhost:9200/_data_stream/${INDEX_NAME}/_stats?human" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json'


--------------------------------------------------------------------------------
/postgresql/run_queries.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | 
12 | TRIES=3
13 | 
14 | cat queries.sql | while read -r query; do
15 | 
16 |     # Clear the Linux file system cache
17 |     echo "Clearing file system cache..."
18 |     sync
19 |     echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
20 |     echo "File system cache cleared."
21 | 
22 |     # Print the query
23 |     echo "Running query: $query"
24 | 
25 |     # Execute the query multiple times
26 |     for i in $(seq 1 $TRIES); do
27 |         sudo -u postgres psql -d "$DB_NAME" -t -c '\timing' -c "$query" | grep 'Time'
28 |     done;
29 | done;


--------------------------------------------------------------------------------
/singlestore/query_results.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <ROOT_PASSWORD> <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | ROOT_PASSWORD="$1"
11 | DB_NAME="$2"
12 | 
13 | export MYSQL_PWD=${ROOT_PASSWORD}
14 | 
15 | QUERY_NUM=1
16 | 
17 | cat queries.sql | while read -r query; do
18 | 
19 |     # Print the query
20 |     echo "------------------------------------------------------------------------------------------------------------------------"
21 |     echo "Result for query Q$QUERY_NUM:"
22 |     echo
23 | 
24 |     mysql -h 127.0.0.1 -P 3306 -u root -D $DB_NAME -e "$query"
25 | 
26 |     # Increment the query number
27 |     QUERY_NUM=$((QUERY_NUM + 1))
28 | done;
29 | 


--------------------------------------------------------------------------------
/singlestore/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <LICENSE_KEY> <ROOT_PASSWORD>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | LICENSE_KEY="$1"
11 | ROOT_PASSWORD="$2"
12 | 
13 | sudo snap install docker
14 | sudo apt-get update
15 | sudo apt-get install -y mysql-client
16 | 
17 | docker run -i --init \
18 |     --name singlestore-ciab \
19 |     -e LICENSE_KEY="${LICENSE_KEY}" \
20 |     -e ROOT_PASSWORD="${ROOT_PASSWORD}" \
21 |     -p 3306:3306 -p 8080:8080 \
22 |     singlestore/cluster-in-a-box
23 | 
24 | docker start singlestore-ciab
25 | 
26 | while true
27 | do
28 |     mysql -h 127.0.0.1 -P 3306 -u root --password="${ROOT_PASSWORD}" -e 'SELECT 1' && break
29 |     sleep 1
30 | done
31 | 


--------------------------------------------------------------------------------
/starrocks/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | sudo snap install docker
 4 | sudo apt-get update
 5 | sudo apt-get install -y mysql-client
 6 | 
 7 | docker run -p 9030:9030 -p 8030:8030 -p 8040:8040 -itd --name starrocks starrocks/allin1-ubuntu:4.0.1
 8 | 
 9 | echo "Starting StarRocks container..."
10 | sleep 5
11 | 
12 | # Monitor logs until "Enjoy" appears
13 | echo "Monitoring container logs for 'Enjoy' message..."
14 | timeout 300 docker logs -f starrocks | while read line; do
15 |     echo "$line"
16 |     if echo "$line" | grep -q "Enjoy"; then
17 |         echo "Found 'Enjoy' message! Container is ready."
18 |         # Kill the docker logs process
19 |         pkill -f "docker logs -f starrocks"
20 |         break
21 |     fi
22 | done
23 | 
24 | echo "StarRocks started successfully."
25 | 


--------------------------------------------------------------------------------
/victorialogs/queries.logsql:
--------------------------------------------------------------------------------
1 | * | by (commit.collection) count() count | sort (count desc)
2 | {kind=commit,commit.operation=create} | by (commit.collection) count() count, count_uniq(did) users | sort (count desc)
3 | {kind=commit,commit.operation=create,commit.collection=~"app\\.bsky\\.feed\\.(post|repost|like)"} | math floor(_time/1h)%24 hour_of_day | by (commit.collection, hour_of_day) count() count | sort (hour_of_day, commit.collection)
4 | {kind=commit,commit.operation=create,commit.collection=app.bsky.feed.post} | by (did) min(_time) first_post_ts | first 3 (first_post_ts)
5 | {kind=commit,commit.operation=create,commit.collection=app.bsky.feed.post} | by (did) min(_time) tmin, max(_time) tmax | math round((tmax-tmin)/1e6) activity_span | keep did, activity_span | first 3 (activity_span desc)
6 | 


--------------------------------------------------------------------------------
/singlestore/results/m6i.8xlarge_bluesky_10m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "SingleStore",
 3 |   "version": "5.7.32",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-03-14",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 10000000,
11 |   "num_loaded_documents": 7000000,
12 |   "total_size": 1943790266,
13 |   "data_size": 1943790266,
14 |   "index_size": 0,
15 |   "result": [
16 |     [0.658, 0.444, 0.494],
17 |     [2.831, 2.647, 2.836],
18 |     [1.437, 1.456, 1.124],
19 |     [1.253, 0.412, 0.434],
20 |     [1.347, 0.553, 0.587]
21 |   ],
22 |   "memory_usage": [
23 |     [null, null, null],
24 |     [null, null, null],
25 |     [null, null, null],
26 |     [null, null, null],
27 |     [null, null, null]
28 |   ]
29 | }
30 | 


--------------------------------------------------------------------------------
/singlestore/results/m6i.8xlarge_bluesky_1m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "SingleStore",
 3 |   "version": "5.7.32",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-03-14",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000,
11 |   "num_loaded_documents": 1000000,
12 |   "total_size": 275466582,
13 |   "data_size": 275466582,
14 |   "index_size": 0,
15 |   "result": [
16 |     [0.206, 0.083, 0.098],
17 |     [0.615, 0.544, 0.452],
18 |     [0.366, 0.268, 0.149],
19 |     [0.259, 0.111, 0.106],
20 |     [0.324, 0.167, 0.171]
21 |   ],
22 |   "memory_usage": [
23 |     [null, null, null],
24 |     [null, null, null],
25 |     [null, null, null],
26 |     [null, null, null],
27 |     [null, null, null]
28 |   ]
29 | }
30 | 


--------------------------------------------------------------------------------
/clickhouse/ddl.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE bluesky
 2 | (
 3 |     `data` JSON(
 4 |         max_dynamic_paths = 0,
 5 |         kind LowCardinality(String),
 6 |         commit.operation LowCardinality(String),
 7 |         commit.collection LowCardinality(String),
 8 |         did String,
 9 |         time_us UInt64)  CODEC(ZSTD(1))
10 | )
11 | ORDER BY (
12 |     data.kind,
13 |     data.commit.operation,
14 |     data.commit.collection,
15 |     data.did,
16 |     fromUnixTimestamp64Micro(data.time_us))
17 | -- Below settings are planned to be default soon
18 | SETTINGS object_serialization_version = 'v3',
19 |          dynamic_serialization_version = 'v3',
20 |          object_shared_data_serialization_version = 'advanced',
21 |          object_shared_data_serialization_version_for_zero_level_parts='map_with_buckets'
22 | 


--------------------------------------------------------------------------------
/clickhouse/run_queries.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | 
12 | TRIES=3
13 | 
14 | cat queries.sql | while read -r query; do
15 | 
16 |     # Clear the Linux file system cache
17 |     echo "Clearing file system cache..."
18 |     sync
19 |     echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
20 |     echo "File system cache cleared."
21 | 
22 |     # Print the query
23 |     echo "Running query: $query"
24 | 
25 |     # Execute the query multiple times
26 |     for i in $(seq 1 $TRIES); do
27 |         ./clickhouse client --database="$DB_NAME" --time --memory-usage --format=Null --query="$query" --progress 0
28 |     done;
29 | done;
30 | 


--------------------------------------------------------------------------------
/singlestore/physical_query_plans.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <ROOT_PASSWORD> <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | ROOT_PASSWORD="$1"
11 | DB_NAME="$2"
12 | 
13 | export MYSQL_PWD=${ROOT_PASSWORD}
14 | 
15 | QUERY_NUM=1
16 | 
17 | cat queries.sql | while read -r query; do
18 | 
19 |     # Print the query number
20 |     echo "------------------------------------------------------------------------------------------------------------------------"
21 |     echo "Physical query plan for query Q$QUERY_NUM:"
22 |     echo
23 | 
24 |     mysql -h 127.0.0.1 -P 3306 -u root $DB_NAME -e "EXPLAIN $query"
25 | 
26 |     # Increment the query number
27 |     QUERY_NUM=$((QUERY_NUM + 1))
28 | done;
29 | 


--------------------------------------------------------------------------------
/singlestore/results/m6i.8xlarge_bluesky_100m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "SingleStore",
 3 |   "version": "5.7.32",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-03-14",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 100000000,
11 |   "num_loaded_documents": 91000000,
12 |   "total_size": 25757172429,
13 |   "data_size": 25757172429,
14 |   "index_size": 0,
15 |   "result": [
16 |     [6.471, 5.066, 5.606],
17 |     [37.551, 25.246, 25.060],
18 |     [14.634, 12.810, 12.786],
19 |     [34.688, 3.691, 3.702],
20 |     [34.735, 4.934, 4.713]
21 |   ],
22 |   "memory_usage": [
23 |     [null, null, null],
24 |     [null, null, null],
25 |     [null, null, null],
26 |     [null, null, null],
27 |     [null, null, null]
28 |   ]
29 | }
30 | 


--------------------------------------------------------------------------------
/doris/ddl.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE bluesky (
 2 |     kind VARCHAR(100) GENERATED ALWAYS AS (get_json_string(data, '$.kind')) NOT NULL,
 3 |     operation VARCHAR(100) GENERATED ALWAYS AS (get_json_string(data, '$.commit.operation')) NULL,
 4 |     collection VARCHAR(100) GENERATED ALWAYS AS (get_json_string(data, '$.commit.collection')) NULL,
 5 |     did VARCHAR(100) GENERATED ALWAYS AS (get_json_string(data,'$.did')) NOT NULL,
 6 |     time DATETIME GENERATED ALWAYS AS (from_microsecond(get_json_bigint(data, '$.time_us'))) NOT NULL,
 7 |     `data` variant<'kind': string, 'commit.operation' : string, 'commit.collection' : string, 'did' : string, 'time_us' : bigint, properties("variant_max_subcolumns_count" = "1024")> NOT NULL
 8 | )
 9 | DUPLICATE KEY (kind, operation, collection, did)
10 | PROPERTIES ("replication_num"="1");
11 | 


--------------------------------------------------------------------------------
/greptimedb/pipeline.yaml:
--------------------------------------------------------------------------------
 1 | processors:
 2 |   - epoch:
 3 |       fields:
 4 |         - time_us
 5 |       resolution: microsecond
 6 |   - simple_extract:
 7 |       fields:
 8 |         - commit, commit_collection
 9 |       key: "collection"
10 |       ignore_missing: true
11 |   - simple_extract:
12 |       fields:
13 |         - commit, commit_operation
14 |       key: "operation"
15 |       ignore_missing: true
16 | 
17 | transform:
18 |   - fields:
19 |       - did
20 |     type: string
21 |   - fields:
22 |       - kind
23 |       - commit_collection
24 |       - commit_operation
25 |     type: string
26 |     index: inverted
27 |     tag: true
28 |   - fields:
29 |       - commit
30 |     type: json
31 |     on_failure: ignore
32 |   - fields:
33 |       - time_us
34 |     type: epoch, us
35 |     index: timestamp
36 | 


--------------------------------------------------------------------------------
/singlestore/results/m6i.8xlarge_bluesky_1000m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "SingleStore",
 3 |   "version": "5.7.32",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-03-14",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "retains_structure": "yes",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000000,
11 |   "num_loaded_documents": 811999990,
12 |   "total_size": 234878938296,
13 |   "data_size": 234878938296,
14 |   "index_size": 0,
15 |   "result": [
16 |     [51.599, 43.98, 51.557],
17 |     [321.517, 207.62, 196.39],
18 |     [125.113, 113.956, 111.105],
19 |     [322.197, 32.433, 32.407],
20 |     [326.151, 40.692, 40.644]
21 |   ],
22 |   "memory_usage": [
23 |     [null, null, null],
24 |     [null, null, null],
25 |     [null, null, null],
26 |     [null, null, null],
27 |     [null, null, null]
28 |   ]
29 | }
30 | 


--------------------------------------------------------------------------------
/elasticsearch/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Install elasticsearch
 4 | wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch | sudo gpg --dearmor --yes -o /usr/share/keyrings/elasticsearch-keyring.gpg
 5 | sudo apt-get install apt-transport-https
 6 | echo "deb [signed-by=/usr/share/keyrings/elasticsearch-keyring.gpg] https://artifacts.elastic.co/packages/8.x/apt stable main" | sudo tee /etc/apt/sources.list.d/elastic-8.x.list
 7 | sudo apt-get update && sudo apt-get install elasticsearch
 8 | 
 9 | # Install filebeat
10 | curl -L -O https://artifacts.elastic.co/downloads/beats/filebeat/filebeat-8.17.0-amd64.deb
11 | sudo dpkg -i filebeat-8.17.0-amd64.deb
12 | 
13 | # Overwrite configuration files
14 | sudo cp config/elasticsearch.yml /etc/elasticsearch/elasticsearch.yml
15 | sudo cp config/jvm.options /etc/elasticsearch/jvm.options
16 | 


--------------------------------------------------------------------------------
/.github/workflows/generate-results.yml:
--------------------------------------------------------------------------------
 1 | name: "Generate index.html"
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 | 
 7 | permissions:
 8 |   contents: write
 9 | 
10 | jobs:
11 |   build:
12 |     runs-on: ubuntu-latest
13 |     env: 
14 |       CI_COMMIT_MESSAGE: "[bot] update index.html"
15 |       CI_COMMIT_AUTHOR: github
16 |     steps:
17 |     - uses: actions/checkout@v3
18 |     - if: github.event.commits[0].message != env.CI_COMMIT_MESSAGE
19 |       run: |
20 |         bash generate-results.sh
21 | 
22 |         git config --global user.name "${{ env.CI_COMMIT_AUTHOR }}"
23 |         git config --global user.email "${{ env.CI_COMMIT_AUTHOR }}@users.noreply.github.com"
24 | 
25 |         git add -A
26 |         if git status | grep -q modified
27 |         then
28 |           git commit -m "${{ env.CI_COMMIT_MESSAGE }}"
29 |           git push
30 |         fi
31 | 


--------------------------------------------------------------------------------
/greptimedb/start.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Do we run already?
 4 | pidof greptime >/dev/null && exit 1
 5 | 
 6 | BASEDIR=greptimedb_data
 7 | 
 8 | echo "Starting GreptimeDB"
 9 | export GREPTIMEDB_STANDALONE__WAL__DIR="${BASEDIR}/wal"
10 | export GREPTIMEDB_STANDALONE__STORAGE__DATA_HOME="${BASEDIR}"
11 | export GREPTIMEDB_STANDALONE__LOGGING__DIR="${BASEDIR}/logs"
12 | export GREPTIMEDB_STANDALONE__LOGGING__APPEND_STDOUT=false
13 | export GREPTIMEDB_STANDALONE__HTTP__BODY_LIMIT=1GB
14 | export GREPTIMEDB_STANDALONE__HTTP__TIMEOUT=500s
15 | ./greptime standalone start &
16 | 
17 | while true
18 | do
19 |     curl -s --fail -o /dev/null http://localhost:4000/health && break
20 |     sleep 1
21 | done
22 | echo "Started GreptimeDB."
23 | 
24 | # init pipeline
25 | curl -s -XPOST 'http://localhost:4000/v1/events/pipelines/jsonbench' -F 'file=@pipeline.yaml'
26 | echo -e "\nPipeline initialized."
27 | 


--------------------------------------------------------------------------------
/_files_gz/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Default data directory
 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky
 5 | 
 6 | # Allow the user to optionally provide the data directory as an argument
 7 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}"
 8 | 
 9 | # Define prefix for output files
10 | OUTPUT_PREFIX="${2:-_files_gz}"
11 | 
12 | # Check if the data directory exists
13 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
14 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
15 |     exit 1
16 | fi
17 | 
18 | 
19 | # 1m
20 | ./total_size.sh "$DATA_DIRECTORY" 1 | tee "${OUTPUT_PREFIX}_1m.total_size"
21 | 
22 | # 10m
23 | ./total_size.sh "$DATA_DIRECTORY" 10 | tee "${OUTPUT_PREFIX}_10m.total_size"
24 | 
25 | # 100m
26 | ./total_size.sh "$DATA_DIRECTORY" 100 | tee "${OUTPUT_PREFIX}_100m.total_size"
27 | 
28 | # 1000m
29 | ./total_size.sh "$DATA_DIRECTORY" 1000 | tee "${OUTPUT_PREFIX}_1000m.total_size"


--------------------------------------------------------------------------------
/mongodb/count.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <database_name> <collection_name>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATABASE_NAME="$1"
11 | COLLECTION_NAME="$2"
12 | 
13 | # Fetch the document count using mongosh
14 | document_count=$(mongosh --quiet --eval "
15 |     const db = db.getSiblingDB('$DATABASE_NAME');
16 |     const count = db.getCollection('$COLLECTION_NAME').stats().count
17 |     print(count);
18 | ")
19 | 
20 | # Debugging information
21 | echo "Database: $DATABASE_NAME"
22 | echo "Collection: $COLLECTION_NAME"
23 | echo "Document count: $document_count"
24 | 
25 | # Print the result
26 | if [[ -z "$document_count" ]]; then
27 |     echo "Error: Unable to fetch document count. Ensure the database and collection exist."
28 |     exit 1
29 | else
30 |     echo $document_count
31 | fi


--------------------------------------------------------------------------------
/duckdb/run_queries.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | 
12 | DUCKDB_CMD="duckdb $HOME/$DB_NAME" # tilda somehow doesn't work
13 | 
14 | TRIES=3
15 | 
16 | LOG_FILE="query_results.log"
17 | > "$LOG_FILE"
18 | 
19 | cat queries.sql | while read -r query; do
20 |     # Clear filesystem cache between queries.
21 |     sync
22 |     echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
23 | 
24 |     echo "Running query: $query"
25 |     for i in $(seq 1 $TRIES); do
26 |         # Run query with timer enabled and extract the real time.
27 |         OUTPUT=$($DUCKDB_CMD <<EOF >> "$LOG_FILE"
28 | .timer on
29 | $query
30 | EOF
31 | )
32 |         REAL_TIME=$(tac "$LOG_FILE" | grep -m 1 -oP 'real\s+\K[\d.]+')
33 |         echo "Real time: $REAL_TIME seconds"
34 |     done
35 | done
36 | 


--------------------------------------------------------------------------------
/elasticsearch/benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <INDEX_NAME> [RESULT_FILE]"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | INDEX_NAME="$1"
11 | RESULT_FILE="${2:-}"
12 | 
13 | # Print the index name
14 | echo "Running queries on index: $INDEX_NAME"
15 | 
16 | # Run queries and log the output
17 | ./run_queries.sh "$INDEX_NAME" 2>&1 | tee query_log.txt
18 | 
19 | # Process the query log and prepare the result
20 | RESULT=$(cat query_log.txt | grep -oP 'Response time: \d+\.\d+ s' | sed -r -e 's/Response time: ([0-9]+\.[0-9]+) s/\1/' | \
21 | awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }')
22 | 
23 | # Output the result
24 | if [[ -n "$RESULT_FILE" ]]; then
25 |     echo "$RESULT" > "$RESULT_FILE"
26 |     echo "Result written to $RESULT_FILE"
27 | else
28 |     echo "$RESULT"
29 | fi


--------------------------------------------------------------------------------
/greptimedb/run_queries.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | TRIES=3
 4 | 
 5 | set -f
 6 | cat queries.sql | while read -r query; do
 7 |     # Clear the Linux file system cache
 8 |     echo "Clearing file system cache..."
 9 |     sync
10 |     echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null
11 | 
12 |     # Print the query
13 |     echo "Running query: $query"
14 | 
15 |     # Execute the query multiple times
16 |     echo -n "["
17 |     for i in $(seq 1 $TRIES); do
18 |         t_start=$(date +%s%3N)
19 |         curl -s --fail http://localhost:4000/v1/sql --data-urlencode "sql=$query" > /dev/null
20 |         exit_code=$?
21 |         t_end=$(date +%s%3N)
22 |         duration=$((t_end-t_start))
23 |         RES=$(awk "BEGIN {print $duration / 1000}" | tr ',' '.')
24 |             [[ "$exit_code" == "0" ]] && echo -n "${RES}" || echo -n "null"
25 |             [[ "$i" != $TRIES ]] && echo -n ", "
26 |     done
27 |     echo "]"
28 | 
29 | done


--------------------------------------------------------------------------------
/mongodb/create_and_load.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 6 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <COLLECTION_NAME> <DATA_DIRECTORY> <NUM_FILES> <SUCCESS_LOG> <ERROR_LOG>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | COLLECTION_NAME="$2"
12 | DATA_DIRECTORY="$3"
13 | NUM_FILES="$4"
14 | SUCCESS_LOG="$5"
15 | ERROR_LOG="$6"
16 | 
17 | # Validate arguments
18 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
19 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }
20 | 
21 | # Create database and execute DDL file
22 | mongosh --quiet --eval "
23 |     db = db.getSiblingDB('$DB_NAME');
24 |     load('$DDL_FILE');
25 | "
26 | 
27 | echo "Loading data"
28 | ./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$COLLECTION_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"
29 | 


--------------------------------------------------------------------------------
/duckdb/benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DATABASE_NAME> [RESULT_FILE]"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATABASE_NAME="$1"
11 | RESULT_FILE="${2:-}"
12 | 
13 | # Print the database name
14 | echo "Running queries on database: $DATABASE_NAME"
15 | 
16 | # Run queries and log the output
17 | ./run_queries.sh "$DATABASE_NAME" 2>&1 | tee query_log.txt
18 | 
19 | # Process the query log and prepare the result
20 | RESULT=$(cat query_log.txt | grep -oP 'Real time: \d+\.\d+ seconds' | sed -r -e 's/Real time: ([0-9]+\.[0-9]+) seconds/\1/' | \
21 | awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }')
22 | 
23 | # Output the result
24 | if [[ -n "$RESULT_FILE" ]]; then
25 |     echo "$RESULT" > "$RESULT_FILE"
26 |     echo "Result written to $RESULT_FILE"
27 | else
28 |     echo "$RESULT"
29 | fi
30 | 


--------------------------------------------------------------------------------
/victorialogs/run_queries.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | TRIES=3
 4 | 
 5 | set -f
 6 | cat queries.logsql | while read -r query; do
 7 | 
 8 |     # Clear the Linux file system cache
 9 |     echo "Clearing file system cache..."
10 |     sync
11 |     echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null
12 | 
13 |     # Print the query
14 |     echo "Running query: $query"
15 | 
16 |     # Execute the query multiple times
17 |     echo -n "["
18 |     for i in $(seq 1 $TRIES); do
19 |         t_start=$(date +%s%3N)
20 |         curl -s --fail http://localhost:9428/select/logsql/query --data-urlencode "query=$query" > /dev/null
21 |         exit_code=$?
22 |         t_end=$(date +%s%3N)
23 |         duration=$((t_end-t_start))
24 |         RES=$(awk "BEGIN {print $duration / 1000}" | tr ',' '.')
25 |             [[ "$exit_code" == "0" ]] && echo -n "${RES}" || echo -n "null"
26 |             [[ "$i" != $TRIES ]] && echo -n ", "
27 |     done
28 |     echo "]"
29 | 
30 | done
31 | 


--------------------------------------------------------------------------------
/duckdb/create_and_load.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 7 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME> <DDL_FILE> <DATA_DIRECTORY> <NUM_FILES> <SUCCESS_LOG> <ERROR_LOG>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | TABLE_NAME="$2"
12 | DDL_FILE="$3"
13 | DATA_DIRECTORY="$4"
14 | NUM_FILES="$5"
15 | SUCCESS_LOG="$6"
16 | ERROR_LOG="$7"
17 | 
18 | # Validate arguments
19 | [[ ! -f "$DDL_FILE" ]] && { echo "Error: DDL file '$DDL_FILE' does not exist."; exit 1; }
20 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
21 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }
22 | 
23 | echo "Create database and execute DDL"
24 | duckdb ~/$DB_NAME < "$DDL_FILE"
25 | 
26 | echo "Load data"
27 | ./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"
28 | 


--------------------------------------------------------------------------------
/doris/queries.sql:
--------------------------------------------------------------------------------
1 | SELECT collection AS event, COUNT(*) AS count FROM bluesky GROUP BY event ORDER BY count DESC;
2 | SELECT collection AS event, COUNT(*) AS count, COUNT(DISTINCT did) AS users FROM bluesky WHERE kind = 'commit' AND operation = 'create' GROUP BY event ORDER BY count DESC;
3 | SELECT collection AS event, HOUR(time) AS hour_of_day, COUNT(*) AS count FROM bluesky WHERE kind = 'commit' AND operation = 'create' AND collection IN ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
4 | SELECT did AS user_id, MIN(time) AS first_post_ts FROM bluesky WHERE kind = 'commit' AND operation = 'create' AND collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC LIMIT 3;
5 | SELECT did AS user_id, MILLISECONDS_DIFF(MAX(time),MIN(time)) AS activity_span FROM bluesky WHERE kind = 'commit' AND operation = 'create' AND collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;
6 | 


--------------------------------------------------------------------------------
/_files_gz/total_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <DATA_DIRECTORY> <N>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATA_DIRECTORY="$1"
11 | N="$2"
12 | 
13 | # Validate the data directory
14 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
15 |     echo "Error: Directory '$DATA_DIRECTORY' does not exist."
16 |     exit 1
17 | fi
18 | 
19 | # Validate N is a positive integer
20 | if ! [[ "$N" =~ ^[0-9]+$ ]]; then
21 |     echo "Error: N must be a positive integer."
22 |     exit 1
23 | fi
24 | 
25 | # Get the first N files sorted by filename and calculate their total size
26 | TOTAL_SIZE=$(ls -1 "$DATA_DIRECTORY" | sort | head -n "$N" | while read -r file; do
27 |     filepath="$DATA_DIRECTORY/$file"
28 |     if [[ -f "$filepath" ]]; then
29 |         stat --format="%s" "$filepath"
30 |     fi
31 | done | awk '{sum += $1} END {print sum}')
32 | 
33 | # Output the total size in bytes
34 | echo $TOTAL_SIZE


--------------------------------------------------------------------------------
/elasticsearch/start.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "Starting ElasticSearch"
 4 | sudo systemctl start elasticsearch.service
 5 | 
 6 | echo "Resetting and export ElasticSearch password"
 7 | export ELASTIC_PASSWORD=$(sudo /usr/share/elasticsearch/bin/elasticsearch-reset-password -s -a -b -u elastic)
 8 | 
 9 | echo "Saving ElasticSearch password in local file"
10 | echo "ELASTIC_PASSWORD=$ELASTIC_PASSWORD" > .elastic_password
11 | 
12 | echo "Generating API key for filebeat"
13 | curl -s -k -X POST "https://localhost:9200/_security/api_key" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' -d '
14 | {
15 |   "name": "filebeat",
16 |   "role_descriptors": {
17 |     "filebeat_writer": {
18 |       "cluster": ["monitor", "read_ilm", "read_pipeline"],
19 |       "index": [
20 |         {
21 |           "names": ["bluesky-*"],
22 |           "privileges": ["view_index_metadata", "create_doc", "auto_configure"]
23 |         }
24 |       ]
25 |     }
26 |   }
27 | }' | jq -r '"\(.id):\(.api_key)"' > .filebeat_api_key
28 | 


--------------------------------------------------------------------------------
/mongodb/benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> [RESULT_FILE]"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | RESULT_FILE="${2:-}"
12 | 
13 | # Construct the query log file name using $DB_NAME
14 | QUERY_LOG_FILE="_query_log_${DB_NAME}.txt"
15 | 
16 | # Print the database name
17 | echo "Running queries on database: $DB_NAME"
18 | 
19 | # Run queries and log the output
20 | ./run_queries.sh "$DB_NAME" 2>&1 | tee "$QUERY_LOG_FILE"
21 | 
22 | # Process the query log and prepare the result
23 | RESULT=$(cat "$QUERY_LOG_FILE" | grep -oP 'Execution time: \d+ms' | sed -r 's/Execution time: ([0-9]+)/\1/' | \
24 | awk '{ if (i % 3 == 0) { printf "[" }; printf $1 / 1000; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }')
25 | 
26 | # Output the result
27 | if [[ -n "$RESULT_FILE" ]]; then
28 |     echo "$RESULT" > "$RESULT_FILE"
29 |     echo "Result written to $RESULT_FILE"
30 | else
31 |     echo "$RESULT"
32 | fi


--------------------------------------------------------------------------------
/postgresql/benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> [RESULT_FILE]"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | RESULT_FILE="${2:-}"
12 | 
13 | # Construct the query log file name using $DB_NAME
14 | QUERY_LOG_FILE="_query_log_${DB_NAME}.txt"
15 | 
16 | # Print the database name
17 | echo "Running queries on database: $DB_NAME"
18 | 
19 | # Run queries and log the output
20 | ./run_queries.sh "$DB_NAME" 2>&1 | tee "$QUERY_LOG_FILE"
21 | 
22 | # Process the query log and prepare the result
23 | RESULT=$(cat "$QUERY_LOG_FILE" | grep -oP 'Time: \d+\.\d+ ms' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/' | \
24 | awk '{ if (i % 3 == 0) { printf "[" }; printf $1 / 1000; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }')
25 | 
26 | # Output the result
27 | if [[ -n "$RESULT_FILE" ]]; then
28 |     echo "$RESULT" > "$RESULT_FILE"
29 |     echo "Result written to $RESULT_FILE"
30 | else
31 |     echo "$RESULT"
32 | fi


--------------------------------------------------------------------------------
/doris/create_and_load.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 6 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME> <DATA_DIRECTORY> <NUM_FILES> <SUCCESS_LOG> <ERROR_LOG>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | TABLE_NAME="$2"
12 | DATA_DIRECTORY="$3"
13 | NUM_FILES="$4"
14 | SUCCESS_LOG="$5"
15 | ERROR_LOG="$6"
16 | 
17 | # Validate arguments
18 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
19 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }
20 | 
21 | 
22 | echo "Create database"
23 | mysql -P 9030 -h 127.0.0.1 -u root -e "CREATE DATABASE IF NOT EXISTS $DB_NAME"
24 | 
25 | echo "Execute DDL"
26 | mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME < "ddl.sql"
27 | 
28 | echo "Load data"
29 | ./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"
30 | 
31 | echo "Sleep 120 sec to collect data size"
32 | sleep 120s
33 | 


--------------------------------------------------------------------------------
/starrocks/run_queries.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # If you change something in this file, please change also in doris/run_queries.sh.
 4 | 
 5 | # Check if the required arguments are provided
 6 | if [[ $# -lt 1 ]]; then
 7 |     echo "Usage: $0 <DB_NAME>"
 8 |     exit 1
 9 | fi
10 | 
11 | # Arguments
12 | DB_NAME="$1"
13 | 
14 | TRIES=3
15 | 
16 | cat queries.sql | while read -r query; do
17 | 
18 |     # Clear the Linux file system cache
19 |     echo "Clearing file system cache..."
20 |     sync
21 |     echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
22 |     echo "File system cache cleared."
23 | 
24 |     # Print the query
25 |     echo "Running query: $query"
26 | 
27 |     # Execute the query multiple times
28 |     for i in $(seq 1 $TRIES); do
29 |         RESP=$({ /usr/bin/time -f '%e' \
30 |                     mysql --skip-auto-rehash --batch --silent -h "$DB_HOST" -P "$DB_MYSQL_PORT" -u"$DB_USER" "$DB_NAME" \
31 |                     -e "$query" >/dev/null; } 2>&1)
32 |         echo "Response time: ${RESP} s"
33 |     done;
34 | done;
35 | 


--------------------------------------------------------------------------------
/clickhouse/create_and_load.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 6 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME> <DATA_DIRECTORY> <NUM_FILES> <SUCCESS_LOG> <ERROR_LOG>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | TABLE_NAME="$2"
12 | DATA_DIRECTORY="$3"
13 | NUM_FILES="$4"
14 | SUCCESS_LOG="$5"
15 | ERROR_LOG="$6"
16 | 
17 | # Validate arguments
18 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
19 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }
20 | 
21 | 
22 | echo "Creating database $DB_NAME"
23 | ./clickhouse client --query "CREATE DATABASE IF NOT EXISTS $DB_NAME"
24 | 
25 | echo "Executing DDL for database $DB_NAME"
26 | ./clickhouse client --database="$DB_NAME" --enable_json_type=1 --multiquery < ddl.sql
27 | 
28 | echo "Loading data for database $DB_NAME"
29 | ./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"
30 | 


--------------------------------------------------------------------------------
/elasticsearch/drop_tables.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "Stopping ElasticSearch"
 4 | sudo systemctl stop elasticsearch.service
 5 | 
 6 | # My amateurish attempt to delete data from Elasticsearch led me to
 7 | # - https://stackoverflow.com/questions/22924300/removing-data-from-elasticsearch
 8 | # - https://stackoverflow.com/questions/23917327/delete-all-documents-from-index-without-deleting-index
 9 | # but none of that worked for me so I gave up after debugging this mess for 90 minutes.
10 | 
11 | # Let's try it the old-fashioned way.
12 | 
13 | # echo "Nuking ElasticSearch directories"
14 | # sudo rm -rf /var/lib/elasticsearch/*
15 | # sudo rm -rf /var/log/elasticsearch/*
16 | 
17 | # ^^ Haha. Fails silently, please `sudo su` and run above `rm` statements by hand. But don't delete the elasticsearch/ folders themselves,
18 | # otherwise elasticsearch will refuse to start and you will need to re-install it via apt. What a shameful disaster. If someone knows how to
19 | # perform the extremely simple task of deleting data from Elasticsearch, please send a pull request.
20 | 


--------------------------------------------------------------------------------
/greptimedb/queries.sql:
--------------------------------------------------------------------------------
1 | SELECT commit_collection AS event, count(1) AS cnt FROM bluesky GROUP BY event ORDER BY cnt DESC;
2 | SELECT commit_collection AS event, count(1) AS cnt, count(DISTINCT did) AS users FROM bluesky WHERE kind = 'commit' AND commit_operation = 'create' GROUP BY event ORDER BY cnt DESC;
3 | SELECT commit_collection AS event, date_part('hour', time_us) AS hour_of_day, count(1) AS cnt FROM bluesky WHERE kind = 'commit' AND commit_operation = 'create' AND commit_collection IN('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
4 | SELECT did AS user_id, min(time_us) AS first_post_ts FROM bluesky WHERE kind = 'commit' AND commit_operation = 'create' AND commit_collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC LIMIT 3;
5 | SELECT did AS user_id, date_part('millisecond',(max(time_us) - min(time_us))) AS activity_span FROM bluesky WHERE kind = 'commit' AND commit_operation = 'create' AND commit_collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;
6 | 


--------------------------------------------------------------------------------
/elasticsearch/queries.txt:
--------------------------------------------------------------------------------
1 | FROM ${INDEX_NAME} | STATS count = COUNT() BY commit.collection | SORT count DESC
2 | FROM ${INDEX_NAME} | WHERE kind == \\\"commit\\\" AND commit.operation == \\\"create\\\" | STATS users = COUNT_DISTINCT(did, 40000), count = COUNT() BY commit.collection | SORT count DESC
3 | FROM ${INDEX_NAME} | WHERE kind == \\\"commit\\\" AND commit.operation == \\\"create\\\" AND commit.collection IN (\\\"app.bsky.feed.post\\\", \\\"app.bsky.feed.repost\\\", \\\"app.bsky.feed.like\\\") | STATS count = COUNT() BY commit.collection, DATE_EXTRACT(\\\"hour_of_day\\\", time_us) | SORT count, commit.collection
4 | FROM ${INDEX_NAME} | WHERE kind == \\\"commit\\\" AND commit.operation == \\\"create\\\" AND commit.collection == \\\"app.bsky.feed.post\\\" | STATS first_post_ts = MIN(time_us) BY did | SORT first_post_ts ASC | LIMIT 3
5 | FROM ${INDEX_NAME} | WHERE kind == \\\"commit\\\" AND commit.operation == \\\"create\\\" AND commit.collection == \\\"app.bsky.feed.post\\\" | STATS activity_span = date_diff(\\\"millisecond\\\",min(time_us), max(time_us)) BY did | SORT activity_span DESC | LIMIT 3


--------------------------------------------------------------------------------
/generate-results.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | 
 3 | # This script will substitute the benchmark results into the HTML page.
 4 | # Note: editing HTML with sed may look strange, but at least we avoid using node.js and npm, and that's good.
 5 | 
 6 | # This is needed on Mac OS. Do `brew install coreutils`.
 7 | if [[ "$(uname)" == "Darwin" ]]; then
 8 |   if ! command -v gsed >/dev/null 2>&1
 9 |   then
10 |     echo "On macOS, please install GNU sed through homebrew."
11 |     exit 1
12 |   else
13 |     shopt -s expand_aliases
14 |     alias sed='gsed'
15 |   fi
16 | fi
17 | 
18 | (
19 |     sed '/^const data = \[$/q' index.html
20 | 
21 |     FIRST=1
22 |     LANG="" ls -1 */results*/*.json | while read -r file
23 |     do
24 |         [ "${FIRST}" = "0" ] && echo -n ','
25 |         jq --compact-output ". += {\"source\": \"${file}\"}" "${file}" || echo "Error in $file" >&2
26 |         FIRST=0
27 |     done
28 | 
29 |     echo ']; // end of data'
30 |     sed '0,/^\]; \/\/ end of data$/d' index.html
31 | 
32 | ) > index.html.new
33 | 
34 | mv index.html index.html.bak
35 | mv index.html.new index.html
36 | 


--------------------------------------------------------------------------------
/singlestore/run_queries.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <ROOT_PASSWORD> <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | ROOT_PASSWORD="$1"
11 | DB_NAME="$2"
12 | 
13 | export MYSQL_PWD=${ROOT_PASSWORD}
14 | 
15 | TRIES=3
16 | 
17 | cat queries.sql | while read -r query; do
18 | 
19 |     # Clear the Linux file system cache
20 |     echo "Clearing file system cache..."
21 |     sync
22 |     echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
23 |     echo "File system cache cleared."
24 | 
25 |     # Print the query
26 |     echo "Running query: $query"
27 | 
28 |     # Execute the query multiple times
29 |     for i in $(seq 1 $TRIES); do
30 |         time mysql -h 127.0.0.1 -P 3306 -u root -D $DB_NAME -e "$query"
31 |     done;
32 | done;
33 | 
34 | # The runtime measured by `time` is manually copied into the result .json file.
35 | # I couldn't find a way to figure out the per-query memory consumption, these are marked as "null" in the result .json files. Feel free to
36 | # re-produce and add memory consumption measurements!
37 | 


--------------------------------------------------------------------------------
/starrocks/create_and_load.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # If you change something in this file, please change also in doris/create_and_load.sh.
 4 | 
 5 | # Check if the required arguments are provided
 6 | if [[ $# -lt 6 ]]; then
 7 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME> <DATA_DIRECTORY> <NUM_FILES> <SUCCESS_LOG> <ERROR_LOG>"
 8 |     exit 1
 9 | fi
10 | 
11 | # Arguments
12 | DB_NAME="$1"
13 | TABLE_NAME="$2"
14 | DATA_DIRECTORY="$3"
15 | NUM_FILES="$4"
16 | SUCCESS_LOG="$5"
17 | ERROR_LOG="$6"
18 | DDL_FILE="ddl.sql"
19 | 
20 | # Validate arguments
21 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
22 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }
23 | 
24 | 
25 | echo "Create database"
26 | mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" -e "CREATE DATABASE IF NOT EXISTS $DB_NAME"
27 | 
28 | echo "Execute DDL"
29 | mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" < "$DDL_FILE"
30 | 
31 | echo "Load data"
32 | ./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"
33 | 


--------------------------------------------------------------------------------
/postgresql/create_and_load.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 7 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME> <DDL_FILE> <DATA_DIRECTORY> <NUM_FILES> <SUCCESS_LOG> <ERROR_LOG>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | TABLE_NAME="$2"
12 | DDL_FILE="$3"
13 | DATA_DIRECTORY="$4"
14 | NUM_FILES="$5"
15 | SUCCESS_LOG="$6"
16 | ERROR_LOG="$7"
17 | 
18 | # Validate arguments
19 | [[ ! -f "$DDL_FILE" ]] && { echo "Error: DDL file '$DDL_FILE' does not exist."; exit 1; }
20 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
21 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }
22 | 
23 | echo "Create database"
24 | sudo -u postgres psql -t -c "CREATE DATABASE $DB_NAME"
25 | 
26 | echo "Execute DDL"
27 | sudo -u postgres psql "$DB_NAME" -t < "$DDL_FILE"
28 | 
29 | echo "Load data"
30 | ./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"
31 | 
32 | echo "Vacuum analyze the table"
33 | sudo -u postgres psql "$DB_NAME" -t -c "VACUUM ANALYZE $TABLE_NAME"
34 | 


--------------------------------------------------------------------------------
/doris/benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # If you change something in this file, please change also in starrocks/benchmark.sh.
 4 | 
 5 | # Check if the required arguments are provided
 6 | if [[ $# -lt 3 ]]; then
 7 |     echo "Usage: $0 <DB_NAME> <RESULT_FILE_RUNTIMES> <QUERIES_FILE>"
 8 |     exit 1
 9 | fi
10 | 
11 | # Arguments
12 | DB_NAME="$1"
13 | RESULT_FILE_RUNTIMES="$2"
14 | QUERIES_FILE="$3"
15 | 
16 | # Construct the query log file name using $DB_NAME
17 | QUERY_LOG_FILE="query_log.txt"
18 | 
19 | # Print the database name
20 | echo "Running queries on database: $DB_NAME"
21 | 
22 | # Run queries and log the output
23 | ./run_queries.sh "$DB_NAME" "$QUERIES_FILE"  2>&1 | tee query_log.txt
24 | 
25 | # Process the query log and prepare the result
26 | RESULT=$(cat query_log.txt | grep -oP 'Response time: \d+\.\d+ s' | sed -r -e 's/Response time: ([0-9]+\.[0-9]+) s/\1/' | \
27 | awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }')
28 | 
29 | # Output the result
30 | if [[ -n "$RESULT_FILE_RUNTIMES" ]]; then
31 |     echo "$RESULT" > "$RESULT_FILE_RUNTIMES"
32 |     echo "Result written to $RESULT_FILE_RUNTIMES"
33 | else
34 |     echo "$RESULT"
35 | fi
36 | 


--------------------------------------------------------------------------------
/starrocks/benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # If you change something in this file, please change also in doris/benchmark.sh.
 4 | 
 5 | # Check if the required arguments are provided
 6 | if [[ $# -lt 3 ]]; then
 7 |     echo "Usage: $0 <DB_NAME> <RESULT_FILE_RUNTIMES> <RESULT_FILE_MEMORY_USAGE>"
 8 |     exit 1
 9 | fi
10 | 
11 | # Arguments
12 | DB_NAME="$1"
13 | RESULT_FILE_RUNTIMES="$2"
14 | RESULT_FILE_MEMORY_USAGE="$3"
15 | 
16 | # Construct the query log file name using $DB_NAME
17 | QUERY_LOG_FILE="query_log.txt"
18 | 
19 | # Print the database name
20 | echo "Running queries on database: $DB_NAME"
21 | 
22 | # Run queries and log the output
23 | ./run_queries.sh "$DB_NAME" 2>&1 | tee query_log.txt
24 | 
25 | # Process the query log and prepare the result
26 | RESULT=$(cat query_log.txt | grep -oP 'Response time: \d+\.\d+ s' | sed -r -e 's/Response time: ([0-9]+\.[0-9]+) s/\1/' | \
27 | awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }')
28 | 
29 | # Output the result
30 | if [[ -n "$RESULT_FILE_RUNTIMES" ]]; then
31 |     echo "$RESULT" > "$RESULT_FILE_RUNTIMES"
32 |     echo "Result written to $RESULT_FILE_RUNTIMES"
33 | else
34 |     echo "$RESULT"
35 | fi
36 | 


--------------------------------------------------------------------------------
/doris/run_queries.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # If you change something in this file, please change also in doris/run_queries.sh.
 4 | 
 5 | # Check if the required arguments are provided
 6 | if [[ $# -lt 2 ]]; then
 7 |     echo "Usage: $0 <DB_NAME> <QUERIES_FILE>"
 8 |     exit 1
 9 | fi
10 | 
11 | # Arguments
12 | DB_NAME="$1"
13 | QUERIES_FILE="$2"
14 | 
15 | TRIES=3
16 | 
17 | mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "set global parallel_pipeline_task_num=32;"
18 | mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "set global enable_parallel_scan=false;"
19 | 
20 | cat $QUERIES_FILE | while read -r query; do
21 | 
22 |     # Clear the Linux file system cache
23 |     echo "Clearing file system cache..."
24 |     sync
25 |     echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
26 |     echo "File system cache cleared."
27 | 
28 |     # Print the query
29 |     echo "Running query: $query"
30 | 
31 |     # Execute the query multiple times
32 |     for i in $(seq 1 $TRIES); do
33 |         RESP=$(mysql -vvv -h127.1 -P9030 -uroot "$DB_NAME" -e "$query" | perl -nle 'if (/\((?:(\d+) min )?(\d+\.\d+) sec\)/) { $t = ($1 || 0) * 60 + $2; printf "%.2f\n", $t }' ||:)
34 |         echo "Response time: ${RESP} s"
35 |     done;
36 | done;
37 | 


--------------------------------------------------------------------------------
/mongodb/query_results.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | 
12 | QUERY_NUM=1
13 | 
14 | # File containing MongoDB queries (replace 'queries.js' with your file)
15 | QUERY_FILE="queries.js"
16 | 
17 | # Check if the query file exists
18 | if [[ ! -f "$QUERY_FILE" ]]; then
19 |     echo "Error: Query file '$QUERY_FILE' does not exist."
20 |     exit 1
21 | fi
22 | 
23 | # Read and execute each query
24 | cat "$QUERY_FILE" | while read -r query; do
25 | 
26 |     # Print the query
27 |     echo "------------------------------------------------------------------------------------------------------------------------"
28 |     echo "Result for query Q$QUERY_NUM:"
29 |     echo
30 | 
31 |     # Escape the query for safe passing to mongosh
32 |     ESCAPED_QUERY=$(echo "$query" | sed 's/\([\"\\]\)/\\\1/g' | sed 's/\$/\\$/g')
33 | 
34 |     mongosh --eval "
35 |         const db = db.getSiblingDB('$DB_NAME');
36 |         const result = eval(\"$ESCAPED_QUERY\");
37 |         printjson(result);
38 |     "
39 | 
40 | 
41 |     # Increment the query number
42 |     QUERY_NUM=$((QUERY_NUM + 1))
43 | 
44 | done


--------------------------------------------------------------------------------
/singlestore/create_and_load.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 8 ]]; then
 5 |     echo "Usage: $0 <ROOT_PASSWORD> <DB_NAME> <TABLE_NAME> <DDL_FILE> <DATA_DIRECTORY> <NUM_FILES> <SUCCESS_LOG> <ERROR_LOG>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | ROOT_PASSWORD="$1"
11 | DB_NAME="$2"
12 | TABLE_NAME="$3"
13 | DDL_FILE="$4"
14 | DATA_DIRECTORY="$5"
15 | NUM_FILES="$6"
16 | SUCCESS_LOG="$7"
17 | ERROR_LOG="$8"
18 | 
19 | # Validate arguments
20 | [[ ! -f "$DDL_FILE" ]] && { echo "Error: DDL file '$DDL_FILE' does not exist."; exit 1; }
21 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
22 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }
23 | 
24 | export MYSQL_PWD=${ROOT_PASSWORD}
25 | 
26 | echo "Creating database $DB_NAME"
27 | mysql -h 127.0.0.1 -P 3306 -u root -e "CREATE DATABASE IF NOT EXISTS $DB_NAME"
28 | 
29 | echo "Executing DDL for database $DB_NAME"
30 | mysql -h 127.0.0.1 -P 3306 -u root $DB_NAME < "$DDL_FILE"
31 | 
32 | echo "Loading data for database $DB_NAME"
33 | ./load_data.sh "$ROOT_PASSWORD" "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"
34 | 


--------------------------------------------------------------------------------
/clickhouse/queries.sql:
--------------------------------------------------------------------------------
1 | SELECT data.commit.collection AS event, count() AS count FROM bluesky GROUP BY event ORDER BY count DESC;
2 | SELECT data.commit.collection AS event, count() AS count, uniqExact(data.did) AS users FROM bluesky WHERE data.kind = 'commit' AND data.commit.operation = 'create' GROUP BY event ORDER BY count DESC;
3 | SELECT data.commit.collection AS event, toHour(fromUnixTimestamp64Micro(data.time_us)) as hour_of_day, count() AS count FROM bluesky WHERE data.kind = 'commit' AND data.commit.operation = 'create' AND data.commit.collection in ['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'] GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
4 | SELECT data.did::String as user_id, min(fromUnixTimestamp64Micro(data.time_us)) as first_post_ts FROM bluesky WHERE data.kind = 'commit' AND data.commit.operation = 'create' AND data.commit.collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC LIMIT 3;
5 | SELECT data.did::String as user_id, date_diff( 'milliseconds', min(fromUnixTimestamp64Micro(data.time_us)), max(fromUnixTimestamp64Micro(data.time_us))) AS activity_span FROM bluesky WHERE data.kind = 'commit' AND data.commit.operation = 'create' AND data.commit.collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;
6 | 


--------------------------------------------------------------------------------
/_files_json/load_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if required arguments are provided
 4 | if [[ $# -lt 3 ]]; then
 5 |     echo "Usage: $0 <DATA_DIRECTORY> <TARGET_DIRECTORY> <N>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATA_DIRECTORY="$1"
11 | TARGET_DIRECTORY="$2"
12 | N="$3"
13 | 
14 | # Validate the source directory
15 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
16 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
17 |     exit 1
18 | fi
19 | 
20 | # Validate the target directory
21 | if [[ ! -d "$TARGET_DIRECTORY" ]]; then
22 |     echo "Error: Target directory '$TARGET_DIRECTORY' does not exist."
23 |     exit 1
24 | fi
25 | 
26 | # Validate N is a positive integer
27 | if ! [[ "$N" =~ ^[0-9]+$ ]]; then
28 |     echo "Error: N must be a positive integer."
29 |     exit 1
30 | fi
31 | 
32 | # Get the sorted list of .json.gz files and extract the first N
33 | count=0
34 | for file in $(ls "$DATA_DIRECTORY"/*.json.gz | sort); do
35 |     if [[ $count -ge $N ]]; then
36 |         break
37 |     fi
38 | 
39 |     echo "Processing $file..."
40 |     gzip -dkc "$file" > "$TARGET_DIRECTORY/$(basename "${file%.gz}")"  # Extract to target directory
41 |     count=$((count + 1))
42 | done
43 | 
44 | echo "Extraction of $count files completed. Extracted files are in '$TARGET_DIRECTORY'."


--------------------------------------------------------------------------------
/clickhouse/benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <RESULT_FILE_RUNTIMES>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | RESULT_FILE_RUNTIMES="$2"
12 | 
13 | # Construct the query log file name using $DB_NAME
14 | QUERY_LOG_FILE="_query_log_${DB_NAME}.txt"
15 | 
16 | # Print the database name
17 | echo "Running queries on database: $DB_NAME"
18 | 
19 | # Run queries and log the output
20 | ./run_queries.sh "$DB_NAME" 2>&1 | tee "$QUERY_LOG_FILE"
21 | 
22 | # Process the query log and prepare the result
23 | RUNTIME_RESULTS=$(grep -E '^[0-9]' "$QUERY_LOG_FILE" | awk 'NR % 2 == 1' | awk '{
24 |     if (NR % 3 == 1) { printf "["; }
25 |     printf $1;
26 |     if (NR % 3 == 0) {
27 |         print "],";
28 |     } else {
29 |         printf ", ";
30 |     }
31 | }')
32 | 
33 | MEMORY_RESULTS=$(grep -E '^[0-9]' "$QUERY_LOG_FILE" | awk 'NR % 2 == 0' | awk '{
34 |     if (NR % 3 == 1) { printf "["; }
35 |     printf $1;
36 |     if (NR % 3 == 0) {
37 |         print "],";
38 |     } else {
39 |         printf ", ";
40 |     }
41 | }')
42 | 
43 | # Output the runtime results
44 | echo "$RUNTIME_RESULTS" > "$RESULT_FILE_RUNTIMES"
45 | echo "Runtime results written to $RESULT_FILE_RUNTIMES"
46 | 


--------------------------------------------------------------------------------
/singlestore/queries.sql:
--------------------------------------------------------------------------------
1 | SELECT data::commit::collection AS event, count(*) AS count FROM bluesky GROUP BY event ORDER BY count DESC;
2 | SELECT data::commit::collection AS event, count(*) AS count, count(distinct data::did) FROM bluesky WHERE data::$kind = 'commit' AND data::commit::$operation = 'create' GROUP BY event ORDER BY count DESC;
3 | SELECT data::commit::collection AS event, hour(from_unixtime(data::time_us/1000000)) AS hour_of_day, count(*) AS count FROM bluesky WHERE data::$kind = 'commit' AND data::commit::$operation = 'create' AND data::commit::$collection IN ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
4 | SELECT data::$did AS user_id, min(from_unixtime(data::time_us/1000000)) AS first_post_ts FROM bluesky WHERE data::$kind = 'commit' AND data::commit::$operation = 'create' AND data::commit::$collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC LIMIT 3;
5 | SELECT data::$did AS user_id, timestampdiff(microsecond, min(from_unixtime(data::time_us/1000000)), max(from_unixtime(data::time_us/1000000))) as activity_span FROM bluesky WHERE data::$kind = 'commit' AND data::commit::$operation = 'create' AND data::commit::$collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span  DESC LIMIT 3
6 | 


--------------------------------------------------------------------------------
/download_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "Select the dataset size to download:"
 4 | echo "1) 1m (default)"
 5 | echo "2) 10m"
 6 | echo "3) 100m"
 7 | echo "4) 1000m"
 8 | read -p "Enter the number corresponding to your choice: " choice
 9 | 
10 | case $choice in
11 |     2)
12 |         # Download 10m dataset: files 0001 to 0010
13 |         wget --continue --timestamping --progress=dot:giga --directory-prefix ~/data/bluesky --input-file <(seq --format "https://clickhouse-public-datasets.s3.amazonaws.com/bluesky/file_%04g.json.gz" 1 10)
14 |         ;;
15 |     3)
16 |         # Download 100m dataset: files 0001 to 0100
17 |         wget --continue --timestamping --progress=dot:giga --directory-prefix ~/data/bluesky --input-file <(seq --format "https://clickhouse-public-datasets.s3.amazonaws.com/bluesky/file_%04g.json.gz" 1 100)
18 |         ;;
19 |     4)
20 |         # Download 1000m dataset: files 0001 to 1000
21 |         wget --continue --timestamping --progress=dot:giga --directory-prefix ~/data/bluesky --input-file <(seq --format "https://clickhouse-public-datasets.s3.amazonaws.com/bluesky/file_%04g.json.gz" 1 1000)
22 |         ;;
23 |     *)
24 |         # Download 1m dataset: single file
25 |         wget --continue --timestamping --progress=dot:giga --directory-prefix ~/data/bluesky "https://clickhouse-public-datasets.s3.amazonaws.com/bluesky/file_0001.json.gz"
26 |         ;;
27 | esac
28 | 


--------------------------------------------------------------------------------
/victorialogs/load_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 4 ]]; then
 5 |     echo "Usage: $0 <DATA_DIRECTORY> <MAX_FILES> <SUCCESS_LOG> <ERROR_LOG>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATA_DIRECTORY="$1"
11 | MAX_FILES="$2"
12 | SUCCESS_LOG="$3"
13 | ERROR_LOG="$4"
14 | 
15 | # Validate arguments
16 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
17 | [[ ! "$MAX_FILES" =~ ^[0-9]+$ ]] && { echo "Error: MAX_FILES must be a positive integer."; exit 1; }
18 | 
19 | # Load data
20 | PARALLEL_WORKERS=8
21 | counter=0
22 | for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do
23 |     echo "Processing file: $file"
24 | 
25 |     zcat $file | curl -s --fail -T - -X POST 'http://localhost:9428/insert/jsonline?_time_field=time_us&_stream_fields=kind,commit.collection,commit.operation' \
26 |         && echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $file." >> "$SUCCESS_LOG" \
27 |         || echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed importing $file." >> "$ERROR_LOG" &
28 | 
29 |     [[ $(jobs -p -r | wc -l) -ge $PARALLEL_WORKERS ]] && wait -n
30 | 
31 |     counter=$((counter + 1))
32 |     if [[ $counter -ge $MAX_FILES ]]; then
33 |         break
34 |     fi
35 | done
36 | 
37 | wait
38 | 
39 | echo "Loaded $MAX_FILES data files from $DATA_DIRECTORY to victorialogs."
40 | 


--------------------------------------------------------------------------------
/duckdb/queries.sql:
--------------------------------------------------------------------------------
1 | SELECT j->>'$.commit.collection' AS event,count() AS count FROM bluesky GROUP BY event ORDER BY count DESC;
2 | SELECT j->>'$.commit.collection' AS event,count() AS count,count(DISTINCT j->>'$.did') AS users FROM bluesky WHERE (j->>'$.kind' = 'commit') AND (j->>'$.commit.operation' = 'create') GROUP BY event ORDER BY count DESC;
3 | SELECT j->>'$.commit.collection' AS event,hour(TO_TIMESTAMP(CAST(j->>'$.time_us' AS BIGINT) / 1000000)) as hour_of_day,count() AS count FROM bluesky WHERE (j->>'$.kind' = 'commit') AND (j->>'$.commit.operation' = 'create') AND (j->>'$.commit.collection' in ['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like']) GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
4 | SELECT j->>'$.did'::String as user_id,TO_TIMESTAMP(CAST(MIN(j->>'$.time_us') AS BIGINT) / 1000000) AS first_post_date FROM bluesky WHERE (j->>'$.kind' = 'commit') AND (j->>'$.commit.operation' = 'create')   AND (j->>'$.commit.collection' = 'app.bsky.feed.post') GROUP BY user_id ORDER BY first_post_date ASC LIMIT 3;
5 | SELECT j->>'$.did'::String as user_id,date_diff('milliseconds', TO_TIMESTAMP(CAST(MIN(j->>'$.time_us') AS BIGINT) / 1000000),TO_TIMESTAMP(CAST(MAX(j->>'$.time_us') AS BIGINT) / 1000000)) AS activity_span FROM bluesky WHERE (j->>'$.kind' = 'commit') AND (j->>'$.commit.operation' = 'create') AND (j->>'$.commit.collection' = 'app.bsky.feed.post') GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;
6 | 


--------------------------------------------------------------------------------
/elasticsearch/query_results.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <INDEX_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Check if ELASTIC_PASSWORD env variable is set, if set from not read from .elastic_password file
10 | if [[ -z "$ELASTIC_PASSWORD" ]]; then
11 |     [[ ! -f ".elastic_password" ]] && { echo "Error: ELASTIC_PASSWORD environment variable is not set and .elastic_password file does not exist."; exit 1; }
12 |     export $(cat .elastic_password)
13 | fi
14 | 
15 | # Arguments
16 | INDEX_NAME="$1"
17 | 
18 | QUERY_NUM=1
19 | 
20 | # File containing Elasticsearch ES|SQL queries
21 | QUERY_FILE="queries.txt"
22 | 
23 | # Check if the query file exists
24 | if [[ ! -f "$QUERY_FILE" ]]; then
25 |     echo "Error: Query file '$QUERY_FILE' does not exist."
26 |     exit 1
27 | fi
28 | 
29 | cat 'queries.txt' | while read -r QUERY; do
30 |     eval "QUERY=\"${QUERY}\""
31 |     # Print the query
32 |     echo "------------------------------------------------------------------------------------------------------------------------"
33 |     echo "Result for query Q$QUERY_NUM: "
34 |     echo
35 |     CURL_DATA="{\"query\": \"$QUERY\"}"
36 |     curl -s -k -X POST "https://localhost:9200/_query?format=txt" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' -d "$CURL_DATA"
37 |     echo
38 |      # Increment the query number
39 |     QUERY_NUM=$((QUERY_NUM + 1))
40 | done
41 | 


--------------------------------------------------------------------------------
/postgresql/queries.sql:
--------------------------------------------------------------------------------
1 | SELECT data -> 'commit' ->> 'collection' AS event, COUNT(*) as count FROM bluesky GROUP BY event ORDER BY count DESC;
2 | SELECT data -> 'commit' ->> 'collection' AS event, COUNT(*) as count, COUNT(DISTINCT data ->> 'did') AS users FROM bluesky WHERE data ->> 'kind' = 'commit' AND data -> 'commit' ->> 'operation' = 'create' GROUP BY event ORDER BY count DESC;
3 | SELECT data->'commit'->>'collection' AS event, EXTRACT(HOUR FROM TO_TIMESTAMP((data->>'time_us')::BIGINT / 1000000)) AS hour_of_day, COUNT(*) AS count FROM bluesky WHERE data->>'kind' = 'commit' AND data->'commit'->>'operation' = 'create' AND data->'commit'->>'collection' IN ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
4 | SELECT data->>'did' AS user_id, MIN( TIMESTAMP WITH TIME ZONE 'epoch' + INTERVAL '1 microsecond' * (data->>'time_us')::BIGINT ) AS first_post_ts FROM bluesky WHERE data->>'kind' = 'commit' AND data->'commit'->>'operation' = 'create' AND data->'commit'->>'collection' = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC LIMIT 3;
5 | SELECT data->>'did' AS user_id, EXTRACT(EPOCH FROM ( MAX( TIMESTAMP WITH TIME ZONE 'epoch' + INTERVAL '1 microsecond' * (data->>'time_us')::BIGINT ) - MIN( TIMESTAMP WITH TIME ZONE 'epoch' + INTERVAL '1 microsecond' * (data->>'time_us')::BIGINT ) )) * 1000 AS activity_span FROM bluesky WHERE data->>'kind' = 'commit' AND data->'commit'->>'operation' = 'create' AND data->'commit'->>'collection' = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;
6 | 


--------------------------------------------------------------------------------
/doris/queries_default.sql:
--------------------------------------------------------------------------------
1 | SELECT cast(data['commit']['collection'] AS TEXT ) AS event, COUNT(*) AS count FROM bluesky GROUP BY event ORDER BY count DESC;
2 | SELECT cast(data['commit']['collection'] AS TEXT ) AS event, COUNT(*) AS count, COUNT(DISTINCT cast(data['did'] AS TEXT )) AS users FROM bluesky WHERE cast(data['kind'] AS TEXT ) = 'commit' AND cast(data['commit']['operation'] AS TEXT ) = 'create' GROUP BY event ORDER BY count DESC;
3 | SELECT cast(data['commit']['collection'] AS TEXT ) AS event, HOUR(from_microsecond(CAST(data['time_us'] AS BIGINT))) AS hour_of_day, COUNT(*) AS count FROM bluesky WHERE cast(data['kind'] AS TEXT ) = 'commit' AND cast(data['commit']['operation'] AS TEXT ) = 'create' AND cast(data['commit']['collection'] AS TEXT ) IN ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
4 | SELECT cast(data['did'] AS TEXT ) AS user_id, MIN(from_microsecond(CAST(data['time_us'] AS BIGINT))) AS first_post_ts FROM bluesky WHERE cast(data['kind'] AS TEXT ) = 'commit' AND cast(data['commit']['operation'] AS TEXT ) = 'create' AND cast(data['commit']['collection'] AS TEXT ) = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC LIMIT 3;
5 | SELECT cast(data['did'] AS TEXT ) AS user_id, MILLISECONDS_DIFF(MAX(from_microsecond(CAST(data['time_us'] AS BIGINT))),MIN(from_microsecond(CAST(data['time_us'] AS BIGINT)))) AS activity_span FROM bluesky WHERE cast(data['kind'] AS TEXT ) = 'commit' AND cast(data['commit']['operation'] AS TEXT ) = 'create' AND cast(data['commit']['collection'] AS TEXT ) = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;
6 | 


--------------------------------------------------------------------------------
/starrocks/queries.sql:
--------------------------------------------------------------------------------
1 | SELECT get_json_string(data, 'commit.collection') AS event, count() AS count FROM bluesky GROUP BY event ORDER BY count DESC;
2 | SELECT get_json_string(data, 'commit.collection') AS event, count() AS count, count(DISTINCT get_json_string(data, 'did')) AS users FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') GROUP BY event ORDER BY count DESC;
3 | SELECT get_json_string(data, 'commit.collection') AS event, hour_from_unixtime(get_json_int(data, 'time_us')/1000000) as hour_of_day, count() AS count FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], get_json_string(data, 'commit.collection'))) GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
4 | SELECT get_json_string(data, 'did') as user_id, to_datetime(min(get_json_int(data, 'time_us')), 6) AS first_post_date FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY first_post_date ASC LIMIT 3;
5 | SELECT get_json_string(data, 'did') as user_id, date_diff('millisecond', to_datetime(min(get_json_int(data, 'time_us')), 6), to_datetime(max(get_json_int(data, 'time_us')), 6)) AS activity_span FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;
6 | 


--------------------------------------------------------------------------------
/greptimedb/load_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 4 ]]; then
 5 |     echo "Usage: $0 <DATA_DIRECTORY> <MAX_FILES> <SUCCESS_LOG> <ERROR_LOG>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATA_DIRECTORY="$1"
11 | MAX_FILES="$2"
12 | SUCCESS_LOG="$3"
13 | ERROR_LOG="$4"
14 | 
15 | # Validate arguments
16 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
17 | [[ ! "$MAX_FILES" =~ ^[0-9]+$ ]] && { echo "Error: MAX_FILES must be a positive integer."; exit 1; }
18 | 
19 | pushd $DATA_DIRECTORY
20 | counter=0
21 | for file in $(ls *.json.gz | head -n $MAX_FILES); do
22 |     echo "Processing file: $file"
23 | 
24 |     curl "http://localhost:4000/v1/events/logs?table=bluesky&pipeline_name=jsonbench&ignore_errors=true" \
25 |          -H "Content-Type: application/x-ndjson" \
26 |          -H "Content-Encoding: gzip" \
27 |          --data-binary @$file
28 |     echo ""
29 | 
30 |     first_attempt=$?
31 |     if [[ $first_attempt -eq 0 ]]; then
32 |         echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $file." >> "$SUCCESS_LOG"
33 |     else
34 |         echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed for $file. Giving up." >> "$ERROR_LOG"
35 |     fi
36 | 
37 |     counter=$((counter + 1))
38 |     if [[ $counter -ge $MAX_FILES ]]; then
39 |         break
40 |     fi
41 | done
42 | 
43 | curl -XPOST -H 'Content-Type: application/x-www-form-urlencoded' \
44 |           http://localhost:4000/v1/sql \
45 |           -d "sql=admin flush_table('bluesky')" \
46 |           -d "format=json"
47 | 
48 | echo -e "\nLoaded $MAX_FILES data files from $DATA_DIRECTORY to GreptimeDB."
49 | 


--------------------------------------------------------------------------------
/mongodb/queries.js:
--------------------------------------------------------------------------------
1 | db.bluesky.aggregate([ { $group: { _id: "$commit.collection", count: { $sum: 1 } } }, { $sort: { count: -1 } } ]);
2 | db.bluesky.aggregate([ { $match: { "kind": "commit", "commit.operation": "create" } }, { $group: { _id: "$commit.collection", count: { $sum: 1 }, users: { $addToSet: "$did" } } }, { $project: { event: "$_id", count: 1, users: { $size: "$users" } } }, { $sort: { count: -1 } } ]);
3 | db.bluesky.aggregate([ { $match: { "kind": "commit", "commit.operation": "create", "commit.collection": { $in: ["app.bsky.feed.post", "app.bsky.feed.repost", "app.bsky.feed.like"] } } }, { $project: { _id: 0, event: "$commit.collection", hour_of_day: { $hour: { $toDate: { $divide: ["$time_us", 1000] } } } } }, { $group: { _id: { event: "$event", hour_of_day: "$hour_of_day" }, count: { $sum: 1 } } }, { $sort: { "_id.hour_of_day": 1, "_id.event": 1 } } ]);
4 | db.bluesky.aggregate([ { $match: { "kind": "commit", "commit.operation": "create", "commit.collection": "app.bsky.feed.post" } }, { $project: { _id: 0, user_id: "$did", timestamp: { $toDate: { $divide: ["$time_us", 1000] } } } }, { $group: { _id: "$user_id", first_post_ts: { $min: "$timestamp" } } }, { $sort: { first_post_ts: 1 } }, { $limit: 3 } ]);
5 | db.bluesky.aggregate([ { $match: { "kind": "commit", "commit.operation": "create", "commit.collection": "app.bsky.feed.post" } }, { $project: { _id: 0, user_id: "$did", timestamp: { $toDate: { $divide: ["$time_us", 1000] } } } }, { $group: { _id: "$user_id", min_timestamp: { $min: "$timestamp" }, max_timestamp: { $max: "$timestamp" } } }, { $project: { activity_span: { $dateDiff: { startDate: "$min_timestamp", endDate: "$max_timestamp", unit: "millisecond" } } } }, { $sort: { activity_span: -1 } }, { $limit: 3 } ]);
6 | 


--------------------------------------------------------------------------------
/elasticsearch/queries_formatted.txt:
--------------------------------------------------------------------------------
 1 | -- Q1 - Top event types
 2 | 
 3 | POST /_query?format=txt
 4 | {
 5 |   "query": """FROM $INDEX_NAME
 6 | | STATS count = COUNT() BY commit.collection
 7 | | SORT count DESC"""
 8 | }
 9 | 
10 | -- Q2 - Top event types together with unique users per event type
11 | -- Note, Elasticsearch does not support exact count. COUNT_DISTINCT returns only an estimate. 
12 | 
13 | POST /_query?format=txt
14 | {
15 |   "query": """FROM $INDEX_NAME
16 | | WHERE kind == "commit" AND commit.operation == "create"
17 | | STATS users = COUNT_DISTINCT(did), count = COUNT() BY commit.collection
18 | | SORT count DESC"""
19 | }
20 | 
21 | -- Q3 - When do people use BlueSky
22 | 
23 | POST /_query?format=txt
24 | {
25 |   "query": """FROM $INDEX_NAME
26 | | WHERE kind == "commit" AND commit.operation == "create" AND commit.collection IN ("app.bsky.feed.post", "app.bsky.feed.repost", "app.bsky.feed.like")
27 | | STATS count = COUNT() BY commit.collection, DATE_EXTRACT("hour_of_day", time_us)
28 | | SORT count, commit.collection"""
29 | }
30 | 
31 | -- Q4 - top 3 post veterans
32 | 
33 | POST /_query?format=txt
34 | {
35 |   "query": """FROM $INDEX_NAME
36 | | WHERE kind == "commit" AND commit.operation == "create" AND commit.collection == "app.bsky.feed.post"
37 | | STATS first_post_ts = MIN(time_us) BY did
38 | | SORT first_post_ts ASC
39 | | LIMIT 3"""
40 | }
41 | 
42 | -- Q5 - top 3 users with longest activity
43 | 
44 | POST /_query?format=txt
45 | {
46 |   "query": """FROM $INDEX_NAME
47 | | WHERE kind == "commit" AND commit.operation == "create" AND commit.collection == "app.bsky.feed.post"
48 | | STATS activity_span = date_diff("millisecond",min(time_us), max(time_us)) BY did
49 | | SORT activity_span DESC
50 | | LIMIT 3"""
51 | }
52 | 


--------------------------------------------------------------------------------
/singlestore/load_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 7 ]]; then
 5 |     echo "Usage: $0 <ROOT_PASSWORD> <DATA_DIRECTORY> <DB_NAME> <TABLE_NAME> <MAX_FILES> <SUCCESS_LOG> <ERROR_LOG>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | 
10 | # Arguments
11 | ROOT_PASSWORD="$1"
12 | DATA_DIRECTORY="$2"
13 | DB_NAME="$3"
14 | TABLE_NAME="$4"
15 | MAX_FILES="$5"
16 | SUCCESS_LOG="$6"
17 | ERROR_LOG="$7"
18 | 
19 | # Validate arguments
20 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
21 | [[ ! "$MAX_FILES" =~ ^[0-9]+$ ]] && { echo "Error: MAX_FILES must be a positive integer."; exit 1; }
22 | 
23 | export MYSQL_PWD=${ROOT_PASSWORD}
24 | 
25 | # Load data
26 | counter=0
27 | for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do
28 |     echo "Processing file: $file"
29 | 
30 |     # Note: If one or more JSON documents in the currently processed file cannot be parsed (because of extremely deep nesting, line breaks
31 |     #       in unexpected places, etc.), then SingleStore will skip the _entire_ file. This unfortunately reduces the "data quality" metric
32 |     #       (= the number of successfully inserted JSON documents) quite a bit. SingleStore's LOAD statement comes with a SKIP PARSER ERRORS
33 |     #       clause that would theoretically allow to skip individual documents, but it is not supported for JSON
34 |     #       (https://www.singlestore.com/forum/t/pipeline-skip-parser-errors-with-json/2794).
35 |     mysql --local-infile=1 -h 127.0.0.1 -P 3306 -u root -D $DB_NAME -e "LOAD DATA LOCAL INFILE \"$file\" INTO TABLE bluesky(data <- %) FORMAT JSON"
36 | 
37 |     counter=$((counter + 1))
38 |     if [[ $counter -ge $MAX_FILES ]]; then
39 |         break
40 |     fi
41 | done
42 | 


--------------------------------------------------------------------------------
/elasticsearch/config/elasticsearch.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # ---------------------------------- Cluster -----------------------------------
 4 | #
 5 | # Use a descriptive name for your cluster:
 6 | #
 7 | cluster.name: es-bench
 8 | # ------------------------------------ Node ------------------------------------
 9 | #
10 | # Use a descriptive name for the node:
11 | #
12 | node.name: node-1
13 | # ----------------------------------- Paths ------------------------------------
14 | #
15 | # Path to directory where to store the data (separate multiple locations by comma):
16 | #
17 | path.data: /var/lib/elasticsearch
18 | #
19 | # Path to log files:
20 | #
21 | path.logs: /var/log/elasticsearch
22 | #
23 | # ----------------------------------- Memory -----------------------------------
24 | #
25 | # Lock the memory on startup:
26 | #
27 | bootstrap.memory_lock: true
28 | # --------------------------------------------------------------------------------
29 | 
30 | # Enable security features
31 | xpack.security.enabled: true
32 | xpack.security.enrollment.enabled: true
33 | 
34 | # Enable encryption for HTTP API client connections, such as Kibana, Logstash, and Agents
35 | xpack.security.http.ssl:
36 |   enabled: true
37 |   keystore.path: certs/http.p12
38 | 
39 | # Enable encryption and mutual authentication between cluster nodes
40 | xpack.security.transport.ssl:
41 |   enabled: true
42 |   verification_mode: certificate
43 |   keystore.path: certs/transport.p12
44 |   truststore.path: certs/transport.p12
45 | # Create a new cluster with the current node only
46 | # Additional nodes can still join the cluster later
47 | cluster.initial_master_nodes: ["node-1"]
48 | 
49 | # Allow HTTP API connections from anywhere
50 | # Connections are encrypted and require user authentication
51 | http.host: 0.0.0.0
52 | 
53 | #----------------------- END SECURITY AUTO CONFIGURATION -------------------------


--------------------------------------------------------------------------------
/duckdb/load_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 6 ]]; then
 5 |     echo "Usage: $0 <directory> <database_name> <table_name> <max_files> <success_log> <error_log>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DIRECTORY="$1"
11 | DB_NAME="$2"
12 | TABLE_NAME="$3"
13 | MAX_FILES="$4"
14 | SUCCESS_LOG="$5"
15 | ERROR_LOG="$6"
16 | 
17 | # Validate that MAX_FILES is a number
18 | if ! [[ "$MAX_FILES" =~ ^[0-9]+$ ]]; then
19 |     echo "Error: <max_files> must be a positive integer."
20 |     exit 1
21 | fi
22 | 
23 | # Ensure the log files exist
24 | touch "$SUCCESS_LOG" "$ERROR_LOG"
25 | 
26 | counter=0
27 | 
28 | # Loop through each .json.gz file in the directory
29 | for file in $(ls "$DIRECTORY"/*.json.gz | sort); do
30 |     # if [[ -f "$file" ]]; then
31 |     #     duckdb ~/$DB_NAME -c "insert into $TABLE_NAME select * from read_ndjson_objects('$file', ignore_errors=false, maximum_object_size=1048576000);"
32 |     # fi
33 |     if [[ -f "$file" ]]; then
34 |         # Create a temporary directory for split files
35 |         temp_dir=$(mktemp -d $DIRECTORY/temp.XXXXXX)
36 | 
37 |         # Decompress and split the file into smaller chunks of 100000 lines each
38 |         gzip -dc "$file" | split -l 100000 - "$temp_dir/chunk_"
39 | 
40 |         # Insert each chunk into DuckDB
41 |         for chunk in "$temp_dir"/chunk_*; do
42 |             duckdb ~/$DB_NAME -c "insert into $TABLE_NAME select * from read_ndjson_objects('$chunk', ignore_errors=false, maximum_object_size=1048576000);"
43 |         done
44 | 
45 |         # Clean up temporary directory
46 |         rm -r "$temp_dir"
47 |         counter=$((counter + 1))
48 |     fi
49 | 
50 |     # Stop processing if the max number of files is reached
51 |     if [[ $counter -ge $MAX_FILES ]]; then
52 |         echo "Copied maximum number of files: $MAX_FILES"
53 |         break
54 |     fi
55 | done
56 | 


--------------------------------------------------------------------------------
/elasticsearch/run_queries.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <INDEX_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Check if ELASTIC_PASSWORD env variable is set, if set from not read from .elastic_password file
10 | if [[ -z "$ELASTIC_PASSWORD" ]]; then
11 |     [[ ! -f ".elastic_password" ]] && { echo "Error: ELASTIC_PASSWORD environment variable is not set and .elastic_password file does not exist."; exit 1; }
12 |     export $(cat .elastic_password)
13 | fi
14 | 
15 | # Arguments
16 | INDEX_NAME="$1"
17 | 
18 | # Number of tries for each query
19 | TRIES=3
20 | 
21 | # File containing Elasticsearch ES|SQL queries
22 | QUERY_FILE="queries.txt"
23 | LOG_FILE="query_log_$INDEX_NAME.log"
24 | > "$LOG_FILE"
25 | 
26 | # Check if the query file exists
27 | if [[ ! -f "$QUERY_FILE" ]]; then
28 |     echo "Error: Query file '$QUERY_FILE' does not exist."
29 |     exit 1
30 | fi
31 | 
32 | cat 'queries.txt' | while read -r QUERY; do
33 |     # Clear filesystem cache between queries.
34 |     sync
35 |     echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
36 |     # Clear query cache between queries.
37 |     curl -k -X POST 'https://localhost:9200/hits/_cache/clear?pretty' -u "elastic:${ELASTIC_PASSWORD}" &>/dev/null
38 |     eval "QUERY=\"${QUERY}\""
39 |     echo "Running query: $QUERY"
40 |     for i in $(seq 1 $TRIES); do
41 |         CURL_DATA="{\"query\": \"$QUERY\"}"
42 |         RESPONSE=$(curl -s -k -X POST "https://localhost:9200/_query" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' -d "$CURL_DATA")
43 |         TOOK_MS=$(echo "$RESPONSE" | jq -r '.took' 2>/dev/null)
44 |         
45 |         # Convert 'took' to seconds (from ms to s)
46 |         TOOK_S=$(bc <<< "scale=3; $TOOK_MS / 1000")
47 |         TOOK_FORMATTED=$(printf "%.3f" "$TOOK_S")
48 |         echo "$RESPONSE" >> "$LOG_FILE"
49 |         echo "Response time: ${TOOK_FORMATTED} s"
50 |     done
51 | done
52 | 


--------------------------------------------------------------------------------
/mongodb/index_usage.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # If you change something in this file, please also change it in ferretdb/index_usage.sh as well
 4 | 
 5 | # Check if the required arguments are provided
 6 | if [[ $# -lt 1 ]]; then
 7 |     echo "Usage: $0 <DB_NAME>"
 8 |     exit 1
 9 | fi
10 | 
11 | # Arguments
12 | DB_NAME="$1"
13 | 
14 | QUERY_NUM=1
15 | 
16 | # File containing MongoDB queries (replace 'queries.js' with your file)
17 | QUERY_FILE="queries.js"
18 | 
19 | # Check if the query file exists
20 | if [[ ! -f "$QUERY_FILE" ]]; then
21 |     echo "Error: Query file '$QUERY_FILE' does not exist."
22 |     exit 1
23 | fi
24 | 
25 | # Set the internalQueryPlannerGenerateCoveredWholeIndexScans parameter to true
26 | echo "Setting internalQueryPlannerGenerateCoveredWholeIndexScans to true..."
27 | mongosh --quiet --eval "
28 |     const result = db.adminCommand({ setParameter: 1, internalQueryPlannerGenerateCoveredWholeIndexScans: true });
29 |     if (result.ok !== 1) {
30 |         print('Failed to set internalQueryPlannerGenerateCoveredWholeIndexScans: ' + JSON.stringify(result));
31 |         quit(1);
32 |     } else {
33 |         print('Successfully set internalQueryPlannerGenerateCoveredWholeIndexScans to true');
34 |     }
35 | "
36 | 
37 | cat "$QUERY_FILE" | while read -r query; do
38 | 
39 |     # Print the query number
40 |     echo "------------------------------------------------------------------------------------------------------------------------"
41 |     echo "Index usage for query Q$QUERY_NUM:"
42 |     echo
43 | 
44 |     # Modify the query to include the explain option inside the aggregate call
45 |     MODIFIED_QUERY=$(echo "$query" | sed 's/]);$/], { explain: "queryPlanner" });/')
46 | 
47 |     # Escape the modified query for safe passing to mongosh
48 |     ESCAPED_QUERY=$(echo "$MODIFIED_QUERY" | sed 's/\([\"\\]\)/\\\1/g' | sed 's/\$/\\$/g')
49 | 
50 |     mongosh --quiet --eval "
51 |         const db = db.getSiblingDB('$DB_NAME');
52 |         const result = eval(\"$ESCAPED_QUERY\");
53 |         printjson(result.stages[0].\$cursor.queryPlanner.winningPlan);
54 |     "
55 | 
56 |     # Increment the query number
57 |     QUERY_NUM=$((QUERY_NUM + 1))
58 | done;


--------------------------------------------------------------------------------
/_files_lz4/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Default data directory
 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky
 5 | DEFAULT_TARGET_DIRECTORY=~/data/bluesky_lz4
 6 | 
 7 | # Allow the user to optionally provide the data and target directories as arguments
 8 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}"
 9 | TARGET_DIRECTORY="${2:-$DEFAULT_TARGET_DIRECTORY}"
10 | 
11 | # Define prefix for output files
12 | OUTPUT_PREFIX="${3:-_files_lz4}"
13 | 
14 | # Check if the data directory exists
15 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
16 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
17 |     exit 1
18 | fi
19 | 
20 | # Ensure the target directory exists
21 | if [[ ! -d "$TARGET_DIRECTORY" ]]; then
22 |     echo "Target directory '$TARGET_DIRECTORY' does not exist. Creating it..."
23 |     mkdir -p "$TARGET_DIRECTORY"
24 |     if [[ $? -ne 0 ]]; then
25 |         echo "Error: Failed to create target directory '$TARGET_DIRECTORY'."
26 |         exit 1
27 |     fi
28 | fi
29 | 
30 | 
31 | # 1m
32 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/1m"
33 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY"
34 | mkdir -p "$TARGET_SUB_DIRECTORY"
35 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 1
36 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_1m.total_size"
37 | 
38 | # 10m
39 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/10m"
40 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY"
41 | mkdir -p "$TARGET_SUB_DIRECTORY"
42 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 10
43 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_10m.total_size"
44 | 
45 | # 100m
46 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/100m"
47 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY"
48 | mkdir -p "$TARGET_SUB_DIRECTORY"
49 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 100
50 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_100m.total_size"
51 | 
52 | # 1000m
53 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/1000m"
54 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY"
55 | mkdir -p "$TARGET_SUB_DIRECTORY"
56 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 1000
57 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_1000m.total_size"


--------------------------------------------------------------------------------
/_files_json/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Default data directory
 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky
 5 | DEFAULT_TARGET_DIRECTORY=~/data/bluesky_json
 6 | 
 7 | # Allow the user to optionally provide the data and target directories as arguments
 8 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}"
 9 | TARGET_DIRECTORY="${2:-$DEFAULT_TARGET_DIRECTORY}"
10 | 
11 | # Define prefix for output files
12 | OUTPUT_PREFIX="${3:-_files_json}"
13 | 
14 | # Check if the data directory exists
15 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
16 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
17 |     exit 1
18 | fi
19 | 
20 | # Ensure the target directory exists
21 | if [[ ! -d "$TARGET_DIRECTORY" ]]; then
22 |     echo "Target directory '$TARGET_DIRECTORY' does not exist. Creating it..."
23 |     mkdir -p "$TARGET_DIRECTORY"
24 |     if [[ $? -ne 0 ]]; then
25 |         echo "Error: Failed to create target directory '$TARGET_DIRECTORY'."
26 |         exit 1
27 |     fi
28 | fi
29 | 
30 | 
31 | # 1m
32 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/1m"
33 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY"
34 | mkdir -p "$TARGET_SUB_DIRECTORY"
35 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 1
36 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_1m.total_size"
37 | 
38 | # 10m
39 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/10m"
40 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY"
41 | mkdir -p "$TARGET_SUB_DIRECTORY"
42 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 10
43 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_10m.total_size"
44 | 
45 | # 100m
46 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/100m"
47 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY"
48 | mkdir -p "$TARGET_SUB_DIRECTORY"
49 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 100
50 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_100m.total_size"
51 | 
52 | # 1000m
53 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/1000m"
54 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY"
55 | mkdir -p "$TARGET_SUB_DIRECTORY"
56 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 1000
57 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_1000m.total_size"


--------------------------------------------------------------------------------
/_files_zstd/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Default data directory
 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky
 5 | DEFAULT_TARGET_DIRECTORY=~/data/bluesky_zstd
 6 | 
 7 | # Allow the user to optionally provide the data and target directories as arguments
 8 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}"
 9 | TARGET_DIRECTORY="${2:-$DEFAULT_TARGET_DIRECTORY}"
10 | 
11 | # Define prefix for output files
12 | OUTPUT_PREFIX="${3:-_files_zstd}"
13 | 
14 | # Check if the data directory exists
15 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
16 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
17 |     exit 1
18 | fi
19 | 
20 | # Ensure the target directory exists
21 | if [[ ! -d "$TARGET_DIRECTORY" ]]; then
22 |     echo "Target directory '$TARGET_DIRECTORY' does not exist. Creating it..."
23 |     mkdir -p "$TARGET_DIRECTORY"
24 |     if [[ $? -ne 0 ]]; then
25 |         echo "Error: Failed to create target directory '$TARGET_DIRECTORY'."
26 |         exit 1
27 |     fi
28 | fi
29 | 
30 | 
31 | # 1m
32 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/1m"
33 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY"
34 | mkdir -p "$TARGET_SUB_DIRECTORY"
35 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 1
36 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_1m.total_size"
37 | 
38 | # 10m
39 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/10m"
40 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY"
41 | mkdir -p "$TARGET_SUB_DIRECTORY"
42 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 10
43 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_10m.total_size"
44 | 
45 | # 100m
46 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/100m"
47 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY"
48 | mkdir -p "$TARGET_SUB_DIRECTORY"
49 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 100
50 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_100m.total_size"
51 | 
52 | # 1000m
53 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/1000m"
54 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY"
55 | mkdir -p "$TARGET_SUB_DIRECTORY"
56 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 1000
57 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_1000m.total_size"


--------------------------------------------------------------------------------
/elasticsearch/config/index_template_source.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "index_patterns": [
 3 |       "${INDEX_NAME}"
 4 |     ],
 5 |     "data_stream": {},
 6 |     "template": {
 7 |       "settings": {
 8 |         "index": {
 9 |           "lifecycle": {
10 |             "name": "filebeat"
11 |           },
12 |           "codec": "best_compression",
13 |           "routing": {
14 |             "allocation": {
15 |               "include": {
16 |                 "_tier_preference": "data_hot"
17 |               }
18 |             }
19 |           },
20 |           "mapping": {
21 |             "total_fields": {
22 |               "limit": "10000"
23 |             }
24 |           },
25 |           "refresh_interval": "30s",
26 |           "number_of_shards": "1",
27 |           "max_docvalue_fields_search": "200",
28 |           "sort": {
29 |             "field": [
30 |               "kind",
31 |               "commit.operation",
32 |               "commit.collection",
33 |               "did",
34 |               "time_us"
35 |             ],
36 |             "order": [
37 |               "asc",
38 |               "asc",
39 |               "asc",
40 |               "asc",
41 |               "asc"
42 |             ]
43 |           },
44 |           "number_of_replicas": "0"
45 |         }
46 |       },
47 |       "mappings": {
48 |         "_source": {
49 |           "enabled": true
50 |         },
51 |         "dynamic_templates": [
52 |           {
53 |             "strings_as_keyword": {
54 |               "match_mapping_type": "string",
55 |               "mapping": {
56 |                 "ignore_above": 1024,
57 |                 "type": "keyword"
58 |               }
59 |             }
60 |           }
61 |         ],
62 |         "properties": {
63 |           "kind": {
64 |             "type": "keyword"
65 |           },
66 |           "commit": {
67 |             "properties": {
68 |               "collection": {
69 |                 "type": "keyword"
70 |               },
71 |               "operation": {
72 |                 "type": "keyword"
73 |               }
74 |             }
75 |           },
76 |           "did": {
77 |             "type": "keyword"
78 |           },
79 |           "time_us": {
80 |             "type": "date"
81 |           }
82 |         }
83 |       },
84 |       "aliases": {}
85 |     }
86 |   }
87 |   


--------------------------------------------------------------------------------
/greptimedb/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DEFAULT_CHOICE=ask
 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky
 5 | 
 6 | # Allow the user to optionally provide the scale factor ("choice") as an argument
 7 | CHOICE="${1:-$DEFAULT_CHOICE}"
 8 | 
 9 | # Allow the user to optionally provide the data directory as an argument
10 | DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}"
11 | 
12 | # Define success and error log files
13 | SUCCESS_LOG="${3:-success.log}"
14 | ERROR_LOG="${4:-error.log}"
15 | 
16 | # Define prefix for output files
17 | OUTPUT_PREFIX="${5:-_m6i.8xlarge}"
18 | 
19 | # Check if the directory exists
20 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
21 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
22 |     exit 1
23 | fi
24 | 
25 | if [ "$CHOICE" = "ask" ]; then
26 |     echo "Select the dataset size to benchmark:"
27 |     echo "1) 1m (default)"
28 |     echo "2) 10m"
29 |     echo "3) 100m"
30 |     echo "4) 1000m"
31 |     echo "5) all"
32 |     read -p "Enter the number corresponding to your choice: " CHOICE
33 | fi
34 | 
35 | ./install.sh
36 | 
37 | benchmark() {
38 |     local size=$1
39 |     # Check DATA_DIRECTORY contains the required number of files to run the benchmark
40 |     file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
41 |     if (( file_count < size )); then
42 |         echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
43 |         exit 1
44 |     fi
45 | 
46 |     ./start.sh
47 |     ./load_data.sh "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
48 |     ./total_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.total_size"
49 |     ./data_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.data_size"
50 |     ./index_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.index_size"
51 |     ./count.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count"
52 |     ./run_queries.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime"
53 |     ./drop_tables.sh
54 | }
55 | 
56 | case $CHOICE in
57 |     2)
58 |         benchmark 10
59 |         ;;
60 |     3)
61 |         benchmark 100
62 |         ;;
63 |     4)
64 |         benchmark 1000
65 |         ;;
66 |     5)
67 |         benchmark 1
68 |         benchmark 10
69 |         benchmark 100
70 |         benchmark 1000
71 |         ;;
72 |     *)
73 |         benchmark 1
74 |         ;;
75 | esac
76 | 


--------------------------------------------------------------------------------
/elasticsearch/config/index_template_no_source.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "index_patterns": [
 3 |       "${INDEX_NAME}"
 4 |     ],
 5 |     "data_stream": {},
 6 |     "template": {
 7 |       "settings": {
 8 |         "index": {
 9 |           "lifecycle": {
10 |             "name": "filebeat"
11 |           },
12 |           "codec": "best_compression",
13 |           "routing": {
14 |             "allocation": {
15 |               "include": {
16 |                 "_tier_preference": "data_hot"
17 |               }
18 |             }
19 |           },
20 |           "mapping": {
21 |             "total_fields": {
22 |               "limit": "10000"
23 |             }
24 |           },
25 |           "refresh_interval": "30s",
26 |           "number_of_shards": "1",
27 |           "max_docvalue_fields_search": "200",
28 |           "sort": {
29 |             "field": [
30 |               "kind",
31 |               "commit.operation",
32 |               "commit.collection",
33 |               "did",
34 |               "time_us"
35 |             ],
36 |             "order": [
37 |               "asc",
38 |               "asc",
39 |               "asc",
40 |               "asc",
41 |               "asc"
42 |             ]
43 |           },
44 |           "number_of_replicas": "0"
45 |         }
46 |       },
47 |       "mappings": {
48 |         "_source": {
49 |           "enabled": false
50 |         },
51 |         "dynamic_templates": [
52 |           {
53 |             "strings_as_keyword": {
54 |               "match_mapping_type": "string",
55 |               "mapping": {
56 |                 "ignore_above": 1024,
57 |                 "type": "keyword"
58 |               }
59 |             }
60 |           }
61 |         ],
62 |         "properties": {
63 |           "kind": {
64 |             "type": "keyword"
65 |           },
66 |           "commit": {
67 |             "properties": {
68 |               "collection": {
69 |                 "type": "keyword"
70 |               },
71 |               "operation": {
72 |                 "type": "keyword"
73 |               }
74 |             }
75 |           },
76 |           "did": {
77 |             "type": "keyword"
78 |           },
79 |           "time_us": {
80 |             "type": "date"
81 |           }
82 |         }
83 |       },
84 |       "aliases": {}
85 |     }
86 |   }
87 |   


--------------------------------------------------------------------------------
/singlestore/results/_query_results/_m6i.8xlarge_bluesky_1m.query_results:
--------------------------------------------------------------------------------
 1 | ------------------------------------------------------------------------------------------------------------------------
 2 | Result for query Q1:
 3 | 
 4 | event	count
 5 | "app.bsky.feed.like"	448944
 6 | "app.bsky.graph.follow"	360374
 7 | "app.bsky.feed.post"	90816
 8 | "app.bsky.feed.repost"	58540
 9 | "app.bsky.graph.block"	14040
10 | "app.bsky.actor.profile"	11762
11 | "app.bsky.graph.listitem"	8103
12 | NULL	5328
13 | "app.bsky.graph.listblock"	895
14 | "app.bsky.graph.starterpack"	405
15 | "app.bsky.graph.list"	356
16 | "app.bsky.feed.threadgate"	255
17 | "app.bsky.feed.postgate"	104
18 | "app.bsky.feed.generator"	74
19 | "app.bsky.labeler.service"	4
20 | ------------------------------------------------------------------------------------------------------------------------
21 | Result for query Q2:
22 | 
23 | event	count	count(distinct data::did)
24 | "app.bsky.feed.like"	444523	117617
25 | "app.bsky.graph.follow"	337978	63957
26 | "app.bsky.feed.post"	86812	50464
27 | "app.bsky.feed.repost"	56993	26581
28 | "app.bsky.graph.block"	13838	5785
29 | "app.bsky.graph.listitem"	7568	1078
30 | "app.bsky.actor.profile"	5337	5337
31 | "app.bsky.graph.listblock"	860	449
32 | "app.bsky.graph.list"	259	218
33 | "app.bsky.feed.threadgate"	228	196
34 | "app.bsky.graph.starterpack"	104	101
35 | "app.bsky.feed.postgate"	101	82
36 | "app.bsky.feed.generator"	10	9
37 | ------------------------------------------------------------------------------------------------------------------------
38 | Result for query Q3:
39 | 
40 | event	hour_of_day	count
41 | "app.bsky.feed.like"	16	444523
42 | "app.bsky.feed.post"	16	86812
43 | "app.bsky.feed.repost"	16	56993
44 | ------------------------------------------------------------------------------------------------------------------------
45 | Result for query Q4:
46 | 
47 | user_id	first_post_ts
48 | did:plc:yj3sjq3blzpynh27cumnp5ks	2024-11-21 16:25:49.000167
49 | did:plc:l5o3qjrmfztir54cpwlv2eme	2024-11-21 16:25:49.001905
50 | did:plc:s4bwqchfzm6gjqfeb6mexgbu	2024-11-21 16:25:49.003907
51 | ------------------------------------------------------------------------------------------------------------------------
52 | Result for query Q5:
53 | 
54 | user_id	activity_span
55 | did:plc:tsyymlun4eqjuw7hqrhmwagd	813006959
56 | did:plc:3ug235sfy2pz7cawmpsftb65	811602261
57 | did:plc:doxhhgtxqiv47tmcovpbcqai	811404021
58 | 


--------------------------------------------------------------------------------
/doris/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # If you change something in this file, please change also in starrocks/main.sh.
 4 | 
 5 | export DORIS_FULL_NAME="apache-doris-3.1.0-bin-x64"
 6 | 
 7 | DEFAULT_CHOICE=ask
 8 | DEFAULT_DATA_DIRECTORY=~/data/bluesky
 9 | 
10 | # Allow the user to optionally provide the scale factor ("choice") as an argument
11 | CHOICE="${1:-$DEFAULT_CHOICE}"
12 | 
13 | # Allow the user to optionally provide the data directory as an argument
14 | DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}"
15 | 
16 | # Define success and error log files
17 | SUCCESS_LOG="${3:-success.log}"
18 | ERROR_LOG="${4:-error.log}"
19 | 
20 | # Define prefix for output files
21 | OUTPUT_PREFIX="${5:-_m6i.8xlarge}"
22 | 
23 | # Check if the directory exists
24 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
25 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
26 |     exit 1
27 | fi
28 | 
29 | if [ "$CHOICE" = "ask" ]; then
30 |     echo "Select the dataset size to benchmark:"
31 |     echo "1) 1m (default)"
32 |     echo "2) 10m"
33 |     echo "3) 100m"
34 |     echo "4) 1000m"
35 |     echo "5) all"
36 |     read -p "Enter the number corresponding to your choice: " CHOICE
37 | fi;
38 | 
39 | ./install.sh
40 | ./start.sh
41 | 
42 | benchmark() {
43 |     local size=$1
44 |     # Check DATA_DIRECTORY contains the required number of files to run the benchmark
45 |     file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
46 |     if (( file_count < size )); then
47 |         echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
48 |         exit 1
49 |     fi
50 |     ./create_and_load.sh "bluesky_${size}m" bluesky "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
51 |     ./total_size.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.total_size"
52 |     ./count.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count"
53 |     ./benchmark.sh "bluesky_${size}m" "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime" "queries.sql"
54 |     ./drop_table.sh "bluesky_${size}m" bluesky
55 | }
56 | 
57 | case $CHOICE in
58 |     2)
59 |         benchmark 10
60 |         ;;
61 |     3)
62 |         benchmark 100
63 |         ;;
64 |     4)
65 |         benchmark 1000
66 |         ;;
67 |     5)
68 |         benchmark 1
69 |         benchmark 10
70 |         benchmark 100
71 |         benchmark 1000
72 |         ;;
73 |     *)
74 |         benchmark 1
75 |         ;;
76 | esac
77 | 
78 | ./stop.sh
79 | ./uninstall.sh
80 | 


--------------------------------------------------------------------------------
/victorialogs/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DEFAULT_CHOICE=ask
 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky
 5 | 
 6 | # Allow the user to optionally provide the scale factor ("choice") as an argument
 7 | CHOICE="${1:-$DEFAULT_CHOICE}"
 8 | 
 9 | # Allow the user to optionally provide the data directory as an argument
10 | DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}"
11 | 
12 | # Define success and error log files
13 | SUCCESS_LOG="${3:-success.log}"
14 | ERROR_LOG="${4:-error.log}"
15 | 
16 | # Define prefix for output files
17 | OUTPUT_PREFIX="${4:-_m6i.8xlarge}"
18 | 
19 | # Check if the directory exists
20 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
21 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
22 |     exit 1
23 | fi
24 | 
25 | if [ "$CHOICE" = "ask" ]; then
26 |     echo "Select the dataset size to benchmark:"
27 |     echo "1) 1m (default)"
28 |     echo "2) 10m"
29 |     echo "3) 100m"
30 |     echo "4) 1000m"
31 |     echo "5) all"
32 |     read -p "Enter the number corresponding to your choice: " CHOICE
33 | fi
34 | 
35 | ./install.sh
36 | 
37 | benchmark() {
38 |     local size=$1
39 |     # Check DATA_DIRECTORY contains the required number of files to run the benchmark
40 |     file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
41 |     if (( file_count < size )); then
42 |         echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
43 |         exit 1
44 |     fi
45 |     ./start.sh
46 |     ./load_data.sh "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
47 |     sleep 1 # sleep for a while for settling down the data
48 |     ./total_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.total_size"
49 |     ./data_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.data_size"
50 |     ./index_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.index_size"
51 |     ./count.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count"
52 |     #./query_results.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.query_results"
53 |     ./run_queries.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime"
54 |     ./drop_tables.sh # also stops VictoriaLogs
55 | }
56 | 
57 | case $CHOICE in
58 |     2)
59 |         benchmark 10
60 |         ;;
61 |     3)
62 |         benchmark 100
63 |         ;;
64 |     4)
65 |         benchmark 1000
66 |         ;;
67 |     5)
68 |         benchmark 1
69 |         benchmark 10
70 |         benchmark 100
71 |         benchmark 1000
72 |         ;;
73 |     *)
74 |         benchmark 1
75 |         ;;
76 | esac
77 | 


--------------------------------------------------------------------------------
/victorialogs/queries_formatted.logsql:
--------------------------------------------------------------------------------
 1 | ------------------------------------------------------------------------------------------------------------------------
 2 | -- Q1 - Top event types
 3 | ------------------------------------------------------------------------------------------------------------------------
 4 | *
 5 |   | stats by (commit.collection)
 6 |       count() as count
 7 |   | sort by (count desc)
 8 | 
 9 | ------------------------------------------------------------------------------------------------------------------------
10 | -- Q2 - Top event types together with unique users per event type
11 | ------------------------------------------------------------------------------------------------------------------------
12 | {kind="commit", commit.operation="create"}
13 |   | stats by (commit.collection)
14 |       count() as count,
15 |       count_uniq(did) as users
16 |   | sort by (count desc)
17 | 
18 | ------------------------------------------------------------------------------------------------------------------------
19 | -- Q3 - When do people use BlueSky
20 | ------------------------------------------------------------------------------------------------------------------------
21 | {kind="commit", commit.operation="create", commit.collection=~"app\\.bsky\\.feed\\.(post|repost|like)"}
22 |   | math floor(_time/1h)%24 as hour_of_day
23 |   | stats by (commit.collection, hour_of_day)
24 |       count() as count
25 |   | sort by (hour_of_day, commit.collection)
26 | 
27 | ------------------------------------------------------------------------------------------------------------------------
28 | -- Q4 - top 3 post veterans
29 | ------------------------------------------------------------------------------------------------------------------------
30 | {kind="commit", commit.operation="create", commit.collection="app.bsky.feed.post"}
31 |   | stats by (did)
32 |       min(_time) as first_post_ts
33 |   | first 3 (first_post_ts)
34 | 
35 | ------------------------------------------------------------------------------------------------------------------------
36 | -- Q5 - top 3 users with longest activity
37 | ------------------------------------------------------------------------------------------------------------------------
38 | {kind="commit", commit.operation="create", commit.collection="app.bsky.feed.post"}
39 |   | stats by (did)
40 |       min(_time) tmin,
41 |       max(_time) tmax
42 |   | math round((tmax-tmin)/1e6) as activity_span
43 |   | keep did, activity_span
44 |   | first 3 (activity_span desc)
45 | 


--------------------------------------------------------------------------------
/elasticsearch/create_and_load.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if ELASTIC_PASSWORD env variable is set, if set from not read from .elastic_password file
 4 | if [[ -z "$ELASTIC_PASSWORD" ]]; then
 5 |     [[ ! -f ".elastic_password" ]] && { echo "Error: ELASTIC_PASSWORD environment variable is not set and .elastic_password file does not exist."; exit 1; }
 6 |     export $(cat .elastic_password)
 7 | fi
 8 | 
 9 | # Check if the required arguments are provided
10 | if [[ $# -lt 6 ]]; then
11 |     echo "Usage: $0 <INDEX_NAME> <INDEX_TEMPLATE_FILE> <DATA_DIRECTORY> <NUM_FILES> <SUCCESS_LOG> <ERROR_LOG>"
12 |     exit 1
13 | fi
14 | 
15 | # Arguments
16 | INDEX_NAME="$1"
17 | INDEX_TEMPLATE_FILE="config/$2.json"
18 | DATA_DIRECTORY="$3"
19 | NUM_FILES="$4"
20 | SUCCESS_LOG="$5"
21 | ERROR_LOG="$6"
22 | 
23 | # Validate arguments
24 | [[ ! -f "$INDEX_TEMPLATE_FILE" ]] && { echo "Error: Index template file '$INDEX_TEMPLATE_FILE' does not exist."; exit 1; }
25 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
26 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }
27 | 
28 | echo "Checking if ILM policy is installed, install if not"
29 | # If curl return 404, means ILM policy is not installed
30 | http_code=$(curl -s -o /dev/null -k -w "%{http_code}" -X GET "https://localhost:9200/_ilm/policy/filebeat" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json')
31 | if [[ "$http_code" -eq 404 ]] ; then
32 |     echo "Installing ILM policy"
33 |     ILM_POLICY=$(cat "config/ilm.json")
34 |     curl -s -k -X PUT "https://localhost:9200/_ilm/policy/filebeat" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' -d "$ILM_POLICY"
35 | fi
36 | 
37 | echo "Installing index template"
38 | # Read index template file json from config/$INDEX_TEMPLATE_FILE
39 | INDEX_TEMPLATE=$(cat "$INDEX_TEMPLATE_FILE")
40 | JSON_DATA=$(cat $INDEX_TEMPLATE_FILE | sed "s/\${INDEX_NAME}/$INDEX_NAME/g")
41 | echo "Install index template"
42 | curl -s -o /dev/null -k -X PUT "https://localhost:9200/_index_template/${INDEX_NAME}" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' -d "$JSON_DATA"
43 | 
44 | echo "Creating the data stream"
45 | echo "Create the data stream"
46 | curl -s -o /dev/null -k -X PUT "https://localhost:9200/_data_stream/${INDEX_NAME}" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json'
47 | 
48 | echo "Loading data"
49 | ./load_data.sh "$DATA_DIRECTORY" "$INDEX_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"
50 | 


--------------------------------------------------------------------------------
/_files_lz4/load_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if required arguments are provided
 4 | if [[ $# -lt 3 ]]; then
 5 |     echo "Usage: $0 <DATA_DIRECTORY> <TARGET_DIRECTORY> <N>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATA_DIRECTORY="$1"
11 | TARGET_DIRECTORY="$2"
12 | N="$3"
13 | 
14 | # Validate the source directory
15 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
16 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
17 |     exit 1
18 | fi
19 | 
20 | # Validate the target directory
21 | if [[ ! -d "$TARGET_DIRECTORY" ]]; then
22 |     echo "Error: Data directory '$TARGET_DIRECTORY' does not exist."
23 |     exit 1
24 | fi
25 | 
26 | # Validate N is a positive integer
27 | if ! [[ "$N" =~ ^[0-9]+$ ]]; then
28 |     echo "Error: N must be a positive integer."
29 |     exit 1
30 | fi
31 | 
32 | # Create a temporary directory inside the current directory
33 | TEMP_DIR="./temp_extraction"
34 | if [[ -d "$TEMP_DIR" ]]; then
35 |     echo "Temporary directory '$TEMP_DIR' already exists. Deleting it first..."
36 |     rm -rf "$TEMP_DIR"
37 | fi
38 | 
39 | mkdir -p "$TEMP_DIR"
40 | 
41 | # Trap to ensure cleanup of the temporary directory
42 | trap "rm -rf $TEMP_DIR" EXIT
43 | 
44 | # Process the first N files
45 | count=0
46 | for file in $(ls "$DATA_DIRECTORY"/*.json.gz | sort); do
47 |     if [[ $count -ge $N ]]; then
48 |         break
49 |     fi
50 | 
51 |     echo "Processing $file..."
52 | 
53 |     # Define paths for the temporary extracted file and compressed file
54 |     extracted_file="$TEMP_DIR/$(basename "${file%.gz}")"
55 |     compressed_file="$TEMP_DIR/$(basename "${file%.gz}.lz4")"
56 | 
57 |     # Extract the .json.gz file into the temporary directory
58 |     gzip -c -d "$file" > "$extracted_file"
59 |     if [[ $? -ne 0 ]]; then
60 |         echo "Error: Failed to extract $file to $extracted_file"
61 |         continue
62 |     fi
63 | 
64 |     # Compress the extracted file with lz4
65 |     lz4 "$extracted_file" "$compressed_file"
66 |     if [[ $? -ne 0 ]]; then
67 |         echo "Error: Failed to compress $extracted_file"
68 |         continue
69 |     fi
70 | 
71 |     # Copy the .lz4 file to the target directory
72 |     cp "$compressed_file" "$TARGET_DIRECTORY/"
73 |     if [[ $? -ne 0 ]]; then
74 |         echo "Error: Failed to copy $compressed_file to $TARGET_DIRECTORY"
75 |         continue
76 |     fi
77 | 
78 |     count=$((count + 1))
79 | done
80 | 
81 | # Cleanup (done automatically by the trap)
82 | echo "Processed $count files. Compressed files are in '$TARGET_DIRECTORY'."
83 | echo "Temporary directory '$TEMP_DIR' has been deleted."


--------------------------------------------------------------------------------
/duckdb/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DEFAULT_CHOICE=ask
 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky
 5 | 
 6 | # Allow the user to optionally provide the scale factor ("choice") as an argument
 7 | CHOICE="${1:-$DEFAULT_CHOICE}"
 8 | 
 9 | # Allow the user to optionally provide the data directory as an argument
10 | DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}"
11 | 
12 | # Define success and error log files
13 | SUCCESS_LOG="${3:-success.log}"
14 | ERROR_LOG="${4:-error.log}"
15 | 
16 | # Define prefix for output files
17 | OUTPUT_PREFIX="${5:-_m6i.8xlarge}"
18 | 
19 | # Check if the directory exists
20 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
21 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
22 |     exit 1
23 | fi
24 | 
25 | if [ "$CHOICE" = "ask" ]; then
26 |     echo "Select the dataset size to benchmark:"
27 |     echo "1) 1m (default)"
28 |     echo "2) 10m"
29 |     echo "3) 100m"
30 |     echo "4) 1000m"
31 |     echo "5) all"
32 |     read -p "Enter the number corresponding to your choice: " CHOICE
33 | fi
34 | 
35 | ./install.sh
36 | export PATH='/home/ubuntu/.duckdb/cli/latest':$PATH
37 | 
38 | benchmark() {
39 |     local size=$1
40 |     # Check DATA_DIRECTORY contains the required number of files to run the benchmark
41 |     file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
42 |     if (( file_count < size )); then
43 |         echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
44 |         exit 1
45 |     fi
46 |     ./create_and_load.sh "db.duckdb_${size}" bluesky ddl.sql "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
47 |     ./total_size.sh "db.duckdb_${size}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.data_size"
48 |     ./count.sh "db.duckdb_${size}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count"
49 |     #./query_results.sh "db.duckdb_${size}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.query_results"
50 |     ./physical_query_plans.sh "db.duckdb_${size}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.physical_query_plans"
51 |     ./benchmark.sh "db.duckdb_${size}" "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime"
52 |     ./drop_table.sh "db.duckdb_${size}"
53 | }
54 | 
55 | case $CHOICE in
56 |     2)
57 |         benchmark 10
58 |         ;;
59 |     3)
60 |         benchmark 100
61 |         ;;
62 |     4)
63 |         benchmark 1000
64 |         ;;
65 |     5)
66 |         benchmark 1
67 |         benchmark 10
68 |         benchmark 100
69 |         benchmark 1000
70 |         ;;
71 |     *)
72 |         benchmark 1
73 |         ;;
74 | esac
75 | 
76 | 
77 | ./uninstall.sh
78 | 


--------------------------------------------------------------------------------
/_files_zstd/load_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if required arguments are provided
 4 | if [[ $# -lt 3 ]]; then
 5 |     echo "Usage: $0 <DATA_DIRECTORY> <TARGET_DIRECTORY> <N>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATA_DIRECTORY="$1"
11 | TARGET_DIRECTORY="$2"
12 | N="$3"
13 | 
14 | # Validate the source directory
15 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
16 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
17 |     exit 1
18 | fi
19 | 
20 | # Validate the target directory
21 | if [[ ! -d "$TARGET_DIRECTORY" ]]; then
22 |     echo "Error: Target directory '$TARGET_DIRECTORY' does not exist."
23 |     exit 1
24 | fi
25 | 
26 | # Validate N is a positive integer
27 | if ! [[ "$N" =~ ^[0-9]+$ ]]; then
28 |     echo "Error: N must be a positive integer."
29 |     exit 1
30 | fi
31 | 
32 | # Create a temporary directory inside the current directory
33 | TEMP_DIR="./temp_extraction"
34 | if [[ -d "$TEMP_DIR" ]]; then
35 |     echo "Temporary directory '$TEMP_DIR' already exists. Deleting it first..."
36 |     rm -rf "$TEMP_DIR"
37 | fi
38 | 
39 | mkdir -p "$TEMP_DIR"
40 | 
41 | # Trap to ensure cleanup of the temporary directory
42 | trap "rm -rf $TEMP_DIR" EXIT
43 | 
44 | # Process the first N files
45 | count=0
46 | for file in $(ls "$DATA_DIRECTORY"/*.json.gz | sort); do
47 |     if [[ $count -ge $N ]]; then
48 |         break
49 |     fi
50 | 
51 |     echo "Processing $file..."
52 | 
53 |     # Define paths for the temporary extracted file and compressed file
54 |     extracted_file="$TEMP_DIR/$(basename "${file%.gz}")"
55 |     compressed_file="$TEMP_DIR/$(basename "${file%.gz}.zst")"
56 | 
57 |     # Extract the .json.gz file into the temporary directory
58 |     gzip -c -d "$file" > "$extracted_file"
59 |     if [[ $? -ne 0 ]]; then
60 |         echo "Error: Failed to extract $file to $extracted_file"
61 |         continue
62 |     fi
63 | 
64 |     # Compress the extracted file with zstd
65 |     zstd -1 "$extracted_file" -o "$compressed_file"
66 |     if [[ $? -ne 0 ]]; then
67 |         echo "Error: Failed to compress $extracted_file"
68 |         continue
69 |     fi
70 | 
71 |     # Copy the .zst file to the target directory
72 |     cp "$compressed_file" "$TARGET_DIRECTORY/"
73 |     if [[ $? -ne 0 ]]; then
74 |         echo "Error: Failed to copy $compressed_file to $TARGET_DIRECTORY"
75 |         continue
76 |     fi
77 | 
78 |     count=$((count + 1))
79 | done
80 | 
81 | # Cleanup (done automatically by the trap)
82 | echo "Processed $count files. Compressed files are in '$TARGET_DIRECTORY'."
83 | echo "Temporary directory '$TEMP_DIR' has been deleted."


--------------------------------------------------------------------------------
/mongodb/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DEFAULT_CHOICE=ask
 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky
 5 | 
 6 | # Allow the user to optionally provide the scale factor ("choice") as an argument
 7 | CHOICE="${1:-$DEFAULT_CHOICE}"
 8 | 
 9 | # Allow the user to optionally provide the data directory as an argument
10 | DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}"
11 | 
12 | # Define success and error log files
13 | SUCCESS_LOG="${3:-success.log}"
14 | ERROR_LOG="${4:-error.log}"
15 | 
16 | # Define prefix for output files
17 | OUTPUT_PREFIX="${5:-_m6i.8xlarge}"
18 | 
19 | # Check if the directory exists
20 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
21 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
22 |     exit 1
23 | fi
24 | 
25 | if [ "$CHOICE" = "ask" ]; then
26 |     echo "Select the dataset size to benchmark:"
27 |     echo "1) 1m (default)"
28 |     echo "2) 10m"
29 |     echo "3) 100m"
30 |     echo "4) 1000m"
31 |     echo "5) all"
32 |     read -p "Enter the number corresponding to your choice: " CHOICE
33 | fi
34 | 
35 | ./install.sh
36 | 
37 | benchmark() {
38 |     local size=$1
39 |     # Check DATA_DIRECTORY contains the required number of files to run the benchmark
40 |     file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
41 |     if (( file_count < size )); then
42 |         echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
43 |         exit 1
44 |     fi
45 |     ./create_and_load.sh "bluesky_${size}m" bluesky "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
46 |     ./total_size.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.total_size"
47 |     ./data_size.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.data_size"
48 |     ./index_size.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.index_size"
49 |     ./count.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count"
50 |     #./query_results.sh "bluesky_${size}m" | tee "${OUTPUT_PREFIX}_bluesky_${size}m.query_results"
51 |     ./index_usage.sh "bluesky_${size}m" | tee "${OUTPUT_PREFIX}_bluesky_${size}m.index_usage"
52 |     ./benchmark.sh "bluesky_${size}m" "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime"
53 |     ./drop_table.sh "bluesky_${size}m"
54 | }
55 | 
56 | case $CHOICE in
57 |     2)
58 |         benchmark 10
59 |         ;;
60 |     3)
61 |         benchmark 100
62 |         ;;
63 |     4)
64 |         benchmark 1000
65 |         ;;
66 |     5)
67 |         benchmark 1
68 |         benchmark 10
69 |         benchmark 100
70 |         benchmark 1000
71 |         ;;
72 |     *)
73 |         benchmark 1
74 |         ;;
75 | esac
76 | 
77 | ./uninstall.sh
78 | 


--------------------------------------------------------------------------------
/postgresql/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DEFAULT_CHOICE=ask
 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky
 5 | 
 6 | # Allow the user to optionally provide the scale factor ("choice") as an argument
 7 | CHOICE="${1:-$DEFAULT_CHOICE}"
 8 | 
 9 | # Allow the user to optionally provide the data directory as an argument
10 | DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}"
11 | 
12 | # Define success and error log files
13 | SUCCESS_LOG="${3:-success.log}"
14 | ERROR_LOG="${4:-error.log}"
15 | 
16 | # Define prefix for output files
17 | OUTPUT_PREFIX="${5:-_m6i.8xlarge}"
18 | 
19 | # Check if the directory exists
20 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
21 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
22 |     exit 1
23 | fi
24 | 
25 | if [ "$CHOICE" = "ask" ]; then
26 |     echo "Select the dataset size to benchmark:"
27 |     echo "1) 1m (default)"
28 |     echo "2) 10m"
29 |     echo "3) 100m"
30 |     echo "4) 1000m"
31 |     echo "5) all"
32 |     read -p "Enter the number corresponding to your choice: " CHOICE
33 | fi
34 | 
35 | ./install.sh
36 | 
37 | benchmark() {
38 |     local size=$1
39 |     # Check DATA_DIRECTORY contains the required number of files to run the benchmark
40 |     file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
41 |     if (( file_count < size )); then
42 |         echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
43 |         exit 1
44 |     fi
45 |     ./create_and_load.sh "bluesky_${size}m" bluesky "ddl.sql" "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
46 |     ./total_size.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.total_size"
47 |     ./data_size.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.data_size"
48 |     ./index_size.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.index_size"
49 |     ./count.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count"
50 |     ./index_usage.sh "bluesky_${size}m" | tee "${OUTPUT_PREFIX}_bluesky_${size}m.index_usage"
51 |     #./query_results.sh "bluesky_${size}m" | tee "${OUTPUT_PREFIX}_bluesky_${size}m.query_results"
52 |     ./benchmark.sh "bluesky_${size}m" "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime"
53 |     ./drop_tables.sh "bluesky_${size}m"
54 | }
55 | 
56 | case $CHOICE in
57 |     2)
58 |         benchmark 10
59 |         ;;
60 |     3)
61 |         benchmark 100
62 |         ;;
63 |     4)
64 |         benchmark 1000
65 |         ;;
66 |     5)
67 |         benchmark 1
68 |         benchmark 10
69 |         benchmark 100
70 |         benchmark 1000
71 |         ;;
72 |     *)
73 |         benchmark 1
74 |         ;;
75 | esac
76 | 
77 | ./uninstall.sh
78 | 


--------------------------------------------------------------------------------
/starrocks/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # If you change something in this file, please change also in doris/main.sh.
 4 | 
 5 | DEFAULT_CHOICE=ask
 6 | DEFAULT_DATA_DIRECTORY=~/data/bluesky
 7 | 
 8 | # Allow the user to optionally provide the scale factor ("choice") as an argument
 9 | CHOICE="${1:-$DEFAULT_CHOICE}"
10 | 
11 | # Allow the user to optionally provide the data directory as an argument
12 | DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}"
13 | 
14 | # Define success and error log files
15 | SUCCESS_LOG="${3:-success.log}"
16 | ERROR_LOG="${4:-error.log}"
17 | 
18 | # Define prefix for output files
19 | OUTPUT_PREFIX="${5:-_m6i.8xlarge}"
20 | 
21 | export DB_HOST="127.0.0.1"
22 | export DB_USER="root"
23 | export DB_MYSQL_PORT="9030"
24 | export DB_HTTP_PORT="8030" # HTTP endpoint for stream load
25 | 
26 | # Check if the directory exists
27 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
28 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
29 |     exit 1
30 | fi
31 | 
32 | if [ "$CHOICE" = "ask" ]; then
33 |     echo "Select the dataset size to benchmark:"
34 |     echo "1) 1m (default)"
35 |     echo "2) 10m"
36 |     echo "3) 100m"
37 |     echo "4) 1000m"
38 |     echo "5) all"
39 |     read -p "Enter the number corresponding to your choice: " CHOICE
40 | fi;
41 | 
42 | ./install.sh
43 | 
44 | benchmark() {
45 |     local size=$1
46 |     # Check DATA_DIRECTORY contains the required number of files to run the benchmark
47 |     file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
48 |     if (( file_count < size )); then
49 |         echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
50 |         exit 1
51 |     fi
52 |     ./create_and_load.sh "bluesky_${size}m" bluesky "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
53 |     ./total_size.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.total_size"
54 |     ./count.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count"
55 |     ./physical_query_plans.sh "bluesky_${size}m" | tee "${OUTPUT_PREFIX}_bluesky_${size}m.physical_query_plans"
56 |     ./benchmark.sh "bluesky_${size}m" "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime" "${OUTPUT_PREFIX}_bluesky_${size}m.results_memory_usage"
57 |     ./drop_table.sh "bluesky_${size}m" bluesky
58 | }
59 | 
60 | case $CHOICE in
61 |     2)
62 |         benchmark 10
63 |         ;;
64 |     3)
65 |         benchmark 100
66 |         ;;
67 |     4)
68 |         benchmark 1000
69 |         ;;
70 |     5)
71 |         benchmark 1
72 |         benchmark 10
73 |         benchmark 100
74 |         benchmark 1000
75 |         ;;
76 |     *)
77 |         benchmark 1
78 |         ;;
79 | esac
80 | 
81 | ./uninstall.sh
82 | 


--------------------------------------------------------------------------------
/mongodb/load_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 6 ]]; then
 5 |     echo "Usage: $0 <directory> <database_name> <collection_name> <max_files> <success_log> <error_log>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DIRECTORY="$1"
11 | DB_NAME="$2"
12 | COLLECTION_NAME="$3"
13 | MAX_FILES="$4"
14 | SUCCESS_LOG="$5"
15 | ERROR_LOG="$6"
16 | MONGO_URI="mongodb://localhost:27017"   # Replace with your MongoDB URI if necessary
17 | 
18 | # Validate that MAX_FILES is a number
19 | if ! [[ "$MAX_FILES" =~ ^[0-9]+$ ]]; then
20 |     echo "Error: <max_files> must be a positive integer."
21 |     exit 1
22 | fi
23 | 
24 | # Ensure the log files exist
25 | touch "$SUCCESS_LOG" "$ERROR_LOG"
26 | 
27 | # Create a temporary directory for uncompressed files
28 | TEMP_DIR=$(mktemp -d /var/tmp/json_files.XXXXXX)
29 | trap "rm -rf $TEMP_DIR" EXIT  # Ensure cleanup on script exit
30 | 
31 | # Counter to track processed files
32 | counter=0
33 | 
34 | # Loop through each .json.gz file in the directory
35 | for file in $(ls "$DIRECTORY"/*.json.gz 2>/dev/null | sort); do
36 |     if [[ -f "$file" ]]; then
37 |         echo "Processing $file..."
38 |         counter=$((counter + 1))
39 | 
40 |         # Uncompress the file into the TEMP_DIR
41 |         uncompressed_file="$TEMP_DIR/$(basename "${file%.gz}")"
42 |         gunzip -c "$file" > "$uncompressed_file"
43 | 
44 |         # Check if uncompression was successful
45 |         if [[ $? -ne 0 ]]; then
46 |             echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed to uncompress $file." >> "$ERROR_LOG"
47 |             continue
48 |         fi
49 | 
50 |         # Import the uncompressed JSON file into MongoDB
51 |         mongoimport --uri "$MONGO_URI" --db "$DB_NAME" --collection "$COLLECTION_NAME" --file "$uncompressed_file"
52 |         import_status=$?
53 | 
54 |         # Check if the import was successful
55 |         if [[ $import_status -eq 0 ]]; then
56 |             echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $uncompressed_file into MongoDB." >> "$SUCCESS_LOG"
57 |         else
58 |             echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed to import $uncompressed_file into MongoDB." >> "$ERROR_LOG"
59 |         fi
60 | 
61 |         # Remove the uncompressed file after processing
62 |         rm -f "$uncompressed_file"
63 | 
64 |         # Stop processing if the max number of files is reached
65 |         if [[ $counter -ge $MAX_FILES ]]; then
66 |             echo "Processed maximum number of files: $MAX_FILES"
67 |             break
68 |         fi
69 |     fi
70 | done
71 | 
72 | if [[ $counter -eq 0 ]]; then
73 |     echo "No .json.gz files found in the directory."
74 | fi
75 | 
76 | echo "All files have been processed."


--------------------------------------------------------------------------------
/clickhouse/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DEFAULT_CHOICE=ask
 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky
 5 | 
 6 | # Allow the user to optionally provide the scale factor ("choice") as an argument
 7 | CHOICE="${1:-$DEFAULT_CHOICE}"
 8 | 
 9 | # Allow the user to optionally provide the data directory as an argument
10 | DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}"
11 | 
12 | # Define success and error log files
13 | SUCCESS_LOG="${3:-success.log}"
14 | ERROR_LOG="${4:-error.log}"
15 | 
16 | # Define prefix for output files
17 | OUTPUT_PREFIX="${4:-_m6i.8xlarge}"
18 | 
19 | # Check if the directory exists
20 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
21 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
22 |     exit 1
23 | fi
24 | 
25 | if [ "$CHOICE" = "ask" ]; then
26 |     echo "Select the dataset size to benchmark:"
27 |     echo "1) 1m (default)"
28 |     echo "2) 10m"
29 |     echo "3) 100m"
30 |     echo "4) 1000m"
31 |     echo "5) all"
32 |     read -p "Enter the number corresponding to your choice: " CHOICE
33 | fi
34 | 
35 | ./install.sh
36 | 
37 | benchmark() {
38 |     local size=$1
39 |     # Check DATA_DIRECTORY contains the required number of files to run the benchmark
40 |     file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
41 |     if (( file_count < size )); then
42 |         echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
43 |         exit 1
44 |     fi
45 |     ./start.sh
46 |     ./create_and_load.sh "bluesky_${size}m" bluesky "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
47 |     ./total_size.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.total_size"
48 |     ./data_size.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.data_size"
49 |     ./index_size.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.index_size"
50 |     ./count.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count"
51 |     #./query_results.sh "bluesky_${size}m" | tee "${OUTPUT_PREFIX}_bluesky_${size}m.query_results"
52 |     ./index_usage.sh "bluesky_${size}m" | tee "${OUTPUT_PREFIX}_bluesky_${size}m.index_usage"
53 |     ./physical_query_plans.sh "bluesky_${size}m" | tee "${OUTPUT_PREFIX}_bluesky_${size}m.physical_query_plans"
54 |     ./benchmark.sh "bluesky_${size}m" "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime"
55 |     ./drop_table.sh # also stops ClickHouse
56 | }
57 | 
58 | case $CHOICE in
59 |     2)
60 |         benchmark 10
61 |         ;;
62 |     3)
63 |         benchmark 100
64 |         ;;
65 |     4)
66 |         benchmark 1000
67 |         ;;
68 |     5)
69 |         benchmark 1
70 |         benchmark 10
71 |         benchmark 100
72 |         benchmark 1000
73 |         ;;
74 |     *)
75 |         benchmark 1
76 |         ;;
77 | esac
78 | 


--------------------------------------------------------------------------------
/greptimedb/queries_formatted.sql:
--------------------------------------------------------------------------------
 1 | ------------------------------------------------------------------------------------------------------------------------
 2 | -- Q1 - Top event types
 3 | ------------------------------------------------------------------------------------------------------------------------
 4 | SELECT commit_collection AS event,
 5 |        count(1) AS cnt
 6 | FROM bluesky
 7 | GROUP BY event
 8 | ORDER BY cnt DESC;
 9 | 
10 | ------------------------------------------------------------------------------------------------------------------------
11 | -- Q2 - Top event types together with unique users per event type
12 | ------------------------------------------------------------------------------------------------------------------------
13 | SELECT commit_collection AS event,
14 |        count(1) AS cnt,
15 |        count(DISTINCT did) AS users
16 | FROM bluesky
17 | WHERE kind = 'commit'
18 |   AND commit_operation = 'create'
19 | GROUP BY event
20 | ORDER BY cnt DESC;
21 | 
22 | ------------------------------------------------------------------------------------------------------------------------
23 | -- Q3 - When do people use BlueSky
24 | ------------------------------------------------------------------------------------------------------------------------
25 | SELECT commit_collection AS event,
26 |        date_part('hour', time_us) AS hour_of_day,
27 |        count(1) AS cnt
28 | FROM bluesky
29 | WHERE kind = 'commit'
30 |   AND commit_operation = 'create'
31 |   AND commit_collection IN('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like')
32 | GROUP BY event,
33 |          hour_of_day
34 | ORDER BY hour_of_day,
35 |          event;
36 | 
37 | ------------------------------------------------------------------------------------------------------------------------
38 | -- Q4 - top 3 post veterans
39 | ------------------------------------------------------------------------------------------------------------------------
40 | SELECT did AS user_id,
41 |        min(time_us) AS first_post_ts
42 | FROM bluesky
43 | WHERE kind = 'commit'
44 |   AND commit_operation = 'create'
45 |   AND commit_collection = 'app.bsky.feed.post'
46 | GROUP BY user_id
47 | ORDER BY first_post_ts ASC LIMIT 3;
48 | 
49 | ------------------------------------------------------------------------------------------------------------------------
50 | -- Q5 - top 3 users with longest activity
51 | ------------------------------------------------------------------------------------------------------------------------
52 | SELECT did AS user_id,
53 |        date_part('millisecond',(max(time_us) - min(time_us))) AS activity_span
54 | FROM bluesky
55 | WHERE kind = 'commit'
56 |   AND commit_operation = 'create'
57 |   AND commit_collection = 'app.bsky.feed.post'
58 | GROUP BY user_id
59 | ORDER BY activity_span DESC LIMIT 3;
60 | 


--------------------------------------------------------------------------------
/elasticsearch/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DEFAULT_CHOICE=ask
 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky
 5 | 
 6 | # Allow the user to optionally provide the scale factor ("choice") as an argument
 7 | CHOICE="${1:-$DEFAULT_CHOICE}"
 8 | 
 9 | # Allow the user to optionally provide the data directory as an argument
10 | DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}"
11 | 
12 | # Define success and error log files
13 | SUCCESS_LOG="${3:-success.log}"
14 | ERROR_LOG="${4:-error.log}"
15 | 
16 | # Define prefix for output files
17 | OUTPUT_PREFIX="${5:-_m6i.8xlarge}"
18 | 
19 | # Check if the directory exists
20 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
21 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
22 |     exit 1
23 | fi
24 | 
25 | if [ "$CHOICE" = "ask" ]; then
26 |     echo "Select the dataset size to benchmark:"
27 |     echo "1) 1m (default)"
28 |     echo "2) 10m"
29 |     echo "3) 100m"
30 |     echo "4) 1000m"
31 |     echo "5) all"
32 |     read -p "Enter the number corresponding to your choice: " CHOICE
33 | fi
34 | 
35 | ./install.sh
36 | 
37 | benchmark() {
38 |     ./start.sh
39 |     local size=$1
40 |     local template=$2
41 |     # Check DATA_DIRECTORY contains the required number of files to run the benchmark
42 |     file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
43 |     if (( file_count < size )); then
44 |         echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
45 |         exit 1
46 |     fi
47 |     ./create_and_load.sh "bluesky-${template}-${size}m" "index_template_${template}" "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
48 |     ./total_size.sh "bluesky-${template}-${size}m" | tee "${OUTPUT_PREFIX}_bluesky-${template}-${size}m.data_size"
49 |     ./count.sh "bluesky-${template}-${size}m" | tee "${OUTPUT_PREFIX}_bluesky-${template}-${size}m.count"
50 |     #./query_results.sh "bluesky-${template}-${size}m" | tee "${OUTPUT_PREFIX}_bluesky-${template}-${size}m.query_results"
51 |     ./benchmark.sh "bluesky-${template}-${size}m" "${OUTPUT_PREFIX}_bluesky-${template}-${size}m.results_runtime"
52 |     ./drop_tables.sh
53 | }
54 | 
55 | case $CHOICE in
56 |     2)
57 |         benchmark 10 no_source
58 |         benchmark 10 source
59 |         ;;
60 |     3)
61 |         benchmark 100 no_source
62 |         benchmark 100 source
63 |         ;;
64 |     4)
65 |         benchmark 1000 no_source
66 |         benchmark 1000 source
67 |         ;;
68 |     5)
69 |         benchmark 1 no_source
70 |         benchmark 1 source
71 |         benchmark 10 no_source
72 |         benchmark 10 source
73 |         benchmark 100 no_source
74 |         benchmark 100 source
75 |         benchmark 1000 no_source
76 |         benchmark 1000 source
77 |         ;;
78 |     *)
79 |         benchmark 1 no_source
80 |         benchmark 1 source
81 |         ;;
82 | esac
83 | 
84 | ./uninstall
85 | 


--------------------------------------------------------------------------------
/clickhouse/load_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 6 ]]; then
 5 |     echo "Usage: $0 <DATA_DIRECTORY> <DB_NAME> <TABLE_NAME> <MAX_FILES> <SUCCESS_LOG> <ERROR_LOG>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | 
10 | # Arguments
11 | DATA_DIRECTORY="$1"
12 | DB_NAME="$2"
13 | TABLE_NAME="$3"
14 | MAX_FILES="$4"
15 | SUCCESS_LOG="$5"
16 | ERROR_LOG="$6"
17 | 
18 | # Validate arguments
19 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
20 | [[ ! "$MAX_FILES" =~ ^[0-9]+$ ]] && { echo "Error: MAX_FILES must be a positive integer."; exit 1; }
21 | 
22 | 
23 | # Create a temporary directory for uncompressed files
24 | TEMP_DIR=$(mktemp -d /var/tmp/json_files.XXXXXX)
25 | trap "rm -rf $TEMP_DIR" EXIT  # Cleanup temp directory on script exit
26 | 
27 | # Load data
28 | counter=0
29 | for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do
30 |     echo "Processing file: $file"
31 | 
32 |     # Uncompress the file into the TEMP_DIR
33 |     uncompressed_file="$TEMP_DIR/$(basename "${file%.gz}")"
34 |     gunzip -c "$file" > "$uncompressed_file"
35 | 
36 |     if [[ $? -ne 0 ]]; then
37 |         echo "Error: Failed to uncompress $file" >> "$ERROR_LOG"
38 |         continue
39 |     fi
40 | 
41 |     # Attempt the first import
42 |     ./clickhouse client --query="INSERT INTO $DB_NAME.$TABLE_NAME SETTINGS min_insert_block_size_rows = 1_000_000, min_insert_block_size_bytes = 0 FORMAT JSONAsObject" < "$uncompressed_file"
43 |     first_attempt=$?
44 | 
45 |     # Check if the first import was successful
46 |     if [[ $first_attempt -eq 0 ]]; then
47 |         echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $file." >> "$SUCCESS_LOG"
48 |         rm -f "$uncompressed_file"  # Delete the uncompressed file after successful processing
49 |     else
50 |         echo "[$(date '+%Y-%m-%d %H:%M:%S')] First attempt failed for $file. Trying again..." >> "$ERROR_LOG"
51 | 
52 |         echo "Processing $file... again..."
53 |         # Attempt the second import with a different command
54 |         ./clickhouse client --query="INSERT INTO $DB_NAME.$TABLE_NAME SETTINGS min_insert_block_size_rows = 1_000_000, min_insert_block_size_bytes = 0, input_format_allow_errors_num = 1_000_000_000, input_format_allow_errors_ratio=1 FORMAT JSONAsObject" < "$uncompressed_file"
55 |         second_attempt=$?
56 | 
57 |         # Check if the second import was successful
58 |         if [[ $second_attempt -eq 0 ]]; then
59 |             echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $file on second attempt." >> "$SUCCESS_LOG"
60 |             rm -f "$uncompressed_file"  # Delete the uncompressed file after successful processing
61 |         else
62 |             echo "[$(date '+%Y-%m-%d %H:%M:%S')] Both attempts failed for $file. Giving up." >> "$ERROR_LOG"
63 |         fi
64 |     fi
65 | 
66 |     counter=$((counter + 1))
67 |     if [[ $counter -ge $MAX_FILES ]]; then
68 |         break
69 |     fi
70 | done
71 | 


--------------------------------------------------------------------------------
/clickhouse/queries_formatted.sql:
--------------------------------------------------------------------------------
 1 | ------------------------------------------------------------------------------------------------------------------------
 2 | -- Q1 - Top event types
 3 | ------------------------------------------------------------------------------------------------------------------------
 4 | SELECT
 5 |     data.commit.collection AS event,
 6 |     count() AS count
 7 | FROM bluesky
 8 | GROUP BY event
 9 | ORDER BY count DESC;
10 | 
11 | ------------------------------------------------------------------------------------------------------------------------
12 | -- Q2 - Top event types together with unique users per event type
13 | ------------------------------------------------------------------------------------------------------------------------
14 | SELECT
15 |     data.commit.collection AS event,
16 |     count() AS count,
17 |     uniqExact(data.did) AS users
18 | FROM bluesky
19 | WHERE data.kind = 'commit'
20 |   AND data.commit.operation = 'create'
21 | GROUP BY event
22 | ORDER BY count DESC;
23 | 
24 | ------------------------------------------------------------------------------------------------------------------------
25 | -- Q3 - When do people use BlueSky
26 | ------------------------------------------------------------------------------------------------------------------------
27 | SELECT
28 |     data.commit.collection AS event,
29 |     toHour(fromUnixTimestamp64Micro(data.time_us)) as hour_of_day,
30 |     count() AS count
31 | FROM bluesky
32 | WHERE data.kind = 'commit'
33 |   AND data.commit.operation = 'create'
34 |   AND data.commit.collection in ['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like']
35 | GROUP BY event, hour_of_day
36 | ORDER BY hour_of_day, event;
37 | 
38 | ------------------------------------------------------------------------------------------------------------------------
39 | -- Q4 - top 3 post veterans
40 | ------------------------------------------------------------------------------------------------------------------------
41 | SELECT
42 |     data.did::String as user_id,
43 |     min(fromUnixTimestamp64Micro(data.time_us)) as first_post_ts
44 | FROM bluesky
45 | WHERE data.kind = 'commit'
46 |   AND data.commit.operation = 'create'
47 |   AND data.commit.collection = 'app.bsky.feed.post'
48 | GROUP BY user_id
49 | ORDER BY first_post_ts ASC
50 | LIMIT 3;
51 | 
52 | ------------------------------------------------------------------------------------------------------------------------
53 | -- Q5 - top 3 users with longest activity
54 | ------------------------------------------------------------------------------------------------------------------------
55 | SELECT
56 |     data.did::String as user_id,
57 |     date_diff(
58 |         'milliseconds',
59 |         min(fromUnixTimestamp64Micro(data.time_us)),
60 |         max(fromUnixTimestamp64Micro(data.time_us))) AS activity_span
61 | FROM bluesky
62 | WHERE data.kind = 'commit'
63 |   AND data.commit.operation = 'create'
64 |   AND data.commit.collection = 'app.bsky.feed.post'
65 | GROUP BY user_id
66 | ORDER BY activity_span DESC
67 | LIMIT 3;
68 | 


--------------------------------------------------------------------------------
/doris/load_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 6 ]]; then
 5 |     echo "Usage: $0 <DATA_DIRECTORY> <DB_NAME> <TABLE_NAME> <MAX_FILES> <SUCCESS_LOG> <ERROR_LOG>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | 
10 | # Arguments
11 | DATA_DIRECTORY="$1"
12 | DB_NAME="$2"
13 | TABLE_NAME="$3"
14 | MAX_FILES="$4"
15 | SUCCESS_LOG="$5"
16 | ERROR_LOG="$6"
17 | 
18 | # Validate arguments
19 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
20 | [[ ! "$MAX_FILES" =~ ^[0-9]+$ ]] && { echo "Error: MAX_FILES must be a positive integer."; exit 1; }
21 | 
22 | # Create a temporary directory for uncompressed files
23 | TEMP_DIR=$(mktemp -d /var/tmp/json_files.XXXXXX)
24 | trap "rm -rf $TEMP_DIR" EXIT  # Cleanup temp directory on script exit
25 | 
26 | # Load data
27 | counter=0
28 | start=0
29 | for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do
30 |     echo "Processing file: $file"
31 |     num=$(echo "$file" | sed -n 's/[^0-9]*\([0-9]\+\).*/\1/p')
32 |     if [ "$num" -le "$start" ]; then
33 |         continue
34 |     fi
35 | 
36 |     # Uncompress the file into the TEMP_DIR
37 |     uncompressed_file="$TEMP_DIR/$(basename "${file%.gz}")"
38 |     gunzip -c "$file" > "$uncompressed_file"
39 | 
40 |     if [[ $? -ne 0 ]]; then
41 |         echo "Error: Failed to uncompress $file" >> "$ERROR_LOG"
42 |         continue
43 |     fi
44 |     MAX_ATTEMPT=10
45 |     attempt=0
46 |     while [ $attempt -lt $MAX_ATTEMPT ]
47 |     do
48 |         # Attempt the import
49 |         http_code=$(curl -s -w "%{http_code}" -o >(cat >/tmp/curl_body) --location-trusted -u root: -H "max_filter_ratio: 0.1" -H "Expect:100-continue" -H "columns: data" -T "$uncompressed_file" -XPUT http://127.0.0.1:8030/api/"$DB_NAME"/"$TABLE_NAME"/_stream_load)
50 |         response_body="$(cat /tmp/curl_body)"
51 |         response_status="$(cat /tmp/curl_body | jq -r '.Status')"
52 |         echo $response_status
53 |         if [[ "$http_code" -ge 200 && "$http_code" -lt 300 ]]; then
54 |             if [ "$response_status" = "Success" ]
55 |             then
56 |                 echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $file. Response: $response_body" >> "$SUCCESS_LOG"
57 |                 rm -f "$uncompressed_file"  # Delete the uncompressed file after successful processing
58 |                 attempt=$((MAX_ATTEMPT))
59 |             else
60 |                 echo "[$(date '+%Y-%m-%d %H:%M:%S')] $attempt attempt failed for $file with status code $http_code. Response: $response_body" >> "$ERROR_LOG"
61 |                 attempt=$((attempt + 1))
62 |                 sleep 2
63 |             fi
64 |         else
65 |             echo "[$(date '+%Y-%m-%d %H:%M:%S')] $attempt attempt failed for $file with status code $http_code. Response: $response_body" >> "$ERROR_LOG"
66 |             attempt=$((attempt + 1))
67 |             sleep 2
68 |         fi
69 |     done
70 | 
71 |     counter=$((counter + 1))
72 |     if [[ $counter -ge $MAX_FILES ]]; then
73 |         break
74 |     fi
75 | done
76 | 


--------------------------------------------------------------------------------
/elasticsearch/config/filebeat.yml:
--------------------------------------------------------------------------------
 1 | # ============================== Filebeat inputs ===============================
 2 | filebeat.registry.flush: 5s
 3 | filebeat.inputs:
 4 | 
 5 | - type: filestream
 6 |   id: bluesky-events
 7 | 
 8 |   paths:
 9 |     - <temp_dir>
10 |   parsers:
11 |     - ndjson:
12 |         target: ""
13 | # ============================== Filebeat modules ==============================
14 | 
15 | filebeat.config.modules:
16 |  # Glob pattern for configuration loading
17 |  path: ${path.config}/modules.d/*.yml
18 | 
19 |  # Set to true to enable config reloading
20 |  reload.enabled: false
21 | 
22 | # ======================= Elasticsearch template setting =======================
23 | 
24 | setup.template.enabled: false
25 | 
26 | # ================================== Outputs ===================================
27 | 
28 | # Configure what output to use when sending the data collected by the beat.
29 | 
30 | # ---------------------------- Elasticsearch Output ----------------------------
31 | 
32 | output.elasticsearch:
33 |  # Array of hosts to connect to.
34 |  hosts: ["https://localhost:9200"]
35 | 
36 |  # Performance preset - one of "balanced", "throughput", "scale",
37 |  # "latency", or "custom".
38 |  preset: throughput
39 |  compression_level: 1
40 |  idle_connection_timeout: 30s
41 |  # Protocol - either `http` (default) or `https`.
42 |  protocol: "https"
43 |  index: "<index_name>"
44 |  # Authentication credentials - either API key or username/password.
45 |  api_key: "<api_key>"
46 |  ssl:
47 |    enabled: true
48 |    verification_mode: "none"
49 | 
50 | http.enabled: true
51 | http.host: localhost
52 | http.port: 5066
53 | 
54 | logging.level: info
55 | logging.to_files: true
56 | logging.files:
57 |   path: /var/log/filebeat
58 |   name: filebeat
59 |   keepfiles: 7
60 |   permissions: 0640
61 | 
62 | processors:
63 |  - rename:
64 |      when:
65 |        and:
66 |          - has_fields: ["commit.record.subject"]
67 |          - not:
68 |              has_fields: ["commit.record.subject.cid"]
69 |      fields:
70 |        - from: "commit.record.subject"
71 |          to: "commit.record.subject.value"
72 |  - rename:
73 |      when:
74 |        and:
75 |          - has_fields: ["commit.record.embed.images.data"]
76 |          - not:
77 |              has_fields: ["commit.record.subject.cid"]
78 |      fields:
79 |        - from: "commit.record.embed.images.data"
80 |          to: "commit.record.embed.images.data.value"
81 |  - drop_fields:
82 |      fields: ["log", "agent", "ecs","host", "input"]
83 |      ignore_missing: true
84 |  - script:
85 |      lang: javascript
86 |      source: >
87 |        function process(event){
88 |          var time_us = event.Get("time_us");
89 |          if (typeof time_us === 'string') {
90 |              time_us = BigInt(time_us);  // If time_us is a string, cast it to a BigInt
91 |          } else if (typeof time_us !== 'number') {
92 |              return;  // Exit the function if time_us is not a valid number
93 |          }
94 |          var time_us_ms = time_us / 1000;
95 |          event.Put("time_us", time_us_ms.toString());
96 |        }


--------------------------------------------------------------------------------
/postgresql/load_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 6 ]]; then
 5 |     echo "Usage: $0 <directory> <database_name> <table_name> <max_files> <success_log> <error_log>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DIRECTORY="$1"
11 | DB_NAME="$2"
12 | TABLE_NAME="$3"
13 | MAX_FILES="$4"
14 | SUCCESS_LOG="$5"
15 | ERROR_LOG="$6"
16 | PSQL_CMD="sudo -u postgres psql -d $DB_NAME"
17 | 
18 | # Validate that MAX_FILES is a number
19 | if ! [[ "$MAX_FILES" =~ ^[0-9]+$ ]]; then
20 |     echo "Error: <max_files> must be a positive integer."
21 |     exit 1
22 | fi
23 | 
24 | # Ensure the log files exist
25 | touch "$SUCCESS_LOG" "$ERROR_LOG"
26 | 
27 | # Create a temporary directory in /var/tmp and ensure it's accessible
28 | TEMP_DIR=$(mktemp -d /var/tmp/cleaned_files.XXXXXX)
29 | chmod 777 "$TEMP_DIR"  # Allow access for all users
30 | trap "rm -rf $TEMP_DIR" EXIT  # Ensure cleanup on script exit
31 | 
32 | # Counter to track processed files
33 | counter=0
34 | 
35 | # Loop through each .json.gz file in the directory
36 | for file in $(ls "$DIRECTORY"/*.json.gz | sort); do
37 |     if [[ -f "$file" ]]; then
38 |         echo "Processing $file..."
39 |         counter=$((counter + 1))
40 | 
41 |         # Uncompress the file into the temporary directory
42 |         uncompressed_file="$TEMP_DIR/$(basename "${file%.gz}")"
43 |         gunzip -c "$file" > "$uncompressed_file"
44 | 
45 |         # Check if uncompression was successful
46 |         if [[ $? -ne 0 ]]; then
47 |             echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed to uncompress $file." >> "$ERROR_LOG"
48 |             continue
49 |         fi
50 | 
51 |         # Preprocess the file to remove null characters
52 |         cleaned_file="$TEMP_DIR/$(basename "${uncompressed_file%.json}_cleaned.json")"
53 |         sed 's/\\u0000//g' "$uncompressed_file" > "$cleaned_file"
54 | 
55 |         # Grant read permissions for the postgres user
56 |         chmod 644 "$cleaned_file"
57 | 
58 |         # Import the cleaned JSON file into PostgreSQL
59 |         $PSQL_CMD -c "\COPY $TABLE_NAME FROM '$cleaned_file' WITH (format csv, quote e'\x01', delimiter e'\x02', escape e'\x01');"
60 |         import_status=$?
61 | 
62 |         # Check if the import was successful
63 |         if [[ $import_status -eq 0 ]]; then
64 |             echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $cleaned_file into PostgreSQL." >> "$SUCCESS_LOG"
65 |             # Delete both the uncompressed and cleaned files after successful processing
66 |             rm -f "$uncompressed_file" "$cleaned_file"
67 |         else
68 |             echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed to import $cleaned_file. See errors above." >> "$ERROR_LOG"
69 |             # Keep the files for debugging purposes
70 |         fi
71 | 
72 |         # Stop processing if the max number of files is reached
73 |         if [[ $counter -ge $MAX_FILES ]]; then
74 |             echo "Processed maximum number of files: $MAX_FILES"
75 |             break
76 |         fi
77 |     else
78 |         echo "No .json.gz files found in the directory."
79 |     fi
80 | done
81 | 


--------------------------------------------------------------------------------