├── CNAME ├── .gitignore ├── duckdb ├── ddl.sql ├── uninstall.sh ├── install.sh ├── results │ ├── m6i.8xlarge_bluesky_1000m.errors │ ├── m6i.8xlarge_bluesky_1m.json │ ├── m6i.8xlarge_bluesky_10m.json │ ├── m6i.8xlarge_bluesky_100m.json │ └── m6i.8xlarge_bluesky_1000m.json ├── drop_table.sh ├── count.sh ├── total_size.sh ├── query_results.sh ├── physical_query_plans.sh ├── run_queries.sh ├── benchmark.sh ├── create_and_load.sh ├── queries.sql ├── load_data.sh └── main.sh ├── favicon.png ├── clickhouse ├── install.sh ├── start.sh ├── count.sh ├── total_size.sh ├── data_size.sh ├── drop_table.sh ├── index_size.sh ├── results │ ├── m6i.8xlarge_bluesky_1m.json │ ├── m6i.8xlarge_bluesky_10m.json │ ├── m6i.8xlarge_bluesky_100m.json │ └── m6i.8xlarge_bluesky_1000m.json ├── index_usage.sh ├── physical_query_plans.sh ├── query_results.sh ├── ddl.sql ├── run_queries.sh ├── create_and_load.sh ├── queries.sql ├── benchmark.sh ├── main.sh ├── load_data.sh └── queries_formatted.sql ├── postgresql ├── uninstall.sh ├── install.sh ├── drop_tables.sh ├── data_size.sh ├── total_size.sh ├── count.sh ├── ddl.sql ├── index_size.sh ├── results │ ├── m6i.8xlarge_bluesky_10m.json │ ├── m6i.8xlarge_bluesky_1m.json │ ├── m6i.8xlarge_bluesky_1000m.json │ └── m6i.8xlarge_bluesky_100m.json ├── query_results.sh ├── index_usage.sh ├── run_queries.sh ├── benchmark.sh ├── create_and_load.sh ├── queries.sql ├── main.sh └── load_data.sh ├── elasticsearch ├── uninstall.sh ├── config │ ├── ilm.json │ ├── elasticsearch.yml │ ├── index_template_source.json │ ├── index_template_no_source.json │ └── filebeat.yml ├── results │ ├── m6i.8xlarge_bluesky_source_1m.json │ ├── m6i.8xlarge_bluesky_source_10m.json │ ├── m6i.8xlarge_bluesky_no_source_10m.json │ ├── m6i.8xlarge_bluesky_no_source_1m.json │ ├── m6i.8xlarge_bluesky_source_1000m.json │ ├── m6i.8xlarge_bluesky_source_100m.json │ ├── m6i.8xlarge_bluesky_no_source_100m.json │ └── m6i.8xlarge_bluesky_no_source_1000m.json ├── count.sh ├── total_size.sh ├── install.sh ├── benchmark.sh ├── start.sh ├── drop_tables.sh ├── queries.txt ├── query_results.sh ├── queries_formatted.txt ├── run_queries.sh ├── create_and_load.sh └── main.sh ├── mongodb ├── uninstall.sh ├── ddl.js ├── drop_table.sh ├── results │ ├── m6i.8xlarge_bluesky_1m.json │ ├── m6i.8xlarge_bluesky_10m.json │ ├── m6i.8xlarge_bluesky_100m.json │ └── m6i.8xlarge_bluesky_1000m.json ├── install.sh ├── data_size.sh ├── total_size.sh ├── index_size.sh ├── count.sh ├── create_and_load.sh ├── benchmark.sh ├── query_results.sh ├── queries.js ├── index_usage.sh ├── main.sh └── load_data.sh ├── doris ├── stop.sh ├── uninstall.sh ├── install.sh ├── total_size.sh ├── count.sh ├── drop_table.sh ├── start.sh ├── results │ ├── m6i.8xlarge_bluesky_1m.json │ ├── m6i.8xlarge_bluesky_10m.json │ ├── m6i.8xlarge_bluesky_100m.json │ └── m6i.8xlarge_bluesky_1000m.json ├── ddl.sql ├── queries.sql ├── create_and_load.sh ├── benchmark.sh ├── run_queries.sh ├── queries_default.sql ├── main.sh └── load_data.sh ├── victorialogs ├── count.sh ├── index_size.sh ├── drop_tables.sh ├── start.sh ├── data_size.sh ├── install.sh ├── total_size.sh ├── query_results.sh ├── results │ ├── m6i.8xlarge_bluesky_1m.json │ ├── m6i.8xlarge_bluesky_10m.json │ ├── m6i.8xlarge_bluesky_100m.json │ └── m6i.8xlarge_bluesky_1000m.json ├── queries.logsql ├── run_queries.sh ├── load_data.sh ├── main.sh └── queries_formatted.logsql ├── starrocks ├── uninstall.sh ├── count.sh ├── total_size.sh ├── ddl.sql ├── drop_table.sh ├── results │ ├── m6i.8xlarge_bluesky_10m.json │ ├── m6i.8xlarge_bluesky_1m.json │ ├── m6i.8xlarge_bluesky_100m.json │ └── m6i.8xlarge_bluesky_1000m.json ├── physical_query_plans.sh ├── install.sh ├── run_queries.sh ├── create_and_load.sh ├── benchmark.sh ├── queries.sql └── main.sh ├── greptimedb ├── drop_tables.sh ├── count.sh ├── install.sh ├── data_size.sh ├── index_size.sh ├── total_size.sh ├── results │ ├── m6i.8xlarge_bluesky_1m.json │ ├── m6i.8xlarge_bluesky_10m.json │ ├── m6i.8xlarge_bluesky_100m.json │ └── m6i.8xlarge_bluesky_1000m.json ├── pipeline.yaml ├── start.sh ├── run_queries.sh ├── queries.sql ├── load_data.sh ├── main.sh └── queries_formatted.sql ├── singlestore ├── uninstall.sh ├── count.sh ├── drop_table.sh ├── index_size.sh ├── data_size.sh ├── ddl.sql ├── total_size.sh ├── benchmark.sh ├── query_results.sh ├── install.sh ├── results │ ├── m6i.8xlarge_bluesky_10m.json │ ├── m6i.8xlarge_bluesky_1m.json │ ├── m6i.8xlarge_bluesky_100m.json │ ├── m6i.8xlarge_bluesky_1000m.json │ └── _query_results │ │ └── _m6i.8xlarge_bluesky_1m.query_results ├── physical_query_plans.sh ├── run_queries.sh ├── create_and_load.sh ├── queries.sql └── load_data.sh ├── _files_gz ├── results │ ├── _files_bluesky_gz_1m.json │ ├── _files_bluesky_gz_10m.json │ ├── _files_bluesky_gz_1000m.json │ └── _files_bluesky_gz_100m.json ├── main.sh └── total_size.sh ├── _files_json ├── results │ ├── _files_bluesky_json_10m.json │ ├── _files_bluesky_json_1m.json │ ├── _files_bluesky_json_1000m.json │ └── _files_bluesky_json_100m.json ├── total_size.sh ├── load_data.sh └── main.sh ├── _files_lz4 ├── results │ ├── _files_bluesky_lz4_1m.json │ ├── _files_bluesky_lz4_100m.json │ ├── _files_bluesky_lz4_10m.json │ └── _files_bluesky_lz4_1000m.json ├── total_size.sh ├── main.sh └── load_data.sh ├── _files_zstd ├── results │ ├── _files_bluesky_zstd_10m.json │ ├── _files_bluesky_zstd_1m.json │ ├── _files_bluesky_zstd_1000m.json │ └── _files_bluesky_zstd_100m.json ├── total_size.sh ├── main.sh └── load_data.sh ├── .github └── workflows │ └── generate-results.yml ├── generate-results.sh └── download_data.sh /CNAME: -------------------------------------------------------------------------------- 1 | jsonbench.com -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.bak 2 | .idea 3 | -------------------------------------------------------------------------------- /duckdb/ddl.sql: -------------------------------------------------------------------------------- 1 | create table bluesky (j JSON); -------------------------------------------------------------------------------- /duckdb/uninstall.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm -rf ~/.duckdb 4 | -------------------------------------------------------------------------------- /favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ClickHouse/JSONBench/HEAD/favicon.png -------------------------------------------------------------------------------- /clickhouse/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | curl https://clickhouse.com/ | sh 4 | -------------------------------------------------------------------------------- /postgresql/uninstall.sh: -------------------------------------------------------------------------------- 1 | sudo apt-get remove -y postgresql-common postgresql-16 2 | -------------------------------------------------------------------------------- /elasticsearch/uninstall.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo apt-get remove --yes elasticsearch filebeat 4 | -------------------------------------------------------------------------------- /mongodb/uninstall.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo systemctl stop mongod 4 | sudo apt-get remove -y mongodb-org 5 | -------------------------------------------------------------------------------- /doris/stop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ${DORIS_FULL_NAME}/be/bin/stop_be.sh 4 | ${DORIS_FULL_NAME}/fe/bin/stop_fe.sh 5 | -------------------------------------------------------------------------------- /duckdb/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm -rf ~/.duckdb # remove remainders 4 | curl https://install.duckdb.org | sh 5 | -------------------------------------------------------------------------------- /doris/uninstall.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo apt-get remove -y mysql-client openjdk-17-jre-headless 4 | 5 | rm -rf ${DORIS_FULL_NAME} 6 | -------------------------------------------------------------------------------- /victorialogs/count.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | curl -s --fail http://localhost:9428/select/logsql/query --data-urlencode "query=* | count() rows" 4 | -------------------------------------------------------------------------------- /starrocks/uninstall.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker stop starrocks 4 | docker rm starrocks 5 | 6 | sudo apt-get remove -y mysql-client 7 | sudo snap remove --purge docker 8 | -------------------------------------------------------------------------------- /greptimedb/drop_tables.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Stopping GreptimeDB" 4 | pidof greptime && kill `pidof greptime` 5 | 6 | echo "Dropping all data" 7 | rm -rf ./greptimedb_data 8 | -------------------------------------------------------------------------------- /postgresql/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # https://www.postgresql.org/download/linux/ubuntu/ 4 | 5 | sudo apt-get update 6 | sudo apt-get install -y postgresql-common postgresql-16 7 | -------------------------------------------------------------------------------- /duckdb/results/m6i.8xlarge_bluesky_1000m.errors: -------------------------------------------------------------------------------- 1 | `Invalid Input Error: Malformed JSON at byte 3 of input: unexpected content after document. Input: ":"This user is a Sable!","lang":"en","name":"S..."` -------------------------------------------------------------------------------- /victorialogs/index_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | curl -s --fail http://localhost:9428/select/logsql/query --data-urlencode "query=* | block_stats | sum(bloom_bytes) index_bytes | keep index_bytes" 4 | -------------------------------------------------------------------------------- /victorialogs/drop_tables.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Stopping VictoriaLogs" 4 | pidof victoria-logs-prod && kill `pidof victoria-logs-prod` 5 | 6 | echo "Dropping all data" 7 | rm -rf victoria-logs-data 8 | -------------------------------------------------------------------------------- /singlestore/uninstall.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # stop and remove all docker containers 4 | docker stop $(docker ps -a -q) 5 | docker rm $(docker ps -a -q) 6 | 7 | sudo apt-get remove -y mysql-client 8 | sudo snap remove --purge docker 9 | -------------------------------------------------------------------------------- /clickhouse/start.sh: -------------------------------------------------------------------------------- 1 | pidof clickhouse > /dev/null && exit 1 2 | 3 | ./clickhouse server > /dev/null 2>&1 & 4 | 5 | sleep 5 6 | 7 | while true 8 | do 9 | ./clickhouse client --query "SELECT 1" && break 10 | sleep 1 11 | done 12 | 13 | -------------------------------------------------------------------------------- /mongodb/ddl.js: -------------------------------------------------------------------------------- 1 | db.createCollection( 2 | "bluesky", 3 | { storageEngine: { wiredTiger: { configString: "block_compressor=zstd" } } } 4 | ); 5 | 6 | db.bluesky.createIndex({"kind": 1, "commit.operation": 1, "commit.collection": 1, "did": 1, "time_us": 1}); -------------------------------------------------------------------------------- /victorialogs/start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Do we run already? 4 | pidof victoria-logs-prod >/dev/null && exit 1 5 | 6 | echo "Starting VictoriaLogs" 7 | ./victoria-logs-prod -loggerOutput=stdout -retentionPeriod=20y -search.maxQueryDuration=5m > server.log & 8 | -------------------------------------------------------------------------------- /victorialogs/data_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | curl -s --fail http://localhost:9428/select/logsql/query --data-urlencode "query=* | block_stats | sum(values_bytes) values_bytes, sum(dict_bytes) dict_bytes | math values_bytes + dict_bytes data_bytes | keep data_bytes" 4 | -------------------------------------------------------------------------------- /greptimedb/count.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | curl -s --fail http://localhost:4000/v1/sql \ 4 | -H 'Content-Type: application/x-www-form-urlencoded' \ 5 | -d "sql=select count(*) as cnt from bluesky" \ 6 | -d "format=json" \ 7 | | grep -o "cnt\":[0-9]*" | sed 's/cnt\"://g' -------------------------------------------------------------------------------- /victorialogs/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | RELEASE_VERSION=v1.17.0-victorialogs 4 | 5 | wget -N https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/${RELEASE_VERSION}/victoria-logs-linux-amd64-${RELEASE_VERSION}.tar.gz 6 | tar xzf victoria-logs-linux-amd64-${RELEASE_VERSION}.tar.gz 7 | -------------------------------------------------------------------------------- /_files_gz/results/_files_bluesky_gz_1m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json.gz", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "retains_structure": "yes", 7 | "tags": [ 8 | ], 9 | "dataset_size": 1000000, 10 | "total_size": 135176827 11 | } 12 | -------------------------------------------------------------------------------- /_files_gz/results/_files_bluesky_gz_10m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json.gz", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "retains_structure": "yes", 7 | "tags": [ 8 | ], 9 | "dataset_size": 10000000, 10 | "total_size": 1354902507 11 | } 12 | -------------------------------------------------------------------------------- /_files_json/results/_files_bluesky_json_10m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "retains_structure": "yes", 7 | "tags": [ 8 | ], 9 | "dataset_size": 10000000, 10 | "total_size": 4858741288 11 | } 12 | -------------------------------------------------------------------------------- /_files_json/results/_files_bluesky_json_1m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "retains_structure": "yes", 7 | "tags": [ 8 | ], 9 | "dataset_size": 1000000, 10 | "total_size": 480778277 11 | } 12 | -------------------------------------------------------------------------------- /_files_lz4/results/_files_bluesky_lz4_1m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json.lz4", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "retains_structure": "yes", 7 | "tags": [ 8 | ], 9 | "dataset_size": 1000000, 10 | "total_size": 208385826 11 | } 12 | -------------------------------------------------------------------------------- /victorialogs/total_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | curl -s --fail http://localhost:9428/select/logsql/query --data-urlencode "query=* | block_stats | sum(values_bytes) values_bytes, sum(dict_bytes) dict_bytes, sum(bloom_bytes) bloom_bytes | math values_bytes + dict_bytes + bloom_bytes total_bytes | keep total_bytes" 4 | -------------------------------------------------------------------------------- /_files_gz/results/_files_bluesky_gz_1000m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json.gz", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "retains_structure": "yes", 7 | "tags": [ 8 | ], 9 | "dataset_size": 1000000000, 10 | "total_size": 134117979655 11 | } 12 | -------------------------------------------------------------------------------- /_files_gz/results/_files_bluesky_gz_100m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json.gz", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "retains_structure": "yes", 7 | "tags": [ 8 | ], 9 | "dataset_size": 100000000, 10 | "total_size": 13372936569 11 | } 12 | -------------------------------------------------------------------------------- /_files_json/results/_files_bluesky_json_1000m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "retains_structure": "yes", 7 | "tags": [ 8 | ], 9 | "dataset_size": 1000000000, 10 | "total_size": 482108809691 11 | } 12 | -------------------------------------------------------------------------------- /_files_json/results/_files_bluesky_json_100m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "retains_structure": "yes", 7 | "tags": [ 8 | ], 9 | "dataset_size": 100000000, 10 | "total_size": 47813179260 11 | } 12 | -------------------------------------------------------------------------------- /_files_lz4/results/_files_bluesky_lz4_100m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json.lz4", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "retains_structure": "yes", 7 | "tags": [ 8 | ], 9 | "dataset_size": 100000000, 10 | "total_size": 20591959778 11 | } 12 | -------------------------------------------------------------------------------- /_files_lz4/results/_files_bluesky_lz4_10m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json.lz4", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "retains_structure": "yes", 7 | "tags": [ 8 | ], 9 | "dataset_size": 10000000, 10 | "total_size": 2084888024 11 | } 12 | -------------------------------------------------------------------------------- /_files_zstd/results/_files_bluesky_zstd_10m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json.zstd", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "retains_structure": "yes", 7 | "tags": [ 8 | ], 9 | "dataset_size": 10000000, 10 | "total_size": 1269817486 11 | } 12 | -------------------------------------------------------------------------------- /_files_zstd/results/_files_bluesky_zstd_1m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json.zstd", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "retains_structure": "yes", 7 | "tags": [ 8 | ], 9 | "dataset_size": 1000000, 10 | "total_size": 126734406 11 | } 12 | -------------------------------------------------------------------------------- /_files_lz4/results/_files_bluesky_lz4_1000m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json.lz4", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "retains_structure": "yes", 7 | "tags": [ 8 | ], 9 | "dataset_size": 1000000000, 10 | "total_size": 206562787263 11 | } 12 | -------------------------------------------------------------------------------- /_files_zstd/results/_files_bluesky_zstd_1000m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json.zstd", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "retains_structure": "yes", 7 | "tags": [ 8 | ], 9 | "dataset_size": 1000000000, 10 | "total_size": 123797963671 11 | } 12 | -------------------------------------------------------------------------------- /_files_zstd/results/_files_bluesky_zstd_100m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json.zstd", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "retains_structure": "yes", 7 | "tags": [ 8 | ], 9 | "dataset_size": 100000000, 10 | "total_size": 12245368182 11 | } 12 | -------------------------------------------------------------------------------- /mongodb/drop_table.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | DB_NAME="$1" 10 | 11 | echo "Dropping database: $DB_NAME" 12 | 13 | mongosh --eval "use $DB_NAME" --eval "db.dropDatabase()" 14 | -------------------------------------------------------------------------------- /postgresql/drop_tables.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | 12 | echo "Dropping database" 13 | sudo -u postgres psql -t -c "DROP DATABASE $DB_NAME" 14 | -------------------------------------------------------------------------------- /postgresql/data_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | TABLE_NAME="$2" 12 | 13 | sudo -u postgres psql -d "$DB_NAME" -t -c "SELECT pg_table_size('$TABLE_NAME')" -------------------------------------------------------------------------------- /clickhouse/count.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | TABLE_NAME="$2" 12 | 13 | ./clickhouse client --database="$DB_NAME" --query "SELECT count() FROM '$TABLE_NAME';" 14 | -------------------------------------------------------------------------------- /postgresql/total_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | TABLE_NAME="$2" 12 | 13 | sudo -u postgres psql -d "$DB_NAME" -t -c "SELECT pg_total_relation_size('$TABLE_NAME')" -------------------------------------------------------------------------------- /duckdb/drop_table.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATABASE_NAME="$1" 11 | 12 | echo "Dropping database: $DATABASE_NAME" 13 | 14 | rm -f ~/${DATABASE_NAME} 15 | rm -f ~/${DATABASE_NAME}-c 16 | -------------------------------------------------------------------------------- /postgresql/count.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | TABLE_NAME="$2" 12 | 13 | # Corrected SQL query 14 | sudo -u postgres psql -d "$DB_NAME" -t -c "SELECT count(*) from $TABLE_NAME" -------------------------------------------------------------------------------- /postgresql/ddl.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE bluesky ( 2 | data JSONB COMPRESSION lz4 NOT NULL 3 | ); 4 | 5 | CREATE INDEX idx_bluesky 6 | ON bluesky ( 7 | (data ->> 'kind'), 8 | (data -> 'commit' ->> 'operation'), 9 | (data -> 'commit' ->> 'collection'), 10 | (data ->> 'did'), 11 | (TO_TIMESTAMP((data ->> 'time_us')::BIGINT / 1000000.0)) 12 | ); -------------------------------------------------------------------------------- /doris/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | wget https://apache-doris-releases.oss-accelerate.aliyuncs.com/${DORIS_FULL_NAME}.tar.gz 4 | mkdir ${DORIS_FULL_NAME} 5 | tar -xvf ${DORIS_FULL_NAME}.tar.gz --strip-components 1 -C ${DORIS_FULL_NAME} 6 | 7 | sudo apt-get update 8 | sudo apt-get install -y mysql-client openjdk-17-jre-headless # somehow _EXACTLY_ v17 is needed 9 | -------------------------------------------------------------------------------- /postgresql/index_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | TABLE_NAME="$2" 12 | 13 | sudo -u postgres psql -d "$DB_NAME" -t -c "SELECT pg_relation_size(oid) FROM pg_class WHERE relname = 'idx_bluesky'" -------------------------------------------------------------------------------- /duckdb/count.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATABASE_NAME="$1" 11 | TABLE_NAME="$2" 12 | 13 | # Fetch the count using duckDB 14 | duckdb ~/$DATABASE_NAME -c "select count() from '$TABLE_NAME';" 15 | 16 | -------------------------------------------------------------------------------- /clickhouse/total_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | TABLE_NAME="$2" 12 | 13 | ./clickhouse client --query "SELECT sum(bytes_on_disk) FROM system.parts WHERE database = '$DB_NAME' AND table = '$TABLE_NAME' AND active" 14 | -------------------------------------------------------------------------------- /clickhouse/data_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | TABLE_NAME="$2" 12 | 13 | ./clickhouse client --query "SELECT sum(data_compressed_bytes) FROM system.parts WHERE database = '$DB_NAME' AND table = '$TABLE_NAME' AND active" 14 | -------------------------------------------------------------------------------- /clickhouse/drop_table.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Stopping ClickHouse" 4 | pidof clickhouse && kill -9 `pidof clickhouse` 5 | 6 | # 'DROP TABLE' has a build-in safety mechanism that prevents users from dropping large tables. We hit that with large 7 | # numbers of ingested data. Instead, make our lifes easy and remove the persistence manually. 8 | echo "Dropping all data" 9 | rm -rf data/ metadata/ store/ 10 | -------------------------------------------------------------------------------- /clickhouse/index_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | TABLE_NAME="$2" 12 | 13 | ./clickhouse client --query "SELECT sum(primary_key_size) + sum(marks_bytes) FROM system.parts WHERE database = '$DB_NAME' AND table = '$TABLE_NAME' AND active" 14 | -------------------------------------------------------------------------------- /doris/total_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | TABLE_NAME="$2" 12 | 13 | mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "ANALYZE TABLE $TABLE_NAME WITH SYNC" 14 | mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "SHOW DATA FROM $TABLE_NAME" 15 | -------------------------------------------------------------------------------- /greptimedb/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | RELEASE_VERSION=v0.13.0-nightly-20250315 4 | 5 | # download greptimedb 6 | wget -N "https://github.com/GreptimeTeam/greptimedb/releases/download/${RELEASE_VERSION}/greptime-linux-amd64-${RELEASE_VERSION}.tar.gz" 7 | tar xzf greptime-linux-amd64-${RELEASE_VERSION}.tar.gz 8 | mv greptime-linux-amd64-${RELEASE_VERSION}/greptime ./ 9 | rm -rf greptime-linux-amd64-${RELEASE_VERSION} 10 | -------------------------------------------------------------------------------- /singlestore/count.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 3 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | ROOT_PASSWORD="$1" 11 | DB_NAME="$2" 12 | TABLE_NAME="$3" 13 | 14 | export MYSQL_PWD=${ROOT_PASSWORD} 15 | 16 | mysql -h 127.0.0.1 -P 3306 -u root -e "SELECT count(*) FROM $DB_NAME.$TABLE_NAME" 17 | -------------------------------------------------------------------------------- /doris/count.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # If you change something in this file, please change also in starrocks/count.sh. 4 | 5 | # Check if the required arguments are provided 6 | if [[ $# -lt 2 ]]; then 7 | echo "Usage: $0 " 8 | exit 1 9 | fi 10 | 11 | # Arguments 12 | DB_NAME="$1" 13 | TABLE_NAME="$2" 14 | 15 | mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "SELECT count() FROM $TABLE_NAME;" 16 | -------------------------------------------------------------------------------- /greptimedb/data_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | curl -s --fail http://localhost:4000/v1/sql \ 4 | -H 'Content-Type: application/x-www-form-urlencoded' \ 5 | -d "sql=SELECT sum(r.sst_size) as data_size FROM information_schema.REGION_STATISTICS r LEFT JOIN information_schema.TABLES t on r.table_id = t.table_id WHERE t.table_name = 'bluesky'" \ 6 | -d "format=json" \ 7 | | grep -o "data_size\":[0-9]*" | sed 's/data_size\"://g' 8 | -------------------------------------------------------------------------------- /greptimedb/index_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | curl -s --fail http://localhost:4000/v1/sql \ 4 | -H 'Content-Type: application/x-www-form-urlencoded' \ 5 | -d "sql=SELECT sum(r.index_size) as index_size FROM information_schema.REGION_STATISTICS r LEFT JOIN information_schema.TABLES t on r.table_id = t.table_id WHERE t.table_name = 'bluesky'" \ 6 | -d "format=json" \ 7 | | grep -o "index_size\":[0-9]*" | sed 's/index_size\"://g' 8 | -------------------------------------------------------------------------------- /greptimedb/total_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | curl -s --fail http://localhost:4000/v1/sql \ 4 | -H 'Content-Type: application/x-www-form-urlencoded' \ 5 | -d "sql=SELECT sum(r.disk_size) as total_size FROM information_schema.REGION_STATISTICS r LEFT JOIN information_schema.TABLES t on r.table_id = t.table_id WHERE t.table_name = 'bluesky'" \ 6 | -d "format=json" \ 7 | | grep -o "total_size\":[0-9]*" | sed 's/total_size\"://g' 8 | -------------------------------------------------------------------------------- /starrocks/count.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # If you change something in this file, please change also in doris/count.sh. 4 | 5 | # Check if the required arguments are provided 6 | if [[ $# -lt 2 ]]; then 7 | echo "Usage: $0 " 8 | exit 1 9 | fi 10 | 11 | # Arguments 12 | DB_NAME="$1" 13 | TABLE_NAME="$2" 14 | 15 | mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" -e "SELECT count() FROM $TABLE_NAME;" 16 | -------------------------------------------------------------------------------- /singlestore/drop_table.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 3 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | ROOT_PASSWORD="$1" 10 | DB_NAME="$2" 11 | TABLE_NAME="$3" 12 | 13 | echo "Dropping table: $DB_NAME.$TABLE_NAME" 14 | 15 | export MYSQL_PWD=${ROOT_PASSWORD} 16 | mysql -h 127.0.0.1 -P 3306 -u root -e "DROP DATABASE IF EXISTS $DB_NAME" 17 | -------------------------------------------------------------------------------- /starrocks/total_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # If you change something in this file, please change also in doris/total_size.sh. 4 | 5 | # Check if the required arguments are provided 6 | if [[ $# -lt 2 ]]; then 7 | echo "Usage: $0 " 8 | exit 1 9 | fi 10 | 11 | # Arguments 12 | DB_NAME="$1" 13 | TABLE_NAME="$2" 14 | 15 | mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" -e "SHOW DATA FROM $TABLE_NAME" 16 | -------------------------------------------------------------------------------- /doris/drop_table.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # If you change something in this file, please change also in starrocks/drop_table.sh. 4 | 5 | # Check if the required arguments are provided 6 | if [[ $# -lt 2 ]]; then 7 | echo "Usage: $0 " 8 | exit 1 9 | fi 10 | 11 | DB_NAME="$1" 12 | TABLE_NAME="$2" 13 | 14 | echo "Dropping table: $DB_NAME.$TABLE_NAME" 15 | mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "DROP TABLE IF EXISTS $TABLE_NAME" 16 | -------------------------------------------------------------------------------- /starrocks/ddl.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE bluesky ( 2 | `id` BIGINT AUTO_INCREMENT, 3 | `data` JSON NOT NULL COMMENT "Primary JSON object, optimized for field access using FlatJSON", 4 | 5 | sort_key VARBINARY AS encode_sort_key( 6 | get_json_string(data, 'kind'), 7 | get_json_string(data, 'commit.operation'), 8 | get_json_string(data, 'commit.collection'), 9 | get_json_string(data, 'did') 10 | ) 11 | ) 12 | ORDER BY (sort_key); 13 | -------------------------------------------------------------------------------- /elasticsearch/config/ilm.json: -------------------------------------------------------------------------------- 1 | { 2 | "policy": { 3 | "phases": { 4 | "hot": { 5 | "min_age": "0ms", 6 | "actions": { 7 | "rollover": { 8 | "max_age": "30d", 9 | "max_primary_shard_size": "50gb" 10 | }, 11 | "forcemerge": { 12 | "max_num_segments": 1 13 | }, 14 | "readonly": {} 15 | } 16 | } 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /starrocks/drop_table.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # If you change something in this file, please change also in doris/drop_table.sh. 4 | 5 | # Check if the required arguments are provided 6 | if [[ $# -lt 2 ]]; then 7 | echo "Usage: $0 " 8 | exit 1 9 | fi 10 | 11 | DB_NAME="$1" 12 | TABLE_NAME="$2" 13 | 14 | echo "Dropping table: $DB_NAME.$TABLE_NAME" 15 | mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" -e "DROP TABLE IF EXISTS $TABLE_NAME" 16 | -------------------------------------------------------------------------------- /_files_json/total_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required argument is provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Argument 10 | DIRECTORY="$1" 11 | 12 | # Check if the directory exists 13 | if [[ ! -d "$DIRECTORY" ]]; then 14 | echo "Error: Directory '$DIRECTORY' does not exist." 15 | exit 1 16 | fi 17 | 18 | # Get the total size in bytes and suppress the directory name 19 | du -sb "$DIRECTORY" | awk '{print $1}' -------------------------------------------------------------------------------- /_files_lz4/total_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required argument is provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Argument 10 | DIRECTORY="$1" 11 | 12 | # Check if the directory exists 13 | if [[ ! -d "$DIRECTORY" ]]; then 14 | echo "Error: Directory '$DIRECTORY' does not exist." 15 | exit 1 16 | fi 17 | 18 | # Get the total size in bytes and suppress the directory name 19 | du -sb "$DIRECTORY" | awk '{print $1}' -------------------------------------------------------------------------------- /_files_zstd/total_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required argument is provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Argument 10 | DIRECTORY="$1" 11 | 12 | # Check if the directory exists 13 | if [[ ! -d "$DIRECTORY" ]]; then 14 | echo "Error: Directory '$DIRECTORY' does not exist." 15 | exit 1 16 | fi 17 | 18 | # Get the total size in bytes and suppress the directory name 19 | du -sb "$DIRECTORY" | awk '{print $1}' -------------------------------------------------------------------------------- /singlestore/index_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 3 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | ROOT_PASSWORD="$1" 11 | DB_NAME="$2" 12 | TABLE_NAME="$3" 13 | 14 | export MYSQL_PWD=${ROOT_PASSWORD} 15 | 16 | mysql -h 127.0.0.1 -P 3306 -u root -e "SELECT sum(memory_use) FROM information_schema.index_statistics WHERE database_name = '$DB_NAME' AND table_name = '$TABLE_NAME'" 17 | -------------------------------------------------------------------------------- /doris/start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-amd64" 4 | sudo sysctl -w vm.max_map_count=2000000 5 | sudo sh -c ulimit -n 655350 6 | 7 | ${DORIS_FULL_NAME}/be/bin/start_be.sh --daemon 8 | ${DORIS_FULL_NAME}/fe/bin/start_fe.sh --daemon 9 | 10 | echo "Sleep 30 sec to wait doris start" 11 | sleep 30s 12 | 13 | mysql -P 9030 -h 127.0.0.1 -u root -e "ALTER SYSTEM ADD BACKEND \"127.0.0.1:9050\";" 14 | 15 | echo "Sleep 10 sec to wait frontend are connected to backend" 16 | sleep 10s 17 | -------------------------------------------------------------------------------- /singlestore/data_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 3 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | ROOT_PASSWORD="$1" 11 | DB_NAME="$2" 12 | TABLE_NAME="$3" 13 | 14 | export MYSQL_PWD=${ROOT_PASSWORD} 15 | 16 | mysql -h 127.0.0.1 -P 3306 -u root -e "SELECT sum(compressed_size) FROM information_schema.columnar_segments WHERE database_name = '$DB_NAME' AND table_name = '$TABLE_NAME'" 17 | -------------------------------------------------------------------------------- /duckdb/results/m6i.8xlarge_bluesky_1m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "DuckDB", 3 | "version": "1.1.3", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-03-27", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000, 11 | "num_loaded_documents": 1000000, 12 | "total_size": 485490688, 13 | "result": [ 14 | [3.554,0.112,0.111], 15 | [3.441,0.324,0.321], 16 | [2.921,0.329,0.339], 17 | [2.961,0.255,0.255], 18 | [3.131,0.262,0.262] 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /singlestore/ddl.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE bluesky 2 | ( 3 | data JSON 4 | ); 5 | -- Notes: 6 | -- - Not using data structures to speed up scans. In SingleStore, no sort keys or indexes can be created on JSON sub-columns. 7 | -- - The only physical optimization we use is 'use_seekable_json' but that is implicitly on: https://docs.singlestore.com/db/v8.9/create-a-database/columnstore/columnstore-seekability-using-json/ 8 | -- - We _could_ run OPTIMIZE to force a merge but since we are also not doing this for other benchmarked databases, we omit that. 9 | -------------------------------------------------------------------------------- /starrocks/results/m6i.8xlarge_bluesky_10m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Starrocks", 3 | "version": "4.0.1", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-11-17", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 10000000, 11 | "num_loaded_documents": 9999997, 12 | "total_size": 2043330691, 13 | "result": [ 14 | [0.09,0.03,0.03], 15 | [0.50,0.22,0.19], 16 | [0.12,0.06,0.05], 17 | [0.10,0.04,0.04], 18 | [0.10,0.04,0.04] 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /starrocks/results/m6i.8xlarge_bluesky_1m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Starrocks", 3 | "version": "4.0.1", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-11-17", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000, 11 | "num_loaded_documents": 1000000, 12 | "total_size": 207987146, 13 | "result": [ 14 | [0.14,0.05,0.05], 15 | [0.21,0.06,0.06], 16 | [0.34,0.05,0.05], 17 | [0.11,0.03,0.03], 18 | [0.09,0.03,0.03] 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /duckdb/results/m6i.8xlarge_bluesky_10m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "DuckDB", 3 | "version": "1.1.3", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-03-27", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 10000000, 11 | "num_loaded_documents": 9700000, 12 | "total_size": 4753981440, 13 | "result": [ 14 | [36.379,0.612,0.609], 15 | [36.357,1.611,1.608], 16 | [36.310,1.551,1.561], 17 | [36.337,1.028,1.108], 18 | [36.372,1.113,1.118] 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /starrocks/results/m6i.8xlarge_bluesky_100m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Starrocks", 3 | "version": "4.0.1", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-11-17", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 100000000, 11 | "num_loaded_documents": 99999984, 12 | "total_size": 18193481465, 13 | "result": [ 14 | [0.17,0.09,0.09], 15 | [7.14,1.07,1.04], 16 | [1.13,0.21,0.22], 17 | [0.18,0.12,0.12], 18 | [0.19,0.13,0.14] 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /victorialogs/query_results.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | QUERY_NUM=1 4 | 5 | set -f 6 | cat queries.logsql | while read -r query; do 7 | 8 | # Print the query 9 | echo "------------------------------------------------------------------------------------------------------------------------" 10 | echo "Result for query Q$QUERY_NUM:" 11 | echo 12 | 13 | curl -s --fail http://localhost:9428/select/logsql/query --data-urlencode "query=$query" 14 | 15 | # Increment the query number 16 | QUERY_NUM=$((QUERY_NUM + 1)) 17 | done; 18 | -------------------------------------------------------------------------------- /duckdb/total_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATABASE_NAME="$1" 11 | TABLE_NAME="$2" 12 | 13 | # Fetch the total size using duckDB 14 | duckdb ~/$DATABASE_NAME -c "select '$TABLE_NAME' as table_name, count(distinct block_id) as num_blocks, count(distinct block_id) * (select block_size from pragma_database_size()) as num_bytes from pragma_storage_info('$TABLE_NAME') group by all;" 15 | 16 | -------------------------------------------------------------------------------- /starrocks/results/m6i.8xlarge_bluesky_1000m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Starrocks", 3 | "version": "4.0.1", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-11-17", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000000, 11 | "num_loaded_documents": 997999662, 12 | "total_size": 192981470543, 13 | "result": [ 14 | [0.69,0.52,0.51], 15 | [74.01,7.27,7.18], 16 | [20.58,1.51,1.45], 17 | [15.86,1.03,1.01], 18 | [1.18,1.10,1.11] 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /duckdb/results/m6i.8xlarge_bluesky_100m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "DuckDB", 3 | "version": "1.1.3", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-03-27", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 100000000, 11 | "num_loaded_documents": 98700000, 12 | "total_size": 47184347136, 13 | "result": [ 14 | [367.536,5.485,5.487], 15 | [367.771,15.209,15.225], 16 | [367.548,13.420,13.357], 17 | [367.689,7.544,7.576], 18 | [367.900,8.177,8.120] 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /elasticsearch/results/m6i.8xlarge_bluesky_source_1m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Elasticsearch", 3 | "version": "8.17.0", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000, 11 | "num_loaded_documents": 1000000, 12 | "total_size": 400974094, 13 | "result": [ 14 | [0.039,0.036,0.036], 15 | [0.344,0.303,0.305], 16 | [0.171,0.166,0.159], 17 | [0.047,0.047,0.049], 18 | [0.056,0.056,0.056] 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /elasticsearch/results/m6i.8xlarge_bluesky_source_10m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Elasticsearch", 3 | "version": "8.17.0", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 10000000, 11 | "num_loaded_documents": 9999998, 12 | "total_size": 3785308904, 13 | "result": [ 14 | [0.286,0.290,0.287], 15 | [2.487,2.367,2.406], 16 | [1.747,1.671,1.656], 17 | [0.368,0.360,0.364], 18 | [0.423,0.424,0.422] 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /singlestore/total_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 3 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | ROOT_PASSWORD="$1" 11 | DB_NAME="$2" 12 | TABLE_NAME="$3" 13 | 14 | export MYSQL_PWD=${ROOT_PASSWORD} 15 | 16 | # No indexes are used, same query as in data_size.sh 17 | mysql -h 127.0.0.1 -P 3306 -u root -e "SELECT sum(compressed_size) FROM information_schema.columnar_segments WHERE database_name = '$DB_NAME' AND table_name = '$TABLE_NAME'" 18 | -------------------------------------------------------------------------------- /elasticsearch/results/m6i.8xlarge_bluesky_no_source_10m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Elasticsearch (no source)", 3 | "version": "8.17.0", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "retains_structure": "no", 8 | "tags": [ 9 | ], 10 | "dataset_size": 10000000, 11 | "num_loaded_documents": 9999998, 12 | "total_size": 2834172690, 13 | "result": [ 14 | [0.270,0.263,0.275], 15 | [2.942,2.683,2.655], 16 | [2.014,2.008,2.037], 17 | [0.414,0.412,0.437], 18 | [0.562,0.470,0.463] 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /elasticsearch/results/m6i.8xlarge_bluesky_no_source_1m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Elasticsearch (no source)", 3 | "version": "8.17.0", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "retains_structure": "no", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000, 11 | "num_loaded_documents": 1000000, 12 | "total_size": 400948257, 13 | "result": [ 14 | [0.041,0.037,0.035], 15 | [0.426,0.321,0.323], 16 | [0.192,0.186,0.213], 17 | [0.056,0.052,0.053], 18 | [0.099,0.061,0.060] 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /elasticsearch/results/m6i.8xlarge_bluesky_source_1000m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Elasticsearch", 3 | "version": "8.17.0", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000000, 11 | "num_loaded_documents": 999999101, 12 | "total_size": 386099682721, 13 | "result": [ 14 | [3.854,3.884,4.081], 15 | [37.078,29.084,28.548], 16 | [24.382,24.279,23.570], 17 | [8.106,8.228,8.080], 18 | [9.208,8.994,9.084] 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /elasticsearch/results/m6i.8xlarge_bluesky_source_100m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Elasticsearch", 3 | "version": "8.17.0", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 100000000, 11 | "num_loaded_documents": 99999947, 12 | "total_size": 34182479705, 13 | "result": [ 14 | [2.765,2.718,2.799], 15 | [20.788,20.822,20.270], 16 | [16.306,16.642,15.693], 17 | [2.454,2.461,2.423], 18 | [2.761,2.768,2.784] 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /elasticsearch/results/m6i.8xlarge_bluesky_no_source_100m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Elasticsearch (no source)", 3 | "version": "8.17.0", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "retains_structure": "no", 8 | "tags": [ 9 | ], 10 | "dataset_size": 100000000, 11 | "num_loaded_documents": 99999947, 12 | "total_size": 21268479403, 13 | "result": [ 14 | [2.532,2.536,2.486], 15 | [23.194,22.932,23.188], 16 | [19.521,19.321,19.159], 17 | [2.867,2.791,2.884], 18 | [3.099,3.136,3.171] 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /doris/results/m6i.8xlarge_bluesky_1m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Apache Doris", 3 | "version": "doris-3.0.5-rc01-e277cfb83f", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-05-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000, 11 | "num_loaded_documents": 1000000, 12 | "total_size": 207785820, 13 | "data_size": 207785820, 14 | "result": [ 15 | [0.03,0.02,0.02], 16 | [0.06,0.04,0.05], 17 | [0.04,0.02,0.03], 18 | [0.03,0.02,0.02], 19 | [0.03,0.02,0.02] 20 | ] 21 | } 22 | -------------------------------------------------------------------------------- /elasticsearch/results/m6i.8xlarge_bluesky_no_source_1000m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Elasticsearch (no source)", 3 | "version": "8.17.0", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "retains_structure": "no", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000000, 11 | "num_loaded_documents": 999998998, 12 | "total_size": 235840659266, 13 | "result": [ 14 | [5.022,5.019,5.078], 15 | [51.486,45.510,45.713], 16 | [41.789,41.359,41.608], 17 | [8.807,8.812,8.711], 18 | [9.696,9.723,9.533] 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /mongodb/results/m6i.8xlarge_bluesky_1m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "MongoDB", 3 | "version": "8.0.6", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000, 11 | "num_loaded_documents": 1000000, 12 | "total_size": 954368, 13 | "data_size": 4096, 14 | "index_size": 950272, 15 | "result": [ 16 | [1.324,1.338,1.341], 17 | [1.815,1.823,1.832], 18 | [1.555,1.603,1.532], 19 | [0.249,0.256,0.266], 20 | [0.267,0.278,0.276] 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /doris/results/m6i.8xlarge_bluesky_10m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Apache Doris", 3 | "version": "doris-3.0.5-rc01-e277cfb83f", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-05-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 10000000, 11 | "num_loaded_documents": 10000000, 12 | "total_size": 2170032226, 13 | "data_size": 2170032226, 14 | "result": [ 15 | [0.05,0.04,0.04], 16 | [0.58,0.12,0.12], 17 | [0.06,0.04,0.05], 18 | [0.04,0.03,0.04], 19 | [0.04,0.04,0.04] 20 | ] 21 | } 22 | -------------------------------------------------------------------------------- /duckdb/results/m6i.8xlarge_bluesky_1000m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "DuckDB", 3 | "version": "1.1.3", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-03-27", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000000, 11 | "num_loaded_documents": 974400000, 12 | "total_size": 472599756800, 13 | "result": [ 14 | [3734.026,3722.939,3717.611], 15 | [3737.451,3726.788,3721.045], 16 | [3734.092,3722.939,3717.631], 17 | [3737.381,3724.588,3719.273], 18 | [3737.908,3726.648,3722.804] 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /doris/results/m6i.8xlarge_bluesky_100m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Apache Doris", 3 | "version": "doris-3.0.5-rc01-e277cfb83f", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-05-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 100000000, 11 | "num_loaded_documents": 100000000, 12 | "total_size": 21304111530, 13 | "data_size": 21304111530, 14 | "result": [ 15 | [0.23,0.19,0.18], 16 | [9.39,0.77,0.77], 17 | [1.12,0.19,0.19], 18 | [0.17,0.15,0.17], 19 | [0.19,0.17,0.16] 20 | ] 21 | } 22 | -------------------------------------------------------------------------------- /doris/results/m6i.8xlarge_bluesky_1000m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Apache Doris", 3 | "version": "doris-3.0.5-rc01-e277cfb83f", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-05-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000000, 11 | "num_loaded_documents": 999999994, 12 | "total_size": 214623810748, 13 | "data_size": 214623810748, 14 | "result": [ 15 | [2.02,1.63,1.63], 16 | [96.02,6.61,6.64], 17 | [21.89,2.15,1.78], 18 | [10.26,0.95,0.93], 19 | [1.04,1.03,1.04] 20 | ] 21 | } 22 | -------------------------------------------------------------------------------- /mongodb/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # https://www.mongodb.com/docs/manual/tutorial/install-mongodb-on-ubuntu/ 4 | 5 | sudo sudo apt-get install gnupg curl 6 | curl -fsSL https://www.mongodb.org/static/pgp/server-8.0.asc | \ 7 | sudo gpg --dearmor --yes -o /usr/share/keyrings/mongodb-server-8.0.gpg 8 | echo "deb [ arch=amd64,arm64 signed-by=/usr/share/keyrings/mongodb-server-8.0.gpg ] https://repo.mongodb.org/apt/ubuntu noble/mongodb-org/8.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-8.0.list 9 | sudo apt-get update 10 | sudo apt-get install -y mongodb-org 11 | sudo systemctl start mongod 12 | -------------------------------------------------------------------------------- /clickhouse/results/m6i.8xlarge_bluesky_1m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "ClickHouse", 3 | "version": "25.11", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-11-15", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000, 11 | "num_loaded_documents": 1000000, 12 | "total_size": 98534792, 13 | "data_size": 98424457, 14 | "index_size": 110328, 15 | "result": [ 16 | [0.007, 0.045, 0.004], 17 | [0.042, 0.035, 0.022], 18 | [0.022, 0.013, 0.012], 19 | [0.033, 0.017, 0.017], 20 | [0.041, 0.020, 0.019] 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /mongodb/results/m6i.8xlarge_bluesky_10m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "MongoDB", 3 | "version": "8.0.6", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 10000000, 11 | "num_loaded_documents": 7860000, 12 | "total_size": 1536905216, 13 | "data_size": 1171361792, 14 | "index_size": 365543424, 15 | "result": [ 16 | [10.334,10.266,10.298], 17 | [37.401,36.807,37.979], 18 | [13.209,12.799,12.889], 19 | [2.071,2.029,2.119], 20 | [2.165,2.076,2.119] 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /clickhouse/results/m6i.8xlarge_bluesky_10m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "ClickHouse", 3 | "version": "25.11", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-11-15", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 10000000, 11 | "num_loaded_documents": 9999994, 12 | "total_size": 1001159528, 13 | "data_size": 1000084752, 14 | "index_size": 1074709, 15 | "result": [ 16 | [0.010, 0.049, 0.006], 17 | [0.151, 0.107, 0.102], 18 | [0.051, 0.036, 0.037], 19 | [0.079, 0.043, 0.040], 20 | [0.075, 0.046, 0.045] 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /clickhouse/results/m6i.8xlarge_bluesky_100m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "ClickHouse", 3 | "version": "25.11", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-11-15", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 100000000, 11 | "num_loaded_documents": 99999968, 12 | "total_size": 9662241335, 13 | "data_size": 9645940572, 14 | "index_size": 16300557, 15 | "result": [ 16 | [0.077, 0.247, 0.027], 17 | [2.322, 0.404, 0.402], 18 | [2.680, 0.214, 0.220], 19 | [0.234, 0.081, 0.079], 20 | [0.513, 0.088, 0.091] 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /greptimedb/results/m6i.8xlarge_bluesky_1m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "GreptimeDB", 3 | "version": "v0.13.0-nightly-20250315", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-03-17", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "no", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000, 11 | "num_loaded_documents": 1000000, 12 | "total_size": 112555783, 13 | "data_size": 112546830, 14 | "index_size": 7048, 15 | "result": [ 16 | [0.148, 0.012, 0.012], 17 | [0.184, 0.064, 0.064], 18 | [0.116, 0.014, 0.024], 19 | [0.111, 0.016, 0.014], 20 | [0.122, 0.017, 0.034] 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /clickhouse/results/m6i.8xlarge_bluesky_1000m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "ClickHouse", 3 | "version": "25.11", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-11-15", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000000, 11 | "num_loaded_documents": 999999258, 12 | "total_size": 99560268152, 13 | "data_size": 99068986216, 14 | "index_size": 491281201, 15 | "result": [ 16 | [0.492, 0.617, 0.225], 17 | [16.298, 3.241, 3.236], 18 | [31.711, 2.136, 2.174], 19 | [6.985, 0.480, 0.479], 20 | [7.393, 0.518, 0.514] 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /greptimedb/results/m6i.8xlarge_bluesky_10m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "GreptimeDB", 3 | "version": "v0.13.0-nightly-20250315", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-03-17", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "no", 8 | "tags": [ 9 | ], 10 | "dataset_size": 10000000, 11 | "num_loaded_documents": 9999994, 12 | "total_size": 1136875364, 13 | "data_size": 1136814387, 14 | "index_size": 55651, 15 | "result": [ 16 | [0.172, 0.029, 0.028], 17 | [0.46, 0.386, 0.421], 18 | [0.146, 0.044, 0.034], 19 | [0.121, 0.023, 0.023], 20 | [0.156, 0.064, 0.051] 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /mongodb/results/m6i.8xlarge_bluesky_100m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "MongoDB", 3 | "version": "8.0.6", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 100000000, 11 | "num_loaded_documents": 94408000, 12 | "total_size": 19315617792, 13 | "data_size": 13590446080, 14 | "index_size": 5725171712, 15 | "result": [ 16 | [113.722,108.665,108.909], 17 | [1551.96,1563.52,1542.5], 18 | [141.132,138.307,140.456], 19 | [21.948,21.717,21.424], 20 | [23.103,22.574,22.302] 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /victorialogs/results/m6i.8xlarge_bluesky_1m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "VictoriaLogs", 3 | "version": "v1.17.0-victorialogs", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-03-28", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "no", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000, 11 | "num_loaded_documents": 1000000, 12 | "total_size": 121728495, 13 | "data_size": 108413431, 14 | "index_size": 13315064, 15 | "result": [ 16 | [0.074, 0.006, 0.006], 17 | [0.128, 0.069, 0.046], 18 | [0.082, 0.015, 0.015], 19 | [0.094, 0.021, 0.017], 20 | [0.126, 0.052, 0.056] 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /greptimedb/results/m6i.8xlarge_bluesky_100m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "GreptimeDB", 3 | "version": "v0.13.0-nightly-20250315", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-03-17", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "no", 8 | "tags": [ 9 | ], 10 | "dataset_size": 100000000, 11 | "num_loaded_documents": 99999968, 12 | "total_size": 10824010669, 13 | "data_size": 10823634560, 14 | "index_size": 348813, 15 | "result": [ 16 | [0.5, 0.167, 0.164], 17 | [10.711, 3.362, 3.324], 18 | [0.314, 0.225, 0.228], 19 | [0.146, 0.069, 0.068], 20 | [0.308, 0.22, 0.206] 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /victorialogs/results/m6i.8xlarge_bluesky_10m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "VictoriaLogs", 3 | "version": "v1.17.0-victorialogs", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-03-28", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "no", 8 | "tags": [ 9 | ], 10 | "dataset_size": 10000000, 11 | "num_loaded_documents": 9999994, 12 | "total_size": 1242217952, 13 | "data_size": 1108712800, 14 | "index_size": 133505152, 15 | "result": [ 16 | [0.086, 0.007, 0.007], 17 | [0.47, 0.241, 0.329], 18 | [0.095, 0.027, 0.028], 19 | [0.199, 0.142, 0.128], 20 | [0.211, 0.148, 0.176] 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /postgresql/results/m6i.8xlarge_bluesky_10m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "PostgreSQL", 3 | "version": "16.6", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 10000000, 11 | "num_loaded_documents": 7000000, 12 | "total_size": 5794693120, 13 | "data_size": 4653178880, 14 | "index_size": 1141514240, 15 | "result": [ 16 | [35.1545,0.949281,0.949623], 17 | [51.8718,9.55589,9.57942], 18 | [36.1771,2.60652,2.59737], 19 | [175.424,1.99499,1.93142], 20 | [176.353,2.1341,2.11399] 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /postgresql/results/m6i.8xlarge_bluesky_1m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "PostgreSQL", 3 | "version": "16.6", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000, 11 | "num_loaded_documents": 1000000, 12 | "total_size": 731111424, 13 | "data_size": 586768384, 14 | "index_size": 144343040, 15 | "result": [ 16 | [4.08449,0.133099,0.134603], 17 | [30.3611,2.15746,2.11889], 18 | [4.13365,0.35889,0.357372], 19 | [15.6849,0.487806,0.224725], 20 | [16.1242,0.251007,0.249576] 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /victorialogs/results/m6i.8xlarge_bluesky_100m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "VictoriaLogs", 3 | "version": "v1.17.0-victorialogs", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-03-28", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "no", 8 | "tags": [ 9 | ], 10 | "dataset_size": 100000000, 11 | "num_loaded_documents": 99999968, 12 | "total_size": 11984344799, 13 | "data_size": 10684518543, 14 | "index_size": 1299826256, 15 | "result": [ 16 | [0.097, 0.009, 0.009], 17 | [9.057, 2.721, 2.406], 18 | [0.474, 0.151, 0.148], 19 | [0.542, 0.464, 0.453], 20 | [0.735, 0.623, 0.664] 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /mongodb/results/m6i.8xlarge_bluesky_1000m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "MongoDB", 3 | "version": "8.0.6", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000000, 11 | "num_loaded_documents": 893632990, 12 | "total_size": 176737361920, 13 | "data_size": 131007311872, 14 | "index_size": 45730050048, 15 | "result": [ 16 | [1472.45,1109.15,1054.37], 17 | [20715.1,20484.9,20461.8], 18 | [1218.37,1216.24,1217.69], 19 | [169.069,170.265,168.797], 20 | [174.406,173.932,173.268] 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /postgresql/results/m6i.8xlarge_bluesky_1000m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "PostgreSQL", 3 | "version": "16.6", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000000, 11 | "num_loaded_documents": 804000000, 12 | "total_size": 660356890624, 13 | "data_size": 512144687104, 14 | "index_size": 148180910080, 15 | "result": [ 16 | [3904.83,3884.18,3884.17], 17 | [32594.9,32590.5,4277.8], 18 | [4249.8,4253.34,4927.79], 19 | [4903.66,4907.09,4947.12], 20 | [4922.98,4913.5,4928.3] 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /postgresql/results/m6i.8xlarge_bluesky_100m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "PostgreSQL", 3 | "version": "16.6", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 100000000, 11 | "num_loaded_documents": 91000000, 12 | "total_size": 69064736768, 13 | "data_size": 54598713344, 14 | "index_size": 14465753088, 15 | "result": [ 16 | [416.392,10.3372,10.3301], 17 | [1868.87,1458.26,1457.45], 18 | [440.283,33.0992,33.1468], 19 | [477.82,17.7637,17.3674], 20 | [476.837,18.9056,18.4372] 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /greptimedb/results/m6i.8xlarge_bluesky_1000m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "GreptimeDB", 3 | "version": "v0.13.0-nightly-20250315", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-03-17", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "no", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000000, 11 | "num_loaded_documents": 999999233, 12 | "total_size": 108808568186, 13 | "data_size": 108803782584, 14 | "index_size": 4656890, 15 | "result": [ 16 | [13.643, 1.568, 1.553], 17 | [93.269, 22.235, 21.625], 18 | [2.176, 2.081, 2.086], 19 | [0.507, 0.427, 0.439], 20 | [1.694, 1.614, 1.606] 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /victorialogs/results/m6i.8xlarge_bluesky_1000m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "VictoriaLogs", 3 | "version": "v1.17.0-victorialogs", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-03-28", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "no", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000000, 11 | "num_loaded_documents": 999999241, 12 | "total_size": 121860870843, 13 | "data_size": 108388327979, 14 | "index_size": 13472542864, 15 | "result": [ 16 | [1.076, 0.029, 0.028], 17 | [98.958, 20.591, 19.794], 18 | [14.423, 1.607, 1.609], 19 | [5.065, 5.185, 5.216], 20 | [7.22, 6.963, 7.351] 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /duckdb/query_results.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATABASE_NAME="$1" 11 | 12 | QUERY_NUM=1 13 | 14 | cat queries.sql | while read -r query; do 15 | 16 | # Print the query 17 | echo "------------------------------------------------------------------------------------------------------------------------" 18 | echo "Result for query Q$QUERY_NUM:" 19 | echo 20 | duckdb ~/$DATABASE_NAME -c "$query" 21 | 22 | # Increment the query number 23 | QUERY_NUM=$((QUERY_NUM + 1)) 24 | done; 25 | -------------------------------------------------------------------------------- /postgresql/query_results.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | 12 | QUERY_NUM=1 13 | 14 | cat queries.sql | while read -r query; do 15 | 16 | # Print the query 17 | echo "------------------------------------------------------------------------------------------------------------------------" 18 | echo "Result for query Q$QUERY_NUM:" 19 | echo 20 | 21 | sudo -u postgres psql -d "$DB_NAME" -c "$query" 22 | 23 | # Increment the query number 24 | QUERY_NUM=$((QUERY_NUM + 1)) 25 | done; -------------------------------------------------------------------------------- /singlestore/benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 4 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | ROOT_PASSWORD="$1" 11 | DB_NAME="$2" 12 | RESULT_FILE_RUNTIMES="$3" 13 | RESULT_FILE_MEMORY_USAGE="$4" 14 | 15 | # Construct the query log file name using $DB_NAME 16 | QUERY_LOG_FILE="_query_log_${DB_NAME}.txt" 17 | 18 | # Print the database name 19 | echo "Running queries on database: $DB_NAME" 20 | 21 | # Run queries and log the output 22 | ./run_queries.sh "$ROOT_PASSWORD" "$DB_NAME" 2>&1 | tee "$QUERY_LOG_FILE" 23 | -------------------------------------------------------------------------------- /postgresql/index_usage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | 12 | QUERY_NUM=1 13 | 14 | 15 | cat queries.sql | while read -r query; do 16 | 17 | # Print the query number 18 | echo "------------------------------------------------------------------------------------------------------------------------" 19 | echo "Index usage for query Q$QUERY_NUM:" 20 | echo 21 | 22 | sudo -u postgres psql -d "$DB_NAME" -t -c "EXPLAIN $query" 23 | 24 | # Increment the query number 25 | QUERY_NUM=$((QUERY_NUM + 1)) 26 | 27 | done; -------------------------------------------------------------------------------- /duckdb/physical_query_plans.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATABASE_NAME="$1" 11 | 12 | QUERY_NUM=1 13 | 14 | cat queries.sql | while read -r query; do 15 | 16 | # Print the query number 17 | echo "------------------------------------------------------------------------------------------------------------------------" 18 | echo "Physical query plan for query Q$QUERY_NUM:" 19 | echo 20 | 21 | duckdb ~/$DATABASE_NAME -c "EXPLAIN $query" 22 | 23 | # Increment the query number 24 | QUERY_NUM=$((QUERY_NUM + 1)) 25 | done; 26 | -------------------------------------------------------------------------------- /clickhouse/index_usage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | 12 | QUERY_NUM=1 13 | 14 | cat queries.sql | while read -r query; do 15 | 16 | # Print the query number 17 | echo "------------------------------------------------------------------------------------------------------------------------" 18 | echo "Index usage for query Q$QUERY_NUM:" 19 | echo 20 | 21 | ./clickhouse client --database="$DB_NAME" --query="EXPLAIN indexes=1 $query" 22 | 23 | # Increment the query number 24 | QUERY_NUM=$((QUERY_NUM + 1)) 25 | done; 26 | -------------------------------------------------------------------------------- /clickhouse/physical_query_plans.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | 12 | QUERY_NUM=1 13 | 14 | cat queries.sql | while read -r query; do 15 | 16 | # Print the query number 17 | echo "------------------------------------------------------------------------------------------------------------------------" 18 | echo "Physical query plan for query Q$QUERY_NUM:" 19 | echo 20 | 21 | ./clickhouse client --database="$DB_NAME" --query="EXPLAIN PIPELINE $query" 22 | 23 | # Increment the query number 24 | QUERY_NUM=$((QUERY_NUM + 1)) 25 | done; 26 | -------------------------------------------------------------------------------- /clickhouse/query_results.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | 12 | QUERY_NUM=1 13 | 14 | cat queries.sql | while read -r query; do 15 | 16 | # Print the query 17 | echo "------------------------------------------------------------------------------------------------------------------------" 18 | echo "Result for query Q$QUERY_NUM:" 19 | echo 20 | 21 | ./clickhouse client --database="$DB_NAME" --format=PrettyCompactMonoBlock --query="$query" --progress 0 22 | 23 | # Increment the query number 24 | QUERY_NUM=$((QUERY_NUM + 1)) 25 | done; 26 | -------------------------------------------------------------------------------- /mongodb/data_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATABASE_NAME="$1" 11 | COLLECTION_NAME="$2" 12 | 13 | # Fetch the totalSize using mongosh 14 | total_size=$(mongosh --quiet --eval " 15 | const db = db.getSiblingDB('$DATABASE_NAME'); 16 | const stats = db.getCollection('$COLLECTION_NAME').stats(); 17 | print(stats.storageSize); 18 | ") 19 | 20 | # Print the result 21 | if [[ -z "$total_size" ]]; then 22 | echo "Error: Unable to fetch totalSize. Ensure the database and collection exist." 23 | exit 1 24 | else 25 | echo $total_size 26 | fi -------------------------------------------------------------------------------- /mongodb/total_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATABASE_NAME="$1" 11 | COLLECTION_NAME="$2" 12 | 13 | # Fetch the totalSize using mongosh 14 | total_size=$(mongosh --quiet --eval " 15 | const db = db.getSiblingDB('$DATABASE_NAME'); 16 | const stats = db.getCollection('$COLLECTION_NAME').stats(); 17 | print(stats.totalSize); 18 | ") 19 | 20 | # Print the result 21 | if [[ -z "$total_size" ]]; then 22 | echo "Error: Unable to fetch totalSize. Ensure the database and collection exist." 23 | exit 1 24 | else 25 | echo $total_size 26 | fi -------------------------------------------------------------------------------- /starrocks/physical_query_plans.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | 12 | QUERY_NUM=1 13 | 14 | cat queries.sql | while read -r query; do 15 | 16 | # Print the query number 17 | echo "------------------------------------------------------------------------------------------------------------------------" 18 | echo "Physical query plan for query Q$QUERY_NUM:" 19 | echo 20 | mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" -e "EXPLAIN $query" 21 | 22 | # Increment the query number 23 | QUERY_NUM=$((QUERY_NUM + 1)) 24 | done; 25 | -------------------------------------------------------------------------------- /mongodb/index_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATABASE_NAME="$1" 11 | COLLECTION_NAME="$2" 12 | 13 | # Fetch the totalSize using mongosh 14 | total_size=$(mongosh --quiet --eval " 15 | const db = db.getSiblingDB('$DATABASE_NAME'); 16 | const stats = db.getCollection('$COLLECTION_NAME').stats(); 17 | print(stats.totalIndexSize); 18 | ") 19 | 20 | # Print the result 21 | if [[ -z "$total_size" ]]; then 22 | echo "Error: Unable to fetch totalSize. Ensure the database and collection exist." 23 | exit 1 24 | else 25 | echo $total_size 26 | fi -------------------------------------------------------------------------------- /elasticsearch/count.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Check if ELASTIC_PASSWORD env variable is set, if set from not read from .elastic_password file 10 | if [[ -z "$ELASTIC_PASSWORD" ]]; then 11 | [[ ! -f ".elastic_password" ]] && { echo "Error: ELASTIC_PASSWORD environment variable is not set and .elastic_password file does not exist."; exit 1; } 12 | export $(cat .elastic_password) 13 | fi 14 | 15 | # Arguments 16 | INDEX_NAME="$1" 17 | 18 | echo $(curl -s -k -X GET "https://localhost:9200/${INDEX_NAME}/_count" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' | jq '.count') 19 | -------------------------------------------------------------------------------- /elasticsearch/total_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Check if ELASTIC_PASSWORD env variable is set, if set from not read from .elastic_password file 10 | if [[ -z "$ELASTIC_PASSWORD" ]]; then 11 | [[ ! -f ".elastic_password" ]] && { echo "Error: ELASTIC_PASSWORD environment variable is not set and .elastic_password file does not exist."; exit 1; } 12 | export $(cat .elastic_password) 13 | fi 14 | 15 | # Arguments 16 | INDEX_NAME="$1" 17 | 18 | # Get data size 19 | curl -k -XGET "https://localhost:9200/_data_stream/${INDEX_NAME}/_stats?human" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' -------------------------------------------------------------------------------- /postgresql/run_queries.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | 12 | TRIES=3 13 | 14 | cat queries.sql | while read -r query; do 15 | 16 | # Clear the Linux file system cache 17 | echo "Clearing file system cache..." 18 | sync 19 | echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 20 | echo "File system cache cleared." 21 | 22 | # Print the query 23 | echo "Running query: $query" 24 | 25 | # Execute the query multiple times 26 | for i in $(seq 1 $TRIES); do 27 | sudo -u postgres psql -d "$DB_NAME" -t -c '\timing' -c "$query" | grep 'Time' 28 | done; 29 | done; -------------------------------------------------------------------------------- /singlestore/query_results.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | ROOT_PASSWORD="$1" 11 | DB_NAME="$2" 12 | 13 | export MYSQL_PWD=${ROOT_PASSWORD} 14 | 15 | QUERY_NUM=1 16 | 17 | cat queries.sql | while read -r query; do 18 | 19 | # Print the query 20 | echo "------------------------------------------------------------------------------------------------------------------------" 21 | echo "Result for query Q$QUERY_NUM:" 22 | echo 23 | 24 | mysql -h 127.0.0.1 -P 3306 -u root -D $DB_NAME -e "$query" 25 | 26 | # Increment the query number 27 | QUERY_NUM=$((QUERY_NUM + 1)) 28 | done; 29 | -------------------------------------------------------------------------------- /singlestore/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | LICENSE_KEY="$1" 11 | ROOT_PASSWORD="$2" 12 | 13 | sudo snap install docker 14 | sudo apt-get update 15 | sudo apt-get install -y mysql-client 16 | 17 | docker run -i --init \ 18 | --name singlestore-ciab \ 19 | -e LICENSE_KEY="${LICENSE_KEY}" \ 20 | -e ROOT_PASSWORD="${ROOT_PASSWORD}" \ 21 | -p 3306:3306 -p 8080:8080 \ 22 | singlestore/cluster-in-a-box 23 | 24 | docker start singlestore-ciab 25 | 26 | while true 27 | do 28 | mysql -h 127.0.0.1 -P 3306 -u root --password="${ROOT_PASSWORD}" -e 'SELECT 1' && break 29 | sleep 1 30 | done 31 | -------------------------------------------------------------------------------- /starrocks/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo snap install docker 4 | sudo apt-get update 5 | sudo apt-get install -y mysql-client 6 | 7 | docker run -p 9030:9030 -p 8030:8030 -p 8040:8040 -itd --name starrocks starrocks/allin1-ubuntu:4.0.1 8 | 9 | echo "Starting StarRocks container..." 10 | sleep 5 11 | 12 | # Monitor logs until "Enjoy" appears 13 | echo "Monitoring container logs for 'Enjoy' message..." 14 | timeout 300 docker logs -f starrocks | while read line; do 15 | echo "$line" 16 | if echo "$line" | grep -q "Enjoy"; then 17 | echo "Found 'Enjoy' message! Container is ready." 18 | # Kill the docker logs process 19 | pkill -f "docker logs -f starrocks" 20 | break 21 | fi 22 | done 23 | 24 | echo "StarRocks started successfully." 25 | -------------------------------------------------------------------------------- /victorialogs/queries.logsql: -------------------------------------------------------------------------------- 1 | * | by (commit.collection) count() count | sort (count desc) 2 | {kind=commit,commit.operation=create} | by (commit.collection) count() count, count_uniq(did) users | sort (count desc) 3 | {kind=commit,commit.operation=create,commit.collection=~"app\\.bsky\\.feed\\.(post|repost|like)"} | math floor(_time/1h)%24 hour_of_day | by (commit.collection, hour_of_day) count() count | sort (hour_of_day, commit.collection) 4 | {kind=commit,commit.operation=create,commit.collection=app.bsky.feed.post} | by (did) min(_time) first_post_ts | first 3 (first_post_ts) 5 | {kind=commit,commit.operation=create,commit.collection=app.bsky.feed.post} | by (did) min(_time) tmin, max(_time) tmax | math round((tmax-tmin)/1e6) activity_span | keep did, activity_span | first 3 (activity_span desc) 6 | -------------------------------------------------------------------------------- /singlestore/results/m6i.8xlarge_bluesky_10m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "SingleStore", 3 | "version": "5.7.32", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-03-14", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 10000000, 11 | "num_loaded_documents": 7000000, 12 | "total_size": 1943790266, 13 | "data_size": 1943790266, 14 | "index_size": 0, 15 | "result": [ 16 | [0.658, 0.444, 0.494], 17 | [2.831, 2.647, 2.836], 18 | [1.437, 1.456, 1.124], 19 | [1.253, 0.412, 0.434], 20 | [1.347, 0.553, 0.587] 21 | ], 22 | "memory_usage": [ 23 | [null, null, null], 24 | [null, null, null], 25 | [null, null, null], 26 | [null, null, null], 27 | [null, null, null] 28 | ] 29 | } 30 | -------------------------------------------------------------------------------- /singlestore/results/m6i.8xlarge_bluesky_1m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "SingleStore", 3 | "version": "5.7.32", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-03-14", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000, 11 | "num_loaded_documents": 1000000, 12 | "total_size": 275466582, 13 | "data_size": 275466582, 14 | "index_size": 0, 15 | "result": [ 16 | [0.206, 0.083, 0.098], 17 | [0.615, 0.544, 0.452], 18 | [0.366, 0.268, 0.149], 19 | [0.259, 0.111, 0.106], 20 | [0.324, 0.167, 0.171] 21 | ], 22 | "memory_usage": [ 23 | [null, null, null], 24 | [null, null, null], 25 | [null, null, null], 26 | [null, null, null], 27 | [null, null, null] 28 | ] 29 | } 30 | -------------------------------------------------------------------------------- /clickhouse/ddl.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE bluesky 2 | ( 3 | `data` JSON( 4 | max_dynamic_paths = 0, 5 | kind LowCardinality(String), 6 | commit.operation LowCardinality(String), 7 | commit.collection LowCardinality(String), 8 | did String, 9 | time_us UInt64) CODEC(ZSTD(1)) 10 | ) 11 | ORDER BY ( 12 | data.kind, 13 | data.commit.operation, 14 | data.commit.collection, 15 | data.did, 16 | fromUnixTimestamp64Micro(data.time_us)) 17 | -- Below settings are planned to be default soon 18 | SETTINGS object_serialization_version = 'v3', 19 | dynamic_serialization_version = 'v3', 20 | object_shared_data_serialization_version = 'advanced', 21 | object_shared_data_serialization_version_for_zero_level_parts='map_with_buckets' 22 | -------------------------------------------------------------------------------- /clickhouse/run_queries.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | 12 | TRIES=3 13 | 14 | cat queries.sql | while read -r query; do 15 | 16 | # Clear the Linux file system cache 17 | echo "Clearing file system cache..." 18 | sync 19 | echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 20 | echo "File system cache cleared." 21 | 22 | # Print the query 23 | echo "Running query: $query" 24 | 25 | # Execute the query multiple times 26 | for i in $(seq 1 $TRIES); do 27 | ./clickhouse client --database="$DB_NAME" --time --memory-usage --format=Null --query="$query" --progress 0 28 | done; 29 | done; 30 | -------------------------------------------------------------------------------- /singlestore/physical_query_plans.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | ROOT_PASSWORD="$1" 11 | DB_NAME="$2" 12 | 13 | export MYSQL_PWD=${ROOT_PASSWORD} 14 | 15 | QUERY_NUM=1 16 | 17 | cat queries.sql | while read -r query; do 18 | 19 | # Print the query number 20 | echo "------------------------------------------------------------------------------------------------------------------------" 21 | echo "Physical query plan for query Q$QUERY_NUM:" 22 | echo 23 | 24 | mysql -h 127.0.0.1 -P 3306 -u root $DB_NAME -e "EXPLAIN $query" 25 | 26 | # Increment the query number 27 | QUERY_NUM=$((QUERY_NUM + 1)) 28 | done; 29 | -------------------------------------------------------------------------------- /singlestore/results/m6i.8xlarge_bluesky_100m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "SingleStore", 3 | "version": "5.7.32", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-03-14", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 100000000, 11 | "num_loaded_documents": 91000000, 12 | "total_size": 25757172429, 13 | "data_size": 25757172429, 14 | "index_size": 0, 15 | "result": [ 16 | [6.471, 5.066, 5.606], 17 | [37.551, 25.246, 25.060], 18 | [14.634, 12.810, 12.786], 19 | [34.688, 3.691, 3.702], 20 | [34.735, 4.934, 4.713] 21 | ], 22 | "memory_usage": [ 23 | [null, null, null], 24 | [null, null, null], 25 | [null, null, null], 26 | [null, null, null], 27 | [null, null, null] 28 | ] 29 | } 30 | -------------------------------------------------------------------------------- /doris/ddl.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE bluesky ( 2 | kind VARCHAR(100) GENERATED ALWAYS AS (get_json_string(data, '$.kind')) NOT NULL, 3 | operation VARCHAR(100) GENERATED ALWAYS AS (get_json_string(data, '$.commit.operation')) NULL, 4 | collection VARCHAR(100) GENERATED ALWAYS AS (get_json_string(data, '$.commit.collection')) NULL, 5 | did VARCHAR(100) GENERATED ALWAYS AS (get_json_string(data,'$.did')) NOT NULL, 6 | time DATETIME GENERATED ALWAYS AS (from_microsecond(get_json_bigint(data, '$.time_us'))) NOT NULL, 7 | `data` variant<'kind': string, 'commit.operation' : string, 'commit.collection' : string, 'did' : string, 'time_us' : bigint, properties("variant_max_subcolumns_count" = "1024")> NOT NULL 8 | ) 9 | DUPLICATE KEY (kind, operation, collection, did) 10 | PROPERTIES ("replication_num"="1"); 11 | -------------------------------------------------------------------------------- /greptimedb/pipeline.yaml: -------------------------------------------------------------------------------- 1 | processors: 2 | - epoch: 3 | fields: 4 | - time_us 5 | resolution: microsecond 6 | - simple_extract: 7 | fields: 8 | - commit, commit_collection 9 | key: "collection" 10 | ignore_missing: true 11 | - simple_extract: 12 | fields: 13 | - commit, commit_operation 14 | key: "operation" 15 | ignore_missing: true 16 | 17 | transform: 18 | - fields: 19 | - did 20 | type: string 21 | - fields: 22 | - kind 23 | - commit_collection 24 | - commit_operation 25 | type: string 26 | index: inverted 27 | tag: true 28 | - fields: 29 | - commit 30 | type: json 31 | on_failure: ignore 32 | - fields: 33 | - time_us 34 | type: epoch, us 35 | index: timestamp 36 | -------------------------------------------------------------------------------- /singlestore/results/m6i.8xlarge_bluesky_1000m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "SingleStore", 3 | "version": "5.7.32", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-03-14", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "retains_structure": "yes", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000000, 11 | "num_loaded_documents": 811999990, 12 | "total_size": 234878938296, 13 | "data_size": 234878938296, 14 | "index_size": 0, 15 | "result": [ 16 | [51.599, 43.98, 51.557], 17 | [321.517, 207.62, 196.39], 18 | [125.113, 113.956, 111.105], 19 | [322.197, 32.433, 32.407], 20 | [326.151, 40.692, 40.644] 21 | ], 22 | "memory_usage": [ 23 | [null, null, null], 24 | [null, null, null], 25 | [null, null, null], 26 | [null, null, null], 27 | [null, null, null] 28 | ] 29 | } 30 | -------------------------------------------------------------------------------- /elasticsearch/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Install elasticsearch 4 | wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch | sudo gpg --dearmor --yes -o /usr/share/keyrings/elasticsearch-keyring.gpg 5 | sudo apt-get install apt-transport-https 6 | echo "deb [signed-by=/usr/share/keyrings/elasticsearch-keyring.gpg] https://artifacts.elastic.co/packages/8.x/apt stable main" | sudo tee /etc/apt/sources.list.d/elastic-8.x.list 7 | sudo apt-get update && sudo apt-get install elasticsearch 8 | 9 | # Install filebeat 10 | curl -L -O https://artifacts.elastic.co/downloads/beats/filebeat/filebeat-8.17.0-amd64.deb 11 | sudo dpkg -i filebeat-8.17.0-amd64.deb 12 | 13 | # Overwrite configuration files 14 | sudo cp config/elasticsearch.yml /etc/elasticsearch/elasticsearch.yml 15 | sudo cp config/jvm.options /etc/elasticsearch/jvm.options 16 | -------------------------------------------------------------------------------- /.github/workflows/generate-results.yml: -------------------------------------------------------------------------------- 1 | name: "Generate index.html" 2 | on: 3 | push: 4 | branches: 5 | - main 6 | 7 | permissions: 8 | contents: write 9 | 10 | jobs: 11 | build: 12 | runs-on: ubuntu-latest 13 | env: 14 | CI_COMMIT_MESSAGE: "[bot] update index.html" 15 | CI_COMMIT_AUTHOR: github 16 | steps: 17 | - uses: actions/checkout@v3 18 | - if: github.event.commits[0].message != env.CI_COMMIT_MESSAGE 19 | run: | 20 | bash generate-results.sh 21 | 22 | git config --global user.name "${{ env.CI_COMMIT_AUTHOR }}" 23 | git config --global user.email "${{ env.CI_COMMIT_AUTHOR }}@users.noreply.github.com" 24 | 25 | git add -A 26 | if git status | grep -q modified 27 | then 28 | git commit -m "${{ env.CI_COMMIT_MESSAGE }}" 29 | git push 30 | fi 31 | -------------------------------------------------------------------------------- /greptimedb/start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Do we run already? 4 | pidof greptime >/dev/null && exit 1 5 | 6 | BASEDIR=greptimedb_data 7 | 8 | echo "Starting GreptimeDB" 9 | export GREPTIMEDB_STANDALONE__WAL__DIR="${BASEDIR}/wal" 10 | export GREPTIMEDB_STANDALONE__STORAGE__DATA_HOME="${BASEDIR}" 11 | export GREPTIMEDB_STANDALONE__LOGGING__DIR="${BASEDIR}/logs" 12 | export GREPTIMEDB_STANDALONE__LOGGING__APPEND_STDOUT=false 13 | export GREPTIMEDB_STANDALONE__HTTP__BODY_LIMIT=1GB 14 | export GREPTIMEDB_STANDALONE__HTTP__TIMEOUT=500s 15 | ./greptime standalone start & 16 | 17 | while true 18 | do 19 | curl -s --fail -o /dev/null http://localhost:4000/health && break 20 | sleep 1 21 | done 22 | echo "Started GreptimeDB." 23 | 24 | # init pipeline 25 | curl -s -XPOST 'http://localhost:4000/v1/events/pipelines/jsonbench' -F 'file=@pipeline.yaml' 26 | echo -e "\nPipeline initialized." 27 | -------------------------------------------------------------------------------- /_files_gz/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Default data directory 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky 5 | 6 | # Allow the user to optionally provide the data directory as an argument 7 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}" 8 | 9 | # Define prefix for output files 10 | OUTPUT_PREFIX="${2:-_files_gz}" 11 | 12 | # Check if the data directory exists 13 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 14 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 15 | exit 1 16 | fi 17 | 18 | 19 | # 1m 20 | ./total_size.sh "$DATA_DIRECTORY" 1 | tee "${OUTPUT_PREFIX}_1m.total_size" 21 | 22 | # 10m 23 | ./total_size.sh "$DATA_DIRECTORY" 10 | tee "${OUTPUT_PREFIX}_10m.total_size" 24 | 25 | # 100m 26 | ./total_size.sh "$DATA_DIRECTORY" 100 | tee "${OUTPUT_PREFIX}_100m.total_size" 27 | 28 | # 1000m 29 | ./total_size.sh "$DATA_DIRECTORY" 1000 | tee "${OUTPUT_PREFIX}_1000m.total_size" -------------------------------------------------------------------------------- /mongodb/count.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATABASE_NAME="$1" 11 | COLLECTION_NAME="$2" 12 | 13 | # Fetch the document count using mongosh 14 | document_count=$(mongosh --quiet --eval " 15 | const db = db.getSiblingDB('$DATABASE_NAME'); 16 | const count = db.getCollection('$COLLECTION_NAME').stats().count 17 | print(count); 18 | ") 19 | 20 | # Debugging information 21 | echo "Database: $DATABASE_NAME" 22 | echo "Collection: $COLLECTION_NAME" 23 | echo "Document count: $document_count" 24 | 25 | # Print the result 26 | if [[ -z "$document_count" ]]; then 27 | echo "Error: Unable to fetch document count. Ensure the database and collection exist." 28 | exit 1 29 | else 30 | echo $document_count 31 | fi -------------------------------------------------------------------------------- /duckdb/run_queries.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | 12 | DUCKDB_CMD="duckdb $HOME/$DB_NAME" # tilda somehow doesn't work 13 | 14 | TRIES=3 15 | 16 | LOG_FILE="query_results.log" 17 | > "$LOG_FILE" 18 | 19 | cat queries.sql | while read -r query; do 20 | # Clear filesystem cache between queries. 21 | sync 22 | echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 23 | 24 | echo "Running query: $query" 25 | for i in $(seq 1 $TRIES); do 26 | # Run query with timer enabled and extract the real time. 27 | OUTPUT=$($DUCKDB_CMD <> "$LOG_FILE" 28 | .timer on 29 | $query 30 | EOF 31 | ) 32 | REAL_TIME=$(tac "$LOG_FILE" | grep -m 1 -oP 'real\s+\K[\d.]+') 33 | echo "Real time: $REAL_TIME seconds" 34 | done 35 | done 36 | -------------------------------------------------------------------------------- /elasticsearch/benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 [RESULT_FILE]" 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | INDEX_NAME="$1" 11 | RESULT_FILE="${2:-}" 12 | 13 | # Print the index name 14 | echo "Running queries on index: $INDEX_NAME" 15 | 16 | # Run queries and log the output 17 | ./run_queries.sh "$INDEX_NAME" 2>&1 | tee query_log.txt 18 | 19 | # Process the query log and prepare the result 20 | RESULT=$(cat query_log.txt | grep -oP 'Response time: \d+\.\d+ s' | sed -r -e 's/Response time: ([0-9]+\.[0-9]+) s/\1/' | \ 21 | awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }') 22 | 23 | # Output the result 24 | if [[ -n "$RESULT_FILE" ]]; then 25 | echo "$RESULT" > "$RESULT_FILE" 26 | echo "Result written to $RESULT_FILE" 27 | else 28 | echo "$RESULT" 29 | fi -------------------------------------------------------------------------------- /greptimedb/run_queries.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TRIES=3 4 | 5 | set -f 6 | cat queries.sql | while read -r query; do 7 | # Clear the Linux file system cache 8 | echo "Clearing file system cache..." 9 | sync 10 | echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null 11 | 12 | # Print the query 13 | echo "Running query: $query" 14 | 15 | # Execute the query multiple times 16 | echo -n "[" 17 | for i in $(seq 1 $TRIES); do 18 | t_start=$(date +%s%3N) 19 | curl -s --fail http://localhost:4000/v1/sql --data-urlencode "sql=$query" > /dev/null 20 | exit_code=$? 21 | t_end=$(date +%s%3N) 22 | duration=$((t_end-t_start)) 23 | RES=$(awk "BEGIN {print $duration / 1000}" | tr ',' '.') 24 | [[ "$exit_code" == "0" ]] && echo -n "${RES}" || echo -n "null" 25 | [[ "$i" != $TRIES ]] && echo -n ", " 26 | done 27 | echo "]" 28 | 29 | done -------------------------------------------------------------------------------- /mongodb/create_and_load.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 6 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | COLLECTION_NAME="$2" 12 | DATA_DIRECTORY="$3" 13 | NUM_FILES="$4" 14 | SUCCESS_LOG="$5" 15 | ERROR_LOG="$6" 16 | 17 | # Validate arguments 18 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; } 19 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; } 20 | 21 | # Create database and execute DDL file 22 | mongosh --quiet --eval " 23 | db = db.getSiblingDB('$DB_NAME'); 24 | load('$DDL_FILE'); 25 | " 26 | 27 | echo "Loading data" 28 | ./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$COLLECTION_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG" 29 | -------------------------------------------------------------------------------- /duckdb/benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 [RESULT_FILE]" 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATABASE_NAME="$1" 11 | RESULT_FILE="${2:-}" 12 | 13 | # Print the database name 14 | echo "Running queries on database: $DATABASE_NAME" 15 | 16 | # Run queries and log the output 17 | ./run_queries.sh "$DATABASE_NAME" 2>&1 | tee query_log.txt 18 | 19 | # Process the query log and prepare the result 20 | RESULT=$(cat query_log.txt | grep -oP 'Real time: \d+\.\d+ seconds' | sed -r -e 's/Real time: ([0-9]+\.[0-9]+) seconds/\1/' | \ 21 | awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }') 22 | 23 | # Output the result 24 | if [[ -n "$RESULT_FILE" ]]; then 25 | echo "$RESULT" > "$RESULT_FILE" 26 | echo "Result written to $RESULT_FILE" 27 | else 28 | echo "$RESULT" 29 | fi 30 | -------------------------------------------------------------------------------- /victorialogs/run_queries.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TRIES=3 4 | 5 | set -f 6 | cat queries.logsql | while read -r query; do 7 | 8 | # Clear the Linux file system cache 9 | echo "Clearing file system cache..." 10 | sync 11 | echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null 12 | 13 | # Print the query 14 | echo "Running query: $query" 15 | 16 | # Execute the query multiple times 17 | echo -n "[" 18 | for i in $(seq 1 $TRIES); do 19 | t_start=$(date +%s%3N) 20 | curl -s --fail http://localhost:9428/select/logsql/query --data-urlencode "query=$query" > /dev/null 21 | exit_code=$? 22 | t_end=$(date +%s%3N) 23 | duration=$((t_end-t_start)) 24 | RES=$(awk "BEGIN {print $duration / 1000}" | tr ',' '.') 25 | [[ "$exit_code" == "0" ]] && echo -n "${RES}" || echo -n "null" 26 | [[ "$i" != $TRIES ]] && echo -n ", " 27 | done 28 | echo "]" 29 | 30 | done 31 | -------------------------------------------------------------------------------- /duckdb/create_and_load.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 7 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | TABLE_NAME="$2" 12 | DDL_FILE="$3" 13 | DATA_DIRECTORY="$4" 14 | NUM_FILES="$5" 15 | SUCCESS_LOG="$6" 16 | ERROR_LOG="$7" 17 | 18 | # Validate arguments 19 | [[ ! -f "$DDL_FILE" ]] && { echo "Error: DDL file '$DDL_FILE' does not exist."; exit 1; } 20 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; } 21 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; } 22 | 23 | echo "Create database and execute DDL" 24 | duckdb ~/$DB_NAME < "$DDL_FILE" 25 | 26 | echo "Load data" 27 | ./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG" 28 | -------------------------------------------------------------------------------- /doris/queries.sql: -------------------------------------------------------------------------------- 1 | SELECT collection AS event, COUNT(*) AS count FROM bluesky GROUP BY event ORDER BY count DESC; 2 | SELECT collection AS event, COUNT(*) AS count, COUNT(DISTINCT did) AS users FROM bluesky WHERE kind = 'commit' AND operation = 'create' GROUP BY event ORDER BY count DESC; 3 | SELECT collection AS event, HOUR(time) AS hour_of_day, COUNT(*) AS count FROM bluesky WHERE kind = 'commit' AND operation = 'create' AND collection IN ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') GROUP BY event, hour_of_day ORDER BY hour_of_day, event; 4 | SELECT did AS user_id, MIN(time) AS first_post_ts FROM bluesky WHERE kind = 'commit' AND operation = 'create' AND collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC LIMIT 3; 5 | SELECT did AS user_id, MILLISECONDS_DIFF(MAX(time),MIN(time)) AS activity_span FROM bluesky WHERE kind = 'commit' AND operation = 'create' AND collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC LIMIT 3; 6 | -------------------------------------------------------------------------------- /_files_gz/total_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATA_DIRECTORY="$1" 11 | N="$2" 12 | 13 | # Validate the data directory 14 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 15 | echo "Error: Directory '$DATA_DIRECTORY' does not exist." 16 | exit 1 17 | fi 18 | 19 | # Validate N is a positive integer 20 | if ! [[ "$N" =~ ^[0-9]+$ ]]; then 21 | echo "Error: N must be a positive integer." 22 | exit 1 23 | fi 24 | 25 | # Get the first N files sorted by filename and calculate their total size 26 | TOTAL_SIZE=$(ls -1 "$DATA_DIRECTORY" | sort | head -n "$N" | while read -r file; do 27 | filepath="$DATA_DIRECTORY/$file" 28 | if [[ -f "$filepath" ]]; then 29 | stat --format="%s" "$filepath" 30 | fi 31 | done | awk '{sum += $1} END {print sum}') 32 | 33 | # Output the total size in bytes 34 | echo $TOTAL_SIZE -------------------------------------------------------------------------------- /elasticsearch/start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Starting ElasticSearch" 4 | sudo systemctl start elasticsearch.service 5 | 6 | echo "Resetting and export ElasticSearch password" 7 | export ELASTIC_PASSWORD=$(sudo /usr/share/elasticsearch/bin/elasticsearch-reset-password -s -a -b -u elastic) 8 | 9 | echo "Saving ElasticSearch password in local file" 10 | echo "ELASTIC_PASSWORD=$ELASTIC_PASSWORD" > .elastic_password 11 | 12 | echo "Generating API key for filebeat" 13 | curl -s -k -X POST "https://localhost:9200/_security/api_key" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' -d ' 14 | { 15 | "name": "filebeat", 16 | "role_descriptors": { 17 | "filebeat_writer": { 18 | "cluster": ["monitor", "read_ilm", "read_pipeline"], 19 | "index": [ 20 | { 21 | "names": ["bluesky-*"], 22 | "privileges": ["view_index_metadata", "create_doc", "auto_configure"] 23 | } 24 | ] 25 | } 26 | } 27 | }' | jq -r '"\(.id):\(.api_key)"' > .filebeat_api_key 28 | -------------------------------------------------------------------------------- /mongodb/benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 [RESULT_FILE]" 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | RESULT_FILE="${2:-}" 12 | 13 | # Construct the query log file name using $DB_NAME 14 | QUERY_LOG_FILE="_query_log_${DB_NAME}.txt" 15 | 16 | # Print the database name 17 | echo "Running queries on database: $DB_NAME" 18 | 19 | # Run queries and log the output 20 | ./run_queries.sh "$DB_NAME" 2>&1 | tee "$QUERY_LOG_FILE" 21 | 22 | # Process the query log and prepare the result 23 | RESULT=$(cat "$QUERY_LOG_FILE" | grep -oP 'Execution time: \d+ms' | sed -r 's/Execution time: ([0-9]+)/\1/' | \ 24 | awk '{ if (i % 3 == 0) { printf "[" }; printf $1 / 1000; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }') 25 | 26 | # Output the result 27 | if [[ -n "$RESULT_FILE" ]]; then 28 | echo "$RESULT" > "$RESULT_FILE" 29 | echo "Result written to $RESULT_FILE" 30 | else 31 | echo "$RESULT" 32 | fi -------------------------------------------------------------------------------- /postgresql/benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 [RESULT_FILE]" 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | RESULT_FILE="${2:-}" 12 | 13 | # Construct the query log file name using $DB_NAME 14 | QUERY_LOG_FILE="_query_log_${DB_NAME}.txt" 15 | 16 | # Print the database name 17 | echo "Running queries on database: $DB_NAME" 18 | 19 | # Run queries and log the output 20 | ./run_queries.sh "$DB_NAME" 2>&1 | tee "$QUERY_LOG_FILE" 21 | 22 | # Process the query log and prepare the result 23 | RESULT=$(cat "$QUERY_LOG_FILE" | grep -oP 'Time: \d+\.\d+ ms' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/' | \ 24 | awk '{ if (i % 3 == 0) { printf "[" }; printf $1 / 1000; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }') 25 | 26 | # Output the result 27 | if [[ -n "$RESULT_FILE" ]]; then 28 | echo "$RESULT" > "$RESULT_FILE" 29 | echo "Result written to $RESULT_FILE" 30 | else 31 | echo "$RESULT" 32 | fi -------------------------------------------------------------------------------- /doris/create_and_load.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 6 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | TABLE_NAME="$2" 12 | DATA_DIRECTORY="$3" 13 | NUM_FILES="$4" 14 | SUCCESS_LOG="$5" 15 | ERROR_LOG="$6" 16 | 17 | # Validate arguments 18 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; } 19 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; } 20 | 21 | 22 | echo "Create database" 23 | mysql -P 9030 -h 127.0.0.1 -u root -e "CREATE DATABASE IF NOT EXISTS $DB_NAME" 24 | 25 | echo "Execute DDL" 26 | mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME < "ddl.sql" 27 | 28 | echo "Load data" 29 | ./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG" 30 | 31 | echo "Sleep 120 sec to collect data size" 32 | sleep 120s 33 | -------------------------------------------------------------------------------- /starrocks/run_queries.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # If you change something in this file, please change also in doris/run_queries.sh. 4 | 5 | # Check if the required arguments are provided 6 | if [[ $# -lt 1 ]]; then 7 | echo "Usage: $0 " 8 | exit 1 9 | fi 10 | 11 | # Arguments 12 | DB_NAME="$1" 13 | 14 | TRIES=3 15 | 16 | cat queries.sql | while read -r query; do 17 | 18 | # Clear the Linux file system cache 19 | echo "Clearing file system cache..." 20 | sync 21 | echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 22 | echo "File system cache cleared." 23 | 24 | # Print the query 25 | echo "Running query: $query" 26 | 27 | # Execute the query multiple times 28 | for i in $(seq 1 $TRIES); do 29 | RESP=$({ /usr/bin/time -f '%e' \ 30 | mysql --skip-auto-rehash --batch --silent -h "$DB_HOST" -P "$DB_MYSQL_PORT" -u"$DB_USER" "$DB_NAME" \ 31 | -e "$query" >/dev/null; } 2>&1) 32 | echo "Response time: ${RESP} s" 33 | done; 34 | done; 35 | -------------------------------------------------------------------------------- /clickhouse/create_and_load.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 6 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | TABLE_NAME="$2" 12 | DATA_DIRECTORY="$3" 13 | NUM_FILES="$4" 14 | SUCCESS_LOG="$5" 15 | ERROR_LOG="$6" 16 | 17 | # Validate arguments 18 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; } 19 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; } 20 | 21 | 22 | echo "Creating database $DB_NAME" 23 | ./clickhouse client --query "CREATE DATABASE IF NOT EXISTS $DB_NAME" 24 | 25 | echo "Executing DDL for database $DB_NAME" 26 | ./clickhouse client --database="$DB_NAME" --enable_json_type=1 --multiquery < ddl.sql 27 | 28 | echo "Loading data for database $DB_NAME" 29 | ./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG" 30 | -------------------------------------------------------------------------------- /elasticsearch/drop_tables.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Stopping ElasticSearch" 4 | sudo systemctl stop elasticsearch.service 5 | 6 | # My amateurish attempt to delete data from Elasticsearch led me to 7 | # - https://stackoverflow.com/questions/22924300/removing-data-from-elasticsearch 8 | # - https://stackoverflow.com/questions/23917327/delete-all-documents-from-index-without-deleting-index 9 | # but none of that worked for me so I gave up after debugging this mess for 90 minutes. 10 | 11 | # Let's try it the old-fashioned way. 12 | 13 | # echo "Nuking ElasticSearch directories" 14 | # sudo rm -rf /var/lib/elasticsearch/* 15 | # sudo rm -rf /var/log/elasticsearch/* 16 | 17 | # ^^ Haha. Fails silently, please `sudo su` and run above `rm` statements by hand. But don't delete the elasticsearch/ folders themselves, 18 | # otherwise elasticsearch will refuse to start and you will need to re-install it via apt. What a shameful disaster. If someone knows how to 19 | # perform the extremely simple task of deleting data from Elasticsearch, please send a pull request. 20 | -------------------------------------------------------------------------------- /greptimedb/queries.sql: -------------------------------------------------------------------------------- 1 | SELECT commit_collection AS event, count(1) AS cnt FROM bluesky GROUP BY event ORDER BY cnt DESC; 2 | SELECT commit_collection AS event, count(1) AS cnt, count(DISTINCT did) AS users FROM bluesky WHERE kind = 'commit' AND commit_operation = 'create' GROUP BY event ORDER BY cnt DESC; 3 | SELECT commit_collection AS event, date_part('hour', time_us) AS hour_of_day, count(1) AS cnt FROM bluesky WHERE kind = 'commit' AND commit_operation = 'create' AND commit_collection IN('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') GROUP BY event, hour_of_day ORDER BY hour_of_day, event; 4 | SELECT did AS user_id, min(time_us) AS first_post_ts FROM bluesky WHERE kind = 'commit' AND commit_operation = 'create' AND commit_collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC LIMIT 3; 5 | SELECT did AS user_id, date_part('millisecond',(max(time_us) - min(time_us))) AS activity_span FROM bluesky WHERE kind = 'commit' AND commit_operation = 'create' AND commit_collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC LIMIT 3; 6 | -------------------------------------------------------------------------------- /elasticsearch/queries.txt: -------------------------------------------------------------------------------- 1 | FROM ${INDEX_NAME} | STATS count = COUNT() BY commit.collection | SORT count DESC 2 | FROM ${INDEX_NAME} | WHERE kind == \\\"commit\\\" AND commit.operation == \\\"create\\\" | STATS users = COUNT_DISTINCT(did, 40000), count = COUNT() BY commit.collection | SORT count DESC 3 | FROM ${INDEX_NAME} | WHERE kind == \\\"commit\\\" AND commit.operation == \\\"create\\\" AND commit.collection IN (\\\"app.bsky.feed.post\\\", \\\"app.bsky.feed.repost\\\", \\\"app.bsky.feed.like\\\") | STATS count = COUNT() BY commit.collection, DATE_EXTRACT(\\\"hour_of_day\\\", time_us) | SORT count, commit.collection 4 | FROM ${INDEX_NAME} | WHERE kind == \\\"commit\\\" AND commit.operation == \\\"create\\\" AND commit.collection == \\\"app.bsky.feed.post\\\" | STATS first_post_ts = MIN(time_us) BY did | SORT first_post_ts ASC | LIMIT 3 5 | FROM ${INDEX_NAME} | WHERE kind == \\\"commit\\\" AND commit.operation == \\\"create\\\" AND commit.collection == \\\"app.bsky.feed.post\\\" | STATS activity_span = date_diff(\\\"millisecond\\\",min(time_us), max(time_us)) BY did | SORT activity_span DESC | LIMIT 3 -------------------------------------------------------------------------------- /generate-results.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # This script will substitute the benchmark results into the HTML page. 4 | # Note: editing HTML with sed may look strange, but at least we avoid using node.js and npm, and that's good. 5 | 6 | # This is needed on Mac OS. Do `brew install coreutils`. 7 | if [[ "$(uname)" == "Darwin" ]]; then 8 | if ! command -v gsed >/dev/null 2>&1 9 | then 10 | echo "On macOS, please install GNU sed through homebrew." 11 | exit 1 12 | else 13 | shopt -s expand_aliases 14 | alias sed='gsed' 15 | fi 16 | fi 17 | 18 | ( 19 | sed '/^const data = \[$/q' index.html 20 | 21 | FIRST=1 22 | LANG="" ls -1 */results*/*.json | while read -r file 23 | do 24 | [ "${FIRST}" = "0" ] && echo -n ',' 25 | jq --compact-output ". += {\"source\": \"${file}\"}" "${file}" || echo "Error in $file" >&2 26 | FIRST=0 27 | done 28 | 29 | echo ']; // end of data' 30 | sed '0,/^\]; \/\/ end of data$/d' index.html 31 | 32 | ) > index.html.new 33 | 34 | mv index.html index.html.bak 35 | mv index.html.new index.html 36 | -------------------------------------------------------------------------------- /singlestore/run_queries.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | ROOT_PASSWORD="$1" 11 | DB_NAME="$2" 12 | 13 | export MYSQL_PWD=${ROOT_PASSWORD} 14 | 15 | TRIES=3 16 | 17 | cat queries.sql | while read -r query; do 18 | 19 | # Clear the Linux file system cache 20 | echo "Clearing file system cache..." 21 | sync 22 | echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 23 | echo "File system cache cleared." 24 | 25 | # Print the query 26 | echo "Running query: $query" 27 | 28 | # Execute the query multiple times 29 | for i in $(seq 1 $TRIES); do 30 | time mysql -h 127.0.0.1 -P 3306 -u root -D $DB_NAME -e "$query" 31 | done; 32 | done; 33 | 34 | # The runtime measured by `time` is manually copied into the result .json file. 35 | # I couldn't find a way to figure out the per-query memory consumption, these are marked as "null" in the result .json files. Feel free to 36 | # re-produce and add memory consumption measurements! 37 | -------------------------------------------------------------------------------- /starrocks/create_and_load.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # If you change something in this file, please change also in doris/create_and_load.sh. 4 | 5 | # Check if the required arguments are provided 6 | if [[ $# -lt 6 ]]; then 7 | echo "Usage: $0 " 8 | exit 1 9 | fi 10 | 11 | # Arguments 12 | DB_NAME="$1" 13 | TABLE_NAME="$2" 14 | DATA_DIRECTORY="$3" 15 | NUM_FILES="$4" 16 | SUCCESS_LOG="$5" 17 | ERROR_LOG="$6" 18 | DDL_FILE="ddl.sql" 19 | 20 | # Validate arguments 21 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; } 22 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; } 23 | 24 | 25 | echo "Create database" 26 | mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" -e "CREATE DATABASE IF NOT EXISTS $DB_NAME" 27 | 28 | echo "Execute DDL" 29 | mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" < "$DDL_FILE" 30 | 31 | echo "Load data" 32 | ./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG" 33 | -------------------------------------------------------------------------------- /postgresql/create_and_load.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 7 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | TABLE_NAME="$2" 12 | DDL_FILE="$3" 13 | DATA_DIRECTORY="$4" 14 | NUM_FILES="$5" 15 | SUCCESS_LOG="$6" 16 | ERROR_LOG="$7" 17 | 18 | # Validate arguments 19 | [[ ! -f "$DDL_FILE" ]] && { echo "Error: DDL file '$DDL_FILE' does not exist."; exit 1; } 20 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; } 21 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; } 22 | 23 | echo "Create database" 24 | sudo -u postgres psql -t -c "CREATE DATABASE $DB_NAME" 25 | 26 | echo "Execute DDL" 27 | sudo -u postgres psql "$DB_NAME" -t < "$DDL_FILE" 28 | 29 | echo "Load data" 30 | ./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG" 31 | 32 | echo "Vacuum analyze the table" 33 | sudo -u postgres psql "$DB_NAME" -t -c "VACUUM ANALYZE $TABLE_NAME" 34 | -------------------------------------------------------------------------------- /doris/benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # If you change something in this file, please change also in starrocks/benchmark.sh. 4 | 5 | # Check if the required arguments are provided 6 | if [[ $# -lt 3 ]]; then 7 | echo "Usage: $0 " 8 | exit 1 9 | fi 10 | 11 | # Arguments 12 | DB_NAME="$1" 13 | RESULT_FILE_RUNTIMES="$2" 14 | QUERIES_FILE="$3" 15 | 16 | # Construct the query log file name using $DB_NAME 17 | QUERY_LOG_FILE="query_log.txt" 18 | 19 | # Print the database name 20 | echo "Running queries on database: $DB_NAME" 21 | 22 | # Run queries and log the output 23 | ./run_queries.sh "$DB_NAME" "$QUERIES_FILE" 2>&1 | tee query_log.txt 24 | 25 | # Process the query log and prepare the result 26 | RESULT=$(cat query_log.txt | grep -oP 'Response time: \d+\.\d+ s' | sed -r -e 's/Response time: ([0-9]+\.[0-9]+) s/\1/' | \ 27 | awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }') 28 | 29 | # Output the result 30 | if [[ -n "$RESULT_FILE_RUNTIMES" ]]; then 31 | echo "$RESULT" > "$RESULT_FILE_RUNTIMES" 32 | echo "Result written to $RESULT_FILE_RUNTIMES" 33 | else 34 | echo "$RESULT" 35 | fi 36 | -------------------------------------------------------------------------------- /starrocks/benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # If you change something in this file, please change also in doris/benchmark.sh. 4 | 5 | # Check if the required arguments are provided 6 | if [[ $# -lt 3 ]]; then 7 | echo "Usage: $0 " 8 | exit 1 9 | fi 10 | 11 | # Arguments 12 | DB_NAME="$1" 13 | RESULT_FILE_RUNTIMES="$2" 14 | RESULT_FILE_MEMORY_USAGE="$3" 15 | 16 | # Construct the query log file name using $DB_NAME 17 | QUERY_LOG_FILE="query_log.txt" 18 | 19 | # Print the database name 20 | echo "Running queries on database: $DB_NAME" 21 | 22 | # Run queries and log the output 23 | ./run_queries.sh "$DB_NAME" 2>&1 | tee query_log.txt 24 | 25 | # Process the query log and prepare the result 26 | RESULT=$(cat query_log.txt | grep -oP 'Response time: \d+\.\d+ s' | sed -r -e 's/Response time: ([0-9]+\.[0-9]+) s/\1/' | \ 27 | awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }') 28 | 29 | # Output the result 30 | if [[ -n "$RESULT_FILE_RUNTIMES" ]]; then 31 | echo "$RESULT" > "$RESULT_FILE_RUNTIMES" 32 | echo "Result written to $RESULT_FILE_RUNTIMES" 33 | else 34 | echo "$RESULT" 35 | fi 36 | -------------------------------------------------------------------------------- /doris/run_queries.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # If you change something in this file, please change also in doris/run_queries.sh. 4 | 5 | # Check if the required arguments are provided 6 | if [[ $# -lt 2 ]]; then 7 | echo "Usage: $0 " 8 | exit 1 9 | fi 10 | 11 | # Arguments 12 | DB_NAME="$1" 13 | QUERIES_FILE="$2" 14 | 15 | TRIES=3 16 | 17 | mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "set global parallel_pipeline_task_num=32;" 18 | mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "set global enable_parallel_scan=false;" 19 | 20 | cat $QUERIES_FILE | while read -r query; do 21 | 22 | # Clear the Linux file system cache 23 | echo "Clearing file system cache..." 24 | sync 25 | echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 26 | echo "File system cache cleared." 27 | 28 | # Print the query 29 | echo "Running query: $query" 30 | 31 | # Execute the query multiple times 32 | for i in $(seq 1 $TRIES); do 33 | RESP=$(mysql -vvv -h127.1 -P9030 -uroot "$DB_NAME" -e "$query" | perl -nle 'if (/\((?:(\d+) min )?(\d+\.\d+) sec\)/) { $t = ($1 || 0) * 60 + $2; printf "%.2f\n", $t }' ||:) 34 | echo "Response time: ${RESP} s" 35 | done; 36 | done; 37 | -------------------------------------------------------------------------------- /mongodb/query_results.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | 12 | QUERY_NUM=1 13 | 14 | # File containing MongoDB queries (replace 'queries.js' with your file) 15 | QUERY_FILE="queries.js" 16 | 17 | # Check if the query file exists 18 | if [[ ! -f "$QUERY_FILE" ]]; then 19 | echo "Error: Query file '$QUERY_FILE' does not exist." 20 | exit 1 21 | fi 22 | 23 | # Read and execute each query 24 | cat "$QUERY_FILE" | while read -r query; do 25 | 26 | # Print the query 27 | echo "------------------------------------------------------------------------------------------------------------------------" 28 | echo "Result for query Q$QUERY_NUM:" 29 | echo 30 | 31 | # Escape the query for safe passing to mongosh 32 | ESCAPED_QUERY=$(echo "$query" | sed 's/\([\"\\]\)/\\\1/g' | sed 's/\$/\\$/g') 33 | 34 | mongosh --eval " 35 | const db = db.getSiblingDB('$DB_NAME'); 36 | const result = eval(\"$ESCAPED_QUERY\"); 37 | printjson(result); 38 | " 39 | 40 | 41 | # Increment the query number 42 | QUERY_NUM=$((QUERY_NUM + 1)) 43 | 44 | done -------------------------------------------------------------------------------- /singlestore/create_and_load.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 8 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | ROOT_PASSWORD="$1" 11 | DB_NAME="$2" 12 | TABLE_NAME="$3" 13 | DDL_FILE="$4" 14 | DATA_DIRECTORY="$5" 15 | NUM_FILES="$6" 16 | SUCCESS_LOG="$7" 17 | ERROR_LOG="$8" 18 | 19 | # Validate arguments 20 | [[ ! -f "$DDL_FILE" ]] && { echo "Error: DDL file '$DDL_FILE' does not exist."; exit 1; } 21 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; } 22 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; } 23 | 24 | export MYSQL_PWD=${ROOT_PASSWORD} 25 | 26 | echo "Creating database $DB_NAME" 27 | mysql -h 127.0.0.1 -P 3306 -u root -e "CREATE DATABASE IF NOT EXISTS $DB_NAME" 28 | 29 | echo "Executing DDL for database $DB_NAME" 30 | mysql -h 127.0.0.1 -P 3306 -u root $DB_NAME < "$DDL_FILE" 31 | 32 | echo "Loading data for database $DB_NAME" 33 | ./load_data.sh "$ROOT_PASSWORD" "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG" 34 | -------------------------------------------------------------------------------- /clickhouse/queries.sql: -------------------------------------------------------------------------------- 1 | SELECT data.commit.collection AS event, count() AS count FROM bluesky GROUP BY event ORDER BY count DESC; 2 | SELECT data.commit.collection AS event, count() AS count, uniqExact(data.did) AS users FROM bluesky WHERE data.kind = 'commit' AND data.commit.operation = 'create' GROUP BY event ORDER BY count DESC; 3 | SELECT data.commit.collection AS event, toHour(fromUnixTimestamp64Micro(data.time_us)) as hour_of_day, count() AS count FROM bluesky WHERE data.kind = 'commit' AND data.commit.operation = 'create' AND data.commit.collection in ['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'] GROUP BY event, hour_of_day ORDER BY hour_of_day, event; 4 | SELECT data.did::String as user_id, min(fromUnixTimestamp64Micro(data.time_us)) as first_post_ts FROM bluesky WHERE data.kind = 'commit' AND data.commit.operation = 'create' AND data.commit.collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC LIMIT 3; 5 | SELECT data.did::String as user_id, date_diff( 'milliseconds', min(fromUnixTimestamp64Micro(data.time_us)), max(fromUnixTimestamp64Micro(data.time_us))) AS activity_span FROM bluesky WHERE data.kind = 'commit' AND data.commit.operation = 'create' AND data.commit.collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC LIMIT 3; 6 | -------------------------------------------------------------------------------- /_files_json/load_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if required arguments are provided 4 | if [[ $# -lt 3 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATA_DIRECTORY="$1" 11 | TARGET_DIRECTORY="$2" 12 | N="$3" 13 | 14 | # Validate the source directory 15 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 16 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 17 | exit 1 18 | fi 19 | 20 | # Validate the target directory 21 | if [[ ! -d "$TARGET_DIRECTORY" ]]; then 22 | echo "Error: Target directory '$TARGET_DIRECTORY' does not exist." 23 | exit 1 24 | fi 25 | 26 | # Validate N is a positive integer 27 | if ! [[ "$N" =~ ^[0-9]+$ ]]; then 28 | echo "Error: N must be a positive integer." 29 | exit 1 30 | fi 31 | 32 | # Get the sorted list of .json.gz files and extract the first N 33 | count=0 34 | for file in $(ls "$DATA_DIRECTORY"/*.json.gz | sort); do 35 | if [[ $count -ge $N ]]; then 36 | break 37 | fi 38 | 39 | echo "Processing $file..." 40 | gzip -dkc "$file" > "$TARGET_DIRECTORY/$(basename "${file%.gz}")" # Extract to target directory 41 | count=$((count + 1)) 42 | done 43 | 44 | echo "Extraction of $count files completed. Extracted files are in '$TARGET_DIRECTORY'." -------------------------------------------------------------------------------- /clickhouse/benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | RESULT_FILE_RUNTIMES="$2" 12 | 13 | # Construct the query log file name using $DB_NAME 14 | QUERY_LOG_FILE="_query_log_${DB_NAME}.txt" 15 | 16 | # Print the database name 17 | echo "Running queries on database: $DB_NAME" 18 | 19 | # Run queries and log the output 20 | ./run_queries.sh "$DB_NAME" 2>&1 | tee "$QUERY_LOG_FILE" 21 | 22 | # Process the query log and prepare the result 23 | RUNTIME_RESULTS=$(grep -E '^[0-9]' "$QUERY_LOG_FILE" | awk 'NR % 2 == 1' | awk '{ 24 | if (NR % 3 == 1) { printf "["; } 25 | printf $1; 26 | if (NR % 3 == 0) { 27 | print "],"; 28 | } else { 29 | printf ", "; 30 | } 31 | }') 32 | 33 | MEMORY_RESULTS=$(grep -E '^[0-9]' "$QUERY_LOG_FILE" | awk 'NR % 2 == 0' | awk '{ 34 | if (NR % 3 == 1) { printf "["; } 35 | printf $1; 36 | if (NR % 3 == 0) { 37 | print "],"; 38 | } else { 39 | printf ", "; 40 | } 41 | }') 42 | 43 | # Output the runtime results 44 | echo "$RUNTIME_RESULTS" > "$RESULT_FILE_RUNTIMES" 45 | echo "Runtime results written to $RESULT_FILE_RUNTIMES" 46 | -------------------------------------------------------------------------------- /singlestore/queries.sql: -------------------------------------------------------------------------------- 1 | SELECT data::commit::collection AS event, count(*) AS count FROM bluesky GROUP BY event ORDER BY count DESC; 2 | SELECT data::commit::collection AS event, count(*) AS count, count(distinct data::did) FROM bluesky WHERE data::$kind = 'commit' AND data::commit::$operation = 'create' GROUP BY event ORDER BY count DESC; 3 | SELECT data::commit::collection AS event, hour(from_unixtime(data::time_us/1000000)) AS hour_of_day, count(*) AS count FROM bluesky WHERE data::$kind = 'commit' AND data::commit::$operation = 'create' AND data::commit::$collection IN ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') GROUP BY event, hour_of_day ORDER BY hour_of_day, event; 4 | SELECT data::$did AS user_id, min(from_unixtime(data::time_us/1000000)) AS first_post_ts FROM bluesky WHERE data::$kind = 'commit' AND data::commit::$operation = 'create' AND data::commit::$collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC LIMIT 3; 5 | SELECT data::$did AS user_id, timestampdiff(microsecond, min(from_unixtime(data::time_us/1000000)), max(from_unixtime(data::time_us/1000000))) as activity_span FROM bluesky WHERE data::$kind = 'commit' AND data::commit::$operation = 'create' AND data::commit::$collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC LIMIT 3 6 | -------------------------------------------------------------------------------- /download_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Select the dataset size to download:" 4 | echo "1) 1m (default)" 5 | echo "2) 10m" 6 | echo "3) 100m" 7 | echo "4) 1000m" 8 | read -p "Enter the number corresponding to your choice: " choice 9 | 10 | case $choice in 11 | 2) 12 | # Download 10m dataset: files 0001 to 0010 13 | wget --continue --timestamping --progress=dot:giga --directory-prefix ~/data/bluesky --input-file <(seq --format "https://clickhouse-public-datasets.s3.amazonaws.com/bluesky/file_%04g.json.gz" 1 10) 14 | ;; 15 | 3) 16 | # Download 100m dataset: files 0001 to 0100 17 | wget --continue --timestamping --progress=dot:giga --directory-prefix ~/data/bluesky --input-file <(seq --format "https://clickhouse-public-datasets.s3.amazonaws.com/bluesky/file_%04g.json.gz" 1 100) 18 | ;; 19 | 4) 20 | # Download 1000m dataset: files 0001 to 1000 21 | wget --continue --timestamping --progress=dot:giga --directory-prefix ~/data/bluesky --input-file <(seq --format "https://clickhouse-public-datasets.s3.amazonaws.com/bluesky/file_%04g.json.gz" 1 1000) 22 | ;; 23 | *) 24 | # Download 1m dataset: single file 25 | wget --continue --timestamping --progress=dot:giga --directory-prefix ~/data/bluesky "https://clickhouse-public-datasets.s3.amazonaws.com/bluesky/file_0001.json.gz" 26 | ;; 27 | esac 28 | -------------------------------------------------------------------------------- /victorialogs/load_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 4 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATA_DIRECTORY="$1" 11 | MAX_FILES="$2" 12 | SUCCESS_LOG="$3" 13 | ERROR_LOG="$4" 14 | 15 | # Validate arguments 16 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; } 17 | [[ ! "$MAX_FILES" =~ ^[0-9]+$ ]] && { echo "Error: MAX_FILES must be a positive integer."; exit 1; } 18 | 19 | # Load data 20 | PARALLEL_WORKERS=8 21 | counter=0 22 | for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do 23 | echo "Processing file: $file" 24 | 25 | zcat $file | curl -s --fail -T - -X POST 'http://localhost:9428/insert/jsonline?_time_field=time_us&_stream_fields=kind,commit.collection,commit.operation' \ 26 | && echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $file." >> "$SUCCESS_LOG" \ 27 | || echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed importing $file." >> "$ERROR_LOG" & 28 | 29 | [[ $(jobs -p -r | wc -l) -ge $PARALLEL_WORKERS ]] && wait -n 30 | 31 | counter=$((counter + 1)) 32 | if [[ $counter -ge $MAX_FILES ]]; then 33 | break 34 | fi 35 | done 36 | 37 | wait 38 | 39 | echo "Loaded $MAX_FILES data files from $DATA_DIRECTORY to victorialogs." 40 | -------------------------------------------------------------------------------- /duckdb/queries.sql: -------------------------------------------------------------------------------- 1 | SELECT j->>'$.commit.collection' AS event,count() AS count FROM bluesky GROUP BY event ORDER BY count DESC; 2 | SELECT j->>'$.commit.collection' AS event,count() AS count,count(DISTINCT j->>'$.did') AS users FROM bluesky WHERE (j->>'$.kind' = 'commit') AND (j->>'$.commit.operation' = 'create') GROUP BY event ORDER BY count DESC; 3 | SELECT j->>'$.commit.collection' AS event,hour(TO_TIMESTAMP(CAST(j->>'$.time_us' AS BIGINT) / 1000000)) as hour_of_day,count() AS count FROM bluesky WHERE (j->>'$.kind' = 'commit') AND (j->>'$.commit.operation' = 'create') AND (j->>'$.commit.collection' in ['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like']) GROUP BY event, hour_of_day ORDER BY hour_of_day, event; 4 | SELECT j->>'$.did'::String as user_id,TO_TIMESTAMP(CAST(MIN(j->>'$.time_us') AS BIGINT) / 1000000) AS first_post_date FROM bluesky WHERE (j->>'$.kind' = 'commit') AND (j->>'$.commit.operation' = 'create') AND (j->>'$.commit.collection' = 'app.bsky.feed.post') GROUP BY user_id ORDER BY first_post_date ASC LIMIT 3; 5 | SELECT j->>'$.did'::String as user_id,date_diff('milliseconds', TO_TIMESTAMP(CAST(MIN(j->>'$.time_us') AS BIGINT) / 1000000),TO_TIMESTAMP(CAST(MAX(j->>'$.time_us') AS BIGINT) / 1000000)) AS activity_span FROM bluesky WHERE (j->>'$.kind' = 'commit') AND (j->>'$.commit.operation' = 'create') AND (j->>'$.commit.collection' = 'app.bsky.feed.post') GROUP BY user_id ORDER BY activity_span DESC LIMIT 3; 6 | -------------------------------------------------------------------------------- /elasticsearch/query_results.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Check if ELASTIC_PASSWORD env variable is set, if set from not read from .elastic_password file 10 | if [[ -z "$ELASTIC_PASSWORD" ]]; then 11 | [[ ! -f ".elastic_password" ]] && { echo "Error: ELASTIC_PASSWORD environment variable is not set and .elastic_password file does not exist."; exit 1; } 12 | export $(cat .elastic_password) 13 | fi 14 | 15 | # Arguments 16 | INDEX_NAME="$1" 17 | 18 | QUERY_NUM=1 19 | 20 | # File containing Elasticsearch ES|SQL queries 21 | QUERY_FILE="queries.txt" 22 | 23 | # Check if the query file exists 24 | if [[ ! -f "$QUERY_FILE" ]]; then 25 | echo "Error: Query file '$QUERY_FILE' does not exist." 26 | exit 1 27 | fi 28 | 29 | cat 'queries.txt' | while read -r QUERY; do 30 | eval "QUERY=\"${QUERY}\"" 31 | # Print the query 32 | echo "------------------------------------------------------------------------------------------------------------------------" 33 | echo "Result for query Q$QUERY_NUM: " 34 | echo 35 | CURL_DATA="{\"query\": \"$QUERY\"}" 36 | curl -s -k -X POST "https://localhost:9200/_query?format=txt" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' -d "$CURL_DATA" 37 | echo 38 | # Increment the query number 39 | QUERY_NUM=$((QUERY_NUM + 1)) 40 | done 41 | -------------------------------------------------------------------------------- /postgresql/queries.sql: -------------------------------------------------------------------------------- 1 | SELECT data -> 'commit' ->> 'collection' AS event, COUNT(*) as count FROM bluesky GROUP BY event ORDER BY count DESC; 2 | SELECT data -> 'commit' ->> 'collection' AS event, COUNT(*) as count, COUNT(DISTINCT data ->> 'did') AS users FROM bluesky WHERE data ->> 'kind' = 'commit' AND data -> 'commit' ->> 'operation' = 'create' GROUP BY event ORDER BY count DESC; 3 | SELECT data->'commit'->>'collection' AS event, EXTRACT(HOUR FROM TO_TIMESTAMP((data->>'time_us')::BIGINT / 1000000)) AS hour_of_day, COUNT(*) AS count FROM bluesky WHERE data->>'kind' = 'commit' AND data->'commit'->>'operation' = 'create' AND data->'commit'->>'collection' IN ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') GROUP BY event, hour_of_day ORDER BY hour_of_day, event; 4 | SELECT data->>'did' AS user_id, MIN( TIMESTAMP WITH TIME ZONE 'epoch' + INTERVAL '1 microsecond' * (data->>'time_us')::BIGINT ) AS first_post_ts FROM bluesky WHERE data->>'kind' = 'commit' AND data->'commit'->>'operation' = 'create' AND data->'commit'->>'collection' = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC LIMIT 3; 5 | SELECT data->>'did' AS user_id, EXTRACT(EPOCH FROM ( MAX( TIMESTAMP WITH TIME ZONE 'epoch' + INTERVAL '1 microsecond' * (data->>'time_us')::BIGINT ) - MIN( TIMESTAMP WITH TIME ZONE 'epoch' + INTERVAL '1 microsecond' * (data->>'time_us')::BIGINT ) )) * 1000 AS activity_span FROM bluesky WHERE data->>'kind' = 'commit' AND data->'commit'->>'operation' = 'create' AND data->'commit'->>'collection' = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC LIMIT 3; 6 | -------------------------------------------------------------------------------- /doris/queries_default.sql: -------------------------------------------------------------------------------- 1 | SELECT cast(data['commit']['collection'] AS TEXT ) AS event, COUNT(*) AS count FROM bluesky GROUP BY event ORDER BY count DESC; 2 | SELECT cast(data['commit']['collection'] AS TEXT ) AS event, COUNT(*) AS count, COUNT(DISTINCT cast(data['did'] AS TEXT )) AS users FROM bluesky WHERE cast(data['kind'] AS TEXT ) = 'commit' AND cast(data['commit']['operation'] AS TEXT ) = 'create' GROUP BY event ORDER BY count DESC; 3 | SELECT cast(data['commit']['collection'] AS TEXT ) AS event, HOUR(from_microsecond(CAST(data['time_us'] AS BIGINT))) AS hour_of_day, COUNT(*) AS count FROM bluesky WHERE cast(data['kind'] AS TEXT ) = 'commit' AND cast(data['commit']['operation'] AS TEXT ) = 'create' AND cast(data['commit']['collection'] AS TEXT ) IN ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') GROUP BY event, hour_of_day ORDER BY hour_of_day, event; 4 | SELECT cast(data['did'] AS TEXT ) AS user_id, MIN(from_microsecond(CAST(data['time_us'] AS BIGINT))) AS first_post_ts FROM bluesky WHERE cast(data['kind'] AS TEXT ) = 'commit' AND cast(data['commit']['operation'] AS TEXT ) = 'create' AND cast(data['commit']['collection'] AS TEXT ) = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC LIMIT 3; 5 | SELECT cast(data['did'] AS TEXT ) AS user_id, MILLISECONDS_DIFF(MAX(from_microsecond(CAST(data['time_us'] AS BIGINT))),MIN(from_microsecond(CAST(data['time_us'] AS BIGINT)))) AS activity_span FROM bluesky WHERE cast(data['kind'] AS TEXT ) = 'commit' AND cast(data['commit']['operation'] AS TEXT ) = 'create' AND cast(data['commit']['collection'] AS TEXT ) = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC LIMIT 3; 6 | -------------------------------------------------------------------------------- /starrocks/queries.sql: -------------------------------------------------------------------------------- 1 | SELECT get_json_string(data, 'commit.collection') AS event, count() AS count FROM bluesky GROUP BY event ORDER BY count DESC; 2 | SELECT get_json_string(data, 'commit.collection') AS event, count() AS count, count(DISTINCT get_json_string(data, 'did')) AS users FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') GROUP BY event ORDER BY count DESC; 3 | SELECT get_json_string(data, 'commit.collection') AS event, hour_from_unixtime(get_json_int(data, 'time_us')/1000000) as hour_of_day, count() AS count FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], get_json_string(data, 'commit.collection'))) GROUP BY event, hour_of_day ORDER BY hour_of_day, event; 4 | SELECT get_json_string(data, 'did') as user_id, to_datetime(min(get_json_int(data, 'time_us')), 6) AS first_post_date FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY first_post_date ASC LIMIT 3; 5 | SELECT get_json_string(data, 'did') as user_id, date_diff('millisecond', to_datetime(min(get_json_int(data, 'time_us')), 6), to_datetime(max(get_json_int(data, 'time_us')), 6)) AS activity_span FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY activity_span DESC LIMIT 3; 6 | -------------------------------------------------------------------------------- /greptimedb/load_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 4 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATA_DIRECTORY="$1" 11 | MAX_FILES="$2" 12 | SUCCESS_LOG="$3" 13 | ERROR_LOG="$4" 14 | 15 | # Validate arguments 16 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; } 17 | [[ ! "$MAX_FILES" =~ ^[0-9]+$ ]] && { echo "Error: MAX_FILES must be a positive integer."; exit 1; } 18 | 19 | pushd $DATA_DIRECTORY 20 | counter=0 21 | for file in $(ls *.json.gz | head -n $MAX_FILES); do 22 | echo "Processing file: $file" 23 | 24 | curl "http://localhost:4000/v1/events/logs?table=bluesky&pipeline_name=jsonbench&ignore_errors=true" \ 25 | -H "Content-Type: application/x-ndjson" \ 26 | -H "Content-Encoding: gzip" \ 27 | --data-binary @$file 28 | echo "" 29 | 30 | first_attempt=$? 31 | if [[ $first_attempt -eq 0 ]]; then 32 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $file." >> "$SUCCESS_LOG" 33 | else 34 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed for $file. Giving up." >> "$ERROR_LOG" 35 | fi 36 | 37 | counter=$((counter + 1)) 38 | if [[ $counter -ge $MAX_FILES ]]; then 39 | break 40 | fi 41 | done 42 | 43 | curl -XPOST -H 'Content-Type: application/x-www-form-urlencoded' \ 44 | http://localhost:4000/v1/sql \ 45 | -d "sql=admin flush_table('bluesky')" \ 46 | -d "format=json" 47 | 48 | echo -e "\nLoaded $MAX_FILES data files from $DATA_DIRECTORY to GreptimeDB." 49 | -------------------------------------------------------------------------------- /mongodb/queries.js: -------------------------------------------------------------------------------- 1 | db.bluesky.aggregate([ { $group: { _id: "$commit.collection", count: { $sum: 1 } } }, { $sort: { count: -1 } } ]); 2 | db.bluesky.aggregate([ { $match: { "kind": "commit", "commit.operation": "create" } }, { $group: { _id: "$commit.collection", count: { $sum: 1 }, users: { $addToSet: "$did" } } }, { $project: { event: "$_id", count: 1, users: { $size: "$users" } } }, { $sort: { count: -1 } } ]); 3 | db.bluesky.aggregate([ { $match: { "kind": "commit", "commit.operation": "create", "commit.collection": { $in: ["app.bsky.feed.post", "app.bsky.feed.repost", "app.bsky.feed.like"] } } }, { $project: { _id: 0, event: "$commit.collection", hour_of_day: { $hour: { $toDate: { $divide: ["$time_us", 1000] } } } } }, { $group: { _id: { event: "$event", hour_of_day: "$hour_of_day" }, count: { $sum: 1 } } }, { $sort: { "_id.hour_of_day": 1, "_id.event": 1 } } ]); 4 | db.bluesky.aggregate([ { $match: { "kind": "commit", "commit.operation": "create", "commit.collection": "app.bsky.feed.post" } }, { $project: { _id: 0, user_id: "$did", timestamp: { $toDate: { $divide: ["$time_us", 1000] } } } }, { $group: { _id: "$user_id", first_post_ts: { $min: "$timestamp" } } }, { $sort: { first_post_ts: 1 } }, { $limit: 3 } ]); 5 | db.bluesky.aggregate([ { $match: { "kind": "commit", "commit.operation": "create", "commit.collection": "app.bsky.feed.post" } }, { $project: { _id: 0, user_id: "$did", timestamp: { $toDate: { $divide: ["$time_us", 1000] } } } }, { $group: { _id: "$user_id", min_timestamp: { $min: "$timestamp" }, max_timestamp: { $max: "$timestamp" } } }, { $project: { activity_span: { $dateDiff: { startDate: "$min_timestamp", endDate: "$max_timestamp", unit: "millisecond" } } } }, { $sort: { activity_span: -1 } }, { $limit: 3 } ]); 6 | -------------------------------------------------------------------------------- /elasticsearch/queries_formatted.txt: -------------------------------------------------------------------------------- 1 | -- Q1 - Top event types 2 | 3 | POST /_query?format=txt 4 | { 5 | "query": """FROM $INDEX_NAME 6 | | STATS count = COUNT() BY commit.collection 7 | | SORT count DESC""" 8 | } 9 | 10 | -- Q2 - Top event types together with unique users per event type 11 | -- Note, Elasticsearch does not support exact count. COUNT_DISTINCT returns only an estimate. 12 | 13 | POST /_query?format=txt 14 | { 15 | "query": """FROM $INDEX_NAME 16 | | WHERE kind == "commit" AND commit.operation == "create" 17 | | STATS users = COUNT_DISTINCT(did), count = COUNT() BY commit.collection 18 | | SORT count DESC""" 19 | } 20 | 21 | -- Q3 - When do people use BlueSky 22 | 23 | POST /_query?format=txt 24 | { 25 | "query": """FROM $INDEX_NAME 26 | | WHERE kind == "commit" AND commit.operation == "create" AND commit.collection IN ("app.bsky.feed.post", "app.bsky.feed.repost", "app.bsky.feed.like") 27 | | STATS count = COUNT() BY commit.collection, DATE_EXTRACT("hour_of_day", time_us) 28 | | SORT count, commit.collection""" 29 | } 30 | 31 | -- Q4 - top 3 post veterans 32 | 33 | POST /_query?format=txt 34 | { 35 | "query": """FROM $INDEX_NAME 36 | | WHERE kind == "commit" AND commit.operation == "create" AND commit.collection == "app.bsky.feed.post" 37 | | STATS first_post_ts = MIN(time_us) BY did 38 | | SORT first_post_ts ASC 39 | | LIMIT 3""" 40 | } 41 | 42 | -- Q5 - top 3 users with longest activity 43 | 44 | POST /_query?format=txt 45 | { 46 | "query": """FROM $INDEX_NAME 47 | | WHERE kind == "commit" AND commit.operation == "create" AND commit.collection == "app.bsky.feed.post" 48 | | STATS activity_span = date_diff("millisecond",min(time_us), max(time_us)) BY did 49 | | SORT activity_span DESC 50 | | LIMIT 3""" 51 | } 52 | -------------------------------------------------------------------------------- /singlestore/load_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 7 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | 10 | # Arguments 11 | ROOT_PASSWORD="$1" 12 | DATA_DIRECTORY="$2" 13 | DB_NAME="$3" 14 | TABLE_NAME="$4" 15 | MAX_FILES="$5" 16 | SUCCESS_LOG="$6" 17 | ERROR_LOG="$7" 18 | 19 | # Validate arguments 20 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; } 21 | [[ ! "$MAX_FILES" =~ ^[0-9]+$ ]] && { echo "Error: MAX_FILES must be a positive integer."; exit 1; } 22 | 23 | export MYSQL_PWD=${ROOT_PASSWORD} 24 | 25 | # Load data 26 | counter=0 27 | for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do 28 | echo "Processing file: $file" 29 | 30 | # Note: If one or more JSON documents in the currently processed file cannot be parsed (because of extremely deep nesting, line breaks 31 | # in unexpected places, etc.), then SingleStore will skip the _entire_ file. This unfortunately reduces the "data quality" metric 32 | # (= the number of successfully inserted JSON documents) quite a bit. SingleStore's LOAD statement comes with a SKIP PARSER ERRORS 33 | # clause that would theoretically allow to skip individual documents, but it is not supported for JSON 34 | # (https://www.singlestore.com/forum/t/pipeline-skip-parser-errors-with-json/2794). 35 | mysql --local-infile=1 -h 127.0.0.1 -P 3306 -u root -D $DB_NAME -e "LOAD DATA LOCAL INFILE \"$file\" INTO TABLE bluesky(data <- %) FORMAT JSON" 36 | 37 | counter=$((counter + 1)) 38 | if [[ $counter -ge $MAX_FILES ]]; then 39 | break 40 | fi 41 | done 42 | -------------------------------------------------------------------------------- /elasticsearch/config/elasticsearch.yml: -------------------------------------------------------------------------------- 1 | 2 | 3 | # ---------------------------------- Cluster ----------------------------------- 4 | # 5 | # Use a descriptive name for your cluster: 6 | # 7 | cluster.name: es-bench 8 | # ------------------------------------ Node ------------------------------------ 9 | # 10 | # Use a descriptive name for the node: 11 | # 12 | node.name: node-1 13 | # ----------------------------------- Paths ------------------------------------ 14 | # 15 | # Path to directory where to store the data (separate multiple locations by comma): 16 | # 17 | path.data: /var/lib/elasticsearch 18 | # 19 | # Path to log files: 20 | # 21 | path.logs: /var/log/elasticsearch 22 | # 23 | # ----------------------------------- Memory ----------------------------------- 24 | # 25 | # Lock the memory on startup: 26 | # 27 | bootstrap.memory_lock: true 28 | # -------------------------------------------------------------------------------- 29 | 30 | # Enable security features 31 | xpack.security.enabled: true 32 | xpack.security.enrollment.enabled: true 33 | 34 | # Enable encryption for HTTP API client connections, such as Kibana, Logstash, and Agents 35 | xpack.security.http.ssl: 36 | enabled: true 37 | keystore.path: certs/http.p12 38 | 39 | # Enable encryption and mutual authentication between cluster nodes 40 | xpack.security.transport.ssl: 41 | enabled: true 42 | verification_mode: certificate 43 | keystore.path: certs/transport.p12 44 | truststore.path: certs/transport.p12 45 | # Create a new cluster with the current node only 46 | # Additional nodes can still join the cluster later 47 | cluster.initial_master_nodes: ["node-1"] 48 | 49 | # Allow HTTP API connections from anywhere 50 | # Connections are encrypted and require user authentication 51 | http.host: 0.0.0.0 52 | 53 | #----------------------- END SECURITY AUTO CONFIGURATION ------------------------- -------------------------------------------------------------------------------- /duckdb/load_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 6 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DIRECTORY="$1" 11 | DB_NAME="$2" 12 | TABLE_NAME="$3" 13 | MAX_FILES="$4" 14 | SUCCESS_LOG="$5" 15 | ERROR_LOG="$6" 16 | 17 | # Validate that MAX_FILES is a number 18 | if ! [[ "$MAX_FILES" =~ ^[0-9]+$ ]]; then 19 | echo "Error: must be a positive integer." 20 | exit 1 21 | fi 22 | 23 | # Ensure the log files exist 24 | touch "$SUCCESS_LOG" "$ERROR_LOG" 25 | 26 | counter=0 27 | 28 | # Loop through each .json.gz file in the directory 29 | for file in $(ls "$DIRECTORY"/*.json.gz | sort); do 30 | # if [[ -f "$file" ]]; then 31 | # duckdb ~/$DB_NAME -c "insert into $TABLE_NAME select * from read_ndjson_objects('$file', ignore_errors=false, maximum_object_size=1048576000);" 32 | # fi 33 | if [[ -f "$file" ]]; then 34 | # Create a temporary directory for split files 35 | temp_dir=$(mktemp -d $DIRECTORY/temp.XXXXXX) 36 | 37 | # Decompress and split the file into smaller chunks of 100000 lines each 38 | gzip -dc "$file" | split -l 100000 - "$temp_dir/chunk_" 39 | 40 | # Insert each chunk into DuckDB 41 | for chunk in "$temp_dir"/chunk_*; do 42 | duckdb ~/$DB_NAME -c "insert into $TABLE_NAME select * from read_ndjson_objects('$chunk', ignore_errors=false, maximum_object_size=1048576000);" 43 | done 44 | 45 | # Clean up temporary directory 46 | rm -r "$temp_dir" 47 | counter=$((counter + 1)) 48 | fi 49 | 50 | # Stop processing if the max number of files is reached 51 | if [[ $counter -ge $MAX_FILES ]]; then 52 | echo "Copied maximum number of files: $MAX_FILES" 53 | break 54 | fi 55 | done 56 | -------------------------------------------------------------------------------- /elasticsearch/run_queries.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Check if ELASTIC_PASSWORD env variable is set, if set from not read from .elastic_password file 10 | if [[ -z "$ELASTIC_PASSWORD" ]]; then 11 | [[ ! -f ".elastic_password" ]] && { echo "Error: ELASTIC_PASSWORD environment variable is not set and .elastic_password file does not exist."; exit 1; } 12 | export $(cat .elastic_password) 13 | fi 14 | 15 | # Arguments 16 | INDEX_NAME="$1" 17 | 18 | # Number of tries for each query 19 | TRIES=3 20 | 21 | # File containing Elasticsearch ES|SQL queries 22 | QUERY_FILE="queries.txt" 23 | LOG_FILE="query_log_$INDEX_NAME.log" 24 | > "$LOG_FILE" 25 | 26 | # Check if the query file exists 27 | if [[ ! -f "$QUERY_FILE" ]]; then 28 | echo "Error: Query file '$QUERY_FILE' does not exist." 29 | exit 1 30 | fi 31 | 32 | cat 'queries.txt' | while read -r QUERY; do 33 | # Clear filesystem cache between queries. 34 | sync 35 | echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 36 | # Clear query cache between queries. 37 | curl -k -X POST 'https://localhost:9200/hits/_cache/clear?pretty' -u "elastic:${ELASTIC_PASSWORD}" &>/dev/null 38 | eval "QUERY=\"${QUERY}\"" 39 | echo "Running query: $QUERY" 40 | for i in $(seq 1 $TRIES); do 41 | CURL_DATA="{\"query\": \"$QUERY\"}" 42 | RESPONSE=$(curl -s -k -X POST "https://localhost:9200/_query" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' -d "$CURL_DATA") 43 | TOOK_MS=$(echo "$RESPONSE" | jq -r '.took' 2>/dev/null) 44 | 45 | # Convert 'took' to seconds (from ms to s) 46 | TOOK_S=$(bc <<< "scale=3; $TOOK_MS / 1000") 47 | TOOK_FORMATTED=$(printf "%.3f" "$TOOK_S") 48 | echo "$RESPONSE" >> "$LOG_FILE" 49 | echo "Response time: ${TOOK_FORMATTED} s" 50 | done 51 | done 52 | -------------------------------------------------------------------------------- /mongodb/index_usage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # If you change something in this file, please also change it in ferretdb/index_usage.sh as well 4 | 5 | # Check if the required arguments are provided 6 | if [[ $# -lt 1 ]]; then 7 | echo "Usage: $0 " 8 | exit 1 9 | fi 10 | 11 | # Arguments 12 | DB_NAME="$1" 13 | 14 | QUERY_NUM=1 15 | 16 | # File containing MongoDB queries (replace 'queries.js' with your file) 17 | QUERY_FILE="queries.js" 18 | 19 | # Check if the query file exists 20 | if [[ ! -f "$QUERY_FILE" ]]; then 21 | echo "Error: Query file '$QUERY_FILE' does not exist." 22 | exit 1 23 | fi 24 | 25 | # Set the internalQueryPlannerGenerateCoveredWholeIndexScans parameter to true 26 | echo "Setting internalQueryPlannerGenerateCoveredWholeIndexScans to true..." 27 | mongosh --quiet --eval " 28 | const result = db.adminCommand({ setParameter: 1, internalQueryPlannerGenerateCoveredWholeIndexScans: true }); 29 | if (result.ok !== 1) { 30 | print('Failed to set internalQueryPlannerGenerateCoveredWholeIndexScans: ' + JSON.stringify(result)); 31 | quit(1); 32 | } else { 33 | print('Successfully set internalQueryPlannerGenerateCoveredWholeIndexScans to true'); 34 | } 35 | " 36 | 37 | cat "$QUERY_FILE" | while read -r query; do 38 | 39 | # Print the query number 40 | echo "------------------------------------------------------------------------------------------------------------------------" 41 | echo "Index usage for query Q$QUERY_NUM:" 42 | echo 43 | 44 | # Modify the query to include the explain option inside the aggregate call 45 | MODIFIED_QUERY=$(echo "$query" | sed 's/]);$/], { explain: "queryPlanner" });/') 46 | 47 | # Escape the modified query for safe passing to mongosh 48 | ESCAPED_QUERY=$(echo "$MODIFIED_QUERY" | sed 's/\([\"\\]\)/\\\1/g' | sed 's/\$/\\$/g') 49 | 50 | mongosh --quiet --eval " 51 | const db = db.getSiblingDB('$DB_NAME'); 52 | const result = eval(\"$ESCAPED_QUERY\"); 53 | printjson(result.stages[0].\$cursor.queryPlanner.winningPlan); 54 | " 55 | 56 | # Increment the query number 57 | QUERY_NUM=$((QUERY_NUM + 1)) 58 | done; -------------------------------------------------------------------------------- /_files_lz4/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Default data directory 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky 5 | DEFAULT_TARGET_DIRECTORY=~/data/bluesky_lz4 6 | 7 | # Allow the user to optionally provide the data and target directories as arguments 8 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}" 9 | TARGET_DIRECTORY="${2:-$DEFAULT_TARGET_DIRECTORY}" 10 | 11 | # Define prefix for output files 12 | OUTPUT_PREFIX="${3:-_files_lz4}" 13 | 14 | # Check if the data directory exists 15 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 16 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 17 | exit 1 18 | fi 19 | 20 | # Ensure the target directory exists 21 | if [[ ! -d "$TARGET_DIRECTORY" ]]; then 22 | echo "Target directory '$TARGET_DIRECTORY' does not exist. Creating it..." 23 | mkdir -p "$TARGET_DIRECTORY" 24 | if [[ $? -ne 0 ]]; then 25 | echo "Error: Failed to create target directory '$TARGET_DIRECTORY'." 26 | exit 1 27 | fi 28 | fi 29 | 30 | 31 | # 1m 32 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/1m" 33 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY" 34 | mkdir -p "$TARGET_SUB_DIRECTORY" 35 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 1 36 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_1m.total_size" 37 | 38 | # 10m 39 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/10m" 40 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY" 41 | mkdir -p "$TARGET_SUB_DIRECTORY" 42 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 10 43 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_10m.total_size" 44 | 45 | # 100m 46 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/100m" 47 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY" 48 | mkdir -p "$TARGET_SUB_DIRECTORY" 49 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 100 50 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_100m.total_size" 51 | 52 | # 1000m 53 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/1000m" 54 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY" 55 | mkdir -p "$TARGET_SUB_DIRECTORY" 56 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 1000 57 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_1000m.total_size" -------------------------------------------------------------------------------- /_files_json/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Default data directory 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky 5 | DEFAULT_TARGET_DIRECTORY=~/data/bluesky_json 6 | 7 | # Allow the user to optionally provide the data and target directories as arguments 8 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}" 9 | TARGET_DIRECTORY="${2:-$DEFAULT_TARGET_DIRECTORY}" 10 | 11 | # Define prefix for output files 12 | OUTPUT_PREFIX="${3:-_files_json}" 13 | 14 | # Check if the data directory exists 15 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 16 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 17 | exit 1 18 | fi 19 | 20 | # Ensure the target directory exists 21 | if [[ ! -d "$TARGET_DIRECTORY" ]]; then 22 | echo "Target directory '$TARGET_DIRECTORY' does not exist. Creating it..." 23 | mkdir -p "$TARGET_DIRECTORY" 24 | if [[ $? -ne 0 ]]; then 25 | echo "Error: Failed to create target directory '$TARGET_DIRECTORY'." 26 | exit 1 27 | fi 28 | fi 29 | 30 | 31 | # 1m 32 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/1m" 33 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY" 34 | mkdir -p "$TARGET_SUB_DIRECTORY" 35 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 1 36 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_1m.total_size" 37 | 38 | # 10m 39 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/10m" 40 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY" 41 | mkdir -p "$TARGET_SUB_DIRECTORY" 42 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 10 43 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_10m.total_size" 44 | 45 | # 100m 46 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/100m" 47 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY" 48 | mkdir -p "$TARGET_SUB_DIRECTORY" 49 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 100 50 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_100m.total_size" 51 | 52 | # 1000m 53 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/1000m" 54 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY" 55 | mkdir -p "$TARGET_SUB_DIRECTORY" 56 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 1000 57 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_1000m.total_size" -------------------------------------------------------------------------------- /_files_zstd/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Default data directory 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky 5 | DEFAULT_TARGET_DIRECTORY=~/data/bluesky_zstd 6 | 7 | # Allow the user to optionally provide the data and target directories as arguments 8 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}" 9 | TARGET_DIRECTORY="${2:-$DEFAULT_TARGET_DIRECTORY}" 10 | 11 | # Define prefix for output files 12 | OUTPUT_PREFIX="${3:-_files_zstd}" 13 | 14 | # Check if the data directory exists 15 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 16 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 17 | exit 1 18 | fi 19 | 20 | # Ensure the target directory exists 21 | if [[ ! -d "$TARGET_DIRECTORY" ]]; then 22 | echo "Target directory '$TARGET_DIRECTORY' does not exist. Creating it..." 23 | mkdir -p "$TARGET_DIRECTORY" 24 | if [[ $? -ne 0 ]]; then 25 | echo "Error: Failed to create target directory '$TARGET_DIRECTORY'." 26 | exit 1 27 | fi 28 | fi 29 | 30 | 31 | # 1m 32 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/1m" 33 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY" 34 | mkdir -p "$TARGET_SUB_DIRECTORY" 35 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 1 36 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_1m.total_size" 37 | 38 | # 10m 39 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/10m" 40 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY" 41 | mkdir -p "$TARGET_SUB_DIRECTORY" 42 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 10 43 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_10m.total_size" 44 | 45 | # 100m 46 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/100m" 47 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY" 48 | mkdir -p "$TARGET_SUB_DIRECTORY" 49 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 100 50 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_100m.total_size" 51 | 52 | # 1000m 53 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/1000m" 54 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY" 55 | mkdir -p "$TARGET_SUB_DIRECTORY" 56 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 1000 57 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_1000m.total_size" -------------------------------------------------------------------------------- /elasticsearch/config/index_template_source.json: -------------------------------------------------------------------------------- 1 | { 2 | "index_patterns": [ 3 | "${INDEX_NAME}" 4 | ], 5 | "data_stream": {}, 6 | "template": { 7 | "settings": { 8 | "index": { 9 | "lifecycle": { 10 | "name": "filebeat" 11 | }, 12 | "codec": "best_compression", 13 | "routing": { 14 | "allocation": { 15 | "include": { 16 | "_tier_preference": "data_hot" 17 | } 18 | } 19 | }, 20 | "mapping": { 21 | "total_fields": { 22 | "limit": "10000" 23 | } 24 | }, 25 | "refresh_interval": "30s", 26 | "number_of_shards": "1", 27 | "max_docvalue_fields_search": "200", 28 | "sort": { 29 | "field": [ 30 | "kind", 31 | "commit.operation", 32 | "commit.collection", 33 | "did", 34 | "time_us" 35 | ], 36 | "order": [ 37 | "asc", 38 | "asc", 39 | "asc", 40 | "asc", 41 | "asc" 42 | ] 43 | }, 44 | "number_of_replicas": "0" 45 | } 46 | }, 47 | "mappings": { 48 | "_source": { 49 | "enabled": true 50 | }, 51 | "dynamic_templates": [ 52 | { 53 | "strings_as_keyword": { 54 | "match_mapping_type": "string", 55 | "mapping": { 56 | "ignore_above": 1024, 57 | "type": "keyword" 58 | } 59 | } 60 | } 61 | ], 62 | "properties": { 63 | "kind": { 64 | "type": "keyword" 65 | }, 66 | "commit": { 67 | "properties": { 68 | "collection": { 69 | "type": "keyword" 70 | }, 71 | "operation": { 72 | "type": "keyword" 73 | } 74 | } 75 | }, 76 | "did": { 77 | "type": "keyword" 78 | }, 79 | "time_us": { 80 | "type": "date" 81 | } 82 | } 83 | }, 84 | "aliases": {} 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /greptimedb/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DEFAULT_CHOICE=ask 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky 5 | 6 | # Allow the user to optionally provide the scale factor ("choice") as an argument 7 | CHOICE="${1:-$DEFAULT_CHOICE}" 8 | 9 | # Allow the user to optionally provide the data directory as an argument 10 | DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}" 11 | 12 | # Define success and error log files 13 | SUCCESS_LOG="${3:-success.log}" 14 | ERROR_LOG="${4:-error.log}" 15 | 16 | # Define prefix for output files 17 | OUTPUT_PREFIX="${5:-_m6i.8xlarge}" 18 | 19 | # Check if the directory exists 20 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 21 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 22 | exit 1 23 | fi 24 | 25 | if [ "$CHOICE" = "ask" ]; then 26 | echo "Select the dataset size to benchmark:" 27 | echo "1) 1m (default)" 28 | echo "2) 10m" 29 | echo "3) 100m" 30 | echo "4) 1000m" 31 | echo "5) all" 32 | read -p "Enter the number corresponding to your choice: " CHOICE 33 | fi 34 | 35 | ./install.sh 36 | 37 | benchmark() { 38 | local size=$1 39 | # Check DATA_DIRECTORY contains the required number of files to run the benchmark 40 | file_count=$(find "$DATA_DIRECTORY" -type f | wc -l) 41 | if (( file_count < size )); then 42 | echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count." 43 | exit 1 44 | fi 45 | 46 | ./start.sh 47 | ./load_data.sh "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG" 48 | ./total_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.total_size" 49 | ./data_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.data_size" 50 | ./index_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.index_size" 51 | ./count.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count" 52 | ./run_queries.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime" 53 | ./drop_tables.sh 54 | } 55 | 56 | case $CHOICE in 57 | 2) 58 | benchmark 10 59 | ;; 60 | 3) 61 | benchmark 100 62 | ;; 63 | 4) 64 | benchmark 1000 65 | ;; 66 | 5) 67 | benchmark 1 68 | benchmark 10 69 | benchmark 100 70 | benchmark 1000 71 | ;; 72 | *) 73 | benchmark 1 74 | ;; 75 | esac 76 | -------------------------------------------------------------------------------- /elasticsearch/config/index_template_no_source.json: -------------------------------------------------------------------------------- 1 | { 2 | "index_patterns": [ 3 | "${INDEX_NAME}" 4 | ], 5 | "data_stream": {}, 6 | "template": { 7 | "settings": { 8 | "index": { 9 | "lifecycle": { 10 | "name": "filebeat" 11 | }, 12 | "codec": "best_compression", 13 | "routing": { 14 | "allocation": { 15 | "include": { 16 | "_tier_preference": "data_hot" 17 | } 18 | } 19 | }, 20 | "mapping": { 21 | "total_fields": { 22 | "limit": "10000" 23 | } 24 | }, 25 | "refresh_interval": "30s", 26 | "number_of_shards": "1", 27 | "max_docvalue_fields_search": "200", 28 | "sort": { 29 | "field": [ 30 | "kind", 31 | "commit.operation", 32 | "commit.collection", 33 | "did", 34 | "time_us" 35 | ], 36 | "order": [ 37 | "asc", 38 | "asc", 39 | "asc", 40 | "asc", 41 | "asc" 42 | ] 43 | }, 44 | "number_of_replicas": "0" 45 | } 46 | }, 47 | "mappings": { 48 | "_source": { 49 | "enabled": false 50 | }, 51 | "dynamic_templates": [ 52 | { 53 | "strings_as_keyword": { 54 | "match_mapping_type": "string", 55 | "mapping": { 56 | "ignore_above": 1024, 57 | "type": "keyword" 58 | } 59 | } 60 | } 61 | ], 62 | "properties": { 63 | "kind": { 64 | "type": "keyword" 65 | }, 66 | "commit": { 67 | "properties": { 68 | "collection": { 69 | "type": "keyword" 70 | }, 71 | "operation": { 72 | "type": "keyword" 73 | } 74 | } 75 | }, 76 | "did": { 77 | "type": "keyword" 78 | }, 79 | "time_us": { 80 | "type": "date" 81 | } 82 | } 83 | }, 84 | "aliases": {} 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /singlestore/results/_query_results/_m6i.8xlarge_bluesky_1m.query_results: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------------------------------------------------ 2 | Result for query Q1: 3 | 4 | event count 5 | "app.bsky.feed.like" 448944 6 | "app.bsky.graph.follow" 360374 7 | "app.bsky.feed.post" 90816 8 | "app.bsky.feed.repost" 58540 9 | "app.bsky.graph.block" 14040 10 | "app.bsky.actor.profile" 11762 11 | "app.bsky.graph.listitem" 8103 12 | NULL 5328 13 | "app.bsky.graph.listblock" 895 14 | "app.bsky.graph.starterpack" 405 15 | "app.bsky.graph.list" 356 16 | "app.bsky.feed.threadgate" 255 17 | "app.bsky.feed.postgate" 104 18 | "app.bsky.feed.generator" 74 19 | "app.bsky.labeler.service" 4 20 | ------------------------------------------------------------------------------------------------------------------------ 21 | Result for query Q2: 22 | 23 | event count count(distinct data::did) 24 | "app.bsky.feed.like" 444523 117617 25 | "app.bsky.graph.follow" 337978 63957 26 | "app.bsky.feed.post" 86812 50464 27 | "app.bsky.feed.repost" 56993 26581 28 | "app.bsky.graph.block" 13838 5785 29 | "app.bsky.graph.listitem" 7568 1078 30 | "app.bsky.actor.profile" 5337 5337 31 | "app.bsky.graph.listblock" 860 449 32 | "app.bsky.graph.list" 259 218 33 | "app.bsky.feed.threadgate" 228 196 34 | "app.bsky.graph.starterpack" 104 101 35 | "app.bsky.feed.postgate" 101 82 36 | "app.bsky.feed.generator" 10 9 37 | ------------------------------------------------------------------------------------------------------------------------ 38 | Result for query Q3: 39 | 40 | event hour_of_day count 41 | "app.bsky.feed.like" 16 444523 42 | "app.bsky.feed.post" 16 86812 43 | "app.bsky.feed.repost" 16 56993 44 | ------------------------------------------------------------------------------------------------------------------------ 45 | Result for query Q4: 46 | 47 | user_id first_post_ts 48 | did:plc:yj3sjq3blzpynh27cumnp5ks 2024-11-21 16:25:49.000167 49 | did:plc:l5o3qjrmfztir54cpwlv2eme 2024-11-21 16:25:49.001905 50 | did:plc:s4bwqchfzm6gjqfeb6mexgbu 2024-11-21 16:25:49.003907 51 | ------------------------------------------------------------------------------------------------------------------------ 52 | Result for query Q5: 53 | 54 | user_id activity_span 55 | did:plc:tsyymlun4eqjuw7hqrhmwagd 813006959 56 | did:plc:3ug235sfy2pz7cawmpsftb65 811602261 57 | did:plc:doxhhgtxqiv47tmcovpbcqai 811404021 58 | -------------------------------------------------------------------------------- /doris/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # If you change something in this file, please change also in starrocks/main.sh. 4 | 5 | export DORIS_FULL_NAME="apache-doris-3.1.0-bin-x64" 6 | 7 | DEFAULT_CHOICE=ask 8 | DEFAULT_DATA_DIRECTORY=~/data/bluesky 9 | 10 | # Allow the user to optionally provide the scale factor ("choice") as an argument 11 | CHOICE="${1:-$DEFAULT_CHOICE}" 12 | 13 | # Allow the user to optionally provide the data directory as an argument 14 | DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}" 15 | 16 | # Define success and error log files 17 | SUCCESS_LOG="${3:-success.log}" 18 | ERROR_LOG="${4:-error.log}" 19 | 20 | # Define prefix for output files 21 | OUTPUT_PREFIX="${5:-_m6i.8xlarge}" 22 | 23 | # Check if the directory exists 24 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 25 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 26 | exit 1 27 | fi 28 | 29 | if [ "$CHOICE" = "ask" ]; then 30 | echo "Select the dataset size to benchmark:" 31 | echo "1) 1m (default)" 32 | echo "2) 10m" 33 | echo "3) 100m" 34 | echo "4) 1000m" 35 | echo "5) all" 36 | read -p "Enter the number corresponding to your choice: " CHOICE 37 | fi; 38 | 39 | ./install.sh 40 | ./start.sh 41 | 42 | benchmark() { 43 | local size=$1 44 | # Check DATA_DIRECTORY contains the required number of files to run the benchmark 45 | file_count=$(find "$DATA_DIRECTORY" -type f | wc -l) 46 | if (( file_count < size )); then 47 | echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count." 48 | exit 1 49 | fi 50 | ./create_and_load.sh "bluesky_${size}m" bluesky "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG" 51 | ./total_size.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.total_size" 52 | ./count.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count" 53 | ./benchmark.sh "bluesky_${size}m" "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime" "queries.sql" 54 | ./drop_table.sh "bluesky_${size}m" bluesky 55 | } 56 | 57 | case $CHOICE in 58 | 2) 59 | benchmark 10 60 | ;; 61 | 3) 62 | benchmark 100 63 | ;; 64 | 4) 65 | benchmark 1000 66 | ;; 67 | 5) 68 | benchmark 1 69 | benchmark 10 70 | benchmark 100 71 | benchmark 1000 72 | ;; 73 | *) 74 | benchmark 1 75 | ;; 76 | esac 77 | 78 | ./stop.sh 79 | ./uninstall.sh 80 | -------------------------------------------------------------------------------- /victorialogs/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DEFAULT_CHOICE=ask 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky 5 | 6 | # Allow the user to optionally provide the scale factor ("choice") as an argument 7 | CHOICE="${1:-$DEFAULT_CHOICE}" 8 | 9 | # Allow the user to optionally provide the data directory as an argument 10 | DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}" 11 | 12 | # Define success and error log files 13 | SUCCESS_LOG="${3:-success.log}" 14 | ERROR_LOG="${4:-error.log}" 15 | 16 | # Define prefix for output files 17 | OUTPUT_PREFIX="${4:-_m6i.8xlarge}" 18 | 19 | # Check if the directory exists 20 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 21 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 22 | exit 1 23 | fi 24 | 25 | if [ "$CHOICE" = "ask" ]; then 26 | echo "Select the dataset size to benchmark:" 27 | echo "1) 1m (default)" 28 | echo "2) 10m" 29 | echo "3) 100m" 30 | echo "4) 1000m" 31 | echo "5) all" 32 | read -p "Enter the number corresponding to your choice: " CHOICE 33 | fi 34 | 35 | ./install.sh 36 | 37 | benchmark() { 38 | local size=$1 39 | # Check DATA_DIRECTORY contains the required number of files to run the benchmark 40 | file_count=$(find "$DATA_DIRECTORY" -type f | wc -l) 41 | if (( file_count < size )); then 42 | echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count." 43 | exit 1 44 | fi 45 | ./start.sh 46 | ./load_data.sh "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG" 47 | sleep 1 # sleep for a while for settling down the data 48 | ./total_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.total_size" 49 | ./data_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.data_size" 50 | ./index_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.index_size" 51 | ./count.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count" 52 | #./query_results.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.query_results" 53 | ./run_queries.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime" 54 | ./drop_tables.sh # also stops VictoriaLogs 55 | } 56 | 57 | case $CHOICE in 58 | 2) 59 | benchmark 10 60 | ;; 61 | 3) 62 | benchmark 100 63 | ;; 64 | 4) 65 | benchmark 1000 66 | ;; 67 | 5) 68 | benchmark 1 69 | benchmark 10 70 | benchmark 100 71 | benchmark 1000 72 | ;; 73 | *) 74 | benchmark 1 75 | ;; 76 | esac 77 | -------------------------------------------------------------------------------- /victorialogs/queries_formatted.logsql: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------------------------------------------------ 2 | -- Q1 - Top event types 3 | ------------------------------------------------------------------------------------------------------------------------ 4 | * 5 | | stats by (commit.collection) 6 | count() as count 7 | | sort by (count desc) 8 | 9 | ------------------------------------------------------------------------------------------------------------------------ 10 | -- Q2 - Top event types together with unique users per event type 11 | ------------------------------------------------------------------------------------------------------------------------ 12 | {kind="commit", commit.operation="create"} 13 | | stats by (commit.collection) 14 | count() as count, 15 | count_uniq(did) as users 16 | | sort by (count desc) 17 | 18 | ------------------------------------------------------------------------------------------------------------------------ 19 | -- Q3 - When do people use BlueSky 20 | ------------------------------------------------------------------------------------------------------------------------ 21 | {kind="commit", commit.operation="create", commit.collection=~"app\\.bsky\\.feed\\.(post|repost|like)"} 22 | | math floor(_time/1h)%24 as hour_of_day 23 | | stats by (commit.collection, hour_of_day) 24 | count() as count 25 | | sort by (hour_of_day, commit.collection) 26 | 27 | ------------------------------------------------------------------------------------------------------------------------ 28 | -- Q4 - top 3 post veterans 29 | ------------------------------------------------------------------------------------------------------------------------ 30 | {kind="commit", commit.operation="create", commit.collection="app.bsky.feed.post"} 31 | | stats by (did) 32 | min(_time) as first_post_ts 33 | | first 3 (first_post_ts) 34 | 35 | ------------------------------------------------------------------------------------------------------------------------ 36 | -- Q5 - top 3 users with longest activity 37 | ------------------------------------------------------------------------------------------------------------------------ 38 | {kind="commit", commit.operation="create", commit.collection="app.bsky.feed.post"} 39 | | stats by (did) 40 | min(_time) tmin, 41 | max(_time) tmax 42 | | math round((tmax-tmin)/1e6) as activity_span 43 | | keep did, activity_span 44 | | first 3 (activity_span desc) 45 | -------------------------------------------------------------------------------- /elasticsearch/create_and_load.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if ELASTIC_PASSWORD env variable is set, if set from not read from .elastic_password file 4 | if [[ -z "$ELASTIC_PASSWORD" ]]; then 5 | [[ ! -f ".elastic_password" ]] && { echo "Error: ELASTIC_PASSWORD environment variable is not set and .elastic_password file does not exist."; exit 1; } 6 | export $(cat .elastic_password) 7 | fi 8 | 9 | # Check if the required arguments are provided 10 | if [[ $# -lt 6 ]]; then 11 | echo "Usage: $0 " 12 | exit 1 13 | fi 14 | 15 | # Arguments 16 | INDEX_NAME="$1" 17 | INDEX_TEMPLATE_FILE="config/$2.json" 18 | DATA_DIRECTORY="$3" 19 | NUM_FILES="$4" 20 | SUCCESS_LOG="$5" 21 | ERROR_LOG="$6" 22 | 23 | # Validate arguments 24 | [[ ! -f "$INDEX_TEMPLATE_FILE" ]] && { echo "Error: Index template file '$INDEX_TEMPLATE_FILE' does not exist."; exit 1; } 25 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; } 26 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; } 27 | 28 | echo "Checking if ILM policy is installed, install if not" 29 | # If curl return 404, means ILM policy is not installed 30 | http_code=$(curl -s -o /dev/null -k -w "%{http_code}" -X GET "https://localhost:9200/_ilm/policy/filebeat" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json') 31 | if [[ "$http_code" -eq 404 ]] ; then 32 | echo "Installing ILM policy" 33 | ILM_POLICY=$(cat "config/ilm.json") 34 | curl -s -k -X PUT "https://localhost:9200/_ilm/policy/filebeat" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' -d "$ILM_POLICY" 35 | fi 36 | 37 | echo "Installing index template" 38 | # Read index template file json from config/$INDEX_TEMPLATE_FILE 39 | INDEX_TEMPLATE=$(cat "$INDEX_TEMPLATE_FILE") 40 | JSON_DATA=$(cat $INDEX_TEMPLATE_FILE | sed "s/\${INDEX_NAME}/$INDEX_NAME/g") 41 | echo "Install index template" 42 | curl -s -o /dev/null -k -X PUT "https://localhost:9200/_index_template/${INDEX_NAME}" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' -d "$JSON_DATA" 43 | 44 | echo "Creating the data stream" 45 | echo "Create the data stream" 46 | curl -s -o /dev/null -k -X PUT "https://localhost:9200/_data_stream/${INDEX_NAME}" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' 47 | 48 | echo "Loading data" 49 | ./load_data.sh "$DATA_DIRECTORY" "$INDEX_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG" 50 | -------------------------------------------------------------------------------- /_files_lz4/load_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if required arguments are provided 4 | if [[ $# -lt 3 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATA_DIRECTORY="$1" 11 | TARGET_DIRECTORY="$2" 12 | N="$3" 13 | 14 | # Validate the source directory 15 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 16 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 17 | exit 1 18 | fi 19 | 20 | # Validate the target directory 21 | if [[ ! -d "$TARGET_DIRECTORY" ]]; then 22 | echo "Error: Data directory '$TARGET_DIRECTORY' does not exist." 23 | exit 1 24 | fi 25 | 26 | # Validate N is a positive integer 27 | if ! [[ "$N" =~ ^[0-9]+$ ]]; then 28 | echo "Error: N must be a positive integer." 29 | exit 1 30 | fi 31 | 32 | # Create a temporary directory inside the current directory 33 | TEMP_DIR="./temp_extraction" 34 | if [[ -d "$TEMP_DIR" ]]; then 35 | echo "Temporary directory '$TEMP_DIR' already exists. Deleting it first..." 36 | rm -rf "$TEMP_DIR" 37 | fi 38 | 39 | mkdir -p "$TEMP_DIR" 40 | 41 | # Trap to ensure cleanup of the temporary directory 42 | trap "rm -rf $TEMP_DIR" EXIT 43 | 44 | # Process the first N files 45 | count=0 46 | for file in $(ls "$DATA_DIRECTORY"/*.json.gz | sort); do 47 | if [[ $count -ge $N ]]; then 48 | break 49 | fi 50 | 51 | echo "Processing $file..." 52 | 53 | # Define paths for the temporary extracted file and compressed file 54 | extracted_file="$TEMP_DIR/$(basename "${file%.gz}")" 55 | compressed_file="$TEMP_DIR/$(basename "${file%.gz}.lz4")" 56 | 57 | # Extract the .json.gz file into the temporary directory 58 | gzip -c -d "$file" > "$extracted_file" 59 | if [[ $? -ne 0 ]]; then 60 | echo "Error: Failed to extract $file to $extracted_file" 61 | continue 62 | fi 63 | 64 | # Compress the extracted file with lz4 65 | lz4 "$extracted_file" "$compressed_file" 66 | if [[ $? -ne 0 ]]; then 67 | echo "Error: Failed to compress $extracted_file" 68 | continue 69 | fi 70 | 71 | # Copy the .lz4 file to the target directory 72 | cp "$compressed_file" "$TARGET_DIRECTORY/" 73 | if [[ $? -ne 0 ]]; then 74 | echo "Error: Failed to copy $compressed_file to $TARGET_DIRECTORY" 75 | continue 76 | fi 77 | 78 | count=$((count + 1)) 79 | done 80 | 81 | # Cleanup (done automatically by the trap) 82 | echo "Processed $count files. Compressed files are in '$TARGET_DIRECTORY'." 83 | echo "Temporary directory '$TEMP_DIR' has been deleted." -------------------------------------------------------------------------------- /duckdb/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DEFAULT_CHOICE=ask 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky 5 | 6 | # Allow the user to optionally provide the scale factor ("choice") as an argument 7 | CHOICE="${1:-$DEFAULT_CHOICE}" 8 | 9 | # Allow the user to optionally provide the data directory as an argument 10 | DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}" 11 | 12 | # Define success and error log files 13 | SUCCESS_LOG="${3:-success.log}" 14 | ERROR_LOG="${4:-error.log}" 15 | 16 | # Define prefix for output files 17 | OUTPUT_PREFIX="${5:-_m6i.8xlarge}" 18 | 19 | # Check if the directory exists 20 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 21 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 22 | exit 1 23 | fi 24 | 25 | if [ "$CHOICE" = "ask" ]; then 26 | echo "Select the dataset size to benchmark:" 27 | echo "1) 1m (default)" 28 | echo "2) 10m" 29 | echo "3) 100m" 30 | echo "4) 1000m" 31 | echo "5) all" 32 | read -p "Enter the number corresponding to your choice: " CHOICE 33 | fi 34 | 35 | ./install.sh 36 | export PATH='/home/ubuntu/.duckdb/cli/latest':$PATH 37 | 38 | benchmark() { 39 | local size=$1 40 | # Check DATA_DIRECTORY contains the required number of files to run the benchmark 41 | file_count=$(find "$DATA_DIRECTORY" -type f | wc -l) 42 | if (( file_count < size )); then 43 | echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count." 44 | exit 1 45 | fi 46 | ./create_and_load.sh "db.duckdb_${size}" bluesky ddl.sql "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG" 47 | ./total_size.sh "db.duckdb_${size}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.data_size" 48 | ./count.sh "db.duckdb_${size}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count" 49 | #./query_results.sh "db.duckdb_${size}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.query_results" 50 | ./physical_query_plans.sh "db.duckdb_${size}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.physical_query_plans" 51 | ./benchmark.sh "db.duckdb_${size}" "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime" 52 | ./drop_table.sh "db.duckdb_${size}" 53 | } 54 | 55 | case $CHOICE in 56 | 2) 57 | benchmark 10 58 | ;; 59 | 3) 60 | benchmark 100 61 | ;; 62 | 4) 63 | benchmark 1000 64 | ;; 65 | 5) 66 | benchmark 1 67 | benchmark 10 68 | benchmark 100 69 | benchmark 1000 70 | ;; 71 | *) 72 | benchmark 1 73 | ;; 74 | esac 75 | 76 | 77 | ./uninstall.sh 78 | -------------------------------------------------------------------------------- /_files_zstd/load_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if required arguments are provided 4 | if [[ $# -lt 3 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATA_DIRECTORY="$1" 11 | TARGET_DIRECTORY="$2" 12 | N="$3" 13 | 14 | # Validate the source directory 15 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 16 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 17 | exit 1 18 | fi 19 | 20 | # Validate the target directory 21 | if [[ ! -d "$TARGET_DIRECTORY" ]]; then 22 | echo "Error: Target directory '$TARGET_DIRECTORY' does not exist." 23 | exit 1 24 | fi 25 | 26 | # Validate N is a positive integer 27 | if ! [[ "$N" =~ ^[0-9]+$ ]]; then 28 | echo "Error: N must be a positive integer." 29 | exit 1 30 | fi 31 | 32 | # Create a temporary directory inside the current directory 33 | TEMP_DIR="./temp_extraction" 34 | if [[ -d "$TEMP_DIR" ]]; then 35 | echo "Temporary directory '$TEMP_DIR' already exists. Deleting it first..." 36 | rm -rf "$TEMP_DIR" 37 | fi 38 | 39 | mkdir -p "$TEMP_DIR" 40 | 41 | # Trap to ensure cleanup of the temporary directory 42 | trap "rm -rf $TEMP_DIR" EXIT 43 | 44 | # Process the first N files 45 | count=0 46 | for file in $(ls "$DATA_DIRECTORY"/*.json.gz | sort); do 47 | if [[ $count -ge $N ]]; then 48 | break 49 | fi 50 | 51 | echo "Processing $file..." 52 | 53 | # Define paths for the temporary extracted file and compressed file 54 | extracted_file="$TEMP_DIR/$(basename "${file%.gz}")" 55 | compressed_file="$TEMP_DIR/$(basename "${file%.gz}.zst")" 56 | 57 | # Extract the .json.gz file into the temporary directory 58 | gzip -c -d "$file" > "$extracted_file" 59 | if [[ $? -ne 0 ]]; then 60 | echo "Error: Failed to extract $file to $extracted_file" 61 | continue 62 | fi 63 | 64 | # Compress the extracted file with zstd 65 | zstd -1 "$extracted_file" -o "$compressed_file" 66 | if [[ $? -ne 0 ]]; then 67 | echo "Error: Failed to compress $extracted_file" 68 | continue 69 | fi 70 | 71 | # Copy the .zst file to the target directory 72 | cp "$compressed_file" "$TARGET_DIRECTORY/" 73 | if [[ $? -ne 0 ]]; then 74 | echo "Error: Failed to copy $compressed_file to $TARGET_DIRECTORY" 75 | continue 76 | fi 77 | 78 | count=$((count + 1)) 79 | done 80 | 81 | # Cleanup (done automatically by the trap) 82 | echo "Processed $count files. Compressed files are in '$TARGET_DIRECTORY'." 83 | echo "Temporary directory '$TEMP_DIR' has been deleted." -------------------------------------------------------------------------------- /mongodb/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DEFAULT_CHOICE=ask 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky 5 | 6 | # Allow the user to optionally provide the scale factor ("choice") as an argument 7 | CHOICE="${1:-$DEFAULT_CHOICE}" 8 | 9 | # Allow the user to optionally provide the data directory as an argument 10 | DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}" 11 | 12 | # Define success and error log files 13 | SUCCESS_LOG="${3:-success.log}" 14 | ERROR_LOG="${4:-error.log}" 15 | 16 | # Define prefix for output files 17 | OUTPUT_PREFIX="${5:-_m6i.8xlarge}" 18 | 19 | # Check if the directory exists 20 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 21 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 22 | exit 1 23 | fi 24 | 25 | if [ "$CHOICE" = "ask" ]; then 26 | echo "Select the dataset size to benchmark:" 27 | echo "1) 1m (default)" 28 | echo "2) 10m" 29 | echo "3) 100m" 30 | echo "4) 1000m" 31 | echo "5) all" 32 | read -p "Enter the number corresponding to your choice: " CHOICE 33 | fi 34 | 35 | ./install.sh 36 | 37 | benchmark() { 38 | local size=$1 39 | # Check DATA_DIRECTORY contains the required number of files to run the benchmark 40 | file_count=$(find "$DATA_DIRECTORY" -type f | wc -l) 41 | if (( file_count < size )); then 42 | echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count." 43 | exit 1 44 | fi 45 | ./create_and_load.sh "bluesky_${size}m" bluesky "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG" 46 | ./total_size.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.total_size" 47 | ./data_size.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.data_size" 48 | ./index_size.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.index_size" 49 | ./count.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count" 50 | #./query_results.sh "bluesky_${size}m" | tee "${OUTPUT_PREFIX}_bluesky_${size}m.query_results" 51 | ./index_usage.sh "bluesky_${size}m" | tee "${OUTPUT_PREFIX}_bluesky_${size}m.index_usage" 52 | ./benchmark.sh "bluesky_${size}m" "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime" 53 | ./drop_table.sh "bluesky_${size}m" 54 | } 55 | 56 | case $CHOICE in 57 | 2) 58 | benchmark 10 59 | ;; 60 | 3) 61 | benchmark 100 62 | ;; 63 | 4) 64 | benchmark 1000 65 | ;; 66 | 5) 67 | benchmark 1 68 | benchmark 10 69 | benchmark 100 70 | benchmark 1000 71 | ;; 72 | *) 73 | benchmark 1 74 | ;; 75 | esac 76 | 77 | ./uninstall.sh 78 | -------------------------------------------------------------------------------- /postgresql/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DEFAULT_CHOICE=ask 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky 5 | 6 | # Allow the user to optionally provide the scale factor ("choice") as an argument 7 | CHOICE="${1:-$DEFAULT_CHOICE}" 8 | 9 | # Allow the user to optionally provide the data directory as an argument 10 | DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}" 11 | 12 | # Define success and error log files 13 | SUCCESS_LOG="${3:-success.log}" 14 | ERROR_LOG="${4:-error.log}" 15 | 16 | # Define prefix for output files 17 | OUTPUT_PREFIX="${5:-_m6i.8xlarge}" 18 | 19 | # Check if the directory exists 20 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 21 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 22 | exit 1 23 | fi 24 | 25 | if [ "$CHOICE" = "ask" ]; then 26 | echo "Select the dataset size to benchmark:" 27 | echo "1) 1m (default)" 28 | echo "2) 10m" 29 | echo "3) 100m" 30 | echo "4) 1000m" 31 | echo "5) all" 32 | read -p "Enter the number corresponding to your choice: " CHOICE 33 | fi 34 | 35 | ./install.sh 36 | 37 | benchmark() { 38 | local size=$1 39 | # Check DATA_DIRECTORY contains the required number of files to run the benchmark 40 | file_count=$(find "$DATA_DIRECTORY" -type f | wc -l) 41 | if (( file_count < size )); then 42 | echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count." 43 | exit 1 44 | fi 45 | ./create_and_load.sh "bluesky_${size}m" bluesky "ddl.sql" "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG" 46 | ./total_size.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.total_size" 47 | ./data_size.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.data_size" 48 | ./index_size.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.index_size" 49 | ./count.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count" 50 | ./index_usage.sh "bluesky_${size}m" | tee "${OUTPUT_PREFIX}_bluesky_${size}m.index_usage" 51 | #./query_results.sh "bluesky_${size}m" | tee "${OUTPUT_PREFIX}_bluesky_${size}m.query_results" 52 | ./benchmark.sh "bluesky_${size}m" "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime" 53 | ./drop_tables.sh "bluesky_${size}m" 54 | } 55 | 56 | case $CHOICE in 57 | 2) 58 | benchmark 10 59 | ;; 60 | 3) 61 | benchmark 100 62 | ;; 63 | 4) 64 | benchmark 1000 65 | ;; 66 | 5) 67 | benchmark 1 68 | benchmark 10 69 | benchmark 100 70 | benchmark 1000 71 | ;; 72 | *) 73 | benchmark 1 74 | ;; 75 | esac 76 | 77 | ./uninstall.sh 78 | -------------------------------------------------------------------------------- /starrocks/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # If you change something in this file, please change also in doris/main.sh. 4 | 5 | DEFAULT_CHOICE=ask 6 | DEFAULT_DATA_DIRECTORY=~/data/bluesky 7 | 8 | # Allow the user to optionally provide the scale factor ("choice") as an argument 9 | CHOICE="${1:-$DEFAULT_CHOICE}" 10 | 11 | # Allow the user to optionally provide the data directory as an argument 12 | DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}" 13 | 14 | # Define success and error log files 15 | SUCCESS_LOG="${3:-success.log}" 16 | ERROR_LOG="${4:-error.log}" 17 | 18 | # Define prefix for output files 19 | OUTPUT_PREFIX="${5:-_m6i.8xlarge}" 20 | 21 | export DB_HOST="127.0.0.1" 22 | export DB_USER="root" 23 | export DB_MYSQL_PORT="9030" 24 | export DB_HTTP_PORT="8030" # HTTP endpoint for stream load 25 | 26 | # Check if the directory exists 27 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 28 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 29 | exit 1 30 | fi 31 | 32 | if [ "$CHOICE" = "ask" ]; then 33 | echo "Select the dataset size to benchmark:" 34 | echo "1) 1m (default)" 35 | echo "2) 10m" 36 | echo "3) 100m" 37 | echo "4) 1000m" 38 | echo "5) all" 39 | read -p "Enter the number corresponding to your choice: " CHOICE 40 | fi; 41 | 42 | ./install.sh 43 | 44 | benchmark() { 45 | local size=$1 46 | # Check DATA_DIRECTORY contains the required number of files to run the benchmark 47 | file_count=$(find "$DATA_DIRECTORY" -type f | wc -l) 48 | if (( file_count < size )); then 49 | echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count." 50 | exit 1 51 | fi 52 | ./create_and_load.sh "bluesky_${size}m" bluesky "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG" 53 | ./total_size.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.total_size" 54 | ./count.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count" 55 | ./physical_query_plans.sh "bluesky_${size}m" | tee "${OUTPUT_PREFIX}_bluesky_${size}m.physical_query_plans" 56 | ./benchmark.sh "bluesky_${size}m" "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime" "${OUTPUT_PREFIX}_bluesky_${size}m.results_memory_usage" 57 | ./drop_table.sh "bluesky_${size}m" bluesky 58 | } 59 | 60 | case $CHOICE in 61 | 2) 62 | benchmark 10 63 | ;; 64 | 3) 65 | benchmark 100 66 | ;; 67 | 4) 68 | benchmark 1000 69 | ;; 70 | 5) 71 | benchmark 1 72 | benchmark 10 73 | benchmark 100 74 | benchmark 1000 75 | ;; 76 | *) 77 | benchmark 1 78 | ;; 79 | esac 80 | 81 | ./uninstall.sh 82 | -------------------------------------------------------------------------------- /mongodb/load_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 6 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DIRECTORY="$1" 11 | DB_NAME="$2" 12 | COLLECTION_NAME="$3" 13 | MAX_FILES="$4" 14 | SUCCESS_LOG="$5" 15 | ERROR_LOG="$6" 16 | MONGO_URI="mongodb://localhost:27017" # Replace with your MongoDB URI if necessary 17 | 18 | # Validate that MAX_FILES is a number 19 | if ! [[ "$MAX_FILES" =~ ^[0-9]+$ ]]; then 20 | echo "Error: must be a positive integer." 21 | exit 1 22 | fi 23 | 24 | # Ensure the log files exist 25 | touch "$SUCCESS_LOG" "$ERROR_LOG" 26 | 27 | # Create a temporary directory for uncompressed files 28 | TEMP_DIR=$(mktemp -d /var/tmp/json_files.XXXXXX) 29 | trap "rm -rf $TEMP_DIR" EXIT # Ensure cleanup on script exit 30 | 31 | # Counter to track processed files 32 | counter=0 33 | 34 | # Loop through each .json.gz file in the directory 35 | for file in $(ls "$DIRECTORY"/*.json.gz 2>/dev/null | sort); do 36 | if [[ -f "$file" ]]; then 37 | echo "Processing $file..." 38 | counter=$((counter + 1)) 39 | 40 | # Uncompress the file into the TEMP_DIR 41 | uncompressed_file="$TEMP_DIR/$(basename "${file%.gz}")" 42 | gunzip -c "$file" > "$uncompressed_file" 43 | 44 | # Check if uncompression was successful 45 | if [[ $? -ne 0 ]]; then 46 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed to uncompress $file." >> "$ERROR_LOG" 47 | continue 48 | fi 49 | 50 | # Import the uncompressed JSON file into MongoDB 51 | mongoimport --uri "$MONGO_URI" --db "$DB_NAME" --collection "$COLLECTION_NAME" --file "$uncompressed_file" 52 | import_status=$? 53 | 54 | # Check if the import was successful 55 | if [[ $import_status -eq 0 ]]; then 56 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $uncompressed_file into MongoDB." >> "$SUCCESS_LOG" 57 | else 58 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed to import $uncompressed_file into MongoDB." >> "$ERROR_LOG" 59 | fi 60 | 61 | # Remove the uncompressed file after processing 62 | rm -f "$uncompressed_file" 63 | 64 | # Stop processing if the max number of files is reached 65 | if [[ $counter -ge $MAX_FILES ]]; then 66 | echo "Processed maximum number of files: $MAX_FILES" 67 | break 68 | fi 69 | fi 70 | done 71 | 72 | if [[ $counter -eq 0 ]]; then 73 | echo "No .json.gz files found in the directory." 74 | fi 75 | 76 | echo "All files have been processed." -------------------------------------------------------------------------------- /clickhouse/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DEFAULT_CHOICE=ask 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky 5 | 6 | # Allow the user to optionally provide the scale factor ("choice") as an argument 7 | CHOICE="${1:-$DEFAULT_CHOICE}" 8 | 9 | # Allow the user to optionally provide the data directory as an argument 10 | DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}" 11 | 12 | # Define success and error log files 13 | SUCCESS_LOG="${3:-success.log}" 14 | ERROR_LOG="${4:-error.log}" 15 | 16 | # Define prefix for output files 17 | OUTPUT_PREFIX="${4:-_m6i.8xlarge}" 18 | 19 | # Check if the directory exists 20 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 21 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 22 | exit 1 23 | fi 24 | 25 | if [ "$CHOICE" = "ask" ]; then 26 | echo "Select the dataset size to benchmark:" 27 | echo "1) 1m (default)" 28 | echo "2) 10m" 29 | echo "3) 100m" 30 | echo "4) 1000m" 31 | echo "5) all" 32 | read -p "Enter the number corresponding to your choice: " CHOICE 33 | fi 34 | 35 | ./install.sh 36 | 37 | benchmark() { 38 | local size=$1 39 | # Check DATA_DIRECTORY contains the required number of files to run the benchmark 40 | file_count=$(find "$DATA_DIRECTORY" -type f | wc -l) 41 | if (( file_count < size )); then 42 | echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count." 43 | exit 1 44 | fi 45 | ./start.sh 46 | ./create_and_load.sh "bluesky_${size}m" bluesky "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG" 47 | ./total_size.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.total_size" 48 | ./data_size.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.data_size" 49 | ./index_size.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.index_size" 50 | ./count.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count" 51 | #./query_results.sh "bluesky_${size}m" | tee "${OUTPUT_PREFIX}_bluesky_${size}m.query_results" 52 | ./index_usage.sh "bluesky_${size}m" | tee "${OUTPUT_PREFIX}_bluesky_${size}m.index_usage" 53 | ./physical_query_plans.sh "bluesky_${size}m" | tee "${OUTPUT_PREFIX}_bluesky_${size}m.physical_query_plans" 54 | ./benchmark.sh "bluesky_${size}m" "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime" 55 | ./drop_table.sh # also stops ClickHouse 56 | } 57 | 58 | case $CHOICE in 59 | 2) 60 | benchmark 10 61 | ;; 62 | 3) 63 | benchmark 100 64 | ;; 65 | 4) 66 | benchmark 1000 67 | ;; 68 | 5) 69 | benchmark 1 70 | benchmark 10 71 | benchmark 100 72 | benchmark 1000 73 | ;; 74 | *) 75 | benchmark 1 76 | ;; 77 | esac 78 | -------------------------------------------------------------------------------- /greptimedb/queries_formatted.sql: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------------------------------------------------ 2 | -- Q1 - Top event types 3 | ------------------------------------------------------------------------------------------------------------------------ 4 | SELECT commit_collection AS event, 5 | count(1) AS cnt 6 | FROM bluesky 7 | GROUP BY event 8 | ORDER BY cnt DESC; 9 | 10 | ------------------------------------------------------------------------------------------------------------------------ 11 | -- Q2 - Top event types together with unique users per event type 12 | ------------------------------------------------------------------------------------------------------------------------ 13 | SELECT commit_collection AS event, 14 | count(1) AS cnt, 15 | count(DISTINCT did) AS users 16 | FROM bluesky 17 | WHERE kind = 'commit' 18 | AND commit_operation = 'create' 19 | GROUP BY event 20 | ORDER BY cnt DESC; 21 | 22 | ------------------------------------------------------------------------------------------------------------------------ 23 | -- Q3 - When do people use BlueSky 24 | ------------------------------------------------------------------------------------------------------------------------ 25 | SELECT commit_collection AS event, 26 | date_part('hour', time_us) AS hour_of_day, 27 | count(1) AS cnt 28 | FROM bluesky 29 | WHERE kind = 'commit' 30 | AND commit_operation = 'create' 31 | AND commit_collection IN('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') 32 | GROUP BY event, 33 | hour_of_day 34 | ORDER BY hour_of_day, 35 | event; 36 | 37 | ------------------------------------------------------------------------------------------------------------------------ 38 | -- Q4 - top 3 post veterans 39 | ------------------------------------------------------------------------------------------------------------------------ 40 | SELECT did AS user_id, 41 | min(time_us) AS first_post_ts 42 | FROM bluesky 43 | WHERE kind = 'commit' 44 | AND commit_operation = 'create' 45 | AND commit_collection = 'app.bsky.feed.post' 46 | GROUP BY user_id 47 | ORDER BY first_post_ts ASC LIMIT 3; 48 | 49 | ------------------------------------------------------------------------------------------------------------------------ 50 | -- Q5 - top 3 users with longest activity 51 | ------------------------------------------------------------------------------------------------------------------------ 52 | SELECT did AS user_id, 53 | date_part('millisecond',(max(time_us) - min(time_us))) AS activity_span 54 | FROM bluesky 55 | WHERE kind = 'commit' 56 | AND commit_operation = 'create' 57 | AND commit_collection = 'app.bsky.feed.post' 58 | GROUP BY user_id 59 | ORDER BY activity_span DESC LIMIT 3; 60 | -------------------------------------------------------------------------------- /elasticsearch/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DEFAULT_CHOICE=ask 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky 5 | 6 | # Allow the user to optionally provide the scale factor ("choice") as an argument 7 | CHOICE="${1:-$DEFAULT_CHOICE}" 8 | 9 | # Allow the user to optionally provide the data directory as an argument 10 | DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}" 11 | 12 | # Define success and error log files 13 | SUCCESS_LOG="${3:-success.log}" 14 | ERROR_LOG="${4:-error.log}" 15 | 16 | # Define prefix for output files 17 | OUTPUT_PREFIX="${5:-_m6i.8xlarge}" 18 | 19 | # Check if the directory exists 20 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 21 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 22 | exit 1 23 | fi 24 | 25 | if [ "$CHOICE" = "ask" ]; then 26 | echo "Select the dataset size to benchmark:" 27 | echo "1) 1m (default)" 28 | echo "2) 10m" 29 | echo "3) 100m" 30 | echo "4) 1000m" 31 | echo "5) all" 32 | read -p "Enter the number corresponding to your choice: " CHOICE 33 | fi 34 | 35 | ./install.sh 36 | 37 | benchmark() { 38 | ./start.sh 39 | local size=$1 40 | local template=$2 41 | # Check DATA_DIRECTORY contains the required number of files to run the benchmark 42 | file_count=$(find "$DATA_DIRECTORY" -type f | wc -l) 43 | if (( file_count < size )); then 44 | echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count." 45 | exit 1 46 | fi 47 | ./create_and_load.sh "bluesky-${template}-${size}m" "index_template_${template}" "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG" 48 | ./total_size.sh "bluesky-${template}-${size}m" | tee "${OUTPUT_PREFIX}_bluesky-${template}-${size}m.data_size" 49 | ./count.sh "bluesky-${template}-${size}m" | tee "${OUTPUT_PREFIX}_bluesky-${template}-${size}m.count" 50 | #./query_results.sh "bluesky-${template}-${size}m" | tee "${OUTPUT_PREFIX}_bluesky-${template}-${size}m.query_results" 51 | ./benchmark.sh "bluesky-${template}-${size}m" "${OUTPUT_PREFIX}_bluesky-${template}-${size}m.results_runtime" 52 | ./drop_tables.sh 53 | } 54 | 55 | case $CHOICE in 56 | 2) 57 | benchmark 10 no_source 58 | benchmark 10 source 59 | ;; 60 | 3) 61 | benchmark 100 no_source 62 | benchmark 100 source 63 | ;; 64 | 4) 65 | benchmark 1000 no_source 66 | benchmark 1000 source 67 | ;; 68 | 5) 69 | benchmark 1 no_source 70 | benchmark 1 source 71 | benchmark 10 no_source 72 | benchmark 10 source 73 | benchmark 100 no_source 74 | benchmark 100 source 75 | benchmark 1000 no_source 76 | benchmark 1000 source 77 | ;; 78 | *) 79 | benchmark 1 no_source 80 | benchmark 1 source 81 | ;; 82 | esac 83 | 84 | ./uninstall 85 | -------------------------------------------------------------------------------- /clickhouse/load_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 6 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | 10 | # Arguments 11 | DATA_DIRECTORY="$1" 12 | DB_NAME="$2" 13 | TABLE_NAME="$3" 14 | MAX_FILES="$4" 15 | SUCCESS_LOG="$5" 16 | ERROR_LOG="$6" 17 | 18 | # Validate arguments 19 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; } 20 | [[ ! "$MAX_FILES" =~ ^[0-9]+$ ]] && { echo "Error: MAX_FILES must be a positive integer."; exit 1; } 21 | 22 | 23 | # Create a temporary directory for uncompressed files 24 | TEMP_DIR=$(mktemp -d /var/tmp/json_files.XXXXXX) 25 | trap "rm -rf $TEMP_DIR" EXIT # Cleanup temp directory on script exit 26 | 27 | # Load data 28 | counter=0 29 | for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do 30 | echo "Processing file: $file" 31 | 32 | # Uncompress the file into the TEMP_DIR 33 | uncompressed_file="$TEMP_DIR/$(basename "${file%.gz}")" 34 | gunzip -c "$file" > "$uncompressed_file" 35 | 36 | if [[ $? -ne 0 ]]; then 37 | echo "Error: Failed to uncompress $file" >> "$ERROR_LOG" 38 | continue 39 | fi 40 | 41 | # Attempt the first import 42 | ./clickhouse client --query="INSERT INTO $DB_NAME.$TABLE_NAME SETTINGS min_insert_block_size_rows = 1_000_000, min_insert_block_size_bytes = 0 FORMAT JSONAsObject" < "$uncompressed_file" 43 | first_attempt=$? 44 | 45 | # Check if the first import was successful 46 | if [[ $first_attempt -eq 0 ]]; then 47 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $file." >> "$SUCCESS_LOG" 48 | rm -f "$uncompressed_file" # Delete the uncompressed file after successful processing 49 | else 50 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] First attempt failed for $file. Trying again..." >> "$ERROR_LOG" 51 | 52 | echo "Processing $file... again..." 53 | # Attempt the second import with a different command 54 | ./clickhouse client --query="INSERT INTO $DB_NAME.$TABLE_NAME SETTINGS min_insert_block_size_rows = 1_000_000, min_insert_block_size_bytes = 0, input_format_allow_errors_num = 1_000_000_000, input_format_allow_errors_ratio=1 FORMAT JSONAsObject" < "$uncompressed_file" 55 | second_attempt=$? 56 | 57 | # Check if the second import was successful 58 | if [[ $second_attempt -eq 0 ]]; then 59 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $file on second attempt." >> "$SUCCESS_LOG" 60 | rm -f "$uncompressed_file" # Delete the uncompressed file after successful processing 61 | else 62 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] Both attempts failed for $file. Giving up." >> "$ERROR_LOG" 63 | fi 64 | fi 65 | 66 | counter=$((counter + 1)) 67 | if [[ $counter -ge $MAX_FILES ]]; then 68 | break 69 | fi 70 | done 71 | -------------------------------------------------------------------------------- /clickhouse/queries_formatted.sql: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------------------------------------------------ 2 | -- Q1 - Top event types 3 | ------------------------------------------------------------------------------------------------------------------------ 4 | SELECT 5 | data.commit.collection AS event, 6 | count() AS count 7 | FROM bluesky 8 | GROUP BY event 9 | ORDER BY count DESC; 10 | 11 | ------------------------------------------------------------------------------------------------------------------------ 12 | -- Q2 - Top event types together with unique users per event type 13 | ------------------------------------------------------------------------------------------------------------------------ 14 | SELECT 15 | data.commit.collection AS event, 16 | count() AS count, 17 | uniqExact(data.did) AS users 18 | FROM bluesky 19 | WHERE data.kind = 'commit' 20 | AND data.commit.operation = 'create' 21 | GROUP BY event 22 | ORDER BY count DESC; 23 | 24 | ------------------------------------------------------------------------------------------------------------------------ 25 | -- Q3 - When do people use BlueSky 26 | ------------------------------------------------------------------------------------------------------------------------ 27 | SELECT 28 | data.commit.collection AS event, 29 | toHour(fromUnixTimestamp64Micro(data.time_us)) as hour_of_day, 30 | count() AS count 31 | FROM bluesky 32 | WHERE data.kind = 'commit' 33 | AND data.commit.operation = 'create' 34 | AND data.commit.collection in ['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'] 35 | GROUP BY event, hour_of_day 36 | ORDER BY hour_of_day, event; 37 | 38 | ------------------------------------------------------------------------------------------------------------------------ 39 | -- Q4 - top 3 post veterans 40 | ------------------------------------------------------------------------------------------------------------------------ 41 | SELECT 42 | data.did::String as user_id, 43 | min(fromUnixTimestamp64Micro(data.time_us)) as first_post_ts 44 | FROM bluesky 45 | WHERE data.kind = 'commit' 46 | AND data.commit.operation = 'create' 47 | AND data.commit.collection = 'app.bsky.feed.post' 48 | GROUP BY user_id 49 | ORDER BY first_post_ts ASC 50 | LIMIT 3; 51 | 52 | ------------------------------------------------------------------------------------------------------------------------ 53 | -- Q5 - top 3 users with longest activity 54 | ------------------------------------------------------------------------------------------------------------------------ 55 | SELECT 56 | data.did::String as user_id, 57 | date_diff( 58 | 'milliseconds', 59 | min(fromUnixTimestamp64Micro(data.time_us)), 60 | max(fromUnixTimestamp64Micro(data.time_us))) AS activity_span 61 | FROM bluesky 62 | WHERE data.kind = 'commit' 63 | AND data.commit.operation = 'create' 64 | AND data.commit.collection = 'app.bsky.feed.post' 65 | GROUP BY user_id 66 | ORDER BY activity_span DESC 67 | LIMIT 3; 68 | -------------------------------------------------------------------------------- /doris/load_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 6 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | 10 | # Arguments 11 | DATA_DIRECTORY="$1" 12 | DB_NAME="$2" 13 | TABLE_NAME="$3" 14 | MAX_FILES="$4" 15 | SUCCESS_LOG="$5" 16 | ERROR_LOG="$6" 17 | 18 | # Validate arguments 19 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; } 20 | [[ ! "$MAX_FILES" =~ ^[0-9]+$ ]] && { echo "Error: MAX_FILES must be a positive integer."; exit 1; } 21 | 22 | # Create a temporary directory for uncompressed files 23 | TEMP_DIR=$(mktemp -d /var/tmp/json_files.XXXXXX) 24 | trap "rm -rf $TEMP_DIR" EXIT # Cleanup temp directory on script exit 25 | 26 | # Load data 27 | counter=0 28 | start=0 29 | for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do 30 | echo "Processing file: $file" 31 | num=$(echo "$file" | sed -n 's/[^0-9]*\([0-9]\+\).*/\1/p') 32 | if [ "$num" -le "$start" ]; then 33 | continue 34 | fi 35 | 36 | # Uncompress the file into the TEMP_DIR 37 | uncompressed_file="$TEMP_DIR/$(basename "${file%.gz}")" 38 | gunzip -c "$file" > "$uncompressed_file" 39 | 40 | if [[ $? -ne 0 ]]; then 41 | echo "Error: Failed to uncompress $file" >> "$ERROR_LOG" 42 | continue 43 | fi 44 | MAX_ATTEMPT=10 45 | attempt=0 46 | while [ $attempt -lt $MAX_ATTEMPT ] 47 | do 48 | # Attempt the import 49 | http_code=$(curl -s -w "%{http_code}" -o >(cat >/tmp/curl_body) --location-trusted -u root: -H "max_filter_ratio: 0.1" -H "Expect:100-continue" -H "columns: data" -T "$uncompressed_file" -XPUT http://127.0.0.1:8030/api/"$DB_NAME"/"$TABLE_NAME"/_stream_load) 50 | response_body="$(cat /tmp/curl_body)" 51 | response_status="$(cat /tmp/curl_body | jq -r '.Status')" 52 | echo $response_status 53 | if [[ "$http_code" -ge 200 && "$http_code" -lt 300 ]]; then 54 | if [ "$response_status" = "Success" ] 55 | then 56 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $file. Response: $response_body" >> "$SUCCESS_LOG" 57 | rm -f "$uncompressed_file" # Delete the uncompressed file after successful processing 58 | attempt=$((MAX_ATTEMPT)) 59 | else 60 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] $attempt attempt failed for $file with status code $http_code. Response: $response_body" >> "$ERROR_LOG" 61 | attempt=$((attempt + 1)) 62 | sleep 2 63 | fi 64 | else 65 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] $attempt attempt failed for $file with status code $http_code. Response: $response_body" >> "$ERROR_LOG" 66 | attempt=$((attempt + 1)) 67 | sleep 2 68 | fi 69 | done 70 | 71 | counter=$((counter + 1)) 72 | if [[ $counter -ge $MAX_FILES ]]; then 73 | break 74 | fi 75 | done 76 | -------------------------------------------------------------------------------- /elasticsearch/config/filebeat.yml: -------------------------------------------------------------------------------- 1 | # ============================== Filebeat inputs =============================== 2 | filebeat.registry.flush: 5s 3 | filebeat.inputs: 4 | 5 | - type: filestream 6 | id: bluesky-events 7 | 8 | paths: 9 | - 10 | parsers: 11 | - ndjson: 12 | target: "" 13 | # ============================== Filebeat modules ============================== 14 | 15 | filebeat.config.modules: 16 | # Glob pattern for configuration loading 17 | path: ${path.config}/modules.d/*.yml 18 | 19 | # Set to true to enable config reloading 20 | reload.enabled: false 21 | 22 | # ======================= Elasticsearch template setting ======================= 23 | 24 | setup.template.enabled: false 25 | 26 | # ================================== Outputs =================================== 27 | 28 | # Configure what output to use when sending the data collected by the beat. 29 | 30 | # ---------------------------- Elasticsearch Output ---------------------------- 31 | 32 | output.elasticsearch: 33 | # Array of hosts to connect to. 34 | hosts: ["https://localhost:9200"] 35 | 36 | # Performance preset - one of "balanced", "throughput", "scale", 37 | # "latency", or "custom". 38 | preset: throughput 39 | compression_level: 1 40 | idle_connection_timeout: 30s 41 | # Protocol - either `http` (default) or `https`. 42 | protocol: "https" 43 | index: "" 44 | # Authentication credentials - either API key or username/password. 45 | api_key: "" 46 | ssl: 47 | enabled: true 48 | verification_mode: "none" 49 | 50 | http.enabled: true 51 | http.host: localhost 52 | http.port: 5066 53 | 54 | logging.level: info 55 | logging.to_files: true 56 | logging.files: 57 | path: /var/log/filebeat 58 | name: filebeat 59 | keepfiles: 7 60 | permissions: 0640 61 | 62 | processors: 63 | - rename: 64 | when: 65 | and: 66 | - has_fields: ["commit.record.subject"] 67 | - not: 68 | has_fields: ["commit.record.subject.cid"] 69 | fields: 70 | - from: "commit.record.subject" 71 | to: "commit.record.subject.value" 72 | - rename: 73 | when: 74 | and: 75 | - has_fields: ["commit.record.embed.images.data"] 76 | - not: 77 | has_fields: ["commit.record.subject.cid"] 78 | fields: 79 | - from: "commit.record.embed.images.data" 80 | to: "commit.record.embed.images.data.value" 81 | - drop_fields: 82 | fields: ["log", "agent", "ecs","host", "input"] 83 | ignore_missing: true 84 | - script: 85 | lang: javascript 86 | source: > 87 | function process(event){ 88 | var time_us = event.Get("time_us"); 89 | if (typeof time_us === 'string') { 90 | time_us = BigInt(time_us); // If time_us is a string, cast it to a BigInt 91 | } else if (typeof time_us !== 'number') { 92 | return; // Exit the function if time_us is not a valid number 93 | } 94 | var time_us_ms = time_us / 1000; 95 | event.Put("time_us", time_us_ms.toString()); 96 | } -------------------------------------------------------------------------------- /postgresql/load_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 6 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DIRECTORY="$1" 11 | DB_NAME="$2" 12 | TABLE_NAME="$3" 13 | MAX_FILES="$4" 14 | SUCCESS_LOG="$5" 15 | ERROR_LOG="$6" 16 | PSQL_CMD="sudo -u postgres psql -d $DB_NAME" 17 | 18 | # Validate that MAX_FILES is a number 19 | if ! [[ "$MAX_FILES" =~ ^[0-9]+$ ]]; then 20 | echo "Error: must be a positive integer." 21 | exit 1 22 | fi 23 | 24 | # Ensure the log files exist 25 | touch "$SUCCESS_LOG" "$ERROR_LOG" 26 | 27 | # Create a temporary directory in /var/tmp and ensure it's accessible 28 | TEMP_DIR=$(mktemp -d /var/tmp/cleaned_files.XXXXXX) 29 | chmod 777 "$TEMP_DIR" # Allow access for all users 30 | trap "rm -rf $TEMP_DIR" EXIT # Ensure cleanup on script exit 31 | 32 | # Counter to track processed files 33 | counter=0 34 | 35 | # Loop through each .json.gz file in the directory 36 | for file in $(ls "$DIRECTORY"/*.json.gz | sort); do 37 | if [[ -f "$file" ]]; then 38 | echo "Processing $file..." 39 | counter=$((counter + 1)) 40 | 41 | # Uncompress the file into the temporary directory 42 | uncompressed_file="$TEMP_DIR/$(basename "${file%.gz}")" 43 | gunzip -c "$file" > "$uncompressed_file" 44 | 45 | # Check if uncompression was successful 46 | if [[ $? -ne 0 ]]; then 47 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed to uncompress $file." >> "$ERROR_LOG" 48 | continue 49 | fi 50 | 51 | # Preprocess the file to remove null characters 52 | cleaned_file="$TEMP_DIR/$(basename "${uncompressed_file%.json}_cleaned.json")" 53 | sed 's/\\u0000//g' "$uncompressed_file" > "$cleaned_file" 54 | 55 | # Grant read permissions for the postgres user 56 | chmod 644 "$cleaned_file" 57 | 58 | # Import the cleaned JSON file into PostgreSQL 59 | $PSQL_CMD -c "\COPY $TABLE_NAME FROM '$cleaned_file' WITH (format csv, quote e'\x01', delimiter e'\x02', escape e'\x01');" 60 | import_status=$? 61 | 62 | # Check if the import was successful 63 | if [[ $import_status -eq 0 ]]; then 64 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $cleaned_file into PostgreSQL." >> "$SUCCESS_LOG" 65 | # Delete both the uncompressed and cleaned files after successful processing 66 | rm -f "$uncompressed_file" "$cleaned_file" 67 | else 68 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed to import $cleaned_file. See errors above." >> "$ERROR_LOG" 69 | # Keep the files for debugging purposes 70 | fi 71 | 72 | # Stop processing if the max number of files is reached 73 | if [[ $counter -ge $MAX_FILES ]]; then 74 | echo "Processed maximum number of files: $MAX_FILES" 75 | break 76 | fi 77 | else 78 | echo "No .json.gz files found in the directory." 79 | fi 80 | done 81 | --------------------------------------------------------------------------------