├── .github └── workflows │ └── generate-results.yml ├── .gitignore ├── CNAME ├── LICENSE ├── README.md ├── _files_gz ├── main.sh ├── results │ ├── _files_bluesky_gz_1000m.json │ ├── _files_bluesky_gz_100m.json │ ├── _files_bluesky_gz_10m.json │ └── _files_bluesky_gz_1m.json └── total_size.sh ├── _files_json ├── load_data.sh ├── main.sh ├── results │ ├── _files_bluesky_json_1000m.json │ ├── _files_bluesky_json_100m.json │ ├── _files_bluesky_json_10m.json │ └── _files_bluesky_json_1m.json └── total_size.sh ├── _files_lz4 ├── load_data.sh ├── main.sh ├── results │ ├── _files_bluesky_lz4_1000m.json │ ├── _files_bluesky_lz4_100m.json │ ├── _files_bluesky_lz4_10m.json │ └── _files_bluesky_lz4_1m.json └── total_size.sh ├── _files_zstd ├── load_data.sh ├── main.sh ├── results │ ├── _files_bluesky_zstd_1000m.json │ ├── _files_bluesky_zstd_100m.json │ ├── _files_bluesky_zstd_10m.json │ └── _files_bluesky_zstd_1m.json └── total_size.sh ├── clickhouse ├── benchmark.sh ├── count.sh ├── create_and_load.sh ├── data_size.sh ├── ddl_lz4.sql ├── ddl_zstd.sql ├── index_size.sh ├── index_usage.sh ├── install.sh ├── load_data.sh ├── main.sh ├── physical_query_plans.sh ├── queries.sql ├── queries_formatted.sql ├── query_results.sh ├── results │ ├── _index_usage │ │ ├── _m6i.8xlarge_bluesky_1000m_lz4.index_usage │ │ ├── _m6i.8xlarge_bluesky_1000m_zstd.index_usage │ │ ├── _m6i.8xlarge_bluesky_100m_lz4.index_usage │ │ ├── _m6i.8xlarge_bluesky_100m_zstd.index_usage │ │ ├── _m6i.8xlarge_bluesky_10m_lz4.index_usage │ │ ├── _m6i.8xlarge_bluesky_10m_zstd.index_usage │ │ ├── _m6i.8xlarge_bluesky_1m_lz4.index_usage │ │ └── _m6i.8xlarge_bluesky_1m_zstd.index_usage │ ├── _physical_query_plans │ │ ├── _m6i.8xlarge_bluesky_1000m_lz4.physical_query_plans │ │ ├── _m6i.8xlarge_bluesky_1000m_zstd.physical_query_plans │ │ ├── _m6i.8xlarge_bluesky_100m_lz4.physical_query_plans │ │ ├── _m6i.8xlarge_bluesky_100m_zstd.physical_query_plans │ │ ├── _m6i.8xlarge_bluesky_10m_lz4.physical_query_plans │ │ ├── _m6i.8xlarge_bluesky_10m_zstd.physical_query_plans │ │ ├── _m6i.8xlarge_bluesky_1m_lz4.physical_query_plans │ │ └── _m6i.8xlarge_bluesky_1m_zstd.physical_query_plans │ ├── _query_results │ │ └── _m6i.8xlarge_bluesky_1m_lz4.query_results │ ├── m6i.8xlarge_bluesky_1000m_lz4.json │ ├── m6i.8xlarge_bluesky_1000m_zstd.json │ ├── m6i.8xlarge_bluesky_100m_lz4.json │ ├── m6i.8xlarge_bluesky_100m_zstd.json │ ├── m6i.8xlarge_bluesky_10m_lz4.json │ ├── m6i.8xlarge_bluesky_10m_zstd.json │ ├── m6i.8xlarge_bluesky_1m_lz4.json │ └── m6i.8xlarge_bluesky_1m_zstd.json ├── run_queries.sh └── total_size.sh ├── copy_data.sh ├── duckdb ├── benchmark.sh ├── count.sh ├── create_and_load.sh ├── ddl.sql ├── install.sh ├── load_data.sh ├── main.sh ├── physical_query_plans.sh ├── queries.sql ├── queries_formatted.sql ├── query_results.sh ├── results │ ├── _physical_query_plans │ │ ├── _m6i.8xlarge_bluesky_1000m.physical_query_plans │ │ ├── _m6i.8xlarge_bluesky_100m.physical_query_plans │ │ ├── _m6i.8xlarge_bluesky_10m.physical_query_plans │ │ └── _m6i.8xlarge_bluesky_1m.physical_query_plans │ ├── _query_results │ │ └── _m6i.8xlarge_bluesky_1m.query_results │ ├── m6i.8xlarge_bluesky_1000m.errors │ ├── m6i.8xlarge_bluesky_1000m.json │ ├── m6i.8xlarge_bluesky_100m.json │ ├── m6i.8xlarge_bluesky_10m.json │ └── m6i.8xlarge_bluesky_1m.json ├── run_queries.sh └── total_size.sh ├── elasticsearch ├── benchmark.sh ├── config │ ├── elasticsearch.yml │ ├── filebeat.yml │ ├── ilm.json │ ├── index_template_no_source_best_compression.json │ ├── index_template_no_source_default_compression.json │ ├── index_template_source_best_compression.json │ ├── index_template_source_default_compression.json │ └── jvm.options ├── count.sh ├── create_and_load.sh ├── install.sh ├── load_data.sh ├── main.sh ├── queries.txt ├── queries_formatted.txt ├── query_results.sh ├── results │ ├── _query_results │ │ └── _m6i.8xlarge_bluesky-no_source_best_compression-1m.query_results │ ├── m6i.8xlarge_bluesky_no_source_1000m_best_compression.json │ ├── m6i.8xlarge_bluesky_no_source_1000m_default_compression.json │ ├── m6i.8xlarge_bluesky_no_source_100m_best_compression.json │ ├── m6i.8xlarge_bluesky_no_source_100m_default_compression.json │ ├── m6i.8xlarge_bluesky_no_source_10m_best_compression.json │ ├── m6i.8xlarge_bluesky_no_source_10m_default_compression.json │ ├── m6i.8xlarge_bluesky_no_source_1m_best_compression.json │ ├── m6i.8xlarge_bluesky_no_source_1m_default_compression.json │ ├── m6i.8xlarge_bluesky_source_1000m_best_compression.json │ ├── m6i.8xlarge_bluesky_source_1000m_default_compression.json │ ├── m6i.8xlarge_bluesky_source_100m_best_compression.json │ ├── m6i.8xlarge_bluesky_source_100m_default_compression.json │ ├── m6i.8xlarge_bluesky_source_10m_best_compression.json │ ├── m6i.8xlarge_bluesky_source_10m_default_compression.json │ ├── m6i.8xlarge_bluesky_source_1m_best_compression.json │ └── m6i.8xlarge_bluesky_source_1m_default_compression.json ├── run_queries.sh └── total_size.sh ├── favicon.png ├── generate-results.sh ├── index.html ├── mongodb ├── benchmark.sh ├── count.sh ├── create_and_load.sh ├── data_size.sh ├── ddl_snappy.js ├── ddl_zstd.js ├── index_size.sh ├── index_usage.sh ├── install.sh ├── load_data.sh ├── main.sh ├── queries.js ├── queries_formatted.js ├── query_results.sh ├── results │ ├── _index_usage │ │ ├── _m6i.8xlarge_bluesky_1000m_snappy.index_usage │ │ ├── _m6i.8xlarge_bluesky_1000m_zstd.index_usage │ │ ├── _m6i.8xlarge_bluesky_100m_snappy.index_usage │ │ ├── _m6i.8xlarge_bluesky_100m_zstd.index_usage │ │ ├── _m6i.8xlarge_bluesky_10m_snappy.index_usage │ │ ├── _m6i.8xlarge_bluesky_10m_zstd.index_usage │ │ ├── _m6i.8xlarge_bluesky_1m_snappy.index_usage │ │ └── _m6i.8xlarge_bluesky_1m_zstd.index_usage │ ├── _query_results │ │ └── _m6i.8xlarge_bluesky_1m_snappy.query_results │ ├── m6i.8xlarge_bluesky_1000m_snappy.json │ ├── m6i.8xlarge_bluesky_1000m_zstd.json │ ├── m6i.8xlarge_bluesky_100m_snappy.json │ ├── m6i.8xlarge_bluesky_100m_zstd.json │ ├── m6i.8xlarge_bluesky_10m_snappy.json │ ├── m6i.8xlarge_bluesky_10m_zstd.json │ ├── m6i.8xlarge_bluesky_1m_snappy.json │ └── m6i.8xlarge_bluesky_1m_zstd.json ├── results_without_covered_index_scans │ ├── _index_usage │ │ ├── _m6i.8xlarge_bluesky_1000m_snappy.index_usage │ │ ├── _m6i.8xlarge_bluesky_1000m_zstd.index_usage │ │ ├── _m6i.8xlarge_bluesky_100m_snappy.index_usage │ │ ├── _m6i.8xlarge_bluesky_100m_zstd.index_usage │ │ ├── _m6i.8xlarge_bluesky_10m_snappy.index_usage │ │ ├── _m6i.8xlarge_bluesky_10m_zstd.index_usage │ │ ├── _m6i.8xlarge_bluesky_1m_snappy.index_usage │ │ └── _m6i.8xlarge_bluesky_1m_zstd.index_usage │ ├── m6i.8xlarge_bluesky_1000m_snappy.json │ ├── m6i.8xlarge_bluesky_1000m_zstd.json │ ├── m6i.8xlarge_bluesky_100m_snappy.json │ ├── m6i.8xlarge_bluesky_100m_zstd.json │ ├── m6i.8xlarge_bluesky_10m_snappy.json │ ├── m6i.8xlarge_bluesky_10m_zstd.json │ ├── m6i.8xlarge_bluesky_1m_snappy.json │ └── m6i.8xlarge_bluesky_1m_zstd.json ├── run_queries.sh └── total_size.sh └── postgresql ├── benchmark.sh ├── count.sh ├── create_and_load.sh ├── data_size.sh ├── ddl_lz4.sql ├── ddl_pglz.sql ├── index_size.sh ├── index_usage.sh ├── install.sh ├── load_data.sh ├── main.sh ├── queries.sql ├── queries_formatted.sql ├── query_results.sh ├── results ├── _index_usage │ ├── _m6i.8xlarge_bluesky_1000m_lz4.index_usage │ ├── _m6i.8xlarge_bluesky_1000m_pglz.index_usage │ ├── _m6i.8xlarge_bluesky_100m_lz4.index_usage │ ├── _m6i.8xlarge_bluesky_100m_pglz.index_usage │ ├── _m6i.8xlarge_bluesky_10m_lz4.index_usage │ ├── _m6i.8xlarge_bluesky_10m_pglz.index_usage │ ├── _m6i.8xlarge_bluesky_1m_lz4.index_usage │ └── _m6i.8xlarge_bluesky_1m_pglz.index_usage ├── _query_results │ └── _m6i.8xlarge_bluesky_1m_lz4.query_results ├── m6i.8xlarge_bluesky_1000m_lz4.json ├── m6i.8xlarge_bluesky_1000m_pglz.json ├── m6i.8xlarge_bluesky_100m_lz4.json ├── m6i.8xlarge_bluesky_100m_pglz.json ├── m6i.8xlarge_bluesky_10m_lz4.json ├── m6i.8xlarge_bluesky_10m_pglz.json ├── m6i.8xlarge_bluesky_1m_lz4.json └── m6i.8xlarge_bluesky_1m_pglz.json ├── run_queries.sh └── total_size.sh /.github/workflows/generate-results.yml: -------------------------------------------------------------------------------- 1 | name: "Generate index.html" 2 | on: 3 | push: 4 | branches: 5 | - main 6 | 7 | permissions: 8 | contents: write 9 | 10 | jobs: 11 | build: 12 | runs-on: ubuntu-latest 13 | env: 14 | CI_COMMIT_MESSAGE: "[bot] update index.html" 15 | CI_COMMIT_AUTHOR: github 16 | steps: 17 | - uses: actions/checkout@v3 18 | - if: github.event.commits[0].message != env.CI_COMMIT_MESSAGE 19 | run: | 20 | bash generate-results.sh 21 | 22 | git config --global user.name "${{ env.CI_COMMIT_AUTHOR }}" 23 | git config --global user.email "${{ env.CI_COMMIT_AUTHOR }}@users.noreply.github.com" 24 | 25 | git add -A 26 | if git status | grep -q modified 27 | then 28 | git commit -m "${{ env.CI_COMMIT_MESSAGE }}" 29 | git push 30 | fi 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.bak 2 | .idea 3 | -------------------------------------------------------------------------------- /CNAME: -------------------------------------------------------------------------------- 1 | jsonbench.com -------------------------------------------------------------------------------- /_files_gz/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Default data directory 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky 5 | 6 | # Allow the user to optionally provide the data directory as an argument 7 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}" 8 | 9 | # Define prefix for output files 10 | OUTPUT_PREFIX="${2:-_files_gz}" 11 | 12 | # Check if the data directory exists 13 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 14 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 15 | exit 1 16 | fi 17 | 18 | 19 | # 1m 20 | ./total_size.sh "$DATA_DIRECTORY" 1 | tee "${OUTPUT_PREFIX}_1m.total_size" 21 | 22 | # 10m 23 | ./total_size.sh "$DATA_DIRECTORY" 10 | tee "${OUTPUT_PREFIX}_10m.total_size" 24 | 25 | # 100m 26 | ./total_size.sh "$DATA_DIRECTORY" 100 | tee "${OUTPUT_PREFIX}_100m.total_size" 27 | 28 | # 1000m 29 | ./total_size.sh "$DATA_DIRECTORY" 1000 | tee "${OUTPUT_PREFIX}_1000m.total_size" -------------------------------------------------------------------------------- /_files_gz/results/_files_bluesky_gz_1000m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json.gz", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "cluster_size": 1, 7 | "comment": "", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000000, 11 | "dataset_size_readable": "1000m", 12 | "data_compression": "gz", 13 | "total_size": 134117979655, 14 | "total_size_readable": "134.12 GB" 15 | } 16 | -------------------------------------------------------------------------------- /_files_gz/results/_files_bluesky_gz_100m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json.gz", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "cluster_size": 1, 7 | "comment": "", 8 | "tags": [ 9 | ], 10 | "dataset_size": 100000000, 11 | "dataset_size_readable": "100m", 12 | "data_compression": "gz", 13 | "total_size": 13372936569, 14 | "total_size_readable": "13.37 GB" 15 | } 16 | -------------------------------------------------------------------------------- /_files_gz/results/_files_bluesky_gz_10m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json.gz", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "cluster_size": 1, 7 | "comment": "", 8 | "tags": [ 9 | ], 10 | "dataset_size": 10000000, 11 | "dataset_size_readable": "10m", 12 | "data_compression": "gz", 13 | "total_size": 1354902507, 14 | "total_size_readable": "1.35 GB" 15 | } 16 | -------------------------------------------------------------------------------- /_files_gz/results/_files_bluesky_gz_1m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json.gz", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "cluster_size": 1, 7 | "comment": "", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000, 11 | "dataset_size_readable": "1m", 12 | "data_compression": "gz", 13 | "total_size": 135176827, 14 | "total_size_readable": "135.17 MB" 15 | } 16 | -------------------------------------------------------------------------------- /_files_gz/total_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATA_DIRECTORY="$1" 11 | N="$2" 12 | 13 | # Validate the data directory 14 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 15 | echo "Error: Directory '$DATA_DIRECTORY' does not exist." 16 | exit 1 17 | fi 18 | 19 | # Validate N is a positive integer 20 | if ! [[ "$N" =~ ^[0-9]+$ ]]; then 21 | echo "Error: N must be a positive integer." 22 | exit 1 23 | fi 24 | 25 | # Get the first N files sorted by filename and calculate their total size 26 | TOTAL_SIZE=$(ls -1 "$DATA_DIRECTORY" | sort | head -n "$N" | while read -r file; do 27 | filepath="$DATA_DIRECTORY/$file" 28 | if [[ -f "$filepath" ]]; then 29 | stat --format="%s" "$filepath" 30 | fi 31 | done | awk '{sum += $1} END {print sum}') 32 | 33 | # Output the total size in bytes 34 | echo $TOTAL_SIZE -------------------------------------------------------------------------------- /_files_json/load_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if required arguments are provided 4 | if [[ $# -lt 3 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATA_DIRECTORY="$1" 11 | TARGET_DIRECTORY="$2" 12 | N="$3" 13 | 14 | # Validate the source directory 15 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 16 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 17 | exit 1 18 | fi 19 | 20 | # Validate the target directory 21 | if [[ ! -d "$TARGET_DIRECTORY" ]]; then 22 | echo "Error: Target directory '$TARGET_DIRECTORY' does not exist." 23 | exit 1 24 | fi 25 | 26 | # Validate N is a positive integer 27 | if ! [[ "$N" =~ ^[0-9]+$ ]]; then 28 | echo "Error: N must be a positive integer." 29 | exit 1 30 | fi 31 | 32 | # Get the sorted list of .json.gz files and extract the first N 33 | count=0 34 | for file in $(ls "$DATA_DIRECTORY"/*.json.gz | sort); do 35 | if [[ $count -ge $N ]]; then 36 | break 37 | fi 38 | 39 | echo "Processing $file..." 40 | gzip -dkc "$file" > "$TARGET_DIRECTORY/$(basename "${file%.gz}")" # Extract to target directory 41 | count=$((count + 1)) 42 | done 43 | 44 | echo "Extraction of $count files completed. Extracted files are in '$TARGET_DIRECTORY'." -------------------------------------------------------------------------------- /_files_json/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Default data directory 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky 5 | DEFAULT_TARGET_DIRECTORY=~/data/bluesky_json 6 | 7 | # Allow the user to optionally provide the data and target directories as arguments 8 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}" 9 | TARGET_DIRECTORY="${2:-$DEFAULT_TARGET_DIRECTORY}" 10 | 11 | # Define prefix for output files 12 | OUTPUT_PREFIX="${3:-_files_json}" 13 | 14 | # Check if the data directory exists 15 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 16 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 17 | exit 1 18 | fi 19 | 20 | # Ensure the target directory exists 21 | if [[ ! -d "$TARGET_DIRECTORY" ]]; then 22 | echo "Target directory '$TARGET_DIRECTORY' does not exist. Creating it..." 23 | mkdir -p "$TARGET_DIRECTORY" 24 | if [[ $? -ne 0 ]]; then 25 | echo "Error: Failed to create target directory '$TARGET_DIRECTORY'." 26 | exit 1 27 | fi 28 | fi 29 | 30 | 31 | # 1m 32 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/1m" 33 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY" 34 | mkdir -p "$TARGET_SUB_DIRECTORY" 35 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 1 36 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_1m.total_size" 37 | 38 | # 10m 39 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/10m" 40 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY" 41 | mkdir -p "$TARGET_SUB_DIRECTORY" 42 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 10 43 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_10m.total_size" 44 | 45 | # 100m 46 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/100m" 47 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY" 48 | mkdir -p "$TARGET_SUB_DIRECTORY" 49 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 100 50 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_100m.total_size" 51 | 52 | # 1000m 53 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/1000m" 54 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY" 55 | mkdir -p "$TARGET_SUB_DIRECTORY" 56 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 1000 57 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_1000m.total_size" -------------------------------------------------------------------------------- /_files_json/results/_files_bluesky_json_1000m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "cluster_size": 1, 7 | "comment": "", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000000, 11 | "dataset_size_readable": "1000m", 12 | "data_compression": "none", 13 | "total_size": 482108809691, 14 | "total_size_readable": "482.11 GB" 15 | } 16 | -------------------------------------------------------------------------------- /_files_json/results/_files_bluesky_json_100m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "cluster_size": 1, 7 | "comment": "", 8 | "tags": [ 9 | ], 10 | "dataset_size": 100000000, 11 | "dataset_size_readable": "100m", 12 | "data_compression": "none", 13 | "total_size": 47813179260, 14 | "total_size_readable": "47.81 GB" 15 | } 16 | -------------------------------------------------------------------------------- /_files_json/results/_files_bluesky_json_10m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "cluster_size": 1, 7 | "comment": "", 8 | "tags": [ 9 | ], 10 | "dataset_size": 10000000, 11 | "dataset_size_readable": "10m", 12 | "data_compression": "none", 13 | "total_size": 4858741288, 14 | "total_size_readable": "4.86 GB" 15 | } 16 | -------------------------------------------------------------------------------- /_files_json/results/_files_bluesky_json_1m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "cluster_size": 1, 7 | "comment": "", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000, 11 | "dataset_size_readable": "1m", 12 | "data_compression": "none", 13 | "total_size": 480778277, 14 | "total_size_readable": "480.78 MB" 15 | } 16 | -------------------------------------------------------------------------------- /_files_json/total_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required argument is provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Argument 10 | DIRECTORY="$1" 11 | 12 | # Check if the directory exists 13 | if [[ ! -d "$DIRECTORY" ]]; then 14 | echo "Error: Directory '$DIRECTORY' does not exist." 15 | exit 1 16 | fi 17 | 18 | # Get the total size in bytes and suppress the directory name 19 | du -sb "$DIRECTORY" | awk '{print $1}' -------------------------------------------------------------------------------- /_files_lz4/load_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if required arguments are provided 4 | if [[ $# -lt 3 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATA_DIRECTORY="$1" 11 | TARGET_DIRECTORY="$2" 12 | N="$3" 13 | 14 | # Validate the source directory 15 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 16 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 17 | exit 1 18 | fi 19 | 20 | # Validate the target directory 21 | if [[ ! -d "$TARGET_DIRECTORY" ]]; then 22 | echo "Error: Data directory '$TARGET_DIRECTORY' does not exist." 23 | exit 1 24 | fi 25 | 26 | # Validate N is a positive integer 27 | if ! [[ "$N" =~ ^[0-9]+$ ]]; then 28 | echo "Error: N must be a positive integer." 29 | exit 1 30 | fi 31 | 32 | # Create a temporary directory inside the current directory 33 | TEMP_DIR="./temp_extraction" 34 | if [[ -d "$TEMP_DIR" ]]; then 35 | echo "Temporary directory '$TEMP_DIR' already exists. Deleting it first..." 36 | rm -rf "$TEMP_DIR" 37 | fi 38 | 39 | mkdir -p "$TEMP_DIR" 40 | 41 | # Trap to ensure cleanup of the temporary directory 42 | trap "rm -rf $TEMP_DIR" EXIT 43 | 44 | # Process the first N files 45 | count=0 46 | for file in $(ls "$DATA_DIRECTORY"/*.json.gz | sort); do 47 | if [[ $count -ge $N ]]; then 48 | break 49 | fi 50 | 51 | echo "Processing $file..." 52 | 53 | # Define paths for the temporary extracted file and compressed file 54 | extracted_file="$TEMP_DIR/$(basename "${file%.gz}")" 55 | compressed_file="$TEMP_DIR/$(basename "${file%.gz}.lz4")" 56 | 57 | # Extract the .json.gz file into the temporary directory 58 | gzip -c -d "$file" > "$extracted_file" 59 | if [[ $? -ne 0 ]]; then 60 | echo "Error: Failed to extract $file to $extracted_file" 61 | continue 62 | fi 63 | 64 | # Compress the extracted file with lz4 65 | lz4 "$extracted_file" "$compressed_file" 66 | if [[ $? -ne 0 ]]; then 67 | echo "Error: Failed to compress $extracted_file" 68 | continue 69 | fi 70 | 71 | # Copy the .lz4 file to the target directory 72 | cp "$compressed_file" "$TARGET_DIRECTORY/" 73 | if [[ $? -ne 0 ]]; then 74 | echo "Error: Failed to copy $compressed_file to $TARGET_DIRECTORY" 75 | continue 76 | fi 77 | 78 | count=$((count + 1)) 79 | done 80 | 81 | # Cleanup (done automatically by the trap) 82 | echo "Processed $count files. Compressed files are in '$TARGET_DIRECTORY'." 83 | echo "Temporary directory '$TEMP_DIR' has been deleted." -------------------------------------------------------------------------------- /_files_lz4/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Default data directory 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky 5 | DEFAULT_TARGET_DIRECTORY=~/data/bluesky_lz4 6 | 7 | # Allow the user to optionally provide the data and target directories as arguments 8 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}" 9 | TARGET_DIRECTORY="${2:-$DEFAULT_TARGET_DIRECTORY}" 10 | 11 | # Define prefix for output files 12 | OUTPUT_PREFIX="${3:-_files_lz4}" 13 | 14 | # Check if the data directory exists 15 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 16 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 17 | exit 1 18 | fi 19 | 20 | # Ensure the target directory exists 21 | if [[ ! -d "$TARGET_DIRECTORY" ]]; then 22 | echo "Target directory '$TARGET_DIRECTORY' does not exist. Creating it..." 23 | mkdir -p "$TARGET_DIRECTORY" 24 | if [[ $? -ne 0 ]]; then 25 | echo "Error: Failed to create target directory '$TARGET_DIRECTORY'." 26 | exit 1 27 | fi 28 | fi 29 | 30 | 31 | # 1m 32 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/1m" 33 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY" 34 | mkdir -p "$TARGET_SUB_DIRECTORY" 35 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 1 36 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_1m.total_size" 37 | 38 | # 10m 39 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/10m" 40 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY" 41 | mkdir -p "$TARGET_SUB_DIRECTORY" 42 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 10 43 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_10m.total_size" 44 | 45 | # 100m 46 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/100m" 47 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY" 48 | mkdir -p "$TARGET_SUB_DIRECTORY" 49 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 100 50 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_100m.total_size" 51 | 52 | # 1000m 53 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/1000m" 54 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY" 55 | mkdir -p "$TARGET_SUB_DIRECTORY" 56 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 1000 57 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_1000m.total_size" -------------------------------------------------------------------------------- /_files_lz4/results/_files_bluesky_lz4_1000m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json.lz4", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "cluster_size": 1, 7 | "comment": "", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000000, 11 | "dataset_size_readable": "1000m", 12 | "data_compression": "lz4", 13 | "total_size": 206562787263, 14 | "total_size_readable": "206.56 GB" 15 | } 16 | -------------------------------------------------------------------------------- /_files_lz4/results/_files_bluesky_lz4_100m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json.lz4", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "cluster_size": 1, 7 | "comment": "", 8 | "tags": [ 9 | ], 10 | "dataset_size": 100000000, 11 | "dataset_size_readable": "100m", 12 | "data_compression": "lz4", 13 | "total_size": 20591959778, 14 | "total_size_readable": "20.59 GB" 15 | } 16 | -------------------------------------------------------------------------------- /_files_lz4/results/_files_bluesky_lz4_10m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json.lz4", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "cluster_size": 1, 7 | "comment": "", 8 | "tags": [ 9 | ], 10 | "dataset_size": 10000000, 11 | "dataset_size_readable": "10m", 12 | "data_compression": "lz4", 13 | "total_size": 2084888024, 14 | "total_size_readable": "2.08 GB" 15 | } 16 | -------------------------------------------------------------------------------- /_files_lz4/results/_files_bluesky_lz4_1m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json.lz4", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "cluster_size": 1, 7 | "comment": "", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000, 11 | "dataset_size_readable": "1m", 12 | "data_compression": "lz4", 13 | "total_size": 208385826, 14 | "total_size_readable": "208.39 MB" 15 | } 16 | -------------------------------------------------------------------------------- /_files_lz4/total_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required argument is provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Argument 10 | DIRECTORY="$1" 11 | 12 | # Check if the directory exists 13 | if [[ ! -d "$DIRECTORY" ]]; then 14 | echo "Error: Directory '$DIRECTORY' does not exist." 15 | exit 1 16 | fi 17 | 18 | # Get the total size in bytes and suppress the directory name 19 | du -sb "$DIRECTORY" | awk '{print $1}' -------------------------------------------------------------------------------- /_files_zstd/load_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if required arguments are provided 4 | if [[ $# -lt 3 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATA_DIRECTORY="$1" 11 | TARGET_DIRECTORY="$2" 12 | N="$3" 13 | 14 | # Validate the source directory 15 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 16 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 17 | exit 1 18 | fi 19 | 20 | # Validate the target directory 21 | if [[ ! -d "$TARGET_DIRECTORY" ]]; then 22 | echo "Error: Target directory '$TARGET_DIRECTORY' does not exist." 23 | exit 1 24 | fi 25 | 26 | # Validate N is a positive integer 27 | if ! [[ "$N" =~ ^[0-9]+$ ]]; then 28 | echo "Error: N must be a positive integer." 29 | exit 1 30 | fi 31 | 32 | # Create a temporary directory inside the current directory 33 | TEMP_DIR="./temp_extraction" 34 | if [[ -d "$TEMP_DIR" ]]; then 35 | echo "Temporary directory '$TEMP_DIR' already exists. Deleting it first..." 36 | rm -rf "$TEMP_DIR" 37 | fi 38 | 39 | mkdir -p "$TEMP_DIR" 40 | 41 | # Trap to ensure cleanup of the temporary directory 42 | trap "rm -rf $TEMP_DIR" EXIT 43 | 44 | # Process the first N files 45 | count=0 46 | for file in $(ls "$DATA_DIRECTORY"/*.json.gz | sort); do 47 | if [[ $count -ge $N ]]; then 48 | break 49 | fi 50 | 51 | echo "Processing $file..." 52 | 53 | # Define paths for the temporary extracted file and compressed file 54 | extracted_file="$TEMP_DIR/$(basename "${file%.gz}")" 55 | compressed_file="$TEMP_DIR/$(basename "${file%.gz}.zst")" 56 | 57 | # Extract the .json.gz file into the temporary directory 58 | gzip -c -d "$file" > "$extracted_file" 59 | if [[ $? -ne 0 ]]; then 60 | echo "Error: Failed to extract $file to $extracted_file" 61 | continue 62 | fi 63 | 64 | # Compress the extracted file with zstd 65 | zstd -1 "$extracted_file" -o "$compressed_file" 66 | if [[ $? -ne 0 ]]; then 67 | echo "Error: Failed to compress $extracted_file" 68 | continue 69 | fi 70 | 71 | # Copy the .zst file to the target directory 72 | cp "$compressed_file" "$TARGET_DIRECTORY/" 73 | if [[ $? -ne 0 ]]; then 74 | echo "Error: Failed to copy $compressed_file to $TARGET_DIRECTORY" 75 | continue 76 | fi 77 | 78 | count=$((count + 1)) 79 | done 80 | 81 | # Cleanup (done automatically by the trap) 82 | echo "Processed $count files. Compressed files are in '$TARGET_DIRECTORY'." 83 | echo "Temporary directory '$TEMP_DIR' has been deleted." -------------------------------------------------------------------------------- /_files_zstd/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Default data directory 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky 5 | DEFAULT_TARGET_DIRECTORY=~/data/bluesky_zstd 6 | 7 | # Allow the user to optionally provide the data and target directories as arguments 8 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}" 9 | TARGET_DIRECTORY="${2:-$DEFAULT_TARGET_DIRECTORY}" 10 | 11 | # Define prefix for output files 12 | OUTPUT_PREFIX="${3:-_files_zstd}" 13 | 14 | # Check if the data directory exists 15 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 16 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 17 | exit 1 18 | fi 19 | 20 | # Ensure the target directory exists 21 | if [[ ! -d "$TARGET_DIRECTORY" ]]; then 22 | echo "Target directory '$TARGET_DIRECTORY' does not exist. Creating it..." 23 | mkdir -p "$TARGET_DIRECTORY" 24 | if [[ $? -ne 0 ]]; then 25 | echo "Error: Failed to create target directory '$TARGET_DIRECTORY'." 26 | exit 1 27 | fi 28 | fi 29 | 30 | 31 | # 1m 32 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/1m" 33 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY" 34 | mkdir -p "$TARGET_SUB_DIRECTORY" 35 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 1 36 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_1m.total_size" 37 | 38 | # 10m 39 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/10m" 40 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY" 41 | mkdir -p "$TARGET_SUB_DIRECTORY" 42 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 10 43 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_10m.total_size" 44 | 45 | # 100m 46 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/100m" 47 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY" 48 | mkdir -p "$TARGET_SUB_DIRECTORY" 49 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 100 50 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_100m.total_size" 51 | 52 | # 1000m 53 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/1000m" 54 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY" 55 | mkdir -p "$TARGET_SUB_DIRECTORY" 56 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 1000 57 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_1000m.total_size" -------------------------------------------------------------------------------- /_files_zstd/results/_files_bluesky_zstd_1000m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json.zstd", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "cluster_size": 1, 7 | "comment": "", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000000, 11 | "dataset_size_readable": "1000m", 12 | "data_compression": "zstd(1)", 13 | "total_size": 123797963671, 14 | "total_size_readable": "123.80 GB" 15 | } 16 | -------------------------------------------------------------------------------- /_files_zstd/results/_files_bluesky_zstd_100m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json.zstd", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "cluster_size": 1, 7 | "comment": "", 8 | "tags": [ 9 | ], 10 | "dataset_size": 100000000, 11 | "dataset_size_readable": "100m", 12 | "data_compression": "zstd(1)", 13 | "total_size": 12245368182, 14 | "total_size_readable": "12.25 GB" 15 | } 16 | -------------------------------------------------------------------------------- /_files_zstd/results/_files_bluesky_zstd_10m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json.zstd", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "cluster_size": 1, 7 | "comment": "", 8 | "tags": [ 9 | ], 10 | "dataset_size": 10000000, 11 | "dataset_size_readable": "10m", 12 | "data_compression": "zstd(1)", 13 | "total_size": 1269817486, 14 | "total_size_readable": "1.27 GB" 15 | } 16 | -------------------------------------------------------------------------------- /_files_zstd/results/_files_bluesky_zstd_1m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "data.json.zstd", 3 | "fake": true, 4 | "date": "2025-01-13", 5 | "machine": "m6i.8xlarge, 10000gib gp3", 6 | "cluster_size": 1, 7 | "comment": "", 8 | "tags": [ 9 | ], 10 | "dataset_size": 1000000, 11 | "dataset_size_readable": "1m", 12 | "data_compression": "zstd(1)", 13 | "total_size": 126734406, 14 | "total_size_readable": "126.73 MB" 15 | } 16 | -------------------------------------------------------------------------------- /_files_zstd/total_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required argument is provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Argument 10 | DIRECTORY="$1" 11 | 12 | # Check if the directory exists 13 | if [[ ! -d "$DIRECTORY" ]]; then 14 | echo "Error: Directory '$DIRECTORY' does not exist." 15 | exit 1 16 | fi 17 | 18 | # Get the total size in bytes and suppress the directory name 19 | du -sb "$DIRECTORY" | awk '{print $1}' -------------------------------------------------------------------------------- /clickhouse/benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 3 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | RESULT_FILE_RUNTIMES="$2" 12 | RESULT_FILE_MEMORY_USAGE="$3" 13 | 14 | # Construct the query log file name using $DB_NAME 15 | QUERY_LOG_FILE="_query_log_${DB_NAME}.txt" 16 | 17 | # Print the database name 18 | echo "Running queries on database: $DB_NAME" 19 | 20 | # Run queries and log the output 21 | ./run_queries.sh "$DB_NAME" 2>&1 | tee "$QUERY_LOG_FILE" 22 | 23 | # Process the query log and prepare the result 24 | RUNTIME_RESULTS=$(grep -E '^[0-9]' "$QUERY_LOG_FILE" | awk 'NR % 2 == 1' | awk '{ 25 | if (NR % 3 == 1) { printf "["; } 26 | printf $1; 27 | if (NR % 3 == 0) { 28 | print "],"; 29 | } else { 30 | printf ", "; 31 | } 32 | }') 33 | 34 | MEMORY_RESULTS=$(grep -E '^[0-9]' "$QUERY_LOG_FILE" | awk 'NR % 2 == 0' | awk '{ 35 | if (NR % 3 == 1) { printf "["; } 36 | printf $1; 37 | if (NR % 3 == 0) { 38 | print "],"; 39 | } else { 40 | printf ", "; 41 | } 42 | }') 43 | 44 | # Output the runtime results 45 | echo "$RUNTIME_RESULTS" > "$RESULT_FILE_RUNTIMES" 46 | echo "Runtime results written to $RESULT_FILE_RUNTIMES" 47 | 48 | # Output the memory usage results 49 | echo "$MEMORY_RESULTS" > "$RESULT_FILE_MEMORY_USAGE" 50 | echo "Memory usage results written to $RESULT_FILE_MEMORY_USAGE" -------------------------------------------------------------------------------- /clickhouse/count.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | TABLE_NAME="$2" 12 | 13 | clickhouse-client --database="$DB_NAME" --query "SELECT count() FROM '$TABLE_NAME';" -------------------------------------------------------------------------------- /clickhouse/create_and_load.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 7 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | TABLE_NAME="$2" 12 | DDL_FILE="$3" 13 | DATA_DIRECTORY="$4" 14 | NUM_FILES="$5" 15 | SUCCESS_LOG="$6" 16 | ERROR_LOG="$7" 17 | 18 | # Validate arguments 19 | [[ ! -f "$DDL_FILE" ]] && { echo "Error: DDL file '$DDL_FILE' does not exist."; exit 1; } 20 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; } 21 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; } 22 | 23 | 24 | # Create database 25 | clickhouse-client --query "CREATE DATABASE IF NOT EXISTS $DB_NAME" 26 | 27 | # Execute DDL 28 | clickhouse-client --database="$DB_NAME" --enable_json_type=1 --multiquery < "$DDL_FILE" 29 | 30 | # Load data 31 | ./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG" 32 | 33 | echo "Script completed successfully." -------------------------------------------------------------------------------- /clickhouse/data_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | TABLE_NAME="$2" 12 | 13 | clickhouse-client --query "SELECT sum(data_compressed_bytes) FROM system.parts WHERE database = '$DB_NAME' AND table = '$TABLE_NAME' AND active" -------------------------------------------------------------------------------- /clickhouse/ddl_lz4.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE bluesky 2 | ( 3 | `data` JSON( 4 | kind LowCardinality(String), 5 | commit.operation LowCardinality(String), 6 | commit.collection LowCardinality(String), 7 | did String, 8 | time_us UInt64) 9 | ) 10 | ORDER BY ( 11 | data.kind, 12 | data.commit.operation, 13 | data.commit.collection, 14 | data.did, 15 | fromUnixTimestamp64Micro(data.time_us)); -------------------------------------------------------------------------------- /clickhouse/ddl_zstd.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE bluesky 2 | ( 3 | `data` JSON( 4 | kind LowCardinality(String), 5 | commit.operation LowCardinality(String), 6 | commit.collection LowCardinality(String), 7 | did String, 8 | time_us UInt64) CODEC(ZSTD(1)) 9 | ) 10 | ORDER BY ( 11 | data.kind, 12 | data.commit.operation, 13 | data.commit.collection, 14 | data.did, 15 | fromUnixTimestamp64Micro(data.time_us)); -------------------------------------------------------------------------------- /clickhouse/index_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | TABLE_NAME="$2" 12 | 13 | clickhouse-client --query "SELECT sum(primary_key_size) + sum(marks_bytes) FROM system.parts WHERE database = '$DB_NAME' AND table = '$TABLE_NAME' AND active" -------------------------------------------------------------------------------- /clickhouse/index_usage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | 12 | QUERY_NUM=1 13 | 14 | cat queries.sql | while read -r query; do 15 | 16 | # Print the query number 17 | echo "------------------------------------------------------------------------------------------------------------------------" 18 | echo "Index usage for query Q$QUERY_NUM:" 19 | echo 20 | 21 | clickhouse-client --database="$DB_NAME" --query="EXPLAIN indexes=1 $query" 22 | 23 | # Increment the query number 24 | QUERY_NUM=$((QUERY_NUM + 1)) 25 | done; -------------------------------------------------------------------------------- /clickhouse/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | curl https://clickhouse.com/ | sh 4 | sudo ./clickhouse install --noninteractive 5 | sudo clickhouse start 6 | 7 | while true 8 | do 9 | clickhouse-client --query "SELECT 1" && break 10 | sleep 1 11 | done 12 | 13 | -------------------------------------------------------------------------------- /clickhouse/load_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 6 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | 10 | # Arguments 11 | DATA_DIRECTORY="$1" 12 | DB_NAME="$2" 13 | TABLE_NAME="$3" 14 | MAX_FILES="$4" 15 | SUCCESS_LOG="$5" 16 | ERROR_LOG="$6" 17 | 18 | # Validate arguments 19 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; } 20 | [[ ! "$MAX_FILES" =~ ^[0-9]+$ ]] && { echo "Error: MAX_FILES must be a positive integer."; exit 1; } 21 | 22 | 23 | # Create a temporary directory for uncompressed files 24 | TEMP_DIR=$(mktemp -d /var/tmp/json_files.XXXXXX) 25 | trap "rm -rf $TEMP_DIR" EXIT # Cleanup temp directory on script exit 26 | 27 | # Load data 28 | counter=0 29 | for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do 30 | echo "Processing file: $file" 31 | 32 | # Uncompress the file into the TEMP_DIR 33 | uncompressed_file="$TEMP_DIR/$(basename "${file%.gz}")" 34 | gunzip -c "$file" > "$uncompressed_file" 35 | 36 | if [[ $? -ne 0 ]]; then 37 | echo "Error: Failed to uncompress $file" >> "$ERROR_LOG" 38 | continue 39 | fi 40 | 41 | # Attempt the first import 42 | clickhouse-client --query="INSERT INTO $DB_NAME.$TABLE_NAME SETTINGS min_insert_block_size_rows = 1_000_000, min_insert_block_size_bytes = 0 FORMAT JSONAsObject" < "$uncompressed_file" 43 | first_attempt=$? 44 | 45 | # Check if the first import was successful 46 | if [[ $first_attempt -eq 0 ]]; then 47 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $file." >> "$SUCCESS_LOG" 48 | rm -f "$uncompressed_file" # Delete the uncompressed file after successful processing 49 | else 50 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] First attempt failed for $file. Trying again..." >> "$ERROR_LOG" 51 | 52 | echo "Processing $file... again..." 53 | # Attempt the second import with a different command 54 | clickhouse-client --query="INSERT INTO $DB_NAME.$TABLE_NAME SETTINGS min_insert_block_size_rows = 1_000_000, min_insert_block_size_bytes = 0, input_format_allow_errors_num = 1_000_000_000, input_format_allow_errors_ratio=1 FORMAT JSONAsObject" < "$uncompressed_file" 55 | second_attempt=$? 56 | 57 | # Check if the second import was successful 58 | if [[ $second_attempt -eq 0 ]]; then 59 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $file on second attempt." >> "$SUCCESS_LOG" 60 | rm -f "$uncompressed_file" # Delete the uncompressed file after successful processing 61 | else 62 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] Both attempts failed for $file. Giving up." >> "$ERROR_LOG" 63 | fi 64 | fi 65 | 66 | counter=$((counter + 1)) 67 | if [[ $counter -ge $MAX_FILES ]]; then 68 | break 69 | fi 70 | done 71 | 72 | echo "Script completed successfully." -------------------------------------------------------------------------------- /clickhouse/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Default data directory 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky 5 | 6 | # Allow the user to optionally provide the data directory as an argument 7 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}" 8 | 9 | # Define success and error log files 10 | SUCCESS_LOG="${2:-success.log}" 11 | ERROR_LOG="${3:-error.log}" 12 | 13 | # Define prefix for output files 14 | OUTPUT_PREFIX="${4:-_m6i.8xlarge}" 15 | 16 | # Check if the directory exists 17 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 18 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 19 | exit 1 20 | fi 21 | 22 | echo "Select the dataset size to benchmark:" 23 | echo "1) 1m (default)" 24 | echo "2) 10m" 25 | echo "3) 100m" 26 | echo "4) 1000m" 27 | echo "5) all" 28 | read -p "Enter the number corresponding to your choice: " choice 29 | 30 | ./install.sh 31 | 32 | benchmark() { 33 | local size=$1 34 | local suffix=$2 35 | # Check DATA_DIRECTORY contains the required number of files to run the benchmark 36 | file_count=$(find "$DATA_DIRECTORY" -type f | wc -l) 37 | if (( file_count < size )); then 38 | echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count." 39 | exit 1 40 | fi 41 | ./create_and_load.sh "bluesky_${size}m_${suffix}" bluesky "ddl_${suffix}.sql" "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG" 42 | ./total_size.sh "bluesky_${size}m_${suffix}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.total_size" 43 | ./data_size.sh "bluesky_${size}m_${suffix}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.data_size" 44 | ./index_size.sh "bluesky_${size}m_${suffix}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.index_size" 45 | ./count.sh "bluesky_${size}m_${suffix}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.count" 46 | #./query_results.sh "bluesky_${size}m_${suffix}" | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.query_results" 47 | ./index_usage.sh "bluesky_${size}m_${suffix}" | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.index_usage" 48 | ./physical_query_plans.sh "bluesky_${size}m_${suffix}" | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.physical_query_plans" 49 | ./benchmark.sh "bluesky_${size}m_${suffix}" "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.results_runtime" "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.results_memory_usage" 50 | } 51 | 52 | case $choice in 53 | 2) 54 | benchmark 10 lz4 55 | benchmark 10 zstd 56 | ;; 57 | 3) 58 | benchmark 100 lz4 59 | benchmark 100 zstd 60 | ;; 61 | 4) 62 | benchmark 1000 lz4 63 | benchmark 1000 zstd 64 | ;; 65 | 5) 66 | benchmark 1 lz4 67 | benchmark 1 zstd 68 | benchmark 10 lz4 69 | benchmark 10 zstd 70 | benchmark 100 lz4 71 | benchmark 100 zstd 72 | benchmark 1000 lz4 73 | benchmark 1000 zstd 74 | ;; 75 | *) 76 | benchmark 1 lz4 77 | benchmark 1 zstd 78 | ;; 79 | esac -------------------------------------------------------------------------------- /clickhouse/physical_query_plans.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | 12 | QUERY_NUM=1 13 | 14 | cat queries.sql | while read -r query; do 15 | 16 | # Print the query number 17 | echo "------------------------------------------------------------------------------------------------------------------------" 18 | echo "Physical query plan for query Q$QUERY_NUM:" 19 | echo 20 | 21 | clickhouse-client --database="$DB_NAME" --query="EXPLAIN PIPELINE $query" 22 | 23 | # Increment the query number 24 | QUERY_NUM=$((QUERY_NUM + 1)) 25 | done; -------------------------------------------------------------------------------- /clickhouse/queries.sql: -------------------------------------------------------------------------------- 1 | SELECT data.commit.collection AS event, count() AS count FROM bluesky GROUP BY event ORDER BY count DESC; 2 | SELECT data.commit.collection AS event, count() AS count, uniqExact(data.did) AS users FROM bluesky WHERE data.kind = 'commit' AND data.commit.operation = 'create' GROUP BY event ORDER BY count DESC; 3 | SELECT data.commit.collection AS event, toHour(fromUnixTimestamp64Micro(data.time_us)) as hour_of_day, count() AS count FROM bluesky WHERE data.kind = 'commit' AND data.commit.operation = 'create' AND data.commit.collection in ['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'] GROUP BY event, hour_of_day ORDER BY hour_of_day, event; 4 | SELECT data.did::String as user_id, min(fromUnixTimestamp64Micro(data.time_us)) as first_post_ts FROM bluesky WHERE data.kind = 'commit' AND data.commit.operation = 'create' AND data.commit.collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC LIMIT 3; 5 | SELECT data.did::String as user_id, date_diff( 'milliseconds', min(fromUnixTimestamp64Micro(data.time_us)), max(fromUnixTimestamp64Micro(data.time_us))) AS activity_span FROM bluesky WHERE data.kind = 'commit' AND data.commit.operation = 'create' AND data.commit.collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC LIMIT 3; -------------------------------------------------------------------------------- /clickhouse/queries_formatted.sql: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------------------------------------------------ 2 | -- Q1 - Top event types 3 | ------------------------------------------------------------------------------------------------------------------------ 4 | SELECT 5 | data.commit.collection AS event, 6 | count() AS count 7 | FROM bluesky 8 | GROUP BY event 9 | ORDER BY count DESC; 10 | 11 | ------------------------------------------------------------------------------------------------------------------------ 12 | -- Q2 - Top event types together with unique users per event type 13 | ------------------------------------------------------------------------------------------------------------------------ 14 | SELECT 15 | data.commit.collection AS event, 16 | count() AS count, 17 | uniqExact(data.did) AS users 18 | FROM bluesky 19 | WHERE data.kind = 'commit' 20 | AND data.commit.operation = 'create' 21 | GROUP BY event 22 | ORDER BY count DESC; 23 | 24 | ------------------------------------------------------------------------------------------------------------------------ 25 | -- Q3 - When do people use BlueSky 26 | ------------------------------------------------------------------------------------------------------------------------ 27 | SELECT 28 | data.commit.collection AS event, 29 | toHour(fromUnixTimestamp64Micro(data.time_us)) as hour_of_day, 30 | count() AS count 31 | FROM bluesky 32 | WHERE data.kind = 'commit' 33 | AND data.commit.operation = 'create' 34 | AND data.commit.collection in ['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'] 35 | GROUP BY event, hour_of_day 36 | ORDER BY hour_of_day, event; 37 | 38 | ------------------------------------------------------------------------------------------------------------------------ 39 | -- Q4 - top 3 post veterans 40 | ------------------------------------------------------------------------------------------------------------------------ 41 | SELECT 42 | data.did::String as user_id, 43 | min(fromUnixTimestamp64Micro(data.time_us)) as first_post_ts 44 | FROM bluesky 45 | WHERE data.kind = 'commit' 46 | AND data.commit.operation = 'create' 47 | AND data.commit.collection = 'app.bsky.feed.post' 48 | GROUP BY user_id 49 | ORDER BY first_post_ts ASC 50 | LIMIT 3; 51 | 52 | ------------------------------------------------------------------------------------------------------------------------ 53 | -- Q5 - top 3 users with longest activity 54 | ------------------------------------------------------------------------------------------------------------------------ 55 | SELECT 56 | data.did::String as user_id, 57 | date_diff( 58 | 'milliseconds', 59 | min(fromUnixTimestamp64Micro(data.time_us)), 60 | max(fromUnixTimestamp64Micro(data.time_us))) AS activity_span 61 | FROM bluesky 62 | WHERE data.kind = 'commit' 63 | AND data.commit.operation = 'create' 64 | AND data.commit.collection = 'app.bsky.feed.post' 65 | GROUP BY user_id 66 | ORDER BY activity_span DESC 67 | LIMIT 3; 68 | -------------------------------------------------------------------------------- /clickhouse/query_results.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | 12 | QUERY_NUM=1 13 | 14 | cat queries.sql | while read -r query; do 15 | 16 | # Print the query 17 | echo "------------------------------------------------------------------------------------------------------------------------" 18 | echo "Result for query Q$QUERY_NUM:" 19 | echo 20 | 21 | clickhouse-client --database="$DB_NAME" --format=PrettyCompactMonoBlock --query="$query" --progress 0 22 | 23 | # Increment the query number 24 | QUERY_NUM=$((QUERY_NUM + 1)) 25 | done; -------------------------------------------------------------------------------- /clickhouse/results/_query_results/_m6i.8xlarge_bluesky_1m_lz4.query_results: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------------------------------------------------ 2 | Result for query Q1: 3 | 4 | ┌─event──────────────────────┬──count─┐ 5 | 1. │ app.bsky.feed.like │ 448944 │ 6 | 2. │ app.bsky.graph.follow │ 360374 │ 7 | 3. │ app.bsky.feed.post │ 90816 │ 8 | 4. │ app.bsky.feed.repost │ 58540 │ 9 | 5. │ app.bsky.graph.block │ 14040 │ 10 | 6. │ app.bsky.actor.profile │ 11762 │ 11 | 7. │ app.bsky.graph.listitem │ 8103 │ 12 | 8. │ │ 5328 │ 13 | 9. │ app.bsky.graph.listblock │ 895 │ 14 | 10. │ app.bsky.graph.starterpack │ 405 │ 15 | 11. │ app.bsky.graph.list │ 356 │ 16 | 12. │ app.bsky.feed.threadgate │ 255 │ 17 | 13. │ app.bsky.feed.postgate │ 104 │ 18 | 14. │ app.bsky.feed.generator │ 74 │ 19 | 15. │ app.bsky.labeler.service │ 4 │ 20 | └────────────────────────────┴────────┘ 21 | ------------------------------------------------------------------------------------------------------------------------ 22 | Result for query Q2: 23 | 24 | ┌─event──────────────────────┬──count─┬──users─┐ 25 | 1. │ app.bsky.feed.like │ 444523 │ 117617 │ 26 | 2. │ app.bsky.graph.follow │ 337978 │ 63957 │ 27 | 3. │ app.bsky.feed.post │ 86812 │ 50464 │ 28 | 4. │ app.bsky.feed.repost │ 56993 │ 26581 │ 29 | 5. │ app.bsky.graph.block │ 13838 │ 5785 │ 30 | 6. │ app.bsky.graph.listitem │ 7568 │ 1078 │ 31 | 7. │ app.bsky.actor.profile │ 5337 │ 5337 │ 32 | 8. │ app.bsky.graph.listblock │ 860 │ 449 │ 33 | 9. │ app.bsky.graph.list │ 259 │ 218 │ 34 | 10. │ app.bsky.feed.threadgate │ 228 │ 196 │ 35 | 11. │ app.bsky.graph.starterpack │ 104 │ 101 │ 36 | 12. │ app.bsky.feed.postgate │ 101 │ 82 │ 37 | 13. │ app.bsky.feed.generator │ 10 │ 9 │ 38 | └────────────────────────────┴────────┴────────┘ 39 | ------------------------------------------------------------------------------------------------------------------------ 40 | Result for query Q3: 41 | 42 | ┌─event────────────────┬─hour_of_day─┬──count─┐ 43 | 1. │ app.bsky.feed.like │ 16 │ 444523 │ 44 | 2. │ app.bsky.feed.post │ 16 │ 86812 │ 45 | 3. │ app.bsky.feed.repost │ 16 │ 56993 │ 46 | └──────────────────────┴─────────────┴────────┘ 47 | ------------------------------------------------------------------------------------------------------------------------ 48 | Result for query Q4: 49 | 50 | ┌─user_id──────────────────────────┬──────────────first_post_ts─┐ 51 | 1. │ did:plc:yj3sjq3blzpynh27cumnp5ks │ 2024-11-21 16:25:49.000167 │ 52 | 2. │ did:plc:l5o3qjrmfztir54cpwlv2eme │ 2024-11-21 16:25:49.001905 │ 53 | 3. │ did:plc:s4bwqchfzm6gjqfeb6mexgbu │ 2024-11-21 16:25:49.003907 │ 54 | └──────────────────────────────────┴────────────────────────────┘ 55 | ------------------------------------------------------------------------------------------------------------------------ 56 | Result for query Q5: 57 | 58 | ┌─user_id──────────────────────────┬─activity_span─┐ 59 | 1. │ did:plc:tsyymlun4eqjuw7hqrhmwagd │ 813007 │ 60 | 2. │ did:plc:3ug235sfy2pz7cawmpsftb65 │ 811602 │ 61 | 3. │ did:plc:doxhhgtxqiv47tmcovpbcqai │ 811404 │ 62 | └──────────────────────────────────┴───────────────┘ -------------------------------------------------------------------------------- /clickhouse/results/m6i.8xlarge_bluesky_1000m_lz4.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "ClickHouse (lz4)", 3 | "version": "25.1.1.2571", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000000, 12 | "dataset_size_readable": "1000m", 13 | "num_loaded_documents": 999999258, 14 | "data_compression": "lz4", 15 | "total_size": 159834794888, 16 | "total_size_readable": "159.83 GB", 17 | "data_size": 158959086724, 18 | "data_size_readable": "158.96 GB", 19 | "index_size": 875707939, 20 | "index_size_readable": "875.71 MB", 21 | "result": [ 22 | [0.401, 0.395, 0.389], 23 | [21.445, 5.542, 5.587], 24 | [34.907, 2.334, 2.351], 25 | [8.173, 0.563, 0.556], 26 | [8.178, 0.610, 0.573] 27 | ], 28 | "result_readable": [ 29 | "401.00 msec, 395.00 msec, 389.00 msec", 30 | "21.45 sec, 5.54 sec, 5.59 sec", 31 | "34.91 sec, 2.33 sec, 2.35 sec", 32 | "8.17 sec, 563.00 msec, 556.00 msec", 33 | "8.18 sec, 610.00 msec, 573.00 msec" 34 | ], 35 | "memory_usage": [ 36 | [2418416, 1362125, 1934870], 37 | [4857932616, 4839715303, 4870295739], 38 | [148886114, 139232115, 140070546], 39 | [2500677923, 2573855740, 2556181514], 40 | [2694349228, 3086817972, 2623063375] 41 | ], 42 | "memory_usage_readable": [ 43 | "2.42 MB, 1.36 MB, 1.93 MB", 44 | "4.86 GB, 4.84 GB, 4.87 GB", 45 | "148.89 MB, 139.23 MB, 140.07 MB", 46 | "2.50 GB, 2.57 GB, 2.56 GB", 47 | "2.69 GB, 3.09 GB, 2.62 GB" 48 | ] 49 | } 50 | -------------------------------------------------------------------------------- /clickhouse/results/m6i.8xlarge_bluesky_1000m_zstd.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "ClickHouse (zstd)", 3 | "version": "25.1.1.2571", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000000, 12 | "dataset_size_readable": "1000m", 13 | "num_loaded_documents": 999999258, 14 | "data_compression": "zstd", 15 | "total_size": 99230044699, 16 | "total_size_readable": "99.23 GB", 17 | "data_size": 98355025153, 18 | "data_size_readable": "98.36 GB", 19 | "index_size": 875019321, 20 | "index_size_readable": "875.02 MB", 21 | "result": [ 22 | [0.405, 0.395, 0.394], 23 | [11.854, 5.632, 5.749], 24 | [28.899, 2.466, 2.488], 25 | [5.384, 0.609, 0.596], 26 | [5.405, 0.640, 0.637] 27 | ], 28 | "result_readable": [ 29 | "405.00 msec, 395.00 msec, 394.00 msec", 30 | "11.85 sec, 5.63 sec, 5.75 sec", 31 | "28.90 sec, 2.47 sec, 2.49 sec", 32 | "5.38 sec, 609.00 msec, 596.00 msec", 33 | "5.41 sec, 640.00 msec, 637.00 msec" 34 | ], 35 | "memory_usage": [ 36 | [2779174, 1363689, 783683], 37 | [4823948860, 4871634449, 4918802526], 38 | [147631790, 139732345, 139867057], 39 | [2537597752, 2638168515, 2609289232], 40 | [2796756189, 2927194160, 2788741563] 41 | ], 42 | "memory_usage_readable": [ 43 | "2.78 MB, 1.36 MB, 783.68 KB", 44 | "4.82 GB, 4.87 GB, 4.92 GB", 45 | "147.63 MB, 139.73 MB, 139.87 MB", 46 | "2.54 GB, 2.64 GB, 2.61 GB", 47 | "2.80 GB, 2.93 GB, 2.79 GB" 48 | ] 49 | } 50 | -------------------------------------------------------------------------------- /clickhouse/results/m6i.8xlarge_bluesky_100m_lz4.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "ClickHouse (lz4)", 3 | "version": "25.1.1.2571", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 100000000, 12 | "dataset_size_readable": "100m", 13 | "num_loaded_documents": 99999968, 14 | "data_compression": "lz4", 15 | "total_size": 15793562624, 16 | "total_size_readable": "15.79 GB", 17 | "data_size": 15771339413, 18 | "data_size_readable": "15.77 GB", 19 | "index_size": 222231402, 20 | "index_size_readable": " 22.22 MB", 21 | "result": [ 22 | [0.054, 0.041, 0.041], 23 | [2.344, 0.724, 0.724], 24 | [2.114, 0.226, 0.228], 25 | [0.380, 0.087, 0.085], 26 | [0.648, 0.088, 0.092] 27 | ], 28 | "result_readable": [ 29 | "54.00 msec, 41.00 msec, 41.00 msec", 30 | "2.34 sec, 724.00 msec, 724.00 msec", 31 | "2.11 sec, 226.00 msec, 228.00 msec", 32 | "380.00 msec, 87.00 msec, 85.00 msec", 33 | "648.00 msec, 88.00 msec, 92.00 msec" 34 | ], 35 | "memory_usage": [ 36 | [713196, 1512167, 859315], 37 | [1044281095, 1061751337, 1065857874], 38 | [143081365, 142096261, 142096261], 39 | [595252210, 568894822, 585137315], 40 | [657713321, 690210788, 697155477] 41 | ], 42 | "memory_usage_readable": [ 43 | "713.20 KB, 1.51 MB, 859.32 KB", 44 | "1.04 GB, 1.06 GB, 1.07 GB", 45 | "143.08 MB, 142.10 MB, 142.10 MB", 46 | "595.25 MB, 568.89 MB, 585.14 MB", 47 | "657.71 MB, 690.21 MB, 697.16 MB" 48 | ] 49 | } 50 | -------------------------------------------------------------------------------- /clickhouse/results/m6i.8xlarge_bluesky_100m_zstd.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "ClickHouse (zstd)", 3 | "version": "25.1.1.2571", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 100000000, 12 | "dataset_size_readable": "100m", 13 | "num_loaded_documents": 99999968, 14 | "data_compression": "zstd", 15 | "total_size": 9684754080, 16 | "total_size_readable": "9.68 GB", 17 | "data_size": 9662584921, 18 | "data_size_readable": "9.66 GB", 19 | "index_size": 22169088, 20 | "index_size_readable": "22.17 MB", 21 | "result": [ 22 | [0.052, 0.043, 0.041], 23 | [1.001, 0.763, 0.765], 24 | [1.502, 0.247, 0.250], 25 | [0.124, 0.094, 0.092], 26 | [0.195, 0.095, 0.096] 27 | ], 28 | "result_readable": [ 29 | "52.00 msec, 43.00 msec, 41.00 msec", 30 | "1.00 sec, 763.00 msec, 765.00 msec", 31 | "1.50 sec, 247.00 msec, 250.00 msec", 32 | "124.00 msec, 94.00 msec, 92.00 msec", 33 | "195.00 msec, 95.00 msec, 96.00 msec" 34 | ], 35 | "memory_usage": [ 36 | [784015, 891324, 1395384], 37 | [1034703427, 1069950332, 1052027034], 38 | [141832133, 140988117, 140988117], 39 | [573490495, 584182275, 597232896], 40 | [690147517, 681601522, 684611565] 41 | ], 42 | "memory_usage_readable": [ 43 | "784.01 KB, 891.32 KB, 1.40 MB", 44 | "1.03 GB, 1.07 GB, 1.05 GB", 45 | "141.83 MB, 140.99 MB, 140.99 MB", 46 | "573.49 MB, 584.18 MB, 597.23 MB", 47 | "690.15 MB, 681.60 MB, 684.61 MB" 48 | ] 49 | } 50 | -------------------------------------------------------------------------------- /clickhouse/results/m6i.8xlarge_bluesky_10m_lz4.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "ClickHouse (lz4)", 3 | "version": "25.1.1.2571", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 10000000, 12 | "dataset_size_readable": "10m", 13 | "num_loaded_documents": 9999994, 14 | "data_compression": "lz4", 15 | "total_size": 1636685315, 16 | "total_size_readable": "1.64 GB", 17 | "data_size": 1635155418, 18 | "data_size_readable": "1.64 GB", 19 | "index_size": 1529863, 20 | "index_size_readable": "1.53 MB", 21 | "result": [ 22 | [0.014, 0.008, 0.008], 23 | [0.155, 0.133, 0.134], 24 | [0.051, 0.037, 0.040], 25 | [0.038, 0.023, 0.025], 26 | [0.042, 0.029, 0.025] 27 | ], 28 | "result_readable": [ 29 | "14.00 msec, 8.00 msec, 8.00 msec", 30 | "155.00 msec, 133.00 msec, 134.00 msec", 31 | "51.00 msec, 37.00 msec, 40.00 msec", 32 | "38.00 msec, 23.00 msec, 25.00 msec", 33 | "42.00 msec, 29.00 msec, 25.00 msec" 34 | ], 35 | "memory_usage": [ 36 | [1025969, 979623, 850534], 37 | [289693872, 274696015, 263016878], 38 | [141884177, 141597001, 141597001], 39 | [204374002, 200220685, 200101066], 40 | [204576293, 208646332, 200235195] 41 | ], 42 | "memory_usage_readable": [ 43 | "1.03 MB, 979.62 KB, 850.53 KB", 44 | "289.69 MB, 274.70 MB, 263.02 MB", 45 | "141.88 MB, 141.60 MB, 141.60 MB", 46 | "204.37 MB, 200.22 MB, 200.10 MB", 47 | "204.58 MB, 208.65 MB, 200.24 MB" 48 | ] 49 | } 50 | -------------------------------------------------------------------------------- /clickhouse/results/m6i.8xlarge_bluesky_10m_zstd.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "ClickHouse (zstd)", 3 | "version": "25.1.1.2571", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 10000000, 12 | "dataset_size_readable": "10m", 13 | "num_loaded_documents": 9999994, 14 | "data_compression": "zstd", 15 | "total_size": 995124869, 16 | "total_size_readable": "995.12 MB", 17 | "993599478": 1056861138, 18 | "data_size_readable": "993.60 MB", 19 | "index_size": 1525357, 20 | "index_size_readable": "1.53 MB", 21 | "result": [ 22 | [0.016, 0.008, 0.008], 23 | [0.166, 0.141, 0.150], 24 | [0.064, 0.040, 0.042], 25 | [0.044, 0.026, 0.024], 26 | [0.042, 0.028, 0.026] 27 | ], 28 | "result_readable": [ 29 | "16.00 msec, 8.00 msec, 8.00 msec", 30 | "166.00 msec, 141.00 msec, 150.00 msec", 31 | "64.00 msec, 40.00 msec, 42.00 msec", 32 | "44.00 msec, 26.00 msec, 24.00 msec", 33 | "42.00 msec, 28.00 msec, 26.00 msec" 34 | ], 35 | "memory_usage": [ 36 | [590559, 1051274, 1205734], 37 | [270187643, 282291213, 261653591], 38 | [141094195, 140810923, 140810923], 39 | [199567823, 195201258, 195267941], 40 | [203996437, 203772727, 199532552] 41 | ], 42 | "memory_usage_readable": [ 43 | "590.56 KB, 1.05 MB, 1.21 MB", 44 | "270.19 MB, 282.29 MB, 261.65 MB", 45 | "141.09 MB, 140.81 MB, 140.81 MB", 46 | "199.57 MB, 195.20 MB, 195.27 MB", 47 | "204.00 MB, 203.77 MB, 199.53 MB" 48 | ] 49 | } 50 | -------------------------------------------------------------------------------- /clickhouse/results/m6i.8xlarge_bluesky_1m_lz4.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "ClickHouse (lz4)", 3 | "version": "25.1.1.2571", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000, 12 | "dataset_size_readable": "1m", 13 | "num_loaded_documents": 1000000, 14 | "data_compression": "lz4", 15 | "total_size": 162735429, 16 | "total_size_readable": "162.74 MB", 17 | "data_size": 162623843, 18 | "data_size_readable": "162.62 MB", 19 | "index_size": 111579, 20 | "index_size_readable": "111.58 KB", 21 | "result": [ 22 | [0.009, 0.004, 0.004], 23 | [0.036, 0.023, 0.024], 24 | [0.021, 0.011, 0.010], 25 | [0.034, 0.019, 0.017], 26 | [0.038, 0.021, 0.017] 27 | ], 28 | "result_readable": [ 29 | "9.00 msec, 4.00 msec, 4.00 msec", 30 | "36.00 msec, 23.00 msec, 24.00 msec", 31 | "21.00 msec, 11.00 msec, 10.00 msec", 32 | "34.00 msec, 19.00 msec, 17.00 msec", 33 | "38.00 msec, 21.00 msec, 17.00 msec" 34 | ], 35 | "memory_usage": [ 36 | [1791510, 1634290, 1486871], 37 | [77337064, 77275976, 77275976], 38 | [7416935, 4754228, 6768205], 39 | [13084446, 13068638, 13068638], 40 | [15183014, 15165990, 15165990] 41 | ], 42 | "memory_usage_readable": [ 43 | "1.79 MB, 1.63 MB, 1.49 MB", 44 | "77.34 MB, 77.28 MB, 77.28 MB", 45 | "7.42 MB, 4.75 MB, 6.77 MB", 46 | "13.08 MB, 13.07 MB, 13.07 MB", 47 | "15.18 MB, 15.17 MB, 15.17 MB" 48 | ] 49 | } 50 | -------------------------------------------------------------------------------- /clickhouse/results/m6i.8xlarge_bluesky_1m_zstd.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "ClickHouse (zstd)", 3 | "version": "25.1.1.2571", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000, 12 | "dataset_size_readable": "1m", 13 | "num_loaded_documents": 1000000, 14 | "data_compression": "zstd", 15 | "total_size": 98560583, 16 | "total_size_readable": "98.56 MB", 17 | "data_size": 98449386, 18 | "data_size_readable": "98.45 MB", 19 | "index_size": 111190, 20 | "index_size_readable": "111.19 KB", 21 | "result": [ 22 | [0.011, 0.004, 0.004], 23 | [0.033, 0.024, 0.022], 24 | [0.023, 0.011, 0.010], 25 | [0.033, 0.026, 0.029], 26 | [0.037, 0.023, 0.021] 27 | ], 28 | "result_readable": [ 29 | "11.00 msec, 4.00 msec, 4.00 msec", 30 | "33.00 msec, 24.00 msec, 22.00 msec", 31 | "23.00 msec, 11.00 msec, 10.00 msec", 32 | "33.00 msec, 26.00 msec, 29.00 msec", 33 | "37.00 msec, 23.00 msec, 21.00 msec" 34 | ], 35 | "memory_usage": [ 36 | [884312, 1849931, 1633834], 37 | [76410278, 72165557, 76362246], 38 | [5557964, 2580507, 2713186], 39 | [12950030, 12938382, 12938382], 40 | [15048278, 15035734, 15035734] 41 | ], 42 | "memory_usage_readable": [ 43 | "884.31 KB, 1.85 MB, 1.63 MB", 44 | "76.41 MB, 72.17 MB, 76.36 MB", 45 | "5.56 MB, 2.58 MB, 2.71 MB", 46 | "12.95 MB, 12.94 MB, 12.94 MB", 47 | "15.05 MB, 15.04 MB, 15.04 MB" 48 | ] 49 | } 50 | -------------------------------------------------------------------------------- /clickhouse/run_queries.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | 12 | TRIES=3 13 | 14 | cat queries.sql | while read -r query; do 15 | 16 | # Clear the Linux file system cache 17 | echo "Clearing file system cache..." 18 | sync 19 | echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 20 | echo "File system cache cleared." 21 | 22 | # Print the query 23 | echo "Running query: $query" 24 | 25 | # Execute the query multiple times 26 | for i in $(seq 1 $TRIES); do 27 | clickhouse-client --database="$DB_NAME" --time --memory-usage --format=Null --query="$query" --progress 0 28 | done; 29 | done; -------------------------------------------------------------------------------- /clickhouse/total_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | TABLE_NAME="$2" 12 | 13 | clickhouse-client --query "SELECT sum(bytes_on_disk) FROM system.parts WHERE database = '$DB_NAME' AND table = '$TABLE_NAME' AND active" -------------------------------------------------------------------------------- /copy_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Select the dataset size to download:" 4 | echo "1) 1m (default)" 5 | echo "2) 10m" 6 | echo "3) 100m" 7 | echo "4) 1000m" 8 | read -p "Enter the number corresponding to your choice: " choice 9 | 10 | case $choice in 11 | 2) 12 | # Download 10m dataset 13 | wget https://clickhouse-public-datasets.s3.amazonaws.com/bluesky/file_{0001..0010}.json.gz -P ~/data/bluesky -N 14 | ;; 15 | 3) 16 | # Download 100m dataset 17 | wget https://clickhouse-public-datasets.s3.amazonaws.com/bluesky/file_{0001..0100}.json.gz -P ~/data/bluesky -N 18 | ;; 19 | 4) 20 | # Download 1000m dataset 21 | wget https://clickhouse-public-datasets.s3.amazonaws.com/bluesky/file_{0001..1000}.json.gz -P ~/data/bluesky -N 22 | ;; 23 | *) 24 | # Download 1m dataset 25 | wget https://clickhouse-public-datasets.s3.amazonaws.com/bluesky/file_0001.json.gz -P ~/data/bluesky -N 26 | ;; 27 | esac -------------------------------------------------------------------------------- /duckdb/benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 [RESULT_FILE]" 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | RESULT_FILE="${2:-}" 12 | 13 | # Print the database name 14 | echo "Running queries on database: $DB_NAME" 15 | 16 | # Run queries and log the output 17 | ./run_queries.sh "$DB_NAME" 2>&1 | tee query_log.txt 18 | 19 | # Process the query log and prepare the result 20 | RESULT=$(cat query_log.txt | grep -oP 'Real time: \d+\.\d+ seconds' | sed -r -e 's/Real time: ([0-9]+\.[0-9]+) seconds/\1/' | \ 21 | awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }') 22 | 23 | # Output the result 24 | if [[ -n "$RESULT_FILE" ]]; then 25 | echo "$RESULT" > "$RESULT_FILE" 26 | echo "Result written to $RESULT_FILE" 27 | else 28 | echo "$RESULT" 29 | fi -------------------------------------------------------------------------------- /duckdb/count.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATABASE_NAME="$1" 11 | TABLE_NAME="$2" 12 | DUCKDB_CMD="duckdb $DATABASE_NAME" 13 | 14 | 15 | # Fetch the count using duckDB 16 | $DUCKDB_CMD -c "select count() from '$TABLE_NAME';" 17 | 18 | -------------------------------------------------------------------------------- /duckdb/create_and_load.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 7 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | TABLE_NAME="$2" 12 | DDL_FILE="$3" 13 | DATA_DIRECTORY="$4" 14 | NUM_FILES="$5" 15 | SUCCESS_LOG="$6" 16 | ERROR_LOG="$7" 17 | 18 | # Validate arguments 19 | [[ ! -f "$DDL_FILE" ]] && { echo "Error: DDL file '$DDL_FILE' does not exist."; exit 1; } 20 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; } 21 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; } 22 | 23 | # Create database and execute DDL 24 | duckdb $DB_NAME < "$DDL_FILE" 25 | 26 | # Load data 27 | ./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG" 28 | 29 | echo "Script completed successfully." -------------------------------------------------------------------------------- /duckdb/ddl.sql: -------------------------------------------------------------------------------- 1 | create table bluesky (j JSON); -------------------------------------------------------------------------------- /duckdb/install.sh: -------------------------------------------------------------------------------- 1 | sudo snap install duckdb 2 | -------------------------------------------------------------------------------- /duckdb/load_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 6 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DIRECTORY="$1" 11 | DB_NAME="$2" 12 | TABLE_NAME="$3" 13 | MAX_FILES="$4" 14 | SUCCESS_LOG="$5" 15 | ERROR_LOG="$6" 16 | DUCKDB_CMD="duckdb $DB_NAME" 17 | 18 | # Validate that MAX_FILES is a number 19 | if ! [[ "$MAX_FILES" =~ ^[0-9]+$ ]]; then 20 | echo "Error: must be a positive integer." 21 | exit 1 22 | fi 23 | 24 | # Ensure the log files exist 25 | touch "$SUCCESS_LOG" "$ERROR_LOG" 26 | 27 | counter=0 28 | 29 | # Loop through each .json.gz file in the directory 30 | for file in $(ls "$DIRECTORY"/*.json.gz | sort); do 31 | # if [[ -f "$file" ]]; then 32 | # $DUCKDB_CMD -c "insert into $TABLE_NAME select * from read_ndjson_objects('$file', ignore_errors=false, maximum_object_size=1048576000);" 33 | # fi 34 | if [[ -f "$file" ]]; then 35 | # Create a temporary directory for split files 36 | temp_dir=$(mktemp -d $DIRECTORY/temp.XXXXXX) 37 | 38 | # Decompress and split the file into smaller chunks of 100000 lines each 39 | gzip -dc "$file" | split -l 100000 - "$temp_dir/chunk_" 40 | 41 | # Insert each chunk into DuckDB 42 | for chunk in "$temp_dir"/chunk_*; do 43 | $DUCKDB_CMD -c "insert into $TABLE_NAME select * from read_ndjson_objects('$chunk', ignore_errors=false, maximum_object_size=1048576000);" 44 | done 45 | 46 | # Clean up temporary directory 47 | rm -r "$temp_dir" 48 | counter=$((counter + 1)) 49 | fi 50 | 51 | # Stop processing if the max number of files is reached 52 | if [[ $counter -ge $MAX_FILES ]]; then 53 | echo "Copied maximum number of files: $MAX_FILES" 54 | break 55 | fi 56 | done 57 | 58 | echo "All files have been imported." -------------------------------------------------------------------------------- /duckdb/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Default data directory 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky 5 | 6 | # Allow the user to optionally provide the data directory as an argument 7 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}" 8 | 9 | # Define success and error log files 10 | SUCCESS_LOG="${2:-success.log}" 11 | ERROR_LOG="${3:-error.log}" 12 | 13 | # Define prefix for output files 14 | OUTPUT_PREFIX="${4:-_m6i.8xlarge}" 15 | 16 | # Check if the directory exists 17 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 18 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 19 | exit 1 20 | fi 21 | 22 | echo "Select the dataset size to benchmark:" 23 | echo "1) 1m (default)" 24 | echo "2) 10m" 25 | echo "3) 100m" 26 | echo "4) 1000m" 27 | echo "5) all" 28 | read -p "Enter the number corresponding to your choice: " choice 29 | 30 | ./install.sh 31 | 32 | benchmark() { 33 | local size=$1 34 | # Check DATA_DIRECTORY contains the required number of files to run the benchmark 35 | file_count=$(find "$DATA_DIRECTORY" -type f | wc -l) 36 | if (( file_count < size )); then 37 | echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count." 38 | exit 1 39 | fi 40 | ./create_and_load.sh "db.duckdb_${size}" bluesky ddl.sql "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG" 41 | ./total_size.sh "db.duckdb_${size}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.data_size" 42 | ./count.sh "db.duckdb_${size}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count" 43 | #./query_results.sh "db.duckdb_${size}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.query_results" 44 | ./physical_query_plans.sh "db.duckdb_${size}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.physical_query_plans" 45 | ./benchmark.sh "db.duckdb_${size}" "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime" 46 | } 47 | 48 | case $choice in 49 | 2) 50 | benchmark 10 51 | ;; 52 | 3) 53 | benchmark 100 54 | ;; 55 | 4) 56 | benchmark 1000 57 | ;; 58 | 5) 59 | benchmark 1 60 | benchmark 10 61 | benchmark 100 62 | benchmark 1000 63 | ;; 64 | *) 65 | benchmark 1 66 | ;; 67 | esac -------------------------------------------------------------------------------- /duckdb/physical_query_plans.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | 12 | QUERY_NUM=1 13 | 14 | DUCKDB_CMD="duckdb $DB_NAME" 15 | 16 | cat queries.sql | while read -r query; do 17 | 18 | # Print the query number 19 | echo "------------------------------------------------------------------------------------------------------------------------" 20 | echo "Physical query plan for query Q$QUERY_NUM:" 21 | echo 22 | 23 | $DUCKDB_CMD -c "EXPLAIN $query" 24 | 25 | # Increment the query number 26 | QUERY_NUM=$((QUERY_NUM + 1)) 27 | done; -------------------------------------------------------------------------------- /duckdb/queries.sql: -------------------------------------------------------------------------------- 1 | SELECT j->>'$.commit.collection' AS event,count() AS count FROM bluesky GROUP BY event ORDER BY count DESC; 2 | SELECT j->>'$.commit.collection' AS event,count() AS count,count(DISTINCT j->>'$.did') AS users FROM bluesky WHERE (j->>'$.kind' = 'commit') AND (j->>'$.commit.operation' = 'create') GROUP BY event ORDER BY count DESC; 3 | SELECT j->>'$.commit.collection' AS event,hour(TO_TIMESTAMP(CAST(j->>'$.time_us' AS BIGINT) / 1000000)) as hour_of_day,count() AS count FROM bluesky WHERE (j->>'$.kind' = 'commit') AND (j->>'$.commit.operation' = 'create') AND (j->>'$.commit.collection' in ['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like']) GROUP BY event, hour_of_day ORDER BY hour_of_day, event; 4 | SELECT j->>'$.did'::String as user_id,TO_TIMESTAMP(CAST(MIN(j->>'$.time_us') AS BIGINT) / 1000000) AS first_post_date FROM bluesky WHERE (j->>'$.kind' = 'commit') AND (j->>'$.commit.operation' = 'create') AND (j->>'$.commit.collection' = 'app.bsky.feed.post') GROUP BY user_id ORDER BY first_post_date ASC LIMIT 3; 5 | SELECT j->>'$.did'::String as user_id,date_diff('milliseconds', TO_TIMESTAMP(CAST(MIN(j->>'$.time_us') AS BIGINT) / 1000000),TO_TIMESTAMP(CAST(MAX(j->>'$.time_us') AS BIGINT) / 1000000)) AS activity_span FROM bluesky WHERE (j->>'$.kind' = 'commit') AND (j->>'$.commit.operation' = 'create') AND (j->>'$.commit.collection' = 'app.bsky.feed.post') GROUP BY user_id ORDER BY activity_span DESC LIMIT 3; 6 | -------------------------------------------------------------------------------- /duckdb/queries_formatted.sql: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------------------------------------------------ 2 | -- Q1 - Top event types 3 | ------------------------------------------------------------------------------------------------------------------------ 4 | SELECT 5 | j->>'$.commit.collection' AS event, 6 | count() AS count 7 | FROM bluesky 8 | GROUP BY event 9 | ORDER BY count DESC; 10 | 11 | ------------------------------------------------------------------------------------------------------------------------ 12 | -- Q2 - Top event types together with unique users per event type 13 | ------------------------------------------------------------------------------------------------------------------------ 14 | 15 | SELECT 16 | j->>'$.commit.collection' AS event, 17 | count() AS count,count(DISTINCT j->>'$.did') AS users 18 | FROM bluesky 19 | WHERE (j->>'$.kind' = 'commit') 20 | AND (j->>'$.commit.operation' = 'create') 21 | GROUP BY event 22 | ORDER BY count DESC; 23 | 24 | ------------------------------------------------------------------------------------------------------------------------ 25 | -- Q3 - When do people use BlueSky 26 | ------------------------------------------------------------------------------------------------------------------------ 27 | 28 | SELECT 29 | j->>'$.commit.collection' AS event, 30 | hour(TO_TIMESTAMP(CAST(j->>'$.time_us' AS BIGINT) / 1000000)) as hour_of_day, 31 | count() AS count 32 | FROM bluesky 33 | WHERE (j->>'$.kind' = 'commit') 34 | AND (j->>'$.commit.operation' = 'create') 35 | AND (j->>'$.commit.collection' in ['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like']) 36 | GROUP BY event, hour_of_day 37 | ORDER BY hour_of_day, event; 38 | 39 | ------------------------------------------------------------------------------------------------------------------------ 40 | -- Q4 - top 3 post veterans 41 | ------------------------------------------------------------------------------------------------------------------------ 42 | 43 | SELECT 44 | j->>'$.did'::String as user_id, 45 | TO_TIMESTAMP(CAST(MIN(j->>'$.time_us') AS BIGINT) / 1000000) AS first_post_date 46 | FROM bluesky 47 | WHERE (j->>'$.kind' = 'commit') 48 | AND (j->>'$.commit.operation' = 'create') 49 | AND (j->>'$.commit.collection' = 'app.bsky.feed.post') 50 | GROUP BY user_id 51 | ORDER BY first_post_date ASC 52 | LIMIT 3; 53 | 54 | ------------------------------------------------------------------------------------------------------------------------ 55 | -- Q5 - top 3 users with longest activity 56 | ------------------------------------------------------------------------------------------------------------------------ 57 | 58 | SELECT 59 | j->>'$.did'::String as user_id, 60 | date_diff('milliseconds', TO_TIMESTAMP(CAST(MIN(j->>'$.time_us') AS BIGINT) / 1000000), 61 | TO_TIMESTAMP(CAST(MAX(j->>'$.time_us') AS BIGINT) / 1000000)) AS activity_span 62 | FROM bluesky 63 | WHERE (j->>'$.kind' = 'commit') 64 | AND (j->>'$.commit.operation' = 'create') 65 | AND (j->>'$.commit.collection' = 'app.bsky.feed.post') 66 | GROUP BY user_id 67 | ORDER BY activity_span DESC 68 | LIMIT 3; -------------------------------------------------------------------------------- /duckdb/query_results.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | 12 | DUCKDB_CMD="duckdb $DB_NAME" 13 | 14 | QUERY_NUM=1 15 | 16 | cat queries.sql | while read -r query; do 17 | 18 | # Print the query 19 | echo "------------------------------------------------------------------------------------------------------------------------" 20 | echo "Result for query Q$QUERY_NUM:" 21 | echo 22 | $DUCKDB_CMD -c "$query" 23 | 24 | # Increment the query number 25 | QUERY_NUM=$((QUERY_NUM + 1)) 26 | done; -------------------------------------------------------------------------------- /duckdb/results/_query_results/_m6i.8xlarge_bluesky_1m.query_results: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------------------------------------------------ 2 | Result for query Q1: 3 | 4 | ┌────────────────────────────┬────────┐ 5 | │ event │ count │ 6 | │ varchar │ int64 │ 7 | ├────────────────────────────┼────────┤ 8 | │ app.bsky.feed.like │ 448944 │ 9 | │ app.bsky.graph.follow │ 360374 │ 10 | │ app.bsky.feed.post │ 90816 │ 11 | │ app.bsky.feed.repost │ 58540 │ 12 | │ app.bsky.graph.block │ 14040 │ 13 | │ app.bsky.actor.profile │ 11762 │ 14 | │ app.bsky.graph.listitem │ 8103 │ 15 | │ │ 5328 │ 16 | │ app.bsky.graph.listblock │ 895 │ 17 | │ app.bsky.graph.starterpack │ 405 │ 18 | │ app.bsky.graph.list │ 356 │ 19 | │ app.bsky.feed.threadgate │ 255 │ 20 | │ app.bsky.feed.postgate │ 104 │ 21 | │ app.bsky.feed.generator │ 74 │ 22 | │ app.bsky.labeler.service │ 4 │ 23 | ├────────────────────────────┴────────┤ 24 | │ 15 rows 2 columns │ 25 | └─────────────────────────────────────┘ 26 | ------------------------------------------------------------------------------------------------------------------------ 27 | Result for query Q2: 28 | 29 | ┌────────────────────────────┬────────┬────────┐ 30 | │ event │ count │ users │ 31 | │ varchar │ int64 │ int64 │ 32 | ├────────────────────────────┼────────┼────────┤ 33 | │ app.bsky.feed.like │ 444523 │ 117617 │ 34 | │ app.bsky.graph.follow │ 337978 │ 63957 │ 35 | │ app.bsky.feed.post │ 86812 │ 50464 │ 36 | │ app.bsky.feed.repost │ 56993 │ 26581 │ 37 | │ app.bsky.graph.block │ 13838 │ 5785 │ 38 | │ app.bsky.graph.listitem │ 7568 │ 1078 │ 39 | │ app.bsky.actor.profile │ 5337 │ 5337 │ 40 | │ app.bsky.graph.listblock │ 860 │ 449 │ 41 | │ app.bsky.graph.list │ 259 │ 218 │ 42 | │ app.bsky.feed.threadgate │ 228 │ 196 │ 43 | │ app.bsky.graph.starterpack │ 104 │ 101 │ 44 | │ app.bsky.feed.postgate │ 101 │ 82 │ 45 | │ app.bsky.feed.generator │ 10 │ 9 │ 46 | ├────────────────────────────┴────────┴────────┤ 47 | │ 13 rows 3 columns │ 48 | └──────────────────────────────────────────────┘ 49 | ------------------------------------------------------------------------------------------------------------------------ 50 | Result for query Q3: 51 | 52 | ┌──────────────────────┬─────────────┬────────┐ 53 | │ event │ hour_of_day │ count │ 54 | │ varchar │ int64 │ int64 │ 55 | ├──────────────────────┼─────────────┼────────┤ 56 | │ app.bsky.feed.like │ 16 │ 444523 │ 57 | │ app.bsky.feed.post │ 16 │ 86812 │ 58 | │ app.bsky.feed.repost │ 16 │ 56993 │ 59 | └──────────────────────┴─────────────┴────────┘ 60 | ------------------------------------------------------------------------------------------------------------------------ 61 | Result for query Q4: 62 | 63 | ┌──────────────────────────────────┬───────────────────────────────┐ 64 | │ user_id │ first_post_date │ 65 | │ varchar │ timestamp with time zone │ 66 | ├──────────────────────────────────┼───────────────────────────────┤ 67 | │ did:plc:yj3sjq3blzpynh27cumnp5ks │ 2024-11-21 16:25:49.000167+00 │ 68 | │ did:plc:l5o3qjrmfztir54cpwlv2eme │ 2024-11-21 16:25:49.001905+00 │ 69 | │ did:plc:s4bwqchfzm6gjqfeb6mexgbu │ 2024-11-21 16:25:49.003907+00 │ 70 | └──────────────────────────────────┴───────────────────────────────┘ 71 | ------------------------------------------------------------------------------------------------------------------------ 72 | Result for query Q5: 73 | 74 | ┌──────────────────────────────────┬───────────────┐ 75 | │ user_id │ activity_span │ 76 | │ varchar │ int64 │ 77 | ├──────────────────────────────────┼───────────────┤ 78 | │ did:plc:tsyymlun4eqjuw7hqrhmwagd │ 813007 │ 79 | │ did:plc:3ug235sfy2pz7cawmpsftb65 │ 811602 │ 80 | │ did:plc:doxhhgtxqiv47tmcovpbcqai │ 811404 │ 81 | └──────────────────────────────────┴───────────────┘ -------------------------------------------------------------------------------- /duckdb/results/m6i.8xlarge_bluesky_1000m.errors: -------------------------------------------------------------------------------- 1 | `Invalid Input Error: Malformed JSON at byte 3 of input: unexpected content after document. Input: ":"This user is a Sable!","lang":"en","name":"S..."` -------------------------------------------------------------------------------- /duckdb/results/m6i.8xlarge_bluesky_1000m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "DuckDB", 3 | "version": "1.1.3", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000000, 12 | "dataset_size_readable": "1000m", 13 | "num_loaded_documents": 974400000, 14 | "data_compression": "auto", 15 | "total_size": 472594513920, 16 | "total_size_readable": "472 GB", 17 | "result": [ 18 | [3741.938,3732.441,3731.713], 19 | [3748.548,3737.248,3741.860], 20 | [3746.684,3731.458,3729.542], 21 | [3746.500,3735.854,3739.261], 22 | [3748.528,3736.934,3740.101] 23 | ], 24 | "result_readable": [ 25 | "1 hr 2 min 21.94 sec, 1 hr 2 min 12.44 sec, 1 hr 2 min 11.71 sec", 26 | "1 hr 2 min 28.55 sec, 1 hr 2 min 17.25 sec, 1 hr 2 min 21.86 sec", 27 | "1 hr 2 min 26.68 sec, 1 hr 2 min 11.46 sec, 1 hr 2 min 9.54 sec", 28 | "1 hr 2 min 26.50 sec, 1 hr 2 min 15.85 sec, 1 hr 2 min 19.26 sec", 29 | "1 hr 2 min 28.53 sec, 1 hr 2 min 16.93 sec, 1 hr 2 min 20.10 sec" 30 | ] 31 | } -------------------------------------------------------------------------------- /duckdb/results/m6i.8xlarge_bluesky_100m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "DuckDB", 3 | "version": "1.1.3", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 100000000, 12 | "dataset_size_readable": "100m", 13 | "num_loaded_documents": 100000000, 14 | "data_compression": "auto", 15 | "total_size": 47934865408, 16 | "total_size_readable": "47.93 GB", 17 | "result": [ 18 | [373.230,10.743,10.760], 19 | [373.570,14.974,14.978], 20 | [373.282,13.486,13.443], 21 | [373.364,7.549,7.530], 22 | [373.560,8.071,8.060] 23 | ], 24 | "result_readable": [ 25 | "6 min 13.23 sec, 10.74 sec, 10.76 sec", 26 | "6 min 13.57 sec, 14.97 sec, 14.98 sec", 27 | "6 min 13.28 sec, 13.49 sec, 13.44 sec", 28 | "6 min 13.36 sec, 7.55 sec, 7.53 sec", 29 | "6 min 13.56 sec, 8.07 sec, 8.06 sec" 30 | ] 31 | } -------------------------------------------------------------------------------- /duckdb/results/m6i.8xlarge_bluesky_10m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "DuckDB", 3 | "version": "1.1.3", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 10000000, 12 | "dataset_size_readable": "10m", 13 | "num_loaded_documents": 10000000, 14 | "data_compression": "auto", 15 | "total_size": 4893442048, 16 | "total_size_readable": "4.89 GB", 17 | "result": [ 18 | [37.332,1.179,1.196], 19 | [37.361,1.639,1.645], 20 | [37.343,1.598,1.613], 21 | [37.363,1.060,1.065], 22 | [37.439,1.183,1.213] 23 | ], 24 | "result_readable": [ 25 | "37.33 sec, 1.18 sec, 1.20 sec", 26 | "37.36 sec, 1.64 sec, 1.65 sec", 27 | "37.34 sec, 1.60 sec, 1.61 sec", 28 | "37.36 sec, 1.06 sec, 1.06 sec", 29 | "37.44 sec, 1.18 sec, 1.21 sec" 30 | ] 31 | } -------------------------------------------------------------------------------- /duckdb/results/m6i.8xlarge_bluesky_1m.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "DuckDB", 3 | "version": "1.1.3", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000, 12 | "dataset_size_readable": "1m", 13 | "num_loaded_documents": 1000000, 14 | "data_compression": "auto", 15 | "total_size": 484704256, 16 | "total_size_readable": "484 MB", 17 | "result": [ 18 | [2.797,0.230,0.228], 19 | [2.810,0.319,0.315], 20 | [2.809,0.327,0.330], 21 | [2.802,0.250,0.247], 22 | [2.822,0.259,0.256] 23 | ], 24 | "result_readable": [ 25 | "2.80 sec, 230.00 msec, 228.00 msec", 26 | "2.81 sec, 319.00 msec, 315.00 msec", 27 | "2.81 sec, 327.00 msec, 330.00 msec", 28 | "2.80 sec, 250.00 msec, 247.00 msec", 29 | "2.82 sec, 259.00 msec, 256.00 msec" 30 | ] 31 | } -------------------------------------------------------------------------------- /duckdb/run_queries.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | 12 | DUCKDB_CMD="duckdb $DB_NAME" 13 | 14 | TRIES=3 15 | 16 | LOG_FILE="query_results.log" 17 | > "$LOG_FILE" 18 | 19 | cat queries.sql | while read -r query; do 20 | # Clear filesystem cache between queries. 21 | sync 22 | echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 23 | 24 | echo "Running query: $query" 25 | for i in $(seq 1 $TRIES); do 26 | # Run query with timer enabled and extract the real time. 27 | OUTPUT=$($DUCKDB_CMD <> "$LOG_FILE" 28 | .timer on 29 | $query 30 | EOF 31 | ) 32 | REAL_TIME=$(tac "$LOG_FILE" | grep -m 1 -oP 'real\s+\K[\d.]+') 33 | echo "Real time: $REAL_TIME seconds" 34 | done 35 | done -------------------------------------------------------------------------------- /duckdb/total_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATABASE_NAME="$1" 11 | TABLE_NAME="$2" 12 | DUCKDB_CMD="duckdb $DATABASE_NAME" 13 | 14 | # Fetch the total size using duckDB 15 | $DUCKDB_CMD -c "select '$TABLE_NAME' as table_name, count(distinct block_id) as num_blocks, count(distinct block_id) * (select block_size from pragma_database_size()) as num_bytes from pragma_storage_info('$TABLE_NAME') group by all;" 16 | 17 | -------------------------------------------------------------------------------- /elasticsearch/benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 [RESULT_FILE]" 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | INDEX_NAME="$1" 11 | RESULT_FILE="${2:-}" 12 | 13 | # Print the index name 14 | echo "Running queries on index: $INDEX_NAME" 15 | 16 | # Run queries and log the output 17 | ./run_queries.sh "$INDEX_NAME" 2>&1 | tee query_log.txt 18 | 19 | # Process the query log and prepare the result 20 | RESULT=$(cat query_log.txt | grep -oP 'Response time: \d+\.\d+ s' | sed -r -e 's/Response time: ([0-9]+\.[0-9]+) s/\1/' | \ 21 | awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }') 22 | 23 | # Output the result 24 | if [[ -n "$RESULT_FILE" ]]; then 25 | echo "$RESULT" > "$RESULT_FILE" 26 | echo "Result written to $RESULT_FILE" 27 | else 28 | echo "$RESULT" 29 | fi -------------------------------------------------------------------------------- /elasticsearch/config/elasticsearch.yml: -------------------------------------------------------------------------------- 1 | 2 | 3 | # ---------------------------------- Cluster ----------------------------------- 4 | # 5 | # Use a descriptive name for your cluster: 6 | # 7 | cluster.name: es-bench 8 | # ------------------------------------ Node ------------------------------------ 9 | # 10 | # Use a descriptive name for the node: 11 | # 12 | node.name: node-1 13 | # ----------------------------------- Paths ------------------------------------ 14 | # 15 | # Path to directory where to store the data (separate multiple locations by comma): 16 | # 17 | path.data: /var/lib/elasticsearch 18 | # 19 | # Path to log files: 20 | # 21 | path.logs: /var/log/elasticsearch 22 | # 23 | # ----------------------------------- Memory ----------------------------------- 24 | # 25 | # Lock the memory on startup: 26 | # 27 | bootstrap.memory_lock: true 28 | # -------------------------------------------------------------------------------- 29 | 30 | # Enable security features 31 | xpack.security.enabled: true 32 | xpack.security.enrollment.enabled: true 33 | 34 | # Enable encryption for HTTP API client connections, such as Kibana, Logstash, and Agents 35 | xpack.security.http.ssl: 36 | enabled: true 37 | keystore.path: certs/http.p12 38 | 39 | # Enable encryption and mutual authentication between cluster nodes 40 | xpack.security.transport.ssl: 41 | enabled: true 42 | verification_mode: certificate 43 | keystore.path: certs/transport.p12 44 | truststore.path: certs/transport.p12 45 | # Create a new cluster with the current node only 46 | # Additional nodes can still join the cluster later 47 | cluster.initial_master_nodes: ["node-1"] 48 | 49 | # Allow HTTP API connections from anywhere 50 | # Connections are encrypted and require user authentication 51 | http.host: 0.0.0.0 52 | 53 | #----------------------- END SECURITY AUTO CONFIGURATION ------------------------- -------------------------------------------------------------------------------- /elasticsearch/config/filebeat.yml: -------------------------------------------------------------------------------- 1 | # ============================== Filebeat inputs =============================== 2 | filebeat.registry.flush: 5s 3 | filebeat.inputs: 4 | 5 | - type: filestream 6 | id: bluesky-events 7 | 8 | paths: 9 | - 10 | parsers: 11 | - ndjson: 12 | target: "" 13 | # ============================== Filebeat modules ============================== 14 | 15 | filebeat.config.modules: 16 | # Glob pattern for configuration loading 17 | path: ${path.config}/modules.d/*.yml 18 | 19 | # Set to true to enable config reloading 20 | reload.enabled: false 21 | 22 | # ======================= Elasticsearch template setting ======================= 23 | 24 | setup.template.enabled: false 25 | 26 | # ================================== Outputs =================================== 27 | 28 | # Configure what output to use when sending the data collected by the beat. 29 | 30 | # ---------------------------- Elasticsearch Output ---------------------------- 31 | 32 | output.elasticsearch: 33 | # Array of hosts to connect to. 34 | hosts: ["https://localhost:9200"] 35 | 36 | # Performance preset - one of "balanced", "throughput", "scale", 37 | # "latency", or "custom". 38 | preset: throughput 39 | compression_level: 1 40 | idle_connection_timeout: 30s 41 | # Protocol - either `http` (default) or `https`. 42 | protocol: "https" 43 | index: "" 44 | # Authentication credentials - either API key or username/password. 45 | api_key: "" 46 | ssl: 47 | enabled: true 48 | verification_mode: "none" 49 | 50 | http.enabled: true 51 | http.host: localhost 52 | http.port: 5066 53 | 54 | logging.level: info 55 | logging.to_files: true 56 | logging.files: 57 | path: /var/log/filebeat 58 | name: filebeat 59 | keepfiles: 7 60 | permissions: 0640 61 | 62 | processors: 63 | - rename: 64 | when: 65 | and: 66 | - has_fields: ["commit.record.subject"] 67 | - not: 68 | has_fields: ["commit.record.subject.cid"] 69 | fields: 70 | - from: "commit.record.subject" 71 | to: "commit.record.subject.value" 72 | - rename: 73 | when: 74 | and: 75 | - has_fields: ["commit.record.embed.images.data"] 76 | - not: 77 | has_fields: ["commit.record.subject.cid"] 78 | fields: 79 | - from: "commit.record.embed.images.data" 80 | to: "commit.record.embed.images.data.value" 81 | - drop_fields: 82 | fields: ["log", "agent", "ecs","host", "input"] 83 | ignore_missing: true 84 | - script: 85 | lang: javascript 86 | source: > 87 | function process(event){ 88 | var time_us = event.Get("time_us"); 89 | if (typeof time_us === 'string') { 90 | time_us = BigInt(time_us); // If time_us is a string, cast it to a BigInt 91 | } else if (typeof time_us !== 'number') { 92 | return; // Exit the function if time_us is not a valid number 93 | } 94 | var time_us_ms = time_us / 1000; 95 | event.Put("time_us", time_us_ms.toString()); 96 | } -------------------------------------------------------------------------------- /elasticsearch/config/ilm.json: -------------------------------------------------------------------------------- 1 | { 2 | "policy": { 3 | "phases": { 4 | "hot": { 5 | "min_age": "0ms", 6 | "actions": { 7 | "rollover": { 8 | "max_age": "30d", 9 | "max_primary_shard_size": "50gb" 10 | }, 11 | "forcemerge": { 12 | "max_num_segments": 1 13 | }, 14 | "readonly": {} 15 | } 16 | } 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /elasticsearch/config/index_template_no_source_best_compression.json: -------------------------------------------------------------------------------- 1 | { 2 | "index_patterns": [ 3 | "${INDEX_NAME}" 4 | ], 5 | "data_stream": {}, 6 | "template": { 7 | "settings": { 8 | "index": { 9 | "lifecycle": { 10 | "name": "filebeat" 11 | }, 12 | "codec": "best_compression", 13 | "routing": { 14 | "allocation": { 15 | "include": { 16 | "_tier_preference": "data_hot" 17 | } 18 | } 19 | }, 20 | "mapping": { 21 | "total_fields": { 22 | "limit": "10000" 23 | } 24 | }, 25 | "refresh_interval": "30s", 26 | "number_of_shards": "1", 27 | "max_docvalue_fields_search": "200", 28 | "sort": { 29 | "field": [ 30 | "kind", 31 | "commit.operation", 32 | "commit.collection", 33 | "did", 34 | "time_us" 35 | ], 36 | "order": [ 37 | "asc", 38 | "asc", 39 | "asc", 40 | "asc", 41 | "asc" 42 | ] 43 | }, 44 | "number_of_replicas": "0" 45 | } 46 | }, 47 | "mappings": { 48 | "_source": { 49 | "enabled": false 50 | }, 51 | "dynamic_templates": [ 52 | { 53 | "strings_as_keyword": { 54 | "match_mapping_type": "string", 55 | "mapping": { 56 | "ignore_above": 1024, 57 | "type": "keyword" 58 | } 59 | } 60 | } 61 | ], 62 | "properties": { 63 | "kind": { 64 | "type": "keyword" 65 | }, 66 | "commit": { 67 | "properties": { 68 | "collection": { 69 | "type": "keyword" 70 | }, 71 | "operation": { 72 | "type": "keyword" 73 | } 74 | } 75 | }, 76 | "did": { 77 | "type": "keyword" 78 | }, 79 | "time_us": { 80 | "type": "date" 81 | } 82 | } 83 | }, 84 | "aliases": {} 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /elasticsearch/config/index_template_no_source_default_compression.json: -------------------------------------------------------------------------------- 1 | { 2 | "index_patterns": [ 3 | "${INDEX_NAME}" 4 | ], 5 | "data_stream": {}, 6 | "template": { 7 | "settings": { 8 | "index": { 9 | "lifecycle": { 10 | "name": "filebeat" 11 | }, 12 | "routing": { 13 | "allocation": { 14 | "include": { 15 | "_tier_preference": "data_hot" 16 | } 17 | } 18 | }, 19 | "mapping": { 20 | "total_fields": { 21 | "limit": "10000" 22 | } 23 | }, 24 | "refresh_interval": "30s", 25 | "number_of_shards": "1", 26 | "max_docvalue_fields_search": "200", 27 | "sort": { 28 | "field": [ 29 | "kind", 30 | "commit.operation", 31 | "commit.collection", 32 | "did", 33 | "time_us" 34 | ], 35 | "order": [ 36 | "asc", 37 | "asc", 38 | "asc", 39 | "asc", 40 | "asc" 41 | ] 42 | }, 43 | "number_of_replicas": "0" 44 | } 45 | }, 46 | "mappings": { 47 | "_source": { 48 | "enabled": false 49 | }, 50 | "dynamic_templates": [ 51 | { 52 | "strings_as_keyword": { 53 | "match_mapping_type": "string", 54 | "mapping": { 55 | "ignore_above": 1024, 56 | "type": "keyword" 57 | } 58 | } 59 | } 60 | ], 61 | "properties": { 62 | "kind": { 63 | "type": "keyword" 64 | }, 65 | "commit": { 66 | "properties": { 67 | "collection": { 68 | "type": "keyword" 69 | }, 70 | "operation": { 71 | "type": "keyword" 72 | } 73 | } 74 | }, 75 | "did": { 76 | "type": "keyword" 77 | }, 78 | "time_us": { 79 | "type": "date" 80 | } 81 | } 82 | }, 83 | "aliases": {} 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /elasticsearch/config/index_template_source_best_compression.json: -------------------------------------------------------------------------------- 1 | { 2 | "index_patterns": [ 3 | "${INDEX_NAME}" 4 | ], 5 | "data_stream": {}, 6 | "template": { 7 | "settings": { 8 | "index": { 9 | "lifecycle": { 10 | "name": "filebeat" 11 | }, 12 | "codec": "best_compression", 13 | "routing": { 14 | "allocation": { 15 | "include": { 16 | "_tier_preference": "data_hot" 17 | } 18 | } 19 | }, 20 | "mapping": { 21 | "total_fields": { 22 | "limit": "10000" 23 | } 24 | }, 25 | "refresh_interval": "30s", 26 | "number_of_shards": "1", 27 | "max_docvalue_fields_search": "200", 28 | "sort": { 29 | "field": [ 30 | "kind", 31 | "commit.operation", 32 | "commit.collection", 33 | "did", 34 | "time_us" 35 | ], 36 | "order": [ 37 | "asc", 38 | "asc", 39 | "asc", 40 | "asc", 41 | "asc" 42 | ] 43 | }, 44 | "number_of_replicas": "0" 45 | } 46 | }, 47 | "mappings": { 48 | "_source": { 49 | "enabled": true 50 | }, 51 | "dynamic_templates": [ 52 | { 53 | "strings_as_keyword": { 54 | "match_mapping_type": "string", 55 | "mapping": { 56 | "ignore_above": 1024, 57 | "type": "keyword" 58 | } 59 | } 60 | } 61 | ], 62 | "properties": { 63 | "kind": { 64 | "type": "keyword" 65 | }, 66 | "commit": { 67 | "properties": { 68 | "collection": { 69 | "type": "keyword" 70 | }, 71 | "operation": { 72 | "type": "keyword" 73 | } 74 | } 75 | }, 76 | "did": { 77 | "type": "keyword" 78 | }, 79 | "time_us": { 80 | "type": "date" 81 | } 82 | } 83 | }, 84 | "aliases": {} 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /elasticsearch/config/index_template_source_default_compression.json: -------------------------------------------------------------------------------- 1 | { 2 | "index_patterns": [ 3 | "${INDEX_NAME}" 4 | ], 5 | "data_stream": {}, 6 | "template": { 7 | "settings": { 8 | "index": { 9 | "lifecycle": { 10 | "name": "filebeat" 11 | }, 12 | "routing": { 13 | "allocation": { 14 | "include": { 15 | "_tier_preference": "data_hot" 16 | } 17 | } 18 | }, 19 | "mapping": { 20 | "total_fields": { 21 | "limit": "10000" 22 | } 23 | }, 24 | "refresh_interval": "30s", 25 | "number_of_shards": "1", 26 | "max_docvalue_fields_search": "200", 27 | "sort": { 28 | "field": [ 29 | "kind", 30 | "commit.operation", 31 | "commit.collection", 32 | "did", 33 | "time_us" 34 | ], 35 | "order": [ 36 | "asc", 37 | "asc", 38 | "asc", 39 | "asc", 40 | "asc" 41 | ] 42 | }, 43 | "number_of_replicas": "0" 44 | } 45 | }, 46 | "mappings": { 47 | "_source": { 48 | "enabled": true 49 | }, 50 | "dynamic_templates": [ 51 | { 52 | "strings_as_keyword": { 53 | "match_mapping_type": "string", 54 | "mapping": { 55 | "ignore_above": 1024, 56 | "type": "keyword" 57 | } 58 | } 59 | } 60 | ], 61 | "properties": { 62 | "kind": { 63 | "type": "keyword" 64 | }, 65 | "commit": { 66 | "properties": { 67 | "collection": { 68 | "type": "keyword" 69 | }, 70 | "operation": { 71 | "type": "keyword" 72 | } 73 | } 74 | }, 75 | "did": { 76 | "type": "keyword" 77 | }, 78 | "time_us": { 79 | "type": "date" 80 | } 81 | } 82 | }, 83 | "aliases": {} 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /elasticsearch/config/jvm.options: -------------------------------------------------------------------------------- 1 | ################################################################ 2 | ## 3 | ## JVM configuration 4 | ## 5 | ################################################################ 6 | ## 7 | ## WARNING: DO NOT EDIT THIS FILE. If you want to override the 8 | ## JVM options in this file, or set any additional options, you 9 | ## should create one or more files in the jvm.options.d 10 | ## directory containing your adjustments. 11 | ## 12 | ## See https://www.elastic.co/guide/en/elasticsearch/reference/8.15/jvm-options.html 13 | ## for more information. 14 | ## 15 | ################################################################ 16 | 17 | 18 | 19 | ################################################################ 20 | ## IMPORTANT: JVM heap size 21 | ################################################################ 22 | ## 23 | ## The heap size is automatically configured by Elasticsearch 24 | ## based on the available memory in your system and the roles 25 | ## each node is configured to fulfill. If specifying heap is 26 | ## required, it should be done through a file in jvm.options.d, 27 | ## which should be named with .options suffix, and the min and 28 | ## max should be set to the same value. For example, to set the 29 | ## heap to 4 GB, create a new file in the jvm.options.d 30 | ## directory containing these lines: 31 | ## 32 | -Xms31g 33 | -Xmx31g 34 | ## 35 | ## See https://www.elastic.co/guide/en/elasticsearch/reference/8.15/heap-size.html 36 | ## for more information 37 | ## 38 | ################################################################ 39 | 40 | 41 | ################################################################ 42 | ## Expert settings 43 | ################################################################ 44 | ## 45 | ## All settings below here are considered expert settings. Do 46 | ## not adjust them unless you understand what you are doing. Do 47 | ## not edit them in this file; instead, create a new file in the 48 | ## jvm.options.d directory containing your adjustments. 49 | ## 50 | ################################################################ 51 | 52 | -XX:+UseG1GC 53 | 54 | ## JVM temporary directory 55 | -Djava.io.tmpdir=${ES_TMPDIR} 56 | 57 | # Leverages accelerated vector hardware instructions; removing this may 58 | # result in less optimal vector performance 59 | 20-:--add-modules=jdk.incubator.vector 60 | 61 | ## heap dumps 62 | 63 | # generate a heap dump when an allocation from the Java heap fails; heap dumps 64 | # are created in the working directory of the JVM unless an alternative path is 65 | # specified 66 | -XX:+HeapDumpOnOutOfMemoryError 67 | 68 | # exit right after heap dump on out of memory error 69 | -XX:+ExitOnOutOfMemoryError 70 | 71 | # specify an alternative path for heap dumps; ensure the directory exists and 72 | # has sufficient space 73 | -XX:HeapDumpPath=/var/lib/elasticsearch 74 | 75 | # specify an alternative path for JVM fatal error logs 76 | -XX:ErrorFile=/var/log/elasticsearch/hs_err_pid%p.log 77 | 78 | ## GC logging 79 | -Xlog:gc*,gc+age=trace,safepoint:file=/var/log/elasticsearch/gc.log:utctime,level,pid,tags:filecount=32,filesize=64m -------------------------------------------------------------------------------- /elasticsearch/count.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Check if ELASTIC_PASSWORD env variable is set, if set from not read from .elastic_password file 10 | if [[ -z "$ELASTIC_PASSWORD" ]]; then 11 | [[ ! -f ".elastic_password" ]] && { echo "Error: ELASTIC_PASSWORD environment variable is not set and .elastic_password file does not exist."; exit 1; } 12 | export $(cat .elastic_password) 13 | fi 14 | 15 | # Arguments 16 | INDEX_NAME="$1" 17 | 18 | echo $(curl -s -k -X GET "https://localhost:9200/${INDEX_NAME}/_count" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' | jq '.count') 19 | -------------------------------------------------------------------------------- /elasticsearch/create_and_load.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if ELASTIC_PASSWORD env variable is set, if set from not read from .elastic_password file 4 | if [[ -z "$ELASTIC_PASSWORD" ]]; then 5 | [[ ! -f ".elastic_password" ]] && { echo "Error: ELASTIC_PASSWORD environment variable is not set and .elastic_password file does not exist."; exit 1; } 6 | export $(cat .elastic_password) 7 | fi 8 | 9 | # Check if the required arguments are provided 10 | if [[ $# -lt 6 ]]; then 11 | echo "Usage: $0 " 12 | exit 1 13 | fi 14 | 15 | # Arguments 16 | INDEX_NAME="$1" 17 | INDEX_TEMPLATE_FILE="config/$2.json" 18 | DATA_DIRECTORY="$3" 19 | NUM_FILES="$4" 20 | SUCCESS_LOG="$5" 21 | ERROR_LOG="$6" 22 | 23 | # Validate arguments 24 | [[ ! -f "$INDEX_TEMPLATE_FILE" ]] && { echo "Error: Index template file '$INDEX_TEMPLATE_FILE' does not exist."; exit 1; } 25 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; } 26 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; } 27 | 28 | # Check ilm policy is installed, install if not 29 | # If curl return 404, means ILM policy is not installed 30 | 31 | http_code=$(curl -s -o /dev/null -k -w "%{http_code}" -X GET "https://localhost:9200/_ilm/policy/filebeat" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json') 32 | if [[ "$http_code" -eq 404 ]] ; then 33 | echo "Installing ILM policy" 34 | ILM_POLICY=$(cat "config/ilm.json") 35 | curl -s -k -X PUT "https://localhost:9200/_ilm/policy/filebeat" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' -d "$ILM_POLICY" 36 | fi 37 | 38 | # Install index template 39 | # Read index template file json from config/$INDEX_TEMPLATE_FILE 40 | INDEX_TEMPLATE=$(cat "$INDEX_TEMPLATE_FILE") 41 | JSON_DATA=$(cat $INDEX_TEMPLATE_FILE | sed "s/\${INDEX_NAME}/$INDEX_NAME/g") 42 | echo "Install index template" 43 | curl -s -o /dev/null -k -X PUT "https://localhost:9200/_index_template/${INDEX_NAME}" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' -d "$JSON_DATA" 44 | 45 | # Create the data stream 46 | echo "Create the data stream" 47 | curl -s -o /dev/null -k -X PUT "https://localhost:9200/_data_stream/${INDEX_NAME}" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' 48 | 49 | # Load data 50 | ./load_data.sh "$DATA_DIRECTORY" "$INDEX_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG" 51 | 52 | echo "Script completed successfully." -------------------------------------------------------------------------------- /elasticsearch/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Install elasticsearch 4 | wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch | sudo gpg --dearmor -o /usr/share/keyrings/elasticsearch-keyring.gpg 5 | sudo apt-get install apt-transport-https 6 | echo "deb [signed-by=/usr/share/keyrings/elasticsearch-keyring.gpg] https://artifacts.elastic.co/packages/8.x/apt stable main" | sudo tee /etc/apt/sources.list.d/elastic-8.x.list 7 | sudo apt-get update && sudo apt-get install elasticsearch 8 | 9 | # # Install filebeat 10 | curl -L -O https://artifacts.elastic.co/downloads/beats/filebeat/filebeat-8.17.0-amd64.deb 11 | sudo dpkg -i filebeat-8.17.0-amd64.deb 12 | 13 | # # Overwrite configuration files 14 | sudo cp config/elasticsearch.yml /etc/elasticsearch/elasticsearch.yml 15 | sudo cp config/jvm.options /etc/elasticsearch/jvm.options 16 | 17 | # # Start elasticsearch 18 | sudo systemctl start elasticsearch.service 19 | 20 | # Reset and export elastic password 21 | export ELASTIC_PASSWORD=$(sudo /usr/share/elasticsearch/bin/elasticsearch-reset-password -s -a -b -u elastic) 22 | 23 | # Save elastic password in local file 24 | echo "ELASTIC_PASSWORD=$ELASTIC_PASSWORD" > .elastic_password 25 | 26 | # Generate api key for filebeat 27 | curl -s -k -X POST "https://localhost:9200/_security/api_key" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' -d ' 28 | { 29 | "name": "filebeat", 30 | "role_descriptors": { 31 | "filebeat_writer": { 32 | "cluster": ["monitor", "read_ilm", "read_pipeline"], 33 | "index": [ 34 | { 35 | "names": ["bluesky-*"], 36 | "privileges": ["view_index_metadata", "create_doc", "auto_configure"] 37 | } 38 | ] 39 | } 40 | } 41 | }' | jq -r '"\(.id):\(.api_key)"' > .filebeat_api_key 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /elasticsearch/load_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 5 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Check if ELASTIC_PASSWORD env variable is set, if set from not read from .elastic_password file 10 | if [[ -z "$ELASTIC_PASSWORD" ]]; then 11 | [[ ! -f ".elastic_password" ]] && { echo "Error: ELASTIC_PASSWORD environment variable is not set and .elastic_password file does not exist."; exit 1; } 12 | export $(cat .elastic_password) 13 | fi 14 | 15 | # Arguments 16 | DIRECTORY="$1" 17 | INDEX_NAME="$2" 18 | MAX_FILES="$3" 19 | SUCCESS_LOG="$4" 20 | ERROR_LOG="$5" 21 | 22 | # Validate that MAX_FILES is a number 23 | if ! [[ "$MAX_FILES" =~ ^[0-9]+$ ]]; then 24 | echo "Error: must be a positive integer." 25 | exit 1 26 | fi 27 | 28 | # Ensure the log files exist 29 | touch "$SUCCESS_LOG" "$ERROR_LOG" 30 | 31 | # Create a temporary directory in /var/tmp and ensure it's accessible 32 | TEMP_DIR=$(mktemp -d /var/tmp/cleaned_files.XXXXXX) 33 | chmod 777 "$TEMP_DIR" # Allow access for all users 34 | trap "rm -rf $TEMP_DIR" EXIT # Ensure cleanup on script exit 35 | 36 | # Copy all files to temp location and uncompress them 37 | # Counter to track processed files 38 | counter=0 39 | 40 | # Loop through each .json.gz file in the directory 41 | for file in $(ls "$DIRECTORY"/*.json.gz | sort); do 42 | if [[ -f "$file" ]]; then 43 | echo "Processing $file..." 44 | counter=$((counter + 1)) 45 | 46 | # Uncompress the file into the temporary directory 47 | uncompressed_file="$TEMP_DIR/$(basename "${file%.gz}")" 48 | gunzip -c "$file" > "$uncompressed_file" 49 | 50 | # Check if uncompression was successful 51 | if [[ $? -ne 0 ]]; then 52 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed to uncompress $file." >> "$ERROR_LOG" 53 | continue 54 | fi 55 | 56 | # Grant read permissions 57 | chmod 644 "$uncompressed_file" 58 | # Stop processing if the max number of files is reached 59 | if [[ $counter -ge $MAX_FILES ]]; then 60 | echo "Processed maximum number of files: $MAX_FILES" 61 | break 62 | fi 63 | else 64 | echo "No .json.gz files found in the directory." 65 | fi 66 | done 67 | 68 | echo "All files have been copied to temp location." 69 | 70 | echo "Prepare filebeat for ingestion" 71 | 72 | # Prepare Filebeat configuration 73 | FILEBEAT_API_KEY=$(cat .filebeat_api_key) 74 | FILEBEAT_CONFIG=$(sed -e "s||$FILEBEAT_API_KEY|g" -e "s||$INDEX_NAME|g" -e "s||"${TEMP_DIR}/*"|g" config/filebeat.yml) 75 | echo "$FILEBEAT_CONFIG" | sudo tee /etc/filebeat/filebeat.yml > /dev/null 76 | sudo rm -rf /var/lib/filebeat/registry 77 | sudo service filebeat start 78 | trap "sudo service filebeat stop" EXIT # Stop filebeat on exit 79 | 80 | # wait until all files have been ingested 81 | total_processed=0 82 | max_events=$MAX_FILES*1000000 83 | while [[ $total_processed -lt $max_events ]]; do 84 | sleep 30 85 | total_processed=$(curl -k -s -XGET 'localhost:5066/stats' | jq .filebeat.events.done) 86 | echo "Total processed files: $total_processed" 87 | done 88 | 89 | sudo service filebeat stop 90 | 91 | echo "Force merge indices" 92 | curl -k -X POST "https://localhost:9200/$INDEX_NAME/_forcemerge?max_num_segments=1" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' 93 | 94 | echo "All files have been processed." -------------------------------------------------------------------------------- /elasticsearch/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Default data directory 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky 5 | 6 | # Allow the user to optionally provide the data directory as an argument 7 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}" 8 | 9 | # Define success and error log files 10 | SUCCESS_LOG="${2:-success.log}" 11 | ERROR_LOG="${3:-error.log}" 12 | 13 | # Define prefix for output files 14 | OUTPUT_PREFIX="${4:-_m6i.8xlarge}" 15 | 16 | # Check if the directory exists 17 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 18 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 19 | exit 1 20 | fi 21 | 22 | echo "Select the dataset size to benchmark:" 23 | echo "1) 1m (default)" 24 | echo "2) 10m" 25 | echo "3) 100m" 26 | echo "4) 1000m" 27 | echo "5) all" 28 | read -p "Enter the number corresponding to your choice: " choice 29 | 30 | ./install.sh 31 | 32 | benchmark() { 33 | local size=$1 34 | local template=$2 35 | # Check DATA_DIRECTORY contains the required number of files to run the benchmark 36 | file_count=$(find "$DATA_DIRECTORY" -type f | wc -l) 37 | if (( file_count < size )); then 38 | echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count." 39 | exit 1 40 | fi 41 | ./create_and_load.sh "bluesky-${template}-${size}m" "index_template_${template}" "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG" 42 | ./total_size.sh "bluesky-${template}-${size}m" | tee "${OUTPUT_PREFIX}_bluesky-${template}-${size}m.data_size" 43 | ./count.sh "bluesky-${template}-${size}m" | tee "${OUTPUT_PREFIX}_bluesky-${template}-${size}m.count" 44 | #./query_results.sh "bluesky-${template}-${size}m" | tee "${OUTPUT_PREFIX}_bluesky-${template}-${size}m.query_results" 45 | ./benchmark.sh "bluesky-${template}-${size}m" "${OUTPUT_PREFIX}_bluesky-${template}-${size}m.results_runtime" 46 | } 47 | 48 | case $choice in 49 | 2) 50 | benchmark 10 no_source_best_compression 51 | benchmark 10 source_best_compression 52 | benchmark 10 source_default_compression 53 | benchmark 10 no_source_default_compression 54 | ;; 55 | 3) 56 | benchmark 100 no_source_best_compression 57 | benchmark 100 source_best_compression 58 | benchmark 100 source_default_compression 59 | benchmark 100 no_source_default_compression 60 | ;; 61 | 4) 62 | benchmark 1000 no_source_best_compression 63 | benchmark 1000 source_best_compression 64 | benchmark 1000 source_default_compression 65 | benchmark 1000 no_source_default_compression 66 | ;; 67 | 5) 68 | benchmark 1 no_source_best_compression 69 | benchmark 1 source_best_compression 70 | benchmark 1 source_default_compression 71 | benchmark 1 no_source_default_compression 72 | benchmark 10 no_source_best_compression 73 | benchmark 10 source_best_compression 74 | benchmark 10 source_default_compression 75 | benchmark 10 no_source_default_compression 76 | benchmark 100 no_source_best_compression 77 | benchmark 100 source_best_compression 78 | benchmark 100 source_default_compression 79 | benchmark 100 no_source_default_compression 80 | benchmark 1000 no_source_best_compression 81 | benchmark 1000 source_best_compression 82 | benchmark 1000 source_default_compression 83 | benchmark 1000 no_source_default_compression 84 | ;; 85 | *) 86 | benchmark 1 no_source_best_compression 87 | benchmark 1 source_best_compression 88 | benchmark 1 source_default_compression 89 | benchmark 1 no_source_default_compression 90 | ;; 91 | esac 92 | -------------------------------------------------------------------------------- /elasticsearch/queries.txt: -------------------------------------------------------------------------------- 1 | FROM ${INDEX_NAME} | STATS count = COUNT() BY commit.collection | SORT count DESC 2 | FROM ${INDEX_NAME} | WHERE kind == \\\"commit\\\" AND commit.operation == \\\"create\\\" | STATS users = COUNT_DISTINCT(did, 40000), count = COUNT() BY commit.collection | SORT count DESC 3 | FROM ${INDEX_NAME} | WHERE kind == \\\"commit\\\" AND commit.operation == \\\"create\\\" AND commit.collection IN (\\\"app.bsky.feed.post\\\", \\\"app.bsky.feed.repost\\\", \\\"app.bsky.feed.like\\\") | STATS count = COUNT() BY commit.collection, DATE_EXTRACT(\\\"hour_of_day\\\", time_us) | SORT count, commit.collection 4 | FROM ${INDEX_NAME} | WHERE kind == \\\"commit\\\" AND commit.operation == \\\"create\\\" AND commit.collection == \\\"app.bsky.feed.post\\\" | STATS first_post_ts = MIN(time_us) BY did | SORT first_post_ts ASC | LIMIT 3 5 | FROM ${INDEX_NAME} | WHERE kind == \\\"commit\\\" AND commit.operation == \\\"create\\\" AND commit.collection == \\\"app.bsky.feed.post\\\" | STATS activity_span = date_diff(\\\"millisecond\\\",min(time_us), max(time_us)) BY did | SORT activity_span DESC | LIMIT 3 -------------------------------------------------------------------------------- /elasticsearch/queries_formatted.txt: -------------------------------------------------------------------------------- 1 | -- Q1 - Top event types 2 | 3 | POST /_query?format=txt 4 | { 5 | "query": """FROM $INDEX_NAME 6 | | STATS count = COUNT() BY commit.collection 7 | | SORT count DESC""" 8 | } 9 | 10 | -- Q2 - Top event types together with unique users per event type 11 | 12 | POST /_query?format=txt 13 | { 14 | "query": """FROM $INDEX_NAME 15 | | WHERE kind == "commit" AND commit.operation == "create" 16 | | STATS users = COUNT_DISTINCT(did), count = COUNT() BY commit.collection 17 | | SORT count DESC""" 18 | } 19 | 20 | -- Q3 - When do people use BlueSky 21 | 22 | POST /_query?format=txt 23 | { 24 | "query": """FROM $INDEX_NAME 25 | | WHERE kind == "commit" AND commit.operation == "create" AND commit.collection IN ("app.bsky.feed.post", "app.bsky.feed.repost", "app.bsky.feed.like") 26 | | STATS count = COUNT() BY commit.collection, DATE_EXTRACT("hour_of_day", time_us) 27 | | SORT count, commit.collection""" 28 | } 29 | 30 | -- Q4 - top 3 post veterans 31 | 32 | POST /_query?format=txt 33 | { 34 | "query": """FROM $INDEX_NAME 35 | | WHERE kind == "commit" AND commit.operation == "create" AND commit.collection == "app.bsky.feed.post" 36 | | STATS first_post_ts = MIN(time_us) BY did 37 | | SORT first_post_ts ASC 38 | | LIMIT 3""" 39 | } 40 | 41 | -- Q5 - top 3 users with longest activity 42 | 43 | POST /_query?format=txt 44 | { 45 | "query": """FROM $INDEX_NAME 46 | | WHERE kind == "commit" AND commit.operation == "create" AND commit.collection == "app.bsky.feed.post" 47 | | STATS activity_span = date_diff("millisecond",min(time_us), max(time_us)) BY did 48 | | SORT activity_span DESC 49 | | LIMIT 3""" 50 | } -------------------------------------------------------------------------------- /elasticsearch/query_results.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Check if ELASTIC_PASSWORD env variable is set, if set from not read from .elastic_password file 10 | if [[ -z "$ELASTIC_PASSWORD" ]]; then 11 | [[ ! -f ".elastic_password" ]] && { echo "Error: ELASTIC_PASSWORD environment variable is not set and .elastic_password file does not exist."; exit 1; } 12 | export $(cat .elastic_password) 13 | fi 14 | 15 | # Arguments 16 | INDEX_NAME="$1" 17 | 18 | QUERY_NUM=1 19 | 20 | # File containing Elasticsearch ES|SQL queries 21 | QUERY_FILE="queries.txt" 22 | 23 | # Check if the query file exists 24 | if [[ ! -f "$QUERY_FILE" ]]; then 25 | echo "Error: Query file '$QUERY_FILE' does not exist." 26 | exit 1 27 | fi 28 | 29 | cat 'queries.txt' | while read -r QUERY; do 30 | eval "QUERY=\"${QUERY}\"" 31 | # Print the query 32 | echo "------------------------------------------------------------------------------------------------------------------------" 33 | echo "Result for query Q$QUERY_NUM: " 34 | echo 35 | CURL_DATA="{\"query\": \"$QUERY\"}" 36 | curl -s -k -X POST "https://localhost:9200/_query?format=txt" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' -d "$CURL_DATA" 37 | echo 38 | # Increment the query number 39 | QUERY_NUM=$((QUERY_NUM + 1)) 40 | done 41 | -------------------------------------------------------------------------------- /elasticsearch/results/_query_results/_m6i.8xlarge_bluesky-no_source_best_compression-1m.query_results: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------------------------------------------------ 2 | Result for query Q1: 3 | 4 | count | commit.collection 5 | ---------------+-------------------------- 6 | 448944 |app.bsky.feed.like 7 | 360374 |app.bsky.graph.follow 8 | 90816 |app.bsky.feed.post 9 | 58540 |app.bsky.feed.repost 10 | 14040 |app.bsky.graph.block 11 | 11762 |app.bsky.actor.profile 12 | 8103 |app.bsky.graph.listitem 13 | 5328 |null 14 | 895 |app.bsky.graph.listblock 15 | 405 |app.bsky.graph.starterpack 16 | 356 |app.bsky.graph.list 17 | 255 |app.bsky.feed.threadgate 18 | 104 |app.bsky.feed.postgate 19 | 74 |app.bsky.feed.generator 20 | 4 |app.bsky.labeler.service 21 | 22 | ------------------------------------------------------------------------------------------------------------------------ 23 | Result for query Q2: 24 | 25 | users | count | commit.collection 26 | ---------------+---------------+-------------------------- 27 | 117184 |444523 |app.bsky.feed.like 28 | 63873 |337978 |app.bsky.graph.follow 29 | 50383 |86812 |app.bsky.feed.post 30 | 26580 |56993 |app.bsky.feed.repost 31 | 5783 |13838 |app.bsky.graph.block 32 | 1078 |7568 |app.bsky.graph.listitem 33 | 5337 |5337 |app.bsky.actor.profile 34 | 449 |860 |app.bsky.graph.listblock 35 | 218 |259 |app.bsky.graph.list 36 | 196 |228 |app.bsky.feed.threadgate 37 | 101 |104 |app.bsky.graph.starterpack 38 | 82 |101 |app.bsky.feed.postgate 39 | 9 |10 |app.bsky.feed.generator 40 | 41 | ------------------------------------------------------------------------------------------------------------------------ 42 | Result for query Q3: 43 | 44 | count | commit.collection |DATE_EXTRACT("hour_of_day", time_us) 45 | ---------------+--------------------+------------------------------------ 46 | 56993 |app.bsky.feed.repost|16 47 | 86812 |app.bsky.feed.post |16 48 | 444523 |app.bsky.feed.like |16 49 | 50 | ------------------------------------------------------------------------------------------------------------------------ 51 | Result for query Q4: 52 | 53 | first_post_ts | did 54 | ------------------------+-------------------------------- 55 | 2024-11-21T16:25:49.000Z|did:plc:yj3sjq3blzpynh27cumnp5ks 56 | 2024-11-21T16:25:49.001Z|did:plc:l5o3qjrmfztir54cpwlv2eme 57 | 2024-11-21T16:25:49.003Z|did:plc:s4bwqchfzm6gjqfeb6mexgbu 58 | 59 | ------------------------------------------------------------------------------------------------------------------------ 60 | Result for query Q5: 61 | 62 | activity_span | did 63 | ---------------+-------------------------------- 64 | 813007 |did:plc:tsyymlun4eqjuw7hqrhmwagd 65 | 811602 |did:plc:3ug235sfy2pz7cawmpsftb65 66 | 811404 |did:plc:doxhhgtxqiv47tmcovpbcqai -------------------------------------------------------------------------------- /elasticsearch/results/m6i.8xlarge_bluesky_no_source_1000m_best_compression.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Elasticsearch (no source, best compression)", 3 | "version": "8.17.0", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000000, 12 | "dataset_size_readable": "1000m", 13 | "num_loaded_documents": 999998998, 14 | "data_compression": "zstd", 15 | "total_size": 235840659266, 16 | "total_size_readable": "219.6 GB", 17 | "result": [ 18 | [5.022,5.019,5.078], 19 | [51.486,45.510,45.713], 20 | [41.789,41.359,41.608], 21 | [8.807,8.812,8.711], 22 | [9.696,9.723,9.533] 23 | ], 24 | "result_readable": [ 25 | "5.02 sec, 5.02 sec, 5.08 sec", 26 | "51.49 sec, 45.51 sec, 45.71 sec", 27 | "41.79 sec, 41.36 sec, 41.61 sec", 28 | "8.81 sec, 8.81 sec, 8.71 sec", 29 | "9.70 sec, 9.72 sec, 9.53 sec" 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /elasticsearch/results/m6i.8xlarge_bluesky_no_source_1000m_default_compression.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Elasticsearch (no source, default)", 3 | "version": "8.17.0", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000000, 12 | "dataset_size_readable": "1000m", 13 | "num_loaded_documents": 999999106, 14 | "data_compression": "lz4", 15 | "total_size": 238598578300, 16 | "total_size_readable": "222 GB", 17 | "result": [ 18 | [4.427,4.294,4.382], 19 | [46.690,41.966,42.483], 20 | [39.386,39.241,38.131], 21 | [8.575,8.428,8.386], 22 | [9.362,9.322,9.299] 23 | ], 24 | "result_readable": [ 25 | "4.43 sec, 4.29 sec, 4.38 sec", 26 | "46.69 sec, 41.97 sec, 42.48 sec", 27 | "39.39 sec, 39.24 sec, 38.13 sec", 28 | "8.57 sec, 8.43 sec, 8.39 sec", 29 | "9.36 sec, 9.32 sec, 9.30 sec" 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /elasticsearch/results/m6i.8xlarge_bluesky_no_source_100m_best_compression.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Elasticsearch (no source, best compression)", 3 | "version": "8.17.0", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 100000000, 12 | "dataset_size_readable": "100m", 13 | "num_loaded_documents": 99999947, 14 | "data_compression": "zstd", 15 | "data_size": 21268479403, 16 | "data_size_readable": "19.8 GB", 17 | "result": [ 18 | [2.532,2.536,2.486], 19 | [23.194,22.932,23.188], 20 | [19.521,19.321,19.159], 21 | [2.867,2.791,2.884], 22 | [3.099,3.136,3.171] 23 | ], 24 | "result_readable": [ 25 | "2.53 sec, 2.54 sec, 2.49 sec", 26 | "23.19 sec, 22.93 sec, 23.19 sec", 27 | "19.52 sec, 19.32 sec, 19.16 sec", 28 | "2.87 sec, 2.79 sec, 2.88 sec", 29 | "3.10 sec, 3.14 sec, 3.17 sec" 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /elasticsearch/results/m6i.8xlarge_bluesky_no_source_100m_default_compression.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Elasticsearch (no source, default)", 3 | "version": "8.17.0", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 100000000, 12 | "dataset_size_readable": "100m", 13 | "num_loaded_documents": 99999947, 14 | "data_compression": "lz4", 15 | "total_size": 21240210194, 16 | "total_size_readable": "19.7 GB", 17 | "result": [ 18 | [2.519,2.524,2.483], 19 | [23.610,23.215,23.230], 20 | [19.762,20.236,19.979], 21 | [2.868,2.809,2.841], 22 | [3.075,3.103,3.117] 23 | ], 24 | "result_readable": [ 25 | "2.52 sec, 2.52 sec, 2.48 sec", 26 | "23.61 sec, 23.21 sec, 23.23 sec", 27 | "19.76 sec, 20.24 sec, 19.98 sec", 28 | "2.87 sec, 2.81 sec, 2.84 sec", 29 | "3.08 sec, 3.10 sec, 3.12 sec" 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /elasticsearch/results/m6i.8xlarge_bluesky_no_source_10m_best_compression.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Elasticsearch (no source, best compression)", 3 | "version": "8.17.0", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 10000000, 12 | "dataset_size_readable": "10m", 13 | "num_loaded_documents": 9999998, 14 | "data_compression": "zstd", 15 | "total_size": 2834172690, 16 | "total_size_readable": "2.6 GB", 17 | "result": [ 18 | [0.270,0.263,0.275], 19 | [2.942,2.683,2.655], 20 | [2.014,2.008,2.037], 21 | [0.414,0.412,0.437], 22 | [0.562,0.470,0.463] 23 | ], 24 | "result_readable": [ 25 | "270.00 msec, 263.00 msec, 275.00 msec", 26 | "2.94 sec, 2.68 sec, 2.65 sec", 27 | "2.01 sec, 2.01 sec, 2.04 sec", 28 | "414.00 msec, 412.00 msec, 437.00 msec", 29 | "562.00 msec, 470.00 msec, 463.00 msec" 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /elasticsearch/results/m6i.8xlarge_bluesky_no_source_10m_default_compression.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Elasticsearch (no source, default)", 3 | "version": "8.17.0", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 10000000, 12 | "dataset_size_readable": "10m", 13 | "num_loaded_documents": 9999998, 14 | "data_compression": "lz4", 15 | "total_size": 3128506511, 16 | "total_size_readable": "2.9 GB", 17 | "result": [ 18 | [0.268,0.269,0.259], 19 | [2.631,2.638,2.557], 20 | [2.099,2.049,2.066], 21 | [0.412,0.407,0.405], 22 | [0.468,0.466,0.462] 23 | ], 24 | "result_readable": [ 25 | "268.00 msec, 269.00 msec, 259.00 msec", 26 | "2.63 sec, 2.64 sec, 2.56 sec", 27 | "2.10 sec, 2.05 sec, 2.07 sec", 28 | "412.00 msec, 407.00 msec, 405.00 msec", 29 | "468.00 msec, 466.00 msec, 462.00 msec" 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /elasticsearch/results/m6i.8xlarge_bluesky_no_source_1m_best_compression.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Elasticsearch (no source, best compression)", 3 | "version": "8.17.0", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000, 12 | "dataset_size_readable": "1m", 13 | "num_loaded_documents": 1000000, 14 | "data_compression": "zstd", 15 | "total_size": 400948257, 16 | "total_size_readable": "382.3 MB", 17 | "result": [ 18 | [0.041,0.037,0.035], 19 | [0.426,0.321,0.323], 20 | [0.192,0.186,0.213], 21 | [0.056,0.052,0.053], 22 | [0.099,0.061,0.060] 23 | ], 24 | "result_readable": [ 25 | "41.00 msec, 37.00 msec, 35.00 msec", 26 | "426.00 msec, 321.00 msec, 323.00 msec", 27 | "192.00 msec, 186.00 msec, 213.00 msec", 28 | "56.00 msec, 52.00 msec, 53.00 msec", 29 | "99.00 msec, 61.00 msec, 60.00 msec" 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /elasticsearch/results/m6i.8xlarge_bluesky_no_source_1m_default_compression.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Elasticsearch (no source, default)", 3 | "version": "8.17.0", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000, 12 | "dataset_size_readable": "1m", 13 | "num_loaded_documents": 1000000, 14 | "data_compression": "lz4", 15 | "total_size": 481917965, 16 | "total_size_readable": "459.4 MB", 17 | "result": [ 18 | [0.035,0.035,0.033], 19 | [0.343,0.330,0.317], 20 | [0.192,0.196,0.189], 21 | [0.049,0.049,0.049], 22 | [0.057,0.057,0.057] 23 | ], 24 | "result_readable": [ 25 | "35.00 msec, 35.00 msec, 33.00 msec", 26 | "343.00 msec, 330.00 msec, 317.00 msec", 27 | "192.00 msec, 196.00 msec, 189.00 msec", 28 | "49.00 msec, 49.00 msec, 49.00 msec", 29 | "57.00 msec, 57.00 msec, 57.00 msec" 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /elasticsearch/results/m6i.8xlarge_bluesky_source_1000m_best_compression.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Elasticsearch (best compression)", 3 | "version": "8.17.0", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000000, 12 | "dataset_size_readable": "1000m", 13 | "num_loaded_documents": 999999101, 14 | "data_compression": "zstd", 15 | "total_size": 386099682721, 16 | "total_size_readable": "359.6 GB", 17 | "result": [ 18 | [3.854,3.884,4.081], 19 | [37.078,29.084,28.548], 20 | [24.382,24.279,23.570], 21 | [8.106,8.228,8.080], 22 | [9.208,8.994,9.084] 23 | ], 24 | "result_readable": [ 25 | "3.85 sec, 3.88 sec, 4.08 sec", 26 | "37.08 sec, 29.08 sec, 28.55 sec", 27 | "24.38 sec, 24.28 sec, 23.57 sec", 28 | "8.11 sec, 8.23 sec, 8.08 sec", 29 | "9.21 sec, 8.99 sec, 9.08 sec" 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /elasticsearch/results/m6i.8xlarge_bluesky_source_1000m_default_compression.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Elasticsearch (default)", 3 | "version": "8.17.0", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000000, 12 | "dataset_size_readable": "1000m", 13 | "num_loaded_documents": 999999153, 14 | "data_compression": "lz4", 15 | "total_size": 488061838541, 16 | "total_size_readable": "454.5 GB", 17 | "result": [ 18 | [3.087,3.026,3.060], 19 | [35.117,27.380,27.745], 20 | [24.882,25.534,24.787], 21 | [8.899,8.912,8.731], 22 | [9.748,9.733,9.797] 23 | ], 24 | "result_readable": [ 25 | "3.09 sec, 3.03 sec, 3.06 sec", 26 | "35.12 sec, 27.38 sec, 27.75 sec", 27 | "24.88 sec, 25.53 sec, 24.79 sec", 28 | "8.90 sec, 8.91 sec, 8.73 sec", 29 | "9.75 sec, 9.73 sec, 9.80 sec" 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /elasticsearch/results/m6i.8xlarge_bluesky_source_100m_best_compression.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Elasticsearch (best compression)", 3 | "version": "8.17.0", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 100000000, 12 | "dataset_size_readable": "100m", 13 | "num_loaded_documents": 99999947, 14 | "data_compression": "zstd", 15 | "total_size": 34182479705, 16 | "total_size_readable": "31.8 GB", 17 | "result": [ 18 | [2.765,2.718,2.799], 19 | [20.788,20.822,20.270], 20 | [16.306,16.642,15.693], 21 | [2.454,2.461,2.423], 22 | [2.761,2.768,2.784] 23 | ], 24 | "result_readable": [ 25 | "2.77 sec, 2.72 sec, 2.80 sec", 26 | "20.79 sec, 20.82 sec, 20.27 sec", 27 | "16.31 sec, 16.64 sec, 15.69 sec", 28 | "2.45 sec, 2.46 sec, 2.42 sec", 29 | "2.76 sec, 2.77 sec, 2.78 sec" 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /elasticsearch/results/m6i.8xlarge_bluesky_source_100m_default_compression.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Elasticsearch (default)", 3 | "version": "8.17.0", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 100000000, 12 | "dataset_size_readable": "100m", 13 | "num_loaded_documents": 99999947, 14 | "data_compression": "lz4", 15 | "total_size": 42432097529, 16 | "total_size_readable": "39.5 GB", 17 | "result": [ 18 | [2.520,2.507,2.592], 19 | [23.055,22.842,22.868], 20 | [19.873,20.199,19.585], 21 | [2.891,2.819,2.791], 22 | [3.144,3.183,3.171] 23 | ], 24 | "result_readable": [ 25 | "2.52 sec, 2.51 sec, 2.59 sec", 26 | "23.05 sec, 22.84 sec, 22.87 sec", 27 | "19.87 sec, 20.20 sec, 19.59 sec", 28 | "2.89 sec, 2.82 sec, 2.79 sec", 29 | "3.14 sec, 3.18 sec, 3.17 sec" 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /elasticsearch/results/m6i.8xlarge_bluesky_source_10m_best_compression.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Elasticsearch (best compression)", 3 | "version": "8.17.0", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 10000000, 12 | "dataset_size_readable": "10m", 13 | "num_loaded_documents": 9999998, 14 | "data_compression": "zstd", 15 | "total_size": 3785308904, 16 | "total_size_readable": "3.5 GB", 17 | "result": [ 18 | [0.286,0.290,0.287], 19 | [2.487,2.367,2.406], 20 | [1.747,1.671,1.656], 21 | [0.368,0.360,0.364], 22 | [0.423,0.424,0.422] 23 | ], 24 | "result_readable": [ 25 | "286.00 msec, 290.00 msec, 287.00 msec", 26 | "2.49 sec, 2.37 sec, 2.41 sec", 27 | "1.75 sec, 1.67 sec, 1.66 sec", 28 | "368.00 msec, 360.00 msec, 364.00 msec", 29 | "423.00 msec, 424.00 msec, 422.00 msec" 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /elasticsearch/results/m6i.8xlarge_bluesky_source_10m_default_compression.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Elasticsearch (default)", 3 | "version": "8.17.0", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 10000000, 12 | "dataset_size_readable": "10m", 13 | "num_loaded_documents": 9999998, 14 | "data_compression": "lz4", 15 | "total_size": 4600428060, 16 | "total_size_readable": "4.2 GB", 17 | "result": [ 18 | [0.266,0.274,0.266], 19 | [2.753,2.634,2.620], 20 | [2.048,2.130,2.020], 21 | [0.417,0.399,0.402], 22 | [0.465,0.466,0.480] 23 | ], 24 | "result_readable": [ 25 | "266.00 msec, 274.00 msec, 266.00 msec", 26 | "2.75 sec, 2.63 sec, 2.62 sec", 27 | "2.05 sec, 2.13 sec, 2.02 sec", 28 | "417.00 msec, 399.00 msec, 402.00 msec", 29 | "465.00 msec, 466.00 msec, 480.00 msec" 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /elasticsearch/results/m6i.8xlarge_bluesky_source_1m_best_compression.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Elasticsearch (best compression)", 3 | "version": "8.17.0", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000, 12 | "dataset_size_readable": "1m", 13 | "num_loaded_documents": 1000000, 14 | "data_compression": "zstd", 15 | "total_size": 400974094, 16 | "total_size_readable": "382.3 MB", 17 | "result": [ 18 | [0.039,0.036,0.036], 19 | [0.344,0.303,0.305], 20 | [0.171,0.166,0.159], 21 | [0.047,0.047,0.049], 22 | [0.056,0.056,0.056] 23 | ], 24 | "result_readable": [ 25 | "39.00 msec, 36.00 msec, 36.00 msec", 26 | "344.00 msec, 303.00 msec, 305.00 msec", 27 | "171.00 msec, 166.00 msec, 159.00 msec", 28 | "47.00 msec, 47.00 msec, 49.00 msec", 29 | "56.00 msec, 56.00 msec, 56.00 msec" 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /elasticsearch/results/m6i.8xlarge_bluesky_source_1m_default_compression.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "Elasticsearch", 3 | "version": "8.17.0", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-14", 6 | "machine": "m6i.8xlarge, 16000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000, 12 | "dataset_size_readable": "1m", 13 | "num_loaded_documents": 1000000, 14 | "data_compression": "lz4", 15 | "total_size": 932793512, 16 | "total_size_readable": "889.5 MB", 17 | "result": [ 18 | [0.037,0.034,0.035], 19 | [0.350,0.320,0.328], 20 | [0.199,0.193,0.190], 21 | [0.048,0.050,0.049], 22 | [0.057,0.059,0.058] 23 | ], 24 | "result_readable": [ 25 | "37.00 msec, 34.00 msec, 35.00 msec", 26 | "350.00 msec, 320.00 msec, 328.00 msec", 27 | "199.00 msec, 193.00 msec, 190.00 msec", 28 | "48.00 msec, 50.00 msec, 49.00 msec", 29 | "57.00 msec, 59.00 msec, 58.00 msec" 30 | ] 31 | } -------------------------------------------------------------------------------- /elasticsearch/run_queries.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Check if ELASTIC_PASSWORD env variable is set, if set from not read from .elastic_password file 10 | if [[ -z "$ELASTIC_PASSWORD" ]]; then 11 | [[ ! -f ".elastic_password" ]] && { echo "Error: ELASTIC_PASSWORD environment variable is not set and .elastic_password file does not exist."; exit 1; } 12 | export $(cat .elastic_password) 13 | fi 14 | 15 | # Arguments 16 | INDEX_NAME="$1" 17 | 18 | # Number of tries for each query 19 | TRIES=3 20 | 21 | # File containing Elasticsearch ES|SQL queries 22 | QUERY_FILE="queries.txt" 23 | LOG_FILE="query_log_$INDEX_NAME.log" 24 | > "$LOG_FILE" 25 | 26 | # Check if the query file exists 27 | if [[ ! -f "$QUERY_FILE" ]]; then 28 | echo "Error: Query file '$QUERY_FILE' does not exist." 29 | exit 1 30 | fi 31 | 32 | cat 'queries.txt' | while read -r QUERY; do 33 | # Clear filesystem cache between queries. 34 | sync 35 | echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 36 | # Clear query cache between queries. 37 | curl -k -X POST 'https://localhost:9200/hits/_cache/clear?pretty' -u "elastic:${ELASTIC_PASSWORD}" &>/dev/null 38 | eval "QUERY=\"${QUERY}\"" 39 | echo "Running query: $QUERY" 40 | for i in $(seq 1 $TRIES); do 41 | CURL_DATA="{\"query\": \"$QUERY\"}" 42 | RESPONSE=$(curl -s -k -X POST "https://localhost:9200/_query" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' -d "$CURL_DATA") 43 | TOOK_MS=$(echo "$RESPONSE" | jq -r '.took' 2>/dev/null) 44 | 45 | # Convert 'took' to seconds (from ms to s) 46 | TOOK_S=$(bc <<< "scale=3; $TOOK_MS / 1000") 47 | TOOK_FORMATTED=$(printf "%.3f" "$TOOK_S") 48 | echo "$RESPONSE" >> "$LOG_FILE" 49 | echo "Response time: ${TOOK_FORMATTED} s" 50 | done 51 | done 52 | -------------------------------------------------------------------------------- /elasticsearch/total_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Check if ELASTIC_PASSWORD env variable is set, if set from not read from .elastic_password file 10 | if [[ -z "$ELASTIC_PASSWORD" ]]; then 11 | [[ ! -f ".elastic_password" ]] && { echo "Error: ELASTIC_PASSWORD environment variable is not set and .elastic_password file does not exist."; exit 1; } 12 | export $(cat .elastic_password) 13 | fi 14 | 15 | # Arguments 16 | INDEX_NAME="$1" 17 | 18 | # Get data size 19 | curl -k -XGET "https://localhost:9200/_data_stream/${INDEX_NAME}/_stats?human" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' -------------------------------------------------------------------------------- /favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Terry7879/JSONBench/3e43960d581c4a3718ea7ec6c8812fadbceac02d/favicon.png -------------------------------------------------------------------------------- /generate-results.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # This script will substitute the benchmark results into the HTML page. 4 | # Note: editing HTML with sed may look strange, but at least we avoid using node.js and npm, and that's good. 5 | 6 | # This is needed on Mac OS. Do `brew install coreutils`. 7 | [ -n "$HOMEBREW_PREFIX" ] && PATH="${HOMEBREW_PREFIX}/opt/coreutils/libexec/gnubin:${PATH}" 8 | if command -v gsed >/dev/null 2>&1 9 | then 10 | alias sed='gsed' 11 | fi 12 | 13 | ( 14 | sed '/^const data = \[$/q' index.html 15 | 16 | FIRST=1 17 | LANG="" ls -1 */results*/*.json | while read -r file 18 | do 19 | [ "${FIRST}" = "0" ] && echo -n ',' 20 | jq --compact-output ". += {\"source\": \"${file}\"}" "${file}" || echo "Error in $file" >&2 21 | FIRST=0 22 | done 23 | 24 | echo ']; // end of data' 25 | sed '0,/^\]; \/\/ end of data$/d' index.html 26 | 27 | ) > index.html.new 28 | 29 | mv index.html index.html.bak 30 | mv index.html.new index.html 31 | -------------------------------------------------------------------------------- /mongodb/benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 [RESULT_FILE]" 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | RESULT_FILE="${2:-}" 12 | 13 | # Construct the query log file name using $DB_NAME 14 | QUERY_LOG_FILE="_query_log_${DB_NAME}.txt" 15 | 16 | # Print the database name 17 | echo "Running queries on database: $DB_NAME" 18 | 19 | # Run queries and log the output 20 | ./run_queries.sh "$DB_NAME" 2>&1 | tee "$QUERY_LOG_FILE" 21 | 22 | # Process the query log and prepare the result 23 | RESULT=$(cat "$QUERY_LOG_FILE" | grep -oP 'Execution time: \d+ms' | sed -r 's/Execution time: ([0-9]+)/\1/' | \ 24 | awk '{ if (i % 3 == 0) { printf "[" }; printf $1 / 1000; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }') 25 | 26 | # Output the result 27 | if [[ -n "$RESULT_FILE" ]]; then 28 | echo "$RESULT" > "$RESULT_FILE" 29 | echo "Result written to $RESULT_FILE" 30 | else 31 | echo "$RESULT" 32 | fi -------------------------------------------------------------------------------- /mongodb/count.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATABASE_NAME="$1" 11 | COLLECTION_NAME="$2" 12 | 13 | # Fetch the document count using mongosh 14 | document_count=$(mongosh --quiet --eval " 15 | const db = db.getSiblingDB('$DATABASE_NAME'); 16 | const count = db.getCollection('$COLLECTION_NAME').stats().count 17 | print(count); 18 | ") 19 | 20 | # Debugging information 21 | echo "Database: $DATABASE_NAME" 22 | echo "Collection: $COLLECTION_NAME" 23 | echo "Document count: $document_count" 24 | 25 | # Print the result 26 | if [[ -z "$document_count" ]]; then 27 | echo "Error: Unable to fetch document count. Ensure the database and collection exist." 28 | exit 1 29 | else 30 | echo $document_count 31 | fi -------------------------------------------------------------------------------- /mongodb/create_and_load.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 7 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | COLLECTION_NAME="$2" 12 | DDL_FILE="$3" 13 | DATA_DIRECTORY="$4" 14 | NUM_FILES="$5" 15 | SUCCESS_LOG="$6" 16 | ERROR_LOG="$7" 17 | 18 | # Validate arguments 19 | [[ ! -f "$DDL_FILE" ]] && { echo "Error: DDL file '$DDL_FILE' does not exist."; exit 1; } 20 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; } 21 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; } 22 | 23 | # Create database and execute DDL file 24 | mongosh --quiet --eval " 25 | db = db.getSiblingDB('$DB_NAME'); 26 | load('$DDL_FILE'); 27 | " 28 | 29 | # Load data 30 | ./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$COLLECTION_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG" 31 | 32 | echo "Script completed successfully." -------------------------------------------------------------------------------- /mongodb/data_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATABASE_NAME="$1" 11 | COLLECTION_NAME="$2" 12 | 13 | # Fetch the totalSize using mongosh 14 | total_size=$(mongosh --quiet --eval " 15 | const db = db.getSiblingDB('$DATABASE_NAME'); 16 | const stats = db.getCollection('$COLLECTION_NAME').stats(); 17 | print(stats.storageSize); 18 | ") 19 | 20 | # Print the result 21 | if [[ -z "$total_size" ]]; then 22 | echo "Error: Unable to fetch totalSize. Ensure the database and collection exist." 23 | exit 1 24 | else 25 | echo $total_size 26 | fi -------------------------------------------------------------------------------- /mongodb/ddl_snappy.js: -------------------------------------------------------------------------------- 1 | db.createCollection( 2 | "bluesky", 3 | { storageEngine: { wiredTiger: { configString: "block_compressor=snappy" } } } 4 | ); 5 | 6 | db.bluesky.createIndex({"kind": 1, "commit.operation": 1, "commit.collection": 1, "did": 1, "time_us": 1}); -------------------------------------------------------------------------------- /mongodb/ddl_zstd.js: -------------------------------------------------------------------------------- 1 | db.createCollection( 2 | "bluesky", 3 | { storageEngine: { wiredTiger: { configString: "block_compressor=zstd" } } } 4 | ); 5 | 6 | db.bluesky.createIndex({"kind": 1, "commit.operation": 1, "commit.collection": 1, "did": 1, "time_us": 1}); -------------------------------------------------------------------------------- /mongodb/index_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATABASE_NAME="$1" 11 | COLLECTION_NAME="$2" 12 | 13 | # Fetch the totalSize using mongosh 14 | total_size=$(mongosh --quiet --eval " 15 | const db = db.getSiblingDB('$DATABASE_NAME'); 16 | const stats = db.getCollection('$COLLECTION_NAME').stats(); 17 | print(stats.totalIndexSize); 18 | ") 19 | 20 | # Print the result 21 | if [[ -z "$total_size" ]]; then 22 | echo "Error: Unable to fetch totalSize. Ensure the database and collection exist." 23 | exit 1 24 | else 25 | echo $total_size 26 | fi -------------------------------------------------------------------------------- /mongodb/index_usage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | 12 | QUERY_NUM=1 13 | 14 | # File containing MongoDB queries (replace 'queries.js' with your file) 15 | QUERY_FILE="queries.js" 16 | 17 | # Check if the query file exists 18 | if [[ ! -f "$QUERY_FILE" ]]; then 19 | echo "Error: Query file '$QUERY_FILE' does not exist." 20 | exit 1 21 | fi 22 | 23 | # Set the internalQueryPlannerGenerateCoveredWholeIndexScans parameter to true 24 | echo "Setting internalQueryPlannerGenerateCoveredWholeIndexScans to true..." 25 | mongosh --quiet --eval " 26 | const result = db.adminCommand({ setParameter: 1, internalQueryPlannerGenerateCoveredWholeIndexScans: true }); 27 | if (result.ok !== 1) { 28 | print('Failed to set internalQueryPlannerGenerateCoveredWholeIndexScans: ' + JSON.stringify(result)); 29 | quit(1); 30 | } else { 31 | print('Successfully set internalQueryPlannerGenerateCoveredWholeIndexScans to true'); 32 | } 33 | " 34 | 35 | cat "$QUERY_FILE" | while read -r query; do 36 | 37 | # Print the query number 38 | echo "------------------------------------------------------------------------------------------------------------------------" 39 | echo "Index usage for query Q$QUERY_NUM:" 40 | echo 41 | 42 | # Modify the query to include the explain option inside the aggregate call 43 | MODIFIED_QUERY=$(echo "$query" | sed 's/]);$/], { explain: "queryPlanner" });/') 44 | 45 | # Escape the modified query for safe passing to mongosh 46 | ESCAPED_QUERY=$(echo "$MODIFIED_QUERY" | sed 's/\([\"\\]\)/\\\1/g' | sed 's/\$/\\$/g') 47 | 48 | mongosh --quiet --eval " 49 | const db = db.getSiblingDB('$DB_NAME'); 50 | const result = eval(\"$ESCAPED_QUERY\"); 51 | printjson(result.stages[0].\$cursor.queryPlanner.winningPlan); 52 | " 53 | 54 | # Increment the query number 55 | QUERY_NUM=$((QUERY_NUM + 1)) 56 | done; -------------------------------------------------------------------------------- /mongodb/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # https://www.mongodb.com/docs/manual/tutorial/install-mongodb-on-ubuntu/ 4 | 5 | sudo sudo apt-get install gnupg curl 6 | curl -fsSL https://www.mongodb.org/static/pgp/server-8.0.asc | \ 7 | sudo gpg -o /usr/share/keyrings/mongodb-server-8.0.gpg \ 8 | --dearmor 9 | echo "deb [ arch=amd64,arm64 signed-by=/usr/share/keyrings/mongodb-server-8.0.gpg ] https://repo.mongodb.org/apt/ubuntu noble/mongodb-org/8.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-8.0.list 10 | sudo apt-get update 11 | sudo apt-get install -y mongodb-org 12 | sudo systemctl start mongod 13 | sudo systemctl status mongod -------------------------------------------------------------------------------- /mongodb/load_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 6 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DIRECTORY="$1" 11 | DB_NAME="$2" 12 | COLLECTION_NAME="$3" 13 | MAX_FILES="$4" 14 | SUCCESS_LOG="$5" 15 | ERROR_LOG="$6" 16 | MONGO_URI="mongodb://localhost:27017" # Replace with your MongoDB URI if necessary 17 | 18 | # Validate that MAX_FILES is a number 19 | if ! [[ "$MAX_FILES" =~ ^[0-9]+$ ]]; then 20 | echo "Error: must be a positive integer." 21 | exit 1 22 | fi 23 | 24 | # Ensure the log files exist 25 | touch "$SUCCESS_LOG" "$ERROR_LOG" 26 | 27 | # Create a temporary directory for uncompressed files 28 | TEMP_DIR=$(mktemp -d /var/tmp/json_files.XXXXXX) 29 | trap "rm -rf $TEMP_DIR" EXIT # Ensure cleanup on script exit 30 | 31 | # Counter to track processed files 32 | counter=0 33 | 34 | # Loop through each .json.gz file in the directory 35 | for file in $(ls "$DIRECTORY"/*.json.gz 2>/dev/null | sort); do 36 | if [[ -f "$file" ]]; then 37 | echo "Processing $file..." 38 | counter=$((counter + 1)) 39 | 40 | # Uncompress the file into the TEMP_DIR 41 | uncompressed_file="$TEMP_DIR/$(basename "${file%.gz}")" 42 | gunzip -c "$file" > "$uncompressed_file" 43 | 44 | # Check if uncompression was successful 45 | if [[ $? -ne 0 ]]; then 46 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed to uncompress $file." >> "$ERROR_LOG" 47 | continue 48 | fi 49 | 50 | # Import the uncompressed JSON file into MongoDB 51 | mongoimport --uri "$MONGO_URI" --db "$DB_NAME" --collection "$COLLECTION_NAME" --file "$uncompressed_file" 52 | import_status=$? 53 | 54 | # Check if the import was successful 55 | if [[ $import_status -eq 0 ]]; then 56 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $uncompressed_file into MongoDB." >> "$SUCCESS_LOG" 57 | else 58 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed to import $uncompressed_file into MongoDB." >> "$ERROR_LOG" 59 | fi 60 | 61 | # Remove the uncompressed file after processing 62 | rm -f "$uncompressed_file" 63 | 64 | # Stop processing if the max number of files is reached 65 | if [[ $counter -ge $MAX_FILES ]]; then 66 | echo "Processed maximum number of files: $MAX_FILES" 67 | break 68 | fi 69 | fi 70 | done 71 | 72 | if [[ $counter -eq 0 ]]; then 73 | echo "No .json.gz files found in the directory." 74 | fi 75 | 76 | echo "All files have been processed." -------------------------------------------------------------------------------- /mongodb/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Default data directory 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky 5 | 6 | # Allow the user to optionally provide the data directory as an argument 7 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}" 8 | 9 | # Define success and error log files 10 | SUCCESS_LOG="${2:-success.log}" 11 | ERROR_LOG="${3:-error.log}" 12 | 13 | # Define prefix for output files 14 | OUTPUT_PREFIX="${4:-_m6i.8xlarge}" 15 | 16 | # Check if the directory exists 17 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 18 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 19 | exit 1 20 | fi 21 | 22 | echo "Select the dataset size to benchmark:" 23 | echo "1) 1m (default)" 24 | echo "2) 10m" 25 | echo "3) 100m" 26 | echo "4) 1000m" 27 | echo "5) all" 28 | read -p "Enter the number corresponding to your choice: " choice 29 | 30 | ./install.sh 31 | 32 | benchmark() { 33 | local size=$1 34 | local compression=$2 35 | # Check DATA_DIRECTORY contains the required number of files to run the benchmark 36 | file_count=$(find "$DATA_DIRECTORY" -type f | wc -l) 37 | if (( file_count < size )); then 38 | echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count." 39 | exit 1 40 | fi 41 | ./create_and_load.sh "bluesky_${size}m_${compression}" bluesky "ddl_${compression}.js" "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG" 42 | ./total_size.sh "bluesky_${size}m_${compression}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.total_size" 43 | ./data_size.sh "bluesky_${size}m_${compression}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.data_size" 44 | ./index_size.sh "bluesky_${size}m_${compression}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.index_size" 45 | ./count.sh "bluesky_${size}m_${compression}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.count" 46 | #./query_results.sh "bluesky_${size}m_${compression}" | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.query_results" 47 | ./index_usage.sh "bluesky_${size}m_${compression}" | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.index_usage" 48 | ./benchmark.sh "bluesky_${size}m_${compression}" "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.results_runtime" 49 | } 50 | 51 | case $choice in 52 | 2) 53 | benchmark 10 snappy 54 | benchmark 10 zstd 55 | ;; 56 | 3) 57 | benchmark 100 snappy 58 | benchmark 100 zstd 59 | ;; 60 | 4) 61 | benchmark 1000 snappy 62 | benchmark 1000 zstd 63 | ;; 64 | 5) 65 | benchmark 1 snappy 66 | benchmark 1 zstd 67 | benchmark 10 snappy 68 | benchmark 10 zstd 69 | benchmark 100 snappy 70 | benchmark 100 zstd 71 | benchmark 1000 snappy 72 | benchmark 1000 zstd 73 | ;; 74 | *) 75 | benchmark 1 snappy 76 | benchmark 1 zstd 77 | ;; 78 | esac -------------------------------------------------------------------------------- /mongodb/queries.js: -------------------------------------------------------------------------------- 1 | db.bluesky.aggregate([ { $group: { _id: "$commit.collection", count: { $sum: 1 } } }, { $sort: { count: -1 } } ]); 2 | db.bluesky.aggregate([ { $match: { "kind": "commit", "commit.operation": "create" } }, { $group: { _id: "$commit.collection", count: { $sum: 1 }, users: { $addToSet: "$did" } } }, { $project: { event: "$_id", count: 1, users: { $size: "$users" } } }, { $sort: { count: -1 } } ]); 3 | db.bluesky.aggregate([ { $match: { "kind": "commit", "commit.operation": "create", "commit.collection": { $in: ["app.bsky.feed.post", "app.bsky.feed.repost", "app.bsky.feed.like"] } } }, { $project: { _id: 0, event: "$commit.collection", hour_of_day: { $hour: { $toDate: { $divide: ["$time_us", 1000] } } } } }, { $group: { _id: { event: "$event", hour_of_day: "$hour_of_day" }, count: { $sum: 1 } } }, { $sort: { "_id.hour_of_day": 1, "_id.event": 1 } } ]); 4 | db.bluesky.aggregate([ { $match: { "kind": "commit", "commit.operation": "create", "commit.collection": "app.bsky.feed.post" } }, { $project: { _id: 0, user_id: "$did", timestamp: { $toDate: { $divide: ["$time_us", 1000] } } } }, { $group: { _id: "$user_id", first_post_ts: { $min: "$timestamp" } } }, { $sort: { first_post_ts: 1 } }, { $limit: 3 } ]); 5 | db.bluesky.aggregate([ { $match: { "kind": "commit", "commit.operation": "create", "commit.collection": "app.bsky.feed.post" } }, { $project: { _id: 0, user_id: "$did", timestamp: { $toDate: { $divide: ["$time_us", 1000] } } } }, { $group: { _id: "$user_id", min_timestamp: { $min: "$timestamp" }, max_timestamp: { $max: "$timestamp" } } }, { $project: { activity_span: { $dateDiff: { startDate: "$min_timestamp", endDate: "$max_timestamp", unit: "millisecond" } } } }, { $sort: { activity_span: -1 } }, { $limit: 3 } ]); -------------------------------------------------------------------------------- /mongodb/query_results.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | 12 | QUERY_NUM=1 13 | 14 | # File containing MongoDB queries (replace 'queries.js' with your file) 15 | QUERY_FILE="queries.js" 16 | 17 | # Check if the query file exists 18 | if [[ ! -f "$QUERY_FILE" ]]; then 19 | echo "Error: Query file '$QUERY_FILE' does not exist." 20 | exit 1 21 | fi 22 | 23 | # Read and execute each query 24 | cat "$QUERY_FILE" | while read -r query; do 25 | 26 | # Print the query 27 | echo "------------------------------------------------------------------------------------------------------------------------" 28 | echo "Result for query Q$QUERY_NUM:" 29 | echo 30 | 31 | # Escape the query for safe passing to mongosh 32 | ESCAPED_QUERY=$(echo "$query" | sed 's/\([\"\\]\)/\\\1/g' | sed 's/\$/\\$/g') 33 | 34 | mongosh --eval " 35 | const db = db.getSiblingDB('$DB_NAME'); 36 | const result = eval(\"$ESCAPED_QUERY\"); 37 | printjson(result); 38 | " 39 | 40 | 41 | # Increment the query number 42 | QUERY_NUM=$((QUERY_NUM + 1)) 43 | 44 | done -------------------------------------------------------------------------------- /mongodb/results/m6i.8xlarge_bluesky_1000m_snappy.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "MongoDB (snappy, covered index)", 3 | "version": "8.0.3", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000000, 12 | "dataset_size_readable": "1000m", 13 | "num_loaded_documents": 893632990, 14 | "data_compression": "snappy", 15 | "total_size": 248580009984, 16 | "total_size_readable": "248.58 GB", 17 | "data_size": 221332566016, 18 | "data_size_readable": "221.33 GB", 19 | "index_size": 27247443968, 20 | "index_size_readable": "27.25 GB", 21 | "result": [ 22 | [987.157,984.441,988.002], 23 | [21177.1,21130,21520.6], 24 | [1242.34,1229.8,1235.88], 25 | [161.845,162.1,162.285], 26 | [166.271,166.134,165.7] 27 | ], 28 | "result_readable": [ 29 | "16 min 27.16 sec, 16 min 24.44 sec, 16 min 28.00 sec", 30 | "5 hr 52 min 57.10 sec, 5 hr 52 min 10.00 sec, 5 hr 58 min 40.60 sec", 31 | "20 min 42.34 sec, 20 min 29.80 sec, 20 min 35.88 sec", 32 | "2 min 41.84 sec, 2 min 42.10 sec, 2 min 42.28 sec", 33 | "2 min 46.27 sec, 2 min 46.13 sec, 2 min 45.70 sec" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /mongodb/results/m6i.8xlarge_bluesky_1000m_zstd.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "MongoDB (zstd, covered index)", 3 | "version": "8.0.3", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000000, 12 | "dataset_size_readable": "1000m", 13 | "num_loaded_documents": 893632990, 14 | "data_compression": "zstd", 15 | "total_size": 158276513792, 16 | "total_size_readable": "158.28 GB", 17 | "data_size": 130998910976, 18 | "data_size_readable": "131.00 GB", 19 | "index_size": 27277602816, 20 | "index_size_readable": "27.28 GB", 21 | "result": [ 22 | [992.518,977.968,983.272], 23 | [21558.6,21530.5,21379.2], 24 | [1238.16,1231,1242.69], 25 | [162.668,162.236,162.897], 26 | [165.783,166.72,165.989] 27 | ], 28 | "result_readable": [ 29 | "16 min 32.52 sec, 16 min 17.97 sec, 16 min 23.27 sec", 30 | "5 hr 59 min 18.60 sec, 5 hr 58 min 50.50 sec, 5 hr 56 min 19.20 sec", 31 | "20 min 38.16 sec, 20 min 31.00 sec, 20 min 42.69 sec", 32 | "2 min 42.67 sec, 2 min 42.24 sec, 2 min 42.90 sec", 33 | "2 min 45.78 sec, 2 min 46.72 sec, 2 min 45.99 sec" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /mongodb/results/m6i.8xlarge_bluesky_100m_snappy.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "MongoDB (snappy, covered index)", 3 | "version": "8.0.3", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 100000000, 12 | "dataset_size_readable": "100m", 13 | "num_loaded_documents": 94408000, 14 | "data_compression": "snappy", 15 | "total_size": 26336047104, 16 | "total_size_readable": "26.34 GB", 17 | "data_size": 23520026624, 18 | "data_size_readable": "23.52 GB", 19 | "index_size": 2816020480, 20 | "index_size_readable": "2.82 GB", 21 | "result": [ 22 | [8.824,8.558,8.519], 23 | [32.831,34.321,34.477], 24 | [10.825,10.725,10.763], 25 | [1.723,1.762,1.736], 26 | [1.841,1.869,1.857] 27 | ], 28 | "result_readable": [ 29 | "8.82 sec, 8.56 sec, 8.52 sec", 30 | "32.83 sec, 34.32 sec, 34.48 sec", 31 | "10.82 sec, 10.72 sec, 10.76 sec", 32 | "1.72 sec, 1.76 sec, 1.74 sec", 33 | "1.84 sec, 1.87 sec, 1.86 sec" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /mongodb/results/m6i.8xlarge_bluesky_100m_zstd.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "MongoDB (zstd, covered index)", 3 | "version": "8.0.3", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 100000000, 12 | "dataset_size_readable": "100m", 13 | "num_loaded_documents": 94408000, 14 | "data_compression": "zstd", 15 | "total_size": 16577900544, 16 | "total_size_readable": "16.57 GB", 17 | "data_size": 13758480384, 18 | "data_size_readable": "13.76 GB", 19 | "index_size": 2819420160, 20 | "index_size_readable": "2.82 GB", 21 | "result": [ 22 | [105.051,102.811,101.4], 23 | [1440.91,1472.03,1440.5], 24 | [123.522,123.082,123.149], 25 | [17.125,17.251,17.238], 26 | [17.962,17.913,17.852] 27 | ], 28 | "result_readable": [ 29 | "1 min 45.05 sec, 1 min 42.81 sec, 1 min 41.40 sec", 30 | "24 min 0.91 sec, 24 min 32.03 sec, 24 min 0.50 sec", 31 | "2 min 3.52 sec, 2 min 3.08 sec, 2 min 3.15 sec", 32 | "17.12 sec, 17.25 sec, 17.24 sec", 33 | "17.96 sec, 17.91 sec, 17.85 sec" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /mongodb/results/m6i.8xlarge_bluesky_10m_snappy.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "MongoDB (snappy, covered index)", 3 | "version": "8.0.3", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 10000000, 12 | "dataset_size_readable": "10m", 13 | "num_loaded_documents": 7860000, 14 | "data_compression": "snappy", 15 | "total_size": 2200559616, 16 | "total_size_readable": "2.20 GB", 17 | "data_size": 1947783168, 18 | "data_size_readable": "1.95 GB", 19 | "index_size": 252776448, 20 | "index_size_readable": "252.78", 21 | "result": [ 22 | [8.824,8.558,8.519], 23 | [32.831,34.321,34.477], 24 | [10.825,10.725,10.763], 25 | [1.723,1.762,1.736], 26 | [1.841,1.869,1.857] 27 | ], 28 | "result_readable": [ 29 | "8.82 sec, 8.56 sec, 8.52 sec", 30 | "32.83 sec, 34.32 sec, 34.48 sec", 31 | "10.82 sec, 10.72 sec, 10.76 sec", 32 | "1.72 sec, 1.76 sec, 1.74 sec", 33 | "1.84 sec, 1.87 sec, 1.86 sec" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /mongodb/results/m6i.8xlarge_bluesky_10m_zstd.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "MongoDB (zstd, covered index)", 3 | "version": "8.0.3", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 10000000, 12 | "dataset_size_readable": "10m", 13 | "num_loaded_documents": 7860000, 14 | "data_compression": "zstd", 15 | "total_size": 1465458688, 16 | "total_size_readable": "1.47 GB", 17 | "data_size": 1212796928, 18 | "data_size_readable": "1.21 GB", 19 | "index_size": 252661760, 20 | "index_size_readable": "252.66 MB", 21 | "result": [ 22 | [8.845,8.541,8.476], 23 | [30.208,31.527,33.274], 24 | [10.843,10.723,10.697], 25 | [1.721,1.714,1.741], 26 | [1.851,1.859,1.843] 27 | ], 28 | "result_readable": [ 29 | "8.85 sec, 8.54 sec, 8.48 sec", 30 | "30.21 sec, 31.53 sec, 33.27 sec", 31 | "10.84 sec, 10.72 sec, 10.70 sec", 32 | "1.72 sec, 1.71 sec, 1.74 sec", 33 | "1.85 sec, 1.86 sec, 1.84 sec" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /mongodb/results/m6i.8xlarge_bluesky_1m_snappy.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "MongoDB (snappy, covered index)", 3 | "version": "8.0.3", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000, 12 | "dataset_size_readable": "1m", 13 | "num_loaded_documents": 1000000, 14 | "data_compression": "snappy", 15 | "total_size": 287158272, 16 | "total_size_readable": "287.16 MB", 17 | "data_size": 253288448, 18 | "data_size_readable": "253.29 MB", 19 | "index_size": 33869824, 20 | "index_size_readable": "33.87 MB", 21 | "result": [ 22 | [1.157,1.098,1.102], 23 | [1.742,1.747,1.768], 24 | [1.344,1.335,1.326], 25 | [0.224,0.22,0.224], 26 | [0.243,0.239,0.242] 27 | ], 28 | "result_readable": [ 29 | "1.16 sec, 1.10 sec, 1.10 sec", 30 | "1.74 sec, 1.75 sec, 1.77 sec", 31 | "1.34 sec, 1.33 sec, 1.33 sec", 32 | "224.00 msec, 220.00 msec, 224.00 msec", 33 | "243.00 msec, 239.00 msec, 242.00 msec" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /mongodb/results/m6i.8xlarge_bluesky_1m_zstd.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "MongoDB (zstd, covered index)", 3 | "version": "8.0.3", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000, 12 | "dataset_size_readable": "1m", 13 | "num_loaded_documents": 1000000, 14 | "data_compression": "zstd", 15 | "total_size": 189222912, 16 | "total_size_readable": "189.22 MB", 17 | "data_size": 155348992, 18 | "data_size_readable": "155.35 MB", 19 | "index_size": 33873920, 20 | "index_size_readable": "33.87 MB", 21 | "result": [ 22 | [1.159,1.108,1.1], 23 | [1.765,1.717,1.737], 24 | [1.37,1.34,1.325], 25 | [0.22,0.248,0.228], 26 | [0.241,0.242,0.244] 27 | ], 28 | "result_readable": [ 29 | "1.16 sec, 1.11 sec, 1.10 sec", 30 | "1.76 sec, 1.72 sec, 1.74 sec", 31 | "1.37 sec, 1.34 sec, 1.32 sec", 32 | "220.00 msec, 248.00 msec, 228.00 msec", 33 | "241.00 msec, 242.00 msec, 244.00 msec" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /mongodb/results_without_covered_index_scans/m6i.8xlarge_bluesky_1000m_snappy.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "MongoDB (snappy)", 3 | "version": "8.0.3", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000000, 12 | "dataset_size_readable": "1000m", 13 | "num_loaded_documents": 893632990, 14 | "data_compression": "snappy", 15 | "total_size": 245411815424, 16 | "total_size_readable": "245.41 GB", 17 | "data_size": 221332566016, 18 | "data_size_readable": "221.33 GB", 19 | "index_size": 24079249408, 20 | "index_size_readable": "24.08 GB", 21 | "result": [ 22 | [1974.18,1956.12,1957.9], 23 | [45189.1,43107.3,42923.6], 24 | [6354.06,6348.55,6347.45], 25 | [2031.61,2022.65,2020.14], 26 | [2054.2,2041.93,2040.19] 27 | ], 28 | "result_readable": [ 29 | "32 min 54.18 sec, 32 min 36.12 sec, 32 min 37.90 sec", 30 | "12 hr 33 min 9.10 sec, 11 hr 58 min 27.30 sec, 11 hr 55 min 23.60 sec", 31 | "1 hr 45 min 54.06 sec, 1 hr 45 min 48.55 sec, 1 hr 45 min 47.45 sec", 32 | "33 min 51.61 sec, 33 min 42.65 sec, 33 min 40.14 sec", 33 | "34 min 14.20 sec, 34 min 1.93 sec, 34 min 0.19 sec" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /mongodb/results_without_covered_index_scans/m6i.8xlarge_bluesky_1000m_zstd.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "MongoDB (zstd)", 3 | "version": "8.0.3", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000000, 12 | "dataset_size_readable": "1000m", 13 | "num_loaded_documents": 893632990, 14 | "data_compression": "zstd", 15 | "total_size": 155395039232, 16 | "total_size_readable": "155.40 GB", 17 | "data_size": 130998910976, 18 | "data_size_readable": "131.00 GB", 19 | "index_size": 24396128256, 20 | "index_size_readable": "24.40 GB", 21 | "result": [ 22 | [1689.1,1667.27,1664.9], 23 | [41945.3,40318.9,40753.6], 24 | [5459.58,5456.72,5451.42], 25 | [1692.29,1674.86,1674.57], 26 | [1714.64,1698.16,1699.13] 27 | ], 28 | "result_readable": [ 29 | "28 min 9.10 sec, 27 min 47.27 sec, 27 min 44.90 sec", 30 | "11 hr 39 min 5.30 sec, 11 hr 11 min 58.90 sec, 11 hr 19 min 13.60 sec", 31 | "1 hr 30 min 59.58 sec, 1 hr 30 min 56.72 sec, 1 hr 30 min 51.42 sec", 32 | "28 min 12.29 sec, 27 min 54.86 sec, 27 min 54.57 sec", 33 | "28 min 34.64 sec, 28 min 18.16 sec, 28 min 19.13 sec" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /mongodb/results_without_covered_index_scans/m6i.8xlarge_bluesky_100m_snappy.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "MongoDB (snappy)", 3 | "version": "8.0.3", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 100000000, 12 | "dataset_size_readable": "100m", 13 | "num_loaded_documents": 94408000, 14 | "data_compression": "snappy", 15 | "total_size": 25923018752, 16 | "total_size_readable": "25.92 GB", 17 | "data_size": 23520026624, 18 | "data_size_readable": "23.52 GB", 19 | "index_size": 2402992128, 20 | "index_size_readable": "2.40 GB", 21 | "result": [ 22 | [99.24,64.444,48.249], 23 | [4129.17,3504.34,2926.89], 24 | [188.384,164.718,154.082], 25 | [36.798,35.852,37.303], 26 | [36.978,38.413,41.326] 27 | ], 28 | "result_readable": [ 29 | "1 min 39.24 sec, 1 min 4.44 sec, 48.25 sec", 30 | "1 hr 8 min 49.17 sec, 58 min 24.34 sec, 48 min 46.89 sec", 31 | "3 min 8.38 sec, 2 min 44.72 sec, 2 min 34.08 sec", 32 | "36.80 sec, 35.85 sec, 37.30 sec", 33 | "36.98 sec, 38.41 sec, 41.33 sec" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /mongodb/results_without_covered_index_scans/m6i.8xlarge_bluesky_100m_zstd.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "MongoDB (zstd)", 3 | "version": "8.0.3", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 100000000, 12 | "dataset_size_readable": "100m", 13 | "num_loaded_documents": 94408000, 14 | "data_compression": "zstd", 15 | "total_size": 16198197248, 16 | "total_size_readable": "16.20 GB", 17 | "data_size": 13758480384, 18 | "data_size_readable": "13.76 GB", 19 | "index_size": 2439716864, 20 | "index_size_readable": "2.44 GB", 21 | "result": [ 22 | [171.13,56.34,50.349], 23 | [2673.28,2573.21,2576.39], 24 | [208.486,183.036,160.959], 25 | [45.227,41.365,43.751], 26 | [47.175,44.368,41.472] 27 | ], 28 | "result_readable": [ 29 | "2 min 51.13 sec, 56.34 sec, 50.35 sec", 30 | "44 min 33.28 sec, 42 min 53.21 sec, 42 min 56.39 sec", 31 | "3 min 28.49 sec, 3 min 3.04 sec, 2 min 40.96 sec", 32 | "45.23 sec, 41.37 sec, 43.75 sec", 33 | "47.17 sec, 44.37 sec, 41.47 sec" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /mongodb/results_without_covered_index_scans/m6i.8xlarge_bluesky_10m_snappy.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "MongoDB (snappy)", 3 | "version": "8.0.3", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 10000000, 12 | "dataset_size_readable": "10m", 13 | "num_loaded_documents": 7860000, 14 | "data_compression": "snappy", 15 | "total_size": 2145968128, 16 | "total_size_readable": "2.15 GB", 17 | "data_size": 1947783168, 18 | "data_size_readable": "1.95 GB", 19 | "index_size": 198184960, 20 | "index_size_readable": "198.18", 21 | "result": [ 22 | [16.363,2.861,2.807], 23 | [33.32,33.482,33.416], 24 | [11.605,11.562,11.561], 25 | [1.88,1.932,1.899], 26 | [2.029,2.025,2.028] 27 | ], 28 | "result_readable": [ 29 | "16.36 sec, 2.86 sec, 2.81 sec", 30 | "33.32 sec, 33.48 sec, 33.42 sec", 31 | "11.61 sec, 11.56 sec, 11.56 sec", 32 | "1.88 sec, 1.93 sec, 1.90 sec", 33 | "2.03 sec, 2.02 sec, 2.03 sec" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /mongodb/results_without_covered_index_scans/m6i.8xlarge_bluesky_10m_zstd.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "MongoDB (zstd)", 3 | "version": "8.0.3", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 10000000, 12 | "dataset_size_readable": "10m", 13 | "num_loaded_documents": 7860000, 14 | "data_compression": "zstd", 15 | "total_size": 1411428352, 16 | "total_size_readable": "1.41 GB", 17 | "data_size": 1212796928, 18 | "data_size_readable": "1.21 GB", 19 | "index_size": 198631424, 20 | "index_size_readable": "198.63 MB", 21 | "result": [ 22 | [15.462,3.001,2.833], 23 | [33.838,33.59,33.529], 24 | [11.42,11.431,11.508], 25 | [1.854,1.9,1.908], 26 | [2.043,2.013,2.014] 27 | ], 28 | "result_readable": [ 29 | "15.46 sec, 3.00 sec, 2.83 sec", 30 | "33.84 sec, 33.59 sec, 33.53 sec", 31 | "11.42 sec, 11.43 sec, 11.51 sec", 32 | "1.85 sec, 1.90 sec, 1.91 sec", 33 | "2.04 sec, 2.01 sec, 2.01 sec" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /mongodb/results_without_covered_index_scans/m6i.8xlarge_bluesky_1m_snappy.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "MongoDB (snappy)", 3 | "version": "8.0.3", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000, 12 | "dataset_size_readable": "1m", 13 | "num_loaded_documents": 1000000, 14 | "data_compression": "snappy", 15 | "total_size": 279212032, 16 | "total_size_readable": "279.21 MB", 17 | "data_size": 253288448, 18 | "data_size_readable": "253.29 MB", 19 | "index_size": 25923584, 20 | "index_size_readable": "25.92 MB", 21 | "result": [ 22 | [0.375,0.365,0.368], 23 | [1.789,1.759,1.752], 24 | [1.431,1.421,1.436], 25 | [0.237,0.235,0.235], 26 | [0.257,0.255,0.263] 27 | ], 28 | "result_readable": [ 29 | "375.00 msec, 365.00 msec, 368.00 msec", 30 | "1.79 sec, 1.76 sec, 1.75 sec", 31 | "1.43 sec, 1.42 sec, 1.44 sec", 32 | "237.00 msec, 235.00 msec, 235.00 msec", 33 | "257.00 msec, 255.00 msec, 263.00 msec" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /mongodb/results_without_covered_index_scans/m6i.8xlarge_bluesky_1m_zstd.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "MongoDB (zstd)", 3 | "version": "8.0.3", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000, 12 | "dataset_size_readable": "1m", 13 | "num_loaded_documents": 1000000, 14 | "data_compression": "zstd", 15 | "total_size": 179613696, 16 | "total_size_readable": "179.61 MB", 17 | "data_size": 155348992, 18 | "data_size_readable": "155.35 MB", 19 | "index_size": 24264704, 20 | "index_size_readable": "24.26 MB", 21 | "result": [ 22 | [1.017,0.377,0.365], 23 | [1.802,1.759,1.746], 24 | [1.419,1.411,1.439], 25 | [0.235,0.238,0.235], 26 | [0.254,0.266,0.257] 27 | ], 28 | "result_readable": [ 29 | "1.02 sec, 377.00 msec, 365.00 msec", 30 | "1.80 sec, 1.76 sec, 1.75 sec", 31 | "1.42 sec, 1.41 sec, 1.44 sec", 32 | "235.00 msec, 238.00 msec, 235.00 msec", 33 | "254.00 msec, 266.00 msec, 257.00 msec" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /mongodb/run_queries.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | 12 | # Number of tries for each query 13 | TRIES=3 14 | 15 | # File containing MongoDB queries (replace 'queries.js' with your file) 16 | QUERY_FILE="queries.js" 17 | 18 | # Check if the query file exists 19 | if [[ ! -f "$QUERY_FILE" ]]; then 20 | echo "Error: Query file '$QUERY_FILE' does not exist." 21 | exit 1 22 | fi 23 | 24 | # Set the internalQueryMaxAddToSetBytes parameter to 1 GB 25 | echo "Setting internalQueryMaxAddToSetBytes to 1 GB..." 26 | mongosh --quiet --eval " 27 | const result = db.adminCommand({ setParameter: 1, internalQueryMaxAddToSetBytes: 1073741824 }); 28 | if (result.ok !== 1) { 29 | print('Failed to set internalQueryMaxAddToSetBytes: ' + JSON.stringify(result)); 30 | quit(1); 31 | } else { 32 | print('Successfully set internalQueryMaxAddToSetBytes to 1 GB'); 33 | } 34 | " 35 | 36 | # Set the internalQueryPlannerGenerateCoveredWholeIndexScans parameter to true 37 | echo "Setting internalQueryPlannerGenerateCoveredWholeIndexScans to true..." 38 | mongosh --quiet --eval " 39 | const result = db.adminCommand({ setParameter: 1, internalQueryPlannerGenerateCoveredWholeIndexScans: true }); 40 | if (result.ok !== 1) { 41 | print('Failed to set internalQueryPlannerGenerateCoveredWholeIndexScans: ' + JSON.stringify(result)); 42 | quit(1); 43 | } else { 44 | print('Successfully set internalQueryPlannerGenerateCoveredWholeIndexScans to true'); 45 | } 46 | " 47 | 48 | # Read and execute each query 49 | cat "$QUERY_FILE" | while read -r query; do 50 | 51 | # Clear the Linux file system cache 52 | echo "Clearing file system cache..." 53 | sync 54 | echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 55 | echo "File system cache cleared." 56 | 57 | # Print the query 58 | echo "Running query: $query" 59 | 60 | # Escape the query for safe passing to mongosh 61 | ESCAPED_QUERY=$(echo "$query" | sed 's/\([\"\\]\)/\\\1/g' | sed 's/\$/\\$/g') 62 | 63 | # Execute the query multiple times 64 | for i in $(seq 1 $TRIES); do 65 | mongosh --quiet --eval " 66 | const db = db.getSiblingDB('$DB_NAME'); 67 | const start = new Date(); 68 | const result = eval(\"$ESCAPED_QUERY\"); 69 | // Force query execution -> When using commands like aggregate() or find(), 70 | // the query is not fully executed until the data is actually fetched or processed. 71 | if (Array.isArray(result)) { 72 | result.length; // Access the length to force evaluation for arrays 73 | } else if (typeof result === 'object' && typeof result.toArray === 'function') { 74 | result.toArray(); // Force execution for cursors 75 | } 76 | const end = new Date(); 77 | print('Execution time: ' + (end.getTime() - start.getTime()) + 'ms'); 78 | " 79 | done 80 | done -------------------------------------------------------------------------------- /mongodb/total_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DATABASE_NAME="$1" 11 | COLLECTION_NAME="$2" 12 | 13 | # Fetch the totalSize using mongosh 14 | total_size=$(mongosh --quiet --eval " 15 | const db = db.getSiblingDB('$DATABASE_NAME'); 16 | const stats = db.getCollection('$COLLECTION_NAME').stats(); 17 | print(stats.totalSize); 18 | ") 19 | 20 | # Print the result 21 | if [[ -z "$total_size" ]]; then 22 | echo "Error: Unable to fetch totalSize. Ensure the database and collection exist." 23 | exit 1 24 | else 25 | echo $total_size 26 | fi -------------------------------------------------------------------------------- /postgresql/benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 [RESULT_FILE]" 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | RESULT_FILE="${2:-}" 12 | 13 | # Construct the query log file name using $DB_NAME 14 | QUERY_LOG_FILE="_query_log_${DB_NAME}.txt" 15 | 16 | # Print the database name 17 | echo "Running queries on database: $DB_NAME" 18 | 19 | # Run queries and log the output 20 | ./run_queries.sh "$DB_NAME" 2>&1 | tee "$QUERY_LOG_FILE" 21 | 22 | # Process the query log and prepare the result 23 | RESULT=$(cat "$QUERY_LOG_FILE" | grep -oP 'Time: \d+\.\d+ ms' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/' | \ 24 | awk '{ if (i % 3 == 0) { printf "[" }; printf $1 / 1000; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }') 25 | 26 | # Output the result 27 | if [[ -n "$RESULT_FILE" ]]; then 28 | echo "$RESULT" > "$RESULT_FILE" 29 | echo "Result written to $RESULT_FILE" 30 | else 31 | echo "$RESULT" 32 | fi -------------------------------------------------------------------------------- /postgresql/count.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | TABLE_NAME="$2" 12 | 13 | # Corrected SQL query 14 | sudo -u postgres psql -d "$DB_NAME" -t -c "SELECT count(*) from $TABLE_NAME" -------------------------------------------------------------------------------- /postgresql/create_and_load.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 7 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | TABLE_NAME="$2" 12 | DDL_FILE="$3" 13 | DATA_DIRECTORY="$4" 14 | NUM_FILES="$5" 15 | SUCCESS_LOG="$6" 16 | ERROR_LOG="$7" 17 | 18 | # Validate arguments 19 | [[ ! -f "$DDL_FILE" ]] && { echo "Error: DDL file '$DDL_FILE' does not exist."; exit 1; } 20 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; } 21 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; } 22 | 23 | # Create database 24 | sudo -u postgres psql -t -c "CREATE DATABASE $DB_NAME" 25 | 26 | # Execute DDL 27 | sudo -u postgres psql "$DB_NAME" -t < "$DDL_FILE" 28 | 29 | # Load data 30 | ./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG" 31 | 32 | # Vacuum analyze the table 33 | sudo -u postgres psql "$DB_NAME" -t -c "VACUUM ANALYZE $TABLE_NAME" 34 | 35 | echo "Script completed successfully." -------------------------------------------------------------------------------- /postgresql/data_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | TABLE_NAME="$2" 12 | 13 | sudo -u postgres psql -d "$DB_NAME" -t -c "SELECT pg_table_size('$TABLE_NAME')" -------------------------------------------------------------------------------- /postgresql/ddl_lz4.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE bluesky ( 2 | data JSONB COMPRESSION lz4 NOT NULL 3 | ); 4 | 5 | CREATE INDEX idx_bluesky 6 | ON bluesky ( 7 | (data ->> 'kind'), 8 | (data -> 'commit' ->> 'operation'), 9 | (data -> 'commit' ->> 'collection'), 10 | (data ->> 'did'), 11 | (TO_TIMESTAMP((data ->> 'time_us')::BIGINT / 1000000.0)) 12 | ); -------------------------------------------------------------------------------- /postgresql/ddl_pglz.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE bluesky ( 2 | data JSONB COMPRESSION pglz NOT NULL 3 | ); 4 | 5 | CREATE INDEX idx_bluesky 6 | ON bluesky ( 7 | (data ->> 'kind'), 8 | (data -> 'commit' ->> 'operation'), 9 | (data -> 'commit' ->> 'collection'), 10 | (data ->> 'did'), 11 | (TO_TIMESTAMP((data ->> 'time_us')::BIGINT / 1000000.0)) 12 | ); -------------------------------------------------------------------------------- /postgresql/index_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | TABLE_NAME="$2" 12 | 13 | sudo -u postgres psql -d "$DB_NAME" -t -c "SELECT pg_relation_size(oid) FROM pg_class WHERE relname = 'idx_bluesky'" -------------------------------------------------------------------------------- /postgresql/index_usage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | 12 | QUERY_NUM=1 13 | 14 | 15 | cat queries.sql | while read -r query; do 16 | 17 | # Print the query number 18 | echo "------------------------------------------------------------------------------------------------------------------------" 19 | echo "Index usage for query Q$QUERY_NUM:" 20 | echo 21 | 22 | sudo -u postgres psql -d "$DB_NAME" -t -c "EXPLAIN $query" 23 | 24 | # Increment the query number 25 | QUERY_NUM=$((QUERY_NUM + 1)) 26 | 27 | done; -------------------------------------------------------------------------------- /postgresql/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # https://www.postgresql.org/download/linux/ubuntu/ 4 | 5 | sudo apt-get update 6 | sudo apt-get install -y postgresql-common 7 | sudo apt-get install -y postgresql-16 -------------------------------------------------------------------------------- /postgresql/load_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 6 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DIRECTORY="$1" 11 | DB_NAME="$2" 12 | TABLE_NAME="$3" 13 | MAX_FILES="$4" 14 | SUCCESS_LOG="$5" 15 | ERROR_LOG="$6" 16 | PSQL_CMD="sudo -u postgres psql -d $DB_NAME" 17 | 18 | # Validate that MAX_FILES is a number 19 | if ! [[ "$MAX_FILES" =~ ^[0-9]+$ ]]; then 20 | echo "Error: must be a positive integer." 21 | exit 1 22 | fi 23 | 24 | # Ensure the log files exist 25 | touch "$SUCCESS_LOG" "$ERROR_LOG" 26 | 27 | # Create a temporary directory in /var/tmp and ensure it's accessible 28 | TEMP_DIR=$(mktemp -d /var/tmp/cleaned_files.XXXXXX) 29 | chmod 777 "$TEMP_DIR" # Allow access for all users 30 | trap "rm -rf $TEMP_DIR" EXIT # Ensure cleanup on script exit 31 | 32 | # Counter to track processed files 33 | counter=0 34 | 35 | # Loop through each .json.gz file in the directory 36 | for file in $(ls "$DIRECTORY"/*.json.gz | sort); do 37 | if [[ -f "$file" ]]; then 38 | echo "Processing $file..." 39 | counter=$((counter + 1)) 40 | 41 | # Uncompress the file into the temporary directory 42 | uncompressed_file="$TEMP_DIR/$(basename "${file%.gz}")" 43 | gunzip -c "$file" > "$uncompressed_file" 44 | 45 | # Check if uncompression was successful 46 | if [[ $? -ne 0 ]]; then 47 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed to uncompress $file." >> "$ERROR_LOG" 48 | continue 49 | fi 50 | 51 | # Preprocess the file to remove null characters 52 | cleaned_file="$TEMP_DIR/$(basename "${uncompressed_file%.json}_cleaned.json")" 53 | sed 's/\\u0000//g' "$uncompressed_file" > "$cleaned_file" 54 | 55 | # Grant read permissions for the postgres user 56 | chmod 644 "$cleaned_file" 57 | 58 | # Import the cleaned JSON file into PostgreSQL 59 | $PSQL_CMD -c "\COPY $TABLE_NAME FROM '$cleaned_file' WITH (format csv, quote e'\x01', delimiter e'\x02', escape e'\x01');" 60 | import_status=$? 61 | 62 | # Check if the import was successful 63 | if [[ $import_status -eq 0 ]]; then 64 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $cleaned_file into PostgreSQL." >> "$SUCCESS_LOG" 65 | # Delete both the uncompressed and cleaned files after successful processing 66 | rm -f "$uncompressed_file" "$cleaned_file" 67 | else 68 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed to import $cleaned_file. See errors above." >> "$ERROR_LOG" 69 | # Keep the files for debugging purposes 70 | fi 71 | 72 | # Stop processing if the max number of files is reached 73 | if [[ $counter -ge $MAX_FILES ]]; then 74 | echo "Processed maximum number of files: $MAX_FILES" 75 | break 76 | fi 77 | else 78 | echo "No .json.gz files found in the directory." 79 | fi 80 | done 81 | 82 | echo "All files have been processed." -------------------------------------------------------------------------------- /postgresql/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Default data directory 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky 5 | 6 | # Allow the user to optionally provide the data directory as an argument 7 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}" 8 | 9 | # Define success and error log files 10 | SUCCESS_LOG="${2:-success.log}" 11 | ERROR_LOG="${3:-error.log}" 12 | 13 | # Define prefix for output files 14 | OUTPUT_PREFIX="${4:-_m6i.8xlarge}" 15 | 16 | # Check if the directory exists 17 | if [[ ! -d "$DATA_DIRECTORY" ]]; then 18 | echo "Error: Data directory '$DATA_DIRECTORY' does not exist." 19 | exit 1 20 | fi 21 | 22 | echo "Select the dataset size to benchmark:" 23 | echo "1) 1m (default)" 24 | echo "2) 10m" 25 | echo "3) 100m" 26 | echo "4) 1000m" 27 | echo "5) all" 28 | read -p "Enter the number corresponding to your choice: " choice 29 | 30 | ./install.sh 31 | 32 | benchmark() { 33 | local size=$1 34 | local compression=$2 35 | # Check DATA_DIRECTORY contains the required number of files to run the benchmark 36 | file_count=$(find "$DATA_DIRECTORY" -type f | wc -l) 37 | if (( file_count < size )); then 38 | echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count." 39 | exit 1 40 | fi 41 | ./create_and_load.sh "bluesky_${size}m_${compression}" bluesky "ddl_${compression}.sql" "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG" 42 | ./total_size.sh "bluesky_${size}m_${compression}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.total_size" 43 | ./data_size.sh "bluesky_${size}m_${compression}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.data_size" 44 | ./index_size.sh "bluesky_${size}m_${compression}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.index_size" 45 | ./count.sh "bluesky_${size}m_${compression}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.count" 46 | ./index_usage.sh "bluesky_${size}m_${compression}" | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.index_usage" 47 | #./query_results.sh "bluesky_${size}m_${compression}" | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.query_results" 48 | ./benchmark.sh "bluesky_${size}m_${compression}" "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.results_runtime" 49 | } 50 | 51 | case $choice in 52 | 2) 53 | benchmark 10 lz4 54 | benchmark 10 pglz 55 | ;; 56 | 3) 57 | benchmark 100 lz4 58 | benchmark 100 pglz 59 | ;; 60 | 4) 61 | benchmark 1000 lz4 62 | benchmark 1000 pglz 63 | ;; 64 | 5) 65 | benchmark 1 lz4 66 | benchmark 1 pglz 67 | benchmark 10 lz4 68 | benchmark 10 pglz 69 | benchmark 100 lz4 70 | benchmark 100 pglz 71 | benchmark 1000 lz4 72 | benchmark 1000 pglz 73 | ;; 74 | *) 75 | benchmark 1 lz4 76 | benchmark 1 pglz 77 | ;; 78 | esac -------------------------------------------------------------------------------- /postgresql/queries.sql: -------------------------------------------------------------------------------- 1 | SELECT data -> 'commit' ->> 'collection' AS event, COUNT(*) as count FROM bluesky GROUP BY event ORDER BY count DESC; 2 | SELECT data -> 'commit' ->> 'collection' AS event, COUNT(*) as count, COUNT(DISTINCT data ->> 'did') AS users FROM bluesky WHERE data ->> 'kind' = 'commit' AND data -> 'commit' ->> 'operation' = 'create' GROUP BY event ORDER BY count DESC; 3 | SELECT data->'commit'->>'collection' AS event, EXTRACT(HOUR FROM TO_TIMESTAMP((data->>'time_us')::BIGINT / 1000000)) AS hour_of_day, COUNT(*) AS count FROM bluesky WHERE data->>'kind' = 'commit' AND data->'commit'->>'operation' = 'create' AND data->'commit'->>'collection' IN ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') GROUP BY event, hour_of_day ORDER BY hour_of_day, event; 4 | SELECT data->>'did' AS user_id, MIN( TIMESTAMP WITH TIME ZONE 'epoch' + INTERVAL '1 microsecond' * (data->>'time_us')::BIGINT ) AS first_post_ts FROM bluesky WHERE data->>'kind' = 'commit' AND data->'commit'->>'operation' = 'create' AND data->'commit'->>'collection' = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC LIMIT 3; 5 | SELECT data->>'did' AS user_id, EXTRACT(EPOCH FROM ( MAX( TIMESTAMP WITH TIME ZONE 'epoch' + INTERVAL '1 microsecond' * (data->>'time_us')::BIGINT ) - MIN( TIMESTAMP WITH TIME ZONE 'epoch' + INTERVAL '1 microsecond' * (data->>'time_us')::BIGINT ) )) * 1000 AS activity_span FROM bluesky WHERE data->>'kind' = 'commit' AND data->'commit'->>'operation' = 'create' AND data->'commit'->>'collection' = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC LIMIT 3; -------------------------------------------------------------------------------- /postgresql/queries_formatted.sql: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------------------------------------------------ 2 | -- Q1 - Top event types 3 | ------------------------------------------------------------------------------------------------------------------------ 4 | SELECT 5 | data -> 'commit' ->> 'collection' AS event, 6 | COUNT(*) as count 7 | FROM bluesky 8 | GROUP BY event 9 | ORDER BY count DESC; 10 | 11 | ------------------------------------------------------------------------------------------------------------------------ 12 | -- Q2 - Top event types together with unique users per event type 13 | ------------------------------------------------------------------------------------------------------------------------ 14 | SELECT 15 | data -> 'commit' ->> 'collection' AS event, 16 | COUNT(*) as count, 17 | COUNT(DISTINCT data ->> 'did') AS users 18 | FROM bluesky 19 | WHERE data ->> 'kind' = 'commit' 20 | AND data -> 'commit' ->> 'operation' = 'create' 21 | GROUP BY event 22 | ORDER BY count DESC; 23 | 24 | ------------------------------------------------------------------------------------------------------------------------ 25 | -- Q3 - When do people use BlueSky 26 | ------------------------------------------------------------------------------------------------------------------------ 27 | SELECT 28 | data->'commit'->>'collection' AS event, 29 | EXTRACT(HOUR FROM TO_TIMESTAMP((data->>'time_us')::BIGINT / 1000000)) AS hour_of_day, 30 | COUNT(*) AS count 31 | FROM bluesky 32 | WHERE data->>'kind' = 'commit' 33 | AND data->'commit'->>'operation' = 'create' 34 | AND data->'commit'->>'collection' IN ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') 35 | GROUP BY event, hour_of_day 36 | ORDER BY hour_of_day, event; 37 | 38 | ------------------------------------------------------------------------------------------------------------------------ 39 | -- Q4 - top 3 post veterans 40 | ------------------------------------------------------------------------------------------------------------------------ 41 | SELECT 42 | data->>'did' AS user_id, 43 | MIN( 44 | TIMESTAMP WITH TIME ZONE 'epoch' + 45 | INTERVAL '1 microsecond' * (data->>'time_us')::BIGINT 46 | ) AS first_post_ts 47 | FROM bluesky 48 | WHERE data->>'kind' = 'commit' 49 | AND data->'commit'->>'operation' = 'create' 50 | AND data->'commit'->>'collection' = 'app.bsky.feed.post' 51 | GROUP BY user_id 52 | ORDER BY first_post_ts ASC 53 | LIMIT 3; 54 | 55 | ------------------------------------------------------------------------------------------------------------------------ 56 | -- Q5 - top 3 users with longest activity 57 | ------------------------------------------------------------------------------------------------------------------------ 58 | SELECT 59 | data->>'did' AS user_id, 60 | EXTRACT(EPOCH FROM ( 61 | MAX( 62 | TIMESTAMP WITH TIME ZONE 'epoch' + 63 | INTERVAL '1 microsecond' * (data->>'time_us')::BIGINT 64 | ) - 65 | MIN( 66 | TIMESTAMP WITH TIME ZONE 'epoch' + 67 | INTERVAL '1 microsecond' * (data->>'time_us')::BIGINT 68 | ) 69 | )) * 1000 AS activity_span 70 | FROM bluesky 71 | WHERE data->>'kind' = 'commit' 72 | AND data->'commit'->>'operation' = 'create' 73 | AND data->'commit'->>'collection' = 'app.bsky.feed.post' 74 | GROUP BY user_id 75 | ORDER BY activity_span DESC 76 | LIMIT 3; -------------------------------------------------------------------------------- /postgresql/query_results.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | 12 | QUERY_NUM=1 13 | 14 | cat queries.sql | while read -r query; do 15 | 16 | # Print the query 17 | echo "------------------------------------------------------------------------------------------------------------------------" 18 | echo "Result for query Q$QUERY_NUM:" 19 | echo 20 | 21 | sudo -u postgres psql -d "$DB_NAME" -c "$query" 22 | 23 | # Increment the query number 24 | QUERY_NUM=$((QUERY_NUM + 1)) 25 | done; -------------------------------------------------------------------------------- /postgresql/results/_query_results/_m6i.8xlarge_bluesky_1m_lz4.query_results: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------------------------------------------------ 2 | Result for query Q1: 3 | 4 | event | count 5 | ----------------------------+-------- 6 | app.bsky.feed.like | 448944 7 | app.bsky.graph.follow | 360374 8 | app.bsky.feed.post | 90816 9 | app.bsky.feed.repost | 58540 10 | app.bsky.graph.block | 14040 11 | app.bsky.actor.profile | 11762 12 | app.bsky.graph.listitem | 8103 13 | | 5328 14 | app.bsky.graph.listblock | 895 15 | app.bsky.graph.starterpack | 405 16 | app.bsky.graph.list | 356 17 | app.bsky.feed.threadgate | 255 18 | app.bsky.feed.postgate | 104 19 | app.bsky.feed.generator | 74 20 | app.bsky.labeler.service | 4 21 | (15 rows) 22 | 23 | ------------------------------------------------------------------------------------------------------------------------ 24 | Result for query Q2: 25 | 26 | event | count | users 27 | ----------------------------+--------+-------- 28 | app.bsky.feed.like | 444523 | 117617 29 | app.bsky.graph.follow | 337978 | 63957 30 | app.bsky.feed.post | 86812 | 50464 31 | app.bsky.feed.repost | 56993 | 26581 32 | app.bsky.graph.block | 13838 | 5785 33 | app.bsky.graph.listitem | 7568 | 1078 34 | app.bsky.actor.profile | 5337 | 5337 35 | app.bsky.graph.listblock | 860 | 449 36 | app.bsky.graph.list | 259 | 218 37 | app.bsky.feed.threadgate | 228 | 196 38 | app.bsky.graph.starterpack | 104 | 101 39 | app.bsky.feed.postgate | 101 | 82 40 | app.bsky.feed.generator | 10 | 9 41 | (13 rows) 42 | 43 | ------------------------------------------------------------------------------------------------------------------------ 44 | Result for query Q3: 45 | 46 | event | hour_of_day | count 47 | ----------------------+-------------+-------- 48 | app.bsky.feed.like | 16 | 444523 49 | app.bsky.feed.post | 16 | 86812 50 | app.bsky.feed.repost | 16 | 56993 51 | (3 rows) 52 | 53 | ------------------------------------------------------------------------------------------------------------------------ 54 | Result for query Q4: 55 | 56 | user_id | first_post_ts 57 | ----------------------------------+------------------------------- 58 | did:plc:yj3sjq3blzpynh27cumnp5ks | 2024-11-21 16:25:49.000167+00 59 | did:plc:l5o3qjrmfztir54cpwlv2eme | 2024-11-21 16:25:49.001905+00 60 | did:plc:s4bwqchfzm6gjqfeb6mexgbu | 2024-11-21 16:25:49.003907+00 61 | (3 rows) 62 | 63 | ------------------------------------------------------------------------------------------------------------------------ 64 | Result for query Q5: 65 | 66 | user_id | activity_span 67 | ----------------------------------+--------------- 68 | did:plc:tsyymlun4eqjuw7hqrhmwagd | 813006.959000 69 | did:plc:3ug235sfy2pz7cawmpsftb65 | 811602.261000 70 | did:plc:doxhhgtxqiv47tmcovpbcqai | 811404.021000 71 | (3 rows) -------------------------------------------------------------------------------- /postgresql/results/m6i.8xlarge_bluesky_1000m_lz4.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "PostgreSQL (lz4)", 3 | "version": "16.6", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000000, 12 | "dataset_size_readable": "1000m", 13 | "num_loaded_documents": 804000000, 14 | "data_compression": "lz4", 15 | "total_size": 654739636224, 16 | "total_size_readable": "654.74 GB", 17 | "data_size": 506726694912, 18 | "data_size_readable": "506.73 GB", 19 | "index_size": 147981623296, 20 | "index_size_readable": "147.98 GB", 21 | "result": [ 22 | [3863.35,3843.05,3843.04], 23 | [32553.3,32554.5,32553.9], 24 | [4222.61,4212.91,4208.16], 25 | [4890.82,4873.02,4871.11], 26 | [4900.32,4883.62,4873.68] 27 | ], 28 | "result_readable": [ 29 | "1 hr 4 min 23.35 sec, 1 hr 4 min 3.05 sec, 1 hr 4 min 3.04 sec", 30 | "9 hr 2 min 33.30 sec, 9 hr 2 min 34.50 sec, 9 hr 2 min 33.90 sec", 31 | "1 hr 10 min 22.61 sec, 1 hr 10 min 12.91 sec, 1 hr 10 min 8.16 sec", 32 | "1 hr 21 min 30.82 sec, 1 hr 21 min 13.02 sec, 1 hr 21 min 11.11 sec", 33 | "1 hr 21 min 40.32 sec, 1 hr 21 min 23.62 sec, 1 hr 21 min 13.68 sec" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /postgresql/results/m6i.8xlarge_bluesky_1000m_pglz.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "PostgreSQL (pglz)", 3 | "version": "16.6", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000000, 12 | "dataset_size_readable": "1000m", 13 | "num_loaded_documents": 804000000, 14 | "data_compression": "pglz", 15 | "total_size": 660828643328, 16 | "total_size_readable": "660.83 GB", 17 | "data_size": 512573440000, 18 | "data_size_readable": "512.57 GB", 19 | "index_size": 148222607360, 20 | "index_size_readable": "148.22 GB", 21 | "result": [ 22 | [3907.81,3891.66,3887.55], 23 | [32598,32600.7,32598], 24 | [4267.02,4246.18,4248.89], 25 | [4902.93,4887.97,4870.85], 26 | [4919.4,4894.7,4914.49] 27 | ], 28 | "result_readable": [ 29 | "1 hr 5 min 7.81 sec, 1 hr 4 min 51.66 sec, 1 hr 4 min 47.55 sec", 30 | "9 hr 3 min 18.00 sec, 9 hr 3 min 20.70 sec, 9 hr 3 min 18.00 sec", 31 | "1 hr 11 min 7.02 sec, 1 hr 10 min 46.18 sec, 1 hr 10 min 48.89 sec", 32 | "1 hr 21 min 42.93 sec, 1 hr 21 min 27.97 sec, 1 hr 21 min 10.85 sec", 33 | "1 hr 21 min 59.40 sec, 1 hr 21 min 34.70 sec, 1 hr 21 min 54.49 sec" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /postgresql/results/m6i.8xlarge_bluesky_100m_lz4.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "PostgreSQL (lz4)", 3 | "version": "16.6", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 100000000, 12 | "dataset_size_readable": "100m", 13 | "num_loaded_documents": 91000000, 14 | "data_compression": "lz4", 15 | "total_size": 69069152256, 16 | "total_size_readable": "69.07 GB", 17 | "data_size": 54598787072, 18 | "data_size_readable": "54.60 GB", 19 | "index_size": 14470062080, 20 | "index_size_readable": "14.47 GB", 21 | "result": [ 22 | [416.294,10.3327,10.3401], 23 | [1865.37,1454.93,1457.57], 24 | [439.522,31.3951,31.3661], 25 | [477.483,17.4918,17.2812], 26 | [478.399,19.4407,18.4989] 27 | ], 28 | "result_readable": [ 29 | "6 min 56.29 sec, 10.33 sec, 10.34 sec", 30 | "31 min 5.37 sec, 24 min 14.93 sec, 24 min 17.57 sec", 31 | "7 min 19.52 sec, 31.40 sec, 31.37 sec", 32 | "7 min 57.48 sec, 17.49 sec, 17.28 sec", 33 | "7 min 58.40 sec, 19.44 sec, 18.50 sec" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /postgresql/results/m6i.8xlarge_bluesky_100m_pglz.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "PostgreSQL (pglz)", 3 | "version": "16.6", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 100000000, 12 | "dataset_size_readable": "100m", 13 | "num_loaded_documents": 91000000, 14 | "data_compression": "pglz", 15 | "total_size": 69085896704, 16 | "total_size_readable": "69.09 GB", 17 | "data_size": 54617038848, 18 | "data_size_readable": "54.62 GB", 19 | "index_size": 14468587520, 20 | "index_size_readable": "14.47 GB", 21 | "result": [ 22 | [416.449,10.4025,10.3982], 23 | [1865.43,1457.18,1455.51], 24 | [439.424,31.7577,31.7307], 25 | [477.52,17.5615,17.6512], 26 | [479.118,18.9199,19.5657] 27 | ], 28 | "result_readable": [ 29 | "6 min 56.45 sec, 10.40 sec, 10.40 sec", 30 | "31 min 5.43 sec, 24 min 17.18 sec, 24 min 15.51 sec", 31 | "7 min 19.42 sec, 31.76 sec, 31.73 sec", 32 | "7 min 57.52 sec, 17.56 sec, 17.65 sec", 33 | "7 min 59.12 sec, 18.92 sec, 19.57 sec" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /postgresql/results/m6i.8xlarge_bluesky_10m_lz4.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "PostgreSQL (lz4)", 3 | "version": "16.6", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 10000000, 12 | "dataset_size_readable": "10m", 13 | "num_loaded_documents": 7000000, 14 | "data_compression": "lz4", 15 | "total_size": 5759737856, 16 | "total_size_readable": "5.76 GB", 17 | "data_size": 4653178880, 18 | "data_size_readable": "4.65 GB", 19 | "index_size": 1106558976, 20 | "index_size_readable": "1.11 GB", 21 | "result": [ 22 | [35.3379,0.945249,0.938868], 23 | [41.45,9.23267,9.23034], 24 | [36.1707,2.49331,2.4911], 25 | [174.534,1.97768,1.90803], 26 | [175.691,2.11244,2.10785] 27 | ], 28 | "result_readable": [ 29 | "35.34 sec, 945.25 msec, 938.87 msec", 30 | "41.45 sec, 9.23 sec, 9.23 sec", 31 | "36.17 sec, 2.49 sec, 2.49 sec", 32 | "2 min 54.53 sec, 1.98 sec, 1.91 sec", 33 | "2 min 55.69 sec, 2.11 sec, 2.11 sec" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /postgresql/results/m6i.8xlarge_bluesky_10m_pglz.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "PostgreSQL (pglz)", 3 | "version": "16.6", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 10000000, 12 | "dataset_size_readable": "10m", 13 | "num_loaded_documents": 7000000, 14 | "data_compression": "pglz", 15 | "total_size": 5778792448, 16 | "total_size_readable": "5.78 GB", 17 | "data_size": 4652179456, 18 | "data_size_readable": "4.65 GB", 19 | "index_size": 1126612992, 20 | "index_size_readable": "1.13 GB", 21 | "result": [ 22 | [35.1722,0.948157,0.947581], 23 | [48.0881,9.34658,9.3514], 24 | [35.668,2.53469,2.53258], 25 | [174.552,2.07684,1.95298], 26 | [175.98,2.18434,2.21651] 27 | ], 28 | "result_readable": [ 29 | "35.17 sec, 948.16 msec, 947.58 msec", 30 | "48.09 sec, 9.35 sec, 9.35 sec", 31 | "35.67 sec, 2.53 sec, 2.53 sec", 32 | "2 min 54.55 sec, 2.08 sec, 1.95 sec", 33 | "2 min 55.98 sec, 2.18 sec, 2.22 sec" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /postgresql/results/m6i.8xlarge_bluesky_1m_lz4.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "PostgreSQL (lz4)", 3 | "version": "16.6", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000, 12 | "dataset_size_readable": "1m", 13 | "num_loaded_documents": 1000000, 14 | "data_compression": "lz4", 15 | "total_size": 731111424, 16 | "total_size_readable": "731.11 MB", 17 | "data_size": 586768384, 18 | "data_size_readable": "586.77 MB", 19 | "index_size": 144343040, 20 | "index_size_readable": "144.34 MB", 21 | "result": [ 22 | [3.63966,0.135007,0.136015], 23 | [36.808,2.1238,2.13312], 24 | [4.05291,0.344643,0.344627], 25 | [15.3146,0.504157,0.224458], 26 | [15.7782,0.238114,0.247374] 27 | ], 28 | "result_readable": [ 29 | "3.64 sec, 135.01 msec, 136.01 msec", 30 | "36.81 sec, 2.12 sec, 2.13 sec", 31 | "4.05 sec, 344.64 msec, 344.63 msec", 32 | "15.31 sec, 504.16 msec, 224.46 msec", 33 | "15.78 sec, 238.11 msec, 247.37 msec" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /postgresql/results/m6i.8xlarge_bluesky_1m_pglz.json: -------------------------------------------------------------------------------- 1 | { 2 | "system": "PostgreSQL (pglz)", 3 | "version": "16.6", 4 | "os": "Ubuntu 24.04", 5 | "date": "2025-01-13", 6 | "machine": "m6i.8xlarge, 10000gib gp3", 7 | "cluster_size": 1, 8 | "comment": "", 9 | "tags": [ 10 | ], 11 | "dataset_size": 1000000, 12 | "dataset_size_readable": "1m", 13 | "num_loaded_documents": 1000000, 14 | "data_compression": "pglz", 15 | "total_size": 731168768, 16 | "total_size_readable": "731.17 MB", 17 | "data_size": 586825728, 18 | "data_size_readable": "586.83 MB", 19 | "index_size": 144343040, 20 | "index_size_readable": "144.34 MB", 21 | "result": [ 22 | [4.05346,0.135037,0.13426], 23 | [30.2945,2.1009,2.07414], 24 | [4.05195,0.348947,0.347623], 25 | [15.7542,0.50897,0.224949], 26 | [16.1238,0.251958,0.251934] 27 | ], 28 | "result_readable": [ 29 | "4.05 sec, 135.04 msec, 134.26 msec", 30 | "30.29 sec, 2.10 sec, 2.07 sec", 31 | "4.05 sec, 348.95 msec, 347.62 msec", 32 | "15.75 sec, 508.97 msec, 224.95 msec", 33 | "16.12 sec, 251.96 msec, 251.93 msec" 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /postgresql/run_queries.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 1 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | 12 | TRIES=3 13 | 14 | cat queries.sql | while read -r query; do 15 | 16 | # Clear the Linux file system cache 17 | echo "Clearing file system cache..." 18 | sync 19 | echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 20 | echo "File system cache cleared." 21 | 22 | # Print the query 23 | echo "Running query: $query" 24 | 25 | # Execute the query multiple times 26 | for i in $(seq 1 $TRIES); do 27 | sudo -u postgres psql -d "$DB_NAME" -t -c '\timing' -c "$query" | grep 'Time' 28 | done; 29 | done; -------------------------------------------------------------------------------- /postgresql/total_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the required arguments are provided 4 | if [[ $# -lt 2 ]]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Arguments 10 | DB_NAME="$1" 11 | TABLE_NAME="$2" 12 | 13 | sudo -u postgres psql -d "$DB_NAME" -t -c "SELECT pg_total_relation_size('$TABLE_NAME')" --------------------------------------------------------------------------------