├── .github
    └── workflows
    │   └── generate-results.yml
├── .gitignore
├── CNAME
├── LICENSE
├── README.md
├── _files_gz
    ├── main.sh
    ├── results
    │   ├── _files_bluesky_gz_1000m.json
    │   ├── _files_bluesky_gz_100m.json
    │   ├── _files_bluesky_gz_10m.json
    │   └── _files_bluesky_gz_1m.json
    └── total_size.sh
├── _files_json
    ├── load_data.sh
    ├── main.sh
    ├── results
    │   ├── _files_bluesky_json_1000m.json
    │   ├── _files_bluesky_json_100m.json
    │   ├── _files_bluesky_json_10m.json
    │   └── _files_bluesky_json_1m.json
    └── total_size.sh
├── _files_lz4
    ├── load_data.sh
    ├── main.sh
    ├── results
    │   ├── _files_bluesky_lz4_1000m.json
    │   ├── _files_bluesky_lz4_100m.json
    │   ├── _files_bluesky_lz4_10m.json
    │   └── _files_bluesky_lz4_1m.json
    └── total_size.sh
├── _files_zstd
    ├── load_data.sh
    ├── main.sh
    ├── results
    │   ├── _files_bluesky_zstd_1000m.json
    │   ├── _files_bluesky_zstd_100m.json
    │   ├── _files_bluesky_zstd_10m.json
    │   └── _files_bluesky_zstd_1m.json
    └── total_size.sh
├── clickhouse
    ├── benchmark.sh
    ├── count.sh
    ├── create_and_load.sh
    ├── data_size.sh
    ├── ddl_lz4.sql
    ├── ddl_zstd.sql
    ├── index_size.sh
    ├── index_usage.sh
    ├── install.sh
    ├── load_data.sh
    ├── main.sh
    ├── physical_query_plans.sh
    ├── queries.sql
    ├── queries_formatted.sql
    ├── query_results.sh
    ├── results
    │   ├── _index_usage
    │   │   ├── _m6i.8xlarge_bluesky_1000m_lz4.index_usage
    │   │   ├── _m6i.8xlarge_bluesky_1000m_zstd.index_usage
    │   │   ├── _m6i.8xlarge_bluesky_100m_lz4.index_usage
    │   │   ├── _m6i.8xlarge_bluesky_100m_zstd.index_usage
    │   │   ├── _m6i.8xlarge_bluesky_10m_lz4.index_usage
    │   │   ├── _m6i.8xlarge_bluesky_10m_zstd.index_usage
    │   │   ├── _m6i.8xlarge_bluesky_1m_lz4.index_usage
    │   │   └── _m6i.8xlarge_bluesky_1m_zstd.index_usage
    │   ├── _physical_query_plans
    │   │   ├── _m6i.8xlarge_bluesky_1000m_lz4.physical_query_plans
    │   │   ├── _m6i.8xlarge_bluesky_1000m_zstd.physical_query_plans
    │   │   ├── _m6i.8xlarge_bluesky_100m_lz4.physical_query_plans
    │   │   ├── _m6i.8xlarge_bluesky_100m_zstd.physical_query_plans
    │   │   ├── _m6i.8xlarge_bluesky_10m_lz4.physical_query_plans
    │   │   ├── _m6i.8xlarge_bluesky_10m_zstd.physical_query_plans
    │   │   ├── _m6i.8xlarge_bluesky_1m_lz4.physical_query_plans
    │   │   └── _m6i.8xlarge_bluesky_1m_zstd.physical_query_plans
    │   ├── _query_results
    │   │   └── _m6i.8xlarge_bluesky_1m_lz4.query_results
    │   ├── m6i.8xlarge_bluesky_1000m_lz4.json
    │   ├── m6i.8xlarge_bluesky_1000m_zstd.json
    │   ├── m6i.8xlarge_bluesky_100m_lz4.json
    │   ├── m6i.8xlarge_bluesky_100m_zstd.json
    │   ├── m6i.8xlarge_bluesky_10m_lz4.json
    │   ├── m6i.8xlarge_bluesky_10m_zstd.json
    │   ├── m6i.8xlarge_bluesky_1m_lz4.json
    │   └── m6i.8xlarge_bluesky_1m_zstd.json
    ├── run_queries.sh
    └── total_size.sh
├── copy_data.sh
├── duckdb
    ├── benchmark.sh
    ├── count.sh
    ├── create_and_load.sh
    ├── ddl.sql
    ├── install.sh
    ├── load_data.sh
    ├── main.sh
    ├── physical_query_plans.sh
    ├── queries.sql
    ├── queries_formatted.sql
    ├── query_results.sh
    ├── results
    │   ├── _physical_query_plans
    │   │   ├── _m6i.8xlarge_bluesky_1000m.physical_query_plans
    │   │   ├── _m6i.8xlarge_bluesky_100m.physical_query_plans
    │   │   ├── _m6i.8xlarge_bluesky_10m.physical_query_plans
    │   │   └── _m6i.8xlarge_bluesky_1m.physical_query_plans
    │   ├── _query_results
    │   │   └── _m6i.8xlarge_bluesky_1m.query_results
    │   ├── m6i.8xlarge_bluesky_1000m.errors
    │   ├── m6i.8xlarge_bluesky_1000m.json
    │   ├── m6i.8xlarge_bluesky_100m.json
    │   ├── m6i.8xlarge_bluesky_10m.json
    │   └── m6i.8xlarge_bluesky_1m.json
    ├── run_queries.sh
    └── total_size.sh
├── elasticsearch
    ├── benchmark.sh
    ├── config
    │   ├── elasticsearch.yml
    │   ├── filebeat.yml
    │   ├── ilm.json
    │   ├── index_template_no_source_best_compression.json
    │   ├── index_template_no_source_default_compression.json
    │   ├── index_template_source_best_compression.json
    │   ├── index_template_source_default_compression.json
    │   └── jvm.options
    ├── count.sh
    ├── create_and_load.sh
    ├── install.sh
    ├── load_data.sh
    ├── main.sh
    ├── queries.txt
    ├── queries_formatted.txt
    ├── query_results.sh
    ├── results
    │   ├── _query_results
    │   │   └── _m6i.8xlarge_bluesky-no_source_best_compression-1m.query_results
    │   ├── m6i.8xlarge_bluesky_no_source_1000m_best_compression.json
    │   ├── m6i.8xlarge_bluesky_no_source_1000m_default_compression.json
    │   ├── m6i.8xlarge_bluesky_no_source_100m_best_compression.json
    │   ├── m6i.8xlarge_bluesky_no_source_100m_default_compression.json
    │   ├── m6i.8xlarge_bluesky_no_source_10m_best_compression.json
    │   ├── m6i.8xlarge_bluesky_no_source_10m_default_compression.json
    │   ├── m6i.8xlarge_bluesky_no_source_1m_best_compression.json
    │   ├── m6i.8xlarge_bluesky_no_source_1m_default_compression.json
    │   ├── m6i.8xlarge_bluesky_source_1000m_best_compression.json
    │   ├── m6i.8xlarge_bluesky_source_1000m_default_compression.json
    │   ├── m6i.8xlarge_bluesky_source_100m_best_compression.json
    │   ├── m6i.8xlarge_bluesky_source_100m_default_compression.json
    │   ├── m6i.8xlarge_bluesky_source_10m_best_compression.json
    │   ├── m6i.8xlarge_bluesky_source_10m_default_compression.json
    │   ├── m6i.8xlarge_bluesky_source_1m_best_compression.json
    │   └── m6i.8xlarge_bluesky_source_1m_default_compression.json
    ├── run_queries.sh
    └── total_size.sh
├── favicon.png
├── generate-results.sh
├── index.html
├── mongodb
    ├── benchmark.sh
    ├── count.sh
    ├── create_and_load.sh
    ├── data_size.sh
    ├── ddl_snappy.js
    ├── ddl_zstd.js
    ├── index_size.sh
    ├── index_usage.sh
    ├── install.sh
    ├── load_data.sh
    ├── main.sh
    ├── queries.js
    ├── queries_formatted.js
    ├── query_results.sh
    ├── results
    │   ├── _index_usage
    │   │   ├── _m6i.8xlarge_bluesky_1000m_snappy.index_usage
    │   │   ├── _m6i.8xlarge_bluesky_1000m_zstd.index_usage
    │   │   ├── _m6i.8xlarge_bluesky_100m_snappy.index_usage
    │   │   ├── _m6i.8xlarge_bluesky_100m_zstd.index_usage
    │   │   ├── _m6i.8xlarge_bluesky_10m_snappy.index_usage
    │   │   ├── _m6i.8xlarge_bluesky_10m_zstd.index_usage
    │   │   ├── _m6i.8xlarge_bluesky_1m_snappy.index_usage
    │   │   └── _m6i.8xlarge_bluesky_1m_zstd.index_usage
    │   ├── _query_results
    │   │   └── _m6i.8xlarge_bluesky_1m_snappy.query_results
    │   ├── m6i.8xlarge_bluesky_1000m_snappy.json
    │   ├── m6i.8xlarge_bluesky_1000m_zstd.json
    │   ├── m6i.8xlarge_bluesky_100m_snappy.json
    │   ├── m6i.8xlarge_bluesky_100m_zstd.json
    │   ├── m6i.8xlarge_bluesky_10m_snappy.json
    │   ├── m6i.8xlarge_bluesky_10m_zstd.json
    │   ├── m6i.8xlarge_bluesky_1m_snappy.json
    │   └── m6i.8xlarge_bluesky_1m_zstd.json
    ├── results_without_covered_index_scans
    │   ├── _index_usage
    │   │   ├── _m6i.8xlarge_bluesky_1000m_snappy.index_usage
    │   │   ├── _m6i.8xlarge_bluesky_1000m_zstd.index_usage
    │   │   ├── _m6i.8xlarge_bluesky_100m_snappy.index_usage
    │   │   ├── _m6i.8xlarge_bluesky_100m_zstd.index_usage
    │   │   ├── _m6i.8xlarge_bluesky_10m_snappy.index_usage
    │   │   ├── _m6i.8xlarge_bluesky_10m_zstd.index_usage
    │   │   ├── _m6i.8xlarge_bluesky_1m_snappy.index_usage
    │   │   └── _m6i.8xlarge_bluesky_1m_zstd.index_usage
    │   ├── m6i.8xlarge_bluesky_1000m_snappy.json
    │   ├── m6i.8xlarge_bluesky_1000m_zstd.json
    │   ├── m6i.8xlarge_bluesky_100m_snappy.json
    │   ├── m6i.8xlarge_bluesky_100m_zstd.json
    │   ├── m6i.8xlarge_bluesky_10m_snappy.json
    │   ├── m6i.8xlarge_bluesky_10m_zstd.json
    │   ├── m6i.8xlarge_bluesky_1m_snappy.json
    │   └── m6i.8xlarge_bluesky_1m_zstd.json
    ├── run_queries.sh
    └── total_size.sh
└── postgresql
    ├── benchmark.sh
    ├── count.sh
    ├── create_and_load.sh
    ├── data_size.sh
    ├── ddl_lz4.sql
    ├── ddl_pglz.sql
    ├── index_size.sh
    ├── index_usage.sh
    ├── install.sh
    ├── load_data.sh
    ├── main.sh
    ├── queries.sql
    ├── queries_formatted.sql
    ├── query_results.sh
    ├── results
        ├── _index_usage
        │   ├── _m6i.8xlarge_bluesky_1000m_lz4.index_usage
        │   ├── _m6i.8xlarge_bluesky_1000m_pglz.index_usage
        │   ├── _m6i.8xlarge_bluesky_100m_lz4.index_usage
        │   ├── _m6i.8xlarge_bluesky_100m_pglz.index_usage
        │   ├── _m6i.8xlarge_bluesky_10m_lz4.index_usage
        │   ├── _m6i.8xlarge_bluesky_10m_pglz.index_usage
        │   ├── _m6i.8xlarge_bluesky_1m_lz4.index_usage
        │   └── _m6i.8xlarge_bluesky_1m_pglz.index_usage
        ├── _query_results
        │   └── _m6i.8xlarge_bluesky_1m_lz4.query_results
        ├── m6i.8xlarge_bluesky_1000m_lz4.json
        ├── m6i.8xlarge_bluesky_1000m_pglz.json
        ├── m6i.8xlarge_bluesky_100m_lz4.json
        ├── m6i.8xlarge_bluesky_100m_pglz.json
        ├── m6i.8xlarge_bluesky_10m_lz4.json
        ├── m6i.8xlarge_bluesky_10m_pglz.json
        ├── m6i.8xlarge_bluesky_1m_lz4.json
        └── m6i.8xlarge_bluesky_1m_pglz.json
    ├── run_queries.sh
    └── total_size.sh


/.github/workflows/generate-results.yml:
--------------------------------------------------------------------------------
 1 | name: "Generate index.html"
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 | 
 7 | permissions:
 8 |   contents: write
 9 | 
10 | jobs:
11 |   build:
12 |     runs-on: ubuntu-latest
13 |     env: 
14 |       CI_COMMIT_MESSAGE: "[bot] update index.html"
15 |       CI_COMMIT_AUTHOR: github
16 |     steps:
17 |     - uses: actions/checkout@v3
18 |     - if: github.event.commits[0].message != env.CI_COMMIT_MESSAGE
19 |       run: |
20 |         bash generate-results.sh
21 | 
22 |         git config --global user.name "${{ env.CI_COMMIT_AUTHOR }}"
23 |         git config --global user.email "${{ env.CI_COMMIT_AUTHOR }}@users.noreply.github.com"
24 | 
25 |         git add -A
26 |         if git status | grep -q modified
27 |         then
28 |           git commit -m "${{ env.CI_COMMIT_MESSAGE }}"
29 |           git push
30 |         fi
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.bak
2 | .idea
3 | 


--------------------------------------------------------------------------------
/CNAME:
--------------------------------------------------------------------------------
1 | jsonbench.com


--------------------------------------------------------------------------------
/_files_gz/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Default data directory
 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky
 5 | 
 6 | # Allow the user to optionally provide the data directory as an argument
 7 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}"
 8 | 
 9 | # Define prefix for output files
10 | OUTPUT_PREFIX="${2:-_files_gz}"
11 | 
12 | # Check if the data directory exists
13 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
14 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
15 |     exit 1
16 | fi
17 | 
18 | 
19 | # 1m
20 | ./total_size.sh "$DATA_DIRECTORY" 1 | tee "${OUTPUT_PREFIX}_1m.total_size"
21 | 
22 | # 10m
23 | ./total_size.sh "$DATA_DIRECTORY" 10 | tee "${OUTPUT_PREFIX}_10m.total_size"
24 | 
25 | # 100m
26 | ./total_size.sh "$DATA_DIRECTORY" 100 | tee "${OUTPUT_PREFIX}_100m.total_size"
27 | 
28 | # 1000m
29 | ./total_size.sh "$DATA_DIRECTORY" 1000 | tee "${OUTPUT_PREFIX}_1000m.total_size"


--------------------------------------------------------------------------------
/_files_gz/results/_files_bluesky_gz_1000m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json.gz",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "cluster_size": 1,
 7 |   "comment": "",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000000,
11 |   "dataset_size_readable": "1000m",
12 |   "data_compression": "gz",
13 |   "total_size": 134117979655,
14 |   "total_size_readable": "134.12 GB"
15 | }
16 | 


--------------------------------------------------------------------------------
/_files_gz/results/_files_bluesky_gz_100m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json.gz",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "cluster_size": 1,
 7 |   "comment": "",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 100000000,
11 |   "dataset_size_readable": "100m",
12 |   "data_compression": "gz",
13 |   "total_size": 13372936569,
14 |   "total_size_readable": "13.37 GB"
15 | }
16 | 


--------------------------------------------------------------------------------
/_files_gz/results/_files_bluesky_gz_10m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json.gz",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "cluster_size": 1,
 7 |   "comment": "",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 10000000,
11 |   "dataset_size_readable": "10m",
12 |   "data_compression": "gz",
13 |   "total_size": 1354902507,
14 |   "total_size_readable": "1.35 GB"
15 | }
16 | 


--------------------------------------------------------------------------------
/_files_gz/results/_files_bluesky_gz_1m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json.gz",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "cluster_size": 1,
 7 |   "comment": "",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000,
11 |   "dataset_size_readable": "1m",
12 |   "data_compression": "gz",
13 |   "total_size": 135176827,
14 |   "total_size_readable": "135.17 MB"
15 | }
16 | 


--------------------------------------------------------------------------------
/_files_gz/total_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <DATA_DIRECTORY> <N>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATA_DIRECTORY="$1"
11 | N="$2"
12 | 
13 | # Validate the data directory
14 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
15 |     echo "Error: Directory '$DATA_DIRECTORY' does not exist."
16 |     exit 1
17 | fi
18 | 
19 | # Validate N is a positive integer
20 | if ! [[ "$N" =~ ^[0-9]+$ ]]; then
21 |     echo "Error: N must be a positive integer."
22 |     exit 1
23 | fi
24 | 
25 | # Get the first N files sorted by filename and calculate their total size
26 | TOTAL_SIZE=$(ls -1 "$DATA_DIRECTORY" | sort | head -n "$N" | while read -r file; do
27 |     filepath="$DATA_DIRECTORY/$file"
28 |     if [[ -f "$filepath" ]]; then
29 |         stat --format="%s" "$filepath"
30 |     fi
31 | done | awk '{sum += $1} END {print sum}')
32 | 
33 | # Output the total size in bytes
34 | echo $TOTAL_SIZE


--------------------------------------------------------------------------------
/_files_json/load_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if required arguments are provided
 4 | if [[ $# -lt 3 ]]; then
 5 |     echo "Usage: $0 <DATA_DIRECTORY> <TARGET_DIRECTORY> <N>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATA_DIRECTORY="$1"
11 | TARGET_DIRECTORY="$2"
12 | N="$3"
13 | 
14 | # Validate the source directory
15 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
16 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
17 |     exit 1
18 | fi
19 | 
20 | # Validate the target directory
21 | if [[ ! -d "$TARGET_DIRECTORY" ]]; then
22 |     echo "Error: Target directory '$TARGET_DIRECTORY' does not exist."
23 |     exit 1
24 | fi
25 | 
26 | # Validate N is a positive integer
27 | if ! [[ "$N" =~ ^[0-9]+$ ]]; then
28 |     echo "Error: N must be a positive integer."
29 |     exit 1
30 | fi
31 | 
32 | # Get the sorted list of .json.gz files and extract the first N
33 | count=0
34 | for file in $(ls "$DATA_DIRECTORY"/*.json.gz | sort); do
35 |     if [[ $count -ge $N ]]; then
36 |         break
37 |     fi
38 | 
39 |     echo "Processing $file..."
40 |     gzip -dkc "$file" > "$TARGET_DIRECTORY/$(basename "${file%.gz}")"  # Extract to target directory
41 |     count=$((count + 1))
42 | done
43 | 
44 | echo "Extraction of $count files completed. Extracted files are in '$TARGET_DIRECTORY'."


--------------------------------------------------------------------------------
/_files_json/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Default data directory
 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky
 5 | DEFAULT_TARGET_DIRECTORY=~/data/bluesky_json
 6 | 
 7 | # Allow the user to optionally provide the data and target directories as arguments
 8 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}"
 9 | TARGET_DIRECTORY="${2:-$DEFAULT_TARGET_DIRECTORY}"
10 | 
11 | # Define prefix for output files
12 | OUTPUT_PREFIX="${3:-_files_json}"
13 | 
14 | # Check if the data directory exists
15 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
16 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
17 |     exit 1
18 | fi
19 | 
20 | # Ensure the target directory exists
21 | if [[ ! -d "$TARGET_DIRECTORY" ]]; then
22 |     echo "Target directory '$TARGET_DIRECTORY' does not exist. Creating it..."
23 |     mkdir -p "$TARGET_DIRECTORY"
24 |     if [[ $? -ne 0 ]]; then
25 |         echo "Error: Failed to create target directory '$TARGET_DIRECTORY'."
26 |         exit 1
27 |     fi
28 | fi
29 | 
30 | 
31 | # 1m
32 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/1m"
33 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY"
34 | mkdir -p "$TARGET_SUB_DIRECTORY"
35 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 1
36 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_1m.total_size"
37 | 
38 | # 10m
39 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/10m"
40 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY"
41 | mkdir -p "$TARGET_SUB_DIRECTORY"
42 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 10
43 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_10m.total_size"
44 | 
45 | # 100m
46 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/100m"
47 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY"
48 | mkdir -p "$TARGET_SUB_DIRECTORY"
49 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 100
50 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_100m.total_size"
51 | 
52 | # 1000m
53 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/1000m"
54 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY"
55 | mkdir -p "$TARGET_SUB_DIRECTORY"
56 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 1000
57 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_1000m.total_size"


--------------------------------------------------------------------------------
/_files_json/results/_files_bluesky_json_1000m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "cluster_size": 1,
 7 |   "comment": "",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000000,
11 |   "dataset_size_readable": "1000m",
12 |   "data_compression": "none",
13 |   "total_size": 482108809691,
14 |   "total_size_readable": "482.11 GB"
15 | }
16 | 


--------------------------------------------------------------------------------
/_files_json/results/_files_bluesky_json_100m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "cluster_size": 1,
 7 |   "comment": "",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 100000000,
11 |   "dataset_size_readable": "100m",
12 |   "data_compression": "none",
13 |   "total_size": 47813179260,
14 |   "total_size_readable": "47.81 GB"
15 | }
16 | 


--------------------------------------------------------------------------------
/_files_json/results/_files_bluesky_json_10m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "cluster_size": 1,
 7 |   "comment": "",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 10000000,
11 |   "dataset_size_readable": "10m",
12 |   "data_compression": "none",
13 |   "total_size": 4858741288,
14 |   "total_size_readable": "4.86 GB"
15 | }
16 | 


--------------------------------------------------------------------------------
/_files_json/results/_files_bluesky_json_1m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "cluster_size": 1,
 7 |   "comment": "",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000,
11 |   "dataset_size_readable": "1m",
12 |   "data_compression": "none",
13 |   "total_size": 480778277,
14 |   "total_size_readable": "480.78 MB"
15 | }
16 | 


--------------------------------------------------------------------------------
/_files_json/total_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required argument is provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DIRECTORY>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Argument
10 | DIRECTORY="$1"
11 | 
12 | # Check if the directory exists
13 | if [[ ! -d "$DIRECTORY" ]]; then
14 |     echo "Error: Directory '$DIRECTORY' does not exist."
15 |     exit 1
16 | fi
17 | 
18 | # Get the total size in bytes and suppress the directory name
19 | du -sb "$DIRECTORY" | awk '{print $1}'


--------------------------------------------------------------------------------
/_files_lz4/load_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if required arguments are provided
 4 | if [[ $# -lt 3 ]]; then
 5 |     echo "Usage: $0 <DATA_DIRECTORY> <TARGET_DIRECTORY> <N>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATA_DIRECTORY="$1"
11 | TARGET_DIRECTORY="$2"
12 | N="$3"
13 | 
14 | # Validate the source directory
15 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
16 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
17 |     exit 1
18 | fi
19 | 
20 | # Validate the target directory
21 | if [[ ! -d "$TARGET_DIRECTORY" ]]; then
22 |     echo "Error: Data directory '$TARGET_DIRECTORY' does not exist."
23 |     exit 1
24 | fi
25 | 
26 | # Validate N is a positive integer
27 | if ! [[ "$N" =~ ^[0-9]+$ ]]; then
28 |     echo "Error: N must be a positive integer."
29 |     exit 1
30 | fi
31 | 
32 | # Create a temporary directory inside the current directory
33 | TEMP_DIR="./temp_extraction"
34 | if [[ -d "$TEMP_DIR" ]]; then
35 |     echo "Temporary directory '$TEMP_DIR' already exists. Deleting it first..."
36 |     rm -rf "$TEMP_DIR"
37 | fi
38 | 
39 | mkdir -p "$TEMP_DIR"
40 | 
41 | # Trap to ensure cleanup of the temporary directory
42 | trap "rm -rf $TEMP_DIR" EXIT
43 | 
44 | # Process the first N files
45 | count=0
46 | for file in $(ls "$DATA_DIRECTORY"/*.json.gz | sort); do
47 |     if [[ $count -ge $N ]]; then
48 |         break
49 |     fi
50 | 
51 |     echo "Processing $file..."
52 | 
53 |     # Define paths for the temporary extracted file and compressed file
54 |     extracted_file="$TEMP_DIR/$(basename "${file%.gz}")"
55 |     compressed_file="$TEMP_DIR/$(basename "${file%.gz}.lz4")"
56 | 
57 |     # Extract the .json.gz file into the temporary directory
58 |     gzip -c -d "$file" > "$extracted_file"
59 |     if [[ $? -ne 0 ]]; then
60 |         echo "Error: Failed to extract $file to $extracted_file"
61 |         continue
62 |     fi
63 | 
64 |     # Compress the extracted file with lz4
65 |     lz4 "$extracted_file" "$compressed_file"
66 |     if [[ $? -ne 0 ]]; then
67 |         echo "Error: Failed to compress $extracted_file"
68 |         continue
69 |     fi
70 | 
71 |     # Copy the .lz4 file to the target directory
72 |     cp "$compressed_file" "$TARGET_DIRECTORY/"
73 |     if [[ $? -ne 0 ]]; then
74 |         echo "Error: Failed to copy $compressed_file to $TARGET_DIRECTORY"
75 |         continue
76 |     fi
77 | 
78 |     count=$((count + 1))
79 | done
80 | 
81 | # Cleanup (done automatically by the trap)
82 | echo "Processed $count files. Compressed files are in '$TARGET_DIRECTORY'."
83 | echo "Temporary directory '$TEMP_DIR' has been deleted."


--------------------------------------------------------------------------------
/_files_lz4/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Default data directory
 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky
 5 | DEFAULT_TARGET_DIRECTORY=~/data/bluesky_lz4
 6 | 
 7 | # Allow the user to optionally provide the data and target directories as arguments
 8 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}"
 9 | TARGET_DIRECTORY="${2:-$DEFAULT_TARGET_DIRECTORY}"
10 | 
11 | # Define prefix for output files
12 | OUTPUT_PREFIX="${3:-_files_lz4}"
13 | 
14 | # Check if the data directory exists
15 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
16 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
17 |     exit 1
18 | fi
19 | 
20 | # Ensure the target directory exists
21 | if [[ ! -d "$TARGET_DIRECTORY" ]]; then
22 |     echo "Target directory '$TARGET_DIRECTORY' does not exist. Creating it..."
23 |     mkdir -p "$TARGET_DIRECTORY"
24 |     if [[ $? -ne 0 ]]; then
25 |         echo "Error: Failed to create target directory '$TARGET_DIRECTORY'."
26 |         exit 1
27 |     fi
28 | fi
29 | 
30 | 
31 | # 1m
32 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/1m"
33 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY"
34 | mkdir -p "$TARGET_SUB_DIRECTORY"
35 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 1
36 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_1m.total_size"
37 | 
38 | # 10m
39 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/10m"
40 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY"
41 | mkdir -p "$TARGET_SUB_DIRECTORY"
42 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 10
43 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_10m.total_size"
44 | 
45 | # 100m
46 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/100m"
47 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY"
48 | mkdir -p "$TARGET_SUB_DIRECTORY"
49 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 100
50 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_100m.total_size"
51 | 
52 | # 1000m
53 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/1000m"
54 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY"
55 | mkdir -p "$TARGET_SUB_DIRECTORY"
56 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 1000
57 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_1000m.total_size"


--------------------------------------------------------------------------------
/_files_lz4/results/_files_bluesky_lz4_1000m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json.lz4",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "cluster_size": 1,
 7 |   "comment": "",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000000,
11 |   "dataset_size_readable": "1000m",
12 |   "data_compression": "lz4",
13 |   "total_size": 206562787263,
14 |   "total_size_readable": "206.56 GB"
15 | }
16 | 


--------------------------------------------------------------------------------
/_files_lz4/results/_files_bluesky_lz4_100m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json.lz4",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "cluster_size": 1,
 7 |   "comment": "",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 100000000,
11 |   "dataset_size_readable": "100m",
12 |   "data_compression": "lz4",
13 |   "total_size": 20591959778,
14 |   "total_size_readable": "20.59 GB"
15 | }
16 | 


--------------------------------------------------------------------------------
/_files_lz4/results/_files_bluesky_lz4_10m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json.lz4",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "cluster_size": 1,
 7 |   "comment": "",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 10000000,
11 |   "dataset_size_readable": "10m",
12 |   "data_compression": "lz4",
13 |   "total_size": 2084888024,
14 |   "total_size_readable": "2.08 GB"
15 | }
16 | 


--------------------------------------------------------------------------------
/_files_lz4/results/_files_bluesky_lz4_1m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json.lz4",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "cluster_size": 1,
 7 |   "comment": "",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000,
11 |   "dataset_size_readable": "1m",
12 |   "data_compression": "lz4",
13 |   "total_size": 208385826,
14 |   "total_size_readable": "208.39 MB"
15 | }
16 | 


--------------------------------------------------------------------------------
/_files_lz4/total_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required argument is provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DIRECTORY>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Argument
10 | DIRECTORY="$1"
11 | 
12 | # Check if the directory exists
13 | if [[ ! -d "$DIRECTORY" ]]; then
14 |     echo "Error: Directory '$DIRECTORY' does not exist."
15 |     exit 1
16 | fi
17 | 
18 | # Get the total size in bytes and suppress the directory name
19 | du -sb "$DIRECTORY" | awk '{print $1}'


--------------------------------------------------------------------------------
/_files_zstd/load_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if required arguments are provided
 4 | if [[ $# -lt 3 ]]; then
 5 |     echo "Usage: $0 <DATA_DIRECTORY> <TARGET_DIRECTORY> <N>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATA_DIRECTORY="$1"
11 | TARGET_DIRECTORY="$2"
12 | N="$3"
13 | 
14 | # Validate the source directory
15 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
16 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
17 |     exit 1
18 | fi
19 | 
20 | # Validate the target directory
21 | if [[ ! -d "$TARGET_DIRECTORY" ]]; then
22 |     echo "Error: Target directory '$TARGET_DIRECTORY' does not exist."
23 |     exit 1
24 | fi
25 | 
26 | # Validate N is a positive integer
27 | if ! [[ "$N" =~ ^[0-9]+$ ]]; then
28 |     echo "Error: N must be a positive integer."
29 |     exit 1
30 | fi
31 | 
32 | # Create a temporary directory inside the current directory
33 | TEMP_DIR="./temp_extraction"
34 | if [[ -d "$TEMP_DIR" ]]; then
35 |     echo "Temporary directory '$TEMP_DIR' already exists. Deleting it first..."
36 |     rm -rf "$TEMP_DIR"
37 | fi
38 | 
39 | mkdir -p "$TEMP_DIR"
40 | 
41 | # Trap to ensure cleanup of the temporary directory
42 | trap "rm -rf $TEMP_DIR" EXIT
43 | 
44 | # Process the first N files
45 | count=0
46 | for file in $(ls "$DATA_DIRECTORY"/*.json.gz | sort); do
47 |     if [[ $count -ge $N ]]; then
48 |         break
49 |     fi
50 | 
51 |     echo "Processing $file..."
52 | 
53 |     # Define paths for the temporary extracted file and compressed file
54 |     extracted_file="$TEMP_DIR/$(basename "${file%.gz}")"
55 |     compressed_file="$TEMP_DIR/$(basename "${file%.gz}.zst")"
56 | 
57 |     # Extract the .json.gz file into the temporary directory
58 |     gzip -c -d "$file" > "$extracted_file"
59 |     if [[ $? -ne 0 ]]; then
60 |         echo "Error: Failed to extract $file to $extracted_file"
61 |         continue
62 |     fi
63 | 
64 |     # Compress the extracted file with zstd
65 |     zstd -1 "$extracted_file" -o "$compressed_file"
66 |     if [[ $? -ne 0 ]]; then
67 |         echo "Error: Failed to compress $extracted_file"
68 |         continue
69 |     fi
70 | 
71 |     # Copy the .zst file to the target directory
72 |     cp "$compressed_file" "$TARGET_DIRECTORY/"
73 |     if [[ $? -ne 0 ]]; then
74 |         echo "Error: Failed to copy $compressed_file to $TARGET_DIRECTORY"
75 |         continue
76 |     fi
77 | 
78 |     count=$((count + 1))
79 | done
80 | 
81 | # Cleanup (done automatically by the trap)
82 | echo "Processed $count files. Compressed files are in '$TARGET_DIRECTORY'."
83 | echo "Temporary directory '$TEMP_DIR' has been deleted."


--------------------------------------------------------------------------------
/_files_zstd/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Default data directory
 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky
 5 | DEFAULT_TARGET_DIRECTORY=~/data/bluesky_zstd
 6 | 
 7 | # Allow the user to optionally provide the data and target directories as arguments
 8 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}"
 9 | TARGET_DIRECTORY="${2:-$DEFAULT_TARGET_DIRECTORY}"
10 | 
11 | # Define prefix for output files
12 | OUTPUT_PREFIX="${3:-_files_zstd}"
13 | 
14 | # Check if the data directory exists
15 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
16 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
17 |     exit 1
18 | fi
19 | 
20 | # Ensure the target directory exists
21 | if [[ ! -d "$TARGET_DIRECTORY" ]]; then
22 |     echo "Target directory '$TARGET_DIRECTORY' does not exist. Creating it..."
23 |     mkdir -p "$TARGET_DIRECTORY"
24 |     if [[ $? -ne 0 ]]; then
25 |         echo "Error: Failed to create target directory '$TARGET_DIRECTORY'."
26 |         exit 1
27 |     fi
28 | fi
29 | 
30 | 
31 | # 1m
32 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/1m"
33 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY"
34 | mkdir -p "$TARGET_SUB_DIRECTORY"
35 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 1
36 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_1m.total_size"
37 | 
38 | # 10m
39 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/10m"
40 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY"
41 | mkdir -p "$TARGET_SUB_DIRECTORY"
42 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 10
43 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_10m.total_size"
44 | 
45 | # 100m
46 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/100m"
47 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY"
48 | mkdir -p "$TARGET_SUB_DIRECTORY"
49 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 100
50 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_100m.total_size"
51 | 
52 | # 1000m
53 | TARGET_SUB_DIRECTORY="$TARGET_DIRECTORY/1000m"
54 | echo "Creating subdirectory: $TARGET_SUB_DIRECTORY"
55 | mkdir -p "$TARGET_SUB_DIRECTORY"
56 | ./load_data.sh "$DATA_DIRECTORY" "$TARGET_SUB_DIRECTORY" 1000
57 | ./total_size.sh "$TARGET_SUB_DIRECTORY" | tee "${OUTPUT_PREFIX}_1000m.total_size"


--------------------------------------------------------------------------------
/_files_zstd/results/_files_bluesky_zstd_1000m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json.zstd",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "cluster_size": 1,
 7 |   "comment": "",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000000,
11 |   "dataset_size_readable": "1000m",
12 |   "data_compression": "zstd(1)",
13 |   "total_size": 123797963671,
14 |   "total_size_readable": "123.80 GB"
15 | }
16 | 


--------------------------------------------------------------------------------
/_files_zstd/results/_files_bluesky_zstd_100m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json.zstd",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "cluster_size": 1,
 7 |   "comment": "",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 100000000,
11 |   "dataset_size_readable": "100m",
12 |   "data_compression": "zstd(1)",
13 |   "total_size": 12245368182,
14 |   "total_size_readable": "12.25 GB"
15 | }
16 | 


--------------------------------------------------------------------------------
/_files_zstd/results/_files_bluesky_zstd_10m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json.zstd",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "cluster_size": 1,
 7 |   "comment": "",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 10000000,
11 |   "dataset_size_readable": "10m",
12 |   "data_compression": "zstd(1)",
13 |   "total_size": 1269817486,
14 |   "total_size_readable": "1.27 GB"
15 | }
16 | 


--------------------------------------------------------------------------------
/_files_zstd/results/_files_bluesky_zstd_1m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "data.json.zstd",
 3 |   "fake": true,
 4 |   "date": "2025-01-13",
 5 |   "machine": "m6i.8xlarge, 10000gib gp3",
 6 |   "cluster_size": 1,
 7 |   "comment": "",
 8 |   "tags": [
 9 |   ],
10 |   "dataset_size": 1000000,
11 |   "dataset_size_readable": "1m",
12 |   "data_compression": "zstd(1)",
13 |   "total_size": 126734406,
14 |   "total_size_readable": "126.73 MB"
15 | }
16 | 


--------------------------------------------------------------------------------
/_files_zstd/total_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required argument is provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DIRECTORY>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Argument
10 | DIRECTORY="$1"
11 | 
12 | # Check if the directory exists
13 | if [[ ! -d "$DIRECTORY" ]]; then
14 |     echo "Error: Directory '$DIRECTORY' does not exist."
15 |     exit 1
16 | fi
17 | 
18 | # Get the total size in bytes and suppress the directory name
19 | du -sb "$DIRECTORY" | awk '{print $1}'


--------------------------------------------------------------------------------
/clickhouse/benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 3 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <RESULT_FILE_RUNTIMES> <RESULT_FILE_MEMORY_USAGE>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | RESULT_FILE_RUNTIMES="$2"
12 | RESULT_FILE_MEMORY_USAGE="$3"
13 | 
14 | # Construct the query log file name using $DB_NAME
15 | QUERY_LOG_FILE="_query_log_${DB_NAME}.txt"
16 | 
17 | # Print the database name
18 | echo "Running queries on database: $DB_NAME"
19 | 
20 | # Run queries and log the output
21 | ./run_queries.sh "$DB_NAME" 2>&1 | tee "$QUERY_LOG_FILE"
22 | 
23 | # Process the query log and prepare the result
24 | RUNTIME_RESULTS=$(grep -E '^[0-9]' "$QUERY_LOG_FILE" | awk 'NR % 2 == 1' | awk '{
25 |     if (NR % 3 == 1) { printf "["; }
26 |     printf $1;
27 |     if (NR % 3 == 0) {
28 |         print "],";
29 |     } else {
30 |         printf ", ";
31 |     }
32 | }')
33 | 
34 | MEMORY_RESULTS=$(grep -E '^[0-9]' "$QUERY_LOG_FILE" | awk 'NR % 2 == 0' | awk '{
35 |     if (NR % 3 == 1) { printf "["; }
36 |     printf $1;
37 |     if (NR % 3 == 0) {
38 |         print "],";
39 |     } else {
40 |         printf ", ";
41 |     }
42 | }')
43 | 
44 | # Output the runtime results
45 | echo "$RUNTIME_RESULTS" > "$RESULT_FILE_RUNTIMES"
46 | echo "Runtime results written to $RESULT_FILE_RUNTIMES"
47 | 
48 | # Output the memory usage results
49 | echo "$MEMORY_RESULTS" > "$RESULT_FILE_MEMORY_USAGE"
50 | echo "Memory usage results written to $RESULT_FILE_MEMORY_USAGE"


--------------------------------------------------------------------------------
/clickhouse/count.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | TABLE_NAME="$2"
12 | 
13 | clickhouse-client --database="$DB_NAME" --query "SELECT count() FROM '$TABLE_NAME';"


--------------------------------------------------------------------------------
/clickhouse/create_and_load.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 7 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME> <DDL_FILE> <DATA_DIRECTORY> <NUM_FILES> <SUCCESS_LOG> <ERROR_LOG>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | TABLE_NAME="$2"
12 | DDL_FILE="$3"
13 | DATA_DIRECTORY="$4"
14 | NUM_FILES="$5"
15 | SUCCESS_LOG="$6"
16 | ERROR_LOG="$7"
17 | 
18 | # Validate arguments
19 | [[ ! -f "$DDL_FILE" ]] && { echo "Error: DDL file '$DDL_FILE' does not exist."; exit 1; }
20 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
21 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }
22 | 
23 | 
24 | # Create database
25 | clickhouse-client --query "CREATE DATABASE IF NOT EXISTS $DB_NAME"
26 | 
27 | # Execute DDL
28 | clickhouse-client --database="$DB_NAME" --enable_json_type=1 --multiquery < "$DDL_FILE"
29 | 
30 | # Load data
31 | ./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"
32 | 
33 | echo "Script completed successfully."


--------------------------------------------------------------------------------
/clickhouse/data_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | TABLE_NAME="$2"
12 | 
13 | clickhouse-client --query "SELECT sum(data_compressed_bytes) FROM system.parts WHERE database = '$DB_NAME' AND table = '$TABLE_NAME' AND active"


--------------------------------------------------------------------------------
/clickhouse/ddl_lz4.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE bluesky
 2 | (
 3 |     `data` JSON(
 4 |         kind LowCardinality(String),
 5 |         commit.operation LowCardinality(String),
 6 |         commit.collection LowCardinality(String),
 7 |         did String,
 8 |         time_us UInt64)
 9 | )
10 | ORDER BY (
11 |     data.kind,
12 |     data.commit.operation,
13 |     data.commit.collection,
14 |     data.did,
15 |     fromUnixTimestamp64Micro(data.time_us));


--------------------------------------------------------------------------------
/clickhouse/ddl_zstd.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE bluesky
 2 | (
 3 |     `data` JSON(
 4 |         kind LowCardinality(String),
 5 |         commit.operation LowCardinality(String),
 6 |         commit.collection LowCardinality(String),
 7 |         did String,
 8 |         time_us UInt64)  CODEC(ZSTD(1))
 9 | )
10 | ORDER BY (
11 |     data.kind,
12 |     data.commit.operation,
13 |     data.commit.collection,
14 |     data.did,
15 |     fromUnixTimestamp64Micro(data.time_us));


--------------------------------------------------------------------------------
/clickhouse/index_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | TABLE_NAME="$2"
12 | 
13 | clickhouse-client --query "SELECT sum(primary_key_size) + sum(marks_bytes) FROM system.parts WHERE database = '$DB_NAME' AND table = '$TABLE_NAME' AND active"


--------------------------------------------------------------------------------
/clickhouse/index_usage.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | 
12 | QUERY_NUM=1
13 | 
14 | cat queries.sql | while read -r query; do
15 | 
16 |     # Print the query number
17 |     echo "------------------------------------------------------------------------------------------------------------------------"
18 |     echo "Index usage for query Q$QUERY_NUM:"
19 |     echo
20 | 
21 |     clickhouse-client --database="$DB_NAME" --query="EXPLAIN indexes=1 $query"
22 | 
23 |     # Increment the query number
24 |     QUERY_NUM=$((QUERY_NUM + 1))
25 | done;


--------------------------------------------------------------------------------
/clickhouse/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | curl https://clickhouse.com/ | sh
 4 | sudo ./clickhouse install --noninteractive
 5 | sudo clickhouse start
 6 | 
 7 | while true
 8 | do
 9 |     clickhouse-client --query "SELECT 1" && break
10 |     sleep 1
11 | done
12 | 
13 | 


--------------------------------------------------------------------------------
/clickhouse/load_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 6 ]]; then
 5 |     echo "Usage: $0 <DATA_DIRECTORY> <DB_NAME> <TABLE_NAME> <MAX_FILES> <SUCCESS_LOG> <ERROR_LOG>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | 
10 | # Arguments
11 | DATA_DIRECTORY="$1"
12 | DB_NAME="$2"
13 | TABLE_NAME="$3"
14 | MAX_FILES="$4"
15 | SUCCESS_LOG="$5"
16 | ERROR_LOG="$6"
17 | 
18 | # Validate arguments
19 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
20 | [[ ! "$MAX_FILES" =~ ^[0-9]+$ ]] && { echo "Error: MAX_FILES must be a positive integer."; exit 1; }
21 | 
22 | 
23 | # Create a temporary directory for uncompressed files
24 | TEMP_DIR=$(mktemp -d /var/tmp/json_files.XXXXXX)
25 | trap "rm -rf $TEMP_DIR" EXIT  # Cleanup temp directory on script exit
26 | 
27 | # Load data
28 | counter=0
29 | for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do
30 |     echo "Processing file: $file"
31 | 
32 |     # Uncompress the file into the TEMP_DIR
33 |     uncompressed_file="$TEMP_DIR/$(basename "${file%.gz}")"
34 |     gunzip -c "$file" > "$uncompressed_file"
35 | 
36 |     if [[ $? -ne 0 ]]; then
37 |         echo "Error: Failed to uncompress $file" >> "$ERROR_LOG"
38 |         continue
39 |     fi
40 | 
41 |     # Attempt the first import
42 |     clickhouse-client --query="INSERT INTO $DB_NAME.$TABLE_NAME SETTINGS min_insert_block_size_rows = 1_000_000, min_insert_block_size_bytes = 0 FORMAT JSONAsObject" < "$uncompressed_file"
43 |     first_attempt=$?
44 | 
45 |     # Check if the first import was successful
46 |     if [[ $first_attempt -eq 0 ]]; then
47 |         echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $file." >> "$SUCCESS_LOG"
48 |         rm -f "$uncompressed_file"  # Delete the uncompressed file after successful processing
49 |     else
50 |         echo "[$(date '+%Y-%m-%d %H:%M:%S')] First attempt failed for $file. Trying again..." >> "$ERROR_LOG"
51 | 
52 |         echo "Processing $file... again..."
53 |         # Attempt the second import with a different command
54 |         clickhouse-client --query="INSERT INTO $DB_NAME.$TABLE_NAME SETTINGS min_insert_block_size_rows = 1_000_000, min_insert_block_size_bytes = 0, input_format_allow_errors_num = 1_000_000_000, input_format_allow_errors_ratio=1 FORMAT JSONAsObject" < "$uncompressed_file"
55 |         second_attempt=$?
56 | 
57 |         # Check if the second import was successful
58 |         if [[ $second_attempt -eq 0 ]]; then
59 |             echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $file on second attempt." >> "$SUCCESS_LOG"
60 |             rm -f "$uncompressed_file"  # Delete the uncompressed file after successful processing
61 |         else
62 |             echo "[$(date '+%Y-%m-%d %H:%M:%S')] Both attempts failed for $file. Giving up." >> "$ERROR_LOG"
63 |         fi
64 |     fi
65 | 
66 |     counter=$((counter + 1))
67 |     if [[ $counter -ge $MAX_FILES ]]; then
68 |         break
69 |     fi
70 | done
71 | 
72 | echo "Script completed successfully."


--------------------------------------------------------------------------------
/clickhouse/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Default data directory
 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky
 5 | 
 6 | # Allow the user to optionally provide the data directory as an argument
 7 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}"
 8 | 
 9 | # Define success and error log files
10 | SUCCESS_LOG="${2:-success.log}"
11 | ERROR_LOG="${3:-error.log}"
12 | 
13 | # Define prefix for output files
14 | OUTPUT_PREFIX="${4:-_m6i.8xlarge}"
15 | 
16 | # Check if the directory exists
17 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
18 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
19 |     exit 1
20 | fi
21 | 
22 | echo "Select the dataset size to benchmark:"
23 | echo "1) 1m (default)"
24 | echo "2) 10m"
25 | echo "3) 100m"
26 | echo "4) 1000m"
27 | echo "5) all"
28 | read -p "Enter the number corresponding to your choice: " choice
29 | 
30 | ./install.sh
31 | 
32 | benchmark() {
33 |     local size=$1
34 |     local suffix=$2
35 |     # Check DATA_DIRECTORY contains the required number of files to run the benchmark
36 |     file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
37 |     if (( file_count < size )); then
38 |         echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
39 |         exit 1
40 |     fi
41 |     ./create_and_load.sh "bluesky_${size}m_${suffix}" bluesky "ddl_${suffix}.sql" "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
42 |     ./total_size.sh "bluesky_${size}m_${suffix}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.total_size"
43 |     ./data_size.sh "bluesky_${size}m_${suffix}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.data_size"
44 |     ./index_size.sh "bluesky_${size}m_${suffix}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.index_size"
45 |     ./count.sh "bluesky_${size}m_${suffix}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.count"
46 |     #./query_results.sh "bluesky_${size}m_${suffix}" | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.query_results"
47 |     ./index_usage.sh "bluesky_${size}m_${suffix}" | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.index_usage"
48 |     ./physical_query_plans.sh "bluesky_${size}m_${suffix}" | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.physical_query_plans"
49 |     ./benchmark.sh "bluesky_${size}m_${suffix}" "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.results_runtime" "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.results_memory_usage"
50 | }
51 | 
52 | case $choice in
53 |     2)
54 |         benchmark 10 lz4
55 |         benchmark 10 zstd
56 |         ;;
57 |     3)
58 |         benchmark 100 lz4
59 |         benchmark 100 zstd
60 |         ;;
61 |     4)
62 |         benchmark 1000 lz4
63 |         benchmark 1000 zstd
64 |         ;;
65 |     5)
66 |         benchmark 1 lz4
67 |         benchmark 1 zstd
68 |         benchmark 10 lz4
69 |         benchmark 10 zstd
70 |         benchmark 100 lz4
71 |         benchmark 100 zstd
72 |         benchmark 1000 lz4
73 |         benchmark 1000 zstd
74 |         ;;
75 |     *)
76 |         benchmark 1 lz4
77 |         benchmark 1 zstd
78 |         ;;
79 | esac


--------------------------------------------------------------------------------
/clickhouse/physical_query_plans.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | 
12 | QUERY_NUM=1
13 | 
14 | cat queries.sql | while read -r query; do
15 | 
16 |     # Print the query number
17 |     echo "------------------------------------------------------------------------------------------------------------------------"
18 |     echo "Physical query plan for query Q$QUERY_NUM:"
19 |     echo
20 | 
21 |     clickhouse-client --database="$DB_NAME" --query="EXPLAIN PIPELINE $query"
22 | 
23 |     # Increment the query number
24 |     QUERY_NUM=$((QUERY_NUM + 1))
25 | done;


--------------------------------------------------------------------------------
/clickhouse/queries.sql:
--------------------------------------------------------------------------------
1 | SELECT data.commit.collection AS event, count() AS count FROM bluesky GROUP BY event ORDER BY count DESC;
2 | SELECT data.commit.collection AS event, count() AS count, uniqExact(data.did) AS users FROM bluesky WHERE data.kind = 'commit' AND data.commit.operation = 'create' GROUP BY event ORDER BY count DESC;
3 | SELECT data.commit.collection AS event, toHour(fromUnixTimestamp64Micro(data.time_us)) as hour_of_day, count() AS count FROM bluesky WHERE data.kind = 'commit' AND data.commit.operation = 'create' AND data.commit.collection in ['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'] GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
4 | SELECT data.did::String as user_id, min(fromUnixTimestamp64Micro(data.time_us)) as first_post_ts FROM bluesky WHERE data.kind = 'commit' AND data.commit.operation = 'create' AND data.commit.collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC LIMIT 3;
5 | SELECT data.did::String as user_id, date_diff( 'milliseconds', min(fromUnixTimestamp64Micro(data.time_us)), max(fromUnixTimestamp64Micro(data.time_us))) AS activity_span FROM bluesky WHERE data.kind = 'commit' AND data.commit.operation = 'create' AND data.commit.collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;


--------------------------------------------------------------------------------
/clickhouse/queries_formatted.sql:
--------------------------------------------------------------------------------
 1 | ------------------------------------------------------------------------------------------------------------------------
 2 | -- Q1 - Top event types
 3 | ------------------------------------------------------------------------------------------------------------------------
 4 | SELECT
 5 |     data.commit.collection AS event,
 6 |     count() AS count
 7 | FROM bluesky
 8 | GROUP BY event
 9 | ORDER BY count DESC;
10 | 
11 | ------------------------------------------------------------------------------------------------------------------------
12 | -- Q2 - Top event types together with unique users per event type
13 | ------------------------------------------------------------------------------------------------------------------------
14 | SELECT
15 |     data.commit.collection AS event,
16 |     count() AS count,
17 |     uniqExact(data.did) AS users
18 | FROM bluesky
19 | WHERE data.kind = 'commit'
20 |   AND data.commit.operation = 'create'
21 | GROUP BY event
22 | ORDER BY count DESC;
23 | 
24 | ------------------------------------------------------------------------------------------------------------------------
25 | -- Q3 - When do people use BlueSky
26 | ------------------------------------------------------------------------------------------------------------------------
27 | SELECT
28 |     data.commit.collection AS event,
29 |     toHour(fromUnixTimestamp64Micro(data.time_us)) as hour_of_day,
30 |     count() AS count
31 | FROM bluesky
32 | WHERE data.kind = 'commit'
33 |   AND data.commit.operation = 'create'
34 |   AND data.commit.collection in ['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like']
35 | GROUP BY event, hour_of_day
36 | ORDER BY hour_of_day, event;
37 | 
38 | ------------------------------------------------------------------------------------------------------------------------
39 | -- Q4 - top 3 post veterans
40 | ------------------------------------------------------------------------------------------------------------------------
41 | SELECT
42 |     data.did::String as user_id,
43 |     min(fromUnixTimestamp64Micro(data.time_us)) as first_post_ts
44 | FROM bluesky
45 | WHERE data.kind = 'commit'
46 |   AND data.commit.operation = 'create'
47 |   AND data.commit.collection = 'app.bsky.feed.post'
48 | GROUP BY user_id
49 | ORDER BY first_post_ts ASC
50 | LIMIT 3;
51 | 
52 | ------------------------------------------------------------------------------------------------------------------------
53 | -- Q5 - top 3 users with longest activity
54 | ------------------------------------------------------------------------------------------------------------------------
55 | SELECT
56 |     data.did::String as user_id,
57 |     date_diff(
58 |         'milliseconds',
59 |         min(fromUnixTimestamp64Micro(data.time_us)),
60 |         max(fromUnixTimestamp64Micro(data.time_us))) AS activity_span
61 | FROM bluesky
62 | WHERE data.kind = 'commit'
63 |   AND data.commit.operation = 'create'
64 |   AND data.commit.collection = 'app.bsky.feed.post'
65 | GROUP BY user_id
66 | ORDER BY activity_span DESC
67 | LIMIT 3;
68 | 


--------------------------------------------------------------------------------
/clickhouse/query_results.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | 
12 | QUERY_NUM=1
13 | 
14 | cat queries.sql | while read -r query; do
15 | 
16 |     # Print the query
17 |     echo "------------------------------------------------------------------------------------------------------------------------"
18 |     echo "Result for query Q$QUERY_NUM:"
19 |     echo
20 | 
21 |     clickhouse-client --database="$DB_NAME" --format=PrettyCompactMonoBlock --query="$query" --progress 0
22 | 
23 |     # Increment the query number
24 |     QUERY_NUM=$((QUERY_NUM + 1))
25 | done;


--------------------------------------------------------------------------------
/clickhouse/results/_query_results/_m6i.8xlarge_bluesky_1m_lz4.query_results:
--------------------------------------------------------------------------------
 1 | ------------------------------------------------------------------------------------------------------------------------
 2 | Result for query Q1:
 3 | 
 4 |     ┌─event──────────────────────┬──count─┐
 5 |  1. │ app.bsky.feed.like         │ 448944 │
 6 |  2. │ app.bsky.graph.follow      │ 360374 │
 7 |  3. │ app.bsky.feed.post         │  90816 │
 8 |  4. │ app.bsky.feed.repost       │  58540 │
 9 |  5. │ app.bsky.graph.block       │  14040 │
10 |  6. │ app.bsky.actor.profile     │  11762 │
11 |  7. │ app.bsky.graph.listitem    │   8103 │
12 |  8. │                            │   5328 │
13 |  9. │ app.bsky.graph.listblock   │    895 │
14 | 10. │ app.bsky.graph.starterpack │    405 │
15 | 11. │ app.bsky.graph.list        │    356 │
16 | 12. │ app.bsky.feed.threadgate   │    255 │
17 | 13. │ app.bsky.feed.postgate     │    104 │
18 | 14. │ app.bsky.feed.generator    │     74 │
19 | 15. │ app.bsky.labeler.service   │      4 │
20 |     └────────────────────────────┴────────┘
21 | ------------------------------------------------------------------------------------------------------------------------
22 | Result for query Q2:
23 | 
24 |     ┌─event──────────────────────┬──count─┬──users─┐
25 |  1. │ app.bsky.feed.like         │ 444523 │ 117617 │
26 |  2. │ app.bsky.graph.follow      │ 337978 │  63957 │
27 |  3. │ app.bsky.feed.post         │  86812 │  50464 │
28 |  4. │ app.bsky.feed.repost       │  56993 │  26581 │
29 |  5. │ app.bsky.graph.block       │  13838 │   5785 │
30 |  6. │ app.bsky.graph.listitem    │   7568 │   1078 │
31 |  7. │ app.bsky.actor.profile     │   5337 │   5337 │
32 |  8. │ app.bsky.graph.listblock   │    860 │    449 │
33 |  9. │ app.bsky.graph.list        │    259 │    218 │
34 | 10. │ app.bsky.feed.threadgate   │    228 │    196 │
35 | 11. │ app.bsky.graph.starterpack │    104 │    101 │
36 | 12. │ app.bsky.feed.postgate     │    101 │     82 │
37 | 13. │ app.bsky.feed.generator    │     10 │      9 │
38 |     └────────────────────────────┴────────┴────────┘
39 | ------------------------------------------------------------------------------------------------------------------------
40 | Result for query Q3:
41 | 
42 |    ┌─event────────────────┬─hour_of_day─┬──count─┐
43 | 1. │ app.bsky.feed.like   │          16 │ 444523 │
44 | 2. │ app.bsky.feed.post   │          16 │  86812 │
45 | 3. │ app.bsky.feed.repost │          16 │  56993 │
46 |    └──────────────────────┴─────────────┴────────┘
47 | ------------------------------------------------------------------------------------------------------------------------
48 | Result for query Q4:
49 | 
50 |    ┌─user_id──────────────────────────┬──────────────first_post_ts─┐
51 | 1. │ did:plc:yj3sjq3blzpynh27cumnp5ks │ 2024-11-21 16:25:49.000167 │
52 | 2. │ did:plc:l5o3qjrmfztir54cpwlv2eme │ 2024-11-21 16:25:49.001905 │
53 | 3. │ did:plc:s4bwqchfzm6gjqfeb6mexgbu │ 2024-11-21 16:25:49.003907 │
54 |    └──────────────────────────────────┴────────────────────────────┘
55 | ------------------------------------------------------------------------------------------------------------------------
56 | Result for query Q5:
57 | 
58 |    ┌─user_id──────────────────────────┬─activity_span─┐
59 | 1. │ did:plc:tsyymlun4eqjuw7hqrhmwagd │        813007 │
60 | 2. │ did:plc:3ug235sfy2pz7cawmpsftb65 │        811602 │
61 | 3. │ did:plc:doxhhgtxqiv47tmcovpbcqai │        811404 │
62 |    └──────────────────────────────────┴───────────────┘


--------------------------------------------------------------------------------
/clickhouse/results/m6i.8xlarge_bluesky_1000m_lz4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "ClickHouse (lz4)",
 3 |   "version": "25.1.1.2571",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000000,
12 |   "dataset_size_readable": "1000m",
13 |   "num_loaded_documents": 999999258,
14 |   "data_compression": "lz4",
15 |   "total_size": 159834794888,
16 |   "total_size_readable": "159.83 GB",
17 |   "data_size": 158959086724,
18 |   "data_size_readable": "158.96 GB",
19 |   "index_size": 875707939,
20 |   "index_size_readable": "875.71 MB",
21 |   "result": [
22 |     [0.401, 0.395, 0.389],
23 |     [21.445, 5.542, 5.587],
24 |     [34.907, 2.334, 2.351],
25 |     [8.173, 0.563, 0.556],
26 |     [8.178, 0.610, 0.573]
27 |   ],
28 |   "result_readable": [
29 |     "401.00 msec, 395.00 msec, 389.00 msec",
30 |     "21.45 sec, 5.54 sec, 5.59 sec",
31 |     "34.91 sec, 2.33 sec, 2.35 sec",
32 |     "8.17 sec, 563.00 msec, 556.00 msec",
33 |     "8.18 sec, 610.00 msec, 573.00 msec"
34 |   ],
35 |   "memory_usage": [
36 |     [2418416, 1362125, 1934870],
37 |     [4857932616, 4839715303, 4870295739],
38 |     [148886114, 139232115, 140070546],
39 |     [2500677923, 2573855740, 2556181514],
40 |     [2694349228, 3086817972, 2623063375]
41 |   ],
42 |   "memory_usage_readable": [
43 |     "2.42 MB, 1.36 MB, 1.93 MB",
44 |     "4.86 GB, 4.84 GB, 4.87 GB",
45 |     "148.89 MB, 139.23 MB, 140.07 MB",
46 |     "2.50 GB, 2.57 GB, 2.56 GB",
47 |     "2.69 GB, 3.09 GB, 2.62 GB"
48 |   ]
49 | }
50 | 


--------------------------------------------------------------------------------
/clickhouse/results/m6i.8xlarge_bluesky_1000m_zstd.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "ClickHouse (zstd)",
 3 |   "version": "25.1.1.2571",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000000,
12 |   "dataset_size_readable": "1000m",
13 |   "num_loaded_documents": 999999258,
14 |   "data_compression": "zstd",
15 |   "total_size": 99230044699,
16 |   "total_size_readable": "99.23 GB",
17 |   "data_size": 98355025153,
18 |   "data_size_readable": "98.36 GB",
19 |   "index_size": 875019321,
20 |   "index_size_readable": "875.02 MB",
21 |   "result": [
22 |     [0.405, 0.395, 0.394],
23 |     [11.854, 5.632, 5.749],
24 |     [28.899, 2.466, 2.488],
25 |     [5.384, 0.609, 0.596],
26 |     [5.405, 0.640, 0.637]
27 |   ],
28 |   "result_readable": [
29 |     "405.00 msec, 395.00 msec, 394.00 msec",
30 |     "11.85 sec, 5.63 sec, 5.75 sec",
31 |     "28.90 sec, 2.47 sec, 2.49 sec",
32 |     "5.38 sec, 609.00 msec, 596.00 msec",
33 |     "5.41 sec, 640.00 msec, 637.00 msec"
34 |   ],
35 |   "memory_usage": [
36 |     [2779174, 1363689, 783683],
37 |     [4823948860, 4871634449, 4918802526],
38 |     [147631790, 139732345, 139867057],
39 |     [2537597752, 2638168515, 2609289232],
40 |     [2796756189, 2927194160, 2788741563]
41 |   ],
42 |   "memory_usage_readable": [
43 |     "2.78 MB, 1.36 MB, 783.68 KB",
44 |     "4.82 GB, 4.87 GB, 4.92 GB",
45 |     "147.63 MB, 139.73 MB, 139.87 MB",
46 |     "2.54 GB, 2.64 GB, 2.61 GB",
47 |     "2.80 GB, 2.93 GB, 2.79 GB"
48 |   ]
49 | }
50 | 


--------------------------------------------------------------------------------
/clickhouse/results/m6i.8xlarge_bluesky_100m_lz4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "ClickHouse (lz4)",
 3 |   "version": "25.1.1.2571",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 100000000,
12 |   "dataset_size_readable": "100m",
13 |   "num_loaded_documents": 99999968,
14 |   "data_compression": "lz4",
15 |   "total_size": 15793562624,
16 |   "total_size_readable": "15.79 GB",
17 |   "data_size": 15771339413,
18 |   "data_size_readable": "15.77 GB",
19 |   "index_size": 222231402,
20 |   "index_size_readable": " 22.22 MB",
21 |   "result": [
22 |     [0.054, 0.041, 0.041],
23 |     [2.344, 0.724, 0.724],
24 |     [2.114, 0.226, 0.228],
25 |     [0.380, 0.087, 0.085],
26 |     [0.648, 0.088, 0.092]
27 |   ],
28 |   "result_readable": [
29 |     "54.00 msec, 41.00 msec, 41.00 msec",
30 |     "2.34 sec, 724.00 msec, 724.00 msec",
31 |     "2.11 sec, 226.00 msec, 228.00 msec",
32 |     "380.00 msec, 87.00 msec, 85.00 msec",
33 |     "648.00 msec, 88.00 msec, 92.00 msec"
34 |   ],
35 |   "memory_usage": [
36 |     [713196, 1512167, 859315],
37 |     [1044281095, 1061751337, 1065857874],
38 |     [143081365, 142096261, 142096261],
39 |     [595252210, 568894822, 585137315],
40 |     [657713321, 690210788, 697155477]
41 |   ],
42 |   "memory_usage_readable": [
43 |     "713.20 KB, 1.51 MB, 859.32 KB",
44 |     "1.04 GB, 1.06 GB, 1.07 GB",
45 |     "143.08 MB, 142.10 MB, 142.10 MB",
46 |     "595.25 MB, 568.89 MB, 585.14 MB",
47 |     "657.71 MB, 690.21 MB, 697.16 MB"
48 |   ]
49 | }
50 | 


--------------------------------------------------------------------------------
/clickhouse/results/m6i.8xlarge_bluesky_100m_zstd.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "ClickHouse (zstd)",
 3 |   "version": "25.1.1.2571",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 100000000,
12 |   "dataset_size_readable": "100m",
13 |   "num_loaded_documents": 99999968,
14 |   "data_compression": "zstd",
15 |   "total_size": 9684754080,
16 |   "total_size_readable": "9.68 GB",
17 |   "data_size": 9662584921,
18 |   "data_size_readable": "9.66 GB",
19 |   "index_size": 22169088,
20 |   "index_size_readable": "22.17 MB",
21 |   "result": [
22 |     [0.052, 0.043, 0.041],
23 |     [1.001, 0.763, 0.765],
24 |     [1.502, 0.247, 0.250],
25 |     [0.124, 0.094, 0.092],
26 |     [0.195, 0.095, 0.096]
27 |   ],
28 |   "result_readable": [
29 |     "52.00 msec, 43.00 msec, 41.00 msec",
30 |     "1.00 sec, 763.00 msec, 765.00 msec",
31 |     "1.50 sec, 247.00 msec, 250.00 msec",
32 |     "124.00 msec, 94.00 msec, 92.00 msec",
33 |     "195.00 msec, 95.00 msec, 96.00 msec"
34 |   ],
35 |   "memory_usage": [
36 |     [784015, 891324, 1395384],
37 |     [1034703427, 1069950332, 1052027034],
38 |     [141832133, 140988117, 140988117],
39 |     [573490495, 584182275, 597232896],
40 |     [690147517, 681601522, 684611565]
41 |   ],
42 |   "memory_usage_readable": [
43 |     "784.01 KB, 891.32 KB, 1.40 MB",
44 |     "1.03 GB, 1.07 GB, 1.05 GB",
45 |     "141.83 MB, 140.99 MB, 140.99 MB",
46 |     "573.49 MB, 584.18 MB, 597.23 MB",
47 |     "690.15 MB, 681.60 MB, 684.61 MB"
48 |   ]
49 | }
50 | 


--------------------------------------------------------------------------------
/clickhouse/results/m6i.8xlarge_bluesky_10m_lz4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "ClickHouse (lz4)",
 3 |   "version": "25.1.1.2571",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 10000000,
12 |   "dataset_size_readable": "10m",
13 |   "num_loaded_documents": 9999994,
14 |   "data_compression": "lz4",
15 |   "total_size": 1636685315,
16 |   "total_size_readable": "1.64 GB",
17 |   "data_size": 1635155418,
18 |   "data_size_readable": "1.64 GB",
19 |   "index_size": 1529863,
20 |   "index_size_readable": "1.53 MB",
21 |   "result": [
22 |     [0.014, 0.008, 0.008],
23 |     [0.155, 0.133, 0.134],
24 |     [0.051, 0.037, 0.040],
25 |     [0.038, 0.023, 0.025],
26 |     [0.042, 0.029, 0.025]
27 |   ],
28 |   "result_readable": [
29 |     "14.00 msec, 8.00 msec, 8.00 msec",
30 |     "155.00 msec, 133.00 msec, 134.00 msec",
31 |     "51.00 msec, 37.00 msec, 40.00 msec",
32 |     "38.00 msec, 23.00 msec, 25.00 msec",
33 |     "42.00 msec, 29.00 msec, 25.00 msec"
34 |   ],
35 |   "memory_usage": [
36 |     [1025969, 979623, 850534],
37 |     [289693872, 274696015, 263016878],
38 |     [141884177, 141597001, 141597001],
39 |     [204374002, 200220685, 200101066],
40 |     [204576293, 208646332, 200235195]
41 |   ],
42 |   "memory_usage_readable": [
43 |     "1.03 MB, 979.62 KB, 850.53 KB",
44 |     "289.69 MB, 274.70 MB, 263.02 MB",
45 |     "141.88 MB, 141.60 MB, 141.60 MB",
46 |     "204.37 MB, 200.22 MB, 200.10 MB",
47 |     "204.58 MB, 208.65 MB, 200.24 MB"
48 |   ]
49 | }
50 | 


--------------------------------------------------------------------------------
/clickhouse/results/m6i.8xlarge_bluesky_10m_zstd.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "ClickHouse (zstd)",
 3 |   "version": "25.1.1.2571",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 10000000,
12 |   "dataset_size_readable": "10m",
13 |   "num_loaded_documents": 9999994,
14 |   "data_compression": "zstd",
15 |   "total_size": 995124869,
16 |   "total_size_readable": "995.12 MB",
17 |   "993599478": 1056861138,
18 |   "data_size_readable": "993.60 MB",
19 |   "index_size": 1525357,
20 |   "index_size_readable": "1.53 MB",
21 |   "result": [
22 |     [0.016, 0.008, 0.008],
23 |     [0.166, 0.141, 0.150],
24 |     [0.064, 0.040, 0.042],
25 |     [0.044, 0.026, 0.024],
26 |     [0.042, 0.028, 0.026]
27 |   ],
28 |   "result_readable": [
29 |     "16.00 msec, 8.00 msec, 8.00 msec",
30 |     "166.00 msec, 141.00 msec, 150.00 msec",
31 |     "64.00 msec, 40.00 msec, 42.00 msec",
32 |     "44.00 msec, 26.00 msec, 24.00 msec",
33 |     "42.00 msec, 28.00 msec, 26.00 msec"
34 |   ],
35 |   "memory_usage": [
36 |     [590559, 1051274, 1205734],
37 |     [270187643, 282291213, 261653591],
38 |     [141094195, 140810923, 140810923],
39 |     [199567823, 195201258, 195267941],
40 |     [203996437, 203772727, 199532552]
41 |   ],
42 |   "memory_usage_readable": [
43 |     "590.56 KB, 1.05 MB, 1.21 MB",
44 |     "270.19 MB, 282.29 MB, 261.65 MB",
45 |     "141.09 MB, 140.81 MB, 140.81 MB",
46 |     "199.57 MB, 195.20 MB, 195.27 MB",
47 |     "204.00 MB, 203.77 MB, 199.53 MB"
48 |   ]
49 | }
50 | 


--------------------------------------------------------------------------------
/clickhouse/results/m6i.8xlarge_bluesky_1m_lz4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "ClickHouse (lz4)",
 3 |   "version": "25.1.1.2571",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000,
12 |   "dataset_size_readable": "1m",
13 |   "num_loaded_documents": 1000000,
14 |   "data_compression": "lz4",
15 |   "total_size": 162735429,
16 |   "total_size_readable": "162.74 MB",
17 |   "data_size": 162623843,
18 |   "data_size_readable": "162.62 MB",
19 |   "index_size": 111579,
20 |   "index_size_readable": "111.58 KB",
21 |   "result": [
22 |     [0.009, 0.004, 0.004],
23 |     [0.036, 0.023, 0.024],
24 |     [0.021, 0.011, 0.010],
25 |     [0.034, 0.019, 0.017],
26 |     [0.038, 0.021, 0.017]
27 |   ],
28 |   "result_readable": [
29 |     "9.00 msec, 4.00 msec, 4.00 msec",
30 |     "36.00 msec, 23.00 msec, 24.00 msec",
31 |     "21.00 msec, 11.00 msec, 10.00 msec",
32 |     "34.00 msec, 19.00 msec, 17.00 msec",
33 |     "38.00 msec, 21.00 msec, 17.00 msec"
34 |   ],
35 |   "memory_usage": [
36 |     [1791510, 1634290, 1486871],
37 |     [77337064, 77275976, 77275976],
38 |     [7416935, 4754228, 6768205],
39 |     [13084446, 13068638, 13068638],
40 |     [15183014, 15165990, 15165990]
41 |   ],
42 |   "memory_usage_readable": [
43 |     "1.79 MB, 1.63 MB, 1.49 MB",
44 |     "77.34 MB, 77.28 MB, 77.28 MB",
45 |     "7.42 MB, 4.75 MB, 6.77 MB",
46 |     "13.08 MB, 13.07 MB, 13.07 MB",
47 |     "15.18 MB, 15.17 MB, 15.17 MB"
48 |   ]
49 | }
50 | 


--------------------------------------------------------------------------------
/clickhouse/results/m6i.8xlarge_bluesky_1m_zstd.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "ClickHouse (zstd)",
 3 |   "version": "25.1.1.2571",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000,
12 |   "dataset_size_readable": "1m",
13 |   "num_loaded_documents": 1000000,
14 |   "data_compression": "zstd",
15 |   "total_size": 98560583,
16 |   "total_size_readable": "98.56 MB",
17 |   "data_size": 98449386,
18 |   "data_size_readable": "98.45 MB",
19 |   "index_size": 111190,
20 |   "index_size_readable": "111.19 KB",
21 |   "result": [
22 |     [0.011, 0.004, 0.004],
23 |     [0.033, 0.024, 0.022],
24 |     [0.023, 0.011, 0.010],
25 |     [0.033, 0.026, 0.029],
26 |     [0.037, 0.023, 0.021]
27 |   ],
28 |   "result_readable": [
29 |     "11.00 msec, 4.00 msec, 4.00 msec",
30 |     "33.00 msec, 24.00 msec, 22.00 msec",
31 |     "23.00 msec, 11.00 msec, 10.00 msec",
32 |     "33.00 msec, 26.00 msec, 29.00 msec",
33 |     "37.00 msec, 23.00 msec, 21.00 msec"
34 |   ],
35 |   "memory_usage": [
36 |     [884312, 1849931, 1633834],
37 |     [76410278, 72165557, 76362246],
38 |     [5557964, 2580507, 2713186],
39 |     [12950030, 12938382, 12938382],
40 |     [15048278, 15035734, 15035734]
41 |   ],
42 |   "memory_usage_readable": [
43 |     "884.31 KB, 1.85 MB, 1.63 MB",
44 |     "76.41 MB, 72.17 MB, 76.36 MB",
45 |     "5.56 MB, 2.58 MB, 2.71 MB",
46 |     "12.95 MB, 12.94 MB, 12.94 MB",
47 |     "15.05 MB, 15.04 MB, 15.04 MB"
48 |   ]
49 | }
50 | 


--------------------------------------------------------------------------------
/clickhouse/run_queries.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | 
12 | TRIES=3
13 | 
14 | cat queries.sql | while read -r query; do
15 | 
16 |     # Clear the Linux file system cache
17 |     echo "Clearing file system cache..."
18 |     sync
19 |     echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
20 |     echo "File system cache cleared."
21 | 
22 |     # Print the query
23 |     echo "Running query: $query"
24 | 
25 |     # Execute the query multiple times
26 |     for i in $(seq 1 $TRIES); do
27 |         clickhouse-client --database="$DB_NAME" --time --memory-usage --format=Null --query="$query" --progress 0
28 |     done;
29 | done;


--------------------------------------------------------------------------------
/clickhouse/total_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | TABLE_NAME="$2"
12 | 
13 | clickhouse-client --query "SELECT sum(bytes_on_disk) FROM system.parts WHERE database = '$DB_NAME' AND table = '$TABLE_NAME' AND active"


--------------------------------------------------------------------------------
/copy_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "Select the dataset size to download:"
 4 | echo "1) 1m (default)"
 5 | echo "2) 10m"
 6 | echo "3) 100m"
 7 | echo "4) 1000m"
 8 | read -p "Enter the number corresponding to your choice: " choice
 9 | 
10 | case $choice in
11 |     2)
12 |         # Download 10m dataset
13 |         wget https://clickhouse-public-datasets.s3.amazonaws.com/bluesky/file_{0001..0010}.json.gz -P ~/data/bluesky -N
14 |         ;;
15 |     3)
16 |         # Download 100m dataset
17 |         wget https://clickhouse-public-datasets.s3.amazonaws.com/bluesky/file_{0001..0100}.json.gz -P ~/data/bluesky -N
18 |         ;;
19 |     4)
20 |         # Download 1000m dataset
21 |         wget https://clickhouse-public-datasets.s3.amazonaws.com/bluesky/file_{0001..1000}.json.gz -P ~/data/bluesky -N
22 |         ;;
23 |     *)
24 |         # Download 1m dataset
25 |         wget https://clickhouse-public-datasets.s3.amazonaws.com/bluesky/file_0001.json.gz -P ~/data/bluesky -N
26 |         ;;
27 | esac


--------------------------------------------------------------------------------
/duckdb/benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> [RESULT_FILE]"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | RESULT_FILE="${2:-}"
12 | 
13 | # Print the database name
14 | echo "Running queries on database: $DB_NAME"
15 | 
16 | # Run queries and log the output
17 | ./run_queries.sh "$DB_NAME" 2>&1 | tee query_log.txt
18 | 
19 | # Process the query log and prepare the result
20 | RESULT=$(cat query_log.txt | grep -oP 'Real time: \d+\.\d+ seconds' | sed -r -e 's/Real time: ([0-9]+\.[0-9]+) seconds/\1/' | \
21 | awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }')
22 | 
23 | # Output the result
24 | if [[ -n "$RESULT_FILE" ]]; then
25 |     echo "$RESULT" > "$RESULT_FILE"
26 |     echo "Result written to $RESULT_FILE"
27 | else
28 |     echo "$RESULT"
29 | fi


--------------------------------------------------------------------------------
/duckdb/count.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <database_name> <table_name>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATABASE_NAME="$1"
11 | TABLE_NAME="$2"
12 | DUCKDB_CMD="duckdb $DATABASE_NAME"
13 | 
14 | 
15 | # Fetch the count using duckDB
16 | $DUCKDB_CMD -c "select count() from '$TABLE_NAME';"
17 | 
18 | 


--------------------------------------------------------------------------------
/duckdb/create_and_load.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 7 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME> <DDL_FILE> <DATA_DIRECTORY> <NUM_FILES> <SUCCESS_LOG> <ERROR_LOG>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | TABLE_NAME="$2"
12 | DDL_FILE="$3"
13 | DATA_DIRECTORY="$4"
14 | NUM_FILES="$5"
15 | SUCCESS_LOG="$6"
16 | ERROR_LOG="$7"
17 | 
18 | # Validate arguments
19 | [[ ! -f "$DDL_FILE" ]] && { echo "Error: DDL file '$DDL_FILE' does not exist."; exit 1; }
20 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
21 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }
22 | 
23 | # Create database and execute DDL
24 | duckdb $DB_NAME < "$DDL_FILE"
25 | 
26 | # Load data
27 | ./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"
28 | 
29 | echo "Script completed successfully."


--------------------------------------------------------------------------------
/duckdb/ddl.sql:
--------------------------------------------------------------------------------
1 | create table bluesky (j JSON);


--------------------------------------------------------------------------------
/duckdb/install.sh:
--------------------------------------------------------------------------------
1 | sudo snap install duckdb
2 | 


--------------------------------------------------------------------------------
/duckdb/load_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 6 ]]; then
 5 |     echo "Usage: $0 <directory> <database_name> <table_name> <max_files> <success_log> <error_log>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DIRECTORY="$1"
11 | DB_NAME="$2"
12 | TABLE_NAME="$3"
13 | MAX_FILES="$4"
14 | SUCCESS_LOG="$5"
15 | ERROR_LOG="$6"
16 | DUCKDB_CMD="duckdb $DB_NAME"
17 | 
18 | # Validate that MAX_FILES is a number
19 | if ! [[ "$MAX_FILES" =~ ^[0-9]+$ ]]; then
20 |     echo "Error: <max_files> must be a positive integer."
21 |     exit 1
22 | fi
23 | 
24 | # Ensure the log files exist
25 | touch "$SUCCESS_LOG" "$ERROR_LOG"
26 | 
27 | counter=0
28 | 
29 | # Loop through each .json.gz file in the directory
30 | for file in $(ls "$DIRECTORY"/*.json.gz | sort); do
31 |     # if [[ -f "$file" ]]; then
32 |     #     $DUCKDB_CMD -c "insert into $TABLE_NAME select * from read_ndjson_objects('$file', ignore_errors=false, maximum_object_size=1048576000);"
33 |     # fi
34 |     if [[ -f "$file" ]]; then
35 |         # Create a temporary directory for split files
36 |         temp_dir=$(mktemp -d $DIRECTORY/temp.XXXXXX)
37 |         
38 |         # Decompress and split the file into smaller chunks of 100000 lines each
39 |         gzip -dc "$file" | split -l 100000 - "$temp_dir/chunk_"
40 |         
41 |         # Insert each chunk into DuckDB
42 |         for chunk in "$temp_dir"/chunk_*; do
43 |             $DUCKDB_CMD -c "insert into $TABLE_NAME select * from read_ndjson_objects('$chunk', ignore_errors=false, maximum_object_size=1048576000);"
44 |         done
45 |         
46 |         # Clean up temporary directory
47 |         rm -r "$temp_dir"
48 |         counter=$((counter + 1))
49 |     fi
50 |     
51 |     # Stop processing if the max number of files is reached
52 |     if [[ $counter -ge $MAX_FILES ]]; then
53 |         echo "Copied maximum number of files: $MAX_FILES"
54 |         break
55 |     fi
56 | done
57 | 
58 | echo "All files have been imported."


--------------------------------------------------------------------------------
/duckdb/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Default data directory
 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky
 5 | 
 6 | # Allow the user to optionally provide the data directory as an argument
 7 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}"
 8 | 
 9 | # Define success and error log files
10 | SUCCESS_LOG="${2:-success.log}"
11 | ERROR_LOG="${3:-error.log}"
12 | 
13 | # Define prefix for output files
14 | OUTPUT_PREFIX="${4:-_m6i.8xlarge}"
15 | 
16 | # Check if the directory exists
17 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
18 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
19 |     exit 1
20 | fi
21 | 
22 | echo "Select the dataset size to benchmark:"
23 | echo "1) 1m (default)"
24 | echo "2) 10m"
25 | echo "3) 100m"
26 | echo "4) 1000m"
27 | echo "5) all"
28 | read -p "Enter the number corresponding to your choice: " choice
29 | 
30 | ./install.sh
31 | 
32 | benchmark() {
33 |     local size=$1
34 |     # Check DATA_DIRECTORY contains the required number of files to run the benchmark
35 |     file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
36 |     if (( file_count < size )); then
37 |         echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
38 |         exit 1
39 |     fi
40 |     ./create_and_load.sh "db.duckdb_${size}" bluesky ddl.sql "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
41 |     ./total_size.sh "db.duckdb_${size}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.data_size"
42 |     ./count.sh "db.duckdb_${size}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count"
43 |     #./query_results.sh "db.duckdb_${size}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.query_results"
44 |     ./physical_query_plans.sh "db.duckdb_${size}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.physical_query_plans"
45 |     ./benchmark.sh "db.duckdb_${size}" "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime"
46 | }
47 | 
48 | case $choice in
49 |     2)
50 |         benchmark 10
51 |         ;;
52 |     3)
53 |         benchmark 100
54 |         ;;
55 |     4)
56 |         benchmark 1000
57 |         ;;
58 |     5)
59 |         benchmark 1
60 |         benchmark 10
61 |         benchmark 100
62 |         benchmark 1000
63 |         ;;
64 |     *)
65 |         benchmark 1
66 |         ;;
67 | esac


--------------------------------------------------------------------------------
/duckdb/physical_query_plans.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | 
12 | QUERY_NUM=1
13 | 
14 | DUCKDB_CMD="duckdb $DB_NAME"
15 | 
16 | cat queries.sql | while read -r query; do
17 | 
18 |     # Print the query number
19 |     echo "------------------------------------------------------------------------------------------------------------------------"
20 |     echo "Physical query plan for query Q$QUERY_NUM:"
21 |     echo
22 | 
23 |     $DUCKDB_CMD -c "EXPLAIN $query"
24 | 
25 |     # Increment the query number
26 |     QUERY_NUM=$((QUERY_NUM + 1))
27 | done;


--------------------------------------------------------------------------------
/duckdb/queries.sql:
--------------------------------------------------------------------------------
1 | SELECT j->>'$.commit.collection' AS event,count() AS count FROM bluesky GROUP BY event ORDER BY count DESC;
2 | SELECT j->>'$.commit.collection' AS event,count() AS count,count(DISTINCT j->>'$.did') AS users FROM bluesky WHERE (j->>'$.kind' = 'commit') AND (j->>'$.commit.operation' = 'create') GROUP BY event ORDER BY count DESC;
3 | SELECT j->>'$.commit.collection' AS event,hour(TO_TIMESTAMP(CAST(j->>'$.time_us' AS BIGINT) / 1000000)) as hour_of_day,count() AS count FROM bluesky WHERE (j->>'$.kind' = 'commit') AND (j->>'$.commit.operation' = 'create') AND (j->>'$.commit.collection' in ['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like']) GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
4 | SELECT j->>'$.did'::String as user_id,TO_TIMESTAMP(CAST(MIN(j->>'$.time_us') AS BIGINT) / 1000000) AS first_post_date FROM bluesky WHERE (j->>'$.kind' = 'commit') AND (j->>'$.commit.operation' = 'create')   AND (j->>'$.commit.collection' = 'app.bsky.feed.post') GROUP BY user_id ORDER BY first_post_date ASC LIMIT 3;
5 | SELECT j->>'$.did'::String as user_id,date_diff('milliseconds', TO_TIMESTAMP(CAST(MIN(j->>'$.time_us') AS BIGINT) / 1000000),TO_TIMESTAMP(CAST(MAX(j->>'$.time_us') AS BIGINT) / 1000000)) AS activity_span FROM bluesky WHERE (j->>'$.kind' = 'commit') AND (j->>'$.commit.operation' = 'create') AND (j->>'$.commit.collection' = 'app.bsky.feed.post') GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;
6 | 


--------------------------------------------------------------------------------
/duckdb/queries_formatted.sql:
--------------------------------------------------------------------------------
 1 | ------------------------------------------------------------------------------------------------------------------------
 2 | -- Q1 - Top event types
 3 | ------------------------------------------------------------------------------------------------------------------------
 4 | SELECT
 5 |     j->>'$.commit.collection' AS event,
 6 |     count() AS count
 7 | FROM bluesky
 8 | GROUP BY event
 9 | ORDER BY count DESC;
10 | 
11 | ------------------------------------------------------------------------------------------------------------------------
12 | -- Q2 - Top event types together with unique users per event type
13 | ------------------------------------------------------------------------------------------------------------------------
14 | 
15 | SELECT
16 |     j->>'$.commit.collection' AS event,
17 |     count() AS count,count(DISTINCT j->>'$.did') AS users
18 | FROM bluesky
19 | WHERE (j->>'$.kind' = 'commit')
20 |   AND (j->>'$.commit.operation' = 'create')
21 | GROUP BY event
22 | ORDER BY count DESC;
23 | 
24 | ------------------------------------------------------------------------------------------------------------------------
25 | -- Q3 - When do people use BlueSky
26 | ------------------------------------------------------------------------------------------------------------------------
27 | 
28 | SELECT
29 |     j->>'$.commit.collection' AS event,
30 |     hour(TO_TIMESTAMP(CAST(j->>'$.time_us' AS BIGINT) / 1000000)) as hour_of_day,
31 |     count() AS count
32 | FROM bluesky
33 | WHERE (j->>'$.kind' = 'commit')
34 |   AND (j->>'$.commit.operation' = 'create')
35 |   AND (j->>'$.commit.collection' in ['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'])
36 | GROUP BY event, hour_of_day
37 | ORDER BY hour_of_day, event;
38 | 
39 | ------------------------------------------------------------------------------------------------------------------------
40 | -- Q4 - top 3 post veterans
41 | ------------------------------------------------------------------------------------------------------------------------
42 | 
43 | SELECT
44 |     j->>'$.did'::String as user_id,
45 |     TO_TIMESTAMP(CAST(MIN(j->>'$.time_us') AS BIGINT) / 1000000) AS first_post_date
46 | FROM bluesky
47 | WHERE (j->>'$.kind' = 'commit')
48 |   AND (j->>'$.commit.operation' = 'create')
49 |   AND (j->>'$.commit.collection' = 'app.bsky.feed.post')
50 | GROUP BY user_id
51 | ORDER BY first_post_date ASC
52 | LIMIT 3;
53 | 
54 | ------------------------------------------------------------------------------------------------------------------------
55 | -- Q5 - top 3 users with longest activity
56 | ------------------------------------------------------------------------------------------------------------------------
57 | 
58 | SELECT
59 |     j->>'$.did'::String as user_id,
60 |     date_diff('milliseconds', TO_TIMESTAMP(CAST(MIN(j->>'$.time_us') AS BIGINT) / 1000000),
61 |     TO_TIMESTAMP(CAST(MAX(j->>'$.time_us') AS BIGINT) / 1000000)) AS activity_span
62 | FROM bluesky
63 | WHERE (j->>'$.kind' = 'commit')
64 |   AND (j->>'$.commit.operation' = 'create')
65 |   AND (j->>'$.commit.collection' = 'app.bsky.feed.post')
66 | GROUP BY user_id
67 | ORDER BY activity_span DESC
68 | LIMIT 3;


--------------------------------------------------------------------------------
/duckdb/query_results.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | 
12 | DUCKDB_CMD="duckdb $DB_NAME"
13 | 
14 | QUERY_NUM=1
15 | 
16 | cat queries.sql | while read -r query; do
17 | 
18 |     # Print the query
19 |     echo "------------------------------------------------------------------------------------------------------------------------"
20 |     echo "Result for query Q$QUERY_NUM:"
21 |     echo
22 |     $DUCKDB_CMD -c "$query"
23 | 
24 |     # Increment the query number
25 |     QUERY_NUM=$((QUERY_NUM + 1))
26 | done;


--------------------------------------------------------------------------------
/duckdb/results/_query_results/_m6i.8xlarge_bluesky_1m.query_results:
--------------------------------------------------------------------------------
 1 | ------------------------------------------------------------------------------------------------------------------------
 2 | Result for query Q1:
 3 | 
 4 | ┌────────────────────────────┬────────┐
 5 | │           event            │ count  │
 6 | │          varchar           │ int64  │
 7 | ├────────────────────────────┼────────┤
 8 | │ app.bsky.feed.like         │ 448944 │
 9 | │ app.bsky.graph.follow      │ 360374 │
10 | │ app.bsky.feed.post         │  90816 │
11 | │ app.bsky.feed.repost       │  58540 │
12 | │ app.bsky.graph.block       │  14040 │
13 | │ app.bsky.actor.profile     │  11762 │
14 | │ app.bsky.graph.listitem    │   8103 │
15 | │                            │   5328 │
16 | │ app.bsky.graph.listblock   │    895 │
17 | │ app.bsky.graph.starterpack │    405 │
18 | │ app.bsky.graph.list        │    356 │
19 | │ app.bsky.feed.threadgate   │    255 │
20 | │ app.bsky.feed.postgate     │    104 │
21 | │ app.bsky.feed.generator    │     74 │
22 | │ app.bsky.labeler.service   │      4 │
23 | ├────────────────────────────┴────────┤
24 | │ 15 rows                   2 columns │
25 | └─────────────────────────────────────┘
26 | ------------------------------------------------------------------------------------------------------------------------
27 | Result for query Q2:
28 | 
29 | ┌────────────────────────────┬────────┬────────┐
30 | │           event            │ count  │ users  │
31 | │          varchar           │ int64  │ int64  │
32 | ├────────────────────────────┼────────┼────────┤
33 | │ app.bsky.feed.like         │ 444523 │ 117617 │
34 | │ app.bsky.graph.follow      │ 337978 │  63957 │
35 | │ app.bsky.feed.post         │  86812 │  50464 │
36 | │ app.bsky.feed.repost       │  56993 │  26581 │
37 | │ app.bsky.graph.block       │  13838 │   5785 │
38 | │ app.bsky.graph.listitem    │   7568 │   1078 │
39 | │ app.bsky.actor.profile     │   5337 │   5337 │
40 | │ app.bsky.graph.listblock   │    860 │    449 │
41 | │ app.bsky.graph.list        │    259 │    218 │
42 | │ app.bsky.feed.threadgate   │    228 │    196 │
43 | │ app.bsky.graph.starterpack │    104 │    101 │
44 | │ app.bsky.feed.postgate     │    101 │     82 │
45 | │ app.bsky.feed.generator    │     10 │      9 │
46 | ├────────────────────────────┴────────┴────────┤
47 | │ 13 rows                            3 columns │
48 | └──────────────────────────────────────────────┘
49 | ------------------------------------------------------------------------------------------------------------------------
50 | Result for query Q3:
51 | 
52 | ┌──────────────────────┬─────────────┬────────┐
53 | │        event         │ hour_of_day │ count  │
54 | │       varchar        │    int64    │ int64  │
55 | ├──────────────────────┼─────────────┼────────┤
56 | │ app.bsky.feed.like   │          16 │ 444523 │
57 | │ app.bsky.feed.post   │          16 │  86812 │
58 | │ app.bsky.feed.repost │          16 │  56993 │
59 | └──────────────────────┴─────────────┴────────┘
60 | ------------------------------------------------------------------------------------------------------------------------
61 | Result for query Q4:
62 | 
63 | ┌──────────────────────────────────┬───────────────────────────────┐
64 | │             user_id              │        first_post_date        │
65 | │             varchar              │   timestamp with time zone    │
66 | ├──────────────────────────────────┼───────────────────────────────┤
67 | │ did:plc:yj3sjq3blzpynh27cumnp5ks │ 2024-11-21 16:25:49.000167+00 │
68 | │ did:plc:l5o3qjrmfztir54cpwlv2eme │ 2024-11-21 16:25:49.001905+00 │
69 | │ did:plc:s4bwqchfzm6gjqfeb6mexgbu │ 2024-11-21 16:25:49.003907+00 │
70 | └──────────────────────────────────┴───────────────────────────────┘
71 | ------------------------------------------------------------------------------------------------------------------------
72 | Result for query Q5:
73 | 
74 | ┌──────────────────────────────────┬───────────────┐
75 | │             user_id              │ activity_span │
76 | │             varchar              │     int64     │
77 | ├──────────────────────────────────┼───────────────┤
78 | │ did:plc:tsyymlun4eqjuw7hqrhmwagd │        813007 │
79 | │ did:plc:3ug235sfy2pz7cawmpsftb65 │        811602 │
80 | │ did:plc:doxhhgtxqiv47tmcovpbcqai │        811404 │
81 | └──────────────────────────────────┴───────────────┘


--------------------------------------------------------------------------------
/duckdb/results/m6i.8xlarge_bluesky_1000m.errors:
--------------------------------------------------------------------------------
1 | `Invalid Input Error: Malformed JSON at byte 3 of input: unexpected content after document.  Input: ":"This user is a Sable!","lang":"en","name":"S..."`


--------------------------------------------------------------------------------
/duckdb/results/m6i.8xlarge_bluesky_1000m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "DuckDB",
 3 |   "version": "1.1.3",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000000,
12 |   "dataset_size_readable": "1000m",
13 |   "num_loaded_documents": 974400000,
14 |   "data_compression": "auto",
15 |   "total_size": 472594513920,
16 |   "total_size_readable": "472 GB",
17 |   "result": [
18 |     [3741.938,3732.441,3731.713],
19 |     [3748.548,3737.248,3741.860],
20 |     [3746.684,3731.458,3729.542],
21 |     [3746.500,3735.854,3739.261],
22 |     [3748.528,3736.934,3740.101]
23 |   ],
24 |   "result_readable": [
25 |     "1 hr 2 min 21.94 sec, 1 hr 2 min 12.44 sec, 1 hr 2 min 11.71 sec",
26 |     "1 hr 2 min 28.55 sec, 1 hr 2 min 17.25 sec, 1 hr 2 min 21.86 sec",
27 |     "1 hr 2 min 26.68 sec, 1 hr 2 min 11.46 sec, 1 hr 2 min 9.54 sec",
28 |     "1 hr 2 min 26.50 sec, 1 hr 2 min 15.85 sec, 1 hr 2 min 19.26 sec",
29 |     "1 hr 2 min 28.53 sec, 1 hr 2 min 16.93 sec, 1 hr 2 min 20.10 sec"
30 |   ]
31 | }


--------------------------------------------------------------------------------
/duckdb/results/m6i.8xlarge_bluesky_100m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "DuckDB",
 3 |   "version": "1.1.3",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 100000000,
12 |   "dataset_size_readable": "100m",
13 |   "num_loaded_documents": 100000000,
14 |   "data_compression": "auto",
15 |   "total_size": 47934865408,
16 |   "total_size_readable": "47.93 GB",
17 |   "result": [
18 |     [373.230,10.743,10.760],
19 |     [373.570,14.974,14.978],
20 |     [373.282,13.486,13.443],
21 |     [373.364,7.549,7.530],
22 |     [373.560,8.071,8.060]
23 |   ],
24 |   "result_readable": [
25 |     "6 min 13.23 sec, 10.74 sec, 10.76 sec",
26 |     "6 min 13.57 sec, 14.97 sec, 14.98 sec",
27 |     "6 min 13.28 sec, 13.49 sec, 13.44 sec",
28 |     "6 min 13.36 sec, 7.55 sec, 7.53 sec",
29 |     "6 min 13.56 sec, 8.07 sec, 8.06 sec"
30 |   ]
31 | }


--------------------------------------------------------------------------------
/duckdb/results/m6i.8xlarge_bluesky_10m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "DuckDB",
 3 |   "version": "1.1.3",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 10000000,
12 |   "dataset_size_readable": "10m",
13 |   "num_loaded_documents": 10000000,
14 |   "data_compression": "auto",
15 |   "total_size": 4893442048,
16 |   "total_size_readable": "4.89 GB",
17 |   "result": [
18 |     [37.332,1.179,1.196],
19 |     [37.361,1.639,1.645],
20 |     [37.343,1.598,1.613],
21 |     [37.363,1.060,1.065],
22 |     [37.439,1.183,1.213]
23 |   ],
24 |   "result_readable": [
25 |     "37.33 sec, 1.18 sec, 1.20 sec",
26 |     "37.36 sec, 1.64 sec, 1.65 sec",
27 |     "37.34 sec, 1.60 sec, 1.61 sec",
28 |     "37.36 sec, 1.06 sec, 1.06 sec",
29 |     "37.44 sec, 1.18 sec, 1.21 sec"
30 |   ]
31 | }


--------------------------------------------------------------------------------
/duckdb/results/m6i.8xlarge_bluesky_1m.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "DuckDB",
 3 |   "version": "1.1.3",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000,
12 |   "dataset_size_readable": "1m",
13 |   "num_loaded_documents": 1000000,
14 |   "data_compression": "auto",
15 |   "total_size": 484704256,
16 |   "total_size_readable": "484 MB",
17 |   "result": [
18 |     [2.797,0.230,0.228],
19 |     [2.810,0.319,0.315],
20 |     [2.809,0.327,0.330],
21 |     [2.802,0.250,0.247],
22 |     [2.822,0.259,0.256]
23 |   ],
24 |   "result_readable": [
25 |     "2.80 sec, 230.00 msec, 228.00 msec",
26 |     "2.81 sec, 319.00 msec, 315.00 msec",
27 |     "2.81 sec, 327.00 msec, 330.00 msec",
28 |     "2.80 sec, 250.00 msec, 247.00 msec",
29 |     "2.82 sec, 259.00 msec, 256.00 msec"
30 |   ]
31 | }


--------------------------------------------------------------------------------
/duckdb/run_queries.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | 
12 | DUCKDB_CMD="duckdb $DB_NAME"
13 | 
14 | TRIES=3
15 | 
16 | LOG_FILE="query_results.log"
17 | > "$LOG_FILE"
18 | 
19 | cat queries.sql | while read -r query; do
20 |     # Clear filesystem cache between queries.
21 |     sync
22 |     echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
23 | 
24 |     echo "Running query: $query"
25 |     for i in $(seq 1 $TRIES); do
26 |         # Run query with timer enabled and extract the real time.
27 |         OUTPUT=$($DUCKDB_CMD <<EOF >> "$LOG_FILE"
28 | .timer on
29 | $query
30 | EOF
31 | )
32 |         REAL_TIME=$(tac "$LOG_FILE" | grep -m 1 -oP 'real\s+\K[\d.]+')
33 |         echo "Real time: $REAL_TIME seconds"
34 |     done
35 | done


--------------------------------------------------------------------------------
/duckdb/total_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <database_name> <table_name>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATABASE_NAME="$1"
11 | TABLE_NAME="$2"
12 | DUCKDB_CMD="duckdb $DATABASE_NAME"
13 | 
14 | # Fetch the total size using duckDB
15 | $DUCKDB_CMD -c "select '$TABLE_NAME' as table_name, count(distinct block_id) as num_blocks, count(distinct block_id) * (select block_size from pragma_database_size()) as num_bytes from pragma_storage_info('$TABLE_NAME') group by all;"
16 | 
17 | 


--------------------------------------------------------------------------------
/elasticsearch/benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <INDEX_NAME> [RESULT_FILE]"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | INDEX_NAME="$1"
11 | RESULT_FILE="${2:-}"
12 | 
13 | # Print the index name
14 | echo "Running queries on index: $INDEX_NAME"
15 | 
16 | # Run queries and log the output
17 | ./run_queries.sh "$INDEX_NAME" 2>&1 | tee query_log.txt
18 | 
19 | # Process the query log and prepare the result
20 | RESULT=$(cat query_log.txt | grep -oP 'Response time: \d+\.\d+ s' | sed -r -e 's/Response time: ([0-9]+\.[0-9]+) s/\1/' | \
21 | awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }')
22 | 
23 | # Output the result
24 | if [[ -n "$RESULT_FILE" ]]; then
25 |     echo "$RESULT" > "$RESULT_FILE"
26 |     echo "Result written to $RESULT_FILE"
27 | else
28 |     echo "$RESULT"
29 | fi


--------------------------------------------------------------------------------
/elasticsearch/config/elasticsearch.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # ---------------------------------- Cluster -----------------------------------
 4 | #
 5 | # Use a descriptive name for your cluster:
 6 | #
 7 | cluster.name: es-bench
 8 | # ------------------------------------ Node ------------------------------------
 9 | #
10 | # Use a descriptive name for the node:
11 | #
12 | node.name: node-1
13 | # ----------------------------------- Paths ------------------------------------
14 | #
15 | # Path to directory where to store the data (separate multiple locations by comma):
16 | #
17 | path.data: /var/lib/elasticsearch
18 | #
19 | # Path to log files:
20 | #
21 | path.logs: /var/log/elasticsearch
22 | #
23 | # ----------------------------------- Memory -----------------------------------
24 | #
25 | # Lock the memory on startup:
26 | #
27 | bootstrap.memory_lock: true
28 | # --------------------------------------------------------------------------------
29 | 
30 | # Enable security features
31 | xpack.security.enabled: true
32 | xpack.security.enrollment.enabled: true
33 | 
34 | # Enable encryption for HTTP API client connections, such as Kibana, Logstash, and Agents
35 | xpack.security.http.ssl:
36 |   enabled: true
37 |   keystore.path: certs/http.p12
38 | 
39 | # Enable encryption and mutual authentication between cluster nodes
40 | xpack.security.transport.ssl:
41 |   enabled: true
42 |   verification_mode: certificate
43 |   keystore.path: certs/transport.p12
44 |   truststore.path: certs/transport.p12
45 | # Create a new cluster with the current node only
46 | # Additional nodes can still join the cluster later
47 | cluster.initial_master_nodes: ["node-1"]
48 | 
49 | # Allow HTTP API connections from anywhere
50 | # Connections are encrypted and require user authentication
51 | http.host: 0.0.0.0
52 | 
53 | #----------------------- END SECURITY AUTO CONFIGURATION -------------------------


--------------------------------------------------------------------------------
/elasticsearch/config/filebeat.yml:
--------------------------------------------------------------------------------
 1 | # ============================== Filebeat inputs ===============================
 2 | filebeat.registry.flush: 5s
 3 | filebeat.inputs:
 4 | 
 5 | - type: filestream
 6 |   id: bluesky-events
 7 | 
 8 |   paths:
 9 |     - <temp_dir>
10 |   parsers:
11 |     - ndjson:
12 |         target: ""
13 | # ============================== Filebeat modules ==============================
14 | 
15 | filebeat.config.modules:
16 |  # Glob pattern for configuration loading
17 |  path: ${path.config}/modules.d/*.yml
18 | 
19 |  # Set to true to enable config reloading
20 |  reload.enabled: false
21 | 
22 | # ======================= Elasticsearch template setting =======================
23 | 
24 | setup.template.enabled: false
25 | 
26 | # ================================== Outputs ===================================
27 | 
28 | # Configure what output to use when sending the data collected by the beat.
29 | 
30 | # ---------------------------- Elasticsearch Output ----------------------------
31 | 
32 | output.elasticsearch:
33 |  # Array of hosts to connect to.
34 |  hosts: ["https://localhost:9200"]
35 | 
36 |  # Performance preset - one of "balanced", "throughput", "scale",
37 |  # "latency", or "custom".
38 |  preset: throughput
39 |  compression_level: 1
40 |  idle_connection_timeout: 30s
41 |  # Protocol - either `http` (default) or `https`.
42 |  protocol: "https"
43 |  index: "<index_name>"
44 |  # Authentication credentials - either API key or username/password.
45 |  api_key: "<api_key>"
46 |  ssl:
47 |    enabled: true
48 |    verification_mode: "none"
49 | 
50 | http.enabled: true
51 | http.host: localhost
52 | http.port: 5066
53 | 
54 | logging.level: info
55 | logging.to_files: true
56 | logging.files:
57 |   path: /var/log/filebeat
58 |   name: filebeat
59 |   keepfiles: 7
60 |   permissions: 0640
61 | 
62 | processors:
63 |  - rename:
64 |      when:
65 |        and:
66 |          - has_fields: ["commit.record.subject"]
67 |          - not:
68 |              has_fields: ["commit.record.subject.cid"]
69 |      fields:
70 |        - from: "commit.record.subject"
71 |          to: "commit.record.subject.value"
72 |  - rename:
73 |      when:
74 |        and:
75 |          - has_fields: ["commit.record.embed.images.data"]
76 |          - not:
77 |              has_fields: ["commit.record.subject.cid"]
78 |      fields:
79 |        - from: "commit.record.embed.images.data"
80 |          to: "commit.record.embed.images.data.value"
81 |  - drop_fields:
82 |      fields: ["log", "agent", "ecs","host", "input"]
83 |      ignore_missing: true
84 |  - script:
85 |      lang: javascript
86 |      source: >
87 |        function process(event){
88 |          var time_us = event.Get("time_us");
89 |          if (typeof time_us === 'string') {
90 |              time_us = BigInt(time_us);  // If time_us is a string, cast it to a BigInt
91 |          } else if (typeof time_us !== 'number') {
92 |              return;  // Exit the function if time_us is not a valid number
93 |          }
94 |          var time_us_ms = time_us / 1000;
95 |          event.Put("time_us", time_us_ms.toString());
96 |        }


--------------------------------------------------------------------------------
/elasticsearch/config/ilm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "policy": {
 3 |       "phases": {
 4 |         "hot": {
 5 |           "min_age": "0ms",
 6 |           "actions": {
 7 |             "rollover": {
 8 |               "max_age": "30d",
 9 |               "max_primary_shard_size": "50gb"
10 |             },
11 |             "forcemerge": {
12 |               "max_num_segments": 1
13 |             },
14 |             "readonly": {}
15 |           }
16 |         }
17 |       }
18 |     }
19 |   }
20 |   


--------------------------------------------------------------------------------
/elasticsearch/config/index_template_no_source_best_compression.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "index_patterns": [
 3 |       "${INDEX_NAME}"
 4 |     ],
 5 |     "data_stream": {},
 6 |     "template": {
 7 |       "settings": {
 8 |         "index": {
 9 |           "lifecycle": {
10 |             "name": "filebeat"
11 |           },
12 |           "codec": "best_compression",
13 |           "routing": {
14 |             "allocation": {
15 |               "include": {
16 |                 "_tier_preference": "data_hot"
17 |               }
18 |             }
19 |           },
20 |           "mapping": {
21 |             "total_fields": {
22 |               "limit": "10000"
23 |             }
24 |           },
25 |           "refresh_interval": "30s",
26 |           "number_of_shards": "1",
27 |           "max_docvalue_fields_search": "200",
28 |           "sort": {
29 |             "field": [
30 |               "kind",
31 |               "commit.operation",
32 |               "commit.collection",
33 |               "did",
34 |               "time_us"
35 |             ],
36 |             "order": [
37 |               "asc",
38 |               "asc",
39 |               "asc",
40 |               "asc",
41 |               "asc"
42 |             ]
43 |           },
44 |           "number_of_replicas": "0"
45 |         }
46 |       },
47 |       "mappings": {
48 |         "_source": {
49 |           "enabled": false
50 |         },
51 |         "dynamic_templates": [
52 |           {
53 |             "strings_as_keyword": {
54 |               "match_mapping_type": "string",
55 |               "mapping": {
56 |                 "ignore_above": 1024,
57 |                 "type": "keyword"
58 |               }
59 |             }
60 |           }
61 |         ],
62 |         "properties": {
63 |           "kind": {
64 |             "type": "keyword"
65 |           },
66 |           "commit": {
67 |             "properties": {
68 |               "collection": {
69 |                 "type": "keyword"
70 |               },
71 |               "operation": {
72 |                 "type": "keyword"
73 |               }
74 |             }
75 |           },
76 |           "did": {
77 |             "type": "keyword"
78 |           },
79 |           "time_us": {
80 |             "type": "date"
81 |           }
82 |         }
83 |       },
84 |       "aliases": {}
85 |     }
86 |   }
87 |   


--------------------------------------------------------------------------------
/elasticsearch/config/index_template_no_source_default_compression.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "index_patterns": [
 3 |       "${INDEX_NAME}"
 4 |     ],
 5 |     "data_stream": {},
 6 |     "template": {
 7 |       "settings": {
 8 |         "index": {
 9 |           "lifecycle": {
10 |             "name": "filebeat"
11 |           },
12 |           "routing": {
13 |             "allocation": {
14 |               "include": {
15 |                 "_tier_preference": "data_hot"
16 |               }
17 |             }
18 |           },
19 |           "mapping": {
20 |             "total_fields": {
21 |               "limit": "10000"
22 |             }
23 |           },
24 |           "refresh_interval": "30s",
25 |           "number_of_shards": "1",
26 |           "max_docvalue_fields_search": "200",
27 |           "sort": {
28 |             "field": [
29 |               "kind",
30 |               "commit.operation",
31 |               "commit.collection",
32 |               "did",
33 |               "time_us"
34 |             ],
35 |             "order": [
36 |               "asc",
37 |               "asc",
38 |               "asc",
39 |               "asc",
40 |               "asc"
41 |             ]
42 |           },
43 |           "number_of_replicas": "0"
44 |         }
45 |       },
46 |       "mappings": {
47 |         "_source": {
48 |           "enabled": false
49 |         },
50 |         "dynamic_templates": [
51 |           {
52 |             "strings_as_keyword": {
53 |               "match_mapping_type": "string",
54 |               "mapping": {
55 |                 "ignore_above": 1024,
56 |                 "type": "keyword"
57 |               }
58 |             }
59 |           }
60 |         ],
61 |         "properties": {
62 |           "kind": {
63 |             "type": "keyword"
64 |           },
65 |           "commit": {
66 |             "properties": {
67 |               "collection": {
68 |                 "type": "keyword"
69 |               },
70 |               "operation": {
71 |                 "type": "keyword"
72 |               }
73 |             }
74 |           },
75 |           "did": {
76 |             "type": "keyword"
77 |           },
78 |           "time_us": {
79 |             "type": "date"
80 |           }
81 |         }
82 |       },
83 |       "aliases": {}
84 |     }
85 |   }
86 |   


--------------------------------------------------------------------------------
/elasticsearch/config/index_template_source_best_compression.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "index_patterns": [
 3 |       "${INDEX_NAME}"
 4 |     ],
 5 |     "data_stream": {},
 6 |     "template": {
 7 |       "settings": {
 8 |         "index": {
 9 |           "lifecycle": {
10 |             "name": "filebeat"
11 |           },
12 |           "codec": "best_compression",
13 |           "routing": {
14 |             "allocation": {
15 |               "include": {
16 |                 "_tier_preference": "data_hot"
17 |               }
18 |             }
19 |           },
20 |           "mapping": {
21 |             "total_fields": {
22 |               "limit": "10000"
23 |             }
24 |           },
25 |           "refresh_interval": "30s",
26 |           "number_of_shards": "1",
27 |           "max_docvalue_fields_search": "200",
28 |           "sort": {
29 |             "field": [
30 |               "kind",
31 |               "commit.operation",
32 |               "commit.collection",
33 |               "did",
34 |               "time_us"
35 |             ],
36 |             "order": [
37 |               "asc",
38 |               "asc",
39 |               "asc",
40 |               "asc",
41 |               "asc"
42 |             ]
43 |           },
44 |           "number_of_replicas": "0"
45 |         }
46 |       },
47 |       "mappings": {
48 |         "_source": {
49 |           "enabled": true
50 |         },
51 |         "dynamic_templates": [
52 |           {
53 |             "strings_as_keyword": {
54 |               "match_mapping_type": "string",
55 |               "mapping": {
56 |                 "ignore_above": 1024,
57 |                 "type": "keyword"
58 |               }
59 |             }
60 |           }
61 |         ],
62 |         "properties": {
63 |           "kind": {
64 |             "type": "keyword"
65 |           },
66 |           "commit": {
67 |             "properties": {
68 |               "collection": {
69 |                 "type": "keyword"
70 |               },
71 |               "operation": {
72 |                 "type": "keyword"
73 |               }
74 |             }
75 |           },
76 |           "did": {
77 |             "type": "keyword"
78 |           },
79 |           "time_us": {
80 |             "type": "date"
81 |           }
82 |         }
83 |       },
84 |       "aliases": {}
85 |     }
86 |   }
87 |   


--------------------------------------------------------------------------------
/elasticsearch/config/index_template_source_default_compression.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "index_patterns": [
 3 |       "${INDEX_NAME}"
 4 |     ],
 5 |     "data_stream": {},
 6 |     "template": {
 7 |       "settings": {
 8 |         "index": {
 9 |           "lifecycle": {
10 |             "name": "filebeat"
11 |           },
12 |           "routing": {
13 |             "allocation": {
14 |               "include": {
15 |                 "_tier_preference": "data_hot"
16 |               }
17 |             }
18 |           },
19 |           "mapping": {
20 |             "total_fields": {
21 |               "limit": "10000"
22 |             }
23 |           },
24 |           "refresh_interval": "30s",
25 |           "number_of_shards": "1",
26 |           "max_docvalue_fields_search": "200",
27 |           "sort": {
28 |             "field": [
29 |               "kind",
30 |               "commit.operation",
31 |               "commit.collection",
32 |               "did",
33 |               "time_us"
34 |             ],
35 |             "order": [
36 |               "asc",
37 |               "asc",
38 |               "asc",
39 |               "asc",
40 |               "asc"
41 |             ]
42 |           },
43 |           "number_of_replicas": "0"
44 |         }
45 |       },
46 |       "mappings": {
47 |         "_source": {
48 |           "enabled": true
49 |         },
50 |         "dynamic_templates": [
51 |           {
52 |             "strings_as_keyword": {
53 |               "match_mapping_type": "string",
54 |               "mapping": {
55 |                 "ignore_above": 1024,
56 |                 "type": "keyword"
57 |               }
58 |             }
59 |           }
60 |         ],
61 |         "properties": {
62 |           "kind": {
63 |             "type": "keyword"
64 |           },
65 |           "commit": {
66 |             "properties": {
67 |               "collection": {
68 |                 "type": "keyword"
69 |               },
70 |               "operation": {
71 |                 "type": "keyword"
72 |               }
73 |             }
74 |           },
75 |           "did": {
76 |             "type": "keyword"
77 |           },
78 |           "time_us": {
79 |             "type": "date"
80 |           }
81 |         }
82 |       },
83 |       "aliases": {}
84 |     }
85 |   }
86 |   


--------------------------------------------------------------------------------
/elasticsearch/config/jvm.options:
--------------------------------------------------------------------------------
 1 | ################################################################
 2 | ##
 3 | ## JVM configuration
 4 | ##
 5 | ################################################################
 6 | ##
 7 | ## WARNING: DO NOT EDIT THIS FILE. If you want to override the
 8 | ## JVM options in this file, or set any additional options, you
 9 | ## should create one or more files in the jvm.options.d
10 | ## directory containing your adjustments.
11 | ##
12 | ## See https://www.elastic.co/guide/en/elasticsearch/reference/8.15/jvm-options.html
13 | ## for more information.
14 | ##
15 | ################################################################
16 | 
17 | 
18 | 
19 | ################################################################
20 | ## IMPORTANT: JVM heap size
21 | ################################################################
22 | ##
23 | ## The heap size is automatically configured by Elasticsearch
24 | ## based on the available memory in your system and the roles
25 | ## each node is configured to fulfill. If specifying heap is
26 | ## required, it should be done through a file in jvm.options.d,
27 | ## which should be named with .options suffix, and the min and
28 | ## max should be set to the same value. For example, to set the
29 | ## heap to 4 GB, create a new file in the jvm.options.d
30 | ## directory containing these lines:
31 | ##
32 | -Xms31g
33 | -Xmx31g
34 | ##
35 | ## See https://www.elastic.co/guide/en/elasticsearch/reference/8.15/heap-size.html
36 | ## for more information
37 | ##
38 | ################################################################
39 | 
40 | 
41 | ################################################################
42 | ## Expert settings
43 | ################################################################
44 | ##
45 | ## All settings below here are considered expert settings. Do
46 | ## not adjust them unless you understand what you are doing. Do
47 | ## not edit them in this file; instead, create a new file in the
48 | ## jvm.options.d directory containing your adjustments.
49 | ##
50 | ################################################################
51 | 
52 | -XX:+UseG1GC
53 | 
54 | ## JVM temporary directory
55 | -Djava.io.tmpdir=${ES_TMPDIR}
56 | 
57 | # Leverages accelerated vector hardware instructions; removing this may
58 | # result in less optimal vector performance
59 | 20-:--add-modules=jdk.incubator.vector
60 | 
61 | ## heap dumps
62 | 
63 | # generate a heap dump when an allocation from the Java heap fails; heap dumps
64 | # are created in the working directory of the JVM unless an alternative path is
65 | # specified
66 | -XX:+HeapDumpOnOutOfMemoryError
67 | 
68 | # exit right after heap dump on out of memory error
69 | -XX:+ExitOnOutOfMemoryError
70 | 
71 | # specify an alternative path for heap dumps; ensure the directory exists and
72 | # has sufficient space
73 | -XX:HeapDumpPath=/var/lib/elasticsearch
74 | 
75 | # specify an alternative path for JVM fatal error logs
76 | -XX:ErrorFile=/var/log/elasticsearch/hs_err_pid%p.log
77 | 
78 | ## GC logging
79 | -Xlog:gc*,gc+age=trace,safepoint:file=/var/log/elasticsearch/gc.log:utctime,level,pid,tags:filecount=32,filesize=64m


--------------------------------------------------------------------------------
/elasticsearch/count.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <index_name>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Check if ELASTIC_PASSWORD env variable is set, if set from not read from .elastic_password file
10 | if [[ -z "$ELASTIC_PASSWORD" ]]; then
11 |     [[ ! -f ".elastic_password" ]] && { echo "Error: ELASTIC_PASSWORD environment variable is not set and .elastic_password file does not exist."; exit 1; }
12 |     export $(cat .elastic_password)
13 | fi
14 | 
15 | # Arguments
16 | INDEX_NAME="$1"
17 | 
18 | echo $(curl -s -k -X GET "https://localhost:9200/${INDEX_NAME}/_count" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' | jq '.count')
19 | 


--------------------------------------------------------------------------------
/elasticsearch/create_and_load.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if ELASTIC_PASSWORD env variable is set, if set from not read from .elastic_password file
 4 | if [[ -z "$ELASTIC_PASSWORD" ]]; then
 5 |     [[ ! -f ".elastic_password" ]] && { echo "Error: ELASTIC_PASSWORD environment variable is not set and .elastic_password file does not exist."; exit 1; }
 6 |     export $(cat .elastic_password)
 7 | fi
 8 | 
 9 | # Check if the required arguments are provided
10 | if [[ $# -lt 6 ]]; then
11 |     echo "Usage: $0 <INDEX_NAME> <INDEX_TEMPLATE_FILE> <DATA_DIRECTORY> <NUM_FILES> <SUCCESS_LOG> <ERROR_LOG>"
12 |     exit 1
13 | fi
14 | 
15 | # Arguments
16 | INDEX_NAME="$1"
17 | INDEX_TEMPLATE_FILE="config/$2.json"
18 | DATA_DIRECTORY="$3"
19 | NUM_FILES="$4"
20 | SUCCESS_LOG="$5"
21 | ERROR_LOG="$6"
22 | 
23 | # Validate arguments
24 | [[ ! -f "$INDEX_TEMPLATE_FILE" ]] && { echo "Error: Index template file '$INDEX_TEMPLATE_FILE' does not exist."; exit 1; }
25 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
26 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }
27 | 
28 | # Check ilm policy is installed, install if not
29 | # If curl return 404, means ILM policy is not installed
30 | 
31 | http_code=$(curl -s -o /dev/null -k -w "%{http_code}" -X GET "https://localhost:9200/_ilm/policy/filebeat" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json')
32 | if [[ "$http_code" -eq 404 ]] ; then
33 |     echo "Installing ILM policy"
34 |     ILM_POLICY=$(cat "config/ilm.json")
35 |     curl -s -k -X PUT "https://localhost:9200/_ilm/policy/filebeat" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' -d "$ILM_POLICY"
36 | fi
37 | 
38 | # Install index template
39 | # Read index template file json from config/$INDEX_TEMPLATE_FILE 
40 | INDEX_TEMPLATE=$(cat "$INDEX_TEMPLATE_FILE")
41 | JSON_DATA=$(cat $INDEX_TEMPLATE_FILE | sed "s/\${INDEX_NAME}/$INDEX_NAME/g")
42 | echo "Install index template"
43 | curl -s -o /dev/null -k -X PUT "https://localhost:9200/_index_template/${INDEX_NAME}" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' -d "$JSON_DATA"
44 | 
45 | # Create the data stream
46 | echo "Create the data stream"
47 | curl -s -o /dev/null -k -X PUT "https://localhost:9200/_data_stream/${INDEX_NAME}" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json'
48 | 
49 | # Load data
50 | ./load_data.sh "$DATA_DIRECTORY" "$INDEX_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"
51 | 
52 | echo "Script completed successfully."


--------------------------------------------------------------------------------
/elasticsearch/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | 
 3 | # Install elasticsearch
 4 | wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch | sudo gpg --dearmor -o /usr/share/keyrings/elasticsearch-keyring.gpg
 5 | sudo apt-get install apt-transport-https
 6 | echo "deb [signed-by=/usr/share/keyrings/elasticsearch-keyring.gpg] https://artifacts.elastic.co/packages/8.x/apt stable main" | sudo tee /etc/apt/sources.list.d/elastic-8.x.list
 7 | sudo apt-get update && sudo apt-get install elasticsearch
 8 | 
 9 | # # Install filebeat
10 | curl -L -O https://artifacts.elastic.co/downloads/beats/filebeat/filebeat-8.17.0-amd64.deb
11 | sudo dpkg -i filebeat-8.17.0-amd64.deb
12 | 
13 | # # Overwrite configuration files
14 | sudo cp config/elasticsearch.yml /etc/elasticsearch/elasticsearch.yml
15 | sudo cp config/jvm.options /etc/elasticsearch/jvm.options
16 | 
17 | # # Start elasticsearch
18 | sudo systemctl start elasticsearch.service
19 | 
20 | # Reset and export elastic password
21 | export ELASTIC_PASSWORD=$(sudo /usr/share/elasticsearch/bin/elasticsearch-reset-password -s -a -b -u elastic)
22 | 
23 | # Save elastic password in local file
24 | echo "ELASTIC_PASSWORD=$ELASTIC_PASSWORD" > .elastic_password
25 | 
26 | # Generate api key for filebeat
27 | curl -s -k -X POST "https://localhost:9200/_security/api_key" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' -d '
28 | {
29 |   "name": "filebeat",
30 |   "role_descriptors": {
31 |     "filebeat_writer": { 
32 |       "cluster": ["monitor", "read_ilm", "read_pipeline"],
33 |       "index": [
34 |         {
35 |           "names": ["bluesky-*"],
36 |           "privileges": ["view_index_metadata", "create_doc", "auto_configure"]
37 |         }
38 |       ]
39 |     }
40 |   }
41 | }' | jq -r '"\(.id):\(.api_key)"' > .filebeat_api_key
42 | 
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/elasticsearch/load_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 5 ]]; then
 5 |     echo "Usage: $0 <directory> <index_name> <max_files> <success_log> <error_log>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Check if ELASTIC_PASSWORD env variable is set, if set from not read from .elastic_password file
10 | if [[ -z "$ELASTIC_PASSWORD" ]]; then
11 |     [[ ! -f ".elastic_password" ]] && { echo "Error: ELASTIC_PASSWORD environment variable is not set and .elastic_password file does not exist."; exit 1; }
12 |     export $(cat .elastic_password)
13 | fi
14 | 
15 | # Arguments
16 | DIRECTORY="$1"
17 | INDEX_NAME="$2"
18 | MAX_FILES="$3"
19 | SUCCESS_LOG="$4"
20 | ERROR_LOG="$5"
21 | 
22 | # Validate that MAX_FILES is a number
23 | if ! [[ "$MAX_FILES" =~ ^[0-9]+$ ]]; then
24 |     echo "Error: <max_files> must be a positive integer."
25 |     exit 1
26 | fi
27 | 
28 | # Ensure the log files exist
29 | touch "$SUCCESS_LOG" "$ERROR_LOG"
30 | 
31 | # Create a temporary directory in /var/tmp and ensure it's accessible
32 | TEMP_DIR=$(mktemp -d /var/tmp/cleaned_files.XXXXXX)
33 | chmod 777 "$TEMP_DIR"  # Allow access for all users
34 | trap "rm -rf $TEMP_DIR" EXIT  # Ensure cleanup on script exit
35 | 
36 | # Copy all files to temp location and uncompress them
37 | # Counter to track processed files
38 | counter=0
39 | 
40 | # Loop through each .json.gz file in the directory
41 | for file in $(ls "$DIRECTORY"/*.json.gz | sort); do
42 |     if [[ -f "$file" ]]; then
43 |         echo "Processing $file..."
44 |         counter=$((counter + 1))
45 | 
46 |         # Uncompress the file into the temporary directory
47 |         uncompressed_file="$TEMP_DIR/$(basename "${file%.gz}")"
48 |         gunzip -c "$file" > "$uncompressed_file"
49 | 
50 |         # Check if uncompression was successful
51 |         if [[ $? -ne 0 ]]; then
52 |             echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed to uncompress $file." >> "$ERROR_LOG"
53 |             continue
54 |         fi
55 | 
56 |         # Grant read permissions
57 |         chmod 644 "$uncompressed_file"
58 |         # Stop processing if the max number of files is reached
59 |         if [[ $counter -ge $MAX_FILES ]]; then
60 |             echo "Processed maximum number of files: $MAX_FILES"
61 |             break
62 |         fi
63 |     else
64 |         echo "No .json.gz files found in the directory."
65 |     fi
66 | done
67 | 
68 | echo "All files have been copied to temp location."
69 | 
70 | echo "Prepare filebeat for ingestion"
71 | 
72 | # Prepare Filebeat configuration
73 | FILEBEAT_API_KEY=$(cat .filebeat_api_key)
74 | FILEBEAT_CONFIG=$(sed -e "s|<api_key>|$FILEBEAT_API_KEY|g" -e "s|<index_name>|$INDEX_NAME|g" -e "s|<temp_dir>|"${TEMP_DIR}/*"|g" config/filebeat.yml)
75 | echo "$FILEBEAT_CONFIG" | sudo tee /etc/filebeat/filebeat.yml > /dev/null
76 | sudo rm -rf /var/lib/filebeat/registry
77 | sudo service filebeat start
78 | trap "sudo service filebeat stop" EXIT  # Stop filebeat on exit
79 | 
80 | # wait until all files have been ingested 
81 | total_processed=0
82 | max_events=$MAX_FILES*1000000
83 | while [[ $total_processed -lt $max_events ]]; do
84 |     sleep 30
85 |     total_processed=$(curl -k -s -XGET 'localhost:5066/stats' | jq .filebeat.events.done)
86 |     echo "Total processed files: $total_processed"
87 | done
88 | 
89 | sudo service filebeat stop
90 | 
91 | echo "Force merge indices"
92 | curl -k -X POST "https://localhost:9200/$INDEX_NAME/_forcemerge?max_num_segments=1" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json'
93 | 
94 | echo "All files have been processed."


--------------------------------------------------------------------------------
/elasticsearch/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Default data directory
 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky
 5 | 
 6 | # Allow the user to optionally provide the data directory as an argument
 7 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}"
 8 | 
 9 | # Define success and error log files
10 | SUCCESS_LOG="${2:-success.log}"
11 | ERROR_LOG="${3:-error.log}"
12 | 
13 | # Define prefix for output files
14 | OUTPUT_PREFIX="${4:-_m6i.8xlarge}"
15 | 
16 | # Check if the directory exists
17 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
18 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
19 |     exit 1
20 | fi
21 | 
22 | echo "Select the dataset size to benchmark:"
23 | echo "1) 1m (default)"
24 | echo "2) 10m"
25 | echo "3) 100m"
26 | echo "4) 1000m"
27 | echo "5) all"
28 | read -p "Enter the number corresponding to your choice: " choice
29 | 
30 | ./install.sh
31 | 
32 | benchmark() {
33 |     local size=$1
34 |     local template=$2
35 |     # Check DATA_DIRECTORY contains the required number of files to run the benchmark
36 |     file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
37 |     if (( file_count < size )); then
38 |         echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
39 |         exit 1
40 |     fi
41 |     ./create_and_load.sh "bluesky-${template}-${size}m" "index_template_${template}" "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
42 |     ./total_size.sh "bluesky-${template}-${size}m" | tee "${OUTPUT_PREFIX}_bluesky-${template}-${size}m.data_size"
43 |     ./count.sh "bluesky-${template}-${size}m" | tee "${OUTPUT_PREFIX}_bluesky-${template}-${size}m.count"
44 |     #./query_results.sh "bluesky-${template}-${size}m" | tee "${OUTPUT_PREFIX}_bluesky-${template}-${size}m.query_results"
45 |     ./benchmark.sh "bluesky-${template}-${size}m" "${OUTPUT_PREFIX}_bluesky-${template}-${size}m.results_runtime"
46 | }
47 | 
48 | case $choice in
49 |     2)
50 |         benchmark 10 no_source_best_compression
51 |         benchmark 10 source_best_compression
52 |         benchmark 10 source_default_compression
53 |         benchmark 10 no_source_default_compression
54 |         ;;
55 |     3)
56 |         benchmark 100 no_source_best_compression
57 |         benchmark 100 source_best_compression
58 |         benchmark 100 source_default_compression
59 |         benchmark 100 no_source_default_compression
60 |         ;;
61 |     4)
62 |         benchmark 1000 no_source_best_compression
63 |         benchmark 1000 source_best_compression
64 |         benchmark 1000 source_default_compression
65 |         benchmark 1000 no_source_default_compression
66 |         ;;
67 |     5)
68 |         benchmark 1 no_source_best_compression
69 |         benchmark 1 source_best_compression
70 |         benchmark 1 source_default_compression
71 |         benchmark 1 no_source_default_compression
72 |         benchmark 10 no_source_best_compression
73 |         benchmark 10 source_best_compression
74 |         benchmark 10 source_default_compression
75 |         benchmark 10 no_source_default_compression
76 |         benchmark 100 no_source_best_compression
77 |         benchmark 100 source_best_compression
78 |         benchmark 100 source_default_compression
79 |         benchmark 100 no_source_default_compression
80 |         benchmark 1000 no_source_best_compression
81 |         benchmark 1000 source_best_compression
82 |         benchmark 1000 source_default_compression
83 |         benchmark 1000 no_source_default_compression
84 |         ;;
85 |     *)
86 |         benchmark 1 no_source_best_compression
87 |         benchmark 1 source_best_compression
88 |         benchmark 1 source_default_compression
89 |         benchmark 1 no_source_default_compression
90 |         ;;
91 | esac
92 | 


--------------------------------------------------------------------------------
/elasticsearch/queries.txt:
--------------------------------------------------------------------------------
1 | FROM ${INDEX_NAME} | STATS count = COUNT() BY commit.collection | SORT count DESC
2 | FROM ${INDEX_NAME} | WHERE kind == \\\"commit\\\" AND commit.operation == \\\"create\\\" | STATS users = COUNT_DISTINCT(did, 40000), count = COUNT() BY commit.collection | SORT count DESC
3 | FROM ${INDEX_NAME} | WHERE kind == \\\"commit\\\" AND commit.operation == \\\"create\\\" AND commit.collection IN (\\\"app.bsky.feed.post\\\", \\\"app.bsky.feed.repost\\\", \\\"app.bsky.feed.like\\\") | STATS count = COUNT() BY commit.collection, DATE_EXTRACT(\\\"hour_of_day\\\", time_us) | SORT count, commit.collection
4 | FROM ${INDEX_NAME} | WHERE kind == \\\"commit\\\" AND commit.operation == \\\"create\\\" AND commit.collection == \\\"app.bsky.feed.post\\\" | STATS first_post_ts = MIN(time_us) BY did | SORT first_post_ts ASC | LIMIT 3
5 | FROM ${INDEX_NAME} | WHERE kind == \\\"commit\\\" AND commit.operation == \\\"create\\\" AND commit.collection == \\\"app.bsky.feed.post\\\" | STATS activity_span = date_diff(\\\"millisecond\\\",min(time_us), max(time_us)) BY did | SORT activity_span DESC | LIMIT 3


--------------------------------------------------------------------------------
/elasticsearch/queries_formatted.txt:
--------------------------------------------------------------------------------
 1 | -- Q1 - Top event types
 2 | 
 3 | POST /_query?format=txt
 4 | {
 5 |   "query": """FROM $INDEX_NAME
 6 | | STATS count = COUNT() BY commit.collection
 7 | | SORT count DESC"""
 8 | }
 9 | 
10 | -- Q2 - Top event types together with unique users per event type
11 | 
12 | POST /_query?format=txt
13 | {
14 |   "query": """FROM $INDEX_NAME
15 | | WHERE kind == "commit" AND commit.operation == "create"
16 | | STATS users = COUNT_DISTINCT(did), count = COUNT() BY commit.collection
17 | | SORT count DESC"""
18 | }
19 | 
20 | -- Q3 - When do people use BlueSky
21 | 
22 | POST /_query?format=txt
23 | {
24 |   "query": """FROM $INDEX_NAME
25 | | WHERE kind == "commit" AND commit.operation == "create" AND commit.collection IN ("app.bsky.feed.post", "app.bsky.feed.repost", "app.bsky.feed.like")
26 | | STATS count = COUNT() BY commit.collection, DATE_EXTRACT("hour_of_day", time_us)
27 | | SORT count, commit.collection"""
28 | }
29 | 
30 | -- Q4 - top 3 post veterans
31 | 
32 | POST /_query?format=txt
33 | {
34 |   "query": """FROM $INDEX_NAME
35 | | WHERE kind == "commit" AND commit.operation == "create" AND commit.collection == "app.bsky.feed.post"
36 | | STATS first_post_ts = MIN(time_us) BY did
37 | | SORT first_post_ts ASC
38 | | LIMIT 3"""
39 | }
40 | 
41 | -- Q5 - top 3 users with longest activity
42 | 
43 | POST /_query?format=txt
44 | {
45 |   "query": """FROM $INDEX_NAME
46 | | WHERE kind == "commit" AND commit.operation == "create" AND commit.collection == "app.bsky.feed.post"
47 | | STATS activity_span = date_diff("millisecond",min(time_us), max(time_us)) BY did
48 | | SORT activity_span DESC
49 | | LIMIT 3"""
50 | }


--------------------------------------------------------------------------------
/elasticsearch/query_results.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <INDEX_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Check if ELASTIC_PASSWORD env variable is set, if set from not read from .elastic_password file
10 | if [[ -z "$ELASTIC_PASSWORD" ]]; then
11 |     [[ ! -f ".elastic_password" ]] && { echo "Error: ELASTIC_PASSWORD environment variable is not set and .elastic_password file does not exist."; exit 1; }
12 |     export $(cat .elastic_password)
13 | fi
14 | 
15 | # Arguments
16 | INDEX_NAME="$1"
17 | 
18 | QUERY_NUM=1
19 | 
20 | # File containing Elasticsearch ES|SQL queries
21 | QUERY_FILE="queries.txt"
22 | 
23 | # Check if the query file exists
24 | if [[ ! -f "$QUERY_FILE" ]]; then
25 |     echo "Error: Query file '$QUERY_FILE' does not exist."
26 |     exit 1
27 | fi
28 | 
29 | cat 'queries.txt' | while read -r QUERY; do
30 |     eval "QUERY=\"${QUERY}\""
31 |     # Print the query
32 |     echo "------------------------------------------------------------------------------------------------------------------------"
33 |     echo "Result for query Q$QUERY_NUM: "
34 |     echo
35 |     CURL_DATA="{\"query\": \"$QUERY\"}"
36 |     curl -s -k -X POST "https://localhost:9200/_query?format=txt" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' -d "$CURL_DATA"
37 |     echo
38 |      # Increment the query number
39 |     QUERY_NUM=$((QUERY_NUM + 1))
40 | done
41 | 


--------------------------------------------------------------------------------
/elasticsearch/results/_query_results/_m6i.8xlarge_bluesky-no_source_best_compression-1m.query_results:
--------------------------------------------------------------------------------
 1 | ------------------------------------------------------------------------------------------------------------------------
 2 | Result for query Q1:
 3 | 
 4 |      count     |    commit.collection
 5 | ---------------+--------------------------
 6 | 448944         |app.bsky.feed.like
 7 | 360374         |app.bsky.graph.follow
 8 | 90816          |app.bsky.feed.post
 9 | 58540          |app.bsky.feed.repost
10 | 14040          |app.bsky.graph.block
11 | 11762          |app.bsky.actor.profile
12 | 8103           |app.bsky.graph.listitem
13 | 5328           |null
14 | 895            |app.bsky.graph.listblock
15 | 405            |app.bsky.graph.starterpack
16 | 356            |app.bsky.graph.list
17 | 255            |app.bsky.feed.threadgate
18 | 104            |app.bsky.feed.postgate
19 | 74             |app.bsky.feed.generator
20 | 4              |app.bsky.labeler.service
21 | 
22 | ------------------------------------------------------------------------------------------------------------------------
23 | Result for query Q2:
24 | 
25 |      users     |     count     |    commit.collection
26 | ---------------+---------------+--------------------------
27 | 117184         |444523         |app.bsky.feed.like
28 | 63873          |337978         |app.bsky.graph.follow
29 | 50383          |86812          |app.bsky.feed.post
30 | 26580          |56993          |app.bsky.feed.repost
31 | 5783           |13838          |app.bsky.graph.block
32 | 1078           |7568           |app.bsky.graph.listitem
33 | 5337           |5337           |app.bsky.actor.profile
34 | 449            |860            |app.bsky.graph.listblock
35 | 218            |259            |app.bsky.graph.list
36 | 196            |228            |app.bsky.feed.threadgate
37 | 101            |104            |app.bsky.graph.starterpack
38 | 82             |101            |app.bsky.feed.postgate
39 | 9              |10             |app.bsky.feed.generator
40 | 
41 | ------------------------------------------------------------------------------------------------------------------------
42 | Result for query Q3:
43 | 
44 |      count     | commit.collection  |DATE_EXTRACT("hour_of_day", time_us)
45 | ---------------+--------------------+------------------------------------
46 | 56993          |app.bsky.feed.repost|16
47 | 86812          |app.bsky.feed.post  |16
48 | 444523         |app.bsky.feed.like  |16
49 | 
50 | ------------------------------------------------------------------------------------------------------------------------
51 | Result for query Q4:
52 | 
53 |      first_post_ts      |              did
54 | ------------------------+--------------------------------
55 | 2024-11-21T16:25:49.000Z|did:plc:yj3sjq3blzpynh27cumnp5ks
56 | 2024-11-21T16:25:49.001Z|did:plc:l5o3qjrmfztir54cpwlv2eme
57 | 2024-11-21T16:25:49.003Z|did:plc:s4bwqchfzm6gjqfeb6mexgbu
58 | 
59 | ------------------------------------------------------------------------------------------------------------------------
60 | Result for query Q5:
61 | 
62 |  activity_span |              did
63 | ---------------+--------------------------------
64 | 813007         |did:plc:tsyymlun4eqjuw7hqrhmwagd
65 | 811602         |did:plc:3ug235sfy2pz7cawmpsftb65
66 | 811404         |did:plc:doxhhgtxqiv47tmcovpbcqai


--------------------------------------------------------------------------------
/elasticsearch/results/m6i.8xlarge_bluesky_no_source_1000m_best_compression.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Elasticsearch (no source, best compression)",
 3 |   "version": "8.17.0",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000000,
12 |   "dataset_size_readable": "1000m",
13 |   "num_loaded_documents": 999998998,
14 |   "data_compression": "zstd",
15 |   "total_size": 235840659266,
16 |   "total_size_readable": "219.6 GB",
17 |   "result": [
18 |     [5.022,5.019,5.078],
19 |     [51.486,45.510,45.713],
20 |     [41.789,41.359,41.608],
21 |     [8.807,8.812,8.711],
22 |     [9.696,9.723,9.533]
23 |   ],
24 |   "result_readable": [
25 |     "5.02 sec, 5.02 sec, 5.08 sec",
26 |     "51.49 sec, 45.51 sec, 45.71 sec",
27 |     "41.79 sec, 41.36 sec, 41.61 sec",
28 |     "8.81 sec, 8.81 sec, 8.71 sec",
29 |     "9.70 sec, 9.72 sec, 9.53 sec"
30 |   ]
31 | }
32 | 


--------------------------------------------------------------------------------
/elasticsearch/results/m6i.8xlarge_bluesky_no_source_1000m_default_compression.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Elasticsearch (no source, default)",
 3 |   "version": "8.17.0",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000000,
12 |   "dataset_size_readable": "1000m",
13 |   "num_loaded_documents": 999999106,
14 |   "data_compression": "lz4",
15 |   "total_size": 238598578300,
16 |   "total_size_readable": "222 GB",
17 |   "result": [
18 |     [4.427,4.294,4.382],
19 |     [46.690,41.966,42.483],
20 |     [39.386,39.241,38.131],
21 |     [8.575,8.428,8.386],
22 |     [9.362,9.322,9.299]
23 |   ],
24 |   "result_readable": [
25 |     "4.43 sec, 4.29 sec, 4.38 sec",
26 |     "46.69 sec, 41.97 sec, 42.48 sec",
27 |     "39.39 sec, 39.24 sec, 38.13 sec",
28 |     "8.57 sec, 8.43 sec, 8.39 sec",
29 |     "9.36 sec, 9.32 sec, 9.30 sec"
30 |   ]
31 | }
32 | 


--------------------------------------------------------------------------------
/elasticsearch/results/m6i.8xlarge_bluesky_no_source_100m_best_compression.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Elasticsearch (no source, best compression)",
 3 |   "version": "8.17.0",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 100000000,
12 |   "dataset_size_readable": "100m",
13 |   "num_loaded_documents": 99999947,
14 |   "data_compression": "zstd",
15 |   "data_size": 21268479403,
16 |   "data_size_readable": "19.8 GB",
17 |   "result": [
18 |     [2.532,2.536,2.486],
19 |     [23.194,22.932,23.188],
20 |     [19.521,19.321,19.159],
21 |     [2.867,2.791,2.884],
22 |     [3.099,3.136,3.171]
23 |   ],
24 |     "result_readable": [
25 |     "2.53 sec, 2.54 sec, 2.49 sec",
26 |     "23.19 sec, 22.93 sec, 23.19 sec",
27 |     "19.52 sec, 19.32 sec, 19.16 sec",
28 |     "2.87 sec, 2.79 sec, 2.88 sec",
29 |     "3.10 sec, 3.14 sec, 3.17 sec"
30 |   ]
31 | }
32 | 


--------------------------------------------------------------------------------
/elasticsearch/results/m6i.8xlarge_bluesky_no_source_100m_default_compression.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Elasticsearch (no source, default)",
 3 |   "version": "8.17.0",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 100000000,
12 |   "dataset_size_readable": "100m",
13 |   "num_loaded_documents": 99999947,
14 |   "data_compression": "lz4",
15 |   "total_size": 21240210194,
16 |   "total_size_readable": "19.7 GB",
17 |   "result": [
18 |     [2.519,2.524,2.483],
19 |     [23.610,23.215,23.230],
20 |     [19.762,20.236,19.979],
21 |     [2.868,2.809,2.841],
22 |     [3.075,3.103,3.117]
23 |   ],
24 |   "result_readable": [
25 |     "2.52 sec, 2.52 sec, 2.48 sec",
26 |     "23.61 sec, 23.21 sec, 23.23 sec",
27 |     "19.76 sec, 20.24 sec, 19.98 sec",
28 |     "2.87 sec, 2.81 sec, 2.84 sec",
29 |     "3.08 sec, 3.10 sec, 3.12 sec"
30 |   ]
31 | }
32 | 


--------------------------------------------------------------------------------
/elasticsearch/results/m6i.8xlarge_bluesky_no_source_10m_best_compression.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Elasticsearch (no source, best compression)",
 3 |   "version": "8.17.0",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 10000000,
12 |   "dataset_size_readable": "10m",
13 |   "num_loaded_documents": 9999998,
14 |   "data_compression": "zstd",
15 |   "total_size": 2834172690,
16 |   "total_size_readable": "2.6 GB",
17 |   "result": [
18 |     [0.270,0.263,0.275],
19 |     [2.942,2.683,2.655],
20 |     [2.014,2.008,2.037],
21 |     [0.414,0.412,0.437],
22 |     [0.562,0.470,0.463]
23 |   ],
24 |   "result_readable": [
25 |     "270.00 msec, 263.00 msec, 275.00 msec",
26 |     "2.94 sec, 2.68 sec, 2.65 sec",
27 |     "2.01 sec, 2.01 sec, 2.04 sec",
28 |     "414.00 msec, 412.00 msec, 437.00 msec",
29 |     "562.00 msec, 470.00 msec, 463.00 msec"
30 |   ]
31 | }
32 | 


--------------------------------------------------------------------------------
/elasticsearch/results/m6i.8xlarge_bluesky_no_source_10m_default_compression.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Elasticsearch (no source, default)",
 3 |   "version": "8.17.0",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 10000000,
12 |   "dataset_size_readable": "10m",
13 |   "num_loaded_documents": 9999998,
14 |   "data_compression": "lz4",
15 |   "total_size": 3128506511,
16 |   "total_size_readable": "2.9 GB",
17 |   "result": [
18 |     [0.268,0.269,0.259],
19 |     [2.631,2.638,2.557],
20 |     [2.099,2.049,2.066],
21 |     [0.412,0.407,0.405],
22 |     [0.468,0.466,0.462]
23 |   ],
24 |   "result_readable": [
25 |     "268.00 msec, 269.00 msec, 259.00 msec",
26 |     "2.63 sec, 2.64 sec, 2.56 sec",
27 |     "2.10 sec, 2.05 sec, 2.07 sec",
28 |     "412.00 msec, 407.00 msec, 405.00 msec",
29 |     "468.00 msec, 466.00 msec, 462.00 msec"
30 |   ]
31 | }
32 | 


--------------------------------------------------------------------------------
/elasticsearch/results/m6i.8xlarge_bluesky_no_source_1m_best_compression.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Elasticsearch (no source, best compression)",
 3 |   "version": "8.17.0",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000,
12 |   "dataset_size_readable": "1m",
13 |   "num_loaded_documents": 1000000,
14 |   "data_compression": "zstd",
15 |   "total_size": 400948257,
16 |   "total_size_readable": "382.3 MB",
17 |   "result": [
18 |     [0.041,0.037,0.035],
19 |     [0.426,0.321,0.323],
20 |     [0.192,0.186,0.213],
21 |     [0.056,0.052,0.053],
22 |     [0.099,0.061,0.060]
23 |   ],
24 |   "result_readable": [
25 |     "41.00 msec, 37.00 msec, 35.00 msec",
26 |     "426.00 msec, 321.00 msec, 323.00 msec",
27 |     "192.00 msec, 186.00 msec, 213.00 msec",
28 |     "56.00 msec, 52.00 msec, 53.00 msec",
29 |     "99.00 msec, 61.00 msec, 60.00 msec"
30 |   ]
31 | }
32 | 


--------------------------------------------------------------------------------
/elasticsearch/results/m6i.8xlarge_bluesky_no_source_1m_default_compression.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Elasticsearch (no source, default)",
 3 |   "version": "8.17.0",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000,
12 |   "dataset_size_readable": "1m",
13 |   "num_loaded_documents": 1000000,
14 |   "data_compression": "lz4",
15 |   "total_size": 481917965,
16 |   "total_size_readable": "459.4 MB",
17 |   "result": [
18 |     [0.035,0.035,0.033],
19 |     [0.343,0.330,0.317],
20 |     [0.192,0.196,0.189],
21 |     [0.049,0.049,0.049],
22 |     [0.057,0.057,0.057]
23 |   ],
24 |   "result_readable": [
25 |     "35.00 msec, 35.00 msec, 33.00 msec",
26 |     "343.00 msec, 330.00 msec, 317.00 msec",
27 |     "192.00 msec, 196.00 msec, 189.00 msec",
28 |     "49.00 msec, 49.00 msec, 49.00 msec",
29 |     "57.00 msec, 57.00 msec, 57.00 msec"
30 |   ]
31 | }
32 | 


--------------------------------------------------------------------------------
/elasticsearch/results/m6i.8xlarge_bluesky_source_1000m_best_compression.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Elasticsearch (best compression)",
 3 |   "version": "8.17.0",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000000,
12 |   "dataset_size_readable": "1000m",
13 |   "num_loaded_documents": 999999101,
14 |   "data_compression": "zstd",
15 |   "total_size": 386099682721,
16 |   "total_size_readable": "359.6 GB",
17 |   "result": [
18 |     [3.854,3.884,4.081],
19 |     [37.078,29.084,28.548],
20 |     [24.382,24.279,23.570],
21 |     [8.106,8.228,8.080],
22 |     [9.208,8.994,9.084]
23 |   ],
24 |   "result_readable": [
25 |     "3.85 sec, 3.88 sec, 4.08 sec",
26 |     "37.08 sec, 29.08 sec, 28.55 sec",
27 |     "24.38 sec, 24.28 sec, 23.57 sec",
28 |     "8.11 sec, 8.23 sec, 8.08 sec",
29 |     "9.21 sec, 8.99 sec, 9.08 sec"
30 |   ]
31 | }
32 | 


--------------------------------------------------------------------------------
/elasticsearch/results/m6i.8xlarge_bluesky_source_1000m_default_compression.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Elasticsearch (default)",
 3 |   "version": "8.17.0",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000000,
12 |   "dataset_size_readable": "1000m",
13 |   "num_loaded_documents": 999999153,
14 |   "data_compression": "lz4",
15 |   "total_size": 488061838541,
16 |   "total_size_readable": "454.5 GB",
17 |   "result": [
18 |     [3.087,3.026,3.060],
19 |     [35.117,27.380,27.745],
20 |     [24.882,25.534,24.787],
21 |     [8.899,8.912,8.731],
22 |     [9.748,9.733,9.797]
23 |   ],
24 |   "result_readable": [
25 |     "3.09 sec, 3.03 sec, 3.06 sec",
26 |     "35.12 sec, 27.38 sec, 27.75 sec",
27 |     "24.88 sec, 25.53 sec, 24.79 sec",
28 |     "8.90 sec, 8.91 sec, 8.73 sec",
29 |     "9.75 sec, 9.73 sec, 9.80 sec"
30 |   ]
31 | }
32 | 


--------------------------------------------------------------------------------
/elasticsearch/results/m6i.8xlarge_bluesky_source_100m_best_compression.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Elasticsearch (best compression)",
 3 |   "version": "8.17.0",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 100000000,
12 |   "dataset_size_readable": "100m",
13 |   "num_loaded_documents": 99999947,
14 |   "data_compression": "zstd",
15 |   "total_size": 34182479705,
16 |   "total_size_readable": "31.8 GB",
17 |   "result": [
18 |     [2.765,2.718,2.799],
19 |     [20.788,20.822,20.270],
20 |     [16.306,16.642,15.693],
21 |     [2.454,2.461,2.423],
22 |     [2.761,2.768,2.784]
23 |   ],
24 |   "result_readable": [
25 |     "2.77 sec, 2.72 sec, 2.80 sec",
26 |     "20.79 sec, 20.82 sec, 20.27 sec",
27 |     "16.31 sec, 16.64 sec, 15.69 sec",
28 |     "2.45 sec, 2.46 sec, 2.42 sec",
29 |     "2.76 sec, 2.77 sec, 2.78 sec"
30 |   ]
31 | }
32 | 


--------------------------------------------------------------------------------
/elasticsearch/results/m6i.8xlarge_bluesky_source_100m_default_compression.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Elasticsearch (default)",
 3 |   "version": "8.17.0",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 100000000,
12 |   "dataset_size_readable": "100m",
13 |   "num_loaded_documents": 99999947,
14 |   "data_compression": "lz4",
15 |   "total_size": 42432097529,
16 |   "total_size_readable": "39.5 GB",
17 |   "result": [
18 |     [2.520,2.507,2.592],
19 |     [23.055,22.842,22.868],
20 |     [19.873,20.199,19.585],
21 |     [2.891,2.819,2.791],
22 |     [3.144,3.183,3.171]
23 |   ],
24 |   "result_readable": [
25 |     "2.52 sec, 2.51 sec, 2.59 sec",
26 |     "23.05 sec, 22.84 sec, 22.87 sec",
27 |     "19.87 sec, 20.20 sec, 19.59 sec",
28 |     "2.89 sec, 2.82 sec, 2.79 sec",
29 |     "3.14 sec, 3.18 sec, 3.17 sec"
30 |   ]
31 | }
32 | 


--------------------------------------------------------------------------------
/elasticsearch/results/m6i.8xlarge_bluesky_source_10m_best_compression.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Elasticsearch (best compression)",
 3 |   "version": "8.17.0",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 10000000,
12 |   "dataset_size_readable": "10m",
13 |   "num_loaded_documents": 9999998,
14 |   "data_compression": "zstd",
15 |   "total_size": 3785308904,
16 |   "total_size_readable": "3.5 GB",
17 |   "result": [
18 |     [0.286,0.290,0.287],
19 |     [2.487,2.367,2.406],
20 |     [1.747,1.671,1.656],
21 |     [0.368,0.360,0.364],
22 |     [0.423,0.424,0.422]
23 |   ],
24 |   "result_readable": [
25 |     "286.00 msec, 290.00 msec, 287.00 msec",
26 |     "2.49 sec, 2.37 sec, 2.41 sec",
27 |     "1.75 sec, 1.67 sec, 1.66 sec",
28 |     "368.00 msec, 360.00 msec, 364.00 msec",
29 |     "423.00 msec, 424.00 msec, 422.00 msec"
30 |   ]
31 | }
32 | 


--------------------------------------------------------------------------------
/elasticsearch/results/m6i.8xlarge_bluesky_source_10m_default_compression.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Elasticsearch (default)",
 3 |   "version": "8.17.0",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 10000000,
12 |   "dataset_size_readable": "10m",
13 |   "num_loaded_documents": 9999998,
14 |   "data_compression": "lz4",
15 |   "total_size": 4600428060,
16 |   "total_size_readable": "4.2 GB",
17 |   "result": [
18 |     [0.266,0.274,0.266],
19 |     [2.753,2.634,2.620],
20 |     [2.048,2.130,2.020],
21 |     [0.417,0.399,0.402],
22 |     [0.465,0.466,0.480]
23 |   ],
24 |   "result_readable": [
25 |     "266.00 msec, 274.00 msec, 266.00 msec",
26 |     "2.75 sec, 2.63 sec, 2.62 sec",
27 |     "2.05 sec, 2.13 sec, 2.02 sec",
28 |     "417.00 msec, 399.00 msec, 402.00 msec",
29 |     "465.00 msec, 466.00 msec, 480.00 msec"
30 |   ]
31 | }
32 | 


--------------------------------------------------------------------------------
/elasticsearch/results/m6i.8xlarge_bluesky_source_1m_best_compression.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Elasticsearch (best compression)",
 3 |   "version": "8.17.0",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000,
12 |   "dataset_size_readable": "1m",
13 |   "num_loaded_documents": 1000000,
14 |   "data_compression": "zstd",
15 |   "total_size": 400974094,
16 |   "total_size_readable": "382.3 MB",
17 |   "result": [
18 |     [0.039,0.036,0.036],
19 |     [0.344,0.303,0.305],
20 |     [0.171,0.166,0.159],
21 |     [0.047,0.047,0.049],
22 |     [0.056,0.056,0.056]
23 |   ],
24 |   "result_readable": [
25 |     "39.00 msec, 36.00 msec, 36.00 msec",
26 |     "344.00 msec, 303.00 msec, 305.00 msec",
27 |     "171.00 msec, 166.00 msec, 159.00 msec",
28 |     "47.00 msec, 47.00 msec, 49.00 msec",
29 |     "56.00 msec, 56.00 msec, 56.00 msec"
30 |   ]
31 | }
32 | 


--------------------------------------------------------------------------------
/elasticsearch/results/m6i.8xlarge_bluesky_source_1m_default_compression.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "Elasticsearch",
 3 |   "version": "8.17.0",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-14",
 6 |   "machine": "m6i.8xlarge, 16000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000,
12 |   "dataset_size_readable": "1m",
13 |   "num_loaded_documents": 1000000,
14 |   "data_compression": "lz4",
15 |   "total_size": 932793512,
16 |   "total_size_readable": "889.5 MB",
17 |   "result": [
18 |     [0.037,0.034,0.035],
19 |     [0.350,0.320,0.328],
20 |     [0.199,0.193,0.190],
21 |     [0.048,0.050,0.049],
22 |     [0.057,0.059,0.058]
23 |   ],
24 |   "result_readable": [
25 |     "37.00 msec, 34.00 msec, 35.00 msec",
26 |     "350.00 msec, 320.00 msec, 328.00 msec",
27 |     "199.00 msec, 193.00 msec, 190.00 msec",
28 |     "48.00 msec, 50.00 msec, 49.00 msec",
29 |     "57.00 msec, 59.00 msec, 58.00 msec"
30 |   ]
31 | }


--------------------------------------------------------------------------------
/elasticsearch/run_queries.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <INDEX_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Check if ELASTIC_PASSWORD env variable is set, if set from not read from .elastic_password file
10 | if [[ -z "$ELASTIC_PASSWORD" ]]; then
11 |     [[ ! -f ".elastic_password" ]] && { echo "Error: ELASTIC_PASSWORD environment variable is not set and .elastic_password file does not exist."; exit 1; }
12 |     export $(cat .elastic_password)
13 | fi
14 | 
15 | # Arguments
16 | INDEX_NAME="$1"
17 | 
18 | # Number of tries for each query
19 | TRIES=3
20 | 
21 | # File containing Elasticsearch ES|SQL queries
22 | QUERY_FILE="queries.txt"
23 | LOG_FILE="query_log_$INDEX_NAME.log"
24 | > "$LOG_FILE"
25 | 
26 | # Check if the query file exists
27 | if [[ ! -f "$QUERY_FILE" ]]; then
28 |     echo "Error: Query file '$QUERY_FILE' does not exist."
29 |     exit 1
30 | fi
31 | 
32 | cat 'queries.txt' | while read -r QUERY; do
33 |     # Clear filesystem cache between queries.
34 |     sync
35 |     echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
36 |     # Clear query cache between queries.
37 |     curl -k -X POST 'https://localhost:9200/hits/_cache/clear?pretty' -u "elastic:${ELASTIC_PASSWORD}" &>/dev/null
38 |     eval "QUERY=\"${QUERY}\""
39 |     echo "Running query: $QUERY"
40 |     for i in $(seq 1 $TRIES); do
41 |         CURL_DATA="{\"query\": \"$QUERY\"}"
42 |         RESPONSE=$(curl -s -k -X POST "https://localhost:9200/_query" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json' -d "$CURL_DATA")
43 |         TOOK_MS=$(echo "$RESPONSE" | jq -r '.took' 2>/dev/null)
44 |         
45 |         # Convert 'took' to seconds (from ms to s)
46 |         TOOK_S=$(bc <<< "scale=3; $TOOK_MS / 1000")
47 |         TOOK_FORMATTED=$(printf "%.3f" "$TOOK_S")
48 |         echo "$RESPONSE" >> "$LOG_FILE"
49 |         echo "Response time: ${TOOK_FORMATTED} s"
50 |     done
51 | done
52 | 


--------------------------------------------------------------------------------
/elasticsearch/total_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <index_name>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Check if ELASTIC_PASSWORD env variable is set, if set from not read from .elastic_password file
10 | if [[ -z "$ELASTIC_PASSWORD" ]]; then
11 |     [[ ! -f ".elastic_password" ]] && { echo "Error: ELASTIC_PASSWORD environment variable is not set and .elastic_password file does not exist."; exit 1; }
12 |     export $(cat .elastic_password)
13 | fi
14 | 
15 | # Arguments
16 | INDEX_NAME="$1"
17 | 
18 | # Get data size
19 | curl -k -XGET "https://localhost:9200/_data_stream/${INDEX_NAME}/_stats?human" -u "elastic:${ELASTIC_PASSWORD}" -H 'Content-Type: application/json'


--------------------------------------------------------------------------------
/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Terry7879/JSONBench/3e43960d581c4a3718ea7ec6c8812fadbceac02d/favicon.png


--------------------------------------------------------------------------------
/generate-results.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | 
 3 | # This script will substitute the benchmark results into the HTML page.
 4 | # Note: editing HTML with sed may look strange, but at least we avoid using node.js and npm, and that's good.
 5 | 
 6 | # This is needed on Mac OS. Do `brew install coreutils`.
 7 | [ -n "$HOMEBREW_PREFIX" ] && PATH="${HOMEBREW_PREFIX}/opt/coreutils/libexec/gnubin:${PATH}"
 8 | if command -v gsed >/dev/null 2>&1
 9 | then
10 |     alias sed='gsed'
11 | fi
12 | 
13 | (
14 |     sed '/^const data = \[$/q' index.html
15 | 
16 |     FIRST=1
17 |     LANG="" ls -1 */results*/*.json | while read -r file
18 |     do
19 |         [ "${FIRST}" = "0" ] && echo -n ','
20 |         jq --compact-output ". += {\"source\": \"${file}\"}" "${file}" || echo "Error in $file" >&2
21 |         FIRST=0
22 |     done
23 | 
24 |     echo ']; // end of data'
25 |     sed '0,/^\]; \/\/ end of data$/d' index.html
26 | 
27 | ) > index.html.new
28 | 
29 | mv index.html index.html.bak
30 | mv index.html.new index.html
31 | 


--------------------------------------------------------------------------------
/mongodb/benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> [RESULT_FILE]"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | RESULT_FILE="${2:-}"
12 | 
13 | # Construct the query log file name using $DB_NAME
14 | QUERY_LOG_FILE="_query_log_${DB_NAME}.txt"
15 | 
16 | # Print the database name
17 | echo "Running queries on database: $DB_NAME"
18 | 
19 | # Run queries and log the output
20 | ./run_queries.sh "$DB_NAME" 2>&1 | tee "$QUERY_LOG_FILE"
21 | 
22 | # Process the query log and prepare the result
23 | RESULT=$(cat "$QUERY_LOG_FILE" | grep -oP 'Execution time: \d+ms' | sed -r 's/Execution time: ([0-9]+)/\1/' | \
24 | awk '{ if (i % 3 == 0) { printf "[" }; printf $1 / 1000; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }')
25 | 
26 | # Output the result
27 | if [[ -n "$RESULT_FILE" ]]; then
28 |     echo "$RESULT" > "$RESULT_FILE"
29 |     echo "Result written to $RESULT_FILE"
30 | else
31 |     echo "$RESULT"
32 | fi


--------------------------------------------------------------------------------
/mongodb/count.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <database_name> <collection_name>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATABASE_NAME="$1"
11 | COLLECTION_NAME="$2"
12 | 
13 | # Fetch the document count using mongosh
14 | document_count=$(mongosh --quiet --eval "
15 |     const db = db.getSiblingDB('$DATABASE_NAME');
16 |     const count = db.getCollection('$COLLECTION_NAME').stats().count
17 |     print(count);
18 | ")
19 | 
20 | # Debugging information
21 | echo "Database: $DATABASE_NAME"
22 | echo "Collection: $COLLECTION_NAME"
23 | echo "Document count: $document_count"
24 | 
25 | # Print the result
26 | if [[ -z "$document_count" ]]; then
27 |     echo "Error: Unable to fetch document count. Ensure the database and collection exist."
28 |     exit 1
29 | else
30 |     echo $document_count
31 | fi


--------------------------------------------------------------------------------
/mongodb/create_and_load.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 7 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <COLLECTION_NAME> <DDL_FILE> <DATA_DIRECTORY> <NUM_FILES> <SUCCESS_LOG> <ERROR_LOG>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | COLLECTION_NAME="$2"
12 | DDL_FILE="$3"
13 | DATA_DIRECTORY="$4"
14 | NUM_FILES="$5"
15 | SUCCESS_LOG="$6"
16 | ERROR_LOG="$7"
17 | 
18 | # Validate arguments
19 | [[ ! -f "$DDL_FILE" ]] && { echo "Error: DDL file '$DDL_FILE' does not exist."; exit 1; }
20 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
21 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }
22 | 
23 | # Create database and execute DDL file
24 | mongosh --quiet --eval "
25 |     db = db.getSiblingDB('$DB_NAME');
26 |     load('$DDL_FILE');
27 | "
28 | 
29 | # Load data
30 | ./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$COLLECTION_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"
31 | 
32 | echo "Script completed successfully."


--------------------------------------------------------------------------------
/mongodb/data_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <database_name> <collection_name>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATABASE_NAME="$1"
11 | COLLECTION_NAME="$2"
12 | 
13 | # Fetch the totalSize using mongosh
14 | total_size=$(mongosh --quiet --eval "
15 |     const db = db.getSiblingDB('$DATABASE_NAME');
16 |     const stats = db.getCollection('$COLLECTION_NAME').stats();
17 |     print(stats.storageSize);
18 | ")
19 | 
20 | # Print the result
21 | if [[ -z "$total_size" ]]; then
22 |     echo "Error: Unable to fetch totalSize. Ensure the database and collection exist."
23 |     exit 1
24 | else
25 |     echo $total_size
26 | fi


--------------------------------------------------------------------------------
/mongodb/ddl_snappy.js:
--------------------------------------------------------------------------------
1 | db.createCollection(
2 |    "bluesky",
3 |    { storageEngine: { wiredTiger: { configString: "block_compressor=snappy" } } }
4 | );
5 | 
6 | db.bluesky.createIndex({"kind": 1, "commit.operation": 1, "commit.collection": 1, "did": 1, "time_us": 1});


--------------------------------------------------------------------------------
/mongodb/ddl_zstd.js:
--------------------------------------------------------------------------------
1 | db.createCollection(
2 |    "bluesky",
3 |    { storageEngine: { wiredTiger: { configString: "block_compressor=zstd" } } }
4 | );
5 | 
6 | db.bluesky.createIndex({"kind": 1, "commit.operation": 1, "commit.collection": 1, "did": 1, "time_us": 1});


--------------------------------------------------------------------------------
/mongodb/index_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <database_name> <collection_name>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATABASE_NAME="$1"
11 | COLLECTION_NAME="$2"
12 | 
13 | # Fetch the totalSize using mongosh
14 | total_size=$(mongosh --quiet --eval "
15 |     const db = db.getSiblingDB('$DATABASE_NAME');
16 |     const stats = db.getCollection('$COLLECTION_NAME').stats();
17 |     print(stats.totalIndexSize);
18 | ")
19 | 
20 | # Print the result
21 | if [[ -z "$total_size" ]]; then
22 |     echo "Error: Unable to fetch totalSize. Ensure the database and collection exist."
23 |     exit 1
24 | else
25 |     echo $total_size
26 | fi


--------------------------------------------------------------------------------
/mongodb/index_usage.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | 
12 | QUERY_NUM=1
13 | 
14 | # File containing MongoDB queries (replace 'queries.js' with your file)
15 | QUERY_FILE="queries.js"
16 | 
17 | # Check if the query file exists
18 | if [[ ! -f "$QUERY_FILE" ]]; then
19 |     echo "Error: Query file '$QUERY_FILE' does not exist."
20 |     exit 1
21 | fi
22 | 
23 | # Set the internalQueryPlannerGenerateCoveredWholeIndexScans parameter to true
24 | echo "Setting internalQueryPlannerGenerateCoveredWholeIndexScans to true..."
25 | mongosh --quiet --eval "
26 |     const result = db.adminCommand({ setParameter: 1, internalQueryPlannerGenerateCoveredWholeIndexScans: true });
27 |     if (result.ok !== 1) {
28 |         print('Failed to set internalQueryPlannerGenerateCoveredWholeIndexScans: ' + JSON.stringify(result));
29 |         quit(1);
30 |     } else {
31 |         print('Successfully set internalQueryPlannerGenerateCoveredWholeIndexScans to true');
32 |     }
33 | "
34 | 
35 | cat "$QUERY_FILE" | while read -r query; do
36 | 
37 |     # Print the query number
38 |     echo "------------------------------------------------------------------------------------------------------------------------"
39 |     echo "Index usage for query Q$QUERY_NUM:"
40 |     echo
41 | 
42 |     # Modify the query to include the explain option inside the aggregate call
43 |     MODIFIED_QUERY=$(echo "$query" | sed 's/]);$/], { explain: "queryPlanner" });/')
44 | 
45 |     # Escape the modified query for safe passing to mongosh
46 |     ESCAPED_QUERY=$(echo "$MODIFIED_QUERY" | sed 's/\([\"\\]\)/\\\1/g' | sed 's/\$/\\$/g')
47 | 
48 |     mongosh --quiet --eval "
49 |         const db = db.getSiblingDB('$DB_NAME');
50 |         const result = eval(\"$ESCAPED_QUERY\");
51 |         printjson(result.stages[0].\$cursor.queryPlanner.winningPlan);
52 |     "
53 | 
54 |     # Increment the query number
55 |     QUERY_NUM=$((QUERY_NUM + 1))
56 | done;


--------------------------------------------------------------------------------
/mongodb/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # https://www.mongodb.com/docs/manual/tutorial/install-mongodb-on-ubuntu/
 4 | 
 5 | sudo sudo apt-get install gnupg curl
 6 | curl -fsSL https://www.mongodb.org/static/pgp/server-8.0.asc | \
 7 |    sudo gpg -o /usr/share/keyrings/mongodb-server-8.0.gpg \
 8 |    --dearmor
 9 | echo "deb [ arch=amd64,arm64 signed-by=/usr/share/keyrings/mongodb-server-8.0.gpg ] https://repo.mongodb.org/apt/ubuntu noble/mongodb-org/8.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-8.0.list
10 | sudo apt-get update
11 | sudo apt-get install -y mongodb-org
12 | sudo systemctl start mongod
13 | sudo systemctl status mongod


--------------------------------------------------------------------------------
/mongodb/load_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 6 ]]; then
 5 |     echo "Usage: $0 <directory> <database_name> <collection_name> <max_files> <success_log> <error_log>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DIRECTORY="$1"
11 | DB_NAME="$2"
12 | COLLECTION_NAME="$3"
13 | MAX_FILES="$4"
14 | SUCCESS_LOG="$5"
15 | ERROR_LOG="$6"
16 | MONGO_URI="mongodb://localhost:27017"   # Replace with your MongoDB URI if necessary
17 | 
18 | # Validate that MAX_FILES is a number
19 | if ! [[ "$MAX_FILES" =~ ^[0-9]+$ ]]; then
20 |     echo "Error: <max_files> must be a positive integer."
21 |     exit 1
22 | fi
23 | 
24 | # Ensure the log files exist
25 | touch "$SUCCESS_LOG" "$ERROR_LOG"
26 | 
27 | # Create a temporary directory for uncompressed files
28 | TEMP_DIR=$(mktemp -d /var/tmp/json_files.XXXXXX)
29 | trap "rm -rf $TEMP_DIR" EXIT  # Ensure cleanup on script exit
30 | 
31 | # Counter to track processed files
32 | counter=0
33 | 
34 | # Loop through each .json.gz file in the directory
35 | for file in $(ls "$DIRECTORY"/*.json.gz 2>/dev/null | sort); do
36 |     if [[ -f "$file" ]]; then
37 |         echo "Processing $file..."
38 |         counter=$((counter + 1))
39 | 
40 |         # Uncompress the file into the TEMP_DIR
41 |         uncompressed_file="$TEMP_DIR/$(basename "${file%.gz}")"
42 |         gunzip -c "$file" > "$uncompressed_file"
43 | 
44 |         # Check if uncompression was successful
45 |         if [[ $? -ne 0 ]]; then
46 |             echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed to uncompress $file." >> "$ERROR_LOG"
47 |             continue
48 |         fi
49 | 
50 |         # Import the uncompressed JSON file into MongoDB
51 |         mongoimport --uri "$MONGO_URI" --db "$DB_NAME" --collection "$COLLECTION_NAME" --file "$uncompressed_file"
52 |         import_status=$?
53 | 
54 |         # Check if the import was successful
55 |         if [[ $import_status -eq 0 ]]; then
56 |             echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $uncompressed_file into MongoDB." >> "$SUCCESS_LOG"
57 |         else
58 |             echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed to import $uncompressed_file into MongoDB." >> "$ERROR_LOG"
59 |         fi
60 | 
61 |         # Remove the uncompressed file after processing
62 |         rm -f "$uncompressed_file"
63 | 
64 |         # Stop processing if the max number of files is reached
65 |         if [[ $counter -ge $MAX_FILES ]]; then
66 |             echo "Processed maximum number of files: $MAX_FILES"
67 |             break
68 |         fi
69 |     fi
70 | done
71 | 
72 | if [[ $counter -eq 0 ]]; then
73 |     echo "No .json.gz files found in the directory."
74 | fi
75 | 
76 | echo "All files have been processed."


--------------------------------------------------------------------------------
/mongodb/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Default data directory
 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky
 5 | 
 6 | # Allow the user to optionally provide the data directory as an argument
 7 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}"
 8 | 
 9 | # Define success and error log files
10 | SUCCESS_LOG="${2:-success.log}"
11 | ERROR_LOG="${3:-error.log}"
12 | 
13 | # Define prefix for output files
14 | OUTPUT_PREFIX="${4:-_m6i.8xlarge}"
15 | 
16 | # Check if the directory exists
17 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
18 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
19 |     exit 1
20 | fi
21 | 
22 | echo "Select the dataset size to benchmark:"
23 | echo "1) 1m (default)"
24 | echo "2) 10m"
25 | echo "3) 100m"
26 | echo "4) 1000m"
27 | echo "5) all"
28 | read -p "Enter the number corresponding to your choice: " choice
29 | 
30 | ./install.sh
31 | 
32 | benchmark() {
33 |     local size=$1
34 |     local compression=$2
35 |     # Check DATA_DIRECTORY contains the required number of files to run the benchmark
36 |     file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
37 |     if (( file_count < size )); then
38 |         echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
39 |         exit 1
40 |     fi
41 |     ./create_and_load.sh "bluesky_${size}m_${compression}" bluesky "ddl_${compression}.js" "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
42 |     ./total_size.sh "bluesky_${size}m_${compression}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.total_size"
43 |     ./data_size.sh "bluesky_${size}m_${compression}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.data_size"
44 |     ./index_size.sh "bluesky_${size}m_${compression}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.index_size"
45 |     ./count.sh "bluesky_${size}m_${compression}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.count"
46 |     #./query_results.sh "bluesky_${size}m_${compression}" | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.query_results"
47 |     ./index_usage.sh "bluesky_${size}m_${compression}" | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.index_usage"
48 |     ./benchmark.sh "bluesky_${size}m_${compression}" "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.results_runtime"
49 | }
50 | 
51 | case $choice in
52 |     2)
53 |         benchmark 10 snappy
54 |         benchmark 10 zstd
55 |         ;;
56 |     3)
57 |         benchmark 100 snappy
58 |         benchmark 100 zstd
59 |         ;;
60 |     4)
61 |         benchmark 1000 snappy
62 |         benchmark 1000 zstd
63 |         ;;
64 |     5)
65 |         benchmark 1 snappy
66 |         benchmark 1 zstd
67 |         benchmark 10 snappy
68 |         benchmark 10 zstd
69 |         benchmark 100 snappy
70 |         benchmark 100 zstd
71 |         benchmark 1000 snappy
72 |         benchmark 1000 zstd
73 |         ;;
74 |     *)
75 |         benchmark 1 snappy
76 |         benchmark 1 zstd
77 |         ;;
78 | esac


--------------------------------------------------------------------------------
/mongodb/queries.js:
--------------------------------------------------------------------------------
1 | db.bluesky.aggregate([ { $group: { _id: "$commit.collection", count: { $sum: 1 } } }, { $sort: { count: -1 } } ]);
2 | db.bluesky.aggregate([ { $match: { "kind": "commit", "commit.operation": "create" } }, { $group: { _id: "$commit.collection", count: { $sum: 1 }, users: { $addToSet: "$did" } } }, { $project: { event: "$_id", count: 1, users: { $size: "$users" } } }, { $sort: { count: -1 } } ]);
3 | db.bluesky.aggregate([ { $match: { "kind": "commit", "commit.operation": "create", "commit.collection": { $in: ["app.bsky.feed.post", "app.bsky.feed.repost", "app.bsky.feed.like"] } } }, { $project: { _id: 0, event: "$commit.collection", hour_of_day: { $hour: { $toDate: { $divide: ["$time_us", 1000] } } } } }, { $group: { _id: { event: "$event", hour_of_day: "$hour_of_day" }, count: { $sum: 1 } } }, { $sort: { "_id.hour_of_day": 1, "_id.event": 1 } } ]);
4 | db.bluesky.aggregate([ { $match: { "kind": "commit", "commit.operation": "create", "commit.collection": "app.bsky.feed.post" } }, { $project: { _id: 0, user_id: "$did", timestamp: { $toDate: { $divide: ["$time_us", 1000] } } } }, { $group: { _id: "$user_id", first_post_ts: { $min: "$timestamp" } } }, { $sort: { first_post_ts: 1 } }, { $limit: 3 } ]);
5 | db.bluesky.aggregate([ { $match: { "kind": "commit", "commit.operation": "create", "commit.collection": "app.bsky.feed.post" } }, { $project: { _id: 0, user_id: "$did", timestamp: { $toDate: { $divide: ["$time_us", 1000] } } } }, { $group: { _id: "$user_id", min_timestamp: { $min: "$timestamp" }, max_timestamp: { $max: "$timestamp" } } }, { $project: { activity_span: { $dateDiff: { startDate: "$min_timestamp", endDate: "$max_timestamp", unit: "millisecond" } } } }, { $sort: { activity_span: -1 } }, { $limit: 3 } ]);


--------------------------------------------------------------------------------
/mongodb/query_results.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | 
12 | QUERY_NUM=1
13 | 
14 | # File containing MongoDB queries (replace 'queries.js' with your file)
15 | QUERY_FILE="queries.js"
16 | 
17 | # Check if the query file exists
18 | if [[ ! -f "$QUERY_FILE" ]]; then
19 |     echo "Error: Query file '$QUERY_FILE' does not exist."
20 |     exit 1
21 | fi
22 | 
23 | # Read and execute each query
24 | cat "$QUERY_FILE" | while read -r query; do
25 | 
26 |     # Print the query
27 |     echo "------------------------------------------------------------------------------------------------------------------------"
28 |     echo "Result for query Q$QUERY_NUM:"
29 |     echo
30 | 
31 |     # Escape the query for safe passing to mongosh
32 |     ESCAPED_QUERY=$(echo "$query" | sed 's/\([\"\\]\)/\\\1/g' | sed 's/\$/\\$/g')
33 | 
34 |     mongosh --eval "
35 |         const db = db.getSiblingDB('$DB_NAME');
36 |         const result = eval(\"$ESCAPED_QUERY\");
37 |         printjson(result);
38 |     "
39 | 
40 | 
41 |     # Increment the query number
42 |     QUERY_NUM=$((QUERY_NUM + 1))
43 | 
44 | done


--------------------------------------------------------------------------------
/mongodb/results/m6i.8xlarge_bluesky_1000m_snappy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "MongoDB (snappy, covered index)",
 3 |   "version": "8.0.3",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000000,
12 |   "dataset_size_readable": "1000m",
13 |   "num_loaded_documents": 893632990,
14 |   "data_compression": "snappy",
15 |   "total_size": 248580009984,
16 |   "total_size_readable": "248.58 GB",
17 |   "data_size": 221332566016,
18 |   "data_size_readable": "221.33 GB",
19 |   "index_size": 27247443968,
20 |   "index_size_readable": "27.25 GB",
21 |   "result": [
22 |     [987.157,984.441,988.002],
23 |     [21177.1,21130,21520.6],
24 |     [1242.34,1229.8,1235.88],
25 |     [161.845,162.1,162.285],
26 |     [166.271,166.134,165.7]
27 |   ],
28 |   "result_readable": [
29 |     "16 min 27.16 sec, 16 min 24.44 sec, 16 min 28.00 sec",
30 |     "5 hr 52 min 57.10 sec, 5 hr 52 min 10.00 sec, 5 hr 58 min 40.60 sec",
31 |     "20 min 42.34 sec, 20 min 29.80 sec, 20 min 35.88 sec",
32 |     "2 min 41.84 sec, 2 min 42.10 sec, 2 min 42.28 sec",
33 |     "2 min 46.27 sec, 2 min 46.13 sec, 2 min 45.70 sec"
34 |   ]
35 | }
36 | 


--------------------------------------------------------------------------------
/mongodb/results/m6i.8xlarge_bluesky_1000m_zstd.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "MongoDB (zstd, covered index)",
 3 |   "version": "8.0.3",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000000,
12 |   "dataset_size_readable": "1000m",
13 |   "num_loaded_documents": 893632990,
14 |   "data_compression": "zstd",
15 |   "total_size": 158276513792,
16 |   "total_size_readable": "158.28 GB",
17 |   "data_size": 130998910976,
18 |   "data_size_readable": "131.00 GB",
19 |   "index_size": 27277602816,
20 |   "index_size_readable": "27.28 GB",
21 |   "result": [
22 |     [992.518,977.968,983.272],
23 |     [21558.6,21530.5,21379.2],
24 |     [1238.16,1231,1242.69],
25 |     [162.668,162.236,162.897],
26 |     [165.783,166.72,165.989]
27 |   ],
28 |   "result_readable": [
29 |     "16 min 32.52 sec, 16 min 17.97 sec, 16 min 23.27 sec",
30 |     "5 hr 59 min 18.60 sec, 5 hr 58 min 50.50 sec, 5 hr 56 min 19.20 sec",
31 |     "20 min 38.16 sec, 20 min 31.00 sec, 20 min 42.69 sec",
32 |     "2 min 42.67 sec, 2 min 42.24 sec, 2 min 42.90 sec",
33 |     "2 min 45.78 sec, 2 min 46.72 sec, 2 min 45.99 sec"
34 |   ]
35 | }
36 | 


--------------------------------------------------------------------------------
/mongodb/results/m6i.8xlarge_bluesky_100m_snappy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "MongoDB (snappy, covered index)",
 3 |   "version": "8.0.3",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 100000000,
12 |   "dataset_size_readable": "100m",
13 |   "num_loaded_documents": 94408000,
14 |   "data_compression": "snappy",
15 |   "total_size": 26336047104,
16 |   "total_size_readable": "26.34 GB",
17 |   "data_size": 23520026624,
18 |   "data_size_readable": "23.52 GB",
19 |   "index_size": 2816020480,
20 |   "index_size_readable": "2.82 GB",
21 |   "result": [
22 |     [8.824,8.558,8.519],
23 |     [32.831,34.321,34.477],
24 |     [10.825,10.725,10.763],
25 |     [1.723,1.762,1.736],
26 |     [1.841,1.869,1.857]
27 |   ],
28 |   "result_readable": [
29 |     "8.82 sec, 8.56 sec, 8.52 sec",
30 |     "32.83 sec, 34.32 sec, 34.48 sec",
31 |     "10.82 sec, 10.72 sec, 10.76 sec",
32 |     "1.72 sec, 1.76 sec, 1.74 sec",
33 |     "1.84 sec, 1.87 sec, 1.86 sec"
34 |   ]
35 | }
36 | 


--------------------------------------------------------------------------------
/mongodb/results/m6i.8xlarge_bluesky_100m_zstd.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "MongoDB (zstd, covered index)",
 3 |   "version": "8.0.3",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 100000000,
12 |   "dataset_size_readable": "100m",
13 |   "num_loaded_documents": 94408000,
14 |   "data_compression": "zstd",
15 |   "total_size": 16577900544,
16 |   "total_size_readable": "16.57 GB",
17 |   "data_size": 13758480384,
18 |   "data_size_readable": "13.76 GB",
19 |   "index_size": 2819420160,
20 |   "index_size_readable": "2.82 GB",
21 |   "result": [
22 |     [105.051,102.811,101.4],
23 |     [1440.91,1472.03,1440.5],
24 |     [123.522,123.082,123.149],
25 |     [17.125,17.251,17.238],
26 |     [17.962,17.913,17.852]
27 |   ],
28 |   "result_readable": [
29 |     "1 min 45.05 sec, 1 min 42.81 sec, 1 min 41.40 sec",
30 |     "24 min 0.91 sec, 24 min 32.03 sec, 24 min 0.50 sec",
31 |     "2 min 3.52 sec, 2 min 3.08 sec, 2 min 3.15 sec",
32 |     "17.12 sec, 17.25 sec, 17.24 sec",
33 |     "17.96 sec, 17.91 sec, 17.85 sec"
34 |   ]
35 | }
36 | 


--------------------------------------------------------------------------------
/mongodb/results/m6i.8xlarge_bluesky_10m_snappy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "MongoDB (snappy, covered index)",
 3 |   "version": "8.0.3",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 10000000,
12 |   "dataset_size_readable": "10m",
13 |   "num_loaded_documents": 7860000,
14 |   "data_compression": "snappy",
15 |   "total_size": 2200559616,
16 |   "total_size_readable": "2.20 GB",
17 |   "data_size": 1947783168,
18 |   "data_size_readable": "1.95 GB",
19 |   "index_size": 252776448,
20 |   "index_size_readable": "252.78",
21 |   "result": [
22 |     [8.824,8.558,8.519],
23 |     [32.831,34.321,34.477],
24 |     [10.825,10.725,10.763],
25 |     [1.723,1.762,1.736],
26 |     [1.841,1.869,1.857]
27 |   ],
28 |   "result_readable": [
29 |     "8.82 sec, 8.56 sec, 8.52 sec",
30 |     "32.83 sec, 34.32 sec, 34.48 sec",
31 |     "10.82 sec, 10.72 sec, 10.76 sec",
32 |     "1.72 sec, 1.76 sec, 1.74 sec",
33 |     "1.84 sec, 1.87 sec, 1.86 sec"
34 |   ]
35 | }
36 | 


--------------------------------------------------------------------------------
/mongodb/results/m6i.8xlarge_bluesky_10m_zstd.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "MongoDB (zstd, covered index)",
 3 |   "version": "8.0.3",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 10000000,
12 |   "dataset_size_readable": "10m",
13 |   "num_loaded_documents": 7860000,
14 |   "data_compression": "zstd",
15 |   "total_size": 1465458688,
16 |   "total_size_readable": "1.47 GB",
17 |   "data_size": 1212796928,
18 |   "data_size_readable": "1.21 GB",
19 |   "index_size": 252661760,
20 |   "index_size_readable": "252.66 MB",
21 |   "result": [
22 |     [8.845,8.541,8.476],
23 |     [30.208,31.527,33.274],
24 |     [10.843,10.723,10.697],
25 |     [1.721,1.714,1.741],
26 |     [1.851,1.859,1.843]
27 |   ],
28 |   "result_readable": [
29 |     "8.85 sec, 8.54 sec, 8.48 sec",
30 |     "30.21 sec, 31.53 sec, 33.27 sec",
31 |     "10.84 sec, 10.72 sec, 10.70 sec",
32 |     "1.72 sec, 1.71 sec, 1.74 sec",
33 |     "1.85 sec, 1.86 sec, 1.84 sec"
34 |   ]
35 | }
36 | 


--------------------------------------------------------------------------------
/mongodb/results/m6i.8xlarge_bluesky_1m_snappy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "MongoDB (snappy, covered index)",
 3 |   "version": "8.0.3",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000,
12 |   "dataset_size_readable": "1m",
13 |   "num_loaded_documents": 1000000,
14 |   "data_compression": "snappy",
15 |   "total_size": 287158272,
16 |   "total_size_readable": "287.16 MB",
17 |   "data_size": 253288448,
18 |   "data_size_readable": "253.29 MB",
19 |   "index_size": 33869824,
20 |   "index_size_readable": "33.87 MB",
21 |   "result": [
22 |     [1.157,1.098,1.102],
23 |     [1.742,1.747,1.768],
24 |     [1.344,1.335,1.326],
25 |     [0.224,0.22,0.224],
26 |     [0.243,0.239,0.242]
27 |   ],
28 |   "result_readable": [
29 |     "1.16 sec, 1.10 sec, 1.10 sec",
30 |     "1.74 sec, 1.75 sec, 1.77 sec",
31 |     "1.34 sec, 1.33 sec, 1.33 sec",
32 |     "224.00 msec, 220.00 msec, 224.00 msec",
33 |     "243.00 msec, 239.00 msec, 242.00 msec"
34 |   ]
35 | }
36 | 


--------------------------------------------------------------------------------
/mongodb/results/m6i.8xlarge_bluesky_1m_zstd.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "MongoDB (zstd, covered index)",
 3 |   "version": "8.0.3",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000,
12 |   "dataset_size_readable": "1m",
13 |   "num_loaded_documents": 1000000,
14 |   "data_compression": "zstd",
15 |   "total_size": 189222912,
16 |   "total_size_readable": "189.22 MB",
17 |   "data_size": 155348992,
18 |   "data_size_readable": "155.35 MB",
19 |   "index_size": 33873920,
20 |   "index_size_readable": "33.87 MB",
21 |   "result": [
22 |     [1.159,1.108,1.1],
23 |     [1.765,1.717,1.737],
24 |     [1.37,1.34,1.325],
25 |     [0.22,0.248,0.228],
26 |     [0.241,0.242,0.244]
27 |   ],
28 |   "result_readable": [
29 |     "1.16 sec, 1.11 sec, 1.10 sec",
30 |     "1.76 sec, 1.72 sec, 1.74 sec",
31 |     "1.37 sec, 1.34 sec, 1.32 sec",
32 |     "220.00 msec, 248.00 msec, 228.00 msec",
33 |     "241.00 msec, 242.00 msec, 244.00 msec"
34 |   ]
35 | }
36 | 


--------------------------------------------------------------------------------
/mongodb/results_without_covered_index_scans/m6i.8xlarge_bluesky_1000m_snappy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "MongoDB (snappy)",
 3 |   "version": "8.0.3",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000000,
12 |   "dataset_size_readable": "1000m",
13 |   "num_loaded_documents": 893632990,
14 |   "data_compression": "snappy",
15 |   "total_size": 245411815424,
16 |   "total_size_readable": "245.41 GB",
17 |   "data_size": 221332566016,
18 |   "data_size_readable": "221.33 GB",
19 |   "index_size": 24079249408,
20 |   "index_size_readable": "24.08 GB",
21 |   "result": [
22 |     [1974.18,1956.12,1957.9],
23 |     [45189.1,43107.3,42923.6],
24 |     [6354.06,6348.55,6347.45],
25 |     [2031.61,2022.65,2020.14],
26 |     [2054.2,2041.93,2040.19]
27 |   ],
28 |   "result_readable": [
29 |     "32 min 54.18 sec, 32 min 36.12 sec, 32 min 37.90 sec",
30 |     "12 hr 33 min 9.10 sec, 11 hr 58 min 27.30 sec, 11 hr 55 min 23.60 sec",
31 |     "1 hr 45 min 54.06 sec, 1 hr 45 min 48.55 sec, 1 hr 45 min 47.45 sec",
32 |     "33 min 51.61 sec, 33 min 42.65 sec, 33 min 40.14 sec",
33 |     "34 min 14.20 sec, 34 min 1.93 sec, 34 min 0.19 sec"
34 |   ]
35 | }
36 | 


--------------------------------------------------------------------------------
/mongodb/results_without_covered_index_scans/m6i.8xlarge_bluesky_1000m_zstd.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "MongoDB (zstd)",
 3 |   "version": "8.0.3",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000000,
12 |   "dataset_size_readable": "1000m",
13 |   "num_loaded_documents": 893632990,
14 |   "data_compression": "zstd",
15 |   "total_size": 155395039232,
16 |   "total_size_readable": "155.40 GB",
17 |   "data_size": 130998910976,
18 |   "data_size_readable": "131.00 GB",
19 |   "index_size": 24396128256,
20 |   "index_size_readable": "24.40 GB",
21 |   "result": [
22 |     [1689.1,1667.27,1664.9],
23 |     [41945.3,40318.9,40753.6],
24 |     [5459.58,5456.72,5451.42],
25 |     [1692.29,1674.86,1674.57],
26 |     [1714.64,1698.16,1699.13]
27 |   ],
28 |   "result_readable": [
29 |     "28 min 9.10 sec, 27 min 47.27 sec, 27 min 44.90 sec",
30 |     "11 hr 39 min 5.30 sec, 11 hr 11 min 58.90 sec, 11 hr 19 min 13.60 sec",
31 |     "1 hr 30 min 59.58 sec, 1 hr 30 min 56.72 sec, 1 hr 30 min 51.42 sec",
32 |     "28 min 12.29 sec, 27 min 54.86 sec, 27 min 54.57 sec",
33 |     "28 min 34.64 sec, 28 min 18.16 sec, 28 min 19.13 sec"
34 |   ]
35 | }
36 | 


--------------------------------------------------------------------------------
/mongodb/results_without_covered_index_scans/m6i.8xlarge_bluesky_100m_snappy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "MongoDB (snappy)",
 3 |   "version": "8.0.3",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 100000000,
12 |   "dataset_size_readable": "100m",
13 |   "num_loaded_documents": 94408000,
14 |   "data_compression": "snappy",
15 |   "total_size": 25923018752,
16 |   "total_size_readable": "25.92 GB",
17 |   "data_size": 23520026624,
18 |   "data_size_readable": "23.52 GB",
19 |   "index_size": 2402992128,
20 |   "index_size_readable": "2.40 GB",
21 |   "result": [
22 |     [99.24,64.444,48.249],
23 |     [4129.17,3504.34,2926.89],
24 |     [188.384,164.718,154.082],
25 |     [36.798,35.852,37.303],
26 |     [36.978,38.413,41.326]
27 |   ],
28 |   "result_readable": [
29 |     "1 min 39.24 sec, 1 min 4.44 sec, 48.25 sec",
30 |     "1 hr 8 min 49.17 sec, 58 min 24.34 sec, 48 min 46.89 sec",
31 |     "3 min 8.38 sec, 2 min 44.72 sec, 2 min 34.08 sec",
32 |     "36.80 sec, 35.85 sec, 37.30 sec",
33 |     "36.98 sec, 38.41 sec, 41.33 sec"
34 |   ]
35 | }
36 | 


--------------------------------------------------------------------------------
/mongodb/results_without_covered_index_scans/m6i.8xlarge_bluesky_100m_zstd.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "MongoDB (zstd)",
 3 |   "version": "8.0.3",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 100000000,
12 |   "dataset_size_readable": "100m",
13 |   "num_loaded_documents": 94408000,
14 |   "data_compression": "zstd",
15 |   "total_size": 16198197248,
16 |   "total_size_readable": "16.20 GB",
17 |   "data_size": 13758480384,
18 |   "data_size_readable": "13.76 GB",
19 |   "index_size": 2439716864,
20 |   "index_size_readable": "2.44 GB",
21 |   "result": [
22 |     [171.13,56.34,50.349],
23 |     [2673.28,2573.21,2576.39],
24 |     [208.486,183.036,160.959],
25 |     [45.227,41.365,43.751],
26 |     [47.175,44.368,41.472]
27 |   ],
28 |   "result_readable": [
29 |     "2 min 51.13 sec, 56.34 sec, 50.35 sec",
30 |     "44 min 33.28 sec, 42 min 53.21 sec, 42 min 56.39 sec",
31 |     "3 min 28.49 sec, 3 min 3.04 sec, 2 min 40.96 sec",
32 |     "45.23 sec, 41.37 sec, 43.75 sec",
33 |     "47.17 sec, 44.37 sec, 41.47 sec"
34 |   ]
35 | }
36 | 


--------------------------------------------------------------------------------
/mongodb/results_without_covered_index_scans/m6i.8xlarge_bluesky_10m_snappy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "MongoDB (snappy)",
 3 |   "version": "8.0.3",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 10000000,
12 |   "dataset_size_readable": "10m",
13 |   "num_loaded_documents": 7860000,
14 |   "data_compression": "snappy",
15 |   "total_size": 2145968128,
16 |   "total_size_readable": "2.15 GB",
17 |   "data_size": 1947783168,
18 |   "data_size_readable": "1.95 GB",
19 |   "index_size": 198184960,
20 |   "index_size_readable": "198.18",
21 |   "result": [
22 |     [16.363,2.861,2.807],
23 |     [33.32,33.482,33.416],
24 |     [11.605,11.562,11.561],
25 |     [1.88,1.932,1.899],
26 |     [2.029,2.025,2.028]
27 |   ],
28 |   "result_readable": [
29 |     "16.36 sec, 2.86 sec, 2.81 sec",
30 |     "33.32 sec, 33.48 sec, 33.42 sec",
31 |     "11.61 sec, 11.56 sec, 11.56 sec",
32 |     "1.88 sec, 1.93 sec, 1.90 sec",
33 |     "2.03 sec, 2.02 sec, 2.03 sec"
34 |   ]
35 | }
36 | 


--------------------------------------------------------------------------------
/mongodb/results_without_covered_index_scans/m6i.8xlarge_bluesky_10m_zstd.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "MongoDB (zstd)",
 3 |   "version": "8.0.3",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 10000000,
12 |   "dataset_size_readable": "10m",
13 |   "num_loaded_documents": 7860000,
14 |   "data_compression": "zstd",
15 |   "total_size": 1411428352,
16 |   "total_size_readable": "1.41 GB",
17 |   "data_size": 1212796928,
18 |   "data_size_readable": "1.21 GB",
19 |   "index_size": 198631424,
20 |   "index_size_readable": "198.63 MB",
21 |   "result": [
22 |     [15.462,3.001,2.833],
23 |     [33.838,33.59,33.529],
24 |     [11.42,11.431,11.508],
25 |     [1.854,1.9,1.908],
26 |     [2.043,2.013,2.014]
27 |   ],
28 |   "result_readable": [
29 |     "15.46 sec, 3.00 sec, 2.83 sec",
30 |     "33.84 sec, 33.59 sec, 33.53 sec",
31 |     "11.42 sec, 11.43 sec, 11.51 sec",
32 |     "1.85 sec, 1.90 sec, 1.91 sec",
33 |     "2.04 sec, 2.01 sec, 2.01 sec"
34 |   ]
35 | }
36 | 


--------------------------------------------------------------------------------
/mongodb/results_without_covered_index_scans/m6i.8xlarge_bluesky_1m_snappy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "MongoDB (snappy)",
 3 |   "version": "8.0.3",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000,
12 |   "dataset_size_readable": "1m",
13 |   "num_loaded_documents": 1000000,
14 |   "data_compression": "snappy",
15 |   "total_size": 279212032,
16 |   "total_size_readable": "279.21 MB",
17 |   "data_size": 253288448,
18 |   "data_size_readable": "253.29 MB",
19 |   "index_size": 25923584,
20 |   "index_size_readable": "25.92 MB",
21 |   "result": [
22 |     [0.375,0.365,0.368],
23 |     [1.789,1.759,1.752],
24 |     [1.431,1.421,1.436],
25 |     [0.237,0.235,0.235],
26 |     [0.257,0.255,0.263]
27 |   ],
28 |   "result_readable": [
29 |     "375.00 msec, 365.00 msec, 368.00 msec",
30 |     "1.79 sec, 1.76 sec, 1.75 sec",
31 |     "1.43 sec, 1.42 sec, 1.44 sec",
32 |     "237.00 msec, 235.00 msec, 235.00 msec",
33 |     "257.00 msec, 255.00 msec, 263.00 msec"
34 |   ]
35 | }
36 | 


--------------------------------------------------------------------------------
/mongodb/results_without_covered_index_scans/m6i.8xlarge_bluesky_1m_zstd.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "MongoDB (zstd)",
 3 |   "version": "8.0.3",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000,
12 |   "dataset_size_readable": "1m",
13 |   "num_loaded_documents": 1000000,
14 |   "data_compression": "zstd",
15 |   "total_size": 179613696,
16 |   "total_size_readable": "179.61 MB",
17 |   "data_size": 155348992,
18 |   "data_size_readable": "155.35 MB",
19 |   "index_size": 24264704,
20 |   "index_size_readable": "24.26 MB",
21 |   "result": [
22 |     [1.017,0.377,0.365],
23 |     [1.802,1.759,1.746],
24 |     [1.419,1.411,1.439],
25 |     [0.235,0.238,0.235],
26 |     [0.254,0.266,0.257]
27 |   ],
28 |   "result_readable": [
29 |     "1.02 sec, 377.00 msec, 365.00 msec",
30 |     "1.80 sec, 1.76 sec, 1.75 sec",
31 |     "1.42 sec, 1.41 sec, 1.44 sec",
32 |     "235.00 msec, 238.00 msec, 235.00 msec",
33 |     "254.00 msec, 266.00 msec, 257.00 msec"
34 |   ]
35 | }
36 | 


--------------------------------------------------------------------------------
/mongodb/run_queries.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | 
12 | # Number of tries for each query
13 | TRIES=3
14 | 
15 | # File containing MongoDB queries (replace 'queries.js' with your file)
16 | QUERY_FILE="queries.js"
17 | 
18 | # Check if the query file exists
19 | if [[ ! -f "$QUERY_FILE" ]]; then
20 |     echo "Error: Query file '$QUERY_FILE' does not exist."
21 |     exit 1
22 | fi
23 | 
24 | # Set the internalQueryMaxAddToSetBytes parameter to 1 GB
25 | echo "Setting internalQueryMaxAddToSetBytes to 1 GB..."
26 | mongosh --quiet --eval "
27 |     const result = db.adminCommand({ setParameter: 1, internalQueryMaxAddToSetBytes: 1073741824 });
28 |     if (result.ok !== 1) {
29 |         print('Failed to set internalQueryMaxAddToSetBytes: ' + JSON.stringify(result));
30 |         quit(1);
31 |     } else {
32 |         print('Successfully set internalQueryMaxAddToSetBytes to 1 GB');
33 |     }
34 | "
35 | 
36 | # Set the internalQueryPlannerGenerateCoveredWholeIndexScans parameter to true
37 | echo "Setting internalQueryPlannerGenerateCoveredWholeIndexScans to true..."
38 | mongosh --quiet --eval "
39 |     const result = db.adminCommand({ setParameter: 1, internalQueryPlannerGenerateCoveredWholeIndexScans: true });
40 |     if (result.ok !== 1) {
41 |         print('Failed to set internalQueryPlannerGenerateCoveredWholeIndexScans: ' + JSON.stringify(result));
42 |         quit(1);
43 |     } else {
44 |         print('Successfully set internalQueryPlannerGenerateCoveredWholeIndexScans to true');
45 |     }
46 | "
47 | 
48 | # Read and execute each query
49 | cat "$QUERY_FILE" | while read -r query; do
50 | 
51 |     # Clear the Linux file system cache
52 |     echo "Clearing file system cache..."
53 |     sync
54 |     echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
55 |     echo "File system cache cleared."
56 | 
57 |     # Print the query
58 |     echo "Running query: $query"
59 | 
60 |     # Escape the query for safe passing to mongosh
61 |     ESCAPED_QUERY=$(echo "$query" | sed 's/\([\"\\]\)/\\\1/g' | sed 's/\$/\\$/g')
62 | 
63 |     # Execute the query multiple times
64 |     for i in $(seq 1 $TRIES); do
65 |         mongosh --quiet --eval "
66 |             const db = db.getSiblingDB('$DB_NAME');
67 |             const start = new Date();
68 |             const result = eval(\"$ESCAPED_QUERY\");
69 |             // Force query execution -> When using commands like aggregate() or find(),
70 |             // the query is not fully executed until the data is actually fetched or processed.
71 |             if (Array.isArray(result)) {
72 |                 result.length;  // Access the length to force evaluation for arrays
73 |             } else if (typeof result === 'object' && typeof result.toArray === 'function') {
74 |                 result.toArray();  // Force execution for cursors
75 |             }
76 |             const end = new Date();
77 |             print('Execution time: ' + (end.getTime() - start.getTime()) + 'ms');
78 |         "
79 |     done
80 | done


--------------------------------------------------------------------------------
/mongodb/total_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <database_name> <collection_name>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DATABASE_NAME="$1"
11 | COLLECTION_NAME="$2"
12 | 
13 | # Fetch the totalSize using mongosh
14 | total_size=$(mongosh --quiet --eval "
15 |     const db = db.getSiblingDB('$DATABASE_NAME');
16 |     const stats = db.getCollection('$COLLECTION_NAME').stats();
17 |     print(stats.totalSize);
18 | ")
19 | 
20 | # Print the result
21 | if [[ -z "$total_size" ]]; then
22 |     echo "Error: Unable to fetch totalSize. Ensure the database and collection exist."
23 |     exit 1
24 | else
25 |     echo $total_size
26 | fi


--------------------------------------------------------------------------------
/postgresql/benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> [RESULT_FILE]"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | RESULT_FILE="${2:-}"
12 | 
13 | # Construct the query log file name using $DB_NAME
14 | QUERY_LOG_FILE="_query_log_${DB_NAME}.txt"
15 | 
16 | # Print the database name
17 | echo "Running queries on database: $DB_NAME"
18 | 
19 | # Run queries and log the output
20 | ./run_queries.sh "$DB_NAME" 2>&1 | tee "$QUERY_LOG_FILE"
21 | 
22 | # Process the query log and prepare the result
23 | RESULT=$(cat "$QUERY_LOG_FILE" | grep -oP 'Time: \d+\.\d+ ms' | sed -r -e 's/Time: ([0-9]+\.[0-9]+) ms/\1/' | \
24 | awk '{ if (i % 3 == 0) { printf "[" }; printf $1 / 1000; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }')
25 | 
26 | # Output the result
27 | if [[ -n "$RESULT_FILE" ]]; then
28 |     echo "$RESULT" > "$RESULT_FILE"
29 |     echo "Result written to $RESULT_FILE"
30 | else
31 |     echo "$RESULT"
32 | fi


--------------------------------------------------------------------------------
/postgresql/count.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | TABLE_NAME="$2"
12 | 
13 | # Corrected SQL query
14 | sudo -u postgres psql -d "$DB_NAME" -t -c "SELECT count(*) from $TABLE_NAME"


--------------------------------------------------------------------------------
/postgresql/create_and_load.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 7 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME> <DDL_FILE> <DATA_DIRECTORY> <NUM_FILES> <SUCCESS_LOG> <ERROR_LOG>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | TABLE_NAME="$2"
12 | DDL_FILE="$3"
13 | DATA_DIRECTORY="$4"
14 | NUM_FILES="$5"
15 | SUCCESS_LOG="$6"
16 | ERROR_LOG="$7"
17 | 
18 | # Validate arguments
19 | [[ ! -f "$DDL_FILE" ]] && { echo "Error: DDL file '$DDL_FILE' does not exist."; exit 1; }
20 | [[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
21 | [[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }
22 | 
23 | # Create database
24 | sudo -u postgres psql -t -c "CREATE DATABASE $DB_NAME"
25 | 
26 | # Execute DDL
27 | sudo -u postgres psql "$DB_NAME" -t < "$DDL_FILE"
28 | 
29 | # Load data
30 | ./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"
31 | 
32 | # Vacuum analyze the table
33 | sudo -u postgres psql "$DB_NAME" -t -c "VACUUM ANALYZE $TABLE_NAME"
34 | 
35 | echo "Script completed successfully."


--------------------------------------------------------------------------------
/postgresql/data_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | TABLE_NAME="$2"
12 | 
13 | sudo -u postgres psql -d "$DB_NAME" -t -c "SELECT pg_table_size('$TABLE_NAME')"


--------------------------------------------------------------------------------
/postgresql/ddl_lz4.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE bluesky (
 2 |     data JSONB COMPRESSION lz4 NOT NULL
 3 | );
 4 | 
 5 | CREATE INDEX idx_bluesky
 6 | ON bluesky (
 7 |     (data ->> 'kind'),
 8 |     (data -> 'commit' ->> 'operation'),
 9 |     (data -> 'commit' ->> 'collection'),
10 |     (data ->> 'did'),
11 |     (TO_TIMESTAMP((data ->> 'time_us')::BIGINT / 1000000.0))
12 | );


--------------------------------------------------------------------------------
/postgresql/ddl_pglz.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE bluesky (
 2 |     data JSONB COMPRESSION pglz NOT NULL
 3 | );
 4 | 
 5 | CREATE INDEX idx_bluesky
 6 | ON bluesky (
 7 |     (data ->> 'kind'),
 8 |     (data -> 'commit' ->> 'operation'),
 9 |     (data -> 'commit' ->> 'collection'),
10 |     (data ->> 'did'),
11 |     (TO_TIMESTAMP((data ->> 'time_us')::BIGINT / 1000000.0))
12 | );


--------------------------------------------------------------------------------
/postgresql/index_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | TABLE_NAME="$2"
12 | 
13 | sudo -u postgres psql -d "$DB_NAME" -t -c "SELECT pg_relation_size(oid) FROM pg_class WHERE relname = 'idx_bluesky'"


--------------------------------------------------------------------------------
/postgresql/index_usage.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | 
12 | QUERY_NUM=1
13 | 
14 | 
15 | cat queries.sql | while read -r query; do
16 | 
17 |     # Print the query number
18 |     echo "------------------------------------------------------------------------------------------------------------------------"
19 |     echo "Index usage for query Q$QUERY_NUM:"
20 |     echo
21 | 
22 |     sudo -u postgres psql -d "$DB_NAME" -t -c "EXPLAIN $query"
23 | 
24 |     # Increment the query number
25 |     QUERY_NUM=$((QUERY_NUM + 1))
26 | 
27 | done;


--------------------------------------------------------------------------------
/postgresql/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # https://www.postgresql.org/download/linux/ubuntu/
4 | 
5 | sudo apt-get update
6 | sudo apt-get install -y postgresql-common
7 | sudo apt-get install -y postgresql-16


--------------------------------------------------------------------------------
/postgresql/load_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 6 ]]; then
 5 |     echo "Usage: $0 <directory> <database_name> <table_name> <max_files> <success_log> <error_log>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DIRECTORY="$1"
11 | DB_NAME="$2"
12 | TABLE_NAME="$3"
13 | MAX_FILES="$4"
14 | SUCCESS_LOG="$5"
15 | ERROR_LOG="$6"
16 | PSQL_CMD="sudo -u postgres psql -d $DB_NAME"
17 | 
18 | # Validate that MAX_FILES is a number
19 | if ! [[ "$MAX_FILES" =~ ^[0-9]+$ ]]; then
20 |     echo "Error: <max_files> must be a positive integer."
21 |     exit 1
22 | fi
23 | 
24 | # Ensure the log files exist
25 | touch "$SUCCESS_LOG" "$ERROR_LOG"
26 | 
27 | # Create a temporary directory in /var/tmp and ensure it's accessible
28 | TEMP_DIR=$(mktemp -d /var/tmp/cleaned_files.XXXXXX)
29 | chmod 777 "$TEMP_DIR"  # Allow access for all users
30 | trap "rm -rf $TEMP_DIR" EXIT  # Ensure cleanup on script exit
31 | 
32 | # Counter to track processed files
33 | counter=0
34 | 
35 | # Loop through each .json.gz file in the directory
36 | for file in $(ls "$DIRECTORY"/*.json.gz | sort); do
37 |     if [[ -f "$file" ]]; then
38 |         echo "Processing $file..."
39 |         counter=$((counter + 1))
40 | 
41 |         # Uncompress the file into the temporary directory
42 |         uncompressed_file="$TEMP_DIR/$(basename "${file%.gz}")"
43 |         gunzip -c "$file" > "$uncompressed_file"
44 | 
45 |         # Check if uncompression was successful
46 |         if [[ $? -ne 0 ]]; then
47 |             echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed to uncompress $file." >> "$ERROR_LOG"
48 |             continue
49 |         fi
50 | 
51 |         # Preprocess the file to remove null characters
52 |         cleaned_file="$TEMP_DIR/$(basename "${uncompressed_file%.json}_cleaned.json")"
53 |         sed 's/\\u0000//g' "$uncompressed_file" > "$cleaned_file"
54 | 
55 |         # Grant read permissions for the postgres user
56 |         chmod 644 "$cleaned_file"
57 | 
58 |         # Import the cleaned JSON file into PostgreSQL
59 |         $PSQL_CMD -c "\COPY $TABLE_NAME FROM '$cleaned_file' WITH (format csv, quote e'\x01', delimiter e'\x02', escape e'\x01');"
60 |         import_status=$?
61 | 
62 |         # Check if the import was successful
63 |         if [[ $import_status -eq 0 ]]; then
64 |             echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $cleaned_file into PostgreSQL." >> "$SUCCESS_LOG"
65 |             # Delete both the uncompressed and cleaned files after successful processing
66 |             rm -f "$uncompressed_file" "$cleaned_file"
67 |         else
68 |             echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed to import $cleaned_file. See errors above." >> "$ERROR_LOG"
69 |             # Keep the files for debugging purposes
70 |         fi
71 | 
72 |         # Stop processing if the max number of files is reached
73 |         if [[ $counter -ge $MAX_FILES ]]; then
74 |             echo "Processed maximum number of files: $MAX_FILES"
75 |             break
76 |         fi
77 |     else
78 |         echo "No .json.gz files found in the directory."
79 |     fi
80 | done
81 | 
82 | echo "All files have been processed."


--------------------------------------------------------------------------------
/postgresql/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Default data directory
 4 | DEFAULT_DATA_DIRECTORY=~/data/bluesky
 5 | 
 6 | # Allow the user to optionally provide the data directory as an argument
 7 | DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}"
 8 | 
 9 | # Define success and error log files
10 | SUCCESS_LOG="${2:-success.log}"
11 | ERROR_LOG="${3:-error.log}"
12 | 
13 | # Define prefix for output files
14 | OUTPUT_PREFIX="${4:-_m6i.8xlarge}"
15 | 
16 | # Check if the directory exists
17 | if [[ ! -d "$DATA_DIRECTORY" ]]; then
18 |     echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
19 |     exit 1
20 | fi
21 | 
22 | echo "Select the dataset size to benchmark:"
23 | echo "1) 1m (default)"
24 | echo "2) 10m"
25 | echo "3) 100m"
26 | echo "4) 1000m"
27 | echo "5) all"
28 | read -p "Enter the number corresponding to your choice: " choice
29 | 
30 | ./install.sh
31 | 
32 | benchmark() {
33 |     local size=$1
34 |     local compression=$2
35 |     # Check DATA_DIRECTORY contains the required number of files to run the benchmark
36 |     file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
37 |     if (( file_count < size )); then
38 |         echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
39 |         exit 1
40 |     fi
41 |     ./create_and_load.sh "bluesky_${size}m_${compression}" bluesky "ddl_${compression}.sql" "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
42 |     ./total_size.sh "bluesky_${size}m_${compression}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.total_size"
43 |     ./data_size.sh "bluesky_${size}m_${compression}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.data_size"
44 |     ./index_size.sh "bluesky_${size}m_${compression}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.index_size"
45 |     ./count.sh "bluesky_${size}m_${compression}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.count"
46 |     ./index_usage.sh "bluesky_${size}m_${compression}" | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.index_usage"
47 |     #./query_results.sh "bluesky_${size}m_${compression}" | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.query_results"
48 |     ./benchmark.sh "bluesky_${size}m_${compression}" "${OUTPUT_PREFIX}_bluesky_${size}m_${compression}.results_runtime"
49 | }
50 | 
51 | case $choice in
52 |     2)
53 |         benchmark 10 lz4
54 |         benchmark 10 pglz
55 |         ;;
56 |     3)
57 |         benchmark 100 lz4
58 |         benchmark 100 pglz
59 |         ;;
60 |     4)
61 |         benchmark 1000 lz4
62 |         benchmark 1000 pglz
63 |         ;;
64 |     5)
65 |         benchmark 1 lz4
66 |         benchmark 1 pglz
67 |         benchmark 10 lz4
68 |         benchmark 10 pglz
69 |         benchmark 100 lz4
70 |         benchmark 100 pglz
71 |         benchmark 1000 lz4
72 |         benchmark 1000 pglz
73 |         ;;
74 |     *)
75 |         benchmark 1 lz4
76 |         benchmark 1 pglz
77 |         ;;
78 | esac


--------------------------------------------------------------------------------
/postgresql/queries.sql:
--------------------------------------------------------------------------------
1 | SELECT data -> 'commit' ->> 'collection' AS event, COUNT(*) as count FROM bluesky GROUP BY event ORDER BY count DESC;
2 | SELECT data -> 'commit' ->> 'collection' AS event, COUNT(*) as count, COUNT(DISTINCT data ->> 'did') AS users FROM bluesky WHERE data ->> 'kind' = 'commit' AND data -> 'commit' ->> 'operation' = 'create' GROUP BY event ORDER BY count DESC;
3 | SELECT data->'commit'->>'collection' AS event, EXTRACT(HOUR FROM TO_TIMESTAMP((data->>'time_us')::BIGINT / 1000000)) AS hour_of_day, COUNT(*) AS count FROM bluesky WHERE data->>'kind' = 'commit' AND data->'commit'->>'operation' = 'create' AND data->'commit'->>'collection' IN ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
4 | SELECT data->>'did' AS user_id, MIN( TIMESTAMP WITH TIME ZONE 'epoch' + INTERVAL '1 microsecond' * (data->>'time_us')::BIGINT ) AS first_post_ts FROM bluesky WHERE data->>'kind' = 'commit' AND data->'commit'->>'operation' = 'create' AND data->'commit'->>'collection' = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC LIMIT 3;
5 | SELECT data->>'did' AS user_id, EXTRACT(EPOCH FROM ( MAX( TIMESTAMP WITH TIME ZONE 'epoch' + INTERVAL '1 microsecond' * (data->>'time_us')::BIGINT ) - MIN( TIMESTAMP WITH TIME ZONE 'epoch' + INTERVAL '1 microsecond' * (data->>'time_us')::BIGINT ) )) * 1000 AS activity_span FROM bluesky WHERE data->>'kind' = 'commit' AND data->'commit'->>'operation' = 'create' AND data->'commit'->>'collection' = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;


--------------------------------------------------------------------------------
/postgresql/queries_formatted.sql:
--------------------------------------------------------------------------------
 1 | ------------------------------------------------------------------------------------------------------------------------
 2 | -- Q1 - Top event types
 3 | ------------------------------------------------------------------------------------------------------------------------
 4 | SELECT
 5 |     data -> 'commit' ->> 'collection' AS event,
 6 |     COUNT(*) as count
 7 | FROM bluesky
 8 | GROUP BY event
 9 | ORDER BY count DESC;
10 | 
11 | ------------------------------------------------------------------------------------------------------------------------
12 | -- Q2 - Top event types together with unique users per event type
13 | ------------------------------------------------------------------------------------------------------------------------
14 | SELECT
15 |     data -> 'commit' ->> 'collection' AS event,
16 |     COUNT(*) as count,
17 |     COUNT(DISTINCT data ->> 'did') AS users
18 | FROM bluesky
19 | WHERE data ->> 'kind' = 'commit'
20 |   AND data -> 'commit' ->> 'operation' = 'create'
21 | GROUP BY event
22 | ORDER BY count DESC;
23 | 
24 | ------------------------------------------------------------------------------------------------------------------------
25 | -- Q3 - When do people use BlueSky
26 | ------------------------------------------------------------------------------------------------------------------------
27 | SELECT
28 |     data->'commit'->>'collection' AS event,
29 |     EXTRACT(HOUR FROM TO_TIMESTAMP((data->>'time_us')::BIGINT / 1000000)) AS hour_of_day,
30 |     COUNT(*) AS count
31 | FROM bluesky
32 | WHERE data->>'kind' = 'commit'
33 |   AND data->'commit'->>'operation' = 'create'
34 |   AND data->'commit'->>'collection' IN ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like')
35 | GROUP BY event, hour_of_day
36 | ORDER BY hour_of_day, event;
37 | 
38 | ------------------------------------------------------------------------------------------------------------------------
39 | -- Q4 - top 3 post veterans
40 | ------------------------------------------------------------------------------------------------------------------------
41 | SELECT
42 |     data->>'did' AS user_id,
43 |     MIN(
44 |         TIMESTAMP WITH TIME ZONE 'epoch' +
45 |         INTERVAL '1 microsecond' * (data->>'time_us')::BIGINT
46 |     ) AS first_post_ts
47 | FROM bluesky
48 | WHERE data->>'kind' = 'commit'
49 |   AND data->'commit'->>'operation' = 'create'
50 |   AND data->'commit'->>'collection' = 'app.bsky.feed.post'
51 | GROUP BY user_id
52 | ORDER BY first_post_ts ASC
53 | LIMIT 3;
54 | 
55 | ------------------------------------------------------------------------------------------------------------------------
56 | -- Q5 - top 3 users with longest activity
57 | ------------------------------------------------------------------------------------------------------------------------
58 | SELECT
59 |     data->>'did' AS user_id,
60 |     EXTRACT(EPOCH FROM (
61 |         MAX(
62 |             TIMESTAMP WITH TIME ZONE 'epoch' +
63 |             INTERVAL '1 microsecond' * (data->>'time_us')::BIGINT
64 |         ) -
65 |         MIN(
66 |             TIMESTAMP WITH TIME ZONE 'epoch' +
67 |             INTERVAL '1 microsecond' * (data->>'time_us')::BIGINT
68 |         )
69 |     )) * 1000 AS activity_span
70 | FROM bluesky
71 | WHERE data->>'kind' = 'commit'
72 |   AND data->'commit'->>'operation' = 'create'
73 |   AND data->'commit'->>'collection' = 'app.bsky.feed.post'
74 | GROUP BY user_id
75 | ORDER BY activity_span DESC
76 | LIMIT 3;


--------------------------------------------------------------------------------
/postgresql/query_results.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | 
12 | QUERY_NUM=1
13 | 
14 | cat queries.sql | while read -r query; do
15 | 
16 |     # Print the query
17 |     echo "------------------------------------------------------------------------------------------------------------------------"
18 |     echo "Result for query Q$QUERY_NUM:"
19 |     echo
20 | 
21 |     sudo -u postgres psql -d "$DB_NAME" -c "$query"
22 | 
23 |     # Increment the query number
24 |     QUERY_NUM=$((QUERY_NUM + 1))
25 | done;


--------------------------------------------------------------------------------
/postgresql/results/_query_results/_m6i.8xlarge_bluesky_1m_lz4.query_results:
--------------------------------------------------------------------------------
 1 | ------------------------------------------------------------------------------------------------------------------------
 2 | Result for query Q1:
 3 | 
 4 |            event            | count
 5 | ----------------------------+--------
 6 |  app.bsky.feed.like         | 448944
 7 |  app.bsky.graph.follow      | 360374
 8 |  app.bsky.feed.post         |  90816
 9 |  app.bsky.feed.repost       |  58540
10 |  app.bsky.graph.block       |  14040
11 |  app.bsky.actor.profile     |  11762
12 |  app.bsky.graph.listitem    |   8103
13 |                             |   5328
14 |  app.bsky.graph.listblock   |    895
15 |  app.bsky.graph.starterpack |    405
16 |  app.bsky.graph.list        |    356
17 |  app.bsky.feed.threadgate   |    255
18 |  app.bsky.feed.postgate     |    104
19 |  app.bsky.feed.generator    |     74
20 |  app.bsky.labeler.service   |      4
21 | (15 rows)
22 | 
23 | ------------------------------------------------------------------------------------------------------------------------
24 | Result for query Q2:
25 | 
26 |            event            | count  | users
27 | ----------------------------+--------+--------
28 |  app.bsky.feed.like         | 444523 | 117617
29 |  app.bsky.graph.follow      | 337978 |  63957
30 |  app.bsky.feed.post         |  86812 |  50464
31 |  app.bsky.feed.repost       |  56993 |  26581
32 |  app.bsky.graph.block       |  13838 |   5785
33 |  app.bsky.graph.listitem    |   7568 |   1078
34 |  app.bsky.actor.profile     |   5337 |   5337
35 |  app.bsky.graph.listblock   |    860 |    449
36 |  app.bsky.graph.list        |    259 |    218
37 |  app.bsky.feed.threadgate   |    228 |    196
38 |  app.bsky.graph.starterpack |    104 |    101
39 |  app.bsky.feed.postgate     |    101 |     82
40 |  app.bsky.feed.generator    |     10 |      9
41 | (13 rows)
42 | 
43 | ------------------------------------------------------------------------------------------------------------------------
44 | Result for query Q3:
45 | 
46 |         event         | hour_of_day | count
47 | ----------------------+-------------+--------
48 |  app.bsky.feed.like   |          16 | 444523
49 |  app.bsky.feed.post   |          16 |  86812
50 |  app.bsky.feed.repost |          16 |  56993
51 | (3 rows)
52 | 
53 | ------------------------------------------------------------------------------------------------------------------------
54 | Result for query Q4:
55 | 
56 |              user_id              |         first_post_ts
57 | ----------------------------------+-------------------------------
58 |  did:plc:yj3sjq3blzpynh27cumnp5ks | 2024-11-21 16:25:49.000167+00
59 |  did:plc:l5o3qjrmfztir54cpwlv2eme | 2024-11-21 16:25:49.001905+00
60 |  did:plc:s4bwqchfzm6gjqfeb6mexgbu | 2024-11-21 16:25:49.003907+00
61 | (3 rows)
62 | 
63 | ------------------------------------------------------------------------------------------------------------------------
64 | Result for query Q5:
65 | 
66 |              user_id              | activity_span
67 | ----------------------------------+---------------
68 |  did:plc:tsyymlun4eqjuw7hqrhmwagd | 813006.959000
69 |  did:plc:3ug235sfy2pz7cawmpsftb65 | 811602.261000
70 |  did:plc:doxhhgtxqiv47tmcovpbcqai | 811404.021000
71 | (3 rows)


--------------------------------------------------------------------------------
/postgresql/results/m6i.8xlarge_bluesky_1000m_lz4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "PostgreSQL (lz4)",
 3 |   "version": "16.6",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000000,
12 |   "dataset_size_readable": "1000m",
13 |   "num_loaded_documents": 804000000,
14 |   "data_compression": "lz4",
15 |   "total_size": 654739636224,
16 |   "total_size_readable": "654.74 GB",
17 |   "data_size": 506726694912,
18 |   "data_size_readable": "506.73 GB",
19 |   "index_size": 147981623296,
20 |   "index_size_readable": "147.98 GB",
21 |   "result": [
22 |     [3863.35,3843.05,3843.04],
23 |     [32553.3,32554.5,32553.9],
24 |     [4222.61,4212.91,4208.16],
25 |     [4890.82,4873.02,4871.11],
26 |     [4900.32,4883.62,4873.68]
27 |   ],
28 |   "result_readable": [
29 |     "1 hr 4 min 23.35 sec, 1 hr 4 min 3.05 sec, 1 hr 4 min 3.04 sec",
30 |     "9 hr 2 min 33.30 sec, 9 hr 2 min 34.50 sec, 9 hr 2 min 33.90 sec",
31 |     "1 hr 10 min 22.61 sec, 1 hr 10 min 12.91 sec, 1 hr 10 min 8.16 sec",
32 |     "1 hr 21 min 30.82 sec, 1 hr 21 min 13.02 sec, 1 hr 21 min 11.11 sec",
33 |     "1 hr 21 min 40.32 sec, 1 hr 21 min 23.62 sec, 1 hr 21 min 13.68 sec"
34 |   ]
35 | }
36 | 


--------------------------------------------------------------------------------
/postgresql/results/m6i.8xlarge_bluesky_1000m_pglz.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "PostgreSQL (pglz)",
 3 |   "version": "16.6",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000000,
12 |   "dataset_size_readable": "1000m",
13 |   "num_loaded_documents": 804000000,
14 |   "data_compression": "pglz",
15 |   "total_size": 660828643328,
16 |   "total_size_readable": "660.83 GB",
17 |   "data_size": 512573440000,
18 |   "data_size_readable": "512.57 GB",
19 |   "index_size": 148222607360,
20 |   "index_size_readable": "148.22 GB",
21 |   "result": [
22 |     [3907.81,3891.66,3887.55],
23 |     [32598,32600.7,32598],
24 |     [4267.02,4246.18,4248.89],
25 |     [4902.93,4887.97,4870.85],
26 |     [4919.4,4894.7,4914.49]
27 |   ],
28 |   "result_readable": [
29 |     "1 hr 5 min 7.81 sec, 1 hr 4 min 51.66 sec, 1 hr 4 min 47.55 sec",
30 |     "9 hr 3 min 18.00 sec, 9 hr 3 min 20.70 sec, 9 hr 3 min 18.00 sec",
31 |     "1 hr 11 min 7.02 sec, 1 hr 10 min 46.18 sec, 1 hr 10 min 48.89 sec",
32 |     "1 hr 21 min 42.93 sec, 1 hr 21 min 27.97 sec, 1 hr 21 min 10.85 sec",
33 |     "1 hr 21 min 59.40 sec, 1 hr 21 min 34.70 sec, 1 hr 21 min 54.49 sec"
34 |   ]
35 | }
36 | 


--------------------------------------------------------------------------------
/postgresql/results/m6i.8xlarge_bluesky_100m_lz4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "PostgreSQL (lz4)",
 3 |   "version": "16.6",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 100000000,
12 |   "dataset_size_readable": "100m",
13 |   "num_loaded_documents": 91000000,
14 |   "data_compression": "lz4",
15 |   "total_size": 69069152256,
16 |   "total_size_readable": "69.07 GB",
17 |   "data_size": 54598787072,
18 |   "data_size_readable": "54.60 GB",
19 |   "index_size": 14470062080,
20 |   "index_size_readable": "14.47 GB",
21 |   "result": [
22 |     [416.294,10.3327,10.3401],
23 |     [1865.37,1454.93,1457.57],
24 |     [439.522,31.3951,31.3661],
25 |     [477.483,17.4918,17.2812],
26 |     [478.399,19.4407,18.4989]
27 |   ],
28 |   "result_readable": [
29 |     "6 min 56.29 sec, 10.33 sec, 10.34 sec",
30 |     "31 min 5.37 sec, 24 min 14.93 sec, 24 min 17.57 sec",
31 |     "7 min 19.52 sec, 31.40 sec, 31.37 sec",
32 |     "7 min 57.48 sec, 17.49 sec, 17.28 sec",
33 |     "7 min 58.40 sec, 19.44 sec, 18.50 sec"
34 |   ]
35 | }
36 | 


--------------------------------------------------------------------------------
/postgresql/results/m6i.8xlarge_bluesky_100m_pglz.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "PostgreSQL (pglz)",
 3 |   "version": "16.6",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 100000000,
12 |   "dataset_size_readable": "100m",
13 |   "num_loaded_documents": 91000000,
14 |   "data_compression": "pglz",
15 |   "total_size": 69085896704,
16 |   "total_size_readable": "69.09 GB",
17 |   "data_size": 54617038848,
18 |   "data_size_readable": "54.62 GB",
19 |   "index_size": 14468587520,
20 |   "index_size_readable": "14.47 GB",
21 |   "result": [
22 |     [416.449,10.4025,10.3982],
23 |     [1865.43,1457.18,1455.51],
24 |     [439.424,31.7577,31.7307],
25 |     [477.52,17.5615,17.6512],
26 |     [479.118,18.9199,19.5657]
27 |   ],
28 |   "result_readable": [
29 |     "6 min 56.45 sec, 10.40 sec, 10.40 sec",
30 |     "31 min 5.43 sec, 24 min 17.18 sec, 24 min 15.51 sec",
31 |     "7 min 19.42 sec, 31.76 sec, 31.73 sec",
32 |     "7 min 57.52 sec, 17.56 sec, 17.65 sec",
33 |     "7 min 59.12 sec, 18.92 sec, 19.57 sec"
34 |   ]
35 | }
36 | 


--------------------------------------------------------------------------------
/postgresql/results/m6i.8xlarge_bluesky_10m_lz4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "PostgreSQL (lz4)",
 3 |   "version": "16.6",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 10000000,
12 |   "dataset_size_readable": "10m",
13 |   "num_loaded_documents": 7000000,
14 |   "data_compression": "lz4",
15 |   "total_size": 5759737856,
16 |   "total_size_readable": "5.76 GB",
17 |   "data_size": 4653178880,
18 |   "data_size_readable": "4.65 GB",
19 |   "index_size": 1106558976,
20 |   "index_size_readable": "1.11 GB",
21 |   "result": [
22 |     [35.3379,0.945249,0.938868],
23 |     [41.45,9.23267,9.23034],
24 |     [36.1707,2.49331,2.4911],
25 |     [174.534,1.97768,1.90803],
26 |     [175.691,2.11244,2.10785]
27 |   ],
28 |   "result_readable": [
29 |     "35.34 sec, 945.25 msec, 938.87 msec",
30 |     "41.45 sec, 9.23 sec, 9.23 sec",
31 |     "36.17 sec, 2.49 sec, 2.49 sec",
32 |     "2 min 54.53 sec, 1.98 sec, 1.91 sec",
33 |     "2 min 55.69 sec, 2.11 sec, 2.11 sec"
34 |   ]
35 | }
36 | 


--------------------------------------------------------------------------------
/postgresql/results/m6i.8xlarge_bluesky_10m_pglz.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "PostgreSQL (pglz)",
 3 |   "version": "16.6",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 10000000,
12 |   "dataset_size_readable": "10m",
13 |   "num_loaded_documents": 7000000,
14 |   "data_compression": "pglz",
15 |   "total_size": 5778792448,
16 |   "total_size_readable": "5.78 GB",
17 |   "data_size": 4652179456,
18 |   "data_size_readable": "4.65 GB",
19 |   "index_size": 1126612992,
20 |   "index_size_readable": "1.13 GB",
21 |   "result": [
22 |     [35.1722,0.948157,0.947581],
23 |     [48.0881,9.34658,9.3514],
24 |     [35.668,2.53469,2.53258],
25 |     [174.552,2.07684,1.95298],
26 |     [175.98,2.18434,2.21651]
27 |   ],
28 |   "result_readable": [
29 |     "35.17 sec, 948.16 msec, 947.58 msec",
30 |     "48.09 sec, 9.35 sec, 9.35 sec",
31 |     "35.67 sec, 2.53 sec, 2.53 sec",
32 |     "2 min 54.55 sec, 2.08 sec, 1.95 sec",
33 |     "2 min 55.98 sec, 2.18 sec, 2.22 sec"
34 |   ]
35 | }
36 | 


--------------------------------------------------------------------------------
/postgresql/results/m6i.8xlarge_bluesky_1m_lz4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "PostgreSQL (lz4)",
 3 |   "version": "16.6",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000,
12 |   "dataset_size_readable": "1m",
13 |   "num_loaded_documents": 1000000,
14 |   "data_compression": "lz4",
15 |   "total_size": 731111424,
16 |   "total_size_readable": "731.11 MB",
17 |   "data_size": 586768384,
18 |   "data_size_readable": "586.77 MB",
19 |   "index_size": 144343040,
20 |   "index_size_readable": "144.34 MB",
21 |   "result": [
22 |     [3.63966,0.135007,0.136015],
23 |     [36.808,2.1238,2.13312],
24 |     [4.05291,0.344643,0.344627],
25 |     [15.3146,0.504157,0.224458],
26 |     [15.7782,0.238114,0.247374]
27 |   ],
28 |   "result_readable": [
29 |     "3.64 sec, 135.01 msec, 136.01 msec",
30 |     "36.81 sec, 2.12 sec, 2.13 sec",
31 |     "4.05 sec, 344.64 msec, 344.63 msec",
32 |     "15.31 sec, 504.16 msec, 224.46 msec",
33 |     "15.78 sec, 238.11 msec, 247.37 msec"
34 |   ]
35 | }
36 | 


--------------------------------------------------------------------------------
/postgresql/results/m6i.8xlarge_bluesky_1m_pglz.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "system": "PostgreSQL (pglz)",
 3 |   "version": "16.6",
 4 |   "os": "Ubuntu 24.04",
 5 |   "date": "2025-01-13",
 6 |   "machine": "m6i.8xlarge, 10000gib gp3",
 7 |   "cluster_size": 1,
 8 |   "comment": "",
 9 |   "tags": [
10 |   ],
11 |   "dataset_size": 1000000,
12 |   "dataset_size_readable": "1m",
13 |   "num_loaded_documents": 1000000,
14 |   "data_compression": "pglz",
15 |   "total_size": 731168768,
16 |   "total_size_readable": "731.17 MB",
17 |   "data_size": 586825728,
18 |   "data_size_readable": "586.83 MB",
19 |   "index_size": 144343040,
20 |   "index_size_readable": "144.34 MB",
21 |   "result": [
22 |     [4.05346,0.135037,0.13426],
23 |     [30.2945,2.1009,2.07414],
24 |     [4.05195,0.348947,0.347623],
25 |     [15.7542,0.50897,0.224949],
26 |     [16.1238,0.251958,0.251934]
27 |   ],
28 |   "result_readable": [
29 |     "4.05 sec, 135.04 msec, 134.26 msec",
30 |     "30.29 sec, 2.10 sec, 2.07 sec",
31 |     "4.05 sec, 348.95 msec, 347.62 msec",
32 |     "15.75 sec, 508.97 msec, 224.95 msec",
33 |     "16.12 sec, 251.96 msec, 251.93 msec"
34 |   ]
35 | }
36 | 


--------------------------------------------------------------------------------
/postgresql/run_queries.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 1 ]]; then
 5 |     echo "Usage: $0 <DB_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | 
12 | TRIES=3
13 | 
14 | cat queries.sql | while read -r query; do
15 | 
16 |     # Clear the Linux file system cache
17 |     echo "Clearing file system cache..."
18 |     sync
19 |     echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
20 |     echo "File system cache cleared."
21 | 
22 |     # Print the query
23 |     echo "Running query: $query"
24 | 
25 |     # Execute the query multiple times
26 |     for i in $(seq 1 $TRIES); do
27 |         sudo -u postgres psql -d "$DB_NAME" -t -c '\timing' -c "$query" | grep 'Time'
28 |     done;
29 | done;


--------------------------------------------------------------------------------
/postgresql/total_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the required arguments are provided
 4 | if [[ $# -lt 2 ]]; then
 5 |     echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
 6 |     exit 1
 7 | fi
 8 | 
 9 | # Arguments
10 | DB_NAME="$1"
11 | TABLE_NAME="$2"
12 | 
13 | sudo -u postgres psql -d "$DB_NAME" -t -c "SELECT pg_total_relation_size('$TABLE_NAME')"


--------------------------------------------------------------------------------