├── .github └── workflows │ ├── RunBenchmark.yml │ ├── regression.yml │ └── static.yml ├── .gitignore ├── LICENSE ├── R-arrow ├── VERSION ├── groupby-R-arrow.R ├── join-R-arrow.R ├── setup-R-arrow.sh ├── upg-R-arrow.sh └── ver-R-arrow.sh ├── README.md ├── _benchplot ├── benchplot-dict.R └── benchplot.R ├── _control ├── data.csv ├── data_large.csv ├── data_medium.csv ├── data_small.csv ├── nodenames.csv ├── questions.csv ├── skipped_benchmarks.csv ├── solutions.csv └── timeout.csv ├── _data ├── groupby-datagen.R ├── groupby2014-datagen.R └── join-datagen.R ├── _docs └── maintenance.md ├── _helpers ├── helpers.R ├── helpers.jl ├── helpers.py ├── helpers.sh └── helpersds.jl ├── _launcher ├── launch.R ├── launcher.R ├── setup.sh └── solution.R ├── _report ├── blah.R ├── ga.html ├── history.Rmd ├── index.Rmd ├── publish.sh ├── report.R └── tech.Rmd ├── _run ├── download_small_medium.sh ├── partitioned_run.sh ├── run_large.sh ├── run_medium.sh └── run_small.sh ├── _setup_utils ├── .DS_Store ├── install_all_solutions.py ├── mount.sh ├── mount_and_install_solutions.sh ├── prep_solutions.py ├── repro.sh ├── setup_small.sh └── sleep_and_run.sh ├── _utils ├── answers-validation.R ├── compare-data.table.R ├── download_data.sh ├── generate-data-small.sh ├── groupby_k_factor.csv ├── maintainer.R ├── maintainer.sh ├── parse_time_logs.R ├── partitioned_run.sh ├── sql_to_check_timings │ └── timing_checks.sql ├── time.R └── validate_no_errors.sh ├── arrow └── VERSION ├── clickhouse ├── VERSION ├── ch.sh ├── clickhouse-misc.sh ├── clickhouse-mount-config.xml ├── clickhouse-parse-log.R ├── exec.sh ├── groupby-clickhouse.sh ├── join-clickhouse.sh ├── setup-clickhouse.sh ├── upg-clickhouse.sh └── ver-clickhouse.sh ├── collapse ├── VERSION ├── groupby-collapse.R ├── groupby2014-collapse.R ├── join-collapse.R ├── setup-collapse.sh ├── upg-collapse.sh └── ver-collapse.sh ├── dask ├── VERSION ├── common.py ├── groupby_dask.py ├── join_dask.py ├── setup-dask.sh ├── upg-dask.sh └── ver-dask.sh ├── datafusion ├── VERSION ├── groupby-datafusion.py ├── join-datafusion.py ├── setup-datafusion.sh ├── upg-datafusion.sh └── ver-datafusion.sh ├── datatable ├── VERSION ├── groupby-datatable.R ├── groupby2014-datatable.R ├── join-datatable.R ├── read-datatable.R ├── setup-datatable.sh ├── sort-datatable.R ├── upg-datatable.sh └── ver-datatable.sh ├── dplyr ├── VERSION ├── groupby-dplyr.R ├── groupby2014-dplyr.R ├── join-dplyr.R ├── read-dplyr.R ├── setup-dplyr.sh ├── sort-dplyr.R ├── upg-dplyr.sh └── ver-dplyr.sh ├── duckdb-latest ├── VERSION ├── groupby-duckdb-latest.R ├── join-duckdb-latest.R ├── setup-duckdb-latest.sh ├── upg-duckdb-latest.sh └── ver-duckdb-latest.sh ├── duckdb ├── VERSION ├── groupby-duckdb.R ├── join-duckdb.R ├── setup-duckdb.sh ├── upg-duckdb.sh └── ver-duckdb.sh ├── h2o ├── exec.sh ├── groupby-h2o.R ├── h2o.sh ├── join-h2o.R ├── setup-h2o.sh ├── upg-h2o.sh └── ver-h2o.sh ├── juliadf ├── VERSION ├── exec.sh ├── groupby-juliadf.jl ├── join-juliadf.jl ├── setup-juliadf.sh ├── upg-juliadf.sh └── ver-juliadf.sh ├── juliads ├── VERSION ├── exec.sh ├── groupby-juliads.jl ├── join-juliads.jl ├── setup-juliads.sh ├── upg-juliads.sh └── ver-juliads.sh ├── logs.csv ├── modin ├── groupby-modin.py ├── join-modin.py ├── setup-modin.sh ├── sort-modin.py ├── upg-modin.sh └── ver-modin.sh ├── pandas ├── VERSION ├── groupby-pandas.py ├── groupby2014-pandas.py ├── join-pandas.py ├── read-pandas.py ├── setup-pandas.sh ├── sort-pandas.py ├── upg-pandas.sh └── ver-pandas.sh ├── path.env ├── polars ├── VERSION ├── groupby-polars.py ├── join-polars.py ├── monitor_ram.py ├── setup-polars.sh ├── upg-polars.sh └── ver-polars.sh ├── pydatatable ├── VERSION ├── convert-pydatatable-data.py ├── groupby-pydatatable.py ├── join-pydatatable.py ├── read-pydatatable.py ├── setup-pydatatable.sh ├── sort-pydatatable.py ├── upg-pydatatable.sh └── ver-pydatatable.sh ├── run.conf ├── run.sh ├── spark ├── VERSION ├── groupby-spark.py ├── join-spark.py ├── setup-spark.sh ├── upg-spark.sh └── ver-spark.sh └── time.csv /.github/workflows/RunBenchmark.yml: -------------------------------------------------------------------------------- 1 | name: Run benchmark 2 | on: 3 | workflow_dispatch: 4 | inputs: 5 | solutions: 6 | type: string 7 | instance_id: 8 | type: string 9 | include_clickhouse: 10 | type: bool 11 | sizes: 12 | type: string 13 | 14 | concurrency: 15 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/master' || github.sha }} 16 | cancel-in-progress: true 17 | 18 | env: 19 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 20 | gh_issue_repo: duckdblabs/db-benchmark 21 | instance_id: ${{ inputs.instance_id }} 22 | solutions: ${{ inputs.solutions }} 23 | include_clickhouse: ${{ inputs.include_clickhouse }} 24 | 25 | 26 | jobs: 27 | start-aws-machine: 28 | name: Start aws-small-machine 29 | runs-on: ubuntu-latest 30 | environment: aws-secrets 31 | steps: 32 | - name: Start EC2 runner 33 | shell: bash 34 | env: 35 | AWS_ACCESS_KEY_ID: ${{secrets.AWS_ACCESS_KEY_ID}} 36 | AWS_SECRET_ACCESS_KEY: ${{secrets.AWS_SECRET_ACCESS_KEY}} 37 | AWS_DEFAULT_REGION: us-east-1 38 | run: aws ec2 start-instances --instance-id ${{ env.instance_id }} 39 | 40 | - name: Create issue if failure 41 | shell: bash 42 | if: ${{ failure() && contains(github.ref_name, 'main') }} 43 | run: | 44 | gh issue create --repo ${{ env.gh_issue_repo }} --title "Could not start DB-benchmark machine" --body "AWS box with instance-id ${{ env.instance_id }} could not be started" 45 | 46 | run-benchmark: 47 | name: Regression Tests all solutions 48 | env: 49 | CC: gcc-10 50 | CXX: g++-10 51 | GEN: ninja 52 | runs-on: self-hosted 53 | environment: aws-secrets 54 | permissions: # Job-level permissions configuration starts here 55 | contents: write # 'write' access to repository contents 56 | pull-requests: write # 'write' access to pull requests 57 | steps: 58 | - uses: actions/checkout@v4 59 | 60 | - name: run mount 61 | shell: bash 62 | run: | 63 | ./_setup_utils/mount.sh 64 | 65 | - name: Install or Upgrade all solutions 66 | shell: bash 67 | working-directory: /var/lib/mount/db-benchmark-metal 68 | run: | 69 | python3 _setup_utils/install_all_solutions.py ${{ env.solutions }} 70 | if [ ${{ env.include_clickhouse }} ]; then 71 | # installing/updating clickhouse needs sudo priviledges 72 | sudo python3 _setup_utils/install_all_solutions.py clickhouse 73 | fi 74 | 75 | - name: Modify run.conf to only have new versions 76 | shell: bash 77 | working-directory: /var/lib/mount/db-benchmark-metal 78 | run: | 79 | git diff --name-only **/VERSION > updated_solutions.txt 80 | cat updated_solutions.txt 81 | export new_solutions="${{ env.solutions }}" 82 | echo "testing solutions: " $new_solutions 83 | sed -i "s/export RUN_SOLUTIONS=.*/export RUN_SOLUTIONS=\"${new_solutions}\"/g" run.conf 84 | 85 | - name: Run the benchmark 86 | shell: bash 87 | working-directory: /var/lib/mount/db-benchmark-metal 88 | env: 89 | DO_REPORT: 1 90 | DO_PUBLISH: 0 91 | run: | 92 | ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'` 93 | if [ $ncores -eq 16 ]; then export MACHINE_TYPE="c6id.4xlarge"; fi 94 | if [ $ncores -eq 32 ]; then export MACHINE_TYPE="c6id.8xlarge"; fi 95 | if [ $ncores -eq 128 ]; then export MACHINE_TYPE="c6id.metal"; fi 96 | if [[ ${{ inputs.sizes }} == *"small"* ]]; 97 | MACHINE_TYPE=$MACHINE_TYPE ./_run/run_small.sh 98 | fi 99 | if [[ ${{ inputs.sizes }} == *"medium"* ]]; 100 | MACHINE_TYPE=$MACHINE_TYPE ./_run/run_medium.sh 101 | fi 102 | if [[ ${{ inputs.sizes }} == *"large"* ]]; 103 | MACHINE_TYPE=$MACHINE_TYPE ./_run/run_large.sh 104 | fi 105 | 106 | - name: name new branch 107 | shell: bash 108 | run: | 109 | echo "new_branch_name=results-`date +%Y-%m-%d-%Hh%Mm`" >> $GITHUB_ENV 110 | echo ${{ env.new_branch_name }} 111 | 112 | - name: Commit updates 113 | shell: bash 114 | working-directory: /var/lib/mount/db-benchmark-metal 115 | run: | 116 | git config --global user.email "" 117 | git config --global user.name "Run Benchmark action" 118 | git remote add upstream git@github.com:duckdblabs/db-benchmark 119 | git fetch upstream 120 | git switch -c ${{ env.new_branch_name }} 121 | git add time.csv logs.csv **/VERSION 122 | git add run.conf 123 | git commit -m "new results" 124 | git push upstream ${{ env.new_branch_name }} 125 | 126 | - name: Create Archive 127 | if: always() 128 | shell: bash 129 | working-directory: /var/lib/mount/db-benchmark-metal 130 | run: | 131 | mkdir -p out 132 | echo "guarantee not empty dir" > out/guarantee.txt 133 | zip -r out-dir.zip out/ public/ 134 | 135 | - uses: actions/upload-artifact@v4 136 | if: always() 137 | with: 138 | name: out-dir.zip 139 | path: /var/lib/mount/db-benchmark-metal/out-dir.zip 140 | if-no-files-found: error 141 | 142 | shutdown: 143 | name: shut down 144 | environment: aws-secrets 145 | if: always() 146 | runs-on: ubuntu-latest 147 | needs: 148 | - start-aws-machine 149 | - run-benchmark 150 | 151 | steps: 152 | - name: shutdown 153 | shell: bash 154 | env: 155 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 156 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 157 | AWS_DEFAULT_REGION: us-east-1 158 | run: aws ec2 stop-instances --instance-id ${{ env.instance_id }} 159 | 160 | -------------------------------------------------------------------------------- /.github/workflows/regression.yml: -------------------------------------------------------------------------------- 1 | name: Regression 2 | on: 3 | workflow_dispatch: 4 | repository_dispatch: 5 | pull_request: 6 | paths-ignore: 7 | - '**.md' 8 | - '.github/workflows/**' 9 | - '!.github/workflows/regression.yml' 10 | 11 | concurrency: 12 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/master' || github.sha }} 13 | cancel-in-progress: true 14 | 15 | jobs: 16 | regression-test-benchmark-runner-solo-solutions: 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, R-arrow, duckdb, datafusion, dask, clickhouse] 21 | name: Solo solutions 22 | runs-on: ubuntu-latest 23 | env: 24 | CC: gcc-10 25 | CXX: g++-10 26 | GEN: ninja 27 | 28 | steps: 29 | - uses: actions/checkout@v3 30 | with: 31 | fetch-depth: 0 32 | 33 | - uses: actions/setup-python@v4 34 | with: 35 | python-version: '3.10' 36 | 37 | - name: Install libraries 38 | shell: bash 39 | run: ./_setup_utils/setup_small.sh 40 | 41 | - name: Generate 500mb datasets 42 | shell: bash 43 | run: ./_utils/generate-data-small.sh 44 | 45 | - name: Remove old logs 46 | shell: bash 47 | run: rm time.csv logs.csv 48 | 49 | - name: Install all solutions 50 | shell: bash 51 | run: source path.env && python3 _setup_utils/install_all_solutions.py ${{ matrix.solution }} 52 | 53 | - name: Turn swap off 54 | shell: bash 55 | run: sudo swapoff -a 56 | 57 | # needed because clickhouse for some reason produces an error the first 58 | # time a benchmark is run. The next benchmark run will work and overwrite the 59 | # old benchmark files. 60 | - name: Run mini GroupBy benchmark if clickhouse 61 | shell: bash 62 | if: ${{ matrix.solution == 'clickhouse' || matrix.solution == 'all' }} 63 | run: | 64 | python3 _setup_utils/prep_solutions.py --task=groupby --solution=clickhouse 65 | source path.env 66 | MACHINE_TYPE="c6id.4xlarge" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh 67 | sleep 60 68 | MACHINE_TYPE="c6id.metal" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh 69 | sleep 60 70 | 71 | - name: Run mini GroupBy benchmark 72 | shell: bash 73 | run: | 74 | python3 _setup_utils/prep_solutions.py --task=groupby --solution=${{ matrix.solution }} 75 | source path.env 76 | MACHINE_TYPE="c6id.4xlarge" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh 77 | sleep 60 78 | MACHINE_TYPE="c6id.metal" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh 79 | sleep 60 80 | 81 | - name: Run mini Join benchmark 82 | shell: bash 83 | run: | 84 | python3 _setup_utils/prep_solutions.py --task=join --solution=${{ matrix.solution }} 85 | source path.env 86 | MACHINE_TYPE="c6id.4xlarge" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh 87 | sleep 60 88 | MACHINE_TYPE="c6id.metal" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh 89 | sleep 60 90 | 91 | - name: Validate benchmark results and report generation 92 | shell: bash 93 | run: ./_utils/validate_no_errors.sh 94 | 95 | - name: Create Archive 96 | if: always() 97 | shell: bash 98 | run: | 99 | cp *.csv out/ 100 | zip -r ${{ matrix.solution }}-out.zip out/ 101 | 102 | # include this step to see what the latest versions are of every solution 103 | - name: Print latest versions 104 | if: always() 105 | shell: bash 106 | run: tail -n +1 */VERSION 107 | 108 | - uses: actions/upload-artifact@v4 109 | if: always() 110 | with: 111 | name: ${{ matrix.solution }}-out.zip 112 | path: ${{ matrix.solution }}-out.zip 113 | if-no-files-found: error 114 | 115 | regression-test-benchmark-runner-all-solutions: 116 | needs: regression-test-benchmark-runner-solo-solutions 117 | name: Regression Tests all solutions 118 | runs-on: ubuntu-20.04 119 | env: 120 | CC: gcc-10 121 | CXX: g++-10 122 | GEN: ninja 123 | 124 | steps: 125 | - uses: actions/checkout@v3 126 | with: 127 | fetch-depth: 0 128 | 129 | - uses: actions/setup-python@v4 130 | with: 131 | python-version: '3.10' 132 | 133 | - name: Install libraries 134 | shell: bash 135 | run: ./_setup_utils/setup_small.sh 136 | 137 | - name: Generate 500mb datasets 138 | shell: bash 139 | run: ./_utils/generate-data-small.sh 140 | 141 | - name: Remove old logs 142 | shell: bash 143 | run: rm time.csv logs.csv 144 | 145 | - name: Install all solutions 146 | shell: bash 147 | run: source path.env && python3 _setup_utils/install_all_solutions.py all 148 | 149 | - name: Turn swap off 150 | shell: bash 151 | run: sudo swapoff -a 152 | 153 | - name: Run mini GroupBy benchmark 154 | shell: bash 155 | run: | 156 | python3 _setup_utils/prep_solutions.py --task=groupby --solution=all 157 | source path.env 158 | MACHINE_TYPE="c6id.4xlarge" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh 159 | sleep 60 160 | MACHINE_TYPE="c6id.metal" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh 161 | 162 | - name: Run mini Join benchmark 163 | shell: bash 164 | run: | 165 | python3 _setup_utils/prep_solutions.py --task=join --solution=all 166 | source path.env 167 | MACHINE_TYPE="c6id.4xlarge" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh 168 | sleep 60 169 | MACHINE_TYPE="c6id.metal" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh 170 | 171 | - name: Validate benchmark results and report generation 172 | shell: bash 173 | run: ./_utils/validate_no_errors.sh 174 | 175 | - name: Create Archive 176 | if: always() 177 | shell: bash 178 | run: | 179 | cp *.csv out/ 180 | zip -r all-out.zip out/ 181 | 182 | # include this step to see what the latest versions are of every solution 183 | - name: Print latest versions 184 | if: always() 185 | shell: bash 186 | run: tail -n +1 */VERSION 187 | 188 | - uses: actions/upload-artifact@v4 189 | if: always() 190 | with: 191 | name: all-out.zip 192 | path: all-out.zip 193 | if-no-files-found: error 194 | 195 | -------------------------------------------------------------------------------- /.github/workflows/static.yml: -------------------------------------------------------------------------------- 1 | # Simple workflow for deploying static content to GitHub Pages 2 | name: Deploy static content to Pages 3 | 4 | on: 5 | # Runs on pushes targeting the default branch 6 | push: 7 | branches: ["gh-pages"] 8 | 9 | # Allows you to run this workflow manually from the Actions tab 10 | workflow_dispatch: 11 | 12 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 13 | permissions: 14 | contents: read 15 | pages: write 16 | id-token: write 17 | 18 | # Allow one concurrent deployment 19 | concurrency: 20 | group: "pages" 21 | cancel-in-progress: true 22 | 23 | jobs: 24 | # Single deploy job since we're just deploying 25 | deploy: 26 | environment: 27 | name: github-pages 28 | url: ${{ steps.deployment.outputs.page_url }} 29 | runs-on: ubuntu-latest 30 | steps: 31 | - name: Checkout 32 | uses: actions/checkout@v3 33 | - name: Setup Pages 34 | uses: actions/configure-pages@v2 35 | - name: Upload artifact 36 | uses: actions/upload-pages-artifact@v1 37 | with: 38 | # Upload entire repository 39 | path: '.' 40 | - name: Deploy to GitHub Pages 41 | id: deployment 42 | uses: actions/deploy-pages@v1 43 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | figure/* 2 | metastore_db/* 3 | *.log 4 | *.html 5 | *.csv 6 | !time.csv 7 | !logs.csv 8 | !_control/data_small.csv 9 | !_control/data_large.csv 10 | *.md5 11 | .Rproj.user 12 | .Rhistory 13 | db-benchmark.Rproj 14 | */REVISION 15 | token 16 | .token 17 | public/ 18 | out/ 19 | data/ 20 | clickhouse/log/ 21 | clickhouse/*-clickhouse.sql 22 | clickhouse/unused/ 23 | */log/ 24 | tmp/ 25 | dask-worker-space/ 26 | GA/ 27 | utils/ 28 | */py-*/ 29 | */r-*/ 30 | duckdb-latest/duckdb 31 | report-done 32 | db-benchmark.gh-pages/ 33 | run.out 34 | clickhouse/etc_sudoers.bak 35 | workdir/ 36 | timeout-exit-codes.out 37 | */target 38 | *.lock 39 | -------------------------------------------------------------------------------- /R-arrow/VERSION: -------------------------------------------------------------------------------- 1 | 20.0.0.2 2 | -------------------------------------------------------------------------------- /R-arrow/setup-R-arrow.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # install stable arrow 5 | mkdir -p ./R-arrow/r-arrow 6 | Rscript -e 'install.packages(c("arrow","dplyr"), lib="./R-arrow/r-arrow")' 7 | 8 | ./R-arrow/ver-R-arrow.sh 9 | -------------------------------------------------------------------------------- /R-arrow/upg-R-arrow.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # upgrade all packages in arrow library only if new arrow is out 5 | echo 'upgrading arrow...' 6 | Rscript -e 'ap=available.packages(); if (ap["arrow","Version"]!=packageVersion("arrow", lib.loc="./R-arrow/r-arrow")) update.packages(lib.loc="./R-arrow/r-arrow", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)' 7 | -------------------------------------------------------------------------------- /R-arrow/ver-R-arrow.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | Rscript -e 'v=read.dcf(system.file(package="arrow", lib.loc="./R-arrow/r-arrow", "DESCRIPTION"), fields=c("Version","RemoteSha")); colnames(v)[colnames(v)=="RemoteSha"]="Revision"; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("R-arrow", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))' 5 | -------------------------------------------------------------------------------- /_control/data.csv: -------------------------------------------------------------------------------- 1 | task,data,nrow,k,na,sort,active 2 | groupby,G1_1e7_1e2_0_0,1e7,1e2,0,0,1 3 | groupby,G1_1e7_1e1_0_0,1e7,1e1,0,0,1 4 | groupby,G1_1e7_2e0_0_0,1e7,2e0,0,0,1 5 | groupby,G1_1e7_1e2_0_1,1e7,1e2,0,1,1 6 | groupby,G1_1e7_1e2_5_0,1e7,1e2,5,0,1 7 | groupby,G1_1e8_1e2_0_0,1e8,1e2,0,0,1 8 | groupby,G1_1e8_1e1_0_0,1e8,1e1,0,0,1 9 | groupby,G1_1e8_2e0_0_0,1e8,2e0,0,0,1 10 | groupby,G1_1e8_1e2_0_1,1e8,1e2,0,1,1 11 | groupby,G1_1e8_1e2_5_0,1e8,1e2,5,0,1 12 | groupby,G1_1e9_1e2_0_0,1e9,1e2,0,0,1 13 | groupby,G1_1e9_1e1_0_0,1e9,1e1,0,0,1 14 | groupby,G1_1e9_2e0_0_0,1e9,2e0,0,0,1 15 | groupby,G1_1e9_1e2_0_1,1e9,1e2,0,1,1 16 | groupby,G1_1e9_1e2_5_0,1e9,1e2,5,0,1 17 | join,J1_1e7_NA_0_0,1e7,NA,0,0,1 18 | join,J1_1e7_NA_5_0,1e7,NA,5,0,1 19 | join,J1_1e7_NA_0_1,1e7,NA,0,1,1 20 | join,J1_1e8_NA_0_0,1e8,NA,0,0,1 21 | join,J1_1e8_NA_5_0,1e8,NA,5,0,1 22 | join,J1_1e8_NA_0_1,1e8,NA,0,1,1 23 | join,J1_1e9_NA_0_0,1e9,NA,0,0,1 -------------------------------------------------------------------------------- /_control/data_large.csv: -------------------------------------------------------------------------------- 1 | task,data,nrow,k,na,sort,active 2 | groupby,G1_1e9_1e2_0_0,1e9,1e2,0,0,1 3 | groupby,G1_1e9_1e1_0_0,1e9,1e1,0,0,1 4 | groupby,G1_1e9_2e0_0_0,1e9,2e0,0,0,1 5 | groupby,G1_1e9_1e2_0_1,1e9,1e2,0,1,1 6 | groupby,G1_1e9_1e2_5_0,1e9,1e2,5,0,1 7 | join,J1_1e9_NA_0_0,1e9,NA,0,0,1 -------------------------------------------------------------------------------- /_control/data_medium.csv: -------------------------------------------------------------------------------- 1 | task,data,nrow,k,na,sort,active 2 | groupby,G1_1e8_1e2_0_0,1e8,1e2,0,0,1 3 | groupby,G1_1e8_1e1_0_0,1e8,1e1,0,0,1 4 | groupby,G1_1e8_2e0_0_0,1e8,2e0,0,0,1 5 | groupby,G1_1e8_1e2_0_1,1e8,1e2,0,1,1 6 | groupby,G1_1e8_1e2_5_0,1e8,1e2,5,0,1 7 | join,J1_1e8_NA_0_0,1e8,NA,0,0,1 8 | join,J1_1e8_NA_5_0,1e8,NA,5,0,1 9 | join,J1_1e8_NA_0_1,1e8,NA,0,1,1 -------------------------------------------------------------------------------- /_control/data_small.csv: -------------------------------------------------------------------------------- 1 | task,data,nrow,k,na,sort,active 2 | groupby,G1_1e7_1e2_0_0,1e7,1e2,0,0,1 3 | groupby,G1_1e7_1e1_0_0,1e7,1e1,0,0,1 4 | groupby,G1_1e7_2e0_0_0,1e7,2e0,0,0,1 5 | groupby,G1_1e7_1e2_0_1,1e7,1e2,0,1,1 6 | groupby,G1_1e7_1e2_5_0,1e7,1e2,5,0,1 7 | join,J1_1e7_NA_0_0,1e7,NA,0,0,1 8 | join,J1_1e7_NA_5_0,1e7,NA,5,0,1 9 | join,J1_1e7_NA_0_1,1e7,NA,0,1,1 -------------------------------------------------------------------------------- /_control/nodenames.csv: -------------------------------------------------------------------------------- 1 | nodename,cpu_model,cpu_cores,memory_model,memory_gb,gpu_model,gpu_num,gpu_gb 2 | mr-0xc11,Intel(R) Xeon(R) CPU E5-2660 v3 @ 2.60GHz,20,DIMM DDR4 Synchronous 2133 MHz,125.80,,, 3 | mr-dl11,Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz,40,DIMM Synchronous 2133 MHz,125.78,GeForce GTX 1080 Ti,2,21.83 4 | m4.10xlarge,Intel(R) Xeon(R) CPU E5-2676 v3 @ 2.40GHz,40,unkown,157,None,None,None 5 | c6id.metal,Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz,128,NVMe SSD,250,None,None,None -------------------------------------------------------------------------------- /_control/questions.csv: -------------------------------------------------------------------------------- 1 | task,question,question_group 2 | groupby,sum v1 by id1,basic 3 | groupby,sum v1 by id1:id2,basic 4 | groupby,sum v1 mean v3 by id3,basic 5 | groupby,mean v1:v3 by id4,basic 6 | groupby,sum v1:v3 by id6,basic 7 | groupby,median v3 sd v3 by id4 id5,advanced 8 | groupby,max v1 - min v2 by id3,advanced 9 | groupby,largest two v3 by id6,advanced 10 | groupby,regression v1 v2 by id2 id4,advanced 11 | groupby,sum v3 count by id1:id6,advanced 12 | join,small inner on int,basic 13 | join,medium inner on int,basic 14 | join,medium outer on int,basic 15 | join,medium inner on factor,basic 16 | join,big inner on int,basic 17 | groupby2014,sum v1 by id1,basic 18 | groupby2014,sum v1 by id1:id2,basic 19 | groupby2014,sum v1 mean v3 by id3,basic 20 | groupby2014,mean v1:v3 by id4,basic 21 | groupby2014,sum v1:v3 by id6,basic 22 | -------------------------------------------------------------------------------- /_control/skipped_benchmarks.csv: -------------------------------------------------------------------------------- 1 | solution,task,data,machine_type 2 | juliads,groupby,G1_1e9_1e2_0_0,c6id.4xlarge 3 | juliads,groupby,G1_1e9_1e1_0_0,c6id.4xlarge 4 | juliads,groupby,G1_1e9_2e0_0_0,c6id.4xlarge 5 | juliads,groupby,G1_1e9_1e2_0_1,c6id.4xlarge 6 | juliads,groupby,G1_1e9_1e2_5_0,c6id.4xlarge 7 | juliadf,groupby,G1_1e9_1e2_0_0,c6id.4xlarge 8 | juliadf,groupby,G1_1e9_1e1_0_0,c6id.4xlarge 9 | juliadf,groupby,G1_1e9_2e0_0_0,c6id.4xlarge 10 | juliadf,groupby,G1_1e9_1e2_0_1,c6id.4xlarge 11 | juliadf,groupby,G1_1e9_1e2_5_0,c6id.4xlarge 12 | R-arrow,groupby,G1_1e9_1e2_0_0,c6id.4xlarge 13 | R-arrow,groupby,G1_1e9_1e1_0_0,c6id.4xlarge 14 | R-arrow,groupby,G1_1e9_2e0_0_0,c6id.4xlarge 15 | R-arrow,groupby,G1_1e9_1e2_0_1,c6id.4xlarge 16 | R-arrow,groupby,G1_1e9_1e2_5_0,c6id.4xlarge 17 | dplyr,groupby,G1_1e9_1e2_0_0,c6id.4xlarge 18 | dplyr,groupby,G1_1e9_1e1_0_0,c6id.4xlarge 19 | dplyr,groupby,G1_1e9_2e0_0_0,c6id.4xlarge 20 | dplyr,groupby,G1_1e9_1e2_0_1,c6id.4xlarge 21 | dplyr,groupby,G1_1e9_1e2_5_0,c6id.4xlarge 22 | pandas,groupby,G1_1e9_1e2_0_0,c6id.4xlarge 23 | pandas,groupby,G1_1e9_1e1_0_0,c6id.4xlarge 24 | pandas,groupby,G1_1e9_2e0_0_0,c6id.4xlarge 25 | pandas,groupby,G1_1e9_1e2_0_1,c6id.4xlarge 26 | pandas,groupby,G1_1e9_1e2_5_0,c6id.4xlarge 27 | pydatatable,groupby,G1_1e9_1e2_0_0,c6id.4xlarge 28 | pydatatable,groupby,G1_1e9_1e1_0_0,c6id.4xlarge 29 | pydatatable,groupby,G1_1e9_2e0_0_0,c6id.4xlarge 30 | pydatatable,groupby,G1_1e9_1e2_0_1,c6id.4xlarge 31 | pydatatable,groupby,G1_1e9_1e2_5_0,c6id.4xlarge 32 | spark,groupby,G1_1e9_1e2_0_0,c6id.4xlarge 33 | spark,groupby,G1_1e9_1e1_0_0,c6id.4xlarge 34 | spark,groupby,G1_1e9_2e0_0_0,c6id.4xlarge 35 | spark,groupby,G1_1e9_1e2_0_1,c6id.4xlarge 36 | spark,groupby,G1_1e9_1e2_5_0,c6id.4xlarge 37 | datafusion,groupby,G1_1e9_1e2_0_0,c6id.4xlarge 38 | datafusion,groupby,G1_1e9_1e1_0_0,c6id.4xlarge 39 | datafusion,groupby,G1_1e9_2e0_0_0,c6id.4xlarge 40 | datafusion,groupby,G1_1e9_1e2_0_1,c6id.4xlarge 41 | datafusion,groupby,G1_1e9_1e2_5_0,c6id.4xlarge 42 | datafusion,join,J1_1e8_NA_0_0,c6id.4xlarge 43 | datafusion,join,J1_1e8_NA_5_0,c6id.4xlarge 44 | datafusion,join,J1_1e8_NA_0_1,c6id.4xlarge 45 | datafusion,join,J1_1e9_NA_0_0,c6id.4xlarge 46 | R-arrow,join,J1_1e9_NA_0_0,c6id.4xlarge 47 | dask,join,J1_1e9_NA_0_0,c6id.4xlarge 48 | datatable,join,J1_1e9_NA_0_0,c6id.4xlarge 49 | juliadf,join,J1_1e9_NA_0_0,c6id.4xlarge 50 | juliads,join,J1_1e9_NA_0_0,c6id.4xlarge 51 | pandas,join,J1_1e9_NA_0_0,c6id.4xlarge 52 | collapse,join,J1_1e9_NA_0_0,c6id.4xlarge 53 | polars,join,J1_1e9_NA_0_0,c6id.4xlarge 54 | pydatatable,join,J1_1e9_NA_0_0,c6id.4xlarge 55 | spark,join,J1_1e9_NA_0_0,c6id.4xlarge 56 | clickhouse,join,J1_1e9_NA_0_0,c6id.4xlarge 57 | 58 | -------------------------------------------------------------------------------- /_control/solutions.csv: -------------------------------------------------------------------------------- 1 | solution,task 2 | collapse,groupby 3 | collapse,groupby2014 4 | collapse,join 5 | data.table,groupby 6 | data.table,join 7 | data.table,groupby2014 8 | dplyr,groupby 9 | dplyr,join 10 | dplyr,groupby2014 11 | pandas,groupby 12 | pandas,join 13 | pandas,groupby2014 14 | pydatatable,groupby 15 | pydatatable,join 16 | spark,groupby 17 | spark,join 18 | dask,groupby 19 | dask,join 20 | juliadf,groupby 21 | juliadf,join 22 | juliads,groupby 23 | juliads,join 24 | clickhouse,groupby 25 | clickhouse,join 26 | polars,groupby 27 | polars,join 28 | R-arrow,groupby 29 | R-arrow,join 30 | duckdb,groupby 31 | duckdb,join 32 | duckdb-latest,groupby 33 | duckdb-latest,join 34 | datafusion,groupby 35 | datafusion,join 36 | -------------------------------------------------------------------------------- /_control/timeout.csv: -------------------------------------------------------------------------------- 1 | task,in_rows,minutes 2 | groupby,1e7,10 3 | groupby,1e8,30 4 | groupby,1e9,60 5 | join,1e7,10 6 | join,1e8,30 7 | join,1e9,60 8 | groupby2014,1e7,60 9 | groupby2014,1e8,120 10 | groupby2014,1e9,180 11 | -------------------------------------------------------------------------------- /_data/groupby-datagen.R: -------------------------------------------------------------------------------- 1 | # Rscript groupby-datagen.R 1e7 1e2 0 0 ## 1e7 rows, 1e2 K, 0% NAs, random order 2 | # Rscript groupby-datagen.R 1e8 1e1 5 1 ## 1e8 rows, 10 K, 5% NAs, sorted order 3 | args = commandArgs(TRUE) 4 | 5 | pretty_sci = function(x) { 6 | tmp<-strsplit(as.character(x), "+", fixed=TRUE)[[1L]] 7 | if(length(tmp)==1L) { 8 | paste0(substr(tmp, 1L, 1L), "e", nchar(tmp)-1L) 9 | } else if(length(tmp)==2L){ 10 | paste0(tmp[1L], as.character(as.integer(tmp[2L]))) 11 | } 12 | } 13 | 14 | library(data.table) 15 | N=as.integer(args[1L]); K=as.integer(args[2L]); nas=as.integer(args[3L]); sort=as.integer(args[4L]) 16 | stopifnot(nas<=100L, nas>=0L, sort%in%c(0L,1L)) 17 | set.seed(108) 18 | cat(sprintf("Producing data of %s rows, %s K groups factors, %s NAs ratio, %s sort flag\n", pretty_sci(N), pretty_sci(K), nas, sort)) 19 | DT = list() 20 | DT[["id1"]] = sample(sprintf("id%03d",1:K), N, TRUE) # large groups (char) 21 | DT[["id2"]] = sample(sprintf("id%03d",1:K), N, TRUE) # small groups (char) 22 | DT[["id3"]] = sample(sprintf("id%010d",1:(N/K)), N, TRUE) # large groups (char) 23 | DT[["id4"]] = sample(K, N, TRUE) # large groups (int) 24 | DT[["id5"]] = sample(K, N, TRUE) # small groups (int) 25 | DT[["id6"]] = sample(N/K, N, TRUE) # small groups (int) 26 | DT[["v1"]] = sample(5, N, TRUE) # int in range [1,5] 27 | DT[["v2"]] = sample(15, N, TRUE) # int in range [1,15] 28 | DT[["v3"]] = round(runif(N,max=100),6) # numeric e.g. 23.574912 29 | setDT(DT) 30 | if (nas>0L) { 31 | cat("Inputting NAs\n") 32 | for (col in paste0("id",1:6)) { 33 | ucol = unique(DT[[col]]) 34 | nna = as.integer(length(ucol) * (nas/100)) 35 | if (nna) 36 | set(DT, DT[.(sample(ucol, nna)), on=col, which=TRUE], col, NA) 37 | rm(ucol) 38 | } 39 | nna = as.integer(nrow(DT) * (nas/100)) 40 | if (nna) { 41 | for (col in paste0("v",1:3)) 42 | set(DT, sample(nrow(DT), nna), col, NA) 43 | } 44 | } 45 | if (sort==1L) { 46 | cat("Sorting data\n") 47 | setkeyv(DT, paste0("id", 1:6)) 48 | } 49 | file = sprintf("G1_%s_%s_%s_%s.csv", pretty_sci(N), pretty_sci(K), nas, sort) 50 | cat(sprintf("Writing data to %s\n", file)) 51 | fwrite(DT, file) 52 | cat(sprintf("Data written to %s, quitting\n", file)) 53 | if (!interactive()) quit("no", status=0) 54 | -------------------------------------------------------------------------------- /_data/groupby2014-datagen.R: -------------------------------------------------------------------------------- 1 | args = commandArgs(TRUE) 2 | 3 | pretty_sci = function(x) { 4 | tmp<-strsplit(as.character(x), "+", fixed=TRUE)[[1L]] 5 | if(length(tmp)==1L) { 6 | paste0(substr(tmp, 1L, 1L), "e", nchar(tmp)-1L) 7 | } else if(length(tmp)==2L){ 8 | paste0(tmp[1L], as.character(as.integer(tmp[2L]))) 9 | } 10 | } 11 | 12 | library(data.table) 13 | N=as.integer(args[1L]); K=as.integer(args[2L]); nas=as.integer(args[3L]); sort=as.integer(args[4L]) 14 | stopifnot(K==1e2L, nas==0L, sort==0L) ## 2014's setup 15 | set.seed(108) 16 | cat(sprintf("Producing data of %s rows, %s K groups factors, %s NAs ratio, %s sort flag\n", pretty_sci(N), pretty_sci(K), nas, sort)) 17 | DT = list() 18 | DT[["id1"]] = sample(sprintf("id%03d",1:K), N, TRUE) # large groups (char) 19 | DT[["id2"]] = sample(sprintf("id%03d",1:K), N, TRUE) # small groups (char) 20 | DT[["id3"]] = sample(sprintf("id%010d",1:(N/K)), N, TRUE) # large groups (char) 21 | DT[["id4"]] = sample(K, N, TRUE) # large groups (int) 22 | DT[["id5"]] = sample(K, N, TRUE) # small groups (int) 23 | DT[["id6"]] = sample(N/K, N, TRUE) # small groups (int) 24 | DT[["v1"]] = sample(5, N, TRUE) # int in range [1,5] 25 | DT[["v2"]] = sample(5, N, TRUE) # int in range [1,5] 26 | DT[["v3"]] = sample(round(runif(100,max=100),4), N, TRUE)# numeric e.g. 23.5749 27 | setDT(DT) 28 | if (nas>0L) { 29 | cat("Inputting NAs\n") 30 | for (col in paste0("id",1:6)) { 31 | ucol = unique(DT[[col]]) 32 | nna = as.integer(length(ucol) * (nas/100)) 33 | if (nna) 34 | set(DT, DT[.(sample(ucol, nna)), on=col, which=TRUE], col, NA) 35 | rm(ucol) 36 | } 37 | nna = as.integer(nrow(DT) * (nas/100)) 38 | if (nna) { 39 | for (col in paste0("v",1:3)) 40 | set(DT, sample(nrow(DT), nna), col, NA) 41 | } 42 | } 43 | if (sort==1L) { 44 | cat("Sorting data\n") 45 | setkeyv(DT, paste0("id", 1:6)) 46 | } 47 | file = sprintf("G0_%s_%s_%s_%s.csv", pretty_sci(N), pretty_sci(K), nas, sort) 48 | cat(sprintf("Writing data to %s\n", file)) 49 | fwrite(DT, file) 50 | cat(sprintf("Data written to %s, quitting\n", file)) 51 | if (!interactive()) quit("no", status=0) 52 | -------------------------------------------------------------------------------- /_data/join-datagen.R: -------------------------------------------------------------------------------- 1 | # Rscript join-datagen.R 1e7 0 0 0 ## 1e7 rows, 0 ignored, 0% NAs, random order 2 | # Rscript join-datagen.R 1e8 0 5 1 ## 1e8 rows, 0 ignored, 5% NAs, sorted order 3 | 4 | # see h2oai/db-benchmark#106 for a design notes of this procedure, feedback welcome in the issue 5 | 6 | # init ---- 7 | 8 | init = proc.time()[["elapsed"]] 9 | args = commandArgs(TRUE) 10 | N=as.numeric(args[1L]); K=as.integer(args[2L]); nas=as.integer(args[3L]); sort=as.integer(args[4L]) 11 | stopifnot(N>=1e7, nas<=100L, nas>=0L, sort%in%c(0L,1L)) 12 | if (N > .Machine$integer.max) stop("no support for long vector in join-datagen yet") 13 | N = as.integer(N) 14 | 15 | # helper functions ---- 16 | 17 | # pretty print big numbers as 1e9, 1e8, etc 18 | pretty_sci = function(x) { 19 | stopifnot(length(x)==1L, !is.na(x)) 20 | tmp = strsplit(as.character(x), "+", fixed=TRUE)[[1L]] 21 | if (length(tmp)==1L) { 22 | paste0(substr(tmp, 1L, 1L), "e", nchar(tmp)-1L) 23 | } else if (length(tmp)==2L) { 24 | paste0(tmp[1L], as.character(as.integer(tmp[2L]))) 25 | } 26 | } 27 | # data_name of table to join 28 | join_to_tbls = function(data_name) { 29 | x_n = as.numeric(strsplit(data_name, "_", fixed=TRUE)[[1L]][2L]) 30 | y_n = setNames(x_n/c(1e6, 1e3, 1e0), c("small","medium","big")) 31 | sapply(sapply(y_n, pretty_sci), gsub, pattern="NA", x=data_name) 32 | } 33 | # sample ensuring none is missing 34 | sample_all = function(x, size) { 35 | stopifnot(length(x) <= size) 36 | y = c(x, sample(x, size=max(size-length(x), 0), replace=TRUE)) 37 | sample(y) 38 | } 39 | # split into common (0.9) left (0.1) and right (0.1) 40 | split_xlr = function(n) { 41 | key = sample.int(n*1.1) # 1.1 = 0.9+0.1+0.1 42 | list( 43 | x = key[seq.int(1, n*0.9)], 44 | l = key[seq.int(n*0.9+1, n)], 45 | r = key[seq.int(n+1, n*1.1)] 46 | ) 47 | } 48 | # check if data name is LHS and has NAs 49 | lhs_nas = function(data_name) { 50 | tmp = strsplit(data_name, "_", fixed=TRUE)[[1L]] 51 | if (!identical(tmp[3L], "NA")) 52 | return(FALSE) ## RHS 53 | as.integer(tmp[4L])>0L ## NAs 54 | } 55 | # NA aware sprintf for single argument only 56 | sprintfId = function(fmt, id) { 57 | x = rep(NA_character_, length(id)) 58 | idx = !is.na(id) 59 | x[idx] = sprintf("id%.0f", id[idx]) 60 | x 61 | } 62 | # we need to write in batches to reduce memory footprint 63 | write_batches = function(d, name, append) { 64 | cols = names(d) 65 | if (lhs_nas(name)) sprintf = sprintfId 66 | if ("id1" %in% cols) set(d, NULL, "id4", sprintf("id%.0f", d$id1)) 67 | if ("id2" %in% cols) set(d, NULL, "id5", sprintf("id%.0f", d$id2)) 68 | if ("id3" %in% cols) set(d, NULL, "id6", sprintf("id%.0f", d$id3)) 69 | setcolorder(d, neworder=setdiff(names(d), c("v1","v2"))) 70 | fwrite(d, paste0(name, ".csv"), showProgress=FALSE, append=append) 71 | } 72 | handle_batches = function(d, data_name) { 73 | N = nrow(d) 74 | if (N > 1e8) { 75 | stopifnot(N==1e9) 76 | for (i in 1:10) { 77 | cat(sprintf("Writing %s data batch %s\n", pretty_sci(N), i)) 78 | write_batches(d[((i-1)*1e8+1L):(i*1e8)], data_name, append=i>1L) 79 | } 80 | } else { 81 | write_batches(d, data_name, append=FALSE) 82 | } 83 | } 84 | 85 | # exec ---- 86 | 87 | library(data.table) 88 | setDTthreads(0L) 89 | set.seed(108) 90 | data_name = sprintf("J1_%s_%s_%s_%s", pretty_sci(N), "NA", nas, sort) 91 | 92 | cat(sprintf("Generate join data of %s rows\n", pretty_sci(N))) 93 | 94 | cat("Producing keys for LHS and RHS data\n") 95 | key1 = split_xlr(N/1e6) 96 | key2 = split_xlr(N/1e3) 97 | key3 = split_xlr(N) 98 | 99 | cat(sprintf("Producing LHS %s data from keys\n", pretty_sci(N))) 100 | lhs = c("x","l") 101 | l = list( 102 | id1 = sample_all(unlist(key1[lhs], use.names=FALSE), N), 103 | id2 = sample_all(unlist(key2[lhs], use.names=FALSE), N), 104 | id3 = sample_all(unlist(key3[lhs], use.names=FALSE), N) 105 | ) 106 | setDT(l) 107 | if (sort==1L) { 108 | cat("Sorting LHS data\n") 109 | setkeyv(l, c("id1","id2","id3")) 110 | } 111 | set(l, NULL, "v1", round(runif(nrow(l), max=100), 6)) 112 | stopifnot( 113 | uniqueN(l, by="id1")==N/1e6, 114 | uniqueN(l, by="id2")==N/1e3, 115 | uniqueN(l, by="id3")==N 116 | ) 117 | if (nas>0L) { 118 | cat("Inputting NAs in LHS data\n") 119 | for (col in paste0("id",1:3)) { 120 | ucol = unique(l[[col]]) 121 | nna = as.integer(length(ucol) * (nas/100)) 122 | if (nna) 123 | set(l, l[.(sample(ucol, nna)), on=col, which=TRUE], col, NA) 124 | rm(ucol) 125 | } 126 | nna = as.integer(nrow(l) * (nas/100)) 127 | if (nna) 128 | set(l, sample(nrow(l), nna), "v1", NA) 129 | } 130 | cat(sprintf("Writing LHS %s data %s\n", pretty_sci(N), data_name)) 131 | handle_batches(l, data_name) 132 | rm(l) 133 | 134 | rhs = c("x","r") 135 | r_data_name = join_to_tbls(data_name) 136 | n = N/1e6 137 | cat(sprintf("Producing RHS %s data from keys\n", pretty_sci(n))) 138 | r1 = list( 139 | id1 = sample_all(unlist(key1[rhs], use.names=FALSE), n) 140 | ) 141 | setDT(r1) 142 | if (sort==1L) { 143 | cat("Sorting RHS small data\n") 144 | setkeyv(r1, "id1") 145 | } 146 | set(r1, NULL, "v2", round(runif(nrow(r1), max=100), 6)) 147 | stopifnot(uniqueN(r1, by="id1")==n) 148 | cat(sprintf("Writing RHS %s data %s\n", pretty_sci(n), r_data_name[1L])) 149 | handle_batches(r1, r_data_name[1L]) 150 | rm(r1) 151 | n = N/1e3 152 | cat(sprintf("Producing RHS %s data from keys\n", pretty_sci(n))) 153 | r2 = list( 154 | id1 = sample_all(unlist(key1[rhs], use.names=FALSE), n), 155 | id2 = sample_all(unlist(key2[rhs], use.names=FALSE), n) 156 | ) 157 | setDT(r2) 158 | if (sort==1L) { 159 | cat("Sorting RHS medium data\n") 160 | setkeyv(r2, "id2") 161 | } 162 | set(r2, NULL, "v2", round(runif(nrow(r2), max=100), 6)) 163 | stopifnot(uniqueN(r2, by="id2")==n) 164 | cat(sprintf("Writing RHS %s data %s\n", pretty_sci(n), r_data_name[2L])) 165 | handle_batches(r2, r_data_name[2L]) 166 | rm(r2) 167 | n = N 168 | cat(sprintf("Producing RHS %s data from keys\n", pretty_sci(n))) 169 | r3 = list( 170 | id1 = sample_all(unlist(key1[rhs], use.names=FALSE), n), 171 | id2 = sample_all(unlist(key2[rhs], use.names=FALSE), n), 172 | id3 = sample_all(unlist(key3[rhs], use.names=FALSE), n) 173 | ) 174 | rm(key1, key2, key3) 175 | setDT(r3) 176 | if (sort==1L) { 177 | cat("Sorting RHS big data\n") 178 | setkeyv(r3, "id3") 179 | } 180 | set(r3, NULL, "v2", round(runif(nrow(r3), max=100), 6)) 181 | stopifnot(uniqueN(r3, by="id3")==n) 182 | cat(sprintf("Writing RHS %s data %s\n", pretty_sci(n), r_data_name[3L])) 183 | handle_batches(r3, r_data_name[3L]) 184 | rm(r3) 185 | 186 | cat(sprintf("Join datagen of %s rows finished in %ss\n", pretty_sci(N), trunc(proc.time()[["elapsed"]]-init))) 187 | if (!interactive()) quit("no", status=0) 188 | -------------------------------------------------------------------------------- /_helpers/helpers.R: -------------------------------------------------------------------------------- 1 | write.log = function( 2 | timestamp=Sys.time(), # this has to be here to support timestamp provided when parsing impala or clickhouse sql logs 3 | task=NA_character_, data=NA_character_, in_rows=NA_integer_, question=NA_character_, out_rows=NA_integer_, 4 | out_cols=NA_integer_, solution=NA_character_, version=NA_character_, git=NA_character_, fun=NA_character_, 5 | run=NA_integer_, time_sec=NA_real_, mem_gb=NA_real_, cache=NA, chk=NA_character_, chk_time_sec=NA_real_, 6 | on_disk=FALSE, machine_type='' 7 | ) { 8 | stopifnot(is.character(task), is.character(data), is.character(solution), is.character(fun), is.logical(on_disk), is.character(machine_type)) 9 | log.file=Sys.getenv("CSV_TIME_FILE", "time.csv") 10 | batch=Sys.getenv("BATCH", NA) 11 | nodename=toString(Sys.info()[["nodename"]]) 12 | comment=NA_character_ # placeholder for updates to timing data 13 | time_sec=round(time_sec, 3) 14 | mem_gb=round(mem_gb, 3) 15 | chk_time_sec=round(chk_time_sec, 3) 16 | df=data.frame(nodename=nodename, batch=as.integer(batch), timestamp=as.numeric(timestamp), 17 | task=task, data=data, in_rows=trunc(in_rows), question=as.character(question), out_rows=trunc(out_rows), # trunc to support big int in double 18 | out_cols=as.integer(out_cols), solution=solution, version=as.character(version), git=as.character(git), fun=fun, 19 | run=as.integer(run), time_sec=time_sec, mem_gb=mem_gb, cache=cache, chk=chk, chk_time_sec=chk_time_sec, 20 | comment=comment, on_disk=on_disk, machine_type=machine_type) 21 | csv_verbose = Sys.getenv("CSV_VERBOSE", "false") 22 | if (as.logical(csv_verbose)) cat("# ", paste(sapply(df, format, scientific=FALSE), collapse=","), "\n", sep="") 23 | if (!file.size(log.file)) file.remove(log.file) 24 | write.table(format(df, scientific=FALSE), 25 | file=log.file, 26 | append=file.exists(log.file), 27 | col.names=!file.exists(log.file), 28 | row.names=FALSE, 29 | quote=FALSE, 30 | na="", 31 | sep=",") 32 | } 33 | 34 | # short format of 1e7, 1e8 etc. 35 | pretty_sci = function(x) { 36 | tmp<-strsplit(as.character(x), "+", fixed=TRUE)[[1L]] 37 | if(length(tmp)==1L) { 38 | paste0(substr(tmp, 1L, 1L), "e", nchar(tmp)-1L) 39 | } else if(length(tmp)==2L){ 40 | paste0(tmp[1L], as.character(as.integer(tmp[2L]))) 41 | } 42 | } 43 | 44 | # makes scalar string to store in "chk" field, check sum of arbitrary number of measures 45 | make_chk = function(values){ 46 | x = sapply(values, function(x) paste(format(x, scientific=FALSE), collapse="_")) 47 | gsub(",", "_", paste(x, collapse=";"), fixed=TRUE) 48 | } 49 | 50 | # bash 'ps -o rss' 51 | memory_usage = function() { 52 | return(NA_real_) # disabled because during #110 system() kills the scripts 53 | cmd = paste("ps -o rss", Sys.getpid(), "| tail -1") 54 | ans = tryCatch(system(cmd, intern=TRUE, ignore.stderr=TRUE), error=function(e) NA_character_) 55 | as.numeric(ans) / (1024^2) # GB units 56 | } 57 | 58 | # join task RHS tables for LHS data name 59 | join_to_tbls = function(data_name) { 60 | x_n = as.numeric(strsplit(data_name, "_", fixed=TRUE)[[1L]][2L]) 61 | y_n = setNames(c(x_n/1e6, x_n/1e3, x_n), c("small","medium","big")) 62 | sapply(sapply(y_n, pretty_sci), gsub, pattern="NA", x=data_name) 63 | } 64 | -------------------------------------------------------------------------------- /_helpers/helpers.jl: -------------------------------------------------------------------------------- 1 | using Printf; # sprintf macro to print in non-scientific format 2 | using Pkg; 3 | 4 | # from https://github.com/JuliaLang/Pkg.jl/issues/793 5 | function getpkgmeta(name::AbstractString) 6 | fname = joinpath(dirname(Base.active_project()), "Manifest.toml") 7 | Pkg.TOML.parse(read(fname, String))["deps"][name][1] 8 | end; 9 | 10 | function write_log(run, task, data, in_rows, question, out_rows, out_cols, solution, version, git, fun, time_sec, mem_gb, cache, chk, chk_time_sec, on_disk, machine_type) 11 | file=try 12 | ENV["CSV_TIME_FILE"] 13 | catch 14 | "time.csv" 15 | end; 16 | if (occursin("/", file) && SubString(file, 1, 1)!="/") # otherwise we assume full path 17 | file="$(pwd())/$file"; 18 | end; 19 | batch=try 20 | ENV["BATCH"] 21 | catch 22 | "" 23 | end; 24 | if (isfile(file) && filesize(file)==0) 25 | rm(file) 26 | end; 27 | nodename=gethostname() 28 | comment="" # placeholder for updates to timing data 29 | time_sec=round(time_sec, digits=3) 30 | mem_gb=round(mem_gb, digits=3) 31 | chk_time_sec=round(chk_time_sec, digits=3) 32 | timestamp=@sprintf("%0.6f", time()) 33 | csv_verbose = false # hardcoded for now, TODO ENV["CSV_VERBOSE"] and print 34 | log = DataFrame(nodename=nodename, batch=batch, timestamp=timestamp, task=task, data=data, in_rows=in_rows, question=question, out_rows=out_rows, out_cols=out_cols, solution=solution, version=version, git=git, fun=fun, run=run, time_sec=time_sec, mem_gb=mem_gb, cache=uppercase(string(cache)), chk=chk, chk_time_sec=chk_time_sec, comment=comment, on_disk=uppercase(string(on_disk)), machine_type=machine_type) 35 | CSV.write(file, log, append=isfile(file), header=!isfile(file)) 36 | end; 37 | 38 | function make_chk(x) 39 | n = length(x) 40 | res = "" 41 | for i = 1:n 42 | res = string(res, i==1 ? "" : ";", @sprintf("%0.3f", x[i])) 43 | end 44 | res 45 | end; 46 | 47 | function memory_usage() 48 | pid = getpid() 49 | s = read(pipeline(`ps -o rss $pid`,`tail -1`), String) 50 | parse(Float64, replace(s, "\n" => "")) / (1024^2) 51 | end; 52 | 53 | function join_to_tbls(data_name) 54 | x_n = Int(parse(Float64, split(data_name, "_")[2])) 55 | y_n = [x_n/1e6, x_n/1e3, x_n] 56 | y_n = [replace(@sprintf("%.0e", y_n[1]), r"[+]0?"=>""), replace(@sprintf("%.0e", y_n[2]), r"[+]0?"=>""), replace(@sprintf("%.0e", y_n[3]), r"[+]0?"=>"")] 57 | [replace(data_name, "NA" => y_n[1]), replace(data_name, "NA" => y_n[2]), replace(data_name, "NA" => y_n[3])] 58 | end; 59 | -------------------------------------------------------------------------------- /_helpers/helpers.py: -------------------------------------------------------------------------------- 1 | import time 2 | import csv 3 | import math 4 | import psutil 5 | import os 6 | import platform 7 | 8 | def write_log(task, data, in_rows, question, out_rows, out_cols, solution, version, git, fun, run, time_sec, mem_gb, cache, chk, chk_time_sec, on_disk, machine_type): 9 | batch = os.getenv('BATCH', "") 10 | timestamp = time.time() 11 | csv_file = os.getenv('CSV_TIME_FILE', "time.csv") 12 | nodename = platform.node() 13 | comment = "" # placeholder for updates to timing data 14 | time_sec = round(time_sec, 3) 15 | chk_time_sec = round(chk_time_sec, 3) 16 | mem_gb = round(mem_gb, 3) 17 | if math.isnan(time_sec): 18 | time_sec = "" 19 | if math.isnan(mem_gb): 20 | mem_gb = "" 21 | log_row = [nodename, batch, timestamp, task, data, in_rows, question, out_rows, out_cols, solution, version, git, fun, run, time_sec, mem_gb, cache, chk, chk_time_sec, comment, on_disk, machine_type] 22 | log_header = ["nodename","batch","timestamp","task","data","in_rows","question","out_rows","out_cols","solution","version","git","fun","run","time_sec","mem_gb","cache","chk","chk_time_sec","comment","on_disk", "machine_type"] 23 | if os.path.isfile(csv_file) and not(os.path.getsize(csv_file)): 24 | os.remove(csv_file) 25 | append = os.path.isfile(csv_file) 26 | csv_verbose = os.getenv('CSV_VERBOSE', "false") 27 | if csv_verbose.lower()=="true": 28 | print('# ' + ','.join(str(x) for x in log_row)) 29 | if append: 30 | with open(csv_file, 'a') as f: 31 | w = csv.writer(f, lineterminator='\n') 32 | w.writerow(log_row) 33 | else: 34 | with open(csv_file, 'w+') as f: 35 | w = csv.writer(f, lineterminator='\n') 36 | w.writerow(log_header) 37 | w.writerow(log_row) 38 | return True 39 | 40 | def str_round(x): 41 | if type(x).__name__ in ["float","float64"]: 42 | x = round(x,3) 43 | return str(x) 44 | 45 | flatten = lambda l: [item for sublist in l for item in sublist] 46 | 47 | def make_chk(values): 48 | s = ';'.join(str_round(x) for x in values) 49 | return s.replace(",","_") # comma is reserved for csv separator 50 | 51 | def memory_usage(): 52 | process = psutil.Process(os.getpid()) 53 | return process.memory_info().rss/(1024**3) # GB units 54 | 55 | def join_to_tbls(data_name): 56 | x_n = int(float(data_name.split("_")[1])) 57 | y_n = ["{:.0e}".format(x_n/1e6), "{:.0e}".format(x_n/1e3), "{:.0e}".format(x_n)] 58 | y_n = [y_n[0].replace('+0', ''), y_n[1].replace('+0', ''), y_n[2].replace('+0', '')] 59 | return [data_name.replace('NA', y_n[0]), data_name.replace('NA', y_n[1]), data_name.replace('NA', y_n[2])] 60 | -------------------------------------------------------------------------------- /_helpers/helpers.sh: -------------------------------------------------------------------------------- 1 | # join task RHS tables for LHS data name 2 | join_to_tbls() { 3 | data_name=$1 4 | x_n="$(echo $data_name | cut -d '_' -f 2)" 5 | x_n_lhs="$(echo $x_n | cut -d 'e' -f 1)" 6 | if [ "$x_n_lhs" -ne 1 ]; then 7 | echo "data_name $data_name must have '1' base in exponential notation for number of rows" >&2 && eit 1 8 | fi 9 | x_n_rhs="$(echo $x_n | cut -d "e" -f 2)" 10 | if [ "$x_n_rhs" -lt 6 ]; then 11 | echo "data_name $data_name must have exponent greater or equal to '6' in exponential notation for number of rows" >&2 && exit 1 12 | fi 13 | echo ${data_name/NA/"$x_n_lhs"e"$(($x_n_rhs-6))"} ${data_name/NA/"$x_n_lhs"e"$(($x_n_rhs-3))"} ${data_name/NA/"$x_n_lhs"e"$x_n_rhs"} 14 | } 15 | -------------------------------------------------------------------------------- /_helpers/helpersds.jl: -------------------------------------------------------------------------------- 1 | using Printf; # sprintf macro to print in non-scientific format 2 | using Pkg; 3 | 4 | # from https://github.com/JuliaLang/Pkg.jl/issues/793 5 | function getpkgmeta(name::AbstractString) 6 | fname = joinpath(dirname(Base.active_project()), "Manifest.toml") 7 | Pkg.TOML.parse(read(fname, String))["deps"][name][1] 8 | end; 9 | 10 | function write_log(run, task, data, in_rows, question, out_rows, out_cols, solution, version, git, fun, time_sec, mem_gb, cache, chk, chk_time_sec, on_disk, machine_type) 11 | file=try 12 | ENV["CSV_TIME_FILE"] 13 | catch 14 | "time.csv" 15 | end; 16 | if (occursin("/", file) && SubString(file, 1, 1)!="/") # otherwise we assume full path 17 | file="$(pwd())/$file"; 18 | end; 19 | batch=try 20 | ENV["BATCH"] 21 | catch 22 | "" 23 | end; 24 | if (isfile(file) && filesize(file)==0) 25 | rm(file) 26 | end; 27 | nodename=gethostname() 28 | comment="" # placeholder for updates to timing data 29 | time_sec=round(time_sec, digits=3) 30 | mem_gb=round(mem_gb, digits=3) 31 | chk_time_sec=round(chk_time_sec, digits=3) 32 | timestamp=@sprintf("%0.6f", time()) 33 | csv_verbose = false 34 | log = Dataset(nodename=nodename, batch=batch, timestamp=timestamp, task=task, data=data, in_rows=in_rows, question=question, out_rows=out_rows, out_cols=out_cols, solution=solution, version=version, git=git, fun=fun, run=run, time_sec=time_sec, mem_gb=mem_gb, cache=uppercase(string(cache)), chk=chk, chk_time_sec=chk_time_sec, comment=comment, on_disk=uppercase(string(on_disk)), machine_type=machine_type) 35 | filewriter(file, log, append=isfile(file), header=!isfile(file)) 36 | end; 37 | 38 | function make_chk(x) 39 | n = length(x) 40 | res = "" 41 | for i = 1:n 42 | res = string(res, i==1 ? "" : ";", @sprintf("%0.3f", x[i])) 43 | end 44 | res 45 | end; 46 | 47 | function memory_usage() 48 | pid = getpid() 49 | s = read(pipeline(`ps -o rss $pid`,`tail -1`), String) 50 | parse(Float64, replace(s, "\n" => "")) / (1024^2) 51 | end; 52 | 53 | function join_to_tbls(data_name) 54 | x_n = Int(parse(Float64, split(data_name, "_")[2])) 55 | y_n = [x_n/1e6, x_n/1e3, x_n] 56 | y_n = [replace(@sprintf("%.0e", y_n[1]), r"[+]0?"=>""), replace(@sprintf("%.0e", y_n[2]), r"[+]0?"=>""), replace(@sprintf("%.0e", y_n[3]), r"[+]0?"=>"")] 57 | [replace(data_name, "NA" => y_n[1]), replace(data_name, "NA" => y_n[2]), replace(data_name, "NA" => y_n[3])] 58 | end; 59 | -------------------------------------------------------------------------------- /_launcher/launch.R: -------------------------------------------------------------------------------- 1 | library("data.table") 2 | if (!packageVersion("data.table") >= "1.13.0") 3 | stop("db-benchmark launcher script depends on recent data.table features, install at least 1.13.0.") 4 | source("./_launcher/launcher.R") 5 | 6 | .nodename = Sys.info()[["nodename"]] 7 | mockup = as.logical(Sys.getenv("MOCKUP", "false")) 8 | 9 | run_tasks = getenv("RUN_TASKS") # run_tasks = c("groupby","join") 10 | if (!length(run_tasks)) { 11 | cat("No benchmark tasks to run\n") 12 | q("no") 13 | } 14 | run_solutions = getenv("RUN_SOLUTIONS") # run_solutions = c("data.table","dplyr","pydatatable","spark","pandas") 15 | if (!length(run_solutions)) { 16 | cat("No benchmark solutions to run\n") 17 | q("no") 18 | } 19 | 20 | data = fread("./_control/data.csv", logical01=TRUE, colClasses=c("character","character","character","character","character","character","logical")) 21 | if (anyDuplicated(data[["data"]])) 22 | stop("_control/data.csv contains duplicated data cases") 23 | data[active==TRUE, # filter on active datasets 24 | ][run_tasks, on="task", nomatch=NA # filter for env var RUN_TASKS 25 | ][, c("active") := NULL # remove unused 26 | ][] -> data 27 | if (any(is.na(data$data))) stop("missing entries in ./_control/data.csv for some tasks") 28 | 29 | timeout = fread("./_control/timeout.csv", colClasses=c("character","character","numeric")) 30 | timeout[run_tasks, on="task", nomatch=NA # # filter for env var RUN_TASKS 31 | ] -> timeout 32 | if (any(is.na(timeout$minutes))) stop("missing entries in ./_control/timeout.csv for some tasks") 33 | 34 | solution = fread("./_control/solutions.csv") 35 | solution[run_solutions, on="solution", nomatch=NA # filter for env var RUN_SOLUTIONS 36 | ] -> solution 37 | if (any(is.na(solution$task))) stop("missing entries in ./_control/solutions.csv for some solutions") 38 | 39 | # what to run, log machine name, lookup timeout 40 | dt = solution[data, on="task", allow.cartesian=TRUE, nomatch=NULL] 41 | dt[, "nodename" := .nodename] 42 | dt[, "in_rows" := sapply(strsplit(data, split="_", fixed=TRUE), `[[`, 2L)] 43 | stopifnot(dt$in_rows == dt$nrow) 44 | dt[timeout, "timeout_s" := i.minutes*60, on=c("task","in_rows")] 45 | if (any(is.na(dt$timeout_s))) stop("missing entries in ./_control/timeout.csv for some tasks, detected after joining to solutions and data to run") 46 | 47 | # detect if script has been already run before for currently installed version/revision 48 | lookup_run_batch(dt) 49 | 50 | machine_type = getenv("MACHINE_TYPE") 51 | dt[,machine_type := machine_type] 52 | 53 | skipped_benchmarks = fread("./_control/skipped_benchmarks.csv", logical01=TRUE, colClasses=c("character","character","character","character")) 54 | print("skipping benchmarks defined in _control/skipped_benchmarks.csv") 55 | print(skipped_benchmarks) 56 | 57 | dt = dt[!skipped_benchmarks, on = c("solution", "task", "data", "machine_type")] 58 | 59 | # print list of solutions that are going to be run in this batch so we know upfront which will be skipped 60 | cat("Benchmark solutions to run: ", dt[is.na(run_batch), paste(unique(solution),collapse=", ")], "\n", sep="") 61 | 62 | is.stop() 63 | is.pause() 64 | is.stop() 65 | 66 | # launch script, if not mockup, if not already run, unless forcerun 67 | dt 68 | launch(dt, mockup=mockup) 69 | 70 | # terminates 71 | q("no") 72 | -------------------------------------------------------------------------------- /_launcher/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # dirs for datasets and output of benchmark 5 | mkdir -p data 6 | mkdir -p out 7 | 8 | sudo apt-get update 9 | 10 | # install R 11 | sudo add-apt-repository "deb https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/" 12 | sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 13 | sudo apt-get update -qq 14 | sudo apt-get install -y r-base-dev 15 | sudo apt-get install python3-dev virtualenv 16 | 17 | sudo chmod a+w /usr/local/lib/R/site-library 18 | 19 | # configure R 20 | echo 'LC_ALL=C' >> ~/.Renviron 21 | mkdir -p ~/.R 22 | echo 'CFLAGS=-O3 -mtune=native' > ~/.R/Makevars 23 | echo 'CXXFLAGS=-O3 -mtune=native' >> ~/.R/Makevars 24 | 25 | # packages used in launcher and report 26 | Rscript -e 'install.packages(c("bit64","rmarkdown","data.table","rpivotTable","formattable","lattice"))' 27 | Rscript -e 'sapply(c("bit64","rmarkdown","data.table","rpivotTable","formattable","lattice"), requireNamespace)' 28 | 29 | # install duckdb for unpacking data 30 | curl --fail --location --progress-bar --output duckdb_cli-linux-amd64.zip https://github.com/duckdb/duckdb/releases/download/v1.2.0/duckdb_cli-linux-amd64.zip 31 | sudo unzip duckdb_cli-linux-amd64.zip -d /usr/local/bin 32 | 33 | 34 | # install aws client to download benchmark data 35 | curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" 36 | unzip awscliv2.zip 37 | sudo ./aws/install 38 | 39 | # after each restart of server 40 | source clickhouse/ch.sh && ch_stop 41 | sudo service docker stop 42 | sudo swapoff -a 43 | 44 | # stop and disable 45 | sudo systemctl disable docker 46 | sudo systemctl stop docker 47 | sudo systemctl disable clickhouse-server 48 | sudo systemctl stop clickhouse-server 49 | -------------------------------------------------------------------------------- /_report/blah.R: -------------------------------------------------------------------------------- 1 | 2 | source("./_report/report.R", chdir=TRUE) 3 | source("./_helpers/helpers.R", chdir=TRUE) 4 | source("./_benchplot/benchplot.R", chdir=TRUE) 5 | source("./_benchplot/benchplot-dict.R", chdir=TRUE) 6 | ld = time_logs() 7 | lld = ld[script_recent==TRUE] 8 | # lld_nodename = as.character(unique(lld$nodename)) 9 | lld_nodename = "c6id.metal" 10 | if (length(lld_nodename)>1L) 11 | stop(sprintf("There are multiple different 'nodename' to be presented on single report '%s'", report_name)) 12 | lld_unfinished = lld[is.na(script_time_sec)] 13 | if (nrow(lld_unfinished)) { 14 | warning(sprintf("Missing solution finish timestamp in logs.csv for '%s' (still running or launcher script killed): %s", paste(unique(lld_unfinished$task), collapse=","), paste(unique(lld_unfinished$solution), collapse=", "))) 15 | } 16 | 17 | dt_groupby = lld[task=="groupby"][substr(data,1,2)=="G1"] 18 | dt_join = lld[task=="join"] 19 | 20 | 21 | loop_benchplot = function(dt_task, report_name, syntax.dict, exceptions, solution.dict, question.txt.fun = NULL, title.txt.fun = NULL, data_namev, q_groupv, cutoff=NULL, pending=NULL) { 22 | for (data_name in data_namev) { 23 | for (q_group in q_groupv) { 24 | message(sprintf("benchplot %s %s %s", report_name, data_name, q_group)) 25 | message(sprintf("machine type = %s", m_type)) 26 | y = dt_task[data==data_name & question_group==q_group & machine_type==m_type][,machine_type := NULL] 27 | benchplot( 28 | y, 29 | filename = file.path("public", report_name, sprintf("%s_%s_%s.png", data_name, q_group, m_type)), 30 | solution.dict = solution.dict, 31 | syntax.dict = syntax.dict, 32 | exceptions = exceptions, 33 | question.txt.fun = question.txt.fun, 34 | title.txt.fun = title.txt.fun, 35 | cutoff = cutoff, 36 | pending = pending, 37 | url.footer = "https://duckdblabs.github.io/db-benchmark", 38 | interactive = FALSE 39 | ) 40 | } 41 | } 42 | } 43 | link = function(data_name, q_group, report_name) { 44 | fnam = sprintf("%s_%s.png", data_name, q_group) 45 | paste(sprintf("[%s](%s)", q_group, file.path(report_name, fnam)), collapse=", ") 46 | } 47 | hours_took = function(lld) { 48 | lld_script_time = lld[, .(n_script_time_sec=uniqueN(script_time_sec), script_time_sec=unique(script_time_sec)), .(solution, task, data)] 49 | if (nrow(lld_script_time[n_script_time_sec>1L])) 50 | stop("There are multiple different 'script_time_sec' for single solution+task+data on report 'index'") 51 | lld_script_time[, round(sum(script_time_sec, na.rm=TRUE)/60/60, 1)] 52 | } 53 | 54 | data_name = get_data_levels()[["groupby"]] 55 | loop_benchplot(dt_groupby, report_name="groupby", syntax.dict=groupby.syntax.dict, exceptions=groupby.exceptions, solution.dict=solution.dict, data_namev=data_name, q_groupv=c("basic","advanced"), title.txt.fun = header_title_fun, question.txt.fun = groupby_q_title_fun, cutoff = "spark", pending = "Modin", machine_types) -------------------------------------------------------------------------------- /_report/ga.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /_report/publish.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -o errexit -o nounset 3 | 4 | publishGhPages(){ 5 | rm -rf db-benchmark.gh-pages 6 | mkdir -p db-benchmark.gh-pages 7 | cd db-benchmark.gh-pages 8 | 9 | ## Set up Repo parameters 10 | git init > /dev/null 11 | git config user.name "Tmonster" 12 | git config user.email "tom@ebergen.com" 13 | 14 | ## Set gh token from local file 15 | 16 | ## Reset gh-pages branch 17 | git remote add upstream "git@github.com:duckdblabs/db-benchmark.git" 18 | git fetch -q upstream gh-pages 19 | rm -f err.txt 20 | git checkout -q gh-pages 21 | git reset -q --hard "645f86716bfb3b44c53eacf1f2bf234e75ea41ec" 22 | 23 | rm -f err.txt 24 | cp -r ../public/* ./ 25 | git add -A 26 | git commit -q -m 'publish benchmark report' 27 | cp ../time.csv . 28 | cp ../logs.csv . 29 | git add time.csv logs.csv 30 | md5sum time.csv > time.csv.md5 31 | md5sum logs.csv > logs.csv.md5 32 | git add time.csv.md5 logs.csv.md5 33 | gzip --keep time.csv 34 | gzip --keep logs.csv 35 | git add time.csv.gz logs.csv.gz 36 | git commit -q -m 'publish benchmark timings and logs' 37 | git push --force upstream gh-pages 38 | 39 | cd .. 40 | 41 | } 42 | 43 | publishGhPages 44 | -------------------------------------------------------------------------------- /_report/tech.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Technical measures of db-benchmark" 3 | output: 4 | html_document: 5 | self_contained: no 6 | toc: true 7 | includes: 8 | in_header: ga.html 9 | --- 10 | ```{r render, include=FALSE} 11 | # Rscript -e 'rmarkdown::render("./_report/tech.Rmd", output_dir="public")' && xdg-open public/tech.html 12 | ``` 13 | 14 | ```{r opts, echo=FALSE} 15 | knitr::opts_knit$set(root.dir="..") 16 | knitr::opts_chunk$set(echo=FALSE, cache=FALSE) 17 | ``` 18 | 19 | ```{r init} 20 | library(lattice) 21 | source("./_report/report.R") 22 | ld = time_logs() 23 | recent_nodename = ld[script_recent==TRUE, unique(nodename)] 24 | stopifnot(length(recent_nodename)==1L) 25 | ld = ld[nodename==recent_nodename] 26 | ``` 27 | 28 | ## Incompleted timings of last run 29 | 30 | ```{r completed} 31 | ll = ld[script_recent==TRUE, { 32 | n_na = is.na(c(time_sec_1, time_sec_2)) 33 | n_completed=sum(!n_na) 34 | n_failed=sum(n_na) 35 | .(n_completed=n_completed, n_failed=n_failed, q_failed=if(n_failed==0L) NA_character_ else paste(paste0("q", iquestion[is.na(time_sec_1) | is.na(time_sec_2)]), collapse=",")) 36 | }, 37 | c("nodename","batch","solution","task","data","in_rows","k","nasorted")] 38 | stopifnot(length(unique(ll$nodename))==1L) 39 | ``` 40 | 41 | ### groupby 42 | 43 | ```{r completed_groupby} 44 | kk(ll[task=="groupby" 45 | ][n_failed>0L, .(solution, data, in_rows, k, `NA, sorted`=nasorted, n_completed, n_failed, q_failed)]) 46 | ``` 47 | 48 | ## Full scripts executions 49 | 50 | Things to consider when looking at below plots. 51 | 52 | - Red dotted line refers to script timeout which initially was not set up. Later it was set to 60 minutes, more recently, after adding new set of questions, it was increased to 120 minutes. Up to date timeout value can be looked up in `_control/timeout.csv` file. 53 | - It might happen that script was terminated by _out of memory killer_ an OS feature. In result script timing will be smaller than in reality it should be. 54 | 55 | Refer to table above to see which script has been fully completed. 56 | 57 | ### groupby 58 | 59 | ```{r logs_plot, fig.width=8, fig.height=48} 60 | #timeout = fread("./_control/timeout.csv", colClasses=c("character","character","numeric")) 61 | #timeout = timeout["groupby", on="task", nomatch=NULL] # filter for env var RUN_TASKS 62 | #stopifnot(nrow(timeout)==1L) 63 | #timeout_m = timeout[["minutes"]] 64 | p = sapply(setNames(nm=as.character(unique(ld$solution))), simplify = FALSE, function(s) 65 | lattice::xyplot(script_time_sec/60 ~ ibatch | k+in_rows, ld[task=="groupby" & substr(data,1,2)=="G1"], 66 | type="l", grid=TRUE, groups=nasorted, 67 | subset=solution==s, main=s, 68 | panel=panel.superpose, 69 | panel.groups=function(x, y, col, col.symbol, ...) { 70 | panel.lines(x, y, col=col.symbol, ...) 71 | #panel.abline(h=timeout_m, col="red", lty=3) 72 | }, 73 | xlab = "benchmark run", 74 | ylab = "minutes", 75 | scales=list(y=list( 76 | relation="free", 77 | limits=rep(ld[solution==s, list(list(c(0, max(script_time_sec)/60))), in_rows]$V1, each=3) 78 | )), 79 | auto.key=list(points=FALSE, lines=TRUE)) 80 | ) 81 | sapply(seq_along(p), function(i) print(p[[i]], split=c(1, i, 1, length(p)), more=i!=length(p))) -> nul 82 | ``` 83 | 84 | ------ 85 | 86 | Report was generated on: `r format(Sys.time(), usetz=TRUE)`. 87 | 88 | ```{r status_set_success} 89 | cat("tech\n", file=get_report_status_file(), append=TRUE) 90 | ``` 91 | -------------------------------------------------------------------------------- /_run/download_small_medium.sh: -------------------------------------------------------------------------------- 1 | # first download and expand small data 2 | 3 | # get groupby small (0.5GB and 5GB datasets) 4 | aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/groupby_small.duckdb data/groupby_small.duckdb --no-sign-request --quiet 5 | # get join small (0.5GB and 5GB datasets) 6 | aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/join_small.duckdb data/join_small.duckdb --no-sign-request --quiet 7 | 8 | 9 | # expand groupby-small datasets to csv 10 | duckdb data/groupby_small.duckdb -c "copy G1_1e7_1e2_0_0 to 'data/G1_1e7_1e2_0_0.csv' (FORMAT CSV)" 11 | duckdb data/groupby_small.duckdb -c "copy G1_1e7_1e1_0_0 to 'data/G1_1e7_1e1_0_0.csv' (FORMAT CSV)" 12 | duckdb data/groupby_small.duckdb -c "copy G1_1e7_2e0_0_0 to 'data/G1_1e7_2e0_0_0.csv' (FORMAT CSV)" 13 | duckdb data/groupby_small.duckdb -c "copy G1_1e7_1e2_0_1 to 'data/G1_1e7_1e2_0_1.csv' (FORMAT CSV)" 14 | duckdb data/groupby_small.duckdb -c "copy G1_1e7_1e2_5_0 to 'data/G1_1e7_1e2_5_0.csv' (FORMAT CSV)" 15 | duckdb data/groupby_small.duckdb -c "copy G1_1e8_1e2_0_0 to 'data/G1_1e8_1e2_0_0.csv' (FORMAT CSV)" 16 | duckdb data/groupby_small.duckdb -c "copy G1_1e8_1e1_0_0 to 'data/G1_1e8_1e1_0_0.csv' (FORMAT CSV)" 17 | duckdb data/groupby_small.duckdb -c "copy G1_1e8_2e0_0_0 to 'data/G1_1e8_2e0_0_0.csv' (FORMAT CSV)" 18 | duckdb data/groupby_small.duckdb -c "copy G1_1e8_1e2_0_1 to 'data/G1_1e8_1e2_0_1.csv' (FORMAT CSV)" 19 | duckdb data/groupby_small.duckdb -c "copy G1_1e8_1e2_5_0 to 'data/G1_1e8_1e2_5_0.csv' (FORMAT CSV)" 20 | 21 | # expand join-small datasets to csv 22 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e1_0_0 to 'data/J1_1e7_1e1_0_0.csv' (FORMAT CSV)" 23 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e4_5_0 to 'data/J1_1e7_1e4_5_0.csv' (FORMAT CSV)" 24 | duckdb data/join_small.duckdb -c "copy J1_1e7_NA_0_1 to 'data/J1_1e7_NA_0_1.csv' (FORMAT CSV)" 25 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e5_0_0 to 'data/J1_1e8_1e5_0_0.csv' (FORMAT CSV)" 26 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e8_5_0 to 'data/J1_1e8_1e8_5_0.csv' (FORMAT CSV)" 27 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e1_0_1 to 'data/J1_1e7_1e1_0_1.csv' (FORMAT CSV)" 28 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e7_0_0 to 'data/J1_1e7_1e7_0_0.csv' (FORMAT CSV)" 29 | duckdb data/join_small.duckdb -c "copy J1_1e7_NA_5_0 to 'data/J1_1e7_NA_5_0.csv' (FORMAT CSV)" 30 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e5_0_1 to 'data/J1_1e8_1e5_0_1.csv' (FORMAT CSV)" 31 | duckdb data/join_small.duckdb -c "copy J1_1e8_NA_0_0 to 'data/J1_1e8_NA_0_0.csv' (FORMAT CSV)" 32 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e1_5_0 to 'data/J1_1e7_1e1_5_0.csv' (FORMAT CSV)" 33 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e7_0_1 to 'data/J1_1e7_1e7_0_1.csv' (FORMAT CSV)" 34 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e2_0_0 to 'data/J1_1e8_1e2_0_0.csv' (FORMAT CSV)" 35 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e5_5_0 to 'data/J1_1e8_1e5_5_0.csv' (FORMAT CSV)" 36 | duckdb data/join_small.duckdb -c "copy J1_1e8_NA_0_1 to 'data/J1_1e8_NA_0_1.csv' (FORMAT CSV)" 37 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e4_0_0 to 'data/J1_1e7_1e4_0_0.csv' (FORMAT CSV)" 38 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e7_5_0 to 'data/J1_1e7_1e7_5_0.csv' (FORMAT CSV)" 39 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e2_0_1 to 'data/J1_1e8_1e2_0_1.csv' (FORMAT CSV)" 40 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e8_0_0 to 'data/J1_1e8_1e8_0_0.csv' (FORMAT CSV)" 41 | duckdb data/join_small.duckdb -c "copy J1_1e8_NA_5_0 to 'data/J1_1e8_NA_5_0.csv' (FORMAT CSV)" 42 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e4_0_1 to 'data/J1_1e7_1e4_0_1.csv' (FORMAT CSV)" 43 | duckdb data/join_small.duckdb -c "copy J1_1e7_NA_0_0 to 'data/J1_1e7_NA_0_0.csv' (FORMAT CSV)" 44 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e2_5_0 to 'data/J1_1e8_1e2_5_0.csv' (FORMAT CSV)" 45 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e8_0_1 to 'data/J1_1e8_1e8_0_1.csv' (FORMAT CSV)" 46 | 47 | -------------------------------------------------------------------------------- /_run/partitioned_run.sh: -------------------------------------------------------------------------------- 1 | # set machine type 2 | ./_run/run_small_medium.sh 3 | 4 | ./_run/run_large.sh 5 | -------------------------------------------------------------------------------- /_run/run_large.sh: -------------------------------------------------------------------------------- 1 | # download and expand large data 2 | 3 | # get groupby large (50GB datasets) 4 | aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/groupby_large.duckdb data/groupby_large.duckdb --no-sign-request --quiet 5 | # get join small (50GB datasets) 6 | aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/join_large.duckdb data/join_large.duckdb --no-sign-request --quiet 7 | 8 | 9 | # expand groupby-large datasets to csv 10 | duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e2_0_0 to 'data/G1_1e9_1e2_0_0.csv' (FORMAT CSV)" 11 | duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e1_0_0 to 'data/G1_1e9_1e1_0_0.csv' (FORMAT CSV)" 12 | duckdb data/groupby_large.duckdb -c "copy G1_1e9_2e0_0_0 to 'data/G1_1e9_2e0_0_0.csv' (FORMAT CSV)" 13 | duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e2_0_1 to 'data/G1_1e9_1e2_0_1.csv' (FORMAT CSV)" 14 | duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e2_5_0 to 'data/G1_1e9_1e2_5_0.csv' (FORMAT CSV)" 15 | 16 | 17 | # expand join-large datasets to csv 18 | duckdb data/join_large.duckdb -c "copy J1_1e9_NA_0_0 to 'data/J1_1e9_NA_0_0.csv' (FORMAT CSV)" 19 | duckdb data/join_large.duckdb -c "copy J1_1e9_1e9_0_0 to 'data/J1_1e9_1e9_0_0.csv' (FORMAT CSV)" 20 | duckdb data/join_large.duckdb -c "copy J1_1e9_1e6_0_0 to 'data/J1_1e9_1e6_0_0.csv' (FORMAT CSV)" 21 | duckdb data/join_large.duckdb -c "copy J1_1e9_1e3_0_0 to 'data/J1_1e9_1e3_0_0.csv' (FORMAT CSV)" 22 | 23 | 24 | cp _control/data_large.csv _control/data.csv 25 | 26 | echo "Running all solutions on large (50GB) datasets" 27 | ./run.sh 28 | 29 | 30 | ### 31 | echo "done..." 32 | echo "removing data files" 33 | rm data/*.csv 34 | rm data/*.duckdb 35 | -------------------------------------------------------------------------------- /_run/run_medium.sh: -------------------------------------------------------------------------------- 1 | ./_run/download_small_medium.sh 2 | 3 | cp _control/data_medium.csv _control/data.csv 4 | 5 | 6 | echo "Running all solutions on medium (5GB) datasets" 7 | ./run.sh 8 | 9 | 10 | ### 11 | echo "done..." 12 | echo "removing small data files" 13 | rm data/*.csv 14 | rm data/*.duckdb 15 | 16 | -------------------------------------------------------------------------------- /_run/run_small.sh: -------------------------------------------------------------------------------- 1 | ./_run/download_small_medium.sh 2 | 3 | cp _control/data_small.csv _control/data.csv 4 | 5 | 6 | echo "Running all solutions on small (0.5GB) datasets" 7 | ./run.sh 8 | 9 | 10 | ### 11 | echo "done..." 12 | echo "removing small data files" 13 | rm data/*.csv 14 | rm data/*.duckdb 15 | 16 | -------------------------------------------------------------------------------- /_setup_utils/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duckdblabs/db-benchmark/47879c51efba789ddbf973423f2c77bfa411143c/_setup_utils/.DS_Store -------------------------------------------------------------------------------- /_setup_utils/install_all_solutions.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import sys 4 | import subprocess 5 | 6 | SOLUTIONS_FILENAME = "_control/solutions.csv" 7 | 8 | 9 | INCLUDE = set() 10 | 11 | def install_solution(solution_name): 12 | min_setup_file_name = f"./{solution_name}/min-setup-{solution_name}.sh" 13 | setup_file_name = f"./{solution_name}/setup-{solution_name}.sh" 14 | upgrade_file_name = f"./{solution_name}/upg-{solution_name}.sh" 15 | get_version_filename = f"./{solution_name}/ver-{solution_name}.sh" 16 | print(f"Installing {solution_name}") 17 | do_install = False 18 | try: 19 | result = subprocess.call([get_version_filename], stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL) 20 | if result != 0: 21 | do_install = True 22 | except Exception as e: 23 | do_install = True 24 | 25 | if do_install: 26 | if os.path.exists(min_setup_file_name): 27 | subprocess.call([min_setup_file_name]) 28 | elif os.path.exists(setup_file_name): 29 | subprocess.call([setup_file_name]) 30 | else: 31 | # print(f"no script for {setup_file_name} or {min_setup_file_name}") 32 | raise Exception(f"No script to install {solution_name}") 33 | else: 34 | subprocess.call([upgrade_file_name]) 35 | 36 | # based on the name of the solution, run the {{solution}}/min-setup-{{solution}}.sh file. 37 | # if there is no min-setup-{{solution}}.sh, then run setup-{{solution}}.sh. 38 | # if error, exit with an error 39 | # else don't 40 | def include_all_solutions(): 41 | global INCLUDE 42 | with open(SOLUTIONS_FILENAME, newline="") as solutions_file: 43 | solutions = csv.DictReader(solutions_file, delimiter=',') 44 | for row in solutions: 45 | if row['solution'] == "data.table": 46 | INCLUDE.add("datatable") 47 | else: 48 | INCLUDE.add(row['solution']) 49 | 50 | if len(sys.argv) == 0: 51 | print(""" 52 | Usage: python3 install_all_solutions.py solution_name solution_name ... 53 | python3 install_all_solutions.py all --exclude clickhouse polars 54 | """) 55 | exit(1) 56 | 57 | # first argument is file name 58 | 59 | def main(): 60 | global INCLUDE 61 | including = True 62 | for solution in sys.argv[1:]: 63 | if solution.strip() == "all": 64 | include_all_solutions() 65 | elif solution.strip() == "--exclude": 66 | including = False 67 | continue 68 | else: 69 | if including: 70 | if solution == "data.table": 71 | INCLUDE.add("datatable") 72 | elif solution == "clickhouse": 73 | INCLUDE.add("clickhouse") 74 | INCLUDE.add("polars") 75 | else: 76 | INCLUDE.add(solution) 77 | else: 78 | sol = solution.strip() 79 | INCLUDE.remove(sol) 80 | 81 | for solution in INCLUDE: 82 | install_solution(solution) 83 | 84 | 85 | if __name__ == "__main__": 86 | main() 87 | 88 | -------------------------------------------------------------------------------- /_setup_utils/mount.sh: -------------------------------------------------------------------------------- 1 | # script to format mount and copy data. 2 | 3 | # remove a leftover instance mount 4 | rm -rf /var/lib/mount/db-benchmark-metal 5 | 6 | # format the mount 7 | 8 | source path.env 9 | 10 | mount_name=$(sudo lsblk | awk ' 11 | NR > 1 && $1 ~ /^nvme/ && $7 == "" { 12 | # Convert SIZE column to bytes for comparison 13 | size = $4; 14 | unit = substr(size, length(size)); 15 | value = substr(size, 1, length(size)-1); 16 | if (unit == "G") { value *= 1024^3; } 17 | else if (unit == "T") { value *= 1024^4; } 18 | else if (unit == "M") { value *= 1024^2; } 19 | else if (unit == "K") { value *= 1024; } 20 | else { value *= 1; } 21 | 22 | # Keep track of the largest size 23 | if (value > max) { 24 | max = value; 25 | largest = $1; 26 | } 27 | } 28 | END { if (largest) print largest; else print "No match found"; } 29 | ') 30 | 31 | if [ -z "${MOUNT_POINT}" ]; then 32 | echo "Error: Environment variable MOUNT_POINT is not set. Set it by running" 33 | echo "source path.env" 34 | exit 1 35 | fi 36 | 37 | sudo mkfs -t xfs /dev/$mount_name 38 | 39 | sudo rm -rf $MOUNT_POINT 40 | sudo mkdir $MOUNT_POINT 41 | sudo mount /dev/$mount_name $MOUNT_POINT 42 | 43 | # make clone of repo on mount 44 | sudo mkdir $MOUNT_POINT/db-benchmark-metal 45 | sudo chown -R ubuntu:ubuntu $MOUNT_POINT 46 | 47 | 48 | git clone $(git remote get-url origin) $MOUNT_POINT/db-benchmark-metal 49 | cd $MOUNT_POINT/db-benchmark-metal -------------------------------------------------------------------------------- /_setup_utils/mount_and_install_solutions.sh: -------------------------------------------------------------------------------- 1 | # script to format mount and copy data. 2 | # mount the data 3 | ./_setup_utils/mount.sh 4 | 5 | # setup all the solutions on db-benchmark-metal. 6 | # creates the necessary python virtual environments and creates the r-libraries 7 | # needed 8 | cd ~/db-benchmark-metal && source path.env && python3 _setup_utils/install_all_solutions.py all 9 | 10 | 11 | 12 | # setup mount for clickhouse spill 13 | # sudo mkfs -t xfs /dev/nvme1n1 14 | # sudo mkdir /var/lib/clickhouse-nvme-mount/ 15 | # sudo mount /dev/nvme1n1 /var/lib/clickhouse-nvme-mount/ 16 | # # not sure if below is necessary. 17 | # sudo cp -a /var/lib/clickhouse/. /var/lib/clickhouse-nvme-mount/ 18 | # # change ownership of new mount to clickhouse 19 | # sudo chown -R clickhouse:clickhouse /var/lib/clickhouse-nvme-mount/ 20 | # sudo chown -R clickhouse:clickhouse /dev/nvme1n1 21 | 22 | # # add config so clickhouse knows to use the mount to spill data 23 | # sudo cp clickhouse/clickhouse-mount-config.xml /etc/clickhouse-server/config.d/data-paths.xml 24 | 25 | echo "------------------------------------------" 26 | echo "------------------------------------------" 27 | echo "READY TO RUN BENCHMARK. ./run.sh" 28 | -------------------------------------------------------------------------------- /_setup_utils/prep_solutions.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import csv 4 | 5 | SOLUTIONS_FILENAME = "_control/solutions.csv" 6 | RUN_CONF_FILENAME = "run.conf" 7 | 8 | SKIPPED_SOLUTIONS = [] 9 | 10 | 11 | def print_usage(): 12 | print("Usage: python3 _utils/prep_solutions.py --task=[groupby|join]") 13 | exit(1) 14 | 15 | def parse_task(): 16 | task = None 17 | for arg in sys.argv[1:]: 18 | if arg.startswith("--task="): 19 | task = arg.replace("--task=", "") 20 | if task == None or (task != "groupby" and task != "join"): 21 | print_usage() 22 | return task 23 | 24 | def parse_solution(): 25 | solution = None 26 | for arg in sys.argv[1:]: 27 | if arg.startswith("--solution="): 28 | solution = arg.replace("--solution=", "") 29 | return solution 30 | 31 | def main(): 32 | task = parse_task() 33 | solution = parse_solution() 34 | if solution == "all": 35 | solution = get_solutions(task) 36 | if solution == "clickhouse": 37 | solution = "clickhouse polars" 38 | update_run_conf_solutions(solution, task) 39 | 40 | def update_run_conf_solutions(solution_name_list, task): 41 | # change what solutions are run in run.conf 42 | os.system(f"sed 's/export RUN_SOLUTIONS=.*/export RUN_SOLUTIONS=\"{solution_name_list}\"/g' run.conf > run_2.conf") 43 | os.system(f"sed 's/export RUN_TASKS=.*/export RUN_TASKS=\"{task}\"/g' run_2.conf > run_3.conf") 44 | os.system(f"sed 's/export DO_REPORT=.*/export DO_REPORT=false/g' run_3.conf > run.conf") 45 | os.remove('run_2.conf') 46 | os.remove('run_3.conf') 47 | 48 | def get_solutions(task): 49 | solutions_for_task = "" 50 | with open(SOLUTIONS_FILENAME, newline="") as solutions_file: 51 | solutions = csv.DictReader(solutions_file, delimiter=',') 52 | for row in solutions: 53 | if row['task'] == task and row['solution'] not in SKIPPED_SOLUTIONS: 54 | solutions_for_task += row['solution'] + " " 55 | return solutions_for_task.strip() 56 | 57 | 58 | if __name__ == "__main__": 59 | main() -------------------------------------------------------------------------------- /_setup_utils/repro.sh: -------------------------------------------------------------------------------- 1 | # full repro on Ubuntu 22.04 2 | 3 | cd ~/h2oai-db-benchmark 4 | 5 | sudo apt-get -qq update 6 | sudo apt upgrade 7 | 8 | sudo apt-get -qq install -y lsb-release software-properties-common wget curl vim htop git byobu libcurl4-openssl-dev libssl-dev 9 | sudo apt-get -qq install -y libfreetype6-dev 10 | sudo apt-get -qq install -y libfribidi-dev 11 | sudo apt-get -qq install -y libharfbuzz-dev 12 | sudo apt-get -qq install -y git 13 | sudo apt-get -qq install -y libxml2-dev 14 | sudo apt-get -qq install -y make 15 | sudo apt-get -qq install -y libfontconfig1-dev 16 | sudo apt-get -qq install -y libicu-dev pandoc zlib1g-dev libgit2-dev libcurl4-openssl-dev libssl-dev libjpeg-dev libpng-dev libtiff-dev 17 | # sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 18 | sudo add-apt-repository "deb [arch=amd64,i386] https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/" 19 | sudo apt-get -qq update 20 | sudo apt-get -qq install -y r-base-dev virtualenv 21 | 22 | cd /usr/local/lib/R 23 | sudo chmod o+w site-library 24 | 25 | cd ~ 26 | mkdir -p .R 27 | echo 'CFLAGS=-O3 -mtune=native' >> ~/.R/Makevars 28 | echo 'CXXFLAGS=-O3 -mtune=native' >> ~/.R/Makevars 29 | 30 | Rscript -e 'install.packages(c("jsonlite","bit64","devtools","rmarkdown"), dependecies=TRUE, repos="https://cloud.r-project.org")' 31 | 32 | 33 | # install dplyr 34 | Rscript -e 'devtools::install_github(c("tidyverse/readr","tidyverse/dplyr"))' 35 | 36 | # install data.table 37 | Rscript -e 'install.packages("data.table", repos="https://rdatatable.gitlab.io/data.table/")' 38 | 39 | -------------------------------------------------------------------------------- /_setup_utils/setup_small.sh: -------------------------------------------------------------------------------- 1 | # full repro on Ubuntu 22.04 2 | 3 | # update the key 4 | sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 51716619E084DAB9 5 | ## Install libraries 6 | 7 | sudo apt-get -qq update 8 | sudo apt upgrade 9 | 10 | sudo apt-get -qq install make 11 | 12 | sudo apt-get -qq install wget curl openssl build-essential 13 | sudo apt-get -qq install -y r-base-dev virtualenv 14 | sudo apt-get -qq install openjdk-17-jdk 15 | 16 | sudo apt-get install -y zlib1g-dev 17 | sudo apt-get install -y pandoc unzip 18 | 19 | # update virtualenv 20 | python3 -m pip install virtualenv 21 | 22 | # sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 23 | # sudo add-apt-repository "deb [arch=amd64,i386] https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/" 24 | 25 | sudo chmod o+w /usr/local/lib/R/site-library 26 | 27 | Rscript -e 'install.packages(c("data.table", "dplyr", "knitr", "bit64"), dependecies=TRUE, repos="https://cloud.r-project.org")' 28 | 29 | mkdir -p ~/.R 30 | echo 'CFLAGS=-O3 -mtune=native' >> ~/.R/Makevars 31 | echo 'CXXFLAGS=-O3 -mtune=native' >> ~/.R/Makevars -------------------------------------------------------------------------------- /_setup_utils/sleep_and_run.sh: -------------------------------------------------------------------------------- 1 | while [ -f run.lock ] 2 | do 3 | sleep 1800 4 | done 5 | 6 | 7 | rm run.lock 8 | 9 | ./run.sh 10 | -------------------------------------------------------------------------------- /_utils/answers-validation.R: -------------------------------------------------------------------------------- 1 | source("report.R") 2 | d = time_logs() 3 | 4 | # this script meant to detect some inconsistencies within a solution results and between solutions results 5 | # note that known exceptions has been already filtered out in report.R in clean_time function 6 | 7 | check = list() 8 | 9 | # detect lack of consistency in query output within single benchmark runs within each solution separately 10 | grain = c("solution","task","data","iquestion") 11 | d[!is.na(out_rows), .(unqn_out_rows=uniqueN(out_rows), unq_out_rows=paste(unique(out_rows), collapse=",")), by=grain 12 | ][unqn_out_rows>1L 13 | ] -> check[["solution_out_rows"]] 14 | 15 | # detect lack of out_rows match in query output between solutions 16 | grain = c("task","data","iquestion","question") 17 | d[!is.na(out_rows), .(unqn_out_rows=uniqueN(out_rows), unq_out_rows=paste(unique(out_rows), collapse=",")), by=grain 18 | ][unqn_out_rows>1L 19 | ] -> check[["out_rows"]] 20 | # detect lack of chk match in query output between median chk from all solutions with tolerance=0.005 21 | chk_check = function(chk, tolerance=sqrt(.Machine$double.eps)) { 22 | len = unique(sapply(chk, length)) 23 | if (length(len)!=1L) stop("some solutions returns chk for less variables than others") 24 | med = sapply(seq.int(len), function(i) median(sapply(chk, `[[`, i))) 25 | eq_txt = sapply(chk, all.equal, med, tolerance=tolerance, simplify=FALSE) 26 | #if (any(!sapply(eq_txt, isTRUE))) browser() 27 | eq = sapply(eq_txt, isTRUE) 28 | ans = list() 29 | ans$n_match = sum(eq) 30 | ans$n_mismatch = sum(!eq) 31 | ans$med_chk = paste0(format(med, scientific=FALSE, trim=TRUE), collapse=";") 32 | ans$sol_mismatch = if (!ans$n_mismatch) NA_character_ else paste0(names(eq)[!eq], collapse=",") 33 | ans$sol_chk_mismatch = if (!ans$n_mismatch) NA_character_ else paste(paste0(names(eq)[!eq], ":", sapply(sapply(chk[names(eq)[!eq]], format, scientific=FALSE, trim=TRUE, simplify=FALSE), paste, collapse=";")), collapse=",") 34 | ans 35 | } 36 | (if (nrow(check[["solution_chk"]])) NULL else { # only proceed if chk was not mismatched within a solution 37 | d[!is.na(chk) & solution!="cudf", # cudf chk validation disabled due to issue described in model_time() in report.R 38 | .(unqn_chk=uniqueN(chk), chk=unique(chk)), by=c("solution", grain) 39 | ][, if (any(unqn_chk>1L)) stop("this check should not be performed, should be escaped in 'if' branch") else .SD # ensure chk is unique 40 | ][, .(chk, chk_l=sapply(strsplit(chk, ";", fixed=TRUE), as.numeric, simplify=FALSE)), by=c("solution", grain) 41 | ][, chk_check(setNames(chk_l, solution), tolerance=0.005), keyby=grain 42 | ][n_mismatch>0L] 43 | }) -> check[["chk"]] 44 | 45 | # detect solutions for which chk calculation timing was relatively big comparing to query timing 46 | grain = c("solution","task","data","iquestion","question") 47 | d[, .(time_sec_1, chk_time_sec_1, time_sec_2, chk_time_sec_2, time_to_chk_1=time_sec_1/chk_time_sec_1, time_to_chk_2=time_sec_2/chk_time_sec_2), by=grain 48 | ][!(time_to_chk_1>2.5 & time_to_chk_2>2.5) # spark chk is only 2.6+ times faster than query 49 | ] -> check[["chk_time_sec"]] 50 | 51 | # print results 52 | if (any(sapply(check, nrow))) { 53 | cat("db-benchmark answers consistency check failed, see details below\n") 54 | print(check) 55 | } else { 56 | cat("db-benchmark answers consistency check successfully passed\n") 57 | } 58 | -------------------------------------------------------------------------------- /_utils/compare-data.table.R: -------------------------------------------------------------------------------- 1 | source("_utils/time.R") 2 | if (system("tail -1 time.csv | cut -d',' -f2", intern=TRUE)!="1621364165") 3 | stop("time.csv and logs.csv should be as of 1621364165 batch run, filter out newer rows in those files") 4 | 5 | ## groupby ---- 6 | 7 | d = tail.time("data.table", "groupby", i=c(1L, 2L)) 8 | setnames(d, c("20210517_2f2f62d","20210518_2f2f62d"), c("th_40","th_20")) 9 | if (nrow(d[(is.na(th_40) & !is.na(th_20)) | (!is.na(th_40) & is.na(th_20))])) { 10 | stop("number of threads had an impact on completion of queries") 11 | } else { 12 | d = d[!is.na(th_40)] 13 | } 14 | d[, th_40_20:=th_40/th_20] 15 | 16 | ## improvement 17 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(in_rows)] 18 | # in_rows mean median 19 | #1: 1e7 1.0242721 0.9609988 20 | #2: 1e8 0.9378870 0.9455267 21 | #3: 1e9 0.9506561 0.9569359 22 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(knasorted)] 23 | # knasorted mean median 24 | #1: 1e2 cardinality factor, 0% NAs, unsorted data 1.0393667 0.9538973 25 | #2: 1e1 cardinality factor, 0% NAs, unsorted data 0.9521915 0.9544223 26 | #3: 2e0 cardinality factor, 0% NAs, unsorted data 0.9604950 0.9569359 27 | #4: 1e2 cardinality factor, 0% NAs, pre-sorted data 0.9371154 0.9487804 28 | #5: 1e2 cardinality factor, 5% NAs, unsorted data 0.9678192 0.9598999 29 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(question_group)] 30 | # question_group mean median 31 | #1: basic 0.9548596 0.9301310 32 | #2: advanced 0.9897345 0.9806791 33 | 34 | ## worst case by data 35 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(in_rows, knasorted)][which.max(mean)] 36 | # in_rows knasorted mean median 37 | #1: 1e7 1e2 cardinality factor, 0% NAs, unsorted data 1.239259 0.9620776 38 | ## best case by data 39 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(in_rows, knasorted)][which.min(mean)] 40 | # in_rows knasorted mean median 41 | #1: 1e8 1e2 cardinality factor, 0% NAs, unsorted data 0.9235102 0.9200373 42 | 43 | ## worst case for single question 44 | d[which.max(th_40_20)] 45 | # in_rows knasorted question_group question th_40 th_20 th_40_20 46 | #1: 1e7 1e2 cardinality factor, 0% NAs, unsorted data basic sum v1 by id1:id2 0.413 0.118 3.5 47 | ## best case for single question 48 | d[which.min(th_40_20)] 49 | # in_rows knasorted question_group question th_40 th_20 th_40_20 50 | #1: 1e9 1e2 cardinality factor, 5% NAs, unsorted data basic sum v1 mean v3 by id3 15.22 21.104 0.7211903 51 | 52 | ## join ---- 53 | 54 | d = tail.time("data.table", "join", i=c(1L, 2L)) 55 | setnames(d, c("20210517_2f2f62d","20210518_2f2f62d"), c("th_40","th_20")) 56 | if (nrow(d[(is.na(th_40) & !is.na(th_20)) | (!is.na(th_40) & is.na(th_20))])) { 57 | stop("number of threads had an impact on completion of queries") 58 | } else { 59 | d = d[!is.na(th_40)] 60 | } 61 | d[, th_40_20:=th_40/th_20] 62 | 63 | ## improvement 64 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(in_rows)] 65 | # in_rows mean median 66 | #1: 1e7 1.0149302 1.0000000 67 | #2: 1e8 0.9143243 0.9008573 68 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(knasorted)] 69 | # knasorted mean median 70 | #1: 0% NAs, unsorted data 0.9385902 0.9144130 71 | #2: 5% NAs, unsorted data 0.9612286 0.9294773 72 | #3: 0% NAs, pre-sorted data 0.9940629 0.9705720 73 | 74 | ## worst case by data 75 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(in_rows, knasorted)][which.max(mean)] 76 | # in_rows knasorted mean median 77 | #1: 1e7 0% NAs, pre-sorted data 1.055906 1.05 78 | ## best case by data 79 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(in_rows, knasorted)][which.min(mean)] 80 | # in_rows knasorted mean median 81 | #1: 1e8 0% NAs, unsorted data 0.8983325 0.8773762 82 | 83 | ## worst case for single question 84 | d[which.max(th_40_20)] 85 | # in_rows knasorted question th_40 th_20 th_40_20 86 | #1: 1e7 5% NAs, unsorted data medium inner on factor 0.513 0.443 1.158014 87 | ## best case for single question 88 | d[which.min(th_40_20)] 89 | # in_rows knasorted question th_40 th_20 th_40_20 90 | #1: 1e8 0% NAs, unsorted data medium outer on int 8.143 9.558 0.8519565 91 | -------------------------------------------------------------------------------- /_utils/download_data.sh: -------------------------------------------------------------------------------- 1 | 2 | # get small data 3 | wget https://duckdb-blobs.s3.amazonaws.com/data/db-benchmark-data/groupby_small.duckdb 4 | ~/duckdb groupby_small.duckdb -c "copy G1_1e7_1e2_0_0 to 'G1_1e7_1e2_0_0.csv' (FORMAT CSV)" 5 | ~/duckdb groupby_small.duckdb -c "copy G1_1e7_1e1_0_0 to 'G1_1e7_1e1_0_0.csv' (FORMAT CSV)" 6 | ~/duckdb groupby_small.duckdb -c "copy G1_1e7_2e0_0_0 to 'G1_1e7_2e0_0_0.csv' (FORMAT CSV)" 7 | ~/duckdb groupby_small.duckdb -c "copy G1_1e7_1e2_0_1 to 'G1_1e7_1e2_0_1.csv' (FORMAT CSV)" 8 | ~/duckdb groupby_small.duckdb -c "copy G1_1e7_1e2_5_0 to 'G1_1e7_1e2_5_0.csv' (FORMAT CSV)" 9 | ~/duckdb groupby_small.duckdb -c "copy G1_1e8_1e2_0_0 to 'G1_1e8_1e2_0_0.csv' (FORMAT CSV)" 10 | ~/duckdb groupby_small.duckdb -c "copy G1_1e8_1e1_0_0 to 'G1_1e8_1e1_0_0.csv' (FORMAT CSV)" 11 | ~/duckdb groupby_small.duckdb -c "copy G1_1e8_2e0_0_0 to 'G1_1e8_2e0_0_0.csv' (FORMAT CSV)" 12 | ~/duckdb groupby_small.duckdb -c "copy G1_1e8_1e2_0_1 to 'G1_1e8_1e2_0_1.csv' (FORMAT CSV)" 13 | ~/duckdb groupby_small.duckdb -c "copy G1_1e8_1e2_5_0 to 'G1_1e8_1e2_5_0.csv' (FORMAT CSV)" 14 | 15 | wget https://duckdb-blobs.s3.amazonaws.com/data/db-benchmark-data/join_small.duckdb 16 | 17 | # get large data 18 | wget https://duckdb-blobs.s3.amazonaws.com/data/db-benchmark-data/join_large.duckdb 19 | 20 | wget https://duckdb-blobs.s3.amazonaws.com/data/db-benchmark-data/groupby_large.duckdb 21 | ~/duckdb -c "copy G1_1e9_1e2_0_0 to 'G1_1e9_1e2_0_0.csv' (FORMAT CSV)" 22 | ~/duckdb -c "copy G1_1e9_1e1_0_0 to 'G1_1e9_1e1_0_0.csv' (FORMAT CSV)" 23 | ~/duckdb -c "copy G1_1e9_2e0_0_0 to 'G1_1e9_2e0_0_0.csv' (FORMAT CSV)" 24 | ~/duckdb -c "copy G1_1e9_1e2_0_1 to 'G1_1e9_1e2_0_1.csv' (FORMAT CSV)" 25 | ~/duckdb -c "copy G1_1e9_1e2_5_0 to 'G1_1e9_1e2_5_0.csv' (FORMAT CSV)" 26 | 27 | # get 500GB data 28 | wget https://duckdb-blobs.s3.amazonaws.com/data/db-benchmark-data/join-500gb.duckdb 29 | 30 | # ??? 31 | wget https://duckdb-blobs.s3.amazonaws.com/data/db-benchmark-data/groupby-500gb.duckdb -------------------------------------------------------------------------------- /_utils/generate-data-small.sh: -------------------------------------------------------------------------------- 1 | # Data generation data for groupby 0.5GB 2 | 3 | mkdir -p data 4 | cd data/ 5 | Rscript ../_data/groupby-datagen.R 1e7 1e2 0 0 6 | Rscript ../_data/groupby-datagen.R 1e7 1e2 15 0 7 | Rscript ../_data/join-datagen.R 1e7 0 0 0 8 | 9 | cp G1_1e7_1e2_0_0.csv G1_1e9_1e2_0_0.csv 10 | cp J1_1e7_1e1_0_0.csv J1_1e9_1e3_0_0.csv 11 | cp J1_1e7_1e4_0_0.csv J1_1e9_1e6_0_0.csv 12 | cp J1_1e7_1e7_0_0.csv J1_1e9_1e9_0_0.csv 13 | cp J1_1e7_NA_0_0.csv J1_1e9_NA_0_0.csv 14 | 15 | cd .. 16 | 17 | # don't publish, we dont even have the keys 18 | sed -i 's/DO_PUBLISH=true/DO_PUBLISH=false/g' run.conf 19 | 20 | # set sizes 21 | mv _control/data.csv _control/data.csv.original 22 | 23 | echo "task,data,nrow,k,na,sort,active" > _control/data.csv 24 | echo "groupby,G1_1e7_1e2_0_0,1e7,1e2,0,0,1" >> _control/data.csv 25 | echo "groupby,G1_1e7_1e2_15_0,1e7,1e2,15,0,1" >> _control/data.csv 26 | echo "groupby,G1_1e9_1e2_0_0,1e9,1e2,0,0,1" >> _control/data.csv 27 | echo "join,J1_1e7_NA_0_0,1e7,NA,0,0,1" >> _control/data.csv 28 | echo "join,J1_1e9_NA_0_0,1e9,NA,0,0,1" >> _control/data.csv 29 | -------------------------------------------------------------------------------- /_utils/groupby_k_factor.csv: -------------------------------------------------------------------------------- 1 | K,in_rows,question,out_rows 2 | 1e2,10000000,q1,100 3 | 1e2,10000000,q2,10000 4 | 1e2,10000000,q3,100000 5 | 1e2,10000000,q4,100 6 | 1e2,10000000,q5,100000 7 | 1e2,100000000,q1,100 8 | 1e2,100000000,q2,10000 9 | 1e2,100000000,q3,1000000 10 | 1e2,100000000,q4,100 11 | 1e2,100000000,q5,1000000 12 | 1e2,1000000000,q1,100 13 | 1e2,1000000000,q2,10000 14 | 1e2,1000000000,q3,10000000 15 | 1e2,1000000000,q4,100 16 | 1e2,1000000000,q5,10000000 17 | 1e1,10000000,q1,10 18 | 1e1,10000000,q2,100 19 | 1e1,10000000,q3,999951 20 | 1e1,10000000,q4,10 21 | 1e1,10000000,q5,999969 22 | 1e1,100000000,q1,10 23 | 1e1,100000000,q2,100 24 | 1e1,100000000,q3,9999518 25 | 1e1,100000000,q4,10 26 | 1e1,100000000,q5,9999512 27 | 1e1,1000000000,q1,10 28 | 1e1,1000000000,q2,100 29 | 1e1,1000000000,q3,99995425 30 | 1e1,1000000000,q4,10 31 | 1e1,1000000000,q5,99995357 32 | 2e0,10000000,q1,2 33 | 2e0,10000000,q2,4 34 | 2e0,10000000,q3,4323484 35 | 2e0,10000000,q4,2 36 | 2e0,10000000,q5,4323579 37 | 2e0,100000000,q1,2 38 | 2e0,100000000,q2,4 39 | 2e0,100000000,q3,43231389 40 | 2e0,100000000,q4,2 41 | 2e0,100000000,q5,43232226 42 | 2e0,1000000000,q1,2 43 | 2e0,1000000000,q2,4 44 | 2e0,1000000000,q3,431884560 45 | 2e0,1000000000,q4,2 46 | 2e0,1000000000,q5,431876300 47 | -------------------------------------------------------------------------------- /_utils/maintainer.R: -------------------------------------------------------------------------------- 1 | timeleft = function() { 2 | l = data.table::fread("logs.csv") 3 | if (!nrow(l)) 4 | stop("logs.csv files is empty") 5 | this = l[.N] 6 | if (this$action=="finish") { 7 | this[, cat(sprintf("%s %s %s must have just finished\n", solution, task, data))] 8 | quit("no") 9 | } 10 | stopifnot(this$action=="start") 11 | l = l[-.N][action!="skip", data.table::dcast(.SD, solution+task+data+batch~action, value.var="timestamp")] 12 | took = l[this, on=.(solution, task, data), nomatch=NULL, finish[.N]-start[.N]] 13 | if (!length(took) || is.na(took)) { 14 | this[, cat(sprintf("%s %s %s is running for the first time so it is unknown how much it will run\n", solution, task, data))] 15 | quit("no") 16 | } 17 | stopifnot(took>0) 18 | now = trunc(as.numeric(Sys.time())) 19 | this[, cat(sprintf("%s %s %s should take around %ss more\n", solution, task, data, trunc(took-(now-timestamp))))] 20 | q("no") 21 | } 22 | -------------------------------------------------------------------------------- /_utils/maintainer.sh: -------------------------------------------------------------------------------- 1 | # returns time left by the currently run script, useful after touch pause|stop 2 | timeleft() { 3 | if [ ! -f ./run.lock ]; then 4 | echo "benchmark is not running now" >&2 && return 1 5 | fi 6 | Rscript -e 'source("_utils/maintainer.R"); timeleft()' 7 | } 8 | -------------------------------------------------------------------------------- /_utils/parse_time_logs.R: -------------------------------------------------------------------------------- 1 | 2 | source("./_report/report.R", chdir=TRUE) 3 | source("./_helpers/helpers.R", chdir=TRUE) 4 | source("./_benchplot/benchplot.R", chdir=TRUE) 5 | source("./_benchplot/benchplot-dict.R", chdir=TRUE) 6 | ld = time_logs() -------------------------------------------------------------------------------- /_utils/partitioned_run.sh: -------------------------------------------------------------------------------- 1 | ./_run/run_small_medium.sh 2 | ./_run/run_large.sh -------------------------------------------------------------------------------- /_utils/sql_to_check_timings/timing_checks.sql: -------------------------------------------------------------------------------- 1 | -- run this in duckdb 2 | 3 | create table timings as select * from read_csv_auto('reports/oct_25/time.csv'); 4 | 5 | 6 | -- check what solutions might have bad out rows 7 | select t1.question, t1.data, t1.out_rows, t1.solution, t2.out_rows, t2.solution from 8 | timings t1, timings t2 9 | where t1.out_rows != t2.out_rows 10 | and t1.question = t2.question 11 | and t1.solution != 'clickhouse' 12 | and t2.solution != 'clickhouse' 13 | and t1.task = t2.task 14 | -- and t1.task = 'groupby' 15 | -- and t1.solution != 'arrow' 16 | -- and t2.solution != 'arrow' 17 | and t2.solution != 'datafusion' 18 | and t1.question != 'sum v3 count by id1:id6' 19 | and t1.data != 'G1_1e8_1e2_5_0' 20 | and t1.data = t2.data ; 21 | 22 | 23 | -- Value of 'chk' varies for different runs for single solution+question 24 | create table timings as select * from read_csv('time.csv'); 25 | 26 | select t1.chk, t2.chk, t1.solution, t2.solution from 27 | timings t1, timings t2 28 | where t1.chk != t2.chk 29 | and t1.question = t2.question 30 | and t1.task = t2.task 31 | and t1.solution != 'datafusion' 32 | and t2.solution != 'datafusion' 33 | and t1.solution != 'arrow' 34 | and t2.solution != 'arrow' 35 | and t1.solution != 'R-arrow' 36 | and t2.solution != 'R-arrow' 37 | and t1.solution != 'collapse' 38 | and t1.solution = t2.solution 39 | and t1.data = t2.data group by all; 40 | 41 | 42 | select t1.question, t1.data, t1.out_rows, t2.solution, t2.out_rows from 43 | timings t1, timings t2 44 | where t1.out_rows != t2.out_rows 45 | and t1.question = t2.question 46 | and t1.solution != 'clickhouse' 47 | and t2.solution != 'clickhouse' 48 | and t1.question = 'medium outer on int' 49 | and t1.data = t2.data; -------------------------------------------------------------------------------- /_utils/time.R: -------------------------------------------------------------------------------- 1 | source("./_report/report.R") 2 | 3 | download.time = function(file=c("logs.csv","time.csv"), from="https://h2oai.github.io/db-benchmark") { 4 | stopifnot(is.character(file), is.character(from), length(file)>=1L, length(from)==1L, !is.na(file), !is.na(from)) 5 | if (all(file.exists(file))) { 6 | md5file = paste(file, "md5", sep=".") 7 | download.file(file.path(from, md5file), destfile=md5file) 8 | upstream = sapply(strsplit(sapply(setNames(md5file, file), readLines), split=" ", fixed=TRUE), `[[`, 1L) 9 | current = tools::md5sum(file) 10 | new = current[names(upstream)] != upstream 11 | file = names(new)[new] 12 | if (!length(file)) { 13 | cat("nothing to download, md5sum of local files match the upstream md5sum\n") 14 | return(invisible(NULL)) 15 | } 16 | } 17 | download.file(file.path(from, file), destfile=file) 18 | return(invisible(NULL)) 19 | } 20 | 21 | drop.data.table = function(x, cols) { 22 | ans = data.table:::shallow(x) 23 | un = sapply(cols, function(col) uniqueN(x[[col]])) 24 | rm = names(un)[un <= 1L] 25 | if (length(rm)) set(ans, NULL, rm, NULL) # Rdatatable/data.table#4086 26 | ans 27 | } 28 | 29 | tail.time = function(solution, task, n=2L, i=seq_len(n), drop=TRUE) { 30 | stopifnot(length(solution)==1L, length(task)==1L, length(n)==1L, n>0L, length(i)>=1L, all(i>=0L)) 31 | if (!missing(n) && !missing(i)) stop("only 'n' or 'i' argument should be used, not both") 32 | ld = time_logs() 33 | s = solution 34 | t = task 35 | ld = ld[solution==s & task==t] 36 | ub = unique(ld$batch) 37 | i = i[i <= length(ub)] # there might be only N unq batches but N+1 requested 38 | if (!length(i)) stop("there are not enough registered runs for this solution and requested recent timings") 39 | b = rev(ub)[i] 40 | ans = dcast( 41 | ld[batch%in%b], 42 | in_rows + knasorted + question_group + question ~ paste(format(as.POSIXct(as.numeric(batch), origin="1970-01-01"), "%Y%m%d"), substr(git, 1, 7), sep="_"), 43 | value.var = "time_sec_1" 44 | ) 45 | if (drop) ans = drop.data.table(ans, cols=c("in_rows","knasorted","question_group","question")) 46 | ans 47 | } 48 | 49 | compare.time = function(solutions, task, drop=TRUE) { 50 | stopifnot(length(solutions)>=1L, length(task)==1L) 51 | ld = time_logs() 52 | t = task 53 | ans = dcast( 54 | ld[script_recent==TRUE & solution%in%solutions & task==t], 55 | in_rows + knasorted + question_group + question ~ solution, 56 | value.var = "time_sec_1" 57 | ) 58 | if (drop) ans = drop.data.table(ans, cols=c("in_rows","knasorted","question_group","question")) 59 | ans 60 | } 61 | 62 | ## maintainer mode 63 | #scp -C mr-dl11:~/git/db-benchmark/logs.csv ~/git/db-benchmark/logs.csv && scp -C mr-dl11:~/git/db-benchmark/time.csv ~/git/db-benchmark/time.csv 64 | 65 | ## user mode 66 | #download.time() 67 | #tail.time("juliadf", "groupby", i=c(1L, 2L)) 68 | #tail.time("data.table", "groupby", i=c(1L, 2L)) 69 | #compare.time(c("data.table","spark","pydatatable"), "join") 70 | -------------------------------------------------------------------------------- /_utils/validate_no_errors.sh: -------------------------------------------------------------------------------- 1 | if [ $(grep -i 'error|exception' out/run_*.err | wc -l) = 0 ] 2 | then 3 | # no true errors found, print last line of each output script 4 | echo "No Errors found in run_*.err logs" 5 | else 6 | echo "The following errors have been found. Failing check" 7 | grep -i "error|exception" out/*.err 8 | exit 1 9 | fi 10 | 11 | 12 | 13 | # check report generation. If this fails, the logs.csv/time.csv 14 | # have errors 15 | Rscript _utils/parse_time_logs.R 2> report_check.txt 16 | # https://gist.github.com/jesugmz/3fda0fc7c1006cedfe039ff1459c3174 17 | output=$(wc -l report_check.txt | awk '{ print $1 }') 18 | if [ $output -ne 0 ] 19 | then 20 | echo "report check not empty" 21 | cat report_check.txt 22 | exit 1 23 | fi 24 | echo "time.csv and logs.csv can be parsed" 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /arrow/VERSION: -------------------------------------------------------------------------------- 1 | 13.0.0.1 2 | -------------------------------------------------------------------------------- /clickhouse/VERSION: -------------------------------------------------------------------------------- 1 | 25.1.3.23 -------------------------------------------------------------------------------- /clickhouse/ch.sh: -------------------------------------------------------------------------------- 1 | ch_installed() { 2 | dpkg-query -Wf'${db:Status-abbrev}' clickhouse-server 2>/dev/null | grep -q '^i' 3 | } 4 | 5 | ch_active() { 6 | clickhouse-client --user db_benchmark --query="SELECT 0;" > /dev/null 2>&1 7 | local ret=$?; 8 | if [[ $ret -eq 0 ]]; then return 0; elif [[ $ret -eq 210 ]]; then return 1; else echo "Unexpected return code from clickhouse-client: $ret" >&2 && return 1; fi; 9 | } 10 | 11 | ch_wait() { 12 | for _ in $(seq 1 60); do if [[ $(wget -q 'localhost:8123' -O-) == 'Ok.' ]]; then break ; else sleep 1; fi ; done 13 | ch_active 14 | } 15 | 16 | ch_start() { 17 | echo '# ch_start: starting clickhouse-server' 18 | sudo service clickhouse-server start 19 | ch_wait 20 | } 21 | 22 | ch_stop() { 23 | echo '# ch_stop: stopping clickhouse-server' 24 | sudo service clickhouse-server stop && sleep 15 25 | } 26 | 27 | ch_query() { 28 | ENGINE=Memory 29 | if [ $COMPRESS -eq 1 ]; then 30 | ENGINE="Memory settings compress=1" 31 | fi 32 | if [ $ON_DISK -eq 1 ]; then 33 | ENGINE="MergeTree ORDER BY tuple()" 34 | fi 35 | sudo touch '/var/lib/clickhouse/flags/force_drop_table' && sudo chmod 666 '/var/lib/clickhouse/flags/force_drop_table' 36 | clickhouse-client --user db_benchmark --query "DROP TABLE IF EXISTS ans;" 37 | clickhouse-client --user db_benchmark --log_comment ${RUNNAME} --query "CREATE TABLE ans ENGINE = ${ENGINE} AS ${QUERY} SETTINGS max_insert_threads=${THREADS}, max_threads=${THREADS};" 38 | local ret=$?; 39 | if [[ $ret -eq 0 ]]; then return 0; elif [[ $ret -eq 210 ]]; then return 1; else echo "Unexpected return code from clickhouse-client: $ret" >&2 && return 1; fi; 40 | clickhouse-client --user db_benchmark --query "SELECT * FROM ans LIMIT 3;" 41 | sudo touch '/var/lib/clickhouse/flags/force_drop_table' && sudo chmod 666 '/var/lib/clickhouse/flags/force_drop_table' 42 | clickhouse-client --user db_benchmark --query "DROP TABLE ans;" 43 | } 44 | 45 | ch_logrun() { 46 | clickhouse-client --user db_benchmark --query "SYSTEM FLUSH LOGS;" 47 | clickhouse-client --user db_benchmark --query "SELECT ${RUN} AS run, toUnixTimestamp(now()) AS timestamp, '${TASK}' AS task, '${SRC_DATANAME}' AS data_name, NULL AS in_rows, '${QUESTION}' AS question, result_rows AS out_rows, NULL AS out_cols, 'clickhouse' AS solution, version() AS version, NULL AS git, '${FUNCTION}' AS fun, query_duration_ms/1000 AS time_sec, memory_usage/1073741824 AS mem_gb, 1 AS cache, NULL AS chk, NULL AS chk_time_sec, 1 AS on_disk FROM system.query_log WHERE type='QueryFinish' AND log_comment='${RUNNAME}' ORDER BY query_start_time DESC LIMIT 1 FORMAT CSVWithNames;" > clickhouse/log/${RUNNAME}.csv 48 | local ret=$?; 49 | if [[ $ret -eq 0 ]]; then return 0; elif [[ $ret -eq 210 ]]; then return 1; else echo "Unexpected return code from clickhouse-client: $ret" >&2 && return 1; fi; 50 | } 51 | 52 | ch_make_2_runs() { 53 | RUN=1 54 | RUNNAME="${TASK}_${SRC_DATANAME}_q${Q}_r${RUN}" 55 | ch_query 56 | ch_logrun 57 | 58 | if [ $COMPRESS -eq 1 ]; then 59 | # It will take some time for memory freed by Memory engine to be returned back to the system. 60 | # Without a sleep we might get a MEMORY_LIMIT exception during the second run of the query. 61 | # It is done only when $COMPRESS=1 because this variable is set to true only for the largest dataset. 62 | sleep 60 63 | fi 64 | 65 | RUN=2 66 | RUNNAME="${TASK}_${SRC_DATANAME}_q${Q}_r${RUN}" 67 | ch_query 68 | ch_logrun 69 | } 70 | -------------------------------------------------------------------------------- /clickhouse/clickhouse-misc.sh: -------------------------------------------------------------------------------- 1 | CREATE USER IF NOT EXISTS db_benchmark IDENTIFIED WITH no_password SETTINGS max_memory_usage = 28000000000 WRITABLE; 2 | GRANT select, insert, create, alter, alter user, drop on *.* to db_benchmark; 3 | 4 | ALTER USER db_benchmark SETTINGS max_memory_usage_for_user = 28000000000; 5 | 6 | 7 | CREATE TABLE G1_1e9_1e1_0_0 (id1 LowCardinality(Nullable(String)), id2 LowCardinality(Nullable(String)), id3 Nullable(String), id4 Nullable(Int32), id5 Nullable(Int32), id6 Nullable(Int32), v1 Nullable(Int32), v2 Nullable(Int32), v3 Nullable(Float64)) ENGINE = MergeTree() ORDER BY tuple(); 8 | 9 | INSERT INTO G1_1e9_1e1_0_0 FROM INFILE 'data/G1_1e9_1e1_0_0.csv'; 10 | 11 | # q1 12 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id1, sum(v1) AS v1 FROM G1_1e9_1e1_0_0 GROUP BY id1 SETTINGS max_insert_threads=32, max_threads=32; 13 | 14 | drop table if exists ans; 15 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id1, id2, sum(v1) AS v1 FROM G1_1e9_1e1_0_0 GROUP BY id1, id2 SETTINGS max_insert_threads=32, max_threads=32; 16 | 17 | drop table if exists ans; 18 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id3, sum(v1) AS v1, avg(v3) AS v3 FROM G1_1e9_1e1_0_0 GROUP BY id3 SETTINGS max_insert_threads=16, max_threads=16; 19 | 20 | drop table if exists ans; 21 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id4, avg(v1) AS v1, avg(v2) AS v2, avg(v3) AS v3 FROM G1_1e9_1e1_0_0 GROUP BY id4 SETTINGS max_insert_threads=32, max_threads=32; 22 | 23 | drop table if exists ans; 24 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id6, sum(v1) AS v1, sum(v2) AS v2, sum(v3) AS v3 FROM G1_1e9_1e1_0_0 GROUP BY id6 SETTINGS max_insert_threads=32, max_threads=32; 25 | 26 | drop table if exists ans; 27 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id4, id5, medianExact(v3) AS median_v3, stddevPop(v3) AS sd_v3 FROM G1_1e9_1e1_0_0 GROUP BY id4, id5 SETTINGS max_insert_threads=32, max_threads=32; 28 | 29 | drop table if exists ans; 30 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id3, max(v1) - min(v2) AS range_v1_v2 FROM G1_1e9_1e1_0_0 GROUP BY id3 SETTINGS max_insert_threads=32, max_threads=32; 31 | 32 | drop table if exists ans; 33 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id6, arrayJoin(arraySlice(arrayReverseSort(groupArray(v3)), 1, 2)) AS v3 FROM (SELECT id6, v3 FROM G1_1e9_1e1_0_0 WHERE v3 IS NOT NULL) AS subq GROUP BY id6 SETTINGS max_insert_threads=32, max_threads=32; 34 | 35 | drop table if exists ans; 36 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id2, id4, pow(corr(v1, v2), 2) AS r2 FROM G1_1e9_1e1_0_0 GROUP BY id2, id4 SETTINGS max_insert_threads=32, max_threads=32; 37 | 38 | drop table if exists ans; 39 | 40 | #q10 41 | 42 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id1, id2, id3, id4, id5, id6, sum(v3) AS v3, count() AS cnt FROM G1_1e9_1e1_0_0 GROUP BY id1, id2, id3, id4, id5, id6 SETTINGS max_insert_threads=32, max_threads=32; -------------------------------------------------------------------------------- /clickhouse/clickhouse-mount-config.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | /var/lib/mount/clickhouse-nvme-mount/ 4 | 0 5 | 0 6 | 7 | -------------------------------------------------------------------------------- /clickhouse/clickhouse-parse-log.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | cat("# clickhouse-parse-log.R: starting to parse timings from clickhouse/log/.\n") 4 | 5 | source("./_helpers/helpers.R") 6 | args = commandArgs(TRUE) # args = c("groupby","G1_1e6_1e2_0_0") 7 | stopifnot(length(args)==2L) 8 | task = args[1L] 9 | data_name = args[2L] 10 | machine_type = Sys.getenv("MACHINE_TYPE") 11 | 12 | library(data.table) 13 | # sort files according to question and run 14 | sort_q_r = function(f) { 15 | tmp = strsplit(tools::file_path_sans_ext(basename(f)), "_", fixed=TRUE) 16 | if (length(len<-unique(lengths(tmp)))!=1L) 17 | stop("files names for some of logs differs in number of underscores, it should be clickhouse/log/[task]_[data_name]_q[i]_r[j].csv") 18 | stopifnot(len>1L) 19 | qr = rbindlist(lapply(lapply(tmp, `[`, c(len-1L,len)), function(x) { 20 | stopifnot(substr(x[1L], 1L, 1L)=="q", substr(x[2L], 1L, 1L)=="r") 21 | list(q=as.integer(substr(x[1L], 2L, nchar(x[1L]))), r=as.integer(substr(x[2L], 2L, nchar(x[2L])))) 22 | })) 23 | o = data.table:::forderv(qr) ## https://github.com/Rdatatable/data.table/issues/3447 24 | if (!length(o)) f else f[o] 25 | } 26 | fcsv = list.files("clickhouse/log", sprintf("^%s_%s_q.*\\.csv$", task, data_name), full.names=TRUE) 27 | if (!length(fcsv)) 28 | stop("no log files produced, did you run clickhouse sql script that will output such to clickhouse/log/[task]_[data_name]_q[i]_r[j].csv") 29 | fcsv = sort_q_r(fcsv) 30 | d = rbindlist(lapply(fcsv, fread, na.strings="\\N")) # fill=TRUE for debugging type column in some queries 31 | if (!nrow(d)) 32 | stop("timing log files empty") 33 | stopifnot(all(d$task==task), all(d$data_name==data_name)) 34 | .in_rows = strsplit(data_name, "_", fixed=TRUE)[[1L]][[2L]] ## taken from data_name because for join CH will sum in rows from both tables 35 | d[, 36 | write.log(run=as.integer(run), timestamp=as.numeric(timestamp), task=as.character(task), data=as.character(data_name), in_rows=as.numeric(.in_rows), question=as.character(question), 37 | out_rows=as.numeric(NA), out_cols=as.integer(NA), solution=as.character(solution), version=as.character(version), git=as.character(NA), fun=as.character(fun), 38 | time_sec=as.numeric(time_sec), mem_gb=as.numeric(NA), cache=as.logical(cache), chk=as.character(NA), chk_time_sec=as.numeric(NA), on_disk=as.logical(on_disk), machine_type=as.character(machine_type)), 39 | by = seq_len(nrow(d))] -> nul 40 | 41 | cat("# clickhouse-parse-log.R: parsing timings to time.csv finished\n") 42 | 43 | if (!interactive()) q("no") 44 | -------------------------------------------------------------------------------- /clickhouse/groupby-clickhouse.sh: -------------------------------------------------------------------------------- 1 | source ./clickhouse/ch.sh 2 | 3 | SOLUTION=clickhouse 4 | TASK=groupby 5 | 6 | # /* q1: question='sum v1 by id1' */ 7 | 8 | Q=1 9 | QUESTION="sum v1 by id1" 10 | QUERY="SELECT id1, sum(v1) AS v1 FROM ${SRC_DATANAME} GROUP BY id1" 11 | 12 | ch_make_2_runs 13 | 14 | # /* q2: question='sum v1 by id1:id2' */ 15 | Q=2 16 | QUESTION="sum v1 by id1:id2" 17 | QUERY="SELECT id1, id2, sum(v1) AS v1 FROM ${SRC_DATANAME} GROUP BY id1, id2" 18 | 19 | ch_make_2_runs 20 | 21 | # /* q3: question='sum v1 mean v3 by id3' */ 22 | Q=3 23 | QUESTION="sum v1 mean v3 by id3" 24 | QUERY="SELECT id3, sum(v1) AS v1, avg(v3) AS v3 FROM ${SRC_DATANAME} GROUP BY id3" 25 | 26 | ch_make_2_runs 27 | 28 | # /* q4: question='mean v1:v3 by id4' */ 29 | Q=4 30 | QUESTION="mean v1:v3 by id4" 31 | QUERY="SELECT id4, avg(v1) AS v1, avg(v2) AS v2, avg(v3) AS v3 FROM ${SRC_DATANAME} GROUP BY id4" 32 | 33 | ch_make_2_runs 34 | 35 | # /* q5: question='sum v1:v3 by id6' */ 36 | Q=5 37 | QUESTION="sum v1:v3 by id6" 38 | QUERY="SELECT id6, sum(v1) AS v1, sum(v2) AS v2, sum(v3) AS v3 FROM ${SRC_DATANAME} GROUP BY id6" 39 | 40 | ch_make_2_runs 41 | 42 | # /* q6: question='median v3 sd v3 by id4 id5' */ 43 | Q=6 44 | QUESTION="median v3 sd v3 by id4 id5" 45 | QUERY="SELECT id4, id5, medianExact(v3) AS median_v3, stddevPop(v3) AS sd_v3 FROM ${SRC_DATANAME} GROUP BY id4, id5" 46 | 47 | ch_make_2_runs 48 | 49 | # /* q7: question='max v1 - min v2 by id3' */ 50 | Q=7 51 | QUESTION="max v1 - min v2 by id3" 52 | QUERY="SELECT id3, max(v1) - min(v2) AS range_v1_v2 FROM ${SRC_DATANAME} GROUP BY id3" 53 | 54 | ch_make_2_runs 55 | 56 | # /* q8: question='largest two v3 by id6' */ 57 | Q=8 58 | QUESTION="largest two v3 by id6" 59 | QUERY="SELECT id6, arrayJoin(arraySlice(arrayReverseSort(groupArray(v3)), 1, 2)) AS v3 FROM (SELECT id6, v3 FROM ${SRC_DATANAME} WHERE v3 IS NOT NULL) AS subq GROUP BY id6" 60 | 61 | ch_make_2_runs 62 | 63 | # /* q9: question='regression v1 v2 by id2 id4' */ 64 | Q=9 65 | QUESTION="regression v1 v2 by id2 id4" 66 | QUERY="SELECT id2, id4, pow(corr(v1, v2), 2) AS r2 FROM ${SRC_DATANAME} GROUP BY id2, id4" 67 | 68 | ch_make_2_runs 69 | 70 | # /* q10: question='sum v3 count by id1:id6' */ 71 | Q=10 72 | QUESTION="sum v3 count by id1:id6" 73 | QUERY="SELECT id1, id2, id3, id4, id5, id6, sum(v3) AS v3, count() AS cnt FROM ${SRC_DATANAME} GROUP BY id1, id2, id3, id4, id5, id6" 74 | 75 | ch_make_2_runs 76 | -------------------------------------------------------------------------------- /clickhouse/join-clickhouse.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | source ./clickhouse/ch.sh 4 | 5 | SOLUTION=clickhouse 6 | TASK=join 7 | 8 | echo SRC ${SRC_DATANAME} RHS1 ${RHS1} RHS2 ${RHS2} RHS3 ${RHS3} COMPRESS ${COMPRESS} THREADS ${THREADS} 9 | 10 | # /* q1: question='small inner on int' */ 11 | Q=1 12 | QUESTION="small inner on int" 13 | QUERY="SELECT id1, x.id2, x.id3, x.id4, y.id4, x.id5, x.id6, x.v1, y.v2 FROM ${SRC_DATANAME} AS x INNER JOIN ${RHS1} AS y USING (id1)" 14 | ch_make_2_runs 15 | 16 | # /* q2: question='medium inner on int' */ 17 | Q=2 18 | QUESTION="medium inner on int" 19 | QUERY="SELECT x.id1, y.id1, id2, x.id3, x.id4, y.id4, x.id5, y.id5, x.id6, x.v1, y.v2 FROM ${SRC_DATANAME} AS x INNER JOIN ${RHS2} AS y USING (id2)" 20 | ch_make_2_runs 21 | 22 | # /* q3: question='medium outer on int' */ 23 | Q=3 24 | QUESTION="medium outer on int" 25 | QUERY="SELECT x.id1, y.id1, id2, x.id3, x.id4, y.id4, x.id5, y.id5, x.id6, x.v1, y.v2 FROM ${SRC_DATANAME} AS x LEFT JOIN ${RHS2} AS y USING (id2)" 26 | ch_make_2_runs 27 | 28 | # /* q4: question='medium inner on factor' */ 29 | Q=4 30 | QUESTION="medium inner on factor" 31 | QUERY="SELECT x.id1, y.id1, x.id2, y.id2, x.id3, x.id4, y.id4, id5, x.id6, x.v1, y.v2 FROM ${SRC_DATANAME} AS x INNER JOIN ${RHS2} AS y USING (id5)" 32 | ch_make_2_runs 33 | 34 | # /* q5: question='big inner on int' */ 35 | Q=5 36 | QUESTION="big inner on int" 37 | QUERY="SELECT x.id1, y.id1, x.id2, y.id2, id3, x.id4, y.id4, x.id5, y.id5, x.id6, y.id6, x.v1, y.v2 FROM ${SRC_DATANAME} AS x INNER JOIN ${RHS3} AS y USING (id3)" 38 | ch_make_2_runs 39 | -------------------------------------------------------------------------------- /clickhouse/setup-clickhouse.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # install 3 | sudo apt-get install -y apt-transport-https ca-certificates curl gnupg 4 | curl -fsSL 'https://packages.clickhouse.com/rpm/lts/repodata/repomd.xml.key' | sudo gpg --dearmor -o /usr/share/keyrings/clickhouse-keyring.gpg 5 | 6 | ARCH=$(dpkg --print-architecture) 7 | echo "deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg arch=${ARCH}] https://packages.clickhouse.com/deb stable main" | sudo tee /etc/apt/sources.list.d/clickhouse.list 8 | sudo apt-get update 9 | 10 | sudo apt-get install -y clickhouse-server clickhouse-client 11 | 12 | # stop server if service was already running 13 | sudo service clickhouse-server start ||: 14 | 15 | 16 | # modify clickhouse settings so data is stored on the mount. 17 | sudo mkdir -p /var/lib/mount/clickhouse-nvme-mount/ 18 | sudo chown clickhouse:clickhouse /var/lib/mount/clickhouse-nvme-mount 19 | 20 | # copy clickhouse config 21 | sudo cp -a /var/lib/clickhouse/. /var/lib/mount/clickhouse-nvme-mount/ 22 | sudo cp clickhouse/clickhouse-mount-config.xml /etc/clickhouse-server/config.d/data-paths.xml 23 | 24 | 25 | # start server 26 | sudo rm -rf /var/log/clickhouse-server/clickhouse-server.err.log /var/log/clickhouse-server/clickhouse-server.log 27 | sudo service clickhouse-server start 28 | 29 | 30 | MEMORY_LIMIT=0 31 | BYTES_BEFORE_EXTERNAL_GROUP_BY=0 32 | if [[ $MACHINE_TYPE == "c6id.4xlarge" ]]; then 33 | MEMORY_LIMIT=28000000000 34 | BYTES_BEFORE_EXTERNAL_GROUP_BY=20000000000 35 | fi 36 | 37 | clickhouse-client --query "CREATE USER IF NOT EXISTS db_benchmark IDENTIFIED WITH no_password SETTINGS max_memory_usage = $MEMORY_LIMIT, max_bytes_before_external_group_by = $BYTES_BEFORE_EXTERNAL_GROUP_BY WRITABLE;" 38 | clickhouse-client --query "GRANT select, insert, create, alter, alter user, create table, truncate, drop, system flush logs on *.* to db_benchmark;" 39 | 40 | ./clickhouse/ver-clickhouse.sh 41 | -------------------------------------------------------------------------------- /clickhouse/upg-clickhouse.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # upgrade to latest released 5 | echo 'upgrading clickhouse-server clickhouse-client...' 6 | sudo apt-get install --only-upgrade clickhouse-server clickhouse-client 7 | 8 | if [[ $TEST_RUN != "true" ]]; then 9 | sudo chown ubuntu:ubuntu clickhouse/VERSION 10 | sudo chown ubuntu:ubuntu clickhouse/REVISION 11 | fi 12 | 13 | 14 | # modify clickhouse settings so data is stored on the mount. 15 | # This is necessary for when clickhouse is installed on a machine but the mount looses all data 16 | sudo mkdir -p /var/lib/mount/clickhouse-nvme-mount/ 17 | sudo chown clickhouse:clickhouse /var/lib/mount/clickhouse-nvme-mount 18 | 19 | # copy clickhouse config 20 | sudo cp -a /var/lib/clickhouse/. /var/lib/mount/clickhouse-nvme-mount/ 21 | sudo cp clickhouse/clickhouse-mount-config.xml /etc/clickhouse-server/config.d/data-paths.xml 22 | 23 | 24 | # start server 25 | sudo rm -rf /var/log/clickhouse-server/clickhouse-server.err.log /var/log/clickhouse-server/clickhouse-server.log 26 | sudo service clickhouse-server start -------------------------------------------------------------------------------- /clickhouse/ver-clickhouse.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | source ./clickhouse/ch.sh # clickhouse helper scripts 5 | 6 | ch_installed && clickhouse-client --version-clean > clickhouse/VERSION && echo "" > clickhouse/REVISION 7 | 8 | if [[ $TEST_RUN != "true" ]]; then 9 | sudo chown ubuntu:ubuntu clickhouse/VERSION 10 | sudo chown ubuntu:ubuntu clickhouse/REVISION 11 | fi -------------------------------------------------------------------------------- /collapse/VERSION: -------------------------------------------------------------------------------- 1 | 2.1.2 2 | -------------------------------------------------------------------------------- /collapse/groupby2014-collapse.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | cat("# groupby2014-collapse.R\n") 4 | 5 | source("./_helpers/helpers.R") 6 | 7 | stopifnot(requireNamespace("data.table", quietly=TRUE)) # collapse does not support integer64. Oversized ints will be summed to double. 8 | .libPaths("./collapse/r-collapse") # tidyverse/collapse#4641 9 | suppressPackageStartupMessages(library("collapse", lib.loc="./collapse/r-collapse", warn.conflicts=FALSE)) 10 | ver = packageVersion("collapse") 11 | git = "" # uses stable version now #124 12 | task = "groupby2014" 13 | solution = "collapse" 14 | fun = "group_by" 15 | cache = TRUE 16 | on_disk = FALSE 17 | 18 | data_name = Sys.getenv("SRC_DATANAME") 19 | src_grp = file.path("data", paste(data_name, "csv", sep=".")) 20 | cat(sprintf("loading dataset %s\n", data_name)) 21 | 22 | x = data.table::fread(src_grp, showProgress=FALSE, data.table=FALSE) 23 | print(nrow(x)) 24 | gc() 25 | 26 | # Setting collapse options: namespace masking and performance 27 | oldopts <- set_collapse(nthreads = data.table::getDTthreads(), 28 | mask = "all", 29 | sort = endsWith(data_name, "_1"), 30 | na.rm = anyNA(num_vars(x)), 31 | stable.algo = FALSE) 32 | 33 | task_init = proc.time()[["elapsed"]] 34 | cat("grouping...\n") 35 | 36 | question = "sum v1 by id1" # q1 37 | t = system.time(print(dim(ans<-collap(x, v1 ~ id1, sum))))[["elapsed"]] 38 | m = memory_usage() 39 | chkt = system.time(chk<-summarise(ans, v1=sum(v1)))[["elapsed"]] 40 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 41 | rm(ans) 42 | t = system.time(print(dim(ans<-collap(x, v1 ~ id1, sum))))[["elapsed"]] 43 | m = memory_usage() 44 | chkt = system.time(chk<-summarise(ans, v1=sum(v1)))[["elapsed"]] 45 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 46 | print(head(ans, 3)) 47 | print(tail(ans, 3)) 48 | rm(ans) 49 | 50 | question = "sum v1 by id1:id2" # q2 51 | t = system.time(print(dim(ans<-collap(x, v1 ~ id1 + id2, sum))))[["elapsed"]] 52 | m = memory_usage() 53 | chkt = system.time(chk<-summarise(ans, v1=sum(v1)))[["elapsed"]] 54 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 55 | rm(ans) 56 | t = system.time(print(dim(ans<-collap(x, v1 ~ id1 + id2, sum))))[["elapsed"]] 57 | m = memory_usage() 58 | chkt = system.time(chk<-summarise(ans, v1=sum(v1)))[["elapsed"]] 59 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 60 | print(head(ans, 3)) 61 | print(tail(ans, 3)) 62 | rm(ans) 63 | 64 | question = "sum v1 mean v3 by id3" # q3 65 | t = system.time(print(dim(ans<-collap(x, ~ id3, custom = list(sum = "v1", mean = "v3")))))[["elapsed"]] 66 | m = memory_usage() 67 | chkt = system.time(chk<-summarise(ans, v1=sum(v1), v3=sum(v3)))[["elapsed"]] 68 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 69 | rm(ans) 70 | t = system.time(print(dim(ans<-collap(x, ~ id3, custom = list(sum = "v1", mean = "v3")))))[["elapsed"]] 71 | m = memory_usage() 72 | chkt = system.time(chk<-summarise(ans, v1=sum(v1), v3=sum(v3)))[["elapsed"]] 73 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 74 | print(head(ans, 3)) 75 | print(tail(ans, 3)) 76 | rm(ans) 77 | 78 | question = "mean v1:v3 by id4" # q4 79 | t = system.time(print(dim(ans<-x |> group_by(id4) |> select(v1:v3) |> mean())))[["elapsed"]] 80 | m = memory_usage() 81 | chkt = system.time(chk<-summarise(ans, v1=sum(v1), v2=sum(v2), v3=sum(v3)))[["elapsed"]] 82 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 83 | rm(ans) 84 | t = system.time(print(dim(ans<-x |> group_by(id4) |> select(v1:v3) |> mean())))[["elapsed"]] 85 | m = memory_usage() 86 | chkt = system.time(chk<-summarise(ans, v1=sum(v1), v2=sum(v2), v3=sum(v3)))[["elapsed"]] 87 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 88 | print(head(ans, 3)) 89 | print(tail(ans, 3)) 90 | rm(ans) 91 | 92 | question = "sum v1:v3 by id6" # q5 93 | t = system.time(print(dim(ans<-x |> group_by(id6) |> select(v1:v3) |> sum())))[["elapsed"]] 94 | m = memory_usage() 95 | chkt = system.time(chk<-summarise(ans, v1=sum(v1), v2=sum(v2), v3=sum(v3)))[["elapsed"]] 96 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 97 | rm(ans) 98 | t = system.time(print(dim(ans<-x |> group_by(id6) |> select(v1:v3) |> sum())))[["elapsed"]] 99 | m = memory_usage() 100 | chkt = system.time(chk<-summarise(ans, v1=sum(v1), v2=sum(v2), v3=sum(v3)))[["elapsed"]] 101 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 102 | print(head(ans, 3)) 103 | print(tail(ans, 3)) 104 | rm(ans) 105 | 106 | cat(sprintf("grouping finished, took %.0fs\n", proc.time()[["elapsed"]]-task_init)) 107 | 108 | set_collapse(oldopts) 109 | 110 | if( !interactive() ) q("no", status=0) 111 | -------------------------------------------------------------------------------- /collapse/setup-collapse.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # install stable collapse 5 | mkdir -p ./collapse/r-collapse 6 | Rscript -e 'install.packages(c("Rcpp", "collapse"), lib="./collapse/r-collapse", repos = "http://cloud.r-project.org")' 7 | 8 | ./collapse/ver-collapse.sh -------------------------------------------------------------------------------- /collapse/upg-collapse.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # upgrade all packages in collapse library only if new collapse is out 5 | echo 'upgrading collapse...' 6 | Rscript -e 'ap=available.packages(); if (ap["collapse","Version"]!=packageVersion("collapse", lib.loc="./collapse/r-collapse")) update.packages(lib.loc="./collapse/r-collapse", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)' 7 | -------------------------------------------------------------------------------- /collapse/ver-collapse.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | Rscript -e 'v=read.dcf(system.file(package="collapse", lib.loc="./collapse/r-collapse", "DESCRIPTION"), fields=c("Version","RemoteSha")); colnames(v)[colnames(v)=="RemoteSha"]="Revision"; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("collapse", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))' 5 | -------------------------------------------------------------------------------- /dask/VERSION: -------------------------------------------------------------------------------- 1 | 2024.9.0 -------------------------------------------------------------------------------- /dask/common.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gc 3 | import os 4 | import logging 5 | import timeit 6 | from abc import ABC, abstractmethod 7 | from typing import Iterable, Any 8 | 9 | import dask.dataframe as dd 10 | from dask import distributed 11 | 12 | logging.basicConfig( 13 | level=logging.INFO, 14 | format='{ %(name)s:%(lineno)d @ %(asctime)s } - %(message)s' 15 | ) 16 | logger = logging.getLogger(__name__) 17 | 18 | THIS_DIR = os.path.abspath( 19 | os.path.dirname(__file__) 20 | ) 21 | HELPERS_DIR = os.path.abspath( 22 | os.path.join( 23 | THIS_DIR, '../_helpers' 24 | ) 25 | ) 26 | sys.path.extend((THIS_DIR, HELPERS_DIR)) 27 | from helpers import * 28 | 29 | class Query(ABC): 30 | question: str = None 31 | 32 | @staticmethod 33 | @abstractmethod 34 | def query(*args) -> dd.DataFrame: 35 | pass 36 | 37 | @staticmethod 38 | @abstractmethod 39 | def check(ans: dd.DataFrame) -> Any: 40 | pass 41 | 42 | @classmethod 43 | def name(cls) -> str: 44 | return f"{cls.__name__}: {cls.question}" 45 | 46 | class QueryRunner: 47 | def __init__( 48 | self, 49 | task: str, 50 | solution: str, 51 | solution_version: str, 52 | solution_revision: str, 53 | fun: str, 54 | cache: str, 55 | on_disk: bool 56 | ): 57 | self.task = task 58 | self.solution = solution 59 | self.solution_version = solution_version 60 | self.solution_revision = solution_revision 61 | self.fun = fun 62 | self.cache = cache 63 | self.on_disk = on_disk 64 | 65 | def run_query( 66 | self, 67 | data_name: str, 68 | in_rows: int, 69 | args: Iterable[Any], 70 | query: Query, 71 | machine_type: str, 72 | runs: int = 2, 73 | raise_exception: bool = False, 74 | ): 75 | logger.info("Running '%s'" % query.name()) 76 | 77 | try: 78 | for run in range(1, runs+1): 79 | gc.collect() # TODO: Able to do this in worker processes? Want to? 80 | 81 | # Calculate ans 82 | t_start = timeit.default_timer() 83 | ans = query.query(*args) 84 | logger.debug("Answer shape: %s" % (ans.shape, )) 85 | t = timeit.default_timer() - t_start 86 | m = memory_usage() 87 | 88 | logger.info("\tRun #%s: %0.3fs" % (run, t)) 89 | 90 | # Calculate chk 91 | t_start = timeit.default_timer() 92 | chk = query.check(ans) 93 | chkt = timeit.default_timer() - t_start 94 | 95 | 96 | write_log( 97 | task=self.task, 98 | data=data_name, 99 | in_rows=in_rows, 100 | question=query.question, 101 | out_rows=ans.shape[0], 102 | out_cols=ans.shape[1], 103 | solution=self.solution, 104 | version=self.solution_version, 105 | git=self.solution_revision, 106 | fun=self.fun, 107 | run=run, 108 | time_sec=t, 109 | mem_gb=m, 110 | cache=self.cache, 111 | chk=make_chk(chk), 112 | chk_time_sec=chkt, 113 | on_disk=self.on_disk, 114 | machine_type=machine_type 115 | ) 116 | if run == runs: 117 | # Print head / tail on last run 118 | logger.debug("Answer head:\n%s" % ans.head(3)) 119 | logger.debug("Answer tail:\n%s" % ans.tail(3)) 120 | del ans 121 | except Exception as err: 122 | logger.error("Query '%s' failed!" % query.name()) 123 | print(err) 124 | 125 | # Re-raise if instructed 126 | if raise_exception: 127 | raise err 128 | 129 | def dask_client() -> distributed.Client: 130 | # we use process-pool instead of thread-pool due to GIL cost 131 | return distributed.Client(processes=True, silence_logs=logging.ERROR) 132 | -------------------------------------------------------------------------------- /dask/setup-dask.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | virtualenv dask/py-dask --python=python3 5 | source dask/py-dask/bin/activate 6 | 7 | # install binaries 8 | python3 -m pip install "dask[complete]" 9 | 10 | # check 11 | # python3 12 | # import dask as dk 13 | # dk.__version__ 14 | # dk.__git_revision__ 15 | # quit() 16 | 17 | deactivate 18 | 19 | ./dask/ver-dask.sh 20 | -------------------------------------------------------------------------------- /dask/upg-dask.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | echo 'upgrading dask...' 5 | 6 | source ./dask/py-dask/bin/activate 7 | 8 | python3 -m pip install --upgrade dask[complete] > /dev/null 9 | -------------------------------------------------------------------------------- /dask/ver-dask.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | source ./dask/py-dask/bin/activate 5 | python3 -c 'import dask as dk; open("dask/VERSION","w").write(dk.__version__); open("dask/REVISION","w").write(dk.__git_revision__);' > /dev/null 6 | -------------------------------------------------------------------------------- /datafusion/VERSION: -------------------------------------------------------------------------------- 1 | 47.0.0 -------------------------------------------------------------------------------- /datafusion/setup-datafusion.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | virtualenv datafusion/py-datafusion --python=python3 5 | source datafusion/py-datafusion/bin/activate 6 | 7 | python3 -m pip install --upgrade psutil datafusion pandas 8 | 9 | # build 10 | deactivate 11 | ./datafusion/upg-datafusion.sh 12 | 13 | ./datafusion/ver-datafusion.sh 14 | 15 | # check 16 | # source datafusion/py-datafusion/bin/activate 17 | # python3 18 | # import datafusion as df 19 | # df.__version__ 20 | # quit() 21 | # deactivate 22 | 23 | # fix: print(ans.head(3), flush=True): UnicodeEncodeError: 'ascii' codec can't encode characters in position 14-31: ordinal not in range(128) 24 | # vim datafusion/py-datafusion/bin/activate 25 | #deactivate () { 26 | # unset PYTHONIOENCODING 27 | # ... 28 | #} 29 | #... 30 | #PYTHONIOENCODING="utf-8" 31 | #export PYTHONIOENCODING 32 | #... 33 | -------------------------------------------------------------------------------- /datafusion/upg-datafusion.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | echo 'upgrading datafusion...' 5 | 6 | source ./datafusion/py-datafusion/bin/activate 7 | 8 | python -m pip install --upgrade datafusion > /dev/null 9 | 10 | deactivate -------------------------------------------------------------------------------- /datafusion/ver-datafusion.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source ./datafusion/py-datafusion/bin/activate 3 | python3 -c 'import datafusion as df; open("datafusion/VERSION","w").write(df.__version__); open("datafusion/REVISION","w").write("");' > /dev/null 4 | -------------------------------------------------------------------------------- /datatable/VERSION: -------------------------------------------------------------------------------- 1 | 1.16.99 2 | -------------------------------------------------------------------------------- /datatable/groupby2014-datatable.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | cat("# groupby2014-datatable.R\n") 4 | 5 | source("./_helpers/helpers.R") 6 | 7 | stopifnot(requireNamespace(c("bit64"), quietly=TRUE)) # used in chk to sum numeric columns 8 | suppressPackageStartupMessages(library("data.table", lib.loc="./datatable/r-datatable")) 9 | setDTthreads(0L) 10 | ver = packageVersion("data.table") 11 | git = data.table:::.git(quiet=TRUE) 12 | task = "groupby2014" 13 | solution = "data.table" 14 | fun = "[.data.table" 15 | cache = TRUE 16 | on_disk = FALSE 17 | 18 | data_name = Sys.getenv("SRC_DATANAME") 19 | src_grp = file.path("data", paste(data_name, "csv", sep=".")) 20 | cat(sprintf("loading dataset %s\n", data_name)) 21 | 22 | x = fread(src_grp, showProgress=FALSE) 23 | print(nrow(x)) 24 | 25 | task_init = proc.time()[["elapsed"]] 26 | cat("grouping...\n") 27 | 28 | question = "sum v1 by id1" # q1 29 | t = system.time(print(dim(ans<-x[, sum(v1), keyby=id1])))[["elapsed"]] 30 | m = memory_usage() 31 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(V1)))])[["elapsed"]] 32 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 33 | rm(ans) 34 | t = system.time(print(dim(ans<-x[, sum(v1), keyby=id1])))[["elapsed"]] 35 | m = memory_usage() 36 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(V1)))])[["elapsed"]] 37 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 38 | print(head(ans, 3)) 39 | print(tail(ans, 3)) 40 | rm(ans) 41 | 42 | question = "sum v1 by id1:id2" # q2 43 | t = system.time(print(dim(ans<-x[, sum(v1), keyby='id1,id2'])))[["elapsed"]] 44 | m = memory_usage() 45 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(V1)))])[["elapsed"]] 46 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 47 | rm(ans) 48 | t = system.time(print(dim(ans<-x[, sum(v1), keyby='id1,id2'])))[["elapsed"]] 49 | m = memory_usage() 50 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(V1)))])[["elapsed"]] 51 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 52 | print(head(ans, 3)) 53 | print(tail(ans, 3)) 54 | rm(ans) 55 | 56 | question = "sum v1 mean v3 by id3" # q3 57 | t = system.time(print(dim(ans<-x[, list(sum(v1), mean(v3)), keyby=id3])))[["elapsed"]] 58 | m = memory_usage() 59 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(V1)), sum(V2))])[["elapsed"]] 60 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 61 | rm(ans) 62 | t = system.time(print(dim(ans<-x[, list(sum(v1), mean(v3)), keyby=id3])))[["elapsed"]] 63 | m = memory_usage() 64 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(V1)), sum(V2))])[["elapsed"]] 65 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 66 | print(head(ans, 3)) 67 | print(tail(ans, 3)) 68 | rm(ans) 69 | 70 | question = "mean v1:v3 by id4" # q4 71 | t = system.time(print(dim(ans<-x[, lapply(.SD, mean), keyby=id4, .SDcols=7:9])))[["elapsed"]] 72 | m = memory_usage() 73 | chkt = system.time(chk<-ans[, .(sum(v1), sum(v2), sum(v3))])[["elapsed"]] 74 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 75 | rm(ans) 76 | t = system.time(print(dim(ans<-x[, lapply(.SD, mean), keyby=id4, .SDcols=7:9])))[["elapsed"]] 77 | m = memory_usage() 78 | chkt = system.time(chk<-ans[, .(sum(v1), sum(v2), sum(v3))])[["elapsed"]] 79 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 80 | print(head(ans, 3)) 81 | print(tail(ans, 3)) 82 | rm(ans) 83 | 84 | question = "sum v1:v3 by id6" # q5 85 | t = system.time(print(dim(ans<-x[, lapply(.SD, sum), keyby=id6, .SDcols=7:9])))[["elapsed"]] 86 | m = memory_usage() 87 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(v1)), sum(bit64::as.integer64(v2)), sum(v3))])[["elapsed"]] 88 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 89 | rm(ans) 90 | t = system.time(print(dim(ans<-x[, lapply(.SD, sum), keyby=id6, .SDcols=7:9])))[["elapsed"]] 91 | m = memory_usage() 92 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(v1)), sum(bit64::as.integer64(v2)), sum(v3))])[["elapsed"]] 93 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 94 | print(head(ans, 3)) 95 | print(tail(ans, 3)) 96 | rm(ans) 97 | 98 | cat(sprintf("grouping finished, took %.0fs\n", proc.time()[["elapsed"]]-task_init)) 99 | 100 | if( !interactive() ) q("no", status=0) 101 | -------------------------------------------------------------------------------- /datatable/read-datatable.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | cat("# read-datatable.R\n") 4 | 5 | source("./helpers.R") 6 | source("./datatable/helpers-datatable.R") 7 | 8 | suppressPackageStartupMessages(library(data.table)) 9 | ver = packageVersion("data.table") 10 | git = datatable.git() 11 | task = "read" 12 | solution = "data.table" 13 | fun = "fread" 14 | cache = TRUE 15 | 16 | src_grp = Sys.getenv("SRC_GRP_LOCAL") 17 | data_name = basename(src_grp) 18 | options("datatable.showProgress"=FALSE) 19 | 20 | in_rows = as.numeric(strsplit(system(sprintf("wc -l %s", data_name), intern=TRUE), " ", fixed=TRUE)[[1L]][1L])-1 21 | 22 | cat("reading...\n") 23 | 24 | question = "all rows" #1 25 | t = system.time(print(dim(ans<-fread(data_name))))[["elapsed"]] 26 | m = memory_usage() 27 | chkt = system.time(chk<-ans[, .(sum(v3))])[["elapsed"]] 28 | write.log(run=1L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 29 | rm(ans) 30 | t = system.time(print(dim(ans<-fread(data_name))))[["elapsed"]] 31 | m = memory_usage() 32 | chkt = system.time(chk<-ans[, .(sum(v3))])[["elapsed"]] 33 | write.log(run=2L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 34 | rm(ans) 35 | t = system.time(print(dim(ans<-fread(data_name))))[["elapsed"]] 36 | m = memory_usage() 37 | chkt = system.time(chk<-ans[, .(sum(v3))])[["elapsed"]] 38 | write.log(run=3L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 39 | rm(ans) 40 | 41 | question = "top 100 rows" #2 42 | t = system.time(print(dim(ans<-fread(data_name, nrows=100))))[["elapsed"]] 43 | m = memory_usage() 44 | chkt = system.time(chk<-ans[, .(sum(v3))])[["elapsed"]] 45 | write.log(run=1L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 46 | rm(ans) 47 | t = system.time(print(dim(ans<-fread(data_name, nrows=100))))[["elapsed"]] 48 | m = memory_usage() 49 | chkt = system.time(chk<-ans[, .(sum(v3))])[["elapsed"]] 50 | write.log(run=2L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 51 | rm(ans) 52 | t = system.time(print(dim(ans<-fread(data_name, nrows=100))))[["elapsed"]] 53 | m = memory_usage() 54 | chkt = system.time(chk<-ans[, .(sum(v3))])[["elapsed"]] 55 | write.log(run=3L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 56 | rm(ans) 57 | 58 | if( !interactive() ) q("no", status=0) 59 | -------------------------------------------------------------------------------- /datatable/setup-datatable.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # install devel data.table 3 | mkdir -p ./datatable/r-datatable 4 | Rscript -e 'install.packages("data.table", repos="https://Rdatatable.gitlab.io/data.table", method="curl", lib="./datatable/r-datatable")' 5 | 6 | ./datatable/ver-datatable.sh 7 | -------------------------------------------------------------------------------- /datatable/sort-datatable.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | cat("# sort-datatable.R\n") 4 | 5 | source("./helpers.R") 6 | source("./datatable/helpers-datatable.R") 7 | 8 | src_x = Sys.getenv("SRC_X", NA_character_) 9 | 10 | # if (get.nrow(src_x) > 1e9L) { 11 | # cat("# sort with data.table skipped due data volume cap for single machine set to total 1e9 rows") 12 | # quit("no", status=0) # datasets > 1e9 too big to try load on single machine 13 | # } 14 | 15 | stopifnot(requireNamespace("bit64", quietly=TRUE)) # used in chk to sum numeric columns 16 | suppressPackageStartupMessages(library(data.table)) 17 | ver = packageVersion("data.table") 18 | git = datatable.git() 19 | data_name = basename(src_x) 20 | task = "sort" 21 | solution = "data.table" 22 | fun = "[.data.table" 23 | question = "by int KEY" 24 | cache = TRUE 25 | 26 | cat("loading dataset...\n") 27 | X = fread(if(file.exists(basename(src_x))) basename(src_x) else sprintf("hadoop fs -cat %s", src_x)) # csv can be provided in local dir for faster import 28 | 29 | cat("sorting...\n") 30 | t = system.time(print(dim(ans<-X[order(KEY)])))[["elapsed"]] 31 | m = memory_usage() 32 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(X2)))])[["elapsed"]] 33 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 34 | rm(ans) 35 | 36 | t = system.time(print(dim(ans<-X[order(KEY)])))[["elapsed"]] 37 | m = memory_usage() 38 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(X2)))])[["elapsed"]] 39 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 40 | rm(ans) 41 | 42 | t = system.time(print(dim(ans<-X[order(KEY)])))[["elapsed"]] 43 | m = memory_usage() 44 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(X2)))])[["elapsed"]] 45 | write.log(run=3L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 46 | rm(ans) 47 | 48 | if( !interactive() ) q("no", status=0) 49 | -------------------------------------------------------------------------------- /datatable/upg-datatable.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # upgrade to latest devel 5 | echo 'upgrading data.table...' 6 | # Rscript -e 'data.table::update.dev.pkg(quiet=TRUE, method="curl", lib="./datatable/r-datatable")' 7 | Rscript -e 'update.packages(lib.loc = "./datatable/r-datatable", repos="https://rdatatable.gitlab.io/data.table", method="curl")' 8 | 9 | -------------------------------------------------------------------------------- /datatable/ver-datatable.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | Rscript -e 'v=read.dcf(system.file(package="data.table", lib.loc="./datatable/r-datatable", "DESCRIPTION"), fields=c("Version","Revision")); cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("datatable", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))' 5 | -------------------------------------------------------------------------------- /dplyr/VERSION: -------------------------------------------------------------------------------- 1 | 1.1.4 2 | -------------------------------------------------------------------------------- /dplyr/groupby2014-dplyr.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | cat("# groupby2014-dplyr.R\n") 4 | 5 | source("./_helpers/helpers.R") 6 | 7 | stopifnot(requireNamespace(c("bit64","data.table"), quietly=TRUE)) # used in chk to sum numeric columns and data loading 8 | .libPaths("./dplyr/r-dplyr") # tidyverse/dplyr#4641 9 | suppressPackageStartupMessages(library("dplyr", lib.loc="./dplyr/r-dplyr", warn.conflicts=FALSE)) 10 | ver = packageVersion("dplyr") 11 | git = "" # uses stable version now #124 12 | task = "groupby2014" 13 | solution = "dplyr" 14 | fun = "group_by" 15 | cache = TRUE 16 | on_disk = FALSE 17 | 18 | data_name = Sys.getenv("SRC_DATANAME") 19 | src_grp = file.path("data", paste(data_name, "csv", sep=".")) 20 | cat(sprintf("loading dataset %s\n", data_name)) 21 | 22 | x = as_tibble(data.table::fread(src_grp, showProgress=FALSE, data.table=FALSE)) 23 | print(nrow(x)) 24 | 25 | task_init = proc.time()[["elapsed"]] 26 | cat("grouping...\n") 27 | 28 | question = "sum v1 by id1" # q1 29 | t = system.time(print(dim(ans<-x %>% group_by(id1) %>% summarise(sum(v1)))))[["elapsed"]] 30 | m = memory_usage() 31 | chkt = system.time(chk<-summarise(ans, v1=sum(bit64::as.integer64(`sum(v1)`))))[["elapsed"]] 32 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 33 | rm(ans) 34 | t = system.time(print(dim(ans<-x %>% group_by(id1) %>% summarise(sum(v1)))))[["elapsed"]] 35 | m = memory_usage() 36 | chkt = system.time(chk<-summarise(ans, v1=sum(bit64::as.integer64(`sum(v1)`))))[["elapsed"]] 37 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 38 | print(head(ans, 3)) 39 | print(tail(ans, 3)) 40 | rm(ans) 41 | 42 | question = "sum v1 by id1:id2" # q2 43 | t = system.time(print(dim(ans<-x %>% group_by(id1,id2) %>% summarise(sum(v1)))))[["elapsed"]] 44 | m = memory_usage() 45 | chkt = system.time(chk<-summarise(ungroup(ans), v1=sum(bit64::as.integer64(`sum(v1)`))))[["elapsed"]] 46 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 47 | rm(ans) 48 | t = system.time(print(dim(ans<-x %>% group_by(id1,id2) %>% summarise(sum(v1)))))[["elapsed"]] 49 | m = memory_usage() 50 | chkt = system.time(chk<-summarise(ungroup(ans), v1=sum(bit64::as.integer64(`sum(v1)`))))[["elapsed"]] 51 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 52 | print(head(ans, 3)) 53 | print(tail(ans, 3)) 54 | rm(ans) 55 | 56 | question = "sum v1 mean v3 by id3" # q3 57 | t = system.time(print(dim(ans<-x %>% group_by(id3) %>% summarise(sum(v1), mean(v3)))))[["elapsed"]] 58 | m = memory_usage() 59 | chkt = system.time(chk<-summarise(ans, v1=sum(bit64::as.integer64(`sum(v1)`)), v3=sum(`mean(v3)`)))[["elapsed"]] 60 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 61 | rm(ans) 62 | t = system.time(print(dim(ans<-x %>% group_by(id3) %>% summarise(sum(v1), mean(v3)))))[["elapsed"]] 63 | m = memory_usage() 64 | chkt = system.time(chk<-summarise(ans, v1=sum(bit64::as.integer64(`sum(v1)`)), v3=sum(`mean(v3)`)))[["elapsed"]] 65 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 66 | print(head(ans, 3)) 67 | print(tail(ans, 3)) 68 | rm(ans) 69 | 70 | question = "mean v1:v3 by id4" # q4 71 | t = system.time(print(dim(ans<-x %>% group_by(id4) %>% summarise(across(v1:v3, mean)))))[["elapsed"]] 72 | m = memory_usage() 73 | chkt = system.time(chk<-summarise(ans, v1=sum(v1), v2=sum(v2), v3=sum(v3)))[["elapsed"]] 74 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 75 | rm(ans) 76 | t = system.time(print(dim(ans<-x %>% group_by(id4) %>% summarise(across(v1:v3, mean)))))[["elapsed"]] 77 | m = memory_usage() 78 | chkt = system.time(chk<-summarise(ans, v1=sum(v1), v2=sum(v2), v3=sum(v3)))[["elapsed"]] 79 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 80 | print(head(ans, 3)) 81 | print(tail(ans, 3)) 82 | rm(ans) 83 | 84 | question = "sum v1:v3 by id6" # q5 85 | t = system.time(print(dim(ans<-x %>% group_by(id6) %>% summarise(across(v1:v3, sum)))))[["elapsed"]] 86 | m = memory_usage() 87 | chkt = system.time(chk<-summarise(ans, v1=sum(bit64::as.integer64(v1)), v2=sum(bit64::as.integer64(v2)), v3=sum(v3)))[["elapsed"]] 88 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 89 | rm(ans) 90 | t = system.time(print(dim(ans<-x %>% group_by(id6) %>% summarise(across(v1:v3, sum)))))[["elapsed"]] 91 | m = memory_usage() 92 | chkt = system.time(chk<-summarise(ans, v1=sum(bit64::as.integer64(v1)), v2=sum(bit64::as.integer64(v2)), v3=sum(v3)))[["elapsed"]] 93 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 94 | print(head(ans, 3)) 95 | print(tail(ans, 3)) 96 | rm(ans) 97 | 98 | cat(sprintf("grouping finished, took %.0fs\n", proc.time()[["elapsed"]]-task_init)) 99 | 100 | if( !interactive() ) q("no", status=0) 101 | -------------------------------------------------------------------------------- /dplyr/join-dplyr.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | cat("# join-dplyr.R\n") 4 | 5 | source("./_helpers/helpers.R") 6 | 7 | stopifnot(requireNamespace(c("data.table"), quietly=TRUE)) # used for data loading 8 | .libPaths("./dplyr/r-dplyr") # tidyverse/dplyr#4641 9 | suppressPackageStartupMessages(library("dplyr", lib.loc="./dplyr/r-dplyr", warn.conflicts=FALSE)) 10 | ver = packageVersion("dplyr") 11 | git = "" # uses stable version now #124 12 | task = "join" 13 | solution = "dplyr" 14 | cache = TRUE 15 | on_disk = FALSE 16 | 17 | data_name = Sys.getenv("SRC_DATANAME") 18 | src_jn_x = file.path("data", paste(data_name, "csv", sep=".")) 19 | y_data_name = join_to_tbls(data_name) 20 | src_jn_y = setNames(file.path("data", paste(y_data_name, "csv", sep=".")), names(y_data_name)) 21 | stopifnot(length(src_jn_y)==3L) 22 | cat(sprintf("loading datasets %s\n", paste(c(data_name, y_data_name), collapse=", "))) 23 | 24 | x = as_tibble(data.table::fread(src_jn_x, showProgress=FALSE, stringsAsFactors=TRUE, data.table=FALSE, na.strings="")) 25 | JN = lapply(sapply(simplify=FALSE, src_jn_y, data.table::fread, showProgress=FALSE, stringsAsFactors=TRUE, data.table=FALSE, na.strings=""), as_tibble) 26 | print(nrow(x)) 27 | sapply(sapply(JN, nrow), print) -> nul 28 | small = JN$small 29 | medium = JN$medium 30 | big = JN$big 31 | 32 | task_init = proc.time()[["elapsed"]] 33 | cat("joining...\n") 34 | 35 | question = "small inner on int" # q1 36 | fun = "inner_join" 37 | t = system.time(print(dim(ans<-inner_join(x, small, by="id1"))))[["elapsed"]] 38 | m = memory_usage() 39 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]] 40 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 41 | rm(ans) 42 | t = system.time(print(dim(ans<-inner_join(x, small, by="id1"))))[["elapsed"]] 43 | m = memory_usage() 44 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]] 45 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 46 | print(head(ans, 3)) 47 | print(tail(ans, 3)) 48 | rm(ans) 49 | 50 | question = "medium inner on int" # q2 51 | fun = "inner_join" 52 | t = system.time(print(dim(ans<-inner_join(x, medium, by="id2"))))[["elapsed"]] 53 | m = memory_usage() 54 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]] 55 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 56 | rm(ans) 57 | t = system.time(print(dim(ans<-inner_join(x, medium, by="id2"))))[["elapsed"]] 58 | m = memory_usage() 59 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]] 60 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 61 | print(head(ans, 3)) 62 | print(tail(ans, 3)) 63 | rm(ans) 64 | 65 | question = "medium outer on int" # q3 66 | fun = "left_join" 67 | t = system.time(print(dim(ans<-left_join(x, medium, by="id2"))))[["elapsed"]] 68 | m = memory_usage() 69 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]] 70 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 71 | rm(ans) 72 | t = system.time(print(dim(ans<-left_join(x, medium, by="id2"))))[["elapsed"]] 73 | m = memory_usage() 74 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]] 75 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 76 | print(head(ans, 3)) 77 | print(tail(ans, 3)) 78 | rm(ans) 79 | 80 | question = "medium inner on factor" # q4 81 | fun = "inner_join" 82 | t = system.time(print(dim(ans<-inner_join(x, medium, by="id5"))))[["elapsed"]] 83 | m = memory_usage() 84 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]] 85 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 86 | rm(ans) 87 | t = system.time(print(dim(ans<-inner_join(x, medium, by="id5"))))[["elapsed"]] 88 | m = memory_usage() 89 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]] 90 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 91 | print(head(ans, 3)) 92 | print(tail(ans, 3)) 93 | rm(ans) 94 | 95 | question = "big inner on int" # q5 96 | fun = "inner_join" 97 | t = system.time(print(dim(ans<-inner_join(x, big, by="id3"))))[["elapsed"]] 98 | m = memory_usage() 99 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]] 100 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 101 | rm(ans) 102 | t = system.time(print(dim(ans<-inner_join(x, big, by="id3"))))[["elapsed"]] 103 | m = memory_usage() 104 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]] 105 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 106 | print(head(ans, 3)) 107 | print(tail(ans, 3)) 108 | rm(ans) 109 | 110 | cat(sprintf("joining finished, took %.0fs\n", proc.time()[["elapsed"]]-task_init)) 111 | 112 | if( !interactive() ) q("no", status=0) 113 | -------------------------------------------------------------------------------- /dplyr/read-dplyr.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | cat("# read-dplyr\n") 4 | 5 | source("./helpers.R") 6 | source("./dplyr/helpers-dplyr.R") 7 | 8 | suppressPackageStartupMessages({ 9 | library(readr, warn.conflicts=FALSE) 10 | library(dplyr, warn.conflicts=FALSE) 11 | }) 12 | ver = NA_character_ #packageVersion("dplyr") 13 | git = NA_character_ #dplyr.git() 14 | task = "read" 15 | solution = "dplyr" 16 | fun = "readr::read_csv" 17 | cache = TRUE 18 | 19 | src_grp = Sys.getenv("SRC_GRP_LOCAL") 20 | data_name = basename(src_grp) 21 | options("readr.show_progress"=FALSE) 22 | 23 | in_rows = as.numeric(strsplit(system(sprintf("wc -l %s", data_name), intern=TRUE), " ", fixed=TRUE)[[1L]][1L])-1 24 | 25 | cat("reading...\n") 26 | 27 | question = "all rows" #1 28 | t = system.time(print(dim(ans<-read_csv(data_name, col_types="ccciiiiid"))))[["elapsed"]] 29 | m = memory_usage() 30 | chkt = system.time(chk<-summarise(ungroup(ans), v3=sum(v1)))[["elapsed"]] 31 | write.log(run=1L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 32 | rm(ans) 33 | t = system.time(print(dim(ans<-read_csv(data_name, col_types="ccciiiiid"))))[["elapsed"]] 34 | m = memory_usage() 35 | chkt = system.time(chk<-summarise(ungroup(ans), v3=sum(v1)))[["elapsed"]] 36 | write.log(run=2L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 37 | rm(ans) 38 | t = system.time(print(dim(ans<-read_csv(data_name, col_types="ccciiiiid"))))[["elapsed"]] 39 | m = memory_usage() 40 | chkt = system.time(chk<-summarise(ungroup(ans), v3=sum(v1)))[["elapsed"]] 41 | write.log(run=3L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 42 | rm(ans) 43 | 44 | question = "top 100 rows" #2 45 | t = system.time(print(dim(ans<-read_csv(data_name, n_max=100, col_types="ccciiiiid"))))[["elapsed"]] 46 | m = memory_usage() 47 | chkt = system.time(chk<-summarise(ungroup(ans), v3=sum(v1)))[["elapsed"]] 48 | write.log(run=1L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 49 | rm(ans) 50 | t = system.time(print(dim(ans<-read_csv(data_name, n_max=100, col_types="ccciiiiid"))))[["elapsed"]] 51 | m = memory_usage() 52 | chkt = system.time(chk<-summarise(ungroup(ans), v3=sum(v1)))[["elapsed"]] 53 | write.log(run=2L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 54 | rm(ans) 55 | t = system.time(print(dim(ans<-read_csv(data_name, n_max=100, col_types="ccciiiiid"))))[["elapsed"]] 56 | m = memory_usage() 57 | chkt = system.time(chk<-summarise(ungroup(ans), v3=sum(v1)))[["elapsed"]] 58 | write.log(run=3L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 59 | rm(ans) 60 | 61 | if( !interactive() ) q("no", status=0) 62 | -------------------------------------------------------------------------------- /dplyr/setup-dplyr.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # install stable dplyr 5 | mkdir -p ./dplyr/r-dplyr 6 | Rscript -e 'install.packages("dplyr", lib="./dplyr/r-dplyr", repos = "http://cloud.r-project.org")' 7 | 8 | ./dplyr/ver-dplyr.sh 9 | -------------------------------------------------------------------------------- /dplyr/sort-dplyr.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | cat("# sort-dplyr\n") 4 | 5 | source("./helpers.R") 6 | source("./dplyr/helpers-dplyr.R") 7 | 8 | src_x = Sys.getenv("SRC_X", NA_character_) 9 | 10 | # if (get.nrow(src_x) > 1e9L) { 11 | # cat("# sort with dplyr skipped due data volume cap for single machine set to total 1e9 rows") 12 | # quit("no", status=0) # datasets > 1e9 too big to try load on single machine 13 | # } 14 | 15 | stopifnot(requireNamespace("bit64", quietly=TRUE)) # used in chk to sum numeric columns 16 | suppressPackageStartupMessages(library(dplyr, warn.conflicts=FALSE)) 17 | ver = packageVersion("dplyr") 18 | git = dplyr.git() 19 | data_name = basename(src_x) 20 | task = "sort" 21 | solution = "dplyr" 22 | fun = "arrange" 23 | question = "by int KEY" 24 | cache = TRUE 25 | 26 | cat("loading dataset...\n") 27 | X = data.table::fread(if(file.exists(basename(src_x))) basename(src_x) else sprintf("hadoop fs -cat %s", src_x), data.table=FALSE) # csv can be provided in local dir for faster import 28 | 29 | cat("sorting...\n") 30 | t = system.time(print(dim(ans<-arrange(X, KEY))))[["elapsed"]] 31 | m = memory_usage() 32 | chkt = system.time(chk<-summarise(ans, sum(bit64::as.integer64(X2))))[["elapsed"]] 33 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 34 | rm(ans) 35 | 36 | t = system.time(print(dim(ans<-arrange(X, KEY))))[["elapsed"]] 37 | m = memory_usage() 38 | chkt = system.time(chk<-summarise(ans, sum(bit64::as.integer64(X2))))[["elapsed"]] 39 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 40 | rm(ans) 41 | 42 | t = system.time(print(dim(ans<-arrange(X, KEY))))[["elapsed"]] 43 | m = memory_usage() 44 | chkt = system.time(chk<-summarise(ans, sum(bit64::as.integer64(X2))))[["elapsed"]] 45 | write.log(run=3L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 46 | rm(ans) 47 | 48 | if( !interactive() ) q("no", status=0) 49 | -------------------------------------------------------------------------------- /dplyr/upg-dplyr.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # upgrade all packages in dplyr library only if new dplyr is out 5 | echo 'upgrading dplyr...' 6 | Rscript -e 'ap=available.packages(); if (ap["dplyr","Version"]!=packageVersion("dplyr", lib.loc="./dplyr/r-dplyr")) update.packages(lib.loc="./dplyr/r-dplyr", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)' 7 | -------------------------------------------------------------------------------- /dplyr/ver-dplyr.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | Rscript -e 'v=read.dcf(system.file(package="dplyr", lib.loc="./dplyr/r-dplyr", "DESCRIPTION"), fields=c("Version","RemoteSha")); colnames(v)[colnames(v)=="RemoteSha"]="Revision"; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("dplyr", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))' 5 | -------------------------------------------------------------------------------- /duckdb-latest/VERSION: -------------------------------------------------------------------------------- 1 | 1.0.99.9000 2 | -------------------------------------------------------------------------------- /duckdb-latest/setup-duckdb-latest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # install stable duckdb-latest 5 | rm -rf ./duckdb-latest/r-duckdb-latest 6 | mkdir -p ./duckdb-latest/r-duckdb-latest 7 | # Rscript -e 'withr::with_libpaths(new = "./duckdb-latest/r-duckdb-latest", devtools::install_github("duckdb-latest/duckdb-latest/tools/rpkg"))' 8 | # prevent errors when running 'ver-duckdb-latest.sh' 9 | Rscript -e 'install.packages("DBI", lib="./duckdb-latest/r-duckdb-latest", repos = "http://cloud.r-project.org")' 10 | 11 | 12 | cd duckdb-latest 13 | rm -rf duckdb-r 14 | git clone https://github.com/duckdb/duckdb-r.git 15 | ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'` 16 | MAKE="make -j$ncores" R CMD INSTALL -l "./r-duckdb-latest" duckdb-r 17 | rm -rf duckdb-r 18 | cd .. 19 | 20 | ./duckdb-latest/ver-duckdb-latest.sh 21 | -------------------------------------------------------------------------------- /duckdb-latest/upg-duckdb-latest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # upgrade all packages in duckdb library only if new arrow is out 5 | echo 'upgrading duckdb-latest, installing 0.9.1' 6 | 7 | rm -rf ./duckdb-latest/r-duckdb-latest 8 | mkdir -p ./duckdb-latest/r-duckdb-latest 9 | Rscript -e 'install.packages("DBI", lib="./duckdb-latest/r-duckdb-latest", repos = "http://cloud.r-project.org")' 10 | 11 | 12 | cd duckdb-latest 13 | rm -rf duckdb-r 14 | git clone https://github.com/duckdb/duckdb-r 15 | ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'` 16 | MAKE="make -j$ncores" R CMD INSTALL -l "./r-duckdb-latest" duckdb-r 17 | rm -rf duckdb-r 18 | cd .. 19 | 20 | ./duckdb-latest/ver-duckdb-latest.sh -------------------------------------------------------------------------------- /duckdb-latest/ver-duckdb-latest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | Rscript -e 'v=read.dcf(system.file(package="duckdb", lib.loc="./duckdb-latest/r-duckdb-latest", "DESCRIPTION"), fields=c("Version","Revision")); if (is.na(v[,"Revision"])) { suppressPackageStartupMessages({ requireNamespace("DBI", lib.loc="./duckdb-latest/r-duckdb-latest"); requireNamespace("duckdb", lib.loc="./duckdb-latest/r-duckdb-latest") }); v[,"Revision"] = DBI::dbGetQuery(con<-DBI::dbConnect(duckdb::duckdb()), "SELECT source_id FROM pragma_version()")[[1L]]; invisible(DBI::dbDisconnect(con, shutdown=TRUE)) }; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(gsub("-", ".", v), file.path("duckdb-latest", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))' 5 | 6 | -------------------------------------------------------------------------------- /duckdb/VERSION: -------------------------------------------------------------------------------- 1 | 1.3.0 2 | -------------------------------------------------------------------------------- /duckdb/setup-duckdb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # install stable duckdb 5 | rm -rf ./duckdb/r-duckdb 6 | mkdir -p ./duckdb/r-duckdb 7 | # Rscript -e 'withr::with_libpaths(new = "./duckdb/r-duckdb", devtools::install_github("duckdb/duckdb/tools/rpkg"))' 8 | # prevent errors when running 'ver-duckdb.sh' 9 | Rscript -e 'install.packages("DBI", lib="./duckdb/r-duckdb", repos = "http://cloud.r-project.org")' 10 | ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'` 11 | MAKE="make -j$ncores" Rscript -e 'install.packages("duckdb", lib="./duckdb/r-duckdb", repos = "http://cloud.r-project.org")' 12 | 13 | ./duckdb/ver-duckdb.sh 14 | -------------------------------------------------------------------------------- /duckdb/upg-duckdb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | rm -rf ./duckdb/r-duckdb 5 | mkdir -p ./duckdb/r-duckdb 6 | 7 | 8 | cd duckdb 9 | rm -rf duckdb-r 10 | git clone https://github.com/duckdb/duckdb-r 11 | cd duckdb-r 12 | git checkout v1.2.0 13 | cd .. 14 | ncores=$(nproc --all) 15 | MAKE="make -j$ncores" R CMD INSTALL -l "./r-duckdb" duckdb-r 16 | rm -rf duckdb-r 17 | cd .. 18 | 19 | 20 | # Rscript -e 'ap=available.packages(repos="https://cloud.r-project.org/"); if (ap["duckdb","Version"]!=packageVersion("duckdb", lib.loc="./duckdb/r-duckdb")) update.packages(lib.loc="./duckdb/r-duckdb", ask=FALSE, checkBuilt=TRUE, quiet=TRUE, repos="https://cloud.r-project.org/")' 21 | -------------------------------------------------------------------------------- /duckdb/ver-duckdb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | Rscript -e 'v=read.dcf(system.file(package="duckdb", lib.loc="./duckdb/r-duckdb", "DESCRIPTION"), fields=c("Version","Revision")); if (is.na(v[,"Revision"])) { suppressPackageStartupMessages({ requireNamespace("DBI", lib.loc="./duckdb/r-duckdb"); requireNamespace("duckdb", lib.loc="./duckdb/r-duckdb") }); v[,"Revision"] = DBI::dbGetQuery(DBI::dbConnect(duckdb::duckdb()), "SELECT source_id FROM pragma_version()")[[1L]] }; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(gsub("-", ".", v), file.path("duckdb", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))' 5 | -------------------------------------------------------------------------------- /h2o/exec.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | if [ "$#" -ne 1 ]; then 5 | echo 'usage: ./h2o/exec.sh groupby'; 6 | exit 1 7 | fi; 8 | 9 | source ./h2o/h2o.sh 10 | 11 | h2o_active && echo 'h2o instance should not be already running, investigate' >&2 12 | h2o_active && exit 1 13 | 14 | # start h2o 15 | h2o_start "h2o_$1_""$SRC_DATANAME" 16 | 17 | # confirm h2o working 18 | h2o_active || sleep 30 19 | h2o_active || echo 'h2o instance should be already running, investigate' >&2 20 | h2o_active || exit 1 21 | 22 | # execute benchmark script 23 | ./h2o/$1-h2o.R || echo "# h2o/exec.sh: benchmark script for $SRC_DATANAME terminated with error" >&2 24 | 25 | # stop h2o instance 26 | h2o_stop && echo '# h2o/exec.sh: stopping h2o instance finished' || echo '# h2o/exec.sh: stopping h2o instance failed' >&2 27 | h2o_active || exit 1 28 | -------------------------------------------------------------------------------- /h2o/h2o.sh: -------------------------------------------------------------------------------- 1 | java_active() { 2 | pgrep -U $UID java > /dev/null 2>&1 3 | } 4 | h2o_active() { 5 | java_active && curl -X GET "localhost:55888/3/About" -H "accept: application/json" > /dev/null 2>&1 6 | } 7 | h2o_start() { 8 | ((!$#)) && echo "h2o_start require h2o instance name as a parameter" >&2 && return 1 9 | echo '# h2o_start: starting h2o instance' 10 | java_active && echo "h2o instance is running already" >&2 && return 1 11 | nohup java -Xmx100G -Xms100G -cp ./h2o/r-h2o/h2o/java/h2o.jar water.H2OApp -name "$1" -baseport 55888 > ./h2o/log/$1.out 2> ./h2o/log/$1.err < /dev/null & 12 | sleep 10 13 | } 14 | h2o_stop() { 15 | echo '# h2o_stop: stopping h2o instance' 16 | java_active || echo "h2o instance was not running already" >&2 17 | java_active || return 0 18 | java_active && echo "sigint h2o instance" && killall -2 -u $USER java > /dev/null 2>&1 19 | sleep 1 && java_active && sleep 15 20 | java_active && echo "sigterm h2o instance" && killall -15 -u $USER java > /dev/null 2>&1 21 | sleep 1 && java_active && sleep 30 22 | java_active && echo "sigkill h2o instance" && killall -9 -u $USER java > /dev/null 2>&1 23 | sleep 1 && java_active && sleep 120 && java_active && echo "h2o instance could not be stopped" >&2 && return 1 24 | return 0 25 | } 26 | 27 | -------------------------------------------------------------------------------- /h2o/join-h2o.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | cat("# join-h2o.R\n") 4 | 5 | source("./_helpers/helpers.R") 6 | 7 | suppressPackageStartupMessages(library("h2o", lib.loc="./h2o/r-h2o", warn.conflicts=FALSE, quietly=TRUE)) 8 | ver = packageVersion("h2o") 9 | git = "" 10 | task = "join" 11 | solution = "h2o" 12 | fun = "h2o.merge" 13 | cache = TRUE 14 | on_disk = FALSE 15 | 16 | h = h2o.init(startH2O=FALSE, port=55888) 17 | h2o.no_progress() 18 | 19 | data_name = Sys.getenv("SRC_DATANAME") 20 | src_jn_x = file.path("data", paste(data_name, "csv", sep=".")) 21 | y_data_name = join_to_tbls(data_name) 22 | src_jn_y = setNames(file.path("data", paste(y_data_name, "csv", sep=".")), names(y_data_name)) 23 | stopifnot(length(src_jn_y)==3L) 24 | cat(sprintf("loading datasets %s\n", paste(c(data_name, y_data_name), collapse=", "))) 25 | 26 | x = h2o.importFile(src_jn_x, col.types=c("int","int","int","enum","enum","string","real")) 27 | print(nrow(x)) 28 | small = h2o.importFile(src_jn_y[1L], col.types=c("int","enum","real")) 29 | medium = h2o.importFile(src_jn_y[2L], col.types=c("int","int","enum","enum","real")) 30 | big = h2o.importFile(src_jn_y[3L], col.types=c("int","int","int","enum","enum","string","real")) 31 | sapply(sapply(list(small, medium, big), nrow), print) -> nul 32 | 33 | task_init = proc.time()[["elapsed"]] 34 | cat("joining...\n") 35 | 36 | question = "small inner on int" # q1 37 | 38 | t = system.time(print(dim(ans<-h2o.merge(x, small, by="id1"))))[["elapsed"]] 39 | m = memory_usage() 40 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]] 41 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 42 | h2o.rm(ans) 43 | t = system.time(print(dim(ans<-h2o.merge(x, small, by="id1"))))[["elapsed"]] 44 | m = memory_usage() 45 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]] 46 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 47 | print(head(ans, 3)) 48 | print(tail(ans, 3)) 49 | h2o.rm(ans) 50 | 51 | question = "medium inner on int" # q2 52 | t = system.time(print(dim(ans<-h2o.merge(x, medium, by="id2"))))[["elapsed"]] 53 | m = memory_usage() 54 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]] 55 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 56 | h2o.rm(ans) 57 | t = system.time(print(dim(ans<-h2o.merge(x, medium, by="id2"))))[["elapsed"]] 58 | m = memory_usage() 59 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]] 60 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 61 | print(head(ans, 3)) 62 | print(tail(ans, 3)) 63 | h2o.rm(ans) 64 | 65 | question = "medium outer on int" # q3 66 | t = system.time(print(dim(ans<-h2o.merge(x, medium, by="id2", all.x=TRUE))))[["elapsed"]] 67 | m = memory_usage() 68 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]], na.rm=TRUE)))[["elapsed"]] 69 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 70 | h2o.rm(ans) 71 | t = system.time(print(dim(ans<-h2o.merge(x, medium, by="id2", all.x=TRUE))))[["elapsed"]] 72 | m = memory_usage() 73 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]], na.rm=TRUE)))[["elapsed"]] 74 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 75 | print(head(ans, 3)) 76 | print(tail(ans, 3)) 77 | h2o.rm(ans) 78 | 79 | question = "medium inner on factor" # q4 80 | t = system.time(print(dim(ans<-h2o.merge(x, medium, by="id5"))))[["elapsed"]] 81 | m = memory_usage() 82 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]] 83 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 84 | h2o.rm(ans) 85 | t = system.time(print(dim(ans<-h2o.merge(x, medium, by="id5"))))[["elapsed"]] 86 | m = memory_usage() 87 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]] 88 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 89 | print(head(ans, 3)) 90 | print(tail(ans, 3)) 91 | h2o.rm(ans) 92 | 93 | question = "big inner on int" # q5 94 | t = system.time(print(dim(ans<-h2o.merge(x, big, by="id3"))))[["elapsed"]] 95 | m = memory_usage() 96 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]] 97 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 98 | h2o.rm(ans) 99 | t = system.time(print(dim(ans<-h2o.merge(x, big, by="id3"))))[["elapsed"]] 100 | m = memory_usage() 101 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]] 102 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) 103 | print(head(ans, 3)) 104 | print(tail(ans, 3)) 105 | h2o.rm(ans) 106 | 107 | h2o.removeAll() 108 | 109 | cat(sprintf("joining finished, took %.0fs\n", proc.time()[["elapsed"]]-task_init)) 110 | 111 | if (!interactive()) q("no", status=0) 112 | -------------------------------------------------------------------------------- /h2o/setup-h2o.sh: -------------------------------------------------------------------------------- 1 | mkdir -p ./h2o/log 2 | # install h2o 3 | mkdir -p ./h2o/r-h2o 4 | Rscript -e 'install.packages(c("RCurl","jsonlite"), repos="https://cloud.r-project.org", lib="./h2o/r-h2o"); install.packages("h2o", repos="http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R", method="curl", lib="./h2o/r-h2o")' 5 | 6 | -------------------------------------------------------------------------------- /h2o/upg-h2o.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # upgrade to latest stable from h2o repo 5 | echo 'upgrading h2o...' 6 | Rscript -e 'ap=available.packages(repos="http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R", method="curl"); if (ap["h2o","Version"]!=packageVersion("h2o", lib.loc="./h2o/r-h2o")) update.packages(lib.loc="./h2o/r-h2o", repos="http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R", method="curl", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)' 7 | -------------------------------------------------------------------------------- /h2o/ver-h2o.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | Rscript -e 'v=read.dcf(system.file(package="h2o", lib.loc="./h2o/r-h2o", "DESCRIPTION"), fields=c("Version","Revision")); cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("h2o", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))' 5 | -------------------------------------------------------------------------------- /juliadf/VERSION: -------------------------------------------------------------------------------- 1 | 1.6.1 -------------------------------------------------------------------------------- /juliadf/exec.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | if [ "$#" -ne 1 ]; then 5 | echo 'usage: ./juliadf/exec.sh groupby'; 6 | exit 1 7 | fi; 8 | 9 | source ./path.env 10 | 11 | ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'` 12 | 13 | # execute benchmark script 14 | julia -t $ncores ./juliadf/$1-juliadf.jl 15 | -------------------------------------------------------------------------------- /juliadf/setup-juliadf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # install julia 3 | 4 | wget -q https://julialang-s3.julialang.org/bin/linux/x64/1.10/julia-1.10.5-linux-x86_64.tar.gz 5 | tar -xvf julia-1.10.5-linux-x86_64.tar.gz > tar.out 2> tar.err 6 | sudo mv julia-1.10.5 /opt 7 | rm julia-1.10.5-linux-x86_64.tar.gz 8 | # put to paths 9 | echo 'export JULIA_HOME=/opt/julia-1.10.5' >> path.env 10 | echo 'export PATH=$PATH:$JULIA_HOME/bin' >> path.env 11 | # note that cron job must have path updated as well 12 | 13 | source path.env 14 | 15 | # install julia dataframes and csv packages 16 | julia -q -e 'using Pkg; Pkg.add(["DataFrames","CSV"])' 17 | julia -q -e 'include("$(pwd())/_helpers/helpers.jl"); pkgmeta = getpkgmeta("DataFrames"); println(string(pkgmeta["version"])); pkgmeta = getpkgmeta("CSV"); println(string(pkgmeta["version"]))' 18 | 19 | ./juliadf/ver-juliadf.sh -------------------------------------------------------------------------------- /juliadf/upg-juliadf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # upgrade to latest devel 5 | echo 'upgrading juliadf...' 6 | julia -q -e 'using Pkg; Pkg.update();' > /dev/null 2>&1 7 | 8 | -------------------------------------------------------------------------------- /juliadf/ver-juliadf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | source path.env 4 | 5 | julia -q -e 'include("$(pwd())/_helpers/helpers.jl"); pkgmeta = getpkgmeta("DataFrames"); f=open("juliadf/VERSION","w"); write(f, string(pkgmeta["version"])); f=open("juliadf/REVISION","w"); write(f, string(pkgmeta["git-tree-sha1"]));' > /dev/null 6 | -------------------------------------------------------------------------------- /juliads/VERSION: -------------------------------------------------------------------------------- 1 | 0.7.21 -------------------------------------------------------------------------------- /juliads/exec.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | if [ "$#" -ne 1 ]; then 5 | echo 'usage: ./juliads/exec.sh groupby'; 6 | exit 1 7 | fi; 8 | 9 | source ./path.env 10 | 11 | ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'` 12 | 13 | # execute benchmark script 14 | julia -t $ncores ./juliads/$1-juliads.jl 15 | -------------------------------------------------------------------------------- /juliads/join-juliads.jl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env julia 2 | 3 | print("# join-juliads.jl\n"); flush(stdout); 4 | 5 | using InMemoryDatasets; 6 | using Printf; 7 | using DLMReader 8 | using PooledArrays 9 | using Arrow 10 | 11 | # Force Julia to precompile methods for common patterns 12 | IMD.warmup() 13 | 14 | include("$(pwd())/_helpers/helpersds.jl"); 15 | 16 | pkgmeta = getpkgmeta("InMemoryDatasets"); 17 | ver = pkgmeta["version"]; 18 | git = ""; 19 | task = "join"; 20 | solution = "juliads"; 21 | fun = "join"; 22 | cache = true; 23 | on_disk = false; 24 | machine_type = ENV["MACHINE_TYPE"] 25 | isondisk(indata) = (parse(Float64, split(indata, "_")[2])>=10^10) || (parse(Float64, split(indata, "_")[2]) >= 1^9 || machine_type == "c6id.4xlarge") 26 | 27 | data_name = ENV["SRC_DATANAME"]; 28 | src_jn_x = string("data/", data_name, ".csv"); 29 | y_data_name = join_to_tbls(data_name); 30 | src_jn_y = [string("data/", y_data_name[1], ".csv"), string("data/", y_data_name[2], ".csv"), string("data/", y_data_name[3], ".csv")]; 31 | if length(src_jn_y) != 3 32 | error("Something went wrong in preparing files used for join") 33 | end; 34 | 35 | on_disk = isondisk(data_name) 36 | 37 | println(string("loading datasets ", data_name, ", ", y_data_name[1], ", ", y_data_name[2], ", ", y_data_name[3])); flush(stdout); 38 | 39 | # temporary file which will be deleted after the run - usually located at /tmp/ 40 | _tmp_storage = tempname() 41 | if isondisk(data_name) 42 | on_disk = true 43 | big_df = filereader(src_jn_y[3], types=[Int32, Int32, Int32, Characters{6}, Characters{9}, Characters{12}, Float64]); 44 | modify!(big_df, [:id4, :id5]=>PooledArray) 45 | Arrow.write(_tmp_storage, big_df[!, :], ntasks=1) 46 | big_df = 0 47 | GC.gc(true) 48 | end 49 | x_df = filereader(src_jn_x, types=[Int32, Int32, Int32, Characters{6}, Characters{9}, Characters{12}, Float64]); 50 | small_df = filereader(src_jn_y[1], types=[Int32, Characters{6}, Float64]); 51 | medium_df = filereader(src_jn_y[2], types=[Int32, Int32, Characters{6}, Characters{9}, Float64]); 52 | if isondisk(data_name) 53 | big_df = Dataset(Arrow.Table(_tmp_storage)) 54 | else 55 | big_df = filereader(src_jn_y[3], types=[Int32, Int32, Int32, Characters{6}, Characters{9}, Characters{12}, Float64]); 56 | modify!(big_df, [:id4, :id5]=>PooledArray) 57 | end 58 | 59 | modify!(x_df, [:id4, :id5]=>PooledArray) 60 | modify!(small_df, :id4=>PooledArray) 61 | modify!(medium_df, [:id4, :id5]=>PooledArray) 62 | 63 | in_rows = size(x_df, 1); 64 | println(in_rows); flush(stdout); 65 | println(size(small_df, 1)); flush(stdout); 66 | println(size(medium_df, 1)); flush(stdout); 67 | println(size(big_df, 1)); flush(stdout); 68 | 69 | task_init = time(); 70 | print("joining...\n"); flush(stdout); 71 | 72 | question = "small inner on int"; # q1 73 | GC.gc(); 74 | t = @elapsed (ANS = innerjoin(x_df, small_df, on = :id1, makeunique=true); println(size(ANS)); flush(stdout)); 75 | m = memory_usage(); 76 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)]; 77 | write_log(1, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type); 78 | ANS = 0; 79 | GC.gc(); 80 | t = @elapsed (ANS = innerjoin(x_df, small_df, on = :id1, makeunique=true); println(size(ANS)); flush(stdout)); 81 | m = memory_usage(); 82 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)]; 83 | write_log(2, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type); 84 | println(first(ANS, 3)); 85 | println(last(ANS, 3)); 86 | ANS = 0; 87 | 88 | question = "medium inner on int"; # q2 89 | GC.gc(); 90 | t = @elapsed (ANS = innerjoin(x_df, medium_df, on = :id2, makeunique=true); println(size(ANS)); flush(stdout)); 91 | m = memory_usage(); 92 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)]; 93 | write_log(1, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type); 94 | ANS = 0; 95 | GC.gc(); 96 | t = @elapsed (ANS = innerjoin(x_df, medium_df, on = :id2, makeunique=true); println(size(ANS)); flush(stdout)); 97 | m = memory_usage(); 98 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)]; 99 | write_log(2, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type); 100 | println(first(ANS, 3)); 101 | println(last(ANS, 3)); 102 | ANS = 0; 103 | 104 | question = "medium outer on int"; # q3 105 | GC.gc(); 106 | t = @elapsed (ANS = leftjoin(x_df, medium_df, on = :id2, makeunique=true); println(size(ANS)); flush(stdout)); 107 | m = memory_usage(); 108 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)]; 109 | write_log(1, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type); 110 | ANS = 0; 111 | GC.gc(); 112 | t = @elapsed (ANS = leftjoin(x_df, medium_df, on = :id2, makeunique=true); println(size(ANS)); flush(stdout)); 113 | m = memory_usage(); 114 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)]; 115 | write_log(2, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type); 116 | println(first(ANS, 3)); 117 | println(last(ANS, 3)); 118 | ANS = 0; 119 | 120 | question = "medium inner on factor"; # q4 121 | GC.gc(); 122 | t = @elapsed (ANS = innerjoin(x_df, medium_df, on = :id5, makeunique=true); println(size(ANS)); flush(stdout)); 123 | m = memory_usage(); 124 | t_start = time_ns(); 125 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)]; 126 | write_log(1, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type); 127 | ANS = 0; 128 | GC.gc(); 129 | t = @elapsed (ANS = innerjoin(x_df, medium_df, on = :id5, makeunique=true); println(size(ANS)); flush(stdout)); 130 | m = memory_usage(); 131 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)]; 132 | write_log(2, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type); 133 | println(first(ANS, 3)); 134 | println(last(ANS, 3)); 135 | ANS = 0; 136 | 137 | question = "big inner on int"; # q5 138 | GC.gc(); 139 | t = @elapsed (ANS = innerjoin(x_df, big_df, on = :id3, makeunique=true); println(size(ANS)); flush(stdout)); 140 | m = memory_usage(); 141 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)]; 142 | write_log(1, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type); 143 | ANS = 0; 144 | GC.gc(); 145 | t = @elapsed (ANS = innerjoin(x_df, big_df, on = :id3, makeunique=true); println(size(ANS)); flush(stdout)); 146 | m = memory_usage(); 147 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)]; 148 | write_log(2, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type); 149 | println(first(ANS, 3)); 150 | println(last(ANS, 3)); 151 | ANS = 0; 152 | 153 | print(@sprintf "joining finished, took %.0fs\n" (time()-task_init)); flush(stdout); 154 | 155 | exit(); 156 | -------------------------------------------------------------------------------- /juliads/setup-juliads.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # install julia 3 | 4 | wget -q https://julialang-s3.julialang.org/bin/linux/x64/1.10/julia-1.10.5-linux-x86_64.tar.gz 5 | tar -xvf julia-1.10.5-linux-x86_64.tar.gz > tar_out.out 2> tar_err.err 6 | sudo mv julia-1.10.5 /opt 7 | rm julia-1.10.5-linux-x86_64.tar.gz 8 | # put to paths 9 | echo 'export JULIA_HOME=/opt/julia-1.10.5' >> path.env 10 | echo 'export PATH=$PATH:$JULIA_HOME/bin' >> path.env 11 | # note that cron job must have path updated as well 12 | 13 | source path.env 14 | 15 | # install julia InMemoryDatasets and csv packages 16 | julia -q -e 'using Pkg; Pkg.add(["InMemoryDatasets","DLMReader", "PooledArrays", "Arrow", "CSV"])' 17 | julia -q -e 'include("$(pwd())/_helpers/helpersds.jl"); pkgmeta = getpkgmeta("InMemoryDatasets"); println(string(pkgmeta["version"])); pkgmeta = getpkgmeta("DLMReader"); println(string(pkgmeta["version"]))' 18 | 19 | ./juliadf/ver-juliadf.sh 20 | -------------------------------------------------------------------------------- /juliads/upg-juliads.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # upgrade to latest devel 5 | echo 'upgrading juliads...' 6 | julia -q -e 'using Pkg; Pkg.update();' > /dev/null 2>&1 7 | 8 | -------------------------------------------------------------------------------- /juliads/ver-juliads.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | source path.env 5 | 6 | julia -q -e 'include("$(pwd())/_helpers/helpersds.jl"); pkgmeta = getpkgmeta("InMemoryDatasets"); f=open("juliads/VERSION","w"); write(f, string(pkgmeta["version"])); f=open("juliads/REVISION","w"); write(f, string(" "));' > /dev/null 7 | -------------------------------------------------------------------------------- /modin/join-modin.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | print("# join-modin.py") 4 | 5 | import os 6 | import gc 7 | import timeit 8 | import modin.pandas as pd 9 | 10 | exec(open("./helpers.py").read()) 11 | 12 | src_x = os.environ['SRC_X_LOCAL'] 13 | src_y = os.environ['SRC_Y_LOCAL'] 14 | 15 | ver = "" #pd.__version__ 16 | git = "" 17 | task = "join" 18 | question = "inner join" 19 | l = [os.path.basename(src_x), os.path.basename(src_y)] 20 | data_name = '-'.join(l) 21 | solution = "modin" 22 | fun = "merge" 23 | cache = "TRUE" 24 | 25 | print("loading datasets...") 26 | 27 | x = pd.read_csv(os.path.basename(src_x)) 28 | y = pd.read_csv(os.path.basename(src_y)) 29 | 30 | print("joining...") 31 | 32 | # NotImplementedError: To contribute to Pandas on Ray, please visit github.com/modin-project/modin 33 | gc.collect() 34 | t_start = timeit.default_timer() 35 | ans = x.merge(y, how='inner', on='KEY') 36 | print(ans.shape) 37 | t = timeit.default_timer() - t_start 38 | m = memory_usage() 39 | t_start = timeit.default_timer() 40 | chk = [ans['X2'].sum(), ans['Y2'].sum()] 41 | chkt = timeit.default_timer() - t_start 42 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 43 | del ans 44 | 45 | gc.collect() 46 | t_start = timeit.default_timer() 47 | ans = x.merge(y, how='inner', on='KEY') 48 | print(ans.shape) 49 | t = timeit.default_timer() - t_start 50 | m = memory_usage() 51 | t_start = timeit.default_timer() 52 | chk = [ans['X2'].sum(), ans['Y2'].sum()] 53 | chkt = timeit.default_timer() - t_start 54 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 55 | del ans 56 | 57 | gc.collect() 58 | t_start = timeit.default_timer() 59 | ans = x.merge(y, how='inner', on='KEY') 60 | print(ans.shape) 61 | t = timeit.default_timer() - t_start 62 | m = memory_usage() 63 | t_start = timeit.default_timer() 64 | chk = [ans['X2'].sum(), ans['Y2'].sum()] 65 | chkt = timeit.default_timer() - t_start 66 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 67 | del ans 68 | 69 | exit(0) 70 | -------------------------------------------------------------------------------- /modin/setup-modin.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | virtualenv modin/py-modin --python=python3 5 | source modin/py-modin/bin/activate 6 | 7 | # install binaries 8 | python3 -m pip install --upgrade modin[all] 9 | 10 | # check 11 | python3 12 | import modin 13 | modin.__version__ 14 | quit() 15 | 16 | deactivate 17 | -------------------------------------------------------------------------------- /modin/sort-modin.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | print("# sort-modin.py") 4 | 5 | import os 6 | import gc 7 | import timeit 8 | import modin as modin 9 | import modin.pandas as pd 10 | 11 | exec(open("./helpers.py").read()) 12 | 13 | src_x = os.environ['SRC_X_LOCAL'] 14 | 15 | ver = modin.__version__ 16 | git = modin.__git_revision__ 17 | task = "sort" 18 | question = "by int KEY" 19 | data_name = os.path.basename(src_x) 20 | solution = "modin" 21 | fun = ".sort" 22 | cache = "TRUE" 23 | 24 | print("loading dataset...") 25 | 26 | x = pd.read_csv(data_name) 27 | 28 | print("sorting...") 29 | 30 | gc.collect() 31 | t_start = timeit.default_timer() 32 | ans = x.sort_values('KEY') 33 | print(ans.shape) 34 | t = timeit.default_timer() - t_start 35 | m = memory_usage() 36 | t_start = timeit.default_timer() 37 | chk = [ans['X2'].sum()] 38 | chkt = timeit.default_timer() - t_start 39 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 40 | del ans 41 | 42 | gc.collect() 43 | t_start = timeit.default_timer() 44 | ans = x.sort_values('KEY') 45 | print(ans.shape) 46 | t = timeit.default_timer() - t_start 47 | m = memory_usage() 48 | t_start = timeit.default_timer() 49 | chk = [ans['X2'].sum()] 50 | chkt = timeit.default_timer() - t_start 51 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 52 | del ans 53 | 54 | gc.collect() 55 | t_start = timeit.default_timer() 56 | ans = x.sort_values('KEY') 57 | print(ans.shape) 58 | t = timeit.default_timer() - t_start 59 | m = memory_usage() 60 | t_start = timeit.default_timer() 61 | chk = [ans['X2'].sum()] 62 | chkt = timeit.default_timer() - t_start 63 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 64 | del ans 65 | 66 | exit(0) 67 | -------------------------------------------------------------------------------- /modin/upg-modin.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | echo 'upgrading modin...' 5 | 6 | source ./modin/py-modin/bin/activate 7 | 8 | python -m pip install --upgrade modin[all] > /dev/null 9 | -------------------------------------------------------------------------------- /modin/ver-modin.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | source ./modin/py-modin/bin/activate 5 | python -c 'import modin as modin; open("modin/VERSION","w").write(modin.__version__); open("modin/REVISION","w").write("");' > /dev/null 6 | -------------------------------------------------------------------------------- /pandas/VERSION: -------------------------------------------------------------------------------- 1 | 2.2.2 -------------------------------------------------------------------------------- /pandas/read-pandas.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | print("# read-pandas.py") 4 | 5 | import os 6 | import gc 7 | import timeit 8 | import subprocess 9 | import pandas as pd 10 | 11 | exec(open("./helpers.py").read()) 12 | 13 | src_grp = os.environ['SRC_GRP_LOCAL'] 14 | 15 | ver = pd.__version__ 16 | git = "" 17 | task = "read" 18 | data_name = os.path.basename(src_grp) 19 | solution = "pandas" 20 | fun = "read_csv" 21 | cache = "TRUE" 22 | 23 | wc_lines = subprocess.run(['wc','-l',data_name], stdout=subprocess.PIPE).stdout.decode('utf-8').split(" ", 1)[0] 24 | in_rows = int(wc_lines)-1 25 | 26 | print("reading...") 27 | 28 | question = "all rows" #1 29 | gc.collect() 30 | t_start = timeit.default_timer() 31 | ans = pd.read_csv(data_name) 32 | print(ans.shape) 33 | t = timeit.default_timer() - t_start 34 | m = memory_usage() 35 | t_start = timeit.default_timer() 36 | chk = [ans['v3'].sum()] 37 | chkt = timeit.default_timer() - t_start 38 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 39 | del ans 40 | gc.collect() 41 | t_start = timeit.default_timer() 42 | ans = pd.read_csv(data_name) 43 | print(ans.shape) 44 | t = timeit.default_timer() - t_start 45 | m = memory_usage() 46 | t_start = timeit.default_timer() 47 | chk = [ans['v3'].sum()] 48 | chkt = timeit.default_timer() - t_start 49 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 50 | del ans 51 | gc.collect() 52 | t_start = timeit.default_timer() 53 | ans = pd.read_csv(data_name) 54 | print(ans.shape) 55 | t = timeit.default_timer() - t_start 56 | m = memory_usage() 57 | t_start = timeit.default_timer() 58 | chk = [ans['v3'].sum()] 59 | chkt = timeit.default_timer() - t_start 60 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 61 | del ans 62 | 63 | question = "top 100 rows" #2 64 | gc.collect() 65 | t_start = timeit.default_timer() 66 | ans = pd.read_csv(data_name, nrows=100) 67 | print(ans.shape) 68 | t = timeit.default_timer() - t_start 69 | m = memory_usage() 70 | t_start = timeit.default_timer() 71 | chk = [ans['v3'].sum()] 72 | chkt = timeit.default_timer() - t_start 73 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 74 | del ans 75 | gc.collect() 76 | t_start = timeit.default_timer() 77 | ans = pd.read_csv(data_name, nrows=100) 78 | print(ans.shape) 79 | t = timeit.default_timer() - t_start 80 | m = memory_usage() 81 | t_start = timeit.default_timer() 82 | chk = [ans['v3'].sum()] 83 | chkt = timeit.default_timer() - t_start 84 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 85 | del ans 86 | gc.collect() 87 | t_start = timeit.default_timer() 88 | ans = pd.read_csv(data_name, nrows=100) 89 | print(ans.shape) 90 | t = timeit.default_timer() - t_start 91 | m = memory_usage() 92 | t_start = timeit.default_timer() 93 | chk = [ans['v3'].sum()] 94 | chkt = timeit.default_timer() - t_start 95 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 96 | del ans 97 | 98 | exit(0) 99 | -------------------------------------------------------------------------------- /pandas/setup-pandas.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # install all dependencies 5 | # sudo apt-get update 6 | # sudo apt-get install build-essential python3-dev python3-pip 7 | 8 | virtualenv pandas/py-pandas --python=python3 9 | source pandas/py-pandas/bin/activate 10 | 11 | # install binaries 12 | python3 -m pip install --upgrade psutil 13 | python3 -m pip install --upgrade pandas 14 | python3 -m pip install --upgrade pyarrow 15 | 16 | deactivate 17 | 18 | ./pandas/ver-pandas.sh 19 | 20 | # # check 21 | # source pandas/py-pandas/bin/activate 22 | # python3 23 | # import pandas as pd 24 | # pd.__version__ 25 | # quit() 26 | # deactivate 27 | -------------------------------------------------------------------------------- /pandas/sort-pandas.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | print("# sort-pandas.py") 4 | 5 | import os 6 | import gc 7 | import timeit 8 | import pandas as pd 9 | #import pydoop.hdfs as hd 10 | 11 | exec(open("./helpers.py").read()) 12 | 13 | src_x = os.environ['SRC_X_LOCAL'] 14 | 15 | ver = pd.__version__ 16 | git = "" 17 | task = "sort" 18 | question = "by int KEY" 19 | data_name = os.path.basename(src_x) 20 | solution = "pandas" 21 | fun = ".sort" 22 | cache = "TRUE" 23 | 24 | print("loading dataset...") 25 | 26 | # with hd.open(src_x) as f: 27 | # x = pd.read_csv(f) 28 | x = pd.read_csv(data_name) 29 | 30 | print("sorting...") 31 | 32 | gc.collect() 33 | t_start = timeit.default_timer() 34 | ans = x.sort_values('KEY') 35 | print(ans.shape) 36 | t = timeit.default_timer() - t_start 37 | m = memory_usage() 38 | t_start = timeit.default_timer() 39 | chk = [ans['X2'].sum()] 40 | chkt = timeit.default_timer() - t_start 41 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 42 | del ans 43 | 44 | gc.collect() 45 | t_start = timeit.default_timer() 46 | ans = x.sort_values('KEY') 47 | print(ans.shape) 48 | t = timeit.default_timer() - t_start 49 | m = memory_usage() 50 | t_start = timeit.default_timer() 51 | chk = [ans['X2'].sum()] 52 | chkt = timeit.default_timer() - t_start 53 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 54 | del ans 55 | 56 | gc.collect() 57 | t_start = timeit.default_timer() 58 | ans = x.sort_values('KEY') 59 | print(ans.shape) 60 | t = timeit.default_timer() - t_start 61 | m = memory_usage() 62 | t_start = timeit.default_timer() 63 | chk = [ans['X2'].sum()] 64 | chkt = timeit.default_timer() - t_start 65 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt) 66 | del ans 67 | 68 | exit(0) 69 | -------------------------------------------------------------------------------- /pandas/upg-pandas.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | echo 'upgrading pandas...' 5 | 6 | source ./pandas/py-pandas/bin/activate 7 | 8 | python3 -m pip install --upgrade pandas > /dev/null 9 | 10 | deactivate -------------------------------------------------------------------------------- /pandas/ver-pandas.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | source ./pandas/py-pandas/bin/activate 5 | python3 -c 'import pandas as pd; open("pandas/VERSION","w").write(pd.__version__); open("pandas/REVISION","w").write(pd.__git_version__);' > /dev/null 6 | deactivate -------------------------------------------------------------------------------- /path.env: -------------------------------------------------------------------------------- 1 | export JULIA_HOME=/opt/julia-1.9.2 2 | export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 3 | export PATH=$PATH:$JULIA_HOME/bin 4 | export MOUNT_POINT=/var/lib/mount 5 | export SPILL_DIR=/var/lib/mount/db-benchmark-metal/spill 6 | -------------------------------------------------------------------------------- /polars/VERSION: -------------------------------------------------------------------------------- 1 | 1.30.0 -------------------------------------------------------------------------------- /polars/monitor_ram.py: -------------------------------------------------------------------------------- 1 | import psutil 2 | import time 3 | import sys 4 | 5 | solution = str(sys.argv[1]) 6 | data_name = str(sys.argv[2]) 7 | pid_of_parent = int(sys.argv[3]) 8 | 9 | max_loops = 720 10 | file_name = f"{solution}-ram-{data_name}.txt" 11 | i = 0 12 | f = open(file_name, "w") 13 | f.close() 14 | while i < max_loops: 15 | # Get the current RAM usage and RSS 16 | process = psutil.Process(pid_of_parent) 17 | rss_usage = process.memory_info().rss >> 30 18 | ram_usage = psutil.virtual_memory().available >> 30 19 | 20 | # Print the results 21 | f = open(file_name, "a") 22 | f.write(f"RAM usage: {ram_usage} GB \n") 23 | f.write(f"RSS usage: {rss_usage} GB \n \n") 24 | f.close() 25 | 26 | # Wait for 10 seconds before polling again 27 | time.sleep(5) 28 | i += 1 29 | -------------------------------------------------------------------------------- /polars/setup-polars.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # install dependencies 5 | # sudo apt-get update -qq 6 | 7 | virtualenv polars/py-polars --python=python3 8 | source polars/py-polars/bin/activate 9 | 10 | python3 -m pip install --upgrade psutil polars numpy 11 | 12 | # build 13 | deactivate 14 | 15 | ./polars/upg-polars.sh 16 | 17 | ./polars/ver-polars.sh 18 | 19 | # check 20 | # source polars/py-polars/bin/activate 21 | # python3 22 | # import polars as pl 23 | # pl.__version__ 24 | # quit() 25 | # deactivate 26 | 27 | # fix: print(ans.head(3), flush=True): UnicodeEncodeError: 'ascii' codec can't encode characters in position 14-31: ordinal not in range(128) 28 | # vim polars/py-polars/bin/activate 29 | #deactivate () { 30 | # unset PYTHONIOENCODING 31 | # ... 32 | #} 33 | #... 34 | #PYTHONIOENCODING="utf-8" 35 | #export PYTHONIOENCODING 36 | #... 37 | -------------------------------------------------------------------------------- /polars/upg-polars.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | echo 'upgrading polars...' 5 | 6 | source ./polars/py-polars/bin/activate 7 | 8 | python3 -m pip install --upgrade polars > /dev/null 9 | 10 | deactivate -------------------------------------------------------------------------------- /polars/ver-polars.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | source ./polars/py-polars/bin/activate 5 | python3 -c 'import polars as pl; open("polars/VERSION","w").write(pl.__version__); open("polars/REVISION","w").write("");' > /dev/null 6 | -------------------------------------------------------------------------------- /pydatatable/VERSION: -------------------------------------------------------------------------------- 1 | 1.2.0a0 -------------------------------------------------------------------------------- /pydatatable/convert-pydatatable-data.py: -------------------------------------------------------------------------------- 1 | print("pydatatable: converting 50GB join data") 2 | import os 3 | import datatable as dt 4 | 5 | if os.path.isfile('data/J1_1e9_NA_0_0.csv'): 6 | dt.fread('data/J1_1e9_NA_0_0.csv').to_jay('data/J1_1e9_NA_0_0.jay') 7 | if os.path.isfile('data/J1_1e9_1e9_0_0.csv'): 8 | dt.fread('data/J1_1e9_1e9_0_0.csv').to_jay('data/J1_1e9_1e9_0_0.jay') 9 | if os.path.isfile('data/J1_1e9_1e6_0_0.csv'): 10 | dt.fread('data/J1_1e9_1e6_0_0.csv').to_jay('data/J1_1e9_1e6_0_0.jay') 11 | if os.path.isfile('data/J1_1e9_1e3_0_0.csv'): 12 | dt.fread('data/J1_1e9_1e3_0_0.csv').to_jay('data/J1_1e9_1e3_0_0.jay') 13 | if os.path.isfile('data/J1_1e9_NA_0_1.csv'): 14 | dt.fread('data/J1_1e9_NA_0_1.csv').to_jay('data/J1_1e9_NA_0_1.jay') 15 | if os.path.isfile('data/J1_1e9_1e9_0_1.csv'): 16 | dt.fread('data/J1_1e9_1e9_0_1.csv').to_jay('data/J1_1e9_1e9_0_1.jay') 17 | if os.path.isfile('data/J1_1e9_1e6_0_1.csv'): 18 | dt.fread('data/J1_1e9_1e6_0_1.csv').to_jay('data/J1_1e9_1e6_0_1.jay') 19 | if os.path.isfile('data/J1_1e9_1e3_0_1.csv'): 20 | dt.fread('data/J1_1e9_1e3_0_1.csv').to_jay('data/J1_1e9_1e3_0_1.jay') 21 | if os.path.isfile('data/J1_1e9_NA_5_0.csv'): 22 | dt.fread('data/J1_1e9_NA_5_0.csv').to_jay('data/J1_1e9_NA_5_0.jay') 23 | if os.path.isfile('data/J1_1e9_1e9_5_0.csv'): 24 | dt.fread('data/J1_1e9_1e9_5_0.csv').to_jay('data/J1_1e9_1e9_5_0.jay') 25 | if os.path.isfile('data/J1_1e9_1e6_5_0.csv'): 26 | dt.fread('data/J1_1e9_1e6_5_0.csv').to_jay('data/J1_1e9_1e6_5_0.jay') 27 | if os.path.isfile('data/J1_1e9_1e3_5_0.csv'): 28 | dt.fread('data/J1_1e9_1e3_5_0.csv').to_jay('data/J1_1e9_1e3_5_0.jay') 29 | 30 | print("pydatatable: done converting 50GB join data") -------------------------------------------------------------------------------- /pydatatable/read-pydatatable.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | print("# read-pydatatable.py") 4 | 5 | import os 6 | import gc 7 | import timeit 8 | import subprocess 9 | import datatable as dt 10 | from datatable import f, sum 11 | 12 | exec(open("./helpers.py").read()) 13 | 14 | src_grp = os.environ['SRC_GRP_LOCAL'] 15 | 16 | ver = dt.__version__ 17 | git = dt.__git_revision__ 18 | task = "read" 19 | data_name = os.path.basename(src_grp) 20 | solution = "pydatatable" 21 | fun = "fread" 22 | cache = "TRUE" 23 | 24 | wc_lines = subprocess.run(['wc','-l',data_name], stdout=subprocess.PIPE).stdout.decode('utf-8').split(" ", 1)[0] 25 | in_rows = int(wc_lines)-1 26 | 27 | print("reading...") 28 | 29 | question = "all rows" #1 30 | gc.collect() 31 | t_start = timeit.default_timer() 32 | ans = dt.fread(data_name, show_progress=False) 33 | print(ans.shape) 34 | t = timeit.default_timer() - t_start 35 | m = memory_usage() 36 | t_start = timeit.default_timer() 37 | chk = ans[:, sum(f.v3)] 38 | chkt = timeit.default_timer() - t_start 39 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt) 40 | del ans 41 | gc.collect() 42 | t_start = timeit.default_timer() 43 | ans = dt.fread(data_name, show_progress=False) 44 | print(ans.shape) 45 | t = timeit.default_timer() - t_start 46 | m = memory_usage() 47 | t_start = timeit.default_timer() 48 | chk = ans[:, sum(f.v3)] 49 | chkt = timeit.default_timer() - t_start 50 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt) 51 | del ans 52 | gc.collect() 53 | t_start = timeit.default_timer() 54 | ans = dt.fread(data_name, show_progress=False) 55 | print(ans.shape) 56 | t = timeit.default_timer() - t_start 57 | m = memory_usage() 58 | t_start = timeit.default_timer() 59 | chk = ans[:, sum(f.v3)] 60 | chkt = timeit.default_timer() - t_start 61 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt) 62 | del ans 63 | 64 | question = "top 100 rows" #2 65 | gc.collect() 66 | t_start = timeit.default_timer() 67 | ans = dt.fread(data_name, max_nrows=100, show_progress=False) 68 | print(ans.shape) 69 | t = timeit.default_timer() - t_start 70 | m = memory_usage() 71 | t_start = timeit.default_timer() 72 | chk = ans[:, sum(f.v3)] 73 | chkt = timeit.default_timer() - t_start 74 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt) 75 | del ans 76 | gc.collect() 77 | t_start = timeit.default_timer() 78 | ans = dt.fread(data_name, max_nrows=100, show_progress=False) 79 | print(ans.shape) 80 | t = timeit.default_timer() - t_start 81 | m = memory_usage() 82 | t_start = timeit.default_timer() 83 | chk = ans[:, sum(f.v3)] 84 | chkt = timeit.default_timer() - t_start 85 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt) 86 | del ans 87 | gc.collect() 88 | t_start = timeit.default_timer() 89 | ans = dt.fread(data_name, max_nrows=100, show_progress=False) 90 | print(ans.shape) 91 | t = timeit.default_timer() - t_start 92 | m = memory_usage() 93 | t_start = timeit.default_timer() 94 | chk = ans[:, sum(f.v3)] 95 | chkt = timeit.default_timer() - t_start 96 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt) 97 | del ans 98 | 99 | exit(0) 100 | -------------------------------------------------------------------------------- /pydatatable/setup-pydatatable.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # install dependencies 5 | virtualenv pydatatable/py-pydatatable --python=python3 6 | source pydatatable/py-pydatatable/bin/activate 7 | 8 | python -m pip install --upgrade psutil 9 | 10 | # # build 11 | deactivate 12 | ./pydatatable/upg-pydatatable.sh 13 | 14 | # # check 15 | # source pydatatable/py-pydatatable/bin/activate 16 | # python 17 | # import datatable as dt 18 | # dt.__version__ 19 | # quit() 20 | # deactivate 21 | 22 | # resave 1e9 join data from csv to jay format so pydt can try out-of-memory processing 23 | source pydatatable/py-pydatatable/bin/activate 24 | python3 pydatatable/convert-pydatatable-data.py 25 | 26 | ./pydatatable/ver-pydatatable.sh -------------------------------------------------------------------------------- /pydatatable/sort-pydatatable.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | print("# sort-pydatatable.py") 4 | 5 | import os 6 | import gc 7 | import timeit 8 | import datatable as dt 9 | from datatable import f, sum 10 | 11 | exec(open("./helpers.py").read()) 12 | 13 | src_x = os.environ['SRC_X_LOCAL'] 14 | 15 | ver = dt.__version__ 16 | git = dt.__git_revision__ 17 | task = "sort" 18 | question = "by int KEY" 19 | data_name = os.path.basename(src_x) 20 | solution = "pydatatable" 21 | fun = ".sort" 22 | cache = "TRUE" 23 | 24 | print("loading dataset...") 25 | 26 | x = dt.fread(data_name) 27 | 28 | print("sorting...") 29 | 30 | gc.collect() 31 | t_start = timeit.default_timer() 32 | ans = x.sort('KEY') 33 | print(ans.shape) 34 | t = timeit.default_timer() - t_start 35 | m = memory_usage() 36 | t_start = timeit.default_timer() 37 | chk = ans[:, sum(f.X2)] 38 | chkt = timeit.default_timer() - t_start 39 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt) 40 | del ans 41 | 42 | gc.collect() 43 | t_start = timeit.default_timer() 44 | ans = x.sort('KEY') 45 | print(ans.shape) 46 | t = timeit.default_timer() - t_start 47 | m = memory_usage() 48 | t_start = timeit.default_timer() 49 | chk = ans[:, sum(f.X2)] 50 | chkt = timeit.default_timer() - t_start 51 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt) 52 | del ans 53 | 54 | gc.collect() 55 | t_start = timeit.default_timer() 56 | ans = x.sort('KEY') 57 | print(ans.shape) 58 | t = timeit.default_timer() - t_start 59 | m = memory_usage() 60 | t_start = timeit.default_timer() 61 | chk = ans[:, sum(f.X2)] 62 | chkt = timeit.default_timer() - t_start 63 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt) 64 | del ans 65 | 66 | exit(0) 67 | -------------------------------------------------------------------------------- /pydatatable/upg-pydatatable.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | echo 'upgrading pydatatable...' 5 | 6 | source ./pydatatable/py-pydatatable/bin/activate 7 | python3 -m pip install --upgrade git+https://github.com/h2oai/datatable > /dev/null 2>&1 8 | deactivate 9 | 10 | echo 'done upgrading' 11 | -------------------------------------------------------------------------------- /pydatatable/ver-pydatatable.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | source ./pydatatable/py-pydatatable/bin/activate 5 | python3 -c 'import datatable as dt; open("pydatatable/VERSION","w").write(dt.__version__.split("+", 1)[0]); open("pydatatable/REVISION","w").write(dt.build_info.git_revision);' > /dev/null 6 | -------------------------------------------------------------------------------- /run.conf: -------------------------------------------------------------------------------- 1 | # task, used in init-setup-iteration.R 2 | export RUN_TASKS="groupby join" 3 | # solution, used in init-setup-iteration.R 4 | export RUN_SOLUTIONS="R-arrow collapse datafusion duckdb polars spark " 5 | 6 | # flag to upgrade tools, used in run.sh on init 7 | export DO_UPGRADE=false 8 | # force run, ignore if same version was run already 9 | export FORCE_RUN=true 10 | # not run benchmarks but print what would run and what skipped 11 | export MOCKUP=false 12 | 13 | # flag to build reports, used in ruh.sh before publish 14 | export DO_REPORT=true 15 | # flag to publish, used in ruh.sh before exit 16 | export DO_PUBLISH=false 17 | 18 | # logging and timing files 19 | export CSV_LOGS_FILE="logs.csv" 20 | export CSV_TIME_FILE="time.csv" 21 | -------------------------------------------------------------------------------- /spark/VERSION: -------------------------------------------------------------------------------- 1 | 4.0.0 -------------------------------------------------------------------------------- /spark/setup-spark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # install java8 5 | # sudo apt-get install openjdk-8-jdk 6 | 7 | virtualenv spark/py-spark --python=python3 8 | 9 | 10 | # put to paths 11 | echo 'export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64' >> path.env 12 | 13 | source path.env 14 | 15 | source spark/py-spark/bin/activate 16 | # install binaries 17 | python3 -m pip install --upgrade psutil 18 | python3 -m pip install --upgrade pyspark 19 | 20 | # check 21 | # python3 22 | # import pyspark 23 | # pyspark.__version__ 24 | # quit() 25 | 26 | deactivate 27 | 28 | 29 | ./spark/ver-spark.sh -------------------------------------------------------------------------------- /spark/upg-spark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | echo 'upgrading spark...' 5 | 6 | source ./spark/py-spark/bin/activate 7 | 8 | python3 -m pip install --upgrade pyspark > /dev/null 9 | 10 | deactivate -------------------------------------------------------------------------------- /spark/ver-spark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | source ./spark/py-spark/bin/activate 5 | python3 -c 'import pyspark; open("spark/VERSION","w").write(pyspark.__version__); open("spark/REVISION","w").write("");' > /dev/null 6 | --------------------------------------------------------------------------------