├── .github
    └── workflows
    │   ├── RunBenchmark.yml
    │   ├── regression.yml
    │   └── static.yml
├── .gitignore
├── LICENSE
├── R-arrow
    ├── VERSION
    ├── groupby-R-arrow.R
    ├── join-R-arrow.R
    ├── setup-R-arrow.sh
    ├── upg-R-arrow.sh
    └── ver-R-arrow.sh
├── README.md
├── _benchplot
    ├── benchplot-dict.R
    └── benchplot.R
├── _control
    ├── data.csv
    ├── data_large.csv
    ├── data_medium.csv
    ├── data_small.csv
    ├── nodenames.csv
    ├── questions.csv
    ├── skipped_benchmarks.csv
    ├── solutions.csv
    └── timeout.csv
├── _data
    ├── groupby-datagen.R
    ├── groupby2014-datagen.R
    └── join-datagen.R
├── _docs
    └── maintenance.md
├── _helpers
    ├── helpers.R
    ├── helpers.jl
    ├── helpers.py
    ├── helpers.sh
    └── helpersds.jl
├── _launcher
    ├── launch.R
    ├── launcher.R
    ├── setup.sh
    └── solution.R
├── _report
    ├── blah.R
    ├── ga.html
    ├── history.Rmd
    ├── index.Rmd
    ├── publish.sh
    ├── report.R
    └── tech.Rmd
├── _run
    ├── download_small_medium.sh
    ├── partitioned_run.sh
    ├── run_large.sh
    ├── run_medium.sh
    └── run_small.sh
├── _setup_utils
    ├── .DS_Store
    ├── install_all_solutions.py
    ├── mount.sh
    ├── mount_and_install_solutions.sh
    ├── prep_solutions.py
    ├── repro.sh
    ├── setup_small.sh
    └── sleep_and_run.sh
├── _utils
    ├── answers-validation.R
    ├── compare-data.table.R
    ├── download_data.sh
    ├── generate-data-small.sh
    ├── groupby_k_factor.csv
    ├── maintainer.R
    ├── maintainer.sh
    ├── parse_time_logs.R
    ├── partitioned_run.sh
    ├── sql_to_check_timings
    │   └── timing_checks.sql
    ├── time.R
    └── validate_no_errors.sh
├── arrow
    └── VERSION
├── clickhouse
    ├── VERSION
    ├── ch.sh
    ├── clickhouse-misc.sh
    ├── clickhouse-mount-config.xml
    ├── clickhouse-parse-log.R
    ├── exec.sh
    ├── groupby-clickhouse.sh
    ├── join-clickhouse.sh
    ├── setup-clickhouse.sh
    ├── upg-clickhouse.sh
    └── ver-clickhouse.sh
├── collapse
    ├── VERSION
    ├── groupby-collapse.R
    ├── groupby2014-collapse.R
    ├── join-collapse.R
    ├── setup-collapse.sh
    ├── upg-collapse.sh
    └── ver-collapse.sh
├── dask
    ├── VERSION
    ├── common.py
    ├── groupby_dask.py
    ├── join_dask.py
    ├── setup-dask.sh
    ├── upg-dask.sh
    └── ver-dask.sh
├── datafusion
    ├── VERSION
    ├── groupby-datafusion.py
    ├── join-datafusion.py
    ├── setup-datafusion.sh
    ├── upg-datafusion.sh
    └── ver-datafusion.sh
├── datatable
    ├── VERSION
    ├── groupby-datatable.R
    ├── groupby2014-datatable.R
    ├── join-datatable.R
    ├── read-datatable.R
    ├── setup-datatable.sh
    ├── sort-datatable.R
    ├── upg-datatable.sh
    └── ver-datatable.sh
├── dplyr
    ├── VERSION
    ├── groupby-dplyr.R
    ├── groupby2014-dplyr.R
    ├── join-dplyr.R
    ├── read-dplyr.R
    ├── setup-dplyr.sh
    ├── sort-dplyr.R
    ├── upg-dplyr.sh
    └── ver-dplyr.sh
├── duckdb-latest
    ├── VERSION
    ├── groupby-duckdb-latest.R
    ├── join-duckdb-latest.R
    ├── setup-duckdb-latest.sh
    ├── upg-duckdb-latest.sh
    └── ver-duckdb-latest.sh
├── duckdb
    ├── VERSION
    ├── groupby-duckdb.R
    ├── join-duckdb.R
    ├── setup-duckdb.sh
    ├── upg-duckdb.sh
    └── ver-duckdb.sh
├── h2o
    ├── exec.sh
    ├── groupby-h2o.R
    ├── h2o.sh
    ├── join-h2o.R
    ├── setup-h2o.sh
    ├── upg-h2o.sh
    └── ver-h2o.sh
├── juliadf
    ├── VERSION
    ├── exec.sh
    ├── groupby-juliadf.jl
    ├── join-juliadf.jl
    ├── setup-juliadf.sh
    ├── upg-juliadf.sh
    └── ver-juliadf.sh
├── juliads
    ├── VERSION
    ├── exec.sh
    ├── groupby-juliads.jl
    ├── join-juliads.jl
    ├── setup-juliads.sh
    ├── upg-juliads.sh
    └── ver-juliads.sh
├── logs.csv
├── modin
    ├── groupby-modin.py
    ├── join-modin.py
    ├── setup-modin.sh
    ├── sort-modin.py
    ├── upg-modin.sh
    └── ver-modin.sh
├── pandas
    ├── VERSION
    ├── groupby-pandas.py
    ├── groupby2014-pandas.py
    ├── join-pandas.py
    ├── read-pandas.py
    ├── setup-pandas.sh
    ├── sort-pandas.py
    ├── upg-pandas.sh
    └── ver-pandas.sh
├── path.env
├── polars
    ├── VERSION
    ├── groupby-polars.py
    ├── join-polars.py
    ├── monitor_ram.py
    ├── setup-polars.sh
    ├── upg-polars.sh
    └── ver-polars.sh
├── pydatatable
    ├── VERSION
    ├── convert-pydatatable-data.py
    ├── groupby-pydatatable.py
    ├── join-pydatatable.py
    ├── read-pydatatable.py
    ├── setup-pydatatable.sh
    ├── sort-pydatatable.py
    ├── upg-pydatatable.sh
    └── ver-pydatatable.sh
├── run.conf
├── run.sh
├── spark
    ├── VERSION
    ├── groupby-spark.py
    ├── join-spark.py
    ├── setup-spark.sh
    ├── upg-spark.sh
    └── ver-spark.sh
└── time.csv


/.github/workflows/RunBenchmark.yml:
--------------------------------------------------------------------------------
  1 | name: Run benchmark
  2 | on:
  3 |   workflow_dispatch:
  4 |     inputs:
  5 |       solutions:
  6 |         type: string
  7 |       instance_id:
  8 |         type: string
  9 |       include_clickhouse:
 10 |         type: bool
 11 |       sizes:
 12 |         type: string
 13 | 
 14 | concurrency:
 15 |   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/master' || github.sha }}
 16 |   cancel-in-progress: true
 17 | 
 18 | env:
 19 |   GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 20 |   gh_issue_repo: duckdblabs/db-benchmark
 21 |   instance_id: ${{ inputs.instance_id }}
 22 |   solutions: ${{ inputs.solutions }}
 23 |   include_clickhouse: ${{ inputs.include_clickhouse }}
 24 | 
 25 | 
 26 | jobs:
 27 |   start-aws-machine:
 28 |     name: Start aws-small-machine
 29 |     runs-on: ubuntu-latest
 30 |     environment: aws-secrets
 31 |     steps:
 32 |       - name: Start EC2 runner
 33 |         shell: bash
 34 |         env:
 35 |           AWS_ACCESS_KEY_ID: ${{secrets.AWS_ACCESS_KEY_ID}}
 36 |           AWS_SECRET_ACCESS_KEY: ${{secrets.AWS_SECRET_ACCESS_KEY}}
 37 |           AWS_DEFAULT_REGION: us-east-1
 38 |         run: aws ec2 start-instances --instance-id ${{ env.instance_id }}
 39 | 
 40 |       - name: Create issue if failure
 41 |         shell: bash
 42 |         if: ${{ failure() && contains(github.ref_name, 'main') }}
 43 |         run: |
 44 |           gh issue create --repo ${{ env.gh_issue_repo }} --title "Could not start DB-benchmark machine" --body "AWS box with instance-id ${{ env.instance_id }} could not be started"
 45 | 
 46 |   run-benchmark:
 47 |     name: Regression Tests all solutions
 48 |     env:
 49 |       CC: gcc-10
 50 |       CXX: g++-10
 51 |       GEN: ninja
 52 |     runs-on: self-hosted
 53 |     environment: aws-secrets
 54 |     permissions:                # Job-level permissions configuration starts here
 55 |       contents: write           # 'write' access to repository contents
 56 |       pull-requests: write      # 'write' access to pull requests
 57 |     steps:
 58 |       - uses: actions/checkout@v4
 59 | 
 60 |       - name: run mount
 61 |         shell: bash
 62 |         run: |
 63 |           ./_setup_utils/mount.sh
 64 | 
 65 |       - name: Install or Upgrade all solutions
 66 |         shell: bash
 67 |         working-directory: /var/lib/mount/db-benchmark-metal
 68 |         run: |
 69 |           python3 _setup_utils/install_all_solutions.py ${{ env.solutions }}
 70 |           if [ ${{ env.include_clickhouse }} ]; then
 71 |             # installing/updating clickhouse needs sudo priviledges
 72 |             sudo python3 _setup_utils/install_all_solutions.py clickhouse
 73 |           fi
 74 | 
 75 |       - name: Modify run.conf to only have new versions
 76 |         shell: bash
 77 |         working-directory: /var/lib/mount/db-benchmark-metal
 78 |         run: |
 79 |           git diff --name-only **/VERSION > updated_solutions.txt
 80 |           cat updated_solutions.txt
 81 |           export new_solutions="${{ env.solutions }}"
 82 |           echo "testing solutions: " $new_solutions
 83 |           sed -i "s/export RUN_SOLUTIONS=.*/export RUN_SOLUTIONS=\"${new_solutions}\"/g" run.conf
 84 | 
 85 |       - name: Run the benchmark
 86 |         shell: bash
 87 |         working-directory: /var/lib/mount/db-benchmark-metal
 88 |         env:
 89 |           DO_REPORT: 1
 90 |           DO_PUBLISH: 0
 91 |         run: |
 92 |           ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'`
 93 |           if [ $ncores -eq 16 ]; then export MACHINE_TYPE="c6id.4xlarge"; fi
 94 |           if [ $ncores -eq 32 ]; then export MACHINE_TYPE="c6id.8xlarge"; fi
 95 |           if [ $ncores -eq 128 ]; then export MACHINE_TYPE="c6id.metal"; fi
 96 |           if [[ ${{ inputs.sizes }} == *"small"* ]];
 97 |             MACHINE_TYPE=$MACHINE_TYPE ./_run/run_small.sh
 98 |           fi
 99 |           if [[ ${{ inputs.sizes }} == *"medium"* ]];
100 |             MACHINE_TYPE=$MACHINE_TYPE ./_run/run_medium.sh
101 |           fi
102 |           if [[ ${{ inputs.sizes }} == *"large"* ]];
103 |             MACHINE_TYPE=$MACHINE_TYPE ./_run/run_large.sh
104 |           fi
105 |           
106 |       - name: name new branch
107 |         shell: bash
108 |         run: |
109 |           echo "new_branch_name=results-`date +%Y-%m-%d-%Hh%Mm`" >> $GITHUB_ENV
110 |           echo ${{ env.new_branch_name }}
111 | 
112 |       - name: Commit updates
113 |         shell: bash
114 |         working-directory: /var/lib/mount/db-benchmark-metal
115 |         run: |
116 |           git config --global user.email ""
117 |           git config --global user.name "Run Benchmark action"
118 |           git remote add upstream git@github.com:duckdblabs/db-benchmark
119 |           git fetch upstream
120 |           git switch -c ${{ env.new_branch_name }}
121 |           git add time.csv logs.csv **/VERSION
122 |           git add run.conf
123 |           git commit -m "new results"
124 |           git push upstream ${{ env.new_branch_name }}
125 | 
126 |       - name: Create Archive
127 |         if: always()
128 |         shell: bash
129 |         working-directory: /var/lib/mount/db-benchmark-metal
130 |         run: |
131 |           mkdir -p out
132 |           echo "guarantee not empty dir" > out/guarantee.txt
133 |           zip -r out-dir.zip out/ public/
134 | 
135 |       - uses: actions/upload-artifact@v4
136 |         if: always()
137 |         with:
138 |           name: out-dir.zip
139 |           path: /var/lib/mount/db-benchmark-metal/out-dir.zip
140 |           if-no-files-found: error
141 | 
142 |   shutdown:
143 |     name: shut down
144 |     environment: aws-secrets
145 |     if: always()
146 |     runs-on: ubuntu-latest
147 |     needs:
148 |       - start-aws-machine
149 |       - run-benchmark
150 | 
151 |     steps:
152 |       - name: shutdown
153 |         shell: bash
154 |         env:
155 |           AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
156 |           AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
157 |           AWS_DEFAULT_REGION: us-east-1
158 |         run: aws ec2 stop-instances --instance-id ${{ env.instance_id }}
159 | 
160 | 


--------------------------------------------------------------------------------
/.github/workflows/regression.yml:
--------------------------------------------------------------------------------
  1 | name: Regression
  2 | on:
  3 |   workflow_dispatch:
  4 |   repository_dispatch:
  5 |   pull_request:
  6 |     paths-ignore:
  7 |       - '**.md'
  8 |       - '.github/workflows/**'
  9 |       - '!.github/workflows/regression.yml'
 10 | 
 11 | concurrency:
 12 |   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/master' || github.sha }}
 13 |   cancel-in-progress: true
 14 | 
 15 | jobs:
 16 |  regression-test-benchmark-runner-solo-solutions:
 17 |   strategy:
 18 |     fail-fast: false
 19 |     matrix:
 20 |       solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, R-arrow, duckdb, datafusion, dask, clickhouse]
 21 |   name: Solo solutions
 22 |   runs-on: ubuntu-latest
 23 |   env:
 24 |     CC: gcc-10
 25 |     CXX: g++-10
 26 |     GEN: ninja
 27 | 
 28 |   steps:
 29 |     - uses: actions/checkout@v3
 30 |       with:
 31 |         fetch-depth: 0
 32 | 
 33 |     - uses: actions/setup-python@v4
 34 |       with:
 35 |         python-version: '3.10'
 36 | 
 37 |     - name: Install libraries
 38 |       shell: bash
 39 |       run: ./_setup_utils/setup_small.sh
 40 | 
 41 |     - name: Generate 500mb datasets
 42 |       shell: bash
 43 |       run: ./_utils/generate-data-small.sh
 44 | 
 45 |     - name: Remove old logs
 46 |       shell: bash
 47 |       run: rm time.csv logs.csv
 48 | 
 49 |     - name: Install all solutions
 50 |       shell: bash
 51 |       run: source path.env && python3 _setup_utils/install_all_solutions.py ${{ matrix.solution }}
 52 | 
 53 |     - name: Turn swap off 
 54 |       shell: bash
 55 |       run: sudo swapoff -a
 56 | 
 57 |     # needed because clickhouse for some reason produces an error the first
 58 |     # time a benchmark is run. The next benchmark run will work and overwrite the
 59 |     # old benchmark files.
 60 |     - name: Run mini GroupBy benchmark if clickhouse
 61 |       shell: bash
 62 |       if: ${{ matrix.solution == 'clickhouse' || matrix.solution == 'all' }}
 63 |       run: |
 64 |         python3 _setup_utils/prep_solutions.py --task=groupby --solution=clickhouse
 65 |         source path.env
 66 |         MACHINE_TYPE="c6id.4xlarge" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
 67 |         sleep 60
 68 |         MACHINE_TYPE="c6id.metal" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
 69 |         sleep 60
 70 | 
 71 |     - name: Run mini GroupBy benchmark
 72 |       shell: bash
 73 |       run: |
 74 |         python3 _setup_utils/prep_solutions.py --task=groupby --solution=${{ matrix.solution }}
 75 |         source path.env
 76 |         MACHINE_TYPE="c6id.4xlarge" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
 77 |         sleep 60
 78 |         MACHINE_TYPE="c6id.metal" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
 79 |         sleep 60
 80 | 
 81 |     - name: Run mini Join benchmark
 82 |       shell: bash
 83 |       run: |
 84 |         python3 _setup_utils/prep_solutions.py --task=join --solution=${{ matrix.solution }}
 85 |         source path.env
 86 |         MACHINE_TYPE="c6id.4xlarge" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
 87 |         sleep 60
 88 |         MACHINE_TYPE="c6id.metal" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
 89 |         sleep 60
 90 | 
 91 |     - name: Validate benchmark results and report generation
 92 |       shell: bash
 93 |       run: ./_utils/validate_no_errors.sh
 94 | 
 95 |     - name: Create Archive
 96 |       if: always()
 97 |       shell: bash
 98 |       run: |
 99 |         cp *.csv out/
100 |         zip -r ${{ matrix.solution }}-out.zip out/
101 | 
102 |     # include this step to see what the latest versions are of every solution
103 |     - name: Print latest versions
104 |       if: always()
105 |       shell: bash
106 |       run: tail -n +1 */VERSION
107 | 
108 |     - uses: actions/upload-artifact@v4
109 |       if: always()
110 |       with:
111 |         name: ${{ matrix.solution }}-out.zip
112 |         path: ${{ matrix.solution }}-out.zip
113 |         if-no-files-found: error
114 |  
115 |  regression-test-benchmark-runner-all-solutions:
116 |   needs: regression-test-benchmark-runner-solo-solutions
117 |   name: Regression Tests all solutions
118 |   runs-on: ubuntu-20.04
119 |   env:
120 |     CC: gcc-10
121 |     CXX: g++-10
122 |     GEN: ninja
123 | 
124 |   steps:
125 |     - uses: actions/checkout@v3
126 |       with:
127 |         fetch-depth: 0
128 | 
129 |     - uses: actions/setup-python@v4
130 |       with:
131 |         python-version: '3.10'
132 | 
133 |     - name: Install libraries
134 |       shell: bash
135 |       run: ./_setup_utils/setup_small.sh
136 | 
137 |     - name: Generate 500mb datasets
138 |       shell: bash
139 |       run: ./_utils/generate-data-small.sh
140 | 
141 |     - name: Remove old logs
142 |       shell: bash
143 |       run: rm time.csv logs.csv
144 | 
145 |     - name: Install all solutions
146 |       shell: bash
147 |       run: source path.env && python3 _setup_utils/install_all_solutions.py all
148 | 
149 |     - name: Turn swap off
150 |       shell: bash
151 |       run: sudo swapoff -a
152 | 
153 |     - name: Run mini GroupBy benchmark
154 |       shell: bash
155 |       run: |
156 |         python3 _setup_utils/prep_solutions.py --task=groupby --solution=all
157 |         source path.env
158 |         MACHINE_TYPE="c6id.4xlarge" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
159 |         sleep 60
160 |         MACHINE_TYPE="c6id.metal" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
161 | 
162 |     - name: Run mini Join benchmark
163 |       shell: bash
164 |       run: |
165 |         python3 _setup_utils/prep_solutions.py --task=join --solution=all
166 |         source path.env
167 |         MACHINE_TYPE="c6id.4xlarge" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
168 |         sleep 60
169 |         MACHINE_TYPE="c6id.metal" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
170 | 
171 |     - name: Validate benchmark results and report generation
172 |       shell: bash
173 |       run: ./_utils/validate_no_errors.sh
174 | 
175 |     - name: Create Archive
176 |       if: always()
177 |       shell: bash
178 |       run: |
179 |         cp *.csv out/
180 |         zip -r all-out.zip out/
181 | 
182 |     # include this step to see what the latest versions are of every solution
183 |     - name: Print latest versions
184 |       if: always()
185 |       shell: bash
186 |       run: tail -n +1 */VERSION
187 | 
188 |     - uses: actions/upload-artifact@v4
189 |       if: always()
190 |       with:
191 |         name: all-out.zip
192 |         path: all-out.zip
193 |         if-no-files-found: error
194 | 
195 | 


--------------------------------------------------------------------------------
/.github/workflows/static.yml:
--------------------------------------------------------------------------------
 1 | # Simple workflow for deploying static content to GitHub Pages
 2 | name: Deploy static content to Pages
 3 | 
 4 | on:
 5 |   # Runs on pushes targeting the default branch
 6 |   push:
 7 |     branches: ["gh-pages"]
 8 | 
 9 |   # Allows you to run this workflow manually from the Actions tab
10 |   workflow_dispatch:
11 | 
12 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
13 | permissions:
14 |   contents: read
15 |   pages: write
16 |   id-token: write
17 | 
18 | # Allow one concurrent deployment
19 | concurrency:
20 |   group: "pages"
21 |   cancel-in-progress: true
22 | 
23 | jobs:
24 |   # Single deploy job since we're just deploying
25 |   deploy:
26 |     environment:
27 |       name: github-pages
28 |       url: ${{ steps.deployment.outputs.page_url }}
29 |     runs-on: ubuntu-latest
30 |     steps:
31 |       - name: Checkout
32 |         uses: actions/checkout@v3
33 |       - name: Setup Pages
34 |         uses: actions/configure-pages@v2
35 |       - name: Upload artifact
36 |         uses: actions/upload-pages-artifact@v1
37 |         with:
38 |           # Upload entire repository
39 |           path: '.'
40 |       - name: Deploy to GitHub Pages
41 |         id: deployment
42 |         uses: actions/deploy-pages@v1
43 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | figure/*
 2 | metastore_db/*
 3 | *.log
 4 | *.html
 5 | *.csv
 6 | !time.csv
 7 | !logs.csv
 8 | !_control/data_small.csv
 9 | !_control/data_large.csv
10 | *.md5
11 | .Rproj.user
12 | .Rhistory
13 | db-benchmark.Rproj
14 | */REVISION
15 | token
16 | .token
17 | public/
18 | out/
19 | data/
20 | clickhouse/log/
21 | clickhouse/*-clickhouse.sql
22 | clickhouse/unused/
23 | */log/
24 | tmp/
25 | dask-worker-space/
26 | GA/
27 | utils/
28 | */py-*/
29 | */r-*/
30 | duckdb-latest/duckdb
31 | report-done
32 | db-benchmark.gh-pages/
33 | run.out
34 | clickhouse/etc_sudoers.bak
35 | workdir/
36 | timeout-exit-codes.out
37 | */target
38 | *.lock
39 | 


--------------------------------------------------------------------------------
/R-arrow/VERSION:
--------------------------------------------------------------------------------
1 | 20.0.0.2
2 | 


--------------------------------------------------------------------------------
/R-arrow/setup-R-arrow.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # install stable arrow
 5 | mkdir -p ./R-arrow/r-arrow
 6 | ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'`
 7 | MAKE="make -j$ncores" Rscript -e 'install.packages(c("arrow","dplyr"), lib="./R-arrow/r-arrow")'
 8 | 
 9 | ./R-arrow/ver-R-arrow.sh
10 | 


--------------------------------------------------------------------------------
/R-arrow/upg-R-arrow.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | # upgrade all packages in arrow library only if new arrow is out
5 | echo 'upgrading arrow...'
6 | Rscript -e 'ap=available.packages(); if (ap["arrow","Version"]!=packageVersion("arrow", lib.loc="./R-arrow/r-arrow")) update.packages(lib.loc="./R-arrow/r-arrow", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)'
7 | 


--------------------------------------------------------------------------------
/R-arrow/ver-R-arrow.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | Rscript -e 'v=read.dcf(system.file(package="arrow", lib.loc="./R-arrow/r-arrow", "DESCRIPTION"), fields=c("Version","RemoteSha")); colnames(v)[colnames(v)=="RemoteSha"]="Revision"; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("R-arrow", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))'
5 | 


--------------------------------------------------------------------------------
/_control/data.csv:
--------------------------------------------------------------------------------
 1 | task,data,nrow,k,na,sort,active
 2 | groupby,G1_1e7_1e2_0_0,1e7,1e2,0,0,1
 3 | groupby,G1_1e7_1e1_0_0,1e7,1e1,0,0,1
 4 | groupby,G1_1e7_2e0_0_0,1e7,2e0,0,0,1
 5 | groupby,G1_1e7_1e2_0_1,1e7,1e2,0,1,1
 6 | groupby,G1_1e7_1e2_5_0,1e7,1e2,5,0,1
 7 | groupby,G1_1e8_1e2_0_0,1e8,1e2,0,0,1
 8 | groupby,G1_1e8_1e1_0_0,1e8,1e1,0,0,1
 9 | groupby,G1_1e8_2e0_0_0,1e8,2e0,0,0,1
10 | groupby,G1_1e8_1e2_0_1,1e8,1e2,0,1,1
11 | groupby,G1_1e8_1e2_5_0,1e8,1e2,5,0,1
12 | groupby,G1_1e9_1e2_0_0,1e9,1e2,0,0,1
13 | groupby,G1_1e9_1e1_0_0,1e9,1e1,0,0,1
14 | groupby,G1_1e9_2e0_0_0,1e9,2e0,0,0,1
15 | groupby,G1_1e9_1e2_0_1,1e9,1e2,0,1,1
16 | groupby,G1_1e9_1e2_5_0,1e9,1e2,5,0,1
17 | join,J1_1e7_NA_0_0,1e7,NA,0,0,1
18 | join,J1_1e7_NA_5_0,1e7,NA,5,0,1
19 | join,J1_1e7_NA_0_1,1e7,NA,0,1,1
20 | join,J1_1e8_NA_0_0,1e8,NA,0,0,1
21 | join,J1_1e8_NA_5_0,1e8,NA,5,0,1
22 | join,J1_1e8_NA_0_1,1e8,NA,0,1,1
23 | join,J1_1e9_NA_0_0,1e9,NA,0,0,1


--------------------------------------------------------------------------------
/_control/data_large.csv:
--------------------------------------------------------------------------------
1 | task,data,nrow,k,na,sort,active
2 | groupby,G1_1e9_1e2_0_0,1e9,1e2,0,0,1
3 | groupby,G1_1e9_1e1_0_0,1e9,1e1,0,0,1
4 | groupby,G1_1e9_2e0_0_0,1e9,2e0,0,0,1
5 | groupby,G1_1e9_1e2_0_1,1e9,1e2,0,1,1
6 | groupby,G1_1e9_1e2_5_0,1e9,1e2,5,0,1
7 | join,J1_1e9_NA_0_0,1e9,NA,0,0,1


--------------------------------------------------------------------------------
/_control/data_medium.csv:
--------------------------------------------------------------------------------
1 | task,data,nrow,k,na,sort,active
2 | groupby,G1_1e8_1e2_0_0,1e8,1e2,0,0,1
3 | groupby,G1_1e8_1e1_0_0,1e8,1e1,0,0,1
4 | groupby,G1_1e8_2e0_0_0,1e8,2e0,0,0,1
5 | groupby,G1_1e8_1e2_0_1,1e8,1e2,0,1,1
6 | groupby,G1_1e8_1e2_5_0,1e8,1e2,5,0,1
7 | join,J1_1e8_NA_0_0,1e8,NA,0,0,1
8 | join,J1_1e8_NA_5_0,1e8,NA,5,0,1
9 | join,J1_1e8_NA_0_1,1e8,NA,0,1,1


--------------------------------------------------------------------------------
/_control/data_small.csv:
--------------------------------------------------------------------------------
1 | task,data,nrow,k,na,sort,active
2 | groupby,G1_1e7_1e2_0_0,1e7,1e2,0,0,1
3 | groupby,G1_1e7_1e1_0_0,1e7,1e1,0,0,1
4 | groupby,G1_1e7_2e0_0_0,1e7,2e0,0,0,1
5 | groupby,G1_1e7_1e2_0_1,1e7,1e2,0,1,1
6 | groupby,G1_1e7_1e2_5_0,1e7,1e2,5,0,1
7 | join,J1_1e7_NA_0_0,1e7,NA,0,0,1
8 | join,J1_1e7_NA_5_0,1e7,NA,5,0,1
9 | join,J1_1e7_NA_0_1,1e7,NA,0,1,1


--------------------------------------------------------------------------------
/_control/nodenames.csv:
--------------------------------------------------------------------------------
1 | nodename,cpu_model,cpu_cores,memory_model,memory_gb,gpu_model,gpu_num,gpu_gb
2 | mr-0xc11,Intel(R) Xeon(R) CPU E5-2660 v3 @ 2.60GHz,20,DIMM DDR4 Synchronous 2133 MHz,125.80,,,
3 | mr-dl11,Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz,40,DIMM Synchronous 2133 MHz,125.78,GeForce GTX 1080 Ti,2,21.83
4 | m4.10xlarge,Intel(R) Xeon(R) CPU E5-2676 v3 @ 2.40GHz,40,unkown,157,None,None,None
5 | c6id.metal,Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz,128,NVMe SSD,250,None,None,None


--------------------------------------------------------------------------------
/_control/questions.csv:
--------------------------------------------------------------------------------
 1 | task,question,question_group
 2 | groupby,sum v1 by id1,basic
 3 | groupby,sum v1 by id1:id2,basic
 4 | groupby,sum v1 mean v3 by id3,basic
 5 | groupby,mean v1:v3 by id4,basic
 6 | groupby,sum v1:v3 by id6,basic
 7 | groupby,median v3 sd v3 by id4 id5,advanced
 8 | groupby,max v1 - min v2 by id3,advanced
 9 | groupby,largest two v3 by id6,advanced
10 | groupby,regression v1 v2 by id2 id4,advanced
11 | groupby,sum v3 count by id1:id6,advanced
12 | join,small inner on int,basic
13 | join,medium inner on int,basic
14 | join,medium outer on int,basic
15 | join,medium inner on factor,basic
16 | join,big inner on int,basic
17 | groupby2014,sum v1 by id1,basic
18 | groupby2014,sum v1 by id1:id2,basic
19 | groupby2014,sum v1 mean v3 by id3,basic
20 | groupby2014,mean v1:v3 by id4,basic
21 | groupby2014,sum v1:v3 by id6,basic
22 | 


--------------------------------------------------------------------------------
/_control/skipped_benchmarks.csv:
--------------------------------------------------------------------------------
 1 | solution,task,data,machine_type
 2 | juliads,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
 3 | juliads,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
 4 | juliads,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
 5 | juliads,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
 6 | juliads,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
 7 | juliadf,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
 8 | juliadf,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
 9 | juliadf,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
10 | juliadf,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
11 | juliadf,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
12 | R-arrow,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
13 | R-arrow,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
14 | R-arrow,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
15 | R-arrow,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
16 | R-arrow,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
17 | dplyr,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
18 | dplyr,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
19 | dplyr,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
20 | dplyr,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
21 | dplyr,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
22 | pandas,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
23 | pandas,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
24 | pandas,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
25 | pandas,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
26 | pandas,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
27 | pydatatable,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
28 | pydatatable,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
29 | pydatatable,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
30 | pydatatable,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
31 | pydatatable,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
32 | spark,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
33 | spark,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
34 | spark,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
35 | spark,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
36 | spark,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
37 | spark,groupby,G1_1e9_1e2_5_0,c6id.metal
38 | datafusion,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
39 | datafusion,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
40 | datafusion,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
41 | datafusion,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
42 | datafusion,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
43 | datafusion,groupby,G1_1e9_1e1_0_0,c6id.metal
44 | datafusion,groupby,G1_1e9_2e0_0_0,c6id.metal
45 | datafusion,groupby,G1_1e9_1e2_0_1,c6id.metal
46 | datafusion,groupby,G1_1e9_1e2_5_0,c6id.metal
47 | datafusion,join,J1_1e8_NA_0_0,c6id.4xlarge
48 | datafusion,join,J1_1e8_NA_5_0,c6id.4xlarge
49 | datafusion,join,J1_1e8_NA_0_1,c6id.4xlarge
50 | datafusion,join,J1_1e9_NA_0_0,c6id.4xlarge
51 | R-arrow,join,J1_1e9_NA_0_0,c6id.4xlarge
52 | dask,join,J1_1e9_NA_0_0,c6id.4xlarge
53 | datatable,join,J1_1e9_NA_0_0,c6id.4xlarge
54 | juliadf,join,J1_1e9_NA_0_0,c6id.4xlarge
55 | juliads,join,J1_1e9_NA_0_0,c6id.4xlarge
56 | pandas,join,J1_1e9_NA_0_0,c6id.4xlarge
57 | collapse,join,J1_1e9_NA_0_0,c6id.4xlarge
58 | polars,join,J1_1e9_NA_0_0,c6id.4xlarge
59 | pydatatable,join,J1_1e9_NA_0_0,c6id.4xlarge
60 | spark,join,J1_1e9_NA_0_0,c6id.4xlarge
61 | clickhouse,join,J1_1e9_NA_0_0,c6id.4xlarge
62 | 
63 | 


--------------------------------------------------------------------------------
/_control/solutions.csv:
--------------------------------------------------------------------------------
 1 | solution,task
 2 | collapse,groupby
 3 | collapse,groupby2014
 4 | collapse,join
 5 | data.table,groupby
 6 | data.table,join
 7 | data.table,groupby2014
 8 | dplyr,groupby
 9 | dplyr,join
10 | dplyr,groupby2014
11 | pandas,groupby
12 | pandas,join
13 | pandas,groupby2014
14 | pydatatable,groupby
15 | pydatatable,join
16 | spark,groupby
17 | spark,join
18 | dask,groupby
19 | dask,join
20 | juliadf,groupby
21 | juliadf,join
22 | juliads,groupby
23 | juliads,join
24 | clickhouse,groupby
25 | clickhouse,join
26 | polars,groupby
27 | polars,join
28 | R-arrow,groupby
29 | R-arrow,join
30 | duckdb,groupby
31 | duckdb,join
32 | duckdb-latest,groupby
33 | duckdb-latest,join
34 | datafusion,groupby
35 | datafusion,join
36 | 


--------------------------------------------------------------------------------
/_control/timeout.csv:
--------------------------------------------------------------------------------
 1 | task,in_rows,minutes
 2 | groupby,1e7,10
 3 | groupby,1e8,30
 4 | groupby,1e9,60
 5 | join,1e7,10
 6 | join,1e8,30
 7 | join,1e9,60
 8 | groupby2014,1e7,60
 9 | groupby2014,1e8,120
10 | groupby2014,1e9,180
11 | 


--------------------------------------------------------------------------------
/_data/groupby-datagen.R:
--------------------------------------------------------------------------------
 1 | # Rscript groupby-datagen.R 1e7 1e2 0 0 ## 1e7 rows, 1e2 K, 0% NAs, random order
 2 | # Rscript groupby-datagen.R 1e8 1e1 5 1 ## 1e8 rows, 10 K, 5% NAs, sorted order
 3 | args = commandArgs(TRUE)
 4 | 
 5 | pretty_sci = function(x) {
 6 |   tmp<-strsplit(as.character(x), "+", fixed=TRUE)[[1L]]
 7 |   if(length(tmp)==1L) {
 8 |     paste0(substr(tmp, 1L, 1L), "e", nchar(tmp)-1L)
 9 |   } else if(length(tmp)==2L){
10 |     paste0(tmp[1L], as.character(as.integer(tmp[2L])))
11 |   }
12 | }
13 | 
14 | library(data.table)
15 | N=as.integer(args[1L]); K=as.integer(args[2L]); nas=as.integer(args[3L]); sort=as.integer(args[4L])
16 | stopifnot(nas<=100L, nas>=0L, sort%in%c(0L,1L))
17 | set.seed(108)
18 | cat(sprintf("Producing data of %s rows, %s K groups factors, %s NAs ratio, %s sort flag\n", pretty_sci(N), pretty_sci(K), nas, sort))
19 | DT = list()
20 | DT[["id1"]] = sample(sprintf("id%03d",1:K), N, TRUE)      # large groups (char)
21 | DT[["id2"]] = sample(sprintf("id%03d",1:K), N, TRUE)      # small groups (char)
22 | DT[["id3"]] = sample(sprintf("id%010d",1:(N/K)), N, TRUE) # large groups (char)
23 | DT[["id4"]] = sample(K, N, TRUE)                          # large groups (int)
24 | DT[["id5"]] = sample(K, N, TRUE)                          # small groups (int)
25 | DT[["id6"]] = sample(N/K, N, TRUE)                        # small groups (int)
26 | DT[["v1"]] =  sample(5, N, TRUE)                          # int in range [1,5]
27 | DT[["v2"]] =  sample(15, N, TRUE)                         # int in range [1,15]
28 | DT[["v3"]] =  round(runif(N,max=100),6)                   # numeric e.g. 23.574912
29 | setDT(DT)
30 | if (nas>0L) {
31 |   cat("Inputting NAs\n")
32 |   for (col in paste0("id",1:6)) {
33 |     ucol = unique(DT[[col]])
34 |     nna = as.integer(length(ucol) * (nas/100))
35 |     if (nna)
36 |       set(DT, DT[.(sample(ucol, nna)), on=col, which=TRUE], col, NA)
37 |     rm(ucol)
38 |   }
39 |   nna = as.integer(nrow(DT) * (nas/100))
40 |   if (nna) {
41 |     for (col in paste0("v",1:3))
42 |       set(DT, sample(nrow(DT), nna), col, NA)
43 |   }
44 | }
45 | if (sort==1L) {
46 |   cat("Sorting data\n")
47 |   setkeyv(DT, paste0("id", 1:6))
48 | }
49 | file = sprintf("G1_%s_%s_%s_%s.csv", pretty_sci(N), pretty_sci(K), nas, sort)
50 | cat(sprintf("Writing data to %s\n", file))
51 | fwrite(DT, file)
52 | cat(sprintf("Data written to %s, quitting\n", file))
53 | if (!interactive()) quit("no", status=0)
54 | 


--------------------------------------------------------------------------------
/_data/groupby2014-datagen.R:
--------------------------------------------------------------------------------
 1 | args = commandArgs(TRUE)
 2 | 
 3 | pretty_sci = function(x) {
 4 |   tmp<-strsplit(as.character(x), "+", fixed=TRUE)[[1L]]
 5 |   if(length(tmp)==1L) {
 6 |     paste0(substr(tmp, 1L, 1L), "e", nchar(tmp)-1L)
 7 |   } else if(length(tmp)==2L){
 8 |     paste0(tmp[1L], as.character(as.integer(tmp[2L])))
 9 |   }
10 | }
11 | 
12 | library(data.table)
13 | N=as.integer(args[1L]); K=as.integer(args[2L]); nas=as.integer(args[3L]); sort=as.integer(args[4L])
14 | stopifnot(K==1e2L, nas==0L, sort==0L) ## 2014's setup
15 | set.seed(108)
16 | cat(sprintf("Producing data of %s rows, %s K groups factors, %s NAs ratio, %s sort flag\n", pretty_sci(N), pretty_sci(K), nas, sort))
17 | DT = list()
18 | DT[["id1"]] = sample(sprintf("id%03d",1:K), N, TRUE)      # large groups (char)
19 | DT[["id2"]] = sample(sprintf("id%03d",1:K), N, TRUE)      # small groups (char)
20 | DT[["id3"]] = sample(sprintf("id%010d",1:(N/K)), N, TRUE) # large groups (char)
21 | DT[["id4"]] = sample(K, N, TRUE)                          # large groups (int)
22 | DT[["id5"]] = sample(K, N, TRUE)                          # small groups (int)
23 | DT[["id6"]] = sample(N/K, N, TRUE)                        # small groups (int)
24 | DT[["v1"]] =  sample(5, N, TRUE)                          # int in range [1,5]
25 | DT[["v2"]] =  sample(5, N, TRUE)                          # int in range [1,5]
26 | DT[["v3"]] =  sample(round(runif(100,max=100),4), N, TRUE)# numeric e.g. 23.5749
27 | setDT(DT)
28 | if (nas>0L) {
29 |   cat("Inputting NAs\n")
30 |   for (col in paste0("id",1:6)) {
31 |     ucol = unique(DT[[col]])
32 |     nna = as.integer(length(ucol) * (nas/100))
33 |     if (nna)
34 |       set(DT, DT[.(sample(ucol, nna)), on=col, which=TRUE], col, NA)
35 |     rm(ucol)
36 |   }
37 |   nna = as.integer(nrow(DT) * (nas/100))
38 |   if (nna) {
39 |     for (col in paste0("v",1:3))
40 |       set(DT, sample(nrow(DT), nna), col, NA)
41 |   }
42 | }
43 | if (sort==1L) {
44 |   cat("Sorting data\n")
45 |   setkeyv(DT, paste0("id", 1:6))
46 | }
47 | file = sprintf("G0_%s_%s_%s_%s.csv", pretty_sci(N), pretty_sci(K), nas, sort)
48 | cat(sprintf("Writing data to %s\n", file))
49 | fwrite(DT, file)
50 | cat(sprintf("Data written to %s, quitting\n", file))
51 | if (!interactive()) quit("no", status=0)
52 | 


--------------------------------------------------------------------------------
/_data/join-datagen.R:
--------------------------------------------------------------------------------
  1 | # Rscript join-datagen.R 1e7 0 0 0 ## 1e7 rows, 0 ignored, 0% NAs, random order
  2 | # Rscript join-datagen.R 1e8 0 5 1 ## 1e8 rows, 0 ignored, 5% NAs, sorted order
  3 | 
  4 | # see h2oai/db-benchmark#106 for a design notes of this procedure, feedback welcome in the issue
  5 | 
  6 | # init ----
  7 | 
  8 | init = proc.time()[["elapsed"]]
  9 | args = commandArgs(TRUE)
 10 | N=as.numeric(args[1L]); K=as.integer(args[2L]); nas=as.integer(args[3L]); sort=as.integer(args[4L])
 11 | stopifnot(N>=1e7, nas<=100L, nas>=0L, sort%in%c(0L,1L))
 12 | if (N > .Machine$integer.max) stop("no support for long vector in join-datagen yet")
 13 | N = as.integer(N)
 14 | 
 15 | # helper functions ----
 16 | 
 17 | # pretty print big numbers as 1e9, 1e8, etc
 18 | pretty_sci = function(x) {
 19 |   stopifnot(length(x)==1L, !is.na(x))
 20 |   tmp = strsplit(as.character(x), "+", fixed=TRUE)[[1L]]
 21 |   if (length(tmp)==1L) {
 22 |     paste0(substr(tmp, 1L, 1L), "e", nchar(tmp)-1L)
 23 |   } else if (length(tmp)==2L) {
 24 |     paste0(tmp[1L], as.character(as.integer(tmp[2L])))
 25 |   }
 26 | }
 27 | # data_name of table to join
 28 | join_to_tbls = function(data_name) {
 29 |   x_n = as.numeric(strsplit(data_name, "_", fixed=TRUE)[[1L]][2L])
 30 |   y_n = setNames(x_n/c(1e6, 1e3, 1e0), c("small","medium","big"))
 31 |   sapply(sapply(y_n, pretty_sci), gsub, pattern="NA", x=data_name)
 32 | }
 33 | # sample ensuring none is missing
 34 | sample_all = function(x, size) {
 35 |   stopifnot(length(x) <= size)
 36 |   y = c(x, sample(x, size=max(size-length(x), 0), replace=TRUE))
 37 |   sample(y)
 38 | }
 39 | # split into common (0.9) left (0.1) and right (0.1)
 40 | split_xlr = function(n) {
 41 |   key = sample.int(n*1.1) # 1.1 = 0.9+0.1+0.1
 42 |   list(
 43 |     x = key[seq.int(1, n*0.9)],
 44 |     l = key[seq.int(n*0.9+1, n)],
 45 |     r = key[seq.int(n+1, n*1.1)]
 46 |   )
 47 | }
 48 | # check if data name is LHS and has NAs
 49 | lhs_nas = function(data_name) {
 50 |   tmp = strsplit(data_name, "_", fixed=TRUE)[[1L]]
 51 |   if (!identical(tmp[3L], "NA"))
 52 |     return(FALSE)        ## RHS
 53 |   as.integer(tmp[4L])>0L ## NAs
 54 | }
 55 | # NA aware sprintf for single argument only
 56 | sprintfId = function(fmt, id) {
 57 |   x = rep(NA_character_, length(id))
 58 |   idx = !is.na(id)
 59 |   x[idx] = sprintf("id%.0f", id[idx])
 60 |   x
 61 | }
 62 | # we need to write in batches to reduce memory footprint
 63 | write_batches = function(d, name, append) {
 64 |   cols = names(d)
 65 |   if (lhs_nas(name)) sprintf = sprintfId
 66 |   if ("id1" %in% cols) set(d, NULL, "id4", sprintf("id%.0f", d$id1))
 67 |   if ("id2" %in% cols) set(d, NULL, "id5", sprintf("id%.0f", d$id2))
 68 |   if ("id3" %in% cols) set(d, NULL, "id6", sprintf("id%.0f", d$id3))
 69 |   setcolorder(d, neworder=setdiff(names(d), c("v1","v2")))
 70 |   fwrite(d, paste0(name, ".csv"), showProgress=FALSE, append=append)
 71 | }
 72 | handle_batches = function(d, data_name) {
 73 |   N = nrow(d)
 74 |   if (N > 1e8) {
 75 |     stopifnot(N==1e9)
 76 |     for (i in 1:10) {
 77 |       cat(sprintf("Writing %s data batch %s\n", pretty_sci(N), i))
 78 |       write_batches(d[((i-1)*1e8+1L):(i*1e8)], data_name, append=i>1L)
 79 |     }
 80 |   } else {
 81 |     write_batches(d, data_name, append=FALSE)
 82 |   }
 83 | }
 84 | 
 85 | # exec ----
 86 | 
 87 | library(data.table)
 88 | setDTthreads(0L)
 89 | set.seed(108)
 90 | data_name = sprintf("J1_%s_%s_%s_%s", pretty_sci(N), "NA", nas, sort)
 91 | 
 92 | cat(sprintf("Generate join data of %s rows\n", pretty_sci(N)))
 93 | 
 94 | cat("Producing keys for LHS and RHS data\n")
 95 | key1 = split_xlr(N/1e6)
 96 | key2 = split_xlr(N/1e3)
 97 | key3 = split_xlr(N)
 98 | 
 99 | cat(sprintf("Producing LHS %s data from keys\n", pretty_sci(N)))
100 | lhs = c("x","l")
101 | l = list(
102 |   id1 = sample_all(unlist(key1[lhs], use.names=FALSE), N),
103 |   id2 = sample_all(unlist(key2[lhs], use.names=FALSE), N),
104 |   id3 = sample_all(unlist(key3[lhs], use.names=FALSE), N)
105 | )
106 | setDT(l)
107 | if (sort==1L) {
108 |   cat("Sorting LHS data\n")
109 |   setkeyv(l, c("id1","id2","id3"))
110 | }
111 | set(l, NULL, "v1", round(runif(nrow(l), max=100), 6))
112 | stopifnot(
113 |   uniqueN(l, by="id1")==N/1e6,
114 |   uniqueN(l, by="id2")==N/1e3,
115 |   uniqueN(l, by="id3")==N
116 | )
117 | if (nas>0L) {
118 |   cat("Inputting NAs in LHS data\n")
119 |   for (col in paste0("id",1:3)) {
120 |     ucol = unique(l[[col]])
121 |     nna = as.integer(length(ucol) * (nas/100))
122 |     if (nna)
123 |       set(l, l[.(sample(ucol, nna)), on=col, which=TRUE], col, NA)
124 |     rm(ucol)
125 |   }
126 |   nna = as.integer(nrow(l) * (nas/100))
127 |   if (nna)
128 |     set(l, sample(nrow(l), nna), "v1", NA)
129 | }
130 | cat(sprintf("Writing LHS %s data %s\n", pretty_sci(N), data_name))
131 | handle_batches(l, data_name)
132 | rm(l)
133 | 
134 | rhs = c("x","r")
135 | r_data_name = join_to_tbls(data_name)
136 | n = N/1e6
137 | cat(sprintf("Producing RHS %s data from keys\n", pretty_sci(n)))
138 | r1 = list(
139 |   id1 = sample_all(unlist(key1[rhs], use.names=FALSE), n)
140 | )
141 | setDT(r1)
142 | if (sort==1L) {
143 |   cat("Sorting RHS small data\n")
144 |   setkeyv(r1, "id1")
145 | }
146 | set(r1, NULL, "v2", round(runif(nrow(r1), max=100), 6))
147 | stopifnot(uniqueN(r1, by="id1")==n)
148 | cat(sprintf("Writing RHS %s data %s\n", pretty_sci(n), r_data_name[1L]))
149 | handle_batches(r1, r_data_name[1L])
150 | rm(r1)
151 | n = N/1e3
152 | cat(sprintf("Producing RHS %s data from keys\n", pretty_sci(n)))
153 | r2 = list(
154 |   id1 = sample_all(unlist(key1[rhs], use.names=FALSE), n),
155 |   id2 = sample_all(unlist(key2[rhs], use.names=FALSE), n)
156 | )
157 | setDT(r2)
158 | if (sort==1L) {
159 |   cat("Sorting RHS medium data\n")
160 |   setkeyv(r2, "id2")
161 | }
162 | set(r2, NULL, "v2", round(runif(nrow(r2), max=100), 6))
163 | stopifnot(uniqueN(r2, by="id2")==n)
164 | cat(sprintf("Writing RHS %s data %s\n", pretty_sci(n), r_data_name[2L]))
165 | handle_batches(r2, r_data_name[2L])
166 | rm(r2)
167 | n = N
168 | cat(sprintf("Producing RHS %s data from keys\n", pretty_sci(n)))
169 | r3 = list(
170 |   id1 = sample_all(unlist(key1[rhs], use.names=FALSE), n),
171 |   id2 = sample_all(unlist(key2[rhs], use.names=FALSE), n),
172 |   id3 = sample_all(unlist(key3[rhs], use.names=FALSE), n)
173 | )
174 | rm(key1, key2, key3)
175 | setDT(r3)
176 | if (sort==1L) {
177 |   cat("Sorting RHS big data\n")
178 |   setkeyv(r3, "id3")
179 | }
180 | set(r3, NULL, "v2", round(runif(nrow(r3), max=100), 6))
181 | stopifnot(uniqueN(r3, by="id3")==n)
182 | cat(sprintf("Writing RHS %s data %s\n", pretty_sci(n), r_data_name[3L]))
183 | handle_batches(r3, r_data_name[3L])
184 | rm(r3)
185 | 
186 | cat(sprintf("Join datagen of %s rows finished in %ss\n", pretty_sci(N), trunc(proc.time()[["elapsed"]]-init)))
187 | if (!interactive()) quit("no", status=0)
188 | 


--------------------------------------------------------------------------------
/_helpers/helpers.R:
--------------------------------------------------------------------------------
 1 | write.log = function(
 2 |   timestamp=Sys.time(), # this has to be here to support timestamp provided when parsing impala or clickhouse sql logs
 3 |   task=NA_character_, data=NA_character_, in_rows=NA_integer_, question=NA_character_, out_rows=NA_integer_,
 4 |   out_cols=NA_integer_, solution=NA_character_, version=NA_character_, git=NA_character_, fun=NA_character_,
 5 |   run=NA_integer_, time_sec=NA_real_, mem_gb=NA_real_, cache=NA, chk=NA_character_, chk_time_sec=NA_real_,
 6 |   on_disk=FALSE, machine_type=''
 7 | ) {
 8 |   stopifnot(is.character(task), is.character(data), is.character(solution), is.character(fun), is.logical(on_disk), is.character(machine_type))
 9 |   log.file=Sys.getenv("CSV_TIME_FILE", "time.csv")
10 |   batch=Sys.getenv("BATCH", NA)
11 |   nodename=toString(Sys.info()[["nodename"]])
12 |   comment=NA_character_ # placeholder for updates to timing data
13 |   time_sec=round(time_sec, 3)
14 |   mem_gb=round(mem_gb, 3)
15 |   chk_time_sec=round(chk_time_sec, 3)
16 |   df=data.frame(nodename=nodename, batch=as.integer(batch), timestamp=as.numeric(timestamp), 
17 |                 task=task, data=data, in_rows=trunc(in_rows), question=as.character(question), out_rows=trunc(out_rows), # trunc to support big int in double
18 |                 out_cols=as.integer(out_cols), solution=solution, version=as.character(version), git=as.character(git), fun=fun,
19 |                 run=as.integer(run), time_sec=time_sec, mem_gb=mem_gb, cache=cache, chk=chk, chk_time_sec=chk_time_sec,
20 |                 comment=comment, on_disk=on_disk, machine_type=machine_type)
21 |   csv_verbose = Sys.getenv("CSV_VERBOSE", "false")
22 |   if (as.logical(csv_verbose)) cat("# ", paste(sapply(df, format, scientific=FALSE), collapse=","), "\n", sep="")
23 |   if (!file.size(log.file)) file.remove(log.file)
24 |   write.table(format(df, scientific=FALSE),
25 |               file=log.file,
26 |               append=file.exists(log.file),
27 |               col.names=!file.exists(log.file),
28 |               row.names=FALSE,
29 |               quote=FALSE,
30 |               na="",
31 |               sep=",")
32 | }
33 | 
34 | # short format of 1e7, 1e8 etc.
35 | pretty_sci = function(x) {
36 |   tmp<-strsplit(as.character(x), "+", fixed=TRUE)[[1L]]
37 |   if(length(tmp)==1L) {
38 |     paste0(substr(tmp, 1L, 1L), "e", nchar(tmp)-1L)
39 |   } else if(length(tmp)==2L){
40 |     paste0(tmp[1L], as.character(as.integer(tmp[2L])))
41 |   }
42 | }
43 | 
44 | # makes scalar string to store in "chk" field, check sum of arbitrary number of measures
45 | make_chk = function(values){
46 |   x = sapply(values, function(x) paste(format(x, scientific=FALSE), collapse="_"))
47 |   gsub(",", "_", paste(x, collapse=";"), fixed=TRUE)
48 | }
49 | 
50 | # bash 'ps -o rss'
51 | memory_usage = function() {
52 |   return(NA_real_) # disabled because during #110 system() kills the scripts
53 |   cmd = paste("ps -o rss", Sys.getpid(), "| tail -1")
54 |   ans = tryCatch(system(cmd, intern=TRUE, ignore.stderr=TRUE), error=function(e) NA_character_)
55 |   as.numeric(ans) / (1024^2) # GB units
56 | }
57 | 
58 | # join task RHS tables for LHS data name
59 | join_to_tbls = function(data_name) {
60 |   x_n = as.numeric(strsplit(data_name, "_", fixed=TRUE)[[1L]][2L])
61 |   y_n = setNames(c(x_n/1e6, x_n/1e3, x_n), c("small","medium","big"))
62 |   sapply(sapply(y_n, pretty_sci), gsub, pattern="NA", x=data_name)
63 | }
64 | 


--------------------------------------------------------------------------------
/_helpers/helpers.jl:
--------------------------------------------------------------------------------
 1 | using Printf; # sprintf macro to print in non-scientific format
 2 | using Pkg;
 3 | 
 4 | # from https://github.com/JuliaLang/Pkg.jl/issues/793
 5 | function getpkgmeta(name::AbstractString)
 6 |     fname = joinpath(dirname(Base.active_project()), "Manifest.toml")
 7 |     Pkg.TOML.parse(read(fname, String))["deps"][name][1]
 8 | end;
 9 | 
10 | function write_log(run, task, data, in_rows, question, out_rows, out_cols, solution, version, git, fun, time_sec, mem_gb, cache, chk, chk_time_sec, on_disk, machine_type)
11 |   file=try
12 |     ENV["CSV_TIME_FILE"]
13 |   catch
14 |     "time.csv"
15 |   end;
16 |   if (occursin("/", file) && SubString(file, 1, 1)!="/") # otherwise we assume full path
17 |     file="$(pwd())/$file";
18 |   end;
19 |   batch=try
20 |     ENV["BATCH"]
21 |   catch
22 |     ""
23 |   end;
24 |   if (isfile(file) && filesize(file)==0)
25 |     rm(file)
26 |   end;
27 |   nodename=gethostname()
28 |   comment="" # placeholder for updates to timing data
29 |   time_sec=round(time_sec, digits=3)
30 |   mem_gb=round(mem_gb, digits=3)
31 |   chk_time_sec=round(chk_time_sec, digits=3)
32 |   timestamp=@sprintf("%0.6f", time())
33 |   csv_verbose = false # hardcoded for now, TODO ENV["CSV_VERBOSE"] and print
34 |   log = DataFrame(nodename=nodename, batch=batch, timestamp=timestamp, task=task, data=data, in_rows=in_rows, question=question, out_rows=out_rows, out_cols=out_cols, solution=solution, version=version, git=git, fun=fun, run=run, time_sec=time_sec, mem_gb=mem_gb, cache=uppercase(string(cache)), chk=chk, chk_time_sec=chk_time_sec, comment=comment, on_disk=uppercase(string(on_disk)), machine_type=machine_type)
35 |   CSV.write(file, log, append=isfile(file), header=!isfile(file))
36 | end;
37 | 
38 | function make_chk(x)
39 |   n = length(x)
40 |   res = ""
41 |   for i = 1:n
42 |     res = string(res, i==1 ? "" : ";", @sprintf("%0.3f", x[i]))
43 |   end
44 |   res
45 | end;
46 | 
47 | function memory_usage()
48 |   pid = getpid()
49 |   s = read(pipeline(`ps -o rss $pid`,`tail -1`), String)
50 |   parse(Float64, replace(s, "\n" => "")) / (1024^2)
51 | end;
52 | 
53 | function join_to_tbls(data_name)
54 |   x_n = Int(parse(Float64, split(data_name, "_")[2]))
55 |   y_n = [x_n/1e6, x_n/1e3, x_n]
56 |   y_n = [replace(@sprintf("%.0e", y_n[1]), r"[+]0?"=>""), replace(@sprintf("%.0e", y_n[2]), r"[+]0?"=>""), replace(@sprintf("%.0e", y_n[3]), r"[+]0?"=>"")]
57 |   [replace(data_name, "NA" => y_n[1]), replace(data_name, "NA" => y_n[2]), replace(data_name, "NA" => y_n[3])]
58 | end;
59 | 


--------------------------------------------------------------------------------
/_helpers/helpers.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import csv
 3 | import math
 4 | import psutil
 5 | import os
 6 | import platform
 7 | 
 8 | def write_log(task, data, in_rows, question, out_rows, out_cols, solution, version, git, fun, run, time_sec, mem_gb, cache, chk, chk_time_sec, on_disk, machine_type):
 9 |    batch = os.getenv('BATCH', "") 
10 |    timestamp = time.time()
11 |    csv_file = os.getenv('CSV_TIME_FILE', "time.csv")
12 |    nodename = platform.node()
13 |    comment = "" # placeholder for updates to timing data
14 |    time_sec = round(time_sec, 3)
15 |    chk_time_sec = round(chk_time_sec, 3)
16 |    mem_gb = round(mem_gb, 3)
17 |    if math.isnan(time_sec):
18 |       time_sec = ""
19 |    if math.isnan(mem_gb):
20 |       mem_gb = ""
21 |    log_row = [nodename, batch, timestamp, task, data, in_rows, question, out_rows, out_cols, solution, version, git, fun, run, time_sec, mem_gb, cache, chk, chk_time_sec, comment, on_disk, machine_type]
22 |    log_header = ["nodename","batch","timestamp","task","data","in_rows","question","out_rows","out_cols","solution","version","git","fun","run","time_sec","mem_gb","cache","chk","chk_time_sec","comment","on_disk", "machine_type"]
23 |    if os.path.isfile(csv_file) and not(os.path.getsize(csv_file)):
24 |       os.remove(csv_file)
25 |    append = os.path.isfile(csv_file)
26 |    csv_verbose = os.getenv('CSV_VERBOSE', "false")
27 |    if csv_verbose.lower()=="true":
28 |       print('# ' + ','.join(str(x) for x in log_row))
29 |    if append:
30 |       with open(csv_file, 'a') as f:
31 |          w = csv.writer(f, lineterminator='\n')
32 |          w.writerow(log_row)
33 |    else:
34 |       with open(csv_file, 'w+') as f:
35 |          w = csv.writer(f, lineterminator='\n')
36 |          w.writerow(log_header)
37 |          w.writerow(log_row)
38 |    return True
39 | 
40 | def str_round(x):
41 |    if type(x).__name__ in ["float","float64"]:
42 |       x = round(x,3)
43 |    return str(x)
44 | 
45 | flatten = lambda l: [item for sublist in l for item in sublist]
46 | 
47 | def make_chk(values):
48 |    s = ';'.join(str_round(x) for x in values)
49 |    return s.replace(",","_") # comma is reserved for csv separator
50 | 
51 | def memory_usage():
52 |     process = psutil.Process(os.getpid())
53 |     return process.memory_info().rss/(1024**3) # GB units
54 | 
55 | def join_to_tbls(data_name):
56 |     x_n = int(float(data_name.split("_")[1]))
57 |     y_n = ["{:.0e}".format(x_n/1e6), "{:.0e}".format(x_n/1e3), "{:.0e}".format(x_n)]
58 |     y_n = [y_n[0].replace('+0', ''), y_n[1].replace('+0', ''), y_n[2].replace('+0', '')]
59 |     return [data_name.replace('NA', y_n[0]), data_name.replace('NA', y_n[1]), data_name.replace('NA', y_n[2])]
60 | 


--------------------------------------------------------------------------------
/_helpers/helpers.sh:
--------------------------------------------------------------------------------
 1 | # join task RHS tables for LHS data name
 2 | join_to_tbls() {
 3 |   data_name=$1
 4 |   x_n="$(echo $data_name | cut -d '_' -f 2)"
 5 |   x_n_lhs="$(echo $x_n | cut -d 'e' -f 1)"
 6 |   if [ "$x_n_lhs" -ne 1 ]; then
 7 |     echo "data_name $data_name must have '1' base in exponential notation for number of rows" >&2 && eit 1
 8 |   fi
 9 |   x_n_rhs="$(echo $x_n | cut -d "e" -f 2)"
10 |   if [ "$x_n_rhs" -lt 6 ]; then
11 |     echo "data_name $data_name must have exponent greater or equal to '6' in exponential notation for number of rows" >&2 && exit 1
12 |   fi
13 |   echo ${data_name/NA/"$x_n_lhs"e"$(($x_n_rhs-6))"} ${data_name/NA/"$x_n_lhs"e"$(($x_n_rhs-3))"} ${data_name/NA/"$x_n_lhs"e"$x_n_rhs"}
14 | }
15 | 


--------------------------------------------------------------------------------
/_helpers/helpersds.jl:
--------------------------------------------------------------------------------
 1 | using Printf; # sprintf macro to print in non-scientific format
 2 | using Pkg;
 3 | 
 4 | # from https://github.com/JuliaLang/Pkg.jl/issues/793
 5 | function getpkgmeta(name::AbstractString)
 6 |     fname = joinpath(dirname(Base.active_project()), "Manifest.toml")
 7 |     Pkg.TOML.parse(read(fname, String))["deps"][name][1]
 8 | end;
 9 | 
10 | function write_log(run, task, data, in_rows, question, out_rows, out_cols, solution, version, git, fun, time_sec, mem_gb, cache, chk, chk_time_sec, on_disk, machine_type)
11 |   file=try
12 |     ENV["CSV_TIME_FILE"]
13 |   catch
14 |     "time.csv"
15 |   end;
16 |   if (occursin("/", file) && SubString(file, 1, 1)!="/") # otherwise we assume full path
17 |     file="$(pwd())/$file";
18 |   end;
19 |   batch=try
20 |     ENV["BATCH"]
21 |   catch
22 |     ""
23 |   end;
24 |   if (isfile(file) && filesize(file)==0)
25 |     rm(file)
26 |   end;
27 |   nodename=gethostname()
28 |   comment="" # placeholder for updates to timing data
29 |   time_sec=round(time_sec, digits=3)
30 |   mem_gb=round(mem_gb, digits=3)
31 |   chk_time_sec=round(chk_time_sec, digits=3)
32 |   timestamp=@sprintf("%0.6f", time())
33 |   csv_verbose = false
34 |   log = Dataset(nodename=nodename, batch=batch, timestamp=timestamp, task=task, data=data, in_rows=in_rows, question=question, out_rows=out_rows, out_cols=out_cols, solution=solution, version=version, git=git, fun=fun, run=run, time_sec=time_sec, mem_gb=mem_gb, cache=uppercase(string(cache)), chk=chk, chk_time_sec=chk_time_sec, comment=comment, on_disk=uppercase(string(on_disk)), machine_type=machine_type)
35 |   filewriter(file, log, append=isfile(file), header=!isfile(file))
36 | end;
37 | 
38 | function make_chk(x)
39 |   n = length(x)
40 |   res = ""
41 |   for i = 1:n
42 |     res = string(res, i==1 ? "" : ";", @sprintf("%0.3f", x[i]))
43 |   end
44 |   res
45 | end;
46 | 
47 | function memory_usage()
48 |   pid = getpid()
49 |   s = read(pipeline(`ps -o rss $pid`,`tail -1`), String)
50 |   parse(Float64, replace(s, "\n" => "")) / (1024^2)
51 | end;
52 | 
53 | function join_to_tbls(data_name)
54 |   x_n = Int(parse(Float64, split(data_name, "_")[2]))
55 |   y_n = [x_n/1e6, x_n/1e3, x_n]
56 |   y_n = [replace(@sprintf("%.0e", y_n[1]), r"[+]0?"=>""), replace(@sprintf("%.0e", y_n[2]), r"[+]0?"=>""), replace(@sprintf("%.0e", y_n[3]), r"[+]0?"=>"")]
57 |   [replace(data_name, "NA" => y_n[1]), replace(data_name, "NA" => y_n[2]), replace(data_name, "NA" => y_n[3])]
58 | end;
59 | 


--------------------------------------------------------------------------------
/_launcher/launch.R:
--------------------------------------------------------------------------------
 1 | library("data.table")
 2 | if (!packageVersion("data.table") >= "1.13.0")
 3 |   stop("db-benchmark launcher script depends on recent data.table features, install at least 1.13.0.")
 4 | source("./_launcher/launcher.R")
 5 | 
 6 | .nodename = Sys.info()[["nodename"]]
 7 | mockup = as.logical(Sys.getenv("MOCKUP", "false"))
 8 | 
 9 | run_tasks = getenv("RUN_TASKS") # run_tasks = c("groupby","join")
10 | if (!length(run_tasks)) {
11 |   cat("No benchmark tasks to run\n")
12 |   q("no")
13 | }
14 | run_solutions = getenv("RUN_SOLUTIONS") # run_solutions = c("data.table","dplyr","pydatatable","spark","pandas")
15 | if (!length(run_solutions)) {
16 |   cat("No benchmark solutions to run\n")
17 |   q("no")
18 | }
19 | 
20 | data = fread("./_control/data.csv", logical01=TRUE, colClasses=c("character","character","character","character","character","character","logical"))
21 | if (anyDuplicated(data[["data"]]))
22 |   stop("_control/data.csv contains duplicated data cases")
23 | data[active==TRUE, # filter on active datasets
24 |      ][run_tasks, on="task", nomatch=NA # filter for env var RUN_TASKS
25 |        ][, c("active") := NULL # remove unused
26 |          ][] -> data
27 | if (any(is.na(data$data))) stop("missing entries in ./_control/data.csv for some tasks")
28 | 
29 | timeout = fread("./_control/timeout.csv", colClasses=c("character","character","numeric"))
30 | timeout[run_tasks, on="task", nomatch=NA #  # filter for env var RUN_TASKS
31 |         ] -> timeout
32 | if (any(is.na(timeout$minutes))) stop("missing entries in ./_control/timeout.csv for some tasks")
33 | 
34 | solution = fread("./_control/solutions.csv")
35 | solution[run_solutions, on="solution", nomatch=NA # filter for env var RUN_SOLUTIONS
36 |          ] -> solution
37 | if (any(is.na(solution$task))) stop("missing entries in ./_control/solutions.csv for some solutions")
38 | 
39 | # what to run, log machine name, lookup timeout
40 | dt = solution[data, on="task", allow.cartesian=TRUE, nomatch=NULL]
41 | dt[, "nodename" := .nodename]
42 | dt[, "in_rows" := sapply(strsplit(data, split="_", fixed=TRUE), `[[`, 2L)]
43 | stopifnot(dt$in_rows == dt$nrow)
44 | dt[timeout, "timeout_s" := i.minutes*60, on=c("task","in_rows")]
45 | if (any(is.na(dt$timeout_s))) stop("missing entries in ./_control/timeout.csv for some tasks, detected after joining to solutions and data to run")
46 | 
47 | # detect if script has been already run before for currently installed version/revision
48 | lookup_run_batch(dt)
49 | 
50 | machine_type = getenv("MACHINE_TYPE")
51 | dt[,machine_type := machine_type]
52 | 
53 | skipped_benchmarks = fread("./_control/skipped_benchmarks.csv", logical01=TRUE, colClasses=c("character","character","character","character"))
54 | print("skipping benchmarks defined in _control/skipped_benchmarks.csv")
55 | print(skipped_benchmarks)
56 | 
57 | dt = dt[!skipped_benchmarks, on = c("solution", "task", "data", "machine_type")]
58 | 
59 | # print list of solutions that are going to be run in this batch so we know upfront which will be skipped
60 | cat("Benchmark solutions to run: ", dt[is.na(run_batch), paste(unique(solution),collapse=", ")], "\n", sep="")
61 | 
62 | is.stop()
63 | is.pause()
64 | is.stop()
65 | 
66 | # launch script, if not mockup, if not already run, unless forcerun
67 | dt
68 | launch(dt, mockup=mockup)
69 | 
70 | # terminates
71 | q("no")
72 | 


--------------------------------------------------------------------------------
/_launcher/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # dirs for datasets and output of benchmark
 5 | mkdir -p data
 6 | mkdir -p out
 7 | 
 8 | sudo apt-get update
 9 | 
10 | # install R
11 | sudo add-apt-repository "deb https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/"
12 | sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
13 | sudo apt-get update -qq
14 | sudo apt-get install -y r-base-dev
15 | sudo apt-get install python3-dev virtualenv
16 | 
17 | sudo chmod a+w /usr/local/lib/R/site-library
18 | 
19 | # configure R
20 | echo 'LC_ALL=C' >> ~/.Renviron
21 | mkdir -p ~/.R
22 | echo 'CFLAGS=-O3 -mtune=native' > ~/.R/Makevars
23 | echo 'CXXFLAGS=-O3 -mtune=native' >> ~/.R/Makevars
24 | 
25 | # packages used in launcher and report
26 | Rscript -e 'install.packages(c("bit64","rmarkdown","data.table","rpivotTable","formattable","lattice"))'
27 | Rscript -e 'sapply(c("bit64","rmarkdown","data.table","rpivotTable","formattable","lattice"), requireNamespace)'
28 | 
29 | # install duckdb for unpacking data
30 | curl --fail --location --progress-bar --output duckdb_cli-linux-amd64.zip https://github.com/duckdb/duckdb/releases/download/v1.2.0/duckdb_cli-linux-amd64.zip
31 | sudo unzip duckdb_cli-linux-amd64.zip -d /usr/local/bin
32 | 
33 | 
34 | # install aws client to download benchmark data
35 | curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
36 | unzip awscliv2.zip
37 | sudo ./aws/install
38 | 
39 | # after each restart of server
40 | source clickhouse/ch.sh && ch_stop
41 | sudo service docker stop
42 | sudo swapoff -a
43 | 
44 | # stop and disable
45 | sudo systemctl disable docker
46 | sudo systemctl stop docker
47 | sudo systemctl disable clickhouse-server
48 | sudo systemctl stop clickhouse-server
49 | 


--------------------------------------------------------------------------------
/_report/blah.R:
--------------------------------------------------------------------------------
 1 | 
 2 | source("./_report/report.R", chdir=TRUE)
 3 | source("./_helpers/helpers.R", chdir=TRUE)
 4 | source("./_benchplot/benchplot.R", chdir=TRUE)
 5 | source("./_benchplot/benchplot-dict.R", chdir=TRUE)
 6 | ld = time_logs()
 7 | lld = ld[script_recent==TRUE]
 8 | # lld_nodename = as.character(unique(lld$nodename))
 9 | lld_nodename = "c6id.metal"
10 | if (length(lld_nodename)>1L)
11 |   stop(sprintf("There are multiple different 'nodename' to be presented on single report '%s'", report_name))
12 | lld_unfinished = lld[is.na(script_time_sec)]
13 | if (nrow(lld_unfinished)) {
14 |   warning(sprintf("Missing solution finish timestamp in logs.csv for '%s' (still running or launcher script killed): %s", paste(unique(lld_unfinished$task), collapse=","), paste(unique(lld_unfinished$solution), collapse=", ")))
15 | }
16 | 
17 | dt_groupby = lld[task=="groupby"][substr(data,1,2)=="G1"]
18 | dt_join = lld[task=="join"]
19 | 
20 | 
21 | loop_benchplot = function(dt_task, report_name, syntax.dict, exceptions, solution.dict, question.txt.fun = NULL, title.txt.fun = NULL, data_namev, q_groupv, cutoff=NULL, pending=NULL) {
22 |   for (data_name in data_namev) {
23 |   for (q_group in q_groupv) {
24 |     message(sprintf("benchplot %s %s %s", report_name, data_name, q_group))
25 |     message(sprintf("machine type = %s", m_type))
26 |     y = dt_task[data==data_name & question_group==q_group & machine_type==m_type][,machine_type := NULL]
27 |     benchplot(
28 |       y,
29 |       filename = file.path("public", report_name, sprintf("%s_%s_%s.png", data_name, q_group, m_type)),
30 |       solution.dict = solution.dict,
31 |       syntax.dict = syntax.dict,
32 |       exceptions = exceptions,
33 |       question.txt.fun = question.txt.fun,
34 |       title.txt.fun = title.txt.fun,
35 |       cutoff = cutoff,
36 |       pending = pending,
37 |       url.footer = "https://duckdblabs.github.io/db-benchmark",
38 |       interactive = FALSE
39 |     )
40 |     }
41 |   }
42 | }
43 | link = function(data_name, q_group, report_name) {
44 |   fnam = sprintf("%s_%s.png", data_name, q_group)
45 |   paste(sprintf("[%s](%s)", q_group, file.path(report_name, fnam)), collapse=", ")
46 | }
47 | hours_took = function(lld) {
48 |   lld_script_time = lld[, .(n_script_time_sec=uniqueN(script_time_sec), script_time_sec=unique(script_time_sec)), .(solution, task, data)]
49 |   if (nrow(lld_script_time[n_script_time_sec>1L]))
50 |     stop("There are multiple different 'script_time_sec' for single solution+task+data on report 'index'")
51 |   lld_script_time[, round(sum(script_time_sec, na.rm=TRUE)/60/60, 1)]
52 | }
53 | 
54 | data_name = get_data_levels()[["groupby"]]
55 | loop_benchplot(dt_groupby, report_name="groupby", syntax.dict=groupby.syntax.dict, exceptions=groupby.exceptions, solution.dict=solution.dict, data_namev=data_name, q_groupv=c("basic","advanced"), title.txt.fun = header_title_fun, question.txt.fun = groupby_q_title_fun, cutoff = "spark", pending = "Modin", machine_types)


--------------------------------------------------------------------------------
/_report/ga.html:
--------------------------------------------------------------------------------
1 | <!-- Global site tag (gtag.js) - Google Analytics -->
2 | <script async src="https://www.googletagmanager.com/gtag/js?id=UA-129166154-1"></script>
3 | <script>
4 |   window.dataLayer = window.dataLayer || [];
5 |   function gtag(){dataLayer.push(arguments);}
6 |   gtag('js', new Date());
7 | 
8 |   gtag('config', 'UA-129166154-1');
9 | </script>


--------------------------------------------------------------------------------
/_report/publish.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -o errexit -o nounset
 3 | 
 4 | publishGhPages(){
 5 |   rm -rf db-benchmark.gh-pages
 6 |   mkdir -p db-benchmark.gh-pages
 7 |   cd db-benchmark.gh-pages
 8 | 
 9 |   ## Set up Repo parameters
10 |   git init > /dev/null
11 |   git config user.name "Tmonster"
12 |   git config user.email "tom@ebergen.com"
13 | 
14 |   ## Set gh token from local file
15 | 
16 |   ## Reset gh-pages branch
17 |   git remote add upstream "git@github.com:duckdblabs/db-benchmark.git"
18 |   git fetch -q upstream gh-pages
19 |   rm -f err.txt
20 |   git checkout -q gh-pages
21 |   git reset -q --hard "645f86716bfb3b44c53eacf1f2bf234e75ea41ec"
22 | 
23 |   rm -f err.txt
24 |   cp -r ../public/* ./
25 |   git add -A
26 |   git commit -q -m 'publish benchmark report'
27 |   cp ../time.csv .
28 |   cp ../logs.csv .
29 |   git add time.csv logs.csv 
30 |   md5sum time.csv > time.csv.md5
31 |   md5sum logs.csv > logs.csv.md5
32 |   git add time.csv.md5 logs.csv.md5
33 |   gzip --keep time.csv
34 |   gzip --keep logs.csv
35 |   git add time.csv.gz logs.csv.gz
36 |   git commit -q -m 'publish benchmark timings and logs'
37 |   git push --force upstream gh-pages
38 |   
39 |   cd ..
40 |   
41 | }
42 | 
43 | publishGhPages
44 | 


--------------------------------------------------------------------------------
/_report/tech.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Technical measures of db-benchmark"
 3 | output:
 4 |   html_document:
 5 |     self_contained: no
 6 |     toc: true
 7 |     includes:
 8 |       in_header: ga.html
 9 | ---
10 | ```{r render, include=FALSE}
11 | # Rscript -e 'rmarkdown::render("./_report/tech.Rmd", output_dir="public")' && xdg-open public/tech.html
12 | ```
13 | 
14 | ```{r opts, echo=FALSE}
15 | knitr::opts_knit$set(root.dir="..")
16 | knitr::opts_chunk$set(echo=FALSE, cache=FALSE)
17 | ```
18 | 
19 | ```{r init}
20 | library(lattice)
21 | source("./_report/report.R")
22 | ld = time_logs()
23 | recent_nodename = ld[script_recent==TRUE, unique(nodename)]
24 | stopifnot(length(recent_nodename)==1L)
25 | ld = ld[nodename==recent_nodename]
26 | ```
27 | 
28 | ## Incompleted timings of last run
29 | 
30 | ```{r completed}
31 | ll = ld[script_recent==TRUE, {
32 |   n_na = is.na(c(time_sec_1, time_sec_2))
33 |   n_completed=sum(!n_na)
34 |   n_failed=sum(n_na)
35 |   .(n_completed=n_completed, n_failed=n_failed, q_failed=if(n_failed==0L) NA_character_ else paste(paste0("q", iquestion[is.na(time_sec_1) | is.na(time_sec_2)]), collapse=","))
36 |   },
37 |    c("nodename","batch","solution","task","data","in_rows","k","nasorted")]
38 | stopifnot(length(unique(ll$nodename))==1L)
39 | ```
40 | 
41 | ### groupby
42 | 
43 | ```{r completed_groupby}
44 | kk(ll[task=="groupby"
45 |       ][n_failed>0L, .(solution, data, in_rows, k, `NA, sorted`=nasorted, n_completed, n_failed, q_failed)])
46 | ```
47 | 
48 | ## Full scripts executions
49 | 
50 | Things to consider when looking at below plots.  
51 | 
52 | - Red dotted line refers to script timeout which initially was not set up. Later it was set to 60 minutes, more recently, after adding new set of questions, it was increased to 120 minutes. Up to date timeout value can be looked up in `_control/timeout.csv` file.
53 | - It might happen that script was terminated by _out of memory killer_ an OS feature. In result script timing will be smaller than in reality it should be.  
54 | 
55 | Refer to table above to see which script has been fully completed.  
56 | 
57 | ### groupby
58 | 
59 | ```{r logs_plot, fig.width=8, fig.height=48}
60 | #timeout = fread("./_control/timeout.csv", colClasses=c("character","character","numeric"))
61 | #timeout = timeout["groupby", on="task", nomatch=NULL] # filter for env var RUN_TASKS
62 | #stopifnot(nrow(timeout)==1L)
63 | #timeout_m = timeout[["minutes"]]
64 | p = sapply(setNames(nm=as.character(unique(ld$solution))), simplify = FALSE, function(s)
65 |   lattice::xyplot(script_time_sec/60 ~ ibatch | k+in_rows, ld[task=="groupby" & substr(data,1,2)=="G1"],
66 |                 type="l", grid=TRUE, groups=nasorted,
67 |                 subset=solution==s, main=s,
68 |                 panel=panel.superpose,
69 |                 panel.groups=function(x, y, col, col.symbol, ...) {
70 |                   panel.lines(x, y, col=col.symbol, ...)
71 |                   #panel.abline(h=timeout_m, col="red", lty=3)
72 |                 },
73 |                 xlab = "benchmark run",
74 |                 ylab = "minutes",
75 |                 scales=list(y=list(
76 |                   relation="free",
77 |                   limits=rep(ld[solution==s, list(list(c(0, max(script_time_sec)/60))), in_rows]$V1, each=3)
78 |                 )),
79 |                 auto.key=list(points=FALSE, lines=TRUE))
80 | )
81 | sapply(seq_along(p), function(i) print(p[[i]], split=c(1, i, 1, length(p)), more=i!=length(p))) -> nul
82 | ```
83 | 
84 | ------
85 | 
86 | Report was generated on: `r format(Sys.time(), usetz=TRUE)`.  
87 | 
88 | ```{r status_set_success}
89 | cat("tech\n", file=get_report_status_file(), append=TRUE)
90 | ```
91 | 


--------------------------------------------------------------------------------
/_run/download_small_medium.sh:
--------------------------------------------------------------------------------
 1 | # first download and expand small data
 2 | 
 3 | # get groupby small (0.5GB and 5GB datasets)
 4 | aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/groupby_small.duckdb data/groupby_small.duckdb --no-sign-request --quiet
 5 | # get join small (0.5GB and 5GB datasets)
 6 | aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/join_small.duckdb data/join_small.duckdb --no-sign-request --quiet
 7 | 
 8 | 
 9 | # expand groupby-small datasets to csv
10 | duckdb data/groupby_small.duckdb -c "copy G1_1e7_1e2_0_0 to 'data/G1_1e7_1e2_0_0.csv' (FORMAT CSV)"
11 | duckdb data/groupby_small.duckdb -c "copy G1_1e7_1e1_0_0 to 'data/G1_1e7_1e1_0_0.csv' (FORMAT CSV)"
12 | duckdb data/groupby_small.duckdb -c "copy G1_1e7_2e0_0_0 to 'data/G1_1e7_2e0_0_0.csv' (FORMAT CSV)"
13 | duckdb data/groupby_small.duckdb -c "copy G1_1e7_1e2_0_1 to 'data/G1_1e7_1e2_0_1.csv' (FORMAT CSV)"
14 | duckdb data/groupby_small.duckdb -c "copy G1_1e7_1e2_5_0 to 'data/G1_1e7_1e2_5_0.csv' (FORMAT CSV)"
15 | duckdb data/groupby_small.duckdb -c "copy G1_1e8_1e2_0_0 to 'data/G1_1e8_1e2_0_0.csv' (FORMAT CSV)"
16 | duckdb data/groupby_small.duckdb -c "copy G1_1e8_1e1_0_0 to 'data/G1_1e8_1e1_0_0.csv' (FORMAT CSV)"
17 | duckdb data/groupby_small.duckdb -c "copy G1_1e8_2e0_0_0 to 'data/G1_1e8_2e0_0_0.csv' (FORMAT CSV)"
18 | duckdb data/groupby_small.duckdb -c "copy G1_1e8_1e2_0_1 to 'data/G1_1e8_1e2_0_1.csv' (FORMAT CSV)"
19 | duckdb data/groupby_small.duckdb -c "copy G1_1e8_1e2_5_0 to 'data/G1_1e8_1e2_5_0.csv' (FORMAT CSV)"
20 | 
21 | # expand join-small datasets to csv
22 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e1_0_0 to 'data/J1_1e7_1e1_0_0.csv' (FORMAT CSV)"
23 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e4_5_0 to 'data/J1_1e7_1e4_5_0.csv' (FORMAT CSV)"
24 | duckdb data/join_small.duckdb -c "copy J1_1e7_NA_0_1 to 'data/J1_1e7_NA_0_1.csv' (FORMAT CSV)"
25 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e5_0_0 to 'data/J1_1e8_1e5_0_0.csv' (FORMAT CSV)"
26 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e8_5_0 to 'data/J1_1e8_1e8_5_0.csv' (FORMAT CSV)"
27 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e1_0_1 to 'data/J1_1e7_1e1_0_1.csv' (FORMAT CSV)"
28 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e7_0_0 to 'data/J1_1e7_1e7_0_0.csv' (FORMAT CSV)"
29 | duckdb data/join_small.duckdb -c "copy J1_1e7_NA_5_0 to 'data/J1_1e7_NA_5_0.csv' (FORMAT CSV)"
30 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e5_0_1 to 'data/J1_1e8_1e5_0_1.csv' (FORMAT CSV)"
31 | duckdb data/join_small.duckdb -c "copy J1_1e8_NA_0_0 to 'data/J1_1e8_NA_0_0.csv' (FORMAT CSV)"
32 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e1_5_0 to 'data/J1_1e7_1e1_5_0.csv' (FORMAT CSV)"
33 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e7_0_1 to 'data/J1_1e7_1e7_0_1.csv' (FORMAT CSV)"
34 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e2_0_0 to 'data/J1_1e8_1e2_0_0.csv' (FORMAT CSV)"
35 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e5_5_0 to 'data/J1_1e8_1e5_5_0.csv' (FORMAT CSV)"
36 | duckdb data/join_small.duckdb -c "copy J1_1e8_NA_0_1 to 'data/J1_1e8_NA_0_1.csv' (FORMAT CSV)"
37 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e4_0_0 to 'data/J1_1e7_1e4_0_0.csv' (FORMAT CSV)"
38 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e7_5_0 to 'data/J1_1e7_1e7_5_0.csv' (FORMAT CSV)"
39 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e2_0_1 to 'data/J1_1e8_1e2_0_1.csv' (FORMAT CSV)"
40 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e8_0_0 to 'data/J1_1e8_1e8_0_0.csv' (FORMAT CSV)"
41 | duckdb data/join_small.duckdb -c "copy J1_1e8_NA_5_0 to 'data/J1_1e8_NA_5_0.csv' (FORMAT CSV)"
42 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e4_0_1 to 'data/J1_1e7_1e4_0_1.csv' (FORMAT CSV)"
43 | duckdb data/join_small.duckdb -c "copy J1_1e7_NA_0_0 to 'data/J1_1e7_NA_0_0.csv' (FORMAT CSV)"
44 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e2_5_0 to 'data/J1_1e8_1e2_5_0.csv' (FORMAT CSV)"
45 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e8_0_1 to 'data/J1_1e8_1e8_0_1.csv' (FORMAT CSV)"
46 | 
47 | 


--------------------------------------------------------------------------------
/_run/partitioned_run.sh:
--------------------------------------------------------------------------------
1 | # set machine type
2 | ./_run/run_small_medium.sh
3 | 
4 | ./_run/run_large.sh
5 | 


--------------------------------------------------------------------------------
/_run/run_large.sh:
--------------------------------------------------------------------------------
 1 | # download and expand large data
 2 | 
 3 | # get groupby large (50GB datasets)
 4 | aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/groupby_large.duckdb data/groupby_large.duckdb --no-sign-request --quiet
 5 | # get join small (50GB datasets)
 6 | aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/join_large.duckdb data/join_large.duckdb --no-sign-request --quiet
 7 | 
 8 | 
 9 | # expand groupby-large datasets to csv
10 | duckdb data/groupby_large.duckdb  -c "copy G1_1e9_1e2_0_0 to 'data/G1_1e9_1e2_0_0.csv' (FORMAT CSV)"
11 | duckdb data/groupby_large.duckdb  -c "copy G1_1e9_1e1_0_0 to 'data/G1_1e9_1e1_0_0.csv' (FORMAT CSV)"
12 | duckdb data/groupby_large.duckdb  -c "copy G1_1e9_2e0_0_0 to 'data/G1_1e9_2e0_0_0.csv' (FORMAT CSV)"
13 | duckdb data/groupby_large.duckdb  -c "copy G1_1e9_1e2_0_1 to 'data/G1_1e9_1e2_0_1.csv' (FORMAT CSV)"
14 | duckdb data/groupby_large.duckdb  -c "copy G1_1e9_1e2_5_0 to 'data/G1_1e9_1e2_5_0.csv' (FORMAT CSV)"
15 | 
16 | 
17 | # expand join-large datasets to csv
18 | duckdb data/join_large.duckdb  -c "copy J1_1e9_NA_0_0 to 'data/J1_1e9_NA_0_0.csv' (FORMAT CSV)"
19 | duckdb data/join_large.duckdb  -c "copy J1_1e9_1e9_0_0 to 'data/J1_1e9_1e9_0_0.csv' (FORMAT CSV)"
20 | duckdb data/join_large.duckdb  -c "copy J1_1e9_1e6_0_0 to 'data/J1_1e9_1e6_0_0.csv' (FORMAT CSV)"
21 | duckdb data/join_large.duckdb  -c "copy J1_1e9_1e3_0_0 to 'data/J1_1e9_1e3_0_0.csv' (FORMAT CSV)"
22 | 
23 | 
24 | cp _control/data_large.csv _control/data.csv
25 | 
26 | echo "Running all solutions on large (50GB) datasets"
27 | ./run.sh
28 | 
29 | 
30 | ###
31 | echo "done..."
32 | echo "removing data files"
33 | rm data/*.csv
34 | rm data/*.duckdb
35 | 


--------------------------------------------------------------------------------
/_run/run_medium.sh:
--------------------------------------------------------------------------------
 1 | ./_run/download_small_medium.sh
 2 | 
 3 | cp _control/data_medium.csv _control/data.csv
 4 | 
 5 | 
 6 | echo "Running all solutions on medium (5GB) datasets"
 7 | ./run.sh
 8 | 
 9 | 
10 | ###
11 | echo "done..."
12 | echo "removing small data files"
13 | rm data/*.csv
14 | rm data/*.duckdb
15 | 
16 | 


--------------------------------------------------------------------------------
/_run/run_small.sh:
--------------------------------------------------------------------------------
 1 | ./_run/download_small_medium.sh
 2 | 
 3 | cp _control/data_small.csv _control/data.csv
 4 | 
 5 | 
 6 | echo "Running all solutions on small (0.5GB) datasets"
 7 | ./run.sh
 8 | 
 9 | 
10 | ###
11 | echo "done..."
12 | echo "removing small data files"
13 | rm data/*.csv
14 | rm data/*.duckdb
15 | 
16 | 


--------------------------------------------------------------------------------
/_setup_utils/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdblabs/db-benchmark/fce276420bc33f56b74f0efa09ea3202e4e2e400/_setup_utils/.DS_Store


--------------------------------------------------------------------------------
/_setup_utils/install_all_solutions.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import csv
 3 | import sys
 4 | import subprocess
 5 | 
 6 | SOLUTIONS_FILENAME = "_control/solutions.csv"
 7 | 
 8 | 
 9 | INCLUDE = set()
10 | 
11 | def install_solution(solution_name):
12 |     min_setup_file_name = f"./{solution_name}/min-setup-{solution_name}.sh"
13 |     setup_file_name = f"./{solution_name}/setup-{solution_name}.sh"
14 |     upgrade_file_name = f"./{solution_name}/upg-{solution_name}.sh"
15 |     get_version_filename = f"./{solution_name}/ver-{solution_name}.sh"
16 |     print(f"Installing {solution_name}")
17 |     do_install = False
18 |     try:
19 |         result = subprocess.call([get_version_filename], stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL)
20 |         if result != 0:
21 |             do_install = True
22 |     except Exception as e:
23 |         do_install = True
24 | 
25 |     if do_install:
26 |         if os.path.exists(min_setup_file_name):
27 |             subprocess.call([min_setup_file_name])
28 |         elif os.path.exists(setup_file_name):
29 |             subprocess.call([setup_file_name])
30 |         else:
31 |             # print(f"no script for {setup_file_name} or {min_setup_file_name}")
32 |             raise Exception(f"No script to install {solution_name}")
33 |     else:
34 |         subprocess.call([upgrade_file_name])
35 | 
36 | # based on the name of the solution, run the {{solution}}/min-setup-{{solution}}.sh file.
37 | # if there is no min-setup-{{solution}}.sh, then run setup-{{solution}}.sh.
38 | # if error, exit with an error
39 | # else don't
40 | def include_all_solutions():
41 |     global INCLUDE
42 |     with open(SOLUTIONS_FILENAME, newline="") as solutions_file:
43 |         solutions = csv.DictReader(solutions_file, delimiter=',')
44 |         for row in solutions:
45 |             if row['solution'] == "data.table":
46 |                 INCLUDE.add("datatable")
47 |             else:
48 |                 INCLUDE.add(row['solution'])
49 |         
50 | if len(sys.argv) == 0:
51 |     print("""
52 | Usage: python3 install_all_solutions.py solution_name solution_name ...
53 |        python3 install_all_solutions.py all --exclude clickhouse polars
54 | """)
55 |     exit(1)
56 | 
57 | # first argument is file name
58 | 
59 | def main():
60 |     global INCLUDE
61 |     including = True
62 |     for solution in sys.argv[1:]:
63 |         if solution.strip() == "all":
64 |             include_all_solutions()
65 |         elif solution.strip() == "--exclude":
66 |             including = False
67 |             continue
68 |         else:
69 |             if including:
70 |                 if solution == "data.table":
71 |                     INCLUDE.add("datatable")
72 |                 elif solution == "clickhouse":
73 |                     INCLUDE.add("clickhouse")
74 |                     INCLUDE.add("polars")
75 |                 else:
76 |                     INCLUDE.add(solution)
77 |             else:
78 |                 sol = solution.strip()
79 |                 INCLUDE.remove(sol)
80 | 
81 |     for solution in INCLUDE:
82 |         install_solution(solution)
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     main()
87 |     
88 | 


--------------------------------------------------------------------------------
/_setup_utils/mount.sh:
--------------------------------------------------------------------------------
 1 | # script to format mount and copy data.
 2 | 
 3 | # remove a leftover instance mount
 4 | rm -rf /var/lib/mount/db-benchmark-metal
 5 | 
 6 | # format the mount
 7 | 
 8 | source path.env
 9 | 
10 | mount_name=$(sudo lsblk | awk '
11 | NR > 1 && $1 ~ /^nvme/ && $7 == "" {
12 |     # Convert SIZE column to bytes for comparison
13 |     size = $4;
14 |     unit = substr(size, length(size));
15 |     value = substr(size, 1, length(size)-1);
16 |     if (unit == "G") { value *= 1024^3; }
17 |     else if (unit == "T") { value *= 1024^4; }
18 |     else if (unit == "M") { value *= 1024^2; }
19 |     else if (unit == "K") { value *= 1024; }
20 |     else { value *= 1; }
21 | 
22 |     # Keep track of the largest size
23 |     if (value > max) {
24 |         max = value;
25 |         largest = $1;
26 |     }
27 | }
28 | END { if (largest) print largest; else print "No match found"; }
29 | ')
30 | 
31 | if [ -z "${MOUNT_POINT}" ]; then
32 |     echo "Error: Environment variable MOUNT_POINT is not set. Set it by running"
33 |     echo "source path.env"
34 |     exit 1
35 | fi
36 | 
37 | sudo mkfs -t xfs /dev/$mount_name
38 | 
39 | sudo rm -rf $MOUNT_POINT
40 | sudo mkdir $MOUNT_POINT
41 | sudo mount /dev/$mount_name $MOUNT_POINT
42 | 
43 | # make clone of repo on mount
44 | sudo mkdir $MOUNT_POINT/db-benchmark-metal
45 | sudo chown -R ubuntu:ubuntu $MOUNT_POINT
46 | 
47 | 
48 | git clone $(git remote get-url origin) $MOUNT_POINT/db-benchmark-metal
49 | cd $MOUNT_POINT/db-benchmark-metal


--------------------------------------------------------------------------------
/_setup_utils/mount_and_install_solutions.sh:
--------------------------------------------------------------------------------
 1 | # script to format mount and copy data.
 2 | # mount the data
 3 | ./_setup_utils/mount.sh
 4 | 
 5 | # setup all the solutions on db-benchmark-metal.
 6 | # creates the necessary python virtual environments and creates the r-libraries
 7 | # needed
 8 | cd ~/db-benchmark-metal && source path.env && python3 _setup_utils/install_all_solutions.py all
 9 | 
10 | 
11 | 
12 | # setup mount for clickhouse spill
13 | # sudo mkfs -t xfs /dev/nvme1n1
14 | # sudo mkdir /var/lib/clickhouse-nvme-mount/
15 | # sudo mount /dev/nvme1n1 /var/lib/clickhouse-nvme-mount/
16 | # # not sure if below is necessary.
17 | # sudo cp -a /var/lib/clickhouse/. /var/lib/clickhouse-nvme-mount/
18 | # # change ownership of new mount to clickhouse
19 | # sudo chown -R clickhouse:clickhouse /var/lib/clickhouse-nvme-mount/
20 | # sudo chown -R clickhouse:clickhouse /dev/nvme1n1
21 | 
22 | # # add config so clickhouse knows to use the mount to spill data
23 | # sudo cp clickhouse/clickhouse-mount-config.xml /etc/clickhouse-server/config.d/data-paths.xml
24 | 
25 | echo "------------------------------------------"
26 | echo "------------------------------------------"
27 | echo "READY TO RUN BENCHMARK. ./run.sh"
28 | 


--------------------------------------------------------------------------------
/_setup_utils/prep_solutions.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import csv
 4 | 
 5 | SOLUTIONS_FILENAME = "_control/solutions.csv"
 6 | RUN_CONF_FILENAME = "run.conf"
 7 | 
 8 | SKIPPED_SOLUTIONS = []
 9 | 
10 | 
11 | def print_usage():
12 |     print("Usage: python3 _utils/prep_solutions.py --task=[groupby|join]")
13 |     exit(1)
14 | 
15 | def parse_task():
16 |     task = None
17 |     for arg in sys.argv[1:]:
18 |         if arg.startswith("--task="):
19 |             task = arg.replace("--task=", "")
20 |     if task == None or (task != "groupby" and task != "join"):
21 |         print_usage()
22 |     return task
23 | 
24 | def parse_solution():
25 |     solution = None
26 |     for arg in sys.argv[1:]:
27 |         if arg.startswith("--solution="):
28 |             solution = arg.replace("--solution=", "")
29 |     return solution
30 | 
31 | def main():
32 |     task = parse_task()
33 |     solution = parse_solution()
34 |     if solution == "all":
35 |         solution = get_solutions(task)
36 |     if solution == "clickhouse":
37 |         solution = "clickhouse polars"
38 |     update_run_conf_solutions(solution, task)
39 | 
40 | def update_run_conf_solutions(solution_name_list, task):
41 |     # change what solutions are run in run.conf
42 |     os.system(f"sed 's/export RUN_SOLUTIONS=.*/export RUN_SOLUTIONS=\"{solution_name_list}\"/g' run.conf > run_2.conf")
43 |     os.system(f"sed 's/export RUN_TASKS=.*/export RUN_TASKS=\"{task}\"/g' run_2.conf > run_3.conf")
44 |     os.system(f"sed 's/export DO_REPORT=.*/export DO_REPORT=false/g' run_3.conf > run.conf")
45 |     os.remove('run_2.conf')
46 |     os.remove('run_3.conf')
47 | 
48 | def get_solutions(task):
49 |     solutions_for_task = ""
50 |     with open(SOLUTIONS_FILENAME, newline="") as solutions_file:
51 |         solutions = csv.DictReader(solutions_file, delimiter=',')
52 |         for row in solutions:
53 |             if row['task'] == task and row['solution'] not in SKIPPED_SOLUTIONS:
54 |                 solutions_for_task += row['solution'] + " "
55 |     return solutions_for_task.strip()
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     main()


--------------------------------------------------------------------------------
/_setup_utils/repro.sh:
--------------------------------------------------------------------------------
 1 | # full repro on Ubuntu 22.04
 2 | 
 3 | cd ~/h2oai-db-benchmark
 4 | 
 5 | sudo apt-get -qq update
 6 | sudo apt upgrade
 7 | 
 8 | sudo apt-get -qq install -y lsb-release software-properties-common wget curl vim htop git byobu libcurl4-openssl-dev libssl-dev
 9 | sudo apt-get -qq install -y libfreetype6-dev
10 | sudo apt-get -qq install -y libfribidi-dev
11 | sudo apt-get -qq install -y libharfbuzz-dev
12 | sudo apt-get -qq install -y git
13 | sudo apt-get -qq install -y libxml2-dev
14 | sudo apt-get -qq install -y make
15 | sudo apt-get -qq install -y libfontconfig1-dev
16 | sudo apt-get -qq install -y libicu-dev pandoc zlib1g-dev libgit2-dev libcurl4-openssl-dev libssl-dev libjpeg-dev libpng-dev libtiff-dev
17 | # sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
18 | sudo add-apt-repository "deb [arch=amd64,i386] https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/"
19 | sudo apt-get -qq update
20 | sudo apt-get -qq install -y r-base-dev virtualenv
21 | 
22 | cd /usr/local/lib/R
23 | sudo chmod o+w site-library
24 | 
25 | cd ~
26 | mkdir -p .R
27 | echo 'CFLAGS=-O3 -mtune=native' >> ~/.R/Makevars
28 | echo 'CXXFLAGS=-O3 -mtune=native' >> ~/.R/Makevars
29 | 
30 | Rscript -e 'install.packages(c("jsonlite","bit64","devtools","rmarkdown"), dependecies=TRUE, repos="https://cloud.r-project.org")'
31 | 
32 | 
33 | # install dplyr
34 | Rscript -e 'devtools::install_github(c("tidyverse/readr","tidyverse/dplyr"))'
35 | 
36 | # install data.table
37 | Rscript -e 'install.packages("data.table", repos="https://rdatatable.gitlab.io/data.table/")'
38 | 
39 | 


--------------------------------------------------------------------------------
/_setup_utils/setup_small.sh:
--------------------------------------------------------------------------------
 1 | # full repro on Ubuntu 22.04
 2 | 
 3 | # update the key
 4 | sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 51716619E084DAB9
 5 | ## Install libraries
 6 | 
 7 | sudo apt-get -qq update
 8 | sudo apt upgrade
 9 | 
10 | sudo apt-get -qq install make
11 | 
12 | sudo apt-get -qq install wget curl openssl build-essential
13 | sudo apt-get -qq install -y r-base-dev virtualenv
14 | sudo apt-get -qq install openjdk-17-jdk
15 | 
16 | sudo apt-get install -y zlib1g-dev
17 | sudo apt-get install -y pandoc unzip
18 | 
19 | # update virtualenv
20 | python3 -m pip install virtualenv
21 | 
22 | # sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
23 | # sudo add-apt-repository "deb [arch=amd64,i386] https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/"
24 | 
25 | sudo chmod o+w /usr/local/lib/R/site-library
26 | 
27 | Rscript -e 'install.packages(c("data.table", "dplyr", "knitr", "bit64"), dependecies=TRUE, repos="https://cloud.r-project.org")'
28 | 
29 | mkdir -p ~/.R
30 | echo 'CFLAGS=-O3 -mtune=native' >> ~/.R/Makevars
31 | echo 'CXXFLAGS=-O3 -mtune=native' >> ~/.R/Makevars


--------------------------------------------------------------------------------
/_setup_utils/sleep_and_run.sh:
--------------------------------------------------------------------------------
 1 | while [ -f run.lock ]
 2 | do
 3 |    sleep 1800
 4 | done
 5 | 
 6 | 
 7 | rm run.lock
 8 | 
 9 | ./run.sh
10 | 


--------------------------------------------------------------------------------
/_utils/answers-validation.R:
--------------------------------------------------------------------------------
 1 | source("report.R")
 2 | d = time_logs()
 3 | 
 4 | # this script meant to detect some inconsistencies within a solution results and between solutions results
 5 | # note that known exceptions has been already filtered out in report.R in clean_time function
 6 | 
 7 | check = list()
 8 | 
 9 | # detect lack of consistency in query output within single benchmark runs within each solution separately
10 | grain = c("solution","task","data","iquestion")
11 | d[!is.na(out_rows), .(unqn_out_rows=uniqueN(out_rows), unq_out_rows=paste(unique(out_rows), collapse=",")), by=grain
12 |   ][unqn_out_rows>1L
13 |     ] -> check[["solution_out_rows"]]
14 | 
15 | # detect lack of out_rows match in query output between solutions
16 | grain = c("task","data","iquestion","question")
17 | d[!is.na(out_rows), .(unqn_out_rows=uniqueN(out_rows), unq_out_rows=paste(unique(out_rows), collapse=",")), by=grain
18 |   ][unqn_out_rows>1L
19 |     ] -> check[["out_rows"]]
20 | # detect lack of chk match in query output between median chk from all solutions with tolerance=0.005
21 | chk_check = function(chk, tolerance=sqrt(.Machine$double.eps)) {
22 |   len = unique(sapply(chk, length))
23 |   if (length(len)!=1L) stop("some solutions returns chk for less variables than others")
24 |   med = sapply(seq.int(len), function(i) median(sapply(chk, `[[`, i)))
25 |   eq_txt = sapply(chk, all.equal, med, tolerance=tolerance, simplify=FALSE)
26 |   #if (any(!sapply(eq_txt, isTRUE))) browser()
27 |   eq = sapply(eq_txt, isTRUE)
28 |   ans = list()
29 |   ans$n_match = sum(eq)
30 |   ans$n_mismatch = sum(!eq)
31 |   ans$med_chk = paste0(format(med, scientific=FALSE, trim=TRUE), collapse=";")
32 |   ans$sol_mismatch = if (!ans$n_mismatch) NA_character_ else paste0(names(eq)[!eq], collapse=",")
33 |   ans$sol_chk_mismatch = if (!ans$n_mismatch) NA_character_ else paste(paste0(names(eq)[!eq], ":", sapply(sapply(chk[names(eq)[!eq]], format, scientific=FALSE, trim=TRUE, simplify=FALSE), paste, collapse=";")), collapse=",")
34 |   ans
35 | }
36 | (if (nrow(check[["solution_chk"]])) NULL else { # only proceed if chk was not mismatched within a solution
37 |   d[!is.na(chk) & solution!="cudf", # cudf chk validation disabled due to issue described in model_time() in report.R
38 |     .(unqn_chk=uniqueN(chk), chk=unique(chk)), by=c("solution", grain)
39 |     ][, if (any(unqn_chk>1L)) stop("this check should not be performed, should be escaped in 'if' branch") else .SD # ensure chk is unique
40 |       ][, .(chk, chk_l=sapply(strsplit(chk, ";", fixed=TRUE), as.numeric, simplify=FALSE)), by=c("solution", grain)
41 |         ][, chk_check(setNames(chk_l, solution), tolerance=0.005), keyby=grain
42 |           ][n_mismatch>0L]
43 | }) -> check[["chk"]]
44 | 
45 | # detect solutions for which chk calculation timing was relatively big comparing to query timing
46 | grain = c("solution","task","data","iquestion","question")
47 | d[, .(time_sec_1, chk_time_sec_1, time_sec_2, chk_time_sec_2, time_to_chk_1=time_sec_1/chk_time_sec_1, time_to_chk_2=time_sec_2/chk_time_sec_2), by=grain
48 |   ][!(time_to_chk_1>2.5 & time_to_chk_2>2.5) # spark chk is only 2.6+ times faster than query
49 |     ] -> check[["chk_time_sec"]]
50 | 
51 | # print results
52 | if (any(sapply(check, nrow))) {
53 |   cat("db-benchmark answers consistency check failed, see details below\n")
54 |   print(check)
55 | } else {
56 |   cat("db-benchmark answers consistency check successfully passed\n")
57 | }
58 | 


--------------------------------------------------------------------------------
/_utils/compare-data.table.R:
--------------------------------------------------------------------------------
 1 | source("_utils/time.R")
 2 | if (system("tail -1 time.csv | cut -d',' -f2", intern=TRUE)!="1621364165")
 3 |   stop("time.csv and logs.csv should be as of 1621364165 batch run, filter out newer rows in those files")
 4 | 
 5 | ## groupby ----
 6 | 
 7 | d = tail.time("data.table", "groupby", i=c(1L, 2L))
 8 | setnames(d, c("20210517_2f2f62d","20210518_2f2f62d"), c("th_40","th_20"))
 9 | if (nrow(d[(is.na(th_40) & !is.na(th_20)) | (!is.na(th_40) & is.na(th_20))])) {
10 |   stop("number of threads had an impact on completion of queries")
11 | } else {
12 |   d = d[!is.na(th_40)]
13 | }
14 | d[, th_40_20:=th_40/th_20]
15 | 
16 | ## improvement
17 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(in_rows)]
18 | #   in_rows      mean    median
19 | #1:     1e7 1.0242721 0.9609988
20 | #2:     1e8 0.9378870 0.9455267
21 | #3:     1e9 0.9506561 0.9569359
22 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(knasorted)]
23 | #                                         knasorted      mean    median
24 | #1:   1e2 cardinality factor, 0% NAs, unsorted data 1.0393667 0.9538973
25 | #2:   1e1 cardinality factor, 0% NAs, unsorted data 0.9521915 0.9544223
26 | #3:   2e0 cardinality factor, 0% NAs, unsorted data 0.9604950 0.9569359
27 | #4: 1e2 cardinality factor, 0% NAs, pre-sorted data 0.9371154 0.9487804
28 | #5:   1e2 cardinality factor, 5% NAs, unsorted data 0.9678192 0.9598999
29 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(question_group)]
30 | #   question_group      mean    median
31 | #1:          basic 0.9548596 0.9301310
32 | #2:       advanced 0.9897345 0.9806791
33 | 
34 | ## worst case by data
35 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(in_rows, knasorted)][which.max(mean)]
36 | #   in_rows                                     knasorted     mean    median
37 | #1:     1e7 1e2 cardinality factor, 0% NAs, unsorted data 1.239259 0.9620776
38 | ## best case by data
39 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(in_rows, knasorted)][which.min(mean)]
40 | #   in_rows                                     knasorted      mean    median
41 | #1:     1e8 1e2 cardinality factor, 0% NAs, unsorted data 0.9235102 0.9200373
42 | 
43 | ## worst case for single question
44 | d[which.max(th_40_20)]
45 | #   in_rows                                     knasorted question_group          question th_40 th_20 th_40_20
46 | #1:     1e7 1e2 cardinality factor, 0% NAs, unsorted data          basic sum v1 by id1:id2 0.413 0.118      3.5
47 | ## best case for single question
48 | d[which.min(th_40_20)]
49 | #   in_rows                                     knasorted question_group              question th_40  th_20  th_40_20
50 | #1:     1e9 1e2 cardinality factor, 5% NAs, unsorted data          basic sum v1 mean v3 by id3 15.22 21.104 0.7211903
51 | 
52 | ## join ----
53 | 
54 | d = tail.time("data.table", "join", i=c(1L, 2L))
55 | setnames(d, c("20210517_2f2f62d","20210518_2f2f62d"), c("th_40","th_20"))
56 | if (nrow(d[(is.na(th_40) & !is.na(th_20)) | (!is.na(th_40) & is.na(th_20))])) {
57 |   stop("number of threads had an impact on completion of queries")
58 | } else {
59 |   d = d[!is.na(th_40)]
60 | }
61 | d[, th_40_20:=th_40/th_20]
62 | 
63 | ## improvement
64 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(in_rows)]
65 | #   in_rows      mean    median
66 | #1:     1e7 1.0149302 1.0000000
67 | #2:     1e8 0.9143243 0.9008573
68 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(knasorted)]
69 | #                 knasorted      mean    median
70 | #1:   0% NAs, unsorted data 0.9385902 0.9144130
71 | #2:   5% NAs, unsorted data 0.9612286 0.9294773
72 | #3: 0% NAs, pre-sorted data 0.9940629 0.9705720
73 | 
74 | ## worst case by data
75 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(in_rows, knasorted)][which.max(mean)]
76 | #   in_rows               knasorted     mean median
77 | #1:     1e7 0% NAs, pre-sorted data 1.055906   1.05
78 | ## best case by data
79 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(in_rows, knasorted)][which.min(mean)]
80 | #   in_rows             knasorted      mean    median
81 | #1:     1e8 0% NAs, unsorted data 0.8983325 0.8773762
82 | 
83 | ## worst case for single question
84 | d[which.max(th_40_20)]
85 | #   in_rows             knasorted               question th_40 th_20 th_40_20
86 | #1:     1e7 5% NAs, unsorted data medium inner on factor 0.513 0.443 1.158014
87 | ## best case for single question
88 | d[which.min(th_40_20)]
89 | #   in_rows             knasorted            question th_40 th_20  th_40_20
90 | #1:     1e8 0% NAs, unsorted data medium outer on int 8.143 9.558 0.8519565
91 | 


--------------------------------------------------------------------------------
/_utils/download_data.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | # get small data
 3 | wget https://duckdb-blobs.s3.amazonaws.com/data/db-benchmark-data/groupby_small.duckdb
 4 | ~/duckdb groupby_small.duckdb -c "copy G1_1e7_1e2_0_0 to 'G1_1e7_1e2_0_0.csv' (FORMAT CSV)"
 5 | ~/duckdb groupby_small.duckdb -c "copy G1_1e7_1e1_0_0 to 'G1_1e7_1e1_0_0.csv' (FORMAT CSV)"
 6 | ~/duckdb groupby_small.duckdb -c "copy G1_1e7_2e0_0_0 to 'G1_1e7_2e0_0_0.csv' (FORMAT CSV)"
 7 | ~/duckdb groupby_small.duckdb -c "copy G1_1e7_1e2_0_1 to 'G1_1e7_1e2_0_1.csv' (FORMAT CSV)"
 8 | ~/duckdb groupby_small.duckdb -c "copy G1_1e7_1e2_5_0 to 'G1_1e7_1e2_5_0.csv' (FORMAT CSV)"
 9 | ~/duckdb groupby_small.duckdb -c "copy G1_1e8_1e2_0_0 to 'G1_1e8_1e2_0_0.csv' (FORMAT CSV)"
10 | ~/duckdb groupby_small.duckdb -c "copy G1_1e8_1e1_0_0 to 'G1_1e8_1e1_0_0.csv' (FORMAT CSV)"
11 | ~/duckdb groupby_small.duckdb -c "copy G1_1e8_2e0_0_0 to 'G1_1e8_2e0_0_0.csv' (FORMAT CSV)"
12 | ~/duckdb groupby_small.duckdb -c "copy G1_1e8_1e2_0_1 to 'G1_1e8_1e2_0_1.csv' (FORMAT CSV)"
13 | ~/duckdb groupby_small.duckdb -c "copy G1_1e8_1e2_5_0 to 'G1_1e8_1e2_5_0.csv' (FORMAT CSV)"
14 | 
15 | wget https://duckdb-blobs.s3.amazonaws.com/data/db-benchmark-data/join_small.duckdb
16 | 
17 | # get large data
18 | wget https://duckdb-blobs.s3.amazonaws.com/data/db-benchmark-data/join_large.duckdb
19 | 
20 | wget https://duckdb-blobs.s3.amazonaws.com/data/db-benchmark-data/groupby_large.duckdb
21 | ~/duckdb -c "copy G1_1e9_1e2_0_0 to 'G1_1e9_1e2_0_0.csv' (FORMAT CSV)"
22 | ~/duckdb -c "copy G1_1e9_1e1_0_0 to 'G1_1e9_1e1_0_0.csv' (FORMAT CSV)"
23 | ~/duckdb -c "copy G1_1e9_2e0_0_0 to 'G1_1e9_2e0_0_0.csv' (FORMAT CSV)"
24 | ~/duckdb -c "copy G1_1e9_1e2_0_1 to 'G1_1e9_1e2_0_1.csv' (FORMAT CSV)"
25 | ~/duckdb -c "copy G1_1e9_1e2_5_0 to 'G1_1e9_1e2_5_0.csv' (FORMAT CSV)"
26 | 
27 | # get 500GB data
28 | wget https://duckdb-blobs.s3.amazonaws.com/data/db-benchmark-data/join-500gb.duckdb
29 | 
30 | # ??? 
31 | wget https://duckdb-blobs.s3.amazonaws.com/data/db-benchmark-data/groupby-500gb.duckdb


--------------------------------------------------------------------------------
/_utils/generate-data-small.sh:
--------------------------------------------------------------------------------
 1 | # Data generation data for groupby 0.5GB
 2 | 
 3 | mkdir -p data
 4 | cd data/
 5 | Rscript ../_data/groupby-datagen.R 1e7 1e2 0 0
 6 | Rscript ../_data/groupby-datagen.R 1e7 1e2 15 0
 7 | Rscript ../_data/join-datagen.R 1e7 0 0 0
 8 | 
 9 | cp G1_1e7_1e2_0_0.csv G1_1e9_1e2_0_0.csv
10 | cp J1_1e7_1e1_0_0.csv J1_1e9_1e3_0_0.csv
11 | cp J1_1e7_1e4_0_0.csv J1_1e9_1e6_0_0.csv
12 | cp J1_1e7_1e7_0_0.csv J1_1e9_1e9_0_0.csv
13 | cp J1_1e7_NA_0_0.csv J1_1e9_NA_0_0.csv
14 | 
15 | cd ..
16 | 
17 | # don't publish, we dont even have the keys
18 | sed -i 's/DO_PUBLISH=true/DO_PUBLISH=false/g' run.conf
19 | 
20 | # set sizes
21 | mv _control/data.csv _control/data.csv.original
22 | 
23 | echo "task,data,nrow,k,na,sort,active" > _control/data.csv
24 | echo "groupby,G1_1e7_1e2_0_0,1e7,1e2,0,0,1" >> _control/data.csv
25 | echo "groupby,G1_1e7_1e2_15_0,1e7,1e2,15,0,1" >> _control/data.csv
26 | echo "groupby,G1_1e9_1e2_0_0,1e9,1e2,0,0,1" >> _control/data.csv
27 | echo "join,J1_1e7_NA_0_0,1e7,NA,0,0,1" >> _control/data.csv
28 | echo "join,J1_1e9_NA_0_0,1e9,NA,0,0,1" >> _control/data.csv
29 | 


--------------------------------------------------------------------------------
/_utils/groupby_k_factor.csv:
--------------------------------------------------------------------------------
 1 | K,in_rows,question,out_rows
 2 | 1e2,10000000,q1,100
 3 | 1e2,10000000,q2,10000
 4 | 1e2,10000000,q3,100000
 5 | 1e2,10000000,q4,100
 6 | 1e2,10000000,q5,100000
 7 | 1e2,100000000,q1,100
 8 | 1e2,100000000,q2,10000
 9 | 1e2,100000000,q3,1000000
10 | 1e2,100000000,q4,100
11 | 1e2,100000000,q5,1000000
12 | 1e2,1000000000,q1,100
13 | 1e2,1000000000,q2,10000
14 | 1e2,1000000000,q3,10000000
15 | 1e2,1000000000,q4,100
16 | 1e2,1000000000,q5,10000000
17 | 1e1,10000000,q1,10
18 | 1e1,10000000,q2,100
19 | 1e1,10000000,q3,999951
20 | 1e1,10000000,q4,10
21 | 1e1,10000000,q5,999969
22 | 1e1,100000000,q1,10
23 | 1e1,100000000,q2,100
24 | 1e1,100000000,q3,9999518
25 | 1e1,100000000,q4,10
26 | 1e1,100000000,q5,9999512
27 | 1e1,1000000000,q1,10
28 | 1e1,1000000000,q2,100
29 | 1e1,1000000000,q3,99995425
30 | 1e1,1000000000,q4,10
31 | 1e1,1000000000,q5,99995357
32 | 2e0,10000000,q1,2
33 | 2e0,10000000,q2,4
34 | 2e0,10000000,q3,4323484
35 | 2e0,10000000,q4,2
36 | 2e0,10000000,q5,4323579
37 | 2e0,100000000,q1,2
38 | 2e0,100000000,q2,4
39 | 2e0,100000000,q3,43231389
40 | 2e0,100000000,q4,2
41 | 2e0,100000000,q5,43232226
42 | 2e0,1000000000,q1,2
43 | 2e0,1000000000,q2,4
44 | 2e0,1000000000,q3,431884560
45 | 2e0,1000000000,q4,2
46 | 2e0,1000000000,q5,431876300
47 | 


--------------------------------------------------------------------------------
/_utils/maintainer.R:
--------------------------------------------------------------------------------
 1 | timeleft = function() {
 2 |   l = data.table::fread("logs.csv")
 3 |   if (!nrow(l))
 4 |     stop("logs.csv files is empty")
 5 |   this = l[.N]
 6 |   if (this$action=="finish") {
 7 |     this[, cat(sprintf("%s %s %s must have just finished\n", solution, task, data))]
 8 |     quit("no")
 9 |   }
10 |   stopifnot(this$action=="start")
11 |   l = l[-.N][action!="skip", data.table::dcast(.SD, solution+task+data+batch~action, value.var="timestamp")]
12 |   took = l[this, on=.(solution, task, data), nomatch=NULL, finish[.N]-start[.N]]
13 |   if (!length(took) || is.na(took)) {
14 |     this[, cat(sprintf("%s %s %s is running for the first time so it is unknown how much it will run\n", solution, task, data))]
15 |     quit("no")
16 |   }
17 |   stopifnot(took>0)
18 |   now = trunc(as.numeric(Sys.time()))
19 |   this[, cat(sprintf("%s %s %s should take around %ss more\n", solution, task, data, trunc(took-(now-timestamp))))]
20 |   q("no")
21 | }
22 | 


--------------------------------------------------------------------------------
/_utils/maintainer.sh:
--------------------------------------------------------------------------------
1 | # returns time left by the currently run script, useful after touch pause|stop
2 | timeleft() {
3 |   if [ ! -f ./run.lock ]; then
4 |     echo "benchmark is not running now" >&2 && return 1
5 |   fi
6 |   Rscript -e 'source("_utils/maintainer.R"); timeleft()'
7 | }
8 | 


--------------------------------------------------------------------------------
/_utils/parse_time_logs.R:
--------------------------------------------------------------------------------
1 | 
2 | source("./_report/report.R", chdir=TRUE)
3 | source("./_helpers/helpers.R", chdir=TRUE)
4 | source("./_benchplot/benchplot.R", chdir=TRUE)
5 | source("./_benchplot/benchplot-dict.R", chdir=TRUE)
6 | ld = time_logs()


--------------------------------------------------------------------------------
/_utils/partitioned_run.sh:
--------------------------------------------------------------------------------
1 | ./_run/run_small_medium.sh
2 | ./_run/run_large.sh


--------------------------------------------------------------------------------
/_utils/sql_to_check_timings/timing_checks.sql:
--------------------------------------------------------------------------------
 1 | -- run this in duckdb
 2 | 
 3 | create table timings as select * from read_csv_auto('reports/oct_25/time.csv');
 4 | 
 5 | 
 6 | -- check what solutions might have bad out rows
 7 | select t1.question, t1.data, t1.out_rows, t1.solution, t2.out_rows, t2.solution from
 8 |    timings t1, timings t2 
 9 |  where t1.out_rows != t2.out_rows 
10 |  and t1.question = t2.question 
11 |  and t1.solution != 'clickhouse'
12 |  and t2.solution != 'clickhouse'
13 |  and t1.task = t2.task
14 |  -- and t1.task = 'groupby'
15 |  -- and t1.solution != 'arrow'
16 |  -- and t2.solution != 'arrow'
17 |  and t2.solution != 'datafusion'
18 |  and t1.question != 'sum v3 count by id1:id6'
19 |  and t1.data != 'G1_1e8_1e2_5_0'
20 |  and t1.data = t2.data ;
21 | 
22 | 
23 | -- Value of 'chk' varies for different runs for single solution+question
24 | create table timings as select * from read_csv('time.csv');
25 | 
26 | select t1.chk, t2.chk, t1.solution, t2.solution from
27 |    timings t1, timings t2 
28 |  where t1.chk != t2.chk 
29 |  and t1.question = t2.question 
30 |  and t1.task = t2.task
31 |  and t1.solution != 'datafusion'
32 |  and t2.solution != 'datafusion'
33 |  and t1.solution != 'arrow'
34 |  and t2.solution != 'arrow'
35 |  and t1.solution != 'R-arrow'
36 |  and t2.solution != 'R-arrow'
37 |  and t1.solution != 'collapse'
38 |  and t1.solution = t2.solution
39 |  and t1.data = t2.data group by all;
40 | 
41 | 
42 | select t1.question, t1.data, t1.out_rows, t2.solution, t2.out_rows from 
43 | timings t1, timings t2
44 | where t1.out_rows != t2.out_rows
45 | and t1.question = t2.question 
46 | and t1.solution != 'clickhouse'
47 | and t2.solution != 'clickhouse'
48 | and t1.question = 'medium outer on int'
49 | and t1.data = t2.data;


--------------------------------------------------------------------------------
/_utils/time.R:
--------------------------------------------------------------------------------
 1 | source("./_report/report.R")
 2 | 
 3 | download.time = function(file=c("logs.csv","time.csv"), from="https://h2oai.github.io/db-benchmark") {
 4 |   stopifnot(is.character(file), is.character(from), length(file)>=1L, length(from)==1L, !is.na(file), !is.na(from))
 5 |   if (all(file.exists(file))) {
 6 |     md5file = paste(file, "md5", sep=".")
 7 |     download.file(file.path(from, md5file), destfile=md5file)
 8 |     upstream = sapply(strsplit(sapply(setNames(md5file, file), readLines), split=" ", fixed=TRUE), `[[`, 1L)
 9 |     current = tools::md5sum(file)
10 |     new = current[names(upstream)] != upstream
11 |     file = names(new)[new]
12 |     if (!length(file)) {
13 |       cat("nothing to download, md5sum of local files match the upstream md5sum\n")
14 |       return(invisible(NULL))
15 |     }
16 |   }
17 |   download.file(file.path(from, file), destfile=file)
18 |   return(invisible(NULL))
19 | }
20 | 
21 | drop.data.table = function(x, cols) {
22 |   ans = data.table:::shallow(x)
23 |   un = sapply(cols, function(col) uniqueN(x[[col]]))
24 |   rm = names(un)[un <= 1L]
25 |   if (length(rm)) set(ans, NULL, rm, NULL) # Rdatatable/data.table#4086
26 |   ans
27 | }
28 | 
29 | tail.time = function(solution, task, n=2L, i=seq_len(n), drop=TRUE) {
30 |   stopifnot(length(solution)==1L, length(task)==1L, length(n)==1L, n>0L, length(i)>=1L, all(i>=0L))
31 |   if (!missing(n) && !missing(i)) stop("only 'n' or 'i' argument should be used, not both")
32 |   ld = time_logs()
33 |   s = solution
34 |   t = task
35 |   ld = ld[solution==s & task==t]
36 |   ub = unique(ld$batch)
37 |   i = i[i <= length(ub)] # there might be only N unq batches but N+1 requested
38 |   if (!length(i)) stop("there are not enough registered runs for this solution and requested recent timings")
39 |   b = rev(ub)[i]
40 |   ans = dcast(
41 |     ld[batch%in%b],
42 |     in_rows + knasorted + question_group + question ~ paste(format(as.POSIXct(as.numeric(batch), origin="1970-01-01"), "%Y%m%d"), substr(git, 1, 7), sep="_"),
43 |     value.var = "time_sec_1"
44 |   )
45 |   if (drop) ans = drop.data.table(ans, cols=c("in_rows","knasorted","question_group","question"))
46 |   ans
47 | }
48 | 
49 | compare.time = function(solutions, task, drop=TRUE) {
50 |   stopifnot(length(solutions)>=1L, length(task)==1L)
51 |   ld = time_logs()
52 |   t = task
53 |   ans = dcast(
54 |     ld[script_recent==TRUE & solution%in%solutions & task==t],
55 |     in_rows + knasorted + question_group + question ~ solution,
56 |     value.var = "time_sec_1"
57 |   )
58 |   if (drop) ans = drop.data.table(ans, cols=c("in_rows","knasorted","question_group","question"))
59 |   ans
60 | }
61 | 
62 | ## maintainer mode
63 | #scp -C mr-dl11:~/git/db-benchmark/logs.csv ~/git/db-benchmark/logs.csv && scp -C mr-dl11:~/git/db-benchmark/time.csv ~/git/db-benchmark/time.csv
64 | 
65 | ## user mode
66 | #download.time()
67 | #tail.time("juliadf", "groupby", i=c(1L, 2L))
68 | #tail.time("data.table", "groupby", i=c(1L, 2L))
69 | #compare.time(c("data.table","spark","pydatatable"), "join")
70 | 


--------------------------------------------------------------------------------
/_utils/validate_no_errors.sh:
--------------------------------------------------------------------------------
 1 | if [ $(grep -i 'error|exception' out/run_*.err | wc -l) = 0 ]
 2 | then
 3 | 	# no true errors found, print last line of each output script
 4 |     echo "No Errors found in run_*.err logs"
 5 | else
 6 | 	echo "The following errors have been found. Failing check"
 7 | 	grep -i "error|exception" out/*.err
 8 | 	exit 1
 9 | fi
10 | 
11 | 
12 | 
13 | # check report generation. If this fails, the logs.csv/time.csv
14 | # have errors 
15 | Rscript _utils/parse_time_logs.R 2> report_check.txt
16 | # https://gist.github.com/jesugmz/3fda0fc7c1006cedfe039ff1459c3174
17 | output=$(wc -l report_check.txt | awk '{ print $1 }')
18 | if [ $output -ne 0 ]
19 | then
20 | 	echo "report check not empty"
21 | 	cat report_check.txt
22 | 	exit 1
23 | fi
24 | echo "time.csv and logs.csv can be parsed"
25 | 
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/arrow/VERSION:
--------------------------------------------------------------------------------
1 | 13.0.0.1
2 | 


--------------------------------------------------------------------------------
/clickhouse/VERSION:
--------------------------------------------------------------------------------
1 | 25.1.3.23


--------------------------------------------------------------------------------
/clickhouse/ch.sh:
--------------------------------------------------------------------------------
 1 | ch_installed() {
 2 |   dpkg-query -Wf'${db:Status-abbrev}' clickhouse-server 2>/dev/null | grep -q '^i'
 3 | }
 4 | 
 5 | ch_active() {
 6 |   clickhouse-client --user db_benchmark --query="SELECT 0;" > /dev/null 2>&1
 7 |   local ret=$?;
 8 |   if [[ $ret -eq 0 ]]; then return 0; elif [[ $ret -eq 210 ]]; then return 1; else echo "Unexpected return code from clickhouse-client: $ret" >&2 && return 1; fi;
 9 | }
10 | 
11 | ch_wait() {
12 |   for _ in $(seq 1 60); do if [[ $(wget -q 'localhost:8123' -O-) == 'Ok.' ]]; then break ; else sleep 1; fi ; done
13 |   ch_active
14 | }
15 | 
16 | ch_start() {
17 |   echo '# ch_start: starting clickhouse-server'
18 |   sudo service clickhouse-server start
19 |   ch_wait
20 | }
21 | 
22 | ch_stop() {
23 |   echo '# ch_stop: stopping clickhouse-server'
24 |   sudo service clickhouse-server stop && sleep 15
25 | }
26 | 
27 | ch_query() {
28 |   ENGINE=Memory
29 |   if [ $COMPRESS -eq 1 ]; then
30 |   ENGINE="Memory settings compress=1"
31 |   fi
32 |   if [ $ON_DISK -eq 1 ]; then
33 |   ENGINE="MergeTree ORDER BY tuple()"
34 |   fi
35 |   sudo touch '/var/lib/clickhouse/flags/force_drop_table' && sudo chmod 666 '/var/lib/clickhouse/flags/force_drop_table'
36 |   clickhouse-client --user db_benchmark --query "DROP TABLE IF EXISTS ans;"
37 |   clickhouse-client --user db_benchmark --log_comment ${RUNNAME} --query "CREATE TABLE ans ENGINE = ${ENGINE} AS ${QUERY} SETTINGS max_insert_threads=${THREADS}, max_threads=${THREADS};"
38 |   local ret=$?;
39 |   if [[ $ret -eq 0 ]]; then return 0; elif [[ $ret -eq 210 ]]; then return 1; else echo "Unexpected return code from clickhouse-client: $ret" >&2 && return 1; fi;
40 |   clickhouse-client --user db_benchmark --query "SELECT * FROM ans LIMIT 3;"
41 |   sudo touch '/var/lib/clickhouse/flags/force_drop_table' && sudo chmod 666 '/var/lib/clickhouse/flags/force_drop_table'
42 |   clickhouse-client --user db_benchmark --query "DROP TABLE ans;"
43 | }
44 | 
45 | ch_logrun() {
46 |   clickhouse-client --user db_benchmark --query "SYSTEM FLUSH LOGS;"
47 |   clickhouse-client --user db_benchmark --query "SELECT ${RUN} AS run, toUnixTimestamp(now()) AS timestamp, '${TASK}' AS task, '${SRC_DATANAME}' AS data_name, NULL AS in_rows, '${QUESTION}' AS question, result_rows AS out_rows, NULL AS out_cols, 'clickhouse' AS solution, version() AS version, NULL AS git, '${FUNCTION}' AS fun, query_duration_ms/1000 AS time_sec, memory_usage/1073741824 AS mem_gb, 1 AS cache, NULL AS chk, NULL AS chk_time_sec, 1 AS on_disk FROM system.query_log WHERE type='QueryFinish' AND log_comment='${RUNNAME}' ORDER BY query_start_time DESC LIMIT 1 FORMAT CSVWithNames;" > clickhouse/log/${RUNNAME}.csv
48 |   local ret=$?;
49 |   if [[ $ret -eq 0 ]]; then return 0; elif [[ $ret -eq 210 ]]; then return 1; else echo "Unexpected return code from clickhouse-client: $ret" >&2 && return 1; fi;
50 | }
51 | 
52 | ch_make_2_runs() {
53 |   RUN=1
54 |   RUNNAME="${TASK}_${SRC_DATANAME}_q${Q}_r${RUN}"
55 |   ch_query
56 |   ch_logrun
57 | 
58 |   if [ $COMPRESS -eq 1 ]; then
59 |     # It will take some time for memory freed by Memory engine to be returned back to the system.
60 |     # Without a sleep we might get a MEMORY_LIMIT exception during the second run of the query.
61 |     # It is done only when $COMPRESS=1 because this variable is set to true only for the largest dataset.
62 |     sleep 60
63 |   fi
64 | 
65 |   RUN=2
66 |   RUNNAME="${TASK}_${SRC_DATANAME}_q${Q}_r${RUN}"
67 |   ch_query
68 |   ch_logrun
69 | }
70 | 


--------------------------------------------------------------------------------
/clickhouse/clickhouse-misc.sh:
--------------------------------------------------------------------------------
 1 | CREATE USER IF NOT EXISTS db_benchmark IDENTIFIED WITH no_password SETTINGS max_memory_usage = 28000000000 WRITABLE;
 2 | GRANT select, insert, create, alter, alter user, drop on *.* to db_benchmark;
 3 | 
 4 | ALTER USER db_benchmark SETTINGS max_memory_usage_for_user = 28000000000;
 5 | 
 6 | 
 7 | CREATE TABLE G1_1e9_1e1_0_0 (id1 LowCardinality(Nullable(String)), id2 LowCardinality(Nullable(String)), id3 Nullable(String), id4 Nullable(Int32), id5 Nullable(Int32), id6 Nullable(Int32), v1 Nullable(Int32), v2 Nullable(Int32), v3 Nullable(Float64)) ENGINE = MergeTree() ORDER BY tuple();
 8 | 
 9 | INSERT INTO G1_1e9_1e1_0_0 FROM INFILE 'data/G1_1e9_1e1_0_0.csv';
10 | 
11 | # q1
12 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id1, sum(v1) AS v1 FROM G1_1e9_1e1_0_0 GROUP BY id1 SETTINGS max_insert_threads=32, max_threads=32;
13 | 
14 | drop table if exists ans;
15 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id1, id2, sum(v1) AS v1 FROM G1_1e9_1e1_0_0 GROUP BY id1, id2  SETTINGS max_insert_threads=32, max_threads=32;
16 | 
17 | drop table if exists ans;
18 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id3, sum(v1) AS v1, avg(v3) AS v3 FROM G1_1e9_1e1_0_0 GROUP BY id3 SETTINGS max_insert_threads=16, max_threads=16;
19 | 
20 | drop table if exists ans;
21 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id4, avg(v1) AS v1, avg(v2) AS v2, avg(v3) AS v3 FROM G1_1e9_1e1_0_0 GROUP BY id4 SETTINGS max_insert_threads=32, max_threads=32;
22 | 
23 | drop table if exists ans;
24 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id6, sum(v1) AS v1, sum(v2) AS v2, sum(v3) AS v3 FROM G1_1e9_1e1_0_0 GROUP BY id6 SETTINGS max_insert_threads=32, max_threads=32;
25 | 
26 | drop table if exists ans;
27 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id4, id5, medianExact(v3) AS median_v3, stddevPop(v3) AS sd_v3 FROM G1_1e9_1e1_0_0 GROUP BY id4, id5 SETTINGS max_insert_threads=32, max_threads=32;
28 | 
29 | drop table if exists ans;
30 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id3, max(v1) - min(v2) AS range_v1_v2 FROM G1_1e9_1e1_0_0 GROUP BY id3 SETTINGS max_insert_threads=32, max_threads=32;
31 | 
32 | drop table if exists ans;
33 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id6, arrayJoin(arraySlice(arrayReverseSort(groupArray(v3)), 1, 2)) AS v3 FROM (SELECT id6, v3 FROM G1_1e9_1e1_0_0 WHERE v3 IS NOT NULL) AS subq GROUP BY id6 SETTINGS max_insert_threads=32, max_threads=32;
34 | 
35 | drop table if exists ans;
36 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id2, id4, pow(corr(v1, v2), 2) AS r2 FROM G1_1e9_1e1_0_0 GROUP BY id2, id4 SETTINGS max_insert_threads=32, max_threads=32;
37 | 
38 | drop table if exists ans;
39 | 
40 | #q10 
41 | 
42 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id1, id2, id3, id4, id5, id6, sum(v3) AS v3, count() AS cnt FROM G1_1e9_1e1_0_0 GROUP BY id1, id2, id3, id4, id5, id6 SETTINGS max_insert_threads=32, max_threads=32;


--------------------------------------------------------------------------------
/clickhouse/clickhouse-mount-config.xml:
--------------------------------------------------------------------------------
 1 | <clickhouse>
 2 | 	<!-- equivalent to $MOUNT_POINT/clickhouse-nvme-mount -->
 3 | 	<path>/var/lib/mount/clickhouse-nvme-mount/</path>
 4 | 	<max_table_size_to_drop>0</max_table_size_to_drop>
 5 | 	<max_partition_size_to_drop>0</max_partition_size_to_drop>
 6 | 	<listen_host>0.0.0.0</listen_host>
 7 | 	<tcp_port>9000</tcp_port>
 8 | 	<http_port>8123</http_port>
 9 | 	<tmp_path>/var/lib/mount/clickhouse-tmp/</tmp_path>
10 | </clickhouse>
11 | 


--------------------------------------------------------------------------------
/clickhouse/clickhouse-parse-log.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | cat("# clickhouse-parse-log.R: starting to parse timings from clickhouse/log/.\n")
 4 | 
 5 | source("./_helpers/helpers.R")
 6 | args = commandArgs(TRUE) # args = c("groupby","G1_1e6_1e2_0_0")
 7 | stopifnot(length(args)==2L)
 8 | task = args[1L]
 9 | data_name = args[2L]
10 | machine_type = Sys.getenv("MACHINE_TYPE")
11 | 
12 | library(data.table)
13 | # sort files according to question and run
14 | sort_q_r = function(f) {
15 |   tmp = strsplit(tools::file_path_sans_ext(basename(f)), "_", fixed=TRUE)
16 |   if (length(len<-unique(lengths(tmp)))!=1L)
17 |     stop("files names for some of logs differs in number of underscores, it should be clickhouse/log/[task]_[data_name]_q[i]_r[j].csv")
18 |   stopifnot(len>1L)
19 |   qr = rbindlist(lapply(lapply(tmp, `[`, c(len-1L,len)), function(x) {
20 |     stopifnot(substr(x[1L], 1L, 1L)=="q", substr(x[2L], 1L, 1L)=="r")
21 |     list(q=as.integer(substr(x[1L], 2L, nchar(x[1L]))), r=as.integer(substr(x[2L], 2L, nchar(x[2L]))))
22 |   }))
23 |   o = data.table:::forderv(qr) ## https://github.com/Rdatatable/data.table/issues/3447
24 |   if (!length(o)) f else f[o]
25 | }
26 | fcsv = list.files("clickhouse/log", sprintf("^%s_%s_q.*\\.csv$", task, data_name), full.names=TRUE)
27 | if (!length(fcsv))
28 |   stop("no log files produced, did you run clickhouse sql script that will output such to clickhouse/log/[task]_[data_name]_q[i]_r[j].csv")
29 | fcsv = sort_q_r(fcsv)
30 | d = rbindlist(lapply(fcsv, fread, na.strings="\\N")) # fill=TRUE for debugging type column in some queries
31 | if (!nrow(d))
32 |   stop("timing log files empty")
33 | stopifnot(all(d$task==task), all(d$data_name==data_name))
34 | .in_rows = strsplit(data_name, "_", fixed=TRUE)[[1L]][[2L]] ## taken from data_name because for join CH will sum in rows from both tables
35 | d[,
36 |   write.log(run=as.integer(run), timestamp=as.numeric(timestamp), task=as.character(task), data=as.character(data_name), in_rows=as.numeric(.in_rows), question=as.character(question),
37 |             out_rows=as.numeric(NA), out_cols=as.integer(NA), solution=as.character(solution), version=as.character(version), git=as.character(NA), fun=as.character(fun), 
38 |             time_sec=as.numeric(time_sec), mem_gb=as.numeric(NA), cache=as.logical(cache), chk=as.character(NA), chk_time_sec=as.numeric(NA), on_disk=as.logical(on_disk), machine_type=as.character(machine_type)),
39 |   by = seq_len(nrow(d))] -> nul
40 | 
41 | cat("# clickhouse-parse-log.R: parsing timings to time.csv finished\n")
42 | 
43 | if (!interactive()) q("no")
44 | 


--------------------------------------------------------------------------------
/clickhouse/groupby-clickhouse.sh:
--------------------------------------------------------------------------------
 1 | source ./clickhouse/ch.sh
 2 | 
 3 | SOLUTION=clickhouse
 4 | TASK=groupby
 5 | 
 6 | # /* q1: question='sum v1 by id1' */
 7 | 
 8 | Q=1
 9 | QUESTION="sum v1 by id1"
10 | QUERY="SELECT id1, sum(v1) AS v1 FROM ${SRC_DATANAME} GROUP BY id1"
11 | 
12 | ch_make_2_runs
13 | 
14 | # /* q2: question='sum v1 by id1:id2' */
15 | Q=2
16 | QUESTION="sum v1 by id1:id2"
17 | QUERY="SELECT id1, id2, sum(v1) AS v1 FROM ${SRC_DATANAME} GROUP BY id1, id2"
18 | 
19 | ch_make_2_runs
20 | 
21 | # /* q3: question='sum v1 mean v3 by id3' */
22 | Q=3
23 | QUESTION="sum v1 mean v3 by id3"
24 | QUERY="SELECT id3, sum(v1) AS v1, avg(v3) AS v3 FROM ${SRC_DATANAME} GROUP BY id3"
25 | 
26 | ch_make_2_runs
27 | 
28 | # /* q4: question='mean v1:v3 by id4' */
29 | Q=4
30 | QUESTION="mean v1:v3 by id4"
31 | QUERY="SELECT id4, avg(v1) AS v1, avg(v2) AS v2, avg(v3) AS v3 FROM ${SRC_DATANAME} GROUP BY id4"
32 | 
33 | ch_make_2_runs
34 | 
35 | # /* q5: question='sum v1:v3 by id6' */
36 | Q=5
37 | QUESTION="sum v1:v3 by id6"
38 | QUERY="SELECT id6, sum(v1) AS v1, sum(v2) AS v2, sum(v3) AS v3 FROM ${SRC_DATANAME} GROUP BY id6"
39 | 
40 | ch_make_2_runs
41 | 
42 | # /* q6: question='median v3 sd v3 by id4 id5' */
43 | Q=6
44 | QUESTION="median v3 sd v3 by id4 id5"
45 | QUERY="SELECT id4, id5, medianExact(v3) AS median_v3, stddevPop(v3) AS sd_v3 FROM ${SRC_DATANAME} GROUP BY id4, id5"
46 | 
47 | ch_make_2_runs
48 | 
49 | # /* q7: question='max v1 - min v2 by id3' */
50 | Q=7
51 | QUESTION="max v1 - min v2 by id3"
52 | QUERY="SELECT id3, max(v1) - min(v2) AS range_v1_v2 FROM ${SRC_DATANAME} GROUP BY id3"
53 | 
54 | ch_make_2_runs
55 | 
56 | # /* q8: question='largest two v3 by id6' */
57 | Q=8
58 | QUESTION="largest two v3 by id6"
59 | QUERY="SELECT id6, arrayJoin(arraySlice(arrayReverseSort(groupArray(v3)), 1, 2)) AS v3 FROM (SELECT id6, v3 FROM ${SRC_DATANAME} WHERE v3 IS NOT NULL) AS subq GROUP BY id6"
60 | 
61 | ch_make_2_runs
62 | 
63 | # /* q9: question='regression v1 v2 by id2 id4' */
64 | Q=9
65 | QUESTION="regression v1 v2 by id2 id4"
66 | QUERY="SELECT id2, id4, pow(corr(v1, v2), 2) AS r2 FROM ${SRC_DATANAME} GROUP BY id2, id4"
67 | 
68 | ch_make_2_runs
69 | 
70 | # /* q10: question='sum v3 count by id1:id6' */
71 | Q=10
72 | QUESTION="sum v3 count by id1:id6"
73 | QUERY="SELECT id1, id2, id3, id4, id5, id6, sum(v3) AS v3, count() AS cnt FROM ${SRC_DATANAME} GROUP BY id1, id2, id3, id4, id5, id6"
74 | 
75 | ch_make_2_runs
76 | 


--------------------------------------------------------------------------------
/clickhouse/join-clickhouse.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | source ./clickhouse/ch.sh
 4 | 
 5 | SOLUTION=clickhouse
 6 | TASK=join
 7 | 
 8 | echo SRC ${SRC_DATANAME} RHS1 ${RHS1} RHS2 ${RHS2} RHS3 ${RHS3} COMPRESS ${COMPRESS} THREADS ${THREADS}
 9 | 
10 | # /* q1: question='small inner on int' */
11 | Q=1
12 | QUESTION="small inner on int"
13 | QUERY="SELECT id1, x.id2, x.id3, x.id4, y.id4, x.id5, x.id6, x.v1, y.v2 FROM ${SRC_DATANAME} AS x INNER JOIN ${RHS1} AS y USING (id1)"
14 | ch_make_2_runs
15 | 
16 | # /* q2: question='medium inner on int' */
17 | Q=2
18 | QUESTION="medium inner on int"
19 | QUERY="SELECT x.id1, y.id1, id2, x.id3, x.id4, y.id4, x.id5, y.id5, x.id6, x.v1, y.v2 FROM ${SRC_DATANAME} AS x INNER JOIN ${RHS2} AS y USING (id2)"
20 | ch_make_2_runs
21 | 
22 | # /* q3: question='medium outer on int' */
23 | Q=3
24 | QUESTION="medium outer on int"
25 | QUERY="SELECT x.id1, y.id1, id2, x.id3, x.id4, y.id4, x.id5, y.id5, x.id6, x.v1, y.v2 FROM ${SRC_DATANAME} AS x LEFT JOIN ${RHS2} AS y USING (id2)"
26 | ch_make_2_runs
27 | 
28 | # /* q4: question='medium inner on factor' */
29 | Q=4
30 | QUESTION="medium inner on factor"
31 | QUERY="SELECT x.id1, y.id1, x.id2, y.id2, x.id3, x.id4, y.id4, id5, x.id6, x.v1, y.v2 FROM ${SRC_DATANAME} AS x INNER JOIN ${RHS2} AS y USING (id5)"
32 | ch_make_2_runs
33 | 
34 | # /* q5: question='big inner on int' */
35 | Q=5
36 | QUESTION="big inner on int"
37 | QUERY="SELECT x.id1, y.id1, x.id2, y.id2, id3, x.id4, y.id4, x.id5, y.id5, x.id6, y.id6, x.v1, y.v2 FROM ${SRC_DATANAME} AS x INNER JOIN ${RHS3} AS y USING (id3)"
38 | ch_make_2_runs
39 | 


--------------------------------------------------------------------------------
/clickhouse/setup-clickhouse.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # install
 3 | sudo apt-get install -y apt-transport-https ca-certificates curl gnupg
 4 | curl -fsSL 'https://packages.clickhouse.com/rpm/lts/repodata/repomd.xml.key' | sudo gpg --dearmor -o /usr/share/keyrings/clickhouse-keyring.gpg
 5 | 
 6 | ARCH=$(dpkg --print-architecture)
 7 | echo "deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg arch=${ARCH}] https://packages.clickhouse.com/deb stable main" | sudo tee /etc/apt/sources.list.d/clickhouse.list
 8 | sudo apt-get update
 9 | 
10 | sudo apt-get install -y clickhouse-server clickhouse-client
11 | 
12 | # stop server if service was already running
13 | sudo service clickhouse-server start ||:
14 | 
15 | 
16 | # modify clickhouse settings so data is stored on the mount.
17 | sudo mkdir -p /var/lib/mount/clickhouse-nvme-mount/
18 | sudo chown clickhouse:clickhouse /var/lib/mount/clickhouse-nvme-mount
19 | 
20 | # set up clickhouse tmp space
21 | sudo mkdir -p /var/lib/mount/clickhouse-tmp/
22 | sudo chown clickhouse:clickhouse /var/lib/mount/clickhouse-tmp
23 | 
24 | # copy clickhouse config
25 | sudo cp -a /var/lib/clickhouse/. /var/lib/mount/clickhouse-nvme-mount/
26 | sudo cp clickhouse/clickhouse-mount-config.xml /etc/clickhouse-server/config.d/data-paths.xml
27 | 
28 | 
29 | # start server
30 | sudo rm -rf /var/log/clickhouse-server/clickhouse-server.err.log /var/log/clickhouse-server/clickhouse-server.log
31 | sudo service clickhouse-server start
32 | 
33 | 
34 | MEMORY_LIMIT=0
35 | BYTES_BEFORE_EXTERNAL_GROUP_BY=0
36 | if [[ $MACHINE_TYPE == "c6id.4xlarge" ]]; then
37 | 	MEMORY_LIMIT=28000000000
38 | 	BYTES_BEFORE_EXTERNAL_GROUP_BY=20000000000
39 | fi
40 | 
41 | clickhouse-client --query "CREATE USER IF NOT EXISTS db_benchmark IDENTIFIED WITH no_password SETTINGS max_memory_usage = $MEMORY_LIMIT, max_bytes_before_external_group_by = $BYTES_BEFORE_EXTERNAL_GROUP_BY WRITABLE;"
42 | clickhouse-client --query "GRANT select, insert, create, alter, alter user, create table, truncate, drop, system flush logs on *.* to db_benchmark;"
43 | 
44 | ./clickhouse/ver-clickhouse.sh
45 | 


--------------------------------------------------------------------------------
/clickhouse/upg-clickhouse.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # upgrade to latest released
 5 | echo 'upgrading clickhouse-server clickhouse-client...'
 6 | sudo apt-get install --only-upgrade clickhouse-server clickhouse-client
 7 | 
 8 | if [[ $TEST_RUN != "true" ]]; then
 9 | 	sudo chown ubuntu:ubuntu clickhouse/VERSION
10 | 	sudo chown ubuntu:ubuntu clickhouse/REVISION
11 | fi
12 | 
13 | 
14 | # modify clickhouse settings so data is stored on the mount.
15 | # This is necessary for when clickhouse is installed on a machine but the mount looses all data
16 | sudo mkdir -p /var/lib/mount/clickhouse-nvme-mount/
17 | sudo chown clickhouse:clickhouse /var/lib/mount/clickhouse-nvme-mount
18 | 
19 | # copy clickhouse config
20 | sudo cp -a /var/lib/clickhouse/. /var/lib/mount/clickhouse-nvme-mount/
21 | sudo cp clickhouse/clickhouse-mount-config.xml /etc/clickhouse-server/config.d/data-paths.xml
22 | 
23 | 
24 | # start server
25 | sudo rm -rf /var/log/clickhouse-server/clickhouse-server.err.log /var/log/clickhouse-server/clickhouse-server.log
26 | sudo service clickhouse-server start


--------------------------------------------------------------------------------
/clickhouse/ver-clickhouse.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | source ./clickhouse/ch.sh # clickhouse helper scripts
 5 | 
 6 | ch_installed && clickhouse-client --version-clean > clickhouse/VERSION && echo "" > clickhouse/REVISION
 7 | 
 8 | if [[ $TEST_RUN != "true" ]]; then
 9 | 	sudo chown ubuntu:ubuntu clickhouse/VERSION
10 | 	sudo chown ubuntu:ubuntu clickhouse/REVISION
11 | fi


--------------------------------------------------------------------------------
/collapse/VERSION:
--------------------------------------------------------------------------------
1 | 2.1.2
2 | 


--------------------------------------------------------------------------------
/collapse/groupby2014-collapse.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | cat("# groupby2014-collapse.R\n")
  4 | 
  5 | source("./_helpers/helpers.R")
  6 | 
  7 | stopifnot(requireNamespace("data.table", quietly=TRUE)) # collapse does not support integer64. Oversized ints will be summed to double. 
  8 | .libPaths("./collapse/r-collapse") # tidyverse/collapse#4641
  9 | suppressPackageStartupMessages(library("collapse", lib.loc="./collapse/r-collapse", warn.conflicts=FALSE))
 10 | ver = packageVersion("collapse")
 11 | git = "" # uses stable version now #124
 12 | task = "groupby2014"
 13 | solution = "collapse"
 14 | fun = "group_by"
 15 | cache = TRUE
 16 | on_disk = FALSE
 17 | 
 18 | data_name = Sys.getenv("SRC_DATANAME")
 19 | src_grp = file.path("data", paste(data_name, "csv", sep="."))
 20 | cat(sprintf("loading dataset %s\n", data_name))
 21 | 
 22 | x = data.table::fread(src_grp, showProgress=FALSE, data.table=FALSE)
 23 | print(nrow(x))
 24 | gc()
 25 | 
 26 | # Setting collapse options: namespace masking and performance
 27 | oldopts <- set_collapse(nthreads = data.table::getDTthreads(), 
 28 |                         mask = "all",
 29 |                         sort = endsWith(data_name, "_1"), 
 30 |                         na.rm = anyNA(num_vars(x)), 
 31 |                         stable.algo = FALSE)
 32 | 
 33 | task_init = proc.time()[["elapsed"]]
 34 | cat("grouping...\n")
 35 | 
 36 | question = "sum v1 by id1" # q1
 37 | t = system.time(print(dim(ans<-collap(x, v1 ~ id1, sum))))[["elapsed"]]
 38 | m = memory_usage()
 39 | chkt = system.time(chk<-summarise(ans, v1=sum(v1)))[["elapsed"]]
 40 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 41 | rm(ans)
 42 | t = system.time(print(dim(ans<-collap(x, v1 ~ id1, sum))))[["elapsed"]]
 43 | m = memory_usage()
 44 | chkt = system.time(chk<-summarise(ans, v1=sum(v1)))[["elapsed"]]
 45 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 46 | print(head(ans, 3))
 47 | print(tail(ans, 3))
 48 | rm(ans)
 49 | 
 50 | question = "sum v1 by id1:id2" # q2
 51 | t = system.time(print(dim(ans<-collap(x, v1 ~ id1 + id2, sum))))[["elapsed"]]
 52 | m = memory_usage()
 53 | chkt = system.time(chk<-summarise(ans, v1=sum(v1)))[["elapsed"]]
 54 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 55 | rm(ans)
 56 | t = system.time(print(dim(ans<-collap(x, v1 ~ id1 + id2, sum))))[["elapsed"]]
 57 | m = memory_usage()
 58 | chkt = system.time(chk<-summarise(ans, v1=sum(v1)))[["elapsed"]]
 59 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 60 | print(head(ans, 3))
 61 | print(tail(ans, 3))
 62 | rm(ans)
 63 | 
 64 | question = "sum v1 mean v3 by id3" # q3
 65 | t = system.time(print(dim(ans<-collap(x, ~ id3, custom = list(sum = "v1", mean = "v3")))))[["elapsed"]]
 66 | m = memory_usage()
 67 | chkt = system.time(chk<-summarise(ans, v1=sum(v1), v3=sum(v3)))[["elapsed"]]
 68 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 69 | rm(ans)
 70 | t = system.time(print(dim(ans<-collap(x, ~ id3, custom = list(sum = "v1", mean = "v3")))))[["elapsed"]]
 71 | m = memory_usage()
 72 | chkt = system.time(chk<-summarise(ans, v1=sum(v1), v3=sum(v3)))[["elapsed"]]
 73 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 74 | print(head(ans, 3))
 75 | print(tail(ans, 3))
 76 | rm(ans)
 77 | 
 78 | question = "mean v1:v3 by id4" # q4
 79 | t = system.time(print(dim(ans<-x |> group_by(id4) |> select(v1:v3) |> mean())))[["elapsed"]]
 80 | m = memory_usage()
 81 | chkt = system.time(chk<-summarise(ans, v1=sum(v1), v2=sum(v2), v3=sum(v3)))[["elapsed"]]
 82 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 83 | rm(ans)
 84 | t = system.time(print(dim(ans<-x |> group_by(id4) |> select(v1:v3) |> mean())))[["elapsed"]]
 85 | m = memory_usage()
 86 | chkt = system.time(chk<-summarise(ans, v1=sum(v1), v2=sum(v2), v3=sum(v3)))[["elapsed"]]
 87 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 88 | print(head(ans, 3))
 89 | print(tail(ans, 3))
 90 | rm(ans)
 91 | 
 92 | question = "sum v1:v3 by id6" # q5
 93 | t = system.time(print(dim(ans<-x |> group_by(id6) |> select(v1:v3) |> sum())))[["elapsed"]]
 94 | m = memory_usage()
 95 | chkt = system.time(chk<-summarise(ans, v1=sum(v1), v2=sum(v2), v3=sum(v3)))[["elapsed"]]
 96 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 97 | rm(ans)
 98 | t = system.time(print(dim(ans<-x |> group_by(id6) |> select(v1:v3) |> sum())))[["elapsed"]]
 99 | m = memory_usage()
100 | chkt = system.time(chk<-summarise(ans, v1=sum(v1), v2=sum(v2), v3=sum(v3)))[["elapsed"]]
101 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
102 | print(head(ans, 3))
103 | print(tail(ans, 3))
104 | rm(ans)
105 | 
106 | cat(sprintf("grouping finished, took %.0fs\n", proc.time()[["elapsed"]]-task_init))
107 | 
108 | set_collapse(oldopts)
109 | 
110 | if( !interactive() ) q("no", status=0)
111 | 


--------------------------------------------------------------------------------
/collapse/setup-collapse.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | # install stable collapse
5 | mkdir -p ./collapse/r-collapse
6 | Rscript -e 'install.packages(c("Rcpp", "collapse"), lib="./collapse/r-collapse", repos = "http://cloud.r-project.org")'
7 | 
8 | ./collapse/ver-collapse.sh


--------------------------------------------------------------------------------
/collapse/upg-collapse.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | # upgrade all packages in collapse library only if new collapse is out
5 | echo 'upgrading collapse...'
6 | Rscript -e 'ap=available.packages(); if (ap["collapse","Version"]!=packageVersion("collapse", lib.loc="./collapse/r-collapse")) update.packages(lib.loc="./collapse/r-collapse", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)'
7 | 


--------------------------------------------------------------------------------
/collapse/ver-collapse.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | Rscript -e 'v=read.dcf(system.file(package="collapse", lib.loc="./collapse/r-collapse", "DESCRIPTION"), fields=c("Version","RemoteSha")); colnames(v)[colnames(v)=="RemoteSha"]="Revision"; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("collapse", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))'
5 | 


--------------------------------------------------------------------------------
/dask/VERSION:
--------------------------------------------------------------------------------
1 | 2024.9.0


--------------------------------------------------------------------------------
/dask/common.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import gc
  3 | import os
  4 | import logging
  5 | import timeit
  6 | from abc import ABC, abstractmethod
  7 | from typing import Iterable, Any
  8 | 
  9 | import dask.dataframe as dd
 10 | from dask import distributed
 11 | 
 12 | logging.basicConfig(
 13 |     level=logging.INFO,
 14 |     format='{ %(name)s:%(lineno)d @ %(asctime)s } - %(message)s'
 15 | )
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | THIS_DIR = os.path.abspath(
 19 |     os.path.dirname(__file__)
 20 | )
 21 | HELPERS_DIR = os.path.abspath(
 22 |     os.path.join(
 23 |         THIS_DIR, '../_helpers'
 24 |     )
 25 | )
 26 | sys.path.extend((THIS_DIR, HELPERS_DIR))
 27 | from helpers import *
 28 | 
 29 | class Query(ABC):
 30 |     question: str = None
 31 | 
 32 |     @staticmethod
 33 |     @abstractmethod
 34 |     def query(*args) -> dd.DataFrame:
 35 |         pass
 36 | 
 37 |     @staticmethod
 38 |     @abstractmethod
 39 |     def check(ans: dd.DataFrame) -> Any:
 40 |         pass
 41 | 
 42 |     @classmethod
 43 |     def name(cls) -> str:
 44 |         return f"{cls.__name__}: {cls.question}"
 45 | 
 46 | class QueryRunner:
 47 |     def __init__(
 48 |         self,
 49 |         task: str,
 50 |         solution: str,
 51 |         solution_version: str,
 52 |         solution_revision: str,
 53 |         fun: str,
 54 |         cache: str,
 55 |         on_disk: bool
 56 |     ):
 57 |         self.task = task
 58 |         self.solution = solution
 59 |         self.solution_version = solution_version
 60 |         self.solution_revision = solution_revision
 61 |         self.fun = fun
 62 |         self.cache = cache
 63 |         self.on_disk = on_disk
 64 | 
 65 |     def run_query(
 66 |         self,
 67 |         data_name: str,
 68 |         in_rows: int,
 69 |         args: Iterable[Any],
 70 |         query: Query,
 71 |         machine_type: str,
 72 |         runs: int = 2,
 73 |         raise_exception: bool = False,
 74 |     ):
 75 |         logger.info("Running '%s'" % query.name())
 76 | 
 77 |         try:
 78 |             for run in range(1, runs+1):
 79 |                 gc.collect() # TODO: Able to do this in worker processes? Want to?
 80 | 
 81 |                 # Calculate ans
 82 |                 t_start = timeit.default_timer()
 83 |                 ans = query.query(*args)
 84 |                 logger.debug("Answer shape: %s" % (ans.shape, ))
 85 |                 t = timeit.default_timer() - t_start
 86 |                 m = memory_usage()
 87 | 
 88 |                 logger.info("\tRun #%s: %0.3fs" % (run, t))
 89 | 
 90 |                 # Calculate chk
 91 |                 t_start = timeit.default_timer()
 92 |                 chk = query.check(ans)
 93 |                 chkt = timeit.default_timer() - t_start
 94 | 
 95 | 
 96 |                 write_log(
 97 |                     task=self.task,
 98 |                     data=data_name,
 99 |                     in_rows=in_rows,
100 |                     question=query.question,
101 |                     out_rows=ans.shape[0],
102 |                     out_cols=ans.shape[1],
103 |                     solution=self.solution,
104 |                     version=self.solution_version,
105 |                     git=self.solution_revision,
106 |                     fun=self.fun,
107 |                     run=run,
108 |                     time_sec=t,
109 |                     mem_gb=m,
110 |                     cache=self.cache,
111 |                     chk=make_chk(chk),
112 |                     chk_time_sec=chkt,
113 |                     on_disk=self.on_disk,
114 |                     machine_type=machine_type
115 |                 )
116 |                 if run == runs:
117 |                     # Print head / tail on last run
118 |                     logger.debug("Answer head:\n%s" % ans.head(3))
119 |                     logger.debug("Answer tail:\n%s" % ans.tail(3))
120 |                 del ans
121 |         except Exception as err:
122 |             logger.error("Query '%s' failed!" % query.name())
123 |             print(err)
124 | 
125 |             # Re-raise if instructed
126 |             if raise_exception:
127 |                 raise err
128 | 
129 | def dask_client() -> distributed.Client:
130 |     # we use process-pool instead of thread-pool due to GIL cost
131 |     return distributed.Client(processes=True, silence_logs=logging.ERROR)
132 | 


--------------------------------------------------------------------------------
/dask/setup-dask.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | virtualenv dask/py-dask --python=python3
 5 | source dask/py-dask/bin/activate
 6 | 
 7 | # install binaries
 8 | python3 -m pip install "dask[complete]"
 9 | 
10 | # check
11 | # python3
12 | # import dask as dk
13 | # dk.__version__
14 | # dk.__git_revision__
15 | # quit()
16 | 
17 | deactivate
18 | 
19 | ./dask/ver-dask.sh
20 | 


--------------------------------------------------------------------------------
/dask/upg-dask.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | echo 'upgrading dask...'
5 | 
6 | source ./dask/py-dask/bin/activate
7 | 
8 | python3 -m pip install --upgrade dask[complete] > /dev/null
9 | 


--------------------------------------------------------------------------------
/dask/ver-dask.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | source ./dask/py-dask/bin/activate
5 | python3 -c 'import dask as dk; open("dask/VERSION","w").write(dk.__version__); open("dask/REVISION","w").write(dk.__git_revision__);' > /dev/null
6 | 


--------------------------------------------------------------------------------
/datafusion/VERSION:
--------------------------------------------------------------------------------
1 | 47.0.0


--------------------------------------------------------------------------------
/datafusion/setup-datafusion.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | virtualenv datafusion/py-datafusion --python=python3
 5 | source datafusion/py-datafusion/bin/activate
 6 | 
 7 | python3 -m pip install --upgrade psutil datafusion pandas
 8 | 
 9 | # build
10 | deactivate
11 | ./datafusion/upg-datafusion.sh
12 | 
13 | ./datafusion/ver-datafusion.sh
14 | 
15 | # check
16 | # source datafusion/py-datafusion/bin/activate
17 | # python3
18 | # import datafusion as df
19 | # df.__version__
20 | # quit()
21 | # deactivate
22 | 
23 | # fix: print(ans.head(3), flush=True): UnicodeEncodeError: 'ascii' codec can't encode characters in position 14-31: ordinal not in range(128)
24 | # vim datafusion/py-datafusion/bin/activate
25 | #deactivate () {
26 | #    unset PYTHONIOENCODING
27 | #    ...
28 | #}
29 | #...
30 | #PYTHONIOENCODING="utf-8"
31 | #export PYTHONIOENCODING
32 | #...
33 | 


--------------------------------------------------------------------------------
/datafusion/upg-datafusion.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | echo 'upgrading datafusion...'
 5 | 
 6 | source ./datafusion/py-datafusion/bin/activate
 7 | 
 8 | python -m pip install --upgrade datafusion > /dev/null
 9 | 
10 | deactivate


--------------------------------------------------------------------------------
/datafusion/ver-datafusion.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source ./datafusion/py-datafusion/bin/activate
3 | python3 -c 'import datafusion as df; open("datafusion/VERSION","w").write(df.__version__); open("datafusion/REVISION","w").write("");' > /dev/null
4 | 


--------------------------------------------------------------------------------
/datatable/VERSION:
--------------------------------------------------------------------------------
1 | 1.16.99
2 | 


--------------------------------------------------------------------------------
/datatable/groupby2014-datatable.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | cat("# groupby2014-datatable.R\n")
  4 | 
  5 | source("./_helpers/helpers.R")
  6 | 
  7 | stopifnot(requireNamespace(c("bit64"), quietly=TRUE)) # used in chk to sum numeric columns
  8 | suppressPackageStartupMessages(library("data.table", lib.loc="./datatable/r-datatable"))
  9 | setDTthreads(0L)
 10 | ver = packageVersion("data.table")
 11 | git = data.table:::.git(quiet=TRUE)
 12 | task = "groupby2014"
 13 | solution = "data.table"
 14 | fun = "[.data.table"
 15 | cache = TRUE
 16 | on_disk = FALSE
 17 | 
 18 | data_name = Sys.getenv("SRC_DATANAME")
 19 | src_grp = file.path("data", paste(data_name, "csv", sep="."))
 20 | cat(sprintf("loading dataset %s\n", data_name))
 21 | 
 22 | x = fread(src_grp, showProgress=FALSE)
 23 | print(nrow(x))
 24 | 
 25 | task_init = proc.time()[["elapsed"]]
 26 | cat("grouping...\n")
 27 | 
 28 | question = "sum v1 by id1" # q1
 29 | t = system.time(print(dim(ans<-x[, sum(v1), keyby=id1])))[["elapsed"]]
 30 | m = memory_usage()
 31 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(V1)))])[["elapsed"]]
 32 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 33 | rm(ans)
 34 | t = system.time(print(dim(ans<-x[, sum(v1), keyby=id1])))[["elapsed"]]
 35 | m = memory_usage()
 36 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(V1)))])[["elapsed"]]
 37 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 38 | print(head(ans, 3))
 39 | print(tail(ans, 3))
 40 | rm(ans)
 41 | 
 42 | question = "sum v1 by id1:id2" # q2
 43 | t = system.time(print(dim(ans<-x[, sum(v1), keyby='id1,id2'])))[["elapsed"]]
 44 | m = memory_usage()
 45 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(V1)))])[["elapsed"]]
 46 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 47 | rm(ans)
 48 | t = system.time(print(dim(ans<-x[, sum(v1), keyby='id1,id2'])))[["elapsed"]]
 49 | m = memory_usage()
 50 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(V1)))])[["elapsed"]]
 51 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 52 | print(head(ans, 3))
 53 | print(tail(ans, 3))
 54 | rm(ans)
 55 | 
 56 | question = "sum v1 mean v3 by id3" # q3
 57 | t = system.time(print(dim(ans<-x[, list(sum(v1), mean(v3)), keyby=id3])))[["elapsed"]]
 58 | m = memory_usage()
 59 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(V1)), sum(V2))])[["elapsed"]]
 60 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 61 | rm(ans)
 62 | t = system.time(print(dim(ans<-x[, list(sum(v1), mean(v3)), keyby=id3])))[["elapsed"]]
 63 | m = memory_usage()
 64 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(V1)), sum(V2))])[["elapsed"]]
 65 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 66 | print(head(ans, 3))
 67 | print(tail(ans, 3))
 68 | rm(ans)
 69 | 
 70 | question = "mean v1:v3 by id4" # q4
 71 | t = system.time(print(dim(ans<-x[, lapply(.SD, mean), keyby=id4, .SDcols=7:9])))[["elapsed"]]
 72 | m = memory_usage()
 73 | chkt = system.time(chk<-ans[, .(sum(v1), sum(v2), sum(v3))])[["elapsed"]]
 74 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 75 | rm(ans)
 76 | t = system.time(print(dim(ans<-x[, lapply(.SD, mean), keyby=id4, .SDcols=7:9])))[["elapsed"]]
 77 | m = memory_usage()
 78 | chkt = system.time(chk<-ans[, .(sum(v1), sum(v2), sum(v3))])[["elapsed"]]
 79 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 80 | print(head(ans, 3))
 81 | print(tail(ans, 3))
 82 | rm(ans)
 83 | 
 84 | question = "sum v1:v3 by id6" # q5
 85 | t = system.time(print(dim(ans<-x[, lapply(.SD, sum), keyby=id6, .SDcols=7:9])))[["elapsed"]]
 86 | m = memory_usage()
 87 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(v1)), sum(bit64::as.integer64(v2)), sum(v3))])[["elapsed"]]
 88 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 89 | rm(ans)
 90 | t = system.time(print(dim(ans<-x[, lapply(.SD, sum), keyby=id6, .SDcols=7:9])))[["elapsed"]]
 91 | m = memory_usage()
 92 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(v1)), sum(bit64::as.integer64(v2)), sum(v3))])[["elapsed"]]
 93 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 94 | print(head(ans, 3))
 95 | print(tail(ans, 3))
 96 | rm(ans)
 97 | 
 98 | cat(sprintf("grouping finished, took %.0fs\n", proc.time()[["elapsed"]]-task_init))
 99 | 
100 | if( !interactive() ) q("no", status=0)
101 | 


--------------------------------------------------------------------------------
/datatable/read-datatable.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | cat("# read-datatable.R\n")
 4 | 
 5 | source("./helpers.R")
 6 | source("./datatable/helpers-datatable.R")
 7 | 
 8 | suppressPackageStartupMessages(library(data.table))
 9 | ver = packageVersion("data.table")
10 | git = datatable.git()
11 | task = "read"
12 | solution = "data.table"
13 | fun = "fread"
14 | cache = TRUE
15 | 
16 | src_grp = Sys.getenv("SRC_GRP_LOCAL")
17 | data_name = basename(src_grp)
18 | options("datatable.showProgress"=FALSE)
19 | 
20 | in_rows = as.numeric(strsplit(system(sprintf("wc -l %s", data_name), intern=TRUE), " ", fixed=TRUE)[[1L]][1L])-1
21 | 
22 | cat("reading...\n")
23 | 
24 | question = "all rows" #1
25 | t = system.time(print(dim(ans<-fread(data_name))))[["elapsed"]]
26 | m = memory_usage()
27 | chkt = system.time(chk<-ans[, .(sum(v3))])[["elapsed"]]
28 | write.log(run=1L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
29 | rm(ans)
30 | t = system.time(print(dim(ans<-fread(data_name))))[["elapsed"]]
31 | m = memory_usage()
32 | chkt = system.time(chk<-ans[, .(sum(v3))])[["elapsed"]]
33 | write.log(run=2L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
34 | rm(ans)
35 | t = system.time(print(dim(ans<-fread(data_name))))[["elapsed"]]
36 | m = memory_usage()
37 | chkt = system.time(chk<-ans[, .(sum(v3))])[["elapsed"]]
38 | write.log(run=3L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
39 | rm(ans)
40 | 
41 | question = "top 100 rows" #2
42 | t = system.time(print(dim(ans<-fread(data_name, nrows=100))))[["elapsed"]]
43 | m = memory_usage()
44 | chkt = system.time(chk<-ans[, .(sum(v3))])[["elapsed"]]
45 | write.log(run=1L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
46 | rm(ans)
47 | t = system.time(print(dim(ans<-fread(data_name, nrows=100))))[["elapsed"]]
48 | m = memory_usage()
49 | chkt = system.time(chk<-ans[, .(sum(v3))])[["elapsed"]]
50 | write.log(run=2L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
51 | rm(ans)
52 | t = system.time(print(dim(ans<-fread(data_name, nrows=100))))[["elapsed"]]
53 | m = memory_usage()
54 | chkt = system.time(chk<-ans[, .(sum(v3))])[["elapsed"]]
55 | write.log(run=3L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
56 | rm(ans)
57 | 
58 | if( !interactive() ) q("no", status=0)
59 | 


--------------------------------------------------------------------------------
/datatable/setup-datatable.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # install devel data.table
3 | mkdir -p ./datatable/r-datatable
4 | Rscript -e 'install.packages("data.table", repos="https://Rdatatable.gitlab.io/data.table", method="curl", lib="./datatable/r-datatable")'
5 | 
6 | ./datatable/ver-datatable.sh
7 | 


--------------------------------------------------------------------------------
/datatable/sort-datatable.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | cat("# sort-datatable.R\n")
 4 | 
 5 | source("./helpers.R")
 6 | source("./datatable/helpers-datatable.R")
 7 | 
 8 | src_x = Sys.getenv("SRC_X", NA_character_)
 9 | 
10 | # if (get.nrow(src_x) > 1e9L) {
11 | #   cat("# sort with data.table skipped due data volume cap for single machine set to total 1e9 rows")
12 | #   quit("no", status=0) # datasets > 1e9 too big to try load on single machine
13 | # }
14 | 
15 | stopifnot(requireNamespace("bit64", quietly=TRUE)) # used in chk to sum numeric columns
16 | suppressPackageStartupMessages(library(data.table))
17 | ver = packageVersion("data.table")
18 | git = datatable.git()
19 | data_name = basename(src_x)
20 | task = "sort"
21 | solution = "data.table"
22 | fun = "[.data.table"
23 | question = "by int KEY"
24 | cache = TRUE
25 | 
26 | cat("loading dataset...\n")
27 | X = fread(if(file.exists(basename(src_x))) basename(src_x) else sprintf("hadoop fs -cat %s", src_x)) # csv can be provided in local dir for faster import
28 | 
29 | cat("sorting...\n")
30 | t = system.time(print(dim(ans<-X[order(KEY)])))[["elapsed"]]
31 | m = memory_usage()
32 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(X2)))])[["elapsed"]]
33 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
34 | rm(ans)
35 | 
36 | t = system.time(print(dim(ans<-X[order(KEY)])))[["elapsed"]]
37 | m = memory_usage()
38 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(X2)))])[["elapsed"]]
39 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
40 | rm(ans)
41 | 
42 | t = system.time(print(dim(ans<-X[order(KEY)])))[["elapsed"]]
43 | m = memory_usage()
44 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(X2)))])[["elapsed"]]
45 | write.log(run=3L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
46 | rm(ans)
47 | 
48 | if( !interactive() ) q("no", status=0)
49 | 


--------------------------------------------------------------------------------
/datatable/upg-datatable.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | # upgrade to latest devel
5 | echo 'upgrading data.table...'
6 | # Rscript -e 'data.table::update.dev.pkg(quiet=TRUE, method="curl", lib="./datatable/r-datatable")'
7 | Rscript -e 'update.packages(lib.loc = "./datatable/r-datatable", repos="https://rdatatable.gitlab.io/data.table", method="curl")'
8 | 
9 | 


--------------------------------------------------------------------------------
/datatable/ver-datatable.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | Rscript -e 'v=read.dcf(system.file(package="data.table", lib.loc="./datatable/r-datatable", "DESCRIPTION"), fields=c("Version","Revision")); cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("datatable", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))'
5 | 


--------------------------------------------------------------------------------
/dplyr/VERSION:
--------------------------------------------------------------------------------
1 | 1.1.4
2 | 


--------------------------------------------------------------------------------
/dplyr/groupby2014-dplyr.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | cat("# groupby2014-dplyr.R\n")
  4 | 
  5 | source("./_helpers/helpers.R")
  6 | 
  7 | stopifnot(requireNamespace(c("bit64","data.table"), quietly=TRUE)) # used in chk to sum numeric columns and data loading
  8 | .libPaths("./dplyr/r-dplyr") # tidyverse/dplyr#4641
  9 | suppressPackageStartupMessages(library("dplyr", lib.loc="./dplyr/r-dplyr", warn.conflicts=FALSE))
 10 | ver = packageVersion("dplyr")
 11 | git = "" # uses stable version now #124
 12 | task = "groupby2014"
 13 | solution = "dplyr"
 14 | fun = "group_by"
 15 | cache = TRUE
 16 | on_disk = FALSE
 17 | 
 18 | data_name = Sys.getenv("SRC_DATANAME")
 19 | src_grp = file.path("data", paste(data_name, "csv", sep="."))
 20 | cat(sprintf("loading dataset %s\n", data_name))
 21 | 
 22 | x = as_tibble(data.table::fread(src_grp, showProgress=FALSE, data.table=FALSE))
 23 | print(nrow(x))
 24 | 
 25 | task_init = proc.time()[["elapsed"]]
 26 | cat("grouping...\n")
 27 | 
 28 | question = "sum v1 by id1" # q1
 29 | t = system.time(print(dim(ans<-x %>% group_by(id1) %>% summarise(sum(v1)))))[["elapsed"]]
 30 | m = memory_usage()
 31 | chkt = system.time(chk<-summarise(ans, v1=sum(bit64::as.integer64(`sum(v1)`))))[["elapsed"]]
 32 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 33 | rm(ans)
 34 | t = system.time(print(dim(ans<-x %>% group_by(id1) %>% summarise(sum(v1)))))[["elapsed"]]
 35 | m = memory_usage()
 36 | chkt = system.time(chk<-summarise(ans, v1=sum(bit64::as.integer64(`sum(v1)`))))[["elapsed"]]
 37 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 38 | print(head(ans, 3))
 39 | print(tail(ans, 3))
 40 | rm(ans)
 41 | 
 42 | question = "sum v1 by id1:id2" # q2
 43 | t = system.time(print(dim(ans<-x %>% group_by(id1,id2) %>% summarise(sum(v1)))))[["elapsed"]]
 44 | m = memory_usage()
 45 | chkt = system.time(chk<-summarise(ungroup(ans), v1=sum(bit64::as.integer64(`sum(v1)`))))[["elapsed"]]
 46 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 47 | rm(ans)
 48 | t = system.time(print(dim(ans<-x %>% group_by(id1,id2) %>% summarise(sum(v1)))))[["elapsed"]]
 49 | m = memory_usage()
 50 | chkt = system.time(chk<-summarise(ungroup(ans), v1=sum(bit64::as.integer64(`sum(v1)`))))[["elapsed"]]
 51 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 52 | print(head(ans, 3))
 53 | print(tail(ans, 3))
 54 | rm(ans)
 55 | 
 56 | question = "sum v1 mean v3 by id3" # q3
 57 | t = system.time(print(dim(ans<-x %>% group_by(id3) %>% summarise(sum(v1), mean(v3)))))[["elapsed"]]
 58 | m = memory_usage()
 59 | chkt = system.time(chk<-summarise(ans, v1=sum(bit64::as.integer64(`sum(v1)`)), v3=sum(`mean(v3)`)))[["elapsed"]]
 60 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 61 | rm(ans)
 62 | t = system.time(print(dim(ans<-x %>% group_by(id3) %>% summarise(sum(v1), mean(v3)))))[["elapsed"]]
 63 | m = memory_usage()
 64 | chkt = system.time(chk<-summarise(ans, v1=sum(bit64::as.integer64(`sum(v1)`)), v3=sum(`mean(v3)`)))[["elapsed"]]
 65 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 66 | print(head(ans, 3))
 67 | print(tail(ans, 3))
 68 | rm(ans)
 69 | 
 70 | question = "mean v1:v3 by id4" # q4
 71 | t = system.time(print(dim(ans<-x %>% group_by(id4) %>% summarise(across(v1:v3, mean)))))[["elapsed"]]
 72 | m = memory_usage()
 73 | chkt = system.time(chk<-summarise(ans, v1=sum(v1), v2=sum(v2), v3=sum(v3)))[["elapsed"]]
 74 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 75 | rm(ans)
 76 | t = system.time(print(dim(ans<-x %>% group_by(id4) %>% summarise(across(v1:v3, mean)))))[["elapsed"]]
 77 | m = memory_usage()
 78 | chkt = system.time(chk<-summarise(ans, v1=sum(v1), v2=sum(v2), v3=sum(v3)))[["elapsed"]]
 79 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 80 | print(head(ans, 3))
 81 | print(tail(ans, 3))
 82 | rm(ans)
 83 | 
 84 | question = "sum v1:v3 by id6" # q5
 85 | t = system.time(print(dim(ans<-x %>% group_by(id6) %>% summarise(across(v1:v3, sum)))))[["elapsed"]]
 86 | m = memory_usage()
 87 | chkt = system.time(chk<-summarise(ans, v1=sum(bit64::as.integer64(v1)), v2=sum(bit64::as.integer64(v2)), v3=sum(v3)))[["elapsed"]]
 88 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 89 | rm(ans)
 90 | t = system.time(print(dim(ans<-x %>% group_by(id6) %>% summarise(across(v1:v3, sum)))))[["elapsed"]]
 91 | m = memory_usage()
 92 | chkt = system.time(chk<-summarise(ans, v1=sum(bit64::as.integer64(v1)), v2=sum(bit64::as.integer64(v2)), v3=sum(v3)))[["elapsed"]]
 93 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 94 | print(head(ans, 3))
 95 | print(tail(ans, 3))
 96 | rm(ans)
 97 | 
 98 | cat(sprintf("grouping finished, took %.0fs\n", proc.time()[["elapsed"]]-task_init))
 99 | 
100 | if( !interactive() ) q("no", status=0)
101 | 


--------------------------------------------------------------------------------
/dplyr/join-dplyr.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | cat("# join-dplyr.R\n")
  4 | 
  5 | source("./_helpers/helpers.R")
  6 | 
  7 | stopifnot(requireNamespace(c("data.table"), quietly=TRUE)) # used for data loading
  8 | .libPaths("./dplyr/r-dplyr") # tidyverse/dplyr#4641
  9 | suppressPackageStartupMessages(library("dplyr", lib.loc="./dplyr/r-dplyr", warn.conflicts=FALSE))
 10 | ver = packageVersion("dplyr")
 11 | git = "" # uses stable version now #124
 12 | task = "join"
 13 | solution = "dplyr"
 14 | cache = TRUE
 15 | on_disk = FALSE
 16 | 
 17 | data_name = Sys.getenv("SRC_DATANAME")
 18 | src_jn_x = file.path("data", paste(data_name, "csv", sep="."))
 19 | y_data_name = join_to_tbls(data_name)
 20 | src_jn_y = setNames(file.path("data", paste(y_data_name, "csv", sep=".")), names(y_data_name))
 21 | stopifnot(length(src_jn_y)==3L)
 22 | cat(sprintf("loading datasets %s\n", paste(c(data_name, y_data_name), collapse=", ")))
 23 | 
 24 | x = as_tibble(data.table::fread(src_jn_x, showProgress=FALSE, stringsAsFactors=TRUE, data.table=FALSE, na.strings=""))
 25 | JN = lapply(sapply(simplify=FALSE, src_jn_y, data.table::fread, showProgress=FALSE, stringsAsFactors=TRUE, data.table=FALSE, na.strings=""), as_tibble)
 26 | print(nrow(x))
 27 | sapply(sapply(JN, nrow), print) -> nul
 28 | small = JN$small
 29 | medium = JN$medium
 30 | big = JN$big
 31 | 
 32 | task_init = proc.time()[["elapsed"]]
 33 | cat("joining...\n")
 34 | 
 35 | question = "small inner on int" # q1
 36 | fun = "inner_join"
 37 | t = system.time(print(dim(ans<-inner_join(x, small, by="id1"))))[["elapsed"]]
 38 | m = memory_usage()
 39 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]]
 40 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 41 | rm(ans)
 42 | t = system.time(print(dim(ans<-inner_join(x, small, by="id1"))))[["elapsed"]]
 43 | m = memory_usage()
 44 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]]
 45 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 46 | print(head(ans, 3))
 47 | print(tail(ans, 3))
 48 | rm(ans)
 49 | 
 50 | question = "medium inner on int" # q2
 51 | fun = "inner_join"
 52 | t = system.time(print(dim(ans<-inner_join(x, medium, by="id2"))))[["elapsed"]]
 53 | m = memory_usage()
 54 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]]
 55 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 56 | rm(ans)
 57 | t = system.time(print(dim(ans<-inner_join(x, medium, by="id2"))))[["elapsed"]]
 58 | m = memory_usage()
 59 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]]
 60 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 61 | print(head(ans, 3))
 62 | print(tail(ans, 3))
 63 | rm(ans)
 64 | 
 65 | question = "medium outer on int" # q3
 66 | fun = "left_join"
 67 | t = system.time(print(dim(ans<-left_join(x, medium, by="id2"))))[["elapsed"]]
 68 | m = memory_usage()
 69 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]]
 70 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 71 | rm(ans)
 72 | t = system.time(print(dim(ans<-left_join(x, medium, by="id2"))))[["elapsed"]]
 73 | m = memory_usage()
 74 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]]
 75 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 76 | print(head(ans, 3))
 77 | print(tail(ans, 3))
 78 | rm(ans)
 79 | 
 80 | question = "medium inner on factor" # q4
 81 | fun = "inner_join"
 82 | t = system.time(print(dim(ans<-inner_join(x, medium, by="id5"))))[["elapsed"]]
 83 | m = memory_usage()
 84 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]]
 85 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 86 | rm(ans)
 87 | t = system.time(print(dim(ans<-inner_join(x, medium, by="id5"))))[["elapsed"]]
 88 | m = memory_usage()
 89 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]]
 90 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 91 | print(head(ans, 3))
 92 | print(tail(ans, 3))
 93 | rm(ans)
 94 | 
 95 | question = "big inner on int" # q5
 96 | fun = "inner_join"
 97 | t = system.time(print(dim(ans<-inner_join(x, big, by="id3"))))[["elapsed"]]
 98 | m = memory_usage()
 99 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]]
100 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
101 | rm(ans)
102 | t = system.time(print(dim(ans<-inner_join(x, big, by="id3"))))[["elapsed"]]
103 | m = memory_usage()
104 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]]
105 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
106 | print(head(ans, 3))
107 | print(tail(ans, 3))
108 | rm(ans)
109 | 
110 | cat(sprintf("joining finished, took %.0fs\n", proc.time()[["elapsed"]]-task_init))
111 | 
112 | if( !interactive() ) q("no", status=0)
113 | 


--------------------------------------------------------------------------------
/dplyr/read-dplyr.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | cat("# read-dplyr\n")
 4 | 
 5 | source("./helpers.R")
 6 | source("./dplyr/helpers-dplyr.R")
 7 | 
 8 | suppressPackageStartupMessages({
 9 |   library(readr, warn.conflicts=FALSE)
10 |   library(dplyr, warn.conflicts=FALSE)
11 | })
12 | ver = NA_character_ #packageVersion("dplyr")
13 | git = NA_character_ #dplyr.git()
14 | task = "read"
15 | solution = "dplyr"
16 | fun = "readr::read_csv"
17 | cache = TRUE
18 | 
19 | src_grp = Sys.getenv("SRC_GRP_LOCAL")
20 | data_name = basename(src_grp)
21 | options("readr.show_progress"=FALSE)
22 | 
23 | in_rows = as.numeric(strsplit(system(sprintf("wc -l %s", data_name), intern=TRUE), " ", fixed=TRUE)[[1L]][1L])-1
24 | 
25 | cat("reading...\n")
26 | 
27 | question = "all rows" #1
28 | t = system.time(print(dim(ans<-read_csv(data_name, col_types="ccciiiiid"))))[["elapsed"]]
29 | m = memory_usage()
30 | chkt = system.time(chk<-summarise(ungroup(ans), v3=sum(v1)))[["elapsed"]]
31 | write.log(run=1L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
32 | rm(ans)
33 | t = system.time(print(dim(ans<-read_csv(data_name, col_types="ccciiiiid"))))[["elapsed"]]
34 | m = memory_usage()
35 | chkt = system.time(chk<-summarise(ungroup(ans), v3=sum(v1)))[["elapsed"]]
36 | write.log(run=2L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
37 | rm(ans)
38 | t = system.time(print(dim(ans<-read_csv(data_name, col_types="ccciiiiid"))))[["elapsed"]]
39 | m = memory_usage()
40 | chkt = system.time(chk<-summarise(ungroup(ans), v3=sum(v1)))[["elapsed"]]
41 | write.log(run=3L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
42 | rm(ans)
43 | 
44 | question = "top 100 rows" #2
45 | t = system.time(print(dim(ans<-read_csv(data_name, n_max=100, col_types="ccciiiiid"))))[["elapsed"]]
46 | m = memory_usage()
47 | chkt = system.time(chk<-summarise(ungroup(ans), v3=sum(v1)))[["elapsed"]]
48 | write.log(run=1L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
49 | rm(ans)
50 | t = system.time(print(dim(ans<-read_csv(data_name, n_max=100, col_types="ccciiiiid"))))[["elapsed"]]
51 | m = memory_usage()
52 | chkt = system.time(chk<-summarise(ungroup(ans), v3=sum(v1)))[["elapsed"]]
53 | write.log(run=2L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
54 | rm(ans)
55 | t = system.time(print(dim(ans<-read_csv(data_name, n_max=100, col_types="ccciiiiid"))))[["elapsed"]]
56 | m = memory_usage()
57 | chkt = system.time(chk<-summarise(ungroup(ans), v3=sum(v1)))[["elapsed"]]
58 | write.log(run=3L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
59 | rm(ans)
60 | 
61 | if( !interactive() ) q("no", status=0)
62 | 


--------------------------------------------------------------------------------
/dplyr/setup-dplyr.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | # install stable dplyr
5 | mkdir -p ./dplyr/r-dplyr
6 | Rscript -e 'install.packages("dplyr", lib="./dplyr/r-dplyr", repos = "http://cloud.r-project.org")'
7 | 
8 | ./dplyr/ver-dplyr.sh
9 | 


--------------------------------------------------------------------------------
/dplyr/sort-dplyr.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | cat("# sort-dplyr\n")
 4 | 
 5 | source("./helpers.R")
 6 | source("./dplyr/helpers-dplyr.R")
 7 | 
 8 | src_x = Sys.getenv("SRC_X", NA_character_)
 9 | 
10 | # if (get.nrow(src_x) > 1e9L) {
11 | #   cat("# sort with dplyr skipped due data volume cap for single machine set to total 1e9 rows")
12 | #   quit("no", status=0) # datasets > 1e9 too big to try load on single machine
13 | # }
14 | 
15 | stopifnot(requireNamespace("bit64", quietly=TRUE)) # used in chk to sum numeric columns
16 | suppressPackageStartupMessages(library(dplyr, warn.conflicts=FALSE))
17 | ver = packageVersion("dplyr")
18 | git = dplyr.git()
19 | data_name = basename(src_x)
20 | task = "sort"
21 | solution = "dplyr"
22 | fun = "arrange"
23 | question = "by int KEY"
24 | cache = TRUE
25 | 
26 | cat("loading dataset...\n")
27 | X = data.table::fread(if(file.exists(basename(src_x))) basename(src_x) else sprintf("hadoop fs -cat %s", src_x), data.table=FALSE) # csv can be provided in local dir for faster import
28 | 
29 | cat("sorting...\n")
30 | t = system.time(print(dim(ans<-arrange(X, KEY))))[["elapsed"]]
31 | m = memory_usage()
32 | chkt = system.time(chk<-summarise(ans, sum(bit64::as.integer64(X2))))[["elapsed"]]
33 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
34 | rm(ans)
35 | 
36 | t = system.time(print(dim(ans<-arrange(X, KEY))))[["elapsed"]]
37 | m = memory_usage()
38 | chkt = system.time(chk<-summarise(ans, sum(bit64::as.integer64(X2))))[["elapsed"]]
39 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
40 | rm(ans)
41 | 
42 | t = system.time(print(dim(ans<-arrange(X, KEY))))[["elapsed"]]
43 | m = memory_usage()
44 | chkt = system.time(chk<-summarise(ans, sum(bit64::as.integer64(X2))))[["elapsed"]]
45 | write.log(run=3L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
46 | rm(ans)
47 | 
48 | if( !interactive() ) q("no", status=0)
49 | 


--------------------------------------------------------------------------------
/dplyr/upg-dplyr.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | # upgrade all packages in dplyr library only if new dplyr is out
5 | echo 'upgrading dplyr...'
6 | Rscript -e 'ap=available.packages(); if (ap["dplyr","Version"]!=packageVersion("dplyr", lib.loc="./dplyr/r-dplyr")) update.packages(lib.loc="./dplyr/r-dplyr", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)'
7 | 


--------------------------------------------------------------------------------
/dplyr/ver-dplyr.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | Rscript -e 'v=read.dcf(system.file(package="dplyr", lib.loc="./dplyr/r-dplyr", "DESCRIPTION"), fields=c("Version","RemoteSha")); colnames(v)[colnames(v)=="RemoteSha"]="Revision"; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("dplyr", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))'
5 | 


--------------------------------------------------------------------------------
/duckdb-latest/VERSION:
--------------------------------------------------------------------------------
1 | 1.0.99.9000
2 | 


--------------------------------------------------------------------------------
/duckdb-latest/setup-duckdb-latest.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # install stable duckdb-latest
 5 | rm -rf ./duckdb-latest/r-duckdb-latest
 6 | mkdir -p ./duckdb-latest/r-duckdb-latest
 7 | # Rscript -e  'withr::with_libpaths(new = "./duckdb-latest/r-duckdb-latest", devtools::install_github("duckdb-latest/duckdb-latest/tools/rpkg"))'
 8 | # prevent errors when running 'ver-duckdb-latest.sh'
 9 | Rscript -e 'install.packages("DBI", lib="./duckdb-latest/r-duckdb-latest", repos = "http://cloud.r-project.org")'
10 | 
11 | 
12 | cd duckdb-latest
13 | rm -rf duckdb-r
14 | git clone https://github.com/duckdb/duckdb-r.git
15 | ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'`
16 | MAKE="make -j$ncores" R CMD INSTALL -l "./r-duckdb-latest" duckdb-r
17 | rm -rf duckdb-r
18 | cd ..
19 | 
20 | ./duckdb-latest/ver-duckdb-latest.sh
21 | 


--------------------------------------------------------------------------------
/duckdb-latest/upg-duckdb-latest.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # upgrade all packages in duckdb library only if new arrow is out
 5 | echo 'upgrading duckdb-latest, installing 0.9.1'
 6 | 
 7 | rm -rf ./duckdb-latest/r-duckdb-latest
 8 | mkdir -p ./duckdb-latest/r-duckdb-latest
 9 | Rscript -e 'install.packages("DBI", lib="./duckdb-latest/r-duckdb-latest", repos = "http://cloud.r-project.org")'
10 | 
11 | 
12 | cd duckdb-latest
13 | rm -rf duckdb-r
14 | git clone https://github.com/duckdb/duckdb-r
15 | ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'`
16 | MAKE="make -j$ncores" R CMD INSTALL -l "./r-duckdb-latest" duckdb-r
17 | rm -rf duckdb-r
18 | cd ..
19 | 
20 | ./duckdb-latest/ver-duckdb-latest.sh


--------------------------------------------------------------------------------
/duckdb-latest/ver-duckdb-latest.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | Rscript -e 'v=read.dcf(system.file(package="duckdb", lib.loc="./duckdb-latest/r-duckdb-latest", "DESCRIPTION"), fields=c("Version","Revision")); if (is.na(v[,"Revision"])) { suppressPackageStartupMessages({ requireNamespace("DBI", lib.loc="./duckdb-latest/r-duckdb-latest"); requireNamespace("duckdb", lib.loc="./duckdb-latest/r-duckdb-latest") }); v[,"Revision"] = DBI::dbGetQuery(con<-DBI::dbConnect(duckdb::duckdb()), "SELECT source_id FROM pragma_version()")[[1L]]; invisible(DBI::dbDisconnect(con, shutdown=TRUE)) }; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(gsub("-", ".", v), file.path("duckdb-latest", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))'
5 | 
6 | 


--------------------------------------------------------------------------------
/duckdb/VERSION:
--------------------------------------------------------------------------------
1 | 1.3.0
2 | 


--------------------------------------------------------------------------------
/duckdb/setup-duckdb.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # install stable duckdb
 5 | rm -rf ./duckdb/r-duckdb
 6 | mkdir -p ./duckdb/r-duckdb
 7 | # Rscript -e  'withr::with_libpaths(new = "./duckdb/r-duckdb", devtools::install_github("duckdb/duckdb/tools/rpkg"))'
 8 | # prevent errors when running 'ver-duckdb.sh'
 9 | Rscript -e 'install.packages("DBI", lib="./duckdb/r-duckdb", repos = "http://cloud.r-project.org")'
10 | ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'`
11 | MAKE="make -j$ncores" Rscript -e 'install.packages("duckdb", lib="./duckdb/r-duckdb", repos = "http://cloud.r-project.org")'
12 | 
13 | ./duckdb/ver-duckdb.sh
14 | 


--------------------------------------------------------------------------------
/duckdb/upg-duckdb.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | rm -rf ./duckdb/r-duckdb
 5 | mkdir -p ./duckdb/r-duckdb
 6 | 
 7 | 
 8 | cd duckdb
 9 | rm -rf duckdb-r
10 | git clone https://github.com/duckdb/duckdb-r
11 | cd duckdb-r 
12 | git checkout v1.2.0
13 | cd ..
14 | ncores=$(nproc --all)
15 | MAKE="make -j$ncores" R CMD INSTALL -l "./r-duckdb" duckdb-r
16 | rm -rf duckdb-r
17 | cd ..
18 | 
19 | 
20 | # Rscript -e 'ap=available.packages(repos="https://cloud.r-project.org/"); if (ap["duckdb","Version"]!=packageVersion("duckdb", lib.loc="./duckdb/r-duckdb")) update.packages(lib.loc="./duckdb/r-duckdb", ask=FALSE, checkBuilt=TRUE, quiet=TRUE, repos="https://cloud.r-project.org/")'
21 | 


--------------------------------------------------------------------------------
/duckdb/ver-duckdb.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | Rscript -e 'v=read.dcf(system.file(package="duckdb", lib.loc="./duckdb/r-duckdb", "DESCRIPTION"), fields=c("Version","Revision")); if (is.na(v[,"Revision"])) { suppressPackageStartupMessages({ requireNamespace("DBI", lib.loc="./duckdb/r-duckdb"); requireNamespace("duckdb", lib.loc="./duckdb/r-duckdb") }); v[,"Revision"] = DBI::dbGetQuery(DBI::dbConnect(duckdb::duckdb()), "SELECT source_id FROM pragma_version()")[[1L]] }; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(gsub("-", ".", v), file.path("duckdb", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))'
5 | 


--------------------------------------------------------------------------------
/h2o/exec.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | if [ "$#" -ne 1 ]; then
 5 |   echo 'usage: ./h2o/exec.sh groupby';
 6 |   exit 1
 7 | fi;
 8 | 
 9 | source ./h2o/h2o.sh
10 | 
11 | h2o_active && echo 'h2o instance should not be already running, investigate' >&2
12 | h2o_active && exit 1
13 | 
14 | # start h2o
15 | h2o_start "h2o_$1_""$SRC_DATANAME"
16 | 
17 | # confirm h2o working
18 | h2o_active || sleep 30
19 | h2o_active || echo 'h2o instance should be already running, investigate' >&2
20 | h2o_active || exit 1
21 | 
22 | # execute benchmark script
23 | ./h2o/$1-h2o.R || echo "# h2o/exec.sh: benchmark script for $SRC_DATANAME terminated with error" >&2
24 | 
25 | # stop h2o instance
26 | h2o_stop && echo '# h2o/exec.sh: stopping h2o instance finished' || echo '# h2o/exec.sh: stopping h2o instance failed' >&2
27 | h2o_active || exit 1
28 | 


--------------------------------------------------------------------------------
/h2o/h2o.sh:
--------------------------------------------------------------------------------
 1 | java_active() {
 2 |   pgrep -U $UID java > /dev/null 2>&1
 3 | }
 4 | h2o_active() {
 5 |   java_active && curl -X GET "localhost:55888/3/About" -H "accept: application/json" > /dev/null 2>&1
 6 | }
 7 | h2o_start() {
 8 |   ((!$#)) && echo "h2o_start require h2o instance name as a parameter" >&2 && return 1
 9 |   echo '# h2o_start: starting h2o instance'
10 |   java_active && echo "h2o instance is running already" >&2 && return 1
11 |   nohup java -Xmx100G -Xms100G -cp ./h2o/r-h2o/h2o/java/h2o.jar water.H2OApp -name "$1" -baseport 55888 > ./h2o/log/$1.out 2> ./h2o/log/$1.err < /dev/null &
12 |   sleep 10
13 | }
14 | h2o_stop() {
15 |   echo '# h2o_stop: stopping h2o instance'
16 |   java_active || echo "h2o instance was not running already" >&2
17 |   java_active || return 0
18 |   java_active && echo "sigint h2o instance" && killall -2 -u $USER java > /dev/null 2>&1
19 |   sleep 1 && java_active && sleep 15
20 |   java_active && echo "sigterm h2o instance" && killall -15 -u $USER java > /dev/null 2>&1
21 |   sleep 1 && java_active && sleep 30
22 |   java_active && echo "sigkill h2o instance" && killall -9 -u $USER java > /dev/null 2>&1
23 |   sleep 1 && java_active && sleep 120 && java_active && echo "h2o instance could not be stopped" >&2 && return 1
24 |   return 0
25 | }
26 | 
27 | 


--------------------------------------------------------------------------------
/h2o/join-h2o.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | cat("# join-h2o.R\n")
  4 | 
  5 | source("./_helpers/helpers.R")
  6 | 
  7 | suppressPackageStartupMessages(library("h2o", lib.loc="./h2o/r-h2o", warn.conflicts=FALSE, quietly=TRUE))
  8 | ver = packageVersion("h2o")
  9 | git = ""
 10 | task = "join"
 11 | solution = "h2o"
 12 | fun = "h2o.merge"
 13 | cache = TRUE
 14 | on_disk = FALSE
 15 | 
 16 | h = h2o.init(startH2O=FALSE, port=55888)
 17 | h2o.no_progress()
 18 | 
 19 | data_name = Sys.getenv("SRC_DATANAME")
 20 | src_jn_x = file.path("data", paste(data_name, "csv", sep="."))
 21 | y_data_name = join_to_tbls(data_name)
 22 | src_jn_y = setNames(file.path("data", paste(y_data_name, "csv", sep=".")), names(y_data_name))
 23 | stopifnot(length(src_jn_y)==3L)
 24 | cat(sprintf("loading datasets %s\n", paste(c(data_name, y_data_name), collapse=", ")))
 25 | 
 26 | x = h2o.importFile(src_jn_x, col.types=c("int","int","int","enum","enum","string","real"))
 27 | print(nrow(x))
 28 | small = h2o.importFile(src_jn_y[1L], col.types=c("int","enum","real"))
 29 | medium = h2o.importFile(src_jn_y[2L], col.types=c("int","int","enum","enum","real"))
 30 | big = h2o.importFile(src_jn_y[3L], col.types=c("int","int","int","enum","enum","string","real"))
 31 | sapply(sapply(list(small, medium, big), nrow), print) -> nul
 32 | 
 33 | task_init = proc.time()[["elapsed"]]
 34 | cat("joining...\n")
 35 | 
 36 | question = "small inner on int" # q1
 37 | 
 38 | t = system.time(print(dim(ans<-h2o.merge(x, small, by="id1"))))[["elapsed"]]
 39 | m = memory_usage()
 40 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]]
 41 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 42 | h2o.rm(ans)
 43 | t = system.time(print(dim(ans<-h2o.merge(x, small, by="id1"))))[["elapsed"]]
 44 | m = memory_usage()
 45 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]]
 46 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 47 | print(head(ans, 3))
 48 | print(tail(ans, 3))
 49 | h2o.rm(ans)
 50 | 
 51 | question = "medium inner on int" # q2
 52 | t = system.time(print(dim(ans<-h2o.merge(x, medium, by="id2"))))[["elapsed"]]
 53 | m = memory_usage()
 54 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]]
 55 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 56 | h2o.rm(ans)
 57 | t = system.time(print(dim(ans<-h2o.merge(x, medium, by="id2"))))[["elapsed"]]
 58 | m = memory_usage()
 59 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]]
 60 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 61 | print(head(ans, 3))
 62 | print(tail(ans, 3))
 63 | h2o.rm(ans)
 64 | 
 65 | question = "medium outer on int" # q3
 66 | t = system.time(print(dim(ans<-h2o.merge(x, medium, by="id2", all.x=TRUE))))[["elapsed"]]
 67 | m = memory_usage()
 68 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]], na.rm=TRUE)))[["elapsed"]]
 69 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 70 | h2o.rm(ans)
 71 | t = system.time(print(dim(ans<-h2o.merge(x, medium, by="id2", all.x=TRUE))))[["elapsed"]]
 72 | m = memory_usage()
 73 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]], na.rm=TRUE)))[["elapsed"]]
 74 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 75 | print(head(ans, 3))
 76 | print(tail(ans, 3))
 77 | h2o.rm(ans)
 78 | 
 79 | question = "medium inner on factor" # q4
 80 | t = system.time(print(dim(ans<-h2o.merge(x, medium, by="id5"))))[["elapsed"]]
 81 | m = memory_usage()
 82 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]]
 83 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 84 | h2o.rm(ans)
 85 | t = system.time(print(dim(ans<-h2o.merge(x, medium, by="id5"))))[["elapsed"]]
 86 | m = memory_usage()
 87 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]]
 88 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 89 | print(head(ans, 3))
 90 | print(tail(ans, 3))
 91 | h2o.rm(ans)
 92 | 
 93 | question = "big inner on int" # q5
 94 | t = system.time(print(dim(ans<-h2o.merge(x, big, by="id3"))))[["elapsed"]]
 95 | m = memory_usage()
 96 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]]
 97 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
 98 | h2o.rm(ans)
 99 | t = system.time(print(dim(ans<-h2o.merge(x, big, by="id3"))))[["elapsed"]]
100 | m = memory_usage()
101 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]]
102 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
103 | print(head(ans, 3))
104 | print(tail(ans, 3))
105 | h2o.rm(ans)
106 | 
107 | h2o.removeAll()
108 | 
109 | cat(sprintf("joining finished, took %.0fs\n", proc.time()[["elapsed"]]-task_init))
110 | 
111 | if (!interactive()) q("no", status=0)
112 | 


--------------------------------------------------------------------------------
/h2o/setup-h2o.sh:
--------------------------------------------------------------------------------
1 | mkdir -p ./h2o/log
2 | # install h2o
3 | mkdir -p ./h2o/r-h2o
4 | Rscript -e 'install.packages(c("RCurl","jsonlite"), repos="https://cloud.r-project.org", lib="./h2o/r-h2o"); install.packages("h2o", repos="http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R", method="curl", lib="./h2o/r-h2o")'
5 | 
6 | 


--------------------------------------------------------------------------------
/h2o/upg-h2o.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | # upgrade to latest stable from h2o repo
5 | echo 'upgrading h2o...'
6 | Rscript -e 'ap=available.packages(repos="http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R", method="curl"); if (ap["h2o","Version"]!=packageVersion("h2o", lib.loc="./h2o/r-h2o")) update.packages(lib.loc="./h2o/r-h2o", repos="http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R", method="curl", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)'
7 | 


--------------------------------------------------------------------------------
/h2o/ver-h2o.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | Rscript -e 'v=read.dcf(system.file(package="h2o", lib.loc="./h2o/r-h2o", "DESCRIPTION"), fields=c("Version","Revision")); cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("h2o", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))'
5 | 


--------------------------------------------------------------------------------
/juliadf/VERSION:
--------------------------------------------------------------------------------
1 | 1.6.1


--------------------------------------------------------------------------------
/juliadf/exec.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | if [ "$#" -ne 1 ]; then
 5 |   echo 'usage: ./juliadf/exec.sh groupby';
 6 |   exit 1
 7 | fi;
 8 | 
 9 | source ./path.env
10 | 
11 | ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'`
12 | 
13 | # execute benchmark script
14 | julia -t $ncores ./juliadf/$1-juliadf.jl
15 | 


--------------------------------------------------------------------------------
/juliadf/setup-juliadf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # install julia
 3 | 
 4 | wget -q https://julialang-s3.julialang.org/bin/linux/x64/1.10/julia-1.10.5-linux-x86_64.tar.gz
 5 | tar -xvf julia-1.10.5-linux-x86_64.tar.gz > tar.out 2> tar.err
 6 | sudo mv julia-1.10.5 /opt
 7 | rm julia-1.10.5-linux-x86_64.tar.gz
 8 | # put to paths
 9 | echo 'export JULIA_HOME=/opt/julia-1.10.5' >> path.env
10 | echo 'export PATH=$PATH:$JULIA_HOME/bin' >> path.env
11 | # note that cron job must have path updated as well
12 | 
13 | source path.env
14 | 
15 | # install julia dataframes and csv packages
16 | julia -q -e 'using Pkg; Pkg.add(["DataFrames","CSV"])'
17 | julia -q -e 'include("$(pwd())/_helpers/helpers.jl"); pkgmeta = getpkgmeta("DataFrames"); println(string(pkgmeta["version"])); pkgmeta = getpkgmeta("CSV"); println(string(pkgmeta["version"]))'
18 | 
19 | ./juliadf/ver-juliadf.sh


--------------------------------------------------------------------------------
/juliadf/upg-juliadf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | # upgrade to latest devel
5 | echo 'upgrading juliadf...'
6 | julia -q -e 'using Pkg; Pkg.update();' > /dev/null 2>&1
7 | 
8 | 


--------------------------------------------------------------------------------
/juliadf/ver-juliadf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | source path.env
4 | 
5 | julia -q -e 'include("$(pwd())/_helpers/helpers.jl"); pkgmeta = getpkgmeta("DataFrames"); f=open("juliadf/VERSION","w"); write(f, string(pkgmeta["version"])); f=open("juliadf/REVISION","w"); write(f, string(pkgmeta["git-tree-sha1"]));' > /dev/null
6 | 


--------------------------------------------------------------------------------
/juliads/VERSION:
--------------------------------------------------------------------------------
1 | 0.7.21


--------------------------------------------------------------------------------
/juliads/exec.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | if [ "$#" -ne 1 ]; then
 5 |   echo 'usage: ./juliads/exec.sh groupby';
 6 |   exit 1
 7 | fi;
 8 | 
 9 | source ./path.env
10 | 
11 | ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'`
12 | 
13 | # execute benchmark script
14 | julia -t $ncores ./juliads/$1-juliads.jl
15 | 


--------------------------------------------------------------------------------
/juliads/join-juliads.jl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env julia
  2 | 
  3 | print("# join-juliads.jl\n"); flush(stdout);
  4 | 
  5 | using InMemoryDatasets;
  6 | using Printf;
  7 | using DLMReader
  8 | using PooledArrays
  9 | using Arrow
 10 | 
 11 | # Force Julia to precompile methods for common patterns
 12 | IMD.warmup()
 13 | 
 14 | include("$(pwd())/_helpers/helpersds.jl");
 15 | 
 16 | pkgmeta = getpkgmeta("InMemoryDatasets");
 17 | ver = pkgmeta["version"];
 18 | git = "";
 19 | task = "join";
 20 | solution = "juliads";
 21 | fun = "join";
 22 | cache = true;
 23 | on_disk = false;
 24 | machine_type = ENV["MACHINE_TYPE"]
 25 | isondisk(indata) = (parse(Float64, split(indata, "_")[2])>=10^10) || (parse(Float64, split(indata, "_")[2]) >= 1^9 || machine_type == "c6id.4xlarge")
 26 | 
 27 | data_name = ENV["SRC_DATANAME"];
 28 | src_jn_x = string("data/", data_name, ".csv");
 29 | y_data_name = join_to_tbls(data_name);
 30 | src_jn_y = [string("data/", y_data_name[1], ".csv"), string("data/", y_data_name[2], ".csv"), string("data/", y_data_name[3], ".csv")];
 31 | if length(src_jn_y) != 3
 32 |   error("Something went wrong in preparing files used for join")
 33 | end;
 34 | 
 35 | on_disk = isondisk(data_name)
 36 | 
 37 | println(string("loading datasets ", data_name, ", ", y_data_name[1], ", ", y_data_name[2], ", ", y_data_name[3])); flush(stdout);
 38 | 
 39 | # temporary file which will be deleted after the run - usually located at /tmp/
 40 | _tmp_storage = tempname() 
 41 | if isondisk(data_name)
 42 |   on_disk = true
 43 |   big_df = filereader(src_jn_y[3], types=[Int32, Int32, Int32, Characters{6}, Characters{9}, Characters{12}, Float64]);
 44 |   modify!(big_df, [:id4, :id5]=>PooledArray)
 45 |   Arrow.write(_tmp_storage, big_df[!, :], ntasks=1)
 46 |   big_df = 0
 47 |   GC.gc(true)
 48 | end
 49 | x_df = filereader(src_jn_x, types=[Int32, Int32, Int32, Characters{6}, Characters{9}, Characters{12}, Float64]);
 50 | small_df = filereader(src_jn_y[1], types=[Int32, Characters{6}, Float64]);
 51 | medium_df = filereader(src_jn_y[2], types=[Int32, Int32, Characters{6}, Characters{9}, Float64]);
 52 | if isondisk(data_name)
 53 |   big_df = Dataset(Arrow.Table(_tmp_storage))
 54 | else
 55 |   big_df = filereader(src_jn_y[3], types=[Int32, Int32, Int32, Characters{6}, Characters{9}, Characters{12}, Float64]);
 56 |   modify!(big_df, [:id4, :id5]=>PooledArray)
 57 | end
 58 | 
 59 | modify!(x_df, [:id4, :id5]=>PooledArray)
 60 | modify!(small_df, :id4=>PooledArray)
 61 | modify!(medium_df, [:id4, :id5]=>PooledArray)
 62 | 
 63 | in_rows = size(x_df, 1);
 64 | println(in_rows); flush(stdout);
 65 | println(size(small_df, 1)); flush(stdout);
 66 | println(size(medium_df, 1)); flush(stdout);
 67 | println(size(big_df, 1)); flush(stdout);
 68 | 
 69 | task_init = time();
 70 | print("joining...\n"); flush(stdout);
 71 | 
 72 | question = "small inner on int"; # q1
 73 | GC.gc();
 74 | t = @elapsed (ANS = innerjoin(x_df, small_df, on = :id1, makeunique=true); println(size(ANS)); flush(stdout));
 75 | m = memory_usage();
 76 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)];
 77 | write_log(1, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type);
 78 | ANS = 0;
 79 | GC.gc();
 80 | t = @elapsed (ANS = innerjoin(x_df, small_df, on = :id1, makeunique=true); println(size(ANS)); flush(stdout));
 81 | m = memory_usage();
 82 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)];
 83 | write_log(2, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type);
 84 | println(first(ANS, 3));
 85 | println(last(ANS, 3));
 86 | ANS = 0;
 87 | 
 88 | question = "medium inner on int"; # q2
 89 | GC.gc();
 90 | t = @elapsed (ANS = innerjoin(x_df, medium_df, on = :id2, makeunique=true); println(size(ANS)); flush(stdout));
 91 | m = memory_usage();
 92 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)];
 93 | write_log(1, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type);
 94 | ANS = 0;
 95 | GC.gc();
 96 | t = @elapsed (ANS = innerjoin(x_df, medium_df, on = :id2, makeunique=true); println(size(ANS)); flush(stdout));
 97 | m = memory_usage();
 98 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)];
 99 | write_log(2, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type);
100 | println(first(ANS, 3));
101 | println(last(ANS, 3));
102 | ANS = 0;
103 | 
104 | question = "medium outer on int"; # q3
105 | GC.gc();
106 | t = @elapsed (ANS = leftjoin(x_df, medium_df, on = :id2, makeunique=true); println(size(ANS)); flush(stdout));
107 | m = memory_usage();
108 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)];
109 | write_log(1, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type);
110 | ANS = 0;
111 | GC.gc();
112 | t = @elapsed (ANS = leftjoin(x_df, medium_df, on = :id2, makeunique=true); println(size(ANS)); flush(stdout));
113 | m = memory_usage();
114 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)];
115 | write_log(2, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type);
116 | println(first(ANS, 3));
117 | println(last(ANS, 3));
118 | ANS = 0;
119 | 
120 | question = "medium inner on factor"; # q4
121 | GC.gc();
122 | t = @elapsed (ANS = innerjoin(x_df, medium_df, on = :id5, makeunique=true); println(size(ANS)); flush(stdout));
123 | m = memory_usage();
124 | t_start = time_ns();
125 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)];
126 | write_log(1, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type);
127 | ANS = 0;
128 | GC.gc();
129 | t = @elapsed (ANS = innerjoin(x_df, medium_df, on = :id5, makeunique=true); println(size(ANS)); flush(stdout));
130 | m = memory_usage();
131 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)];
132 | write_log(2, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type);
133 | println(first(ANS, 3));
134 | println(last(ANS, 3));
135 | ANS = 0;
136 | 
137 | question = "big inner on int"; # q5
138 | GC.gc();
139 | t = @elapsed (ANS = innerjoin(x_df, big_df, on = :id3, makeunique=true); println(size(ANS)); flush(stdout));
140 | m = memory_usage();
141 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)];
142 | write_log(1, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type);
143 | ANS = 0;
144 | GC.gc();
145 | t = @elapsed (ANS = innerjoin(x_df, big_df, on = :id3, makeunique=true); println(size(ANS)); flush(stdout));
146 | m = memory_usage();
147 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)];
148 | write_log(2, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type);
149 | println(first(ANS, 3));
150 | println(last(ANS, 3));
151 | ANS = 0;
152 | 
153 | print(@sprintf "joining finished, took %.0fs\n" (time()-task_init)); flush(stdout);
154 | 
155 | exit();
156 | 


--------------------------------------------------------------------------------
/juliads/setup-juliads.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # install julia
 3 | 
 4 | wget -q https://julialang-s3.julialang.org/bin/linux/x64/1.10/julia-1.10.5-linux-x86_64.tar.gz 
 5 | tar -xvf julia-1.10.5-linux-x86_64.tar.gz > tar_out.out 2> tar_err.err
 6 | sudo mv julia-1.10.5 /opt
 7 | rm julia-1.10.5-linux-x86_64.tar.gz
 8 | # put to paths
 9 | echo 'export JULIA_HOME=/opt/julia-1.10.5' >> path.env
10 | echo 'export PATH=$PATH:$JULIA_HOME/bin' >> path.env
11 | # note that cron job must have path updated as well
12 | 
13 | source path.env
14 | 
15 | # install julia InMemoryDatasets and csv packages
16 | julia -q -e 'using Pkg; Pkg.add(["InMemoryDatasets","DLMReader", "PooledArrays", "Arrow", "CSV"])'
17 | julia -q -e 'include("$(pwd())/_helpers/helpersds.jl"); pkgmeta = getpkgmeta("InMemoryDatasets"); println(string(pkgmeta["version"])); pkgmeta = getpkgmeta("DLMReader"); println(string(pkgmeta["version"]))'
18 | 
19 | ./juliadf/ver-juliadf.sh
20 | 


--------------------------------------------------------------------------------
/juliads/upg-juliads.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | # upgrade to latest devel
5 | echo 'upgrading juliads...'
6 | julia -q -e 'using Pkg; Pkg.update();' > /dev/null 2>&1
7 | 
8 | 


--------------------------------------------------------------------------------
/juliads/ver-juliads.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | source path.env
5 | 
6 | julia -q -e 'include("$(pwd())/_helpers/helpersds.jl"); pkgmeta = getpkgmeta("InMemoryDatasets"); f=open("juliads/VERSION","w"); write(f, string(pkgmeta["version"])); f=open("juliads/REVISION","w"); write(f, string(" "));' > /dev/null
7 | 


--------------------------------------------------------------------------------
/modin/join-modin.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | print("# join-modin.py")
 4 | 
 5 | import os
 6 | import gc
 7 | import timeit
 8 | import modin.pandas as pd
 9 | 
10 | exec(open("./helpers.py").read())
11 | 
12 | src_x = os.environ['SRC_X_LOCAL']
13 | src_y = os.environ['SRC_Y_LOCAL']
14 | 
15 | ver = "" #pd.__version__
16 | git = ""
17 | task = "join"
18 | question = "inner join"
19 | l = [os.path.basename(src_x), os.path.basename(src_y)]
20 | data_name = '-'.join(l)
21 | solution = "modin"
22 | fun = "merge"
23 | cache = "TRUE"
24 | 
25 | print("loading datasets...")
26 | 
27 | x = pd.read_csv(os.path.basename(src_x))
28 | y = pd.read_csv(os.path.basename(src_y))
29 | 
30 | print("joining...")
31 | 
32 | # NotImplementedError: To contribute to Pandas on Ray, please visit github.com/modin-project/modin
33 | gc.collect()
34 | t_start = timeit.default_timer()
35 | ans = x.merge(y, how='inner', on='KEY')
36 | print(ans.shape)
37 | t = timeit.default_timer() - t_start
38 | m = memory_usage()
39 | t_start = timeit.default_timer()
40 | chk = [ans['X2'].sum(), ans['Y2'].sum()]
41 | chkt = timeit.default_timer() - t_start
42 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
43 | del ans
44 | 
45 | gc.collect()
46 | t_start = timeit.default_timer()
47 | ans = x.merge(y, how='inner', on='KEY')
48 | print(ans.shape)
49 | t = timeit.default_timer() - t_start
50 | m = memory_usage()
51 | t_start = timeit.default_timer()
52 | chk = [ans['X2'].sum(), ans['Y2'].sum()]
53 | chkt = timeit.default_timer() - t_start
54 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
55 | del ans
56 | 
57 | gc.collect()
58 | t_start = timeit.default_timer()
59 | ans = x.merge(y, how='inner', on='KEY')
60 | print(ans.shape)
61 | t = timeit.default_timer() - t_start
62 | m = memory_usage()
63 | t_start = timeit.default_timer()
64 | chk = [ans['X2'].sum(), ans['Y2'].sum()]
65 | chkt = timeit.default_timer() - t_start
66 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
67 | del ans
68 | 
69 | exit(0)
70 | 


--------------------------------------------------------------------------------
/modin/setup-modin.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | virtualenv modin/py-modin --python=python3
 5 | source modin/py-modin/bin/activate
 6 | 
 7 | # install binaries
 8 | python3 -m pip install --upgrade modin[all]
 9 | 
10 | # check
11 | python3
12 | import modin
13 | modin.__version__
14 | quit()
15 | 
16 | deactivate
17 | 


--------------------------------------------------------------------------------
/modin/sort-modin.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | print("# sort-modin.py")
 4 | 
 5 | import os
 6 | import gc
 7 | import timeit
 8 | import modin as modin
 9 | import modin.pandas as pd
10 | 
11 | exec(open("./helpers.py").read())
12 | 
13 | src_x = os.environ['SRC_X_LOCAL']
14 | 
15 | ver = modin.__version__
16 | git = modin.__git_revision__
17 | task = "sort"
18 | question = "by int KEY"
19 | data_name = os.path.basename(src_x)
20 | solution = "modin"
21 | fun = ".sort"
22 | cache = "TRUE"
23 | 
24 | print("loading dataset...")
25 | 
26 | x = pd.read_csv(data_name)
27 | 
28 | print("sorting...")
29 | 
30 | gc.collect()
31 | t_start = timeit.default_timer()
32 | ans = x.sort_values('KEY')
33 | print(ans.shape)
34 | t = timeit.default_timer() - t_start
35 | m = memory_usage()
36 | t_start = timeit.default_timer()
37 | chk = [ans['X2'].sum()]
38 | chkt = timeit.default_timer() - t_start
39 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
40 | del ans
41 | 
42 | gc.collect()
43 | t_start = timeit.default_timer()
44 | ans = x.sort_values('KEY')
45 | print(ans.shape)
46 | t = timeit.default_timer() - t_start
47 | m = memory_usage()
48 | t_start = timeit.default_timer()
49 | chk = [ans['X2'].sum()]
50 | chkt = timeit.default_timer() - t_start
51 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
52 | del ans
53 | 
54 | gc.collect()
55 | t_start = timeit.default_timer()
56 | ans = x.sort_values('KEY')
57 | print(ans.shape)
58 | t = timeit.default_timer() - t_start
59 | m = memory_usage()
60 | t_start = timeit.default_timer()
61 | chk = [ans['X2'].sum()]
62 | chkt = timeit.default_timer() - t_start
63 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
64 | del ans
65 | 
66 | exit(0)
67 | 


--------------------------------------------------------------------------------
/modin/upg-modin.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | echo 'upgrading modin...'
5 | 
6 | source ./modin/py-modin/bin/activate
7 | 
8 | python -m pip install --upgrade modin[all] > /dev/null
9 | 


--------------------------------------------------------------------------------
/modin/ver-modin.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | source ./modin/py-modin/bin/activate
5 | python -c 'import modin as modin; open("modin/VERSION","w").write(modin.__version__); open("modin/REVISION","w").write("");' > /dev/null
6 | 


--------------------------------------------------------------------------------
/pandas/VERSION:
--------------------------------------------------------------------------------
1 | 2.2.2


--------------------------------------------------------------------------------
/pandas/read-pandas.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | print("# read-pandas.py")
 4 | 
 5 | import os
 6 | import gc
 7 | import timeit
 8 | import subprocess
 9 | import pandas as pd
10 | 
11 | exec(open("./helpers.py").read())
12 | 
13 | src_grp = os.environ['SRC_GRP_LOCAL']
14 | 
15 | ver = pd.__version__
16 | git = ""
17 | task = "read"
18 | data_name = os.path.basename(src_grp)
19 | solution = "pandas"
20 | fun = "read_csv"
21 | cache = "TRUE"
22 | 
23 | wc_lines = subprocess.run(['wc','-l',data_name], stdout=subprocess.PIPE).stdout.decode('utf-8').split(" ", 1)[0]
24 | in_rows = int(wc_lines)-1
25 | 
26 | print("reading...")
27 | 
28 | question = "all rows" #1
29 | gc.collect()
30 | t_start = timeit.default_timer()
31 | ans = pd.read_csv(data_name)
32 | print(ans.shape)
33 | t = timeit.default_timer() - t_start
34 | m = memory_usage()
35 | t_start = timeit.default_timer()
36 | chk = [ans['v3'].sum()]
37 | chkt = timeit.default_timer() - t_start
38 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
39 | del ans
40 | gc.collect()
41 | t_start = timeit.default_timer()
42 | ans = pd.read_csv(data_name)
43 | print(ans.shape)
44 | t = timeit.default_timer() - t_start
45 | m = memory_usage()
46 | t_start = timeit.default_timer()
47 | chk = [ans['v3'].sum()]
48 | chkt = timeit.default_timer() - t_start
49 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
50 | del ans
51 | gc.collect()
52 | t_start = timeit.default_timer()
53 | ans = pd.read_csv(data_name)
54 | print(ans.shape)
55 | t = timeit.default_timer() - t_start
56 | m = memory_usage()
57 | t_start = timeit.default_timer()
58 | chk = [ans['v3'].sum()]
59 | chkt = timeit.default_timer() - t_start
60 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
61 | del ans
62 | 
63 | question = "top 100 rows" #2
64 | gc.collect()
65 | t_start = timeit.default_timer()
66 | ans = pd.read_csv(data_name, nrows=100)
67 | print(ans.shape)
68 | t = timeit.default_timer() - t_start
69 | m = memory_usage()
70 | t_start = timeit.default_timer()
71 | chk = [ans['v3'].sum()]
72 | chkt = timeit.default_timer() - t_start
73 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
74 | del ans
75 | gc.collect()
76 | t_start = timeit.default_timer()
77 | ans = pd.read_csv(data_name, nrows=100)
78 | print(ans.shape)
79 | t = timeit.default_timer() - t_start
80 | m = memory_usage()
81 | t_start = timeit.default_timer()
82 | chk = [ans['v3'].sum()]
83 | chkt = timeit.default_timer() - t_start
84 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
85 | del ans
86 | gc.collect()
87 | t_start = timeit.default_timer()
88 | ans = pd.read_csv(data_name, nrows=100)
89 | print(ans.shape)
90 | t = timeit.default_timer() - t_start
91 | m = memory_usage()
92 | t_start = timeit.default_timer()
93 | chk = [ans['v3'].sum()]
94 | chkt = timeit.default_timer() - t_start
95 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
96 | del ans
97 | 
98 | exit(0)
99 | 


--------------------------------------------------------------------------------
/pandas/setup-pandas.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # install all dependencies
 5 | # sudo apt-get update
 6 | # sudo apt-get install build-essential python3-dev python3-pip
 7 | 
 8 | virtualenv pandas/py-pandas --python=python3
 9 | source pandas/py-pandas/bin/activate
10 | 
11 | # install binaries
12 | python3 -m pip install --upgrade psutil
13 | python3 -m pip install --upgrade pandas
14 | python3 -m pip install --upgrade pyarrow
15 | 
16 | deactivate
17 | 
18 | ./pandas/ver-pandas.sh
19 | 
20 | # # check
21 | # source pandas/py-pandas/bin/activate
22 | # python3
23 | # import pandas as pd
24 | # pd.__version__
25 | # quit()
26 | # deactivate
27 | 


--------------------------------------------------------------------------------
/pandas/sort-pandas.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | print("# sort-pandas.py")
 4 | 
 5 | import os
 6 | import gc
 7 | import timeit
 8 | import pandas as pd
 9 | #import pydoop.hdfs as hd
10 | 
11 | exec(open("./helpers.py").read())
12 | 
13 | src_x = os.environ['SRC_X_LOCAL']
14 | 
15 | ver = pd.__version__
16 | git = ""
17 | task = "sort"
18 | question = "by int KEY"
19 | data_name = os.path.basename(src_x)
20 | solution = "pandas"
21 | fun = ".sort"
22 | cache = "TRUE"
23 | 
24 | print("loading dataset...")
25 | 
26 | # with hd.open(src_x) as f:
27 | #    x = pd.read_csv(f)
28 | x = pd.read_csv(data_name)
29 | 
30 | print("sorting...")
31 | 
32 | gc.collect()
33 | t_start = timeit.default_timer()
34 | ans = x.sort_values('KEY')
35 | print(ans.shape)
36 | t = timeit.default_timer() - t_start
37 | m = memory_usage()
38 | t_start = timeit.default_timer()
39 | chk = [ans['X2'].sum()]
40 | chkt = timeit.default_timer() - t_start
41 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
42 | del ans
43 | 
44 | gc.collect()
45 | t_start = timeit.default_timer()
46 | ans = x.sort_values('KEY')
47 | print(ans.shape)
48 | t = timeit.default_timer() - t_start
49 | m = memory_usage()
50 | t_start = timeit.default_timer()
51 | chk = [ans['X2'].sum()]
52 | chkt = timeit.default_timer() - t_start
53 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
54 | del ans
55 | 
56 | gc.collect()
57 | t_start = timeit.default_timer()
58 | ans = x.sort_values('KEY')
59 | print(ans.shape)
60 | t = timeit.default_timer() - t_start
61 | m = memory_usage()
62 | t_start = timeit.default_timer()
63 | chk = [ans['X2'].sum()]
64 | chkt = timeit.default_timer() - t_start
65 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
66 | del ans
67 | 
68 | exit(0)
69 | 


--------------------------------------------------------------------------------
/pandas/upg-pandas.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | echo 'upgrading pandas...'
 5 | 
 6 | source ./pandas/py-pandas/bin/activate
 7 | 
 8 | python3 -m pip install --upgrade pandas > /dev/null
 9 | 
10 | deactivate


--------------------------------------------------------------------------------
/pandas/ver-pandas.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | source ./pandas/py-pandas/bin/activate
5 | python3 -c 'import pandas as pd; open("pandas/VERSION","w").write(pd.__version__); open("pandas/REVISION","w").write(pd.__git_version__);' > /dev/null
6 | deactivate


--------------------------------------------------------------------------------
/path.env:
--------------------------------------------------------------------------------
1 | export JULIA_HOME=/opt/julia-1.9.2
2 | export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
3 | export PATH=$PATH:$JULIA_HOME/bin
4 | export MOUNT_POINT=/var/lib/mount
5 | export SPILL_DIR=/var/lib/mount/db-benchmark-metal/spill
6 | 


--------------------------------------------------------------------------------
/polars/VERSION:
--------------------------------------------------------------------------------
1 | 1.30.0


--------------------------------------------------------------------------------
/polars/monitor_ram.py:
--------------------------------------------------------------------------------
 1 | import psutil
 2 | import time
 3 | import sys
 4 | 
 5 | solution = str(sys.argv[1])
 6 | data_name = str(sys.argv[2])
 7 | pid_of_parent = int(sys.argv[3])
 8 | 
 9 | max_loops = 720
10 | file_name = f"{solution}-ram-{data_name}.txt"
11 | i = 0
12 | f = open(file_name, "w")
13 | f.close()
14 | while i < max_loops:
15 |     # Get the current RAM usage and RSS
16 |     process = psutil.Process(pid_of_parent)
17 |     rss_usage = process.memory_info().rss >> 30
18 |     ram_usage = psutil.virtual_memory().available >> 30
19 | 
20 |     # Print the results
21 |     f = open(file_name, "a")
22 |     f.write(f"RAM usage: {ram_usage} GB \n")
23 |     f.write(f"RSS usage: {rss_usage} GB \n \n")
24 |     f.close()
25 |     
26 |     # Wait for 10 seconds before polling again
27 |     time.sleep(5)
28 |     i += 1
29 | 


--------------------------------------------------------------------------------
/polars/setup-polars.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # install dependencies
 5 | # sudo apt-get update -qq
 6 | 
 7 | virtualenv polars/py-polars --python=python3
 8 | source polars/py-polars/bin/activate
 9 | 
10 | python3 -m pip install --upgrade psutil polars numpy
11 | 
12 | # build
13 | deactivate
14 | 
15 | ./polars/upg-polars.sh
16 | 
17 | ./polars/ver-polars.sh
18 | 
19 | # check
20 | # source polars/py-polars/bin/activate
21 | # python3
22 | # import polars as pl
23 | # pl.__version__
24 | # quit()
25 | # deactivate
26 | 
27 | # fix: print(ans.head(3), flush=True): UnicodeEncodeError: 'ascii' codec can't encode characters in position 14-31: ordinal not in range(128)
28 | # vim polars/py-polars/bin/activate
29 | #deactivate () {
30 | #    unset PYTHONIOENCODING
31 | #    ...
32 | #}
33 | #...
34 | #PYTHONIOENCODING="utf-8"
35 | #export PYTHONIOENCODING
36 | #...
37 | 


--------------------------------------------------------------------------------
/polars/upg-polars.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | echo 'upgrading polars...'
 5 | 
 6 | source ./polars/py-polars/bin/activate
 7 | 
 8 | python3 -m pip install --upgrade polars > /dev/null
 9 | 
10 | deactivate


--------------------------------------------------------------------------------
/polars/ver-polars.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | source ./polars/py-polars/bin/activate
5 | python3 -c 'import polars as pl; open("polars/VERSION","w").write(pl.__version__); open("polars/REVISION","w").write("");' > /dev/null
6 | 


--------------------------------------------------------------------------------
/pydatatable/VERSION:
--------------------------------------------------------------------------------
1 | 1.2.0a0


--------------------------------------------------------------------------------
/pydatatable/convert-pydatatable-data.py:
--------------------------------------------------------------------------------
 1 | print("pydatatable: converting 50GB join data")
 2 | import os
 3 | import datatable as dt
 4 | 
 5 | if os.path.isfile('data/J1_1e9_NA_0_0.csv'):
 6 |     dt.fread('data/J1_1e9_NA_0_0.csv').to_jay('data/J1_1e9_NA_0_0.jay')
 7 | if os.path.isfile('data/J1_1e9_1e9_0_0.csv'):
 8 |     dt.fread('data/J1_1e9_1e9_0_0.csv').to_jay('data/J1_1e9_1e9_0_0.jay')
 9 | if os.path.isfile('data/J1_1e9_1e6_0_0.csv'):
10 |     dt.fread('data/J1_1e9_1e6_0_0.csv').to_jay('data/J1_1e9_1e6_0_0.jay')
11 | if os.path.isfile('data/J1_1e9_1e3_0_0.csv'):
12 |     dt.fread('data/J1_1e9_1e3_0_0.csv').to_jay('data/J1_1e9_1e3_0_0.jay')
13 | if os.path.isfile('data/J1_1e9_NA_0_1.csv'):
14 |     dt.fread('data/J1_1e9_NA_0_1.csv').to_jay('data/J1_1e9_NA_0_1.jay')
15 | if os.path.isfile('data/J1_1e9_1e9_0_1.csv'):
16 |     dt.fread('data/J1_1e9_1e9_0_1.csv').to_jay('data/J1_1e9_1e9_0_1.jay')
17 | if os.path.isfile('data/J1_1e9_1e6_0_1.csv'):
18 |     dt.fread('data/J1_1e9_1e6_0_1.csv').to_jay('data/J1_1e9_1e6_0_1.jay')
19 | if os.path.isfile('data/J1_1e9_1e3_0_1.csv'):
20 |     dt.fread('data/J1_1e9_1e3_0_1.csv').to_jay('data/J1_1e9_1e3_0_1.jay')
21 | if os.path.isfile('data/J1_1e9_NA_5_0.csv'):
22 |     dt.fread('data/J1_1e9_NA_5_0.csv').to_jay('data/J1_1e9_NA_5_0.jay')
23 | if os.path.isfile('data/J1_1e9_1e9_5_0.csv'):
24 |     dt.fread('data/J1_1e9_1e9_5_0.csv').to_jay('data/J1_1e9_1e9_5_0.jay')
25 | if os.path.isfile('data/J1_1e9_1e6_5_0.csv'):
26 |     dt.fread('data/J1_1e9_1e6_5_0.csv').to_jay('data/J1_1e9_1e6_5_0.jay')
27 | if os.path.isfile('data/J1_1e9_1e3_5_0.csv'):
28 |     dt.fread('data/J1_1e9_1e3_5_0.csv').to_jay('data/J1_1e9_1e3_5_0.jay')
29 | 
30 | print("pydatatable: done converting 50GB join data")


--------------------------------------------------------------------------------
/pydatatable/read-pydatatable.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | print("# read-pydatatable.py")
  4 | 
  5 | import os
  6 | import gc
  7 | import timeit
  8 | import subprocess
  9 | import datatable as dt
 10 | from datatable import f, sum
 11 | 
 12 | exec(open("./helpers.py").read())
 13 | 
 14 | src_grp = os.environ['SRC_GRP_LOCAL']
 15 | 
 16 | ver = dt.__version__
 17 | git = dt.__git_revision__
 18 | task = "read"
 19 | data_name = os.path.basename(src_grp)
 20 | solution = "pydatatable"
 21 | fun = "fread"
 22 | cache = "TRUE"
 23 | 
 24 | wc_lines = subprocess.run(['wc','-l',data_name], stdout=subprocess.PIPE).stdout.decode('utf-8').split(" ", 1)[0]
 25 | in_rows = int(wc_lines)-1
 26 | 
 27 | print("reading...")
 28 | 
 29 | question = "all rows" #1
 30 | gc.collect()
 31 | t_start = timeit.default_timer()
 32 | ans = dt.fread(data_name, show_progress=False)
 33 | print(ans.shape)
 34 | t = timeit.default_timer() - t_start
 35 | m = memory_usage()
 36 | t_start = timeit.default_timer()
 37 | chk = ans[:, sum(f.v3)]
 38 | chkt = timeit.default_timer() - t_start
 39 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
 40 | del ans
 41 | gc.collect()
 42 | t_start = timeit.default_timer()
 43 | ans = dt.fread(data_name, show_progress=False)
 44 | print(ans.shape)
 45 | t = timeit.default_timer() - t_start
 46 | m = memory_usage()
 47 | t_start = timeit.default_timer()
 48 | chk = ans[:, sum(f.v3)]
 49 | chkt = timeit.default_timer() - t_start
 50 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
 51 | del ans
 52 | gc.collect()
 53 | t_start = timeit.default_timer()
 54 | ans = dt.fread(data_name, show_progress=False)
 55 | print(ans.shape)
 56 | t = timeit.default_timer() - t_start
 57 | m = memory_usage()
 58 | t_start = timeit.default_timer()
 59 | chk = ans[:, sum(f.v3)]
 60 | chkt = timeit.default_timer() - t_start
 61 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
 62 | del ans
 63 | 
 64 | question = "top 100 rows" #2
 65 | gc.collect()
 66 | t_start = timeit.default_timer()
 67 | ans = dt.fread(data_name, max_nrows=100, show_progress=False)
 68 | print(ans.shape)
 69 | t = timeit.default_timer() - t_start
 70 | m = memory_usage()
 71 | t_start = timeit.default_timer()
 72 | chk = ans[:, sum(f.v3)]
 73 | chkt = timeit.default_timer() - t_start
 74 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
 75 | del ans
 76 | gc.collect()
 77 | t_start = timeit.default_timer()
 78 | ans = dt.fread(data_name, max_nrows=100, show_progress=False)
 79 | print(ans.shape)
 80 | t = timeit.default_timer() - t_start
 81 | m = memory_usage()
 82 | t_start = timeit.default_timer()
 83 | chk = ans[:, sum(f.v3)]
 84 | chkt = timeit.default_timer() - t_start
 85 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
 86 | del ans
 87 | gc.collect()
 88 | t_start = timeit.default_timer()
 89 | ans = dt.fread(data_name, max_nrows=100, show_progress=False)
 90 | print(ans.shape)
 91 | t = timeit.default_timer() - t_start
 92 | m = memory_usage()
 93 | t_start = timeit.default_timer()
 94 | chk = ans[:, sum(f.v3)]
 95 | chkt = timeit.default_timer() - t_start
 96 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
 97 | del ans
 98 | 
 99 | exit(0)
100 | 


--------------------------------------------------------------------------------
/pydatatable/setup-pydatatable.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # install dependencies
 5 | virtualenv pydatatable/py-pydatatable --python=python3
 6 | source pydatatable/py-pydatatable/bin/activate
 7 | 
 8 | python -m pip install --upgrade psutil
 9 | 
10 | # # build
11 | deactivate
12 | ./pydatatable/upg-pydatatable.sh
13 | 
14 | # # check
15 | # source pydatatable/py-pydatatable/bin/activate
16 | # python
17 | # import datatable as dt
18 | # dt.__version__
19 | # quit()
20 | # deactivate
21 | 
22 | # resave 1e9 join data from csv to jay format so pydt can try out-of-memory processing
23 | source pydatatable/py-pydatatable/bin/activate
24 | python3 pydatatable/convert-pydatatable-data.py
25 | 
26 | ./pydatatable/ver-pydatatable.sh


--------------------------------------------------------------------------------
/pydatatable/sort-pydatatable.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | print("# sort-pydatatable.py")
 4 | 
 5 | import os
 6 | import gc
 7 | import timeit
 8 | import datatable as dt
 9 | from datatable import f, sum
10 | 
11 | exec(open("./helpers.py").read())
12 | 
13 | src_x = os.environ['SRC_X_LOCAL']
14 | 
15 | ver = dt.__version__
16 | git = dt.__git_revision__
17 | task = "sort"
18 | question = "by int KEY"
19 | data_name = os.path.basename(src_x)
20 | solution = "pydatatable"
21 | fun = ".sort"
22 | cache = "TRUE"
23 | 
24 | print("loading dataset...")
25 | 
26 | x = dt.fread(data_name)
27 | 
28 | print("sorting...")
29 | 
30 | gc.collect()
31 | t_start = timeit.default_timer()
32 | ans = x.sort('KEY')
33 | print(ans.shape)
34 | t = timeit.default_timer() - t_start
35 | m = memory_usage()
36 | t_start = timeit.default_timer()
37 | chk = ans[:, sum(f.X2)]
38 | chkt = timeit.default_timer() - t_start
39 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
40 | del ans
41 | 
42 | gc.collect()
43 | t_start = timeit.default_timer()
44 | ans = x.sort('KEY')
45 | print(ans.shape)
46 | t = timeit.default_timer() - t_start
47 | m = memory_usage()
48 | t_start = timeit.default_timer()
49 | chk = ans[:, sum(f.X2)]
50 | chkt = timeit.default_timer() - t_start
51 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
52 | del ans
53 | 
54 | gc.collect()
55 | t_start = timeit.default_timer()
56 | ans = x.sort('KEY')
57 | print(ans.shape)
58 | t = timeit.default_timer() - t_start
59 | m = memory_usage()
60 | t_start = timeit.default_timer()
61 | chk = ans[:, sum(f.X2)]
62 | chkt = timeit.default_timer() - t_start
63 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
64 | del ans
65 | 
66 | exit(0)
67 | 


--------------------------------------------------------------------------------
/pydatatable/upg-pydatatable.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | echo 'upgrading pydatatable...'
 5 | 
 6 | source ./pydatatable/py-pydatatable/bin/activate
 7 | python3 -m pip install --upgrade git+https://github.com/h2oai/datatable > /dev/null 2>&1
 8 | deactivate
 9 | 
10 | echo 'done upgrading'
11 | 


--------------------------------------------------------------------------------
/pydatatable/ver-pydatatable.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | source ./pydatatable/py-pydatatable/bin/activate
5 | python3 -c 'import datatable as dt; open("pydatatable/VERSION","w").write(dt.__version__.split("+", 1)[0]); open("pydatatable/REVISION","w").write(dt.build_info.git_revision);' > /dev/null
6 | 


--------------------------------------------------------------------------------
/run.conf:
--------------------------------------------------------------------------------
 1 | # task, used in init-setup-iteration.R
 2 | export RUN_TASKS="groupby join"
 3 | # solution, used in init-setup-iteration.R
 4 | export RUN_SOLUTIONS="R-arrow collapse datafusion duckdb polars spark "
 5 | 
 6 | # flag to upgrade tools, used in run.sh on init
 7 | export DO_UPGRADE=false
 8 | # force run, ignore if same version was run already
 9 | export FORCE_RUN=true
10 | # not run benchmarks but print what would run and what skipped
11 | export MOCKUP=false
12 | 
13 | # flag to build reports, used in ruh.sh before publish
14 | export DO_REPORT=true
15 | # flag to publish, used in ruh.sh before exit
16 | export DO_PUBLISH=false
17 | 
18 | # logging and timing files
19 | export CSV_LOGS_FILE="logs.csv"
20 | export CSV_TIME_FILE="time.csv"
21 | 


--------------------------------------------------------------------------------
/spark/VERSION:
--------------------------------------------------------------------------------
1 | 4.0.0


--------------------------------------------------------------------------------
/spark/setup-spark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # install java8
 5 | # sudo apt-get install openjdk-8-jdk
 6 | 
 7 | virtualenv spark/py-spark --python=python3
 8 | 
 9 | 
10 | # put to paths
11 | echo 'export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64' >> path.env
12 | 
13 | source path.env
14 | 
15 | source spark/py-spark/bin/activate
16 | # install binaries
17 | python3 -m pip install --upgrade psutil
18 | python3 -m pip install --upgrade pyspark
19 | 
20 | # check
21 | # python3
22 | # import pyspark
23 | # pyspark.__version__
24 | # quit()
25 | 
26 | deactivate
27 | 
28 | 
29 | ./spark/ver-spark.sh


--------------------------------------------------------------------------------
/spark/upg-spark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | echo 'upgrading spark...'
 5 | 
 6 | source ./spark/py-spark/bin/activate
 7 | 
 8 | python3 -m pip install --upgrade pyspark > /dev/null
 9 | 
10 | deactivate


--------------------------------------------------------------------------------
/spark/ver-spark.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | 
4 | source ./spark/py-spark/bin/activate
5 | python3 -c 'import pyspark; open("spark/VERSION","w").write(pyspark.__version__); open("spark/REVISION","w").write("");' > /dev/null
6 | 


--------------------------------------------------------------------------------