├── .github
└── workflows
│ ├── RunBenchmark.yml
│ ├── regression.yml
│ └── static.yml
├── .gitignore
├── LICENSE
├── R-arrow
├── VERSION
├── groupby-R-arrow.R
├── join-R-arrow.R
├── setup-R-arrow.sh
├── upg-R-arrow.sh
└── ver-R-arrow.sh
├── README.md
├── _benchplot
├── benchplot-dict.R
└── benchplot.R
├── _control
├── data.csv
├── data_large.csv
├── data_medium.csv
├── data_small.csv
├── nodenames.csv
├── questions.csv
├── skipped_benchmarks.csv
├── solutions.csv
└── timeout.csv
├── _data
├── groupby-datagen.R
├── groupby2014-datagen.R
└── join-datagen.R
├── _docs
└── maintenance.md
├── _helpers
├── helpers.R
├── helpers.jl
├── helpers.py
├── helpers.sh
└── helpersds.jl
├── _launcher
├── launch.R
├── launcher.R
├── setup.sh
└── solution.R
├── _report
├── blah.R
├── ga.html
├── history.Rmd
├── index.Rmd
├── publish.sh
├── report.R
└── tech.Rmd
├── _run
├── download_small_medium.sh
├── partitioned_run.sh
├── run_large.sh
├── run_medium.sh
└── run_small.sh
├── _setup_utils
├── .DS_Store
├── install_all_solutions.py
├── mount.sh
├── mount_and_install_solutions.sh
├── prep_solutions.py
├── repro.sh
├── setup_small.sh
└── sleep_and_run.sh
├── _utils
├── answers-validation.R
├── compare-data.table.R
├── download_data.sh
├── generate-data-small.sh
├── groupby_k_factor.csv
├── maintainer.R
├── maintainer.sh
├── parse_time_logs.R
├── partitioned_run.sh
├── sql_to_check_timings
│ └── timing_checks.sql
├── time.R
└── validate_no_errors.sh
├── arrow
└── VERSION
├── clickhouse
├── VERSION
├── ch.sh
├── clickhouse-misc.sh
├── clickhouse-mount-config.xml
├── clickhouse-parse-log.R
├── exec.sh
├── groupby-clickhouse.sh
├── join-clickhouse.sh
├── setup-clickhouse.sh
├── upg-clickhouse.sh
└── ver-clickhouse.sh
├── collapse
├── VERSION
├── groupby-collapse.R
├── groupby2014-collapse.R
├── join-collapse.R
├── setup-collapse.sh
├── upg-collapse.sh
└── ver-collapse.sh
├── dask
├── VERSION
├── common.py
├── groupby_dask.py
├── join_dask.py
├── setup-dask.sh
├── upg-dask.sh
└── ver-dask.sh
├── datafusion
├── VERSION
├── groupby-datafusion.py
├── join-datafusion.py
├── setup-datafusion.sh
├── upg-datafusion.sh
└── ver-datafusion.sh
├── datatable
├── VERSION
├── groupby-datatable.R
├── groupby2014-datatable.R
├── join-datatable.R
├── read-datatable.R
├── setup-datatable.sh
├── sort-datatable.R
├── upg-datatable.sh
└── ver-datatable.sh
├── dplyr
├── VERSION
├── groupby-dplyr.R
├── groupby2014-dplyr.R
├── join-dplyr.R
├── read-dplyr.R
├── setup-dplyr.sh
├── sort-dplyr.R
├── upg-dplyr.sh
└── ver-dplyr.sh
├── duckdb-latest
├── VERSION
├── groupby-duckdb-latest.R
├── join-duckdb-latest.R
├── setup-duckdb-latest.sh
├── upg-duckdb-latest.sh
└── ver-duckdb-latest.sh
├── duckdb
├── VERSION
├── groupby-duckdb.R
├── join-duckdb.R
├── setup-duckdb.sh
├── upg-duckdb.sh
└── ver-duckdb.sh
├── h2o
├── exec.sh
├── groupby-h2o.R
├── h2o.sh
├── join-h2o.R
├── setup-h2o.sh
├── upg-h2o.sh
└── ver-h2o.sh
├── juliadf
├── VERSION
├── exec.sh
├── groupby-juliadf.jl
├── join-juliadf.jl
├── setup-juliadf.sh
├── upg-juliadf.sh
└── ver-juliadf.sh
├── juliads
├── VERSION
├── exec.sh
├── groupby-juliads.jl
├── join-juliads.jl
├── setup-juliads.sh
├── upg-juliads.sh
└── ver-juliads.sh
├── logs.csv
├── modin
├── groupby-modin.py
├── join-modin.py
├── setup-modin.sh
├── sort-modin.py
├── upg-modin.sh
└── ver-modin.sh
├── pandas
├── VERSION
├── groupby-pandas.py
├── groupby2014-pandas.py
├── join-pandas.py
├── read-pandas.py
├── setup-pandas.sh
├── sort-pandas.py
├── upg-pandas.sh
└── ver-pandas.sh
├── path.env
├── polars
├── VERSION
├── groupby-polars.py
├── join-polars.py
├── monitor_ram.py
├── setup-polars.sh
├── upg-polars.sh
└── ver-polars.sh
├── pydatatable
├── VERSION
├── convert-pydatatable-data.py
├── groupby-pydatatable.py
├── join-pydatatable.py
├── read-pydatatable.py
├── setup-pydatatable.sh
├── sort-pydatatable.py
├── upg-pydatatable.sh
└── ver-pydatatable.sh
├── run.conf
├── run.sh
├── spark
├── VERSION
├── groupby-spark.py
├── join-spark.py
├── setup-spark.sh
├── upg-spark.sh
└── ver-spark.sh
└── time.csv
/.github/workflows/RunBenchmark.yml:
--------------------------------------------------------------------------------
1 | name: Run benchmark
2 | on:
3 | workflow_dispatch:
4 | inputs:
5 | solutions:
6 | type: string
7 | instance_id:
8 | type: string
9 | include_clickhouse:
10 | type: bool
11 | sizes:
12 | type: string
13 |
14 | concurrency:
15 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/master' || github.sha }}
16 | cancel-in-progress: true
17 |
18 | env:
19 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
20 | gh_issue_repo: duckdblabs/db-benchmark
21 | instance_id: ${{ inputs.instance_id }}
22 | solutions: ${{ inputs.solutions }}
23 | include_clickhouse: ${{ inputs.include_clickhouse }}
24 |
25 |
26 | jobs:
27 | start-aws-machine:
28 | name: Start aws-small-machine
29 | runs-on: ubuntu-latest
30 | environment: aws-secrets
31 | steps:
32 | - name: Start EC2 runner
33 | shell: bash
34 | env:
35 | AWS_ACCESS_KEY_ID: ${{secrets.AWS_ACCESS_KEY_ID}}
36 | AWS_SECRET_ACCESS_KEY: ${{secrets.AWS_SECRET_ACCESS_KEY}}
37 | AWS_DEFAULT_REGION: us-east-1
38 | run: aws ec2 start-instances --instance-id ${{ env.instance_id }}
39 |
40 | - name: Create issue if failure
41 | shell: bash
42 | if: ${{ failure() && contains(github.ref_name, 'main') }}
43 | run: |
44 | gh issue create --repo ${{ env.gh_issue_repo }} --title "Could not start DB-benchmark machine" --body "AWS box with instance-id ${{ env.instance_id }} could not be started"
45 |
46 | run-benchmark:
47 | name: Regression Tests all solutions
48 | env:
49 | CC: gcc-10
50 | CXX: g++-10
51 | GEN: ninja
52 | runs-on: self-hosted
53 | environment: aws-secrets
54 | permissions: # Job-level permissions configuration starts here
55 | contents: write # 'write' access to repository contents
56 | pull-requests: write # 'write' access to pull requests
57 | steps:
58 | - uses: actions/checkout@v4
59 |
60 | - name: run mount
61 | shell: bash
62 | run: |
63 | ./_setup_utils/mount.sh
64 |
65 | - name: Install or Upgrade all solutions
66 | shell: bash
67 | working-directory: /var/lib/mount/db-benchmark-metal
68 | run: |
69 | python3 _setup_utils/install_all_solutions.py ${{ env.solutions }}
70 | if [ ${{ env.include_clickhouse }} ]; then
71 | # installing/updating clickhouse needs sudo priviledges
72 | sudo python3 _setup_utils/install_all_solutions.py clickhouse
73 | fi
74 |
75 | - name: Modify run.conf to only have new versions
76 | shell: bash
77 | working-directory: /var/lib/mount/db-benchmark-metal
78 | run: |
79 | git diff --name-only **/VERSION > updated_solutions.txt
80 | cat updated_solutions.txt
81 | export new_solutions="${{ env.solutions }}"
82 | echo "testing solutions: " $new_solutions
83 | sed -i "s/export RUN_SOLUTIONS=.*/export RUN_SOLUTIONS=\"${new_solutions}\"/g" run.conf
84 |
85 | - name: Run the benchmark
86 | shell: bash
87 | working-directory: /var/lib/mount/db-benchmark-metal
88 | env:
89 | DO_REPORT: 1
90 | DO_PUBLISH: 0
91 | run: |
92 | ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'`
93 | if [ $ncores -eq 16 ]; then export MACHINE_TYPE="c6id.4xlarge"; fi
94 | if [ $ncores -eq 32 ]; then export MACHINE_TYPE="c6id.8xlarge"; fi
95 | if [ $ncores -eq 128 ]; then export MACHINE_TYPE="c6id.metal"; fi
96 | if [[ ${{ inputs.sizes }} == *"small"* ]];
97 | MACHINE_TYPE=$MACHINE_TYPE ./_run/run_small.sh
98 | fi
99 | if [[ ${{ inputs.sizes }} == *"medium"* ]];
100 | MACHINE_TYPE=$MACHINE_TYPE ./_run/run_medium.sh
101 | fi
102 | if [[ ${{ inputs.sizes }} == *"large"* ]];
103 | MACHINE_TYPE=$MACHINE_TYPE ./_run/run_large.sh
104 | fi
105 |
106 | - name: name new branch
107 | shell: bash
108 | run: |
109 | echo "new_branch_name=results-`date +%Y-%m-%d-%Hh%Mm`" >> $GITHUB_ENV
110 | echo ${{ env.new_branch_name }}
111 |
112 | - name: Commit updates
113 | shell: bash
114 | working-directory: /var/lib/mount/db-benchmark-metal
115 | run: |
116 | git config --global user.email ""
117 | git config --global user.name "Run Benchmark action"
118 | git remote add upstream git@github.com:duckdblabs/db-benchmark
119 | git fetch upstream
120 | git switch -c ${{ env.new_branch_name }}
121 | git add time.csv logs.csv **/VERSION
122 | git add run.conf
123 | git commit -m "new results"
124 | git push upstream ${{ env.new_branch_name }}
125 |
126 | - name: Create Archive
127 | if: always()
128 | shell: bash
129 | working-directory: /var/lib/mount/db-benchmark-metal
130 | run: |
131 | mkdir -p out
132 | echo "guarantee not empty dir" > out/guarantee.txt
133 | zip -r out-dir.zip out/ public/
134 |
135 | - uses: actions/upload-artifact@v4
136 | if: always()
137 | with:
138 | name: out-dir.zip
139 | path: /var/lib/mount/db-benchmark-metal/out-dir.zip
140 | if-no-files-found: error
141 |
142 | shutdown:
143 | name: shut down
144 | environment: aws-secrets
145 | if: always()
146 | runs-on: ubuntu-latest
147 | needs:
148 | - start-aws-machine
149 | - run-benchmark
150 |
151 | steps:
152 | - name: shutdown
153 | shell: bash
154 | env:
155 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
156 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
157 | AWS_DEFAULT_REGION: us-east-1
158 | run: aws ec2 stop-instances --instance-id ${{ env.instance_id }}
159 |
160 |
--------------------------------------------------------------------------------
/.github/workflows/regression.yml:
--------------------------------------------------------------------------------
1 | name: Regression
2 | on:
3 | workflow_dispatch:
4 | repository_dispatch:
5 | pull_request:
6 | paths-ignore:
7 | - '**.md'
8 | - '.github/workflows/**'
9 | - '!.github/workflows/regression.yml'
10 |
11 | concurrency:
12 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/master' || github.sha }}
13 | cancel-in-progress: true
14 |
15 | jobs:
16 | regression-test-benchmark-runner-solo-solutions:
17 | strategy:
18 | fail-fast: false
19 | matrix:
20 | solution: [data.table, collapse, dplyr, pandas, pydatatable, spark, juliadf, juliads, polars, R-arrow, duckdb, datafusion, dask, clickhouse]
21 | name: Solo solutions
22 | runs-on: ubuntu-latest
23 | env:
24 | CC: gcc-10
25 | CXX: g++-10
26 | GEN: ninja
27 |
28 | steps:
29 | - uses: actions/checkout@v3
30 | with:
31 | fetch-depth: 0
32 |
33 | - uses: actions/setup-python@v4
34 | with:
35 | python-version: '3.10'
36 |
37 | - name: Install libraries
38 | shell: bash
39 | run: ./_setup_utils/setup_small.sh
40 |
41 | - name: Generate 500mb datasets
42 | shell: bash
43 | run: ./_utils/generate-data-small.sh
44 |
45 | - name: Remove old logs
46 | shell: bash
47 | run: rm time.csv logs.csv
48 |
49 | - name: Install all solutions
50 | shell: bash
51 | run: source path.env && python3 _setup_utils/install_all_solutions.py ${{ matrix.solution }}
52 |
53 | - name: Turn swap off
54 | shell: bash
55 | run: sudo swapoff -a
56 |
57 | # needed because clickhouse for some reason produces an error the first
58 | # time a benchmark is run. The next benchmark run will work and overwrite the
59 | # old benchmark files.
60 | - name: Run mini GroupBy benchmark if clickhouse
61 | shell: bash
62 | if: ${{ matrix.solution == 'clickhouse' || matrix.solution == 'all' }}
63 | run: |
64 | python3 _setup_utils/prep_solutions.py --task=groupby --solution=clickhouse
65 | source path.env
66 | MACHINE_TYPE="c6id.4xlarge" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
67 | sleep 60
68 | MACHINE_TYPE="c6id.metal" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
69 | sleep 60
70 |
71 | - name: Run mini GroupBy benchmark
72 | shell: bash
73 | run: |
74 | python3 _setup_utils/prep_solutions.py --task=groupby --solution=${{ matrix.solution }}
75 | source path.env
76 | MACHINE_TYPE="c6id.4xlarge" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
77 | sleep 60
78 | MACHINE_TYPE="c6id.metal" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
79 | sleep 60
80 |
81 | - name: Run mini Join benchmark
82 | shell: bash
83 | run: |
84 | python3 _setup_utils/prep_solutions.py --task=join --solution=${{ matrix.solution }}
85 | source path.env
86 | MACHINE_TYPE="c6id.4xlarge" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
87 | sleep 60
88 | MACHINE_TYPE="c6id.metal" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
89 | sleep 60
90 |
91 | - name: Validate benchmark results and report generation
92 | shell: bash
93 | run: ./_utils/validate_no_errors.sh
94 |
95 | - name: Create Archive
96 | if: always()
97 | shell: bash
98 | run: |
99 | cp *.csv out/
100 | zip -r ${{ matrix.solution }}-out.zip out/
101 |
102 | # include this step to see what the latest versions are of every solution
103 | - name: Print latest versions
104 | if: always()
105 | shell: bash
106 | run: tail -n +1 */VERSION
107 |
108 | - uses: actions/upload-artifact@v4
109 | if: always()
110 | with:
111 | name: ${{ matrix.solution }}-out.zip
112 | path: ${{ matrix.solution }}-out.zip
113 | if-no-files-found: error
114 |
115 | regression-test-benchmark-runner-all-solutions:
116 | needs: regression-test-benchmark-runner-solo-solutions
117 | name: Regression Tests all solutions
118 | runs-on: ubuntu-20.04
119 | env:
120 | CC: gcc-10
121 | CXX: g++-10
122 | GEN: ninja
123 |
124 | steps:
125 | - uses: actions/checkout@v3
126 | with:
127 | fetch-depth: 0
128 |
129 | - uses: actions/setup-python@v4
130 | with:
131 | python-version: '3.10'
132 |
133 | - name: Install libraries
134 | shell: bash
135 | run: ./_setup_utils/setup_small.sh
136 |
137 | - name: Generate 500mb datasets
138 | shell: bash
139 | run: ./_utils/generate-data-small.sh
140 |
141 | - name: Remove old logs
142 | shell: bash
143 | run: rm time.csv logs.csv
144 |
145 | - name: Install all solutions
146 | shell: bash
147 | run: source path.env && python3 _setup_utils/install_all_solutions.py all
148 |
149 | - name: Turn swap off
150 | shell: bash
151 | run: sudo swapoff -a
152 |
153 | - name: Run mini GroupBy benchmark
154 | shell: bash
155 | run: |
156 | python3 _setup_utils/prep_solutions.py --task=groupby --solution=all
157 | source path.env
158 | MACHINE_TYPE="c6id.4xlarge" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
159 | sleep 60
160 | MACHINE_TYPE="c6id.metal" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
161 |
162 | - name: Run mini Join benchmark
163 | shell: bash
164 | run: |
165 | python3 _setup_utils/prep_solutions.py --task=join --solution=all
166 | source path.env
167 | MACHINE_TYPE="c6id.4xlarge" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
168 | sleep 60
169 | MACHINE_TYPE="c6id.metal" TEST_RUN=true TEST_MOUNT_DIR=$GITHUB_WORKSPACE ./run.sh
170 |
171 | - name: Validate benchmark results and report generation
172 | shell: bash
173 | run: ./_utils/validate_no_errors.sh
174 |
175 | - name: Create Archive
176 | if: always()
177 | shell: bash
178 | run: |
179 | cp *.csv out/
180 | zip -r all-out.zip out/
181 |
182 | # include this step to see what the latest versions are of every solution
183 | - name: Print latest versions
184 | if: always()
185 | shell: bash
186 | run: tail -n +1 */VERSION
187 |
188 | - uses: actions/upload-artifact@v4
189 | if: always()
190 | with:
191 | name: all-out.zip
192 | path: all-out.zip
193 | if-no-files-found: error
194 |
195 |
--------------------------------------------------------------------------------
/.github/workflows/static.yml:
--------------------------------------------------------------------------------
1 | # Simple workflow for deploying static content to GitHub Pages
2 | name: Deploy static content to Pages
3 |
4 | on:
5 | # Runs on pushes targeting the default branch
6 | push:
7 | branches: ["gh-pages"]
8 |
9 | # Allows you to run this workflow manually from the Actions tab
10 | workflow_dispatch:
11 |
12 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
13 | permissions:
14 | contents: read
15 | pages: write
16 | id-token: write
17 |
18 | # Allow one concurrent deployment
19 | concurrency:
20 | group: "pages"
21 | cancel-in-progress: true
22 |
23 | jobs:
24 | # Single deploy job since we're just deploying
25 | deploy:
26 | environment:
27 | name: github-pages
28 | url: ${{ steps.deployment.outputs.page_url }}
29 | runs-on: ubuntu-latest
30 | steps:
31 | - name: Checkout
32 | uses: actions/checkout@v3
33 | - name: Setup Pages
34 | uses: actions/configure-pages@v2
35 | - name: Upload artifact
36 | uses: actions/upload-pages-artifact@v1
37 | with:
38 | # Upload entire repository
39 | path: '.'
40 | - name: Deploy to GitHub Pages
41 | id: deployment
42 | uses: actions/deploy-pages@v1
43 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | figure/*
2 | metastore_db/*
3 | *.log
4 | *.html
5 | *.csv
6 | !time.csv
7 | !logs.csv
8 | !_control/data_small.csv
9 | !_control/data_large.csv
10 | *.md5
11 | .Rproj.user
12 | .Rhistory
13 | db-benchmark.Rproj
14 | */REVISION
15 | token
16 | .token
17 | public/
18 | out/
19 | data/
20 | clickhouse/log/
21 | clickhouse/*-clickhouse.sql
22 | clickhouse/unused/
23 | */log/
24 | tmp/
25 | dask-worker-space/
26 | GA/
27 | utils/
28 | */py-*/
29 | */r-*/
30 | duckdb-latest/duckdb
31 | report-done
32 | db-benchmark.gh-pages/
33 | run.out
34 | clickhouse/etc_sudoers.bak
35 | workdir/
36 | timeout-exit-codes.out
37 | */target
38 | *.lock
39 |
--------------------------------------------------------------------------------
/R-arrow/VERSION:
--------------------------------------------------------------------------------
1 | 20.0.0.2
2 |
--------------------------------------------------------------------------------
/R-arrow/setup-R-arrow.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | # install stable arrow
5 | mkdir -p ./R-arrow/r-arrow
6 | Rscript -e 'install.packages(c("arrow","dplyr"), lib="./R-arrow/r-arrow")'
7 |
8 | ./R-arrow/ver-R-arrow.sh
9 |
--------------------------------------------------------------------------------
/R-arrow/upg-R-arrow.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | # upgrade all packages in arrow library only if new arrow is out
5 | echo 'upgrading arrow...'
6 | Rscript -e 'ap=available.packages(); if (ap["arrow","Version"]!=packageVersion("arrow", lib.loc="./R-arrow/r-arrow")) update.packages(lib.loc="./R-arrow/r-arrow", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)'
7 |
--------------------------------------------------------------------------------
/R-arrow/ver-R-arrow.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | Rscript -e 'v=read.dcf(system.file(package="arrow", lib.loc="./R-arrow/r-arrow", "DESCRIPTION"), fields=c("Version","RemoteSha")); colnames(v)[colnames(v)=="RemoteSha"]="Revision"; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("R-arrow", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))'
5 |
--------------------------------------------------------------------------------
/_control/data.csv:
--------------------------------------------------------------------------------
1 | task,data,nrow,k,na,sort,active
2 | groupby,G1_1e7_1e2_0_0,1e7,1e2,0,0,1
3 | groupby,G1_1e7_1e1_0_0,1e7,1e1,0,0,1
4 | groupby,G1_1e7_2e0_0_0,1e7,2e0,0,0,1
5 | groupby,G1_1e7_1e2_0_1,1e7,1e2,0,1,1
6 | groupby,G1_1e7_1e2_5_0,1e7,1e2,5,0,1
7 | groupby,G1_1e8_1e2_0_0,1e8,1e2,0,0,1
8 | groupby,G1_1e8_1e1_0_0,1e8,1e1,0,0,1
9 | groupby,G1_1e8_2e0_0_0,1e8,2e0,0,0,1
10 | groupby,G1_1e8_1e2_0_1,1e8,1e2,0,1,1
11 | groupby,G1_1e8_1e2_5_0,1e8,1e2,5,0,1
12 | groupby,G1_1e9_1e2_0_0,1e9,1e2,0,0,1
13 | groupby,G1_1e9_1e1_0_0,1e9,1e1,0,0,1
14 | groupby,G1_1e9_2e0_0_0,1e9,2e0,0,0,1
15 | groupby,G1_1e9_1e2_0_1,1e9,1e2,0,1,1
16 | groupby,G1_1e9_1e2_5_0,1e9,1e2,5,0,1
17 | join,J1_1e7_NA_0_0,1e7,NA,0,0,1
18 | join,J1_1e7_NA_5_0,1e7,NA,5,0,1
19 | join,J1_1e7_NA_0_1,1e7,NA,0,1,1
20 | join,J1_1e8_NA_0_0,1e8,NA,0,0,1
21 | join,J1_1e8_NA_5_0,1e8,NA,5,0,1
22 | join,J1_1e8_NA_0_1,1e8,NA,0,1,1
23 | join,J1_1e9_NA_0_0,1e9,NA,0,0,1
--------------------------------------------------------------------------------
/_control/data_large.csv:
--------------------------------------------------------------------------------
1 | task,data,nrow,k,na,sort,active
2 | groupby,G1_1e9_1e2_0_0,1e9,1e2,0,0,1
3 | groupby,G1_1e9_1e1_0_0,1e9,1e1,0,0,1
4 | groupby,G1_1e9_2e0_0_0,1e9,2e0,0,0,1
5 | groupby,G1_1e9_1e2_0_1,1e9,1e2,0,1,1
6 | groupby,G1_1e9_1e2_5_0,1e9,1e2,5,0,1
7 | join,J1_1e9_NA_0_0,1e9,NA,0,0,1
--------------------------------------------------------------------------------
/_control/data_medium.csv:
--------------------------------------------------------------------------------
1 | task,data,nrow,k,na,sort,active
2 | groupby,G1_1e8_1e2_0_0,1e8,1e2,0,0,1
3 | groupby,G1_1e8_1e1_0_0,1e8,1e1,0,0,1
4 | groupby,G1_1e8_2e0_0_0,1e8,2e0,0,0,1
5 | groupby,G1_1e8_1e2_0_1,1e8,1e2,0,1,1
6 | groupby,G1_1e8_1e2_5_0,1e8,1e2,5,0,1
7 | join,J1_1e8_NA_0_0,1e8,NA,0,0,1
8 | join,J1_1e8_NA_5_0,1e8,NA,5,0,1
9 | join,J1_1e8_NA_0_1,1e8,NA,0,1,1
--------------------------------------------------------------------------------
/_control/data_small.csv:
--------------------------------------------------------------------------------
1 | task,data,nrow,k,na,sort,active
2 | groupby,G1_1e7_1e2_0_0,1e7,1e2,0,0,1
3 | groupby,G1_1e7_1e1_0_0,1e7,1e1,0,0,1
4 | groupby,G1_1e7_2e0_0_0,1e7,2e0,0,0,1
5 | groupby,G1_1e7_1e2_0_1,1e7,1e2,0,1,1
6 | groupby,G1_1e7_1e2_5_0,1e7,1e2,5,0,1
7 | join,J1_1e7_NA_0_0,1e7,NA,0,0,1
8 | join,J1_1e7_NA_5_0,1e7,NA,5,0,1
9 | join,J1_1e7_NA_0_1,1e7,NA,0,1,1
--------------------------------------------------------------------------------
/_control/nodenames.csv:
--------------------------------------------------------------------------------
1 | nodename,cpu_model,cpu_cores,memory_model,memory_gb,gpu_model,gpu_num,gpu_gb
2 | mr-0xc11,Intel(R) Xeon(R) CPU E5-2660 v3 @ 2.60GHz,20,DIMM DDR4 Synchronous 2133 MHz,125.80,,,
3 | mr-dl11,Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz,40,DIMM Synchronous 2133 MHz,125.78,GeForce GTX 1080 Ti,2,21.83
4 | m4.10xlarge,Intel(R) Xeon(R) CPU E5-2676 v3 @ 2.40GHz,40,unkown,157,None,None,None
5 | c6id.metal,Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz,128,NVMe SSD,250,None,None,None
--------------------------------------------------------------------------------
/_control/questions.csv:
--------------------------------------------------------------------------------
1 | task,question,question_group
2 | groupby,sum v1 by id1,basic
3 | groupby,sum v1 by id1:id2,basic
4 | groupby,sum v1 mean v3 by id3,basic
5 | groupby,mean v1:v3 by id4,basic
6 | groupby,sum v1:v3 by id6,basic
7 | groupby,median v3 sd v3 by id4 id5,advanced
8 | groupby,max v1 - min v2 by id3,advanced
9 | groupby,largest two v3 by id6,advanced
10 | groupby,regression v1 v2 by id2 id4,advanced
11 | groupby,sum v3 count by id1:id6,advanced
12 | join,small inner on int,basic
13 | join,medium inner on int,basic
14 | join,medium outer on int,basic
15 | join,medium inner on factor,basic
16 | join,big inner on int,basic
17 | groupby2014,sum v1 by id1,basic
18 | groupby2014,sum v1 by id1:id2,basic
19 | groupby2014,sum v1 mean v3 by id3,basic
20 | groupby2014,mean v1:v3 by id4,basic
21 | groupby2014,sum v1:v3 by id6,basic
22 |
--------------------------------------------------------------------------------
/_control/skipped_benchmarks.csv:
--------------------------------------------------------------------------------
1 | solution,task,data,machine_type
2 | juliads,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
3 | juliads,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
4 | juliads,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
5 | juliads,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
6 | juliads,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
7 | juliadf,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
8 | juliadf,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
9 | juliadf,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
10 | juliadf,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
11 | juliadf,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
12 | R-arrow,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
13 | R-arrow,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
14 | R-arrow,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
15 | R-arrow,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
16 | R-arrow,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
17 | dplyr,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
18 | dplyr,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
19 | dplyr,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
20 | dplyr,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
21 | dplyr,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
22 | pandas,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
23 | pandas,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
24 | pandas,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
25 | pandas,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
26 | pandas,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
27 | pydatatable,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
28 | pydatatable,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
29 | pydatatable,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
30 | pydatatable,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
31 | pydatatable,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
32 | spark,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
33 | spark,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
34 | spark,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
35 | spark,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
36 | spark,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
37 | datafusion,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
38 | datafusion,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
39 | datafusion,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
40 | datafusion,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
41 | datafusion,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
42 | datafusion,join,J1_1e8_NA_0_0,c6id.4xlarge
43 | datafusion,join,J1_1e8_NA_5_0,c6id.4xlarge
44 | datafusion,join,J1_1e8_NA_0_1,c6id.4xlarge
45 | datafusion,join,J1_1e9_NA_0_0,c6id.4xlarge
46 | R-arrow,join,J1_1e9_NA_0_0,c6id.4xlarge
47 | dask,join,J1_1e9_NA_0_0,c6id.4xlarge
48 | datatable,join,J1_1e9_NA_0_0,c6id.4xlarge
49 | juliadf,join,J1_1e9_NA_0_0,c6id.4xlarge
50 | juliads,join,J1_1e9_NA_0_0,c6id.4xlarge
51 | pandas,join,J1_1e9_NA_0_0,c6id.4xlarge
52 | collapse,join,J1_1e9_NA_0_0,c6id.4xlarge
53 | polars,join,J1_1e9_NA_0_0,c6id.4xlarge
54 | pydatatable,join,J1_1e9_NA_0_0,c6id.4xlarge
55 | spark,join,J1_1e9_NA_0_0,c6id.4xlarge
56 | clickhouse,join,J1_1e9_NA_0_0,c6id.4xlarge
57 |
58 |
--------------------------------------------------------------------------------
/_control/solutions.csv:
--------------------------------------------------------------------------------
1 | solution,task
2 | collapse,groupby
3 | collapse,groupby2014
4 | collapse,join
5 | data.table,groupby
6 | data.table,join
7 | data.table,groupby2014
8 | dplyr,groupby
9 | dplyr,join
10 | dplyr,groupby2014
11 | pandas,groupby
12 | pandas,join
13 | pandas,groupby2014
14 | pydatatable,groupby
15 | pydatatable,join
16 | spark,groupby
17 | spark,join
18 | dask,groupby
19 | dask,join
20 | juliadf,groupby
21 | juliadf,join
22 | juliads,groupby
23 | juliads,join
24 | clickhouse,groupby
25 | clickhouse,join
26 | polars,groupby
27 | polars,join
28 | R-arrow,groupby
29 | R-arrow,join
30 | duckdb,groupby
31 | duckdb,join
32 | duckdb-latest,groupby
33 | duckdb-latest,join
34 | datafusion,groupby
35 | datafusion,join
36 |
--------------------------------------------------------------------------------
/_control/timeout.csv:
--------------------------------------------------------------------------------
1 | task,in_rows,minutes
2 | groupby,1e7,10
3 | groupby,1e8,30
4 | groupby,1e9,60
5 | join,1e7,10
6 | join,1e8,30
7 | join,1e9,60
8 | groupby2014,1e7,60
9 | groupby2014,1e8,120
10 | groupby2014,1e9,180
11 |
--------------------------------------------------------------------------------
/_data/groupby-datagen.R:
--------------------------------------------------------------------------------
1 | # Rscript groupby-datagen.R 1e7 1e2 0 0 ## 1e7 rows, 1e2 K, 0% NAs, random order
2 | # Rscript groupby-datagen.R 1e8 1e1 5 1 ## 1e8 rows, 10 K, 5% NAs, sorted order
3 | args = commandArgs(TRUE)
4 |
5 | pretty_sci = function(x) {
6 | tmp<-strsplit(as.character(x), "+", fixed=TRUE)[[1L]]
7 | if(length(tmp)==1L) {
8 | paste0(substr(tmp, 1L, 1L), "e", nchar(tmp)-1L)
9 | } else if(length(tmp)==2L){
10 | paste0(tmp[1L], as.character(as.integer(tmp[2L])))
11 | }
12 | }
13 |
14 | library(data.table)
15 | N=as.integer(args[1L]); K=as.integer(args[2L]); nas=as.integer(args[3L]); sort=as.integer(args[4L])
16 | stopifnot(nas<=100L, nas>=0L, sort%in%c(0L,1L))
17 | set.seed(108)
18 | cat(sprintf("Producing data of %s rows, %s K groups factors, %s NAs ratio, %s sort flag\n", pretty_sci(N), pretty_sci(K), nas, sort))
19 | DT = list()
20 | DT[["id1"]] = sample(sprintf("id%03d",1:K), N, TRUE) # large groups (char)
21 | DT[["id2"]] = sample(sprintf("id%03d",1:K), N, TRUE) # small groups (char)
22 | DT[["id3"]] = sample(sprintf("id%010d",1:(N/K)), N, TRUE) # large groups (char)
23 | DT[["id4"]] = sample(K, N, TRUE) # large groups (int)
24 | DT[["id5"]] = sample(K, N, TRUE) # small groups (int)
25 | DT[["id6"]] = sample(N/K, N, TRUE) # small groups (int)
26 | DT[["v1"]] = sample(5, N, TRUE) # int in range [1,5]
27 | DT[["v2"]] = sample(15, N, TRUE) # int in range [1,15]
28 | DT[["v3"]] = round(runif(N,max=100),6) # numeric e.g. 23.574912
29 | setDT(DT)
30 | if (nas>0L) {
31 | cat("Inputting NAs\n")
32 | for (col in paste0("id",1:6)) {
33 | ucol = unique(DT[[col]])
34 | nna = as.integer(length(ucol) * (nas/100))
35 | if (nna)
36 | set(DT, DT[.(sample(ucol, nna)), on=col, which=TRUE], col, NA)
37 | rm(ucol)
38 | }
39 | nna = as.integer(nrow(DT) * (nas/100))
40 | if (nna) {
41 | for (col in paste0("v",1:3))
42 | set(DT, sample(nrow(DT), nna), col, NA)
43 | }
44 | }
45 | if (sort==1L) {
46 | cat("Sorting data\n")
47 | setkeyv(DT, paste0("id", 1:6))
48 | }
49 | file = sprintf("G1_%s_%s_%s_%s.csv", pretty_sci(N), pretty_sci(K), nas, sort)
50 | cat(sprintf("Writing data to %s\n", file))
51 | fwrite(DT, file)
52 | cat(sprintf("Data written to %s, quitting\n", file))
53 | if (!interactive()) quit("no", status=0)
54 |
--------------------------------------------------------------------------------
/_data/groupby2014-datagen.R:
--------------------------------------------------------------------------------
1 | args = commandArgs(TRUE)
2 |
3 | pretty_sci = function(x) {
4 | tmp<-strsplit(as.character(x), "+", fixed=TRUE)[[1L]]
5 | if(length(tmp)==1L) {
6 | paste0(substr(tmp, 1L, 1L), "e", nchar(tmp)-1L)
7 | } else if(length(tmp)==2L){
8 | paste0(tmp[1L], as.character(as.integer(tmp[2L])))
9 | }
10 | }
11 |
12 | library(data.table)
13 | N=as.integer(args[1L]); K=as.integer(args[2L]); nas=as.integer(args[3L]); sort=as.integer(args[4L])
14 | stopifnot(K==1e2L, nas==0L, sort==0L) ## 2014's setup
15 | set.seed(108)
16 | cat(sprintf("Producing data of %s rows, %s K groups factors, %s NAs ratio, %s sort flag\n", pretty_sci(N), pretty_sci(K), nas, sort))
17 | DT = list()
18 | DT[["id1"]] = sample(sprintf("id%03d",1:K), N, TRUE) # large groups (char)
19 | DT[["id2"]] = sample(sprintf("id%03d",1:K), N, TRUE) # small groups (char)
20 | DT[["id3"]] = sample(sprintf("id%010d",1:(N/K)), N, TRUE) # large groups (char)
21 | DT[["id4"]] = sample(K, N, TRUE) # large groups (int)
22 | DT[["id5"]] = sample(K, N, TRUE) # small groups (int)
23 | DT[["id6"]] = sample(N/K, N, TRUE) # small groups (int)
24 | DT[["v1"]] = sample(5, N, TRUE) # int in range [1,5]
25 | DT[["v2"]] = sample(5, N, TRUE) # int in range [1,5]
26 | DT[["v3"]] = sample(round(runif(100,max=100),4), N, TRUE)# numeric e.g. 23.5749
27 | setDT(DT)
28 | if (nas>0L) {
29 | cat("Inputting NAs\n")
30 | for (col in paste0("id",1:6)) {
31 | ucol = unique(DT[[col]])
32 | nna = as.integer(length(ucol) * (nas/100))
33 | if (nna)
34 | set(DT, DT[.(sample(ucol, nna)), on=col, which=TRUE], col, NA)
35 | rm(ucol)
36 | }
37 | nna = as.integer(nrow(DT) * (nas/100))
38 | if (nna) {
39 | for (col in paste0("v",1:3))
40 | set(DT, sample(nrow(DT), nna), col, NA)
41 | }
42 | }
43 | if (sort==1L) {
44 | cat("Sorting data\n")
45 | setkeyv(DT, paste0("id", 1:6))
46 | }
47 | file = sprintf("G0_%s_%s_%s_%s.csv", pretty_sci(N), pretty_sci(K), nas, sort)
48 | cat(sprintf("Writing data to %s\n", file))
49 | fwrite(DT, file)
50 | cat(sprintf("Data written to %s, quitting\n", file))
51 | if (!interactive()) quit("no", status=0)
52 |
--------------------------------------------------------------------------------
/_data/join-datagen.R:
--------------------------------------------------------------------------------
1 | # Rscript join-datagen.R 1e7 0 0 0 ## 1e7 rows, 0 ignored, 0% NAs, random order
2 | # Rscript join-datagen.R 1e8 0 5 1 ## 1e8 rows, 0 ignored, 5% NAs, sorted order
3 |
4 | # see h2oai/db-benchmark#106 for a design notes of this procedure, feedback welcome in the issue
5 |
6 | # init ----
7 |
8 | init = proc.time()[["elapsed"]]
9 | args = commandArgs(TRUE)
10 | N=as.numeric(args[1L]); K=as.integer(args[2L]); nas=as.integer(args[3L]); sort=as.integer(args[4L])
11 | stopifnot(N>=1e7, nas<=100L, nas>=0L, sort%in%c(0L,1L))
12 | if (N > .Machine$integer.max) stop("no support for long vector in join-datagen yet")
13 | N = as.integer(N)
14 |
15 | # helper functions ----
16 |
17 | # pretty print big numbers as 1e9, 1e8, etc
18 | pretty_sci = function(x) {
19 | stopifnot(length(x)==1L, !is.na(x))
20 | tmp = strsplit(as.character(x), "+", fixed=TRUE)[[1L]]
21 | if (length(tmp)==1L) {
22 | paste0(substr(tmp, 1L, 1L), "e", nchar(tmp)-1L)
23 | } else if (length(tmp)==2L) {
24 | paste0(tmp[1L], as.character(as.integer(tmp[2L])))
25 | }
26 | }
27 | # data_name of table to join
28 | join_to_tbls = function(data_name) {
29 | x_n = as.numeric(strsplit(data_name, "_", fixed=TRUE)[[1L]][2L])
30 | y_n = setNames(x_n/c(1e6, 1e3, 1e0), c("small","medium","big"))
31 | sapply(sapply(y_n, pretty_sci), gsub, pattern="NA", x=data_name)
32 | }
33 | # sample ensuring none is missing
34 | sample_all = function(x, size) {
35 | stopifnot(length(x) <= size)
36 | y = c(x, sample(x, size=max(size-length(x), 0), replace=TRUE))
37 | sample(y)
38 | }
39 | # split into common (0.9) left (0.1) and right (0.1)
40 | split_xlr = function(n) {
41 | key = sample.int(n*1.1) # 1.1 = 0.9+0.1+0.1
42 | list(
43 | x = key[seq.int(1, n*0.9)],
44 | l = key[seq.int(n*0.9+1, n)],
45 | r = key[seq.int(n+1, n*1.1)]
46 | )
47 | }
48 | # check if data name is LHS and has NAs
49 | lhs_nas = function(data_name) {
50 | tmp = strsplit(data_name, "_", fixed=TRUE)[[1L]]
51 | if (!identical(tmp[3L], "NA"))
52 | return(FALSE) ## RHS
53 | as.integer(tmp[4L])>0L ## NAs
54 | }
55 | # NA aware sprintf for single argument only
56 | sprintfId = function(fmt, id) {
57 | x = rep(NA_character_, length(id))
58 | idx = !is.na(id)
59 | x[idx] = sprintf("id%.0f", id[idx])
60 | x
61 | }
62 | # we need to write in batches to reduce memory footprint
63 | write_batches = function(d, name, append) {
64 | cols = names(d)
65 | if (lhs_nas(name)) sprintf = sprintfId
66 | if ("id1" %in% cols) set(d, NULL, "id4", sprintf("id%.0f", d$id1))
67 | if ("id2" %in% cols) set(d, NULL, "id5", sprintf("id%.0f", d$id2))
68 | if ("id3" %in% cols) set(d, NULL, "id6", sprintf("id%.0f", d$id3))
69 | setcolorder(d, neworder=setdiff(names(d), c("v1","v2")))
70 | fwrite(d, paste0(name, ".csv"), showProgress=FALSE, append=append)
71 | }
72 | handle_batches = function(d, data_name) {
73 | N = nrow(d)
74 | if (N > 1e8) {
75 | stopifnot(N==1e9)
76 | for (i in 1:10) {
77 | cat(sprintf("Writing %s data batch %s\n", pretty_sci(N), i))
78 | write_batches(d[((i-1)*1e8+1L):(i*1e8)], data_name, append=i>1L)
79 | }
80 | } else {
81 | write_batches(d, data_name, append=FALSE)
82 | }
83 | }
84 |
85 | # exec ----
86 |
87 | library(data.table)
88 | setDTthreads(0L)
89 | set.seed(108)
90 | data_name = sprintf("J1_%s_%s_%s_%s", pretty_sci(N), "NA", nas, sort)
91 |
92 | cat(sprintf("Generate join data of %s rows\n", pretty_sci(N)))
93 |
94 | cat("Producing keys for LHS and RHS data\n")
95 | key1 = split_xlr(N/1e6)
96 | key2 = split_xlr(N/1e3)
97 | key3 = split_xlr(N)
98 |
99 | cat(sprintf("Producing LHS %s data from keys\n", pretty_sci(N)))
100 | lhs = c("x","l")
101 | l = list(
102 | id1 = sample_all(unlist(key1[lhs], use.names=FALSE), N),
103 | id2 = sample_all(unlist(key2[lhs], use.names=FALSE), N),
104 | id3 = sample_all(unlist(key3[lhs], use.names=FALSE), N)
105 | )
106 | setDT(l)
107 | if (sort==1L) {
108 | cat("Sorting LHS data\n")
109 | setkeyv(l, c("id1","id2","id3"))
110 | }
111 | set(l, NULL, "v1", round(runif(nrow(l), max=100), 6))
112 | stopifnot(
113 | uniqueN(l, by="id1")==N/1e6,
114 | uniqueN(l, by="id2")==N/1e3,
115 | uniqueN(l, by="id3")==N
116 | )
117 | if (nas>0L) {
118 | cat("Inputting NAs in LHS data\n")
119 | for (col in paste0("id",1:3)) {
120 | ucol = unique(l[[col]])
121 | nna = as.integer(length(ucol) * (nas/100))
122 | if (nna)
123 | set(l, l[.(sample(ucol, nna)), on=col, which=TRUE], col, NA)
124 | rm(ucol)
125 | }
126 | nna = as.integer(nrow(l) * (nas/100))
127 | if (nna)
128 | set(l, sample(nrow(l), nna), "v1", NA)
129 | }
130 | cat(sprintf("Writing LHS %s data %s\n", pretty_sci(N), data_name))
131 | handle_batches(l, data_name)
132 | rm(l)
133 |
134 | rhs = c("x","r")
135 | r_data_name = join_to_tbls(data_name)
136 | n = N/1e6
137 | cat(sprintf("Producing RHS %s data from keys\n", pretty_sci(n)))
138 | r1 = list(
139 | id1 = sample_all(unlist(key1[rhs], use.names=FALSE), n)
140 | )
141 | setDT(r1)
142 | if (sort==1L) {
143 | cat("Sorting RHS small data\n")
144 | setkeyv(r1, "id1")
145 | }
146 | set(r1, NULL, "v2", round(runif(nrow(r1), max=100), 6))
147 | stopifnot(uniqueN(r1, by="id1")==n)
148 | cat(sprintf("Writing RHS %s data %s\n", pretty_sci(n), r_data_name[1L]))
149 | handle_batches(r1, r_data_name[1L])
150 | rm(r1)
151 | n = N/1e3
152 | cat(sprintf("Producing RHS %s data from keys\n", pretty_sci(n)))
153 | r2 = list(
154 | id1 = sample_all(unlist(key1[rhs], use.names=FALSE), n),
155 | id2 = sample_all(unlist(key2[rhs], use.names=FALSE), n)
156 | )
157 | setDT(r2)
158 | if (sort==1L) {
159 | cat("Sorting RHS medium data\n")
160 | setkeyv(r2, "id2")
161 | }
162 | set(r2, NULL, "v2", round(runif(nrow(r2), max=100), 6))
163 | stopifnot(uniqueN(r2, by="id2")==n)
164 | cat(sprintf("Writing RHS %s data %s\n", pretty_sci(n), r_data_name[2L]))
165 | handle_batches(r2, r_data_name[2L])
166 | rm(r2)
167 | n = N
168 | cat(sprintf("Producing RHS %s data from keys\n", pretty_sci(n)))
169 | r3 = list(
170 | id1 = sample_all(unlist(key1[rhs], use.names=FALSE), n),
171 | id2 = sample_all(unlist(key2[rhs], use.names=FALSE), n),
172 | id3 = sample_all(unlist(key3[rhs], use.names=FALSE), n)
173 | )
174 | rm(key1, key2, key3)
175 | setDT(r3)
176 | if (sort==1L) {
177 | cat("Sorting RHS big data\n")
178 | setkeyv(r3, "id3")
179 | }
180 | set(r3, NULL, "v2", round(runif(nrow(r3), max=100), 6))
181 | stopifnot(uniqueN(r3, by="id3")==n)
182 | cat(sprintf("Writing RHS %s data %s\n", pretty_sci(n), r_data_name[3L]))
183 | handle_batches(r3, r_data_name[3L])
184 | rm(r3)
185 |
186 | cat(sprintf("Join datagen of %s rows finished in %ss\n", pretty_sci(N), trunc(proc.time()[["elapsed"]]-init)))
187 | if (!interactive()) quit("no", status=0)
188 |
--------------------------------------------------------------------------------
/_helpers/helpers.R:
--------------------------------------------------------------------------------
1 | write.log = function(
2 | timestamp=Sys.time(), # this has to be here to support timestamp provided when parsing impala or clickhouse sql logs
3 | task=NA_character_, data=NA_character_, in_rows=NA_integer_, question=NA_character_, out_rows=NA_integer_,
4 | out_cols=NA_integer_, solution=NA_character_, version=NA_character_, git=NA_character_, fun=NA_character_,
5 | run=NA_integer_, time_sec=NA_real_, mem_gb=NA_real_, cache=NA, chk=NA_character_, chk_time_sec=NA_real_,
6 | on_disk=FALSE, machine_type=''
7 | ) {
8 | stopifnot(is.character(task), is.character(data), is.character(solution), is.character(fun), is.logical(on_disk), is.character(machine_type))
9 | log.file=Sys.getenv("CSV_TIME_FILE", "time.csv")
10 | batch=Sys.getenv("BATCH", NA)
11 | nodename=toString(Sys.info()[["nodename"]])
12 | comment=NA_character_ # placeholder for updates to timing data
13 | time_sec=round(time_sec, 3)
14 | mem_gb=round(mem_gb, 3)
15 | chk_time_sec=round(chk_time_sec, 3)
16 | df=data.frame(nodename=nodename, batch=as.integer(batch), timestamp=as.numeric(timestamp),
17 | task=task, data=data, in_rows=trunc(in_rows), question=as.character(question), out_rows=trunc(out_rows), # trunc to support big int in double
18 | out_cols=as.integer(out_cols), solution=solution, version=as.character(version), git=as.character(git), fun=fun,
19 | run=as.integer(run), time_sec=time_sec, mem_gb=mem_gb, cache=cache, chk=chk, chk_time_sec=chk_time_sec,
20 | comment=comment, on_disk=on_disk, machine_type=machine_type)
21 | csv_verbose = Sys.getenv("CSV_VERBOSE", "false")
22 | if (as.logical(csv_verbose)) cat("# ", paste(sapply(df, format, scientific=FALSE), collapse=","), "\n", sep="")
23 | if (!file.size(log.file)) file.remove(log.file)
24 | write.table(format(df, scientific=FALSE),
25 | file=log.file,
26 | append=file.exists(log.file),
27 | col.names=!file.exists(log.file),
28 | row.names=FALSE,
29 | quote=FALSE,
30 | na="",
31 | sep=",")
32 | }
33 |
34 | # short format of 1e7, 1e8 etc.
35 | pretty_sci = function(x) {
36 | tmp<-strsplit(as.character(x), "+", fixed=TRUE)[[1L]]
37 | if(length(tmp)==1L) {
38 | paste0(substr(tmp, 1L, 1L), "e", nchar(tmp)-1L)
39 | } else if(length(tmp)==2L){
40 | paste0(tmp[1L], as.character(as.integer(tmp[2L])))
41 | }
42 | }
43 |
44 | # makes scalar string to store in "chk" field, check sum of arbitrary number of measures
45 | make_chk = function(values){
46 | x = sapply(values, function(x) paste(format(x, scientific=FALSE), collapse="_"))
47 | gsub(",", "_", paste(x, collapse=";"), fixed=TRUE)
48 | }
49 |
50 | # bash 'ps -o rss'
51 | memory_usage = function() {
52 | return(NA_real_) # disabled because during #110 system() kills the scripts
53 | cmd = paste("ps -o rss", Sys.getpid(), "| tail -1")
54 | ans = tryCatch(system(cmd, intern=TRUE, ignore.stderr=TRUE), error=function(e) NA_character_)
55 | as.numeric(ans) / (1024^2) # GB units
56 | }
57 |
58 | # join task RHS tables for LHS data name
59 | join_to_tbls = function(data_name) {
60 | x_n = as.numeric(strsplit(data_name, "_", fixed=TRUE)[[1L]][2L])
61 | y_n = setNames(c(x_n/1e6, x_n/1e3, x_n), c("small","medium","big"))
62 | sapply(sapply(y_n, pretty_sci), gsub, pattern="NA", x=data_name)
63 | }
64 |
--------------------------------------------------------------------------------
/_helpers/helpers.jl:
--------------------------------------------------------------------------------
1 | using Printf; # sprintf macro to print in non-scientific format
2 | using Pkg;
3 |
4 | # from https://github.com/JuliaLang/Pkg.jl/issues/793
5 | function getpkgmeta(name::AbstractString)
6 | fname = joinpath(dirname(Base.active_project()), "Manifest.toml")
7 | Pkg.TOML.parse(read(fname, String))["deps"][name][1]
8 | end;
9 |
10 | function write_log(run, task, data, in_rows, question, out_rows, out_cols, solution, version, git, fun, time_sec, mem_gb, cache, chk, chk_time_sec, on_disk, machine_type)
11 | file=try
12 | ENV["CSV_TIME_FILE"]
13 | catch
14 | "time.csv"
15 | end;
16 | if (occursin("/", file) && SubString(file, 1, 1)!="/") # otherwise we assume full path
17 | file="$(pwd())/$file";
18 | end;
19 | batch=try
20 | ENV["BATCH"]
21 | catch
22 | ""
23 | end;
24 | if (isfile(file) && filesize(file)==0)
25 | rm(file)
26 | end;
27 | nodename=gethostname()
28 | comment="" # placeholder for updates to timing data
29 | time_sec=round(time_sec, digits=3)
30 | mem_gb=round(mem_gb, digits=3)
31 | chk_time_sec=round(chk_time_sec, digits=3)
32 | timestamp=@sprintf("%0.6f", time())
33 | csv_verbose = false # hardcoded for now, TODO ENV["CSV_VERBOSE"] and print
34 | log = DataFrame(nodename=nodename, batch=batch, timestamp=timestamp, task=task, data=data, in_rows=in_rows, question=question, out_rows=out_rows, out_cols=out_cols, solution=solution, version=version, git=git, fun=fun, run=run, time_sec=time_sec, mem_gb=mem_gb, cache=uppercase(string(cache)), chk=chk, chk_time_sec=chk_time_sec, comment=comment, on_disk=uppercase(string(on_disk)), machine_type=machine_type)
35 | CSV.write(file, log, append=isfile(file), header=!isfile(file))
36 | end;
37 |
38 | function make_chk(x)
39 | n = length(x)
40 | res = ""
41 | for i = 1:n
42 | res = string(res, i==1 ? "" : ";", @sprintf("%0.3f", x[i]))
43 | end
44 | res
45 | end;
46 |
47 | function memory_usage()
48 | pid = getpid()
49 | s = read(pipeline(`ps -o rss $pid`,`tail -1`), String)
50 | parse(Float64, replace(s, "\n" => "")) / (1024^2)
51 | end;
52 |
53 | function join_to_tbls(data_name)
54 | x_n = Int(parse(Float64, split(data_name, "_")[2]))
55 | y_n = [x_n/1e6, x_n/1e3, x_n]
56 | y_n = [replace(@sprintf("%.0e", y_n[1]), r"[+]0?"=>""), replace(@sprintf("%.0e", y_n[2]), r"[+]0?"=>""), replace(@sprintf("%.0e", y_n[3]), r"[+]0?"=>"")]
57 | [replace(data_name, "NA" => y_n[1]), replace(data_name, "NA" => y_n[2]), replace(data_name, "NA" => y_n[3])]
58 | end;
59 |
--------------------------------------------------------------------------------
/_helpers/helpers.py:
--------------------------------------------------------------------------------
1 | import time
2 | import csv
3 | import math
4 | import psutil
5 | import os
6 | import platform
7 |
8 | def write_log(task, data, in_rows, question, out_rows, out_cols, solution, version, git, fun, run, time_sec, mem_gb, cache, chk, chk_time_sec, on_disk, machine_type):
9 | batch = os.getenv('BATCH', "")
10 | timestamp = time.time()
11 | csv_file = os.getenv('CSV_TIME_FILE', "time.csv")
12 | nodename = platform.node()
13 | comment = "" # placeholder for updates to timing data
14 | time_sec = round(time_sec, 3)
15 | chk_time_sec = round(chk_time_sec, 3)
16 | mem_gb = round(mem_gb, 3)
17 | if math.isnan(time_sec):
18 | time_sec = ""
19 | if math.isnan(mem_gb):
20 | mem_gb = ""
21 | log_row = [nodename, batch, timestamp, task, data, in_rows, question, out_rows, out_cols, solution, version, git, fun, run, time_sec, mem_gb, cache, chk, chk_time_sec, comment, on_disk, machine_type]
22 | log_header = ["nodename","batch","timestamp","task","data","in_rows","question","out_rows","out_cols","solution","version","git","fun","run","time_sec","mem_gb","cache","chk","chk_time_sec","comment","on_disk", "machine_type"]
23 | if os.path.isfile(csv_file) and not(os.path.getsize(csv_file)):
24 | os.remove(csv_file)
25 | append = os.path.isfile(csv_file)
26 | csv_verbose = os.getenv('CSV_VERBOSE', "false")
27 | if csv_verbose.lower()=="true":
28 | print('# ' + ','.join(str(x) for x in log_row))
29 | if append:
30 | with open(csv_file, 'a') as f:
31 | w = csv.writer(f, lineterminator='\n')
32 | w.writerow(log_row)
33 | else:
34 | with open(csv_file, 'w+') as f:
35 | w = csv.writer(f, lineterminator='\n')
36 | w.writerow(log_header)
37 | w.writerow(log_row)
38 | return True
39 |
40 | def str_round(x):
41 | if type(x).__name__ in ["float","float64"]:
42 | x = round(x,3)
43 | return str(x)
44 |
45 | flatten = lambda l: [item for sublist in l for item in sublist]
46 |
47 | def make_chk(values):
48 | s = ';'.join(str_round(x) for x in values)
49 | return s.replace(",","_") # comma is reserved for csv separator
50 |
51 | def memory_usage():
52 | process = psutil.Process(os.getpid())
53 | return process.memory_info().rss/(1024**3) # GB units
54 |
55 | def join_to_tbls(data_name):
56 | x_n = int(float(data_name.split("_")[1]))
57 | y_n = ["{:.0e}".format(x_n/1e6), "{:.0e}".format(x_n/1e3), "{:.0e}".format(x_n)]
58 | y_n = [y_n[0].replace('+0', ''), y_n[1].replace('+0', ''), y_n[2].replace('+0', '')]
59 | return [data_name.replace('NA', y_n[0]), data_name.replace('NA', y_n[1]), data_name.replace('NA', y_n[2])]
60 |
--------------------------------------------------------------------------------
/_helpers/helpers.sh:
--------------------------------------------------------------------------------
1 | # join task RHS tables for LHS data name
2 | join_to_tbls() {
3 | data_name=$1
4 | x_n="$(echo $data_name | cut -d '_' -f 2)"
5 | x_n_lhs="$(echo $x_n | cut -d 'e' -f 1)"
6 | if [ "$x_n_lhs" -ne 1 ]; then
7 | echo "data_name $data_name must have '1' base in exponential notation for number of rows" >&2 && eit 1
8 | fi
9 | x_n_rhs="$(echo $x_n | cut -d "e" -f 2)"
10 | if [ "$x_n_rhs" -lt 6 ]; then
11 | echo "data_name $data_name must have exponent greater or equal to '6' in exponential notation for number of rows" >&2 && exit 1
12 | fi
13 | echo ${data_name/NA/"$x_n_lhs"e"$(($x_n_rhs-6))"} ${data_name/NA/"$x_n_lhs"e"$(($x_n_rhs-3))"} ${data_name/NA/"$x_n_lhs"e"$x_n_rhs"}
14 | }
15 |
--------------------------------------------------------------------------------
/_helpers/helpersds.jl:
--------------------------------------------------------------------------------
1 | using Printf; # sprintf macro to print in non-scientific format
2 | using Pkg;
3 |
4 | # from https://github.com/JuliaLang/Pkg.jl/issues/793
5 | function getpkgmeta(name::AbstractString)
6 | fname = joinpath(dirname(Base.active_project()), "Manifest.toml")
7 | Pkg.TOML.parse(read(fname, String))["deps"][name][1]
8 | end;
9 |
10 | function write_log(run, task, data, in_rows, question, out_rows, out_cols, solution, version, git, fun, time_sec, mem_gb, cache, chk, chk_time_sec, on_disk, machine_type)
11 | file=try
12 | ENV["CSV_TIME_FILE"]
13 | catch
14 | "time.csv"
15 | end;
16 | if (occursin("/", file) && SubString(file, 1, 1)!="/") # otherwise we assume full path
17 | file="$(pwd())/$file";
18 | end;
19 | batch=try
20 | ENV["BATCH"]
21 | catch
22 | ""
23 | end;
24 | if (isfile(file) && filesize(file)==0)
25 | rm(file)
26 | end;
27 | nodename=gethostname()
28 | comment="" # placeholder for updates to timing data
29 | time_sec=round(time_sec, digits=3)
30 | mem_gb=round(mem_gb, digits=3)
31 | chk_time_sec=round(chk_time_sec, digits=3)
32 | timestamp=@sprintf("%0.6f", time())
33 | csv_verbose = false
34 | log = Dataset(nodename=nodename, batch=batch, timestamp=timestamp, task=task, data=data, in_rows=in_rows, question=question, out_rows=out_rows, out_cols=out_cols, solution=solution, version=version, git=git, fun=fun, run=run, time_sec=time_sec, mem_gb=mem_gb, cache=uppercase(string(cache)), chk=chk, chk_time_sec=chk_time_sec, comment=comment, on_disk=uppercase(string(on_disk)), machine_type=machine_type)
35 | filewriter(file, log, append=isfile(file), header=!isfile(file))
36 | end;
37 |
38 | function make_chk(x)
39 | n = length(x)
40 | res = ""
41 | for i = 1:n
42 | res = string(res, i==1 ? "" : ";", @sprintf("%0.3f", x[i]))
43 | end
44 | res
45 | end;
46 |
47 | function memory_usage()
48 | pid = getpid()
49 | s = read(pipeline(`ps -o rss $pid`,`tail -1`), String)
50 | parse(Float64, replace(s, "\n" => "")) / (1024^2)
51 | end;
52 |
53 | function join_to_tbls(data_name)
54 | x_n = Int(parse(Float64, split(data_name, "_")[2]))
55 | y_n = [x_n/1e6, x_n/1e3, x_n]
56 | y_n = [replace(@sprintf("%.0e", y_n[1]), r"[+]0?"=>""), replace(@sprintf("%.0e", y_n[2]), r"[+]0?"=>""), replace(@sprintf("%.0e", y_n[3]), r"[+]0?"=>"")]
57 | [replace(data_name, "NA" => y_n[1]), replace(data_name, "NA" => y_n[2]), replace(data_name, "NA" => y_n[3])]
58 | end;
59 |
--------------------------------------------------------------------------------
/_launcher/launch.R:
--------------------------------------------------------------------------------
1 | library("data.table")
2 | if (!packageVersion("data.table") >= "1.13.0")
3 | stop("db-benchmark launcher script depends on recent data.table features, install at least 1.13.0.")
4 | source("./_launcher/launcher.R")
5 |
6 | .nodename = Sys.info()[["nodename"]]
7 | mockup = as.logical(Sys.getenv("MOCKUP", "false"))
8 |
9 | run_tasks = getenv("RUN_TASKS") # run_tasks = c("groupby","join")
10 | if (!length(run_tasks)) {
11 | cat("No benchmark tasks to run\n")
12 | q("no")
13 | }
14 | run_solutions = getenv("RUN_SOLUTIONS") # run_solutions = c("data.table","dplyr","pydatatable","spark","pandas")
15 | if (!length(run_solutions)) {
16 | cat("No benchmark solutions to run\n")
17 | q("no")
18 | }
19 |
20 | data = fread("./_control/data.csv", logical01=TRUE, colClasses=c("character","character","character","character","character","character","logical"))
21 | if (anyDuplicated(data[["data"]]))
22 | stop("_control/data.csv contains duplicated data cases")
23 | data[active==TRUE, # filter on active datasets
24 | ][run_tasks, on="task", nomatch=NA # filter for env var RUN_TASKS
25 | ][, c("active") := NULL # remove unused
26 | ][] -> data
27 | if (any(is.na(data$data))) stop("missing entries in ./_control/data.csv for some tasks")
28 |
29 | timeout = fread("./_control/timeout.csv", colClasses=c("character","character","numeric"))
30 | timeout[run_tasks, on="task", nomatch=NA # # filter for env var RUN_TASKS
31 | ] -> timeout
32 | if (any(is.na(timeout$minutes))) stop("missing entries in ./_control/timeout.csv for some tasks")
33 |
34 | solution = fread("./_control/solutions.csv")
35 | solution[run_solutions, on="solution", nomatch=NA # filter for env var RUN_SOLUTIONS
36 | ] -> solution
37 | if (any(is.na(solution$task))) stop("missing entries in ./_control/solutions.csv for some solutions")
38 |
39 | # what to run, log machine name, lookup timeout
40 | dt = solution[data, on="task", allow.cartesian=TRUE, nomatch=NULL]
41 | dt[, "nodename" := .nodename]
42 | dt[, "in_rows" := sapply(strsplit(data, split="_", fixed=TRUE), `[[`, 2L)]
43 | stopifnot(dt$in_rows == dt$nrow)
44 | dt[timeout, "timeout_s" := i.minutes*60, on=c("task","in_rows")]
45 | if (any(is.na(dt$timeout_s))) stop("missing entries in ./_control/timeout.csv for some tasks, detected after joining to solutions and data to run")
46 |
47 | # detect if script has been already run before for currently installed version/revision
48 | lookup_run_batch(dt)
49 |
50 | machine_type = getenv("MACHINE_TYPE")
51 | dt[,machine_type := machine_type]
52 |
53 | skipped_benchmarks = fread("./_control/skipped_benchmarks.csv", logical01=TRUE, colClasses=c("character","character","character","character"))
54 | print("skipping benchmarks defined in _control/skipped_benchmarks.csv")
55 | print(skipped_benchmarks)
56 |
57 | dt = dt[!skipped_benchmarks, on = c("solution", "task", "data", "machine_type")]
58 |
59 | # print list of solutions that are going to be run in this batch so we know upfront which will be skipped
60 | cat("Benchmark solutions to run: ", dt[is.na(run_batch), paste(unique(solution),collapse=", ")], "\n", sep="")
61 |
62 | is.stop()
63 | is.pause()
64 | is.stop()
65 |
66 | # launch script, if not mockup, if not already run, unless forcerun
67 | dt
68 | launch(dt, mockup=mockup)
69 |
70 | # terminates
71 | q("no")
72 |
--------------------------------------------------------------------------------
/_launcher/setup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | # dirs for datasets and output of benchmark
5 | mkdir -p data
6 | mkdir -p out
7 |
8 | sudo apt-get update
9 |
10 | # install R
11 | sudo add-apt-repository "deb https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/"
12 | sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
13 | sudo apt-get update -qq
14 | sudo apt-get install -y r-base-dev
15 | sudo apt-get install python3-dev virtualenv
16 |
17 | sudo chmod a+w /usr/local/lib/R/site-library
18 |
19 | # configure R
20 | echo 'LC_ALL=C' >> ~/.Renviron
21 | mkdir -p ~/.R
22 | echo 'CFLAGS=-O3 -mtune=native' > ~/.R/Makevars
23 | echo 'CXXFLAGS=-O3 -mtune=native' >> ~/.R/Makevars
24 |
25 | # packages used in launcher and report
26 | Rscript -e 'install.packages(c("bit64","rmarkdown","data.table","rpivotTable","formattable","lattice"))'
27 | Rscript -e 'sapply(c("bit64","rmarkdown","data.table","rpivotTable","formattable","lattice"), requireNamespace)'
28 |
29 | # install duckdb for unpacking data
30 | curl --fail --location --progress-bar --output duckdb_cli-linux-amd64.zip https://github.com/duckdb/duckdb/releases/download/v1.2.0/duckdb_cli-linux-amd64.zip
31 | sudo unzip duckdb_cli-linux-amd64.zip -d /usr/local/bin
32 |
33 |
34 | # install aws client to download benchmark data
35 | curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
36 | unzip awscliv2.zip
37 | sudo ./aws/install
38 |
39 | # after each restart of server
40 | source clickhouse/ch.sh && ch_stop
41 | sudo service docker stop
42 | sudo swapoff -a
43 |
44 | # stop and disable
45 | sudo systemctl disable docker
46 | sudo systemctl stop docker
47 | sudo systemctl disable clickhouse-server
48 | sudo systemctl stop clickhouse-server
49 |
--------------------------------------------------------------------------------
/_report/blah.R:
--------------------------------------------------------------------------------
1 |
2 | source("./_report/report.R", chdir=TRUE)
3 | source("./_helpers/helpers.R", chdir=TRUE)
4 | source("./_benchplot/benchplot.R", chdir=TRUE)
5 | source("./_benchplot/benchplot-dict.R", chdir=TRUE)
6 | ld = time_logs()
7 | lld = ld[script_recent==TRUE]
8 | # lld_nodename = as.character(unique(lld$nodename))
9 | lld_nodename = "c6id.metal"
10 | if (length(lld_nodename)>1L)
11 | stop(sprintf("There are multiple different 'nodename' to be presented on single report '%s'", report_name))
12 | lld_unfinished = lld[is.na(script_time_sec)]
13 | if (nrow(lld_unfinished)) {
14 | warning(sprintf("Missing solution finish timestamp in logs.csv for '%s' (still running or launcher script killed): %s", paste(unique(lld_unfinished$task), collapse=","), paste(unique(lld_unfinished$solution), collapse=", ")))
15 | }
16 |
17 | dt_groupby = lld[task=="groupby"][substr(data,1,2)=="G1"]
18 | dt_join = lld[task=="join"]
19 |
20 |
21 | loop_benchplot = function(dt_task, report_name, syntax.dict, exceptions, solution.dict, question.txt.fun = NULL, title.txt.fun = NULL, data_namev, q_groupv, cutoff=NULL, pending=NULL) {
22 | for (data_name in data_namev) {
23 | for (q_group in q_groupv) {
24 | message(sprintf("benchplot %s %s %s", report_name, data_name, q_group))
25 | message(sprintf("machine type = %s", m_type))
26 | y = dt_task[data==data_name & question_group==q_group & machine_type==m_type][,machine_type := NULL]
27 | benchplot(
28 | y,
29 | filename = file.path("public", report_name, sprintf("%s_%s_%s.png", data_name, q_group, m_type)),
30 | solution.dict = solution.dict,
31 | syntax.dict = syntax.dict,
32 | exceptions = exceptions,
33 | question.txt.fun = question.txt.fun,
34 | title.txt.fun = title.txt.fun,
35 | cutoff = cutoff,
36 | pending = pending,
37 | url.footer = "https://duckdblabs.github.io/db-benchmark",
38 | interactive = FALSE
39 | )
40 | }
41 | }
42 | }
43 | link = function(data_name, q_group, report_name) {
44 | fnam = sprintf("%s_%s.png", data_name, q_group)
45 | paste(sprintf("[%s](%s)", q_group, file.path(report_name, fnam)), collapse=", ")
46 | }
47 | hours_took = function(lld) {
48 | lld_script_time = lld[, .(n_script_time_sec=uniqueN(script_time_sec), script_time_sec=unique(script_time_sec)), .(solution, task, data)]
49 | if (nrow(lld_script_time[n_script_time_sec>1L]))
50 | stop("There are multiple different 'script_time_sec' for single solution+task+data on report 'index'")
51 | lld_script_time[, round(sum(script_time_sec, na.rm=TRUE)/60/60, 1)]
52 | }
53 |
54 | data_name = get_data_levels()[["groupby"]]
55 | loop_benchplot(dt_groupby, report_name="groupby", syntax.dict=groupby.syntax.dict, exceptions=groupby.exceptions, solution.dict=solution.dict, data_namev=data_name, q_groupv=c("basic","advanced"), title.txt.fun = header_title_fun, question.txt.fun = groupby_q_title_fun, cutoff = "spark", pending = "Modin", machine_types)
--------------------------------------------------------------------------------
/_report/ga.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/_report/publish.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -o errexit -o nounset
3 |
4 | publishGhPages(){
5 | rm -rf db-benchmark.gh-pages
6 | mkdir -p db-benchmark.gh-pages
7 | cd db-benchmark.gh-pages
8 |
9 | ## Set up Repo parameters
10 | git init > /dev/null
11 | git config user.name "Tmonster"
12 | git config user.email "tom@ebergen.com"
13 |
14 | ## Set gh token from local file
15 |
16 | ## Reset gh-pages branch
17 | git remote add upstream "git@github.com:duckdblabs/db-benchmark.git"
18 | git fetch -q upstream gh-pages
19 | rm -f err.txt
20 | git checkout -q gh-pages
21 | git reset -q --hard "645f86716bfb3b44c53eacf1f2bf234e75ea41ec"
22 |
23 | rm -f err.txt
24 | cp -r ../public/* ./
25 | git add -A
26 | git commit -q -m 'publish benchmark report'
27 | cp ../time.csv .
28 | cp ../logs.csv .
29 | git add time.csv logs.csv
30 | md5sum time.csv > time.csv.md5
31 | md5sum logs.csv > logs.csv.md5
32 | git add time.csv.md5 logs.csv.md5
33 | gzip --keep time.csv
34 | gzip --keep logs.csv
35 | git add time.csv.gz logs.csv.gz
36 | git commit -q -m 'publish benchmark timings and logs'
37 | git push --force upstream gh-pages
38 |
39 | cd ..
40 |
41 | }
42 |
43 | publishGhPages
44 |
--------------------------------------------------------------------------------
/_report/tech.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Technical measures of db-benchmark"
3 | output:
4 | html_document:
5 | self_contained: no
6 | toc: true
7 | includes:
8 | in_header: ga.html
9 | ---
10 | ```{r render, include=FALSE}
11 | # Rscript -e 'rmarkdown::render("./_report/tech.Rmd", output_dir="public")' && xdg-open public/tech.html
12 | ```
13 |
14 | ```{r opts, echo=FALSE}
15 | knitr::opts_knit$set(root.dir="..")
16 | knitr::opts_chunk$set(echo=FALSE, cache=FALSE)
17 | ```
18 |
19 | ```{r init}
20 | library(lattice)
21 | source("./_report/report.R")
22 | ld = time_logs()
23 | recent_nodename = ld[script_recent==TRUE, unique(nodename)]
24 | stopifnot(length(recent_nodename)==1L)
25 | ld = ld[nodename==recent_nodename]
26 | ```
27 |
28 | ## Incompleted timings of last run
29 |
30 | ```{r completed}
31 | ll = ld[script_recent==TRUE, {
32 | n_na = is.na(c(time_sec_1, time_sec_2))
33 | n_completed=sum(!n_na)
34 | n_failed=sum(n_na)
35 | .(n_completed=n_completed, n_failed=n_failed, q_failed=if(n_failed==0L) NA_character_ else paste(paste0("q", iquestion[is.na(time_sec_1) | is.na(time_sec_2)]), collapse=","))
36 | },
37 | c("nodename","batch","solution","task","data","in_rows","k","nasorted")]
38 | stopifnot(length(unique(ll$nodename))==1L)
39 | ```
40 |
41 | ### groupby
42 |
43 | ```{r completed_groupby}
44 | kk(ll[task=="groupby"
45 | ][n_failed>0L, .(solution, data, in_rows, k, `NA, sorted`=nasorted, n_completed, n_failed, q_failed)])
46 | ```
47 |
48 | ## Full scripts executions
49 |
50 | Things to consider when looking at below plots.
51 |
52 | - Red dotted line refers to script timeout which initially was not set up. Later it was set to 60 minutes, more recently, after adding new set of questions, it was increased to 120 minutes. Up to date timeout value can be looked up in `_control/timeout.csv` file.
53 | - It might happen that script was terminated by _out of memory killer_ an OS feature. In result script timing will be smaller than in reality it should be.
54 |
55 | Refer to table above to see which script has been fully completed.
56 |
57 | ### groupby
58 |
59 | ```{r logs_plot, fig.width=8, fig.height=48}
60 | #timeout = fread("./_control/timeout.csv", colClasses=c("character","character","numeric"))
61 | #timeout = timeout["groupby", on="task", nomatch=NULL] # filter for env var RUN_TASKS
62 | #stopifnot(nrow(timeout)==1L)
63 | #timeout_m = timeout[["minutes"]]
64 | p = sapply(setNames(nm=as.character(unique(ld$solution))), simplify = FALSE, function(s)
65 | lattice::xyplot(script_time_sec/60 ~ ibatch | k+in_rows, ld[task=="groupby" & substr(data,1,2)=="G1"],
66 | type="l", grid=TRUE, groups=nasorted,
67 | subset=solution==s, main=s,
68 | panel=panel.superpose,
69 | panel.groups=function(x, y, col, col.symbol, ...) {
70 | panel.lines(x, y, col=col.symbol, ...)
71 | #panel.abline(h=timeout_m, col="red", lty=3)
72 | },
73 | xlab = "benchmark run",
74 | ylab = "minutes",
75 | scales=list(y=list(
76 | relation="free",
77 | limits=rep(ld[solution==s, list(list(c(0, max(script_time_sec)/60))), in_rows]$V1, each=3)
78 | )),
79 | auto.key=list(points=FALSE, lines=TRUE))
80 | )
81 | sapply(seq_along(p), function(i) print(p[[i]], split=c(1, i, 1, length(p)), more=i!=length(p))) -> nul
82 | ```
83 |
84 | ------
85 |
86 | Report was generated on: `r format(Sys.time(), usetz=TRUE)`.
87 |
88 | ```{r status_set_success}
89 | cat("tech\n", file=get_report_status_file(), append=TRUE)
90 | ```
91 |
--------------------------------------------------------------------------------
/_run/download_small_medium.sh:
--------------------------------------------------------------------------------
1 | # first download and expand small data
2 |
3 | # get groupby small (0.5GB and 5GB datasets)
4 | aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/groupby_small.duckdb data/groupby_small.duckdb --no-sign-request --quiet
5 | # get join small (0.5GB and 5GB datasets)
6 | aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/join_small.duckdb data/join_small.duckdb --no-sign-request --quiet
7 |
8 |
9 | # expand groupby-small datasets to csv
10 | duckdb data/groupby_small.duckdb -c "copy G1_1e7_1e2_0_0 to 'data/G1_1e7_1e2_0_0.csv' (FORMAT CSV)"
11 | duckdb data/groupby_small.duckdb -c "copy G1_1e7_1e1_0_0 to 'data/G1_1e7_1e1_0_0.csv' (FORMAT CSV)"
12 | duckdb data/groupby_small.duckdb -c "copy G1_1e7_2e0_0_0 to 'data/G1_1e7_2e0_0_0.csv' (FORMAT CSV)"
13 | duckdb data/groupby_small.duckdb -c "copy G1_1e7_1e2_0_1 to 'data/G1_1e7_1e2_0_1.csv' (FORMAT CSV)"
14 | duckdb data/groupby_small.duckdb -c "copy G1_1e7_1e2_5_0 to 'data/G1_1e7_1e2_5_0.csv' (FORMAT CSV)"
15 | duckdb data/groupby_small.duckdb -c "copy G1_1e8_1e2_0_0 to 'data/G1_1e8_1e2_0_0.csv' (FORMAT CSV)"
16 | duckdb data/groupby_small.duckdb -c "copy G1_1e8_1e1_0_0 to 'data/G1_1e8_1e1_0_0.csv' (FORMAT CSV)"
17 | duckdb data/groupby_small.duckdb -c "copy G1_1e8_2e0_0_0 to 'data/G1_1e8_2e0_0_0.csv' (FORMAT CSV)"
18 | duckdb data/groupby_small.duckdb -c "copy G1_1e8_1e2_0_1 to 'data/G1_1e8_1e2_0_1.csv' (FORMAT CSV)"
19 | duckdb data/groupby_small.duckdb -c "copy G1_1e8_1e2_5_0 to 'data/G1_1e8_1e2_5_0.csv' (FORMAT CSV)"
20 |
21 | # expand join-small datasets to csv
22 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e1_0_0 to 'data/J1_1e7_1e1_0_0.csv' (FORMAT CSV)"
23 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e4_5_0 to 'data/J1_1e7_1e4_5_0.csv' (FORMAT CSV)"
24 | duckdb data/join_small.duckdb -c "copy J1_1e7_NA_0_1 to 'data/J1_1e7_NA_0_1.csv' (FORMAT CSV)"
25 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e5_0_0 to 'data/J1_1e8_1e5_0_0.csv' (FORMAT CSV)"
26 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e8_5_0 to 'data/J1_1e8_1e8_5_0.csv' (FORMAT CSV)"
27 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e1_0_1 to 'data/J1_1e7_1e1_0_1.csv' (FORMAT CSV)"
28 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e7_0_0 to 'data/J1_1e7_1e7_0_0.csv' (FORMAT CSV)"
29 | duckdb data/join_small.duckdb -c "copy J1_1e7_NA_5_0 to 'data/J1_1e7_NA_5_0.csv' (FORMAT CSV)"
30 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e5_0_1 to 'data/J1_1e8_1e5_0_1.csv' (FORMAT CSV)"
31 | duckdb data/join_small.duckdb -c "copy J1_1e8_NA_0_0 to 'data/J1_1e8_NA_0_0.csv' (FORMAT CSV)"
32 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e1_5_0 to 'data/J1_1e7_1e1_5_0.csv' (FORMAT CSV)"
33 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e7_0_1 to 'data/J1_1e7_1e7_0_1.csv' (FORMAT CSV)"
34 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e2_0_0 to 'data/J1_1e8_1e2_0_0.csv' (FORMAT CSV)"
35 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e5_5_0 to 'data/J1_1e8_1e5_5_0.csv' (FORMAT CSV)"
36 | duckdb data/join_small.duckdb -c "copy J1_1e8_NA_0_1 to 'data/J1_1e8_NA_0_1.csv' (FORMAT CSV)"
37 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e4_0_0 to 'data/J1_1e7_1e4_0_0.csv' (FORMAT CSV)"
38 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e7_5_0 to 'data/J1_1e7_1e7_5_0.csv' (FORMAT CSV)"
39 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e2_0_1 to 'data/J1_1e8_1e2_0_1.csv' (FORMAT CSV)"
40 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e8_0_0 to 'data/J1_1e8_1e8_0_0.csv' (FORMAT CSV)"
41 | duckdb data/join_small.duckdb -c "copy J1_1e8_NA_5_0 to 'data/J1_1e8_NA_5_0.csv' (FORMAT CSV)"
42 | duckdb data/join_small.duckdb -c "copy J1_1e7_1e4_0_1 to 'data/J1_1e7_1e4_0_1.csv' (FORMAT CSV)"
43 | duckdb data/join_small.duckdb -c "copy J1_1e7_NA_0_0 to 'data/J1_1e7_NA_0_0.csv' (FORMAT CSV)"
44 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e2_5_0 to 'data/J1_1e8_1e2_5_0.csv' (FORMAT CSV)"
45 | duckdb data/join_small.duckdb -c "copy J1_1e8_1e8_0_1 to 'data/J1_1e8_1e8_0_1.csv' (FORMAT CSV)"
46 |
47 |
--------------------------------------------------------------------------------
/_run/partitioned_run.sh:
--------------------------------------------------------------------------------
1 | # set machine type
2 | ./_run/run_small_medium.sh
3 |
4 | ./_run/run_large.sh
5 |
--------------------------------------------------------------------------------
/_run/run_large.sh:
--------------------------------------------------------------------------------
1 | # download and expand large data
2 |
3 | # get groupby large (50GB datasets)
4 | aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/groupby_large.duckdb data/groupby_large.duckdb --no-sign-request --quiet
5 | # get join small (50GB datasets)
6 | aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/join_large.duckdb data/join_large.duckdb --no-sign-request --quiet
7 |
8 |
9 | # expand groupby-large datasets to csv
10 | duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e2_0_0 to 'data/G1_1e9_1e2_0_0.csv' (FORMAT CSV)"
11 | duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e1_0_0 to 'data/G1_1e9_1e1_0_0.csv' (FORMAT CSV)"
12 | duckdb data/groupby_large.duckdb -c "copy G1_1e9_2e0_0_0 to 'data/G1_1e9_2e0_0_0.csv' (FORMAT CSV)"
13 | duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e2_0_1 to 'data/G1_1e9_1e2_0_1.csv' (FORMAT CSV)"
14 | duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e2_5_0 to 'data/G1_1e9_1e2_5_0.csv' (FORMAT CSV)"
15 |
16 |
17 | # expand join-large datasets to csv
18 | duckdb data/join_large.duckdb -c "copy J1_1e9_NA_0_0 to 'data/J1_1e9_NA_0_0.csv' (FORMAT CSV)"
19 | duckdb data/join_large.duckdb -c "copy J1_1e9_1e9_0_0 to 'data/J1_1e9_1e9_0_0.csv' (FORMAT CSV)"
20 | duckdb data/join_large.duckdb -c "copy J1_1e9_1e6_0_0 to 'data/J1_1e9_1e6_0_0.csv' (FORMAT CSV)"
21 | duckdb data/join_large.duckdb -c "copy J1_1e9_1e3_0_0 to 'data/J1_1e9_1e3_0_0.csv' (FORMAT CSV)"
22 |
23 |
24 | cp _control/data_large.csv _control/data.csv
25 |
26 | echo "Running all solutions on large (50GB) datasets"
27 | ./run.sh
28 |
29 |
30 | ###
31 | echo "done..."
32 | echo "removing data files"
33 | rm data/*.csv
34 | rm data/*.duckdb
35 |
--------------------------------------------------------------------------------
/_run/run_medium.sh:
--------------------------------------------------------------------------------
1 | ./_run/download_small_medium.sh
2 |
3 | cp _control/data_medium.csv _control/data.csv
4 |
5 |
6 | echo "Running all solutions on medium (5GB) datasets"
7 | ./run.sh
8 |
9 |
10 | ###
11 | echo "done..."
12 | echo "removing small data files"
13 | rm data/*.csv
14 | rm data/*.duckdb
15 |
16 |
--------------------------------------------------------------------------------
/_run/run_small.sh:
--------------------------------------------------------------------------------
1 | ./_run/download_small_medium.sh
2 |
3 | cp _control/data_small.csv _control/data.csv
4 |
5 |
6 | echo "Running all solutions on small (0.5GB) datasets"
7 | ./run.sh
8 |
9 |
10 | ###
11 | echo "done..."
12 | echo "removing small data files"
13 | rm data/*.csv
14 | rm data/*.duckdb
15 |
16 |
--------------------------------------------------------------------------------
/_setup_utils/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duckdblabs/db-benchmark/47879c51efba789ddbf973423f2c77bfa411143c/_setup_utils/.DS_Store
--------------------------------------------------------------------------------
/_setup_utils/install_all_solutions.py:
--------------------------------------------------------------------------------
1 | import os
2 | import csv
3 | import sys
4 | import subprocess
5 |
6 | SOLUTIONS_FILENAME = "_control/solutions.csv"
7 |
8 |
9 | INCLUDE = set()
10 |
11 | def install_solution(solution_name):
12 | min_setup_file_name = f"./{solution_name}/min-setup-{solution_name}.sh"
13 | setup_file_name = f"./{solution_name}/setup-{solution_name}.sh"
14 | upgrade_file_name = f"./{solution_name}/upg-{solution_name}.sh"
15 | get_version_filename = f"./{solution_name}/ver-{solution_name}.sh"
16 | print(f"Installing {solution_name}")
17 | do_install = False
18 | try:
19 | result = subprocess.call([get_version_filename], stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL)
20 | if result != 0:
21 | do_install = True
22 | except Exception as e:
23 | do_install = True
24 |
25 | if do_install:
26 | if os.path.exists(min_setup_file_name):
27 | subprocess.call([min_setup_file_name])
28 | elif os.path.exists(setup_file_name):
29 | subprocess.call([setup_file_name])
30 | else:
31 | # print(f"no script for {setup_file_name} or {min_setup_file_name}")
32 | raise Exception(f"No script to install {solution_name}")
33 | else:
34 | subprocess.call([upgrade_file_name])
35 |
36 | # based on the name of the solution, run the {{solution}}/min-setup-{{solution}}.sh file.
37 | # if there is no min-setup-{{solution}}.sh, then run setup-{{solution}}.sh.
38 | # if error, exit with an error
39 | # else don't
40 | def include_all_solutions():
41 | global INCLUDE
42 | with open(SOLUTIONS_FILENAME, newline="") as solutions_file:
43 | solutions = csv.DictReader(solutions_file, delimiter=',')
44 | for row in solutions:
45 | if row['solution'] == "data.table":
46 | INCLUDE.add("datatable")
47 | else:
48 | INCLUDE.add(row['solution'])
49 |
50 | if len(sys.argv) == 0:
51 | print("""
52 | Usage: python3 install_all_solutions.py solution_name solution_name ...
53 | python3 install_all_solutions.py all --exclude clickhouse polars
54 | """)
55 | exit(1)
56 |
57 | # first argument is file name
58 |
59 | def main():
60 | global INCLUDE
61 | including = True
62 | for solution in sys.argv[1:]:
63 | if solution.strip() == "all":
64 | include_all_solutions()
65 | elif solution.strip() == "--exclude":
66 | including = False
67 | continue
68 | else:
69 | if including:
70 | if solution == "data.table":
71 | INCLUDE.add("datatable")
72 | elif solution == "clickhouse":
73 | INCLUDE.add("clickhouse")
74 | INCLUDE.add("polars")
75 | else:
76 | INCLUDE.add(solution)
77 | else:
78 | sol = solution.strip()
79 | INCLUDE.remove(sol)
80 |
81 | for solution in INCLUDE:
82 | install_solution(solution)
83 |
84 |
85 | if __name__ == "__main__":
86 | main()
87 |
88 |
--------------------------------------------------------------------------------
/_setup_utils/mount.sh:
--------------------------------------------------------------------------------
1 | # script to format mount and copy data.
2 |
3 | # remove a leftover instance mount
4 | rm -rf /var/lib/mount/db-benchmark-metal
5 |
6 | # format the mount
7 |
8 | source path.env
9 |
10 | mount_name=$(sudo lsblk | awk '
11 | NR > 1 && $1 ~ /^nvme/ && $7 == "" {
12 | # Convert SIZE column to bytes for comparison
13 | size = $4;
14 | unit = substr(size, length(size));
15 | value = substr(size, 1, length(size)-1);
16 | if (unit == "G") { value *= 1024^3; }
17 | else if (unit == "T") { value *= 1024^4; }
18 | else if (unit == "M") { value *= 1024^2; }
19 | else if (unit == "K") { value *= 1024; }
20 | else { value *= 1; }
21 |
22 | # Keep track of the largest size
23 | if (value > max) {
24 | max = value;
25 | largest = $1;
26 | }
27 | }
28 | END { if (largest) print largest; else print "No match found"; }
29 | ')
30 |
31 | if [ -z "${MOUNT_POINT}" ]; then
32 | echo "Error: Environment variable MOUNT_POINT is not set. Set it by running"
33 | echo "source path.env"
34 | exit 1
35 | fi
36 |
37 | sudo mkfs -t xfs /dev/$mount_name
38 |
39 | sudo rm -rf $MOUNT_POINT
40 | sudo mkdir $MOUNT_POINT
41 | sudo mount /dev/$mount_name $MOUNT_POINT
42 |
43 | # make clone of repo on mount
44 | sudo mkdir $MOUNT_POINT/db-benchmark-metal
45 | sudo chown -R ubuntu:ubuntu $MOUNT_POINT
46 |
47 |
48 | git clone $(git remote get-url origin) $MOUNT_POINT/db-benchmark-metal
49 | cd $MOUNT_POINT/db-benchmark-metal
--------------------------------------------------------------------------------
/_setup_utils/mount_and_install_solutions.sh:
--------------------------------------------------------------------------------
1 | # script to format mount and copy data.
2 | # mount the data
3 | ./_setup_utils/mount.sh
4 |
5 | # setup all the solutions on db-benchmark-metal.
6 | # creates the necessary python virtual environments and creates the r-libraries
7 | # needed
8 | cd ~/db-benchmark-metal && source path.env && python3 _setup_utils/install_all_solutions.py all
9 |
10 |
11 |
12 | # setup mount for clickhouse spill
13 | # sudo mkfs -t xfs /dev/nvme1n1
14 | # sudo mkdir /var/lib/clickhouse-nvme-mount/
15 | # sudo mount /dev/nvme1n1 /var/lib/clickhouse-nvme-mount/
16 | # # not sure if below is necessary.
17 | # sudo cp -a /var/lib/clickhouse/. /var/lib/clickhouse-nvme-mount/
18 | # # change ownership of new mount to clickhouse
19 | # sudo chown -R clickhouse:clickhouse /var/lib/clickhouse-nvme-mount/
20 | # sudo chown -R clickhouse:clickhouse /dev/nvme1n1
21 |
22 | # # add config so clickhouse knows to use the mount to spill data
23 | # sudo cp clickhouse/clickhouse-mount-config.xml /etc/clickhouse-server/config.d/data-paths.xml
24 |
25 | echo "------------------------------------------"
26 | echo "------------------------------------------"
27 | echo "READY TO RUN BENCHMARK. ./run.sh"
28 |
--------------------------------------------------------------------------------
/_setup_utils/prep_solutions.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import csv
4 |
5 | SOLUTIONS_FILENAME = "_control/solutions.csv"
6 | RUN_CONF_FILENAME = "run.conf"
7 |
8 | SKIPPED_SOLUTIONS = []
9 |
10 |
11 | def print_usage():
12 | print("Usage: python3 _utils/prep_solutions.py --task=[groupby|join]")
13 | exit(1)
14 |
15 | def parse_task():
16 | task = None
17 | for arg in sys.argv[1:]:
18 | if arg.startswith("--task="):
19 | task = arg.replace("--task=", "")
20 | if task == None or (task != "groupby" and task != "join"):
21 | print_usage()
22 | return task
23 |
24 | def parse_solution():
25 | solution = None
26 | for arg in sys.argv[1:]:
27 | if arg.startswith("--solution="):
28 | solution = arg.replace("--solution=", "")
29 | return solution
30 |
31 | def main():
32 | task = parse_task()
33 | solution = parse_solution()
34 | if solution == "all":
35 | solution = get_solutions(task)
36 | if solution == "clickhouse":
37 | solution = "clickhouse polars"
38 | update_run_conf_solutions(solution, task)
39 |
40 | def update_run_conf_solutions(solution_name_list, task):
41 | # change what solutions are run in run.conf
42 | os.system(f"sed 's/export RUN_SOLUTIONS=.*/export RUN_SOLUTIONS=\"{solution_name_list}\"/g' run.conf > run_2.conf")
43 | os.system(f"sed 's/export RUN_TASKS=.*/export RUN_TASKS=\"{task}\"/g' run_2.conf > run_3.conf")
44 | os.system(f"sed 's/export DO_REPORT=.*/export DO_REPORT=false/g' run_3.conf > run.conf")
45 | os.remove('run_2.conf')
46 | os.remove('run_3.conf')
47 |
48 | def get_solutions(task):
49 | solutions_for_task = ""
50 | with open(SOLUTIONS_FILENAME, newline="") as solutions_file:
51 | solutions = csv.DictReader(solutions_file, delimiter=',')
52 | for row in solutions:
53 | if row['task'] == task and row['solution'] not in SKIPPED_SOLUTIONS:
54 | solutions_for_task += row['solution'] + " "
55 | return solutions_for_task.strip()
56 |
57 |
58 | if __name__ == "__main__":
59 | main()
--------------------------------------------------------------------------------
/_setup_utils/repro.sh:
--------------------------------------------------------------------------------
1 | # full repro on Ubuntu 22.04
2 |
3 | cd ~/h2oai-db-benchmark
4 |
5 | sudo apt-get -qq update
6 | sudo apt upgrade
7 |
8 | sudo apt-get -qq install -y lsb-release software-properties-common wget curl vim htop git byobu libcurl4-openssl-dev libssl-dev
9 | sudo apt-get -qq install -y libfreetype6-dev
10 | sudo apt-get -qq install -y libfribidi-dev
11 | sudo apt-get -qq install -y libharfbuzz-dev
12 | sudo apt-get -qq install -y git
13 | sudo apt-get -qq install -y libxml2-dev
14 | sudo apt-get -qq install -y make
15 | sudo apt-get -qq install -y libfontconfig1-dev
16 | sudo apt-get -qq install -y libicu-dev pandoc zlib1g-dev libgit2-dev libcurl4-openssl-dev libssl-dev libjpeg-dev libpng-dev libtiff-dev
17 | # sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
18 | sudo add-apt-repository "deb [arch=amd64,i386] https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/"
19 | sudo apt-get -qq update
20 | sudo apt-get -qq install -y r-base-dev virtualenv
21 |
22 | cd /usr/local/lib/R
23 | sudo chmod o+w site-library
24 |
25 | cd ~
26 | mkdir -p .R
27 | echo 'CFLAGS=-O3 -mtune=native' >> ~/.R/Makevars
28 | echo 'CXXFLAGS=-O3 -mtune=native' >> ~/.R/Makevars
29 |
30 | Rscript -e 'install.packages(c("jsonlite","bit64","devtools","rmarkdown"), dependecies=TRUE, repos="https://cloud.r-project.org")'
31 |
32 |
33 | # install dplyr
34 | Rscript -e 'devtools::install_github(c("tidyverse/readr","tidyverse/dplyr"))'
35 |
36 | # install data.table
37 | Rscript -e 'install.packages("data.table", repos="https://rdatatable.gitlab.io/data.table/")'
38 |
39 |
--------------------------------------------------------------------------------
/_setup_utils/setup_small.sh:
--------------------------------------------------------------------------------
1 | # full repro on Ubuntu 22.04
2 |
3 | # update the key
4 | sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 51716619E084DAB9
5 | ## Install libraries
6 |
7 | sudo apt-get -qq update
8 | sudo apt upgrade
9 |
10 | sudo apt-get -qq install make
11 |
12 | sudo apt-get -qq install wget curl openssl build-essential
13 | sudo apt-get -qq install -y r-base-dev virtualenv
14 | sudo apt-get -qq install openjdk-17-jdk
15 |
16 | sudo apt-get install -y zlib1g-dev
17 | sudo apt-get install -y pandoc unzip
18 |
19 | # update virtualenv
20 | python3 -m pip install virtualenv
21 |
22 | # sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
23 | # sudo add-apt-repository "deb [arch=amd64,i386] https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/"
24 |
25 | sudo chmod o+w /usr/local/lib/R/site-library
26 |
27 | Rscript -e 'install.packages(c("data.table", "dplyr", "knitr", "bit64"), dependecies=TRUE, repos="https://cloud.r-project.org")'
28 |
29 | mkdir -p ~/.R
30 | echo 'CFLAGS=-O3 -mtune=native' >> ~/.R/Makevars
31 | echo 'CXXFLAGS=-O3 -mtune=native' >> ~/.R/Makevars
--------------------------------------------------------------------------------
/_setup_utils/sleep_and_run.sh:
--------------------------------------------------------------------------------
1 | while [ -f run.lock ]
2 | do
3 | sleep 1800
4 | done
5 |
6 |
7 | rm run.lock
8 |
9 | ./run.sh
10 |
--------------------------------------------------------------------------------
/_utils/answers-validation.R:
--------------------------------------------------------------------------------
1 | source("report.R")
2 | d = time_logs()
3 |
4 | # this script meant to detect some inconsistencies within a solution results and between solutions results
5 | # note that known exceptions has been already filtered out in report.R in clean_time function
6 |
7 | check = list()
8 |
9 | # detect lack of consistency in query output within single benchmark runs within each solution separately
10 | grain = c("solution","task","data","iquestion")
11 | d[!is.na(out_rows), .(unqn_out_rows=uniqueN(out_rows), unq_out_rows=paste(unique(out_rows), collapse=",")), by=grain
12 | ][unqn_out_rows>1L
13 | ] -> check[["solution_out_rows"]]
14 |
15 | # detect lack of out_rows match in query output between solutions
16 | grain = c("task","data","iquestion","question")
17 | d[!is.na(out_rows), .(unqn_out_rows=uniqueN(out_rows), unq_out_rows=paste(unique(out_rows), collapse=",")), by=grain
18 | ][unqn_out_rows>1L
19 | ] -> check[["out_rows"]]
20 | # detect lack of chk match in query output between median chk from all solutions with tolerance=0.005
21 | chk_check = function(chk, tolerance=sqrt(.Machine$double.eps)) {
22 | len = unique(sapply(chk, length))
23 | if (length(len)!=1L) stop("some solutions returns chk for less variables than others")
24 | med = sapply(seq.int(len), function(i) median(sapply(chk, `[[`, i)))
25 | eq_txt = sapply(chk, all.equal, med, tolerance=tolerance, simplify=FALSE)
26 | #if (any(!sapply(eq_txt, isTRUE))) browser()
27 | eq = sapply(eq_txt, isTRUE)
28 | ans = list()
29 | ans$n_match = sum(eq)
30 | ans$n_mismatch = sum(!eq)
31 | ans$med_chk = paste0(format(med, scientific=FALSE, trim=TRUE), collapse=";")
32 | ans$sol_mismatch = if (!ans$n_mismatch) NA_character_ else paste0(names(eq)[!eq], collapse=",")
33 | ans$sol_chk_mismatch = if (!ans$n_mismatch) NA_character_ else paste(paste0(names(eq)[!eq], ":", sapply(sapply(chk[names(eq)[!eq]], format, scientific=FALSE, trim=TRUE, simplify=FALSE), paste, collapse=";")), collapse=",")
34 | ans
35 | }
36 | (if (nrow(check[["solution_chk"]])) NULL else { # only proceed if chk was not mismatched within a solution
37 | d[!is.na(chk) & solution!="cudf", # cudf chk validation disabled due to issue described in model_time() in report.R
38 | .(unqn_chk=uniqueN(chk), chk=unique(chk)), by=c("solution", grain)
39 | ][, if (any(unqn_chk>1L)) stop("this check should not be performed, should be escaped in 'if' branch") else .SD # ensure chk is unique
40 | ][, .(chk, chk_l=sapply(strsplit(chk, ";", fixed=TRUE), as.numeric, simplify=FALSE)), by=c("solution", grain)
41 | ][, chk_check(setNames(chk_l, solution), tolerance=0.005), keyby=grain
42 | ][n_mismatch>0L]
43 | }) -> check[["chk"]]
44 |
45 | # detect solutions for which chk calculation timing was relatively big comparing to query timing
46 | grain = c("solution","task","data","iquestion","question")
47 | d[, .(time_sec_1, chk_time_sec_1, time_sec_2, chk_time_sec_2, time_to_chk_1=time_sec_1/chk_time_sec_1, time_to_chk_2=time_sec_2/chk_time_sec_2), by=grain
48 | ][!(time_to_chk_1>2.5 & time_to_chk_2>2.5) # spark chk is only 2.6+ times faster than query
49 | ] -> check[["chk_time_sec"]]
50 |
51 | # print results
52 | if (any(sapply(check, nrow))) {
53 | cat("db-benchmark answers consistency check failed, see details below\n")
54 | print(check)
55 | } else {
56 | cat("db-benchmark answers consistency check successfully passed\n")
57 | }
58 |
--------------------------------------------------------------------------------
/_utils/compare-data.table.R:
--------------------------------------------------------------------------------
1 | source("_utils/time.R")
2 | if (system("tail -1 time.csv | cut -d',' -f2", intern=TRUE)!="1621364165")
3 | stop("time.csv and logs.csv should be as of 1621364165 batch run, filter out newer rows in those files")
4 |
5 | ## groupby ----
6 |
7 | d = tail.time("data.table", "groupby", i=c(1L, 2L))
8 | setnames(d, c("20210517_2f2f62d","20210518_2f2f62d"), c("th_40","th_20"))
9 | if (nrow(d[(is.na(th_40) & !is.na(th_20)) | (!is.na(th_40) & is.na(th_20))])) {
10 | stop("number of threads had an impact on completion of queries")
11 | } else {
12 | d = d[!is.na(th_40)]
13 | }
14 | d[, th_40_20:=th_40/th_20]
15 |
16 | ## improvement
17 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(in_rows)]
18 | # in_rows mean median
19 | #1: 1e7 1.0242721 0.9609988
20 | #2: 1e8 0.9378870 0.9455267
21 | #3: 1e9 0.9506561 0.9569359
22 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(knasorted)]
23 | # knasorted mean median
24 | #1: 1e2 cardinality factor, 0% NAs, unsorted data 1.0393667 0.9538973
25 | #2: 1e1 cardinality factor, 0% NAs, unsorted data 0.9521915 0.9544223
26 | #3: 2e0 cardinality factor, 0% NAs, unsorted data 0.9604950 0.9569359
27 | #4: 1e2 cardinality factor, 0% NAs, pre-sorted data 0.9371154 0.9487804
28 | #5: 1e2 cardinality factor, 5% NAs, unsorted data 0.9678192 0.9598999
29 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(question_group)]
30 | # question_group mean median
31 | #1: basic 0.9548596 0.9301310
32 | #2: advanced 0.9897345 0.9806791
33 |
34 | ## worst case by data
35 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(in_rows, knasorted)][which.max(mean)]
36 | # in_rows knasorted mean median
37 | #1: 1e7 1e2 cardinality factor, 0% NAs, unsorted data 1.239259 0.9620776
38 | ## best case by data
39 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(in_rows, knasorted)][which.min(mean)]
40 | # in_rows knasorted mean median
41 | #1: 1e8 1e2 cardinality factor, 0% NAs, unsorted data 0.9235102 0.9200373
42 |
43 | ## worst case for single question
44 | d[which.max(th_40_20)]
45 | # in_rows knasorted question_group question th_40 th_20 th_40_20
46 | #1: 1e7 1e2 cardinality factor, 0% NAs, unsorted data basic sum v1 by id1:id2 0.413 0.118 3.5
47 | ## best case for single question
48 | d[which.min(th_40_20)]
49 | # in_rows knasorted question_group question th_40 th_20 th_40_20
50 | #1: 1e9 1e2 cardinality factor, 5% NAs, unsorted data basic sum v1 mean v3 by id3 15.22 21.104 0.7211903
51 |
52 | ## join ----
53 |
54 | d = tail.time("data.table", "join", i=c(1L, 2L))
55 | setnames(d, c("20210517_2f2f62d","20210518_2f2f62d"), c("th_40","th_20"))
56 | if (nrow(d[(is.na(th_40) & !is.na(th_20)) | (!is.na(th_40) & is.na(th_20))])) {
57 | stop("number of threads had an impact on completion of queries")
58 | } else {
59 | d = d[!is.na(th_40)]
60 | }
61 | d[, th_40_20:=th_40/th_20]
62 |
63 | ## improvement
64 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(in_rows)]
65 | # in_rows mean median
66 | #1: 1e7 1.0149302 1.0000000
67 | #2: 1e8 0.9143243 0.9008573
68 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(knasorted)]
69 | # knasorted mean median
70 | #1: 0% NAs, unsorted data 0.9385902 0.9144130
71 | #2: 5% NAs, unsorted data 0.9612286 0.9294773
72 | #3: 0% NAs, pre-sorted data 0.9940629 0.9705720
73 |
74 | ## worst case by data
75 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(in_rows, knasorted)][which.max(mean)]
76 | # in_rows knasorted mean median
77 | #1: 1e7 0% NAs, pre-sorted data 1.055906 1.05
78 | ## best case by data
79 | d[, .(mean=mean(th_40_20), median=median(th_40_20)), .(in_rows, knasorted)][which.min(mean)]
80 | # in_rows knasorted mean median
81 | #1: 1e8 0% NAs, unsorted data 0.8983325 0.8773762
82 |
83 | ## worst case for single question
84 | d[which.max(th_40_20)]
85 | # in_rows knasorted question th_40 th_20 th_40_20
86 | #1: 1e7 5% NAs, unsorted data medium inner on factor 0.513 0.443 1.158014
87 | ## best case for single question
88 | d[which.min(th_40_20)]
89 | # in_rows knasorted question th_40 th_20 th_40_20
90 | #1: 1e8 0% NAs, unsorted data medium outer on int 8.143 9.558 0.8519565
91 |
--------------------------------------------------------------------------------
/_utils/download_data.sh:
--------------------------------------------------------------------------------
1 |
2 | # get small data
3 | wget https://duckdb-blobs.s3.amazonaws.com/data/db-benchmark-data/groupby_small.duckdb
4 | ~/duckdb groupby_small.duckdb -c "copy G1_1e7_1e2_0_0 to 'G1_1e7_1e2_0_0.csv' (FORMAT CSV)"
5 | ~/duckdb groupby_small.duckdb -c "copy G1_1e7_1e1_0_0 to 'G1_1e7_1e1_0_0.csv' (FORMAT CSV)"
6 | ~/duckdb groupby_small.duckdb -c "copy G1_1e7_2e0_0_0 to 'G1_1e7_2e0_0_0.csv' (FORMAT CSV)"
7 | ~/duckdb groupby_small.duckdb -c "copy G1_1e7_1e2_0_1 to 'G1_1e7_1e2_0_1.csv' (FORMAT CSV)"
8 | ~/duckdb groupby_small.duckdb -c "copy G1_1e7_1e2_5_0 to 'G1_1e7_1e2_5_0.csv' (FORMAT CSV)"
9 | ~/duckdb groupby_small.duckdb -c "copy G1_1e8_1e2_0_0 to 'G1_1e8_1e2_0_0.csv' (FORMAT CSV)"
10 | ~/duckdb groupby_small.duckdb -c "copy G1_1e8_1e1_0_0 to 'G1_1e8_1e1_0_0.csv' (FORMAT CSV)"
11 | ~/duckdb groupby_small.duckdb -c "copy G1_1e8_2e0_0_0 to 'G1_1e8_2e0_0_0.csv' (FORMAT CSV)"
12 | ~/duckdb groupby_small.duckdb -c "copy G1_1e8_1e2_0_1 to 'G1_1e8_1e2_0_1.csv' (FORMAT CSV)"
13 | ~/duckdb groupby_small.duckdb -c "copy G1_1e8_1e2_5_0 to 'G1_1e8_1e2_5_0.csv' (FORMAT CSV)"
14 |
15 | wget https://duckdb-blobs.s3.amazonaws.com/data/db-benchmark-data/join_small.duckdb
16 |
17 | # get large data
18 | wget https://duckdb-blobs.s3.amazonaws.com/data/db-benchmark-data/join_large.duckdb
19 |
20 | wget https://duckdb-blobs.s3.amazonaws.com/data/db-benchmark-data/groupby_large.duckdb
21 | ~/duckdb -c "copy G1_1e9_1e2_0_0 to 'G1_1e9_1e2_0_0.csv' (FORMAT CSV)"
22 | ~/duckdb -c "copy G1_1e9_1e1_0_0 to 'G1_1e9_1e1_0_0.csv' (FORMAT CSV)"
23 | ~/duckdb -c "copy G1_1e9_2e0_0_0 to 'G1_1e9_2e0_0_0.csv' (FORMAT CSV)"
24 | ~/duckdb -c "copy G1_1e9_1e2_0_1 to 'G1_1e9_1e2_0_1.csv' (FORMAT CSV)"
25 | ~/duckdb -c "copy G1_1e9_1e2_5_0 to 'G1_1e9_1e2_5_0.csv' (FORMAT CSV)"
26 |
27 | # get 500GB data
28 | wget https://duckdb-blobs.s3.amazonaws.com/data/db-benchmark-data/join-500gb.duckdb
29 |
30 | # ???
31 | wget https://duckdb-blobs.s3.amazonaws.com/data/db-benchmark-data/groupby-500gb.duckdb
--------------------------------------------------------------------------------
/_utils/generate-data-small.sh:
--------------------------------------------------------------------------------
1 | # Data generation data for groupby 0.5GB
2 |
3 | mkdir -p data
4 | cd data/
5 | Rscript ../_data/groupby-datagen.R 1e7 1e2 0 0
6 | Rscript ../_data/groupby-datagen.R 1e7 1e2 15 0
7 | Rscript ../_data/join-datagen.R 1e7 0 0 0
8 |
9 | cp G1_1e7_1e2_0_0.csv G1_1e9_1e2_0_0.csv
10 | cp J1_1e7_1e1_0_0.csv J1_1e9_1e3_0_0.csv
11 | cp J1_1e7_1e4_0_0.csv J1_1e9_1e6_0_0.csv
12 | cp J1_1e7_1e7_0_0.csv J1_1e9_1e9_0_0.csv
13 | cp J1_1e7_NA_0_0.csv J1_1e9_NA_0_0.csv
14 |
15 | cd ..
16 |
17 | # don't publish, we dont even have the keys
18 | sed -i 's/DO_PUBLISH=true/DO_PUBLISH=false/g' run.conf
19 |
20 | # set sizes
21 | mv _control/data.csv _control/data.csv.original
22 |
23 | echo "task,data,nrow,k,na,sort,active" > _control/data.csv
24 | echo "groupby,G1_1e7_1e2_0_0,1e7,1e2,0,0,1" >> _control/data.csv
25 | echo "groupby,G1_1e7_1e2_15_0,1e7,1e2,15,0,1" >> _control/data.csv
26 | echo "groupby,G1_1e9_1e2_0_0,1e9,1e2,0,0,1" >> _control/data.csv
27 | echo "join,J1_1e7_NA_0_0,1e7,NA,0,0,1" >> _control/data.csv
28 | echo "join,J1_1e9_NA_0_0,1e9,NA,0,0,1" >> _control/data.csv
29 |
--------------------------------------------------------------------------------
/_utils/groupby_k_factor.csv:
--------------------------------------------------------------------------------
1 | K,in_rows,question,out_rows
2 | 1e2,10000000,q1,100
3 | 1e2,10000000,q2,10000
4 | 1e2,10000000,q3,100000
5 | 1e2,10000000,q4,100
6 | 1e2,10000000,q5,100000
7 | 1e2,100000000,q1,100
8 | 1e2,100000000,q2,10000
9 | 1e2,100000000,q3,1000000
10 | 1e2,100000000,q4,100
11 | 1e2,100000000,q5,1000000
12 | 1e2,1000000000,q1,100
13 | 1e2,1000000000,q2,10000
14 | 1e2,1000000000,q3,10000000
15 | 1e2,1000000000,q4,100
16 | 1e2,1000000000,q5,10000000
17 | 1e1,10000000,q1,10
18 | 1e1,10000000,q2,100
19 | 1e1,10000000,q3,999951
20 | 1e1,10000000,q4,10
21 | 1e1,10000000,q5,999969
22 | 1e1,100000000,q1,10
23 | 1e1,100000000,q2,100
24 | 1e1,100000000,q3,9999518
25 | 1e1,100000000,q4,10
26 | 1e1,100000000,q5,9999512
27 | 1e1,1000000000,q1,10
28 | 1e1,1000000000,q2,100
29 | 1e1,1000000000,q3,99995425
30 | 1e1,1000000000,q4,10
31 | 1e1,1000000000,q5,99995357
32 | 2e0,10000000,q1,2
33 | 2e0,10000000,q2,4
34 | 2e0,10000000,q3,4323484
35 | 2e0,10000000,q4,2
36 | 2e0,10000000,q5,4323579
37 | 2e0,100000000,q1,2
38 | 2e0,100000000,q2,4
39 | 2e0,100000000,q3,43231389
40 | 2e0,100000000,q4,2
41 | 2e0,100000000,q5,43232226
42 | 2e0,1000000000,q1,2
43 | 2e0,1000000000,q2,4
44 | 2e0,1000000000,q3,431884560
45 | 2e0,1000000000,q4,2
46 | 2e0,1000000000,q5,431876300
47 |
--------------------------------------------------------------------------------
/_utils/maintainer.R:
--------------------------------------------------------------------------------
1 | timeleft = function() {
2 | l = data.table::fread("logs.csv")
3 | if (!nrow(l))
4 | stop("logs.csv files is empty")
5 | this = l[.N]
6 | if (this$action=="finish") {
7 | this[, cat(sprintf("%s %s %s must have just finished\n", solution, task, data))]
8 | quit("no")
9 | }
10 | stopifnot(this$action=="start")
11 | l = l[-.N][action!="skip", data.table::dcast(.SD, solution+task+data+batch~action, value.var="timestamp")]
12 | took = l[this, on=.(solution, task, data), nomatch=NULL, finish[.N]-start[.N]]
13 | if (!length(took) || is.na(took)) {
14 | this[, cat(sprintf("%s %s %s is running for the first time so it is unknown how much it will run\n", solution, task, data))]
15 | quit("no")
16 | }
17 | stopifnot(took>0)
18 | now = trunc(as.numeric(Sys.time()))
19 | this[, cat(sprintf("%s %s %s should take around %ss more\n", solution, task, data, trunc(took-(now-timestamp))))]
20 | q("no")
21 | }
22 |
--------------------------------------------------------------------------------
/_utils/maintainer.sh:
--------------------------------------------------------------------------------
1 | # returns time left by the currently run script, useful after touch pause|stop
2 | timeleft() {
3 | if [ ! -f ./run.lock ]; then
4 | echo "benchmark is not running now" >&2 && return 1
5 | fi
6 | Rscript -e 'source("_utils/maintainer.R"); timeleft()'
7 | }
8 |
--------------------------------------------------------------------------------
/_utils/parse_time_logs.R:
--------------------------------------------------------------------------------
1 |
2 | source("./_report/report.R", chdir=TRUE)
3 | source("./_helpers/helpers.R", chdir=TRUE)
4 | source("./_benchplot/benchplot.R", chdir=TRUE)
5 | source("./_benchplot/benchplot-dict.R", chdir=TRUE)
6 | ld = time_logs()
--------------------------------------------------------------------------------
/_utils/partitioned_run.sh:
--------------------------------------------------------------------------------
1 | ./_run/run_small_medium.sh
2 | ./_run/run_large.sh
--------------------------------------------------------------------------------
/_utils/sql_to_check_timings/timing_checks.sql:
--------------------------------------------------------------------------------
1 | -- run this in duckdb
2 |
3 | create table timings as select * from read_csv_auto('reports/oct_25/time.csv');
4 |
5 |
6 | -- check what solutions might have bad out rows
7 | select t1.question, t1.data, t1.out_rows, t1.solution, t2.out_rows, t2.solution from
8 | timings t1, timings t2
9 | where t1.out_rows != t2.out_rows
10 | and t1.question = t2.question
11 | and t1.solution != 'clickhouse'
12 | and t2.solution != 'clickhouse'
13 | and t1.task = t2.task
14 | -- and t1.task = 'groupby'
15 | -- and t1.solution != 'arrow'
16 | -- and t2.solution != 'arrow'
17 | and t2.solution != 'datafusion'
18 | and t1.question != 'sum v3 count by id1:id6'
19 | and t1.data != 'G1_1e8_1e2_5_0'
20 | and t1.data = t2.data ;
21 |
22 |
23 | -- Value of 'chk' varies for different runs for single solution+question
24 | create table timings as select * from read_csv('time.csv');
25 |
26 | select t1.chk, t2.chk, t1.solution, t2.solution from
27 | timings t1, timings t2
28 | where t1.chk != t2.chk
29 | and t1.question = t2.question
30 | and t1.task = t2.task
31 | and t1.solution != 'datafusion'
32 | and t2.solution != 'datafusion'
33 | and t1.solution != 'arrow'
34 | and t2.solution != 'arrow'
35 | and t1.solution != 'R-arrow'
36 | and t2.solution != 'R-arrow'
37 | and t1.solution != 'collapse'
38 | and t1.solution = t2.solution
39 | and t1.data = t2.data group by all;
40 |
41 |
42 | select t1.question, t1.data, t1.out_rows, t2.solution, t2.out_rows from
43 | timings t1, timings t2
44 | where t1.out_rows != t2.out_rows
45 | and t1.question = t2.question
46 | and t1.solution != 'clickhouse'
47 | and t2.solution != 'clickhouse'
48 | and t1.question = 'medium outer on int'
49 | and t1.data = t2.data;
--------------------------------------------------------------------------------
/_utils/time.R:
--------------------------------------------------------------------------------
1 | source("./_report/report.R")
2 |
3 | download.time = function(file=c("logs.csv","time.csv"), from="https://h2oai.github.io/db-benchmark") {
4 | stopifnot(is.character(file), is.character(from), length(file)>=1L, length(from)==1L, !is.na(file), !is.na(from))
5 | if (all(file.exists(file))) {
6 | md5file = paste(file, "md5", sep=".")
7 | download.file(file.path(from, md5file), destfile=md5file)
8 | upstream = sapply(strsplit(sapply(setNames(md5file, file), readLines), split=" ", fixed=TRUE), `[[`, 1L)
9 | current = tools::md5sum(file)
10 | new = current[names(upstream)] != upstream
11 | file = names(new)[new]
12 | if (!length(file)) {
13 | cat("nothing to download, md5sum of local files match the upstream md5sum\n")
14 | return(invisible(NULL))
15 | }
16 | }
17 | download.file(file.path(from, file), destfile=file)
18 | return(invisible(NULL))
19 | }
20 |
21 | drop.data.table = function(x, cols) {
22 | ans = data.table:::shallow(x)
23 | un = sapply(cols, function(col) uniqueN(x[[col]]))
24 | rm = names(un)[un <= 1L]
25 | if (length(rm)) set(ans, NULL, rm, NULL) # Rdatatable/data.table#4086
26 | ans
27 | }
28 |
29 | tail.time = function(solution, task, n=2L, i=seq_len(n), drop=TRUE) {
30 | stopifnot(length(solution)==1L, length(task)==1L, length(n)==1L, n>0L, length(i)>=1L, all(i>=0L))
31 | if (!missing(n) && !missing(i)) stop("only 'n' or 'i' argument should be used, not both")
32 | ld = time_logs()
33 | s = solution
34 | t = task
35 | ld = ld[solution==s & task==t]
36 | ub = unique(ld$batch)
37 | i = i[i <= length(ub)] # there might be only N unq batches but N+1 requested
38 | if (!length(i)) stop("there are not enough registered runs for this solution and requested recent timings")
39 | b = rev(ub)[i]
40 | ans = dcast(
41 | ld[batch%in%b],
42 | in_rows + knasorted + question_group + question ~ paste(format(as.POSIXct(as.numeric(batch), origin="1970-01-01"), "%Y%m%d"), substr(git, 1, 7), sep="_"),
43 | value.var = "time_sec_1"
44 | )
45 | if (drop) ans = drop.data.table(ans, cols=c("in_rows","knasorted","question_group","question"))
46 | ans
47 | }
48 |
49 | compare.time = function(solutions, task, drop=TRUE) {
50 | stopifnot(length(solutions)>=1L, length(task)==1L)
51 | ld = time_logs()
52 | t = task
53 | ans = dcast(
54 | ld[script_recent==TRUE & solution%in%solutions & task==t],
55 | in_rows + knasorted + question_group + question ~ solution,
56 | value.var = "time_sec_1"
57 | )
58 | if (drop) ans = drop.data.table(ans, cols=c("in_rows","knasorted","question_group","question"))
59 | ans
60 | }
61 |
62 | ## maintainer mode
63 | #scp -C mr-dl11:~/git/db-benchmark/logs.csv ~/git/db-benchmark/logs.csv && scp -C mr-dl11:~/git/db-benchmark/time.csv ~/git/db-benchmark/time.csv
64 |
65 | ## user mode
66 | #download.time()
67 | #tail.time("juliadf", "groupby", i=c(1L, 2L))
68 | #tail.time("data.table", "groupby", i=c(1L, 2L))
69 | #compare.time(c("data.table","spark","pydatatable"), "join")
70 |
--------------------------------------------------------------------------------
/_utils/validate_no_errors.sh:
--------------------------------------------------------------------------------
1 | if [ $(grep -i 'error|exception' out/run_*.err | wc -l) = 0 ]
2 | then
3 | # no true errors found, print last line of each output script
4 | echo "No Errors found in run_*.err logs"
5 | else
6 | echo "The following errors have been found. Failing check"
7 | grep -i "error|exception" out/*.err
8 | exit 1
9 | fi
10 |
11 |
12 |
13 | # check report generation. If this fails, the logs.csv/time.csv
14 | # have errors
15 | Rscript _utils/parse_time_logs.R 2> report_check.txt
16 | # https://gist.github.com/jesugmz/3fda0fc7c1006cedfe039ff1459c3174
17 | output=$(wc -l report_check.txt | awk '{ print $1 }')
18 | if [ $output -ne 0 ]
19 | then
20 | echo "report check not empty"
21 | cat report_check.txt
22 | exit 1
23 | fi
24 | echo "time.csv and logs.csv can be parsed"
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/arrow/VERSION:
--------------------------------------------------------------------------------
1 | 13.0.0.1
2 |
--------------------------------------------------------------------------------
/clickhouse/VERSION:
--------------------------------------------------------------------------------
1 | 25.1.3.23
--------------------------------------------------------------------------------
/clickhouse/ch.sh:
--------------------------------------------------------------------------------
1 | ch_installed() {
2 | dpkg-query -Wf'${db:Status-abbrev}' clickhouse-server 2>/dev/null | grep -q '^i'
3 | }
4 |
5 | ch_active() {
6 | clickhouse-client --user db_benchmark --query="SELECT 0;" > /dev/null 2>&1
7 | local ret=$?;
8 | if [[ $ret -eq 0 ]]; then return 0; elif [[ $ret -eq 210 ]]; then return 1; else echo "Unexpected return code from clickhouse-client: $ret" >&2 && return 1; fi;
9 | }
10 |
11 | ch_wait() {
12 | for _ in $(seq 1 60); do if [[ $(wget -q 'localhost:8123' -O-) == 'Ok.' ]]; then break ; else sleep 1; fi ; done
13 | ch_active
14 | }
15 |
16 | ch_start() {
17 | echo '# ch_start: starting clickhouse-server'
18 | sudo service clickhouse-server start
19 | ch_wait
20 | }
21 |
22 | ch_stop() {
23 | echo '# ch_stop: stopping clickhouse-server'
24 | sudo service clickhouse-server stop && sleep 15
25 | }
26 |
27 | ch_query() {
28 | ENGINE=Memory
29 | if [ $COMPRESS -eq 1 ]; then
30 | ENGINE="Memory settings compress=1"
31 | fi
32 | if [ $ON_DISK -eq 1 ]; then
33 | ENGINE="MergeTree ORDER BY tuple()"
34 | fi
35 | sudo touch '/var/lib/clickhouse/flags/force_drop_table' && sudo chmod 666 '/var/lib/clickhouse/flags/force_drop_table'
36 | clickhouse-client --user db_benchmark --query "DROP TABLE IF EXISTS ans;"
37 | clickhouse-client --user db_benchmark --log_comment ${RUNNAME} --query "CREATE TABLE ans ENGINE = ${ENGINE} AS ${QUERY} SETTINGS max_insert_threads=${THREADS}, max_threads=${THREADS};"
38 | local ret=$?;
39 | if [[ $ret -eq 0 ]]; then return 0; elif [[ $ret -eq 210 ]]; then return 1; else echo "Unexpected return code from clickhouse-client: $ret" >&2 && return 1; fi;
40 | clickhouse-client --user db_benchmark --query "SELECT * FROM ans LIMIT 3;"
41 | sudo touch '/var/lib/clickhouse/flags/force_drop_table' && sudo chmod 666 '/var/lib/clickhouse/flags/force_drop_table'
42 | clickhouse-client --user db_benchmark --query "DROP TABLE ans;"
43 | }
44 |
45 | ch_logrun() {
46 | clickhouse-client --user db_benchmark --query "SYSTEM FLUSH LOGS;"
47 | clickhouse-client --user db_benchmark --query "SELECT ${RUN} AS run, toUnixTimestamp(now()) AS timestamp, '${TASK}' AS task, '${SRC_DATANAME}' AS data_name, NULL AS in_rows, '${QUESTION}' AS question, result_rows AS out_rows, NULL AS out_cols, 'clickhouse' AS solution, version() AS version, NULL AS git, '${FUNCTION}' AS fun, query_duration_ms/1000 AS time_sec, memory_usage/1073741824 AS mem_gb, 1 AS cache, NULL AS chk, NULL AS chk_time_sec, 1 AS on_disk FROM system.query_log WHERE type='QueryFinish' AND log_comment='${RUNNAME}' ORDER BY query_start_time DESC LIMIT 1 FORMAT CSVWithNames;" > clickhouse/log/${RUNNAME}.csv
48 | local ret=$?;
49 | if [[ $ret -eq 0 ]]; then return 0; elif [[ $ret -eq 210 ]]; then return 1; else echo "Unexpected return code from clickhouse-client: $ret" >&2 && return 1; fi;
50 | }
51 |
52 | ch_make_2_runs() {
53 | RUN=1
54 | RUNNAME="${TASK}_${SRC_DATANAME}_q${Q}_r${RUN}"
55 | ch_query
56 | ch_logrun
57 |
58 | if [ $COMPRESS -eq 1 ]; then
59 | # It will take some time for memory freed by Memory engine to be returned back to the system.
60 | # Without a sleep we might get a MEMORY_LIMIT exception during the second run of the query.
61 | # It is done only when $COMPRESS=1 because this variable is set to true only for the largest dataset.
62 | sleep 60
63 | fi
64 |
65 | RUN=2
66 | RUNNAME="${TASK}_${SRC_DATANAME}_q${Q}_r${RUN}"
67 | ch_query
68 | ch_logrun
69 | }
70 |
--------------------------------------------------------------------------------
/clickhouse/clickhouse-misc.sh:
--------------------------------------------------------------------------------
1 | CREATE USER IF NOT EXISTS db_benchmark IDENTIFIED WITH no_password SETTINGS max_memory_usage = 28000000000 WRITABLE;
2 | GRANT select, insert, create, alter, alter user, drop on *.* to db_benchmark;
3 |
4 | ALTER USER db_benchmark SETTINGS max_memory_usage_for_user = 28000000000;
5 |
6 |
7 | CREATE TABLE G1_1e9_1e1_0_0 (id1 LowCardinality(Nullable(String)), id2 LowCardinality(Nullable(String)), id3 Nullable(String), id4 Nullable(Int32), id5 Nullable(Int32), id6 Nullable(Int32), v1 Nullable(Int32), v2 Nullable(Int32), v3 Nullable(Float64)) ENGINE = MergeTree() ORDER BY tuple();
8 |
9 | INSERT INTO G1_1e9_1e1_0_0 FROM INFILE 'data/G1_1e9_1e1_0_0.csv';
10 |
11 | # q1
12 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id1, sum(v1) AS v1 FROM G1_1e9_1e1_0_0 GROUP BY id1 SETTINGS max_insert_threads=32, max_threads=32;
13 |
14 | drop table if exists ans;
15 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id1, id2, sum(v1) AS v1 FROM G1_1e9_1e1_0_0 GROUP BY id1, id2 SETTINGS max_insert_threads=32, max_threads=32;
16 |
17 | drop table if exists ans;
18 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id3, sum(v1) AS v1, avg(v3) AS v3 FROM G1_1e9_1e1_0_0 GROUP BY id3 SETTINGS max_insert_threads=16, max_threads=16;
19 |
20 | drop table if exists ans;
21 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id4, avg(v1) AS v1, avg(v2) AS v2, avg(v3) AS v3 FROM G1_1e9_1e1_0_0 GROUP BY id4 SETTINGS max_insert_threads=32, max_threads=32;
22 |
23 | drop table if exists ans;
24 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id6, sum(v1) AS v1, sum(v2) AS v2, sum(v3) AS v3 FROM G1_1e9_1e1_0_0 GROUP BY id6 SETTINGS max_insert_threads=32, max_threads=32;
25 |
26 | drop table if exists ans;
27 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id4, id5, medianExact(v3) AS median_v3, stddevPop(v3) AS sd_v3 FROM G1_1e9_1e1_0_0 GROUP BY id4, id5 SETTINGS max_insert_threads=32, max_threads=32;
28 |
29 | drop table if exists ans;
30 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id3, max(v1) - min(v2) AS range_v1_v2 FROM G1_1e9_1e1_0_0 GROUP BY id3 SETTINGS max_insert_threads=32, max_threads=32;
31 |
32 | drop table if exists ans;
33 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id6, arrayJoin(arraySlice(arrayReverseSort(groupArray(v3)), 1, 2)) AS v3 FROM (SELECT id6, v3 FROM G1_1e9_1e1_0_0 WHERE v3 IS NOT NULL) AS subq GROUP BY id6 SETTINGS max_insert_threads=32, max_threads=32;
34 |
35 | drop table if exists ans;
36 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id2, id4, pow(corr(v1, v2), 2) AS r2 FROM G1_1e9_1e1_0_0 GROUP BY id2, id4 SETTINGS max_insert_threads=32, max_threads=32;
37 |
38 | drop table if exists ans;
39 |
40 | #q10
41 |
42 | CREATE TABLE ans ENGINE = MergeTree ORDER BY tuple() AS SELECT id1, id2, id3, id4, id5, id6, sum(v3) AS v3, count() AS cnt FROM G1_1e9_1e1_0_0 GROUP BY id1, id2, id3, id4, id5, id6 SETTINGS max_insert_threads=32, max_threads=32;
--------------------------------------------------------------------------------
/clickhouse/clickhouse-mount-config.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | /var/lib/mount/clickhouse-nvme-mount/
4 | 0
5 | 0
6 |
7 |
--------------------------------------------------------------------------------
/clickhouse/clickhouse-parse-log.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 |
3 | cat("# clickhouse-parse-log.R: starting to parse timings from clickhouse/log/.\n")
4 |
5 | source("./_helpers/helpers.R")
6 | args = commandArgs(TRUE) # args = c("groupby","G1_1e6_1e2_0_0")
7 | stopifnot(length(args)==2L)
8 | task = args[1L]
9 | data_name = args[2L]
10 | machine_type = Sys.getenv("MACHINE_TYPE")
11 |
12 | library(data.table)
13 | # sort files according to question and run
14 | sort_q_r = function(f) {
15 | tmp = strsplit(tools::file_path_sans_ext(basename(f)), "_", fixed=TRUE)
16 | if (length(len<-unique(lengths(tmp)))!=1L)
17 | stop("files names for some of logs differs in number of underscores, it should be clickhouse/log/[task]_[data_name]_q[i]_r[j].csv")
18 | stopifnot(len>1L)
19 | qr = rbindlist(lapply(lapply(tmp, `[`, c(len-1L,len)), function(x) {
20 | stopifnot(substr(x[1L], 1L, 1L)=="q", substr(x[2L], 1L, 1L)=="r")
21 | list(q=as.integer(substr(x[1L], 2L, nchar(x[1L]))), r=as.integer(substr(x[2L], 2L, nchar(x[2L]))))
22 | }))
23 | o = data.table:::forderv(qr) ## https://github.com/Rdatatable/data.table/issues/3447
24 | if (!length(o)) f else f[o]
25 | }
26 | fcsv = list.files("clickhouse/log", sprintf("^%s_%s_q.*\\.csv$", task, data_name), full.names=TRUE)
27 | if (!length(fcsv))
28 | stop("no log files produced, did you run clickhouse sql script that will output such to clickhouse/log/[task]_[data_name]_q[i]_r[j].csv")
29 | fcsv = sort_q_r(fcsv)
30 | d = rbindlist(lapply(fcsv, fread, na.strings="\\N")) # fill=TRUE for debugging type column in some queries
31 | if (!nrow(d))
32 | stop("timing log files empty")
33 | stopifnot(all(d$task==task), all(d$data_name==data_name))
34 | .in_rows = strsplit(data_name, "_", fixed=TRUE)[[1L]][[2L]] ## taken from data_name because for join CH will sum in rows from both tables
35 | d[,
36 | write.log(run=as.integer(run), timestamp=as.numeric(timestamp), task=as.character(task), data=as.character(data_name), in_rows=as.numeric(.in_rows), question=as.character(question),
37 | out_rows=as.numeric(NA), out_cols=as.integer(NA), solution=as.character(solution), version=as.character(version), git=as.character(NA), fun=as.character(fun),
38 | time_sec=as.numeric(time_sec), mem_gb=as.numeric(NA), cache=as.logical(cache), chk=as.character(NA), chk_time_sec=as.numeric(NA), on_disk=as.logical(on_disk), machine_type=as.character(machine_type)),
39 | by = seq_len(nrow(d))] -> nul
40 |
41 | cat("# clickhouse-parse-log.R: parsing timings to time.csv finished\n")
42 |
43 | if (!interactive()) q("no")
44 |
--------------------------------------------------------------------------------
/clickhouse/groupby-clickhouse.sh:
--------------------------------------------------------------------------------
1 | source ./clickhouse/ch.sh
2 |
3 | SOLUTION=clickhouse
4 | TASK=groupby
5 |
6 | # /* q1: question='sum v1 by id1' */
7 |
8 | Q=1
9 | QUESTION="sum v1 by id1"
10 | QUERY="SELECT id1, sum(v1) AS v1 FROM ${SRC_DATANAME} GROUP BY id1"
11 |
12 | ch_make_2_runs
13 |
14 | # /* q2: question='sum v1 by id1:id2' */
15 | Q=2
16 | QUESTION="sum v1 by id1:id2"
17 | QUERY="SELECT id1, id2, sum(v1) AS v1 FROM ${SRC_DATANAME} GROUP BY id1, id2"
18 |
19 | ch_make_2_runs
20 |
21 | # /* q3: question='sum v1 mean v3 by id3' */
22 | Q=3
23 | QUESTION="sum v1 mean v3 by id3"
24 | QUERY="SELECT id3, sum(v1) AS v1, avg(v3) AS v3 FROM ${SRC_DATANAME} GROUP BY id3"
25 |
26 | ch_make_2_runs
27 |
28 | # /* q4: question='mean v1:v3 by id4' */
29 | Q=4
30 | QUESTION="mean v1:v3 by id4"
31 | QUERY="SELECT id4, avg(v1) AS v1, avg(v2) AS v2, avg(v3) AS v3 FROM ${SRC_DATANAME} GROUP BY id4"
32 |
33 | ch_make_2_runs
34 |
35 | # /* q5: question='sum v1:v3 by id6' */
36 | Q=5
37 | QUESTION="sum v1:v3 by id6"
38 | QUERY="SELECT id6, sum(v1) AS v1, sum(v2) AS v2, sum(v3) AS v3 FROM ${SRC_DATANAME} GROUP BY id6"
39 |
40 | ch_make_2_runs
41 |
42 | # /* q6: question='median v3 sd v3 by id4 id5' */
43 | Q=6
44 | QUESTION="median v3 sd v3 by id4 id5"
45 | QUERY="SELECT id4, id5, medianExact(v3) AS median_v3, stddevPop(v3) AS sd_v3 FROM ${SRC_DATANAME} GROUP BY id4, id5"
46 |
47 | ch_make_2_runs
48 |
49 | # /* q7: question='max v1 - min v2 by id3' */
50 | Q=7
51 | QUESTION="max v1 - min v2 by id3"
52 | QUERY="SELECT id3, max(v1) - min(v2) AS range_v1_v2 FROM ${SRC_DATANAME} GROUP BY id3"
53 |
54 | ch_make_2_runs
55 |
56 | # /* q8: question='largest two v3 by id6' */
57 | Q=8
58 | QUESTION="largest two v3 by id6"
59 | QUERY="SELECT id6, arrayJoin(arraySlice(arrayReverseSort(groupArray(v3)), 1, 2)) AS v3 FROM (SELECT id6, v3 FROM ${SRC_DATANAME} WHERE v3 IS NOT NULL) AS subq GROUP BY id6"
60 |
61 | ch_make_2_runs
62 |
63 | # /* q9: question='regression v1 v2 by id2 id4' */
64 | Q=9
65 | QUESTION="regression v1 v2 by id2 id4"
66 | QUERY="SELECT id2, id4, pow(corr(v1, v2), 2) AS r2 FROM ${SRC_DATANAME} GROUP BY id2, id4"
67 |
68 | ch_make_2_runs
69 |
70 | # /* q10: question='sum v3 count by id1:id6' */
71 | Q=10
72 | QUESTION="sum v3 count by id1:id6"
73 | QUERY="SELECT id1, id2, id3, id4, id5, id6, sum(v3) AS v3, count() AS cnt FROM ${SRC_DATANAME} GROUP BY id1, id2, id3, id4, id5, id6"
74 |
75 | ch_make_2_runs
76 |
--------------------------------------------------------------------------------
/clickhouse/join-clickhouse.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | source ./clickhouse/ch.sh
4 |
5 | SOLUTION=clickhouse
6 | TASK=join
7 |
8 | echo SRC ${SRC_DATANAME} RHS1 ${RHS1} RHS2 ${RHS2} RHS3 ${RHS3} COMPRESS ${COMPRESS} THREADS ${THREADS}
9 |
10 | # /* q1: question='small inner on int' */
11 | Q=1
12 | QUESTION="small inner on int"
13 | QUERY="SELECT id1, x.id2, x.id3, x.id4, y.id4, x.id5, x.id6, x.v1, y.v2 FROM ${SRC_DATANAME} AS x INNER JOIN ${RHS1} AS y USING (id1)"
14 | ch_make_2_runs
15 |
16 | # /* q2: question='medium inner on int' */
17 | Q=2
18 | QUESTION="medium inner on int"
19 | QUERY="SELECT x.id1, y.id1, id2, x.id3, x.id4, y.id4, x.id5, y.id5, x.id6, x.v1, y.v2 FROM ${SRC_DATANAME} AS x INNER JOIN ${RHS2} AS y USING (id2)"
20 | ch_make_2_runs
21 |
22 | # /* q3: question='medium outer on int' */
23 | Q=3
24 | QUESTION="medium outer on int"
25 | QUERY="SELECT x.id1, y.id1, id2, x.id3, x.id4, y.id4, x.id5, y.id5, x.id6, x.v1, y.v2 FROM ${SRC_DATANAME} AS x LEFT JOIN ${RHS2} AS y USING (id2)"
26 | ch_make_2_runs
27 |
28 | # /* q4: question='medium inner on factor' */
29 | Q=4
30 | QUESTION="medium inner on factor"
31 | QUERY="SELECT x.id1, y.id1, x.id2, y.id2, x.id3, x.id4, y.id4, id5, x.id6, x.v1, y.v2 FROM ${SRC_DATANAME} AS x INNER JOIN ${RHS2} AS y USING (id5)"
32 | ch_make_2_runs
33 |
34 | # /* q5: question='big inner on int' */
35 | Q=5
36 | QUESTION="big inner on int"
37 | QUERY="SELECT x.id1, y.id1, x.id2, y.id2, id3, x.id4, y.id4, x.id5, y.id5, x.id6, y.id6, x.v1, y.v2 FROM ${SRC_DATANAME} AS x INNER JOIN ${RHS3} AS y USING (id3)"
38 | ch_make_2_runs
39 |
--------------------------------------------------------------------------------
/clickhouse/setup-clickhouse.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # install
3 | sudo apt-get install -y apt-transport-https ca-certificates curl gnupg
4 | curl -fsSL 'https://packages.clickhouse.com/rpm/lts/repodata/repomd.xml.key' | sudo gpg --dearmor -o /usr/share/keyrings/clickhouse-keyring.gpg
5 |
6 | ARCH=$(dpkg --print-architecture)
7 | echo "deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg arch=${ARCH}] https://packages.clickhouse.com/deb stable main" | sudo tee /etc/apt/sources.list.d/clickhouse.list
8 | sudo apt-get update
9 |
10 | sudo apt-get install -y clickhouse-server clickhouse-client
11 |
12 | # stop server if service was already running
13 | sudo service clickhouse-server start ||:
14 |
15 |
16 | # modify clickhouse settings so data is stored on the mount.
17 | sudo mkdir -p /var/lib/mount/clickhouse-nvme-mount/
18 | sudo chown clickhouse:clickhouse /var/lib/mount/clickhouse-nvme-mount
19 |
20 | # copy clickhouse config
21 | sudo cp -a /var/lib/clickhouse/. /var/lib/mount/clickhouse-nvme-mount/
22 | sudo cp clickhouse/clickhouse-mount-config.xml /etc/clickhouse-server/config.d/data-paths.xml
23 |
24 |
25 | # start server
26 | sudo rm -rf /var/log/clickhouse-server/clickhouse-server.err.log /var/log/clickhouse-server/clickhouse-server.log
27 | sudo service clickhouse-server start
28 |
29 |
30 | MEMORY_LIMIT=0
31 | BYTES_BEFORE_EXTERNAL_GROUP_BY=0
32 | if [[ $MACHINE_TYPE == "c6id.4xlarge" ]]; then
33 | MEMORY_LIMIT=28000000000
34 | BYTES_BEFORE_EXTERNAL_GROUP_BY=20000000000
35 | fi
36 |
37 | clickhouse-client --query "CREATE USER IF NOT EXISTS db_benchmark IDENTIFIED WITH no_password SETTINGS max_memory_usage = $MEMORY_LIMIT, max_bytes_before_external_group_by = $BYTES_BEFORE_EXTERNAL_GROUP_BY WRITABLE;"
38 | clickhouse-client --query "GRANT select, insert, create, alter, alter user, create table, truncate, drop, system flush logs on *.* to db_benchmark;"
39 |
40 | ./clickhouse/ver-clickhouse.sh
41 |
--------------------------------------------------------------------------------
/clickhouse/upg-clickhouse.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | # upgrade to latest released
5 | echo 'upgrading clickhouse-server clickhouse-client...'
6 | sudo apt-get install --only-upgrade clickhouse-server clickhouse-client
7 |
8 | if [[ $TEST_RUN != "true" ]]; then
9 | sudo chown ubuntu:ubuntu clickhouse/VERSION
10 | sudo chown ubuntu:ubuntu clickhouse/REVISION
11 | fi
12 |
13 |
14 | # modify clickhouse settings so data is stored on the mount.
15 | # This is necessary for when clickhouse is installed on a machine but the mount looses all data
16 | sudo mkdir -p /var/lib/mount/clickhouse-nvme-mount/
17 | sudo chown clickhouse:clickhouse /var/lib/mount/clickhouse-nvme-mount
18 |
19 | # copy clickhouse config
20 | sudo cp -a /var/lib/clickhouse/. /var/lib/mount/clickhouse-nvme-mount/
21 | sudo cp clickhouse/clickhouse-mount-config.xml /etc/clickhouse-server/config.d/data-paths.xml
22 |
23 |
24 | # start server
25 | sudo rm -rf /var/log/clickhouse-server/clickhouse-server.err.log /var/log/clickhouse-server/clickhouse-server.log
26 | sudo service clickhouse-server start
--------------------------------------------------------------------------------
/clickhouse/ver-clickhouse.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | source ./clickhouse/ch.sh # clickhouse helper scripts
5 |
6 | ch_installed && clickhouse-client --version-clean > clickhouse/VERSION && echo "" > clickhouse/REVISION
7 |
8 | if [[ $TEST_RUN != "true" ]]; then
9 | sudo chown ubuntu:ubuntu clickhouse/VERSION
10 | sudo chown ubuntu:ubuntu clickhouse/REVISION
11 | fi
--------------------------------------------------------------------------------
/collapse/VERSION:
--------------------------------------------------------------------------------
1 | 2.1.2
2 |
--------------------------------------------------------------------------------
/collapse/groupby2014-collapse.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 |
3 | cat("# groupby2014-collapse.R\n")
4 |
5 | source("./_helpers/helpers.R")
6 |
7 | stopifnot(requireNamespace("data.table", quietly=TRUE)) # collapse does not support integer64. Oversized ints will be summed to double.
8 | .libPaths("./collapse/r-collapse") # tidyverse/collapse#4641
9 | suppressPackageStartupMessages(library("collapse", lib.loc="./collapse/r-collapse", warn.conflicts=FALSE))
10 | ver = packageVersion("collapse")
11 | git = "" # uses stable version now #124
12 | task = "groupby2014"
13 | solution = "collapse"
14 | fun = "group_by"
15 | cache = TRUE
16 | on_disk = FALSE
17 |
18 | data_name = Sys.getenv("SRC_DATANAME")
19 | src_grp = file.path("data", paste(data_name, "csv", sep="."))
20 | cat(sprintf("loading dataset %s\n", data_name))
21 |
22 | x = data.table::fread(src_grp, showProgress=FALSE, data.table=FALSE)
23 | print(nrow(x))
24 | gc()
25 |
26 | # Setting collapse options: namespace masking and performance
27 | oldopts <- set_collapse(nthreads = data.table::getDTthreads(),
28 | mask = "all",
29 | sort = endsWith(data_name, "_1"),
30 | na.rm = anyNA(num_vars(x)),
31 | stable.algo = FALSE)
32 |
33 | task_init = proc.time()[["elapsed"]]
34 | cat("grouping...\n")
35 |
36 | question = "sum v1 by id1" # q1
37 | t = system.time(print(dim(ans<-collap(x, v1 ~ id1, sum))))[["elapsed"]]
38 | m = memory_usage()
39 | chkt = system.time(chk<-summarise(ans, v1=sum(v1)))[["elapsed"]]
40 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
41 | rm(ans)
42 | t = system.time(print(dim(ans<-collap(x, v1 ~ id1, sum))))[["elapsed"]]
43 | m = memory_usage()
44 | chkt = system.time(chk<-summarise(ans, v1=sum(v1)))[["elapsed"]]
45 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
46 | print(head(ans, 3))
47 | print(tail(ans, 3))
48 | rm(ans)
49 |
50 | question = "sum v1 by id1:id2" # q2
51 | t = system.time(print(dim(ans<-collap(x, v1 ~ id1 + id2, sum))))[["elapsed"]]
52 | m = memory_usage()
53 | chkt = system.time(chk<-summarise(ans, v1=sum(v1)))[["elapsed"]]
54 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
55 | rm(ans)
56 | t = system.time(print(dim(ans<-collap(x, v1 ~ id1 + id2, sum))))[["elapsed"]]
57 | m = memory_usage()
58 | chkt = system.time(chk<-summarise(ans, v1=sum(v1)))[["elapsed"]]
59 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
60 | print(head(ans, 3))
61 | print(tail(ans, 3))
62 | rm(ans)
63 |
64 | question = "sum v1 mean v3 by id3" # q3
65 | t = system.time(print(dim(ans<-collap(x, ~ id3, custom = list(sum = "v1", mean = "v3")))))[["elapsed"]]
66 | m = memory_usage()
67 | chkt = system.time(chk<-summarise(ans, v1=sum(v1), v3=sum(v3)))[["elapsed"]]
68 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
69 | rm(ans)
70 | t = system.time(print(dim(ans<-collap(x, ~ id3, custom = list(sum = "v1", mean = "v3")))))[["elapsed"]]
71 | m = memory_usage()
72 | chkt = system.time(chk<-summarise(ans, v1=sum(v1), v3=sum(v3)))[["elapsed"]]
73 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
74 | print(head(ans, 3))
75 | print(tail(ans, 3))
76 | rm(ans)
77 |
78 | question = "mean v1:v3 by id4" # q4
79 | t = system.time(print(dim(ans<-x |> group_by(id4) |> select(v1:v3) |> mean())))[["elapsed"]]
80 | m = memory_usage()
81 | chkt = system.time(chk<-summarise(ans, v1=sum(v1), v2=sum(v2), v3=sum(v3)))[["elapsed"]]
82 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
83 | rm(ans)
84 | t = system.time(print(dim(ans<-x |> group_by(id4) |> select(v1:v3) |> mean())))[["elapsed"]]
85 | m = memory_usage()
86 | chkt = system.time(chk<-summarise(ans, v1=sum(v1), v2=sum(v2), v3=sum(v3)))[["elapsed"]]
87 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
88 | print(head(ans, 3))
89 | print(tail(ans, 3))
90 | rm(ans)
91 |
92 | question = "sum v1:v3 by id6" # q5
93 | t = system.time(print(dim(ans<-x |> group_by(id6) |> select(v1:v3) |> sum())))[["elapsed"]]
94 | m = memory_usage()
95 | chkt = system.time(chk<-summarise(ans, v1=sum(v1), v2=sum(v2), v3=sum(v3)))[["elapsed"]]
96 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
97 | rm(ans)
98 | t = system.time(print(dim(ans<-x |> group_by(id6) |> select(v1:v3) |> sum())))[["elapsed"]]
99 | m = memory_usage()
100 | chkt = system.time(chk<-summarise(ans, v1=sum(v1), v2=sum(v2), v3=sum(v3)))[["elapsed"]]
101 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
102 | print(head(ans, 3))
103 | print(tail(ans, 3))
104 | rm(ans)
105 |
106 | cat(sprintf("grouping finished, took %.0fs\n", proc.time()[["elapsed"]]-task_init))
107 |
108 | set_collapse(oldopts)
109 |
110 | if( !interactive() ) q("no", status=0)
111 |
--------------------------------------------------------------------------------
/collapse/setup-collapse.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | # install stable collapse
5 | mkdir -p ./collapse/r-collapse
6 | Rscript -e 'install.packages(c("Rcpp", "collapse"), lib="./collapse/r-collapse", repos = "http://cloud.r-project.org")'
7 |
8 | ./collapse/ver-collapse.sh
--------------------------------------------------------------------------------
/collapse/upg-collapse.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | # upgrade all packages in collapse library only if new collapse is out
5 | echo 'upgrading collapse...'
6 | Rscript -e 'ap=available.packages(); if (ap["collapse","Version"]!=packageVersion("collapse", lib.loc="./collapse/r-collapse")) update.packages(lib.loc="./collapse/r-collapse", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)'
7 |
--------------------------------------------------------------------------------
/collapse/ver-collapse.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | Rscript -e 'v=read.dcf(system.file(package="collapse", lib.loc="./collapse/r-collapse", "DESCRIPTION"), fields=c("Version","RemoteSha")); colnames(v)[colnames(v)=="RemoteSha"]="Revision"; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("collapse", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))'
5 |
--------------------------------------------------------------------------------
/dask/VERSION:
--------------------------------------------------------------------------------
1 | 2024.9.0
--------------------------------------------------------------------------------
/dask/common.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import gc
3 | import os
4 | import logging
5 | import timeit
6 | from abc import ABC, abstractmethod
7 | from typing import Iterable, Any
8 |
9 | import dask.dataframe as dd
10 | from dask import distributed
11 |
12 | logging.basicConfig(
13 | level=logging.INFO,
14 | format='{ %(name)s:%(lineno)d @ %(asctime)s } - %(message)s'
15 | )
16 | logger = logging.getLogger(__name__)
17 |
18 | THIS_DIR = os.path.abspath(
19 | os.path.dirname(__file__)
20 | )
21 | HELPERS_DIR = os.path.abspath(
22 | os.path.join(
23 | THIS_DIR, '../_helpers'
24 | )
25 | )
26 | sys.path.extend((THIS_DIR, HELPERS_DIR))
27 | from helpers import *
28 |
29 | class Query(ABC):
30 | question: str = None
31 |
32 | @staticmethod
33 | @abstractmethod
34 | def query(*args) -> dd.DataFrame:
35 | pass
36 |
37 | @staticmethod
38 | @abstractmethod
39 | def check(ans: dd.DataFrame) -> Any:
40 | pass
41 |
42 | @classmethod
43 | def name(cls) -> str:
44 | return f"{cls.__name__}: {cls.question}"
45 |
46 | class QueryRunner:
47 | def __init__(
48 | self,
49 | task: str,
50 | solution: str,
51 | solution_version: str,
52 | solution_revision: str,
53 | fun: str,
54 | cache: str,
55 | on_disk: bool
56 | ):
57 | self.task = task
58 | self.solution = solution
59 | self.solution_version = solution_version
60 | self.solution_revision = solution_revision
61 | self.fun = fun
62 | self.cache = cache
63 | self.on_disk = on_disk
64 |
65 | def run_query(
66 | self,
67 | data_name: str,
68 | in_rows: int,
69 | args: Iterable[Any],
70 | query: Query,
71 | machine_type: str,
72 | runs: int = 2,
73 | raise_exception: bool = False,
74 | ):
75 | logger.info("Running '%s'" % query.name())
76 |
77 | try:
78 | for run in range(1, runs+1):
79 | gc.collect() # TODO: Able to do this in worker processes? Want to?
80 |
81 | # Calculate ans
82 | t_start = timeit.default_timer()
83 | ans = query.query(*args)
84 | logger.debug("Answer shape: %s" % (ans.shape, ))
85 | t = timeit.default_timer() - t_start
86 | m = memory_usage()
87 |
88 | logger.info("\tRun #%s: %0.3fs" % (run, t))
89 |
90 | # Calculate chk
91 | t_start = timeit.default_timer()
92 | chk = query.check(ans)
93 | chkt = timeit.default_timer() - t_start
94 |
95 |
96 | write_log(
97 | task=self.task,
98 | data=data_name,
99 | in_rows=in_rows,
100 | question=query.question,
101 | out_rows=ans.shape[0],
102 | out_cols=ans.shape[1],
103 | solution=self.solution,
104 | version=self.solution_version,
105 | git=self.solution_revision,
106 | fun=self.fun,
107 | run=run,
108 | time_sec=t,
109 | mem_gb=m,
110 | cache=self.cache,
111 | chk=make_chk(chk),
112 | chk_time_sec=chkt,
113 | on_disk=self.on_disk,
114 | machine_type=machine_type
115 | )
116 | if run == runs:
117 | # Print head / tail on last run
118 | logger.debug("Answer head:\n%s" % ans.head(3))
119 | logger.debug("Answer tail:\n%s" % ans.tail(3))
120 | del ans
121 | except Exception as err:
122 | logger.error("Query '%s' failed!" % query.name())
123 | print(err)
124 |
125 | # Re-raise if instructed
126 | if raise_exception:
127 | raise err
128 |
129 | def dask_client() -> distributed.Client:
130 | # we use process-pool instead of thread-pool due to GIL cost
131 | return distributed.Client(processes=True, silence_logs=logging.ERROR)
132 |
--------------------------------------------------------------------------------
/dask/setup-dask.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | virtualenv dask/py-dask --python=python3
5 | source dask/py-dask/bin/activate
6 |
7 | # install binaries
8 | python3 -m pip install "dask[complete]"
9 |
10 | # check
11 | # python3
12 | # import dask as dk
13 | # dk.__version__
14 | # dk.__git_revision__
15 | # quit()
16 |
17 | deactivate
18 |
19 | ./dask/ver-dask.sh
20 |
--------------------------------------------------------------------------------
/dask/upg-dask.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | echo 'upgrading dask...'
5 |
6 | source ./dask/py-dask/bin/activate
7 |
8 | python3 -m pip install --upgrade dask[complete] > /dev/null
9 |
--------------------------------------------------------------------------------
/dask/ver-dask.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | source ./dask/py-dask/bin/activate
5 | python3 -c 'import dask as dk; open("dask/VERSION","w").write(dk.__version__); open("dask/REVISION","w").write(dk.__git_revision__);' > /dev/null
6 |
--------------------------------------------------------------------------------
/datafusion/VERSION:
--------------------------------------------------------------------------------
1 | 47.0.0
--------------------------------------------------------------------------------
/datafusion/setup-datafusion.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | virtualenv datafusion/py-datafusion --python=python3
5 | source datafusion/py-datafusion/bin/activate
6 |
7 | python3 -m pip install --upgrade psutil datafusion pandas
8 |
9 | # build
10 | deactivate
11 | ./datafusion/upg-datafusion.sh
12 |
13 | ./datafusion/ver-datafusion.sh
14 |
15 | # check
16 | # source datafusion/py-datafusion/bin/activate
17 | # python3
18 | # import datafusion as df
19 | # df.__version__
20 | # quit()
21 | # deactivate
22 |
23 | # fix: print(ans.head(3), flush=True): UnicodeEncodeError: 'ascii' codec can't encode characters in position 14-31: ordinal not in range(128)
24 | # vim datafusion/py-datafusion/bin/activate
25 | #deactivate () {
26 | # unset PYTHONIOENCODING
27 | # ...
28 | #}
29 | #...
30 | #PYTHONIOENCODING="utf-8"
31 | #export PYTHONIOENCODING
32 | #...
33 |
--------------------------------------------------------------------------------
/datafusion/upg-datafusion.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | echo 'upgrading datafusion...'
5 |
6 | source ./datafusion/py-datafusion/bin/activate
7 |
8 | python -m pip install --upgrade datafusion > /dev/null
9 |
10 | deactivate
--------------------------------------------------------------------------------
/datafusion/ver-datafusion.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | source ./datafusion/py-datafusion/bin/activate
3 | python3 -c 'import datafusion as df; open("datafusion/VERSION","w").write(df.__version__); open("datafusion/REVISION","w").write("");' > /dev/null
4 |
--------------------------------------------------------------------------------
/datatable/VERSION:
--------------------------------------------------------------------------------
1 | 1.16.99
2 |
--------------------------------------------------------------------------------
/datatable/groupby2014-datatable.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 |
3 | cat("# groupby2014-datatable.R\n")
4 |
5 | source("./_helpers/helpers.R")
6 |
7 | stopifnot(requireNamespace(c("bit64"), quietly=TRUE)) # used in chk to sum numeric columns
8 | suppressPackageStartupMessages(library("data.table", lib.loc="./datatable/r-datatable"))
9 | setDTthreads(0L)
10 | ver = packageVersion("data.table")
11 | git = data.table:::.git(quiet=TRUE)
12 | task = "groupby2014"
13 | solution = "data.table"
14 | fun = "[.data.table"
15 | cache = TRUE
16 | on_disk = FALSE
17 |
18 | data_name = Sys.getenv("SRC_DATANAME")
19 | src_grp = file.path("data", paste(data_name, "csv", sep="."))
20 | cat(sprintf("loading dataset %s\n", data_name))
21 |
22 | x = fread(src_grp, showProgress=FALSE)
23 | print(nrow(x))
24 |
25 | task_init = proc.time()[["elapsed"]]
26 | cat("grouping...\n")
27 |
28 | question = "sum v1 by id1" # q1
29 | t = system.time(print(dim(ans<-x[, sum(v1), keyby=id1])))[["elapsed"]]
30 | m = memory_usage()
31 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(V1)))])[["elapsed"]]
32 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
33 | rm(ans)
34 | t = system.time(print(dim(ans<-x[, sum(v1), keyby=id1])))[["elapsed"]]
35 | m = memory_usage()
36 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(V1)))])[["elapsed"]]
37 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
38 | print(head(ans, 3))
39 | print(tail(ans, 3))
40 | rm(ans)
41 |
42 | question = "sum v1 by id1:id2" # q2
43 | t = system.time(print(dim(ans<-x[, sum(v1), keyby='id1,id2'])))[["elapsed"]]
44 | m = memory_usage()
45 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(V1)))])[["elapsed"]]
46 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
47 | rm(ans)
48 | t = system.time(print(dim(ans<-x[, sum(v1), keyby='id1,id2'])))[["elapsed"]]
49 | m = memory_usage()
50 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(V1)))])[["elapsed"]]
51 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
52 | print(head(ans, 3))
53 | print(tail(ans, 3))
54 | rm(ans)
55 |
56 | question = "sum v1 mean v3 by id3" # q3
57 | t = system.time(print(dim(ans<-x[, list(sum(v1), mean(v3)), keyby=id3])))[["elapsed"]]
58 | m = memory_usage()
59 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(V1)), sum(V2))])[["elapsed"]]
60 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
61 | rm(ans)
62 | t = system.time(print(dim(ans<-x[, list(sum(v1), mean(v3)), keyby=id3])))[["elapsed"]]
63 | m = memory_usage()
64 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(V1)), sum(V2))])[["elapsed"]]
65 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
66 | print(head(ans, 3))
67 | print(tail(ans, 3))
68 | rm(ans)
69 |
70 | question = "mean v1:v3 by id4" # q4
71 | t = system.time(print(dim(ans<-x[, lapply(.SD, mean), keyby=id4, .SDcols=7:9])))[["elapsed"]]
72 | m = memory_usage()
73 | chkt = system.time(chk<-ans[, .(sum(v1), sum(v2), sum(v3))])[["elapsed"]]
74 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
75 | rm(ans)
76 | t = system.time(print(dim(ans<-x[, lapply(.SD, mean), keyby=id4, .SDcols=7:9])))[["elapsed"]]
77 | m = memory_usage()
78 | chkt = system.time(chk<-ans[, .(sum(v1), sum(v2), sum(v3))])[["elapsed"]]
79 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
80 | print(head(ans, 3))
81 | print(tail(ans, 3))
82 | rm(ans)
83 |
84 | question = "sum v1:v3 by id6" # q5
85 | t = system.time(print(dim(ans<-x[, lapply(.SD, sum), keyby=id6, .SDcols=7:9])))[["elapsed"]]
86 | m = memory_usage()
87 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(v1)), sum(bit64::as.integer64(v2)), sum(v3))])[["elapsed"]]
88 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
89 | rm(ans)
90 | t = system.time(print(dim(ans<-x[, lapply(.SD, sum), keyby=id6, .SDcols=7:9])))[["elapsed"]]
91 | m = memory_usage()
92 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(v1)), sum(bit64::as.integer64(v2)), sum(v3))])[["elapsed"]]
93 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
94 | print(head(ans, 3))
95 | print(tail(ans, 3))
96 | rm(ans)
97 |
98 | cat(sprintf("grouping finished, took %.0fs\n", proc.time()[["elapsed"]]-task_init))
99 |
100 | if( !interactive() ) q("no", status=0)
101 |
--------------------------------------------------------------------------------
/datatable/read-datatable.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 |
3 | cat("# read-datatable.R\n")
4 |
5 | source("./helpers.R")
6 | source("./datatable/helpers-datatable.R")
7 |
8 | suppressPackageStartupMessages(library(data.table))
9 | ver = packageVersion("data.table")
10 | git = datatable.git()
11 | task = "read"
12 | solution = "data.table"
13 | fun = "fread"
14 | cache = TRUE
15 |
16 | src_grp = Sys.getenv("SRC_GRP_LOCAL")
17 | data_name = basename(src_grp)
18 | options("datatable.showProgress"=FALSE)
19 |
20 | in_rows = as.numeric(strsplit(system(sprintf("wc -l %s", data_name), intern=TRUE), " ", fixed=TRUE)[[1L]][1L])-1
21 |
22 | cat("reading...\n")
23 |
24 | question = "all rows" #1
25 | t = system.time(print(dim(ans<-fread(data_name))))[["elapsed"]]
26 | m = memory_usage()
27 | chkt = system.time(chk<-ans[, .(sum(v3))])[["elapsed"]]
28 | write.log(run=1L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
29 | rm(ans)
30 | t = system.time(print(dim(ans<-fread(data_name))))[["elapsed"]]
31 | m = memory_usage()
32 | chkt = system.time(chk<-ans[, .(sum(v3))])[["elapsed"]]
33 | write.log(run=2L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
34 | rm(ans)
35 | t = system.time(print(dim(ans<-fread(data_name))))[["elapsed"]]
36 | m = memory_usage()
37 | chkt = system.time(chk<-ans[, .(sum(v3))])[["elapsed"]]
38 | write.log(run=3L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
39 | rm(ans)
40 |
41 | question = "top 100 rows" #2
42 | t = system.time(print(dim(ans<-fread(data_name, nrows=100))))[["elapsed"]]
43 | m = memory_usage()
44 | chkt = system.time(chk<-ans[, .(sum(v3))])[["elapsed"]]
45 | write.log(run=1L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
46 | rm(ans)
47 | t = system.time(print(dim(ans<-fread(data_name, nrows=100))))[["elapsed"]]
48 | m = memory_usage()
49 | chkt = system.time(chk<-ans[, .(sum(v3))])[["elapsed"]]
50 | write.log(run=2L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
51 | rm(ans)
52 | t = system.time(print(dim(ans<-fread(data_name, nrows=100))))[["elapsed"]]
53 | m = memory_usage()
54 | chkt = system.time(chk<-ans[, .(sum(v3))])[["elapsed"]]
55 | write.log(run=3L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
56 | rm(ans)
57 |
58 | if( !interactive() ) q("no", status=0)
59 |
--------------------------------------------------------------------------------
/datatable/setup-datatable.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # install devel data.table
3 | mkdir -p ./datatable/r-datatable
4 | Rscript -e 'install.packages("data.table", repos="https://Rdatatable.gitlab.io/data.table", method="curl", lib="./datatable/r-datatable")'
5 |
6 | ./datatable/ver-datatable.sh
7 |
--------------------------------------------------------------------------------
/datatable/sort-datatable.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 |
3 | cat("# sort-datatable.R\n")
4 |
5 | source("./helpers.R")
6 | source("./datatable/helpers-datatable.R")
7 |
8 | src_x = Sys.getenv("SRC_X", NA_character_)
9 |
10 | # if (get.nrow(src_x) > 1e9L) {
11 | # cat("# sort with data.table skipped due data volume cap for single machine set to total 1e9 rows")
12 | # quit("no", status=0) # datasets > 1e9 too big to try load on single machine
13 | # }
14 |
15 | stopifnot(requireNamespace("bit64", quietly=TRUE)) # used in chk to sum numeric columns
16 | suppressPackageStartupMessages(library(data.table))
17 | ver = packageVersion("data.table")
18 | git = datatable.git()
19 | data_name = basename(src_x)
20 | task = "sort"
21 | solution = "data.table"
22 | fun = "[.data.table"
23 | question = "by int KEY"
24 | cache = TRUE
25 |
26 | cat("loading dataset...\n")
27 | X = fread(if(file.exists(basename(src_x))) basename(src_x) else sprintf("hadoop fs -cat %s", src_x)) # csv can be provided in local dir for faster import
28 |
29 | cat("sorting...\n")
30 | t = system.time(print(dim(ans<-X[order(KEY)])))[["elapsed"]]
31 | m = memory_usage()
32 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(X2)))])[["elapsed"]]
33 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
34 | rm(ans)
35 |
36 | t = system.time(print(dim(ans<-X[order(KEY)])))[["elapsed"]]
37 | m = memory_usage()
38 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(X2)))])[["elapsed"]]
39 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
40 | rm(ans)
41 |
42 | t = system.time(print(dim(ans<-X[order(KEY)])))[["elapsed"]]
43 | m = memory_usage()
44 | chkt = system.time(chk<-ans[, .(sum(bit64::as.integer64(X2)))])[["elapsed"]]
45 | write.log(run=3L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
46 | rm(ans)
47 |
48 | if( !interactive() ) q("no", status=0)
49 |
--------------------------------------------------------------------------------
/datatable/upg-datatable.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | # upgrade to latest devel
5 | echo 'upgrading data.table...'
6 | # Rscript -e 'data.table::update.dev.pkg(quiet=TRUE, method="curl", lib="./datatable/r-datatable")'
7 | Rscript -e 'update.packages(lib.loc = "./datatable/r-datatable", repos="https://rdatatable.gitlab.io/data.table", method="curl")'
8 |
9 |
--------------------------------------------------------------------------------
/datatable/ver-datatable.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | Rscript -e 'v=read.dcf(system.file(package="data.table", lib.loc="./datatable/r-datatable", "DESCRIPTION"), fields=c("Version","Revision")); cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("datatable", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))'
5 |
--------------------------------------------------------------------------------
/dplyr/VERSION:
--------------------------------------------------------------------------------
1 | 1.1.4
2 |
--------------------------------------------------------------------------------
/dplyr/groupby2014-dplyr.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 |
3 | cat("# groupby2014-dplyr.R\n")
4 |
5 | source("./_helpers/helpers.R")
6 |
7 | stopifnot(requireNamespace(c("bit64","data.table"), quietly=TRUE)) # used in chk to sum numeric columns and data loading
8 | .libPaths("./dplyr/r-dplyr") # tidyverse/dplyr#4641
9 | suppressPackageStartupMessages(library("dplyr", lib.loc="./dplyr/r-dplyr", warn.conflicts=FALSE))
10 | ver = packageVersion("dplyr")
11 | git = "" # uses stable version now #124
12 | task = "groupby2014"
13 | solution = "dplyr"
14 | fun = "group_by"
15 | cache = TRUE
16 | on_disk = FALSE
17 |
18 | data_name = Sys.getenv("SRC_DATANAME")
19 | src_grp = file.path("data", paste(data_name, "csv", sep="."))
20 | cat(sprintf("loading dataset %s\n", data_name))
21 |
22 | x = as_tibble(data.table::fread(src_grp, showProgress=FALSE, data.table=FALSE))
23 | print(nrow(x))
24 |
25 | task_init = proc.time()[["elapsed"]]
26 | cat("grouping...\n")
27 |
28 | question = "sum v1 by id1" # q1
29 | t = system.time(print(dim(ans<-x %>% group_by(id1) %>% summarise(sum(v1)))))[["elapsed"]]
30 | m = memory_usage()
31 | chkt = system.time(chk<-summarise(ans, v1=sum(bit64::as.integer64(`sum(v1)`))))[["elapsed"]]
32 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
33 | rm(ans)
34 | t = system.time(print(dim(ans<-x %>% group_by(id1) %>% summarise(sum(v1)))))[["elapsed"]]
35 | m = memory_usage()
36 | chkt = system.time(chk<-summarise(ans, v1=sum(bit64::as.integer64(`sum(v1)`))))[["elapsed"]]
37 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
38 | print(head(ans, 3))
39 | print(tail(ans, 3))
40 | rm(ans)
41 |
42 | question = "sum v1 by id1:id2" # q2
43 | t = system.time(print(dim(ans<-x %>% group_by(id1,id2) %>% summarise(sum(v1)))))[["elapsed"]]
44 | m = memory_usage()
45 | chkt = system.time(chk<-summarise(ungroup(ans), v1=sum(bit64::as.integer64(`sum(v1)`))))[["elapsed"]]
46 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
47 | rm(ans)
48 | t = system.time(print(dim(ans<-x %>% group_by(id1,id2) %>% summarise(sum(v1)))))[["elapsed"]]
49 | m = memory_usage()
50 | chkt = system.time(chk<-summarise(ungroup(ans), v1=sum(bit64::as.integer64(`sum(v1)`))))[["elapsed"]]
51 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
52 | print(head(ans, 3))
53 | print(tail(ans, 3))
54 | rm(ans)
55 |
56 | question = "sum v1 mean v3 by id3" # q3
57 | t = system.time(print(dim(ans<-x %>% group_by(id3) %>% summarise(sum(v1), mean(v3)))))[["elapsed"]]
58 | m = memory_usage()
59 | chkt = system.time(chk<-summarise(ans, v1=sum(bit64::as.integer64(`sum(v1)`)), v3=sum(`mean(v3)`)))[["elapsed"]]
60 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
61 | rm(ans)
62 | t = system.time(print(dim(ans<-x %>% group_by(id3) %>% summarise(sum(v1), mean(v3)))))[["elapsed"]]
63 | m = memory_usage()
64 | chkt = system.time(chk<-summarise(ans, v1=sum(bit64::as.integer64(`sum(v1)`)), v3=sum(`mean(v3)`)))[["elapsed"]]
65 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
66 | print(head(ans, 3))
67 | print(tail(ans, 3))
68 | rm(ans)
69 |
70 | question = "mean v1:v3 by id4" # q4
71 | t = system.time(print(dim(ans<-x %>% group_by(id4) %>% summarise(across(v1:v3, mean)))))[["elapsed"]]
72 | m = memory_usage()
73 | chkt = system.time(chk<-summarise(ans, v1=sum(v1), v2=sum(v2), v3=sum(v3)))[["elapsed"]]
74 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
75 | rm(ans)
76 | t = system.time(print(dim(ans<-x %>% group_by(id4) %>% summarise(across(v1:v3, mean)))))[["elapsed"]]
77 | m = memory_usage()
78 | chkt = system.time(chk<-summarise(ans, v1=sum(v1), v2=sum(v2), v3=sum(v3)))[["elapsed"]]
79 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
80 | print(head(ans, 3))
81 | print(tail(ans, 3))
82 | rm(ans)
83 |
84 | question = "sum v1:v3 by id6" # q5
85 | t = system.time(print(dim(ans<-x %>% group_by(id6) %>% summarise(across(v1:v3, sum)))))[["elapsed"]]
86 | m = memory_usage()
87 | chkt = system.time(chk<-summarise(ans, v1=sum(bit64::as.integer64(v1)), v2=sum(bit64::as.integer64(v2)), v3=sum(v3)))[["elapsed"]]
88 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
89 | rm(ans)
90 | t = system.time(print(dim(ans<-x %>% group_by(id6) %>% summarise(across(v1:v3, sum)))))[["elapsed"]]
91 | m = memory_usage()
92 | chkt = system.time(chk<-summarise(ans, v1=sum(bit64::as.integer64(v1)), v2=sum(bit64::as.integer64(v2)), v3=sum(v3)))[["elapsed"]]
93 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
94 | print(head(ans, 3))
95 | print(tail(ans, 3))
96 | rm(ans)
97 |
98 | cat(sprintf("grouping finished, took %.0fs\n", proc.time()[["elapsed"]]-task_init))
99 |
100 | if( !interactive() ) q("no", status=0)
101 |
--------------------------------------------------------------------------------
/dplyr/join-dplyr.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 |
3 | cat("# join-dplyr.R\n")
4 |
5 | source("./_helpers/helpers.R")
6 |
7 | stopifnot(requireNamespace(c("data.table"), quietly=TRUE)) # used for data loading
8 | .libPaths("./dplyr/r-dplyr") # tidyverse/dplyr#4641
9 | suppressPackageStartupMessages(library("dplyr", lib.loc="./dplyr/r-dplyr", warn.conflicts=FALSE))
10 | ver = packageVersion("dplyr")
11 | git = "" # uses stable version now #124
12 | task = "join"
13 | solution = "dplyr"
14 | cache = TRUE
15 | on_disk = FALSE
16 |
17 | data_name = Sys.getenv("SRC_DATANAME")
18 | src_jn_x = file.path("data", paste(data_name, "csv", sep="."))
19 | y_data_name = join_to_tbls(data_name)
20 | src_jn_y = setNames(file.path("data", paste(y_data_name, "csv", sep=".")), names(y_data_name))
21 | stopifnot(length(src_jn_y)==3L)
22 | cat(sprintf("loading datasets %s\n", paste(c(data_name, y_data_name), collapse=", ")))
23 |
24 | x = as_tibble(data.table::fread(src_jn_x, showProgress=FALSE, stringsAsFactors=TRUE, data.table=FALSE, na.strings=""))
25 | JN = lapply(sapply(simplify=FALSE, src_jn_y, data.table::fread, showProgress=FALSE, stringsAsFactors=TRUE, data.table=FALSE, na.strings=""), as_tibble)
26 | print(nrow(x))
27 | sapply(sapply(JN, nrow), print) -> nul
28 | small = JN$small
29 | medium = JN$medium
30 | big = JN$big
31 |
32 | task_init = proc.time()[["elapsed"]]
33 | cat("joining...\n")
34 |
35 | question = "small inner on int" # q1
36 | fun = "inner_join"
37 | t = system.time(print(dim(ans<-inner_join(x, small, by="id1"))))[["elapsed"]]
38 | m = memory_usage()
39 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]]
40 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
41 | rm(ans)
42 | t = system.time(print(dim(ans<-inner_join(x, small, by="id1"))))[["elapsed"]]
43 | m = memory_usage()
44 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]]
45 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
46 | print(head(ans, 3))
47 | print(tail(ans, 3))
48 | rm(ans)
49 |
50 | question = "medium inner on int" # q2
51 | fun = "inner_join"
52 | t = system.time(print(dim(ans<-inner_join(x, medium, by="id2"))))[["elapsed"]]
53 | m = memory_usage()
54 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]]
55 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
56 | rm(ans)
57 | t = system.time(print(dim(ans<-inner_join(x, medium, by="id2"))))[["elapsed"]]
58 | m = memory_usage()
59 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]]
60 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
61 | print(head(ans, 3))
62 | print(tail(ans, 3))
63 | rm(ans)
64 |
65 | question = "medium outer on int" # q3
66 | fun = "left_join"
67 | t = system.time(print(dim(ans<-left_join(x, medium, by="id2"))))[["elapsed"]]
68 | m = memory_usage()
69 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]]
70 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
71 | rm(ans)
72 | t = system.time(print(dim(ans<-left_join(x, medium, by="id2"))))[["elapsed"]]
73 | m = memory_usage()
74 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]]
75 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
76 | print(head(ans, 3))
77 | print(tail(ans, 3))
78 | rm(ans)
79 |
80 | question = "medium inner on factor" # q4
81 | fun = "inner_join"
82 | t = system.time(print(dim(ans<-inner_join(x, medium, by="id5"))))[["elapsed"]]
83 | m = memory_usage()
84 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]]
85 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
86 | rm(ans)
87 | t = system.time(print(dim(ans<-inner_join(x, medium, by="id5"))))[["elapsed"]]
88 | m = memory_usage()
89 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]]
90 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
91 | print(head(ans, 3))
92 | print(tail(ans, 3))
93 | rm(ans)
94 |
95 | question = "big inner on int" # q5
96 | fun = "inner_join"
97 | t = system.time(print(dim(ans<-inner_join(x, big, by="id3"))))[["elapsed"]]
98 | m = memory_usage()
99 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]]
100 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
101 | rm(ans)
102 | t = system.time(print(dim(ans<-inner_join(x, big, by="id3"))))[["elapsed"]]
103 | m = memory_usage()
104 | chkt = system.time(chk<-summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE)))[["elapsed"]]
105 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
106 | print(head(ans, 3))
107 | print(tail(ans, 3))
108 | rm(ans)
109 |
110 | cat(sprintf("joining finished, took %.0fs\n", proc.time()[["elapsed"]]-task_init))
111 |
112 | if( !interactive() ) q("no", status=0)
113 |
--------------------------------------------------------------------------------
/dplyr/read-dplyr.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 |
3 | cat("# read-dplyr\n")
4 |
5 | source("./helpers.R")
6 | source("./dplyr/helpers-dplyr.R")
7 |
8 | suppressPackageStartupMessages({
9 | library(readr, warn.conflicts=FALSE)
10 | library(dplyr, warn.conflicts=FALSE)
11 | })
12 | ver = NA_character_ #packageVersion("dplyr")
13 | git = NA_character_ #dplyr.git()
14 | task = "read"
15 | solution = "dplyr"
16 | fun = "readr::read_csv"
17 | cache = TRUE
18 |
19 | src_grp = Sys.getenv("SRC_GRP_LOCAL")
20 | data_name = basename(src_grp)
21 | options("readr.show_progress"=FALSE)
22 |
23 | in_rows = as.numeric(strsplit(system(sprintf("wc -l %s", data_name), intern=TRUE), " ", fixed=TRUE)[[1L]][1L])-1
24 |
25 | cat("reading...\n")
26 |
27 | question = "all rows" #1
28 | t = system.time(print(dim(ans<-read_csv(data_name, col_types="ccciiiiid"))))[["elapsed"]]
29 | m = memory_usage()
30 | chkt = system.time(chk<-summarise(ungroup(ans), v3=sum(v1)))[["elapsed"]]
31 | write.log(run=1L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
32 | rm(ans)
33 | t = system.time(print(dim(ans<-read_csv(data_name, col_types="ccciiiiid"))))[["elapsed"]]
34 | m = memory_usage()
35 | chkt = system.time(chk<-summarise(ungroup(ans), v3=sum(v1)))[["elapsed"]]
36 | write.log(run=2L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
37 | rm(ans)
38 | t = system.time(print(dim(ans<-read_csv(data_name, col_types="ccciiiiid"))))[["elapsed"]]
39 | m = memory_usage()
40 | chkt = system.time(chk<-summarise(ungroup(ans), v3=sum(v1)))[["elapsed"]]
41 | write.log(run=3L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
42 | rm(ans)
43 |
44 | question = "top 100 rows" #2
45 | t = system.time(print(dim(ans<-read_csv(data_name, n_max=100, col_types="ccciiiiid"))))[["elapsed"]]
46 | m = memory_usage()
47 | chkt = system.time(chk<-summarise(ungroup(ans), v3=sum(v1)))[["elapsed"]]
48 | write.log(run=1L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
49 | rm(ans)
50 | t = system.time(print(dim(ans<-read_csv(data_name, n_max=100, col_types="ccciiiiid"))))[["elapsed"]]
51 | m = memory_usage()
52 | chkt = system.time(chk<-summarise(ungroup(ans), v3=sum(v1)))[["elapsed"]]
53 | write.log(run=2L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
54 | rm(ans)
55 | t = system.time(print(dim(ans<-read_csv(data_name, n_max=100, col_types="ccciiiiid"))))[["elapsed"]]
56 | m = memory_usage()
57 | chkt = system.time(chk<-summarise(ungroup(ans), v3=sum(v1)))[["elapsed"]]
58 | write.log(run=3L, task=task, data=data_name, in_rows=in_rows, question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
59 | rm(ans)
60 |
61 | if( !interactive() ) q("no", status=0)
62 |
--------------------------------------------------------------------------------
/dplyr/setup-dplyr.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | # install stable dplyr
5 | mkdir -p ./dplyr/r-dplyr
6 | Rscript -e 'install.packages("dplyr", lib="./dplyr/r-dplyr", repos = "http://cloud.r-project.org")'
7 |
8 | ./dplyr/ver-dplyr.sh
9 |
--------------------------------------------------------------------------------
/dplyr/sort-dplyr.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 |
3 | cat("# sort-dplyr\n")
4 |
5 | source("./helpers.R")
6 | source("./dplyr/helpers-dplyr.R")
7 |
8 | src_x = Sys.getenv("SRC_X", NA_character_)
9 |
10 | # if (get.nrow(src_x) > 1e9L) {
11 | # cat("# sort with dplyr skipped due data volume cap for single machine set to total 1e9 rows")
12 | # quit("no", status=0) # datasets > 1e9 too big to try load on single machine
13 | # }
14 |
15 | stopifnot(requireNamespace("bit64", quietly=TRUE)) # used in chk to sum numeric columns
16 | suppressPackageStartupMessages(library(dplyr, warn.conflicts=FALSE))
17 | ver = packageVersion("dplyr")
18 | git = dplyr.git()
19 | data_name = basename(src_x)
20 | task = "sort"
21 | solution = "dplyr"
22 | fun = "arrange"
23 | question = "by int KEY"
24 | cache = TRUE
25 |
26 | cat("loading dataset...\n")
27 | X = data.table::fread(if(file.exists(basename(src_x))) basename(src_x) else sprintf("hadoop fs -cat %s", src_x), data.table=FALSE) # csv can be provided in local dir for faster import
28 |
29 | cat("sorting...\n")
30 | t = system.time(print(dim(ans<-arrange(X, KEY))))[["elapsed"]]
31 | m = memory_usage()
32 | chkt = system.time(chk<-summarise(ans, sum(bit64::as.integer64(X2))))[["elapsed"]]
33 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
34 | rm(ans)
35 |
36 | t = system.time(print(dim(ans<-arrange(X, KEY))))[["elapsed"]]
37 | m = memory_usage()
38 | chkt = system.time(chk<-summarise(ans, sum(bit64::as.integer64(X2))))[["elapsed"]]
39 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
40 | rm(ans)
41 |
42 | t = system.time(print(dim(ans<-arrange(X, KEY))))[["elapsed"]]
43 | m = memory_usage()
44 | chkt = system.time(chk<-summarise(ans, sum(bit64::as.integer64(X2))))[["elapsed"]]
45 | write.log(run=3L, task=task, data=data_name, in_rows=nrow(X), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
46 | rm(ans)
47 |
48 | if( !interactive() ) q("no", status=0)
49 |
--------------------------------------------------------------------------------
/dplyr/upg-dplyr.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | # upgrade all packages in dplyr library only if new dplyr is out
5 | echo 'upgrading dplyr...'
6 | Rscript -e 'ap=available.packages(); if (ap["dplyr","Version"]!=packageVersion("dplyr", lib.loc="./dplyr/r-dplyr")) update.packages(lib.loc="./dplyr/r-dplyr", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)'
7 |
--------------------------------------------------------------------------------
/dplyr/ver-dplyr.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | Rscript -e 'v=read.dcf(system.file(package="dplyr", lib.loc="./dplyr/r-dplyr", "DESCRIPTION"), fields=c("Version","RemoteSha")); colnames(v)[colnames(v)=="RemoteSha"]="Revision"; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("dplyr", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))'
5 |
--------------------------------------------------------------------------------
/duckdb-latest/VERSION:
--------------------------------------------------------------------------------
1 | 1.0.99.9000
2 |
--------------------------------------------------------------------------------
/duckdb-latest/setup-duckdb-latest.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | # install stable duckdb-latest
5 | rm -rf ./duckdb-latest/r-duckdb-latest
6 | mkdir -p ./duckdb-latest/r-duckdb-latest
7 | # Rscript -e 'withr::with_libpaths(new = "./duckdb-latest/r-duckdb-latest", devtools::install_github("duckdb-latest/duckdb-latest/tools/rpkg"))'
8 | # prevent errors when running 'ver-duckdb-latest.sh'
9 | Rscript -e 'install.packages("DBI", lib="./duckdb-latest/r-duckdb-latest", repos = "http://cloud.r-project.org")'
10 |
11 |
12 | cd duckdb-latest
13 | rm -rf duckdb-r
14 | git clone https://github.com/duckdb/duckdb-r.git
15 | ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'`
16 | MAKE="make -j$ncores" R CMD INSTALL -l "./r-duckdb-latest" duckdb-r
17 | rm -rf duckdb-r
18 | cd ..
19 |
20 | ./duckdb-latest/ver-duckdb-latest.sh
21 |
--------------------------------------------------------------------------------
/duckdb-latest/upg-duckdb-latest.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | # upgrade all packages in duckdb library only if new arrow is out
5 | echo 'upgrading duckdb-latest, installing 0.9.1'
6 |
7 | rm -rf ./duckdb-latest/r-duckdb-latest
8 | mkdir -p ./duckdb-latest/r-duckdb-latest
9 | Rscript -e 'install.packages("DBI", lib="./duckdb-latest/r-duckdb-latest", repos = "http://cloud.r-project.org")'
10 |
11 |
12 | cd duckdb-latest
13 | rm -rf duckdb-r
14 | git clone https://github.com/duckdb/duckdb-r
15 | ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'`
16 | MAKE="make -j$ncores" R CMD INSTALL -l "./r-duckdb-latest" duckdb-r
17 | rm -rf duckdb-r
18 | cd ..
19 |
20 | ./duckdb-latest/ver-duckdb-latest.sh
--------------------------------------------------------------------------------
/duckdb-latest/ver-duckdb-latest.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | Rscript -e 'v=read.dcf(system.file(package="duckdb", lib.loc="./duckdb-latest/r-duckdb-latest", "DESCRIPTION"), fields=c("Version","Revision")); if (is.na(v[,"Revision"])) { suppressPackageStartupMessages({ requireNamespace("DBI", lib.loc="./duckdb-latest/r-duckdb-latest"); requireNamespace("duckdb", lib.loc="./duckdb-latest/r-duckdb-latest") }); v[,"Revision"] = DBI::dbGetQuery(con<-DBI::dbConnect(duckdb::duckdb()), "SELECT source_id FROM pragma_version()")[[1L]]; invisible(DBI::dbDisconnect(con, shutdown=TRUE)) }; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(gsub("-", ".", v), file.path("duckdb-latest", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))'
5 |
6 |
--------------------------------------------------------------------------------
/duckdb/VERSION:
--------------------------------------------------------------------------------
1 | 1.3.0
2 |
--------------------------------------------------------------------------------
/duckdb/setup-duckdb.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | # install stable duckdb
5 | rm -rf ./duckdb/r-duckdb
6 | mkdir -p ./duckdb/r-duckdb
7 | # Rscript -e 'withr::with_libpaths(new = "./duckdb/r-duckdb", devtools::install_github("duckdb/duckdb/tools/rpkg"))'
8 | # prevent errors when running 'ver-duckdb.sh'
9 | Rscript -e 'install.packages("DBI", lib="./duckdb/r-duckdb", repos = "http://cloud.r-project.org")'
10 | ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'`
11 | MAKE="make -j$ncores" Rscript -e 'install.packages("duckdb", lib="./duckdb/r-duckdb", repos = "http://cloud.r-project.org")'
12 |
13 | ./duckdb/ver-duckdb.sh
14 |
--------------------------------------------------------------------------------
/duckdb/upg-duckdb.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | rm -rf ./duckdb/r-duckdb
5 | mkdir -p ./duckdb/r-duckdb
6 |
7 |
8 | cd duckdb
9 | rm -rf duckdb-r
10 | git clone https://github.com/duckdb/duckdb-r
11 | cd duckdb-r
12 | git checkout v1.2.0
13 | cd ..
14 | ncores=$(nproc --all)
15 | MAKE="make -j$ncores" R CMD INSTALL -l "./r-duckdb" duckdb-r
16 | rm -rf duckdb-r
17 | cd ..
18 |
19 |
20 | # Rscript -e 'ap=available.packages(repos="https://cloud.r-project.org/"); if (ap["duckdb","Version"]!=packageVersion("duckdb", lib.loc="./duckdb/r-duckdb")) update.packages(lib.loc="./duckdb/r-duckdb", ask=FALSE, checkBuilt=TRUE, quiet=TRUE, repos="https://cloud.r-project.org/")'
21 |
--------------------------------------------------------------------------------
/duckdb/ver-duckdb.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | Rscript -e 'v=read.dcf(system.file(package="duckdb", lib.loc="./duckdb/r-duckdb", "DESCRIPTION"), fields=c("Version","Revision")); if (is.na(v[,"Revision"])) { suppressPackageStartupMessages({ requireNamespace("DBI", lib.loc="./duckdb/r-duckdb"); requireNamespace("duckdb", lib.loc="./duckdb/r-duckdb") }); v[,"Revision"] = DBI::dbGetQuery(DBI::dbConnect(duckdb::duckdb()), "SELECT source_id FROM pragma_version()")[[1L]] }; cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(gsub("-", ".", v), file.path("duckdb", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))'
5 |
--------------------------------------------------------------------------------
/h2o/exec.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | if [ "$#" -ne 1 ]; then
5 | echo 'usage: ./h2o/exec.sh groupby';
6 | exit 1
7 | fi;
8 |
9 | source ./h2o/h2o.sh
10 |
11 | h2o_active && echo 'h2o instance should not be already running, investigate' >&2
12 | h2o_active && exit 1
13 |
14 | # start h2o
15 | h2o_start "h2o_$1_""$SRC_DATANAME"
16 |
17 | # confirm h2o working
18 | h2o_active || sleep 30
19 | h2o_active || echo 'h2o instance should be already running, investigate' >&2
20 | h2o_active || exit 1
21 |
22 | # execute benchmark script
23 | ./h2o/$1-h2o.R || echo "# h2o/exec.sh: benchmark script for $SRC_DATANAME terminated with error" >&2
24 |
25 | # stop h2o instance
26 | h2o_stop && echo '# h2o/exec.sh: stopping h2o instance finished' || echo '# h2o/exec.sh: stopping h2o instance failed' >&2
27 | h2o_active || exit 1
28 |
--------------------------------------------------------------------------------
/h2o/h2o.sh:
--------------------------------------------------------------------------------
1 | java_active() {
2 | pgrep -U $UID java > /dev/null 2>&1
3 | }
4 | h2o_active() {
5 | java_active && curl -X GET "localhost:55888/3/About" -H "accept: application/json" > /dev/null 2>&1
6 | }
7 | h2o_start() {
8 | ((!$#)) && echo "h2o_start require h2o instance name as a parameter" >&2 && return 1
9 | echo '# h2o_start: starting h2o instance'
10 | java_active && echo "h2o instance is running already" >&2 && return 1
11 | nohup java -Xmx100G -Xms100G -cp ./h2o/r-h2o/h2o/java/h2o.jar water.H2OApp -name "$1" -baseport 55888 > ./h2o/log/$1.out 2> ./h2o/log/$1.err < /dev/null &
12 | sleep 10
13 | }
14 | h2o_stop() {
15 | echo '# h2o_stop: stopping h2o instance'
16 | java_active || echo "h2o instance was not running already" >&2
17 | java_active || return 0
18 | java_active && echo "sigint h2o instance" && killall -2 -u $USER java > /dev/null 2>&1
19 | sleep 1 && java_active && sleep 15
20 | java_active && echo "sigterm h2o instance" && killall -15 -u $USER java > /dev/null 2>&1
21 | sleep 1 && java_active && sleep 30
22 | java_active && echo "sigkill h2o instance" && killall -9 -u $USER java > /dev/null 2>&1
23 | sleep 1 && java_active && sleep 120 && java_active && echo "h2o instance could not be stopped" >&2 && return 1
24 | return 0
25 | }
26 |
27 |
--------------------------------------------------------------------------------
/h2o/join-h2o.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 |
3 | cat("# join-h2o.R\n")
4 |
5 | source("./_helpers/helpers.R")
6 |
7 | suppressPackageStartupMessages(library("h2o", lib.loc="./h2o/r-h2o", warn.conflicts=FALSE, quietly=TRUE))
8 | ver = packageVersion("h2o")
9 | git = ""
10 | task = "join"
11 | solution = "h2o"
12 | fun = "h2o.merge"
13 | cache = TRUE
14 | on_disk = FALSE
15 |
16 | h = h2o.init(startH2O=FALSE, port=55888)
17 | h2o.no_progress()
18 |
19 | data_name = Sys.getenv("SRC_DATANAME")
20 | src_jn_x = file.path("data", paste(data_name, "csv", sep="."))
21 | y_data_name = join_to_tbls(data_name)
22 | src_jn_y = setNames(file.path("data", paste(y_data_name, "csv", sep=".")), names(y_data_name))
23 | stopifnot(length(src_jn_y)==3L)
24 | cat(sprintf("loading datasets %s\n", paste(c(data_name, y_data_name), collapse=", ")))
25 |
26 | x = h2o.importFile(src_jn_x, col.types=c("int","int","int","enum","enum","string","real"))
27 | print(nrow(x))
28 | small = h2o.importFile(src_jn_y[1L], col.types=c("int","enum","real"))
29 | medium = h2o.importFile(src_jn_y[2L], col.types=c("int","int","enum","enum","real"))
30 | big = h2o.importFile(src_jn_y[3L], col.types=c("int","int","int","enum","enum","string","real"))
31 | sapply(sapply(list(small, medium, big), nrow), print) -> nul
32 |
33 | task_init = proc.time()[["elapsed"]]
34 | cat("joining...\n")
35 |
36 | question = "small inner on int" # q1
37 |
38 | t = system.time(print(dim(ans<-h2o.merge(x, small, by="id1"))))[["elapsed"]]
39 | m = memory_usage()
40 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]]
41 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
42 | h2o.rm(ans)
43 | t = system.time(print(dim(ans<-h2o.merge(x, small, by="id1"))))[["elapsed"]]
44 | m = memory_usage()
45 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]]
46 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
47 | print(head(ans, 3))
48 | print(tail(ans, 3))
49 | h2o.rm(ans)
50 |
51 | question = "medium inner on int" # q2
52 | t = system.time(print(dim(ans<-h2o.merge(x, medium, by="id2"))))[["elapsed"]]
53 | m = memory_usage()
54 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]]
55 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
56 | h2o.rm(ans)
57 | t = system.time(print(dim(ans<-h2o.merge(x, medium, by="id2"))))[["elapsed"]]
58 | m = memory_usage()
59 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]]
60 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
61 | print(head(ans, 3))
62 | print(tail(ans, 3))
63 | h2o.rm(ans)
64 |
65 | question = "medium outer on int" # q3
66 | t = system.time(print(dim(ans<-h2o.merge(x, medium, by="id2", all.x=TRUE))))[["elapsed"]]
67 | m = memory_usage()
68 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]], na.rm=TRUE)))[["elapsed"]]
69 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
70 | h2o.rm(ans)
71 | t = system.time(print(dim(ans<-h2o.merge(x, medium, by="id2", all.x=TRUE))))[["elapsed"]]
72 | m = memory_usage()
73 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]], na.rm=TRUE)))[["elapsed"]]
74 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
75 | print(head(ans, 3))
76 | print(tail(ans, 3))
77 | h2o.rm(ans)
78 |
79 | question = "medium inner on factor" # q4
80 | t = system.time(print(dim(ans<-h2o.merge(x, medium, by="id5"))))[["elapsed"]]
81 | m = memory_usage()
82 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]]
83 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
84 | h2o.rm(ans)
85 | t = system.time(print(dim(ans<-h2o.merge(x, medium, by="id5"))))[["elapsed"]]
86 | m = memory_usage()
87 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]]
88 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
89 | print(head(ans, 3))
90 | print(tail(ans, 3))
91 | h2o.rm(ans)
92 |
93 | question = "big inner on int" # q5
94 | t = system.time(print(dim(ans<-h2o.merge(x, big, by="id3"))))[["elapsed"]]
95 | m = memory_usage()
96 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]]
97 | write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
98 | h2o.rm(ans)
99 | t = system.time(print(dim(ans<-h2o.merge(x, big, by="id3"))))[["elapsed"]]
100 | m = memory_usage()
101 | chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]]
102 | write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
103 | print(head(ans, 3))
104 | print(tail(ans, 3))
105 | h2o.rm(ans)
106 |
107 | h2o.removeAll()
108 |
109 | cat(sprintf("joining finished, took %.0fs\n", proc.time()[["elapsed"]]-task_init))
110 |
111 | if (!interactive()) q("no", status=0)
112 |
--------------------------------------------------------------------------------
/h2o/setup-h2o.sh:
--------------------------------------------------------------------------------
1 | mkdir -p ./h2o/log
2 | # install h2o
3 | mkdir -p ./h2o/r-h2o
4 | Rscript -e 'install.packages(c("RCurl","jsonlite"), repos="https://cloud.r-project.org", lib="./h2o/r-h2o"); install.packages("h2o", repos="http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R", method="curl", lib="./h2o/r-h2o")'
5 |
6 |
--------------------------------------------------------------------------------
/h2o/upg-h2o.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | # upgrade to latest stable from h2o repo
5 | echo 'upgrading h2o...'
6 | Rscript -e 'ap=available.packages(repos="http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R", method="curl"); if (ap["h2o","Version"]!=packageVersion("h2o", lib.loc="./h2o/r-h2o")) update.packages(lib.loc="./h2o/r-h2o", repos="http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R", method="curl", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)'
7 |
--------------------------------------------------------------------------------
/h2o/ver-h2o.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | Rscript -e 'v=read.dcf(system.file(package="h2o", lib.loc="./h2o/r-h2o", "DESCRIPTION"), fields=c("Version","Revision")); cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("h2o", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))'
5 |
--------------------------------------------------------------------------------
/juliadf/VERSION:
--------------------------------------------------------------------------------
1 | 1.6.1
--------------------------------------------------------------------------------
/juliadf/exec.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | if [ "$#" -ne 1 ]; then
5 | echo 'usage: ./juliadf/exec.sh groupby';
6 | exit 1
7 | fi;
8 |
9 | source ./path.env
10 |
11 | ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'`
12 |
13 | # execute benchmark script
14 | julia -t $ncores ./juliadf/$1-juliadf.jl
15 |
--------------------------------------------------------------------------------
/juliadf/setup-juliadf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # install julia
3 |
4 | wget -q https://julialang-s3.julialang.org/bin/linux/x64/1.10/julia-1.10.5-linux-x86_64.tar.gz
5 | tar -xvf julia-1.10.5-linux-x86_64.tar.gz > tar.out 2> tar.err
6 | sudo mv julia-1.10.5 /opt
7 | rm julia-1.10.5-linux-x86_64.tar.gz
8 | # put to paths
9 | echo 'export JULIA_HOME=/opt/julia-1.10.5' >> path.env
10 | echo 'export PATH=$PATH:$JULIA_HOME/bin' >> path.env
11 | # note that cron job must have path updated as well
12 |
13 | source path.env
14 |
15 | # install julia dataframes and csv packages
16 | julia -q -e 'using Pkg; Pkg.add(["DataFrames","CSV"])'
17 | julia -q -e 'include("$(pwd())/_helpers/helpers.jl"); pkgmeta = getpkgmeta("DataFrames"); println(string(pkgmeta["version"])); pkgmeta = getpkgmeta("CSV"); println(string(pkgmeta["version"]))'
18 |
19 | ./juliadf/ver-juliadf.sh
--------------------------------------------------------------------------------
/juliadf/upg-juliadf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | # upgrade to latest devel
5 | echo 'upgrading juliadf...'
6 | julia -q -e 'using Pkg; Pkg.update();' > /dev/null 2>&1
7 |
8 |
--------------------------------------------------------------------------------
/juliadf/ver-juliadf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | source path.env
4 |
5 | julia -q -e 'include("$(pwd())/_helpers/helpers.jl"); pkgmeta = getpkgmeta("DataFrames"); f=open("juliadf/VERSION","w"); write(f, string(pkgmeta["version"])); f=open("juliadf/REVISION","w"); write(f, string(pkgmeta["git-tree-sha1"]));' > /dev/null
6 |
--------------------------------------------------------------------------------
/juliads/VERSION:
--------------------------------------------------------------------------------
1 | 0.7.21
--------------------------------------------------------------------------------
/juliads/exec.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | if [ "$#" -ne 1 ]; then
5 | echo 'usage: ./juliads/exec.sh groupby';
6 | exit 1
7 | fi;
8 |
9 | source ./path.env
10 |
11 | ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'`
12 |
13 | # execute benchmark script
14 | julia -t $ncores ./juliads/$1-juliads.jl
15 |
--------------------------------------------------------------------------------
/juliads/join-juliads.jl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env julia
2 |
3 | print("# join-juliads.jl\n"); flush(stdout);
4 |
5 | using InMemoryDatasets;
6 | using Printf;
7 | using DLMReader
8 | using PooledArrays
9 | using Arrow
10 |
11 | # Force Julia to precompile methods for common patterns
12 | IMD.warmup()
13 |
14 | include("$(pwd())/_helpers/helpersds.jl");
15 |
16 | pkgmeta = getpkgmeta("InMemoryDatasets");
17 | ver = pkgmeta["version"];
18 | git = "";
19 | task = "join";
20 | solution = "juliads";
21 | fun = "join";
22 | cache = true;
23 | on_disk = false;
24 | machine_type = ENV["MACHINE_TYPE"]
25 | isondisk(indata) = (parse(Float64, split(indata, "_")[2])>=10^10) || (parse(Float64, split(indata, "_")[2]) >= 1^9 || machine_type == "c6id.4xlarge")
26 |
27 | data_name = ENV["SRC_DATANAME"];
28 | src_jn_x = string("data/", data_name, ".csv");
29 | y_data_name = join_to_tbls(data_name);
30 | src_jn_y = [string("data/", y_data_name[1], ".csv"), string("data/", y_data_name[2], ".csv"), string("data/", y_data_name[3], ".csv")];
31 | if length(src_jn_y) != 3
32 | error("Something went wrong in preparing files used for join")
33 | end;
34 |
35 | on_disk = isondisk(data_name)
36 |
37 | println(string("loading datasets ", data_name, ", ", y_data_name[1], ", ", y_data_name[2], ", ", y_data_name[3])); flush(stdout);
38 |
39 | # temporary file which will be deleted after the run - usually located at /tmp/
40 | _tmp_storage = tempname()
41 | if isondisk(data_name)
42 | on_disk = true
43 | big_df = filereader(src_jn_y[3], types=[Int32, Int32, Int32, Characters{6}, Characters{9}, Characters{12}, Float64]);
44 | modify!(big_df, [:id4, :id5]=>PooledArray)
45 | Arrow.write(_tmp_storage, big_df[!, :], ntasks=1)
46 | big_df = 0
47 | GC.gc(true)
48 | end
49 | x_df = filereader(src_jn_x, types=[Int32, Int32, Int32, Characters{6}, Characters{9}, Characters{12}, Float64]);
50 | small_df = filereader(src_jn_y[1], types=[Int32, Characters{6}, Float64]);
51 | medium_df = filereader(src_jn_y[2], types=[Int32, Int32, Characters{6}, Characters{9}, Float64]);
52 | if isondisk(data_name)
53 | big_df = Dataset(Arrow.Table(_tmp_storage))
54 | else
55 | big_df = filereader(src_jn_y[3], types=[Int32, Int32, Int32, Characters{6}, Characters{9}, Characters{12}, Float64]);
56 | modify!(big_df, [:id4, :id5]=>PooledArray)
57 | end
58 |
59 | modify!(x_df, [:id4, :id5]=>PooledArray)
60 | modify!(small_df, :id4=>PooledArray)
61 | modify!(medium_df, [:id4, :id5]=>PooledArray)
62 |
63 | in_rows = size(x_df, 1);
64 | println(in_rows); flush(stdout);
65 | println(size(small_df, 1)); flush(stdout);
66 | println(size(medium_df, 1)); flush(stdout);
67 | println(size(big_df, 1)); flush(stdout);
68 |
69 | task_init = time();
70 | print("joining...\n"); flush(stdout);
71 |
72 | question = "small inner on int"; # q1
73 | GC.gc();
74 | t = @elapsed (ANS = innerjoin(x_df, small_df, on = :id1, makeunique=true); println(size(ANS)); flush(stdout));
75 | m = memory_usage();
76 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)];
77 | write_log(1, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type);
78 | ANS = 0;
79 | GC.gc();
80 | t = @elapsed (ANS = innerjoin(x_df, small_df, on = :id1, makeunique=true); println(size(ANS)); flush(stdout));
81 | m = memory_usage();
82 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)];
83 | write_log(2, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type);
84 | println(first(ANS, 3));
85 | println(last(ANS, 3));
86 | ANS = 0;
87 |
88 | question = "medium inner on int"; # q2
89 | GC.gc();
90 | t = @elapsed (ANS = innerjoin(x_df, medium_df, on = :id2, makeunique=true); println(size(ANS)); flush(stdout));
91 | m = memory_usage();
92 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)];
93 | write_log(1, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type);
94 | ANS = 0;
95 | GC.gc();
96 | t = @elapsed (ANS = innerjoin(x_df, medium_df, on = :id2, makeunique=true); println(size(ANS)); flush(stdout));
97 | m = memory_usage();
98 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)];
99 | write_log(2, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type);
100 | println(first(ANS, 3));
101 | println(last(ANS, 3));
102 | ANS = 0;
103 |
104 | question = "medium outer on int"; # q3
105 | GC.gc();
106 | t = @elapsed (ANS = leftjoin(x_df, medium_df, on = :id2, makeunique=true); println(size(ANS)); flush(stdout));
107 | m = memory_usage();
108 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)];
109 | write_log(1, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type);
110 | ANS = 0;
111 | GC.gc();
112 | t = @elapsed (ANS = leftjoin(x_df, medium_df, on = :id2, makeunique=true); println(size(ANS)); flush(stdout));
113 | m = memory_usage();
114 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)];
115 | write_log(2, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type);
116 | println(first(ANS, 3));
117 | println(last(ANS, 3));
118 | ANS = 0;
119 |
120 | question = "medium inner on factor"; # q4
121 | GC.gc();
122 | t = @elapsed (ANS = innerjoin(x_df, medium_df, on = :id5, makeunique=true); println(size(ANS)); flush(stdout));
123 | m = memory_usage();
124 | t_start = time_ns();
125 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)];
126 | write_log(1, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type);
127 | ANS = 0;
128 | GC.gc();
129 | t = @elapsed (ANS = innerjoin(x_df, medium_df, on = :id5, makeunique=true); println(size(ANS)); flush(stdout));
130 | m = memory_usage();
131 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)];
132 | write_log(2, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type);
133 | println(first(ANS, 3));
134 | println(last(ANS, 3));
135 | ANS = 0;
136 |
137 | question = "big inner on int"; # q5
138 | GC.gc();
139 | t = @elapsed (ANS = innerjoin(x_df, big_df, on = :id3, makeunique=true); println(size(ANS)); flush(stdout));
140 | m = memory_usage();
141 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)];
142 | write_log(1, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type);
143 | ANS = 0;
144 | GC.gc();
145 | t = @elapsed (ANS = innerjoin(x_df, big_df, on = :id3, makeunique=true); println(size(ANS)); flush(stdout));
146 | m = memory_usage();
147 | chkt = @elapsed chk = [sum(ANS.v1), sum(ANS.v2)];
148 | write_log(2, task, data_name, in_rows, question, size(ANS, 1), size(ANS, 2), solution, ver, git, fun, t, m, cache, make_chk(chk), chkt, on_disk, machine_type);
149 | println(first(ANS, 3));
150 | println(last(ANS, 3));
151 | ANS = 0;
152 |
153 | print(@sprintf "joining finished, took %.0fs\n" (time()-task_init)); flush(stdout);
154 |
155 | exit();
156 |
--------------------------------------------------------------------------------
/juliads/setup-juliads.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # install julia
3 |
4 | wget -q https://julialang-s3.julialang.org/bin/linux/x64/1.10/julia-1.10.5-linux-x86_64.tar.gz
5 | tar -xvf julia-1.10.5-linux-x86_64.tar.gz > tar_out.out 2> tar_err.err
6 | sudo mv julia-1.10.5 /opt
7 | rm julia-1.10.5-linux-x86_64.tar.gz
8 | # put to paths
9 | echo 'export JULIA_HOME=/opt/julia-1.10.5' >> path.env
10 | echo 'export PATH=$PATH:$JULIA_HOME/bin' >> path.env
11 | # note that cron job must have path updated as well
12 |
13 | source path.env
14 |
15 | # install julia InMemoryDatasets and csv packages
16 | julia -q -e 'using Pkg; Pkg.add(["InMemoryDatasets","DLMReader", "PooledArrays", "Arrow", "CSV"])'
17 | julia -q -e 'include("$(pwd())/_helpers/helpersds.jl"); pkgmeta = getpkgmeta("InMemoryDatasets"); println(string(pkgmeta["version"])); pkgmeta = getpkgmeta("DLMReader"); println(string(pkgmeta["version"]))'
18 |
19 | ./juliadf/ver-juliadf.sh
20 |
--------------------------------------------------------------------------------
/juliads/upg-juliads.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | # upgrade to latest devel
5 | echo 'upgrading juliads...'
6 | julia -q -e 'using Pkg; Pkg.update();' > /dev/null 2>&1
7 |
8 |
--------------------------------------------------------------------------------
/juliads/ver-juliads.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | source path.env
5 |
6 | julia -q -e 'include("$(pwd())/_helpers/helpersds.jl"); pkgmeta = getpkgmeta("InMemoryDatasets"); f=open("juliads/VERSION","w"); write(f, string(pkgmeta["version"])); f=open("juliads/REVISION","w"); write(f, string(" "));' > /dev/null
7 |
--------------------------------------------------------------------------------
/modin/join-modin.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | print("# join-modin.py")
4 |
5 | import os
6 | import gc
7 | import timeit
8 | import modin.pandas as pd
9 |
10 | exec(open("./helpers.py").read())
11 |
12 | src_x = os.environ['SRC_X_LOCAL']
13 | src_y = os.environ['SRC_Y_LOCAL']
14 |
15 | ver = "" #pd.__version__
16 | git = ""
17 | task = "join"
18 | question = "inner join"
19 | l = [os.path.basename(src_x), os.path.basename(src_y)]
20 | data_name = '-'.join(l)
21 | solution = "modin"
22 | fun = "merge"
23 | cache = "TRUE"
24 |
25 | print("loading datasets...")
26 |
27 | x = pd.read_csv(os.path.basename(src_x))
28 | y = pd.read_csv(os.path.basename(src_y))
29 |
30 | print("joining...")
31 |
32 | # NotImplementedError: To contribute to Pandas on Ray, please visit github.com/modin-project/modin
33 | gc.collect()
34 | t_start = timeit.default_timer()
35 | ans = x.merge(y, how='inner', on='KEY')
36 | print(ans.shape)
37 | t = timeit.default_timer() - t_start
38 | m = memory_usage()
39 | t_start = timeit.default_timer()
40 | chk = [ans['X2'].sum(), ans['Y2'].sum()]
41 | chkt = timeit.default_timer() - t_start
42 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
43 | del ans
44 |
45 | gc.collect()
46 | t_start = timeit.default_timer()
47 | ans = x.merge(y, how='inner', on='KEY')
48 | print(ans.shape)
49 | t = timeit.default_timer() - t_start
50 | m = memory_usage()
51 | t_start = timeit.default_timer()
52 | chk = [ans['X2'].sum(), ans['Y2'].sum()]
53 | chkt = timeit.default_timer() - t_start
54 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
55 | del ans
56 |
57 | gc.collect()
58 | t_start = timeit.default_timer()
59 | ans = x.merge(y, how='inner', on='KEY')
60 | print(ans.shape)
61 | t = timeit.default_timer() - t_start
62 | m = memory_usage()
63 | t_start = timeit.default_timer()
64 | chk = [ans['X2'].sum(), ans['Y2'].sum()]
65 | chkt = timeit.default_timer() - t_start
66 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
67 | del ans
68 |
69 | exit(0)
70 |
--------------------------------------------------------------------------------
/modin/setup-modin.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | virtualenv modin/py-modin --python=python3
5 | source modin/py-modin/bin/activate
6 |
7 | # install binaries
8 | python3 -m pip install --upgrade modin[all]
9 |
10 | # check
11 | python3
12 | import modin
13 | modin.__version__
14 | quit()
15 |
16 | deactivate
17 |
--------------------------------------------------------------------------------
/modin/sort-modin.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | print("# sort-modin.py")
4 |
5 | import os
6 | import gc
7 | import timeit
8 | import modin as modin
9 | import modin.pandas as pd
10 |
11 | exec(open("./helpers.py").read())
12 |
13 | src_x = os.environ['SRC_X_LOCAL']
14 |
15 | ver = modin.__version__
16 | git = modin.__git_revision__
17 | task = "sort"
18 | question = "by int KEY"
19 | data_name = os.path.basename(src_x)
20 | solution = "modin"
21 | fun = ".sort"
22 | cache = "TRUE"
23 |
24 | print("loading dataset...")
25 |
26 | x = pd.read_csv(data_name)
27 |
28 | print("sorting...")
29 |
30 | gc.collect()
31 | t_start = timeit.default_timer()
32 | ans = x.sort_values('KEY')
33 | print(ans.shape)
34 | t = timeit.default_timer() - t_start
35 | m = memory_usage()
36 | t_start = timeit.default_timer()
37 | chk = [ans['X2'].sum()]
38 | chkt = timeit.default_timer() - t_start
39 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
40 | del ans
41 |
42 | gc.collect()
43 | t_start = timeit.default_timer()
44 | ans = x.sort_values('KEY')
45 | print(ans.shape)
46 | t = timeit.default_timer() - t_start
47 | m = memory_usage()
48 | t_start = timeit.default_timer()
49 | chk = [ans['X2'].sum()]
50 | chkt = timeit.default_timer() - t_start
51 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
52 | del ans
53 |
54 | gc.collect()
55 | t_start = timeit.default_timer()
56 | ans = x.sort_values('KEY')
57 | print(ans.shape)
58 | t = timeit.default_timer() - t_start
59 | m = memory_usage()
60 | t_start = timeit.default_timer()
61 | chk = [ans['X2'].sum()]
62 | chkt = timeit.default_timer() - t_start
63 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
64 | del ans
65 |
66 | exit(0)
67 |
--------------------------------------------------------------------------------
/modin/upg-modin.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | echo 'upgrading modin...'
5 |
6 | source ./modin/py-modin/bin/activate
7 |
8 | python -m pip install --upgrade modin[all] > /dev/null
9 |
--------------------------------------------------------------------------------
/modin/ver-modin.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | source ./modin/py-modin/bin/activate
5 | python -c 'import modin as modin; open("modin/VERSION","w").write(modin.__version__); open("modin/REVISION","w").write("");' > /dev/null
6 |
--------------------------------------------------------------------------------
/pandas/VERSION:
--------------------------------------------------------------------------------
1 | 2.2.2
--------------------------------------------------------------------------------
/pandas/read-pandas.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | print("# read-pandas.py")
4 |
5 | import os
6 | import gc
7 | import timeit
8 | import subprocess
9 | import pandas as pd
10 |
11 | exec(open("./helpers.py").read())
12 |
13 | src_grp = os.environ['SRC_GRP_LOCAL']
14 |
15 | ver = pd.__version__
16 | git = ""
17 | task = "read"
18 | data_name = os.path.basename(src_grp)
19 | solution = "pandas"
20 | fun = "read_csv"
21 | cache = "TRUE"
22 |
23 | wc_lines = subprocess.run(['wc','-l',data_name], stdout=subprocess.PIPE).stdout.decode('utf-8').split(" ", 1)[0]
24 | in_rows = int(wc_lines)-1
25 |
26 | print("reading...")
27 |
28 | question = "all rows" #1
29 | gc.collect()
30 | t_start = timeit.default_timer()
31 | ans = pd.read_csv(data_name)
32 | print(ans.shape)
33 | t = timeit.default_timer() - t_start
34 | m = memory_usage()
35 | t_start = timeit.default_timer()
36 | chk = [ans['v3'].sum()]
37 | chkt = timeit.default_timer() - t_start
38 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
39 | del ans
40 | gc.collect()
41 | t_start = timeit.default_timer()
42 | ans = pd.read_csv(data_name)
43 | print(ans.shape)
44 | t = timeit.default_timer() - t_start
45 | m = memory_usage()
46 | t_start = timeit.default_timer()
47 | chk = [ans['v3'].sum()]
48 | chkt = timeit.default_timer() - t_start
49 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
50 | del ans
51 | gc.collect()
52 | t_start = timeit.default_timer()
53 | ans = pd.read_csv(data_name)
54 | print(ans.shape)
55 | t = timeit.default_timer() - t_start
56 | m = memory_usage()
57 | t_start = timeit.default_timer()
58 | chk = [ans['v3'].sum()]
59 | chkt = timeit.default_timer() - t_start
60 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
61 | del ans
62 |
63 | question = "top 100 rows" #2
64 | gc.collect()
65 | t_start = timeit.default_timer()
66 | ans = pd.read_csv(data_name, nrows=100)
67 | print(ans.shape)
68 | t = timeit.default_timer() - t_start
69 | m = memory_usage()
70 | t_start = timeit.default_timer()
71 | chk = [ans['v3'].sum()]
72 | chkt = timeit.default_timer() - t_start
73 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
74 | del ans
75 | gc.collect()
76 | t_start = timeit.default_timer()
77 | ans = pd.read_csv(data_name, nrows=100)
78 | print(ans.shape)
79 | t = timeit.default_timer() - t_start
80 | m = memory_usage()
81 | t_start = timeit.default_timer()
82 | chk = [ans['v3'].sum()]
83 | chkt = timeit.default_timer() - t_start
84 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
85 | del ans
86 | gc.collect()
87 | t_start = timeit.default_timer()
88 | ans = pd.read_csv(data_name, nrows=100)
89 | print(ans.shape)
90 | t = timeit.default_timer() - t_start
91 | m = memory_usage()
92 | t_start = timeit.default_timer()
93 | chk = [ans['v3'].sum()]
94 | chkt = timeit.default_timer() - t_start
95 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
96 | del ans
97 |
98 | exit(0)
99 |
--------------------------------------------------------------------------------
/pandas/setup-pandas.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | # install all dependencies
5 | # sudo apt-get update
6 | # sudo apt-get install build-essential python3-dev python3-pip
7 |
8 | virtualenv pandas/py-pandas --python=python3
9 | source pandas/py-pandas/bin/activate
10 |
11 | # install binaries
12 | python3 -m pip install --upgrade psutil
13 | python3 -m pip install --upgrade pandas
14 | python3 -m pip install --upgrade pyarrow
15 |
16 | deactivate
17 |
18 | ./pandas/ver-pandas.sh
19 |
20 | # # check
21 | # source pandas/py-pandas/bin/activate
22 | # python3
23 | # import pandas as pd
24 | # pd.__version__
25 | # quit()
26 | # deactivate
27 |
--------------------------------------------------------------------------------
/pandas/sort-pandas.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | print("# sort-pandas.py")
4 |
5 | import os
6 | import gc
7 | import timeit
8 | import pandas as pd
9 | #import pydoop.hdfs as hd
10 |
11 | exec(open("./helpers.py").read())
12 |
13 | src_x = os.environ['SRC_X_LOCAL']
14 |
15 | ver = pd.__version__
16 | git = ""
17 | task = "sort"
18 | question = "by int KEY"
19 | data_name = os.path.basename(src_x)
20 | solution = "pandas"
21 | fun = ".sort"
22 | cache = "TRUE"
23 |
24 | print("loading dataset...")
25 |
26 | # with hd.open(src_x) as f:
27 | # x = pd.read_csv(f)
28 | x = pd.read_csv(data_name)
29 |
30 | print("sorting...")
31 |
32 | gc.collect()
33 | t_start = timeit.default_timer()
34 | ans = x.sort_values('KEY')
35 | print(ans.shape)
36 | t = timeit.default_timer() - t_start
37 | m = memory_usage()
38 | t_start = timeit.default_timer()
39 | chk = [ans['X2'].sum()]
40 | chkt = timeit.default_timer() - t_start
41 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
42 | del ans
43 |
44 | gc.collect()
45 | t_start = timeit.default_timer()
46 | ans = x.sort_values('KEY')
47 | print(ans.shape)
48 | t = timeit.default_timer() - t_start
49 | m = memory_usage()
50 | t_start = timeit.default_timer()
51 | chk = [ans['X2'].sum()]
52 | chkt = timeit.default_timer() - t_start
53 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
54 | del ans
55 |
56 | gc.collect()
57 | t_start = timeit.default_timer()
58 | ans = x.sort_values('KEY')
59 | print(ans.shape)
60 | t = timeit.default_timer() - t_start
61 | m = memory_usage()
62 | t_start = timeit.default_timer()
63 | chk = [ans['X2'].sum()]
64 | chkt = timeit.default_timer() - t_start
65 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt)
66 | del ans
67 |
68 | exit(0)
69 |
--------------------------------------------------------------------------------
/pandas/upg-pandas.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | echo 'upgrading pandas...'
5 |
6 | source ./pandas/py-pandas/bin/activate
7 |
8 | python3 -m pip install --upgrade pandas > /dev/null
9 |
10 | deactivate
--------------------------------------------------------------------------------
/pandas/ver-pandas.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | source ./pandas/py-pandas/bin/activate
5 | python3 -c 'import pandas as pd; open("pandas/VERSION","w").write(pd.__version__); open("pandas/REVISION","w").write(pd.__git_version__);' > /dev/null
6 | deactivate
--------------------------------------------------------------------------------
/path.env:
--------------------------------------------------------------------------------
1 | export JULIA_HOME=/opt/julia-1.9.2
2 | export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
3 | export PATH=$PATH:$JULIA_HOME/bin
4 | export MOUNT_POINT=/var/lib/mount
5 | export SPILL_DIR=/var/lib/mount/db-benchmark-metal/spill
6 |
--------------------------------------------------------------------------------
/polars/VERSION:
--------------------------------------------------------------------------------
1 | 1.30.0
--------------------------------------------------------------------------------
/polars/monitor_ram.py:
--------------------------------------------------------------------------------
1 | import psutil
2 | import time
3 | import sys
4 |
5 | solution = str(sys.argv[1])
6 | data_name = str(sys.argv[2])
7 | pid_of_parent = int(sys.argv[3])
8 |
9 | max_loops = 720
10 | file_name = f"{solution}-ram-{data_name}.txt"
11 | i = 0
12 | f = open(file_name, "w")
13 | f.close()
14 | while i < max_loops:
15 | # Get the current RAM usage and RSS
16 | process = psutil.Process(pid_of_parent)
17 | rss_usage = process.memory_info().rss >> 30
18 | ram_usage = psutil.virtual_memory().available >> 30
19 |
20 | # Print the results
21 | f = open(file_name, "a")
22 | f.write(f"RAM usage: {ram_usage} GB \n")
23 | f.write(f"RSS usage: {rss_usage} GB \n \n")
24 | f.close()
25 |
26 | # Wait for 10 seconds before polling again
27 | time.sleep(5)
28 | i += 1
29 |
--------------------------------------------------------------------------------
/polars/setup-polars.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | # install dependencies
5 | # sudo apt-get update -qq
6 |
7 | virtualenv polars/py-polars --python=python3
8 | source polars/py-polars/bin/activate
9 |
10 | python3 -m pip install --upgrade psutil polars numpy
11 |
12 | # build
13 | deactivate
14 |
15 | ./polars/upg-polars.sh
16 |
17 | ./polars/ver-polars.sh
18 |
19 | # check
20 | # source polars/py-polars/bin/activate
21 | # python3
22 | # import polars as pl
23 | # pl.__version__
24 | # quit()
25 | # deactivate
26 |
27 | # fix: print(ans.head(3), flush=True): UnicodeEncodeError: 'ascii' codec can't encode characters in position 14-31: ordinal not in range(128)
28 | # vim polars/py-polars/bin/activate
29 | #deactivate () {
30 | # unset PYTHONIOENCODING
31 | # ...
32 | #}
33 | #...
34 | #PYTHONIOENCODING="utf-8"
35 | #export PYTHONIOENCODING
36 | #...
37 |
--------------------------------------------------------------------------------
/polars/upg-polars.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | echo 'upgrading polars...'
5 |
6 | source ./polars/py-polars/bin/activate
7 |
8 | python3 -m pip install --upgrade polars > /dev/null
9 |
10 | deactivate
--------------------------------------------------------------------------------
/polars/ver-polars.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | source ./polars/py-polars/bin/activate
5 | python3 -c 'import polars as pl; open("polars/VERSION","w").write(pl.__version__); open("polars/REVISION","w").write("");' > /dev/null
6 |
--------------------------------------------------------------------------------
/pydatatable/VERSION:
--------------------------------------------------------------------------------
1 | 1.2.0a0
--------------------------------------------------------------------------------
/pydatatable/convert-pydatatable-data.py:
--------------------------------------------------------------------------------
1 | print("pydatatable: converting 50GB join data")
2 | import os
3 | import datatable as dt
4 |
5 | if os.path.isfile('data/J1_1e9_NA_0_0.csv'):
6 | dt.fread('data/J1_1e9_NA_0_0.csv').to_jay('data/J1_1e9_NA_0_0.jay')
7 | if os.path.isfile('data/J1_1e9_1e9_0_0.csv'):
8 | dt.fread('data/J1_1e9_1e9_0_0.csv').to_jay('data/J1_1e9_1e9_0_0.jay')
9 | if os.path.isfile('data/J1_1e9_1e6_0_0.csv'):
10 | dt.fread('data/J1_1e9_1e6_0_0.csv').to_jay('data/J1_1e9_1e6_0_0.jay')
11 | if os.path.isfile('data/J1_1e9_1e3_0_0.csv'):
12 | dt.fread('data/J1_1e9_1e3_0_0.csv').to_jay('data/J1_1e9_1e3_0_0.jay')
13 | if os.path.isfile('data/J1_1e9_NA_0_1.csv'):
14 | dt.fread('data/J1_1e9_NA_0_1.csv').to_jay('data/J1_1e9_NA_0_1.jay')
15 | if os.path.isfile('data/J1_1e9_1e9_0_1.csv'):
16 | dt.fread('data/J1_1e9_1e9_0_1.csv').to_jay('data/J1_1e9_1e9_0_1.jay')
17 | if os.path.isfile('data/J1_1e9_1e6_0_1.csv'):
18 | dt.fread('data/J1_1e9_1e6_0_1.csv').to_jay('data/J1_1e9_1e6_0_1.jay')
19 | if os.path.isfile('data/J1_1e9_1e3_0_1.csv'):
20 | dt.fread('data/J1_1e9_1e3_0_1.csv').to_jay('data/J1_1e9_1e3_0_1.jay')
21 | if os.path.isfile('data/J1_1e9_NA_5_0.csv'):
22 | dt.fread('data/J1_1e9_NA_5_0.csv').to_jay('data/J1_1e9_NA_5_0.jay')
23 | if os.path.isfile('data/J1_1e9_1e9_5_0.csv'):
24 | dt.fread('data/J1_1e9_1e9_5_0.csv').to_jay('data/J1_1e9_1e9_5_0.jay')
25 | if os.path.isfile('data/J1_1e9_1e6_5_0.csv'):
26 | dt.fread('data/J1_1e9_1e6_5_0.csv').to_jay('data/J1_1e9_1e6_5_0.jay')
27 | if os.path.isfile('data/J1_1e9_1e3_5_0.csv'):
28 | dt.fread('data/J1_1e9_1e3_5_0.csv').to_jay('data/J1_1e9_1e3_5_0.jay')
29 |
30 | print("pydatatable: done converting 50GB join data")
--------------------------------------------------------------------------------
/pydatatable/read-pydatatable.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | print("# read-pydatatable.py")
4 |
5 | import os
6 | import gc
7 | import timeit
8 | import subprocess
9 | import datatable as dt
10 | from datatable import f, sum
11 |
12 | exec(open("./helpers.py").read())
13 |
14 | src_grp = os.environ['SRC_GRP_LOCAL']
15 |
16 | ver = dt.__version__
17 | git = dt.__git_revision__
18 | task = "read"
19 | data_name = os.path.basename(src_grp)
20 | solution = "pydatatable"
21 | fun = "fread"
22 | cache = "TRUE"
23 |
24 | wc_lines = subprocess.run(['wc','-l',data_name], stdout=subprocess.PIPE).stdout.decode('utf-8').split(" ", 1)[0]
25 | in_rows = int(wc_lines)-1
26 |
27 | print("reading...")
28 |
29 | question = "all rows" #1
30 | gc.collect()
31 | t_start = timeit.default_timer()
32 | ans = dt.fread(data_name, show_progress=False)
33 | print(ans.shape)
34 | t = timeit.default_timer() - t_start
35 | m = memory_usage()
36 | t_start = timeit.default_timer()
37 | chk = ans[:, sum(f.v3)]
38 | chkt = timeit.default_timer() - t_start
39 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
40 | del ans
41 | gc.collect()
42 | t_start = timeit.default_timer()
43 | ans = dt.fread(data_name, show_progress=False)
44 | print(ans.shape)
45 | t = timeit.default_timer() - t_start
46 | m = memory_usage()
47 | t_start = timeit.default_timer()
48 | chk = ans[:, sum(f.v3)]
49 | chkt = timeit.default_timer() - t_start
50 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
51 | del ans
52 | gc.collect()
53 | t_start = timeit.default_timer()
54 | ans = dt.fread(data_name, show_progress=False)
55 | print(ans.shape)
56 | t = timeit.default_timer() - t_start
57 | m = memory_usage()
58 | t_start = timeit.default_timer()
59 | chk = ans[:, sum(f.v3)]
60 | chkt = timeit.default_timer() - t_start
61 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
62 | del ans
63 |
64 | question = "top 100 rows" #2
65 | gc.collect()
66 | t_start = timeit.default_timer()
67 | ans = dt.fread(data_name, max_nrows=100, show_progress=False)
68 | print(ans.shape)
69 | t = timeit.default_timer() - t_start
70 | m = memory_usage()
71 | t_start = timeit.default_timer()
72 | chk = ans[:, sum(f.v3)]
73 | chkt = timeit.default_timer() - t_start
74 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
75 | del ans
76 | gc.collect()
77 | t_start = timeit.default_timer()
78 | ans = dt.fread(data_name, max_nrows=100, show_progress=False)
79 | print(ans.shape)
80 | t = timeit.default_timer() - t_start
81 | m = memory_usage()
82 | t_start = timeit.default_timer()
83 | chk = ans[:, sum(f.v3)]
84 | chkt = timeit.default_timer() - t_start
85 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
86 | del ans
87 | gc.collect()
88 | t_start = timeit.default_timer()
89 | ans = dt.fread(data_name, max_nrows=100, show_progress=False)
90 | print(ans.shape)
91 | t = timeit.default_timer() - t_start
92 | m = memory_usage()
93 | t_start = timeit.default_timer()
94 | chk = ans[:, sum(f.v3)]
95 | chkt = timeit.default_timer() - t_start
96 | write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
97 | del ans
98 |
99 | exit(0)
100 |
--------------------------------------------------------------------------------
/pydatatable/setup-pydatatable.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | # install dependencies
5 | virtualenv pydatatable/py-pydatatable --python=python3
6 | source pydatatable/py-pydatatable/bin/activate
7 |
8 | python -m pip install --upgrade psutil
9 |
10 | # # build
11 | deactivate
12 | ./pydatatable/upg-pydatatable.sh
13 |
14 | # # check
15 | # source pydatatable/py-pydatatable/bin/activate
16 | # python
17 | # import datatable as dt
18 | # dt.__version__
19 | # quit()
20 | # deactivate
21 |
22 | # resave 1e9 join data from csv to jay format so pydt can try out-of-memory processing
23 | source pydatatable/py-pydatatable/bin/activate
24 | python3 pydatatable/convert-pydatatable-data.py
25 |
26 | ./pydatatable/ver-pydatatable.sh
--------------------------------------------------------------------------------
/pydatatable/sort-pydatatable.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | print("# sort-pydatatable.py")
4 |
5 | import os
6 | import gc
7 | import timeit
8 | import datatable as dt
9 | from datatable import f, sum
10 |
11 | exec(open("./helpers.py").read())
12 |
13 | src_x = os.environ['SRC_X_LOCAL']
14 |
15 | ver = dt.__version__
16 | git = dt.__git_revision__
17 | task = "sort"
18 | question = "by int KEY"
19 | data_name = os.path.basename(src_x)
20 | solution = "pydatatable"
21 | fun = ".sort"
22 | cache = "TRUE"
23 |
24 | print("loading dataset...")
25 |
26 | x = dt.fread(data_name)
27 |
28 | print("sorting...")
29 |
30 | gc.collect()
31 | t_start = timeit.default_timer()
32 | ans = x.sort('KEY')
33 | print(ans.shape)
34 | t = timeit.default_timer() - t_start
35 | m = memory_usage()
36 | t_start = timeit.default_timer()
37 | chk = ans[:, sum(f.X2)]
38 | chkt = timeit.default_timer() - t_start
39 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
40 | del ans
41 |
42 | gc.collect()
43 | t_start = timeit.default_timer()
44 | ans = x.sort('KEY')
45 | print(ans.shape)
46 | t = timeit.default_timer() - t_start
47 | m = memory_usage()
48 | t_start = timeit.default_timer()
49 | chk = ans[:, sum(f.X2)]
50 | chkt = timeit.default_timer() - t_start
51 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
52 | del ans
53 |
54 | gc.collect()
55 | t_start = timeit.default_timer()
56 | ans = x.sort('KEY')
57 | print(ans.shape)
58 | t = timeit.default_timer() - t_start
59 | m = memory_usage()
60 | t_start = timeit.default_timer()
61 | chk = ans[:, sum(f.X2)]
62 | chkt = timeit.default_timer() - t_start
63 | write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=3, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
64 | del ans
65 |
66 | exit(0)
67 |
--------------------------------------------------------------------------------
/pydatatable/upg-pydatatable.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | echo 'upgrading pydatatable...'
5 |
6 | source ./pydatatable/py-pydatatable/bin/activate
7 | python3 -m pip install --upgrade git+https://github.com/h2oai/datatable > /dev/null 2>&1
8 | deactivate
9 |
10 | echo 'done upgrading'
11 |
--------------------------------------------------------------------------------
/pydatatable/ver-pydatatable.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | source ./pydatatable/py-pydatatable/bin/activate
5 | python3 -c 'import datatable as dt; open("pydatatable/VERSION","w").write(dt.__version__.split("+", 1)[0]); open("pydatatable/REVISION","w").write(dt.build_info.git_revision);' > /dev/null
6 |
--------------------------------------------------------------------------------
/run.conf:
--------------------------------------------------------------------------------
1 | # task, used in init-setup-iteration.R
2 | export RUN_TASKS="groupby join"
3 | # solution, used in init-setup-iteration.R
4 | export RUN_SOLUTIONS="R-arrow collapse datafusion duckdb polars spark "
5 |
6 | # flag to upgrade tools, used in run.sh on init
7 | export DO_UPGRADE=false
8 | # force run, ignore if same version was run already
9 | export FORCE_RUN=true
10 | # not run benchmarks but print what would run and what skipped
11 | export MOCKUP=false
12 |
13 | # flag to build reports, used in ruh.sh before publish
14 | export DO_REPORT=true
15 | # flag to publish, used in ruh.sh before exit
16 | export DO_PUBLISH=false
17 |
18 | # logging and timing files
19 | export CSV_LOGS_FILE="logs.csv"
20 | export CSV_TIME_FILE="time.csv"
21 |
--------------------------------------------------------------------------------
/spark/VERSION:
--------------------------------------------------------------------------------
1 | 4.0.0
--------------------------------------------------------------------------------
/spark/setup-spark.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | # install java8
5 | # sudo apt-get install openjdk-8-jdk
6 |
7 | virtualenv spark/py-spark --python=python3
8 |
9 |
10 | # put to paths
11 | echo 'export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64' >> path.env
12 |
13 | source path.env
14 |
15 | source spark/py-spark/bin/activate
16 | # install binaries
17 | python3 -m pip install --upgrade psutil
18 | python3 -m pip install --upgrade pyspark
19 |
20 | # check
21 | # python3
22 | # import pyspark
23 | # pyspark.__version__
24 | # quit()
25 |
26 | deactivate
27 |
28 |
29 | ./spark/ver-spark.sh
--------------------------------------------------------------------------------
/spark/upg-spark.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | echo 'upgrading spark...'
5 |
6 | source ./spark/py-spark/bin/activate
7 |
8 | python3 -m pip install --upgrade pyspark > /dev/null
9 |
10 | deactivate
--------------------------------------------------------------------------------
/spark/ver-spark.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | source ./spark/py-spark/bin/activate
5 | python3 -c 'import pyspark; open("spark/VERSION","w").write(pyspark.__version__); open("spark/REVISION","w").write("");' > /dev/null
6 |
--------------------------------------------------------------------------------