├── .Rbuildignore ├── .github └── workflows │ └── check-standard.yaml ├── .gitignore ├── .gitmodules ├── DESCRIPTION ├── LICENSE ├── Makefile ├── NAMESPACE ├── NEWS.md ├── R ├── Q.r ├── Q_rows.r ├── RcppExports.R ├── check_args.r ├── chunk.r ├── clustermq-package.r ├── foreach.r ├── master.r ├── pool.r ├── qsys.r ├── qsys_local.r ├── qsys_lsf.r ├── qsys_multicore.r ├── qsys_multiprocess.r ├── qsys_sge.r ├── qsys_slurm.r ├── qsys_ssh.r ├── ssh_proxy.r ├── summarize_result.r ├── util.r ├── work_chunk.r ├── worker.r ├── workers.r └── zzz.r ├── README.md ├── _pkgdown.yml ├── cleanup ├── configure ├── configure.win ├── inst ├── CITATION ├── LSF.tmpl ├── PBS.tmpl ├── SGE.tmpl ├── SLURM.tmpl ├── SSH.tmpl └── TORQUE.tmpl ├── man ├── LOCAL.Rd ├── LSF.Rd ├── MULTICORE.Rd ├── MULTIPROCESS.Rd ├── Pool.Rd ├── Q.Rd ├── QSys.Rd ├── Q_rows.Rd ├── SGE.Rd ├── SLURM.Rd ├── SSH.Rd ├── check_args.Rd ├── chunk.Rd ├── clustermq-package.Rd ├── cmq_foreach.Rd ├── dot-onAttach.Rd ├── dot-onLoad.Rd ├── fill_template.Rd ├── host.Rd ├── master.Rd ├── msg_fmt.Rd ├── register_dopar_cmq.Rd ├── ssh_proxy.Rd ├── summarize_result.Rd ├── vec_lookup.Rd ├── work_chunk.Rd ├── worker.Rd ├── workers.Rd └── wrap_error.Rd ├── src ├── CMQMaster.cpp ├── CMQMaster.h ├── CMQProxy.cpp ├── CMQProxy.h ├── CMQWorker.cpp ├── CMQWorker.h ├── Makevars.in ├── Makevars.win ├── RcppExports.cpp ├── common.cpp ├── common.h ├── util.cpp └── util │ ├── build_libzmq.sh │ ├── patch_libzmq.sh │ ├── test_cpp11.cpp │ └── test_libzmq.c ├── tests ├── bin │ ├── bkill │ ├── bsub │ ├── fake_scheduler.sh │ ├── qdel │ ├── qsub │ ├── sbatch │ └── scancel ├── testthat.R └── testthat │ ├── helper-util.r │ ├── test-0-util.r │ ├── test-1-check_args.r │ ├── test-2-worker.r │ ├── test-3-work_chunk.r │ ├── test-4-pool.r │ ├── test-5-queue.r │ ├── test-6-queue_impl.r │ ├── test-7-ssh_proxy.r │ └── test-8-foreach.r ├── tools └── winlibs.R └── vignettes ├── faq.Rmd ├── technicaldocs.Rmd └── userguide.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^doc$ 2 | ^configure.backup$ 3 | ^Meta$ 4 | ^doc(s)?$ 5 | \.gitignore 6 | backup 7 | ^CHANGES\.md$ 8 | ^Makefile$$ 9 | ^\.travis\.yml$ 10 | ^\.travis-ssh\.sh$ 11 | ^CITATION$ 12 | ^_pkgdown\.yml$ 13 | clustermq_[0-9.]+\.tar\.gz 14 | ^\.github$ 15 | ^src/libzmq/\. 16 | ^src/libzmq/config/.*\.m4$ 17 | ^src/libzmq/build_qnx/.*Makefile$ 18 | ^src/libzmq/builds/openwrt/Makefile$ 19 | ^src/libzmq/Makefile$ 20 | ^src/libzmq/CMakeFiles$ 21 | ^src/libzmq/external/wepoll$ 22 | ^src/libzmq/src/tweetnacl.c$ 23 | ^src/cppzmq/\. 24 | ^windows$ 25 | -------------------------------------------------------------------------------- /.github/workflows/check-standard.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches-ignore: gh-pages 6 | pull_request: 7 | branches-ignore: gh-pages 8 | schedule: 9 | - cron: "0 0 * * 2" 10 | 11 | name: R-check 12 | 13 | jobs: 14 | R-CMD-check: 15 | runs-on: ${{ matrix.config.os }} 16 | 17 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 18 | 19 | strategy: 20 | fail-fast: false 21 | matrix: 22 | config: 23 | - {os: windows-latest, r: 'release'} 24 | - {os: windows-latest, r: 'devel'} 25 | - {os: macOS-latest, r: 'release'} 26 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 27 | - {os: ubuntu-latest, r: 'release'} 28 | - {os: ubuntu-latest, r: 'oldrel-1'} 29 | 30 | env: 31 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 32 | R_KEEP_PKG_SOURCE: yes 33 | 34 | steps: 35 | - uses: actions/checkout@v3 36 | with: 37 | submodules: recursive 38 | 39 | - uses: r-lib/actions/setup-pandoc@v2 40 | 41 | - uses: r-lib/actions/setup-r@v2 42 | with: 43 | r-version: ${{ matrix.config.r }} 44 | http-user-agent: ${{ matrix.config.http-user-agent }} 45 | use-public-rspm: true 46 | 47 | - uses: r-lib/actions/setup-r-dependencies@v2 48 | with: 49 | extra-packages: any::rcmdcheck 50 | needs: check 51 | 52 | - name: Install system dependencies (macOS) 53 | if: runner.os == 'macOS' 54 | run: | 55 | brew update 56 | brew install coreutils automake libtool zeromq 57 | 58 | - name: Set up local key-based SSH 59 | if: runner.os != 'Windows' # GHA does not allow empty passphrase on Windows 60 | run: | 61 | ssh-keygen -t rsa -f ~/.ssh/id_rsa -N "" -q 62 | cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys 63 | ssh-keyscan -t rsa 127.0.0.1 >> ~/.ssh/known_hosts 64 | echo "Host 127.0.0.1" >> ~/.ssh/config 65 | echo " IdentityFile ~/.ssh/id_rsa" >> ~/.ssh/config 66 | echo "$(hostname) 127.0.0.1" >> ~/.hosts 67 | chmod og-rw ~ 68 | 69 | - name: Install R package and add paths 70 | if: runner.os != 'Windows' 71 | run: | 72 | R CMD INSTALL . 73 | echo '.libPaths("~/work/_temp/Library")' >> ~/.Rprofile # cmq package in R 74 | echo 'options(clustermq.scheduler="multicore")' >> ~/.Rprofile 75 | echo "$(pwd)/tests/bin" >> $GITHUB_PATH # local cmq 76 | sed -i "1iexport PATH=$(pwd)/tests/bin:\$PATH" ~/.bashrc || true # ssh cmq 77 | 78 | - name: Query capabilities 79 | if: runner.os != 'Windows' # does not recognize -e 80 | run: | 81 | set -x 82 | which R 83 | which sbatch || echo "sbatch not found" 84 | ssh 127.0.0.1 'which R; which sbatch; echo $PATH' || true 85 | ssh 127.0.0.1 'R --slave --no-restore -e ".libPaths()"' || true 86 | R --slave --no-restore -e "message(clustermq:::qsys_default)" || true 87 | ssh 127.0.0.1 'R --slave --no-restore -e "message(clustermq:::qsys_default)"' || true 88 | 89 | - name: make test 90 | if: runner.os != 'Windows' 91 | run: | 92 | timeout 300 make test 93 | 94 | - uses: r-lib/actions/check-r-package@v2 95 | with: 96 | upload-snapshots: true 97 | 98 | - name: Print logs if failure 99 | if: failure() && runner.os != 'Windows' 100 | run: | 101 | set -x 102 | cat ~/*.log || true 103 | cat ~/worker.log || true 104 | cat ~/ssh_proxy.log || true 105 | cat clustermq.Rcheck/tests/* || true 106 | 107 | - name: Upload check results 108 | if: failure() 109 | uses: actions/upload-artifact@main 110 | with: 111 | name: ${{ runner.os }}-r${{ matrix.config.r }}-results 112 | path: check 113 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | Meta 2 | doc 3 | docs 4 | backup 5 | inst/doc 6 | vignettes/*.R 7 | vignettes/*.md 8 | vignettes/.build.timestamp 9 | src/*.o 10 | src/*.so 11 | *.swp 12 | *.tar.gz 13 | *.log 14 | *.html 15 | clustermq.Rcheck 16 | src/Makevars 17 | .github/*.html 18 | windows 19 | /doc/ 20 | /Meta/ 21 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "src/libzmq"] 2 | path = src/libzmq 3 | url = https://github.com/zeromq/libzmq.git 4 | [submodule "src/cppzmq"] 5 | path = src/cppzmq 6 | url = https://github.com/zeromq/cppzmq.git 7 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: clustermq 2 | Title: Evaluate Function Calls on HPC Schedulers (LSF, SGE, SLURM, PBS/Torque) 3 | Version: 0.9.9 4 | Authors@R: c( 5 | person('Michael', 'Schubert', email='mschu.dev@gmail.com', 6 | role = c('aut', 'cre', 'cph'), 7 | comment = c(ORCID='0000-0002-6862-5221')), 8 | person('ZeroMQ authors', 9 | role = c('aut', 'cph'), 10 | comment = "source files in 'src/libzmq' and 'src/cppzmq'")) 11 | Maintainer: Michael Schubert 12 | Description: Evaluate arbitrary function calls using workers on HPC schedulers 13 | in single line of code. All processing is done on the network without 14 | accessing the file system. Remote schedulers are supported via SSH. 15 | URL: https://mschubert.github.io/clustermq/ 16 | BugReports: https://github.com/mschubert/clustermq/issues 17 | SystemRequirements: ZeroMQ (libzmq) >= 4.3.0 (optional; otherwise bundled) 18 | Depends: 19 | R (>= 3.6.2) 20 | LinkingTo: Rcpp 21 | Imports: 22 | methods, 23 | globals, 24 | progress, 25 | R6, 26 | Rcpp, 27 | utils 28 | License: Apache License (== 2.0) | file LICENSE 29 | Encoding: UTF-8 30 | Suggests: 31 | BiocParallel, 32 | callr, 33 | devtools, 34 | foreach, 35 | iterators, 36 | knitr, 37 | parallel, 38 | rmarkdown, 39 | roxygen2 (>= 5.0.0), 40 | testthat, 41 | tools 42 | VignetteBuilder: knitr 43 | Roxygen: list(r6 = FALSE) 44 | RoxygenNote: 7.3.2 45 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all 2 | all: rcpp doc vignettes 3 | 4 | R = R --no-save --no-restore -e 5 | BIN = $(abspath $(lastword $(MAKEFILE_LIST))/../tests/bin) 6 | PKGVER = $(shell grep Version: < DESCRIPTION | sed "s/Version: //") 7 | 8 | .PHONY: test 9 | test: 10 | PATH=$(BIN):$$PATH $(R) "devtools::test()" 11 | 12 | .PHONY: check 13 | check: 14 | PATH=$(BIN):$$PATH $(R) "devtools::check()" 15 | 16 | .PHONY: rcpp 17 | rcpp: 18 | $(R) "Rcpp::compileAttributes()" 19 | 20 | rmd_files=$(wildcard vignettes/*.rmd) 21 | knit_results=$(patsubst vignettes/%.rmd,inst/doc/%.md,$(rmd_files)) 22 | 23 | .PHONY: vignettes 24 | vignettes: inst/doc ${knit_results} 25 | $(R) "library(knitr); library(devtools); build_vignettes()" 26 | 27 | inst/doc: 28 | mkdir -p $@ 29 | 30 | inst/doc/%.md: vignettes/%.rmd 31 | $(R) "knitr::knit('$<', '$@')" 32 | 33 | .PHONY: doc 34 | doc: 35 | $(R) "devtools::document()" 36 | 37 | .PHONY: package 38 | package: rcpp doc vignettes 39 | ./src/util/patch_libzmq.sh 40 | PATH=$(BIN):$$PATH R CMD build . 41 | R CMD check --as-cran clustermq_$(PKGVER).tar.gz 42 | 43 | .PHONY: deploy 44 | deploy: 45 | $(R) "pkgdown::deploy_to_branch()" 46 | 47 | .PHONY: clean 48 | clean: 49 | ${RM} -r inst/doc 50 | ${RM} -r man 51 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(Q) 4 | export(Q_rows) 5 | export(register_dopar_cmq) 6 | export(workers) 7 | import(Rcpp) 8 | useDynLib(clustermq) 9 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # clustermq 0.9.9 2 | 3 | * The Windows binary no longer includes the disconnect monitor 4 | * Fix more CRAN warnings and test timeouts 5 | 6 | # clustermq 0.9.8 7 | 8 | * Suppress R6 clonable message 9 | * Fix CRAN warning about `cppzmq` deprecated declaration 10 | 11 | # clustermq 0.9.7 12 | 13 | * Fix a bug where `BiocGenerics` could break template filling (#337) 14 | * Remove deprecated automatic array splitting in `Q` 15 | 16 | # clustermq 0.9.6 17 | 18 | * Large common data size is now reported correctly (#336) 19 | * Template filling will no longer convert large numbers to scientific format 20 | * Common data will no longer be duplicated when sending to workers 21 | 22 | # clustermq 0.9.5 23 | 24 | * Fix a bug where an outdated system `libzmq` led to compilation errors (#327) 25 | * New option `clustermq.ports` specifies eligible port range (#328) @michaelmayer2 26 | 27 | # clustermq 0.9.4 28 | 29 | * Fix a bug where worker stats were shown as `NA` (#325) 30 | * Worker API: `env()` now visibly lists environment if called without arguments 31 | 32 | # clustermq 0.9.3 33 | 34 | * Fix a bug where `BiocParallel` did not export required objects (#302) 35 | * Fix a bug where already finished workers were killed (#307) 36 | * Fix a bug where worker results and stats could be garbage collected (#324) 37 | * There is now an FAQ vignette with answers to frequently asked questions 38 | * Worker API: `send()` now reports a call identifier that `current()` tracks 39 | 40 | # clustermq 0.9.2 41 | 42 | * Fix a bug where SSH proxy would not cache data properly (#320) 43 | * Fix a bug where `max_calls_worker` was not respected (#322) 44 | * Local parallelism (`multicore`, `multiprocess`) again uses local IP (#321) 45 | * Worker API: `info()` now also returns current worker and number of calls 46 | 47 | # clustermq 0.9.1 48 | 49 | * Disconnect monitor (libzmq with `-DZMQ_BUILD_DRAFT_API=1`) is now optional (#317) 50 | * Fix a bug where worker shutdown notifications can cause a crash (#306, #308, #310) 51 | * Fix a bug where template values were not filled correctly (#309) 52 | * Fix a bug where using `Rf_error` lead to improper cleanup of resources (#311) 53 | * Fix a bug where maximum worker timeout was multiplied and led to undefined behavior 54 | * Fix a bug where ZeroMQ's `-Werror` flag led to compilation issues on M1 Mac 55 | * Fix a bug where SSH tests could error with timeout on high load 56 | * Worker API: `CMQMaster` now needs to know `add_pending_workers(n)` 57 | * Worker API: status report `info()` now displays properly 58 | 59 | # clustermq 0.9.0 60 | 61 | #### Features 62 | 63 | * Reuse of common data is now supported (#154) 64 | * Jobs now error instead of stalling upon unexpected worker disconnect (#150) 65 | * Workers now error if they can not establish a connection within a time limit 66 | * Error if `n_jobs` and `max_calls_worker` provide insufficient call slots (#258) 67 | * Request 1 GB by default in SGE template (#298) @nickholway 68 | * Error and warning summary now orders by index and severity (#304) 69 | * A call can have multiple warnings forwarded, not only last 70 | 71 | #### Bugfix 72 | 73 | * Fix bug where max memory reporting by `gc()` may be in different column (#240) 74 | * Fix passing numerical `job_id` to `qdel` in PBS (#265) 75 | * The job port/id pool is now used properly upon binding failure (#270) @luwidmer 76 | * Common data size warning is now only displayed when exceeding limits (#287) 77 | 78 | #### Internal 79 | 80 | * Complete rewrite of the worker API 81 | * We no longer depend on the `purrr` package 82 | 83 | # clustermq 0.8.95 84 | 85 | * We are now using _ZeroMQ_ via `Rcpp` in preparation for `v0.9` (#151) 86 | * New `multiprocess` backend via `callr` instead of forking (#142, #197) 87 | * Sending data on sockets is now blocking to avoid excess memory usage (#161) 88 | * `multicore`, `multiprocess` schedulers now support logging (#169) 89 | * New option `clustermq.host` can specify host IP or network interface name (#170) 90 | * Template filling will now raise error for missing keys (#174, #198) 91 | * Workers failing with large common data is improved (fixed?) (#146, #179, #191) 92 | * Local connections are now routed via `127.0.0.1` instead of `localhost` (#192) 93 | * Submit messages are different between local, multicore and HPC (#196) 94 | * Functions exported by `foreach` now have their environment stripped (#200) 95 | * Deprecation of `log_worker=T/F` argument is rescinded 96 | 97 | # clustermq 0.8.9 98 | 99 | * New option `clustermq.ssh.timeout` for SSH proxy startup (#157) @brendanf 100 | * New option `clustermq.worker.timeout` for delay before worker shutdown (#188) 101 | * Fixed PBS/Torque docs, template and cleanup (#184, #186) @mstr3336 102 | * Warning if common data is very large, set by `clustermq.data.warning` (#189) 103 | 104 | # clustermq 0.8.8 105 | 106 | * `Q`, `Q_rows` have new arguments `verbose` (#111) and `pkgs` (#144) 107 | * `foreach` backend now uses its dedicated API where possible (#143, #144) 108 | * Number and size of objects common to all calls now work properly 109 | * Templates are filled internally and no longer depend on `infuser` package 110 | 111 | # clustermq 0.8.7 112 | 113 | * `Q` now has `max_calls_worker` argument to avoid walltime (#110) 114 | * Submission messages now list size of common data (drake#800) 115 | * All default templates now have an optional `cores` per job field (#123) 116 | * `foreach` now treats `.export` (#124) and `.combine` (#126) correctly 117 | * New option `clustermq.error.timeout` to not wait for clean shutdown (#134) 118 | * SSH command is now specified via a template file (#122) 119 | * SSH will now forward errors to the local process (#135) 120 | * The Wiki is deprecated, use https://mschubert.github.io/clustermq/ instead 121 | 122 | # clustermq 0.8.6 123 | 124 | * Progress bar is now shown before any workers start (#107) 125 | * Socket connections are now authenticated using a session password (#125) 126 | * Marked internal functions with `@keywords internal` 127 | * Added vignettes for the _User Guide_ and _Technical Documentation_ 128 | 129 | # clustermq 0.8.5 130 | 131 | * Added experimental support as parallel foreach backend (#83) 132 | * Moved templates to package `inst/` directory (#85) 133 | * Added `send_call` to worker to evaluate arbitrary expressions (drake#501; #86) 134 | * Option `clustermq.scheduler` is now respected if set after package load (#88) 135 | * System interrupts are now handled correctly (rzmq#44; #73, #93, #97) 136 | * Number of workers running/total is now shown in progress bar (#98) 137 | * Unqualified (short) host names are now resolved by default (#104) 138 | 139 | # clustermq 0.8.4 140 | 141 | * Fix error for `qsys$reusable` when using `n_jobs=0`/local processing (#75) 142 | * Scheduler-specific templates are deprecated. Use `clustermq.template` instead 143 | * Allow option `clustermq.defaults` to fill default template values (#71) 144 | * Errors in worker processing are now shut down cleanly (#67) 145 | * Progress bar now shows estimated time remaining (#66) 146 | * Progress bar now also shown when processing locally 147 | * Memory summary now adds estimated memory of R session (#69) 148 | 149 | # clustermq 0.8.3 150 | 151 | * Support `rettype` for function calls where return type is known (#59) 152 | * Reduce memory requirements by processing results when we receive them 153 | * Fix a bug where cleanup, `log_worker` flag were not working for SGE/SLURM 154 | 155 | # clustermq 0.8.2 156 | 157 | * Fix a bug where never-started jobs are not cleaned up 158 | * Fix a bug where tests leave processes if port binding fails (#60) 159 | * Multicore no longer prints worker debug messages (#61) 160 | 161 | # clustermq 0.8.1 162 | 163 | * Fix performance issues for a high number of function calls (#56) 164 | * Fix bug where multicore workers were not shut down properly (#58) 165 | * Fix default templates for SGE, LSF and SLURM (misplaced quote) 166 | 167 | # clustermq 0.8.0 168 | 169 | #### Features 170 | 171 | * Templates changed: `clustermq:::worker` now takes only master as argument 172 | * Creating `workers` is now separated from `Q`, enabling worker reuse (#45) 173 | * Objects in the function environment must now be `export`ed explicitly (#47) 174 | * Added `multicore` qsys using the `parallel` package (#49) 175 | * New function `Q_rows` using data.frame rows as iterated arguments (#43) 176 | * Job summary will now report max memory as reported by `gc` (#18) 177 | 178 | #### Bugfix 179 | 180 | * Fix a bug where copies of `common_data` are collected by gc too slowly (#19) 181 | 182 | #### Internal 183 | 184 | * Messages on the master are now processed in threads (#42) 185 | * Jobs will now be submitted as array if possible 186 | 187 | # clustermq 0.7.0 188 | 189 | * Initial release on CRAN 190 | -------------------------------------------------------------------------------- /R/Q.r: -------------------------------------------------------------------------------- 1 | #' Queue function calls on the cluster 2 | #' 3 | #' @param fun A function to call 4 | #' @param ... Objects to be iterated in each function call 5 | #' @param const A list of constant arguments passed to each function call 6 | #' @param export List of objects to be exported to the worker 7 | #' @param pkgs Character vector of packages to load on the worker 8 | #' @param seed A seed to set for each function call 9 | #' @param memory Short for `template=list(memory=value)` 10 | #' @param template A named list of values to fill in the scheduler template 11 | #' @param n_jobs The number of jobs to submit; upper limit of jobs if job_size 12 | #' is given as well 13 | #' @param job_size The number of function calls per job 14 | #' @param rettype Return type of function call (vector type or 'list') 15 | #' @param fail_on_error If an error occurs on the workers, continue or fail? 16 | #' @param workers Optional instance of QSys representing a worker pool 17 | #' @param log_worker Write a log file for each worker 18 | #' @param chunk_size Number of function calls to chunk together 19 | #' defaults to 100 chunks per worker or max. 10 kb per chunk 20 | #' @param timeout Maximum time in seconds to wait for worker (default: Inf) 21 | #' @param max_calls_worker Maxmimum number of chunks that will be sent to one worker 22 | #' @param verbose Print status messages and progress bar (default: TRUE) 23 | #' @return A list of whatever `fun` returned 24 | #' @export 25 | #' 26 | #' @examples 27 | #' \dontrun{ 28 | #' # Run a simple multiplication for numbers 1 to 3 on a worker node 29 | #' fx = function(x) x * 2 30 | #' Q(fx, x=1:3, n_jobs=1) 31 | #' # list(2,4,6) 32 | #' 33 | #' # Run a mutate() call in dplyr on a worker node 34 | #' iris %>% 35 | #' mutate(area = Q(`*`, e1=Sepal.Length, e2=Sepal.Width, n_jobs=1)) 36 | #' # iris with an additional column 'area' 37 | #' } 38 | Q = function(fun, ..., const=list(), export=list(), pkgs=c(), seed=128965, 39 | memory=NULL, template=list(), n_jobs=NULL, job_size=NULL, 40 | rettype="list", fail_on_error=TRUE, workers=NULL, log_worker=FALSE, 41 | chunk_size=NA, timeout=Inf, max_calls_worker=Inf, verbose=TRUE) { 42 | 43 | df = check_args(fun, list(...), const) 44 | 45 | Q_rows(fun = fun, 46 | df = df, 47 | const = const, 48 | export = export, 49 | pkgs = pkgs, 50 | seed = seed, 51 | memory = memory, 52 | template = template, 53 | n_jobs = n_jobs, 54 | job_size = job_size, 55 | rettype = rettype, 56 | fail_on_error = fail_on_error, 57 | workers = workers, 58 | log_worker = log_worker, 59 | chunk_size = chunk_size, 60 | timeout = timeout, 61 | max_calls_worker = max_calls_worker, 62 | verbose = verbose) 63 | } 64 | -------------------------------------------------------------------------------- /R/Q_rows.r: -------------------------------------------------------------------------------- 1 | #' Queue function calls defined by rows in a data.frame 2 | #' 3 | #' @param df data.frame with iterated arguments 4 | #' @inheritParams Q 5 | #' @export 6 | #' 7 | #' @examples 8 | #' \dontrun{ 9 | #' # Run a simple multiplication for data frame columns x and y on a worker node 10 | #' fx = function (x, y) x * y 11 | #' df = data.frame(x = 5, y = 10) 12 | #' Q_rows(df, fx, job_size = 1) 13 | #' # [1] 50 14 | #' 15 | #' # Q_rows also matches the names of a data frame with the function arguments 16 | #' fx = function (x, y) x - y 17 | #' df = data.frame(y = 5, x = 10) 18 | #' Q_rows(df, fx, job_size = 1) 19 | #' # [1] 5 20 | #' } 21 | Q_rows = function(df, fun, const=list(), export=list(), pkgs=c(), seed=128965, 22 | memory=NULL, template=list(), n_jobs=NULL, job_size=NULL, 23 | rettype="list", fail_on_error=TRUE, workers=NULL, log_worker=FALSE, 24 | chunk_size=NA, timeout=Inf, max_calls_worker=Inf, verbose=TRUE) { 25 | 26 | # check if call args make sense 27 | if (!is.null(memory)) 28 | template$memory = memory 29 | if (!is.null(template$memory) && template$memory < 50) 30 | stop("Worker needs about 23 MB overhead, set memory>=50") 31 | if (is.na(seed) || length(seed) != 1) 32 | stop("'seed' needs to be a length-1 integer") 33 | 34 | fun = match.fun(fun) 35 | df = as.data.frame(df, check.names=FALSE, stringsAsFactors=FALSE) 36 | n_calls = nrow(df) 37 | seed = as.integer(seed) 38 | check_args(fun, df, const) 39 | 40 | # set up workers if none provided 41 | if (is.null(workers)) { 42 | qsys_id = toupper(getOption("clustermq.scheduler", qsys_default)) 43 | if (!is.null(n_jobs) && n_jobs == 0) 44 | qsys_id = "LOCAL" 45 | if (qsys_id != "LOCAL" && is.null(n_jobs) && is.null(job_size)) 46 | stop("n_jobs or job_size is required") 47 | n_jobs = Reduce(min, c(ceiling(n_calls / job_size), n_jobs, n_calls)) 48 | } else { 49 | qsys_id = class(workers$workers)[1] 50 | n_jobs = Inf #todo: number of workers 51 | } 52 | if (qsys_id != "LOCAL" && n_calls > n_jobs*max_calls_worker) 53 | stop("n_jobs and max_calls_worker provide fewer call slots than required") 54 | if (is.null(workers)) 55 | workers = workers(n_jobs, reuse=FALSE, template=template, 56 | log_worker=log_worker, verbose=verbose) 57 | workers$env(fun=fun, rettype=rettype, common_seed=seed, const=const) 58 | workers$pkg(pkgs) 59 | objs = do.call(workers$env, export) 60 | if (!is.null(template$memory) && 2*sum(objs$size)/1024^2 > template$memory) 61 | stop("Not enough memory requested to unserialize data on workers") 62 | 63 | # heuristic for chunk size 64 | if (is.na(chunk_size)) 65 | chunk_size = round(Reduce(min, c( 66 | 500, # never more than 500 67 | n_calls / n_jobs / 100, # each worker reports back 100 times 68 | n_calls / 2000, # at most 2000 reports total 69 | 1e4 * n_calls / utils::object.size(df)[[1]] # no more than 10 kb 70 | ))) 71 | chunk_size = max(chunk_size, 1) 72 | 73 | # process calls 74 | if (inherits(workers$workers, "LOCAL")) { 75 | list2env(export, envir=environment(fun)) 76 | for (pkg in pkgs) # is it possible to attach the package to fun's env? 77 | library(pkg, character.only=TRUE) 78 | re = work_chunk(df=df, fun=fun, const=const, rettype=rettype, 79 | common_seed=seed, progress=TRUE) 80 | summarize_result(re$result, length(re$errors), length(re$warnings), 81 | re[c("errors", "warnings")], fail_on_error=fail_on_error) 82 | } else { 83 | master(pool=workers, iter=df, rettype=rettype, 84 | fail_on_error=fail_on_error, chunk_size=chunk_size, 85 | timeout=timeout, max_calls_worker=max_calls_worker, 86 | verbose=verbose) 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | has_connectivity <- function(host) { 5 | .Call('_clustermq_has_connectivity', PACKAGE = 'clustermq', host) 6 | } 7 | 8 | libzmq_has_draft <- function() { 9 | .Call('_clustermq_libzmq_has_draft', PACKAGE = 'clustermq') 10 | } 11 | 12 | -------------------------------------------------------------------------------- /R/check_args.r: -------------------------------------------------------------------------------- 1 | #' Function to check arguments with which Q() is called 2 | #' 3 | #' @param fun A function to call 4 | #' @param iter Objects to be iterated in each function call 5 | #' @param const A list of constant arguments passed to each function call 6 | #' @return Processed iterated argument list if 'iter' is a list 7 | #' @keywords internal 8 | check_args = function(fun, iter, const=list()) { 9 | if (!is.list(iter) || length(iter) == 0) 10 | stop("'iter' needs to be a list with at least one element") 11 | 12 | # check function and arguments provided 13 | funargs = formals(fun) 14 | required = names(funargs)[unlist(lapply(funargs, function(f) class(f)=='name'))] 15 | required = setdiff(required, "...") 16 | 17 | if (length(iter) == 1 && length(required) == 1 && is.null(names(iter))) 18 | names(iter) = required 19 | 20 | provided = names(c(iter, const)) 21 | 22 | sdiff = unlist(setdiff(required, provided)) 23 | if (length(sdiff) > 1) 24 | stop(paste("If more than one argument, all must be named:", 25 | paste(sdiff, collapse=" "))) 26 | 27 | sdiff = unlist(setdiff(provided, names(funargs))) 28 | if (length(sdiff) > 0 && ! '...' %in% names(funargs)) 29 | stop(paste("Argument provided but not accepted by function:", 30 | paste(sdiff, collapse=" "))) 31 | 32 | dups = duplicated(provided) 33 | if (any(dups)) 34 | stop(paste("Argument duplicated:", paste(provided[[dups]], collapse=" "))) 35 | 36 | if (!is.data.frame(iter)) { 37 | df = data.frame(..placeholder.. = seq_along(iter[[1]])) 38 | for (field in names(iter)) 39 | df[[field]] = iter[[field]] 40 | df$..placeholder.. = NULL 41 | df 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /R/chunk.r: -------------------------------------------------------------------------------- 1 | #' Subset index chunk for processing 2 | #' 3 | #' 'attr' in `[.data.frame` takes too much CPU time 4 | #' 5 | #' @param x Index data.frame 6 | #' @param i Rows to subset 7 | #' @return x[i,] 8 | #' @keywords internal 9 | chunk = function(x, i) { 10 | re = lapply(x, `[`, i=i) 11 | re$` id ` = i 12 | re 13 | } 14 | -------------------------------------------------------------------------------- /R/clustermq-package.r: -------------------------------------------------------------------------------- 1 | #' Evaluate Function Calls on HPC Schedulers (LSF, SGE, SLURM) 2 | #' 3 | #' Provides the \code{Q} function to send arbitrary function calls to 4 | #' workers on HPC schedulers without relying on network-mounted storage. 5 | #' Allows using remote schedulers via SSH. 6 | #' 7 | #' Under the hood, this will submit a cluster job that connects to the master 8 | #' via TCP the master will then send the function and argument chunks to the 9 | #' worker and the worker will return the results to the master until everything 10 | #' is done and you get back your result 11 | #' 12 | #' Computations are done entirely on the network and without any temporary 13 | #' files on network-mounted storage, so there is no strain on the file system 14 | #' apart from starting up R once per job. This removes the biggest bottleneck 15 | #' in distributed computing. 16 | #' 17 | #' Using this approach, we can easily do load-balancing, i.e. workers that get 18 | #' their jobs done faster will also receive more function calls to work on. This 19 | #' is especially useful if not all calls return after the same time, or one 20 | #' worker has a high load. 21 | #' 22 | #' For more detailed usage instructions, see the documentation of the \code{Q} 23 | #' function. 24 | #' 25 | #' @docType package 26 | #' @keywords internal 27 | #' @useDynLib clustermq 28 | #' @import Rcpp 29 | "_PACKAGE" 30 | -------------------------------------------------------------------------------- /R/foreach.r: -------------------------------------------------------------------------------- 1 | #' Register clustermq as `foreach` parallel handler 2 | #' 3 | #' @param ... List of arguments passed to the `Q` function, e.g. n_jobs 4 | #' @export 5 | register_dopar_cmq = function(...) { 6 | dots = list(...) 7 | workers = NA 8 | if ("n_jobs" %in% names(dots)) 9 | workers = dots$n_jobs 10 | info = function(data, item) { 11 | switch(item, 12 | name = "clustermq", 13 | version = utils::packageVersion("clustermq"), 14 | workers = workers) 15 | } 16 | foreach::setDoPar(cmq_foreach, data=dots, info=info) 17 | } 18 | 19 | #' clustermq foreach handler 20 | #' 21 | #' @param obj Returned from foreach::foreach, containing the following variables: 22 | #' args : Arguments passed, each as a call 23 | #' argnames: character vector of arguments passed 24 | #' evalenv : Environment where to evaluate the arguments 25 | #' export : character vector of variable names to export to nodes 26 | #' packages: character vector of required packages 27 | #' verbose : whether to print status messages [logical] 28 | #' errorHandling: string of function name to call error with, e.g. "stop" 29 | #' @param expr An R expression in curly braces 30 | #' @param envir Environment where to evaluate the arguments 31 | #' @param data Common arguments passed by register_dopcar_cmq(), e.g. n_jobs 32 | #' @keywords internal 33 | cmq_foreach = function(obj, expr, envir, data) { 34 | stopifnot(inherits(obj, "foreach")) 35 | stopifnot(inherits(envir, "environment")) 36 | 37 | it = iterators::iter(obj) 38 | args_df = do.call(rbind, as.list(it)) 39 | 40 | # if we call a function by name, add it to the export list 41 | if (is.call(expr) && as.character(expr[[1]]) != "{") 42 | obj$export = c(as.character(expr[[1]]), obj$export) 43 | 44 | # wrap whatever we call in a function for use with Q(...) 45 | fun = function(...) NULL 46 | add = stats::setNames(replicate(ncol(args_df), substitute()), obj$argnames) 47 | formals(fun) = c(add, formals(fun)) 48 | body(fun) = expr 49 | 50 | # scan 'expr' for exports, eval and add objects ref'd in '.export' 51 | globs = globals::globalsOf(expr, envir=envir, mustExist=FALSE) 52 | globs = globs[! names(globs) %in% c(names(formals(fun)), ls(baseenv()))] 53 | data$export = utils::modifyList(as.list(data$export), globs, keep.null=TRUE) 54 | 55 | # make sure packages are loaded on the dopar target 56 | if (length(obj$packages) > 0) 57 | data$pkgs = unique(c(data$pkgs, obj$packages)) 58 | 59 | result = do.call(Q_rows, c(list(df=args_df, fun=fun), data)) 60 | 61 | accum = foreach::makeAccum(it) 62 | accum(result, tags=seq_along(result)) 63 | foreach::getResult(it) 64 | } 65 | -------------------------------------------------------------------------------- /R/master.r: -------------------------------------------------------------------------------- 1 | #' Master controlling the workers 2 | #' 3 | #' exchanging messages between the master and workers works the following way: 4 | #' * we have submitted a job where we don't know when it will start up 5 | #' * it starts, sends is a message list(id=0) indicating it is ready 6 | #' * we send it the function definition and common data 7 | #' * we also send it the first data set to work on 8 | #' * when we get any id > 0, it is a result that we store 9 | #' * and send the next data set/index to work on 10 | #' * when computatons are complete, we send id=0 to the worker 11 | #' * it responds with id=-1 (and usage stats) and shuts down 12 | #' 13 | #' @param pool Instance of Pool object 14 | #' @param iter Objects to be iterated in each function call 15 | #' @param rettype Return type of function 16 | #' @param fail_on_error If an error occurs on the workers, continue or fail? 17 | #' @param chunk_size Number of function calls to chunk together 18 | #' defaults to 100 chunks per worker or max. 500 kb per chunk 19 | #' @param timeout Maximum time in seconds to wait for worker (default: Inf) 20 | #' @param max_calls_worker Maxmimum number of function calls that will be sent to one worker 21 | #' @param verbose Print progress messages 22 | #' @return A list of whatever `fun` returned 23 | #' @keywords internal 24 | master = function(pool, iter, rettype="list", fail_on_error=TRUE, 25 | chunk_size=NA, timeout=Inf, max_calls_worker=Inf, verbose=TRUE) { 26 | # prepare empty variables for managing results 27 | n_calls = nrow(iter) 28 | job_result = rep(vec_lookup[[rettype]], n_calls) 29 | submit_index = 1:chunk_size 30 | jobs_running = 0 31 | cond_msgs = list(warnings=list(), errors=list()) 32 | n_errors = 0 33 | n_warnings = 0 34 | shutdown = FALSE 35 | kill_workers = FALSE 36 | penv = pool$env(work_chunk=work_chunk) 37 | obj_size = structure(sum(penv$size), class="object_size") 38 | obj_size_fmt = format(obj_size, big.mark=",", units="auto") 39 | if (is.infinite(timeout)) { 40 | timeout = -1L 41 | } else { 42 | timeout = timeout * 1000 # Rcpp API uses msec 43 | } 44 | 45 | #TODO: warn before serialization, create pool+env & then submit 46 | if (obj_size/1e6 > getOption("clustermq.data.warning", 500)) 47 | warning("Common data is ", obj_size_fmt, ". Recommended limit is ", 48 | getOption("clustermq.data.warning", 500), 49 | " Mb (set by clustermq.data.warning option)", immediate.=TRUE) 50 | 51 | if (!pool$reusable) 52 | on.exit(pool$cleanup()) 53 | 54 | if (verbose) { 55 | message("Running ", format(n_calls, big.mark=",", scientific=FALSE), 56 | " calculations (", nrow(penv), " objs/", obj_size_fmt, 57 | " common; ", chunk_size, " calls/chunk) ...") 58 | pb = progress::progress_bar$new(total = n_calls, 59 | format = "[:bar] :percent (:wup/:wtot wrk) eta: :eta") 60 | pb$tick(0, tokens=list(wtot=pool$workers_total, wup=pool$workers_running)) 61 | } 62 | 63 | # main event loop 64 | while((!shutdown && submit_index[1] <= n_calls) || jobs_running > 0) { 65 | msg = pool$recv(timeout) 66 | if (inherits(msg, "worker_error")) 67 | stop("Worker Error: ", msg) 68 | 69 | if (verbose) 70 | pb$tick(length(msg$result), 71 | tokens=list(wtot=pool$workers_total, wup=pool$workers_running)) 72 | 73 | # process the result data if we got some 74 | if (!is.null(msg$result)) { 75 | call_id = names(msg$result) 76 | jobs_running = jobs_running - length(call_id) 77 | job_result[as.integer(call_id)] = msg$result 78 | 79 | n_warnings = n_warnings + length(msg$warnings) 80 | n_errors = n_errors + length(msg$errors) 81 | if (n_errors > 0 && fail_on_error == TRUE) 82 | shutdown = TRUE 83 | if (length(cond_msgs$warnings) < 50) 84 | cond_msgs$warnings = c(cond_msgs$warnings, msg$warnings) 85 | if (length(cond_msgs$errors) < 50) 86 | cond_msgs$errors = c(cond_msgs$errors, msg$errors) 87 | } 88 | 89 | if (shutdown || pool$current()$calls >= max_calls_worker) { 90 | pool$send_shutdown() 91 | next 92 | } 93 | 94 | if (submit_index[1] <= n_calls) { 95 | # if we have work, send it to the worker 96 | submit_index = submit_index[submit_index <= n_calls] 97 | pool$send(work_chunk(chunk, fun=fun, const=const, rettype=rettype, 98 | common_seed=common_seed), chunk=chunk(iter, submit_index)) 99 | jobs_running = jobs_running + length(submit_index) 100 | submit_index = submit_index + chunk_size 101 | 102 | # adapt chunk size towards end of processing 103 | cs = ceiling((n_calls - submit_index[1]) / pool$workers_running) 104 | if (cs < chunk_size) { 105 | chunk_size = max(cs, 1) 106 | submit_index = submit_index[1:chunk_size] 107 | } 108 | 109 | } else if (pool$reusable) { 110 | pool$send_wait() 111 | } else { # or else shut it down 112 | pool$send_shutdown() 113 | } 114 | } 115 | 116 | summarize_result(job_result, n_errors, n_warnings, cond_msgs, 117 | min(submit_index)-1, fail_on_error) 118 | } 119 | -------------------------------------------------------------------------------- /R/pool.r: -------------------------------------------------------------------------------- 1 | loadModule("cmq_master", TRUE) # CMQMaster C++ class 2 | 3 | #' Class for basic queuing system functions 4 | #' 5 | #' Provides the basic functions needed to communicate between machines 6 | #' This should abstract most functions of rZMQ so the scheduler 7 | #' implementations can rely on the higher level functionality 8 | #' 9 | #' @keywords internal 10 | Pool = R6::R6Class("Pool", 11 | public = list( 12 | initialize = function(addr=sample(host()), reuse=TRUE) { 13 | private$master = methods::new(CMQMaster) 14 | # ZeroMQ allows connecting by node name, but binding must be either 15 | # a numerical IP or an interfacet name. This is a bit of a hack to 16 | # seem to allow node-name bindings 17 | nodename = Sys.info()["nodename"] 18 | addr = sub(nodename, "*", addr, fixed=TRUE) 19 | bound = private$master$listen(addr) 20 | private$addr = sub("0.0.0.0", nodename, bound, fixed=TRUE) 21 | private$timer = proc.time() 22 | private$reuse = reuse 23 | }, 24 | 25 | print = function() { 26 | cat(sprintf(" worker pool with %i member(s)\n", self$workers$n())) 27 | }, 28 | 29 | info = function() { 30 | info = private$master$list_workers() 31 | times = do.call(rbind, info$time)[,1:3,drop=FALSE] 32 | mem = function(field) sapply(info$mem, function(m) sum(m[,field] * c(56,1))) 33 | do.call(data.frame, c(info[c("worker", "status")], 34 | current=list(info$worker==info$cur), 35 | info["calls"], as.data.frame(times), 36 | list(mem.used=mem("used"), mem.max=mem("max used")))) 37 | }, 38 | current = function() { 39 | private$master$current() 40 | }, 41 | 42 | add = function(qsys, n, ...) { 43 | self$workers = qsys$new(addr=private$addr, master=private$master, n_jobs=n, ...) 44 | }, 45 | 46 | env = function(...) { 47 | args = list(...) 48 | for (name in names(args)) 49 | private$master$add_env(name, args[[name]]) 50 | if (length(args) == 0) 51 | private$master$list_env() 52 | else 53 | invisible(private$master$list_env()) 54 | }, 55 | 56 | pkg = function(...) { 57 | args = as.list(...) 58 | for (elm in args) 59 | private$master$add_pkg(elm) 60 | }, 61 | 62 | ### START pre-0.9 compatibility functions (deprecated) 63 | set_common_data = function(..., export=list(), pkgs=c(), token="") { 64 | .Deprecated("env") 65 | do.call(self$env, c(list(...), export)) 66 | if (length(pkgs) > 0) 67 | do.call(self$pkg, as.list(pkgs)) 68 | private$token = token 69 | }, 70 | send_common_data = function() { 71 | .Deprecated("handled implicitly") 72 | self$send() 73 | }, 74 | send_shutdown_worker = function() { 75 | .Deprecated("send_shutdown") 76 | self$send_shutdown() 77 | }, 78 | send_call = function(expr, env=list(), ref=substitute(expr)) { 79 | .Deprecated("send") 80 | pcall = quote(substitute(expr)) 81 | do.call(self$send, c(list(cmd=eval(pcall)), env)) 82 | }, 83 | receive_data = function() { 84 | .Deprecated("recv") 85 | rd = self$recv() 86 | list(result=rd, warnings=c(), errors=c(), token=private$token) 87 | }, 88 | ### END pre-0.9 compatibility functions (deprecated) 89 | 90 | send = function(cmd, ...) { 91 | pcall = quote(substitute(cmd)) 92 | cmd = as.expression(do.call(substitute, list(eval(pcall), env=list(...)))) 93 | invisible(private$master$send(cmd)) 94 | }, 95 | send_shutdown = function() { 96 | private$master$send_shutdown() 97 | }, 98 | send_wait = function(wait=50) { 99 | private$master$send(Sys.sleep(wait/1000)) 100 | }, 101 | 102 | recv = function(timeout=-1L) { 103 | private$master$recv(timeout) 104 | }, 105 | 106 | cleanup = function(timeout=5) { 107 | success = private$master$close(as.integer(timeout*1000)) 108 | success = self$workers$cleanup(success, timeout) # timeout left? 109 | 110 | info = self$info() 111 | max_mem = max(c(info$mem.max+2e8, 0), na.rm=TRUE) # add 200 Mb 112 | max_mem_str = format(structure(max_mem, class="object_size"), units="auto") 113 | 114 | if (nrow(info) > 0) { 115 | wt = lapply(info[c("user.self", "sys.self", "elapsed")], mean, na.rm=TRUE) 116 | } else { 117 | wt = rep(NA, 3) 118 | } 119 | rt = proc.time() - private$timer 120 | rt3_fmt = difftime(as.POSIXct(rt[[3]], origin="1970-01-01"), 121 | as.POSIXct(0, origin="1970-01-01"), units="auto") 122 | rt3_str = sprintf("%.1f %s", rt3_fmt, attr(rt3_fmt, "units")) 123 | 124 | fmt = "Master: [%s %.1f%% CPU]; Worker: [avg %.1f%% CPU, max %s]" 125 | message(sprintf(fmt, rt3_str, 100*(rt[[1]]+rt[[2]])/rt[[3]], 126 | 100*(wt[[1]]+wt[[2]])/wt[[3]], max_mem_str)) 127 | 128 | invisible(success) 129 | }, 130 | 131 | workers = NULL 132 | ), 133 | 134 | active = list( 135 | workers_total = function() private$master$workers_total(), 136 | workers_running = function() private$master$workers_running(), 137 | reusable = function() private$reuse 138 | ), 139 | 140 | private = list( 141 | token = NULL, ### pre-0.9 compatibility functions (deprecated) 142 | 143 | master = NULL, 144 | addr = NULL, 145 | timer = NULL, 146 | reuse = NULL, 147 | 148 | finalize = function() { 149 | private$master$close(0L) 150 | } 151 | ), 152 | 153 | cloneable = FALSE 154 | ) 155 | -------------------------------------------------------------------------------- /R/qsys.r: -------------------------------------------------------------------------------- 1 | loadModule("cmq_master", TRUE) # CMQMaster C++ class 2 | 3 | #' Class for basic queuing system functions 4 | #' 5 | #' Provides the basic functions needed to communicate between machines 6 | #' This should abstract most functions of rZMQ so the scheduler 7 | #' implementations can rely on the higher level functionality 8 | #' 9 | #' @keywords internal 10 | QSys = R6::R6Class("QSys", 11 | public = list( 12 | # Create a class instance 13 | # 14 | # Initializes ZeroMQ and sets and sets up our primary communication socket 15 | # 16 | # @param addr Vector of possible addresses to bind 17 | # @param bind Whether to bind 'addr' or just refer to it 18 | initialize = function(addr, master, template=NULL) { 19 | private$master = master 20 | private$addr = addr 21 | private$port = as.integer(sub(".*:", "", addr)) 22 | 23 | if (!is.null(template)) { 24 | if (!file.exists(template)) 25 | template = system.file(paste0(template, ".tmpl"), 26 | package="clustermq", mustWork=TRUE) 27 | if (file.exists(template)) { 28 | private$template_file = template 29 | private$template = readChar(template, file.info(template)$size) 30 | } else 31 | stop("Template file does not exist: ", sQuote(template)) 32 | } 33 | private$defaults = getOption("clustermq.defaults", list()) 34 | }, 35 | 36 | cleanup = function(success, timeout) TRUE, 37 | 38 | n = function() private$workers_total 39 | ), 40 | 41 | private = list( 42 | master = NULL, 43 | addr = NULL, 44 | port = NULL, 45 | template = NULL, 46 | template_file = NULL, 47 | workers_total = NULL, 48 | defaults = list(), 49 | is_cleaned_up = NULL, 50 | 51 | fill_options = function(...) { 52 | values = utils::modifyList(private$defaults, list(...)) 53 | values$master = private$addr 54 | if (grepl("CMQ_AUTH", private$template)) { 55 | # note: auth will be obligatory in the future and this check will 56 | # be removed (i.e., filling will fail if no field in template) 57 | values$auth = paste(sample(letters, 5, TRUE), collapse="") 58 | } else { 59 | values$auth = NULL 60 | warning("Add 'CMQ_AUTH={{ auth }}' to template to enable socket authentication", 61 | immediate.=TRUE) 62 | } 63 | if (!"job_name" %in% names(values)) 64 | values$job_name = paste0("cmq", private$port) 65 | private$workers_total = values$n_jobs 66 | values 67 | }, 68 | 69 | template_error = function(scheduler, status, filled) { 70 | message("\nThe filled ", scheduler, " template ", sQuote(private$template_file), 71 | " was:\n", '"""', "\n", filled, '"""', "\n") 72 | message("see: https://mschubert.github.io/clustermq/articles/userguide.html#scheduler-setup\n") 73 | stop("Job submission failed with error code ", status, call.=FALSE) 74 | } 75 | ), 76 | 77 | cloneable = FALSE 78 | ) 79 | -------------------------------------------------------------------------------- /R/qsys_local.r: -------------------------------------------------------------------------------- 1 | #' Placeholder for local processing 2 | #' 3 | #' Mainly so tests pass without setting up a scheduler 4 | #' 5 | #' @keywords internal 6 | LOCAL = R6::R6Class("LOCAL", 7 | inherit = QSys, 8 | 9 | public = list( 10 | initialize = function(addr="unused", n_jobs=0, master=NULL, ..., 11 | log_worker=FALSE, log_file=NULL, verbose=TRUE) { 12 | super$initialize(addr=addr, master=master) 13 | if (verbose) 14 | message("Running sequentially ('LOCAL') ...") 15 | private$is_cleaned_up = TRUE 16 | } 17 | ), 18 | 19 | cloneable = FALSE 20 | ) 21 | -------------------------------------------------------------------------------- /R/qsys_lsf.r: -------------------------------------------------------------------------------- 1 | #' LSF scheduler functions 2 | #' 3 | #' Derives from QSys to provide LSF-specific functions 4 | #' 5 | #' @keywords internal 6 | LSF = R6::R6Class("LSF", 7 | inherit = QSys, 8 | 9 | public = list( 10 | initialize = function(addr, n_jobs, master, ..., template=getOption("clustermq.template", "LSF"), 11 | log_worker=FALSE, log_file=NULL, verbose=TRUE) { 12 | super$initialize(addr=addr, master=master, template=template) 13 | 14 | opts = private$fill_options(n_jobs=n_jobs, ...) 15 | private$job_id = opts$job_name 16 | if (!is.null(opts$log_file)) 17 | opts$log_file = normalizePath(opts$log_file, mustWork=FALSE) 18 | else if (log_worker) 19 | opts$log_file = paste0(private$job_id, "-%I.log") 20 | filled = fill_template(private$template, opts, 21 | required=c("master", "job_name", "n_jobs")) 22 | 23 | if (verbose) 24 | message("Submitting ", n_jobs, " worker jobs to ", class(self)[1], 25 | " as ", sQuote(private$job_id), " ...") 26 | 27 | status = system("bsub", input=filled, ignore.stdout=TRUE) 28 | if (status != 0) 29 | private$template_error("LSF", status, filled) 30 | private$master$add_pending_workers(n_jobs) 31 | private$is_cleaned_up = FALSE 32 | }, 33 | 34 | cleanup = function(success, timeout) { 35 | private$is_cleaned_up = success 36 | private$finalize() 37 | } 38 | ), 39 | 40 | private = list( 41 | job_id = NULL, 42 | 43 | finalize = function(quiet=self$workers_running == 0) { 44 | quiet = FALSE #TODO: 45 | if (!private$is_cleaned_up) { 46 | system(paste("bkill -J", private$job_id), 47 | ignore.stdout=quiet, ignore.stderr=quiet, wait=FALSE) 48 | } 49 | private$is_cleaned_up = TRUE 50 | } 51 | ), 52 | 53 | cloneable = FALSE 54 | ) 55 | -------------------------------------------------------------------------------- /R/qsys_multicore.r: -------------------------------------------------------------------------------- 1 | #' Process on multiple cores on one machine 2 | #' 3 | #' Derives from QSys to provide multicore-specific functions 4 | #' 5 | #' @keywords internal 6 | MULTICORE = R6::R6Class("MULTICORE", 7 | inherit = QSys, 8 | 9 | public = list( 10 | initialize = function(addr, n_jobs, master, ..., log_worker=FALSE, log_file=NULL, verbose=TRUE) { 11 | addr = sub(Sys.info()["nodename"], "127.0.0.1", addr, fixed=TRUE) 12 | super$initialize(addr=addr, master=master) 13 | if (verbose) 14 | message("Starting ", n_jobs, " cores ...") 15 | if (log_worker && is.null(log_file)) 16 | log_file = sprintf("cmq%i-%%i.log", private$port) 17 | 18 | for (i in seq_len(n_jobs)) { 19 | if (is.character(log_file)) 20 | log_i = suppressWarnings(sprintf(log_file, i)) 21 | else 22 | log_i = nullfile() 23 | wrapper = function(m, logfile) { 24 | fout = file(logfile, open="wt") 25 | sink(file=fout, type="output") 26 | sink(file=fout, type="message") 27 | on.exit({ sink(type="message"); sink(type="output"); close(fout) }) 28 | clustermq:::worker(m) 29 | } 30 | p = parallel::mcparallel(quote(wrapper(private$addr, log_i))) 31 | private$children[[as.character(p$pid)]] = p 32 | } 33 | private$master$add_pending_workers(n_jobs) 34 | private$workers_total = n_jobs 35 | private$is_cleaned_up = FALSE 36 | }, 37 | 38 | cleanup = function(success, timeout=5L) { 39 | private$is_cleaned_up = success 40 | private$collect_children(wait=FALSE, timeout=timeout) 41 | private$finalize() 42 | } 43 | ), 44 | 45 | private = list( 46 | collect_children = function(...) { 47 | pids = as.integer(names(private$children)) 48 | res = suppressWarnings(parallel::mccollect(pids, ...)) 49 | finished = intersect(names(private$children), names(res)) 50 | private$children[finished] = NULL 51 | }, 52 | 53 | children = list(), 54 | 55 | finalize = function(quiet=FALSE) { 56 | if (!private$is_cleaned_up) { 57 | private$collect_children(wait=FALSE, timeout=0) 58 | running = names(private$children) 59 | if (length(running) > 0) { 60 | if (!quiet) 61 | warning("Unclean shutdown for PIDs: ", 62 | paste(running, collapse=", "), 63 | immediate.=TRUE) 64 | tools::pskill(running, tools::SIGKILL) 65 | } 66 | private$children = list() 67 | } 68 | private$is_cleaned_up = TRUE 69 | } 70 | ), 71 | 72 | cloneable = FALSE 73 | ) 74 | -------------------------------------------------------------------------------- /R/qsys_multiprocess.r: -------------------------------------------------------------------------------- 1 | #' Process on multiple processes on one machine 2 | #' 3 | #' Derives from QSys to provide callr-specific functions 4 | #' 5 | #' @keywords internal 6 | MULTIPROCESS = R6::R6Class("MULTIPROCESS", 7 | inherit = QSys, 8 | 9 | public = list( 10 | initialize = function(addr, n_jobs, master, ..., log_worker=FALSE, log_file=NULL, verbose=TRUE) { 11 | if (! requireNamespace("callr", quietly=TRUE)) 12 | stop("The ", sQuote(callr), " package is required for ", sQuote("multiprocess")) 13 | addr = sub(Sys.info()["nodename"], "127.0.0.1", addr, fixed=TRUE) 14 | super$initialize(addr=addr, master=master) 15 | 16 | if (verbose) 17 | message("Starting ", n_jobs, " processes ...") 18 | 19 | if (log_worker && is.null(log_file)) 20 | log_file = sprintf("cmq%i-%%i.log", private$port) 21 | 22 | for (i in seq_len(n_jobs)) { 23 | if (is.character(log_file)) 24 | log_i = suppressWarnings(sprintf(log_file, i)) 25 | else 26 | log_i = nullfile() 27 | cr = callr::r_bg(function(m) clustermq:::worker(m), 28 | args=list(m=private$addr), 29 | stdout=log_i, stderr=log_i) 30 | private$callr[[as.character(cr$get_pid())]] = cr 31 | } 32 | private$master$add_pending_workers(n_jobs) 33 | private$workers_total = n_jobs 34 | private$is_cleaned_up = FALSE 35 | }, 36 | 37 | cleanup = function(success, timeout) { 38 | dead_workers = sapply(private$callr, function(x) ! x$is_alive()) 39 | if (length(dead_workers) > 0) 40 | private$callr[dead_workers] = NULL 41 | else 42 | private$is_cleaned_up = TRUE 43 | private$is_cleaned_up 44 | } 45 | ), 46 | 47 | private = list( 48 | callr = list(), 49 | 50 | finalize = function(quiet=FALSE) { 51 | if (!private$is_cleaned_up) { 52 | dead_workers = sapply(private$callr, function(x) ! x$is_alive()) 53 | if (length(dead_workers) > 0) 54 | private$callr[dead_workers] = NULL 55 | if (!quiet && length(private$callr) > 0) 56 | warning("Unclean shutdown for PIDs: ", 57 | paste(names(private$callr), collapse=", "), immediate.=TRUE) 58 | for (cr in private$callr) 59 | cr$kill_tree() 60 | } 61 | private$is_cleaned_up = TRUE 62 | } 63 | ), 64 | 65 | cloneable = FALSE 66 | ) 67 | -------------------------------------------------------------------------------- /R/qsys_sge.r: -------------------------------------------------------------------------------- 1 | 2 | #' SGE scheduler functions 3 | #' 4 | #' Derives from QSys to provide SGE-specific functions 5 | #' 6 | #' @keywords internal 7 | SGE = R6::R6Class("SGE", 8 | inherit = QSys, 9 | 10 | public = list( 11 | initialize = function(addr, n_jobs, master, ..., template=getOption("clustermq.template", "SGE"), 12 | log_worker=FALSE, log_file=NULL, verbose=TRUE) { 13 | super$initialize(addr=addr, master=master, template=template) 14 | 15 | opts = private$fill_options(n_jobs=n_jobs, ...) 16 | private$job_name = opts$job_name 17 | if (!is.null(opts$log_file)) 18 | opts$log_file = normalizePath(opts$log_file, mustWork=FALSE) 19 | else if (log_worker) 20 | opts$log_file = sprintf("%s-%s.log", private$job_name, private$array_idx) 21 | filled = fill_template(private$template, opts, required=c("master", "n_jobs")) 22 | 23 | if (verbose) 24 | message("Submitting ", n_jobs, " worker jobs to ", class(self)[1], 25 | " as ", sQuote(private$job_id), " ...") 26 | 27 | private$qsub_stdout = system2("qsub", input=filled, stdout=TRUE) 28 | status = attr(private$qsub_stdout, "status") 29 | if (!is.null(status) && status != 0) 30 | private$template_error("SGE", status, filled) 31 | private$job_id = private$job_name 32 | private$master$add_pending_workers(n_jobs) 33 | private$is_cleaned_up = FALSE 34 | }, 35 | 36 | cleanup = function(success, timeout) { 37 | private$is_cleaned_up = success 38 | private$finalize() 39 | } 40 | ), 41 | 42 | private = list( 43 | qsub_stdout = NULL, 44 | job_name = NULL, 45 | job_id = NULL, 46 | array_idx = "$TASK_ID", 47 | 48 | finalize = function(quiet = TRUE) { # self$workers_running == 0 49 | if (!private$is_cleaned_up) { 50 | system(paste("qdel", private$job_id), 51 | ignore.stdout=quiet, ignore.stderr=quiet, wait=FALSE) 52 | } 53 | private$is_cleaned_up = TRUE 54 | } 55 | ), 56 | 57 | cloneable = FALSE 58 | ) 59 | 60 | PBS = R6::R6Class("PBS", 61 | inherit = SGE, 62 | 63 | public = list( 64 | initialize = function(..., template=getOption("clustermq.template", "PBS")) { 65 | super$initialize(..., template=template) 66 | private$array_idx = "$PBS_ARRAY_INDEX" 67 | private$job_id = private$qsub_stdout[1] 68 | } 69 | ), 70 | 71 | cloneable = FALSE 72 | ) 73 | 74 | TORQUE = R6::R6Class("TORQUE", 75 | inherit = PBS, 76 | 77 | public = list( 78 | initialize = function(..., template=getOption("clustermq.template", "TORQUE")) { 79 | super$initialize(..., template=template) 80 | private$array_idx = "$PBS_ARRAYID" 81 | } 82 | ), 83 | 84 | cloneable = FALSE 85 | ) 86 | -------------------------------------------------------------------------------- /R/qsys_slurm.r: -------------------------------------------------------------------------------- 1 | #' SLURM scheduler functions 2 | #' 3 | #' Derives from QSys to provide SLURM-specific functions 4 | #' 5 | #' @keywords internal 6 | SLURM = R6::R6Class("SLURM", 7 | inherit = QSys, 8 | 9 | public = list( 10 | initialize = function(addr, n_jobs, master, ..., template=getOption("clustermq.template", "SLURM"), 11 | log_worker=FALSE, verbose=TRUE) { 12 | super$initialize(addr=addr, master=master, template=template) 13 | 14 | opts = private$fill_options(n_jobs=n_jobs, ...) 15 | private$job_id = opts$job_name 16 | if (!is.null(opts$log_file)) 17 | opts$log_file = normalizePath(opts$log_file, mustWork=FALSE) 18 | else if (log_worker) 19 | opts$log_file = paste0(private$job_id, "-%a.log") 20 | filled = fill_template(private$template, opts, 21 | required=c("master", "job_name", "n_jobs")) 22 | 23 | if (verbose) 24 | message("Submitting ", n_jobs, " worker jobs to ", class(self)[1], 25 | " as ", sQuote(private$job_id), " ...") 26 | 27 | status = system("sbatch", input=filled, ignore.stdout=TRUE) 28 | if (status != 0) 29 | private$template_error("SLURM", status, filled) 30 | private$master$add_pending_workers(n_jobs) 31 | private$is_cleaned_up = FALSE 32 | }, 33 | 34 | cleanup = function(success, timeout) { 35 | private$is_cleaned_up = success 36 | private$finalize() 37 | } 38 | ), 39 | 40 | private = list( 41 | job_id = NULL, 42 | 43 | finalize = function(quiet = TRUE) { # self$workers_running == 0 44 | if (!private$is_cleaned_up) { 45 | system(paste("scancel --name", private$job_id), 46 | ignore.stdout=quiet, ignore.stderr=quiet, wait=FALSE) 47 | } 48 | private$is_cleaned_up = TRUE 49 | } 50 | ), 51 | 52 | cloneable = FALSE 53 | ) 54 | -------------------------------------------------------------------------------- /R/qsys_ssh.r: -------------------------------------------------------------------------------- 1 | #' SSH scheduler functions 2 | #' 3 | #' Derives from QSys to provide SSH-specific functions 4 | #' 5 | #' @keywords internal 6 | SSH = R6::R6Class("SSH", 7 | inherit = QSys, 8 | 9 | public = list( 10 | initialize = function(addr, n_jobs, ..., master, 11 | ssh_host = getOption("clustermq.ssh.host"), 12 | ssh_log = getOption("clustermq.ssh.log"), 13 | template = getOption("clustermq.template", "SSH"), 14 | verbose = TRUE) { 15 | if (is.null(ssh_host)) 16 | stop("Option 'clustermq.ssh.host' required for SSH but not set") 17 | if (!grepl("^tcp://", addr)) 18 | stop("SSH QSys must connect via tcp:// not ", sQuote(addr)) 19 | 20 | super$initialize(addr=addr, master=master, template=template) 21 | private$template = paste(trimws(readLines(textConnection(private$template))), collapse=" ") 22 | 23 | # set forward and run ssh.r (send port, master) 24 | opts = private$fill_options(ssh_log=ssh_log, ssh_host=ssh_host) 25 | ssh_cmd = fill_template(private$template, opts, 26 | required=c("local_port", "ssh.hpc_fwd_port", "ssh_host")) 27 | 28 | # wait for ssh to connect 29 | message(sprintf("Connecting to %s via SSH ...", sQuote(ssh_host))) 30 | system(ssh_cmd, wait=TRUE, ignore.stdout=TRUE, ignore.stderr=TRUE) 31 | 32 | master$add_pending_workers(n_jobs) 33 | args = c(list(...), list(n_jobs=n_jobs)) 34 | init_timeout = getOption("clustermq.ssh.timeout", 10) 35 | tryCatch(private$master$proxy_submit_cmd(args, init_timeout*1000), 36 | error = function(e) { 37 | if (grepl("timed out", conditionMessage(e))) { 38 | stop("Remote R process did not respond after ", 39 | init_timeout, " seconds. Check your SSH server log.") 40 | } else stop(e) 41 | }) 42 | 43 | private$workers_total = args$n_jobs 44 | }, 45 | 46 | cleanup = function(success, timeout) { 47 | private$finalize() 48 | TRUE 49 | } 50 | ), 51 | 52 | private = list( 53 | ssh_proxy_running = TRUE, 54 | 55 | fill_options = function(...) { 56 | args = list(...) 57 | args$local_port = sub(".*:", "", private$addr) 58 | args$ssh.hpc_fwd_port=getOption("clustermq.ssh.hpc_fwd_port", 50000:55000) 59 | if (length(args$ssh.hpc_fwd_port) > 1) 60 | args$ssh.hpc_fwd_port = sample(args$ssh.hpc_fwd_port, 1) 61 | utils::modifyList(private$defaults, args) 62 | }, 63 | 64 | finalize = function(quiet = self$workers_running == 0) { 65 | # if (private$ssh_proxy_running) { 66 | # private$zmq$send( 67 | # list(id="PROXY_STOP", finalize=!private$is_cleaned_up), 68 | # "proxy" 69 | # ) 70 | # } 71 | private$ssh_proxy_running = FALSE 72 | } 73 | ), 74 | 75 | cloneable = FALSE 76 | ) 77 | -------------------------------------------------------------------------------- /R/ssh_proxy.r: -------------------------------------------------------------------------------- 1 | loadModule("cmq_proxy", TRUE) # CMQProxy C++ class 2 | 3 | #' SSH proxy for different schedulers 4 | #' 5 | #' Do not call this manually, the SSH qsys will do that 6 | #' 7 | #' @param fwd_port The port of the master address to connect to 8 | #' (remote end of reverse tunnel) 9 | #' @param qsys_id Character string of QSys class to use 10 | #' @keywords internal 11 | ssh_proxy = function(fwd_port, qsys_id=qsys_default) { 12 | message = msg_fmt() 13 | 14 | master = sprintf("tcp://127.0.0.1:%s", fwd_port) 15 | p = methods::new(CMQProxy) 16 | p$connect(master, 10000L) 17 | 18 | tryCatch({ 19 | nodename = Sys.info()["nodename"] 20 | addr = p$listen(sub(nodename, "*", sample(host()), fixed=TRUE)) 21 | addr = sub("0.0.0.0", nodename, addr, fixed=TRUE) 22 | message("listening for workers at ", addr) 23 | 24 | p$proxy_request_cmd() 25 | args = p$proxy_receive_cmd() 26 | message("submit args: ", paste(mapply(paste, names(args), args, sep="="), collapse=", ")) 27 | stopifnot(inherits(args, "list"), "n_jobs" %in% names(args)) 28 | 29 | # set up qsys on cluster 30 | message("setting up qsys: ", qsys_id) 31 | if (toupper(qsys_id) %in% c("LOCAL", "SSH")) 32 | stop("Remote SSH QSys ", sQuote(qsys_id), " is not allowed") 33 | qsys = get(toupper(qsys_id), envir=parent.env(environment())) 34 | qsys = do.call(qsys$new, c(list(addr=addr, master=p), args)) 35 | on.exit(qsys$cleanup()) 36 | 37 | while(p$process_one()) { 38 | message("event at: ", Sys.time()) 39 | } 40 | 41 | message("shutting down") 42 | p$close(1000L) 43 | 44 | }, error = function(e) { 45 | stop(e) 46 | }) 47 | } 48 | -------------------------------------------------------------------------------- /R/summarize_result.r: -------------------------------------------------------------------------------- 1 | #' Print a summary of errors and warnings that occurred during processing 2 | #' 3 | #' @param result A list or vector of the processing result 4 | #' @param n_errors How many errors occurred 5 | #' @param n_warnings How many warnings occurred 6 | #' @param cond_msgs Error and warnings messages, we display first 50 7 | #' @param at How many calls were procesed up to this point 8 | #' @param fail_on_error Stop if error(s) occurred 9 | #' @keywords internal 10 | summarize_result = function(result, n_errors, n_warnings, 11 | cond_msgs, at=length(result), fail_on_error=TRUE) { 12 | 13 | cond_msgs$errors = cond_msgs$errors[order(as.integer(names(cond_msgs$errors)))] 14 | cond_msgs$warnings = cond_msgs$warnings[order(as.integer(names(cond_msgs$warnings)))] 15 | cond_msgs = utils::head(c(cond_msgs$errors, cond_msgs$warnings), 50) 16 | detail = paste(unlist(cond_msgs), collapse="\n") 17 | 18 | if (n_errors > 0) { 19 | msg = sprintf("%i/%i jobs failed (%i warnings)", n_errors, at, n_warnings) 20 | if (fail_on_error) 21 | stop(msg, ". Stopping.\n", detail, call.=FALSE) 22 | else 23 | warning(msg, "\n", detail, immediate.=TRUE, call.=FALSE) 24 | } else if (n_warnings > 0) { 25 | msg = sprintf("%i warnings occurred in processing\n", n_warnings) 26 | warning(msg, detail, immediate.=TRUE, call.=FALSE) 27 | } 28 | unname(result) 29 | } 30 | -------------------------------------------------------------------------------- /R/util.r: -------------------------------------------------------------------------------- 1 | #' Construct the ZeroMQ host address 2 | #' 3 | #' @param node Node or device name 4 | #' @param ports Range of ports to consider 5 | #' @param n How many addresses to return 6 | #' @return The possible addresses as character vector 7 | #' @keywords internal 8 | # @param short Whether to use unqualified host name (before first dot) 9 | host = function(node=getOption("clustermq.host", Sys.info()["nodename"]), 10 | ports=getOption("clustermq.ports", 6000:9999), n=100) { 11 | utils::head(sample(sprintf("tcp://%s:%i", node, ports)), n) 12 | } 13 | 14 | #' Fill a template string with supplied values 15 | #' 16 | #' @param template A character string of a submission template 17 | #' @param values A named list of key-value pairs 18 | #' @param required Keys that must be present in the template (default: none) 19 | #' @return A template where placeholder fields were replaced by values 20 | #' @keywords internal 21 | fill_template = function(template, values, required=c()) { 22 | pattern = "\\{\\{\\s*([^\\s]+)\\s*(\\|\\s*[^\\s]+\\s*)?\\}\\}" 23 | match_obj = gregexpr(pattern, template, perl=TRUE) 24 | matches = regmatches(template, match_obj)[[1]] 25 | 26 | no_delim = substr(matches, 3, nchar(matches)-2) 27 | kv_str = strsplit(no_delim, "|", fixed=TRUE) 28 | keys = sapply(kv_str, function(s) gsub("\\s", "", s[1])) 29 | vals = sapply(kv_str, function(s) gsub("\\s", "", s[2])) 30 | if (! all(required %in% keys)) 31 | stop("Template keys required but not provided: ", 32 | paste(setdiff(required, keys), collapse=", ")) 33 | 34 | upd = keys %in% names(values) 35 | is_num = sapply(values, is.numeric) 36 | if (length(is_num) > 0) 37 | values[is_num] = format.default(values[is_num], scientific=FALSE, trim=TRUE) 38 | vals[upd] = unlist(values)[keys[upd]] 39 | if (any(is.na(vals))) 40 | stop("Template values required but not provided: ", 41 | paste(unique(keys[is.na(vals)]), collapse=", ")) 42 | 43 | for (i in seq_along(matches)) 44 | template = sub(matches[i], vals[i], template, fixed=TRUE) 45 | template 46 | } 47 | 48 | #' Lookup table for return types to vector NAs 49 | #' 50 | #' @keywords internal 51 | vec_lookup = list( 52 | "list" = list(NULL), 53 | "logical" = as.logical(NA), 54 | "numeric" = NA_real_, 55 | "integer" = NA_integer_, 56 | "character" = NA_character_, 57 | "lgl" = as.logical(NA), 58 | "dbl" = NA_real_, 59 | "int" = NA_integer_, 60 | "chr" = NA_character_ 61 | ) 62 | 63 | #' Wraps an error in a condition object 64 | #' 65 | #' @keywords internal 66 | wrap_error = function(call) { 67 | structure(class = c("worker_error", "condition"), 68 | list(message=geterrmessage(), call=call)) 69 | } 70 | 71 | #' Message format for logging 72 | #' 73 | #' @keywords internal 74 | msg_fmt = function(verbose=TRUE) { 75 | if (verbose) 76 | function(...) base::message(format(Sys.time(), "%Y-%m-%d %H:%M:%OS9 | "), ...) 77 | else 78 | function(...) invisible(NULL) 79 | } 80 | -------------------------------------------------------------------------------- /R/work_chunk.r: -------------------------------------------------------------------------------- 1 | #' Function to process a chunk of calls 2 | #' 3 | #' Each chunk comes encapsulated in a data.frame 4 | #' 5 | #' @param df A data.frame with call IDs as rownames and arguments as columns 6 | #' @param fun The function to call 7 | #' @param const Constant arguments passed to each call 8 | #' @param rettype Return type of function 9 | #' @param common_seed A seed offset common to all function calls 10 | #' @param progress Logical indicated whether to display a progress bar 11 | #' @return A list of call results (or try-error if they failed) 12 | #' @keywords internal 13 | work_chunk = function(df, fun, const=list(), rettype="list", 14 | common_seed=NULL, progress=FALSE) { 15 | context = new.env() 16 | context$warnings = list() 17 | context$errors = list() 18 | if (progress) { 19 | pb = progress::progress_bar$new(total = nrow(df), 20 | format = "[:bar] :percent eta: :eta") 21 | pb$tick(0) 22 | } 23 | 24 | fwrap = function(..., ` id `, ` seed `=NA) { 25 | chr_id = as.character(` id `) 26 | if (!is.na(` seed `)) 27 | set.seed(` seed `) 28 | 29 | result = withCallingHandlers( 30 | withRestarts( 31 | do.call(fun, c(list(...), const)), 32 | muffleStop = function(e) if (rettype == "list") 33 | structure(e, class="error") 34 | ), 35 | warning = function(w) { 36 | wmsg = paste0("(#", chr_id, ") ", conditionMessage(w)) 37 | context$warnings[[chr_id]] = c(context$warnings[[chr_id]], wmsg) 38 | invokeRestart("muffleWarning") 39 | }, 40 | error = function(e) { 41 | emsg = paste0("(Error #", chr_id, ") ", conditionMessage(e)) 42 | context$errors[[chr_id]] = emsg 43 | invokeRestart("muffleStop", emsg) 44 | } 45 | ) 46 | 47 | if (progress) 48 | pb$tick() 49 | result 50 | } 51 | 52 | if (is.null(df$` id `)) 53 | df$` id ` = seq_along(df[[1]]) 54 | 55 | if (!is.null(common_seed)) 56 | df$` seed ` = as.integer(df$` id ` %% .Machine$integer.max) - common_seed 57 | 58 | re = stats::setNames(.mapply(fwrap, df, NULL), df$` id `) 59 | if (rettype != "list") 60 | re = unlist(re) 61 | list(result = re, warnings = context$warnings, errors = context$errors) 62 | } 63 | -------------------------------------------------------------------------------- /R/worker.r: -------------------------------------------------------------------------------- 1 | loadModule("cmq_worker", TRUE) # CMQWorker C++ class 2 | utils::globalVariables(c("common_seed", "const", "fun")) # worker .GlobalEnv 3 | 4 | #' R worker submitted as cluster job 5 | #' 6 | #' Do not call this manually, the master will do that 7 | #' 8 | #' @param master The master address (tcp://ip:port) 9 | #' @param ... Catch-all to not break older template values (ignored) 10 | #' @param verbose Whether to print debug messages 11 | #' @param context ZeroMQ context (for internal testing) 12 | #' @keywords internal 13 | worker = function(master, ..., verbose=TRUE, context=NULL) { 14 | message = msg_fmt(verbose) 15 | 16 | #TODO: replace this by proper authentication 17 | auth = Sys.getenv("CMQ_AUTH") 18 | 19 | message("Master: ", master) 20 | if (length(list(...)) > 0) 21 | warning("Arguments ignored: ", paste(names(list(...)), collapse=", ")) 22 | 23 | # connect to master 24 | if (is.null(context)) 25 | w = methods::new(CMQWorker) 26 | else 27 | w = methods::new(CMQWorker, context) 28 | message("connecting to: ", master) 29 | w$connect(master, 10000L) 30 | 31 | counter = 0 32 | repeat { 33 | tic = proc.time() 34 | w$poll() 35 | delta = proc.time() - tic 36 | counter = counter + 1 37 | message(sprintf("> call %i (%.3fs wait)", counter, delta[3])) 38 | if (! w$process_one()) 39 | break 40 | } 41 | 42 | message("shutting down worker") 43 | run_time = proc.time() 44 | fmt = "%i in %.2fs [user], %.2fs [system], %.2fs [elapsed]" 45 | message("\nTotal: ", sprintf(fmt, counter, run_time[1], run_time[2], run_time[3])) 46 | } 47 | -------------------------------------------------------------------------------- /R/workers.r: -------------------------------------------------------------------------------- 1 | #' Creates a pool of workers 2 | #' 3 | #' @param n_jobs Number of jobs to submit (0 implies local processing) 4 | #' @param data Set common data (function, constant args, seed) 5 | #' @param reuse Whether workers are reusable or get shut down after call 6 | #' @param template A named list of values to fill in template 7 | #' @param log_worker Write a log file for each worker 8 | #' @param qsys_id Character string of QSys class to use 9 | #' @param verbose Print message about worker startup 10 | #' @param ... Additional arguments passed to the qsys constructor 11 | #' @return An instance of the QSys class 12 | #' @export 13 | workers = function(n_jobs, data=NULL, reuse=TRUE, template=list(), log_worker=FALSE, 14 | qsys_id=getOption("clustermq.scheduler", qsys_default), 15 | verbose=FALSE, ...) { 16 | if (n_jobs == 0) 17 | qsys_id = "LOCAL" 18 | 19 | gc() # be sure to clean up old zmq handles (zeromq/libzmq/issues/1108) 20 | 21 | qsys = get(toupper(qsys_id), envir=parent.env(environment())) 22 | 23 | p = Pool$new(reuse=reuse) 24 | # p$add(qsys, n_jobs, log_worker=log_worker, verbose=verbose, ...) 25 | args = c(list(qsys=qsys, n=n_jobs, log_worker=log_worker, verbose=verbose), template, list(...)) 26 | do.call(p$add, args) 27 | p 28 | } 29 | -------------------------------------------------------------------------------- /R/zzz.r: -------------------------------------------------------------------------------- 1 | #' Select the queueing system on package loading 2 | #' 3 | #' This is done by setting the variable 'qsys' in the package environment 4 | #' to the object that contains the desired queueing system. 5 | #' 6 | #' @param libname default arg for compatibility 7 | #' @param pkgname default arg for compatibility 8 | #' @keywords internal 9 | .onLoad = function(libname, pkgname) { 10 | qsys_default = toupper(getOption('clustermq.scheduler')) 11 | 12 | if (length(qsys_default) == 0) { 13 | qname = c("SLURM", "LSF", "SGE", "LOCAL") 14 | exec = Sys.which(c("sbatch", "bsub", "qsub")) 15 | select = c(which(nchar(exec) > 0), 4)[1] 16 | qsys_default = qname[select] 17 | } 18 | 19 | assign("qsys_default", qsys_default, envir=parent.env(environment())) 20 | } 21 | 22 | #' Report queueing system on package attach if not set 23 | #' 24 | #' @param libname default arg for compatibility 25 | #' @param pkgname default arg for compatibility 26 | #' @keywords internal 27 | .onAttach = function(libname, pkgname) { 28 | if (is.null(getOption("clustermq.scheduler"))) { 29 | packageStartupMessage("* Option 'clustermq.scheduler' not set, ", 30 | "defaulting to ", sQuote(qsys_default)) 31 | packageStartupMessage("--- see: https://mschubert.github.io/clustermq/articles/userguide.html#configuration") 32 | } 33 | if (!libzmq_has_draft()) { 34 | packageStartupMessage("* Worker disconnect monitor is disabled") 35 | packageStartupMessage("--- see: https://mschubert.github.io/clustermq/articles/userguide.html#installation") 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ClusterMQ: send R function calls as cluster jobs 2 | ================================================ 3 | 4 | [![CRAN version](https://www.r-pkg.org/badges/version/clustermq)](https://cran.r-project.org/package=clustermq) 5 | [![Build Status](https://github.com/mschubert/clustermq/actions/workflows/check-standard.yaml/badge.svg)](https://github.com/mschubert/clustermq/actions) 6 | [![CRAN downloads](https://cranlogs.r-pkg.org/badges/clustermq)](https://cran.r-project.org/package=clustermq) 7 | [![DOI](https://zenodo.org/badge/DOI/10.1093/bioinformatics/btz284.svg)](https://doi.org/10.1093/bioinformatics/btz284) 8 | 9 | This package will allow you to send function calls as jobs on a computing 10 | cluster with a minimal interface provided by the `Q` function: 11 | 12 | ```r 13 | # install the package if you haven't done so yet 14 | install.packages('clustermq') 15 | 16 | # load the library and create a simple function 17 | library(clustermq) 18 | fx = function(x) x * 2 19 | 20 | # queue the function call on your scheduler 21 | Q(fx, x=1:3, n_jobs=1) 22 | # list(2,4,6) 23 | ``` 24 | 25 | Computations are done [entirely on the network](https://zeromq.org/) 26 | and without any temporary files on network-mounted storage, so there is no 27 | strain on the file system apart from starting up R once per job. All 28 | calculations are load-balanced, i.e. workers that get their jobs done faster 29 | will also receive more function calls to work on. This is especially useful if 30 | not all calls return after the same time, or one worker has a high load. 31 | 32 | Browse the vignettes here: 33 | 34 | * [User Guide](https://mschubert.github.io/clustermq/articles/userguide.html) 35 | * [Technical Documentation](https://mschubert.github.io/clustermq/articles/technicaldocs.html) 36 | * [FAQ](https://mschubert.github.io/clustermq/articles/faq.html) 37 | 38 | Schedulers 39 | ---------- 40 | 41 | An HPC cluster's scheduler ensures that computing jobs are distributed to 42 | available worker nodes. Hence, this is what clustermq interfaces with in order 43 | to do computations. 44 | 45 | We currently support the [following 46 | schedulers](https://mschubert.github.io/clustermq/articles/userguide.html#configuration) 47 | (either locally or via SSH): 48 | 49 | * [Multiprocess](https://mschubert.github.io/clustermq/articles/userguide.html#local-parallelization) - 50 | *test your calls and parallelize on cores using* `options(clustermq.scheduler="multiprocess")` 51 | * [SLURM](https://mschubert.github.io/clustermq/articles/userguide.html#slurm) - *should work without setup* 52 | * [LSF](https://mschubert.github.io/clustermq/articles/userguide.html#lsf) - *should work without setup* 53 | * [SGE](https://mschubert.github.io/clustermq/articles/userguide.html#sge) - *may require configuration* 54 | * [PBS](https://mschubert.github.io/clustermq/articles/userguide.html#pbs)/[Torque](https://mschubert.github.io/clustermq/articles/userguide.html#torque) - *needs* `options(clustermq.scheduler="PBS"/"Torque")` 55 | * via [SSH](https://mschubert.github.io/clustermq/articles/userguide.html#ssh-connector) - 56 | *needs* `options(clustermq.scheduler="ssh", clustermq.ssh.host=)` 57 | 58 | > [!TIP] 59 | > Follow the links above to configure your scheduler in case it is not working 60 | > out of the box and check the 61 | > [FAQ](https://mschubert.github.io/clustermq/articles/faq.html) if 62 | > your job submission errors or gets stuck 63 | 64 | Usage 65 | ----- 66 | 67 | The most common arguments for `Q` are: 68 | 69 | * `fun` - The function to call. This needs to be self-sufficient (because it 70 | will not have access to the `master` environment) 71 | * `...` - All iterated arguments passed to the function. If there is more than 72 | one, all of them need to be named 73 | * `const` - A named list of non-iterated arguments passed to `fun` 74 | * `export` - A named list of objects to export to the worker environment 75 | 76 | The documentation for other arguments can be accessed by typing `?Q`. Examples 77 | of using `const` and `export` would be: 78 | 79 | ```r 80 | # adding a constant argument 81 | fx = function(x, y) x * 2 + y 82 | Q(fx, x=1:3, const=list(y=10), n_jobs=1) 83 | 84 | # exporting an object to workers 85 | fx = function(x) x * 2 + y 86 | Q(fx, x=1:3, export=list(y=10), n_jobs=1) 87 | ``` 88 | 89 | We can also use `clustermq` as a parallel backend in 90 | [`foreach`](https://cran.r-project.org/package=foreach) or 91 | [`BiocParallel`](https://bioconductor.org/packages/release/bioc/html/BiocParallel.html): 92 | 93 | ```r 94 | # using foreach 95 | library(foreach) 96 | register_dopar_cmq(n_jobs=2, memory=1024) # see `?workers` for arguments 97 | foreach(i=1:3) %dopar% sqrt(i) # this will be executed as jobs 98 | 99 | # using BiocParallel 100 | library(BiocParallel) 101 | register(DoparParam()) # after register_dopar_cmq(...) 102 | bplapply(1:3, sqrt) 103 | ``` 104 | 105 | More examples are available in [the 106 | User Guide](https://mschubert.github.io/clustermq/articles/userguide.html). 107 | 108 | Comparison to other packages 109 | ---------------------------- 110 | 111 | There are some packages that provide high-level parallelization of R function calls 112 | on a computing cluster. We compared `clustermq` to `BatchJobs` and `batchtools` for 113 | processing many short-running jobs, and found it to have approximately 1000x less 114 | overhead cost. 115 | 116 | ![Overhead comparison](http://image.ibb.co/cRgYNR/plot.png) 117 | 118 | In short, use `clustermq` if you want: 119 | 120 | * a one-line solution to run cluster jobs with minimal setup 121 | * access cluster functions from your local Rstudio via SSH 122 | * fast processing of many function calls without network storage I/O 123 | 124 | Use [`batchtools`](https://github.com/mllg/batchtools) if you: 125 | 126 | * want to use a mature and well-tested package 127 | * don't mind that arguments to every call are written to/read from disc 128 | * don't mind there's no load-balancing at run-time 129 | 130 | Use [Snakemake](https://snakemake.readthedocs.io/en/latest/) or 131 | [`targets`](https://github.com/ropensci/targets) if: 132 | 133 | * you want to design and run a workflow on HPC 134 | 135 | Don't use [`batch`](https://cran.r-project.org/package=batch) 136 | (last updated 2013) or [`BatchJobs`](https://github.com/tudo-r/BatchJobs) 137 | (issues with SQLite on network-mounted storage). 138 | 139 | Contributing 140 | ------------ 141 | 142 | Contributions are welcome and they come in many different forms, shapes, and 143 | sizes. These include, but are not limited to: 144 | 145 | * **Questions**: Ask on the [Github 146 | Discussions](https://github.com/mschubert/clustermq/discussions) board. If 147 | you are an advanced user, please also consider answering questions there. 148 | * **Bug reports**: [File an issue](https://github.com/mschubert/clustermq/issues) 149 | if something does not work as expected. Be sure to 150 | include a self-contained [Minimal Reproducible 151 | Example](https://stackoverflow.com/help/minimal-reproducible-example) and set 152 | `log_worker=TRUE`. 153 | * **Code contributions**: Have a look at the [`good first 154 | issue`](https://github.com/mschubert/clustermq/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) 155 | tag. Please discuss anything more complicated before putting a lot of work 156 | in, I'm happy to help you get started. 157 | 158 | > [!TIP] 159 | > Check the 160 | > [User Guide](https://mschubert.github.io/clustermq/articles/userguide.html) and the 161 | > [FAQ](https://mschubert.github.io/clustermq/articles/faq.html) first, maybe 162 | > your query is already answered there 163 | 164 | Citation 165 | -------- 166 | 167 | This project is part of my academic work, for which I will be evaluated on 168 | citations. If you like me to be able to continue working on research support 169 | tools like `clustermq`, please cite the article when using it for publications: 170 | 171 | > M Schubert. clustermq enables efficient parallelisation of genomic analyses. 172 | > *Bioinformatics* (2019). 173 | > [doi:10.1093/bioinformatics/btz284](https://doi.org/10.1093/bioinformatics/btz284) 174 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | template: 2 | bootstrap: 5 3 | light-switch: true 4 | 5 | toc: 6 | depth: 2 # level 3 currently not distinguishable from 2 7 | 8 | navbar: 9 | type: default 10 | left: 11 | - icon: fa-home 12 | href: index.html 13 | - text: User Guide 14 | href: articles/userguide.html 15 | - text: Technical Documentation 16 | href: articles/technicaldocs.html 17 | - text: FAQ 18 | href: articles/faq.html 19 | - text: Reference 20 | href: reference/index.html 21 | - text: Changelog 22 | href: news/index.html 23 | right: 24 | - icon: fa-github fa-lg 25 | href: https://github.com/mschubert/clustermq 26 | 27 | reference: 28 | - title: Overview 29 | contents: 30 | - clustermq 31 | - title: Run calls on HPC 32 | contents: 33 | - Q 34 | - Q_rows 35 | - title: Manage worker pools 36 | contents: 37 | - workers 38 | - title: "`foreach` support" 39 | contents: 40 | - register_dopar_cmq 41 | -------------------------------------------------------------------------------- /cleanup: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | rm -f src/Makevars src/*.o src/*.so* src/*.dylib configure.log autobrew 3 | if [ -f src/libzmq/Makefile ]; then 4 | make -C src/libzmq distclean 5 | fi 6 | -------------------------------------------------------------------------------- /configure: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | CC=$(${R_HOME}/bin/R CMD config CC) 3 | CXX=$(${R_HOME}/bin/R CMD config CXX) 4 | CXXFLAGS=$(${R_HOME}/bin/R CMD config CXXFLAGS) 5 | CPPFLAGS=$(${R_HOME}/bin/R CMD config CPPFLAGS) 6 | 7 | # remove code that causes R-check warnings 8 | ./src/util/patch_libzmq.sh 9 | 10 | if [ -z "$CLUSTERMQ_USE_SYSTEM_LIBZMQ" ]; then 11 | if $($CC -o test_libzmq src/util/test_libzmq.c $(pkg-config --cflags --libs libzmq) >/dev/null 2>&1); then 12 | echo "* system has libzmq -> linking system library" 13 | if $(./test_libzmq 2>/dev/null); then 14 | echo "* libzmq has DRAFT API -> enabling crash monitor feature" 15 | CLUSTERMQ_USE_SYSTEM_LIBZMQ=1 16 | else 17 | echo "* libzmq without DRAFT API found -> disabling crash monitor feature" 18 | CLUSTERMQ_USE_SYSTEM_LIBZMQ=1 19 | fi 20 | else 21 | echo "* no system libzmq>=4.3.0 found -> using bundled libzmq" 22 | CLUSTERMQ_USE_SYSTEM_LIBZMQ=0 23 | fi 24 | rm -f test_libzmq 25 | fi 26 | 27 | if ! $($CXX -o test_cpp11 src/util/test_cpp11.cpp >/dev/null 2>&1); then 28 | echo "ERROR: compiler needs full c++11 support (gcc>=5, clang>=3.3) -> check 'cc --version'" 29 | exit 1 30 | fi 31 | rm -f test_cpp11 32 | 33 | if [ "$CLUSTERMQ_USE_SYSTEM_LIBZMQ" -eq "0" ]; then 34 | PKG_CFLAGS="-DZMQ_STATIC -DZMQ_BUILD_DRAFT_API=1 -fPIC -Ilibzmq/include -Icppzmq" 35 | PKG_LIBS="libzmq/src/.libs/libzmq.a" 36 | ./src/util/build_libzmq.sh 37 | else 38 | PKG_CFLAGS="$(pkg-config --cflags libzmq) -fPIC -Icppzmq" 39 | PKG_LIBS="$(pkg-config --libs libzmq)" 40 | fi 41 | 42 | sed -e "s|@cflags@|$PKG_CFLAGS|" -e "s|@libs@|$PKG_LIBS|" src/Makevars.in > src/Makevars 43 | -------------------------------------------------------------------------------- /configure.win: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # remove code that causes R-check warnings 4 | ./src/util/patch_libzmq.sh 5 | -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | bibentry( 2 | bibtype = "Article", 3 | title = "clustermq enables efficient parallelisation of genomic analyses", 4 | author = "Michael Schubert", 5 | journal = "Bioinformatics", 6 | month = "May", 7 | year = "2019", 8 | language = "en", 9 | doi = "10.1093/bioinformatics/btz284", 10 | url = "https://github.com/mschubert/clustermq", 11 | textVersion = paste( 12 | "Schubert, M.", 13 | "clustermq enables efficient parallelisation of genomic analyses.", 14 | "Bioinformatics (2019).", 15 | "doi:10.1093/bioinformatics/btz284" 16 | ), 17 | header = "To cite clustermq in publications use:" 18 | ) 19 | -------------------------------------------------------------------------------- /inst/LSF.tmpl: -------------------------------------------------------------------------------- 1 | #BSUB-J {{ job_name }}[1-{{ n_jobs }}] 2 | #BSUB-n {{ cores | 1 }} 3 | #BSUB-o {{ log_file | /dev/null }} 4 | #BSUB-M {{ memory | 4096 }} 5 | #BSUB-R rusage[mem={{ memory | 4096 }}] 6 | #BSUB-R span[ptile=1] 7 | 8 | ulimit -v $(( 1024 * {{ memory | 4096 }} )) 9 | CMQ_AUTH={{ auth }} R --no-save --no-restore -e 'clustermq:::worker("{{ master }}")' 10 | -------------------------------------------------------------------------------- /inst/PBS.tmpl: -------------------------------------------------------------------------------- 1 | #PBS -N {{ job_name }} 2 | #PBS -J 1-{{ n_jobs }} 3 | #PBS -l nodes=1:ppn={{ cores | 1 }}:mem={{ memory | 4096 }}MB 4 | # ppn=P is equivalent to ncpus=P:mpiprocs=P 5 | # "New" syntax: #PBS -l select=1:ncpus={{ cores | 1 }}:mpiprocs={{ cores | 1 }}:mem={{ memory | 4096 }}MB 6 | 7 | #PBS -l walltime={{ walltime | 12:00:00 }} 8 | #PBS -o {{ log_file | /dev/null }} 9 | #PBS -j oe 10 | 11 | # Uncomment if R is an environement module 12 | # module load R 13 | 14 | # Uncomment to set the working directory 15 | # cd {{ workdir | "$PBS_O_WORKDIR" }} 16 | 17 | ulimit -v $(( 1024 * {{ memory | 4096 }} )) 18 | CMQ_AUTH={{ auth }} R --no-save --no-restore -e 'clustermq:::worker("{{ master }}")' 19 | -------------------------------------------------------------------------------- /inst/SGE.tmpl: -------------------------------------------------------------------------------- 1 | #$ -N {{ job_name }} 2 | #$ -j y 3 | #$ -o {{ log_file | /dev/null }} 4 | #$ -cwd 5 | #$ -V 6 | #$ -t 1-{{ n_jobs }} 7 | #$ -pe smp {{ cores | 1 }} 8 | #$ -l m_mem_free={{ memory | 1073741824 }} 9 | 10 | ulimit -v $(( 1024 * {{ memory | 4096 }} )) 11 | CMQ_AUTH={{ auth }} R --no-save --no-restore -e 'clustermq:::worker("{{ master }}")' 12 | -------------------------------------------------------------------------------- /inst/SLURM.tmpl: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #SBATCH --job-name={{ job_name }} 3 | #SBATCH --output={{ log_file | /dev/null }} 4 | #SBATCH --error={{ log_file | /dev/null }} 5 | #SBATCH --mem-per-cpu={{ memory | 4096 }} 6 | #SBATCH --array=1-{{ n_jobs }} 7 | #SBATCH --cpus-per-task={{ cores | 1 }} 8 | 9 | ulimit -v $(( 1024 * {{ memory | 4096 }} )) 10 | CMQ_AUTH={{ auth }} R --no-save --no-restore -e 'clustermq:::worker("{{ master }}")' 11 | -------------------------------------------------------------------------------- /inst/SSH.tmpl: -------------------------------------------------------------------------------- 1 | ssh -o "ExitOnForwardFailure yes" -f 2 | -R {{ ssh.hpc_fwd_port }}:127.0.0.1:{{ local_port }} 3 | {{ ssh_host }} 4 | "R --no-save --no-restore -e 5 | 'clustermq:::ssh_proxy({{ ssh.hpc_fwd_port }})' 6 | > {{ ssh_log | /dev/null }} 2>&1" 7 | -------------------------------------------------------------------------------- /inst/TORQUE.tmpl: -------------------------------------------------------------------------------- 1 | #PBS -N {{ job_name }} 2 | #PBS -l nodes={{ n_jobs }}:ppn={{ cores | 1 }},walltime={{ walltime | 12:00:00 }} 3 | #PBS -o {{ log_file | /dev/null }} 4 | #PBS -q default 5 | #PBS -j oe 6 | 7 | ulimit -v $(( 1024 * {{ memory | 4096 }} )) 8 | CMQ_AUTH={{ auth }} R --no-save --no-restore -e 'clustermq:::worker("{{ master }}")' 9 | -------------------------------------------------------------------------------- /man/LOCAL.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qsys_local.r 3 | \name{LOCAL} 4 | \alias{LOCAL} 5 | \title{Placeholder for local processing} 6 | \description{ 7 | Mainly so tests pass without setting up a scheduler 8 | } 9 | \keyword{internal} 10 | -------------------------------------------------------------------------------- /man/LSF.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qsys_lsf.r 3 | \name{LSF} 4 | \alias{LSF} 5 | \title{LSF scheduler functions} 6 | \description{ 7 | Derives from QSys to provide LSF-specific functions 8 | } 9 | \keyword{internal} 10 | -------------------------------------------------------------------------------- /man/MULTICORE.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qsys_multicore.r 3 | \name{MULTICORE} 4 | \alias{MULTICORE} 5 | \title{Process on multiple cores on one machine} 6 | \description{ 7 | Derives from QSys to provide multicore-specific functions 8 | } 9 | \keyword{internal} 10 | -------------------------------------------------------------------------------- /man/MULTIPROCESS.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qsys_multiprocess.r 3 | \name{MULTIPROCESS} 4 | \alias{MULTIPROCESS} 5 | \title{Process on multiple processes on one machine} 6 | \description{ 7 | Derives from QSys to provide callr-specific functions 8 | } 9 | \keyword{internal} 10 | -------------------------------------------------------------------------------- /man/Pool.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pool.r 3 | \name{Pool} 4 | \alias{Pool} 5 | \title{Class for basic queuing system functions} 6 | \description{ 7 | Provides the basic functions needed to communicate between machines 8 | This should abstract most functions of rZMQ so the scheduler 9 | implementations can rely on the higher level functionality 10 | } 11 | \keyword{internal} 12 | -------------------------------------------------------------------------------- /man/Q.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Q.r 3 | \name{Q} 4 | \alias{Q} 5 | \title{Queue function calls on the cluster} 6 | \usage{ 7 | Q( 8 | fun, 9 | ..., 10 | const = list(), 11 | export = list(), 12 | pkgs = c(), 13 | seed = 128965, 14 | memory = NULL, 15 | template = list(), 16 | n_jobs = NULL, 17 | job_size = NULL, 18 | rettype = "list", 19 | fail_on_error = TRUE, 20 | workers = NULL, 21 | log_worker = FALSE, 22 | chunk_size = NA, 23 | timeout = Inf, 24 | max_calls_worker = Inf, 25 | verbose = TRUE 26 | ) 27 | } 28 | \arguments{ 29 | \item{fun}{A function to call} 30 | 31 | \item{...}{Objects to be iterated in each function call} 32 | 33 | \item{const}{A list of constant arguments passed to each function call} 34 | 35 | \item{export}{List of objects to be exported to the worker} 36 | 37 | \item{pkgs}{Character vector of packages to load on the worker} 38 | 39 | \item{seed}{A seed to set for each function call} 40 | 41 | \item{memory}{Short for `template=list(memory=value)`} 42 | 43 | \item{template}{A named list of values to fill in the scheduler template} 44 | 45 | \item{n_jobs}{The number of jobs to submit; upper limit of jobs if job_size 46 | is given as well} 47 | 48 | \item{job_size}{The number of function calls per job} 49 | 50 | \item{rettype}{Return type of function call (vector type or 'list')} 51 | 52 | \item{fail_on_error}{If an error occurs on the workers, continue or fail?} 53 | 54 | \item{workers}{Optional instance of QSys representing a worker pool} 55 | 56 | \item{log_worker}{Write a log file for each worker} 57 | 58 | \item{chunk_size}{Number of function calls to chunk together 59 | defaults to 100 chunks per worker or max. 10 kb per chunk} 60 | 61 | \item{timeout}{Maximum time in seconds to wait for worker (default: Inf)} 62 | 63 | \item{max_calls_worker}{Maxmimum number of chunks that will be sent to one worker} 64 | 65 | \item{verbose}{Print status messages and progress bar (default: TRUE)} 66 | } 67 | \value{ 68 | A list of whatever `fun` returned 69 | } 70 | \description{ 71 | Queue function calls on the cluster 72 | } 73 | \examples{ 74 | \dontrun{ 75 | # Run a simple multiplication for numbers 1 to 3 on a worker node 76 | fx = function(x) x * 2 77 | Q(fx, x=1:3, n_jobs=1) 78 | # list(2,4,6) 79 | 80 | # Run a mutate() call in dplyr on a worker node 81 | iris \%>\% 82 | mutate(area = Q(`*`, e1=Sepal.Length, e2=Sepal.Width, n_jobs=1)) 83 | # iris with an additional column 'area' 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /man/QSys.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qsys.r 3 | \name{QSys} 4 | \alias{QSys} 5 | \title{Class for basic queuing system functions} 6 | \description{ 7 | Provides the basic functions needed to communicate between machines 8 | This should abstract most functions of rZMQ so the scheduler 9 | implementations can rely on the higher level functionality 10 | } 11 | \keyword{internal} 12 | -------------------------------------------------------------------------------- /man/Q_rows.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Q_rows.r 3 | \name{Q_rows} 4 | \alias{Q_rows} 5 | \title{Queue function calls defined by rows in a data.frame} 6 | \usage{ 7 | Q_rows( 8 | df, 9 | fun, 10 | const = list(), 11 | export = list(), 12 | pkgs = c(), 13 | seed = 128965, 14 | memory = NULL, 15 | template = list(), 16 | n_jobs = NULL, 17 | job_size = NULL, 18 | rettype = "list", 19 | fail_on_error = TRUE, 20 | workers = NULL, 21 | log_worker = FALSE, 22 | chunk_size = NA, 23 | timeout = Inf, 24 | max_calls_worker = Inf, 25 | verbose = TRUE 26 | ) 27 | } 28 | \arguments{ 29 | \item{df}{data.frame with iterated arguments} 30 | 31 | \item{fun}{A function to call} 32 | 33 | \item{const}{A list of constant arguments passed to each function call} 34 | 35 | \item{export}{List of objects to be exported to the worker} 36 | 37 | \item{pkgs}{Character vector of packages to load on the worker} 38 | 39 | \item{seed}{A seed to set for each function call} 40 | 41 | \item{memory}{Short for `template=list(memory=value)`} 42 | 43 | \item{template}{A named list of values to fill in the scheduler template} 44 | 45 | \item{n_jobs}{The number of jobs to submit; upper limit of jobs if job_size 46 | is given as well} 47 | 48 | \item{job_size}{The number of function calls per job} 49 | 50 | \item{rettype}{Return type of function call (vector type or 'list')} 51 | 52 | \item{fail_on_error}{If an error occurs on the workers, continue or fail?} 53 | 54 | \item{workers}{Optional instance of QSys representing a worker pool} 55 | 56 | \item{log_worker}{Write a log file for each worker} 57 | 58 | \item{chunk_size}{Number of function calls to chunk together 59 | defaults to 100 chunks per worker or max. 10 kb per chunk} 60 | 61 | \item{timeout}{Maximum time in seconds to wait for worker (default: Inf)} 62 | 63 | \item{max_calls_worker}{Maxmimum number of chunks that will be sent to one worker} 64 | 65 | \item{verbose}{Print status messages and progress bar (default: TRUE)} 66 | } 67 | \description{ 68 | Queue function calls defined by rows in a data.frame 69 | } 70 | \examples{ 71 | \dontrun{ 72 | # Run a simple multiplication for data frame columns x and y on a worker node 73 | fx = function (x, y) x * y 74 | df = data.frame(x = 5, y = 10) 75 | Q_rows(df, fx, job_size = 1) 76 | # [1] 50 77 | 78 | # Q_rows also matches the names of a data frame with the function arguments 79 | fx = function (x, y) x - y 80 | df = data.frame(y = 5, x = 10) 81 | Q_rows(df, fx, job_size = 1) 82 | # [1] 5 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /man/SGE.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qsys_sge.r 3 | \name{SGE} 4 | \alias{SGE} 5 | \title{SGE scheduler functions} 6 | \description{ 7 | Derives from QSys to provide SGE-specific functions 8 | } 9 | \keyword{internal} 10 | -------------------------------------------------------------------------------- /man/SLURM.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qsys_slurm.r 3 | \name{SLURM} 4 | \alias{SLURM} 5 | \title{SLURM scheduler functions} 6 | \description{ 7 | Derives from QSys to provide SLURM-specific functions 8 | } 9 | \keyword{internal} 10 | -------------------------------------------------------------------------------- /man/SSH.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qsys_ssh.r 3 | \name{SSH} 4 | \alias{SSH} 5 | \title{SSH scheduler functions} 6 | \description{ 7 | Derives from QSys to provide SSH-specific functions 8 | } 9 | \keyword{internal} 10 | -------------------------------------------------------------------------------- /man/check_args.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/check_args.r 3 | \name{check_args} 4 | \alias{check_args} 5 | \title{Function to check arguments with which Q() is called} 6 | \usage{ 7 | check_args(fun, iter, const = list()) 8 | } 9 | \arguments{ 10 | \item{fun}{A function to call} 11 | 12 | \item{iter}{Objects to be iterated in each function call} 13 | 14 | \item{const}{A list of constant arguments passed to each function call} 15 | } 16 | \value{ 17 | Processed iterated argument list if 'iter' is a list 18 | } 19 | \description{ 20 | Function to check arguments with which Q() is called 21 | } 22 | \keyword{internal} 23 | -------------------------------------------------------------------------------- /man/chunk.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/chunk.r 3 | \name{chunk} 4 | \alias{chunk} 5 | \title{Subset index chunk for processing} 6 | \usage{ 7 | chunk(x, i) 8 | } 9 | \arguments{ 10 | \item{x}{Index data.frame} 11 | 12 | \item{i}{Rows to subset} 13 | } 14 | \value{ 15 | x[i,] 16 | } 17 | \description{ 18 | 'attr' in `[.data.frame` takes too much CPU time 19 | } 20 | \keyword{internal} 21 | -------------------------------------------------------------------------------- /man/clustermq-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clustermq-package.r 3 | \docType{package} 4 | \name{clustermq-package} 5 | \alias{clustermq} 6 | \alias{clustermq-package} 7 | \title{Evaluate Function Calls on HPC Schedulers (LSF, SGE, SLURM)} 8 | \description{ 9 | Provides the \code{Q} function to send arbitrary function calls to 10 | workers on HPC schedulers without relying on network-mounted storage. 11 | Allows using remote schedulers via SSH. 12 | } 13 | \details{ 14 | Under the hood, this will submit a cluster job that connects to the master 15 | via TCP the master will then send the function and argument chunks to the 16 | worker and the worker will return the results to the master until everything 17 | is done and you get back your result 18 | 19 | Computations are done entirely on the network and without any temporary 20 | files on network-mounted storage, so there is no strain on the file system 21 | apart from starting up R once per job. This removes the biggest bottleneck 22 | in distributed computing. 23 | 24 | Using this approach, we can easily do load-balancing, i.e. workers that get 25 | their jobs done faster will also receive more function calls to work on. This 26 | is especially useful if not all calls return after the same time, or one 27 | worker has a high load. 28 | 29 | For more detailed usage instructions, see the documentation of the \code{Q} 30 | function. 31 | } 32 | \seealso{ 33 | Useful links: 34 | \itemize{ 35 | \item \url{https://mschubert.github.io/clustermq/} 36 | \item Report bugs at \url{https://github.com/mschubert/clustermq/issues} 37 | } 38 | 39 | } 40 | \author{ 41 | \strong{Maintainer}: Michael Schubert \email{mschu.dev@gmail.com} (\href{https://orcid.org/0000-0002-6862-5221}{ORCID}) [copyright holder] 42 | 43 | Authors: 44 | \itemize{ 45 | \item ZeroMQ authors (source files in 'src/libzmq' and 'src/cppzmq') [copyright holder] 46 | } 47 | 48 | } 49 | \keyword{internal} 50 | -------------------------------------------------------------------------------- /man/cmq_foreach.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/foreach.r 3 | \name{cmq_foreach} 4 | \alias{cmq_foreach} 5 | \title{clustermq foreach handler} 6 | \usage{ 7 | cmq_foreach(obj, expr, envir, data) 8 | } 9 | \arguments{ 10 | \item{obj}{Returned from foreach::foreach, containing the following variables: 11 | args : Arguments passed, each as a call 12 | argnames: character vector of arguments passed 13 | evalenv : Environment where to evaluate the arguments 14 | export : character vector of variable names to export to nodes 15 | packages: character vector of required packages 16 | verbose : whether to print status messages [logical] 17 | errorHandling: string of function name to call error with, e.g. "stop"} 18 | 19 | \item{expr}{An R expression in curly braces} 20 | 21 | \item{envir}{Environment where to evaluate the arguments} 22 | 23 | \item{data}{Common arguments passed by register_dopcar_cmq(), e.g. n_jobs} 24 | } 25 | \description{ 26 | clustermq foreach handler 27 | } 28 | \keyword{internal} 29 | -------------------------------------------------------------------------------- /man/dot-onAttach.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zzz.r 3 | \name{.onAttach} 4 | \alias{.onAttach} 5 | \title{Report queueing system on package attach if not set} 6 | \usage{ 7 | .onAttach(libname, pkgname) 8 | } 9 | \arguments{ 10 | \item{libname}{default arg for compatibility} 11 | 12 | \item{pkgname}{default arg for compatibility} 13 | } 14 | \description{ 15 | Report queueing system on package attach if not set 16 | } 17 | \keyword{internal} 18 | -------------------------------------------------------------------------------- /man/dot-onLoad.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zzz.r 3 | \name{.onLoad} 4 | \alias{.onLoad} 5 | \title{Select the queueing system on package loading} 6 | \usage{ 7 | .onLoad(libname, pkgname) 8 | } 9 | \arguments{ 10 | \item{libname}{default arg for compatibility} 11 | 12 | \item{pkgname}{default arg for compatibility} 13 | } 14 | \description{ 15 | This is done by setting the variable 'qsys' in the package environment 16 | to the object that contains the desired queueing system. 17 | } 18 | \keyword{internal} 19 | -------------------------------------------------------------------------------- /man/fill_template.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/util.r 3 | \name{fill_template} 4 | \alias{fill_template} 5 | \title{Fill a template string with supplied values} 6 | \usage{ 7 | fill_template(template, values, required = c()) 8 | } 9 | \arguments{ 10 | \item{template}{A character string of a submission template} 11 | 12 | \item{values}{A named list of key-value pairs} 13 | 14 | \item{required}{Keys that must be present in the template (default: none)} 15 | } 16 | \value{ 17 | A template where placeholder fields were replaced by values 18 | } 19 | \description{ 20 | Fill a template string with supplied values 21 | } 22 | \keyword{internal} 23 | -------------------------------------------------------------------------------- /man/host.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/util.r 3 | \name{host} 4 | \alias{host} 5 | \title{Construct the ZeroMQ host address} 6 | \usage{ 7 | host( 8 | node = getOption("clustermq.host", Sys.info()["nodename"]), 9 | ports = getOption("clustermq.ports", 6000:9999), 10 | n = 100 11 | ) 12 | } 13 | \arguments{ 14 | \item{node}{Node or device name} 15 | 16 | \item{ports}{Range of ports to consider} 17 | 18 | \item{n}{How many addresses to return} 19 | } 20 | \value{ 21 | The possible addresses as character vector 22 | } 23 | \description{ 24 | Construct the ZeroMQ host address 25 | } 26 | \keyword{internal} 27 | -------------------------------------------------------------------------------- /man/master.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/master.r 3 | \name{master} 4 | \alias{master} 5 | \title{Master controlling the workers} 6 | \usage{ 7 | master( 8 | pool, 9 | iter, 10 | rettype = "list", 11 | fail_on_error = TRUE, 12 | chunk_size = NA, 13 | timeout = Inf, 14 | max_calls_worker = Inf, 15 | verbose = TRUE 16 | ) 17 | } 18 | \arguments{ 19 | \item{pool}{Instance of Pool object} 20 | 21 | \item{iter}{Objects to be iterated in each function call} 22 | 23 | \item{rettype}{Return type of function} 24 | 25 | \item{fail_on_error}{If an error occurs on the workers, continue or fail?} 26 | 27 | \item{chunk_size}{Number of function calls to chunk together 28 | defaults to 100 chunks per worker or max. 500 kb per chunk} 29 | 30 | \item{timeout}{Maximum time in seconds to wait for worker (default: Inf)} 31 | 32 | \item{max_calls_worker}{Maxmimum number of function calls that will be sent to one worker} 33 | 34 | \item{verbose}{Print progress messages} 35 | } 36 | \value{ 37 | A list of whatever `fun` returned 38 | } 39 | \description{ 40 | exchanging messages between the master and workers works the following way: 41 | * we have submitted a job where we don't know when it will start up 42 | * it starts, sends is a message list(id=0) indicating it is ready 43 | * we send it the function definition and common data 44 | * we also send it the first data set to work on 45 | * when we get any id > 0, it is a result that we store 46 | * and send the next data set/index to work on 47 | * when computatons are complete, we send id=0 to the worker 48 | * it responds with id=-1 (and usage stats) and shuts down 49 | } 50 | \keyword{internal} 51 | -------------------------------------------------------------------------------- /man/msg_fmt.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/util.r 3 | \name{msg_fmt} 4 | \alias{msg_fmt} 5 | \title{Message format for logging} 6 | \usage{ 7 | msg_fmt(verbose = TRUE) 8 | } 9 | \description{ 10 | Message format for logging 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/register_dopar_cmq.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/foreach.r 3 | \name{register_dopar_cmq} 4 | \alias{register_dopar_cmq} 5 | \title{Register clustermq as `foreach` parallel handler} 6 | \usage{ 7 | register_dopar_cmq(...) 8 | } 9 | \arguments{ 10 | \item{...}{List of arguments passed to the `Q` function, e.g. n_jobs} 11 | } 12 | \description{ 13 | Register clustermq as `foreach` parallel handler 14 | } 15 | -------------------------------------------------------------------------------- /man/ssh_proxy.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ssh_proxy.r 3 | \name{ssh_proxy} 4 | \alias{ssh_proxy} 5 | \title{SSH proxy for different schedulers} 6 | \usage{ 7 | ssh_proxy(fwd_port, qsys_id = qsys_default) 8 | } 9 | \arguments{ 10 | \item{fwd_port}{The port of the master address to connect to 11 | (remote end of reverse tunnel)} 12 | 13 | \item{qsys_id}{Character string of QSys class to use} 14 | } 15 | \description{ 16 | Do not call this manually, the SSH qsys will do that 17 | } 18 | \keyword{internal} 19 | -------------------------------------------------------------------------------- /man/summarize_result.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/summarize_result.r 3 | \name{summarize_result} 4 | \alias{summarize_result} 5 | \title{Print a summary of errors and warnings that occurred during processing} 6 | \usage{ 7 | summarize_result( 8 | result, 9 | n_errors, 10 | n_warnings, 11 | cond_msgs, 12 | at = length(result), 13 | fail_on_error = TRUE 14 | ) 15 | } 16 | \arguments{ 17 | \item{result}{A list or vector of the processing result} 18 | 19 | \item{n_errors}{How many errors occurred} 20 | 21 | \item{n_warnings}{How many warnings occurred} 22 | 23 | \item{cond_msgs}{Error and warnings messages, we display first 50} 24 | 25 | \item{at}{How many calls were procesed up to this point} 26 | 27 | \item{fail_on_error}{Stop if error(s) occurred} 28 | } 29 | \description{ 30 | Print a summary of errors and warnings that occurred during processing 31 | } 32 | \keyword{internal} 33 | -------------------------------------------------------------------------------- /man/vec_lookup.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/util.r 3 | \docType{data} 4 | \name{vec_lookup} 5 | \alias{vec_lookup} 6 | \title{Lookup table for return types to vector NAs} 7 | \format{ 8 | An object of class \code{list} of length 9. 9 | } 10 | \usage{ 11 | vec_lookup 12 | } 13 | \description{ 14 | Lookup table for return types to vector NAs 15 | } 16 | \keyword{internal} 17 | -------------------------------------------------------------------------------- /man/work_chunk.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/work_chunk.r 3 | \name{work_chunk} 4 | \alias{work_chunk} 5 | \title{Function to process a chunk of calls} 6 | \usage{ 7 | work_chunk( 8 | df, 9 | fun, 10 | const = list(), 11 | rettype = "list", 12 | common_seed = NULL, 13 | progress = FALSE 14 | ) 15 | } 16 | \arguments{ 17 | \item{df}{A data.frame with call IDs as rownames and arguments as columns} 18 | 19 | \item{fun}{The function to call} 20 | 21 | \item{const}{Constant arguments passed to each call} 22 | 23 | \item{rettype}{Return type of function} 24 | 25 | \item{common_seed}{A seed offset common to all function calls} 26 | 27 | \item{progress}{Logical indicated whether to display a progress bar} 28 | } 29 | \value{ 30 | A list of call results (or try-error if they failed) 31 | } 32 | \description{ 33 | Each chunk comes encapsulated in a data.frame 34 | } 35 | \keyword{internal} 36 | -------------------------------------------------------------------------------- /man/worker.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/worker.r 3 | \name{worker} 4 | \alias{worker} 5 | \title{R worker submitted as cluster job} 6 | \usage{ 7 | worker(master, ..., verbose = TRUE, context = NULL) 8 | } 9 | \arguments{ 10 | \item{master}{The master address (tcp://ip:port)} 11 | 12 | \item{...}{Catch-all to not break older template values (ignored)} 13 | 14 | \item{verbose}{Whether to print debug messages} 15 | 16 | \item{context}{ZeroMQ context (for internal testing)} 17 | } 18 | \description{ 19 | Do not call this manually, the master will do that 20 | } 21 | \keyword{internal} 22 | -------------------------------------------------------------------------------- /man/workers.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/workers.r 3 | \name{workers} 4 | \alias{workers} 5 | \title{Creates a pool of workers} 6 | \usage{ 7 | workers( 8 | n_jobs, 9 | data = NULL, 10 | reuse = TRUE, 11 | template = list(), 12 | log_worker = FALSE, 13 | qsys_id = getOption("clustermq.scheduler", qsys_default), 14 | verbose = FALSE, 15 | ... 16 | ) 17 | } 18 | \arguments{ 19 | \item{n_jobs}{Number of jobs to submit (0 implies local processing)} 20 | 21 | \item{data}{Set common data (function, constant args, seed)} 22 | 23 | \item{reuse}{Whether workers are reusable or get shut down after call} 24 | 25 | \item{template}{A named list of values to fill in template} 26 | 27 | \item{log_worker}{Write a log file for each worker} 28 | 29 | \item{qsys_id}{Character string of QSys class to use} 30 | 31 | \item{verbose}{Print message about worker startup} 32 | 33 | \item{...}{Additional arguments passed to the qsys constructor} 34 | } 35 | \value{ 36 | An instance of the QSys class 37 | } 38 | \description{ 39 | Creates a pool of workers 40 | } 41 | -------------------------------------------------------------------------------- /man/wrap_error.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/util.r 3 | \name{wrap_error} 4 | \alias{wrap_error} 5 | \title{Wraps an error in a condition object} 6 | \usage{ 7 | wrap_error(call) 8 | } 9 | \description{ 10 | Wraps an error in a condition object 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /src/CMQMaster.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "CMQMaster.h" 3 | 4 | RCPP_MODULE(cmq_master) { 5 | using namespace Rcpp; 6 | class_("CMQMaster") 7 | .constructor() 8 | .method("context", &CMQMaster::context) 9 | .method("listen", &CMQMaster::listen) 10 | .method("close", &CMQMaster::close) 11 | .method("recv", &CMQMaster::recv) 12 | .method("send", &CMQMaster::send) 13 | .method("send_shutdown", &CMQMaster::send_shutdown) 14 | .method("proxy_submit_cmd", &CMQMaster::proxy_submit_cmd) 15 | .method("add_env", &CMQMaster::add_env) 16 | .method("add_pkg", &CMQMaster::add_pkg) 17 | .method("list_env", &CMQMaster::list_env) 18 | .method("add_pending_workers", &CMQMaster::add_pending_workers) 19 | .method("list_workers", &CMQMaster::list_workers) 20 | .method("current", &CMQMaster::current) 21 | .method("workers_running", &CMQMaster::workers_running) 22 | .method("workers_total", &CMQMaster::workers_total) 23 | ; 24 | } 25 | -------------------------------------------------------------------------------- /src/CMQMaster.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include "common.h" 3 | 4 | class CMQMaster { 5 | public: 6 | CMQMaster(): ctx(new zmq::context_t(3)) {} 7 | ~CMQMaster() { close(); } 8 | 9 | SEXP context() const { 10 | Rcpp::XPtr p(ctx, true); 11 | return p; 12 | } 13 | 14 | std::string listen(Rcpp::CharacterVector addrs) { 15 | sock = zmq::socket_t(*ctx, ZMQ_ROUTER); 16 | sock.set(zmq::sockopt::router_mandatory, 1); 17 | #ifdef ZMQ_BUILD_DRAFT_API 18 | sock.set(zmq::sockopt::router_notify, ZMQ_NOTIFY_DISCONNECT); 19 | #endif 20 | 21 | int i; 22 | for (i=0; i(addrs[i]); 24 | try { 25 | sock.bind(addr); 26 | return sock.get(zmq::sockopt::last_endpoint); 27 | } catch(zmq::error_t const &e) { 28 | if ((errno != EADDRINUSE && errno != EINTR) || pending_interrupt()) 29 | Rcpp::stop(std::string("Binding port failed (") + e.what() + ")"); 30 | } 31 | } 32 | Rcpp::stop("Could not bind port to any address in provided pool"); 33 | } 34 | 35 | bool close(int timeout=1000) { 36 | if (ctx == nullptr) 37 | return is_cleaned_up; 38 | 39 | auto pitems = std::vector(1); 40 | pitems[0].socket = sock; 41 | pitems[0].events = ZMQ_POLLIN; 42 | 43 | auto time_ms = std::chrono::milliseconds(timeout); 44 | auto time_left = time_ms; 45 | auto start = Time::now(); 46 | while (time_left.count() > 0) { 47 | if (std::find_if(peers.begin(), peers.end(), [](const std::pair &w) { // 'const auto &w' is C++14 48 | return w.second.status == wlife_t::active; }) == peers.end()) { 49 | is_cleaned_up = true; 50 | break; 51 | } 52 | 53 | if (peers.find(cur) != peers.end()) { 54 | auto &w = peers[cur]; 55 | if (w.status == wlife_t::active && w.call == R_NilValue) 56 | try { 57 | send_shutdown(); 58 | } catch (...) {} 59 | } 60 | 61 | try { 62 | int rc = zmq::poll(pitems, time_left); 63 | if (pitems[0].revents) { 64 | std::vector msgs; 65 | auto n = recv_multipart(sock, std::back_inserter(msgs)); 66 | register_peer(msgs); 67 | } 68 | } catch (zmq::error_t const &e) { 69 | if (errno != EINTR || pending_interrupt()) 70 | throw; 71 | } catch (...) { 72 | timeout = 0; 73 | break; 74 | } 75 | time_left = time_ms - std::chrono::duration_cast(Time::now() - start); 76 | }; 77 | 78 | env.clear(); 79 | pending_workers = 0; 80 | 81 | if (sock.handle() != nullptr) { 82 | sock.set(zmq::sockopt::linger, timeout); 83 | sock.close(); 84 | } 85 | if (ctx != nullptr) { 86 | ctx->close(); 87 | ctx = nullptr; 88 | } 89 | return is_cleaned_up; 90 | } 91 | 92 | SEXP recv(int timeout=-1) { 93 | int data_offset; 94 | std::vector msgs; 95 | 96 | do { 97 | int w_active = pending_workers; 98 | for (const auto &kv: peers) { 99 | if (kv.second.status == wlife_t::active || kv.second.status == wlife_t::proxy_cmd) 100 | w_active++; 101 | } 102 | if (w_active <= 0) 103 | Rcpp::stop("Trying to receive data without workers"); 104 | 105 | msgs.clear(); 106 | timeout = poll(timeout); 107 | auto n = recv_multipart(sock, std::back_inserter(msgs)); 108 | data_offset = register_peer(msgs); 109 | } while(data_offset >= msgs.size()); 110 | 111 | return msg2r(std::move(msgs[data_offset]), true); 112 | } 113 | 114 | int send(SEXP cmd) { 115 | auto &w = check_current_worker(wlife_t::active); 116 | auto add_to_worker = set_difference(env_names, w.env); 117 | auto mp = init_multipart(w, wlife_t::active); 118 | mp.push_back(r2msg(cmd)); 119 | 120 | if (w.via.empty()) { 121 | for (auto &str : add_to_worker) 122 | multipart_add_obj(mp, str, w.env); 123 | } else { 124 | std::vector proxy_add_env; 125 | auto &via_env = peers[w.via].env; 126 | for (auto &str : add_to_worker) { 127 | w.env.insert(str); 128 | if (via_env.find(str) == via_env.end()) 129 | multipart_add_obj(mp, str, via_env); 130 | else 131 | proxy_add_env.push_back(str); 132 | } 133 | mp.push_back(r2msg(Rcpp::wrap(proxy_add_env))); 134 | } 135 | 136 | w.call = cmd; 137 | w.call_ref = ++call_counter; 138 | mp.send(sock); 139 | return w.call_ref; 140 | } 141 | void send_shutdown() { 142 | auto &w = check_current_worker(wlife_t::active); 143 | auto mp = init_multipart(w, wlife_t::shutdown); 144 | w.call = R_NilValue; 145 | w.status = wlife_t::shutdown; 146 | mp.send(sock); 147 | } 148 | 149 | void proxy_submit_cmd(SEXP args, int timeout=10000) { 150 | poll(timeout); 151 | std::vector msgs; 152 | auto n = recv_multipart(sock, std::back_inserter(msgs)); 153 | register_peer(msgs); 154 | // msgs[0] == "proxy" routing id 155 | // msgs[1] == delimiter 156 | // msgs[2] == wlife_t::proxy_cmd 157 | 158 | auto &w = check_current_worker(wlife_t::proxy_cmd); 159 | auto mp = init_multipart(w, wlife_t::proxy_cmd); 160 | mp.push_back(r2msg(args)); 161 | mp.send(sock); 162 | } 163 | 164 | void add_env(std::string name, SEXP obj) { 165 | for (auto &w : peers) 166 | w.second.env.erase(name); 167 | env_names.insert(name); 168 | env[name] = r2msg(R_serialize(obj, R_NilValue)); 169 | } 170 | void add_pkg(Rcpp::CharacterVector pkg) { 171 | add_env("package:" + Rcpp::as(pkg), pkg); 172 | } 173 | Rcpp::DataFrame list_env() const { 174 | std::vector names; 175 | names.reserve(env.size()); 176 | std::vector sizes; 177 | sizes.reserve(env.size()); 178 | for (const auto &kv: env) { 179 | names.push_back(kv.first); 180 | sizes.push_back(kv.second.size()); 181 | } 182 | return Rcpp::DataFrame::create(Rcpp::_["object"] = Rcpp::wrap(names), 183 | Rcpp::_["size"] = Rcpp::wrap(sizes)); 184 | } 185 | 186 | void add_pending_workers(int n) { 187 | pending_workers += n; 188 | } 189 | 190 | Rcpp::List list_workers() const { 191 | std::vector names, status; 192 | std::vector calls; 193 | names.reserve(peers.size()); 194 | status.reserve(peers.size()); 195 | calls.reserve(peers.size()); 196 | Rcpp::List wtime, mem; 197 | std::string cur_z85; 198 | for (const auto &kv: peers) { 199 | if (kv.second.status == wlife_t::proxy_cmd || kv.second.status == wlife_t::error) 200 | continue; 201 | names.push_back(z85_encode_routing_id(kv.first)); 202 | if (kv.first == cur) 203 | cur_z85 = names.back(); 204 | status.push_back(std::string(wlife_t2str(kv.second.status))); 205 | calls.push_back(kv.second.n_calls); 206 | wtime.push_back(kv.second.time); 207 | mem.push_back(kv.second.mem); 208 | } 209 | return Rcpp::List::create( 210 | Rcpp::_["worker"] = Rcpp::wrap(names), 211 | Rcpp::_["status"] = Rcpp::wrap(status), 212 | Rcpp::_["current"] = cur_z85, 213 | Rcpp::_["calls"] = calls, 214 | Rcpp::_["time"] = wtime, 215 | Rcpp::_["mem"] = mem, 216 | Rcpp::_["pending"] = pending_workers 217 | ); 218 | } 219 | Rcpp::List current() { 220 | if (peers.find(cur) == peers.end()) 221 | return Rcpp::List::create(); 222 | const auto &w = peers[cur]; 223 | return Rcpp::List::create( 224 | Rcpp::_["worker"] = z85_encode_routing_id(cur), 225 | Rcpp::_["status"] = Rcpp::wrap(wlife_t2str(w.status)), 226 | Rcpp::_["call_ref"] = w.call_ref, 227 | Rcpp::_["calls"] = w.n_calls, 228 | Rcpp::_["time"] = w.time, 229 | Rcpp::_["mem"] = w.mem 230 | ); 231 | } 232 | int workers_running() { 233 | return std::count_if(peers.begin(), peers.end(), [](const std::pair &w) { // 'const auto &w' is C++14 234 | return w.second.status == wlife_t::active; }); 235 | } 236 | int workers_total() { 237 | return workers_running() + pending_workers; 238 | } 239 | 240 | private: 241 | struct worker_t { 242 | std::set env; 243 | Rcpp::RObject call {R_NilValue}; 244 | Rcpp::RObject time {R_NilValue}; 245 | Rcpp::RObject mem {R_NilValue}; 246 | wlife_t status; 247 | std::string via; 248 | int n_calls {-1}; 249 | int call_ref {-1}; 250 | }; 251 | 252 | zmq::context_t *ctx {nullptr}; 253 | bool is_cleaned_up {false}; 254 | int pending_workers {0}; 255 | int call_counter {-1}; 256 | zmq::socket_t sock; 257 | std::string cur; 258 | std::unordered_map peers; 259 | std::unordered_map env; 260 | std::set env_names; 261 | 262 | worker_t &check_current_worker(const wlife_t status) { 263 | if (peers.find(cur) == peers.end()) 264 | Rcpp::stop("Trying to send to worker that does not exist"); 265 | auto &w = peers[cur]; 266 | if (w.status != status) 267 | Rcpp::stop("Trying to send to worker with invalid status"); 268 | return w; 269 | } 270 | zmq::multipart_t init_multipart(const worker_t &w, const wlife_t status) const { 271 | zmq::multipart_t mp; 272 | if (!w.via.empty()) 273 | mp.push_back(zmq::message_t(w.via)); 274 | mp.push_back(zmq::message_t(cur)); 275 | mp.push_back(zmq::message_t(0)); 276 | mp.push_back(int2msg(status)); 277 | return mp; 278 | } 279 | 280 | void multipart_add_obj(zmq::multipart_t &mp, std::string str, std::set &tracker) { 281 | auto &obj = env[str]; 282 | tracker.insert(str); 283 | mp.push_back(zmq::message_t(str)); 284 | mp.push_back(zmq::message_t(obj.data(), obj.size(), [](void*, void*){})); 285 | } 286 | 287 | int poll(int timeout=-1) { 288 | auto pitems = std::vector(1); 289 | pitems[0].socket = sock; 290 | pitems[0].events = ZMQ_POLLIN; 291 | 292 | auto time_ms = std::chrono::milliseconds(timeout); 293 | auto time_left = time_ms; 294 | auto start = Time::now(); 295 | 296 | int rc = 0; 297 | do { 298 | try { 299 | rc = zmq::poll(pitems, time_left); 300 | } catch (zmq::error_t const &e) { 301 | if (errno != EINTR || pending_interrupt()) 302 | Rcpp::stop(e.what()); 303 | } 304 | 305 | if (timeout != -1) { 306 | auto ms_diff = std::chrono::duration_cast(Time::now() - start); 307 | time_left = time_ms - ms_diff; 308 | timeout = time_left.count(); 309 | if (timeout < 0) { 310 | std::ostringstream err; 311 | err << "Socket timed out after " << ms_diff.count() << " ms\n"; 312 | throw Rcpp::exception(err.str().c_str()); 313 | } 314 | } 315 | } while (rc == 0); 316 | 317 | return timeout; 318 | } 319 | 320 | int register_peer(std::vector &msgs) { 321 | // std::cout << "Received message: "; 322 | // for (int i=0; i ++cur_i) { 342 | w.status = msg2wlife_t(msgs[cur_i]); 343 | w.n_calls++; 344 | } else { 345 | if (w.status == wlife_t::proxy_cmd) { 346 | for (const auto &w: peers) { 347 | if (w.second.via == cur && w.second.status == wlife_t::active) 348 | Rcpp::stop("Proxy disconnect with active worker(s)"); 349 | } 350 | } else if (w.status == wlife_t::shutdown) { 351 | w.status = wlife_t::finished; 352 | } else 353 | Rcpp::stop("Unexpected worker disconnect"); 354 | } 355 | 356 | if (peers.size() > prev_size && w.status == wlife_t::active) { 357 | if (--pending_workers < 0) 358 | Rcpp::stop("More workers registered than expected"); 359 | } 360 | 361 | if (msgs.size() > cur_i+2) { 362 | w.time = msg2r(std::move(msgs[++cur_i]), true); 363 | w.mem = msg2r(std::move(msgs[++cur_i]), true); 364 | } 365 | return ++cur_i; 366 | } 367 | }; 368 | -------------------------------------------------------------------------------- /src/CMQProxy.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "CMQProxy.h" 3 | 4 | RCPP_MODULE(cmq_proxy) { 5 | using namespace Rcpp; 6 | class_("CMQProxy") 7 | .constructor() 8 | .constructor() 9 | .method("listen", &CMQProxy::listen) 10 | .method("connect", &CMQProxy::connect) 11 | .method("proxy_request_cmd", &CMQProxy::proxy_request_cmd) 12 | .method("proxy_receive_cmd", &CMQProxy::proxy_receive_cmd) 13 | .method("add_pending_workers", &CMQProxy::add_pending_workers) 14 | .method("close", &CMQProxy::close) 15 | .method("process_one", &CMQProxy::process_one) 16 | ; 17 | } 18 | -------------------------------------------------------------------------------- /src/CMQProxy.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include "common.h" 3 | #include "CMQMaster.h" 4 | 5 | class CMQProxy { 6 | public: 7 | CMQProxy(): ctx(new zmq::context_t(1)) { 8 | external_context = false; 9 | } 10 | CMQProxy(SEXP ctx_): ctx(Rcpp::as>(ctx_)) {} 11 | ~CMQProxy() { close(); } 12 | 13 | void close(int timeout=1000L) { 14 | if (mon.handle() != nullptr) { 15 | mon.set(zmq::sockopt::linger, 0); 16 | mon.close(); 17 | } 18 | if (to_worker.handle() != nullptr) { 19 | to_worker.set(zmq::sockopt::linger, timeout); 20 | to_worker.close(); 21 | } 22 | if (to_master.handle() != nullptr) { 23 | to_master.set(zmq::sockopt::linger, timeout); 24 | to_master.close(); 25 | } 26 | if (!external_context && ctx != nullptr) { 27 | ctx->close(); 28 | delete ctx; 29 | ctx = nullptr; 30 | } 31 | } 32 | 33 | void connect(std::string addr, int timeout=-1) { 34 | to_master = zmq::socket_t(*ctx, ZMQ_DEALER); 35 | to_master.set(zmq::sockopt::connect_timeout, timeout); 36 | to_master.set(zmq::sockopt::routing_id, "proxy"); 37 | 38 | if (zmq_socket_monitor(to_master, "inproc://monitor", ZMQ_EVENT_DISCONNECTED) < 0) 39 | Rcpp::stop("failed to create socket monitor"); 40 | mon = zmq::socket_t(*ctx, ZMQ_PAIR); 41 | mon.connect("inproc://monitor"); 42 | 43 | to_master.connect(addr); 44 | } 45 | 46 | void proxy_request_cmd() { 47 | to_master.send(zmq::message_t(0), zmq::send_flags::sndmore); 48 | to_master.send(int2msg(wlife_t::proxy_cmd), zmq::send_flags::sndmore); 49 | to_master.send(r2msg(proc_time()), zmq::send_flags::sndmore); 50 | to_master.send(r2msg(gc()), zmq::send_flags::none); 51 | } 52 | SEXP proxy_receive_cmd() { 53 | std::vector msgs; 54 | auto n = recv_multipart(to_master, std::back_inserter(msgs)); 55 | auto status = msg2wlife_t(msgs[1]); 56 | return msg2r(std::move(msgs[2]), true); 57 | } 58 | 59 | void add_pending_workers(int n) { 60 | // proxy will always wait 61 | } 62 | 63 | std::string listen(Rcpp::CharacterVector addrs) { 64 | to_worker = zmq::socket_t(*ctx, ZMQ_ROUTER); 65 | to_worker.set(zmq::sockopt::router_mandatory, 1); 66 | #ifdef ZMQ_BUILD_DRAFT_API 67 | to_worker.set(zmq::sockopt::router_notify, ZMQ_NOTIFY_DISCONNECT); 68 | #endif 69 | 70 | int i; 71 | for (i=0; i(addrs[i]); 73 | try { 74 | to_worker.bind(addr); 75 | return to_worker.get(zmq::sockopt::last_endpoint); 76 | } catch(zmq::error_t const &e) { 77 | if (errno != EADDRINUSE) 78 | Rcpp::stop(e.what()); 79 | } 80 | } 81 | Rcpp::stop("Could not bind port to any address in provided pool"); 82 | } 83 | 84 | bool process_one() { 85 | auto pitems = std::vector(3); 86 | pitems[0].socket = to_master; 87 | pitems[0].events = ZMQ_POLLIN; 88 | pitems[1].socket = to_worker; 89 | pitems[1].events = ZMQ_POLLIN; 90 | pitems[2].socket = mon; 91 | pitems[2].events = ZMQ_POLLIN; 92 | 93 | auto time_left = std::chrono::milliseconds(-1); 94 | int rc = 0; 95 | do { 96 | try { 97 | rc = zmq::poll(pitems, time_left); 98 | } catch (zmq::error_t const &e) { 99 | if (errno != EINTR || pending_interrupt()) 100 | Rcpp::stop(e.what()); 101 | } 102 | } while (rc == 0); 103 | 104 | // master to worker communication -> add R env objects 105 | // frames: id, delim, status, call, [objs{1..n},] env_add 106 | if (pitems[0].revents > 0) { 107 | std::vector msgs; 108 | auto n = recv_multipart(to_master, std::back_inserter(msgs)); 109 | std::vector add_from_proxy; 110 | if (msgs.size() >= 5) { 111 | add_from_proxy = Rcpp::as>(msg2r(std::move(msgs.back()), true)); 112 | msgs.pop_back(); 113 | } 114 | 115 | zmq::multipart_t mp; 116 | for (int i=0; i= 4) { 119 | auto name = msgs[i++].to_string(); 120 | mp.push_back(zmq::message_t(msgs[i].data(), msgs[i].size())); 121 | env[name] = zmq::message_t(msgs[i].data(), msgs[i].size()); 122 | } 123 | } 124 | 125 | // std::cout << "adding from proxy env: (" << add_from_proxy.size() << ")"; 126 | for (auto &name : add_from_proxy) { 127 | mp.push_back(zmq::message_t(name)); 128 | mp.push_back(zmq::message_t(env[name].data(), env[name].size(), [](void*, void*){})); 129 | } 130 | // std::cout << "\nMESSAGE SIZE to worker: " << mp.size() << "\n\n"; 131 | mp.send(to_worker); 132 | } 133 | 134 | // worker to master communication -> simple forward 135 | if (pitems[1].revents > 0) { 136 | std::vector msgs; 137 | auto n = recv_multipart(to_worker, std::back_inserter(msgs)); 138 | zmq::multipart_t mp; 139 | for (int i=0; i 0) 145 | return false; 146 | 147 | return true; 148 | } 149 | 150 | private: 151 | Rcpp::Function proc_time {"proc.time"}; 152 | Rcpp::Function gc {"gc"}; 153 | bool external_context {true}; 154 | zmq::context_t *ctx {nullptr}; 155 | zmq::socket_t to_master; 156 | zmq::socket_t to_worker; 157 | zmq::socket_t mon; 158 | std::unordered_map env; 159 | }; 160 | -------------------------------------------------------------------------------- /src/CMQWorker.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "CMQWorker.h" 3 | 4 | RCPP_MODULE(cmq_worker) { 5 | using namespace Rcpp; 6 | class_("CMQWorker") 7 | .constructor() 8 | .constructor() 9 | .method("connect", &CMQWorker::connect) 10 | .method("close", &CMQWorker::close) 11 | .method("poll", &CMQWorker::poll) 12 | .method("process_one", &CMQWorker::process_one) 13 | ; 14 | } 15 | -------------------------------------------------------------------------------- /src/CMQWorker.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include "common.h" 3 | 4 | class CMQWorker { 5 | public: 6 | CMQWorker(): ctx(new zmq::context_t(1)) { 7 | external_context = false; 8 | } 9 | CMQWorker(SEXP ctx_): ctx(Rcpp::as>(ctx_)) {} 10 | ~CMQWorker() { close(); } 11 | 12 | void connect(std::string addr, int timeout=5000) { 13 | sock = zmq::socket_t(*ctx, ZMQ_REQ); 14 | // timeout would need ZMQ_RECONNECT_STOP_CONN_REFUSED (draft, no C++ yet) to work 15 | sock.set(zmq::sockopt::connect_timeout, timeout); 16 | sock.set(zmq::sockopt::immediate, 1); 17 | 18 | if (mon.handle() == nullptr) { 19 | if (zmq_socket_monitor(sock, "inproc://monitor", ZMQ_EVENT_DISCONNECTED) < 0) 20 | Rcpp::stop("failed to create socket monitor"); 21 | mon = zmq::socket_t(*ctx, ZMQ_PAIR); 22 | mon.connect("inproc://monitor"); 23 | } 24 | 25 | try { 26 | sock.connect(addr); 27 | check_send_ready(timeout); 28 | sock.send(int2msg(wlife_t::active), zmq::send_flags::sndmore); 29 | sock.send(r2msg(proc_time()), zmq::send_flags::sndmore); 30 | sock.send(r2msg(gc()), zmq::send_flags::sndmore); 31 | sock.send(r2msg(R_NilValue), zmq::send_flags::none); 32 | } catch (zmq::error_t const &e) { 33 | Rcpp::stop(e.what()); 34 | } 35 | } 36 | 37 | void close() { 38 | if (mon.handle() != nullptr) { 39 | mon.set(zmq::sockopt::linger, 0); 40 | mon.close(); 41 | } 42 | if (sock.handle() != nullptr) { 43 | sock.set(zmq::sockopt::linger, 10000); 44 | sock.close(); 45 | } 46 | if (!external_context && ctx != nullptr) { 47 | ctx->close(); 48 | delete ctx; 49 | ctx = nullptr; 50 | } 51 | } 52 | 53 | void poll() { 54 | auto pitems = std::vector(2); 55 | pitems[0].socket = sock; 56 | pitems[0].events = ZMQ_POLLIN; 57 | pitems[1].socket = mon; 58 | pitems[1].events = ZMQ_POLLIN; 59 | 60 | int total_sock_ev = 0; 61 | do { 62 | try { 63 | zmq::poll(pitems, std::chrono::milliseconds{-1}); 64 | } catch (zmq::error_t const &e) { 65 | if (errno != EINTR || pending_interrupt()) 66 | Rcpp::stop(e.what()); 67 | } 68 | if (pitems[1].revents > 0) 69 | Rcpp::stop("Unexpected peer disconnect"); 70 | total_sock_ev = pitems[0].revents; 71 | } while (total_sock_ev == 0); 72 | } 73 | 74 | bool process_one() { 75 | std::vector msgs; 76 | auto n = recv_multipart(sock, std::back_inserter(msgs)); 77 | 78 | // std::cout << "Received message: "; 79 | // for (int i=0; ito_string(); 91 | if (name.compare(0, 8, "package:") == 0) 92 | load_pkg(name.substr(8, std::string::npos)); 93 | else 94 | env.assign(name, msg2r(std::move(*it), true)); 95 | } 96 | 97 | SEXP cmd, eval, time, mem; 98 | PROTECT(cmd = msg2r(std::move(msgs[1]), true)); 99 | int err = 0; 100 | PROTECT(eval = R_tryEvalSilent(Rcpp::as(cmd)[0], env, &err)); 101 | if (err) { 102 | auto cmq = Rcpp::Environment::namespace_env("clustermq"); 103 | Rcpp::Function wrap_error = cmq["wrap_error"]; 104 | UNPROTECT(1); 105 | PROTECT(eval = wrap_error(cmd)); 106 | } 107 | PROTECT(time = proc_time()); 108 | PROTECT(mem = gc()); 109 | sock.send(int2msg(wlife_t::active), zmq::send_flags::sndmore); 110 | sock.send(r2msg(time), zmq::send_flags::sndmore); 111 | sock.send(r2msg(mem), zmq::send_flags::sndmore); 112 | sock.send(r2msg(eval), zmq::send_flags::none); 113 | UNPROTECT(4); 114 | return true; 115 | } 116 | 117 | private: 118 | bool external_context {true}; 119 | zmq::context_t *ctx {nullptr}; 120 | zmq::socket_t sock; 121 | zmq::socket_t mon; 122 | Rcpp::Environment env {1}; 123 | Rcpp::Function load_pkg {"library"}; 124 | Rcpp::Function proc_time {"proc.time"}; 125 | Rcpp::Function gc {"gc"}; 126 | 127 | void check_send_ready(int timeout=5000) { 128 | auto pitems = std::vector(1); 129 | pitems[0].socket = sock; 130 | pitems[0].events = ZMQ_POLLOUT; 131 | 132 | auto time_ms = std::chrono::milliseconds(timeout); 133 | auto time_left = time_ms; 134 | auto start = Time::now(); 135 | 136 | do { 137 | try { 138 | zmq::poll(pitems, time_left); 139 | } catch (zmq::error_t const &e) { 140 | if (errno != EINTR || pending_interrupt()) 141 | Rcpp::stop(e.what()); 142 | } 143 | 144 | auto ms_diff = std::chrono::duration_cast(Time::now() - start); 145 | time_left = time_ms - ms_diff; 146 | if (time_left.count() < 0) { 147 | std::ostringstream err; 148 | err << "Connection failed after " << ms_diff.count() << " ms\n"; 149 | throw Rcpp::exception(err.str().c_str()); 150 | } 151 | } while (pitems[0].revents == 0); 152 | } 153 | }; 154 | -------------------------------------------------------------------------------- /src/Makevars.in: -------------------------------------------------------------------------------- 1 | PKG_CPPFLAGS = @cflags@ 2 | PKG_CFLAGS = @cflags@ 3 | PKG_LIBS = @libs@ 4 | -------------------------------------------------------------------------------- /src/Makevars.win: -------------------------------------------------------------------------------- 1 | PKG_CPPFLAGS = -DZMQ_STATIC -Icppzmq -I"$(R_TOOLS_SOFT)/include" 2 | PKG_LIBS = -L"$(R_TOOLS_SOFT)/lib" -lzmq -lsodium -lpthread -liphlpapi -lws2_32 3 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | 6 | using namespace Rcpp; 7 | 8 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 9 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 10 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 11 | #endif 12 | 13 | // has_connectivity 14 | bool has_connectivity(std::string host); 15 | RcppExport SEXP _clustermq_has_connectivity(SEXP hostSEXP) { 16 | BEGIN_RCPP 17 | Rcpp::RObject rcpp_result_gen; 18 | Rcpp::RNGScope rcpp_rngScope_gen; 19 | Rcpp::traits::input_parameter< std::string >::type host(hostSEXP); 20 | rcpp_result_gen = Rcpp::wrap(has_connectivity(host)); 21 | return rcpp_result_gen; 22 | END_RCPP 23 | } 24 | // libzmq_has_draft 25 | bool libzmq_has_draft(); 26 | RcppExport SEXP _clustermq_libzmq_has_draft() { 27 | BEGIN_RCPP 28 | Rcpp::RObject rcpp_result_gen; 29 | Rcpp::RNGScope rcpp_rngScope_gen; 30 | rcpp_result_gen = Rcpp::wrap(libzmq_has_draft()); 31 | return rcpp_result_gen; 32 | END_RCPP 33 | } 34 | 35 | RcppExport SEXP _rcpp_module_boot_cmq_master(); 36 | RcppExport SEXP _rcpp_module_boot_cmq_proxy(); 37 | RcppExport SEXP _rcpp_module_boot_cmq_worker(); 38 | 39 | static const R_CallMethodDef CallEntries[] = { 40 | {"_clustermq_has_connectivity", (DL_FUNC) &_clustermq_has_connectivity, 1}, 41 | {"_clustermq_libzmq_has_draft", (DL_FUNC) &_clustermq_libzmq_has_draft, 0}, 42 | {"_rcpp_module_boot_cmq_master", (DL_FUNC) &_rcpp_module_boot_cmq_master, 0}, 43 | {"_rcpp_module_boot_cmq_proxy", (DL_FUNC) &_rcpp_module_boot_cmq_proxy, 0}, 44 | {"_rcpp_module_boot_cmq_worker", (DL_FUNC) &_rcpp_module_boot_cmq_worker, 0}, 45 | {NULL, NULL, 0} 46 | }; 47 | 48 | RcppExport void R_init_clustermq(DllInfo *dll) { 49 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 50 | R_useDynamicSymbols(dll, FALSE); 51 | } 52 | -------------------------------------------------------------------------------- /src/common.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | Rcpp::Function R_serialize("serialize"); 4 | Rcpp::Function R_unserialize("unserialize"); 5 | 6 | const char* wlife_t2str(wlife_t status) { 7 | switch(status) { 8 | case wlife_t::active: return "active"; 9 | case wlife_t::shutdown: return "shutdown"; 10 | case wlife_t::finished: return "finished"; 11 | case wlife_t::error: return "error"; 12 | case wlife_t::proxy_cmd: return "proxy_cmd"; 13 | case wlife_t::proxy_error: return "proxy_error"; 14 | default: Rcpp::stop("Invalid worker status"); 15 | } 16 | } 17 | 18 | void check_interrupt_fn(void *dummy) { 19 | R_CheckUserInterrupt(); 20 | } 21 | 22 | int pending_interrupt() { 23 | return !(R_ToplevelExec(check_interrupt_fn, NULL)); 24 | } 25 | 26 | zmq::message_t int2msg(const int val) { 27 | zmq::message_t msg(sizeof(int)); 28 | memcpy(msg.data(), &val, sizeof(int)); 29 | return msg; 30 | } 31 | 32 | zmq::message_t r2msg(SEXP data) { 33 | if (TYPEOF(data) != RAWSXP) 34 | data = R_serialize(data, R_NilValue); 35 | zmq::message_t msg(Rf_xlength(data)); 36 | memcpy(msg.data(), RAW(data), Rf_xlength(data)); 37 | return msg; 38 | } 39 | 40 | SEXP msg2r(const zmq::message_t &&msg, const bool unserialize) { 41 | SEXP ans = Rf_allocVector(RAWSXP, msg.size()); 42 | memcpy(RAW(ans), msg.data(), msg.size()); 43 | if (unserialize) 44 | return R_unserialize(ans); 45 | else 46 | return ans; 47 | } 48 | 49 | wlife_t msg2wlife_t(const zmq::message_t &msg) { 50 | wlife_t res; 51 | memcpy(&res, msg.data(), msg.size()); 52 | return res; 53 | } 54 | 55 | std::string z85_encode_routing_id(const std::string rid) { 56 | std::string dest(5, 0); 57 | zmq_z85_encode(&dest[0], reinterpret_cast(&rid[1]), 4); 58 | return dest; 59 | } 60 | 61 | std::set set_difference(std::set &set1, std::set &set2) { 62 | std::set diff; 63 | std::set_difference(set1.begin(), set1.end(), set2.begin(), set2.end(), 64 | std::inserter(diff, diff.end())); 65 | return diff; 66 | } 67 | -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | #ifndef _COMMON_H_ 2 | #define _COMMON_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "zmq.hpp" 10 | #include "zmq_addon.hpp" 11 | 12 | #if ! ZMQ_VERSION >= ZMQ_MAKE_VERSION(4, 3, 0) || \ 13 | ! CPPZMQ_VERSION >= ZMQ_MAKE_VERSION(4, 10, 0) 14 | #define XSTR(x) STR(x) 15 | #define STR(x) #x 16 | #pragma message "libzmq version is: " XSTR(ZMQ_VERSION_MAJOR) "." \ 17 | XSTR(ZMQ_VERSION_MINOR) "." XSTR(ZMQ_VERSION_PATCH) 18 | #pragma message "cppzmq version is: " XSTR(CPPZMQ_VERSION_MAJOR) "." \ 19 | XSTR(CPPZMQ_VERSION_MINOR) "." XSTR(CPPZMQ_VERSION_PATCH) 20 | #error clustermq needs libzmq>=4.3.0 and cppzmq>=4.10.0 21 | #endif 22 | 23 | enum wlife_t { 24 | active, 25 | shutdown, 26 | finished, 27 | error, 28 | proxy_cmd, 29 | proxy_error 30 | }; 31 | const char* wlife_t2str(wlife_t status); 32 | typedef std::chrono::high_resolution_clock Time; 33 | typedef std::chrono::milliseconds ms; 34 | extern Rcpp::Function R_serialize; 35 | extern Rcpp::Function R_unserialize; 36 | 37 | void check_interrupt_fn(void *dummy); 38 | int pending_interrupt(); 39 | zmq::message_t int2msg(const int val); 40 | zmq::message_t r2msg(SEXP data); 41 | SEXP msg2r(const zmq::message_t &&msg, const bool unserialize); 42 | wlife_t msg2wlife_t(const zmq::message_t &msg); 43 | std::string z85_encode_routing_id(const std::string rid); 44 | std::set set_difference(std::set &set1, std::set &set2); 45 | 46 | #endif // _COMMON_H_ 47 | -------------------------------------------------------------------------------- /src/util.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "zmq.hpp" 4 | 5 | // [[Rcpp::export]] 6 | bool has_connectivity(std::string host) { 7 | bool success = false; 8 | zmq::context_t ctx; 9 | zmq::socket_t server = zmq::socket_t(ctx, ZMQ_REP); 10 | zmq::socket_t client = zmq::socket_t(ctx, ZMQ_REQ); 11 | 12 | try { 13 | server.bind("tcp://*:*"); 14 | std::string addr = server.get(zmq::sockopt::last_endpoint); 15 | const std::string all_hosts = "0.0.0.0"; 16 | addr.replace(addr.find(all_hosts), all_hosts.size(), host); 17 | 18 | client.connect(addr); 19 | const std::string msg1 = "testing connection"; 20 | client.send(zmq::buffer(msg1), zmq::send_flags::none); 21 | 22 | zmq::message_t msg2; 23 | auto time_ms = std::chrono::milliseconds(200); 24 | auto pitems = std::vector(1); 25 | pitems[0].socket = server; 26 | pitems[0].events = ZMQ_POLLIN; 27 | zmq::poll(pitems, time_ms); 28 | auto n = server.recv(msg2, zmq::recv_flags::dontwait); 29 | auto msg2_s = std::string(reinterpret_cast(msg2.data()), msg2.size()); 30 | 31 | if (msg1 == msg2_s) 32 | success = true; 33 | } catch(zmq::error_t const &e) { 34 | // std::cerr << e.what() << "\n"; 35 | success = false; 36 | } 37 | 38 | client.set(zmq::sockopt::linger, 0); 39 | client.close(); 40 | server.set(zmq::sockopt::linger, 0); 41 | server.close(); 42 | ctx.close(); 43 | 44 | return success; 45 | } 46 | 47 | // [[Rcpp::export]] 48 | bool libzmq_has_draft() { 49 | #ifdef ZMQ_BUILD_DRAFT_API 50 | return true; 51 | #else 52 | return false; 53 | #endif 54 | } 55 | -------------------------------------------------------------------------------- /src/util/build_libzmq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | cd "$(dirname $0)"/../libzmq 4 | 5 | if [ ! -f Makefile.in ]; then 6 | ./autogen.sh || exit 1 7 | fi 8 | 9 | if [ ! -f src/.libs/libzmq.a ]; then 10 | CXX="$CXX" CXXFLAGS="$CXXFLAGS -fPIC" CPPFLAGS="$CPPFLAGS" ./configure \ 11 | --enable-drafts \ 12 | --enable-static \ 13 | --disable-shared \ 14 | --disable-maintainer-mode \ 15 | --disable-Werror \ 16 | --disable-libbsd \ 17 | --disable-libunwind \ 18 | --disable-perf \ 19 | --disable-curve \ 20 | --disable-curve-keygen \ 21 | --disable-ws \ 22 | --disable-radix-tree \ 23 | --without-docs 24 | make || exit 1 25 | fi 26 | -------------------------------------------------------------------------------- /src/util/patch_libzmq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | cd "$(dirname $0)"/.. 4 | 5 | # the tarball has the submodules, a fresh clone does not 6 | if [ ! -f libzmq/autogen.sh ]; then 7 | git submodule update --init --recursive 8 | fi 9 | 10 | cd libzmq 11 | 12 | # remove code format helper and valgrind support that CRAN complains about 13 | # sed -i does not work on macOS 14 | if [ ! -f src/Makefile.am.orig ]; then 15 | mv Makefile.am Makefile.am.orig 16 | sed '/WITH_CLANG_FORMAT/,/VALGRIND_SUPPRESSIONS_FILES/d' Makefile.am.orig > Makefile.am 17 | fi 18 | 19 | # remove disabled gcc check that cran complains about 20 | if [ ! -f src/curve_client_tools.hpp.orig ]; then 21 | mv src/curve_client_tools.hpp src/curve_client_tools.hpp.orig 22 | sed '/^#pragma/s|^|//|' src/curve_client_tools.hpp.orig > src/curve_client_tools.hpp 23 | fi 24 | if [ ! -f include/zmq_utils.h.orig ]; then 25 | mv include/zmq_utils.h include/zmq_utils.h.orig 26 | sed '/^#pragma/s|^|//|' include/zmq_utils.h.orig > include/zmq_utils.h 27 | fi 28 | -------------------------------------------------------------------------------- /src/util/test_cpp11.cpp: -------------------------------------------------------------------------------- 1 | #if (!defined(__llvm__) && !defined(__INTEL_COMPILER) && defined(__GNUC__) && __GNUC__ < 5) || \ 2 | (defined(__GLIBCXX__) && __GLIBCXX__ < 20160805) 3 | #error "gcc with no or only partial c++11 support" 4 | #endif 5 | 6 | int main() {} 7 | -------------------------------------------------------------------------------- /src/util/test_libzmq.c: -------------------------------------------------------------------------------- 1 | #include 2 | #if ZMQ_VERSION < ZMQ_MAKE_VERSION(4, 3, 0) 3 | #error clustermq needs libzmq>=4.3.0 4 | #endif 5 | int main() { 6 | #ifndef ZMQ_BUILD_DRAFT_API 7 | return 1; 8 | #endif 9 | } 10 | -------------------------------------------------------------------------------- /tests/bin/bkill: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | -------------------------------------------------------------------------------- /tests/bin/bsub: -------------------------------------------------------------------------------- 1 | fake_scheduler.sh -------------------------------------------------------------------------------- /tests/bin/fake_scheduler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | OUT=/dev/stderr 3 | echo "starting PID $$" > $OUT 4 | timeout 30 sh < /dev/stdin >> $OUT 2>&1 & 5 | [[ $? == 0 ]] && echo "started PID $$" >> $OUT 6 | -------------------------------------------------------------------------------- /tests/bin/qdel: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | -------------------------------------------------------------------------------- /tests/bin/qsub: -------------------------------------------------------------------------------- 1 | fake_scheduler.sh -------------------------------------------------------------------------------- /tests/bin/sbatch: -------------------------------------------------------------------------------- 1 | fake_scheduler.sh -------------------------------------------------------------------------------- /tests/bin/scancel: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | test_check("clustermq") 3 | -------------------------------------------------------------------------------- /tests/testthat/helper-util.r: -------------------------------------------------------------------------------- 1 | ssh_opts = "-oPasswordAuthentication=no -oChallengeResponseAuthentication=no" 2 | 3 | has_ssh = function(host) { 4 | status = system(paste("ssh", ssh_opts, host, "'exit'"), wait=TRUE, 5 | ignore.stdout=TRUE, ignore.stderr=TRUE) 6 | status == 0 7 | } 8 | 9 | has_ssh_cmq = function(host) { 10 | status = suppressWarnings( 11 | system(paste("ssh", ssh_opts, host, "'R -e \"library(clustermq)\"'"), 12 | wait=TRUE, ignore.stdout=TRUE, ignore.stderr=TRUE)) 13 | status == 0 14 | } 15 | 16 | has_cmq = function(host) { 17 | status = system("R -e 'library(clustermq)'", wait=TRUE, 18 | ignore.stdout=TRUE, ignore.stderr=TRUE) 19 | status == 0 20 | } 21 | -------------------------------------------------------------------------------- /tests/testthat/test-0-util.r: -------------------------------------------------------------------------------- 1 | context("util") 2 | 3 | test_that("template filler", { 4 | tmpl = "this is my {{ template }}" 5 | values = list(template = "filled") 6 | 7 | filled = fill_template(tmpl, values) 8 | expect_equal(filled, "this is my filled") 9 | 10 | expect_error(fill_template(tmpl, list(key="unrelated"))) 11 | }) 12 | 13 | test_that("template default values", { 14 | tmpl = "this is my {{ template | default }}" 15 | values = list(template = "filled") 16 | 17 | filled1 = fill_template(tmpl, values) 18 | expect_equal(filled1, "this is my filled") 19 | 20 | filled2 = fill_template(tmpl, list()) 21 | expect_equal(filled2, "this is my default") 22 | }) 23 | 24 | test_that("template required key", { 25 | tmpl = "this is my {{ template }}" 26 | values = list(template = "filled") 27 | 28 | expect_error(fill_template(tmpl, values, required="missing")) 29 | }) 30 | 31 | test_that("template filling works with vectors", { 32 | tmpl = "{{ var1 }} and {{ var2 }}" 33 | values = c(var1=1, var2=2) 34 | 35 | expect_equal(fill_template(tmpl, values), "1 and 2") 36 | }) 37 | 38 | test_that("template numbers are not converted to sci format", { 39 | tmpl = "this is my {{ template }}" 40 | values = list(template = 100000) 41 | 42 | expect_equal(fill_template(tmpl, values), "this is my 100000") 43 | }) 44 | 45 | test_that("no sci format when passing vectors", { 46 | tmpl = "{{ var1 }} and {{ var2 }}" 47 | values = c(var1=1, var2=1e6) 48 | 49 | expect_equal(fill_template(tmpl, values), "1 and 1000000") 50 | }) 51 | 52 | test_that("BiocGenerics changes format dispatch (#337)", { 53 | # see: https://github.com/Bioconductor/BiocGenerics/blob/RELEASE_3_20/R/format.R 54 | setGeneric("format") 55 | format.list = base::format.AsIs 56 | 57 | tmpl = "{{ var1 }} and {{ var2 }}" 58 | values = c(var1=1, var2=100) 59 | expect_equal(fill_template(tmpl, values), "1 and 100") 60 | }) 61 | -------------------------------------------------------------------------------- /tests/testthat/test-1-check_args.r: -------------------------------------------------------------------------------- 1 | context("check_args") 2 | 3 | test_that("required args are provided", { 4 | f1 = function(x) x 5 | # x is provided 6 | expect_is(check_args(f1, iter=list(x=1)), "data.frame") 7 | expect_error(check_args(f1, iter=list(y=1))) 8 | 9 | # don't allow empty iter argument 10 | expect_error(check_args(f1, iter=list())) 11 | expect_error(check_args(f1, const=list(x=1))) 12 | }) 13 | 14 | test_that("no superfluous args unless function takes `...`", { 15 | f1 = function(x) x 16 | expect_error(check_args(f1, iter=list(x=1, y=1))) 17 | expect_error(check_args(f1, iter=list(x=1), const=list(y=1))) 18 | 19 | f2 = function(x, ...) x 20 | expect_is(check_args(f2, iter=list(x=1, y=1)), "data.frame") 21 | expect_is(check_args(f2, iter=list(x=1), const=list(y=1)), "data.frame") 22 | }) 23 | 24 | test_that("allow 1 non-optional unnamed arg", { 25 | f1 = function(x) x 26 | f2 = function(x, y=1) x+y 27 | f3 = function(x, y) x+y 28 | 29 | # allow 1 unnamed arg, but not wrong name 30 | expect_is(check_args(f1, iter=list(1)), "data.frame") 31 | expect_is(check_args(f2, iter=list(1)), "data.frame") 32 | expect_error(check_args(f3, iter=list(1))) 33 | }) 34 | -------------------------------------------------------------------------------- /tests/testthat/test-2-worker.r: -------------------------------------------------------------------------------- 1 | context("worker usage") 2 | 3 | test_that("connect to invalid endpoint errors", { 4 | w = methods::new(CMQWorker) 5 | expect_error(w$connect("tcp://localhost:12345", 0L)) 6 | w$close() 7 | }) 8 | 9 | test_that("recv without pending workers errors before timeout", { 10 | m = methods::new(CMQMaster) 11 | addr = m$listen("inproc://endpoint") 12 | expect_error(m$recv(-1L)) 13 | m$close(500L) 14 | }) 15 | 16 | test_that("recv timeout works", { 17 | m = methods::new(CMQMaster) 18 | addr = m$listen("inproc://endpoint") 19 | m$add_pending_workers(1L) 20 | expect_error(m$recv(0L)) 21 | m$close(500L) 22 | }) 23 | 24 | test_that("worker evaluation", { 25 | m = methods::new(CMQMaster) 26 | w = methods::new(CMQWorker, m$context()) 27 | addr = m$listen("inproc://endpoint") 28 | m$add_pending_workers(1L) 29 | w$connect(addr, 500L) 30 | 31 | m$recv(500L) 32 | m$send(expression(5 * 2)) 33 | status = w$process_one() 34 | result = m$recv(500L) 35 | 36 | expect_true(status) 37 | expect_equal(result, 10) 38 | 39 | w$close() 40 | m$close(500L) 41 | }) 42 | 43 | test_that("export variable to worker", { 44 | m = methods::new(CMQMaster) 45 | w = methods::new(CMQWorker, m$context()) 46 | addr = m$listen("inproc://endpoint") 47 | m$add_pending_workers(1L) 48 | w$connect(addr, 500L) 49 | 50 | m$add_env("x", 3) 51 | m$recv(500L) 52 | m$send(expression(5 + x)) 53 | status = w$process_one() 54 | result = m$recv(500L) 55 | expect_true(status) 56 | expect_equal(result, 8) 57 | 58 | m$add_env("x", 5) 59 | m$send(expression(5 + x)) 60 | status = w$process_one() 61 | result = m$recv(500L) 62 | expect_true(status) 63 | expect_equal(result, 10) 64 | 65 | w$close() 66 | m$close(500L) 67 | }) 68 | 69 | test_that("load package on worker", { 70 | m = methods::new(CMQMaster) 71 | w = methods::new(CMQWorker, m$context()) 72 | addr = m$listen("inproc://endpoint") 73 | m$add_pending_workers(1L) 74 | w$connect(addr, 500L) 75 | 76 | m$add_pkg("parallel") 77 | 78 | m$recv(500L) 79 | m$send(expression(splitIndices(1, 1)[[1]])) 80 | status = w$process_one() 81 | result = m$recv(500L) 82 | 83 | expect_true(status) 84 | expect_equal(result, 1) 85 | 86 | w$close() 87 | m$close(500L) 88 | }) 89 | 90 | test_that("errors are sent back to master", { 91 | skip("this works interactively but evaluates the error on testthat") 92 | 93 | m = methods::new(CMQMaster) 94 | w = methods::new(CMQWorker, m$context()) 95 | addr = m$listen("inproc://endpoint") 96 | m$add_pending_workers(1L) 97 | w$connect(addr, 500L) 98 | 99 | m$recv(500L) 100 | m$send(expression(stop("errmsg"))) 101 | status = w$process_one() 102 | result = m$recv(500L) 103 | 104 | expect_true(status) 105 | expect_true(inherits(result, c("condition", "worker_error"))) 106 | 107 | w$close() 108 | m$close(500L) 109 | }) 110 | 111 | test_that("worker R API", { 112 | skip_on_os("windows") 113 | skip_if_not(has_connectivity("127.0.0.1")) # -> this or inproc w/ passing context 114 | 115 | m = methods::new(CMQMaster) 116 | addr = m$listen("tcp://127.0.0.1:*") 117 | m$add_pending_workers(1L) 118 | # addr = m$listen("inproc://endpoint") # mailbox.cpp assertion error 119 | 120 | p = parallel::mcparallel(worker(addr)) 121 | expect_null(m$recv(5000L)) 122 | m$send(expression(5 + 1)) 123 | res = m$recv(500L) 124 | expect_equal(res[[1]], 6) 125 | 126 | m$send_shutdown() 127 | pc = parallel::mccollect(p, wait=TRUE, timeout=0.5) 128 | expect_equal(pc[[1]], NULL) 129 | m$close(500L) 130 | }) 131 | 132 | test_that("communication with two workers", { 133 | skip_on_os("windows") 134 | skip_if_not(has_connectivity("127.0.0.1")) 135 | 136 | m = methods::new(CMQMaster) 137 | addr = m$listen("tcp://127.0.0.1:*") 138 | m$add_pending_workers(2L) 139 | w1 = parallel::mcparallel(worker(addr)) 140 | w2 = parallel::mcparallel(worker(addr)) 141 | 142 | expect_null(m$recv(5000L)) # worker 1 up 143 | m$send(expression({ Sys.sleep(0.5); 5 + 2 })) 144 | expect_null(m$recv(500L)) # worker 2 up 145 | m$send(expression({ Sys.sleep(0.5); 3 + 1 })) 146 | r1 = m$recv(1000L) 147 | m$send_shutdown() 148 | r2 = m$recv(1000L) 149 | m$send_shutdown() 150 | expect_equal(sort(c(r1, r2)), c(4,7)) 151 | 152 | coll1 = parallel::mccollect(w1, wait=TRUE, timeout=0.5) 153 | expect_equal(names(coll1), as.character(w1$pid)) 154 | coll2 = parallel::mccollect(w2, wait=TRUE, timeout=0.5) 155 | expect_equal(names(coll2), as.character(w2$pid)) 156 | 157 | m$close(500L) 158 | }) 159 | -------------------------------------------------------------------------------- /tests/testthat/test-3-work_chunk.r: -------------------------------------------------------------------------------- 1 | context("work_chunk") 2 | 3 | df = structure(row.names=c(NA, -3), class="data.frame", .Data=list( 4 | a = 1:3, 5 | b = as.list(letters[1:3]), 6 | c = setNames(as.list(3:1), letters[1:3]) 7 | )) 8 | 9 | test_that("data types and arg names", { 10 | fx = function(c, a, b) a + c 11 | expect_equal(work_chunk(df, fx)$result, 12 | setNames(as.list(rep(4,3)), rownames(df))) 13 | 14 | expect_equal(work_chunk(df, fx, rettype="numeric")$result, 15 | setNames(rep(4,3), rownames(df))) 16 | }) 17 | 18 | test_that("check call classes", { 19 | df2 = df 20 | df2$a = list(matrix(1:4, nrow=2)) 21 | fx = function(...) sapply(list(...), class) 22 | 23 | re = sapply(colnames(df2), function(i) class(df2[[1,i]])) 24 | expect_equal(work_chunk(df2, fx)$result, setNames(rep(list(re), 3), c(1:3))) 25 | }) 26 | 27 | test_that("do not unlist matrix in data.frame", { 28 | elm = structure(1:4, .Dim = c(2,2), .Dimnames=list(c("r1","r2"), c("c1","c2"))) 29 | df2 = structure(list(expr = structure(list(expr = elm)))) 30 | 31 | fx = function(...) list(...) 32 | expect_equal(work_chunk(df2, fx)$result$'1', list(expr=elm)) 33 | }) 34 | 35 | test_that("warning and error handling", { 36 | fx = function(a, ...) { 37 | if (a %% 3 == 0) 38 | warning("warning") 39 | if (a %% 2 == 0) 40 | stop("error") 41 | a 42 | } 43 | 44 | re = work_chunk(data.frame(a=1:6), fx) 45 | expect_equal(sapply(re$result, class) == "error", 46 | setNames(rep(c(FALSE,TRUE), 3), 1:6)) 47 | expect_equal(c(1,3,5), unname(unlist(re$result[c(1,3,5)]))) 48 | expect_equal(c(1,3,5), as.integer(names(re$result[c(1,3,5)]))) 49 | expect_equal(length(re$warnings), 2) 50 | expect_true(grepl("3", re$warnings[[1]])) 51 | expect_true(grepl("warning", re$warnings[[1]])) 52 | expect_true(grepl("6", re$warnings[[2]])) 53 | expect_true(grepl("warning", re$warnings[[2]])) 54 | }) 55 | 56 | test_that("call can have multiple warnings", { 57 | fx = function(a) { 58 | if (a == 1) { 59 | warning("warning 1") 60 | warning("warning 2") 61 | } 62 | } 63 | re = work_chunk(data.frame(a=1:2), fx) 64 | expect_equal(length(re$warnings[['1']]), 2) 65 | }) 66 | 67 | test_that("const args", { 68 | fx = function(a, ..., x=23) a + x 69 | 70 | re = work_chunk(df, fx, const=list(x=5))$result 71 | expect_equal(re, setNames(as.list(df$a + 5), 1:3)) 72 | }) 73 | 74 | test_that("seed reproducibility", { 75 | fx = function(a, ...) sample(1:1000, 1) 76 | 77 | # seed should be set by common + df row name 78 | expect_equal(work_chunk(df[1:2,], fx, common_seed=123)$result$'2', 79 | work_chunk(df[2:3,], fx, common_seed=123)$result$'2') 80 | }) 81 | 82 | test_that("env separation", { 83 | seed = 123 84 | fx = function(x, common_seed=seed) { 85 | fun = function(x) stop("overwrite function") 86 | df = data.frame() 87 | common_seed 88 | } 89 | df2 = data.frame(x=1:5) 90 | expect_equal(work_chunk(df2, fx)$result, setNames(rep(list(seed), 5), 1:5)) 91 | }) 92 | -------------------------------------------------------------------------------- /tests/testthat/test-4-pool.r: -------------------------------------------------------------------------------- 1 | context("pool") 2 | 3 | skip_if_not(has_connectivity("127.0.0.1")) 4 | 5 | test_that("starting and stopping multicore", { 6 | skip_on_os("windows") 7 | 8 | w = workers(1, qsys_id="multicore") 9 | expect_equal(w$workers_total, 1) 10 | expect_equal(w$workers_running, 0) 11 | expect_null(w$recv(5000L)) 12 | expect_equal(w$workers_running, 1) 13 | w$send(3 + 4) 14 | expect_equal(w$workers_running, 1) 15 | expect_equal(w$recv(1000L), 7) 16 | expect_equal(w$workers_running, 1) 17 | w$send_shutdown() 18 | expect_equal(w$workers_running, 0) 19 | expect_equal(w$workers_total, 0) 20 | expect_error(w$send(1)) 21 | expect_error(w$recv(1000L)) 22 | w$cleanup() 23 | expect_equal(w$workers_running, 0) 24 | expect_equal(w$workers_total, 0) 25 | expect_error(w$send(2)) 26 | expect_error(w$recv(1000L)) 27 | expect_equal(w$workers_running, 0) 28 | expect_equal(w$workers_total, 0) 29 | }) 30 | 31 | test_that("pending workers area cleaned up properly", { 32 | skip_on_os("windows") 33 | w = workers(1, qsys_id="multicore") 34 | w$cleanup() 35 | expect_equal(w$workers_running, 0) 36 | expect_equal(w$workers_total, 0) 37 | }) 38 | 39 | test_that("calculations are really done on the worker", { 40 | skip_on_os("windows") 41 | x = 1 42 | y = 2 43 | w = workers(1, qsys_id="multicore") 44 | expect_null(w$recv(5000L)) 45 | w$env(y = 3) 46 | w$send(x + y, x=4) 47 | expect_equal(w$recv(1000L), 7) 48 | w$send_shutdown() 49 | w$cleanup() 50 | }) 51 | 52 | test_that("call references are matched properly", { 53 | skip_on_os("windows") 54 | skip_on_cran() 55 | 56 | w = workers(2, qsys_id="multicore") 57 | expect_null(w$recv(5000L)) 58 | 59 | r1 = w$send({Sys.sleep(1); 1}) 60 | expect_null(w$recv(1000L)) 61 | r2 = w$send(2) 62 | expect_equal(w$recv(500L), 2) 63 | expect_equal(w$current()$call_ref, r2) 64 | w$send_shutdown() 65 | expect_equal(w$recv(2000L), 1) 66 | expect_equal(w$current()$call_ref, r1) 67 | w$cleanup() 68 | }) 69 | 70 | test_that("multiprocess", { 71 | skip("https://github.com/r-lib/processx/issues/236") 72 | 73 | w = workers(1, qsys_id="multiprocess") 74 | expect_null(w$recv()) 75 | w$send(3 + 5) 76 | expect_equal(w$recv(), 8) 77 | w$send_shutdown() 78 | w$cleanup() 79 | }) 80 | 81 | test_that("work_chunk on multiprocess", { 82 | skip("https://github.com/r-lib/processx/issues/236") 83 | 84 | w = workers(1, qsys_id="multiprocess") 85 | expect_null(w$recv()) 86 | w$send(clustermq:::work_chunk(chunk, `+`), chunk=list(a=1:3, b=4:6)) 87 | res = w$recv() 88 | expect_equal(res$result, list(`1`=5, `2`=7, `3`=9)) 89 | expect_equal(res$warnings, list()) 90 | expect_equal(res$errors, list()) 91 | w$send_shutdown() 92 | w$cleanup() 93 | }) 94 | 95 | test_that("worker creation passes template filling values", { 96 | TMPL_FILLER <<- R6::R6Class("TMPL_FILLER", 97 | inherit = QSys, 98 | public = list( 99 | initialize = function(addr, n_jobs, master, ...) { 100 | super$initialize(addr=addr, master=master, template="LSF") 101 | self$filled = private$fill_options(...) 102 | }, 103 | filled = list() 104 | ) 105 | ) 106 | old_defaults = getOption("clustermq.defaults") 107 | on.exit(options(clustermq.defaults = old_defaults)) 108 | options(clustermq.defaults = list(cores="defaults_test", memory="invalid")) 109 | 110 | w = workers(1, qsys_id="tmpl_filler", template=list(memory="test")) 111 | rm(TMPL_FILLER, envir=.GlobalEnv) 112 | 113 | expect_equal(w$workers$filled$memory, "test") 114 | expect_equal(w$workers$filled$cores, "defaults_test") 115 | }) 116 | -------------------------------------------------------------------------------- /tests/testthat/test-5-queue.r: -------------------------------------------------------------------------------- 1 | context("queue") 2 | 3 | skip_if_not(has_connectivity("127.0.0.1")) 4 | 5 | test_that("control flow", { 6 | skip_on_os("windows") 7 | fx = function(x) x*2 8 | w = workers(n_jobs=1, qsys_id="multicore", reuse=FALSE) 9 | r = Q(fx, x=1:3, workers=w, timeout=10L) 10 | expect_equal(r, as.list(1:3*2)) 11 | }) 12 | 13 | test_that("control flow with automatic workers", { 14 | skip_on_os("windows") 15 | 16 | old_sched = getOption("clustermq.scheduler") 17 | on.exit(options(clustermq.scheduler = old_sched)) 18 | options(clustermq.scheduler = "multicore") 19 | 20 | fx = function(x) x*2 21 | r = Q(fx, x=1:3, n_jobs=1, timeout=10L) 22 | expect_equal(r, as.list(1:3*2)) 23 | }) 24 | 25 | test_that("common data", { 26 | skip_on_os("windows") 27 | fx = function(x, y) x*2 + y 28 | w = workers(n_jobs=1, qsys_id="multicore", reuse=FALSE) 29 | r = Q(fx, x=1:3, const=list(y=10), workers=w, timeout=10L) 30 | expect_equal(r, as.list(1:3*2+10)) 31 | }) 32 | 33 | test_that("export", { 34 | skip_on_os("windows") 35 | fx = function(x) x*2 + z 36 | w = workers(n_jobs=1, qsys_id="multicore", reuse=FALSE) 37 | r = Q(fx, x=1:3, export=list(z=20), workers=w, timeout=10L) 38 | expect_equal(r, as.list(1:3*2+20)) 39 | }) 40 | 41 | test_that("load package on worker", { 42 | skip_on_os("windows") 43 | fx = function(x) splitIndices(1,1) 44 | x = "a string" 45 | w = workers(n_jobs=1, qsys_id="multicore", reuse=FALSE) 46 | r = Q(fx, x=x, pkgs="parallel", workers=w, rettype="character", timeout=10L) 47 | expect_equal(r, "1") 48 | }) 49 | 50 | test_that("seed reproducibility", { 51 | skip_on_os("windows") 52 | fx = function(x) sample(1:100, 1) 53 | w1 = workers(n_jobs=1, qsys_id="multicore", reuse=FALSE) 54 | w2 = workers(n_jobs=1, qsys_id="multicore", reuse=FALSE) 55 | r1 = Q(fx, x=1:3, workers=w1, timeout=10L) 56 | r2 = Q(fx, x=1:3, workers=w2, timeout=10L) 57 | expect_equal(r1, r2) 58 | }) 59 | 60 | test_that("master does not exit loop prematurely", { 61 | skip_on_os("windows") 62 | fx = function(x) { 63 | Sys.sleep(0.5) 64 | x*2 65 | } 66 | w = workers(n_jobs=2, qsys_id="multicore", reuse=FALSE) 67 | r = Q(fx, x=1:3, workers=w, timeout=10L) 68 | expect_equal(r, as.list(1:3*2)) 69 | }) 70 | 71 | test_that("rettype is respected", { 72 | skip_on_os("windows") 73 | fx = function(x) x*2 74 | w = workers(n_jobs=1, qsys_id="multicore", reuse=FALSE) 75 | r = Q(fx, x=1:3, rettype="numeric", workers=w, timeout=10L) 76 | expect_equal(r, 1:3*2) 77 | }) 78 | 79 | test_that("worker timeout throws error", { 80 | skip_on_os("windows") 81 | w = workers(n_jobs=1, qsys_id="multicore", reuse=FALSE) 82 | expect_error(expect_warning( 83 | Q(Sys.sleep, 3, rettype="numeric", workers=w, timeout=1L))) 84 | }) 85 | 86 | test_that("Q with expired workers throws error quickly", { 87 | skip_on_cran() 88 | skip_on_os("windows") 89 | 90 | w = workers(n_jobs=1, qsys_id="multicore", reuse=FALSE) 91 | w$cleanup() 92 | 93 | times = system.time({ 94 | expect_error(Q(identity, x=1:3, rettype="numeric", workers=w, timeout=10L)) 95 | }) 96 | expect_true(times[["elapsed"]] < 5) 97 | }) 98 | 99 | test_that("shutdown monitor does not fire on clean disconnects", { 100 | skip_on_os("windows") 101 | skip_if_not(libzmq_has_draft()) 102 | 103 | w = workers(n_jobs=2, qsys_id="multicore", reuse=FALSE) 104 | res = Q(Sys.sleep, time=c(0,1), workers=w, timeout=10L) 105 | expect_equal(res, list(NULL, NULL)) 106 | }) 107 | 108 | test_that("max_calls_worker is respected", { 109 | skip_on_cran() 110 | skip_on_os("windows") 111 | 112 | fx = function(x) { Sys.sleep(x==1); Sys.getpid() } 113 | 114 | w = workers(n_jobs=2, qsys_id="multicore", reuse=FALSE) 115 | res = table(unlist(Q(fx, x=1:4, workers=w))) 116 | expect_true(setequal(res, c(1,3))) 117 | 118 | w = workers(n_jobs=2, qsys_id="multicore", reuse=FALSE) 119 | res = table(unlist(Q(fx, x=1:4, workers=w, max_calls_worker=2))) 120 | expect_true(setequal(res, 2)) 121 | }) 122 | -------------------------------------------------------------------------------- /tests/testthat/test-6-queue_impl.r: -------------------------------------------------------------------------------- 1 | context("qsys implementations") 2 | 3 | avail = Sys.which(c("bsub", "qsub", "sbatch", "fake_scheduler.sh")) 4 | avail = as.list(nchar(avail) != 0) 5 | fx = function(x) x*2 6 | 7 | test_that("local, explicit", { 8 | w = workers(n_jobs=4, qsys_id="local") 9 | r = Q(fx, x=1:3, workers=w, timeout=10L) 10 | success = w$cleanup() 11 | expect_equal(r, as.list(1:3*2)) 12 | expect_true(success) 13 | }) 14 | 15 | test_that("local, n_jobs=0", { 16 | fx = function(x) x*2 17 | r = Q(fx, x=1:3, n_jobs=0, timeout=10L) 18 | expect_equal(r, as.list(1:3*2)) 19 | }) 20 | 21 | test_that("qsys_multicore", { 22 | skip_on_os("windows") 23 | w = workers(n_jobs=4, qsys_id="multicore", reuse=FALSE) 24 | r = Q(fx, x=1:3, workers=w, timeout=10L) 25 | expect_equal(r, as.list(1:3*2)) 26 | }) 27 | 28 | test_that("qsys_multicore with reuse=TRUE", { 29 | skip_on_os("windows") 30 | w = workers(n_jobs=4, qsys_id="multicore", reuse=TRUE) 31 | r = Q(fx, x=1:3, workers=w, timeout=10L) 32 | success = w$cleanup() 33 | expect_equal(r, as.list(1:3*2)) 34 | expect_true(success) 35 | }) 36 | 37 | test_that("qsys_multiprocess (callr)", { 38 | skip("https://github.com/r-lib/processx/issues/236") 39 | 40 | w = workers(n_jobs=2, qsys_id="multiprocess", reuse=TRUE) 41 | r = Q(fx, x=1:3, workers=w, timeout=10L) 42 | success = w$cleanup() 43 | expect_equal(r, as.list(1:3*2)) 44 | expect_equal(success, TRUE) 45 | }) 46 | 47 | test_that("qsys_lsf", { 48 | skip_on_cran() 49 | skip_if_not(with(avail, bsub)) 50 | skip_if_not(has_cmq()) 51 | skip_if_not(has_connectivity(Sys.info()["nodename"])) 52 | skip_on_os("windows") 53 | w = workers(n_jobs=1, qsys_id="lsf", reuse=FALSE) 54 | r = Q(fx, x=1:3, workers=w, timeout=10L) 55 | expect_equal(r, as.list(1:3*2)) 56 | }) 57 | 58 | test_that("qsys_sge", { 59 | skip_on_cran() 60 | skip_if_not(with(avail, qsub)) 61 | skip_if_not(has_cmq()) 62 | skip_if_not(has_connectivity(Sys.info()["nodename"])) 63 | skip_on_os("windows") 64 | w = workers(n_jobs=1, qsys_id="sge", reuse=FALSE) 65 | r = Q(fx, x=1:3, workers=w, timeout=10L) 66 | expect_equal(r, as.list(1:3*2)) 67 | }) 68 | 69 | test_that("qsys_slurm", { 70 | skip_on_cran() 71 | skip_if_not(with(avail, sbatch)) 72 | skip_if_not(has_cmq()) 73 | skip_if_not(has_connectivity(Sys.info()["nodename"])) 74 | skip_on_os("windows") 75 | w = workers(n_jobs=1, qsys_id="slurm", reuse=FALSE) 76 | r = Q(fx, x=1:3, workers=w, timeout=10L) 77 | expect_equal(r, as.list(1:3*2)) 78 | }) 79 | -------------------------------------------------------------------------------- /tests/testthat/test-7-ssh_proxy.r: -------------------------------------------------------------------------------- 1 | context("ssh proxy") 2 | 3 | has_localhost = has_connectivity("127.0.0.1") 4 | 5 | # in the following 2 tests, passing the context is deactivated because running 6 | # the first test twice leads to a segfault; not sure why, fix this eventually 7 | test_that("simple forwarding works", { 8 | skip_if_not(has_localhost) 9 | 10 | m = methods::new(CMQMaster) 11 | p = methods::new(CMQProxy)#, m$context()) 12 | w = methods::new(CMQWorker)#, m$context()) 13 | addr1 = m$listen("tcp://127.0.0.1:*")#"inproc://master") 14 | addr2 = p$listen("tcp://127.0.0.1:*")#"inproc://proxy") 15 | m$add_pending_workers(1L) 16 | p$connect(addr1, 500L) 17 | w$connect(addr2, 500L) 18 | expect_true(p$process_one()) 19 | expect_null(m$recv(500L)) # worker up 20 | m$send(5 + 2) 21 | expect_true(p$process_one()) 22 | expect_true(w$process_one()) 23 | expect_true(p$process_one()) 24 | result = m$recv(500L) 25 | expect_equal(result, 7) 26 | 27 | w$close() 28 | p$close(0L) 29 | m$close(0L) 30 | }) 31 | 32 | test_that("proxy communication yields submit args", { 33 | skip_if_not(has_localhost) 34 | skip_on_cran() 35 | 36 | m = methods::new(CMQMaster) 37 | p = methods::new(CMQProxy)#, m$context()) 38 | addr1 = m$listen("tcp://127.0.0.1:*")#"inproc://master") 39 | addr2 = p$listen("tcp://127.0.0.1:*")#"inproc://proxy") 40 | 41 | # direct connection, no ssh forward here 42 | p$connect(addr1, 500L) 43 | p$proxy_request_cmd() 44 | m$proxy_submit_cmd(list(n_jobs=1), 500L) 45 | args = p$proxy_receive_cmd() 46 | 47 | expect_true(inherits(args, "list")) 48 | expect_true("n_jobs" %in% names(args)) 49 | 50 | p$close(0L) 51 | m$close(0L) 52 | }) 53 | 54 | test_that("using the proxy without pool and forward", { 55 | skip_on_cran() 56 | skip_on_os("windows") 57 | skip_if_not(has_localhost) 58 | skip_if(toupper(getOption("clustermq.scheduler", qsys_default)) != "MULTICORE", 59 | message="options(clustermq.scheduler') must be 'MULTICORE'") 60 | 61 | m = methods::new(CMQMaster) 62 | addr = m$listen("tcp://127.0.0.1:*") 63 | p = parallel::mcparallel(ssh_proxy(sub(".*:", "", addr))) 64 | 65 | m$proxy_submit_cmd(list(n_jobs=1), 10000L) 66 | m$add_pending_workers(1L) 67 | expect_null(m$recv(2000L)) # worker 1 up 68 | m$send(5 + 2) 69 | expect_equal(m$recv(500L), 7) # collect results 70 | 71 | m$send_shutdown() 72 | m$close(500L) 73 | 74 | pr = parallel::mccollect(p, wait=TRUE, timeout=0.5) 75 | expect_equal(names(pr), as.character(p$pid)) 76 | }) 77 | 78 | test_that("using the proxy without pool and forward, 2 workers", { 79 | skip_on_cran() 80 | skip_on_os("windows") 81 | skip_if_not(has_localhost) 82 | skip_if(toupper(getOption("clustermq.scheduler", qsys_default)) != "MULTICORE", 83 | message="options(clustermq.scheduler') must be 'MULTICORE'") 84 | 85 | m = methods::new(CMQMaster) 86 | addr = m$listen("tcp://127.0.0.1:*") 87 | p = parallel::mcparallel(ssh_proxy(sub(".*:", "", addr))) 88 | 89 | m$proxy_submit_cmd(list(n_jobs=2), 10000L) 90 | m$add_pending_workers(2L) 91 | expect_null(m$recv(2000L)) # worker 1 up 92 | m$send({ Sys.sleep(0.5); 5 + 2 }) 93 | expect_null(m$recv(500L)) # worker 2 up 94 | m$send({ Sys.sleep(0.5); 3 + 1 }) 95 | r1 = m$recv(1000L) 96 | m$send_shutdown() 97 | r2 = m$recv(500L) 98 | m$send_shutdown() 99 | expect_equal(sort(c(r1,r2)), c(4,7)) 100 | 101 | m$close(500L) 102 | pr = parallel::mccollect(p, wait=TRUE, timeout=0.5) 103 | expect_equal(names(pr), as.character(p$pid)) 104 | }) 105 | 106 | test_that("full SSH connection", { 107 | skip_on_cran() 108 | skip_on_os("windows") 109 | skip_if_not(has_localhost) 110 | skip_if_not(has_ssh_cmq("127.0.0.1")) 111 | 112 | # 'LOCAL' mode (default) will not set up required sockets 113 | # 'SSH' mode would lead to circular connections 114 | # schedulers may have long delay (they start in fresh session, so no path) 115 | sched = getOption("clustermq.scheduler", qsys_default) 116 | skip_if(is.null(sched) || toupper(sched) != "MULTICORE", 117 | message="options(clustermq.scheduler') must be 'MULTICORE'") 118 | options(clustermq.template = "SSH", clustermq.ssh.host="127.0.0.1") 119 | 120 | w = workers(n_jobs=1, qsys_id="ssh", reuse=FALSE) 121 | result = Q(identity, 42, n_jobs=1, timeout=10L, workers=w) 122 | expect_equal(result, list(42)) 123 | 124 | w = workers(n_jobs=2, qsys_id="ssh", reuse=FALSE) 125 | result = clustermq::Q(Sys.sleep, time=c(1,2), n_jobs=2) 126 | expect_equal(result, list(NULL, NULL)) 127 | }) 128 | -------------------------------------------------------------------------------- /tests/testthat/test-8-foreach.r: -------------------------------------------------------------------------------- 1 | context("foreach") 2 | 3 | skip_if_not_installed("foreach") 4 | 5 | foreach = foreach::foreach 6 | `%dopar%` = foreach::`%dopar%` 7 | `%do%` = foreach::`%do%` 8 | register_dopar_cmq(n_jobs=0) 9 | 10 | test_that("foreach::getDoParWorkers() returns n_jobs", { 11 | expect_equal(foreach::getDoParWorkers(), 0) 12 | }) 13 | 14 | test_that("simple foreach registration works", { 15 | res = foreach(i=1:3) %dopar% sqrt(i) 16 | cmp = foreach(i=1:3) %do% sqrt(i) 17 | 18 | expect_equal(res, cmp) 19 | }) 20 | 21 | test_that(".export objects are exported", { 22 | y = 5 23 | res = foreach(x=1:3, .export="y") %dopar% { x + y } 24 | cmp = foreach(x=1:3, .export="y") %do% { x + y } 25 | 26 | expect_equal(res, cmp) 27 | # expect_error(foreach(x=1:3) %dopar% { x + y }) 28 | }) 29 | 30 | test_that(".packages are loaded", { 31 | expect_error(foreach(i="a string") %dopar% { md5sum(i) }) 32 | res = foreach(i="a string", .packages="tools") %dopar% { md5sum(i) } 33 | cmp = foreach(i="a string") %do% { md5sum(i) } 34 | expect_equal(res, cmp) 35 | }) 36 | 37 | test_that(".combine is respected", { 38 | res = foreach(i=1:3, .combine=c) %dopar% sqrt(i) 39 | cmp = foreach(i=1:3, .combine=c) %do% sqrt(i) 40 | expect_equal(res, cmp) 41 | 42 | res = foreach(i=1:3, .combine=append) %dopar% list(a=1, b=2) 43 | cmp = foreach(i=1:3, .combine=append) %do% list(a=1, b=2) 44 | expect_equal(res, cmp) 45 | 46 | res = foreach(i=1:3, .combine=cbind) %dopar% sqrt(i) 47 | cmp = foreach(i=1:3, .combine=cbind) %do% sqrt(i) 48 | expect_equal(res, cmp) 49 | 50 | res = foreach(i=1:3, .combine=rbind) %dopar% sqrt(i) 51 | cmp = foreach(i=1:3, .combine=rbind) %do% sqrt(i) 52 | expect_equal(res, cmp) 53 | }) 54 | 55 | test_that("no matrix unlisting (#143)", { 56 | fx = function(x) matrix(c(1,2)+x, ncol=1) 57 | res = foreach(i=1:3) %dopar% fx(i) 58 | cmp = foreach(i=1:3) %do% fx(i) 59 | expect_equal(res, cmp) 60 | }) 61 | 62 | test_that("automatic export in foreach", { 63 | fx = function(x) x + y 64 | y = 5 65 | res = foreach(x=1:3) %dopar% { x + y } 66 | cmp = foreach(x=1:3) %do% { x + y } 67 | expect_equal(res, cmp) 68 | }) 69 | 70 | test_that("NULL objects are exported", { 71 | fx = function(x) is.null(x) 72 | y = NULL 73 | res = foreach(i=1) %dopar% fx(y) 74 | cmp = foreach(i=1) %do% fx(y) 75 | expect_equal(res, cmp) 76 | }) 77 | 78 | test_that("external worker", { 79 | skip_on_os("windows") 80 | 81 | old_sched = getOption("clustermq.scheduler") 82 | on.exit(options(clustermq.scheduler = old_sched)) 83 | options(clustermq.scheduler = "multicore") 84 | 85 | register_dopar_cmq(n_jobs=1) 86 | res = foreach(i=1:3) %dopar% sqrt(i) 87 | cmp = foreach(i=1:3) %do% sqrt(i) 88 | expect_equal(res, cmp) 89 | }) 90 | 91 | test_that("foreach works via BiocParallel", { 92 | skip_on_os("windows") 93 | skip_if_not_installed("BiocParallel") 94 | 95 | old_sched = getOption("clustermq.scheduler") 96 | on.exit(options(clustermq.scheduler = old_sched)) 97 | options(clustermq.scheduler = "multicore") 98 | 99 | register_dopar_cmq(n_jobs=1) 100 | BiocParallel::register(BiocParallel::DoparParam()) 101 | res = BiocParallel::bplapply(1:3, sqrt) 102 | cmp = foreach(i=1:3) %do% sqrt(i) 103 | 104 | expect_equal(res, cmp) 105 | }) 106 | -------------------------------------------------------------------------------- /tools/winlibs.R: -------------------------------------------------------------------------------- 1 | if(!file.exists("../windows/zeromq/include")){ 2 | unlink("../windows", recursive = TRUE) 3 | url <- if(grepl("aarch", R.version$platform)){ 4 | "https://github.com/r-windows/bundles/releases/download/zeromq-4.3.4/zeromq-4.3.4-clang-aarch64.tar.xz" 5 | } else if(grepl("clang", Sys.getenv('R_COMPILED_BY'))){ 6 | "https://github.com/r-windows/bundles/releases/download/zeromq-4.3.4/zeromq-4.3.4-clang-x86_64.tar.xz" 7 | } else if(getRversion() >= "4.3") { 8 | "https://github.com/r-windows/bundles/releases/download/zeromq-4.3.4/zeromq-4.3.4-ucrt-x86_64.tar.xz" 9 | } else { 10 | "https://github.com/rwinlib/zeromq/archive/4.3.4.tar.gz" 11 | } 12 | download.file(url, basename(url), quiet = TRUE) 13 | dir.create("../windows", showWarnings = FALSE) 14 | untar(basename(url), exdir = "../windows", tar = 'internal') 15 | unlink(basename(url)) 16 | setwd("../windows") 17 | file.rename(list.files(), 'zeromq') 18 | } 19 | -------------------------------------------------------------------------------- /vignettes/faq.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Frequently asked questions" 3 | output: 4 | rmarkdown::html_vignette 5 | vignette: > 6 | %\VignetteIndexEntry{FAQ} 7 | %\VignetteEngine{knitr::rmarkdown} 8 | %\VignetteEncoding{UTF-8} 9 | --- 10 | 11 | ```{css echo=FALSE} 12 | img { 13 | border: 0px !important; 14 | margin: 2em 2em 2em 2em !important; 15 | } 16 | code { 17 | border: 0px !important; 18 | } 19 | ``` 20 | 21 | ```{r echo=FALSE, results="hide"} 22 | knitr::opts_chunk$set( 23 | cache = FALSE, 24 | echo = TRUE, 25 | collapse = TRUE, 26 | comment = "#>" 27 | ) 28 | options(clustermq.scheduler = "local", rmarkdown.html_vignette.check_title = FALSE) 29 | suppressPackageStartupMessages(library(clustermq)) 30 | ``` 31 | 32 | ## Installation errors {#install} 33 | 34 | To compile this package a fully C++11 compliant compiler is required. This is 35 | [implicit for CRAN packages](https://www.tidyverse.org/blog/2023/03/cran-checks-compiled-code/) 36 | since `R=3.6.2` and is hence not listed in _SystemRequirements_. 37 | 38 | If you encounter an error saying that that no matching function call to 39 | `zmq::message_t::message_t(std::string&)` exists, your compiler does not 40 | (fully) support this and the automated check failed for some reason. 41 | 42 | ```{sh eval=FALSE} 43 | In file included from CMQMaster.cpp:2:0: 44 | CMQMaster.h: In member function ‘void CMQMaster::proxy_submit_cmd(SEXP, int)’: 45 | CMQMaster.h:146:40: error: no matching function for call to ‘zmq::message_t::message_t(std::string&)’ 46 | mp.push_back(zmq::message_t(cur)); 47 | ``` 48 | 49 | This happens for instance for old versions of the `gcc` compiler (default on 50 | most Linux distributions). You can check your version in the terminal using: 51 | 52 | ```{sh eval=FALSE} 53 | # the minimum required gcc version is 5.5 for full C++11 support (3.3 for clang) 54 | cc --version 55 | ``` 56 | 57 | In this case, it is _very_ likely that your HPC system already has a newer 58 | compiler installed that you need to add to your `$PATH` or load as a module. 59 | Once this is set, you can install the package from R *that was started in a 60 | terminal that has this module/path active*. 61 | 62 | ## Session gets stuck at "Running calculations" {#stuck} 63 | 64 | Your R session may be stuck at something like the following: 65 | 66 | ```{r eval=FALSE} 67 | > clustermq::Q(identity, x=42, n_jobs=1) 68 | Submitting 1 worker jobs (ID: cmq8480) ... 69 | Running 1 calculations (5 objs/19.4 Kb common; 1 calls/chunk) ... 70 | ``` 71 | 72 | You will see this every time your jobs are queued but not yet started. 73 | Depending on how busy your HPC is, this may take a long time. You can check the 74 | queueing status of your jobs in the terminal with _e.g._ `qstat` (SGE), `bjobs` 75 | (LSF), or `sinfo` (SLURM). 76 | 77 | If your jobs are already finished, this likely means that the `clustermq` 78 | workers can not connect to the main session. You can confirm this by passing 79 | [`log_worker=TRUE`](https://mschubert.github.io/clustermq/articles/userguide.html#debugging-workers) 80 | to `Q` and inspect the logs created in your current working directory. If they 81 | state something like: 82 | 83 | ```{sh eval=FALSE} 84 | > clustermq:::worker("tcp://my.headnode:9091") 85 | 2023-12-11 10:22:58.485529 | Master: tcp://my.headnode:9091 86 | 2023-12-11 10:22:58.488892 | connecting to: tcp://my.headnode:9091: 87 | Error: Connection failed after 10016 ms 88 | Execution halted 89 | ``` 90 | 91 | the submitted job is indeed unable to establish a network connection with the 92 | head node. This can happen if your HPC does not allow incoming connections at 93 | all, but more likely happens because (1) only certain ports are allowed, or (2) 94 | there are multiple network interfaces, only some of which have access to the 95 | head node. 96 | 97 | 1. If the head node only allows incoming connections on certain ports, set the 98 | [R 99 | option](https://mschubert.github.io/clustermq/articles/userguide.html#options) 100 | `clustermq.ports=`. 101 | 2. You can list the available network interfaces using the `ifconfig` command 102 | in the terminal. Find the interface that shares a subnetwork with the head 103 | node and add the [R 104 | option](https://mschubert.github.io/clustermq/articles/userguide.html#options) 105 | `clustermq.host=`. If this is unclear, contact your system 106 | administrators to see which interface to use. 107 | 108 | ## SSH not working {#ssh} 109 | 110 | Before trying remote schedulers via SSH, make sure that the scheduler works 111 | when you first connect to the cluster and run a job from there. 112 | 113 | If the terminal is stuck at 114 | 115 | ``` 116 | Connecting via SSH ... 117 | ``` 118 | 119 | make sure that each step of your SSH connection works by typing the following 120 | commands in your **local** terminal and make sure that you don't get errors or 121 | warnings in each step: 122 | 123 | ```{sh eval=FALSE} 124 | # test your ssh login that you set up in ~/.ssh/config 125 | # if this fails you have not set up SSH correctly 126 | ssh 127 | 128 | # test port forwarding from 54709 remote to 6687 local (ports are random) 129 | # if the fails you will not be able to use clustermq via SSH 130 | ssh -R 54709:localhost:6687 R --vanilla 131 | ``` 132 | 133 | If you get an `Command not found: R` error, make sure your `$PATH` is set up 134 | correctly in your `~/.bash_profile` and/or your `~/.bashrc` (depending on your 135 | cluster config you might need either). You may also need to modify your [SSH 136 | template](https://mschubert.github.io/clustermq/articles/userguide.html#ssh-template) 137 | to load R as a module or conda environment. 138 | 139 | If you get a SSH warning or error try again with `ssh -v` to enable verbose 140 | output. If the forward itself works, run the following in your local R session 141 | (ideally also in command-line R, [not only in 142 | RStudio](https://github.com/mschubert/clustermq/issues/206)): 143 | 144 | ```{r eval=FALSE} 145 | options(clustermq.scheduler = "ssh", 146 | clustermq.ssh.log = "~/ssh_proxy.log") 147 | Q(identity, x=1, n_jobs=1) 148 | ``` 149 | 150 | This will create a log file *on the remote server* that will contain any errors 151 | that might have occurred during `ssh_proxy` startup. 152 | 153 | If the `ssh_proxy` startup fails on your local machine with the error 154 | 155 | ``` 156 | Remote R process did not respond after 5 seconds. Check your SSH server log. 157 | ``` 158 | 159 | but the server log does not show any errors, then you can try increasing the 160 | timeout: 161 | 162 | ```{r eval=FALSE} 163 | options(clustermq.ssh.timeout = 30) # in seconds 164 | ``` 165 | 166 | This can happen when your SSH startup template includes additional steps before 167 | starting R, such as activating a module or conda environment, or having to 168 | confirm the connection via two-factor authentication. 169 | 170 | ## Running the master inside containers {#master-in-container} 171 | 172 | If your master process is inside a container, accessing the HPC scheduler is 173 | more difficult. Containers, including singularity and docker, isolate the 174 | processes inside the container from the host. The *R* process will not be able 175 | to submit a job because the scheduler cannot be found. 176 | 177 | Note that the HPC node running the master process must be allowed to submit 178 | jobs. Not all HPC systems allow compute nodes to submit jobs. If that is the 179 | case, you may need to run the master process on the login node, and discuss the 180 | issue with your system administrator. 181 | 182 | If your container is binary compatible with the host, you may be able to bind 183 | in the scheduler executable to the container. 184 | 185 | For example, PBS might look something like: 186 | 187 | ```{sh eval=FALSE} 188 | #PBS directives ... 189 | 190 | module load singularity 191 | 192 | SINGULARITYENV_APPEND_PATH=/opt/pbs/bin 193 | singularity exec --bind /opt/pbs/bin r_image.sif Rscript master_script.R 194 | ``` 195 | 196 | A working example of binding SLURM into a CentOS 7 container image from a 197 | CentOS 7 host is available at 198 | https://groups.google.com/a/lbl.gov/d/msg/singularity/syLcsIWWzdo/NZvF2Ud2AAAJ 199 | 200 | Alternatively, you can create a script that uses SSH to execute the scheduler 201 | on the login node. For this, you will need an SSH client in the container, 202 | [keys set up for password-less login](https://www.digitalocean.com/community/tutorials/how-to-configure-ssh-key-based-authentication-on-a-linux-server), 203 | and create a script to call the scheduler on the login node via ssh (e.g. 204 | `~/bin/qsub` for SGE/PBS/Torque, `bsub` for LSF and `sbatch` for Slurm): 205 | 206 | ```{sh eval=FALSE} 207 | #!/bin/bash 208 | ssh -i ~/.ssh/ ${PBS_O_HOST:-"no_host_not_in_a_pbs_job"} qsub "$@" 209 | ``` 210 | 211 | Make sure the script is executable, and bind/copy it into the container 212 | somewhere on `$PATH`. Home directories are bound in by default in singularity. 213 | 214 | ```{sh eval=FALSE} 215 | chmod u+x ~/bin/qsub 216 | SINGULARITYENV_APPEND_PATH=~/bin 217 | ``` 218 | -------------------------------------------------------------------------------- /vignettes/technicaldocs.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Technical Documentation" 3 | output: 4 | rmarkdown::html_vignette 5 | vignette: > 6 | %\VignetteIndexEntry{Technical Documentation} 7 | %\VignetteEngine{knitr::rmarkdown} 8 | %\VignetteEncoding{UTF-8} 9 | --- 10 | 11 | ```{css echo=FALSE} 12 | img { 13 | border: 0px !important; 14 | margin: 2em 2em 2em 2em !important; 15 | } 16 | code { 17 | border: 0px !important; 18 | } 19 | ``` 20 | 21 | ```{r echo=FALSE, results="hide"} 22 | knitr::opts_chunk$set( 23 | cache = FALSE, 24 | echo = TRUE, 25 | collapse = TRUE, 26 | comment = "#>" 27 | ) 28 | options(clustermq.scheduler = "local") 29 | suppressPackageStartupMessages(library(clustermq)) 30 | ``` 31 | 32 | ## Worker API 33 | 34 | ### Base API and schedulers 35 | 36 | The main worker functions are wrapped in an _R6_ class with the name of `QSys`. 37 | This provides a standardized API to the [lower-level 38 | messages](https://mschubert.github.io/clustermq/articles/technicaldocs.html#zeromq-message-specification) 39 | that are sent via [_ZeroMQ_](https://zeromq.org/). 40 | 41 | The base class itself is derived in scheduler classes that add the required 42 | functions for submitting and cleaning up jobs: 43 | 44 | ``` 45 | + QSys 46 | |- Multicore 47 | |- LSF 48 | + SGE 49 | |- PBS 50 | |- Torque 51 | |- etc. 52 | ``` 53 | 54 | The user-visible object is a worker `Pool` that wraps this, and will eventually 55 | allow to manage different workers. 56 | 57 | ### Workers 58 | 59 | #### Creating a worker pool 60 | 61 | A pool of workers can be created using the `workers()` function, which 62 | instantiates a `Pool` object of the corresponding `QSys`-derived scheduler 63 | class. See `?workers` for details. 64 | 65 | ```{r eval=FALSE} 66 | # start up a pool of three workers using the default scheduler 67 | w = workers(n_jobs=3) 68 | 69 | # if we make an unclean exit for whatever reason, clean up the jobs 70 | on.exit(w$cleanup()) 71 | ``` 72 | 73 | #### Worker startup 74 | 75 | For workers that are started up via a scheduler, we do not know which machine 76 | they will run on. This is why we start up every worker with a TCP/IP address of 77 | the master socket that will distribute work. 78 | 79 | This is achieved by the call to R common to all schedulers: 80 | 81 | ```{sh eval=FALSE} 82 | R --no-save --no-restore -e 'clustermq:::worker("{{ master }}")' 83 | ``` 84 | 85 | #### Worker communication 86 | 87 | On the master's side, we wait until a worker connects: 88 | 89 | ```{r eval=FALSE} 90 | msg = w$recv() # this will block until a worker is ready 91 | ``` 92 | 93 | We can then send any expression to be evaluated on the worker using the `send` 94 | method: 95 | 96 | ```{r eval=FALSE} 97 | w$send(expression, ...) 98 | ``` 99 | 100 | After the expression (in `...`), any variables that should be passed along with 101 | the call can be added. For batch processing that `clustermq` usually does, this 102 | command is `work_chunk`, where the `chunk` data is added: 103 | 104 | ```{r eval=FALSE} 105 | w$send(clustermq:::work_chunk(chunk, fun, const, rettype, common_seed), 106 | chunk = chunk(iter, submit_index)) 107 | ``` 108 | 109 | #### Worker environment 110 | 111 | We can add any number of objects to a worker environment using the `env` 112 | method: 113 | 114 | ```{r eval=FALSE} 115 | w$env(object=value, ...) 116 | ``` 117 | 118 | This will also invisibly return a `data.frame` with all objects currently in 119 | the environment. If a user wants to inspect the environment without changing it 120 | they can call `w$env()` without arguments. The environment will be propagated 121 | to all workers automatically in a greedy fashion. 122 | 123 | ### Main event loop 124 | 125 | Putting the above together in an event loop, we get what is essentially 126 | implemented in `master`. `w$send` invisibly returns an identifier to track 127 | which call was submitted, and `w$current()` matches the same to `w$recv()`. 128 | 129 | ```{r eval=FALSE} 130 | w = workers(3) 131 | on.exit(w$cleanup()) 132 | w$env(...) 133 | 134 | while (we have new work to send || jobs pending) { 135 | res = w$recv() # the result of the call, or NULL for a new worker 136 | w$current()$call_ref # matches answer to request, -1 otherwise 137 | # handle result 138 | 139 | if (more work) 140 | call_ref = w$send(expression, ...) # call_ref tracks request identity 141 | else 142 | w$send_shutdown() 143 | } 144 | ``` 145 | 146 | A loop of a similar structure can be used to extend `clustermq`. As an example, 147 | [this was done by the _targets_ 148 | package](https://github.com/ropensci/targets/blob/1.2.2/R/class_clustermq.R). 149 | 150 | ## ZeroMQ message specification 151 | 152 | Communication between the `master` (main event loop) and workers (`QSys` base 153 | class) is organised in _messages_. These are chunks of serialized data sent via 154 | _ZeroMQ_'s protocol (_ZMTP_). The parts of each message are called *frames*. 155 | 156 | ### Master - Worker communication 157 | 158 | The master requests an evaluation in a message with X frames (direct) or Y if 159 | proxied. This is all handled by _clustermq_ internally. 160 | 161 | * The worker identity frame or routing identifier 162 | * A delimiter frame 163 | * Worker status (`wlife_t`) 164 | * The call to be evaluated 165 | * _N_ repetitions of: 166 | * The variable name of an environment object that is not yet present on the 167 | worker 168 | * The variable value 169 | 170 | If using a proxy, this will be followed by a `SEXP` that contains variable 171 | names the proxy should add before forwarding to the worker. 172 | 173 | ### Worker evaluation 174 | 175 | A worker evaluates the call using the R C API: 176 | 177 | ```{r eval=FALSE} 178 | R_tryEvalSilent(cmd, env, &err); 179 | ``` 180 | 181 | If an error occurs in this evaluation will be returned as a structure with 182 | class `worker_error`. If a developer wants to catch errors and warnings in a 183 | more fine-grained manner, it is recommended to add their own `callingHandlers` 184 | to `cmd` (as _clustermq_ does work its `work_chunk`). 185 | 186 | ### Worker - Master communication 187 | 188 | The result of this evaluation is then returned in a message with four (direct) 189 | or five (proxied) frames: 190 | 191 | * Worker identity frame (handled internally by _ZeroMQ_'s `ZMQ_REQ` socket) 192 | * Empty frame (handled internally by _ZeroMQ_'s `ZMQ_REQ` socket) 193 | * Worker status (`wlife_t`) that is handled internally by _clustermq_ 194 | * The result of the call (`SEXP`), visible to the user 195 | 196 | If using a worker via SSH, these frames will be preceded by a routing identify 197 | frame that is handled internally by _ZeroMQ_ and added or peeled off by the 198 | proxy. 199 | --------------------------------------------------------------------------------