├── .github └── workflows │ ├── spellcheck.yml │ └── sphinx.yml ├── .gitignore ├── CITATION.cff ├── LICENSE ├── Makefile ├── README.md ├── content ├── conf.py ├── dependencies.md ├── environments.md ├── exercises.md ├── guide.md ├── img │ ├── dependency.png │ ├── docker_architecture.svg │ ├── docker_meme.jpg │ ├── kitchen │ │ ├── busy.png │ │ ├── libraries.png │ │ ├── linux.png │ │ ├── macos.png │ │ ├── recipe.png │ │ └── windows.png │ ├── python_environment.png │ ├── repro-pyramid.png │ ├── reproducibility_levels.png │ ├── reproducibility_nature.jpg │ ├── reproducible-research.jpg │ ├── reproducible_research_plus_lessons.png │ ├── reproducible_research_plus_lessons.svg │ ├── research_comic_phd.gif │ ├── snakemake.png │ ├── snakemake_dag.png │ ├── turing-way │ │ ├── 8-fair-principles.jpg │ │ └── reproducibility.jpg │ └── word-count │ │ ├── arrows.png │ │ ├── gutenberg.png │ │ ├── plot.png │ │ └── statistics.png ├── index.rst ├── intro.md ├── motivation.md ├── organizing-projects.md ├── where-to-go.md └── workflow-management.md └── requirements.txt /.github/workflows/spellcheck.yml: -------------------------------------------------------------------------------- 1 | name: Spelling Erros Check 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | strategy: 8 | max-parallel: 2 9 | matrix: 10 | os: [ubuntu-latest] 11 | 12 | runs-on: ${{ matrix.os }} 13 | 14 | steps: 15 | - uses: actions/checkout@v1 16 | - name: Install dependencies 17 | run: curl -L https://git.io/misspell | bash 18 | 19 | # This will return an exit code of 2, thus triggering a failed build 20 | - name: Test spelling errors 21 | shell: bash 22 | run: | 23 | bin/misspell -error * 24 | -------------------------------------------------------------------------------- /.github/workflows/sphinx.yml: -------------------------------------------------------------------------------- 1 | # Deploy Sphinx. This could be shorter, but we also do some extra 2 | # stuff. 3 | # 4 | # License: CC-0. This is the canonical location of this file, which 5 | # you may want to link to anyway: 6 | # https://github.com/coderefinery/sphinx-lesson-template/blob/main/.github/workflows/sphinx.yml 7 | # https://raw.githubusercontent.com/coderefinery/sphinx-lesson-template/main/.github/workflows/sphinx.yml 8 | 9 | 10 | name: sphinx 11 | on: [push, pull_request] 12 | 13 | env: 14 | DEFAULT_BRANCH: "main" 15 | # If these SPHINXOPTS are enabled, then be strict about the 16 | # builds and fail on any warnings. 17 | #SPHINXOPTS: "-W --keep-going -T" 18 | GENERATE_PDF: true # to enable, must be 'true' lowercase 19 | GENERATE_SINGLEHTML: true # to enable, must be 'true' lowercase 20 | PDF_FILENAME: lesson.pdf 21 | MULTIBRANCH: true # to enable, must be 'true' lowercase 22 | 23 | 24 | jobs: 25 | build: 26 | name: Build 27 | runs-on: ubuntu-latest 28 | permissions: 29 | contents: read 30 | 31 | steps: 32 | # https://github.com/marketplace/actions/checkout 33 | - uses: actions/checkout@v4 34 | with: 35 | fetch-depth: 0 36 | lfs: true 37 | 38 | # https://github.com/marketplace/actions/setup-python 39 | # ^-- This gives info on matrix testing. 40 | - name: Install Python 41 | uses: actions/setup-python@v4 42 | with: 43 | python-version: '3.11' 44 | cache: 'pip' 45 | 46 | # https://docs.github.com/en/actions/guides/building-and-testing-python#installing-dependencies 47 | # ^-- This gives info on installing dependencies with pip 48 | - name: Install dependencies 49 | run: | 50 | python -m pip install --upgrade pip 51 | pip install -r requirements.txt 52 | 53 | # Debug 54 | - name: Debugging information 55 | env: 56 | ref: ${{github.ref}} 57 | event_name: ${{github.event_name}} 58 | head_ref: ${{github.head_ref}} 59 | base_ref: ${{github.base_ref}} 60 | run: | 61 | echo "github.ref: ${ref}" 62 | echo "github.event_name: ${event_name}" 63 | echo "github.head_ref: ${head_ref}" 64 | echo "github.base_ref: ${base_ref}" 65 | echo "GENERATE_PDF: ${GENERATE_PDF}" 66 | echo "GENERATE_SINGLEHTML: ${GENERATE_SINGLEHTML}" 67 | set -x 68 | git rev-parse --abbrev-ref HEAD 69 | git branch 70 | git branch -a 71 | git remote -v 72 | python -V 73 | pip list --not-required 74 | pip list 75 | 76 | 77 | # Build 78 | - uses: ammaraskar/sphinx-problem-matcher@master 79 | - name: Build Sphinx docs (dirhtml) 80 | # SPHINXOPTS used via environment variables 81 | run: | 82 | make dirhtml 83 | # This fixes broken copy button icons, as explained in 84 | # https://github.com/coderefinery/sphinx-lesson/issues/50 85 | # https://github.com/executablebooks/sphinx-copybutton/issues/110 86 | # This can be removed once these PRs are accepted (but the 87 | # fixes also need to propagate to other themes): 88 | # https://github.com/sphinx-doc/sphinx/pull/8524 89 | # https://github.com/readthedocs/sphinx_rtd_theme/pull/1025 90 | sed -i 's/url_root="#"/url_root=""/' _build/dirhtml/index.html || true 91 | 92 | # singlehtml 93 | - name: Generate singlehtml 94 | if: ${{ env.GENERATE_SINGLEHTML == 'true' }} 95 | run: | 96 | make singlehtml 97 | mv _build/singlehtml/ _build/dirhtml/singlehtml/ 98 | 99 | # PDF if requested 100 | - name: Generate PDF 101 | if: ${{ env.GENERATE_PDF == 'true' }} 102 | run: | 103 | pip install https://github.com/rkdarst/sphinx_pyppeteer_builder/archive/refs/heads/main.zip 104 | make pyppeteer 105 | mv _build/pyppeteer/*.pdf _build/dirhtml/${PDF_FILENAME} 106 | 107 | # Stage all deployed assets in _gh-pages/ for simplicity, and to 108 | # prepare to do a multi-branch deployment. 109 | - name: Copy deployment data to _gh-pages/ 110 | if: ${{ github.event_name == 'push' }} 111 | run: 112 | rsync -a _build/dirhtml/ _gh-pages/ 113 | 114 | # Use gh-pages-multibranch to multiplex different branches into 115 | # one deployment. See 116 | # https://github.com/coderefinery/gh-pages-multibranch 117 | - name: gh-pages multibranch 118 | uses: coderefinery/gh-pages-multibranch@main 119 | if: ${{ github.event_name == 'push' && env.MULTIBRANCH == 'true' }} 120 | with: 121 | directory: _gh-pages/ 122 | default_branch: ${{ env.DEFAULT_BRANCH }} 123 | publish_branch: gh-pages 124 | 125 | # Add the .nojekyll file 126 | - name: nojekyll 127 | if: ${{ github.event_name == 'push' }} 128 | run: | 129 | touch _gh-pages/.nojekyll 130 | 131 | # Save artifact for the next step. 132 | - uses: actions/upload-artifact@v4 133 | if: ${{ github.event_name == 'push' }} 134 | with: 135 | name: gh-pages-build 136 | path: _gh-pages/ 137 | 138 | # Deploy in a separate job so that write permissions are restricted 139 | # to the minimum steps. 140 | deploy: 141 | name: Deploy 142 | runs-on: ubuntu-latest 143 | needs: build 144 | # This if can't use the env context - find better way later. 145 | if: ${{ github.event_name == 'push' }} 146 | permissions: 147 | contents: write 148 | 149 | steps: 150 | - uses: actions/download-artifact@v4 151 | if: ${{ github.event_name == 'push' && ( env.MULTIBRANCH == 'true' || github.ref == format('refs/heads/{0}', env.DEFAULT_BRANCH )) }} 152 | with: 153 | name: gh-pages-build 154 | path: _gh-pages/ 155 | 156 | # As of 2023, we could publish to pages via a Deployment. This 157 | # isn't done yet to give it time to stabilize (out of beta), and 158 | # also having a gh-pages branch to check out is rather 159 | # convenient. 160 | 161 | # Deploy 162 | # https://github.com/peaceiris/actions-gh-pages 163 | - name: Deploy 164 | uses: peaceiris/actions-gh-pages@v3 165 | if: ${{ github.event_name == 'push' && ( env.MULTIBRANCH == 'true' || github.ref == format('refs/heads/{0}', env.DEFAULT_BRANCH )) }} 166 | with: 167 | publish_branch: gh-pages 168 | github_token: ${{ secrets.GITHUB_TOKEN }} 169 | publish_dir: _gh-pages/ 170 | force_orphan: true 171 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _build 2 | *~ 3 | .DS_Store 4 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this lesson material, please cite it using these metadata." 3 | authors: 4 | - name: "CodeRefinery" 5 | - family-names: "Wikfeldt" 6 | given-names: "Kjartan Thor" 7 | - family-names: "Bast" 8 | given-names: "Radovan" 9 | - family-names: "Darst" 10 | given-names: "Richard" 11 | - family-names: "Hellsvik" 12 | given-names: "Johann" 13 | - family-names: "Wittke" 14 | given-names: "Samantha" 15 | - family-names: "Jääskeläinen" 16 | given-names: "Matias" 17 | - family-names: "Glerean" 18 | given-names: "Enrico" 19 | - family-names: "Vathsavayi" 20 | given-names: "Harsha" 21 | - family-names: "Wang" 22 | given-names: "Yonglei" 23 | title: "Reproducible research - Preparing code to be usable by you in the future and others in general" 24 | type: "data" 25 | abstract: "We focus here on 3 aspects of reproducible programs and computations: documenting dependencies, environments, and computational steps in a reproducible way. We touch on containers." 26 | version: 2025-03-19 27 | date-released: 2025-03-19 28 | url: "https://coderefinery.github.io/reproducible-research/" 29 | license: CC-BY-4.0 30 | repository-code: "https://github.com/coderefinery/reproducible-research" 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Attribution 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution 4.0 International Public License 58 | 59 | By exercising the Licensed Rights (defined below), You accept and agree 60 | to be bound by the terms and conditions of this Creative Commons 61 | Attribution 4.0 International Public License ("Public License"). To the 62 | extent this Public License may be interpreted as a contract, You are 63 | granted the Licensed Rights in consideration of Your acceptance of 64 | these terms and conditions, and the Licensor grants You such rights in 65 | consideration of benefits the Licensor receives from making the 66 | Licensed Material available under these terms and conditions. 67 | 68 | 69 | Section 1 -- Definitions. 70 | 71 | a. Adapted Material means material subject to Copyright and Similar 72 | Rights that is derived from or based upon the Licensed Material 73 | and in which the Licensed Material is translated, altered, 74 | arranged, transformed, or otherwise modified in a manner requiring 75 | permission under the Copyright and Similar Rights held by the 76 | Licensor. For purposes of this Public License, where the Licensed 77 | Material is a musical work, performance, or sound recording, 78 | Adapted Material is always produced where the Licensed Material is 79 | synched in timed relation with a moving image. 80 | 81 | b. Adapter's License means the license You apply to Your Copyright 82 | and Similar Rights in Your contributions to Adapted Material in 83 | accordance with the terms and conditions of this Public License. 84 | 85 | c. Copyright and Similar Rights means copyright and/or similar rights 86 | closely related to copyright including, without limitation, 87 | performance, broadcast, sound recording, and Sui Generis Database 88 | Rights, without regard to how the rights are labeled or 89 | categorized. For purposes of this Public License, the rights 90 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 91 | Rights. 92 | 93 | d. Effective Technological Measures means those measures that, in the 94 | absence of proper authority, may not be circumvented under laws 95 | fulfilling obligations under Article 11 of the WIPO Copyright 96 | Treaty adopted on December 20, 1996, and/or similar international 97 | agreements. 98 | 99 | e. Exceptions and Limitations means fair use, fair dealing, and/or 100 | any other exception or limitation to Copyright and Similar Rights 101 | that applies to Your use of the Licensed Material. 102 | 103 | f. Licensed Material means the artistic or literary work, database, 104 | or other material to which the Licensor applied this Public 105 | License. 106 | 107 | g. Licensed Rights means the rights granted to You subject to the 108 | terms and conditions of this Public License, which are limited to 109 | all Copyright and Similar Rights that apply to Your use of the 110 | Licensed Material and that the Licensor has authority to license. 111 | 112 | h. Licensor means the individual(s) or entity(ies) granting rights 113 | under this Public License. 114 | 115 | i. Share means to provide material to the public by any means or 116 | process that requires permission under the Licensed Rights, such 117 | as reproduction, public display, public performance, distribution, 118 | dissemination, communication, or importation, and to make material 119 | available to the public including in ways that members of the 120 | public may access the material from a place and at a time 121 | individually chosen by them. 122 | 123 | j. Sui Generis Database Rights means rights other than copyright 124 | resulting from Directive 96/9/EC of the European Parliament and of 125 | the Council of 11 March 1996 on the legal protection of databases, 126 | as amended and/or succeeded, as well as other essentially 127 | equivalent rights anywhere in the world. 128 | 129 | k. You means the individual or entity exercising the Licensed Rights 130 | under this Public License. Your has a corresponding meaning. 131 | 132 | 133 | Section 2 -- Scope. 134 | 135 | a. License grant. 136 | 137 | 1. Subject to the terms and conditions of this Public License, 138 | the Licensor hereby grants You a worldwide, royalty-free, 139 | non-sublicensable, non-exclusive, irrevocable license to 140 | exercise the Licensed Rights in the Licensed Material to: 141 | 142 | a. reproduce and Share the Licensed Material, in whole or 143 | in part; and 144 | 145 | b. produce, reproduce, and Share Adapted Material. 146 | 147 | 2. Exceptions and Limitations. For the avoidance of doubt, where 148 | Exceptions and Limitations apply to Your use, this Public 149 | License does not apply, and You do not need to comply with 150 | its terms and conditions. 151 | 152 | 3. Term. The term of this Public License is specified in Section 153 | 6(a). 154 | 155 | 4. Media and formats; technical modifications allowed. The 156 | Licensor authorizes You to exercise the Licensed Rights in 157 | all media and formats whether now known or hereafter created, 158 | and to make technical modifications necessary to do so. The 159 | Licensor waives and/or agrees not to assert any right or 160 | authority to forbid You from making technical modifications 161 | necessary to exercise the Licensed Rights, including 162 | technical modifications necessary to circumvent Effective 163 | Technological Measures. For purposes of this Public License, 164 | simply making modifications authorized by this Section 2(a) 165 | (4) never produces Adapted Material. 166 | 167 | 5. Downstream recipients. 168 | 169 | a. Offer from the Licensor -- Licensed Material. Every 170 | recipient of the Licensed Material automatically 171 | receives an offer from the Licensor to exercise the 172 | Licensed Rights under the terms and conditions of this 173 | Public License. 174 | 175 | b. No downstream restrictions. You may not offer or impose 176 | any additional or different terms or conditions on, or 177 | apply any Effective Technological Measures to, the 178 | Licensed Material if doing so restricts exercise of the 179 | Licensed Rights by any recipient of the Licensed 180 | Material. 181 | 182 | 6. No endorsement. Nothing in this Public License constitutes or 183 | may be construed as permission to assert or imply that You 184 | are, or that Your use of the Licensed Material is, connected 185 | with, or sponsored, endorsed, or granted official status by, 186 | the Licensor or others designated to receive attribution as 187 | provided in Section 3(a)(1)(A)(i). 188 | 189 | b. Other rights. 190 | 191 | 1. Moral rights, such as the right of integrity, are not 192 | licensed under this Public License, nor are publicity, 193 | privacy, and/or other similar personality rights; however, to 194 | the extent possible, the Licensor waives and/or agrees not to 195 | assert any such rights held by the Licensor to the limited 196 | extent necessary to allow You to exercise the Licensed 197 | Rights, but not otherwise. 198 | 199 | 2. Patent and trademark rights are not licensed under this 200 | Public License. 201 | 202 | 3. To the extent possible, the Licensor waives any right to 203 | collect royalties from You for the exercise of the Licensed 204 | Rights, whether directly or through a collecting society 205 | under any voluntary or waivable statutory or compulsory 206 | licensing scheme. In all other cases the Licensor expressly 207 | reserves any right to collect such royalties. 208 | 209 | 210 | Section 3 -- License Conditions. 211 | 212 | Your exercise of the Licensed Rights is expressly made subject to the 213 | following conditions. 214 | 215 | a. Attribution. 216 | 217 | 1. If You Share the Licensed Material (including in modified 218 | form), You must: 219 | 220 | a. retain the following if it is supplied by the Licensor 221 | with the Licensed Material: 222 | 223 | i. identification of the creator(s) of the Licensed 224 | Material and any others designated to receive 225 | attribution, in any reasonable manner requested by 226 | the Licensor (including by pseudonym if 227 | designated); 228 | 229 | ii. a copyright notice; 230 | 231 | iii. a notice that refers to this Public License; 232 | 233 | iv. a notice that refers to the disclaimer of 234 | warranties; 235 | 236 | v. a URI or hyperlink to the Licensed Material to the 237 | extent reasonably practicable; 238 | 239 | b. indicate if You modified the Licensed Material and 240 | retain an indication of any previous modifications; and 241 | 242 | c. indicate the Licensed Material is licensed under this 243 | Public License, and include the text of, or the URI or 244 | hyperlink to, this Public License. 245 | 246 | 2. You may satisfy the conditions in Section 3(a)(1) in any 247 | reasonable manner based on the medium, means, and context in 248 | which You Share the Licensed Material. For example, it may be 249 | reasonable to satisfy the conditions by providing a URI or 250 | hyperlink to a resource that includes the required 251 | information. 252 | 253 | 3. If requested by the Licensor, You must remove any of the 254 | information required by Section 3(a)(1)(A) to the extent 255 | reasonably practicable. 256 | 257 | 4. If You Share Adapted Material You produce, the Adapter's 258 | License You apply must not prevent recipients of the Adapted 259 | Material from complying with this Public License. 260 | 261 | 262 | Section 4 -- Sui Generis Database Rights. 263 | 264 | Where the Licensed Rights include Sui Generis Database Rights that 265 | apply to Your use of the Licensed Material: 266 | 267 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 268 | to extract, reuse, reproduce, and Share all or a substantial 269 | portion of the contents of the database; 270 | 271 | b. if You include all or a substantial portion of the database 272 | contents in a database in which You have Sui Generis Database 273 | Rights, then the database in which You have Sui Generis Database 274 | Rights (but not its individual contents) is Adapted Material; and 275 | 276 | c. You must comply with the conditions in Section 3(a) if You Share 277 | all or a substantial portion of the contents of the database. 278 | 279 | For the avoidance of doubt, this Section 4 supplements and does not 280 | replace Your obligations under this Public License where the Licensed 281 | Rights include other Copyright and Similar Rights. 282 | 283 | 284 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 285 | 286 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 287 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 288 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 289 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 290 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 291 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 292 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 293 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 294 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 295 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 296 | 297 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 298 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 299 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 300 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 301 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 302 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 303 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 304 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 305 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 306 | 307 | c. The disclaimer of warranties and limitation of liability provided 308 | above shall be interpreted in a manner that, to the extent 309 | possible, most closely approximates an absolute disclaimer and 310 | waiver of all liability. 311 | 312 | 313 | Section 6 -- Term and Termination. 314 | 315 | a. This Public License applies for the term of the Copyright and 316 | Similar Rights licensed here. However, if You fail to comply with 317 | this Public License, then Your rights under this Public License 318 | terminate automatically. 319 | 320 | b. Where Your right to use the Licensed Material has terminated under 321 | Section 6(a), it reinstates: 322 | 323 | 1. automatically as of the date the violation is cured, provided 324 | it is cured within 30 days of Your discovery of the 325 | violation; or 326 | 327 | 2. upon express reinstatement by the Licensor. 328 | 329 | For the avoidance of doubt, this Section 6(b) does not affect any 330 | right the Licensor may have to seek remedies for Your violations 331 | of this Public License. 332 | 333 | c. For the avoidance of doubt, the Licensor may also offer the 334 | Licensed Material under separate terms or conditions or stop 335 | distributing the Licensed Material at any time; however, doing so 336 | will not terminate this Public License. 337 | 338 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 339 | License. 340 | 341 | 342 | Section 7 -- Other Terms and Conditions. 343 | 344 | a. The Licensor shall not be bound by any additional or different 345 | terms or conditions communicated by You unless expressly agreed. 346 | 347 | b. Any arrangements, understandings, or agreements regarding the 348 | Licensed Material not stated herein are separate from and 349 | independent of the terms and conditions of this Public License. 350 | 351 | 352 | Section 8 -- Interpretation. 353 | 354 | a. For the avoidance of doubt, this Public License does not, and 355 | shall not be interpreted to, reduce, limit, restrict, or impose 356 | conditions on any use of the Licensed Material that could lawfully 357 | be made without permission under this Public License. 358 | 359 | b. To the extent possible, if any provision of this Public License is 360 | deemed unenforceable, it shall be automatically reformed to the 361 | minimum extent necessary to make it enforceable. If the provision 362 | cannot be reformed, it shall be severed from this Public License 363 | without affecting the enforceability of the remaining terms and 364 | conditions. 365 | 366 | c. No term or condition of this Public License will be waived and no 367 | failure to comply consented to unless expressly agreed to by the 368 | Licensor. 369 | 370 | d. Nothing in this Public License constitutes or may be interpreted 371 | as a limitation upon, or waiver of, any privileges and immunities 372 | that apply to the Licensor or You, including from the legal 373 | processes of any jurisdiction or authority. 374 | 375 | 376 | ======================================================================= 377 | 378 | Creative Commons is not a party to its public 379 | licenses. Notwithstanding, Creative Commons may elect to apply one of 380 | its public licenses to material it publishes and in those instances 381 | will be considered the “Licensor.” The text of the Creative Commons 382 | public licenses is dedicated to the public domain under the CC0 Public 383 | Domain Dedication. Except for the limited purpose of indicating that 384 | material is shared under a Creative Commons public license or as 385 | otherwise permitted by the Creative Commons policies published at 386 | creativecommons.org/policies, Creative Commons does not authorize the 387 | use of the trademark "Creative Commons" or any other trademark or logo 388 | of Creative Commons without its prior written consent including, 389 | without limitation, in connection with any unauthorized modifications 390 | to any of its public licenses or any other arrangements, 391 | understandings, or agreements concerning use of licensed material. For 392 | the avoidance of doubt, this paragraph does not form part of the 393 | public licenses. 394 | 395 | Creative Commons may be contacted at creativecommons.org. 396 | 397 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = content 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [Reproducible research - Preparing code to be usable by you and others in the future](https://coderefinery.github.io/reproducible-research/) 2 | 3 | - [Credit and license](https://coderefinery.github.io/reproducible-research/license/) 4 | -------------------------------------------------------------------------------- /content/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = "Reproducible research" 21 | copyright = "CodeRefinery contributors" 22 | author = "CodeRefinery contributors" 23 | github_user = "coderefinery" 24 | github_repo_name = "reproducible-research" # auto-detected from dirname if blank 25 | github_version = "main" 26 | conf_py_path = "/content/" # with leading and trailing slash 27 | 28 | # -- General configuration --------------------------------------------------- 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = [ 34 | # githubpages just adds a .nojekyll file 35 | "sphinx.ext.githubpages", 36 | "sphinx_lesson", 37 | # remove once sphinx_rtd_theme updated for contrast and accessibility: 38 | "sphinx_rtd_theme_ext_color_contrast", 39 | "sphinx_coderefinery_branding", 40 | ] 41 | 42 | nb_execution_mode = "cache" 43 | 44 | # Add any paths that contain templates here, relative to this directory. 45 | # templates_path = ['_templates'] 46 | 47 | # List of patterns, relative to source directory, that match files and 48 | # directories to ignore when looking for source files. 49 | # This pattern also affects html_static_path and html_extra_path. 50 | exclude_patterns = [ 51 | "examples", 52 | "README*", 53 | "_build", 54 | "Thumbs.db", 55 | ".DS_Store", 56 | "jupyter_execute", 57 | "*venv*", 58 | "img/README.md", 59 | ] 60 | 61 | 62 | # -- Options for HTML output ------------------------------------------------- 63 | 64 | # The theme to use for HTML and HTML Help pages. See the documentation for 65 | # a list of builtin themes. 66 | # 67 | html_theme = "sphinx_rtd_theme" 68 | 69 | # Add any paths that contain custom static files (such as style sheets) here, 70 | # relative to this directory. They are copied after the builtin static files, 71 | # so a file named "default.css" will overwrite the builtin "default.css". 72 | #html_static_path = ['css'] 73 | 74 | 75 | # HTML context: 76 | from os.path import basename, dirname, realpath 77 | 78 | html_context = { 79 | "display_github": True, 80 | "github_user": github_user, 81 | # Auto-detect directory name. This can break, but 82 | # useful as a default. 83 | "github_repo": github_repo_name or basename(dirname(realpath(__file__))), 84 | "github_version": github_version, 85 | "conf_py_path": conf_py_path, 86 | } 87 | 88 | import os 89 | if os.environ.get('GITHUB_REF', '') == 'refs/heads/'+github_version: 90 | html_js_files = [ 91 | ('https://plausible.cs.aalto.fi/js/script.js', {"data-domain": "coderefinery.github.io", "defer": "defer"}), 92 | ] 93 | -------------------------------------------------------------------------------- /content/dependencies.md: -------------------------------------------------------------------------------- 1 | # Recording dependencies 2 | 3 | ```{objectives} 4 | - Understand what dependency management tools can be useful for 5 | - Discuss environment/requirements files in the context of reusability and 6 | reproducibility 7 | ``` 8 | 9 | ```{questions} 10 | - How can we communicate different versions of software dependencies? 11 | ``` 12 | 13 | ```{instructor-note} 14 | - 10 min teaching 15 | - 10 min demo 16 | ``` 17 | 18 | Our codes often depend on other codes that in turn depend on other codes ... 19 | 20 | - **Reproducibility**: We can version-control our code with Git but how should we version-control dependencies? 21 | How can we capture and communicate dependencies? 22 | - **Dependency hell**: Different codes on the same environment can have conflicting dependencies. 23 | 24 | ```{figure} img/dependency.png 25 | :alt: An image showing blocks (=codes) depending on each other for stability 26 | :width: 60% 27 | 28 | From [xkcd - dependency](https://xkcd.com/2347/). Another image that might be familiar to some of you working with Python can be found on [xkcd - superfund](https://xkcd.com/1987/). 29 | ``` 30 | 31 | ````{discussion} Kitchen analogy 32 | - Software <-> recipe 33 | - Data <-> ingredients 34 | - Libraries <-> pots/tools 35 | 36 | ```{figure} img/kitchen/recipe.png 37 | :alt: Cooking recipe in an unfamiliar language 38 | :width: 50% 39 | 40 | Cooking recipe in an unfamiliar language [Midjourney, CC-BY-NC 4.0] 41 | ``` 42 | 43 | ```{figure} img/kitchen/libraries.png 44 | :alt: Kitchen with few open cooking books 45 | :width: 50% 46 | 47 | When we create recipes, we often use tools created by others (libraries) [Midjourney, CC-BY-NC 4.0] 48 | ``` 49 | ```` 50 | 51 | --- 52 | 53 | ## Dependency and environment management 54 | 55 | **Conda, Anaconda, pip, virtualenv, Pipenv, pyenv, Poetry, requirements.txt, 56 | environment.yml, renv**, ..., these tools try to solve the following problems: 57 | 58 | - **Defining a specific set of dependencies** 59 | - **Installing those dependencies** mostly automatically 60 | - **Recording the versions** for all dependencies 61 | - **Isolate environments** 62 | - On your computer for projects so they can use different software 63 | - Isolate environments on computers with many users (and allow self-installations) 64 | - Using **different package versions** per project (also e.g. Python/R versions) 65 | - Provide tools and services to **share packages** 66 | 67 | Isolated environments are also useful because they help you make sure 68 | that you know your dependencies! 69 | 70 | **If things go wrong, you can delete and re-create** - much better 71 | than debugging. The more often you re-create your environment, the 72 | more reproducible it is. 73 | 74 | --- 75 | 76 | ## Demo 77 | 78 | ``````{challenge} Dependencies-1: Time-capsule of dependencies 79 | Situation: 5 students (A, B, C, D, E) wrote a code that depends on a couple of libraries. 80 | They uploaded their projects to GitHub. We now travel 3 years into the future 81 | and find their GitHub repositories and try to re-run their code before adapting 82 | it. 83 | 84 | Answer in the collaborative document: 85 | 86 | - Which version do you expect to be easiest to re-run? Why? 87 | - What problems do you anticipate in each solution? 88 | 89 | `````{tabs} 90 | ````{group-tab} Conda 91 | **A**: 92 | You find a couple of library imports across the code but that's it. 93 | 94 | **B**: 95 | The README file lists which libraries were used but does not mention 96 | any versions. 97 | 98 | **C**: 99 | You find a `environment.yml` file with: 100 | ``` 101 | name: student-project 102 | channels: 103 | - conda-forge 104 | dependencies: 105 | - scipy 106 | - numpy 107 | - sympy 108 | - click 109 | - python 110 | - pip 111 | - pip: 112 | - git+https://github.com/someuser/someproject.git@master 113 | - git+https://github.com/anotheruser/anotherproject.git@master 114 | ``` 115 | 116 | **D**: 117 | You find a `environment.yml` file with: 118 | ``` 119 | name: student-project 120 | channels: 121 | - conda-forge 122 | dependencies: 123 | - scipy=1.3.1 124 | - numpy=1.16.4 125 | - sympy=1.4 126 | - click=7.0 127 | - python=3.8 128 | - pip 129 | - pip: 130 | - git+https://github.com/someuser/someproject.git@d7b2c7e 131 | - git+https://github.com/anotheruser/anotherproject.git@sometag 132 | ``` 133 | 134 | **E**: 135 | You find a `environment.yml` file with: 136 | ``` 137 | name: student-project 138 | channels: 139 | - conda-forge 140 | dependencies: 141 | - scipy=1.3.1 142 | - numpy=1.16.4 143 | - sympy=1.4 144 | - click=7.0 145 | - python=3.8 146 | - someproject=1.2.3 147 | - anotherproject=2.3.4 148 | ``` 149 | ```` 150 | 151 | ````{group-tab} Python virtualenv 152 | **A**: 153 | You find a couple of library imports across the code but that's it. 154 | 155 | **B**: 156 | The README file lists which libraries were used but does not mention 157 | any versions. 158 | 159 | **C**: 160 | You find a `requirements.txt` file with: 161 | ``` 162 | scipy 163 | numpy 164 | sympy 165 | click 166 | python 167 | git+https://github.com/someuser/someproject.git@master 168 | git+https://github.com/anotheruser/anotherproject.git@master 169 | ``` 170 | 171 | **D**: 172 | You find a `requirements.txt` file with: 173 | ``` 174 | scipy==1.3.1 175 | numpy==1.16.4 176 | sympy==1.4 177 | click==7.0 178 | python==3.8 179 | git+https://github.com/someuser/someproject.git@d7b2c7e 180 | git+https://github.com/anotheruser/anotherproject.git@sometag 181 | ``` 182 | 183 | **E**: 184 | You find a `requirements.txt` file with: 185 | ``` 186 | scipy==1.3.1 187 | numpy==1.16.4 188 | sympy==1.4 189 | click==7.0 190 | python==3.8 191 | someproject==1.2.3 192 | anotherproject==2.3.4 193 | ``` 194 | ```` 195 | 196 | ````{group-tab} R 197 | **A**: 198 | You find a couple of `library()` or `require()` calls across the code but that's it. 199 | 200 | **B**: 201 | The README file lists which libraries were used but does not mention 202 | any versions. 203 | 204 | **C**: 205 | You find a [DESCRIPTION file](https://r-pkgs.org/description.html) which contains: 206 | ``` 207 | Imports: 208 | dplyr, 209 | tidyr 210 | ``` 211 | In addition you find these: 212 | ```r 213 | remotes::install_github("someuser/someproject@master") 214 | remotes::install_github("anotheruser/anotherproject@master") 215 | ``` 216 | 217 | **D**: 218 | You find a [DESCRIPTION file](https://r-pkgs.org/description.html) which contains: 219 | ``` 220 | Imports: 221 | dplyr (== 1.0.0), 222 | tidyr (== 1.1.0) 223 | ``` 224 | In addition you find these: 225 | ```r 226 | remotes::install_github("someuser/someproject@d7b2c7e") 227 | remotes::install_github("anotheruser/anotherproject@sometag") 228 | ``` 229 | 230 | **E**: 231 | You find a [DESCRIPTION file](https://r-pkgs.org/description.html) which contains: 232 | ``` 233 | Imports: 234 | dplyr (== 1.0.0), 235 | tidyr (== 1.1.0), 236 | someproject (== 1.2.3), 237 | anotherproject (== 2.3.4) 238 | ``` 239 | ```` 240 | 241 | ````{group-tab} Matlab 242 | Can you please contribute an example? 243 | ```` 244 | ````` 245 | 246 | `````{solution} 247 | **A**: It will be tedious to collect the dependencies one by one. And after 248 | the tedious process you will still not know which versions they have used. 249 | 250 | **B**: If there is no standard file to look for and look at and it might 251 | become very difficult for to create the software environment required to 252 | run the software. But at least we know the list of libraries. But we don't 253 | know the versions. 254 | 255 | **C**: Having a standard file listing dependencies is definitely better 256 | than nothing. However, if the versions are not specified, you or someone 257 | else might run into problems with dependencies, deprecated features, 258 | changes in package APIs, etc. 259 | 260 | **D** and **E**: In both these cases exact versions of all dependencies are 261 | specified and one can recreate the software environment required for the 262 | project. One problem with the dependencies that come from GitHub is that 263 | they might have disappeared (what if their authors deleted these 264 | repositories?). 265 | 266 | **E** is slightly preferable because version numbers are easier to understand than Git 267 | commit hashes or Git tags. 268 | ````` 269 | `````` 270 | 271 | ``````{challenge} Dependencies-2: Create a time-capsule for the future 272 | Now we will demo creating our own time-capsule and share it with the future 273 | world. If we asked you now which dependencies your project is using, what would 274 | you answer? How would you find out? And how would you communicate this 275 | information? 276 | 277 | `````{tabs} 278 | ````{group-tab} Conda 279 | We start from an existing conda environment. Try this either with your own project or inside the "coderefinery" conda 280 | environment. For demonstration puprposes, you can also create an environment with: 281 | 282 | ```console 283 | $ conda env create -f myenv.yml 284 | ``` 285 | Where the file `myenv.yml` could have some python libraries with unspecified versions: 286 | 287 | ``` 288 | name: myenv 289 | channels: 290 | - conda-forge 291 | - defaults 292 | dependencies: 293 | - python=3.10 294 | - numpy 295 | - pandas 296 | - seaborn 297 | ``` 298 | 299 | After creating the environment we can activate it with 300 | 301 | ``` 302 | conda activate myenv 303 | ``` 304 | 305 | Now we can freeze the environment into a new YAML file with: 306 | 307 | ```console 308 | $ conda env export > environment.yml 309 | ``` 310 | 311 | Have a look at the generated file and discuss what you see. 312 | 313 | ```{solution} Some things to note 314 | - Can you find all packages you installed directly? Which versions were installed? 315 | - What other packages were installed? -> Dependencies of dependencies 316 | - Besides the version you can also see the build channel 317 | - Sometimes the build includes an operating system or an architecture 318 | - Using this environment file might therefore not work/ not result in an identical setup on other computers 319 | ``` 320 | 321 | In the future — or on a different computer — we can re-create this environment with: 322 | 323 | ```console 324 | $ conda env create -f environment.yml 325 | ``` 326 | You may use `conda` or `mamba` interchangeably for this step; mamba may solve the dependencies a bit faster. 327 | 328 | What happens instead when you run the following command? 329 | 330 | ```console 331 | $ conda env export --from-history > environment_fromhistory.yml 332 | ``` 333 | 334 | ```{solution} Some things to note 335 | - Everything is listed as you installed it; with or without specified versions 336 | - Using this environment file a few days/weeks later will likely not result in the same environment 337 | - This can be a good starting point for a reproducible environment as you may add your current version numbers to it (check for example with `conda list | grep "packagename"`) 338 | ``` 339 | 340 | In daily use you may not always use an environment.yml file to create the full environment, but create a base environment and then add new packages with `conda install packagename` as you go. Also those packages will be listed in the environment files created with either of the approaches above. 341 | 342 | More information: and 343 | ```` 344 | 345 | ````{group-tab} Python virtualenv 346 | Try this in your own project: 347 | ```console 348 | $ pip freeze > requirements.txt 349 | ``` 350 | 351 | Have a look at the generated file and discuss what you see. 352 | 353 | In future you can re-create this environment with: 354 | ```console 355 | $ pip install -r requirements.txt 356 | ``` 357 | 358 | More information: 359 | ```` 360 | 361 | ````{group-tab} R 362 | This example uses renv. 363 | 364 | Try to "save" and "load" the state of your project library using 365 | `renv::snapshot()` and `renv::restore()`. 366 | See also: 367 | 368 | More information: 369 | ```` 370 | 371 | ````{group-tab} Matlab 372 | Can you please contribute an example? 373 | ```` 374 | ````` 375 | `````` 376 | 377 | ```{keypoints} 378 | - Recording dependencies with versions can make it easier for the next person to execute your code 379 | - There are many tools to record dependencies and separate environments 380 | ``` 381 | -------------------------------------------------------------------------------- /content/environments.md: -------------------------------------------------------------------------------- 1 | # Recording environments 2 | 3 | ```{objectives} 4 | - Understand what containers are and what they are useful for 5 | - Discuss container definitions files in the context of reusability and 6 | reproducibility 7 | ``` 8 | 9 | ```{instructor-note} 10 | - 10 min teaching/discussion 11 | - 10 min demo 12 | ``` 13 | 14 | ## What is a container? 15 | 16 | Imagine if you didn't have to install things yourself, but instead you could 17 | get a computer with the exact software for a task pre-installed? Containers 18 | effectively do that, with various advantages and disadvantages. They are 19 | **like an entire operating system with software installed, all in one file**. 20 | 21 | ```{figure} img/docker_meme.jpg 22 | :alt: He said, then we will ship your machine. And that's how Docker was born. 23 | :width: 60% 24 | 25 | From [reddit](https://www.reddit.com/r/ProgrammerHumor/comments/cw58z7/it_works_on_my_machine/). 26 | ``` 27 | 28 | ``````{discussion} Kitchen analogy 29 | - Our codes/scripts <-> cooking recipes 30 | - Container definition files <-> like a blueprint to build a kitchen with all 31 | utensils in which the recipe can be prepared. 32 | - Container images <-> showroom kitchens 33 | - Containers <-> A real connected kitchen 34 | 35 | Just for fun: which operating systems do the following example kitchens represent? 36 | `````{tabs} 37 | ````{tab} 1 38 | ```{figure} img/kitchen/macos.png 39 | :alt: Generated image of a kitchen 40 | :width: 50% 41 | 42 | [Midjourney, CC-BY-NC 4.0] 43 | ``` 44 | ```` 45 | 46 | ````{tab} 2 47 | ```{figure} img/kitchen/windows.png 48 | :alt: Generated image of a kitchen 49 | :width: 50% 50 | 51 | [Midjourney, CC-BY-NC 4.0] 52 | ``` 53 | ```` 54 | 55 | ````{tab} 3 56 | ```{figure} img/kitchen/linux.png 57 | :alt: Generated image of a kitchen 58 | :width: 50% 59 | 60 | [Midjourney, CC-BY-NC 4.0] 61 | ``` 62 | ```` 63 | ````` 64 | `````` 65 | 66 | ## From definition files to container images to containers 67 | 68 | - Containers can be built to bundle _all the necessary ingredients_ (data, code, environment, operating system). 69 | - A container image is like a piece of paper with all the operating system on it. When you run it, 70 | a transparent sheet is placed on top to form a container. The container runs and writes only on 71 | that transparent sheet (and what other mounts have been layered on top). When you are done, 72 | transparency is thrown away. It can be repeated as often as you want, and base is always the same. 73 | - Definition files (e.g. Dockerfile or Singularity definition file) are text 74 | files that contain a series of instructions to build container images. 75 | 76 | ## You may have use for containers in different ways 77 | 78 | - **Installing a certain software is tricky**, or not supported for your operating system? - Check if an image is available and run the software from a container instead! 79 | - You want to make sure your colleagues are using the **same environment** for running your code? - Provide them an image of your container! 80 | - If this does not work, because they are using a different architecture than you do? - Provide a definition file for them to **build the image suitable to their computers**. This does not create the exact environment as you have, but in most cases similar enough. 81 | 82 | ## The container recipe 83 | 84 | Here is an example of a Singularity definition file ([reference](https://apptainer.org/docs/user/main/build_a_container.html#building-containers-from-apptainer-definition-files)): 85 | 86 | ``` 87 | Bootstrap: docker 88 | From: ubuntu:24.04 89 | 90 | %post 91 | apt-get -y update 92 | apt-get -y install fortune cowsay lolcat 93 | 94 | %environment 95 | export LC_ALL=C 96 | export PATH=/usr/games:$PATH 97 | 98 | %runscript 99 | fortune | cowsay | lolcat 100 | ``` 101 | 102 | Popular container implementations: 103 | 104 | - [Docker](https://www.docker.com/) 105 | - [Singularity](https://sylabs.io/docs/) (popular on high-performance computing systems) 106 | - [Apptainer](https://apptainer.org) (popular on high-performance computing systems, fork of Singularity) 107 | - [podman](https://podman.io/) 108 | 109 | They are to some extent interoperable: 110 | 111 | - podman is very close to Docker 112 | - Docker images can be converted to Singularity/Apptainer images 113 | - [Singularity Python](https://singularityhub.github.io/singularity-cli/) can convert Dockerfiles to Singularity definition files 114 | 115 | --- 116 | 117 | ## Pros and cons of containers 118 | 119 | Containers are popular for a reason - they solve a number of 120 | important problems: 121 | 122 | - Allow for seamlessly **moving workflows across different platforms**. 123 | - Can solve the **"works on my machine"** situation. 124 | - For software with many dependencies, in turn with its own dependencies, 125 | containers offer possibly the only way to preserve the 126 | computational experiment for **future reproducibility**. 127 | - A mechanism to "send the computer to the data" when the **dataset is too large** to transfer. 128 | - **Installing software into a file** instead of into your computer (removing 129 | a file is often easier than uninstalling software if you suddenly regret an 130 | installation) 131 | 132 | However, containers may also have some drawbacks: 133 | 134 | - Can be used to hide away software installation problems and thereby 135 | **discourage good software development practices**. 136 | - Instead of "works on my machine" problem: **"works only in this container"** problem? 137 | - They can be **difficult to modify** 138 | - Container **images can become large** 139 | 140 | ```{danger} 141 | Use only **official and trusted images**! Not all images can be trusted! There 142 | have been examples of contaminated images so investigate before using images 143 | blindly. Apply same caution as installing software packages from untrusted 144 | package repositories. 145 | ``` 146 | 147 | --- 148 | 149 | ## Where can one share or find images? 150 | 151 | - [Docker Hub](https://hub.docker.com/) 152 | - [Quay](https://quay.io/) 153 | - [GitHub Container Registry](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry) 154 | - [GitLab Container Registry](https://docs.gitlab.com/ee/user/packages/container_registry/) 155 | - GitHub/GitLab release artifacts 156 | - [Zenodo](https://zenodo.org/) 157 | 158 | --- 159 | 160 | ## Exercises 161 | 162 | ``````{exercise} Containers-1: Time travel 163 | Scenario: A researcher has written and published their research code which 164 | requires a number of libraries and system dependencies. They ran their code 165 | on a Linux computer (Ubuntu). One very nice thing they did was to publish 166 | also a container image with all dependencies included, as well as the 167 | definition file (below) to create the container image. 168 | 169 | Now we travel 3 years into the future and want to reuse their work and adapt 170 | it for our data. The container registry where they uploaded the container 171 | image however no longer exists. But luckily we still have the definition file 172 | (below)! From this we should be able to create a new container image. 173 | 174 | - Can you anticipate problems using the definitions file 3 years after its creation? 175 | Which possible problems can you point out? 176 | - Discuss possible take-aways for creating more reusable containers. 177 | 178 | `````{tabs} 179 | ````{tab} Python project using virtual environment 180 | ```{code-block} 181 | :linenos: 182 | Bootstrap: docker 183 | From: ubuntu:latest 184 | 185 | %post 186 | # Set environment variables 187 | export VIRTUAL_ENV=/app/venv 188 | 189 | # Install system dependencies and Python 3 190 | apt-get update && \ 191 | apt-get install -y --no-install-recommends \ 192 | gcc \ 193 | libgomp1 \ 194 | python3 \ 195 | python3-venv \ 196 | python3-distutils \ 197 | python3-pip && \ 198 | apt-get clean && \ 199 | rm -rf /var/lib/apt/lists/* 200 | 201 | # Set up the virtual environment 202 | python3 -m venv $VIRTUAL_ENV 203 | . $VIRTUAL_ENV/bin/activate 204 | 205 | # Install Python libraries 206 | pip install --no-cache-dir --upgrade pip && \ 207 | pip install --no-cache-dir -r /app/requirements.txt 208 | 209 | %files 210 | # Copy project files 211 | ./requirements.txt /app/requirements.txt 212 | ./app.py /app/app.py 213 | # Copy data 214 | /home/myself/data /app/data 215 | # Workaround to fix dependency on fancylib 216 | /home/myself/fancylib /usr/lib/fancylib 217 | 218 | %environment 219 | # Set the environment variables 220 | export LANG=C.UTF-8 LC_ALL=C.UTF-8 221 | export VIRTUAL_ENV=/app/venv 222 | 223 | %runscript 224 | # Activate the virtual environment 225 | . $VIRTUAL_ENV/bin/activate 226 | # Run the application 227 | python /app/app.py 228 | ``` 229 | 230 | ```{solution} 231 | - Line 2: "ubuntu:latest" will mean something different 3 years in future. 232 | - Lines 11-12: The compiler gcc and the library libgomp1 will have evolved. 233 | - Line 30: The container uses requirements.txt to build the virtual environment but we don't see 234 | here what libraries the code depends on. 235 | - Line 33: Data is copied in from the hard disk of the person who created it. Hopefully we can find the data somewhere. 236 | - Line 35: The library fancylib has been built outside the container and copied in but we don't see here how it was done. 237 | - Python version will be different then and hopefully the code still runs then. 238 | - Singularity/Apptainer will have also evolved by then. Hopefully this definition file then still works. 239 | - No contact address to ask more questions about this file. 240 | - (Can you find more? Please contribute more points.) 241 | ``` 242 | ```` 243 | 244 | ````{tab} R project using renv 245 | Work in progress: Please contribute a corresponding example which 246 | demonstrates this in the context of R and renv. 247 | ```` 248 | ````` 249 | `````` 250 | 251 | ````{exercise} (optional) Containers-2: Installing the impossible. 252 | 253 | When you are missing privileges for installing certain software tools, containers can come handy. 254 | Here we build a Singularity/Apptainer container for installing `cowsay` and `lolcat` Linux programs. 255 | 256 | 1. Make sure you have apptainer installed: 257 | ```console 258 | $ apptainer --version 259 | ``` 260 | 261 | 2. Make sure you set the apptainer cache and temporary folders. 262 | ```console 263 | $ mkdir ./cache/ 264 | $ mkdir ./temp/ 265 | $ export APPTAINER_CACHEDIR="./cache/" 266 | $ export APPTAINER_TMPDIR="./temp/" 267 | ``` 268 | 269 | 3. Build the container from the following definition file above. 270 | ```console 271 | apptainer build cowsay.sif cowsay.def 272 | ``` 273 | 274 | 4. Let's test the container by entering into it with a shell terminal 275 | ```console 276 | $ apptainer shell cowsay.sif 277 | ``` 278 | 279 | 5. We can verify the installation. 280 | ```console 281 | $ cowsay "Hello world!"|lolcat 282 | ``` 283 | 284 | ```` 285 | 286 | ````{exercise} (optional) Containers-3: Explore two really useful Docker images 287 | You can try the below if you have Docker installed. If you have 288 | Singularity/Apptainer and not Docker, the goal of the exercise can be to run 289 | the Docker containers through Singularity/Apptainer. 290 | 291 | 1. Run a specific version of *Rstudio*: 292 | ```console 293 | $ docker run --rm -p 8787:8787 -e PASSWORD=yourpasswordhere rocker/rstudio 294 | ``` 295 | 296 | Then open your browser to [http://localhost:8787](http://localhost:8787) 297 | with login rstudio and password "yourpasswordhere" used in the previous 298 | command. 299 | 300 | If you want to try an older version you can check the tags at 301 | [https://hub.docker.com/r/rocker/rstudio/tags](https://hub.docker.com/r/rocker/rstudio/tags) 302 | and run for example: 303 | ```console 304 | $ docker run --rm -p 8787:8787 -e PASSWORD=yourpasswordhere rocker/rstudio:3.3 305 | ``` 306 | 307 | 2. Run a specific version of *Anaconda3* from 308 | [https://hub.docker.com/r/continuumio/anaconda3](https://hub.docker.com/r/continuumio/anaconda3): 309 | ```console 310 | $ docker run -i -t continuumio/anaconda3 /bin/bash 311 | ``` 312 | ```` 313 | 314 | ## Resources for further learning 315 | 316 | - [Carpentries incubator lesson on Docker](https://carpentries-incubator.github.io/docker-introduction/) 317 | - [Carpentries incubator lesson on Singularity/Apptainer](https://carpentries-incubator.github.io/singularity-introduction/) 318 | 319 | ```{keypoints} 320 | - Containers can be helpful if complex setups are needed to running a specific software 321 | - They can also be helpful for prototyping without "messing up" your own computing environment, or for running software that requires a different operating system than your own 322 | ``` 323 | -------------------------------------------------------------------------------- /content/exercises.md: -------------------------------------------------------------------------------- 1 | # List of exercises 2 | 3 | ## Full list 4 | 5 | This is a list of all exercises and solutions in this lesson, mainly 6 | as a reference for helpers and instructors. This list is 7 | automatically generated from all of the other pages in the lesson. 8 | Any single teaching event will probably cover only a subset of these, 9 | depending on their interests. 10 | 11 | ```{exerciselist} 12 | ``` 13 | -------------------------------------------------------------------------------- /content/guide.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | permalink: /guide/ 4 | --- 5 | 6 | # Instructor guide 7 | 8 | 9 | ## Detailed day schedule 10 | 11 | Some example schedules for this lesson: 12 | 13 | 2024 edition plan (times in EET, Helsinki time), **no exercises**, just demos: 14 | 15 | - 09:50 - 10:00 Soft start and icebreaker question 16 | - Page: collaborative notes document 17 | - Give more space to the icebreaker and see what people are writing and talk about our own experiences 18 | - 10:00 - 10:03 Collab document intro 19 | - 10:03 - 10:05 Learning outcomes: https://coderefinery.github.io/reproducible-research/ 20 | - 10:05 - 10:10 Overview of CR and how it all fits together 21 | - Page: https://coderefinery.github.io/reproducible-research/intro 22 | - Learning outcomes from index 23 | - 10:10 - 10:20 Reproducible research, Motivation 24 | - Exercise in notes doc with the discussions in bottom of motivation page 25 | - Page: https://coderefinery.github.io/reproducible-research/motivation/ 26 | - 10:20 - 10:30 Organizing your projects 27 | - Copy the discussion on the notes and if we have time we can highlight some answers 28 | - Page: https://coderefinery.github.io/reproducible-research/organizing-projects/ 29 | - 10:30 - 10:35 ask in collab document and discuss 30 | - https://coderefinery.github.io/reproducible-research/organizing-projects/#discussion-on-reproducibility 31 | - Are you using version control for academic papers? 32 | - ... 33 | - ... 34 | - How do you handle collaborative issues e.g. conflicting changes? 35 | - ... 36 | - ... 37 | - 10:35 - 10:55 Recording computational steps 38 | - Page: https://coderefinery.github.io/reproducible-research/workflow-management/ 39 | - 10:55 - 11:05 Real break 40 | - 11:05 - 11:25 Recording dependencies 41 | - https://coderefinery.github.io/reproducible-research/dependencies/#exercises 42 | - ask first one in collab doc and discuss on stream 43 | - show difference between created env from env file vs exported env file on stream 44 | - 11:25 - 11:30 ask in collaborative document 45 | - Are you using any dependency and/or environment management tool in your work? 46 | - No: o 47 | - why not? 48 | - .. 49 | - .. 50 | - Yes: o 51 | - which? 52 | - .. 53 | - .. 54 | - Have you heard about or been in contact with containers (docker, singularity, podman) in your work? How did you come across them? 55 | - No: o 56 | - Yes: 57 | - .. 58 | - .. 59 | - .. 60 | - 11:30 - 11:50 Recording environments 61 | - The first contact with containers is often: Take this and run this command and then when you need to share/build. 62 | - Discuss setup issues, permissions if docker wants root, bandwidth, etc 63 | - Pros and cons of containers 64 | - Demo of two pre-made containers e.g. expand the R studio optional exercise? 65 | - 11:50 - 12.00 Wrapup 66 | - where to go from here: idea would be to give it more practical focus: what to do with these tools? Project level reproducibility. Time-scales of what changes (short time changes of code, long time years changes of OS-s, libraries). 67 | - Bring your code session advertisement 68 | - Material + recording available 69 | - 12:00 - long break starts 70 | 71 | This is the planned schedule for the workshop in September 2023 (2 hours and 5 minutes including 10 min break) ; note that for this workshop, sharing code and data was moved to social coding lesson: 72 | 73 | - 08:50 - 09:00 Soft start and icebreaker question 74 | - 09:00 - 09:10 Overview of CR and how it all fits together 75 | - 09:10 - 09:20 Reproducible research, [Motivation](https://coderefinery.github.io/reproducible-research/motivation/) 76 | - 09:20 - 09:27 [Organizing your projects](https://coderefinery.github.io/reproducible-research/organizing-projects/) 77 | - 09:27 - 09:35 [Recording computational steps](https://coderefinery.github.io/reproducible-research/workflow-management/) - discussion 78 | - 09:35 - 10:00 Snakemake exercise (25 min) 79 | - 10:00 - 10:10 Break 80 | - 10:10 - 10:15 Summary of workflows and the exercise 81 | - 10:15 - 10:30 [Recording dependencies](https://coderefinery.github.io/reproducible-research/dependencies/) 82 | - 10:30 - 10:40 [Recording environments](https://coderefinery.github.io/reproducible-research/environments/) 83 | - 10:40 - 11:00 Container-1 exercise (20 min) 84 | - 11:00 - 11.05 Wrapup 85 | 86 | This was the schedule at workshop in March 2023 (2 hours and 15 minutes including 2x 10 min break): 87 | 88 | - 08:50 - 09:00 Soft start and icebreaker question 89 | - 09:00 - 09:10 Interview with an invited guest 90 | - 09:10 - 09:20 [Motivation](https://coderefinery.github.io/reproducible-research/motivation/) 91 | - 09:20 - 09:30 [Organizing your projects](https://coderefinery.github.io/reproducible-research/organizing-projects/) 92 | - 09:30 - 10:00 [Recording dependencies](https://coderefinery.github.io/reproducible-research/dependencies/) 93 | - discussion (5 min) 94 | - exercise (20 min) 95 | - discussion (5 min) 96 | - 10:00 - 10:10 Break 97 | - 10:10 - 10:40 [Recording computational steps](https://coderefinery.github.io/reproducible-research/workflow-management/) 98 | - discussion (5 min) 99 | - exercise (20 min) 100 | - discussion (5 min) 101 | - 10:40 - 10:50 [Recording environments](https://coderefinery.github.io/reproducible-research/environments/) 102 | - an exercise exists but is typically not done as part of a standard workshop 103 | - 10:50 - 11:05 [Sharing code and data](https://coderefinery.github.io/reproducible-research/sharing/) 104 | - [demo (15 min)](https://coderefinery.github.io/reproducible-research/sharing/#connecting-repositories-to-zenodo) 105 | - 11:05 - 11:15 Break 106 | 107 | 108 | ## Why we teach this lesson 109 | 110 | Reproducibility in research is something that publishers, funding agencies, universities, 111 | research leaders and the general public worries about and much is being written about it. 112 | It is also something that researchers care deeply about - this lesson is typically one of the 113 | most popular lessons in the pre-workshop survey. 114 | 115 | Even though most PhD students, postdocs and researchers (i.e. typical workshop participants) 116 | know about the importance of reproducibility in research, they often lack both a general 117 | overview of what different aspects there are to reproducibility, and the knowledge of 118 | specific tools that can be used for improving reproducibility. 119 | 120 | Many participants may not adhere to good practices when organizing their projects, 121 | and the "Organizing your projects" episode is meant to encourage participants to 122 | structure their projects better. This may be obvious to some participants but it 123 | doesn't harm to preach to the choir. 124 | 125 | Even though many participants know that code can have many dependencies (e.g. they 126 | may have experienced difficulties in getting other people's code to run), they 127 | often don't know or use good practices when it comes to recording dependencies. 128 | Most participants also don't use isolated environments for different projects and 129 | don't know why that can be important. 130 | The episode "Recording dependencies" tries to convey the importance of recording 131 | dependencies accurately for your projects, and shows how tools like conda can be 132 | used both as a package and software environment manager. 133 | 134 | Many participants have heard about containers and find them interesting, but 135 | lack an understanding of how they work or how they can be used. The episode 136 | "Recording environments" introduces the concept of containers, and the optional 137 | episode "Creating and sharing a container image" goes into details. 138 | 139 | Many participants use complicated series of computational steps in their research 140 | without realizing that this work falls into the category of "scientific workflows", 141 | and that there actually exist tools that help make such workflows reproducible. 142 | The episode "Recording computational steps" introduces the concept of scientific 143 | workflows, discusses various ways of managing workflows with varying degrees of 144 | reproducibility, and shows how tools like Snakemake can be used to 145 | both simplify workflows and make them more reproducible. 146 | 147 | 148 | ## How to teach this lesson 149 | 150 | ### How to start 151 | 152 | Everyone knows that scientific results need to be reproducible, but not everyone is using 153 | appropriate tools to ensure this. Here we're going to get to know tools which help with 154 | preserving the provenance of data and reproducibility on different levels, ranging from 155 | workflow automation to software environment (containers). 156 | 157 | 158 | ### Focus on concepts, and when to use which tool 159 | 160 | Try to explain better what the different tools are useful for, but don't go 161 | into details. In this lesson we are not trying to gain expertise in the 162 | various tools and master the details but rather we want to give an overview and 163 | show that many tools exist and try to give participant the right feel for which 164 | set of tools to approach for which type of problem. 165 | 166 | 167 | ## Typical pitfalls 168 | 169 | ### Indentation in Snakefiles 170 | 171 | - the body of a rule and the body of an input keyword need to be indented, but the number of spaces doesn't matter 172 | This works: 173 | ```python 174 | rule all: 175 | input: 176 | expand('statistics/{book}.data', book=DATA), 177 | expand('plot/{book}.png', book=DATA) 178 | ``` 179 | but this doesn't work: 180 | ```python 181 | rule all: 182 | input: 183 | expand('statistics/{book}.data', book=DATA), 184 | expand('plot/{book}.png', book=DATA) 185 | ``` 186 | nor this: 187 | ```python 188 | rule all: 189 | input: 190 | expand('statistics/{book}.data', book=DATA), 191 | expand('plot/{book}.png', book=DATA) 192 | ``` 193 | 194 | 195 | ## Field reports 196 | 197 | ### 2022 September 198 | 199 | We used the strategy "absolutely minimal introductions, most time 200 | for exercise". Overall, it was probably the right thing to do since 201 | there is so little time and so much to cover. 202 | 203 | There wasn't enough time for the conda exercise (we could give only 7 204 | minutes), but also I wonder how engaging it is. We should look at how 205 | to optimize the start of that episode. 206 | 207 | The Snakemake episode went reasonably well. Our goal was 5 minutes 208 | intro, long exercise, 5 minutes outro. The intro was actually a bit 209 | longer, and there was the comment that we didn't really explain what 210 | Snakemake was before it started (though we tried). The start of this 211 | episode should get particular focus in the future, since this is the 212 | main exercise of the day. 213 | 214 | -------------------------------------------------------------------------------- /content/img/dependency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/dependency.png -------------------------------------------------------------------------------- /content/img/docker_meme.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/docker_meme.jpg -------------------------------------------------------------------------------- /content/img/kitchen/busy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/kitchen/busy.png -------------------------------------------------------------------------------- /content/img/kitchen/libraries.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/kitchen/libraries.png -------------------------------------------------------------------------------- /content/img/kitchen/linux.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/kitchen/linux.png -------------------------------------------------------------------------------- /content/img/kitchen/macos.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/kitchen/macos.png -------------------------------------------------------------------------------- /content/img/kitchen/recipe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/kitchen/recipe.png -------------------------------------------------------------------------------- /content/img/kitchen/windows.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/kitchen/windows.png -------------------------------------------------------------------------------- /content/img/python_environment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/python_environment.png -------------------------------------------------------------------------------- /content/img/repro-pyramid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/repro-pyramid.png -------------------------------------------------------------------------------- /content/img/reproducibility_levels.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/reproducibility_levels.png -------------------------------------------------------------------------------- /content/img/reproducibility_nature.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/reproducibility_nature.jpg -------------------------------------------------------------------------------- /content/img/reproducible-research.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/reproducible-research.jpg -------------------------------------------------------------------------------- /content/img/reproducible_research_plus_lessons.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/reproducible_research_plus_lessons.png -------------------------------------------------------------------------------- /content/img/research_comic_phd.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/research_comic_phd.gif -------------------------------------------------------------------------------- /content/img/snakemake.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/snakemake.png -------------------------------------------------------------------------------- /content/img/snakemake_dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/snakemake_dag.png -------------------------------------------------------------------------------- /content/img/turing-way/8-fair-principles.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/turing-way/8-fair-principles.jpg -------------------------------------------------------------------------------- /content/img/turing-way/reproducibility.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/turing-way/reproducibility.jpg -------------------------------------------------------------------------------- /content/img/word-count/arrows.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/word-count/arrows.png -------------------------------------------------------------------------------- /content/img/word-count/gutenberg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/word-count/gutenberg.png -------------------------------------------------------------------------------- /content/img/word-count/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/word-count/plot.png -------------------------------------------------------------------------------- /content/img/word-count/statistics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/word-count/statistics.png -------------------------------------------------------------------------------- /content/index.rst: -------------------------------------------------------------------------------- 1 | .. _index: 2 | 3 | Reproducible research - Preparing code to be usable by you and others in the future 4 | =================================================================================== 5 | 6 | Have you ever spent days **trying to repeat the results from few weeks or months 7 | ago**? Or you have to do paper revisions, but you just can't get the results to 8 | match up? It's unpleasant for both you and science. 9 | 10 | In this lesson we will explore different methods and tools for better 11 | reproducibility in research software and data. We will demonstrate how version 12 | control, workflows, containers, and package managers can be used to **record 13 | reproducible environments and computational steps** for our future selves and others. 14 | 15 | 16 | .. admonition:: Learning outcomes 17 | 18 | By the end of this lesson, learners should: 19 | - Be able to apply well organized directory structure for their project 20 | - Understand that code can have dependencies, and know how to document them 21 | - Be able to document computational steps, and have an idea when it can be useful 22 | - Know about use cases for containers 23 | 24 | .. prereq:: 25 | 26 | You need to install 27 | `Git, Python, and Snakemake `__. 28 | 29 | If you wish to follow in the terminal and are new to the command line, we 30 | recorded a `short shell crash course `__. 31 | 32 | 33 | .. toctree:: 34 | :maxdepth: 1 35 | :caption: Core episodes 36 | 37 | intro.md 38 | motivation.md 39 | organizing-projects.md 40 | workflow-management.md 41 | dependencies.md 42 | environments.md 43 | where-to-go.md 44 | 45 | 46 | .. toctree:: 47 | :maxdepth: 1 48 | :caption: Reference 49 | 50 | Shell crash course 51 | exercises 52 | guide 53 | 54 | 55 | .. toctree:: 56 | :maxdepth: 1 57 | :caption: About 58 | 59 | All lessons 60 | CodeRefinery 61 | Reusing 62 | -------------------------------------------------------------------------------- /content/intro.md: -------------------------------------------------------------------------------- 1 | # Introduction - How it all connects 2 | 3 | ```{instructor-note} 4 | - 10 min teaching/discussion 5 | - 0 min exercises 6 | ``` 7 | 8 | --- 9 | 10 | ```{figure} /img/turing-way/reproducibility.jpg 11 | :alt: "A person showing another person what steps to take to make their data research reproducible. There is a path with several steps- Here is my data - Here are my tools - Here is my code - Here are my results" 12 | :width: 100% 13 | ``` 14 | 15 | [The Turing Way project illustration by Scriberia. Used under a CC-BY 4.0 licence. DOI: ] 16 | 17 | ## This workshop is all about reproducibility - from a computational perspective 18 | 19 | This section connects the steps above to the CodeRefinery workshop lessons. 20 | 21 | **"Here is my code"** 22 | 23 | - **Version control with git** with focus on collaboration 24 | - **Social coding**: What can you do to get credit for your code and to allow reuse 25 | - **Documentation**: How to let others or future you know about your thoughts and how to use your code 26 | - **Jupyter Notebooks**: A tool to write and share executable notebooks and data visualization 27 | - **Automated testing**: Preventing yourself and others from breaking your functioning code 28 | - **Modular code development**: Making reusing parts of your code easier 29 | 30 | **"Here are my tools"** 31 | 32 | This lesson on general **Reproducibility**: Preparing code to be usable by you and others in the future 33 | 34 | This includes organizing your projects on your own computer and recording your computational steps, dependencies and computing environment. 35 | 36 | We will also mention a few tools and platforms for sharing data (**"Here is my data"**) and research outputs(**"Here are my results"**) in the **social coding** lesson, but they are not the focus of this workshop. 37 | 38 | ## Small steps towards reproducible research 39 | 40 | If this is all new to you, it may feel quite overwhelming. 41 | 42 | **Our recommendation:** Don't worry! Focus on "good enough" instead of perfect. 43 | 44 | To start, pick one topic that seems reasonable to implement for your current project. Something that helps YOU right now. This may be something you may have to implement due to requirements from your funders or the journal where you want to publish your research. Use their requirements as a checklist and find tools that feel comfortable for you. 45 | 46 | A great way to see what are the really important things to implement is to meet with a colleague, exchange codes and try to run each others code. Every question your colleague has to ask from you about your code gives a hint on where you may need to improve. 47 | 48 | Keeping a "log book" while working on your own code also serves as a great basis for making your code more reproducible. Can you use any of the tools and techniques learned in this workshop to share parts of your log book with others to help them run your code? 49 | 50 | -------------------------------------------------------------------------------- /content/motivation.md: -------------------------------------------------------------------------------- 1 | # Motivation 2 | 3 | ```{objectives} 4 | - Understand why we are talking about reproducibility in this workshop 5 | ``` 6 | 7 | ```{instructor-note} 8 | - 10 min teaching/discussion 9 | ``` 10 | 11 | ```{figure} img/research_comic_phd.gif 12 | :alt: Research comic 13 | :width: 100% 14 | ``` 15 | 16 | ```{admonition} A scary anecdote 17 | - A group of researchers obtain great results and submit their work to a high-profile journal. 18 | - Reviewers ask for new figures and additional analysis. 19 | - The researchers start working on revisions and generate modified figures, but find inconsistencies with old figures. 20 | - The researchers can't find some of the data they used to generate the original results, and 21 | can't figure out which parameters they used when running their analyses. 22 | - The manuscript is still languishing in the drawer ... 23 | ``` 24 | 25 | --- 26 | 27 | ## Why talking about reproducible research? 28 | 29 | A 2016 30 | [survey](http://www.nature.com/news/1-500-scientists-lift-the-lid-on-reproducibility-1.19970) 31 | in Nature revealed that irreproducible experiments are a problem across all 32 | domains of science: 33 | 34 | ```{figure} img/reproducibility_nature.jpg 35 | :alt: reproduciblity Nature 36 | :width: 100% 37 | ``` 38 | 39 | This study is now few years old but the highlighted problem did not get 40 | smaller. 41 | 42 | --- 43 | 44 | ## Levels of reproducibility 45 | 46 | A published article is like the top of a pyramid. It rests on multiple 47 | levels that each contributes to its reproducibility. 48 | 49 | ```{figure} img/repro-pyramid.png 50 | :alt: Reproducibility pyramid 51 | :width: 100% 52 | ``` 53 | 54 | [Steeves, Vicky (2017) in "Reproducibility Librarianship," Collaborative Librarianship: Vol. 9: Iss. 2, Article 4. 55 | Available at: https://digitalcommons.du.edu/collaborativelibrarianship/vol9/iss2/4] 56 | 57 | This also means that you can think about it from the beginning of your research life cycle! 58 | 59 | --- 60 | 61 | ````{discussion} Discuss in collaborative document or with your team members 62 | ```markdown 63 | - What are your experiences re-running or adjusting a script or a figure you 64 | created few months ago? 65 | - ... 66 | - ... 67 | - (share your experience) 68 | 69 | - Have you continued working from a previous student's 70 | script/code/plot/notebook? What were the biggest challenges? 71 | - ... 72 | - ... 73 | - (share your experience, but constructively) 74 | ```` 75 | 76 | ```{keypoints} 77 | - Without reproducibility in scientific computing, everyone would have to start a new project / code from scratch 78 | ``` -------------------------------------------------------------------------------- /content/organizing-projects.md: -------------------------------------------------------------------------------- 1 | # Organizing your projects 2 | 3 | ```{objectives} 4 | - Understand how to organize research projects 5 | - Get an overview of tools for collaborative and version controlled manuscripts 6 | ``` 7 | 8 | ```{instructor-note} 9 | - 10 min teaching incl. discussions 10 | ``` 11 | 12 | One of the first steps to make your work reproducible is to organize your projects well. 13 | Let's go over some of the basic things which people have found to work (and not to work). 14 | 15 | 16 | ## Directory structure for projects 17 | 18 | - Project files in a **single directory** 19 | - **Different projects** should have **separate directories** 20 | - Use **consistent and informative directory structure** 21 | - Avoid spaces in directory and file names – use `-`, `_` or CamelCase instead (nicer for computers to handle). 22 | - If you need to separate public/private directories, 23 | - put them separately in public and private Git repositories, or 24 | - use `.gitignore` to exclude the private information from being tracked 25 | - Add a **README file** to describe the project and instructions on reproducing the results 26 | - If you want to use the **same code in multiple projects**, host it on GitHub (or similar) and clone it into each of your project directories. 27 | 28 | A project directory can look something like this: 29 | 30 | ```shell 31 | project_name/ 32 | ├── README.md # overview of the project 33 | ├── data/ # data files used in the project 34 | │ ├── README.md # describes where data came from 35 | │ └── sub-directory/ # may contain subdirectories 36 | ├── processed_data/ # intermediate files from the analysis 37 | ├── manuscript/ # manuscript describing the results 38 | ├── results/ # results of the analysis (data, tables, figures) 39 | ├── src/ # contains all code in the project 40 | │ ├── LICENSE # license for your code 41 | │ ├── requirements.txt # software requirements and dependencies 42 | │ └── ... 43 | └── doc/ # documentation for your project 44 | ├── index.rst 45 | └── ... 46 | ``` 47 | 48 | --- 49 | 50 | ## Tracking source code, data, and results 51 | 52 | - All code is version controlled and goes in the `src/` or `source/` directory 53 | - Include appropriate LICENSE file and information on software requirements 54 | - You can also version control data files or input files under `data/` 55 | - If data files are too large (or sensitive) to track, untrack them using `.gitignore` 56 | - Intermediate files from the analysis are kept in `processed_data/` 57 | - Consider using Git tags to mark specific versions of results (version 58 | submitted to a journal, dissertation version, poster version, etc.): 59 | ```console 60 | $ git tag -a thesis-submitted -m "this is the submitted version of my thesis" 61 | ``` 62 | 63 | Check the [Git-intro lesson](https://coderefinery.github.io/git-intro/) for a reminder. 64 | 65 | 66 | ## Some tools and templates 67 | 68 | - [R devtools](https://devtools.r-lib.org/) 69 | - [Python cookiecutter template](https://github.com/Materials-Data-Science-and-Informatics/fair-python-cookiecutter) 70 | - [Reproducible research template](https://github.com/the-turing-way/reproducible-project-template) by the Turing Way 71 | 72 | More tools and templates in [Heidi Seibolds blog](https://heidiseibold.ck.page/posts/setting-up-a-fair-and-reproducible-project). 73 | 74 | 75 | --- 76 | 77 | ## Excursion: Reproducible publications 78 | 79 | ### Discussion on collaborative writing of academic papers 80 | 81 | ````{discussion} Discuss in the collaborative document: 82 | 83 | ``` 84 | - How do you collaborate on writing academic papers? 85 | - ... 86 | - ... 87 | - (share your experience) 88 | 89 | - How do you handle collaborative issues e.g. conflicting changes? 90 | - ... 91 | - ... 92 | - (share your experience) 93 | ``` 94 | 95 | ```` 96 | 97 | -> Consider using **version control for manuscripts** as well. It may help you when keeping track of edits + if you sync it online then you don't have to worry about losing your work. 98 | 99 | Version control does not have to mean git, but could also mean using "tracking changes" in tools like Word, Google Docs, or Overleaf (find links below). 100 | 101 | ### Tools for collaborative writing and version control of manuscripts 102 | 103 | Git **can** be used to collaborate on manuscripts written in, e.g., LaTeX and other text-based formats. However it might not always be the most convenient. Other tools exist to make the process more enjoyable: 104 | 105 | You can **collaboratively gather notes** using self-hosted or public instances of tools like [HedgeDoc](https://hedgedoc.org/) and [Etherpad](https://etherpad.org) or use online options like [HackMD](https://hackmd.io/), [Google Docs](https://docs.google.com) or the Microsoft online tools for easy and efficient collaboration. 106 | 107 | To format your notes into a manuscript, you can use Word-like online editors or tools like [Overleaf](https://www.overleaf.com) (LaTeX) or [Typst](https://typst.app/) (markdown). Most of the tools in this section even provide a git integration. 108 | 109 | [Manubot](https://github.com/manubot/rootstock) offers another way to turn your written word into a fully rendered manuscript using GitHub. 110 | 111 | ### Executable manuscripts 112 | 113 | You may also want to consider writing an executable manuscript using tools like [Jupyter Notebooks](https://jupyter.org) hosted on [Binder](https://mybinder.org), [Quarto](https://quarto.org/), [Authorea](https://www.authorea.com) or [Observable](https://observablehq.com/), to name a few. 114 | 115 | ### Resources on research compendia 116 | 117 | - [About research compendia at the Turing Way](https://book.the-turing-way.org/reproducible-research/compendia) 118 | - ["Research compendia"](http://inundata.org/talks/rstd19/#/): a set of good practices for reproducible data analysis in R, but much is transferable to other languages. 119 | - [rrtools](https://github.com/benmarwick/rrtools): instructions, templates, and functions for writing a reproducible article or report with R. 120 | - ... 121 | 122 | ```{keypoints} 123 | - An organized project directory structure helps with reproducibility. 124 | - Also think about version control for writing your academic manuscripts. 125 | ``` 126 | -------------------------------------------------------------------------------- /content/where-to-go.md: -------------------------------------------------------------------------------- 1 | # Where to go from here 2 | 3 | ```{objectives} 4 | - Understand when tools discussed in this episode can be useful 5 | ``` 6 | 7 | ```{instructor-note} 8 | - 10 min teaching/discussion 9 | ``` 10 | 11 | This episode presents a lot of different tools and opportunities for your research software project. 12 | However, you will not always need all of them. As with so many things, it again depends on your project. 13 | 14 | ## Workflow tools will maybe make sense in the future 15 | 16 | - In many cases, it is probably not needed 17 | - You will want to consider workflow tools: 18 | - When processing many files with many steps 19 | - Steps or files may change 20 | - Your main script, connecting your steps gets very long 21 | - You are still collecting your input data 22 | - ... 23 | 24 | ## Containers seem amazing, but do I have use for them? 25 | 26 | - Maybe not yet, but knowing that you can ... 27 | - Run Linux tools on your Windows computer 28 | - Run different versions of same software on your computer 29 | - Follow the "easy installation instructions" for an operating system that is not your own 30 | - Get a fully configured environment instead of only installing a tool 31 | - Share your setup and configurations with others 32 | 33 | ... can be very beneficial :) 34 | 35 | ## Important for every project 36 | 37 | - Clear file structure for your project 38 | - Record your workflow and write it down in a script file. 39 | - Create a dependency list and keep it updated, optimally in an environment file 40 | - At least consider the possibility that someone, maybe you may want to reproduce your work 41 | - Can you do something (small) to make it easier? 42 | - If you have ideas, but no time: add an issue to your repository; maybe someone else wants to help. 43 | 44 | ## Further reading 45 | 46 | - [The Turing Way handbook to reproducible, ethical and collaborative data science](https://doi.org/10.5281/zenodo.3233853) 47 | - [Reproducible research policies and software/data management in scientific computing journals: a survey, discussion, and perspectives](https://doi.org/10.3389/fcomp.2024.1491823) 48 | - ... 49 | 50 | ```{seealso} 51 | Do you want to practice your reproducibility skills and get inspired by working with other people's code/data? Join a [ReproHack event](https://www.reprohack.org/event/)! 52 | ``` 53 | 54 | ```{keypoints} 55 | - Not everything in this lesson might be useful right now, but it is good to know that these things exist if you ever get in a situation that would require such solution. 56 | - Caring about reproducibility makes work easier for the next person working on the project - and that might be you in a few years! 57 | ``` 58 | -------------------------------------------------------------------------------- /content/workflow-management.md: -------------------------------------------------------------------------------- 1 | # Recording computational steps 2 | 3 | ```{objectives} 4 | - Understand why and when a workflow management tool can be useful 5 | ``` 6 | 7 | ```{questions} 8 | - You have some steps that need to be run to do your work. How do you 9 | actually run them? Does it rely on your own memory and work, or is it 10 | reproducible? **How do you communicate the steps** for future you and others? 11 | - How can we create a reproducible workflow? 12 | ``` 13 | 14 | ```{instructor-note} 15 | - 5 min teaching 16 | - 15 min demo 17 | ``` 18 | 19 | 20 | ## Several steps from input data to result 21 | 22 | *The following material is partly derived from a [HPC Carpentry lesson](https://hpc-carpentry.github.io/hpc-python/).* 23 | 24 | In this episode, we will use an [example 25 | project](https://github.com/coderefinery/word-count) which finds most frequent 26 | words in books and plots the result from those statistics. In this example we 27 | wish to: 28 | 29 | 1. Analyze word frequencies using [code/count.py](https://github.com/coderefinery/word-count/blob/main/code/count.py) 30 | for 4 books 31 | (they are all in the [data](https://github.com/coderefinery/word-count/tree/main/data) directory). 32 | 2. Plot a histogram using [plot/plot.py](https://github.com/coderefinery/word-count/blob/main/plot/plot.py). 33 | 34 | ```{figure} img/word-count/arrows.png 35 | :alt: From book to word counts to plot 36 | :width: 100% 37 | ``` 38 | 39 | Example (for one book only): 40 | 41 | ```console 42 | $ python code/count.py data/isles.txt > statistics/isles.data 43 | $ python code/plot.py --data-file statistics/isles.data --plot-file plot/isles.png 44 | ``` 45 | 46 | Another way to analyze the data would be via a graphical user interface (GUI), where you can for example drag and drop files and click buttons to do the different processing steps. 47 | 48 | Both of the above (single line commands and simple graphical interfaces) are tricky in terms of reproducibility. We currently have two steps and 4 books. But **imagine having 4 steps and 500 books**. 49 | How could we deal with this? 50 | 51 | As a first idea we could express the workflow with a script. The repository includes such script called `run_all.sh`. 52 | 53 | We can run it with: 54 | 55 | ```console 56 | $ bash run_all.sh 57 | ``` 58 | 59 | This is **imperative style**: we tell the script to run these 60 | steps in precisely this order, as we would run them manually, one after another. 61 | 62 | ````{discussion} 63 | - What are the advantages of this solution compared to processing all one by one? 64 | - Is the scripted solution reproducible? 65 | - Imagine adding more steps to the analysis and imagine the steps being time consuming. What problems do you anticipate 66 | with a scripted solution? 67 | 68 | ```{solution} 69 | The advantage of this solution compared to processing one by one is more automation: We can generate all. 70 | This is not only easier, it is also less error-prone. 71 | 72 | Yes, the scripted solution can be reproducible. But could you easily run it e.g. on a Windows computer? 73 | 74 | If we had more steps and once steps start to be time-consuming, a limitation of 75 | a scripted solution is that it tries to run all steps always. Rerunning only 76 | part of the steps or only part of the input data requires us to outcomment or change lines in our script in between runs which can again become tedious and error-prone. 77 | ``` 78 | ```` 79 | 80 | --- 81 | 82 | ## Workflow tools 83 | 84 | Sometimes it may be helpful to go from imperative to declarative style. Rather than saying "do this and then that" we describe dependencies between steps, but we let the tool figure out the order of steps to produce results. 85 | 86 | ### Example workflow tool: [Snakemake](https://snakemake.readthedocs.io/en/stable/index.html) 87 | 88 | Snakemake (inspired by [GNU Make](https://www.gnu.org/software/make/)) is one of many tools to create reproducible and scalable data analysis workflows. Workflows are described via a human readable, Python based language. 89 | Snakemake workflows scale seamlessly from laptop to cluster or cloud, without the need to modify the workflow definition. 90 | 91 | --- 92 | 93 | ## A demo 94 | 95 | ````{prereq} Preparation 96 | The exercise (below) and pre-exercise discussion uses the 97 | word-count repository 98 | () which we need to clone to work on it. 99 | 100 | If you want to do this exercise on your own, you can do so either on your own computer (follow the instructions in the bottom right panel on the [CodeRefinery installation instruction page](https://coderefinery.github.io/installation/)), or the [Binder](https://mybinder.org/) 101 | cloud service: 102 | 103 | **On your own computer**: 104 | - Install the necessary tools 105 | - Activate the [coderefinery conda environment](https://coderefinery.github.io/installation/conda-environment/) with `conda activate coderefinery`. 106 | - Clone the word-count repository: 107 | ```console 108 | $ git clone https://github.com/coderefinery/word-count.git 109 | ``` 110 | 111 | **On Binder**: 112 | We can also use the cloud service [Binder](https://mybinder.org/) to make sure 113 | we all have the same computing environment. This is interesting from a 114 | reproducible research point of view and it's explained further in the [Jupyter 115 | lesson](https://coderefinery.github.io/jupyter/sharing/) how this is even 116 | possible. 117 | - Go to and click on the "launch binder" badge in the README. 118 | - Once it get started, you can open a new Terminal from the **new** menu (top right) and select **Terminal**. 119 | ```` 120 | 121 | ````{exercise} Workflow-1: Workflow solution using Snakemake 122 | 123 | ```{figure} img/snakemake.png 124 | :alt: How Snakemake works 125 | :width: 100% 126 | ``` 127 | 128 | Somebody wrote a [Snakemake](https://snakemake.readthedocs.io) solution in the [Snakefile](https://github.com/coderefinery/word-count/blob/main/Snakefile): 129 | ``` 130 | # a list of all the books we are analyzing 131 | DATA = glob_wildcards('data/{book}.txt').book 132 | 133 | rule all: 134 | input: 135 | expand('statistics/{book}.data', book=DATA), 136 | expand('plot/{book}.png', book=DATA) 137 | 138 | # count words in one of our books 139 | rule count_words: 140 | input: 141 | script='code/count.py', 142 | book='data/{file}.txt' 143 | output: 'statistics/{file}.data' 144 | shell: 'python {input.script} {input.book} > {output}' 145 | 146 | # create a plot for each book 147 | rule make_plot: 148 | input: 149 | script='code/plot.py', 150 | book='statistics/{file}.data' 151 | output: 'plot/{file}.png' 152 | shell: 'python {input.script} --data-file {input.book} --plot-file {output}' 153 | 154 | ``` 155 | 156 | We can see that Snakemake uses **declarative style**: 157 | Snakefiles contain rules that relate targets (`output`) to dependencies 158 | (`input`) and commands (`shell`). 159 | 160 | Steps: 161 | 1. Clone the example to your computer: `$ git clone https://github.com/coderefinery/word-count.git` 162 | 2. Study the Snakefile. How does it know what to do first and what to do then? 163 | 3. Try to run it. Since version 5.11 one needs to specify number of cores (or 164 | jobs) using `-j`, `--jobs` or `--cores`: 165 | ```console 166 | $ snakemake --delete-all-output -j 1 167 | $ snakemake -j 1 168 | ``` 169 | The `--delete-all-output` part makes sure that we remove all generated files before we start. 170 | 4. Try running `snakemake` again and observe that and discuss why it refused to rerun all steps: 171 | ```console 172 | $ snakemake -j 1 173 | 174 | Building DAG of jobs... 175 | Nothing to be done (all requested files are present and up to date). 176 | ``` 177 | 5. Make a tiny modification to the plot.py script and run `$ snakemake -j 1` again and observe how it will only re-run the plot steps. 178 | 6. Make a tiny modification to one of the books and run `$ snakemake -j 1` again and observe how it only regenerates files for this book. 179 | 7. Discuss possible advantages compared to a scripted solution. 180 | 8. **Question for R developers**: Imagine you want to rewrite the two Python scripts and use R instead. Which lines in 181 | the [Snakefile](https://github.com/coderefinery/word-count/blob/main/Snakefile) would you have to modify so that it uses your R code? 182 | 9. If you make changes to the Snakefile, validate it using `$ snakemake --lint`. 183 | 184 | ```{solution} 185 | - 2: Start with "all" and look what it depends on. Now search for rules that 186 | have these as output. Look for their inputs and search where they 187 | are produced. In other words, search backwards and build a graph of 188 | dependencies. This is what Snakemake does. 189 | - 4: It can see that outputs are newer than inputs. It will only regenerate 190 | outputs if they are not there or if the inputs or scripts have changed. 191 | - 7: It only generates steps and outputs that are missing or outdated. The workflow 192 | does not run everything every time. In other words if you notice a problem or update information 193 | "half way" in the analysis, it will only re-run what needs to be re-run. Nothing more, nothing less. 194 | Another advantage is that it can distribute tasks to multiple cores, off-load work to supercomputers, 195 | offers more fine-grained control over environments, and more. 196 | - 8: Probably only the two lines containing "shell". 197 | ``` 198 | ```` 199 | 200 | ## Visualizing the workflow 201 | 202 | We can visualize the directed acyclic graph (DAG) of our current Snakefile 203 | using the `--dag` option, which will output the DAG in `dot` language. 204 | 205 | **Note**: This requires the [Graphviz software](https://www.graphviz.org/), 206 | which can be installed by `conda install graphviz`. 207 | 208 | ```console 209 | $ snakemake -j 1 --dag | dot -Tpng > dag.png 210 | ``` 211 | 212 | Rules that have yet to be completed are indicated with solid outlines, while already completed rules are indicated with dashed outlines. 213 | 214 | ```{figure} img/snakemake_dag.png 215 | :alt: Snakemake DAG 216 | :width: 100% 217 | ``` 218 | 219 | ## Why [Snakemake](https://snakemake.readthedocs.io/)? 220 | 221 | - Gentle **learning curve**. 222 | - Free, open-source, and **installs easily** via conda or pip. 223 | - **Cross-platform** (Windows, MacOS, Linux) and compatible with all High Performance Computing (HPC) schedulers: 224 | same workflow works without modification and scales appropriately whether on a laptop or cluster. 225 | - If several workflow steps are independent of each other, and you have multiple cores available, Snakemake can run them **in parallel**. 226 | - Is is possible to define **isolated software environments** per rule, e.g. by adding `conda: 'environment.yml'` to a rule. 227 | - Also possible to run workflows in Docker or Apptainer **containers** e.g. by adding `container: 'docker://some-org/some-tool#2.3.1'` to a rule. 228 | - [Heavily used in bioinformatics](https://twitter.com/carl_witt/status/1103951128046301185), but is **completely general**. 229 | - Nice functionality for archiving the workflow, see: [the official documentation](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#sustainable-and-reproducible-archiving) 230 | 231 | Tools like Snakemake help us with **reproducibility** by supporting us with **automation**, **scalability** and **portability** of our workflows. 232 | 233 | ## Similar tools 234 | 235 | - [Make](https://www.gnu.org/software/make/) 236 | - [Nextflow](https://www.nextflow.io/) 237 | - [Task](https://taskfile.dev/) 238 | - [Common Workflow Language](https://www.commonwl.org/) 239 | - Many [specialized frameworks](https://github.com/common-workflow-language/common-workflow-language/wiki/Existing-Workflow-systems) exist. 240 | - [Book on building reproducible analytical pipelines with R](https://raps-with-r.dev/) 241 | - [{targets} R package - make-like pipeline tool for R](https://books.ropensci.org/targets/) 242 | 243 | ```{keypoints} 244 | - Computational steps can be recorded in many ways 245 | - Workflow tools can help, if there are many steps to be executed and/or many datasets to be processed 246 | ``` 247 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Sphinx 2 | sphinx_rtd_theme 3 | sphinx_rtd_theme_ext_color_contrast 4 | myst_nb 5 | git+https://github.com/rkdarst/sphinx-copybutton.git@exclude-unselectable-3 6 | sphinx-lesson 7 | https://github.com/coderefinery/sphinx-coderefinery-branding/archive/master.zip 8 | --------------------------------------------------------------------------------