├── .github
└── workflows
│ ├── spellcheck.yml
│ └── sphinx.yml
├── .gitignore
├── CITATION.cff
├── LICENSE
├── Makefile
├── README.md
├── content
├── conf.py
├── dependencies.md
├── environments.md
├── exercises.md
├── guide.md
├── img
│ ├── dependency.png
│ ├── docker_architecture.svg
│ ├── docker_meme.jpg
│ ├── kitchen
│ │ ├── busy.png
│ │ ├── libraries.png
│ │ ├── linux.png
│ │ ├── macos.png
│ │ ├── recipe.png
│ │ └── windows.png
│ ├── python_environment.png
│ ├── repro-pyramid.png
│ ├── reproducibility_levels.png
│ ├── reproducibility_nature.jpg
│ ├── reproducible-research.jpg
│ ├── reproducible_research_plus_lessons.png
│ ├── reproducible_research_plus_lessons.svg
│ ├── research_comic_phd.gif
│ ├── snakemake.png
│ ├── snakemake_dag.png
│ ├── turing-way
│ │ ├── 8-fair-principles.jpg
│ │ └── reproducibility.jpg
│ └── word-count
│ │ ├── arrows.png
│ │ ├── gutenberg.png
│ │ ├── plot.png
│ │ └── statistics.png
├── index.rst
├── intro.md
├── motivation.md
├── organizing-projects.md
├── where-to-go.md
└── workflow-management.md
└── requirements.txt
/.github/workflows/spellcheck.yml:
--------------------------------------------------------------------------------
1 | name: Spelling Erros Check
2 |
3 | on: [push]
4 |
5 | jobs:
6 | build:
7 | strategy:
8 | max-parallel: 2
9 | matrix:
10 | os: [ubuntu-latest]
11 |
12 | runs-on: ${{ matrix.os }}
13 |
14 | steps:
15 | - uses: actions/checkout@v1
16 | - name: Install dependencies
17 | run: curl -L https://git.io/misspell | bash
18 |
19 | # This will return an exit code of 2, thus triggering a failed build
20 | - name: Test spelling errors
21 | shell: bash
22 | run: |
23 | bin/misspell -error *
24 |
--------------------------------------------------------------------------------
/.github/workflows/sphinx.yml:
--------------------------------------------------------------------------------
1 | # Deploy Sphinx. This could be shorter, but we also do some extra
2 | # stuff.
3 | #
4 | # License: CC-0. This is the canonical location of this file, which
5 | # you may want to link to anyway:
6 | # https://github.com/coderefinery/sphinx-lesson-template/blob/main/.github/workflows/sphinx.yml
7 | # https://raw.githubusercontent.com/coderefinery/sphinx-lesson-template/main/.github/workflows/sphinx.yml
8 |
9 |
10 | name: sphinx
11 | on: [push, pull_request]
12 |
13 | env:
14 | DEFAULT_BRANCH: "main"
15 | # If these SPHINXOPTS are enabled, then be strict about the
16 | # builds and fail on any warnings.
17 | #SPHINXOPTS: "-W --keep-going -T"
18 | GENERATE_PDF: true # to enable, must be 'true' lowercase
19 | GENERATE_SINGLEHTML: true # to enable, must be 'true' lowercase
20 | PDF_FILENAME: lesson.pdf
21 | MULTIBRANCH: true # to enable, must be 'true' lowercase
22 |
23 |
24 | jobs:
25 | build:
26 | name: Build
27 | runs-on: ubuntu-latest
28 | permissions:
29 | contents: read
30 |
31 | steps:
32 | # https://github.com/marketplace/actions/checkout
33 | - uses: actions/checkout@v4
34 | with:
35 | fetch-depth: 0
36 | lfs: true
37 |
38 | # https://github.com/marketplace/actions/setup-python
39 | # ^-- This gives info on matrix testing.
40 | - name: Install Python
41 | uses: actions/setup-python@v4
42 | with:
43 | python-version: '3.11'
44 | cache: 'pip'
45 |
46 | # https://docs.github.com/en/actions/guides/building-and-testing-python#installing-dependencies
47 | # ^-- This gives info on installing dependencies with pip
48 | - name: Install dependencies
49 | run: |
50 | python -m pip install --upgrade pip
51 | pip install -r requirements.txt
52 |
53 | # Debug
54 | - name: Debugging information
55 | env:
56 | ref: ${{github.ref}}
57 | event_name: ${{github.event_name}}
58 | head_ref: ${{github.head_ref}}
59 | base_ref: ${{github.base_ref}}
60 | run: |
61 | echo "github.ref: ${ref}"
62 | echo "github.event_name: ${event_name}"
63 | echo "github.head_ref: ${head_ref}"
64 | echo "github.base_ref: ${base_ref}"
65 | echo "GENERATE_PDF: ${GENERATE_PDF}"
66 | echo "GENERATE_SINGLEHTML: ${GENERATE_SINGLEHTML}"
67 | set -x
68 | git rev-parse --abbrev-ref HEAD
69 | git branch
70 | git branch -a
71 | git remote -v
72 | python -V
73 | pip list --not-required
74 | pip list
75 |
76 |
77 | # Build
78 | - uses: ammaraskar/sphinx-problem-matcher@master
79 | - name: Build Sphinx docs (dirhtml)
80 | # SPHINXOPTS used via environment variables
81 | run: |
82 | make dirhtml
83 | # This fixes broken copy button icons, as explained in
84 | # https://github.com/coderefinery/sphinx-lesson/issues/50
85 | # https://github.com/executablebooks/sphinx-copybutton/issues/110
86 | # This can be removed once these PRs are accepted (but the
87 | # fixes also need to propagate to other themes):
88 | # https://github.com/sphinx-doc/sphinx/pull/8524
89 | # https://github.com/readthedocs/sphinx_rtd_theme/pull/1025
90 | sed -i 's/url_root="#"/url_root=""/' _build/dirhtml/index.html || true
91 |
92 | # singlehtml
93 | - name: Generate singlehtml
94 | if: ${{ env.GENERATE_SINGLEHTML == 'true' }}
95 | run: |
96 | make singlehtml
97 | mv _build/singlehtml/ _build/dirhtml/singlehtml/
98 |
99 | # PDF if requested
100 | - name: Generate PDF
101 | if: ${{ env.GENERATE_PDF == 'true' }}
102 | run: |
103 | pip install https://github.com/rkdarst/sphinx_pyppeteer_builder/archive/refs/heads/main.zip
104 | make pyppeteer
105 | mv _build/pyppeteer/*.pdf _build/dirhtml/${PDF_FILENAME}
106 |
107 | # Stage all deployed assets in _gh-pages/ for simplicity, and to
108 | # prepare to do a multi-branch deployment.
109 | - name: Copy deployment data to _gh-pages/
110 | if: ${{ github.event_name == 'push' }}
111 | run:
112 | rsync -a _build/dirhtml/ _gh-pages/
113 |
114 | # Use gh-pages-multibranch to multiplex different branches into
115 | # one deployment. See
116 | # https://github.com/coderefinery/gh-pages-multibranch
117 | - name: gh-pages multibranch
118 | uses: coderefinery/gh-pages-multibranch@main
119 | if: ${{ github.event_name == 'push' && env.MULTIBRANCH == 'true' }}
120 | with:
121 | directory: _gh-pages/
122 | default_branch: ${{ env.DEFAULT_BRANCH }}
123 | publish_branch: gh-pages
124 |
125 | # Add the .nojekyll file
126 | - name: nojekyll
127 | if: ${{ github.event_name == 'push' }}
128 | run: |
129 | touch _gh-pages/.nojekyll
130 |
131 | # Save artifact for the next step.
132 | - uses: actions/upload-artifact@v4
133 | if: ${{ github.event_name == 'push' }}
134 | with:
135 | name: gh-pages-build
136 | path: _gh-pages/
137 |
138 | # Deploy in a separate job so that write permissions are restricted
139 | # to the minimum steps.
140 | deploy:
141 | name: Deploy
142 | runs-on: ubuntu-latest
143 | needs: build
144 | # This if can't use the env context - find better way later.
145 | if: ${{ github.event_name == 'push' }}
146 | permissions:
147 | contents: write
148 |
149 | steps:
150 | - uses: actions/download-artifact@v4
151 | if: ${{ github.event_name == 'push' && ( env.MULTIBRANCH == 'true' || github.ref == format('refs/heads/{0}', env.DEFAULT_BRANCH )) }}
152 | with:
153 | name: gh-pages-build
154 | path: _gh-pages/
155 |
156 | # As of 2023, we could publish to pages via a Deployment. This
157 | # isn't done yet to give it time to stabilize (out of beta), and
158 | # also having a gh-pages branch to check out is rather
159 | # convenient.
160 |
161 | # Deploy
162 | # https://github.com/peaceiris/actions-gh-pages
163 | - name: Deploy
164 | uses: peaceiris/actions-gh-pages@v3
165 | if: ${{ github.event_name == 'push' && ( env.MULTIBRANCH == 'true' || github.ref == format('refs/heads/{0}', env.DEFAULT_BRANCH )) }}
166 | with:
167 | publish_branch: gh-pages
168 | github_token: ${{ secrets.GITHUB_TOKEN }}
169 | publish_dir: _gh-pages/
170 | force_orphan: true
171 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | _build
2 | *~
3 | .DS_Store
4 |
--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | message: "If you use this lesson material, please cite it using these metadata."
3 | authors:
4 | - name: "CodeRefinery"
5 | - family-names: "Wikfeldt"
6 | given-names: "Kjartan Thor"
7 | - family-names: "Bast"
8 | given-names: "Radovan"
9 | - family-names: "Darst"
10 | given-names: "Richard"
11 | - family-names: "Hellsvik"
12 | given-names: "Johann"
13 | - family-names: "Wittke"
14 | given-names: "Samantha"
15 | - family-names: "Jääskeläinen"
16 | given-names: "Matias"
17 | - family-names: "Glerean"
18 | given-names: "Enrico"
19 | - family-names: "Vathsavayi"
20 | given-names: "Harsha"
21 | - family-names: "Wang"
22 | given-names: "Yonglei"
23 | title: "Reproducible research - Preparing code to be usable by you in the future and others in general"
24 | type: "data"
25 | abstract: "We focus here on 3 aspects of reproducible programs and computations: documenting dependencies, environments, and computational steps in a reproducible way. We touch on containers."
26 | version: 2025-03-19
27 | date-released: 2025-03-19
28 | url: "https://coderefinery.github.io/reproducible-research/"
29 | license: CC-BY-4.0
30 | repository-code: "https://github.com/coderefinery/reproducible-research"
31 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Attribution 4.0 International
2 |
3 | =======================================================================
4 |
5 | Creative Commons Corporation ("Creative Commons") is not a law firm and
6 | does not provide legal services or legal advice. Distribution of
7 | Creative Commons public licenses does not create a lawyer-client or
8 | other relationship. Creative Commons makes its licenses and related
9 | information available on an "as-is" basis. Creative Commons gives no
10 | warranties regarding its licenses, any material licensed under their
11 | terms and conditions, or any related information. Creative Commons
12 | disclaims all liability for damages resulting from their use to the
13 | fullest extent possible.
14 |
15 | Using Creative Commons Public Licenses
16 |
17 | Creative Commons public licenses provide a standard set of terms and
18 | conditions that creators and other rights holders may use to share
19 | original works of authorship and other material subject to copyright
20 | and certain other rights specified in the public license below. The
21 | following considerations are for informational purposes only, are not
22 | exhaustive, and do not form part of our licenses.
23 |
24 | Considerations for licensors: Our public licenses are
25 | intended for use by those authorized to give the public
26 | permission to use material in ways otherwise restricted by
27 | copyright and certain other rights. Our licenses are
28 | irrevocable. Licensors should read and understand the terms
29 | and conditions of the license they choose before applying it.
30 | Licensors should also secure all rights necessary before
31 | applying our licenses so that the public can reuse the
32 | material as expected. Licensors should clearly mark any
33 | material not subject to the license. This includes other CC-
34 | licensed material, or material used under an exception or
35 | limitation to copyright. More considerations for licensors:
36 | wiki.creativecommons.org/Considerations_for_licensors
37 |
38 | Considerations for the public: By using one of our public
39 | licenses, a licensor grants the public permission to use the
40 | licensed material under specified terms and conditions. If
41 | the licensor's permission is not necessary for any reason--for
42 | example, because of any applicable exception or limitation to
43 | copyright--then that use is not regulated by the license. Our
44 | licenses grant only permissions under copyright and certain
45 | other rights that a licensor has authority to grant. Use of
46 | the licensed material may still be restricted for other
47 | reasons, including because others have copyright or other
48 | rights in the material. A licensor may make special requests,
49 | such as asking that all changes be marked or described.
50 | Although not required by our licenses, you are encouraged to
51 | respect those requests where reasonable. More considerations
52 | for the public:
53 | wiki.creativecommons.org/Considerations_for_licensees
54 |
55 | =======================================================================
56 |
57 | Creative Commons Attribution 4.0 International Public License
58 |
59 | By exercising the Licensed Rights (defined below), You accept and agree
60 | to be bound by the terms and conditions of this Creative Commons
61 | Attribution 4.0 International Public License ("Public License"). To the
62 | extent this Public License may be interpreted as a contract, You are
63 | granted the Licensed Rights in consideration of Your acceptance of
64 | these terms and conditions, and the Licensor grants You such rights in
65 | consideration of benefits the Licensor receives from making the
66 | Licensed Material available under these terms and conditions.
67 |
68 |
69 | Section 1 -- Definitions.
70 |
71 | a. Adapted Material means material subject to Copyright and Similar
72 | Rights that is derived from or based upon the Licensed Material
73 | and in which the Licensed Material is translated, altered,
74 | arranged, transformed, or otherwise modified in a manner requiring
75 | permission under the Copyright and Similar Rights held by the
76 | Licensor. For purposes of this Public License, where the Licensed
77 | Material is a musical work, performance, or sound recording,
78 | Adapted Material is always produced where the Licensed Material is
79 | synched in timed relation with a moving image.
80 |
81 | b. Adapter's License means the license You apply to Your Copyright
82 | and Similar Rights in Your contributions to Adapted Material in
83 | accordance with the terms and conditions of this Public License.
84 |
85 | c. Copyright and Similar Rights means copyright and/or similar rights
86 | closely related to copyright including, without limitation,
87 | performance, broadcast, sound recording, and Sui Generis Database
88 | Rights, without regard to how the rights are labeled or
89 | categorized. For purposes of this Public License, the rights
90 | specified in Section 2(b)(1)-(2) are not Copyright and Similar
91 | Rights.
92 |
93 | d. Effective Technological Measures means those measures that, in the
94 | absence of proper authority, may not be circumvented under laws
95 | fulfilling obligations under Article 11 of the WIPO Copyright
96 | Treaty adopted on December 20, 1996, and/or similar international
97 | agreements.
98 |
99 | e. Exceptions and Limitations means fair use, fair dealing, and/or
100 | any other exception or limitation to Copyright and Similar Rights
101 | that applies to Your use of the Licensed Material.
102 |
103 | f. Licensed Material means the artistic or literary work, database,
104 | or other material to which the Licensor applied this Public
105 | License.
106 |
107 | g. Licensed Rights means the rights granted to You subject to the
108 | terms and conditions of this Public License, which are limited to
109 | all Copyright and Similar Rights that apply to Your use of the
110 | Licensed Material and that the Licensor has authority to license.
111 |
112 | h. Licensor means the individual(s) or entity(ies) granting rights
113 | under this Public License.
114 |
115 | i. Share means to provide material to the public by any means or
116 | process that requires permission under the Licensed Rights, such
117 | as reproduction, public display, public performance, distribution,
118 | dissemination, communication, or importation, and to make material
119 | available to the public including in ways that members of the
120 | public may access the material from a place and at a time
121 | individually chosen by them.
122 |
123 | j. Sui Generis Database Rights means rights other than copyright
124 | resulting from Directive 96/9/EC of the European Parliament and of
125 | the Council of 11 March 1996 on the legal protection of databases,
126 | as amended and/or succeeded, as well as other essentially
127 | equivalent rights anywhere in the world.
128 |
129 | k. You means the individual or entity exercising the Licensed Rights
130 | under this Public License. Your has a corresponding meaning.
131 |
132 |
133 | Section 2 -- Scope.
134 |
135 | a. License grant.
136 |
137 | 1. Subject to the terms and conditions of this Public License,
138 | the Licensor hereby grants You a worldwide, royalty-free,
139 | non-sublicensable, non-exclusive, irrevocable license to
140 | exercise the Licensed Rights in the Licensed Material to:
141 |
142 | a. reproduce and Share the Licensed Material, in whole or
143 | in part; and
144 |
145 | b. produce, reproduce, and Share Adapted Material.
146 |
147 | 2. Exceptions and Limitations. For the avoidance of doubt, where
148 | Exceptions and Limitations apply to Your use, this Public
149 | License does not apply, and You do not need to comply with
150 | its terms and conditions.
151 |
152 | 3. Term. The term of this Public License is specified in Section
153 | 6(a).
154 |
155 | 4. Media and formats; technical modifications allowed. The
156 | Licensor authorizes You to exercise the Licensed Rights in
157 | all media and formats whether now known or hereafter created,
158 | and to make technical modifications necessary to do so. The
159 | Licensor waives and/or agrees not to assert any right or
160 | authority to forbid You from making technical modifications
161 | necessary to exercise the Licensed Rights, including
162 | technical modifications necessary to circumvent Effective
163 | Technological Measures. For purposes of this Public License,
164 | simply making modifications authorized by this Section 2(a)
165 | (4) never produces Adapted Material.
166 |
167 | 5. Downstream recipients.
168 |
169 | a. Offer from the Licensor -- Licensed Material. Every
170 | recipient of the Licensed Material automatically
171 | receives an offer from the Licensor to exercise the
172 | Licensed Rights under the terms and conditions of this
173 | Public License.
174 |
175 | b. No downstream restrictions. You may not offer or impose
176 | any additional or different terms or conditions on, or
177 | apply any Effective Technological Measures to, the
178 | Licensed Material if doing so restricts exercise of the
179 | Licensed Rights by any recipient of the Licensed
180 | Material.
181 |
182 | 6. No endorsement. Nothing in this Public License constitutes or
183 | may be construed as permission to assert or imply that You
184 | are, or that Your use of the Licensed Material is, connected
185 | with, or sponsored, endorsed, or granted official status by,
186 | the Licensor or others designated to receive attribution as
187 | provided in Section 3(a)(1)(A)(i).
188 |
189 | b. Other rights.
190 |
191 | 1. Moral rights, such as the right of integrity, are not
192 | licensed under this Public License, nor are publicity,
193 | privacy, and/or other similar personality rights; however, to
194 | the extent possible, the Licensor waives and/or agrees not to
195 | assert any such rights held by the Licensor to the limited
196 | extent necessary to allow You to exercise the Licensed
197 | Rights, but not otherwise.
198 |
199 | 2. Patent and trademark rights are not licensed under this
200 | Public License.
201 |
202 | 3. To the extent possible, the Licensor waives any right to
203 | collect royalties from You for the exercise of the Licensed
204 | Rights, whether directly or through a collecting society
205 | under any voluntary or waivable statutory or compulsory
206 | licensing scheme. In all other cases the Licensor expressly
207 | reserves any right to collect such royalties.
208 |
209 |
210 | Section 3 -- License Conditions.
211 |
212 | Your exercise of the Licensed Rights is expressly made subject to the
213 | following conditions.
214 |
215 | a. Attribution.
216 |
217 | 1. If You Share the Licensed Material (including in modified
218 | form), You must:
219 |
220 | a. retain the following if it is supplied by the Licensor
221 | with the Licensed Material:
222 |
223 | i. identification of the creator(s) of the Licensed
224 | Material and any others designated to receive
225 | attribution, in any reasonable manner requested by
226 | the Licensor (including by pseudonym if
227 | designated);
228 |
229 | ii. a copyright notice;
230 |
231 | iii. a notice that refers to this Public License;
232 |
233 | iv. a notice that refers to the disclaimer of
234 | warranties;
235 |
236 | v. a URI or hyperlink to the Licensed Material to the
237 | extent reasonably practicable;
238 |
239 | b. indicate if You modified the Licensed Material and
240 | retain an indication of any previous modifications; and
241 |
242 | c. indicate the Licensed Material is licensed under this
243 | Public License, and include the text of, or the URI or
244 | hyperlink to, this Public License.
245 |
246 | 2. You may satisfy the conditions in Section 3(a)(1) in any
247 | reasonable manner based on the medium, means, and context in
248 | which You Share the Licensed Material. For example, it may be
249 | reasonable to satisfy the conditions by providing a URI or
250 | hyperlink to a resource that includes the required
251 | information.
252 |
253 | 3. If requested by the Licensor, You must remove any of the
254 | information required by Section 3(a)(1)(A) to the extent
255 | reasonably practicable.
256 |
257 | 4. If You Share Adapted Material You produce, the Adapter's
258 | License You apply must not prevent recipients of the Adapted
259 | Material from complying with this Public License.
260 |
261 |
262 | Section 4 -- Sui Generis Database Rights.
263 |
264 | Where the Licensed Rights include Sui Generis Database Rights that
265 | apply to Your use of the Licensed Material:
266 |
267 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right
268 | to extract, reuse, reproduce, and Share all or a substantial
269 | portion of the contents of the database;
270 |
271 | b. if You include all or a substantial portion of the database
272 | contents in a database in which You have Sui Generis Database
273 | Rights, then the database in which You have Sui Generis Database
274 | Rights (but not its individual contents) is Adapted Material; and
275 |
276 | c. You must comply with the conditions in Section 3(a) if You Share
277 | all or a substantial portion of the contents of the database.
278 |
279 | For the avoidance of doubt, this Section 4 supplements and does not
280 | replace Your obligations under this Public License where the Licensed
281 | Rights include other Copyright and Similar Rights.
282 |
283 |
284 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
285 |
286 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
287 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
288 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
289 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
290 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
291 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
292 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
293 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
294 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
295 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
296 |
297 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
298 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
299 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
300 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
301 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
302 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
303 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
304 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
305 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
306 |
307 | c. The disclaimer of warranties and limitation of liability provided
308 | above shall be interpreted in a manner that, to the extent
309 | possible, most closely approximates an absolute disclaimer and
310 | waiver of all liability.
311 |
312 |
313 | Section 6 -- Term and Termination.
314 |
315 | a. This Public License applies for the term of the Copyright and
316 | Similar Rights licensed here. However, if You fail to comply with
317 | this Public License, then Your rights under this Public License
318 | terminate automatically.
319 |
320 | b. Where Your right to use the Licensed Material has terminated under
321 | Section 6(a), it reinstates:
322 |
323 | 1. automatically as of the date the violation is cured, provided
324 | it is cured within 30 days of Your discovery of the
325 | violation; or
326 |
327 | 2. upon express reinstatement by the Licensor.
328 |
329 | For the avoidance of doubt, this Section 6(b) does not affect any
330 | right the Licensor may have to seek remedies for Your violations
331 | of this Public License.
332 |
333 | c. For the avoidance of doubt, the Licensor may also offer the
334 | Licensed Material under separate terms or conditions or stop
335 | distributing the Licensed Material at any time; however, doing so
336 | will not terminate this Public License.
337 |
338 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
339 | License.
340 |
341 |
342 | Section 7 -- Other Terms and Conditions.
343 |
344 | a. The Licensor shall not be bound by any additional or different
345 | terms or conditions communicated by You unless expressly agreed.
346 |
347 | b. Any arrangements, understandings, or agreements regarding the
348 | Licensed Material not stated herein are separate from and
349 | independent of the terms and conditions of this Public License.
350 |
351 |
352 | Section 8 -- Interpretation.
353 |
354 | a. For the avoidance of doubt, this Public License does not, and
355 | shall not be interpreted to, reduce, limit, restrict, or impose
356 | conditions on any use of the Licensed Material that could lawfully
357 | be made without permission under this Public License.
358 |
359 | b. To the extent possible, if any provision of this Public License is
360 | deemed unenforceable, it shall be automatically reformed to the
361 | minimum extent necessary to make it enforceable. If the provision
362 | cannot be reformed, it shall be severed from this Public License
363 | without affecting the enforceability of the remaining terms and
364 | conditions.
365 |
366 | c. No term or condition of this Public License will be waived and no
367 | failure to comply consented to unless expressly agreed to by the
368 | Licensor.
369 |
370 | d. Nothing in this Public License constitutes or may be interpreted
371 | as a limitation upon, or waiver of, any privileges and immunities
372 | that apply to the Licensor or You, including from the legal
373 | processes of any jurisdiction or authority.
374 |
375 |
376 | =======================================================================
377 |
378 | Creative Commons is not a party to its public
379 | licenses. Notwithstanding, Creative Commons may elect to apply one of
380 | its public licenses to material it publishes and in those instances
381 | will be considered the “Licensor.” The text of the Creative Commons
382 | public licenses is dedicated to the public domain under the CC0 Public
383 | Domain Dedication. Except for the limited purpose of indicating that
384 | material is shared under a Creative Commons public license or as
385 | otherwise permitted by the Creative Commons policies published at
386 | creativecommons.org/policies, Creative Commons does not authorize the
387 | use of the trademark "Creative Commons" or any other trademark or logo
388 | of Creative Commons without its prior written consent including,
389 | without limitation, in connection with any unauthorized modifications
390 | to any of its public licenses or any other arrangements,
391 | understandings, or agreements concerning use of licensed material. For
392 | the avoidance of doubt, this paragraph does not form part of the
393 | public licenses.
394 |
395 | Creative Commons may be contacted at creativecommons.org.
396 |
397 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = content
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # [Reproducible research - Preparing code to be usable by you and others in the future](https://coderefinery.github.io/reproducible-research/)
2 |
3 | - [Credit and license](https://coderefinery.github.io/reproducible-research/license/)
4 |
--------------------------------------------------------------------------------
/content/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('.'))
16 |
17 |
18 | # -- Project information -----------------------------------------------------
19 |
20 | project = "Reproducible research"
21 | copyright = "CodeRefinery contributors"
22 | author = "CodeRefinery contributors"
23 | github_user = "coderefinery"
24 | github_repo_name = "reproducible-research" # auto-detected from dirname if blank
25 | github_version = "main"
26 | conf_py_path = "/content/" # with leading and trailing slash
27 |
28 | # -- General configuration ---------------------------------------------------
29 |
30 | # Add any Sphinx extension module names here, as strings. They can be
31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
32 | # ones.
33 | extensions = [
34 | # githubpages just adds a .nojekyll file
35 | "sphinx.ext.githubpages",
36 | "sphinx_lesson",
37 | # remove once sphinx_rtd_theme updated for contrast and accessibility:
38 | "sphinx_rtd_theme_ext_color_contrast",
39 | "sphinx_coderefinery_branding",
40 | ]
41 |
42 | nb_execution_mode = "cache"
43 |
44 | # Add any paths that contain templates here, relative to this directory.
45 | # templates_path = ['_templates']
46 |
47 | # List of patterns, relative to source directory, that match files and
48 | # directories to ignore when looking for source files.
49 | # This pattern also affects html_static_path and html_extra_path.
50 | exclude_patterns = [
51 | "examples",
52 | "README*",
53 | "_build",
54 | "Thumbs.db",
55 | ".DS_Store",
56 | "jupyter_execute",
57 | "*venv*",
58 | "img/README.md",
59 | ]
60 |
61 |
62 | # -- Options for HTML output -------------------------------------------------
63 |
64 | # The theme to use for HTML and HTML Help pages. See the documentation for
65 | # a list of builtin themes.
66 | #
67 | html_theme = "sphinx_rtd_theme"
68 |
69 | # Add any paths that contain custom static files (such as style sheets) here,
70 | # relative to this directory. They are copied after the builtin static files,
71 | # so a file named "default.css" will overwrite the builtin "default.css".
72 | #html_static_path = ['css']
73 |
74 |
75 | # HTML context:
76 | from os.path import basename, dirname, realpath
77 |
78 | html_context = {
79 | "display_github": True,
80 | "github_user": github_user,
81 | # Auto-detect directory name. This can break, but
82 | # useful as a default.
83 | "github_repo": github_repo_name or basename(dirname(realpath(__file__))),
84 | "github_version": github_version,
85 | "conf_py_path": conf_py_path,
86 | }
87 |
88 | import os
89 | if os.environ.get('GITHUB_REF', '') == 'refs/heads/'+github_version:
90 | html_js_files = [
91 | ('https://plausible.cs.aalto.fi/js/script.js', {"data-domain": "coderefinery.github.io", "defer": "defer"}),
92 | ]
93 |
--------------------------------------------------------------------------------
/content/dependencies.md:
--------------------------------------------------------------------------------
1 | # Recording dependencies
2 |
3 | ```{objectives}
4 | - Understand what dependency management tools can be useful for
5 | - Discuss environment/requirements files in the context of reusability and
6 | reproducibility
7 | ```
8 |
9 | ```{questions}
10 | - How can we communicate different versions of software dependencies?
11 | ```
12 |
13 | ```{instructor-note}
14 | - 10 min teaching
15 | - 10 min demo
16 | ```
17 |
18 | Our codes often depend on other codes that in turn depend on other codes ...
19 |
20 | - **Reproducibility**: We can version-control our code with Git but how should we version-control dependencies?
21 | How can we capture and communicate dependencies?
22 | - **Dependency hell**: Different codes on the same environment can have conflicting dependencies.
23 |
24 | ```{figure} img/dependency.png
25 | :alt: An image showing blocks (=codes) depending on each other for stability
26 | :width: 60%
27 |
28 | From [xkcd - dependency](https://xkcd.com/2347/). Another image that might be familiar to some of you working with Python can be found on [xkcd - superfund](https://xkcd.com/1987/).
29 | ```
30 |
31 | ````{discussion} Kitchen analogy
32 | - Software <-> recipe
33 | - Data <-> ingredients
34 | - Libraries <-> pots/tools
35 |
36 | ```{figure} img/kitchen/recipe.png
37 | :alt: Cooking recipe in an unfamiliar language
38 | :width: 50%
39 |
40 | Cooking recipe in an unfamiliar language [Midjourney, CC-BY-NC 4.0]
41 | ```
42 |
43 | ```{figure} img/kitchen/libraries.png
44 | :alt: Kitchen with few open cooking books
45 | :width: 50%
46 |
47 | When we create recipes, we often use tools created by others (libraries) [Midjourney, CC-BY-NC 4.0]
48 | ```
49 | ````
50 |
51 | ---
52 |
53 | ## Dependency and environment management
54 |
55 | **Conda, Anaconda, pip, virtualenv, Pipenv, pyenv, Poetry, requirements.txt,
56 | environment.yml, renv**, ..., these tools try to solve the following problems:
57 |
58 | - **Defining a specific set of dependencies**
59 | - **Installing those dependencies** mostly automatically
60 | - **Recording the versions** for all dependencies
61 | - **Isolate environments**
62 | - On your computer for projects so they can use different software
63 | - Isolate environments on computers with many users (and allow self-installations)
64 | - Using **different package versions** per project (also e.g. Python/R versions)
65 | - Provide tools and services to **share packages**
66 |
67 | Isolated environments are also useful because they help you make sure
68 | that you know your dependencies!
69 |
70 | **If things go wrong, you can delete and re-create** - much better
71 | than debugging. The more often you re-create your environment, the
72 | more reproducible it is.
73 |
74 | ---
75 |
76 | ## Demo
77 |
78 | ``````{challenge} Dependencies-1: Time-capsule of dependencies
79 | Situation: 5 students (A, B, C, D, E) wrote a code that depends on a couple of libraries.
80 | They uploaded their projects to GitHub. We now travel 3 years into the future
81 | and find their GitHub repositories and try to re-run their code before adapting
82 | it.
83 |
84 | Answer in the collaborative document:
85 |
86 | - Which version do you expect to be easiest to re-run? Why?
87 | - What problems do you anticipate in each solution?
88 |
89 | `````{tabs}
90 | ````{group-tab} Conda
91 | **A**:
92 | You find a couple of library imports across the code but that's it.
93 |
94 | **B**:
95 | The README file lists which libraries were used but does not mention
96 | any versions.
97 |
98 | **C**:
99 | You find a `environment.yml` file with:
100 | ```
101 | name: student-project
102 | channels:
103 | - conda-forge
104 | dependencies:
105 | - scipy
106 | - numpy
107 | - sympy
108 | - click
109 | - python
110 | - pip
111 | - pip:
112 | - git+https://github.com/someuser/someproject.git@master
113 | - git+https://github.com/anotheruser/anotherproject.git@master
114 | ```
115 |
116 | **D**:
117 | You find a `environment.yml` file with:
118 | ```
119 | name: student-project
120 | channels:
121 | - conda-forge
122 | dependencies:
123 | - scipy=1.3.1
124 | - numpy=1.16.4
125 | - sympy=1.4
126 | - click=7.0
127 | - python=3.8
128 | - pip
129 | - pip:
130 | - git+https://github.com/someuser/someproject.git@d7b2c7e
131 | - git+https://github.com/anotheruser/anotherproject.git@sometag
132 | ```
133 |
134 | **E**:
135 | You find a `environment.yml` file with:
136 | ```
137 | name: student-project
138 | channels:
139 | - conda-forge
140 | dependencies:
141 | - scipy=1.3.1
142 | - numpy=1.16.4
143 | - sympy=1.4
144 | - click=7.0
145 | - python=3.8
146 | - someproject=1.2.3
147 | - anotherproject=2.3.4
148 | ```
149 | ````
150 |
151 | ````{group-tab} Python virtualenv
152 | **A**:
153 | You find a couple of library imports across the code but that's it.
154 |
155 | **B**:
156 | The README file lists which libraries were used but does not mention
157 | any versions.
158 |
159 | **C**:
160 | You find a `requirements.txt` file with:
161 | ```
162 | scipy
163 | numpy
164 | sympy
165 | click
166 | python
167 | git+https://github.com/someuser/someproject.git@master
168 | git+https://github.com/anotheruser/anotherproject.git@master
169 | ```
170 |
171 | **D**:
172 | You find a `requirements.txt` file with:
173 | ```
174 | scipy==1.3.1
175 | numpy==1.16.4
176 | sympy==1.4
177 | click==7.0
178 | python==3.8
179 | git+https://github.com/someuser/someproject.git@d7b2c7e
180 | git+https://github.com/anotheruser/anotherproject.git@sometag
181 | ```
182 |
183 | **E**:
184 | You find a `requirements.txt` file with:
185 | ```
186 | scipy==1.3.1
187 | numpy==1.16.4
188 | sympy==1.4
189 | click==7.0
190 | python==3.8
191 | someproject==1.2.3
192 | anotherproject==2.3.4
193 | ```
194 | ````
195 |
196 | ````{group-tab} R
197 | **A**:
198 | You find a couple of `library()` or `require()` calls across the code but that's it.
199 |
200 | **B**:
201 | The README file lists which libraries were used but does not mention
202 | any versions.
203 |
204 | **C**:
205 | You find a [DESCRIPTION file](https://r-pkgs.org/description.html) which contains:
206 | ```
207 | Imports:
208 | dplyr,
209 | tidyr
210 | ```
211 | In addition you find these:
212 | ```r
213 | remotes::install_github("someuser/someproject@master")
214 | remotes::install_github("anotheruser/anotherproject@master")
215 | ```
216 |
217 | **D**:
218 | You find a [DESCRIPTION file](https://r-pkgs.org/description.html) which contains:
219 | ```
220 | Imports:
221 | dplyr (== 1.0.0),
222 | tidyr (== 1.1.0)
223 | ```
224 | In addition you find these:
225 | ```r
226 | remotes::install_github("someuser/someproject@d7b2c7e")
227 | remotes::install_github("anotheruser/anotherproject@sometag")
228 | ```
229 |
230 | **E**:
231 | You find a [DESCRIPTION file](https://r-pkgs.org/description.html) which contains:
232 | ```
233 | Imports:
234 | dplyr (== 1.0.0),
235 | tidyr (== 1.1.0),
236 | someproject (== 1.2.3),
237 | anotherproject (== 2.3.4)
238 | ```
239 | ````
240 |
241 | ````{group-tab} Matlab
242 | Can you please contribute an example?
243 | ````
244 | `````
245 |
246 | `````{solution}
247 | **A**: It will be tedious to collect the dependencies one by one. And after
248 | the tedious process you will still not know which versions they have used.
249 |
250 | **B**: If there is no standard file to look for and look at and it might
251 | become very difficult for to create the software environment required to
252 | run the software. But at least we know the list of libraries. But we don't
253 | know the versions.
254 |
255 | **C**: Having a standard file listing dependencies is definitely better
256 | than nothing. However, if the versions are not specified, you or someone
257 | else might run into problems with dependencies, deprecated features,
258 | changes in package APIs, etc.
259 |
260 | **D** and **E**: In both these cases exact versions of all dependencies are
261 | specified and one can recreate the software environment required for the
262 | project. One problem with the dependencies that come from GitHub is that
263 | they might have disappeared (what if their authors deleted these
264 | repositories?).
265 |
266 | **E** is slightly preferable because version numbers are easier to understand than Git
267 | commit hashes or Git tags.
268 | `````
269 | ``````
270 |
271 | ``````{challenge} Dependencies-2: Create a time-capsule for the future
272 | Now we will demo creating our own time-capsule and share it with the future
273 | world. If we asked you now which dependencies your project is using, what would
274 | you answer? How would you find out? And how would you communicate this
275 | information?
276 |
277 | `````{tabs}
278 | ````{group-tab} Conda
279 | We start from an existing conda environment. Try this either with your own project or inside the "coderefinery" conda
280 | environment. For demonstration puprposes, you can also create an environment with:
281 |
282 | ```console
283 | $ conda env create -f myenv.yml
284 | ```
285 | Where the file `myenv.yml` could have some python libraries with unspecified versions:
286 |
287 | ```
288 | name: myenv
289 | channels:
290 | - conda-forge
291 | - defaults
292 | dependencies:
293 | - python=3.10
294 | - numpy
295 | - pandas
296 | - seaborn
297 | ```
298 |
299 | After creating the environment we can activate it with
300 |
301 | ```
302 | conda activate myenv
303 | ```
304 |
305 | Now we can freeze the environment into a new YAML file with:
306 |
307 | ```console
308 | $ conda env export > environment.yml
309 | ```
310 |
311 | Have a look at the generated file and discuss what you see.
312 |
313 | ```{solution} Some things to note
314 | - Can you find all packages you installed directly? Which versions were installed?
315 | - What other packages were installed? -> Dependencies of dependencies
316 | - Besides the version you can also see the build channel
317 | - Sometimes the build includes an operating system or an architecture
318 | - Using this environment file might therefore not work/ not result in an identical setup on other computers
319 | ```
320 |
321 | In the future — or on a different computer — we can re-create this environment with:
322 |
323 | ```console
324 | $ conda env create -f environment.yml
325 | ```
326 | You may use `conda` or `mamba` interchangeably for this step; mamba may solve the dependencies a bit faster.
327 |
328 | What happens instead when you run the following command?
329 |
330 | ```console
331 | $ conda env export --from-history > environment_fromhistory.yml
332 | ```
333 |
334 | ```{solution} Some things to note
335 | - Everything is listed as you installed it; with or without specified versions
336 | - Using this environment file a few days/weeks later will likely not result in the same environment
337 | - This can be a good starting point for a reproducible environment as you may add your current version numbers to it (check for example with `conda list | grep "packagename"`)
338 | ```
339 |
340 | In daily use you may not always use an environment.yml file to create the full environment, but create a base environment and then add new packages with `conda install packagename` as you go. Also those packages will be listed in the environment files created with either of the approaches above.
341 |
342 | More information: and
343 | ````
344 |
345 | ````{group-tab} Python virtualenv
346 | Try this in your own project:
347 | ```console
348 | $ pip freeze > requirements.txt
349 | ```
350 |
351 | Have a look at the generated file and discuss what you see.
352 |
353 | In future you can re-create this environment with:
354 | ```console
355 | $ pip install -r requirements.txt
356 | ```
357 |
358 | More information:
359 | ````
360 |
361 | ````{group-tab} R
362 | This example uses renv.
363 |
364 | Try to "save" and "load" the state of your project library using
365 | `renv::snapshot()` and `renv::restore()`.
366 | See also:
367 |
368 | More information:
369 | ````
370 |
371 | ````{group-tab} Matlab
372 | Can you please contribute an example?
373 | ````
374 | `````
375 | ``````
376 |
377 | ```{keypoints}
378 | - Recording dependencies with versions can make it easier for the next person to execute your code
379 | - There are many tools to record dependencies and separate environments
380 | ```
381 |
--------------------------------------------------------------------------------
/content/environments.md:
--------------------------------------------------------------------------------
1 | # Recording environments
2 |
3 | ```{objectives}
4 | - Understand what containers are and what they are useful for
5 | - Discuss container definitions files in the context of reusability and
6 | reproducibility
7 | ```
8 |
9 | ```{instructor-note}
10 | - 10 min teaching/discussion
11 | - 10 min demo
12 | ```
13 |
14 | ## What is a container?
15 |
16 | Imagine if you didn't have to install things yourself, but instead you could
17 | get a computer with the exact software for a task pre-installed? Containers
18 | effectively do that, with various advantages and disadvantages. They are
19 | **like an entire operating system with software installed, all in one file**.
20 |
21 | ```{figure} img/docker_meme.jpg
22 | :alt: He said, then we will ship your machine. And that's how Docker was born.
23 | :width: 60%
24 |
25 | From [reddit](https://www.reddit.com/r/ProgrammerHumor/comments/cw58z7/it_works_on_my_machine/).
26 | ```
27 |
28 | ``````{discussion} Kitchen analogy
29 | - Our codes/scripts <-> cooking recipes
30 | - Container definition files <-> like a blueprint to build a kitchen with all
31 | utensils in which the recipe can be prepared.
32 | - Container images <-> showroom kitchens
33 | - Containers <-> A real connected kitchen
34 |
35 | Just for fun: which operating systems do the following example kitchens represent?
36 | `````{tabs}
37 | ````{tab} 1
38 | ```{figure} img/kitchen/macos.png
39 | :alt: Generated image of a kitchen
40 | :width: 50%
41 |
42 | [Midjourney, CC-BY-NC 4.0]
43 | ```
44 | ````
45 |
46 | ````{tab} 2
47 | ```{figure} img/kitchen/windows.png
48 | :alt: Generated image of a kitchen
49 | :width: 50%
50 |
51 | [Midjourney, CC-BY-NC 4.0]
52 | ```
53 | ````
54 |
55 | ````{tab} 3
56 | ```{figure} img/kitchen/linux.png
57 | :alt: Generated image of a kitchen
58 | :width: 50%
59 |
60 | [Midjourney, CC-BY-NC 4.0]
61 | ```
62 | ````
63 | `````
64 | ``````
65 |
66 | ## From definition files to container images to containers
67 |
68 | - Containers can be built to bundle _all the necessary ingredients_ (data, code, environment, operating system).
69 | - A container image is like a piece of paper with all the operating system on it. When you run it,
70 | a transparent sheet is placed on top to form a container. The container runs and writes only on
71 | that transparent sheet (and what other mounts have been layered on top). When you are done,
72 | transparency is thrown away. It can be repeated as often as you want, and base is always the same.
73 | - Definition files (e.g. Dockerfile or Singularity definition file) are text
74 | files that contain a series of instructions to build container images.
75 |
76 | ## You may have use for containers in different ways
77 |
78 | - **Installing a certain software is tricky**, or not supported for your operating system? - Check if an image is available and run the software from a container instead!
79 | - You want to make sure your colleagues are using the **same environment** for running your code? - Provide them an image of your container!
80 | - If this does not work, because they are using a different architecture than you do? - Provide a definition file for them to **build the image suitable to their computers**. This does not create the exact environment as you have, but in most cases similar enough.
81 |
82 | ## The container recipe
83 |
84 | Here is an example of a Singularity definition file ([reference](https://apptainer.org/docs/user/main/build_a_container.html#building-containers-from-apptainer-definition-files)):
85 |
86 | ```
87 | Bootstrap: docker
88 | From: ubuntu:24.04
89 |
90 | %post
91 | apt-get -y update
92 | apt-get -y install fortune cowsay lolcat
93 |
94 | %environment
95 | export LC_ALL=C
96 | export PATH=/usr/games:$PATH
97 |
98 | %runscript
99 | fortune | cowsay | lolcat
100 | ```
101 |
102 | Popular container implementations:
103 |
104 | - [Docker](https://www.docker.com/)
105 | - [Singularity](https://sylabs.io/docs/) (popular on high-performance computing systems)
106 | - [Apptainer](https://apptainer.org) (popular on high-performance computing systems, fork of Singularity)
107 | - [podman](https://podman.io/)
108 |
109 | They are to some extent interoperable:
110 |
111 | - podman is very close to Docker
112 | - Docker images can be converted to Singularity/Apptainer images
113 | - [Singularity Python](https://singularityhub.github.io/singularity-cli/) can convert Dockerfiles to Singularity definition files
114 |
115 | ---
116 |
117 | ## Pros and cons of containers
118 |
119 | Containers are popular for a reason - they solve a number of
120 | important problems:
121 |
122 | - Allow for seamlessly **moving workflows across different platforms**.
123 | - Can solve the **"works on my machine"** situation.
124 | - For software with many dependencies, in turn with its own dependencies,
125 | containers offer possibly the only way to preserve the
126 | computational experiment for **future reproducibility**.
127 | - A mechanism to "send the computer to the data" when the **dataset is too large** to transfer.
128 | - **Installing software into a file** instead of into your computer (removing
129 | a file is often easier than uninstalling software if you suddenly regret an
130 | installation)
131 |
132 | However, containers may also have some drawbacks:
133 |
134 | - Can be used to hide away software installation problems and thereby
135 | **discourage good software development practices**.
136 | - Instead of "works on my machine" problem: **"works only in this container"** problem?
137 | - They can be **difficult to modify**
138 | - Container **images can become large**
139 |
140 | ```{danger}
141 | Use only **official and trusted images**! Not all images can be trusted! There
142 | have been examples of contaminated images so investigate before using images
143 | blindly. Apply same caution as installing software packages from untrusted
144 | package repositories.
145 | ```
146 |
147 | ---
148 |
149 | ## Where can one share or find images?
150 |
151 | - [Docker Hub](https://hub.docker.com/)
152 | - [Quay](https://quay.io/)
153 | - [GitHub Container Registry](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry)
154 | - [GitLab Container Registry](https://docs.gitlab.com/ee/user/packages/container_registry/)
155 | - GitHub/GitLab release artifacts
156 | - [Zenodo](https://zenodo.org/)
157 |
158 | ---
159 |
160 | ## Exercises
161 |
162 | ``````{exercise} Containers-1: Time travel
163 | Scenario: A researcher has written and published their research code which
164 | requires a number of libraries and system dependencies. They ran their code
165 | on a Linux computer (Ubuntu). One very nice thing they did was to publish
166 | also a container image with all dependencies included, as well as the
167 | definition file (below) to create the container image.
168 |
169 | Now we travel 3 years into the future and want to reuse their work and adapt
170 | it for our data. The container registry where they uploaded the container
171 | image however no longer exists. But luckily we still have the definition file
172 | (below)! From this we should be able to create a new container image.
173 |
174 | - Can you anticipate problems using the definitions file 3 years after its creation?
175 | Which possible problems can you point out?
176 | - Discuss possible take-aways for creating more reusable containers.
177 |
178 | `````{tabs}
179 | ````{tab} Python project using virtual environment
180 | ```{code-block}
181 | :linenos:
182 | Bootstrap: docker
183 | From: ubuntu:latest
184 |
185 | %post
186 | # Set environment variables
187 | export VIRTUAL_ENV=/app/venv
188 |
189 | # Install system dependencies and Python 3
190 | apt-get update && \
191 | apt-get install -y --no-install-recommends \
192 | gcc \
193 | libgomp1 \
194 | python3 \
195 | python3-venv \
196 | python3-distutils \
197 | python3-pip && \
198 | apt-get clean && \
199 | rm -rf /var/lib/apt/lists/*
200 |
201 | # Set up the virtual environment
202 | python3 -m venv $VIRTUAL_ENV
203 | . $VIRTUAL_ENV/bin/activate
204 |
205 | # Install Python libraries
206 | pip install --no-cache-dir --upgrade pip && \
207 | pip install --no-cache-dir -r /app/requirements.txt
208 |
209 | %files
210 | # Copy project files
211 | ./requirements.txt /app/requirements.txt
212 | ./app.py /app/app.py
213 | # Copy data
214 | /home/myself/data /app/data
215 | # Workaround to fix dependency on fancylib
216 | /home/myself/fancylib /usr/lib/fancylib
217 |
218 | %environment
219 | # Set the environment variables
220 | export LANG=C.UTF-8 LC_ALL=C.UTF-8
221 | export VIRTUAL_ENV=/app/venv
222 |
223 | %runscript
224 | # Activate the virtual environment
225 | . $VIRTUAL_ENV/bin/activate
226 | # Run the application
227 | python /app/app.py
228 | ```
229 |
230 | ```{solution}
231 | - Line 2: "ubuntu:latest" will mean something different 3 years in future.
232 | - Lines 11-12: The compiler gcc and the library libgomp1 will have evolved.
233 | - Line 30: The container uses requirements.txt to build the virtual environment but we don't see
234 | here what libraries the code depends on.
235 | - Line 33: Data is copied in from the hard disk of the person who created it. Hopefully we can find the data somewhere.
236 | - Line 35: The library fancylib has been built outside the container and copied in but we don't see here how it was done.
237 | - Python version will be different then and hopefully the code still runs then.
238 | - Singularity/Apptainer will have also evolved by then. Hopefully this definition file then still works.
239 | - No contact address to ask more questions about this file.
240 | - (Can you find more? Please contribute more points.)
241 | ```
242 | ````
243 |
244 | ````{tab} R project using renv
245 | Work in progress: Please contribute a corresponding example which
246 | demonstrates this in the context of R and renv.
247 | ````
248 | `````
249 | ``````
250 |
251 | ````{exercise} (optional) Containers-2: Installing the impossible.
252 |
253 | When you are missing privileges for installing certain software tools, containers can come handy.
254 | Here we build a Singularity/Apptainer container for installing `cowsay` and `lolcat` Linux programs.
255 |
256 | 1. Make sure you have apptainer installed:
257 | ```console
258 | $ apptainer --version
259 | ```
260 |
261 | 2. Make sure you set the apptainer cache and temporary folders.
262 | ```console
263 | $ mkdir ./cache/
264 | $ mkdir ./temp/
265 | $ export APPTAINER_CACHEDIR="./cache/"
266 | $ export APPTAINER_TMPDIR="./temp/"
267 | ```
268 |
269 | 3. Build the container from the following definition file above.
270 | ```console
271 | apptainer build cowsay.sif cowsay.def
272 | ```
273 |
274 | 4. Let's test the container by entering into it with a shell terminal
275 | ```console
276 | $ apptainer shell cowsay.sif
277 | ```
278 |
279 | 5. We can verify the installation.
280 | ```console
281 | $ cowsay "Hello world!"|lolcat
282 | ```
283 |
284 | ````
285 |
286 | ````{exercise} (optional) Containers-3: Explore two really useful Docker images
287 | You can try the below if you have Docker installed. If you have
288 | Singularity/Apptainer and not Docker, the goal of the exercise can be to run
289 | the Docker containers through Singularity/Apptainer.
290 |
291 | 1. Run a specific version of *Rstudio*:
292 | ```console
293 | $ docker run --rm -p 8787:8787 -e PASSWORD=yourpasswordhere rocker/rstudio
294 | ```
295 |
296 | Then open your browser to [http://localhost:8787](http://localhost:8787)
297 | with login rstudio and password "yourpasswordhere" used in the previous
298 | command.
299 |
300 | If you want to try an older version you can check the tags at
301 | [https://hub.docker.com/r/rocker/rstudio/tags](https://hub.docker.com/r/rocker/rstudio/tags)
302 | and run for example:
303 | ```console
304 | $ docker run --rm -p 8787:8787 -e PASSWORD=yourpasswordhere rocker/rstudio:3.3
305 | ```
306 |
307 | 2. Run a specific version of *Anaconda3* from
308 | [https://hub.docker.com/r/continuumio/anaconda3](https://hub.docker.com/r/continuumio/anaconda3):
309 | ```console
310 | $ docker run -i -t continuumio/anaconda3 /bin/bash
311 | ```
312 | ````
313 |
314 | ## Resources for further learning
315 |
316 | - [Carpentries incubator lesson on Docker](https://carpentries-incubator.github.io/docker-introduction/)
317 | - [Carpentries incubator lesson on Singularity/Apptainer](https://carpentries-incubator.github.io/singularity-introduction/)
318 |
319 | ```{keypoints}
320 | - Containers can be helpful if complex setups are needed to running a specific software
321 | - They can also be helpful for prototyping without "messing up" your own computing environment, or for running software that requires a different operating system than your own
322 | ```
323 |
--------------------------------------------------------------------------------
/content/exercises.md:
--------------------------------------------------------------------------------
1 | # List of exercises
2 |
3 | ## Full list
4 |
5 | This is a list of all exercises and solutions in this lesson, mainly
6 | as a reference for helpers and instructors. This list is
7 | automatically generated from all of the other pages in the lesson.
8 | Any single teaching event will probably cover only a subset of these,
9 | depending on their interests.
10 |
11 | ```{exerciselist}
12 | ```
13 |
--------------------------------------------------------------------------------
/content/guide.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: /guide/
4 | ---
5 |
6 | # Instructor guide
7 |
8 |
9 | ## Detailed day schedule
10 |
11 | Some example schedules for this lesson:
12 |
13 | 2024 edition plan (times in EET, Helsinki time), **no exercises**, just demos:
14 |
15 | - 09:50 - 10:00 Soft start and icebreaker question
16 | - Page: collaborative notes document
17 | - Give more space to the icebreaker and see what people are writing and talk about our own experiences
18 | - 10:00 - 10:03 Collab document intro
19 | - 10:03 - 10:05 Learning outcomes: https://coderefinery.github.io/reproducible-research/
20 | - 10:05 - 10:10 Overview of CR and how it all fits together
21 | - Page: https://coderefinery.github.io/reproducible-research/intro
22 | - Learning outcomes from index
23 | - 10:10 - 10:20 Reproducible research, Motivation
24 | - Exercise in notes doc with the discussions in bottom of motivation page
25 | - Page: https://coderefinery.github.io/reproducible-research/motivation/
26 | - 10:20 - 10:30 Organizing your projects
27 | - Copy the discussion on the notes and if we have time we can highlight some answers
28 | - Page: https://coderefinery.github.io/reproducible-research/organizing-projects/
29 | - 10:30 - 10:35 ask in collab document and discuss
30 | - https://coderefinery.github.io/reproducible-research/organizing-projects/#discussion-on-reproducibility
31 | - Are you using version control for academic papers?
32 | - ...
33 | - ...
34 | - How do you handle collaborative issues e.g. conflicting changes?
35 | - ...
36 | - ...
37 | - 10:35 - 10:55 Recording computational steps
38 | - Page: https://coderefinery.github.io/reproducible-research/workflow-management/
39 | - 10:55 - 11:05 Real break
40 | - 11:05 - 11:25 Recording dependencies
41 | - https://coderefinery.github.io/reproducible-research/dependencies/#exercises
42 | - ask first one in collab doc and discuss on stream
43 | - show difference between created env from env file vs exported env file on stream
44 | - 11:25 - 11:30 ask in collaborative document
45 | - Are you using any dependency and/or environment management tool in your work?
46 | - No: o
47 | - why not?
48 | - ..
49 | - ..
50 | - Yes: o
51 | - which?
52 | - ..
53 | - ..
54 | - Have you heard about or been in contact with containers (docker, singularity, podman) in your work? How did you come across them?
55 | - No: o
56 | - Yes:
57 | - ..
58 | - ..
59 | - ..
60 | - 11:30 - 11:50 Recording environments
61 | - The first contact with containers is often: Take this and run this command and then when you need to share/build.
62 | - Discuss setup issues, permissions if docker wants root, bandwidth, etc
63 | - Pros and cons of containers
64 | - Demo of two pre-made containers e.g. expand the R studio optional exercise?
65 | - 11:50 - 12.00 Wrapup
66 | - where to go from here: idea would be to give it more practical focus: what to do with these tools? Project level reproducibility. Time-scales of what changes (short time changes of code, long time years changes of OS-s, libraries).
67 | - Bring your code session advertisement
68 | - Material + recording available
69 | - 12:00 - long break starts
70 |
71 | This is the planned schedule for the workshop in September 2023 (2 hours and 5 minutes including 10 min break) ; note that for this workshop, sharing code and data was moved to social coding lesson:
72 |
73 | - 08:50 - 09:00 Soft start and icebreaker question
74 | - 09:00 - 09:10 Overview of CR and how it all fits together
75 | - 09:10 - 09:20 Reproducible research, [Motivation](https://coderefinery.github.io/reproducible-research/motivation/)
76 | - 09:20 - 09:27 [Organizing your projects](https://coderefinery.github.io/reproducible-research/organizing-projects/)
77 | - 09:27 - 09:35 [Recording computational steps](https://coderefinery.github.io/reproducible-research/workflow-management/) - discussion
78 | - 09:35 - 10:00 Snakemake exercise (25 min)
79 | - 10:00 - 10:10 Break
80 | - 10:10 - 10:15 Summary of workflows and the exercise
81 | - 10:15 - 10:30 [Recording dependencies](https://coderefinery.github.io/reproducible-research/dependencies/)
82 | - 10:30 - 10:40 [Recording environments](https://coderefinery.github.io/reproducible-research/environments/)
83 | - 10:40 - 11:00 Container-1 exercise (20 min)
84 | - 11:00 - 11.05 Wrapup
85 |
86 | This was the schedule at workshop in March 2023 (2 hours and 15 minutes including 2x 10 min break):
87 |
88 | - 08:50 - 09:00 Soft start and icebreaker question
89 | - 09:00 - 09:10 Interview with an invited guest
90 | - 09:10 - 09:20 [Motivation](https://coderefinery.github.io/reproducible-research/motivation/)
91 | - 09:20 - 09:30 [Organizing your projects](https://coderefinery.github.io/reproducible-research/organizing-projects/)
92 | - 09:30 - 10:00 [Recording dependencies](https://coderefinery.github.io/reproducible-research/dependencies/)
93 | - discussion (5 min)
94 | - exercise (20 min)
95 | - discussion (5 min)
96 | - 10:00 - 10:10 Break
97 | - 10:10 - 10:40 [Recording computational steps](https://coderefinery.github.io/reproducible-research/workflow-management/)
98 | - discussion (5 min)
99 | - exercise (20 min)
100 | - discussion (5 min)
101 | - 10:40 - 10:50 [Recording environments](https://coderefinery.github.io/reproducible-research/environments/)
102 | - an exercise exists but is typically not done as part of a standard workshop
103 | - 10:50 - 11:05 [Sharing code and data](https://coderefinery.github.io/reproducible-research/sharing/)
104 | - [demo (15 min)](https://coderefinery.github.io/reproducible-research/sharing/#connecting-repositories-to-zenodo)
105 | - 11:05 - 11:15 Break
106 |
107 |
108 | ## Why we teach this lesson
109 |
110 | Reproducibility in research is something that publishers, funding agencies, universities,
111 | research leaders and the general public worries about and much is being written about it.
112 | It is also something that researchers care deeply about - this lesson is typically one of the
113 | most popular lessons in the pre-workshop survey.
114 |
115 | Even though most PhD students, postdocs and researchers (i.e. typical workshop participants)
116 | know about the importance of reproducibility in research, they often lack both a general
117 | overview of what different aspects there are to reproducibility, and the knowledge of
118 | specific tools that can be used for improving reproducibility.
119 |
120 | Many participants may not adhere to good practices when organizing their projects,
121 | and the "Organizing your projects" episode is meant to encourage participants to
122 | structure their projects better. This may be obvious to some participants but it
123 | doesn't harm to preach to the choir.
124 |
125 | Even though many participants know that code can have many dependencies (e.g. they
126 | may have experienced difficulties in getting other people's code to run), they
127 | often don't know or use good practices when it comes to recording dependencies.
128 | Most participants also don't use isolated environments for different projects and
129 | don't know why that can be important.
130 | The episode "Recording dependencies" tries to convey the importance of recording
131 | dependencies accurately for your projects, and shows how tools like conda can be
132 | used both as a package and software environment manager.
133 |
134 | Many participants have heard about containers and find them interesting, but
135 | lack an understanding of how they work or how they can be used. The episode
136 | "Recording environments" introduces the concept of containers, and the optional
137 | episode "Creating and sharing a container image" goes into details.
138 |
139 | Many participants use complicated series of computational steps in their research
140 | without realizing that this work falls into the category of "scientific workflows",
141 | and that there actually exist tools that help make such workflows reproducible.
142 | The episode "Recording computational steps" introduces the concept of scientific
143 | workflows, discusses various ways of managing workflows with varying degrees of
144 | reproducibility, and shows how tools like Snakemake can be used to
145 | both simplify workflows and make them more reproducible.
146 |
147 |
148 | ## How to teach this lesson
149 |
150 | ### How to start
151 |
152 | Everyone knows that scientific results need to be reproducible, but not everyone is using
153 | appropriate tools to ensure this. Here we're going to get to know tools which help with
154 | preserving the provenance of data and reproducibility on different levels, ranging from
155 | workflow automation to software environment (containers).
156 |
157 |
158 | ### Focus on concepts, and when to use which tool
159 |
160 | Try to explain better what the different tools are useful for, but don't go
161 | into details. In this lesson we are not trying to gain expertise in the
162 | various tools and master the details but rather we want to give an overview and
163 | show that many tools exist and try to give participant the right feel for which
164 | set of tools to approach for which type of problem.
165 |
166 |
167 | ## Typical pitfalls
168 |
169 | ### Indentation in Snakefiles
170 |
171 | - the body of a rule and the body of an input keyword need to be indented, but the number of spaces doesn't matter
172 | This works:
173 | ```python
174 | rule all:
175 | input:
176 | expand('statistics/{book}.data', book=DATA),
177 | expand('plot/{book}.png', book=DATA)
178 | ```
179 | but this doesn't work:
180 | ```python
181 | rule all:
182 | input:
183 | expand('statistics/{book}.data', book=DATA),
184 | expand('plot/{book}.png', book=DATA)
185 | ```
186 | nor this:
187 | ```python
188 | rule all:
189 | input:
190 | expand('statistics/{book}.data', book=DATA),
191 | expand('plot/{book}.png', book=DATA)
192 | ```
193 |
194 |
195 | ## Field reports
196 |
197 | ### 2022 September
198 |
199 | We used the strategy "absolutely minimal introductions, most time
200 | for exercise". Overall, it was probably the right thing to do since
201 | there is so little time and so much to cover.
202 |
203 | There wasn't enough time for the conda exercise (we could give only 7
204 | minutes), but also I wonder how engaging it is. We should look at how
205 | to optimize the start of that episode.
206 |
207 | The Snakemake episode went reasonably well. Our goal was 5 minutes
208 | intro, long exercise, 5 minutes outro. The intro was actually a bit
209 | longer, and there was the comment that we didn't really explain what
210 | Snakemake was before it started (though we tried). The start of this
211 | episode should get particular focus in the future, since this is the
212 | main exercise of the day.
213 |
214 |
--------------------------------------------------------------------------------
/content/img/dependency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/dependency.png
--------------------------------------------------------------------------------
/content/img/docker_meme.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/docker_meme.jpg
--------------------------------------------------------------------------------
/content/img/kitchen/busy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/kitchen/busy.png
--------------------------------------------------------------------------------
/content/img/kitchen/libraries.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/kitchen/libraries.png
--------------------------------------------------------------------------------
/content/img/kitchen/linux.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/kitchen/linux.png
--------------------------------------------------------------------------------
/content/img/kitchen/macos.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/kitchen/macos.png
--------------------------------------------------------------------------------
/content/img/kitchen/recipe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/kitchen/recipe.png
--------------------------------------------------------------------------------
/content/img/kitchen/windows.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/kitchen/windows.png
--------------------------------------------------------------------------------
/content/img/python_environment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/python_environment.png
--------------------------------------------------------------------------------
/content/img/repro-pyramid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/repro-pyramid.png
--------------------------------------------------------------------------------
/content/img/reproducibility_levels.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/reproducibility_levels.png
--------------------------------------------------------------------------------
/content/img/reproducibility_nature.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/reproducibility_nature.jpg
--------------------------------------------------------------------------------
/content/img/reproducible-research.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/reproducible-research.jpg
--------------------------------------------------------------------------------
/content/img/reproducible_research_plus_lessons.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/reproducible_research_plus_lessons.png
--------------------------------------------------------------------------------
/content/img/research_comic_phd.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/research_comic_phd.gif
--------------------------------------------------------------------------------
/content/img/snakemake.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/snakemake.png
--------------------------------------------------------------------------------
/content/img/snakemake_dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/snakemake_dag.png
--------------------------------------------------------------------------------
/content/img/turing-way/8-fair-principles.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/turing-way/8-fair-principles.jpg
--------------------------------------------------------------------------------
/content/img/turing-way/reproducibility.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/turing-way/reproducibility.jpg
--------------------------------------------------------------------------------
/content/img/word-count/arrows.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/word-count/arrows.png
--------------------------------------------------------------------------------
/content/img/word-count/gutenberg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/word-count/gutenberg.png
--------------------------------------------------------------------------------
/content/img/word-count/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/word-count/plot.png
--------------------------------------------------------------------------------
/content/img/word-count/statistics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coderefinery/reproducible-research/a765a14f07a4d713adb3d5b4f68093f336b2e703/content/img/word-count/statistics.png
--------------------------------------------------------------------------------
/content/index.rst:
--------------------------------------------------------------------------------
1 | .. _index:
2 |
3 | Reproducible research - Preparing code to be usable by you and others in the future
4 | ===================================================================================
5 |
6 | Have you ever spent days **trying to repeat the results from few weeks or months
7 | ago**? Or you have to do paper revisions, but you just can't get the results to
8 | match up? It's unpleasant for both you and science.
9 |
10 | In this lesson we will explore different methods and tools for better
11 | reproducibility in research software and data. We will demonstrate how version
12 | control, workflows, containers, and package managers can be used to **record
13 | reproducible environments and computational steps** for our future selves and others.
14 |
15 |
16 | .. admonition:: Learning outcomes
17 |
18 | By the end of this lesson, learners should:
19 | - Be able to apply well organized directory structure for their project
20 | - Understand that code can have dependencies, and know how to document them
21 | - Be able to document computational steps, and have an idea when it can be useful
22 | - Know about use cases for containers
23 |
24 | .. prereq::
25 |
26 | You need to install
27 | `Git, Python, and Snakemake `__.
28 |
29 | If you wish to follow in the terminal and are new to the command line, we
30 | recorded a `short shell crash course `__.
31 |
32 |
33 | .. toctree::
34 | :maxdepth: 1
35 | :caption: Core episodes
36 |
37 | intro.md
38 | motivation.md
39 | organizing-projects.md
40 | workflow-management.md
41 | dependencies.md
42 | environments.md
43 | where-to-go.md
44 |
45 |
46 | .. toctree::
47 | :maxdepth: 1
48 | :caption: Reference
49 |
50 | Shell crash course
51 | exercises
52 | guide
53 |
54 |
55 | .. toctree::
56 | :maxdepth: 1
57 | :caption: About
58 |
59 | All lessons
60 | CodeRefinery
61 | Reusing
62 |
--------------------------------------------------------------------------------
/content/intro.md:
--------------------------------------------------------------------------------
1 | # Introduction - How it all connects
2 |
3 | ```{instructor-note}
4 | - 10 min teaching/discussion
5 | - 0 min exercises
6 | ```
7 |
8 | ---
9 |
10 | ```{figure} /img/turing-way/reproducibility.jpg
11 | :alt: "A person showing another person what steps to take to make their data research reproducible. There is a path with several steps- Here is my data - Here are my tools - Here is my code - Here are my results"
12 | :width: 100%
13 | ```
14 |
15 | [The Turing Way project illustration by Scriberia. Used under a CC-BY 4.0 licence. DOI: ]
16 |
17 | ## This workshop is all about reproducibility - from a computational perspective
18 |
19 | This section connects the steps above to the CodeRefinery workshop lessons.
20 |
21 | **"Here is my code"**
22 |
23 | - **Version control with git** with focus on collaboration
24 | - **Social coding**: What can you do to get credit for your code and to allow reuse
25 | - **Documentation**: How to let others or future you know about your thoughts and how to use your code
26 | - **Jupyter Notebooks**: A tool to write and share executable notebooks and data visualization
27 | - **Automated testing**: Preventing yourself and others from breaking your functioning code
28 | - **Modular code development**: Making reusing parts of your code easier
29 |
30 | **"Here are my tools"**
31 |
32 | This lesson on general **Reproducibility**: Preparing code to be usable by you and others in the future
33 |
34 | This includes organizing your projects on your own computer and recording your computational steps, dependencies and computing environment.
35 |
36 | We will also mention a few tools and platforms for sharing data (**"Here is my data"**) and research outputs(**"Here are my results"**) in the **social coding** lesson, but they are not the focus of this workshop.
37 |
38 | ## Small steps towards reproducible research
39 |
40 | If this is all new to you, it may feel quite overwhelming.
41 |
42 | **Our recommendation:** Don't worry! Focus on "good enough" instead of perfect.
43 |
44 | To start, pick one topic that seems reasonable to implement for your current project. Something that helps YOU right now. This may be something you may have to implement due to requirements from your funders or the journal where you want to publish your research. Use their requirements as a checklist and find tools that feel comfortable for you.
45 |
46 | A great way to see what are the really important things to implement is to meet with a colleague, exchange codes and try to run each others code. Every question your colleague has to ask from you about your code gives a hint on where you may need to improve.
47 |
48 | Keeping a "log book" while working on your own code also serves as a great basis for making your code more reproducible. Can you use any of the tools and techniques learned in this workshop to share parts of your log book with others to help them run your code?
49 |
50 |
--------------------------------------------------------------------------------
/content/motivation.md:
--------------------------------------------------------------------------------
1 | # Motivation
2 |
3 | ```{objectives}
4 | - Understand why we are talking about reproducibility in this workshop
5 | ```
6 |
7 | ```{instructor-note}
8 | - 10 min teaching/discussion
9 | ```
10 |
11 | ```{figure} img/research_comic_phd.gif
12 | :alt: Research comic
13 | :width: 100%
14 | ```
15 |
16 | ```{admonition} A scary anecdote
17 | - A group of researchers obtain great results and submit their work to a high-profile journal.
18 | - Reviewers ask for new figures and additional analysis.
19 | - The researchers start working on revisions and generate modified figures, but find inconsistencies with old figures.
20 | - The researchers can't find some of the data they used to generate the original results, and
21 | can't figure out which parameters they used when running their analyses.
22 | - The manuscript is still languishing in the drawer ...
23 | ```
24 |
25 | ---
26 |
27 | ## Why talking about reproducible research?
28 |
29 | A 2016
30 | [survey](http://www.nature.com/news/1-500-scientists-lift-the-lid-on-reproducibility-1.19970)
31 | in Nature revealed that irreproducible experiments are a problem across all
32 | domains of science:
33 |
34 | ```{figure} img/reproducibility_nature.jpg
35 | :alt: reproduciblity Nature
36 | :width: 100%
37 | ```
38 |
39 | This study is now few years old but the highlighted problem did not get
40 | smaller.
41 |
42 | ---
43 |
44 | ## Levels of reproducibility
45 |
46 | A published article is like the top of a pyramid. It rests on multiple
47 | levels that each contributes to its reproducibility.
48 |
49 | ```{figure} img/repro-pyramid.png
50 | :alt: Reproducibility pyramid
51 | :width: 100%
52 | ```
53 |
54 | [Steeves, Vicky (2017) in "Reproducibility Librarianship," Collaborative Librarianship: Vol. 9: Iss. 2, Article 4.
55 | Available at: https://digitalcommons.du.edu/collaborativelibrarianship/vol9/iss2/4]
56 |
57 | This also means that you can think about it from the beginning of your research life cycle!
58 |
59 | ---
60 |
61 | ````{discussion} Discuss in collaborative document or with your team members
62 | ```markdown
63 | - What are your experiences re-running or adjusting a script or a figure you
64 | created few months ago?
65 | - ...
66 | - ...
67 | - (share your experience)
68 |
69 | - Have you continued working from a previous student's
70 | script/code/plot/notebook? What were the biggest challenges?
71 | - ...
72 | - ...
73 | - (share your experience, but constructively)
74 | ````
75 |
76 | ```{keypoints}
77 | - Without reproducibility in scientific computing, everyone would have to start a new project / code from scratch
78 | ```
--------------------------------------------------------------------------------
/content/organizing-projects.md:
--------------------------------------------------------------------------------
1 | # Organizing your projects
2 |
3 | ```{objectives}
4 | - Understand how to organize research projects
5 | - Get an overview of tools for collaborative and version controlled manuscripts
6 | ```
7 |
8 | ```{instructor-note}
9 | - 10 min teaching incl. discussions
10 | ```
11 |
12 | One of the first steps to make your work reproducible is to organize your projects well.
13 | Let's go over some of the basic things which people have found to work (and not to work).
14 |
15 |
16 | ## Directory structure for projects
17 |
18 | - Project files in a **single directory**
19 | - **Different projects** should have **separate directories**
20 | - Use **consistent and informative directory structure**
21 | - Avoid spaces in directory and file names – use `-`, `_` or CamelCase instead (nicer for computers to handle).
22 | - If you need to separate public/private directories,
23 | - put them separately in public and private Git repositories, or
24 | - use `.gitignore` to exclude the private information from being tracked
25 | - Add a **README file** to describe the project and instructions on reproducing the results
26 | - If you want to use the **same code in multiple projects**, host it on GitHub (or similar) and clone it into each of your project directories.
27 |
28 | A project directory can look something like this:
29 |
30 | ```shell
31 | project_name/
32 | ├── README.md # overview of the project
33 | ├── data/ # data files used in the project
34 | │ ├── README.md # describes where data came from
35 | │ └── sub-directory/ # may contain subdirectories
36 | ├── processed_data/ # intermediate files from the analysis
37 | ├── manuscript/ # manuscript describing the results
38 | ├── results/ # results of the analysis (data, tables, figures)
39 | ├── src/ # contains all code in the project
40 | │ ├── LICENSE # license for your code
41 | │ ├── requirements.txt # software requirements and dependencies
42 | │ └── ...
43 | └── doc/ # documentation for your project
44 | ├── index.rst
45 | └── ...
46 | ```
47 |
48 | ---
49 |
50 | ## Tracking source code, data, and results
51 |
52 | - All code is version controlled and goes in the `src/` or `source/` directory
53 | - Include appropriate LICENSE file and information on software requirements
54 | - You can also version control data files or input files under `data/`
55 | - If data files are too large (or sensitive) to track, untrack them using `.gitignore`
56 | - Intermediate files from the analysis are kept in `processed_data/`
57 | - Consider using Git tags to mark specific versions of results (version
58 | submitted to a journal, dissertation version, poster version, etc.):
59 | ```console
60 | $ git tag -a thesis-submitted -m "this is the submitted version of my thesis"
61 | ```
62 |
63 | Check the [Git-intro lesson](https://coderefinery.github.io/git-intro/) for a reminder.
64 |
65 |
66 | ## Some tools and templates
67 |
68 | - [R devtools](https://devtools.r-lib.org/)
69 | - [Python cookiecutter template](https://github.com/Materials-Data-Science-and-Informatics/fair-python-cookiecutter)
70 | - [Reproducible research template](https://github.com/the-turing-way/reproducible-project-template) by the Turing Way
71 |
72 | More tools and templates in [Heidi Seibolds blog](https://heidiseibold.ck.page/posts/setting-up-a-fair-and-reproducible-project).
73 |
74 |
75 | ---
76 |
77 | ## Excursion: Reproducible publications
78 |
79 | ### Discussion on collaborative writing of academic papers
80 |
81 | ````{discussion} Discuss in the collaborative document:
82 |
83 | ```
84 | - How do you collaborate on writing academic papers?
85 | - ...
86 | - ...
87 | - (share your experience)
88 |
89 | - How do you handle collaborative issues e.g. conflicting changes?
90 | - ...
91 | - ...
92 | - (share your experience)
93 | ```
94 |
95 | ````
96 |
97 | -> Consider using **version control for manuscripts** as well. It may help you when keeping track of edits + if you sync it online then you don't have to worry about losing your work.
98 |
99 | Version control does not have to mean git, but could also mean using "tracking changes" in tools like Word, Google Docs, or Overleaf (find links below).
100 |
101 | ### Tools for collaborative writing and version control of manuscripts
102 |
103 | Git **can** be used to collaborate on manuscripts written in, e.g., LaTeX and other text-based formats. However it might not always be the most convenient. Other tools exist to make the process more enjoyable:
104 |
105 | You can **collaboratively gather notes** using self-hosted or public instances of tools like [HedgeDoc](https://hedgedoc.org/) and [Etherpad](https://etherpad.org) or use online options like [HackMD](https://hackmd.io/), [Google Docs](https://docs.google.com) or the Microsoft online tools for easy and efficient collaboration.
106 |
107 | To format your notes into a manuscript, you can use Word-like online editors or tools like [Overleaf](https://www.overleaf.com) (LaTeX) or [Typst](https://typst.app/) (markdown). Most of the tools in this section even provide a git integration.
108 |
109 | [Manubot](https://github.com/manubot/rootstock) offers another way to turn your written word into a fully rendered manuscript using GitHub.
110 |
111 | ### Executable manuscripts
112 |
113 | You may also want to consider writing an executable manuscript using tools like [Jupyter Notebooks](https://jupyter.org) hosted on [Binder](https://mybinder.org), [Quarto](https://quarto.org/), [Authorea](https://www.authorea.com) or [Observable](https://observablehq.com/), to name a few.
114 |
115 | ### Resources on research compendia
116 |
117 | - [About research compendia at the Turing Way](https://book.the-turing-way.org/reproducible-research/compendia)
118 | - ["Research compendia"](http://inundata.org/talks/rstd19/#/): a set of good practices for reproducible data analysis in R, but much is transferable to other languages.
119 | - [rrtools](https://github.com/benmarwick/rrtools): instructions, templates, and functions for writing a reproducible article or report with R.
120 | - ...
121 |
122 | ```{keypoints}
123 | - An organized project directory structure helps with reproducibility.
124 | - Also think about version control for writing your academic manuscripts.
125 | ```
126 |
--------------------------------------------------------------------------------
/content/where-to-go.md:
--------------------------------------------------------------------------------
1 | # Where to go from here
2 |
3 | ```{objectives}
4 | - Understand when tools discussed in this episode can be useful
5 | ```
6 |
7 | ```{instructor-note}
8 | - 10 min teaching/discussion
9 | ```
10 |
11 | This episode presents a lot of different tools and opportunities for your research software project.
12 | However, you will not always need all of them. As with so many things, it again depends on your project.
13 |
14 | ## Workflow tools will maybe make sense in the future
15 |
16 | - In many cases, it is probably not needed
17 | - You will want to consider workflow tools:
18 | - When processing many files with many steps
19 | - Steps or files may change
20 | - Your main script, connecting your steps gets very long
21 | - You are still collecting your input data
22 | - ...
23 |
24 | ## Containers seem amazing, but do I have use for them?
25 |
26 | - Maybe not yet, but knowing that you can ...
27 | - Run Linux tools on your Windows computer
28 | - Run different versions of same software on your computer
29 | - Follow the "easy installation instructions" for an operating system that is not your own
30 | - Get a fully configured environment instead of only installing a tool
31 | - Share your setup and configurations with others
32 |
33 | ... can be very beneficial :)
34 |
35 | ## Important for every project
36 |
37 | - Clear file structure for your project
38 | - Record your workflow and write it down in a script file.
39 | - Create a dependency list and keep it updated, optimally in an environment file
40 | - At least consider the possibility that someone, maybe you may want to reproduce your work
41 | - Can you do something (small) to make it easier?
42 | - If you have ideas, but no time: add an issue to your repository; maybe someone else wants to help.
43 |
44 | ## Further reading
45 |
46 | - [The Turing Way handbook to reproducible, ethical and collaborative data science](https://doi.org/10.5281/zenodo.3233853)
47 | - [Reproducible research policies and software/data management in scientific computing journals: a survey, discussion, and perspectives](https://doi.org/10.3389/fcomp.2024.1491823)
48 | - ...
49 |
50 | ```{seealso}
51 | Do you want to practice your reproducibility skills and get inspired by working with other people's code/data? Join a [ReproHack event](https://www.reprohack.org/event/)!
52 | ```
53 |
54 | ```{keypoints}
55 | - Not everything in this lesson might be useful right now, but it is good to know that these things exist if you ever get in a situation that would require such solution.
56 | - Caring about reproducibility makes work easier for the next person working on the project - and that might be you in a few years!
57 | ```
58 |
--------------------------------------------------------------------------------
/content/workflow-management.md:
--------------------------------------------------------------------------------
1 | # Recording computational steps
2 |
3 | ```{objectives}
4 | - Understand why and when a workflow management tool can be useful
5 | ```
6 |
7 | ```{questions}
8 | - You have some steps that need to be run to do your work. How do you
9 | actually run them? Does it rely on your own memory and work, or is it
10 | reproducible? **How do you communicate the steps** for future you and others?
11 | - How can we create a reproducible workflow?
12 | ```
13 |
14 | ```{instructor-note}
15 | - 5 min teaching
16 | - 15 min demo
17 | ```
18 |
19 |
20 | ## Several steps from input data to result
21 |
22 | *The following material is partly derived from a [HPC Carpentry lesson](https://hpc-carpentry.github.io/hpc-python/).*
23 |
24 | In this episode, we will use an [example
25 | project](https://github.com/coderefinery/word-count) which finds most frequent
26 | words in books and plots the result from those statistics. In this example we
27 | wish to:
28 |
29 | 1. Analyze word frequencies using [code/count.py](https://github.com/coderefinery/word-count/blob/main/code/count.py)
30 | for 4 books
31 | (they are all in the [data](https://github.com/coderefinery/word-count/tree/main/data) directory).
32 | 2. Plot a histogram using [plot/plot.py](https://github.com/coderefinery/word-count/blob/main/plot/plot.py).
33 |
34 | ```{figure} img/word-count/arrows.png
35 | :alt: From book to word counts to plot
36 | :width: 100%
37 | ```
38 |
39 | Example (for one book only):
40 |
41 | ```console
42 | $ python code/count.py data/isles.txt > statistics/isles.data
43 | $ python code/plot.py --data-file statistics/isles.data --plot-file plot/isles.png
44 | ```
45 |
46 | Another way to analyze the data would be via a graphical user interface (GUI), where you can for example drag and drop files and click buttons to do the different processing steps.
47 |
48 | Both of the above (single line commands and simple graphical interfaces) are tricky in terms of reproducibility. We currently have two steps and 4 books. But **imagine having 4 steps and 500 books**.
49 | How could we deal with this?
50 |
51 | As a first idea we could express the workflow with a script. The repository includes such script called `run_all.sh`.
52 |
53 | We can run it with:
54 |
55 | ```console
56 | $ bash run_all.sh
57 | ```
58 |
59 | This is **imperative style**: we tell the script to run these
60 | steps in precisely this order, as we would run them manually, one after another.
61 |
62 | ````{discussion}
63 | - What are the advantages of this solution compared to processing all one by one?
64 | - Is the scripted solution reproducible?
65 | - Imagine adding more steps to the analysis and imagine the steps being time consuming. What problems do you anticipate
66 | with a scripted solution?
67 |
68 | ```{solution}
69 | The advantage of this solution compared to processing one by one is more automation: We can generate all.
70 | This is not only easier, it is also less error-prone.
71 |
72 | Yes, the scripted solution can be reproducible. But could you easily run it e.g. on a Windows computer?
73 |
74 | If we had more steps and once steps start to be time-consuming, a limitation of
75 | a scripted solution is that it tries to run all steps always. Rerunning only
76 | part of the steps or only part of the input data requires us to outcomment or change lines in our script in between runs which can again become tedious and error-prone.
77 | ```
78 | ````
79 |
80 | ---
81 |
82 | ## Workflow tools
83 |
84 | Sometimes it may be helpful to go from imperative to declarative style. Rather than saying "do this and then that" we describe dependencies between steps, but we let the tool figure out the order of steps to produce results.
85 |
86 | ### Example workflow tool: [Snakemake](https://snakemake.readthedocs.io/en/stable/index.html)
87 |
88 | Snakemake (inspired by [GNU Make](https://www.gnu.org/software/make/)) is one of many tools to create reproducible and scalable data analysis workflows. Workflows are described via a human readable, Python based language.
89 | Snakemake workflows scale seamlessly from laptop to cluster or cloud, without the need to modify the workflow definition.
90 |
91 | ---
92 |
93 | ## A demo
94 |
95 | ````{prereq} Preparation
96 | The exercise (below) and pre-exercise discussion uses the
97 | word-count repository
98 | () which we need to clone to work on it.
99 |
100 | If you want to do this exercise on your own, you can do so either on your own computer (follow the instructions in the bottom right panel on the [CodeRefinery installation instruction page](https://coderefinery.github.io/installation/)), or the [Binder](https://mybinder.org/)
101 | cloud service:
102 |
103 | **On your own computer**:
104 | - Install the necessary tools
105 | - Activate the [coderefinery conda environment](https://coderefinery.github.io/installation/conda-environment/) with `conda activate coderefinery`.
106 | - Clone the word-count repository:
107 | ```console
108 | $ git clone https://github.com/coderefinery/word-count.git
109 | ```
110 |
111 | **On Binder**:
112 | We can also use the cloud service [Binder](https://mybinder.org/) to make sure
113 | we all have the same computing environment. This is interesting from a
114 | reproducible research point of view and it's explained further in the [Jupyter
115 | lesson](https://coderefinery.github.io/jupyter/sharing/) how this is even
116 | possible.
117 | - Go to and click on the "launch binder" badge in the README.
118 | - Once it get started, you can open a new Terminal from the **new** menu (top right) and select **Terminal**.
119 | ````
120 |
121 | ````{exercise} Workflow-1: Workflow solution using Snakemake
122 |
123 | ```{figure} img/snakemake.png
124 | :alt: How Snakemake works
125 | :width: 100%
126 | ```
127 |
128 | Somebody wrote a [Snakemake](https://snakemake.readthedocs.io) solution in the [Snakefile](https://github.com/coderefinery/word-count/blob/main/Snakefile):
129 | ```
130 | # a list of all the books we are analyzing
131 | DATA = glob_wildcards('data/{book}.txt').book
132 |
133 | rule all:
134 | input:
135 | expand('statistics/{book}.data', book=DATA),
136 | expand('plot/{book}.png', book=DATA)
137 |
138 | # count words in one of our books
139 | rule count_words:
140 | input:
141 | script='code/count.py',
142 | book='data/{file}.txt'
143 | output: 'statistics/{file}.data'
144 | shell: 'python {input.script} {input.book} > {output}'
145 |
146 | # create a plot for each book
147 | rule make_plot:
148 | input:
149 | script='code/plot.py',
150 | book='statistics/{file}.data'
151 | output: 'plot/{file}.png'
152 | shell: 'python {input.script} --data-file {input.book} --plot-file {output}'
153 |
154 | ```
155 |
156 | We can see that Snakemake uses **declarative style**:
157 | Snakefiles contain rules that relate targets (`output`) to dependencies
158 | (`input`) and commands (`shell`).
159 |
160 | Steps:
161 | 1. Clone the example to your computer: `$ git clone https://github.com/coderefinery/word-count.git`
162 | 2. Study the Snakefile. How does it know what to do first and what to do then?
163 | 3. Try to run it. Since version 5.11 one needs to specify number of cores (or
164 | jobs) using `-j`, `--jobs` or `--cores`:
165 | ```console
166 | $ snakemake --delete-all-output -j 1
167 | $ snakemake -j 1
168 | ```
169 | The `--delete-all-output` part makes sure that we remove all generated files before we start.
170 | 4. Try running `snakemake` again and observe that and discuss why it refused to rerun all steps:
171 | ```console
172 | $ snakemake -j 1
173 |
174 | Building DAG of jobs...
175 | Nothing to be done (all requested files are present and up to date).
176 | ```
177 | 5. Make a tiny modification to the plot.py script and run `$ snakemake -j 1` again and observe how it will only re-run the plot steps.
178 | 6. Make a tiny modification to one of the books and run `$ snakemake -j 1` again and observe how it only regenerates files for this book.
179 | 7. Discuss possible advantages compared to a scripted solution.
180 | 8. **Question for R developers**: Imagine you want to rewrite the two Python scripts and use R instead. Which lines in
181 | the [Snakefile](https://github.com/coderefinery/word-count/blob/main/Snakefile) would you have to modify so that it uses your R code?
182 | 9. If you make changes to the Snakefile, validate it using `$ snakemake --lint`.
183 |
184 | ```{solution}
185 | - 2: Start with "all" and look what it depends on. Now search for rules that
186 | have these as output. Look for their inputs and search where they
187 | are produced. In other words, search backwards and build a graph of
188 | dependencies. This is what Snakemake does.
189 | - 4: It can see that outputs are newer than inputs. It will only regenerate
190 | outputs if they are not there or if the inputs or scripts have changed.
191 | - 7: It only generates steps and outputs that are missing or outdated. The workflow
192 | does not run everything every time. In other words if you notice a problem or update information
193 | "half way" in the analysis, it will only re-run what needs to be re-run. Nothing more, nothing less.
194 | Another advantage is that it can distribute tasks to multiple cores, off-load work to supercomputers,
195 | offers more fine-grained control over environments, and more.
196 | - 8: Probably only the two lines containing "shell".
197 | ```
198 | ````
199 |
200 | ## Visualizing the workflow
201 |
202 | We can visualize the directed acyclic graph (DAG) of our current Snakefile
203 | using the `--dag` option, which will output the DAG in `dot` language.
204 |
205 | **Note**: This requires the [Graphviz software](https://www.graphviz.org/),
206 | which can be installed by `conda install graphviz`.
207 |
208 | ```console
209 | $ snakemake -j 1 --dag | dot -Tpng > dag.png
210 | ```
211 |
212 | Rules that have yet to be completed are indicated with solid outlines, while already completed rules are indicated with dashed outlines.
213 |
214 | ```{figure} img/snakemake_dag.png
215 | :alt: Snakemake DAG
216 | :width: 100%
217 | ```
218 |
219 | ## Why [Snakemake](https://snakemake.readthedocs.io/)?
220 |
221 | - Gentle **learning curve**.
222 | - Free, open-source, and **installs easily** via conda or pip.
223 | - **Cross-platform** (Windows, MacOS, Linux) and compatible with all High Performance Computing (HPC) schedulers:
224 | same workflow works without modification and scales appropriately whether on a laptop or cluster.
225 | - If several workflow steps are independent of each other, and you have multiple cores available, Snakemake can run them **in parallel**.
226 | - Is is possible to define **isolated software environments** per rule, e.g. by adding `conda: 'environment.yml'` to a rule.
227 | - Also possible to run workflows in Docker or Apptainer **containers** e.g. by adding `container: 'docker://some-org/some-tool#2.3.1'` to a rule.
228 | - [Heavily used in bioinformatics](https://twitter.com/carl_witt/status/1103951128046301185), but is **completely general**.
229 | - Nice functionality for archiving the workflow, see: [the official documentation](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#sustainable-and-reproducible-archiving)
230 |
231 | Tools like Snakemake help us with **reproducibility** by supporting us with **automation**, **scalability** and **portability** of our workflows.
232 |
233 | ## Similar tools
234 |
235 | - [Make](https://www.gnu.org/software/make/)
236 | - [Nextflow](https://www.nextflow.io/)
237 | - [Task](https://taskfile.dev/)
238 | - [Common Workflow Language](https://www.commonwl.org/)
239 | - Many [specialized frameworks](https://github.com/common-workflow-language/common-workflow-language/wiki/Existing-Workflow-systems) exist.
240 | - [Book on building reproducible analytical pipelines with R](https://raps-with-r.dev/)
241 | - [{targets} R package - make-like pipeline tool for R](https://books.ropensci.org/targets/)
242 |
243 | ```{keypoints}
244 | - Computational steps can be recorded in many ways
245 | - Workflow tools can help, if there are many steps to be executed and/or many datasets to be processed
246 | ```
247 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Sphinx
2 | sphinx_rtd_theme
3 | sphinx_rtd_theme_ext_color_contrast
4 | myst_nb
5 | git+https://github.com/rkdarst/sphinx-copybutton.git@exclude-unselectable-3
6 | sphinx-lesson
7 | https://github.com/coderefinery/sphinx-coderefinery-branding/archive/master.zip
8 |
--------------------------------------------------------------------------------