├── .github
    ├── CODEOWNERS
    └── workflows
    │   └── test_and_build.yml
├── .gitignore
├── AUTHORS
├── CITATION
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Gemfile
├── LICENSE.md
├── Makefile
├── README.md
├── _config.yml
├── _episodes
    ├── .gitkeep
    ├── 01-basics.md
    ├── 02-scripts.md
    ├── 03-lists.md
    ├── 04-dicts.md
    ├── 05-functions.md
    ├── 06-parallel.md
    ├── 07-snakemake-intro.md
    ├── 08-snakefiles.md
    ├── 09-wildcards.md
    ├── 10-patterns.md
    ├── 11-snakemake-python.md
    ├── 12-resources.md
    ├── 13-cluster.md
    └── 14-final-notes.md
├── _extras
    ├── .gitkeep
    ├── about.md
    ├── discuss.md
    ├── figures.md
    └── guide.md
├── _includes
    ├── all_figures.html
    ├── figure.html
    └── links.md
├── aio.md
├── bin
    ├── chunk-options.R
    ├── extract_figures.py
    ├── generate_md_episodes.R
    ├── knit_lessons.sh
    ├── lesson_check.py
    ├── lesson_initialize.py
    ├── markdown_ast.rb
    ├── repo_check.py
    ├── run-make-docker-serve.sh
    ├── test_lesson_check.py
    ├── util.py
    └── workshop_check.py
├── code
    └── .gitkeep
├── commands.mk
├── data
    └── .gitkeep
├── fig
    ├── .gitkeep
    ├── 02-challenge-dag.svg
    ├── 02-dats-dag.svg
    └── 05-final-dag.svg
├── files
    ├── .gitkeep
    ├── snakemake-lesson.tar.gz
    ├── snakemake-lesson.zip
    └── snakemake-lesson
    │   ├── .Snakefile
    │   ├── books
    │       ├── LICENSE_TEXTS.md
    │       ├── abyss.txt
    │       ├── isles.txt
    │       ├── last.txt
    │       └── sierra.txt
    │   ├── cluster.json
    │   ├── matplotlibrc
    │   ├── plotcount.py
    │   ├── wordcount.py
    │   └── zipf_test.py
├── index.md
├── lesson-outline.md
├── reference.md
└── setup.md


/.github/CODEOWNERS:
--------------------------------------------------------------------------------
 1 | # This file lists the contributors responsible for the
 2 | # repository content. They will also be automatically
 3 | # asked to review any pull request made in this repository.
 4 | 
 5 | # Each line is a file pattern followed by one or more owners.
 6 | # The sequence matters: later patterns take precedence.
 7 | 
 8 | # FILES  OWNERS
 9 | *        @hpc-carpentry/hpc-python-maintainers
10 | 


--------------------------------------------------------------------------------
/.github/workflows/test_and_build.yml:
--------------------------------------------------------------------------------
 1 | name: Check lesson and build for all configs
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 |     branches:
 7 |       - gh-pages
 8 | 
 9 | jobs:
10 |   spellcheck:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |     - uses: actions/checkout@v2
14 |     - name: Set up Python
15 |       uses: actions/setup-python@v2
16 |       with:
17 |         python-version: 3.7
18 |     - name: Install codespell
19 |       run: |
20 |         pip3 install codespell
21 |     - name: Check spelling
22 |       run: |
23 |         make spellcheck
24 | 
25 |   check_lesson_and_build_default:
26 |     runs-on: ubuntu-latest
27 |     steps:
28 |     - uses: actions/checkout@v2
29 |     - uses: actions/setup-ruby@v1
30 |       with:
31 |         ruby-version: '2.7'
32 |     - name: Install basic requirements
33 |       run: |
34 |         # Need this library for nokogiri
35 |         sudo apt-get install libxslt1-dev    
36 |         gem install bundler json kramdown kramdown-parser-gfm
37 |         bundle config set path '.vendor/bundle'
38 |         bundle config build.nokogiri --use-system-libraries
39 |         bundle install
40 |     - name: "Lint episode markdown"
41 |       run: |
42 |         find _episodes -name \*.md -exec bundle exec mdl -r MD001,MD003,MD005,MD006,MD007,MD008,MD009,MD010,MD011,MD012,MD015,MD016,MD017,MD018,MD019,MD020,MD021,MD022,MD023,MD025,MD035,MD036,MD037,MD038,MD039,MD046 {} \;
43 |     - name: "Check lesson for warnings"
44 |       run: |
45 |         make lesson-check-all
46 |     - name: "Check lesson for errors"
47 |       run: |
48 |         make lesson-check
49 |     - name: "Check build"
50 |       run: |
51 |         make --always-make site
52 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *~
 3 | .DS_Store
 4 | .ipynb_checkpoints
 5 | .jekyll-cache
 6 | .sass-cache
 7 | __pycache__
 8 | _site
 9 | files/*.gtf
10 | files/*.fb
11 | files/*.fastq
12 | files/*.tsv
13 | Gemfile.lock
14 | scratch/
15 | .bundle
16 | .vendor
17 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | HPC Python is written and maintained by the members of
2 | [HPC Carpentry](https://github.com/hpc-carpentry).
3 | 


--------------------------------------------------------------------------------
/CITATION:
--------------------------------------------------------------------------------
1 | FIXME: describe how to cite this lesson.
2 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | title: "Contributor Code of Conduct"
 4 | ---
 5 | As contributors and maintainers of this project,
 6 | we pledge to follow the [Carpentry Code of Conduct][coc].
 7 | 
 8 | Instances of abusive, harassing, or otherwise unacceptable behavior
 9 | may be reported by following our [reporting guidelines][coc-reporting].
10 | 
11 | {% include links.md %}
12 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing
  2 | 
  3 | [Software Carpentry][swc-site] and [Data Carpentry][dc-site] are open source
  4 | projects, and we welcome contributions of all kinds: new lessons, fixes to
  5 | existing material, bug reports, and reviews of proposed changes are all
  6 | welcome.
  7 | 
  8 | ## Contributor Agreement
  9 | 
 10 | By contributing, you agree that we may redistribute your work under [our
 11 | license](LICENSE.md). In exchange, we will address your issues and/or assess
 12 | your change proposal as promptly as we can, and help you become a member of our
 13 | community. Everyone involved in [Software Carpentry][swc-site] and [Data
 14 | Carpentry][dc-site] agrees to abide by our [code of
 15 | conduct](CODE_OF_CONDUCT.md).
 16 | 
 17 | ## How to Contribute
 18 | 
 19 | The easiest way to get started is to file an issue to tell us about a spelling
 20 | mistake, some awkward wording, or a factual error. This is a good way to
 21 | introduce yourself and to meet some of our community members.
 22 | 
 23 | 1. If you do not have a [GitHub][github] account, you can [send us comments by
 24 |    email][email]. However, we will be able to respond more quickly if you use
 25 |    one of the other methods described below.
 26 | 
 27 | 1. If you have a [GitHub][github] account, or are willing to [create
 28 |    one][github-join], but do not know how to use Git, you can report problems
 29 |    or suggest improvements by [creating an issue][issues]. This allows us to
 30 |    assign the item to someone and to respond to it in a threaded discussion.
 31 | 
 32 | 1. If you are comfortable with Git, and would like to add or change material,
 33 |    you can submit a pull request (PR). Instructions for doing this are
 34 |    [included below](#using-github).
 35 | 
 36 | ## Where to Contribute
 37 | 
 38 | 1. If you wish to change this lesson, please work in
 39 |    <https://github.com/carpentries-incubator/hpc-intro>, which can be viewed at
 40 |    <https://carpentries-incubator.github.io/hpc-intro>.
 41 | 
 42 | 1. If you wish to change the example lesson, please work in
 43 |    <https://github.com/carpentries/lesson-example>, which documents the format
 44 |    of our lessons and can be viewed at
 45 |    <https://carpentries.github.io/lesson-example>.
 46 | 
 47 | 1. If you wish to change the template used for workshop websites, please work
 48 |    in <https://github.com/carpentries/workshop-template>. The home page of that
 49 |    repository explains how to set up workshop websites, while the extra pages
 50 |    in <https://carpentries.github.io/workshop-template> provide more background
 51 |    on our design choices.
 52 | 
 53 | 1. If you wish to change CSS style files, tools, or HTML boilerplate for
 54 |    lessons or workshops stored in `_includes` or `_layouts`, please work in
 55 |    <https://github.com/carpentries/styles>.
 56 | 
 57 | ## What to Contribute
 58 | 
 59 | There are many ways to contribute, from writing new exercises and improving
 60 | existing ones to updating or filling in the documentation and submitting [bug
 61 | reports][issues] about things that don't work, aren't clear, or are missing. If
 62 | you are looking for ideas, please see the 'Issues' tab for a list of issues
 63 | associated with this repository, or you may also look at the issues for [Data
 64 | Carpentry][dc-issues] and [Software Carpentry][swc-issues] projects.
 65 | 
 66 | Comments on issues and reviews of pull requests are just as welcome: we are
 67 | smarter together than we are on our own. Reviews from novices and newcomers are
 68 | particularly valuable: it's easy for people who have been using these lessons
 69 | for a while to forget how impenetrable some of this material can be, so fresh
 70 | eyes are always welcome.
 71 | 
 72 | ## What *Not* to Contribute
 73 | 
 74 | Our lessons already contain more material than we can cover in a typical
 75 | workshop, so we are usually *not* looking for more concepts or tools to add to
 76 | them. As a rule, if you want to introduce a new idea, you must (a) estimate how
 77 | long it will take to teach and (b) explain what you would take out to make room
 78 | for it. The first encourages contributors to be honest about requirements; the
 79 | second, to think hard about priorities.
 80 | 
 81 | We are also not looking for exercises or other material that only run on one
 82 | platform. Our workshops typically contain a mixture of Windows, macOS, and
 83 | Linux users; in order to be usable, our lessons must run equally well on all
 84 | three.
 85 | 
 86 | ## Using GitHub
 87 | 
 88 | If you choose to contribute via GitHub, you may want to look at [How to
 89 | Contribute to an Open Source Project on GitHub][how-contribute]. To manage
 90 | changes, we follow [GitHub flow][github-flow]. Each lesson has two maintainers
 91 | who review issues and pull requests or encourage others to do so. The
 92 | maintainers are community volunteers and have final say over what gets merged
 93 | into the lesson. To use the web interface for contributing to a lesson:
 94 | 
 95 | 1. Fork the originating repository to your GitHub profile.
 96 | 1. Within your version of the forked repository, move to the `gh-pages` branch
 97 |    and create a new branch for each significant change being made.
 98 | 1. Navigate to the file(s) you wish to change within the new branches and make
 99 |    revisions as required.
100 | 1. Commit all changed files within the appropriate branches.
101 | 1. Create individual pull requests from each of your changed branches to the
102 |    `gh-pages` branch within the originating repository.
103 | 1. If you receive feedback, make changes using your issue-specific branches of
104 |    the forked repository and the pull requests will update automatically.
105 | 1. Repeat as needed until all feedback has been addressed.
106 | 
107 | When starting work, please make sure your clone of the originating `gh-pages`
108 | branch is up-to-date before creating your own revision-specific branch(es) from
109 | there. Additionally, please only work from your newly-created branch(es) and
110 | *not* your clone of the originating `gh-pages` branch. Lastly, published copies
111 | of all the lessons are available in the `gh-pages` branch of the originating
112 | repository for reference while revising.
113 | 
114 | ## Other Resources
115 | 
116 | General discussion of [Software Carpentry][swc-site] and [Data
117 | Carpentry][dc-site] happens on the [discussion mailing list][discuss-list],
118 | which everyone is welcome to join. You can also [reach us by email][email].
119 | 
120 | [email]: mailto:admin@software-carpentry.org
121 | [dc-issues]: https://github.com/issues?q=user%3Adatacarpentry
122 | [dc-lessons]: http://datacarpentry.org/lessons/
123 | [dc-site]: http://datacarpentry.org/
124 | [discuss-list]: http://lists.software-carpentry.org/listinfo/discuss
125 | [github]: https://github.com
126 | [github-flow]: https://guides.github.com/introduction/flow/
127 | [github-join]: https://github.com/join
128 | [how-contribute]: https://egghead.io/series/how-to-contribute-to-an-open-source-project-on-github
129 | [issues]: https://guides.github.com/features/issues/
130 | [swc-issues]: https://github.com/issues?q=user%3Aswcarpentry
131 | [swc-lessons]: https://software-carpentry.org/lessons/
132 | [swc-site]: https://software-carpentry.org/
133 | 
134 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | source 'https://rubygems.org'
 4 | 
 5 | git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
 6 | 
 7 | # Synchronize with https://pages.github.com/versions
 8 | ruby '>=2.7.1'
 9 | 
10 | gem 'github-pages', group: :jekyll_plugins
11 | gem 'kramdown-parser-gfm'
12 | gem 'mdl'
13 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Licenses"
 3 | ---
 4 | 
 5 | # CC BY 4.0
 6 | 
 7 | ## Instructional Material
 8 | 
 9 | All High Performance Computing Carpentry instructional material is
10 | made available under the [Creative Commons Attribution
11 | license][cc-by-human]. The following is a human-readable summary of
12 | (and not a substitute for) the [full legal text of the CC BY 4.0
13 | license][cc-by-legal].
14 | 
15 | You are free:
16 | 
17 | * to **Share**---copy and redistribute the material in any medium or format
18 | * to **Adapt**---remix, transform, and build upon the material
19 | 
20 | for any purpose, even commercially.
21 | 
22 | The licensor cannot revoke these freedoms as long as you follow the
23 | license terms.
24 | 
25 | Under the following terms:
26 | 
27 | * **Attribution**---You must give appropriate credit (mentioning that
28 |   your work is derived from work that is Copyright © Software
29 |   Carpentry and, where practical, linking to
30 |   <http://software-carpentry.org/>), provide a [link to the
31 |   license][cc-by-human], and indicate if changes were made. You may do
32 |   so in any reasonable manner, but not in any way that suggests the
33 |   licensor endorses you or your use.
34 | 
35 | **No additional restrictions**---You may not apply legal terms or
36 | technological measures that legally restrict others from doing
37 | anything the license permits.  With the understanding that:
38 | 
39 | Notices:
40 | 
41 | * You do not have to comply with the license for elements of the
42 |   material in the public domain or where your use is permitted by an
43 |   applicable exception or limitation.
44 | * No warranties are given. The license may not give you all of the
45 |   permissions necessary for your intended use. For example, other
46 |   rights such as publicity, privacy, or moral rights may limit how you
47 |   use the material.
48 | 
49 | ## Software
50 | 
51 | Except where otherwise noted, the example programs and other software
52 | provided by Software Carpentry and Data Carpentry are made available under the
53 | [OSI][osi]-approved
54 | [MIT license][mit-license].
55 | 
56 | Permission is hereby granted, free of charge, to any person obtaining
57 | a copy of this software and associated documentation files (the
58 | "Software"), to deal in the Software without restriction, including
59 | without limitation the rights to use, copy, modify, merge, publish,
60 | distribute, sublicense, and/or sell copies of the Software, and to
61 | permit persons to whom the Software is furnished to do so, subject to
62 | the following conditions:
63 | 
64 | The above copyright notice and this permission notice shall be
65 | included in all copies or substantial portions of the Software.
66 | 
67 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
68 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
69 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
70 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
71 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
72 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
73 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
74 | 
75 | ## Trademark
76 | 
77 | "Software Carpentry" and "Data Carpentry" and their respective logos
78 | are registered trademarks of [Community Initiatives][CI].
79 | 
80 | [cc-by-human]: https://creativecommons.org/licenses/by/4.0/
81 | [cc-by-legal]: https://creativecommons.org/licenses/by/4.0/legalcode
82 | [mit-license]: https://opensource.org/licenses/mit-license.html
83 | [ci]: http://communityin.org/
84 | [osi]: https://opensource.org
85 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | # Use /bin/bash instead of /bin/sh
  2 | export SHELL = /bin/bash
  3 | 
  4 | ## ========================================
  5 | ## Commands for both workshop and lesson websites.
  6 | 
  7 | # Settings
  8 | MAKEFILES=Makefile $(wildcard *.mk)
  9 | JEKYLL=bundle config --local set path .vendor/bundle && bundle install && bundle update && bundle exec jekyll
 10 | HPC_JEKYLL_CONFIG?=
 11 | PARSER=bin/markdown_ast.rb
 12 | DST=_site
 13 | 
 14 | # Check Python 3 is installed and determine if it's called via python3 or python
 15 | # (https://stackoverflow.com/a/4933395)
 16 | PYTHON3_EXE := $(shell which python3 2>/dev/null)
 17 | ifneq (, $(PYTHON3_EXE))
 18 |   ifeq (,$(findstring Microsoft/WindowsApps/python3,$(subst \,/,$(PYTHON3_EXE))))
 19 |     PYTHON := python3
 20 |   endif
 21 | endif
 22 | 
 23 | ifeq (,$(PYTHON))
 24 |   PYTHON_EXE := $(shell which python 2>/dev/null)
 25 |   ifneq (, $(PYTHON_EXE))
 26 |     PYTHON_VERSION_FULL := $(wordlist 2,4,$(subst ., ,$(shell python --version 2>&1)))
 27 |     PYTHON_VERSION_MAJOR := $(word 1,${PYTHON_VERSION_FULL})
 28 |     ifneq (3, ${PYTHON_VERSION_MAJOR})
 29 |       $(error "Your system does not appear to have Python 3 installed.")
 30 |     endif
 31 |     PYTHON := python
 32 |   else
 33 |       $(error "Your system does not appear to have any Python installed.")
 34 |   endif
 35 | endif
 36 | 
 37 | 
 38 | # Controls
 39 | .PHONY : commands clean files
 40 | 
 41 | # Default target
 42 | .DEFAULT_GOAL := commands
 43 | 
 44 | ## I. Commands for both workshop and lesson websites
 45 | ## =================================================
 46 | 
 47 | ## * serve            : render website and run a local server
 48 | serve : lesson-md
 49 | 	${JEKYLL} serve --config _config.yml,${HPC_JEKYLL_CONFIG}
 50 | 
 51 | ## * site             : build website but do not run a server
 52 | site : lesson-md
 53 | 	${JEKYLL} build --config _config.yml,${HPC_JEKYLL_CONFIG}
 54 | 
 55 | ## * docker-serve     : use Docker to serve the site
 56 | docker-serve :
 57 | 	docker pull carpentries/lesson-docker:latest
 58 | 	docker run --rm -it \
 59 | 		-v $${PWD}:/home/rstudio \
 60 | 		-p 4000:4000 \
 61 | 		-p 8787:8787 \
 62 | 		-e USERID=$$(id -u) \
 63 | 		-e GROUPID=$$(id -g) \
 64 | 		carpentries/lesson-docker:latest
 65 | 
 66 | ## * repo-check       : check repository settings
 67 | repo-check :
 68 | 	@${PYTHON} bin/repo_check.py -s .
 69 | 
 70 | ## * clean            : clean up junk files
 71 | clean :
 72 | 	@rm -rf ${DST}
 73 | 	@rm -rf .sass-cache
 74 | 	@rm -rf bin/__pycache__
 75 | 	@find . -name .DS_Store -exec rm {} \;
 76 | 	@find . -name '*~' -exec rm {} \;
 77 | 	@find . -name '*.pyc' -exec rm {} \;
 78 | 
 79 | ## * clean-rmd        : clean intermediate R files (that need to be committed to the repo)
 80 | clean-rmd :
 81 | 	@rm -rf ${RMD_DST}
 82 | 	@rm -rf fig/rmd-*
 83 | 
 84 | 
 85 | ##
 86 | ## II. Commands specific to workshop websites
 87 | ## =================================================
 88 | 
 89 | .PHONY : workshop-check
 90 | 
 91 | ## * workshop-check   : check workshop homepage
 92 | workshop-check :
 93 | 	@${PYTHON} bin/workshop_check.py .
 94 | 
 95 | 
 96 | ##
 97 | ## III. Commands specific to lesson websites
 98 | ## =================================================
 99 | 
100 | .PHONY : lesson-check lesson-md lesson-files lesson-fixme spellcheck
101 | 
102 | # RMarkdown files
103 | RMD_SRC = $(wildcard _episodes_rmd/??-*.Rmd)
104 | RMD_DST = $(patsubst _episodes_rmd/%.Rmd,_episodes/%.md,$(RMD_SRC))
105 | 
106 | # Lesson source files in the order they appear in the navigation menu.
107 | MARKDOWN_SRC = \
108 |   index.md \
109 |   CODE_OF_CONDUCT.md \
110 |   setup.md \
111 |   $(sort $(wildcard _episodes/*.md)) \
112 |   reference.md \
113 |   $(sort $(wildcard _extras/*.md)) \
114 |   LICENSE.md
115 | 
116 | # Generated lesson files in the order they appear in the navigation menu.
117 | HTML_DST = \
118 |   ${DST}/index.html \
119 |   ${DST}/conduct/index.html \
120 |   ${DST}/setup/index.html \
121 |   $(patsubst _episodes/%.md,${DST}/%/index.html,$(sort $(wildcard _episodes/*.md))) \
122 |   ${DST}/reference/index.html \
123 |   $(patsubst _extras/%.md,${DST}/%/index.html,$(sort $(wildcard _extras/*.md))) \
124 |   ${DST}/license/index.html
125 | 
126 | ## * lesson-md        : convert Rmarkdown files to markdown
127 | lesson-md : ${RMD_DST}
128 | 
129 | _episodes/%.md: _episodes_rmd/%.Rmd
130 | 	@bin/knit_lessons.sh $< $@
131 | 
132 | # * lesson-check     : validate lesson Markdown
133 | lesson-check : lesson-fixme
134 | 	@${PYTHON} bin/lesson_check.py -s . -p ${PARSER} -r _includes/links.md
135 | 
136 | ## * lesson-check-all : validate lesson Markdown, checking line lengths and trailing whitespace
137 | lesson-check-all :
138 | 	@${PYTHON} bin/lesson_check.py -s . -p ${PARSER} -r _includes/links.md -l -w --permissive
139 | 
140 | spellcheck:
141 | 	codespell --skip="assets,.bundle,_site,*.svg,*.txt,.vendor" --quiet-level=2  -L "dropse,hart,hist,namd,rouge"
142 | 
143 | ## * unittest         : run unit tests on checking tools
144 | unittest :
145 | 	@${PYTHON} bin/test_lesson_check.py
146 | 
147 | ## * lesson-files     : show expected names of generated files for debugging
148 | lesson-files :
149 | 	@echo 'RMD_SRC:' ${RMD_SRC}
150 | 	@echo 'RMD_DST:' ${RMD_DST}
151 | 	@echo 'MARKDOWN_SRC:' ${MARKDOWN_SRC}
152 | 	@echo 'HTML_DST:' ${HTML_DST}
153 | 
154 | ## * lesson-fixme     : show FIXME markers embedded in source files
155 | lesson-fixme :
156 | 	@fgrep -i -n FIXME ${MARKDOWN_SRC} || true
157 | 
158 | ##
159 | ## IV. Auxililary (plumbing) commands
160 | ## =================================================
161 | 
162 | ## * commands         : show all commands.
163 | commands :
164 | 	@sed -n -e '/^##/s|^##[[:space:]]*||p' $(MAKEFILE_LIST)
165 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # HPC Python
 2 | 
 3 | This is the HPC Carpentry Python lesson.
 4 | 
 5 | This lesson is focused on teaching the basics of high-performance computing
 6 | (HPC). There are 4 primary components to this lesson. Each component is
 7 | budgeted half a day's worth of teaching-time, resulting in a two day workshop.
 8 | 
 9 | 1. UNIX fundamentals
10 | 2. Working on a cluster
11 | 3. Programming language introduction/review
12 | 4. Introduction to parallel programming
13 | 
14 | Sections 3 and 4 (programming) will feature two programming languages:
15 | [Python](https://www.python.org/) and [Chapel](http://chapel.cray.com/).
16 | There are strong arguments for both languages,
17 | and instructors will be able to choose which language they wish to teach in.
18 | 
19 | ## Topic breakdown and todo list
20 | 
21 | The lesson outline and rough breakdown of topics by lesson writer is in
22 | [lesson-outline.md](lesson-outline.md).
23 | The topics there will be initially generated by the lesson writer,
24 | and then reviewed by the rest of the group once complete.
25 | 
26 | ## Lesson writing instructions
27 | 
28 | This is a fast overview of the Software Carpentry lesson template.
29 | This won't cover lesson style or formatting (address that during review?).
30 | 
31 | For a full guide to the lesson template, see the [Software Carpentry example
32 | lesson](http://swcarpentry.github.io/lesson-example/).
33 | 
34 | ### Lesson structure
35 | 
36 | Software Carpentry lessons are generally episodic, with one clear concept for
37 | each episode ([example](http://swcarpentry.github.io/r-novice-gapminder/)).
38 | We've got 4 major sections, each section should be broken up into several
39 | episodes (perhaps the higher-level bullet points from the lesson outline?).
40 | 
41 | An episode is just a markdown file that lives under the `_episodes` folder.
42 | Here is a link to a [markdown cheatsheet](
43 | https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet) with most
44 | markdown syntax.
45 | Additionally, the Software Carpentry lesson template uses several extra bits of
46 | formatting - see here for a [full guide](
47 | http://swcarpentry.github.io/lesson-example/04-formatting/).
48 | The most significant change is the addition of a YAML header that adds metadata
49 | (key questions, lesson teaching times, etc.) and special syntax for code
50 | blocks, exercises, and the like.
51 | 
52 | Episode names should be prefixed with a number of their section plus the number
53 | of their episode within that section.
54 | This is important because the Software Carpentry lesson template will auto-post
55 | our lessons in the order that they would sort in.
56 | As long as your lesson sorts into the correct order, it will appear in the
57 | correct order on the website.
58 | 
59 | ### Publishing changes to GitHub + the GitHub pages website
60 | 
61 | The lesson website is viewable at [hpc-carpentry.github.io/hpc-python](
62 | https://hpc-carpentry.github.io/hpc-python).
63 | 
64 | The lesson website itself is auto-generated from the `gh-pages` branch of this
65 | repository. GitHub pages will rebuild the website as soon as you push to the
66 | GitHub `gh-pages` branch. Because of this `gh-pages` is considered the "master"
67 | branch.
68 | 
69 | ### Previewing changes locally
70 | 
71 | Obviously having to push to GitHub every time you want to view your changes to
72 | the website isn't very convenient.
73 | To preview the lesson locally, run `make serve`.
74 | You can then view the website at `localhost:4000` in your browser.
75 | Pages will be automatically regenerated every time you write to them.
76 | 
77 | Note that the autogenerated website lives under the `_site` directory
78 | (and doesn't get pushed to GitHub).
79 | 
80 | This process requires Ruby, Make, and Jekyll. You can find setup instructions
81 | [here](http://swcarpentry.github.io/lesson-example/setup/).
82 | 
83 | ## Example lessons
84 | 
85 | A couple links to example SWC workshop lessons for reference:
86 | 
87 | * [Example Bash lesson](https://github.com/swcarpentry/shell-novice)
88 | * [Example Python lesson](
89 |   https://github.com/swcarpentry/python-novice-inflammation)
90 | * [Example R lesson](https://github.com/swcarpentry/r-novice-gapminder)
91 |   (uses R Markdown files instead of Markdown)
92 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
  1 | #------------------------------------------------------------
  2 | # Values for this lesson
  3 | #------------------------------------------------------------
  4 | 
  5 | # Which carpentry is this?
  6 | # "swc": Software Carpentry
  7 | # "dc":  Data Carpentry
  8 | # "lc":  Library Carpentry
  9 | # "cp":  Carpentries (e.g., instructor training)
 10 | carpentry: "incubator"
 11 | 
 12 | # Overall title for pages.
 13 | title: "Introduction to High-Performance Computing in Python"
 14 | 
 15 | # Email address, no mailto:
 16 | email: "team@carpentries.org"
 17 | 
 18 | # Life cycle stage of the lesson ("pre-alpha", "alpha", "beta", "stable")?
 19 | life_cycle: "alpha"
 20 | 
 21 | episode_order:
 22 |   - 01-basics
 23 |   - 02-scripts
 24 |   - 03-lists
 25 |   - 04-dicts
 26 |   - 05-functions
 27 |   - 06-parallel
 28 |   - 07-snakemake-intro
 29 |   - 08-snakefiles
 30 |   - 09-wildcards
 31 |   - 10-patterns
 32 |   - 11-snakemake-python
 33 |   - 12-resources
 34 |   - 13-cluster
 35 |   - 14-final-notes
 36 | 
 37 | #------------------------------------------------------------
 38 | # Generic settings (should not need to change).
 39 | #------------------------------------------------------------
 40 | 
 41 | # What kind of thing is this ("workshop" or "lesson")?
 42 | kind: "lesson"
 43 | 
 44 | # Start time in minutes (0 to be clock-independent, 540 to show a start at 09:00 am).
 45 | start_time: 0
 46 | 
 47 | # Magic to make URLs resolve both locally and on GitHub. Please don't change.
 48 | repository: <USERNAME>/<PROJECT>
 49 | 
 50 | # Invoke the Carpentries theme rather than copying into our repository.
 51 | remote_theme: carpentries/carpentries-theme
 52 | 
 53 | # Sites.
 54 | amy_site: "https://amy.carpentries.org/"
 55 | carpentries_github: "https://github.com/carpentries"
 56 | carpentries_pages: "https://carpentries.github.io"
 57 | carpentries_site: "https://carpentries.org/"
 58 | dc_site: "https://datacarpentry.org"
 59 | example_repo: "https://github.com/carpentries/lesson-example"
 60 | example_site: "https://carpentries.github.io/lesson-example"
 61 | lc_site: "https://librarycarpentry.org/"
 62 | swc_github: "https://github.com/swcarpentry"
 63 | swc_pages: "https://swcarpentry.github.io"
 64 | swc_site: "https://software-carpentry.org"
 65 | template_repo: "https://github.com/carpentries/styles"
 66 | training_site: "https://carpentries.github.io/instructor-training"
 67 | workshop_repo: "https://github.com/carpentries/workshop-template"
 68 | workshop_site: "https://carpentries.github.io/workshop-template"
 69 | cc_by_human: "https://creativecommons.org/licenses/by/4.0/"
 70 | 
 71 | # Specify that things in the Episodes and Extras collections should be output.
 72 | collections:
 73 |   episodes:
 74 |     output: true
 75 |     permalink: /:path/index.html
 76 |   extras:
 77 |     output: true
 78 |     permalink: /:path/index.html
 79 | 
 80 | # Set the default layout for things in the episodes collection.
 81 | defaults:
 82 |   - values:
 83 |       root: .
 84 |       layout: page
 85 |   - scope:
 86 |       path: ""
 87 |       type: episodes
 88 |     values:
 89 |       root: ..
 90 |       layout: episode
 91 |   - scope:
 92 |       path: ""
 93 |       type: extras
 94 |     values:
 95 |       root: ..
 96 |       layout: page
 97 | 
 98 | # Files and directories that are not to be copied.
 99 | exclude:
100 |   - Makefile
101 |   - bin/
102 |   - .Rproj.user/
103 |   - .vendor/
104 |   - .docker-vendor/
105 | 
106 | # Turn on built-in syntax highlighting.
107 | highlighter: rouge
108 | 
109 | plugins:
110 |   - jekyll-redirect-from
111 |   - jekyll-paginate
112 | 


--------------------------------------------------------------------------------
/_episodes/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc-carpentry/hpc-python/4af94a57eccb8a2b400db514d682bcfc6cdc9a7d/_episodes/.gitkeep


--------------------------------------------------------------------------------
/_episodes/01-basics.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Basic syntax"
  3 | teaching: 15
  4 | exercises: 15
  5 | questions:
  6 | - "Where do I start?"
  7 | objectives:
  8 | - "Understand basic Python syntax and data types."
  9 | keypoints:
 10 | - "Errors are there to help us."
 11 | ---
 12 | 
 13 | The most basic use of Python is to use it as a fancy calculator.
 14 | It is very easy to do basic maths in Python.
 15 | 
 16 | ```
 17 | print(5 + 1)
 18 | ```
 19 | {: .language-python}
 20 | ```
 21 | 6
 22 | ```
 23 | {: .output}
 24 | 
 25 | Note that we don't always have to use the `print()` statement.
 26 | Notice how leaving out `print()` gives us the same result as above.
 27 | 
 28 | ```
 29 | 5 + 1
 30 | ```
 31 | {: .language-python}
 32 | ```
 33 | 6
 34 | ```
 35 | {: .output}
 36 | 
 37 | Python can do all of the normal basic maths operations you'd expect.
 38 | 
 39 | ```
 40 | 5 + 3
 41 | 2 - 9
 42 | 4 * 6
 43 | 14 / 3
 44 | ```
 45 | {: .language-python}
 46 | ```
 47 | 8
 48 | -7
 49 | 24
 50 | 4.666666666666667
 51 | ```
 52 | {: .output}
 53 | 
 54 | You can also use it to more complicated operations, like exponentiation (`**`):
 55 | 
 56 | ```
 57 | 5 ** 2
 58 | ```
 59 | {: .language-python}
 60 | ```
 61 | 25
 62 | ```
 63 | {: .output}
 64 | 
 65 | Along with floor and remainder division.
 66 | Floor division (`//`) gives the results of division, rounded down.
 67 | Remainder division (`%`), gives the remainder after division.
 68 | 
 69 | ```
 70 | 5 // 2  # floor division
 71 | 5 % 2   # remainder division
 72 | ```
 73 | {: .language-python}
 74 | ```
 75 | 2
 76 | 1
 77 | ```
 78 | {: .output}
 79 | 
 80 | Python follows the normal order of operations for maths.
 81 | 
 82 | ```
 83 | 4 + 1 * 6
 84 | ```
 85 | {: .language-python}
 86 | ```
 87 | 10
 88 | ```
 89 | {: .output}
 90 | 
 91 | However, if you want Python to change the order it does things in,
 92 | you can use parentheses to specify what to do first.
 93 | Note that there is no limit to the number of parentheses you can use.
 94 | 
 95 | ```
 96 | (4 + 1) * 6
 97 | ```
 98 | {: .language-python}
 99 | ```
100 | 30
101 | ```
102 | {: .output}
103 | 
104 | ## Variables
105 | 
106 | Of course, we will probably want to save our answers at some point.
107 | We can do this by *assigning a variable*.
108 | In Python, a variable is a name for a saved result.
109 | We can set them with the `=` sign.
110 | 
111 | ```
112 | weight_kg = 55
113 | ```
114 | {: .language-python}
115 | 
116 | If we want to retrieve the information we've stored,
117 | we can do it by simply typing the name of the variable again.
118 | 
119 | ```
120 | weight_kg
121 | ```
122 | {: .language-python}
123 | ```
124 | 55
125 | ```
126 | {: .output}
127 | 
128 | We can perform maths on variables the same way we would normally.
129 | 
130 | ```
131 | print('weight in pounds:', 2.2 * weight_kg)
132 | ```
133 | {: .language-python}
134 | ```
135 | weight in pounds: 121.00000000000001
136 | ```
137 | {: .output}
138 | 
139 | As the example above shows, we can print several things at once by separating
140 | them with commas.
141 | Note that in this case, the number might appear as 121.00000000000001 due to
142 | the way numbers are internally represented in Python.
143 | 
144 | We can also change a variable’s value by assigning it a new one:
145 | 
146 | ```
147 | weight_lb = 2.2 * weight_kg
148 | print(weight_lb)
149 | ```
150 | {: .language-python}
151 | ```
152 | 121.00000000000001
153 | ```
154 | {: .output}
155 | 
156 | What happens when we change a variable?
157 | Let's update `weight_kg` and see what happens to `weight_lb`.
158 | 
159 | ```
160 | print('weight_kg starting value is', weight_kg)
161 | weight_kg = 10000
162 | print('after updating, weight_kg ending value is', weight_kg)
163 | print('weight in lb ending value is', weight_lb)
164 | ```
165 | {: .language-python}
166 | ```
167 | weight_kg starting value is 55
168 | after updating, weight_kg ending value is 10000
169 | weight in lb ending value is 121.00000000000001
170 | ```
171 | {: .output}
172 | 
173 | 
174 | Notice how even though we changed the value of `weight_kg`, `weight_lb` did not
175 | update.
176 | This demonstrates a very important property of programming languages:
177 | a computer will not do anything unless you specifically tell it to &mdash;
178 | **nothing ever happens automatically**.
179 | This is different from the behaviour of a spreadsheets,
180 | where a cell will automatically update when the cells it refers to are updated.
181 | 
182 | If we want to tell Python to update `weight_lb` to reflect the new value of
183 | `weight_kg`, we will need to perform this operation explicitly.
184 | 
185 | ```
186 | weight_lb = weight_kg * 2.2
187 | print('new value for weight_lb is', weight_lb)
188 | ```
189 | {: .language-python}
190 | ```
191 | new value for weight_lb is 22000.0
192 | ```
193 | {: .output}
194 | 
195 | One more thing to note:
196 | what we just did is the best way to learn Python.
197 | Don't know how something works?
198 | Try it and find out!
199 | 
200 | > ## Where are variables stored?
201 | >
202 | > Your computer has two places where it stores information:
203 | > hard disk and memory.
204 | > What are they and what are they used for?
205 | > Where do variables get stored?
206 | >
207 | > Memory is where temporary information on your computer gets placed.
208 | > It is very fast and easy to access, but has one important drawback:
209 | > data here is erased when your program quits or your computer shuts down.
210 | > All information you save as variables in Python will be stored in memory!
211 | > When programming, we always need to save our data as a file (on our hard
212 | > disk) if we want to keep it!
213 | >
214 | > Your computer's hard disk is used to store information long-term.
215 | > This is where files get stored, and the information on your hard drive is
216 | > more or less permanent.
217 | > Hard drives can also store lots of data very cheaply &mdash; a terabyte of
218 | > hard drive space is very cheap, whereas the same amount of memory costs a lot
219 | > more.
220 | > So if hard drive space is permanent and super-cheap, why don't we use it to
221 | > store all of our data?
222 | > The biggest reason is speed &mdash; memory is typically hundreds, if not
223 | > thousands of times faster to access.
224 | > If we stored our variables to our hard disk, our programs would be incredibly
225 | > slow!
226 | {: .callout}
227 | 
228 | ## Errors
229 | 
230 | Of course, not everything will always work perfectly.
231 | We are going to run into errors.
232 | For instance, what happens if we accidentally don't finish a command?
233 | 
234 | ```
235 | 1 +
236 | ```
237 | {: .language-python}
238 | ```
239 | SyntaxError: invalid syntax (<ipython-input-15-70475fc083df, line 1)
240 |   File "<ipython-input-15-70475fc083df", line 1
241 |     1 +
242 |        ^
243 | SyntaxError: invalid syntax
244 | ```
245 | {: .error}
246 | 
247 | This is an error. Errors are good!
248 | When we do something that Python doesn't like, it will give us an error
249 | message.
250 | These error messages are called tracebacks, and often tell us exactly how to
251 | fix our stuff!
252 | 
253 | ### Let's walk through this error:
254 | 
255 | ```
256 | SyntaxError: invalid syntax
257 | ```
258 | {: .error}
259 | 
260 | All errors have types.
261 | This one is a `SyntaxError`, indicating, well... an error in our syntax.
262 | Syntax is "computer-speak" for how things are supposed to be typed.
263 | Python only understands certain commands, and typos will mess it up.
264 | If we type a command in such a way that Python can't understand it, we need to
265 | fix our syntax (make sure we've typed a valid command).
266 | **Takeaway message: We made an error when typing things.**
267 | 
268 | ```
269 | File "<ipython-input-15-70475fc083df", line 1
270 |     1 +
271 |        ^
272 | ```
273 | {: .error}
274 | 
275 | Python is trying to be helpful and tell us exactly where our error occurred.
276 | The first thing it does is tell us which file had the error in it.
277 | Since we are using the terminal, it gives us the semi-confusing
278 | `<ipython-input-15-70475fc083df` instead of a filename.
279 | 
280 | The `line 1` bit tells us that our error was on line 1 of our last command.
281 | Specifically, Python has printed out the offending line for us,
282 | and pointed an arrow (`^`) at the bad part.
283 | **Takeaway message: The error came right after we typed the `+` sign.**
284 | 
285 | ## Different types of data
286 | 
287 | Computers are not smart, and have to be explicitly told how to handle different
288 | types of data.
289 | Although a human might know that you can't do maths on a word,
290 | our computer does not.
291 | To work around this problem, programming languages store different types of
292 | data in different ways.
293 | 
294 | For reasons that are likely obvious, we will need to store text differently
295 | than numbers.
296 | What is less obvious is that Python also has special ways of handling integers
297 | vs. decimals,
298 | Boolean values (True/False), and a special value used to indicate no data
299 | whatsoever.
300 | 
301 | ### Strings
302 | 
303 | We've already encountered 3 of these "data types" already.
304 | The first is strings, which are used to store text.
305 | Strings are indicated with either single (`'`) or double (`"`) quotes.
306 | 
307 | To see what data type something is, we can use the `type()` command.
308 | It will print the data type of whatever is inside the parentheses.
309 | 
310 | ```
311 | type('this is a string')
312 | type("this is also a string")
313 | ```
314 | {: .language-python}
315 | ```
316 | str
317 | str
318 | ```
319 | {: .output}
320 | 
321 | We can also make multiline strings with 3 of either set of quotes.
322 | 
323 | ```
324 | multiline = '''
325 |     This string
326 |     spans
327 |     multiple
328 |     lines
329 |     !!!!
330 |     '''
331 | print(multiline)
332 | type(multiline)
333 | ```
334 | {: .language-python}
335 | ```
336 | This string
337 | spans
338 | multiple
339 | lines
340 | !!!!
341 | 
342 | str
343 | ```
344 | {: .output}
345 | 
346 | Python makes it very easy to work with basic text.
347 | For instance, we can even use the `+` sign to put strings together!
348 | 
349 | ```
350 | 'some text' + 'MORE TEXT'
351 | 'repetition' * 3
352 | ```
353 | {: .language-python}
354 | ```
355 | 'some textMORE TEXT'
356 | 'repetitionrepetitionrepetition'
357 | ```
358 | {: .output}
359 | 
360 | Note that maths operations on strings will only work within reason.
361 | Attempting to add a string to a number doesn't work!
362 | 
363 | ```
364 | '5' + 5
365 | ```
366 | {: .language-python}
367 | ```
368 | ---------------------------------------------------------------------------
369 | TypeError                                 Traceback (most recent call last)
370 | <ipython-input-41-f9dbf5f0b234 in <module()
371 | ---- 1 '5' + 5
372 | 
373 | TypeError: Can't convert 'int' object to str implicitly
374 | ```
375 | {: .error}
376 | 
377 | This error indicates that Python doesn't know how to convert a string to an
378 | integer (without our help)!
379 | 
380 | ### Numbers
381 | 
382 | Integers are used to store any whole number, either positive or negative.
383 | Any number without a decimal place is an int, or integer.
384 | 
385 | ```
386 | type(5)
387 | type(-1000)
388 | type(6 + -33)
389 | ```
390 | {: .language-python}
391 | ```
392 | int
393 | int
394 | int
395 | ```
396 | {: .output}
397 | 
398 | But what happens when we perform a maths operation that would result in a
399 | decimal?
400 | 
401 | ```
402 | type(10 / 3)
403 | ```
404 | {: .language-python}
405 | ```
406 | float
407 | ```
408 | {: .output}
409 | 
410 | Any operation that would result in a decimal being created converts the number
411 | to a "float".
412 | Floats are used to represent decimals in Python.
413 | To explicitly set a number as a float, just add a decimal point.
414 | 
415 | ```
416 | type(1.3)
417 | type(22.)
418 | ```
419 | {: .language-python}
420 | ```
421 | float
422 | float
423 | ```
424 | {: .output}
425 | 
426 | ### Other data types
427 | 
428 | Python has two special "Boolean" values to indicate whether or not something is
429 | true or false.
430 | Unsurprisingly, these are defined as `True` and `False`.
431 | 
432 | ```
433 | type(True)
434 | type(False)
435 | ```
436 | {: .language-python}
437 | ```
438 | bool
439 | bool
440 | ```
441 | {: .output}
442 | 
443 | Finally, there is a special value called `None` used to indicate no data.
444 | We will revisit `None` in more detail later, so for now, just be aware it
445 | exists.
446 | 
447 | ```
448 | type(None)
449 | ```
450 | {: .language-python}
451 | ```
452 | NoneType
453 | ```
454 | {: .output}
455 | 
456 | > ## Converting between data types
457 | > Data often isn't the format you want it to be.
458 | > For instance, we got an error earlier while attempting to
459 | > perform addition between a string and a number (`'5' + 5`).
460 | > What if we really needed to do that?
461 | > Fortunately, Python makes it rather easy to convert between data types.
462 | > Each data type has a function used to convert another piece of data.
463 | >
464 | > To convert a string to an integer, for instance, we can use the `int()`
465 | > command:
466 | >
467 | > ```
468 | > print(int('5') + 5)
469 | > ```
470 | > {: .language-python}
471 | > ```
472 | > 10
473 | > ```
474 | > {: .output}
475 | >
476 | > Likewise, we can use the following commands to convert data to other types:
477 | >
478 | > * `str()` - creates a string
479 | > * `int()` - creates an integer
480 | > * `float()` - creates a float
481 | > * `bool()` - creates a Boolean
482 | >
483 | > Using this information,
484 | > see if you can fix the left side of these statements to equal the right side
485 | > of each statement.
486 | > Use only the commands shown above.
487 | >
488 | > ```
489 | > 1 + '1' == '11'
490 | > '6' - 7 == -1
491 | > 7.23 == 7
492 | > '5' == True
493 | > 4 / 1.3 == 4
494 | > ```
495 | > {: .language-python}
496 | {: .challenge}
497 | 
498 | > ## Data type conversion pitfalls
499 | >
500 | > You may have noticed something weird when converting a float to an int in the
501 | > last example.
502 | > Is Python simply rounding floats to the nearest integer, or is it doing
503 | > something else?
504 | {: .challenge}
505 | 


--------------------------------------------------------------------------------
/_episodes/02-scripts.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Scripts and imports"
  3 | teaching: 15
  4 | exercises: 15
  5 | questions:
  6 | - "What is a Python program?"
  7 | objectives:
  8 | - "Explain what constitutes a Python program."
  9 | - "Import Python modules."
 10 | keypoints:
 11 | - "To run a Python program, use `python3 program_name.py`."
 12 | ---
 13 | 
 14 | Everything we've learned so far is pretty cool.
 15 | But if we want to run a set of commands more than once?
 16 | How do we write a program in Python?
 17 | 
 18 | Python programs are simply a set of Python commands saved in a file.
 19 | No compiling required!
 20 | To demonstrate this, let's write our first program!
 21 | Enter the following text in a text editor and save it under any name you like
 22 | (Python files are typically given the extension `.py`).
 23 | 
 24 | ```
 25 | print('it works!!!')
 26 | ```
 27 | {: .language-python}
 28 | 
 29 | We can now run this program in several ways.
 30 | If we were to open up a terminal in the folder where we had saved our program,
 31 | we could run the command `python3 our-script-name.py` to run it.
 32 | 
 33 | ```
 34 | it works!!!
 35 | ```
 36 | {: .output}
 37 | 
 38 | > ## What's the point of print()?
 39 | >
 40 | > We saw earlier that there was no difference between printing something with
 41 | > `print()` and just entering a command on the command line.
 42 | > But is this really the case?
 43 | > Is there a difference after all?
 44 | >
 45 | > Try executing the following code:
 46 | >
 47 | > ```
 48 | > print('this involves print')
 49 | > 'this does not'
 50 | > ```
 51 | > {: .language-python}
 52 | > What gets printed if you execute this as a script?
 53 | > What gets printed if you execute things line by line?
 54 | > Using this information, what's the point of `print()`?
 55 | {: .challenge}
 56 | 
 57 | ## `import`-ing things
 58 | 
 59 | IPython has a neat trick to run command line commands without exiting IPython.
 60 | Any command that begins with `!` gets run on your computer's command line, and
 61 | not the IPython terminal.
 62 | 
 63 | We can use this fact to run the command `python3 our-script-name.py`.
 64 | I've called my script `test.py` as an example.
 65 | 
 66 | ```
 67 | !python3 test.py
 68 | ```
 69 | {: .language-python}
 70 | ```
 71 | it works!!!
 72 | ```
 73 | {: .output}
 74 | 
 75 | What if we wanted to pass additional information to Python?
 76 | For example, what if we want Python to print whatever we type back at us?
 77 | To do this, we'll need to use a bit of extra functionality:
 78 | the `sys` package.
 79 | 
 80 | Python includes a lot of extra features in the form of packages,
 81 | but not all of them get loaded by default.
 82 | To access a package, we need to `import` it.
 83 | 
 84 | ```
 85 | import sys
 86 | ```
 87 | {: .language-python}
 88 | 
 89 | You'll notice that there's no output.
 90 | Only one thing is changed:
 91 | We can now use the bonuses provided by the `sys` package.
 92 | For now, all we will use is `sys.argv`.
 93 | `sys.argv` is a special variable
 94 | that stores any additional arguments we provide on the command-line
 95 | after `python3 our-script-name.py`.
 96 | Let's make a new script called `command-args.py` to try this out.
 97 | 
 98 | ```
 99 | import sys
100 | print('we typed: ', sys.argv)
101 | ```
102 | {: .language-python}
103 | 
104 | We can then execute this program with:
105 | ```
106 | !python3 test.py word1 word2 3
107 | ```
108 | {: .language-python}
109 | ```
110 | we typed: ['test.py', 'word1', 'word2', '3']
111 | ```
112 | {: .output}
113 | 
114 | You'll notice that `sys.argv` looks different from other data types we've seen
115 | so far. `sys.argv` is a list (more about this in the next session).
116 | 


--------------------------------------------------------------------------------
/_episodes/03-lists.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Numpy arrays and lists"
  3 | teaching: 15
  4 | exercises: 15
  5 | questions:
  6 | - "How do we store large amounts of data?"
  7 | objectives:
  8 | - "Learn to use lists and Numpy arrays,
  9 |   and explain the difference between each."
 10 | keypoints:
 11 | - "Lists store a sequence of elements."
 12 | - "Numpy allows vector maths in Python."
 13 | ---
 14 | 
 15 | At the end of the last lesson, we noticed that `sys.argv` gave us a new data
 16 | structure: a list.
 17 | A list is a set of objects enclosed by a set of square brackets (`[]`).
 18 | 
 19 | ```
 20 | example = [1, 2, 4, 5]
 21 | example
 22 | ```
 23 | {: .language-python}
 24 | ```
 25 | [1, 2, 4, 5]
 26 | ```
 27 | {: .output}
 28 | 
 29 | Note that a list can hold any type of item, even other lists!
 30 | 
 31 | ```
 32 | example = [1, True, None, ["word", 123], "test"]
 33 | example
 34 | ```
 35 | {: .language-python}
 36 | ```
 37 | [1, True, None, ['word', 123], 'test']
 38 | ```
 39 | {: .output}
 40 | 
 41 | We can get different pieces of a list via indexing.
 42 | We add a set of square brackets after the list in question along with the index
 43 | of the values we want.
 44 | Note that in Python, all indices start from 0 &mdash; the first element is
 45 | actually the 0th element (this is different from languages like R or MATLAB).
 46 | The best way to think about array indices is that they are the number of
 47 | offsets from the first position &mdash; the first element does not require an
 48 | offset to get to.
 49 | 
 50 | {% include figure.html url="" max-width="50%"
 51 |    file="https://imgs.xkcd.com/comics/donald_knuth.png"
 52 |    alt="Arrays start at 0"
 53 |    caption="<i>Source:</i> <a href='https://xkcd.com/163/'>xkcd #163</a>" %}
 54 | 
 55 | A few examples of this in action:
 56 | 
 57 | ```
 58 | # first element
 59 | example[0]
 60 | # second element
 61 | example[1]
 62 | # fetch the list inside the list
 63 | example[3]
 64 | ```
 65 | {: .language-python}
 66 | ```
 67 | 1
 68 | True
 69 | ['word', 123]
 70 | ```
 71 | {: .output}
 72 | 
 73 | Note that we can index a range using the colon (`:`) operator.
 74 | A colon by itself means fetch everything.
 75 | 
 76 | ```
 77 | example[:]
 78 | ```
 79 | {: .language-python}
 80 | ```
 81 | [1, True, None, ['word', 123], 'test']
 82 | ```
 83 | {: .output}
 84 | 
 85 | A colon on the right side of an index means everything after the specified
 86 | index.
 87 | 
 88 | ```
 89 | example[2:]
 90 | ```
 91 | {: .language-python}
 92 | ```
 93 | [None, ['word', 123], 'test']
 94 | ```
 95 | {: .output}
 96 | 
 97 | A colon on the left side of an index means everything before, but not
 98 | including, the index.
 99 | 
100 | ```
101 | example[:2]
102 | ```
103 | {: .language-python}
104 | ```
105 | [1, True]
106 | ```
107 | {: .output}
108 | 
109 | And if we use a negative index, it means get elements from the end,
110 | going backwards.
111 | 
112 | ```
113 | # last element
114 | example[-1]
115 | # everything except the last two elements
116 | example[:-2]
117 | ```
118 | {: .language-python}
119 | ```
120 | 'test'
121 | [1, True, None]
122 | ```
123 | {: .output}
124 | 
125 | Note that we can use the index multiple times to retrieve information from
126 | nested objects.
127 | 
128 | ```
129 | example[3][0]
130 | ```
131 | {: .language-python}
132 | ```
133 | 'word'
134 | ```
135 | {: .output}
136 | 
137 | If we index out of range, it is an error:
138 | 
139 | ```
140 | example[5]
141 | ```
142 | {: .language-python}
143 | ```
144 | ---------------------------------------------------------------------------
145 | IndexError                                Traceback (most recent call last)
146 | <ipython-input-12-98429cb6526b> in <module>()
147 | ----> 1 example[5]
148 | 
149 | IndexError: list index out of range
150 | ```
151 | {: .error}
152 | 
153 | We can also add two lists together to create a larger list.
154 | 
155 | ```
156 | [45, 2] + [3]
157 | ```
158 | {: .language-python}
159 | ```
160 | [45, 2, 3]
161 | ```
162 | {: .output}
163 | 
164 | ## Lists as objects
165 | 
166 | Like other objects in Python, lists have a unique behaviour that can catch a
167 | lot of people off guard. What happens when we run the following code?
168 | 
169 | ```
170 | list1 = [1, 2, 3, 4]
171 | list2 = list1
172 | list2 += [5, 6, 7]
173 | print('List 2 is: ', list2)
174 | print('List 1 is: ', list1)
175 | ```
176 | {: .language-python}
177 | ```
178 | List 2 is:  [1, 2, 3, 4, 5, 6, 7]
179 | List 1 is:  [1, 2, 3, 4, 5, 6, 7]
180 | ```
181 | {: .output}
182 | 
183 | Modifying `list2` actually modified `list1` as well.
184 | In Python, lists are objects.
185 | Objects are not copied when we assign them to a new value (like in R).
186 | This is an important optimisation, as we won't accidentally fill up all of our
187 | computer's memory by renaming a variable a couple of times.
188 | When we ran `list2 = list1`, it just created a new name for `list1`.
189 | `list1` still points at the same underlying object.
190 | 
191 | We can verify this with the `id()` function.
192 | `id()` prints an objects unique identifier.
193 | Two objects will not have the same ID unless they are the same object.
194 | 
195 | ```
196 | id(list1)
197 | id(list2)
198 | ```
199 | {: .language-python}
200 | ```
201 | 140319556870408
202 | 140319556870408
203 | ```
204 | {: .output}
205 | 
206 | In order to create `list2` as a unique copy of `list1`.
207 | We have to use the `.copy()` method.
208 | 
209 | ```
210 | list1 = [1, 2, 3, 4]
211 | list2 = list1.copy()
212 | list2 += [5, 6, 7]
213 | print('List 2 is: ', list2)
214 | print('List 1 is: ', list1)
215 | id(list2)
216 | id(list1)
217 | ```
218 | {: .language-python}
219 | ```
220 | List 2 is:  [1, 2, 3, 4, 5, 6, 7]
221 | List 1 is:  [1, 2, 3, 4]
222 | 140319554648072
223 | 140319554461896
224 | ```
225 | {: .output}
226 | 
227 | `.copy()` is a method.
228 | Methods are special functions associated with an object and define what it can
229 | do.
230 | They always follow the syntax `object.method(arg1, arg2)` and have predefined
231 | number of arguments mostly with default values. We may also specify a subset of
232 | arguments, e.g. `object.method(arg1, arg4=some_value)`.
233 | 
234 | Other frequently used methods of lists include `.append()`:
235 | 
236 | ```
237 | list1.append(77)
238 | ```
239 | {: .language-python}
240 | ```
241 | [1, 2, 3, 4, 77]
242 | ```
243 | {: .output}
244 | 
245 | ```
246 | # this adds a one-element list
247 | list1.append([88])
248 | ```
249 | {: .language-python}
250 | ```
251 | [1, 2, 3, 4, 77, [88]]
252 | ```
253 | {: .output}
254 | 
255 | And `.extend()` (combines two lists, instead of adding the second list as an
256 | element):
257 | 
258 | ```
259 | list1.extend([99, 88, 101])
260 | ```
261 | {: .language-python}
262 | ```
263 | [1, 2, 3, 4, 77, [88], 99, 88, 101]
264 | ```
265 | {: .output}
266 | 
267 | And of course, `.remove()` and `.clear()` (both do exactly what you think they
268 | should do):
269 | 
270 | ```
271 | list1.remove([88])
272 | print(list1)
273 | list1.clear()
274 | print(list1)
275 | ```
276 | {: .language-python}
277 | ```
278 | [1, 2, 3, 4, 77, 99, 88, 101]
279 | []
280 | ```
281 | {: .output}
282 | 
283 | > ## Dynamic resizing of lists
284 | >
285 | > Python's lists are an extremely optimised data structure.
286 | > Unlike R's vectors, there is no time penalty to continuously adding elements
287 | > to list.
288 | > You never need to pre-allocate a list at a certain size for performance
289 | > reasons.
290 | {: .callout}
291 | 
292 | ## Iterating through lists
293 | 
294 | We'll very frequently want to iterate over lists and perform an operation with
295 | every element.
296 | We do this using a for loop.
297 | 
298 | A for loop generally looks like the following:
299 | 
300 | ```
301 | for variable in things_to_iterate_over:
302 |     do_stuff_with(variable)
303 | ```
304 | {: .language-python}
305 | 
306 | An example of an actually functioning for loop is shown below:
307 | 
308 | ```
309 | for i in range(10):
310 |     print(i)
311 | ```
312 | {: .language-python}
313 | ```
314 | 0
315 | 1
316 | 2
317 | 3
318 | 4
319 | 5
320 | 6
321 | 7
322 | 8
323 | 9
324 | ```
325 | {: .output}
326 | 
327 | In this case we are iterating over the values provided by `range()`.
328 | `range()` is a special generator function we can use to provide
329 | a sequence of numbers.
330 | 
331 | We can also iterate over a list, or any collection of elements:
332 | 
333 | ```
334 | for element in ['a', True, None]:
335 |     print(type(element))
336 | ```
337 | {: .language-python}
338 | ```
339 | <class 'str'>
340 | <class 'bool'>
341 | <class 'NoneType'>
342 | ```
343 | {: .output}
344 | 
345 | ## Vectorised operations with Numpy
346 | 
347 | Numpy is a numerical library designed to make working with numbers
348 | easier than it would otherwise be.
349 | 
350 | For example, say we had a list of a thousand numbers.
351 | There's no way to do vector maths without iterating through all the
352 | elements!
353 | 
354 | ```
355 | vals = list(range(1000))
356 | 
357 | new_vals = vals.copy()
358 | print(new_vals[:5])
359 | for idx in range(1000):
360 |     new_vals[idx] += 10
361 | 
362 | print(new_vals[:5])
363 | ```
364 | {: .language-python}
365 | ```
366 | [0, 1, 2, 3, 4]
367 | [10, 11, 12, 13, 14]
368 | ```
369 | {: .output}
370 | 
371 | That was a lot of work.
372 | Numpy lets us do vector maths like in R, saving us a lot of effort.
373 | The most basic function is `np.array()` which creates a numerical
374 | array from a list.
375 | A numpy array is a collection of numbers that can have any number of
376 | dimensions.
377 | In this case, there is only one dimension, since we created the array from a
378 | list.
379 | 
380 | ```
381 | import numpy as np
382 | 
383 | new_vals = np.array(vals)
384 | new_vals += 10
385 | new_vals[:5]
386 | ```
387 | {: .language-python}
388 | ```
389 | array([10, 11, 12, 13, 14])
390 | ```
391 | {: .output}
392 | 
393 | One very nice thing about Numpy is that it's much more performant than ordinary
394 | Python lists.
395 | A nice trick we can use with IPython to measure execution times is the
396 | `%timeit` magic function.
397 | Anything following the `%timeit` gets measured for speed.
398 | Adding `%%` to the `timeit` command instead of `%` means that `timeit` is run
399 | on the entire cell, not just a single line. Note that `%%timeit` must be on the
400 | first line of an IPython/Jupyter cell for it to work, whereas the `%timeit`
401 | command can be used anywhere.
402 | 
403 | Using Python's lists:
404 | 
405 | ```
406 | %%timeit
407 | for idx in range(1000):
408 |     vals[idx] + 10
409 | ```
410 | {: .language-python}
411 | ```
412 | 10000 loops, best of 3: 165 µs per loop
413 | ```
414 | {: .output}
415 | 
416 | Using numpy:
417 | 
418 | ```
419 | %timeit new_vals + 10
420 | ```
421 | {: .language-python}
422 | ```
423 | The slowest run took 22.13 times longer than the fastest.
424 | This could mean that an intermediate result is being cached.
425 | 1000000 loops, best of 3: 1.63 µs per loop
426 | ```
427 | {: .output}
428 | 
429 | Numpy was about 100x faster, though `%timeit` did mention that Numpy could be
430 | cheating a bit.
431 | Even in Numpy's worst case scenario however, it still ran 5x faster than using
432 | Python's basic lists.
433 | 
434 | ## Working with multiple dimensions
435 | 
436 | Sometimes, you'll encounter a dataset with multiple dimensions and will need to
437 | be able to retrieve elements from it as such.
438 | 
439 | ```
440 | arr2d = np.arange(0, 40)  # sequence of numbers from 0 to 39
441 | arr2d = arr2d.reshape([5, 8])  # reshape so it has 5 rows and 8 columns
442 | arr2d
443 | ```
444 | {: .language-python}
445 | ```
446 | array([[ 0,  1,  2,  3,  4,  5,  6,  7],
447 |        [ 8,  9, 10, 11, 12, 13, 14, 15],
448 |        [16, 17, 18, 19, 20, 21, 22, 23],
449 |        [24, 25, 26, 27, 28, 29, 30, 31],
450 |        [32, 33, 34, 35, 36, 37, 38, 39]])
451 | ```
452 | {: .output}
453 | 
454 | In this case, we must index using multiple indices, separated by a comma.
455 | 
456 | To grab the first element, we would use `[0, 0]`
457 | 
458 | ```
459 | arr2d[0, 0]
460 | ```
461 | {: .language-python}
462 | ```
463 | 0
464 | ```
465 | {: .output}
466 | 
467 | The first index, corresponds to rows, the second corresponds to columns,
468 | and the third to the next dimension...
469 | 
470 | ```
471 | arr2d[0, :]
472 | arr2d[:, 0]
473 | ```
474 | {: .language-python}
475 | ```
476 | array([0, 1, 2, 3, 4, 5, 6, 7])
477 | array([ 0,  8, 16, 24, 32])
478 | ```
479 | {: .output}
480 | 
481 | > ## Practising indexing
482 | >
483 | > Retrieve everything defined in the range of
484 | > rows 4-5 and columns 1-4.
485 | {: .challenge}
486 | 


--------------------------------------------------------------------------------
/_episodes/04-dicts.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Storing data with dicts"
  3 | teaching: 15
  4 | exercises: 15
  5 | questions:
  6 | - "How do I store structured data?"
  7 | objectives:
  8 | - "Be able to store data using Python's dict objects."
  9 | keypoints:
 10 | - "Dicts provide key-value storage of information."
 11 | ---
 12 | 
 13 | Dictionaries (also called dicts) are another key data structure we'll need to
 14 | use to write a pipeline. In particular, dicts allow efficient key-value storage
 15 | of any type of data.
 16 | 
 17 | To create a dict, we use syntax like the following.
 18 | 
 19 | ```
 20 | example = {}
 21 | type(example)
 22 | ```
 23 | {: .language-python}
 24 | ```
 25 | dict
 26 | ```
 27 | {: .output}
 28 | 
 29 | We can then store values in our dict using indexing.
 30 | The index is referred to as the "key",
 31 | and the stored data is referred to as the "value".
 32 | 
 33 | ```
 34 | example['key'] = 'value'
 35 | example['key']
 36 | ```
 37 | {: .language-python}
 38 | ```
 39 | 'value'
 40 | ```
 41 | {: .output}
 42 | 
 43 | In addition, keys can be stored using any type of value.
 44 | Let's add several more values to demonstrate this.
 45 | 
 46 | ```
 47 | example[1] = 2
 48 | example[4] = False
 49 | example['test'] = 5
 50 | example[7] = 'myvalue'
 51 | ```
 52 | {: .language-python}
 53 | 
 54 | To retrieve all keys in the dictionary, we can use the `.keys()`method.
 55 | Note how we used the `list()` function to turn our resulting output into a
 56 | list.
 57 | 
 58 | ```
 59 | list(example.keys())
 60 | ```
 61 | {: .language-python}
 62 | ```
 63 | ['key', 1, 4, 'test', 7]
 64 | ```
 65 | {: .output}
 66 | 
 67 | Likewise, we can retrieve all the values at once, using `.values()`
 68 | 
 69 | ```
 70 | list(example.values())
 71 | ```
 72 | {: .language-python}
 73 | ```
 74 | ['value', 2, False, 5, 'myvalue']
 75 | ```
 76 | {: .output}
 77 | 
 78 | > ## Dictionary order
 79 | >
 80 | > Note that the order of keys and values in a dictionary should not be relied
 81 | > upon. We'll create dictionary another way to demonstrate this:
 82 | >
 83 | > ```
 84 | > unordered = {'a': 1,
 85 | >              'b': 2,
 86 | >              'c': 3,
 87 | >              'd': 4}
 88 | > ```
 89 | > {: .language-python}
 90 | > ```
 91 | > {'a': 1, 'b': 2, 'c': 3, 'd': 4}
 92 | > ```
 93 | > {: .output}
 94 | >
 95 | > Depending on your version of Python, the dictionary will either be in order,
 96 | > or out of order. If you are on Python 3.6+ dictionaries are ordered.
 97 | >
 98 | > Iterate through and print the dictionary's keys in both forward and reverse
 99 | > order.
100 | >
101 | > (To iterate through the dict in a specific order, you will need to sort the
102 | > keys using the `sorted()` function.)
103 | {: .callout}
104 | 


--------------------------------------------------------------------------------
/_episodes/05-functions.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Functions and Conditions"
  3 | teaching: 15
  4 | exercises: 15
  5 | questions:
  6 | - "How do I write functions?"
  7 | objectives:
  8 | - "Be able to write our own functions and use basic functional programming
  9 |   constructs like `map()` and `filter()`."
 10 | keypoints:
 11 | - "`map()` applies a function to every object in a data structure."
 12 | - "`filter()` returns only the data objects for which some condition is true."
 13 | ---
 14 | 
 15 | Of course, at some point, we are going to want to define our own functions
 16 | rather than just use the ones provided by Python and its various modules.
 17 | 
 18 | The general syntax for defining a function is as follows:
 19 | 
 20 | ```
 21 | def function(arg1):
 22 |     # do stuff with arg1
 23 |     return answer
 24 | ```
 25 | {: .language-python}
 26 | 
 27 | So, an example function that adds two numbers together might look a little like
 28 | this:
 29 | 
 30 | ```
 31 | def adder(x, y):
 32 |     return x + y
 33 | 
 34 | adder(1, 2)
 35 | ```
 36 | {: .language-python}
 37 | ```
 38 | 3
 39 | ```
 40 | {: .output}
 41 | 
 42 | We can also add a default argument
 43 | (say if we wanted y to be equal to 10 unless we otherwise specified),
 44 | by using an equals sign and a default value in our function definition:
 45 | 
 46 | ```
 47 | def adder(x, y=10):
 48 |     return x + y
 49 | 
 50 | adder(5)
 51 | ```
 52 | {: .language-python}
 53 | ```
 54 | 15
 55 | ```
 56 | {: .output}
 57 | 
 58 | > ## Practice defining functions
 59 | >
 60 | > Define a function that converts from temperatures in Fahrenheit
 61 | > to temperatures in Kelvin, and another function that converts
 62 | > back again.
 63 | > 
 64 | > The general formula for the conversion from Fahrenheit to Kelvin is:
 65 | > 
 66 | > `kelvin = (fahr - 32) * 5 / 9 + 273.15`
 67 | {: .challenge}
 68 | 
 69 | ## Conditional statements
 70 | 
 71 | We may also need to have our functions do specific things in some conditions,
 72 | but not in others.
 73 | This relies upon comparisons between items:
 74 | 
 75 | In python, comparison is done using the `==` operator:
 76 | 
 77 | ```
 78 | True == True
 79 | True == False
 80 | 'words' == 'words'
 81 | ```
 82 | {: .language-python}
 83 | ```
 84 | True
 85 | False
 86 | True
 87 | ```
 88 | {: .output}
 89 | 
 90 | `not` indicates the opposite of True or False, and `!=` means not equal to.
 91 | 
 92 | ```
 93 | not True == False
 94 | True != False
 95 | ```
 96 | {: .language-python}
 97 | ```
 98 | True
 99 | True
100 | ```
101 | {: .output}
102 | 
103 | As with other programming languages, we can make the usual comparisons with the
104 | `>` and `<` operators.
105 | Adding an equals sign (`>=`, `<=`) indicates less than or equal to or greater
106 | than or equal to.
107 | 
108 | ```
109 | 5 < 10
110 | 5 > 10
111 | -4 >= -4
112 | 1 <= 2
113 | ```
114 | {: .language-python}
115 | ```
116 | True
117 | False
118 | True
119 | True
120 | ```
121 | {: .output}
122 | 
123 | These statements can be combined with the `if` statement to produce code that
124 | executes at various times.
125 | 
126 | ```
127 | number = 5
128 | if number <= 10:
129 |     print('number was less than 10')
130 | ```
131 | {: .language-python}
132 | ```
133 | number was less than 10
134 | ```
135 | {: .output}
136 | 
137 | If the `if` statement is not equal to `True`,
138 | the statement does not execute:
139 | 
140 | ```
141 | number = 11
142 | if number <= 10:
143 |     print('number was less than 10')
144 | ```
145 | {: .language-python}
146 | 
147 | However, we can add code to execute when the `if` condition is not met by
148 | adding an `else` statement.
149 | 
150 | ```
151 | number = 11
152 | if number <= 10:
153 |     print('number was less than 10')
154 | else:
155 |     print('number was greater than 10')
156 | ```
157 | {: .language-python}
158 | ```
159 | number was greater than 10
160 | ```
161 | {: .output}
162 | 
163 | And if we want to check an additional statement,
164 | we can use the `elif` keyword (else-if):
165 | 
166 | ```
167 | number = 10
168 | if number < 10:
169 |     print('number was less than 10')
170 | elif number == 10:
171 |     print('number was equal to 10')
172 | else:
173 |     print('number was greater than 10')
174 | ```
175 | {: .language-python}
176 | 
177 | One final note, to check if a value is equal to `None` in Python
178 | we must use `is None` and `is not None`.
179 | Normal `==` operators will not work.
180 | 
181 | ```
182 | None is None
183 | 5 is not None
184 | ```
185 | {: .language-python}
186 | ```
187 | True
188 | True
189 | ```
190 | {: .output}
191 | 
192 | Additionally, we can check if one value is in another set of values with the
193 | `in` operator:
194 | 
195 | ```
196 | 5 in [4, 5, 6]
197 | 43 in [4, 5, 6]
198 | ```
199 | {: .language-python}
200 | ```
201 | True
202 | False
203 | ```
204 | {: .output}
205 | 
206 | ## map(), filter(), and anonymous (lambda) functions
207 | 
208 | Python has good support for functional programming,
209 | and has its own equivalents for map/reduce-style functionality.
210 | To "map" a function means to apply it to a set of elements.
211 | To "reduce" means to collapse a set of values to a single value.
212 | Finally, "filtering" means returning only a set of elements where a certain
213 | value is true.
214 | 
215 | Let's explore what that means with our own functions.
216 | The syntax of map/reduce/filter is identical:
217 | 
218 | ```
219 | map(function, thing_to_iterate_over, next_thing_to_iterate_over)
220 | ```
221 | {: .language-python}
222 | 
223 | Let's apply this to a few test cases using map.
224 | Note that when selecting which function we are going to "map" with,
225 | 
226 | ```
227 | import math
228 | values = [0, 1, 2, 3, 4, 5, 6]
229 | map(math.sin, values)
230 | ```
231 | {: .language-python}
232 | ```
233 | <map object at 0x7f31c246cba8>
234 | ```
235 | {: .output}
236 | 
237 | To retrieve the actual values,
238 | we typically need to make the resulting output a list.
239 | 
240 | ```
241 | list(map(math.sin, values))
242 | ```
243 | {: .language-python}
244 | ```
245 | [0.0,
246 |  0.8414709848078965,
247 |  0.9092974268256817,
248 |  0.1411200080598672,
249 |  -0.7568024953079282,
250 |  -0.9589242746631385,
251 |  -0.27941549819892586]
252 | ```
253 | {: .output}
254 | 
255 | `filter()` applies a similar operation,
256 | but instead of applying a function to every piece,
257 | it only returns points where a function returns true.
258 | 
259 | ```
260 | def less_than_3(val):
261 |     return val < 3
262 | 
263 | list(filter(less_than_3, values))
264 | ```
265 | {: .language-python}
266 | ```
267 | [0, 1, 2]
268 | ```
269 | {: .output}
270 | 
271 | That was very inconvenient.
272 | We had to define an entire function just to only use it once.
273 | The solution for this is to write a one-time use function that has no name.
274 | Such functions are called either anonymous functions or lamdba functions
275 | (both mean the same thing).
276 | 
277 | To define a lambda function in python, the general syntax is as follows:
278 | 
279 | ```
280 | lambda x: x + 54
281 | ``` 
282 | {: .language-python}
283 | 
284 | In this case, `lambda x:` indicates we are defining a lambda function with a
285 | single argument, `x`.
286 | Everything following the `:` is our function.
287 | Whatever value this evaluates to is automatically returned.
288 | So `lambda x: x + 54` equates to:
289 | 
290 | ```
291 | def some_func(x):
292 |     return x + 54
293 | ```
294 | {: .language-python}
295 | 
296 | Rewriting our filter statement to use a lambda function:
297 | 
298 | ```
299 | list(filter(lambda x: x < 3, values))
300 | ```
301 | {: .language-python}
302 | ```
303 | [0, 1, 2]
304 | ```
305 | {: .output}
306 | 
307 | And a side-by-side example that demonstrates the difference between `map()` and
308 | `filter()`.
309 | 
310 | ```
311 | list(map(lambda x: x+100, [1,2,3,4,5]))
312 | list(filter(lambda x: x<3, [1,2,3,4,5]))
313 | ```
314 | {: .language-python}
315 | ```
316 | [101, 102, 103, 104, 105]   # map()
317 | [1, 2]   # filter()
318 | ```
319 | {: .output}
320 | 
321 | > ## Using lambdas in practice
322 | >
323 | > Add `'-cheesecake'` to every word in the following list using `map()`.
324 | > 
325 | > `['new york', 'chocolate', 'new york', 'ketchup', 'mayo']`
326 | > 
327 | > Using `filter()`, remove the items which would be absolutely terrible to eat.
328 | {: .challenge}
329 | 
330 | ## map/filter style functionality with Numpy arrays
331 | 
332 | Although you *could* use a for-loop to apply a custom function to a numpy array
333 | in a single go, there is a handy `np.vectorize()` function you can use to
334 | convert your functions to a vectorised numpy equivalent. 
335 | Note that this is purely for convenience &mdash; this uses a `for-loop`
336 | internally.
337 | 
338 | ```
339 | import numpy as np
340 | # create a function to perform cubes of a number
341 | vector_cube = np.vectorize(lambda x: x ** 3)
342 | 
343 | vector_cube(np.array([1, 2, 3, 4, 5]))
344 | ```
345 | {: .language-python}
346 | ```
347 | array([  1,   8,  27,  64, 125])
348 | ```
349 | {: .output}
350 | 
351 | To perform a similar option to `filter()`,
352 | you can actually specify a conditional statement inside the `[]`
353 | when indexing a Numpy array.
354 | 
355 | ```
356 | arr = np.array([1, 2, 3, 4, 5])
357 | arr[arr >= 3]
358 | ```
359 | {: .language-python}
360 | 
361 | > ## Removing np.nan values
362 | >
363 | > Remove all of the `np.nan` values from the following sequence
364 | > using logical indexing.
365 | >
366 | > `np.array([np.nan, np.nan, 2, 3, 4, np.nan])`
367 | {: .challenge}
368 | 


--------------------------------------------------------------------------------
/_episodes/06-parallel.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Introduction to parallel computing"
  3 | teaching: 15
  4 | exercises: 15
  5 | questions:
  6 | - "How do I run code in parallel?"
  7 | objectives:
  8 | - "Understand how to run parallel code with `multiprocessing`."
  9 | keypoints:
 10 | - "`Pool.map()` will perform an operation in parallel."
 11 | ---
 12 | 
 13 | The primary goal of these lesson materials is to accelerate your workflows
 14 | by executing them in a massively parallel (and reproducible!) manner.
 15 | Of course, what does this actually mean?
 16 | 
 17 | The basic concept of parallel computing is simple to understand:
 18 | we divide our job in tasks that can be executed at the same time,
 19 | so that we finish the job in a fraction of the time
 20 | that it would have taken if the tasks are executed one by one.
 21 | There are a lot of different ways of parallelizing things however -
 22 | we need to cover these concepts before running our workflows in parallel.
 23 | 
 24 | Let's start with an analogy:
 25 | suppose that we want to paint the four walls in a room. This is our problem.
 26 | We can divide our problem in 4 different tasks: paint each of the walls.
 27 | In principle, our 4 tasks are independent from each other
 28 | in the sense that we don’t need to finish one to start another.
 29 | However, this does not mean that the tasks can be executed simultaneously or in
 30 | parallel.
 31 | It all depends on the amount of resources that we have for the tasks.
 32 | 
 33 | ### Concurrent vs. parallel execution
 34 | 
 35 | If there is only one painter, they could work for a while in one wall,
 36 | then start painting another one, then work for a little bit in the third one,
 37 | and so on.
 38 | **The tasks are being executed concurrently but not in parallel.**
 39 | Only one task is being performed at a time.
 40 | If we have 2 or more painters for the job,
 41 | then the tasks can be performed in parallel.
 42 | 
 43 | In our analogy, the painters represent CPU cores in your computer.
 44 | The number of CPU cores available determines
 45 | the maximum number of tasks that can be performed in parallel.
 46 | The number of concurrent tasks that can be started at the same time,
 47 | however, is unlimited.
 48 | 
 49 | ### Synchronous vs. asynchronous execution
 50 | 
 51 | Now imagine that all workers have to obtain their paint form a central
 52 | dispenser located at the middle of the room.
 53 | If each worker is using a different colour, then they can work asynchronously.
 54 | However, if they use the same colour,
 55 | and two of them run out of paint at the same time,
 56 | then they have to synchronise to use the dispenser &mdash;
 57 | one should wait while the other is being serviced.
 58 | 
 59 | In our analogy, the paint dispenser represents access to the memory in your
 60 | computer.
 61 | Depending on how a program is written, access to data in memory can be
 62 | synchronous or asynchronous.
 63 | 
 64 | ### Distributed vs. shared memory
 65 | 
 66 | Finally, imagine that we have 4 paint dispensers, one for each worker.
 67 | In this scenario, each worker can complete its task totally on their own.
 68 | They don’t even have to be in the same room,
 69 | they could be painting walls of different rooms in the house,
 70 | on different houses in the city, and different cities in the country.
 71 | In many cases, however, we need a communication system in place.
 72 | Suppose that worker A, needs a colour that is only available in the dispenser
 73 | of worker B &mdash;
 74 | worker A should request the paint to worker B,
 75 | and worker B should respond by sending the required colour.
 76 | 
 77 | Think of the memory distributed on each node/computer of a cluster as the
 78 | different dispensers for your workers.
 79 | A *fine-grained* parallel program needs lots of communication/synchronisation
 80 | between tasks,
 81 | in contrast with a *course-grained* one that barely communicates at all.
 82 | An embarrassingly/massively parallel problem is one where all tasks can be
 83 | executed completely independent from each other (no communications required).
 84 | 
 85 | ### Processes vs. threads
 86 | 
 87 | Our example painters have two arms, and could potentially paint with both arms
 88 | at the same time.
 89 | Technically, the work being done by each arm is the work of a single painter.
 90 | 
 91 | In this example, each painter would be a process (an individual instance of a
 92 | program).
 93 | The painters' arms represent a "thread" of a program.
 94 | Threads are separate points of execution within a single program,
 95 | and can be executed either synchronously or asynchronously.
 96 | 
 97 | ---
 98 | 
 99 | ## How does parallelization work in practice?
100 | 
101 | These concepts translate into several different types of parallel computing,
102 | each good at certain types of tasks:
103 | 
104 | ### Asynchronous programming
105 | 
106 | Often times, certain computations involve a lot of waiting.
107 | Perhaps you sent some information to a webserver on the internet and are
108 | waiting back on a response.
109 | In this case, if you needed to make lots of requests over the internet,
110 | your program would spend ages just waiting to hear back.
111 | In this scenario, it would be very advantageous to fire off a bunch of requests
112 | to the internet, and then instead of waiting on each one,
113 | check back periodically to see if the request has completed before processing
114 | each request individually.
115 | 
116 | This is an example of asynchronous programming.
117 | One thread executes many tasks at the same time,
118 | periodically checking on each one,
119 | and only taking an action once some external task has completed.
120 | Asynchronous programming is very important when programming for the web,
121 | where lots of waiting around happens.
122 | To do this in Python, you'd typically want to use something like the
123 | [asyncio](https://docs.python.org/3/library/asyncio.html) module.
124 | It's not very useful for scientific programming, because only one core/thread
125 | is typically doing any work &mdash;
126 | a normal program that doesn't run in parallel at all would be just as fast!
127 | 
128 | ### Shared memory programming
129 | 
130 | Shared memory programming means using the resources on a single computer,
131 | and having multiple threads or processes work together on a single copy of a
132 | dataset in memory.
133 | This is the most common form of parallel programming and is relatively easy to
134 | do.
135 | We will cover basic shared-memory programming in Python using the
136 | `multiprocess` / `multiprocessing` packages in this lesson.
137 | 
138 | ### Distributed memory programming
139 | 
140 | Shared memory programming, although very useful, has one major limitation:
141 | we can only use the number of CPU cores present on a single computer.
142 | If we want to increase speed any more, we need a better computer.
143 | Big computers cost lots and lots of money.
144 | Wouldn't it be more efficient to just use a lot of smaller,
145 | cheap computers instead?
146 | 
147 | This is the rationale behind distributed memory programming &mdash;
148 | a task is farmed out to a large number of computers,
149 | each of which tackle an individual portion of a problem.
150 | Results are communicated back and forth between compute nodes.
151 | 
152 | This is most advantageous when a dataset is too large to fit into a computer's
153 | memory (depending on the hardware you have access to this can be anything from
154 | several dozen gigabytes, to several terabytes).
155 | Frameworks like [MPI](https://www.open-mpi.org/),
156 | [Hadoop](http://hadoop.apache.org/), and [Spark](https://spark.apache.org/)
157 | see widespread use for these types of problems
158 | (and are not covered in this lesson).
159 | 
160 | ### Serial farming
161 | 
162 | In many cases, we'll need to repeat the same computation multiple times.
163 | Maybe we need to run the same set of steps on 10 different samples.
164 | There doesn't need to be any communication at all,
165 | and each task is completely independent of the others.
166 | 
167 | In this scenario, why bother with all of these fancy parallel programming
168 | techniques, let's just start the same program 10 times on 10 different datasets
169 | on 10 different computers.
170 | The work is still happening in parallel, and we didn't need to change anything
171 | about our program to achieve this.
172 | As an extra benefit, this works the same for every program, regardless of what
173 | it does or what language it was written in.
174 | 
175 | This technique is known as serial farming, and is the primary focus of this
176 | lesson.
177 | We will learn to use [Snakemake](http://snakemake.readthedocs.io/en/stable/) to
178 | coordinate the parallel launch of dozens, if not hundreds or thousands of
179 | independent tasks.
180 | 
181 | -------------------------------------------------
182 | 
183 | ## Parallelization in Python
184 | 
185 | Python does not thread very well.
186 | Specifically, Python has a very nasty drawback known as a Global Interpreter
187 | Lock (GIL).
188 | The GIL ensures that only one compute thread can run at a time.
189 | This makes multithreaded processing very difficult.
190 | Instead, the best way to go about doing things is to use multiple independent
191 | processes to perform the computations.
192 | This method skips the GIL,
193 | as each individual process has it's own GIL that does not block the others.
194 | This is typically done using the `multiprocessing` module.
195 | 
196 | Before we start, we will need the number of CPU cores in our computer.
197 | To get the number of cores in our computer, we can use the `psutil` module.
198 | We are using `psutil` instead of `multiprocessing` because `psutil` counts
199 | cores instead of threads.
200 | Long story short, cores are the actual computation units,
201 | threads allow additional multitasking using the cores you have.
202 | For heavy compute jobs, you are generally interested in cores.
203 | 
204 | ```
205 | import psutil
206 | # logical=True counts threads, but we are interested in cores
207 | psutil.cpu_count(logical=False)
208 | ```
209 | {: .language-python}
210 | ```
211 | 8
212 | ```
213 | {: .output}
214 | 
215 | Using this number, we can create a pool of worker processes with which to
216 | parallelize our jobs:
217 | 
218 | ```
219 | from multiprocessing import Pool
220 | pool = Pool(psutil.cpu_count(logical=False))
221 | ```
222 | {: .language-python}
223 | 
224 | The `pool` object gives us a set of parallel workers we can
225 | use to parallelize our calculations.
226 | In particular, there is a map function
227 | (with identical syntax to the `map()` function used earlier),
228 | that runs a workflow in parallel.
229 | 
230 | Let's try `map()` out with a test function that just runs sleep.
231 | 
232 | ```
233 | import time
234 | 
235 | def sleeping(arg):
236 |     time.sleep(0.1)
237 | 
238 | %timeit list(map(sleeping, range(24)))
239 | ```
240 | {: .language-python}
241 | ```
242 | 1 loop, best of 3: 2.4 s per loop
243 | ```
244 | {: .output}
245 | 
246 | Now let's try it in parallel:
247 | 
248 | ```
249 | %timeit pool.map(sleeping, range(24))
250 | ```
251 | {: .language-python}
252 | 
253 | If you are using a Jupyter notebook, this will fail:
254 | 
255 | ```
256 | # more errors omitted
257 | AttributeError: Can't get attribute 'sleeping' on <module '__main__'>
258 | AttributeError: Can't get attribute 'sleeping' on <module '__main__'>
259 | ```
260 | {: .error}
261 | 
262 | > ## Differences between Jupyter notebooks versus and the Python interpreters
263 | >
264 | > The last command may have succeeded if you are running in a Python or IPython
265 | > shell. This is due to a difference in the way Jupyter executes user-defined
266 | > functions):
267 | >
268 | > ```
269 | > 1 loop, best of 3: 302 ms per loop
270 | > ```
271 | > {: .output}
272 | >
273 | > Jupyter notebooks define user functions under a special Python module called
274 | > `__main__`.
275 | > This does not work with `multiprocessing`.
276 | > However these issues are not limited to Jupyter notebooks &mdash;
277 | > a similar error will occur if you use a lambda function instead:
278 | >
279 | > ```
280 | > pool.map(lambda x: time.sleep(0.1), range(24))
281 | > ```
282 | > {: .language-python}
283 | > ```
284 | > ---------------------------------------------------------------------------
285 | > PicklingError                             Traceback (most recent call last)
286 | > <ipython-input-10-df8237b4b421> in <module>()
287 | > ----> 1 pool.map(lambda x: time.sleep(0.1), range(24))
288 | >
289 | > # more errors omitted
290 | > ```
291 | > {: .error}
292 | {: .callout}
293 | 
294 | The `multiprocessing` module has a major limitation:
295 | it only accepts certain functions, and in certain situations.
296 | For instance any class methods, lambdas, or functions defined in `__main__`
297 | wont' work.
298 | This is due to the way Python "pickles" (read: serialises) data
299 | and sends it to the worker processes.
300 | "Pickling" simply can't handle a lot of different types of Python objects.
301 | 
302 | Fortunately, there is a fork of the `multiprocessing` module called
303 | `multiprocess` that works just fine (`pip install --user multiprocess`).
304 | `multiprocess` uses `dill` instead of `pickle` to serialise Python objects
305 | (read: send your data and functions to the Python workers),
306 | and does not suffer the same issues.
307 | Usage is identical:
308 | 
309 | ```
310 | # shut down the old workers
311 | pool.close()
312 | 
313 | from multiprocess import Pool
314 | pool = Pool(8)
315 | %timeit pool.map(lambda x: time.sleep(0.1), range(24))
316 | pool.close()
317 | ```
318 | {: .language-python}
319 | ```
320 | 1 loop, best of 3: 309 ms per loop
321 | ```
322 | {: .output}
323 | 
324 | This is a general purpose parallelization recipe that you can use for your
325 | Python projects.
326 | 
327 | ```
328 | # make sure to always use multiprocess
329 | from multiprocess import Pool
330 | # start your parallel workers at the beginning of your script
331 | pool = Pool(number_of_cores)
332 | 
333 | # execute a computation(s) in parallel
334 | result = pool.map(your_function, something_to_iterate_over)
335 | result2 = pool.map(another_function, more_stuff_to_iterate_over)
336 | 
337 | # turn off your parallel workers at the end of your script
338 | pool.close()
339 | ```
340 | {: .language-python}
341 | 
342 | Parallel workers (with their own copy of everything) are created, data are sent
343 | to these workers, and then results are combined back together again.
344 | There is also an optional `chunksize` argument (for `pool.map()`) that lets you
345 | control how big each chunk of data is before it's sent off to each worker.
346 | A larger chunk size means that less time is spent shuttling data to and from
347 | workers, and will be more useful if you have a large number of very fast
348 | computations to perform.
349 | When each iteration takes a very long time to run, you will want to use a
350 | smaller chunk size.
351 | 


--------------------------------------------------------------------------------
/_episodes/07-snakemake-intro.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Introduction to Snakemake"
  3 | teaching: 15
  4 | exercises: 15
  5 | questions:
  6 | - "How can I make my results easier to reproduce?"
  7 | objectives:
  8 | - "Understand our example problem."
  9 | keypoints:
 10 | - "Bash scripts are not an efficient way of storing a workflow."
 11 | - "Snakemake is one method of managing a complex computational workflow."
 12 | ---
 13 | 
 14 | Let's imagine that we're interested in
 15 | seeing the frequency of various words in various books.
 16 | 
 17 | We've compiled our raw data i.e. the books we want to analyse
 18 | and have prepared several Python scripts that together make up our
 19 | analysis pipeline.
 20 | 
 21 | Let's take quick look at one of the books using the command
 22 | 
 23 | ```
 24 | $ head books/isles.txt
 25 | ```
 26 | {: .language-bash}
 27 | 
 28 | By default, `head`  displays the first 10 lines of the specified file.
 29 | 
 30 | ```
 31 | A JOURNEY TO THE WESTERN ISLANDS OF SCOTLAND
 32 | 
 33 | 
 34 | INCH KEITH
 35 | 
 36 | 
 37 | I had desired to visit the Hebrides, or Western Islands of Scotland, so
 38 | long, that I scarcely remember how the wish was originally excited; and
 39 | was in the Autumn of the year 1773 induced to undertake the journey, by
 40 | finding in Mr. Boswell a companion, whose acuteness would help my
 41 | ```
 42 | {: .output}
 43 | 
 44 | Our directory has the Python scripts and data files we
 45 | we will be working with:
 46 | 
 47 | ```
 48 | |- books
 49 | |  |- abyss.txt
 50 | |  |- isles.txt
 51 | |  |- last.txt
 52 | |  |- LICENSE_TEXTS.md
 53 | |  |- sierra.txt
 54 | |- plotcount.py
 55 | |- wordcount.py
 56 | |- zipf_test.py
 57 | ```
 58 | {: .output}
 59 | 
 60 | The first step is to count the frequency of each word in a book.
 61 | The first argument (`books/isles.txt`) to wordcount.py is the file to analyse,
 62 | and the last argument (`isles.dat`) specifies the output file to write.
 63 | 
 64 | ```
 65 | $ python wordcount.py books/isles.txt isles.dat
 66 | ```
 67 | {: .language-bash}
 68 | 
 69 | Let's take a quick peek at the result.
 70 | 
 71 | ```
 72 | $ head -5 isles.dat
 73 | ```
 74 | {: .language-bash}
 75 | 
 76 | This shows us the top 5 lines in the output file:
 77 | 
 78 | ```
 79 | the 3822 6.7371760973
 80 | of 2460 4.33632998414
 81 | and 1723 3.03719372466
 82 | to 1479 2.60708619778
 83 | a 1308 2.30565838181
 84 | ```
 85 | {: .output}
 86 | 
 87 | We can see that the file consists of one row per word.
 88 | Each row shows the word itself, the number of occurrences of that
 89 | word, and the number of occurrences as a percentage of the total
 90 | number of words in the text file.
 91 | 
 92 | We can do the same thing for a different book:
 93 | 
 94 | ```
 95 | $ python wordcount.py books/abyss.txt abyss.dat
 96 | $ head -5 abyss.dat
 97 | ```
 98 | {: .language-bash}
 99 | ```
100 | the 4044 6.35449402891
101 | and 2807 4.41074795726
102 | of 1907 2.99654305468
103 | a 1594 2.50471401634
104 | to 1515 2.38057825267
105 | ```
106 | {: .output}
107 | 
108 | Let's visualise the results.
109 | The script `plotcount.py` reads in a data file and plots the 10 most
110 | frequently occurring words as a text-based bar plot:
111 | 
112 | ```
113 | $ python plotcount.py isles.dat ascii
114 | ```
115 | {: .language-bash}
116 | ```
117 | the   ########################################################################
118 | of    ##############################################
119 | and   ################################
120 | to    ############################
121 | a     #########################
122 | in    ###################
123 | is    #################
124 | that  ############
125 | by    ###########
126 | it    ###########
127 | ```
128 | {: .output}
129 | 
130 | `plotcount.py` can also show the plot graphically:
131 | 
132 | ```
133 | $ python plotcount.py isles.dat show
134 | ```
135 | {: .language-bash}
136 | 
137 | Close the window to exit the plot.
138 | 
139 | `plotcount.py` can also create the plot as an image file (e.g. a PNG file): 
140 | 
141 | ```
142 | $ python plotcount.py isles.dat isles.png
143 | ```
144 | {: .language-bash}
145 | 
146 | Finally, let's test Zipf's law for these books:
147 | 
148 | ```
149 | $ python zipf_test.py abyss.dat isles.dat
150 | ```
151 | {: .language-bash}
152 | ```
153 | Book	First	Second	Ratio
154 | abyss	4044	2807	1.44
155 | isles	3822	2460	1.55
156 | ```
157 | {: .output}
158 | 
159 | > ## Zipf's Law
160 | >
161 | > [Zipf's Law](https://en.wikipedia.org/wiki/Zipf%27s_law) is an [empirical
162 | > law](https://en.wikipedia.org/wiki/Empirical_law) formulated using
163 | > [mathematical statistics](
164 | > https://en.wikipedia.org/wiki/Mathematical_statistics) that refers to the
165 | > fact that many types of data studied in the physical and social sciences can
166 | > be approximated with a Zipfian distribution, one of a family of related
167 | > discrete [power law](https://en.wikipedia.org/wiki/Power_law)
168 | > [probability distributions](
169 | > https://en.wikipedia.org/wiki/Probability_distribution).
170 | >
171 | > Zipf's law was originally formulated in terms of [quantitative
172 | > linguistics](https://en.wikipedia.org/wiki/Quantitative_linguistics), stating
173 | > that given some [corpus](https://en.wikipedia.org/wiki/Text_corpus) of
174 | > [natural language](https://en.wikipedia.org/wiki/Natural_language)
175 | > utterances, the frequency of any word is [inversely proportional](
176 | > https://en.wikipedia.org/wiki/Inversely_proportional) to its
177 | > rank in the [frequency table](https://en.wikipedia.org/wiki/Frequency_table).
178 | > For example, in the [Brown Corpus](
179 | > https://en.wikipedia.org/wiki/Brown_Corpus) of American English text,
180 | > the word the is the most frequently occurring word, and by itself accounts
181 | > for nearly 7% of all word occurrences (69,971 out of slightly over 1
182 | > million). True to Zipf's Law, the second-place word of accounts for slightly
183 | > over 3.5% of words (36,411 occurrences), followed by and (28,852). Only 135
184 | > vocabulary items are needed to account for half the Corpus.
185 | > 
186 | > *Source:* [Wikipedia](https://en.wikipedia.org/wiki/Zipf%27s_law)
187 | {: .callout}
188 | 
189 | Together these scripts implement a common workflow:
190 | 
191 | 1. Read a data file.
192 | 2. Perform an analysis on this data file.
193 | 3. Write the analysis results to a new file.
194 | 4. Plot a graph of the analysis results.
195 | 5. Save the graph as an image, so we can put it in a paper.
196 | 6. Make a summary table of the analyses
197 | 
198 | Running `wordcount.py` and `plotcount.py` at the shell prompt, as we
199 | have been doing, is fine for one or two files. If, however, we had 5
200 | or 10 or 20 text files,
201 | or if the number of steps in the pipeline were to expand, this could turn into
202 | a lot of work.
203 | Plus, no one wants to sit and wait for a command to finish, even just for 30
204 | seconds.
205 | 
206 | The most common solution to the tedium of data processing is to write
207 | a shell script that runs the whole pipeline from start to finish.
208 | 
209 | Using your text editor of choice (e.g. nano), add the following to a new file
210 | named `run_pipeline.sh`.
211 | 
212 | ```
213 | # USAGE: bash run_pipeline.sh
214 | # to produce plots for isles and abyss
215 | # and the summary table for the Zipf's law tests
216 | 
217 | python wordcount.py books/isles.txt isles.dat
218 | python wordcount.py books/abyss.txt abyss.dat
219 | 
220 | python plotcount.py isles.dat isles.png
221 | python plotcount.py abyss.dat abyss.png
222 | 
223 | # Generate summary table
224 | python zipf_test.py abyss.dat isles.dat > results.txt
225 | ```
226 | {: .language-bash}
227 | 
228 | Run the script and check that the output is the same as before:
229 | 
230 | ```
231 | $ bash run_pipeline.sh
232 | $ cat results.txt
233 | ```
234 | {: .language-bash}
235 | 
236 | This shell script solves several problems in computational reproducibility:
237 | 
238 | 1.  It explicitly documents our pipeline, making communication with colleagues
239 |     (and our future selves) more efficient.
240 | 2.  It allows us to type a single command, `bash run_pipeline.sh`, to
241 |     reproduce the full analysis.
242 | 3.  It prevents us from _repeating_ typos or mistakes.
243 |     You might not get it right the first time, but once you fix something
244 |     it'll stay fixed.
245 | 
246 | Despite these benefits it has a few shortcomings.
247 | 
248 | Let's adjust the width of the bars in our plot produced by `plotcount.py`.
249 | 
250 | Edit `plotcount.py` so that the bars are 0.8 units wide instead of 1 unit.
251 | (Hint: replace `width = 1.0` with `width = 0.8` in the definition of
252 | `plot_word_counts`.)
253 | 
254 | Now we want to recreate our figures.
255 | We _could_ just `bash run_pipeline.sh` again.
256 | That would work, but it could also be a big pain if counting words takes
257 | more than a few seconds.
258 | The word counting routine hasn't changed; we shouldn't need to recreate
259 | those files.
260 | 
261 | Alternatively, we could manually rerun the plotting for each word-count file.
262 | (Experienced shell scripters can make this easier on themselves using a
263 | for-loop.)
264 | 
265 | ```
266 | $ for book in abyss isles; do python plotcount.py $book.dat $book.png; done
267 | ```
268 | {: .language-bash}
269 | 
270 | With this approach, however,
271 | we don't get many of the benefits of having a shell script in the first place.
272 | 
273 | Another popular option is to comment out a subset of the lines in
274 | `run_pipeline.sh`:
275 | 
276 | ```
277 | # USAGE: bash run_pipeline.sh
278 | # to produce plots for isles and abyss
279 | # and the summary table
280 | 
281 | # These lines are commented out because they don't need to be rerun.
282 | #python wordcount.py books/isles.txt isles.dat
283 | #python wordcount.py books/abyss.txt abyss.dat
284 | 
285 | python plotcount.py isles.dat isles.png
286 | python plotcount.py abyss.dat abyss.png
287 | 
288 | # This line is also commented out because it doesn't need to be rerun.
289 | # python zipf_test.py abyss.dat isles.dat > results.txt
290 | ```
291 | {: .language-bash}
292 | 
293 | Then, we would run our modified shell script using `bash run_pipeline.sh`.
294 | 
295 | But commenting out these lines, and subsequently un-commenting them,
296 | can be a hassle and source of errors in complicated pipelines.
297 | What happens if we have hundreds of input files?
298 | No one wants to enter the same command a hundred times,
299 | and then edit the result.
300 | 
301 | What we really want is an executable _description_ of our pipeline that
302 | allows software to do the tricky part for us:
303 | figuring out what tasks need to be run where and when,
304 | then perform those tasks for us.
305 | 
306 | ## What is Snakemake and why are we using it?
307 | 
308 | There are many different tools that researchers use to automate this type of
309 | work.
310 | Snakemake is a very popular tool, and the one we have selected for this
311 | tutorial.
312 | There are several reasons this tool was chosen:
313 | 
314 | * It’s free, open-source, and installs in about 5 seconds flat via `pip`.
315 | 
316 | * Snakemake works cross-platform (Windows, MacOS, Linux) and is compatible with
317 |   all HPC schedulers. More importantly, the same workflow will work and scale
318 |   appropriately regardless of whether it’s on a laptop or cluster without
319 |   modification.
320 | 
321 | * Snakemake uses pure Python syntax. There is no tool specific-language to
322 |   learn like in GNU Make, NextFlow, WDL, etc.. Even if students end up not
323 |   liking Snakemake, you’ve still taught them how to program in Python at the
324 |   end of the day.
325 | 
326 | * Anything that you can do in Python, you can do with Snakemake (since you can
327 |   pretty much execute arbitrary Python code anywhere).
328 | 
329 | * Snakemake was written to be as similar to GNU Make as possible. Users already
330 |   familiar with Make will find Snakemake quite easy to use.
331 | 
332 | * It’s easy. You can (hopefully!) learn Snakemake in an afternoon!
333 | 
334 | The rest of these lessons aim to teach you how to use Snakemake by example.
335 | Our goal is to automate our example workflow, and have it do everything for us
336 | in parallel regardless of where and how it is run (and have it be
337 | reproducible!).
338 | 


--------------------------------------------------------------------------------
/_episodes/09-wildcards.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Wildcards"
  3 | teaching: 30
  4 | exercises: 15
  5 | questions:
  6 | - "How can I abbreviate the rules in my pipeline?"
  7 | objectives:
  8 | - "Use snakemake wildcards to simplify our rules."
  9 | - "Output files are a product not only of input files but of the scripts or
 10 |   code that created the output files."
 11 | keypoints:
 12 | - "Use `{output}` to refer to the output of the current rule."
 13 | - "Use `{input}` to refer to the dependencies of the current rule."
 14 | - "You can use Python indexing to retrieve individual outputs and inputs
 15 |   (example: `{input[0]}`)"
 16 | - "Wildcards can be named (example: `{input.file1}`)."
 17 | ---
 18 | 
 19 | After the exercise at the end of the previous episode,
 20 | our Snakefile looked like this:
 21 | 
 22 | ```
 23 | # generate summary table
 24 | rule zipf_test:
 25 |     input:  'abyss.dat', 'last.dat', 'isles.dat'
 26 |     output: 'results.txt'
 27 |     shell:  'python zipf_test.py abyss.dat isles.dat last.dat > results.txt'
 28 | 
 29 | rule dats:
 30 |      input: 'isles.dat', 'abyss.dat', 'last.dat'
 31 | 
 32 | # delete everything so we can re-run things
 33 | rule clean:
 34 |     shell:  'rm -f *.dat results.txt'
 35 | 
 36 | # count words in one of our "books"
 37 | rule count_words:
 38 |     input:  'books/isles.txt'
 39 |     output: 'isles.dat'
 40 |     shell:  'python wordcount.py books/isles.txt isles.dat'
 41 | 
 42 | rule count_words_abyss:
 43 |     input:  'books/abyss.txt'
 44 |     output: 'abyss.dat'
 45 |     shell:  'python wordcount.py books/abyss.txt abyss.dat'
 46 | 
 47 | rule count_words_last:
 48 |     input:  'books/last.txt'
 49 |     output: 'last.dat'
 50 |     shell:  'python wordcount.py books/last.txt last.dat'
 51 | ```
 52 | {: .language-make}
 53 | 
 54 | Our Snakefile has a lot of duplication. For example, the names of text
 55 | files and data files are repeated in many places throughout the
 56 | Snakefile. Snakefiles are a form of code and, in any code, repeated code
 57 | can lead to problems (e.g. we rename a data file in one part of the
 58 | Snakefile but forget to rename it elsewhere).
 59 | 
 60 | > ## D.R.Y. (Don't Repeat Yourself)
 61 | >
 62 | > In many programming languages, the bulk of the language features are
 63 | > there to allow the programmer to describe long-winded computational
 64 | > routines as short, expressive, beautiful code.  Features in Python
 65 | > or R or Java, such as user-defined variables and functions are useful in
 66 | > part because they mean we don't have to write out (or think about)
 67 | > all of the details over and over again.  This good habit of writing
 68 | > things out only once is known as the "Don't Repeat Yourself"
 69 | > principle or D.R.Y.
 70 | {: .callout}
 71 | 
 72 | Let us set about removing some of the repetition from our Snakefile.
 73 | In our `zipf_test` rule we duplicate the data file names and the
 74 | name of the results file name:
 75 | 
 76 | ```
 77 | rule zipf_test:
 78 |     input:
 79 |             'abyss.dat',
 80 |             'last.dat',
 81 |             'isles.dat'
 82 |     output: 'results.txt'
 83 |     shell:  'python zipf_test.py abyss.dat isles.dat last.dat > results.txt'
 84 | ```
 85 | {: .language-make}
 86 | 
 87 | Looking at the results file name first, we can replace it in the action
 88 | with `{output}`:
 89 | 
 90 | ```
 91 | rule zipf_test:
 92 |     input:  'abyss.dat', 'last.dat', 'isles.dat'
 93 |     output: 'results.txt'
 94 |     shell:  'python zipf_test.py abyss.dat isles.dat last.dat > {output}'
 95 | ```
 96 | {: .language-make}
 97 | 
 98 | `{output}` is a Snakemake [wildcard](
 99 | {{ page.root }}/reference/#automatic-variable) which is equivalent to the value
100 | we specified for the `output` section of the rule.
101 | 
102 | We can replace the dependencies in the action with `{input}`:
103 | 
104 | ```
105 | rule zipf_test:
106 |     input:  'abyss.dat', 'last.dat', 'isles.dat'
107 |     output: 'results.txt'
108 |     shell:  'python zipf_test.py {input} > {output}'
109 | ```
110 | {: .language-make}
111 | 
112 | `{input}` is another wildcard which means 'all the dependencies of the current
113 | rule'. Again, when Snakemake is run it will replace this variable with the
114 | dependencies.
115 | 
116 | Let's update our text files and re-run our rule:
117 | 
118 | ```
119 | $ touch books/*.txt
120 | $ snakemake results.txt
121 | ```
122 | {: .language-bash}
123 | 
124 | We get:
125 | 
126 | ```
127 | Provided cores: 1
128 | Rules claiming more threads will be scaled down.
129 | Job counts:
130 |     count	jobs
131 |     1	count_words
132 |     1	count_words_abyss
133 |     1	count_words_last
134 |     1	zipf_test
135 |     4
136 | 
137 | rule count_words_last:
138 |     input: books/last.txt
139 |     output: last.dat
140 |     jobid: 1
141 | 
142 | Finished job 1.
143 | 1 of 4 steps (25%) done
144 | 
145 | rule count_words_abyss:
146 |     input: books/abyss.txt
147 |     output: abyss.dat
148 |     jobid: 2
149 | 
150 | Finished job 2.
151 | 2 of 4 steps (50%) done
152 | 
153 | rule count_words:
154 |     input: books/isles.txt
155 |     output: isles.dat
156 |     jobid: 3
157 | 
158 | Finished job 3.
159 | 3 of 4 steps (75%) done
160 | 
161 | rule zipf_test:
162 |     input: abyss.dat, last.dat, isles.dat
163 |     output: results.txt
164 |     jobid: 0
165 | 
166 | Finished job 0.
167 | 4 of 4 steps (100%) done
168 | ```
169 | {: .output}
170 | 
171 | 
172 | > ## Update Dependencies
173 | >
174 | > What will happen if you now execute:
175 | >
176 | > ```
177 | > $ touch *.dat
178 | > $ snakemake results.txt
179 | > ```
180 | > {: .language-bash}
181 | >
182 | > 1. nothing
183 | > 2. all files recreated
184 | > 3. only `.dat` files recreated
185 | > 4. only `results.txt` recreated
186 | >
187 | > > ## Solution
188 | > > `4.` Only `results.txt` recreated.
189 | > >
190 | > > The rules for `*.dat` are not executed because their corresponding
191 | > > `.txt` files haven't been modified.
192 | > >
193 | > > If you run:
194 | > >
195 | > > ```
196 | > > $ touch books/*.txt
197 | > > $ snakemake results.txt
198 | > > ```
199 | > > {: .language-bash}
200 | > >
201 | > > you will find that the `.dat` files as well as `results.txt` are recreated.
202 | > {: .solution}
203 | {: .challenge}
204 | 
205 | As we saw, `{input}` means 'all the dependencies of the current rule'.
206 | This works well for `results.txt` as its action treats all the dependencies the
207 | same &mdash; as the input for the `zipf_test.py` script.
208 | 
209 | > ## Rewrite `.dat` rules to use wildcards
210 | >
211 | > Rewrite each `.dat` rule to use the `{input}` and `{output}` wildcards.
212 | {: .challenge}
213 | 
214 | ## Handling dependencies differently
215 | 
216 | For many rules, we may want to treat some dependencies
217 | differently. For example, our rules for `.dat` use their first (and
218 | only) dependency specifically as the input file to `wordcount.py`. If
219 | we add additional dependencies (as we will soon do) then we don't want
220 | these being passed as input files to `wordcount.py` as it expects only
221 | one input file to be named when it is invoked.
222 | 
223 | Snakemake provides several solutions to this.
224 | Depending on what we want to do,
225 | it's possible to both index and name our wildcards.
226 | 
227 | Suppose we want to add `wordcount.py` as a dependency of each data file.
228 | In this case, we can use `{input[0]}` to refer to the first dependency,
229 | and `{input[1]}` to refer to the second.
230 | 
231 | ```
232 | rule count_words:
233 |     input:  'wordcount.py', 'books/isles.txt'
234 |     output: 'isles.dat'
235 |     shell:  'python {input[0]} {input[1]} {output}'
236 | ```
237 | {: .language-make}
238 | 
239 | Alternatively, we can name our dependencies.
240 | 
241 | ```
242 | rule count_words_abyss:
243 |     input:
244 |         wc='wordcount.py',
245 |         book='books/abyss.txt'
246 |     output: 'abyss.dat'
247 |     shell:  'python {input.wc} {input.book} {output}'
248 | ```
249 | {: .language-make}
250 | 
251 | Let's mark `wordcount.py` as updated, and re-run the pipeline.
252 | 
253 | ```
254 | $ touch wordcount.py
255 | $ snakemake
256 | ```
257 | {: .language-bash}
258 | 
259 | ```
260 | Provided cores: 1
261 | Rules claiming more threads will be scaled down.
262 | Job counts:
263 |     count	jobs
264 |     1	count_words
265 |     1	count_words_abyss
266 |     1	zipf_test
267 |     3
268 | 
269 | rule count_words_abyss:
270 |     input: wordcount.py, books/abyss.txt
271 |     output: abyss.dat
272 |     jobid: 2
273 | 
274 | Finished job 2.
275 | 1 of 3 steps (33%) done
276 | 
277 | rule count_words:
278 |     input: wordcount.py, books/isles.txt
279 |     output: isles.dat
280 |     jobid: 1
281 | 
282 | Finished job 1.
283 | 2 of 3 steps (67%) done
284 | 
285 | rule zipf_test:
286 |     input: abyss.dat, last.dat, isles.dat
287 |     output: results.txt
288 |     jobid: 0
289 | 
290 | Finished job 0.
291 | 3 of 3 steps (100%) done
292 | ```
293 | {: .output}
294 | 
295 | Notice how `last.dat` (which does not depend on `wordcount.py`) is not rebuilt.
296 | Intuitively, we should also add `wordcount.py` as dependency for
297 | `results.txt`, as the final table should be rebuilt as we remake the
298 | `.dat` files. However, it turns out we don't have to! Let's see what
299 | happens to `results.txt` when we update `wordcount.py`:
300 | 
301 | ```
302 | $ touch wordcount.py
303 | $ snakemake results.txt
304 | ```
305 | {: .language-bash}
306 | 
307 | then we get:
308 | 
309 | ```
310 | Provided cores: 1
311 | Rules claiming more threads will be scaled down.
312 | Job counts:
313 |     count	jobs
314 |     1	count_words
315 |     1	count_words_abyss
316 |     1	zipf_test
317 |     3
318 | 
319 | rule count_words_abyss:
320 |     input: wordcount.py, books/abyss.txt
321 |     output: abyss.dat
322 |     jobid: 2
323 | 
324 | Finished job 2.
325 | 1 of 3 steps (33%) done
326 | 
327 | rule count_words:
328 |     input: wordcount.py, books/isles.txt
329 |     output: isles.dat
330 |     jobid: 1
331 | 
332 | Finished job 1.
333 | 2 of 3 steps (67%) done
334 | 
335 | rule zipf_test:
336 |     input: abyss.dat, last.dat, isles.dat
337 |     output: results.txt
338 |     jobid: 0
339 | 
340 | Finished job 0.
341 | 3 of 3 steps (100%) done
342 | ```
343 | {: .output}
344 | 
345 | The whole pipeline is triggered, even the creation of the
346 | `results.txt` file! To understand this, note that according to the
347 | dependency figure, `results.txt` depends on the `.dat` files. The
348 | update of `wordcount.py` triggers an update of the `*.dat`
349 | files. Thus, `snakemake` sees that the dependencies (the `.dat` files) are
350 | newer than the target file (`results.txt`) and thus it recreates
351 | `results.txt`. This is an example of the power of `snakemake`: updating a
352 | subset of the files in the pipeline triggers rerunning the appropriate
353 | downstream steps.
354 | 
355 | > ## Updating One Input File
356 | >
357 | > What will happen if you now execute:
358 | >
359 | > ```
360 | > touch books/last.txt
361 | > snakemake results.txt
362 | > ```
363 | > {: .language-bash}
364 | >
365 | > 1. only `last.dat` is recreated
366 | > 2. all `.dat` files are recreated
367 | > 3. only `last.dat` and `results.txt` are recreated
368 | > 4. all `.dat` and `results.txt` are recreated
369 | {: .challenge}
370 | 
371 | > ## More dependencies...
372 | >
373 | > Add `zipf_test.py` as a dependency of `results.txt`
374 | > Which method do you prefer here, indexing or named input files?
375 | > Yes, this will be clunky, but we'll fix that part later!
376 | > Remember that you can do a dry run with `snakemake -n -p`!
377 | {: .challenge}
378 | 


--------------------------------------------------------------------------------
/_episodes/10-patterns.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Pattern Rules"
  3 | teaching: 15
  4 | exercises: 0
  5 | questions:
  6 | - "How can I define rules to operate on similar files?"
  7 | objectives:
  8 | - "Write Snakemake pattern rules."
  9 | keypoints:
 10 | - "Use any named wildcard (`{some_name}`) as a placeholder in targets and
 11 |   dependencies."
 12 | ---
 13 | 
 14 | Our Snakefile still has a ton of repeated content.
 15 | The rules for each `.dat` file all do the same thing for the part.
 16 | We can replace these rules with a single [pattern rule](
 17 | {{ page.root }}/reference/#pattern-rule) which can be used to build any
 18 | `.dat` file from a `.txt` file in `books/`:
 19 | 
 20 | ```
 21 | rule count_words:
 22 |     input:
 23 |         wc='wordcount.py',
 24 |         book='books/{file}.txt'
 25 |     output: '{file}.dat'
 26 |     shell:  'python {input.wc} {input.book} {output}'
 27 | ```
 28 | {: .language-make}
 29 | 
 30 | `{file}` is another arbitrary [wildcard]({{ page.root }}/reference/#wildcard),
 31 | that we can use as a placeholder for any generic book to analyse.
 32 | Note that we don't have to use `{file}` as the name of our wildcard &mdash;
 33 | it can be anything we want!
 34 | 
 35 | This rule can be interpreted as:
 36 | "In order to build a file named `something.dat` (the output)
 37 | find a file named `books/something.txt` (the input)
 38 | and run `wordcount.py input output`."
 39 | 
 40 | ```
 41 | $ snakemake clean
 42 | # use the -p option to show that it is running things correctly!
 43 | $ snakemake -p dats
 44 | ```
 45 | {: .language-bash}
 46 | 
 47 | We should see the same output as before.
 48 | Note that we can still use snakemake to build individual `.dat` targets as
 49 | before, and that our new rule will work no matter what stem is being matched.
 50 | 
 51 | ```
 52 | $ snakemake -p sierra.dat
 53 | ```
 54 | {: .language-bash}
 55 | 
 56 | which gives the output below:
 57 | 
 58 | ```
 59 | Provided cores: 1
 60 | Rules claiming more threads will be scaled down.
 61 | Job counts:
 62 |     count	jobs
 63 |     1	count_words
 64 |     1
 65 | 
 66 | rule count_words:
 67 |     input: wordcount.py, books/sierra.txt
 68 |     output: sierra.dat
 69 |     jobid: 0
 70 |     wildcards: file=sierra
 71 | 
 72 | python wordcount.py books/sierra.txt sierra.dat
 73 | Finished job 0.
 74 | 1 of 1 steps (100%) done
 75 | ```
 76 | {: .output}
 77 | 
 78 | > ## Using wildcards
 79 | >
 80 | > Our arbitrary wildcards like `{file}` can only be used in
 81 | > `input:` and `output:` fields. It cannot be used in actions.
 82 | {: .callout}
 83 | 
 84 | Our Snakefile is now much shorter and cleaner:
 85 | 
 86 | ```
 87 | # generate summary table
 88 | rule zipf_test:
 89 |     input:  'zipf_test.py', 'abyss.dat', 'last.dat', 'isles.dat'
 90 |     output: 'results.txt'
 91 |     shell:  'python {input[0]} {input[1]} {input[2]} {input[3]} > {output}'
 92 | 
 93 | rule dats:
 94 |      input:
 95 |          'isles.dat', 'abyss.dat', 'last.dat'
 96 | 
 97 | # delete everything so we can re-run things
 98 | rule clean:
 99 |     shell:  'rm -f *.dat results.txt'
100 | 
101 | # count words in one of our "books"
102 | rule count_words:
103 |     input:
104 |         wc='wordcount.py',
105 |         book='books/{file}.txt'
106 |     output: '{file}.dat'
107 |     shell:  'python {input.wc} {input.book} {output}'
108 | ```
109 | {: .language-make}
110 | 


--------------------------------------------------------------------------------
/_episodes/11-snakemake-python.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Snakefiles are Python code"
  3 | teaching: 30
  4 | exercises: 15
  5 | questions:
  6 | - "How can I automatically manage dependencies and outputs?"
  7 | - "How can I use Python code to add features to my pipeline?"
  8 | objectives:
  9 | - "Use variables, functions, and imports in a Snakefile."
 10 | - "Learn to use the `run` action to execute Python code as an action."
 11 | keypoints:
 12 | - "Snakefiles are Python code."
 13 | - "The entire Snakefile is executed whenever you run `snakemake`."
 14 | - "All actual work should be done by rules."
 15 | ---
 16 | 
 17 | Despite our efforts, our pipeline still has repeated content,
 18 | for instance the names of output files/dependencies.
 19 | Our `zipf_test` rule, for instance, is extremely clunky.
 20 | What happens if we want to analyse `books/sierra.txt` as well?
 21 | We'd have to update everything!
 22 | 
 23 | ```
 24 | rule zipf_test:
 25 |     input:  'zipf_test.py', 'abyss.dat', 'last.dat', 'isles.dat'
 26 |     output: 'results.txt'
 27 |     shell:  'python {input[0]} {input[1]} {input[2]} {input[3]} > {output}'
 28 | ```
 29 | {: .language-make}
 30 | 
 31 | First, let's cut down on a little bit of the clunkiness of the `shell`
 32 | directive.
 33 | One thing you've probably noticed is that all of our rules are using Python
 34 | strings.
 35 | Other data structures work too &mdash; let's try a list:
 36 | 
 37 | ```
 38 | rule zipf_test:
 39 |     input:
 40 |         zipf='zipf_test.py',
 41 |         books=['abyss.dat', 'last.dat', 'isles.dat']
 42 |     output: 'results.txt'
 43 |     shell:  'python {input.zipf} {input.books} > {output}'
 44 | ```
 45 | {: .language-make}
 46 | 
 47 | (`snakemake clean` and `snakemake -p` should show that the pipeline still
 48 | works!)
 49 | 
 50 | This illustrates a key feature of Snakemake.
 51 | Snakefiles are just Python code.
 52 | We can make our list into a variable to demonstrate this.
 53 | Let's create the variable `DATS` and use it in our `zipf_test` and `dats`
 54 | rules.
 55 | 
 56 | ```
 57 | DATS=['abyss.dat', 'last.dat', 'isles.dat']
 58 | 
 59 | # generate summary table
 60 | rule zipf_test:
 61 |     input:
 62 |         zipf='zipf_test.py',
 63 |         books=DATS
 64 |     output: 'results.txt'
 65 |     shell:  'python {input.zipf} {input.books} > {output}'
 66 | 
 67 | rule dats:
 68 |     input: DATS
 69 | ```
 70 | {: .language-make}
 71 | 
 72 | Try re-creating both the `dats` and `results.txt` targets
 73 | (run `snakemake clean` in between).
 74 | 
 75 | ## When are Snakefiles executed?
 76 | 
 77 | The last example illustrated that we can use arbitrary Python code in our
 78 | Snakefile.
 79 | It's important to understand when this code gets executed.
 80 | Let's add a `print` instruction to the top of our Snakefile.
 81 | 
 82 | ```
 83 | print('Snakefile is being executed!')
 84 | 
 85 | DATS=['abyss.dat', 'last.dat', 'isles.dat']
 86 | 
 87 | # generate summary table
 88 | rule zipf_test:
 89 |     input:
 90 | # more output below
 91 | ```
 92 | {: .language-make}
 93 | 
 94 | Now let's clean up our workspace with `snakemake clean`
 95 | 
 96 | ```
 97 | snakemake clean
 98 | ```
 99 | {: .language-bash}
100 | ```
101 | Snakefile is being executed!
102 | Provided cores: 1
103 | Rules claiming more threads will be scaled down.
104 | Job counts:
105 |     count	jobs
106 |     1	clean
107 |     1
108 | 
109 | rule clean:
110 |     jobid: 0
111 | 
112 | Finished job 0.
113 | 1 of 1 steps (100%) done
114 | ```
115 | {: .output}
116 | 
117 | Now let's re-run the pipeline...
118 | 
119 | ```
120 | $ snakemake
121 | ```
122 | {: .language-bash}
123 | ```
124 | Snakefile is being executed!
125 | Provided cores: 1
126 | Rules claiming more threads will be scaled down.
127 | Job counts:
128 |     count	jobs
129 |     3	count_words
130 |     1	zipf_test
131 |     4
132 | 
133 | rule count_words:
134 |     input: wordcount.py, books/last.txt
135 |     output: last.dat
136 |     jobid: 3
137 |     wildcards: file=last
138 | 
139 | Finished job 3.
140 | 1 of 4 steps (25%) done
141 | 
142 | rule count_words:
143 |     input: wordcount.py, books/abyss.txt
144 |     output: abyss.dat
145 |     jobid: 1
146 |     wildcards: file=abyss
147 | 
148 | Finished job 1.
149 | 2 of 4 steps (50%) done
150 | 
151 | rule count_words:
152 |     input: wordcount.py, books/isles.txt
153 |     output: isles.dat
154 |     jobid: 2
155 |     wildcards: file=isles
156 | 
157 | Finished job 2.
158 | 3 of 4 steps (75%) done
159 | 
160 | rule zipf_test:
161 |     input: zipf_test.py, abyss.dat, last.dat, isles.dat
162 |     output: results.txt
163 |     jobid: 0
164 | 
165 | Finished job 0.
166 | 4 of 4 steps (100%) done
167 | ```
168 | {: .output}
169 | 
170 | Let's do a dry-run:
171 | 
172 | ```
173 | $ snakemake -n
174 | ```
175 | {: .language-bash}
176 | ```
177 | Snakefile is being executed!
178 | Nothing to be done.
179 | ```
180 | {: .output}
181 | 
182 | In every case, the `print()` statement ran before any of the actual
183 | pipeline code was run.
184 | What we can take away from this is that Snakemake executes the entire Snakefile
185 | every time we run `snakemake` (regardless of if it's a dry run!).
186 | Because of this, we need to be careful,
187 | and only put tasks that do "real work" (changing files on disk) inside rules.
188 | 
189 | ## Using functions in Snakefiles
190 | 
191 | In our example here, we only have 4 books.
192 | But what if we had 700 books to be processed?
193 | It would be a massive effort to update our `DATS` variable to
194 | add the name of every single book's corresponding `.dat` filename.
195 | 
196 | Fortunately, Snakemake ships with several functions that make working with
197 | large numbers of files much easier.
198 | The two most helpful ones are `glob_wildcards()` and `expand()`.
199 | Let's start an interactive Python session to see how they work:
200 | 
201 | ```
202 | $ python3
203 | ```
204 | {: .language-bash}
205 | ```
206 | Python 3.6.1 (default, Jun 27 2017, 14:35:15)
207 | Type "copyright", "credits" or "license" for more information.
208 | ```
209 | {: .output}
210 | 
211 | In this example, we will import these Snakemake functions directly in our
212 | interactive Python session.
213 | It is not necessary to import these functions within your
214 | Snakefile &mdash; these functions are always imported for you.
215 | 
216 | ```
217 | from snakemake.io import expand, glob_wildcards
218 | ```
219 | {: .language-python}
220 | 
221 | ### Generating file names with expand()
222 | 
223 | The first function we'll use is `expand()`.
224 | `expand()` is used quite literally,
225 | to expand a snakemake wildcard(s) into a set of filenames.
226 | 
227 | ```
228 | >>> expand('folder/{wildcard1}_{wildcard2}.txt',
229 | ...        wildcard1=['a', 'b', 'c'],
230 | ...        wildcard2=[1, 2, 3])
231 | ```
232 | {: .language-python}
233 | ```
234 | ['folder/a_1.txt',
235 |  'folder/a_2.txt',
236 |  'folder/a_3.txt',
237 |  'folder/b_1.txt',
238 |  'folder/b_2.txt',
239 |  'folder/b_3.txt',
240 |  'folder/c_1.txt',
241 |  'folder/c_2.txt',
242 |  'folder/c_3.txt']
243 | ```
244 | {: .output}
245 | 
246 | In this case, `expand()` created every possible combination of filenames from
247 | the two wildcards. Useful!
248 | Of course, this still leaves us needing somehow get the values for `wildcard1`
249 | and `wildcard2` in the first place.
250 | 
251 | ### Get wildcard values with glob_wildcards()
252 | 
253 | To get a set of wildcards from a list of files, we can use the
254 | `glob_wildcards()` function.
255 | Let's try grabbing all of the book titles in our `books` folder.
256 | 
257 | ```
258 | >>> glob_wildcards('books/{example}.txt')
259 | ```
260 | {: .language-python}
261 | ```
262 | Wildcards(example=['isles', 'last', 'abyss', 'sierra'])
263 | ```
264 | {: .output}
265 | 
266 | `glob_wildcards()` returns a `Wildcards` object as output.
267 | `Wildcards` is a special object defined by Snakemake that
268 | provides named lists.
269 | 
270 | In this case, there is only one wildcard, `{example}`.
271 | We can extract the values for the file names by getting the `example`
272 | property from the output of `glob_wildcards()`
273 | 
274 | ```
275 | >>> glob_wildcards('books/{example}.txt').example
276 | ```
277 | {: .language-python}
278 | ```
279 | ['isles', 'last', 'abyss', 'sierra']
280 | ```
281 | {: .output}
282 | 
283 | > ## Putting it all together
284 | >
285 | > Using the `expand()` and `glob_wildcards()` functions,
286 | > modify the pipeline so that it automatically detects and analyses
287 | > all the files in the `books/` folder.
288 | {: .challenge}
289 | 
290 | ## Using Python code as actions
291 | 
292 | One very useful feature of Snakemake is the ability to execute Python code
293 | instead of just shell commands.
294 | Instead of `shell:` as an action, we can use `run:` instead.
295 | 
296 | Add the following to our snakefile:
297 | 
298 | ```
299 | # at the top of the file
300 | import os
301 | import glob
302 | 
303 | # add this wherever
304 | rule print_book_names:
305 |     run:
306 |         print('These are all the book names:')
307 |         for book in glob.glob('books/*.txt'):
308 |             print(book)
309 | ```
310 | {: .language-python}
311 | 
312 | Upon execution of the corresponding rule, Snakemake dutifully runs our Python
313 | code in the `run:` block:
314 | 
315 | ```
316 | $ snakemake print_book_names
317 | ```
318 | {: .language-bash}
319 | ```
320 | Provided cores: 1
321 | Rules claiming more threads will be scaled down.
322 | Job counts:
323 |     count	jobs
324 |     1	print_book_names
325 |     1
326 | 
327 | rule print_book_names:
328 |     jobid: 0
329 | 
330 | These are all the book names:
331 | books/isles.txt
332 | books/last.txt
333 | books/abyss.txt
334 | books/sierra.txt
335 | Finished job 0.
336 | 1 of 1 steps (100%) done
337 | ```
338 | {: .output}
339 | 
340 | > ## Moving output locations
341 | >
342 | > Alter the rules in your Snakefile so that the `.dat` files are created in
343 | > their own `dats/` folder.
344 | > Note that creating this folder beforehand is unnecessary.
345 | > Snakemake automatically creates any folders for you, as needed.
346 | {: .challenge}
347 | 
348 | > ## Creating PNGs
349 | >
350 | > Add new rules and update existing rules to:
351 | >
352 | > * Create `.png` files from `.dat` files using `plotcount.py`.
353 | > * Remove all auto-generated files (`.dat`, `.png`,
354 | >   `results.txt`).
355 | >
356 | > Finally, many Snakefiles define a default target called `all` as first
357 | > target, that will build what the Snakefile has been written to build (e.g. in
358 | > our case, the `.png` files and the `results.txt` file).
359 | > Add an `all` target to your Snakefile (Hint: this rule has the `results.txt`
360 | > file and the `.png` files as dependencies, but no actions).
361 | > With that in place, instead of running `snakemake results.txt`,
362 | > you should now run `snakemake all`, or just simply `snakemake`.
363 | {: .challenge}
364 | 
365 | > ## Creating an Archive
366 | >
367 | > Update your pipeline to:
368 | >
369 | > * Create an archive, `zipf_analysis.tar.gz`, to hold all our
370 | >   `.dat` files, plots, and the Zipf summary table.
371 | > * Update `all` to expect `zipf_analysis.tar.gz` as input.
372 | > * Remove `zipf_analysis.tar.gz` when `snakemake clean` is called.
373 | >
374 | > The syntax to create an archive is shown below:
375 | > ```
376 | > tar -czvf zipf_analysis.tar.gz file1 directory2 file3 etc
377 | > ```
378 | > {: .language-bash}
379 | {: .challenge}
380 | 
381 | After these exercises our final workflow should look something like the
382 | following:
383 | 
384 | ![Final directed acyclic graph](../fig/05-final-dag.svg)
385 | 
386 | > ## Adding more books
387 | >
388 | > We can now do a better job at testing Zipf's rule by adding more books.
389 | > The books we have used come from the [Project Gutenberg](
390 | > http://www.gutenberg.org/) website.
391 | > Project Gutenberg offers thousands of free e-books to download.
392 | >
393 | > ### Exercise instructions
394 | >
395 | > * Go to [Project Gutenberg](http://www.gutenberg.org/) and use the search box
396 | >   to find another book, for example ['The Picture of Dorian Gray'](
397 | >   https://www.gutenberg.org/ebooks/174) by Oscar Wilde.
398 | > * Download the 'Plain Text UTF-8' version and save it to the `books` folder;
399 | >   choose a short name for the file
400 | > * Optionally, open the file in a text editor and remove extraneous text at
401 | >   the beginning and end (look for the phrase `End of Project Gutenberg's
402 | >   [title], by [author]`)
403 | > * Run `snakemake` and check that the correct commands are run
404 | > * Check the `results.txt` file to see how this book compares to the others
405 | {: .challenge}
406 | 


--------------------------------------------------------------------------------
/_episodes/12-resources.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Resources and parallelism"
  3 | teaching: 30
  4 | exercises: 15
  5 | questions:
  6 | - "How do I scale a pipeline across multiple cores?"
  7 | - "How do I manage access to resources while working in parallel?"
  8 | objectives:
  9 | - "Modify your pipeline to run in parallel."
 10 | keypoints:
 11 | - "Use `threads` to indicate the number of cores used by a rule."
 12 | - "Resources are arbitrary and can be used for anything."
 13 | - "The `&&` operator is a useful tool when chaining bash commands."
 14 | ---
 15 | 
 16 | After the exercises at the end of our last lesson,
 17 | our Snakefile looks something like this:
 18 | 
 19 | ```
 20 | # our zipf analysis pipeline
 21 | DATS = glob_wildcards('books/{book}.txt').book
 22 | 
 23 | rule all:
 24 |     input:
 25 |         'zipf_analysis.tar.gz'
 26 | 
 27 | # delete everything so we can re-run things
 28 | rule clean:
 29 |     shell:
 30 |         '''
 31 |         rm -rf results dats plots
 32 |         rm -f results.txt zipf_analysis.tar.gz
 33 |         '''
 34 | 
 35 | # count words in one of our "books"
 36 | rule count_words:
 37 |     input:
 38 |         wc='wordcount.py',
 39 |         book='books/{file}.txt'
 40 |     output: 'dats/{file}.dat'
 41 |     shell:  'python {input.wc} {input.book} {output}'
 42 | 
 43 | # create a plot for each book
 44 | rule make_plot:
 45 |     input:
 46 |         plotcount='plotcount.py',
 47 |         book='dats/{file}.dat'
 48 |     output: 'plots/{file}.png'
 49 |     shell:  'python {input.plotcount} {input.book} {output}'
 50 | 
 51 | # generate summary table
 52 | rule zipf_test:
 53 |     input:
 54 |         zipf='zipf_test.py',
 55 |         books=expand('dats/{book}.dat', book=DATS)
 56 |     output: 'results.txt'
 57 |     shell:  'python {input.zipf} {input.books} > {output}'
 58 | 
 59 | # create an archive with all of our results
 60 | rule make_archive:
 61 |     input:
 62 |         expand('plots/{book}.png', book=DATS),
 63 |         expand('dats/{book}.dat', book=DATS),
 64 |         'results.txt'
 65 |     output: 'zipf_analysis.tar.gz'
 66 |     shell: 'tar -czvf {output} {input}'
 67 | ```
 68 | {: .language-make}
 69 | 
 70 | At this point, we have a complete data analysis pipeline.
 71 | Very cool.
 72 | But how do we make it run as efficiently as possible?
 73 | 
 74 | ## Running in parallel
 75 | 
 76 | Up to this point, Snakemake has printed out an interesting message
 77 | whenever we run our pipeline.
 78 | 
 79 | ```
 80 | Provided cores: 1
 81 | Rules claiming more threads will be scaled down.
 82 | ```
 83 | {: .output}
 84 | 
 85 | So far, Snakemake has been run with one core.
 86 | Let's scale up our pipeline to run in parallel.
 87 | The only change we need to make is run Snakemake with the `-j` argument.
 88 | `-j` is used to indicate number of CPU cores available,
 89 | and on a cluster, maximum number of jobs (we'll get to that part later).
 90 | Note that 4 cores is usually a safe assumption when working on a laptop.
 91 | 
 92 | ```
 93 | $ snakemake clean
 94 | $ snakemake -j 4
 95 | ```
 96 | {: .language-bash}
 97 | ```
 98 | Provided cores: 4
 99 | Rules claiming more threads will be scaled down.
100 | # more output follows
101 | ```
102 | {: .output}
103 | 
104 | Our pipeline ran in parallel and finished roughly 4 times as quickly!
105 | The takeaway here is that all we need to do to scale from a
106 | serial pipeline is run `snakemake` with the `-j` option.
107 | 
108 | > ## How many CPUs does your computer have?
109 | >
110 | > Now that we can have our pipeline use multiple CPUs,
111 | > how do we know how many CPUs to provide to the `-j` option?
112 | > Note that for all of these options, it's best to use CPU cores,
113 | > and not CPU threads.
114 | >
115 | > **Linux**: You can use the `lscpu` command.
116 | >
117 | > **All platforms**: Python's `psutil` module can be used to fetch the number
118 | > of cores in your computer. Using `logical=False` returns the number of true
119 | > CPU cores.
120 | > `logical=True` gives the number of CPU threads on your system.
121 | >
122 | > ```
123 | > import psutil
124 | > psutil.cpu_count(logical=False)
125 | > ```
126 | > {: .language-python}
127 | {: .callout}
128 | 
129 | ## Managing CPUs
130 | 
131 | Each rule has a number of optional keywords aside from the usual
132 | `input`, `output`, and `shell`/`run`.
133 | The `threads` keyword is used to specify how many CPU cores a rule
134 | needs while executing.
135 | Though in reality CPU threads are not quite the same as CPU cores,
136 | the two terms are interchangeable when working with Snakemake.
137 | 
138 | Let's pretend that our `count_words` rule is actually very CPU-intensive.
139 | We'll say that it needs a whopping 4 CPUs per run.
140 | We can specify this with the `threads` keyword in our rule.
141 | We will also modify the rule to print out the number of threads it thinks it is
142 | using.
143 | Please note that just giving a Snakemake rule 4 threads does not automatically
144 | make its action run in parallel! The action also needs to be threads-capable
145 | and to explicitly use the `{threads}` information for this to happen.
146 | In this case `wordcount.py` is actually still running with 1 core,
147 | we are simply using it as a demonstration of how to go about
148 | running something with multiple cores.
149 | 
150 | ```
151 | rule count_words:
152 |     input:
153 |         wc='wordcount.py',
154 |         book='books/{file}.txt'
155 |     output: 'dats/{file}.dat'
156 |     threads: 4
157 |     shell:
158 |         '''
159 |         echo "Running {input.wc} with {threads} cores."
160 |         python {input.wc} {input.book} {output}
161 |         '''
162 | ```
163 | {: .language-make}
164 | 
165 | Now when we run `snakemake -j 4`, the jobs from `count_words` are run one at a
166 | time so as to give each job the resources it needs.
167 | Since each job of the `count_words` rule requires 4 threads (as per the newly
168 | added thread directive), and because all jobs have a maximum of 4 cores
169 | available to them as per the `-j 4` option, the count_words jobs are run one at
170 | a time.
171 | All of our other rules will still run in parallel since they default to
172 | requesting a single thread.
173 | Unless otherwise specified with `{threads}`, rules will use 1 core by default.
174 | 
175 | ```
176 | Provided cores: 4
177 | Rules claiming more threads will be scaled down.
178 | Job counts:
179 |     count	jobs
180 |     1	all
181 |     4	count_words
182 |     1	make_archive
183 |     4	make_plot
184 |     1	zipf_test
185 |     11
186 | 
187 | rule count_words:
188 |     input: wordcount.py, books/last.txt
189 |     output: dats/last.dat
190 |     jobid: 3
191 |     wildcards: file=last
192 |     threads: 4
193 | 
194 | Running wordcount.py with 4 cores.
195 | Finished job 3.
196 | 1 of 11 steps (9%) done
197 | 
198 | # other output follows
199 | ```
200 | {: .output}
201 | 
202 | What happens when we don't have 4 cores available?
203 | What if we tell Snakemake to run with 2 cores instead?
204 | 
205 | ```
206 | $ snakemake -j 2
207 | ```
208 | {: .language-bash}
209 | ```
210 | Provided cores: 2
211 | Rules claiming more threads will be scaled down.
212 | Job counts:
213 |     count	jobs
214 |     1	all
215 |     4	count_words
216 |     1	make_archive
217 |     4	make_plot
218 |     1	zipf_test
219 |     11
220 | 
221 | rule count_words:
222 |     input: wordcount.py, books/last.txt
223 |     output: dats/last.dat
224 |     jobid: 6
225 |     wildcards: file=last
226 |     threads: 2
227 | 
228 | Running wordcount.py with 2 cores.
229 | Finished job 6.
230 | 1 of 11 steps (9%) done
231 | 
232 | # more output below
233 | ```
234 | {: .output}
235 | 
236 | The key bit of output is `Rules claiming more threads will be scaled down.`.
237 | When Snakemake doesn't have enough cores to run a rule (as defined by
238 | `{threads}`),
239 | Snakemake will run that rule with the maximum available number of cores
240 | instead.
241 | After all, Snakemake's job is to get our workflow done.
242 | It automatically scales our workload to match the maximum number of cores
243 | available without us editing the Snakefile.
244 | 
245 | ## Chaining multiple commands
246 | 
247 | Up until now, all of our commands have fit on one line.
248 | To execute multiple bash commands, the only modification we need to make
249 | is use a Python multiline string (begin and end with `'''`)
250 | 
251 | One important addition we should be aware of is the `&&` operator.
252 | `&&` is a bash operator that runs commands as part of a chain.
253 | If the first command fails, the remaining steps are not run.
254 | This is more forgiving than bash's default "hit an error and keep going"
255 | behavior.
256 | After all, if the first command failed,
257 | it's unlikely the other steps will work.
258 | 
259 | ```
260 | # count words in one of our "books"
261 | rule count_words:
262 |     input:
263 |         wc='wordcount.py',
264 |         book='books/{file}.txt'
265 |     output: 'dats/{file}.dat'
266 |     threads: 4
267 |     shell:
268 |         '''
269 |         echo "Running {input.wc} with {threads} cores on {input.book}." &&
270 |             python {input.wc} {input.book} {output}
271 |         '''
272 | ```
273 | {: .language-make}
274 | 
275 | ## Managing other types of resources
276 | 
277 | Not all compute resources are CPUs.
278 | Examples might include limited amounts of RAM, number of GPUs, database locks,
279 | or perhaps we simply don't want multiple processes writing to the same file at
280 | once.
281 | All non-CPU resources are handled using the `resources` keyword.
282 | 
283 | For our example, let's pretend that creating a plot with `plotcount.py`
284 | requires dedicated access to a GPU (it doesn't),
285 | and only one GPU is available.
286 | How do we indicate this to Snakemake so that it knows to give dedicated access
287 | to a GPU for rules that need it?
288 | Let's modify the `make_plot` rule as an example:
289 | 
290 | ```
291 | # create a plot for each book
292 | rule make_plot:
293 |     input:
294 |         plotcount='plotcount.py',
295 |         book='dats/{file}.dat'
296 |     output: 'plots/{file}.png'
297 |     resources: gpu=1
298 |     shell:  'python {input.plotcount} {input.book} {output}'
299 | ```
300 | {: .language-make}
301 | 
302 | We can execute our pipeline using the following (using 8 cores and 1 gpu):
303 | 
304 | ```
305 | $ snakemake clean
306 | $ snakemake -j 8 --resources gpu=1
307 | ```
308 | {: .language-bash}
309 | ```
310 | Provided cores: 8
311 | Rules claiming more threads will be scaled down.
312 | Provided resources: gpu=1
313 | # other output removed for brevity
314 | ```
315 | {: .output}
316 | 
317 | Resources are entirely arbitrary &mdash; like wildcards,
318 | they can be named anything.
319 | Snakemake knows nothing about them aside from the fact that they have a name
320 | and a value.
321 | In this case `gpu` indicates simply that there is a resource called `gpu` used
322 | by `make_plot`.
323 | We provided 1 `gpu` to the workflow,
324 | and the `gpu` is considered in use as long as the rule is running.
325 | Once the `make_plot` rule completes,
326 | the `gpu` it consumed is added back to the pool of available `gpu`s.
327 | To be extra clear: `gpu` in this case does not actually represent a GPU,
328 | it is an arbitrary limit used to prevent multiple tasks that use a `gpu` from
329 | executing at the same time.
330 | 
331 | But what happens if we run our pipeline without specifying the number of GPUs?
332 | 
333 | ```
334 | $ snakemake clean
335 | $ snakemake -j 8
336 | ```
337 | {: .language-bash}
338 | ```
339 | Provided cores: 8
340 | Rules claiming more threads will be scaled down.
341 | Unlimited resources: gpu
342 | ```
343 | {: .output}
344 | 
345 | If you have specified that a rule needs a certain resource,
346 | but do not specify how many you have,
347 | Snakemake will assume that the resources in question are unlimited.
348 | 
349 | > ## Other uses for `resources`
350 | >
351 | > Resources do not have to correspond to actual compute resources.
352 | > Perhaps one rule is particularly I/O heavy,
353 | > and it's best if only a limited number of these jobs run at a time.
354 | > Or maybe a type of rule uses a lot of network bandwidth as it downloads data.
355 | > In all of these cases, `resources` can be used to constrain access to
356 | > arbitrary compute resources so that each rule can run at it's most efficient.
357 | > Snakemake will run your rules in such a way as to maximise throughput given
358 | > your resource constraints.
359 | {: .callout}
360 | 


--------------------------------------------------------------------------------
/_episodes/13-cluster.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Scaling a pipeline across a cluster"
  3 | teaching: 30
  4 | exercises: 15
  5 | questions:
  6 | - "How do I run my workflow on an HPC system?"
  7 | objectives:
  8 | - "Understand the Snakemake cluster job submission workflow."
  9 | keypoints:
 10 | - "Snakemake generates and submits its own batch scripts for your scheduler."
 11 | - "`localrules` defines rules that are executed on the Snakemake head node."
 12 | - "`$PATH` must be passed to Snakemake rules."
 13 | - "`nohup <command> &` prevents `<command>` from exiting when you log off."
 14 | ---
 15 | 
 16 | Right now we have a reasonably effective pipeline that scales nicely on our
 17 | local computer.
 18 | However, for the sake of this course, we'll pretend that our workflow actually
 19 | takes significant computational resources and needs to be run on a cluster.
 20 | 
 21 | > ## HPC cluster architecture
 22 | >
 23 | > Most HPC clusters are run using a scheduler.
 24 | > The scheduler is a piece of software that handles which compute jobs are run
 25 | > on which compute nodes and where.
 26 | > It allows a set of users to share a shared computing system as efficiently as
 27 | > possible.
 28 | > In order to use it, users typically must write their commands to be run into
 29 | > a shell script and then "submit" it to the scheduler.
 30 | >
 31 | > A good analogy would be a university's room booking system.
 32 | > No one gets to use a room without going through the booking system.
 33 | > The booking system decides which rooms people get based on their requirements
 34 | > (# of students, time allotted, etc.).
 35 | {: .callout}
 36 | 
 37 | Normally, moving a workflow to be run by a cluster scheduler requires a lot of
 38 | work.
 39 | Batch scripts need to be written, and you'll need to monitor and babysit the
 40 | status of each of your jobs.
 41 | This is especially difficult if one batch job depends on the output from
 42 | another.
 43 | Even moving from one cluster to another (especially ones using a different
 44 | scheduler) requires a large investment of time and effort &mdash; all the batch
 45 | scripts from before need to be rewritten.
 46 | 
 47 | Snakemake does all of this for you.
 48 | All details of running the pipeline through the cluster scheduler are handled
 49 | by Snakemake &mdash; this includes writing batch scripts, submitting, and
 50 | monitoring jobs.
 51 | In this scenario, the role of the scheduler is limited to ensuring each
 52 | Snakemake rule is executed with the resources it needs.
 53 | 
 54 | We'll explore how to port our example Snakemake pipeline.
 55 | Our current Snakefile is shown below:
 56 | 
 57 | ```
 58 | # our zipf analysis pipeline
 59 | DATS = glob_wildcards('books/{book}.txt').book
 60 | 
 61 | rule all:
 62 |     input:
 63 |         'zipf_analysis.tar.gz'
 64 | 
 65 | # delete everything so we can re-run things
 66 | rule clean:
 67 |     shell:
 68 |         '''
 69 |         rm -rf results dats plots
 70 |         rm -f results.txt zipf_analysis.tar.gz
 71 |         '''
 72 | 
 73 | # count words in one of our "books"
 74 | rule count_words:
 75 |     input:
 76 |         wc='wordcount.py',
 77 |         book='books/{file}.txt'
 78 |     output: 'dats/{file}.dat'
 79 |     threads: 4
 80 |     shell:
 81 |         '''
 82 |         python {input.wc} {input.book} {output}
 83 |         '''
 84 | 
 85 | # create a plot for each book
 86 | rule make_plot:
 87 |     input:
 88 |         plotcount='plotcount.py',
 89 |         book='dats/{file}.dat'
 90 |     output: 'plots/{file}.png'
 91 |     resources: gpu=1
 92 |     shell:  'python {input.plotcount} {input.book} {output}'
 93 | 
 94 | # generate summary table
 95 | rule zipf_test:
 96 |     input:
 97 |         zipf='zipf_test.py',
 98 |         books=expand('dats/{book}.dat', book=DATS)
 99 |     output: 'results.txt'
100 |     shell:  'python {input.zipf} {input.books} > {output}'
101 | 
102 | # create an archive with all of our results
103 | rule make_archive:
104 |     input:
105 |         expand('plots/{book}.png', book=DATS),
106 |         expand('dats/{book}.dat', book=DATS),
107 |         'results.txt'
108 |     output: 'zipf_analysis.tar.gz'
109 |     shell: 'tar -czvf {output} {input}'
110 | ```
111 | {: .language-make}
112 | 
113 | To run Snakemake on a cluster, we need to create a profile to tell snakemake
114 | how to submit jobs to our cluster.
115 | We can then submit jobs with this profile using the `--profile` argument
116 | followed by the name of our profile.
117 | In this configuration,
118 | Snakemake runs on the cluster head node and submits jobs.
119 | Each cluster job executes a single rule and then exits.
120 | Snakemake detects the creation of output files,
121 | and submits new jobs (rules) once their dependencies are created.
122 | 
123 | ## Transferring our workflow
124 | 
125 | Let's port our workflow to Compute Canada's Graham cluster as an example (you
126 | will probably be using a different cluster, adapt these instructions to your
127 | cluster).
128 | The first step will be to transfer our files to the cluster and log on via SSH.
129 | Snakemake has a powerful archiving utility that we can use to bundle up our
130 | workflow and transfer it.
131 | 
132 | 
133 | ```
134 | $ snakemake clean
135 | $ tar -czvf pipeline.tar.gz .
136 | # transfer the pipeline via scp
137 | $ scp pipeline.tar.gz yourUsername@graham.computecanada.ca:
138 | # log on to the cluster
139 | $ ssh -X yourUsername@graham.computecanada.ca
140 | ```
141 | {: .language-bash}
142 | 
143 | > ## `snakemake --archive` and Conda deployment
144 | >
145 | > Snakemake has a built-in method to archive all input files
146 | > and scripts under version control: `snakemake --archive`.
147 | > What's more, it also installs any required dependencies if they can be
148 | > installed using Anaconda's `conda` package manager.
149 | > You can use this feature for this tutorial
150 | > (I've already added all of the files to version control for you),
151 | > but if you want to use this feature in your own work,
152 | > you should familiarise yourself with a version control tool like Git.
153 | >
154 | > For more information on how to use this feature, see
155 | > [http://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html](
156 | > http://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html)
157 | {: .callout}
158 | 
159 | At this point we've archived our entire pipeline, sent it to the cluster, and
160 | logged on. Let's create a folder for our pipeline and unpack it there.
161 | 
162 | ```
163 | $ mkdir pipeline
164 | $ mv pipeline.tar.gz pipeline
165 | $ cd pipeline
166 | $ tar -xvzf pipeline.tar.gz
167 | ```
168 | {: .language-bash}
169 | 
170 | If Snakemake and Python are not already installed on your cluster, you can
171 | install them in an Anaconda Python environment using the following commands:
172 | 
173 | ```
174 | $ wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
175 | $ bash Miniconda3-latest-Linux-x86_64.sh
176 | ```
177 | {: .language-bash}
178 | 
179 | This is an interactive installation through the command line. Review and accept
180 | the license agreement, then work through the prompts. The defaults are probably
181 | fine. Accept its offer to initialize your environment (`conda init`), then run
182 | the suggested command to load the `conda` base environment so you can use it
183 | straight away. Finally, install Snakemake from the [bioconda channel](
184 | https://anaconda.org/bioconda):
185 | 
186 | ```
187 | $ conda install -y -c bioconda graphviz matplotlib numpy snakemake
188 | ```
189 | {: .language-bash}
190 | 
191 | Assuming you've transferred your files and everything is set to go,
192 | the command `snakemake -n` should work without errors.
193 | 
194 | ## Creating a cluster profile
195 | 
196 | Snakemake uses a YAML-formatted configuration file to retrieve cluster
197 | submission parameters; we will use the SLURM scheduler for an example.
198 | When we use the '--profile slurm' argument, snakemake looks for a directory
199 | with the name of our profile (slurm) containing a 'config.yaml' file such as
200 | the one below.
201 | 
202 | ```
203 | cluster: "sbatch --time={resources.time_min} --mem={resources.mem_mb}
204 |           -c {resources.cpus} -o slurm/logs/{rule}_{wildcards}
205 |           -e slurm/logs/{rule}_{wildcards}"
206 | jobs: 25
207 | default-resources: [cpus=1, mem_mb=1000, time_min=5]
208 | resources: [cpus=100, mem_mb=100000]
209 | ```
210 | {: .source}
211 | 
212 | This file has several components.
213 | `cluster` and the arguments that follow tell snakemake how to submit jobs to
214 | the cluster.
215 | Here we've used SLURM's `sbatch` command and arguments for setting time limits
216 | and resources with snakemake wildcards defining the requested values.
217 | 
218 | We've also specified where to save SLURM logs and what to call them. **Note** that
219 | this folder must already exist. If the folders don't exist, Snakemake will hang.
220 | 
221 | Values for any command line argument to snakemake can be defined in our
222 | profile, although a value is required (e.g. the `--use-conda` argument could be
223 | included in our profile with `use-conda: true`).
224 | 
225 | `jobs` specifies the maximum number of jobs that will be submitted at one time.
226 | We also specified the `default-resources` that will be requested for each job,
227 | while `resources` defines the resource limits.
228 | 
229 | With these parameters, snakemake will use no more than 100 cpus and 100000 MB
230 | (100 GB) at a time between all currently submitted jobs.
231 | While it does not come into play here, a generally sensible default is slightly
232 | above the maximum number of jobs you are allowed to have submitted at a time.
233 | 
234 | The defaults won't always be perfect, however &mdash; chances are some rules
235 | may need to run with non-default amounts of memory or time limits.
236 | We are using the `count_words` rule as an example of this.
237 | To request non-default resources for a job, we can modify the rule in our
238 | snakefile to include a `resources` section like this:
239 | 
240 | ```
241 | # count words in one of our "books"
242 | rule count_words:
243 |     input:
244 |         wc='wordcount.py',
245 |         book='books/{file}.txt'
246 |     output: 'dats/{file}.dat'
247 |     threads: 4
248 |     resources: cpus=4, mem_mb=8000, time_min=20
249 |     shell:
250 |         '''
251 |         python {input.wc} {input.book} {output}
252 |         '''
253 | ```
254 | {: .language-make}
255 | 
256 | ## Local rule execution
257 | 
258 | Some Snakemake rules perform trivial tasks where job submission might be
259 | overkill (i.e. less than 1 minute worth of compute time).
260 | It would be a better idea to have these rules execute locally
261 | (i.e. where the `snakemake` command is run)
262 | instead of as a job.
263 | Let's define `all`, `clean`, and `make_archive` as localrules near the top of
264 | our `Snakefile`.
265 | 
266 | ```
267 | localrules: all, clean, make_archive
268 | ```
269 | {: .language-make}
270 | 
271 | ## Running our workflow on the cluster
272 | 
273 | OK, time for the moment we've all been waiting for &mdash; let's run our
274 | workflow on the cluster with the profile we've created. Use this command:
275 | 
276 | ```
277 | $ snakemake --profile slurm
278 | ```
279 | {: .language-bash}
280 | 
281 | While things execute, you may wish to SSH to the cluster in another window so
282 | you can watch the pipeline's progress with `watch squeue -u $(whoami)`.
283 | 
284 | 
285 | > ## Notes on `$PATH`
286 | >
287 | > As with any cluster jobs, jobs started by Snakemake need to have the commands
288 | > they are running on `$PATH`.
289 | > For some schedulers (SLURM), no modifications are necessary &mdash; variables
290 | > are passed to the jobs by default.
291 | > Other schedulers (SGE) need to have this enabled through a command line flag
292 | > when submitting jobs (`-V` for SGE).
293 | > If this is possible, just run the `module load` commands you need ahead of
294 | > the job and run Snakemake as normal.
295 | >
296 | > If this is not possible, you have several options:
297 | >
298 | > * You can edit your `.bashrc` file to modify `$PATH` for all jobs and
299 | >   sessions you start on a cluster.
300 | > * Inserting `shell.prefix('some command')` in a Snakefile means that all
301 | >   rules run will be prefixed by `some_command`. You can use this to modify
302 | >   `$PATH`, e.g., `shell.prefix('PATH=/extra/directory:$PATH ')`.
303 | > * You can modify rules directly to run the appropriate `module load` commands
304 | >   beforehand. This is not recommended, only if because it is more work than
305 | >   the other options available.
306 | {: .callout}
307 | 
308 | > ## Submitting a workflow with nohup
309 | >
310 | > `nohup some_command &` runs a command in the background and lets it keep
311 | > running if you log off.
312 | > Try running the pipeline in cluster mode using `nohup` (run `snakemake clean`
313 | > beforehand).
314 | > Where does the Snakemake log go to?
315 | > Why might this technique be useful?
316 | > Can we also submit the `snakemake --profile slurm` pipeline as a job?
317 | > Where does the Snakemake command run in each scenario?
318 | >
319 | > You can kill the running Snakemake process with `killall snakemake`.
320 | > Notice that if you try to run Snakemake again, it says the directory is
321 | > locked.
322 | > You can unlock the directory with `snakemake --unlock`.
323 | {: .challenge}
324 | 


--------------------------------------------------------------------------------
/_episodes/14-final-notes.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Final notes"
  3 | teaching: 15
  4 | exercises: 15
  5 | questions:
  6 | - "What are some tips and tricks I can use to make this easier?"
  7 | objectives:
  8 | - "Understand how to make a DAG graph."
  9 | keypoints:
 10 | - "Token files can be used to take the place of output files if
 11 |   none are created."
 12 | - "`snakemake --dag | dot -Tsvg > dag.svg` creates a graphic of
 13 |   your workflow."
 14 | - "`snakemake --gui` opens a browser window with your workflow."
 15 | ---
 16 | 
 17 | Now that we know how to write and scale a pipeline,
 18 | here are some tips and tricks for making the process go more smoothly.
 19 | 
 20 | ## `snakemake -n` is your friend
 21 | 
 22 | Whenever you edit your Snakefile, run `snakemake -n` immediately afterwards.
 23 | This will check for errors and make sure that the pipeline is able to run.
 24 | 
 25 | The most common source of errors is a mismatch in filenames
 26 | (Snakemake doesn't know how to produce a particular output file) &mdash;
 27 | `snakemake -n` will catch this as long as the troublesome output files haven't
 28 | already been made.
 29 | 
 30 | ## Configuring logging
 31 | 
 32 | By default, Snakemake prints all output from stderr and stdout from rules.
 33 | This is useful, but if a failure occurs (or we otherwise need to inspect the
 34 | logs) it can be extremely difficult to determine what happened
 35 | or which rule had an issue, especially when running in parallel.
 36 | 
 37 | The solution to this issue is to redirect the output from each rule/
 38 | set of inputs to a dedicated log file.
 39 | We can do this using the `log` keyword.
 40 | Let's modify our `count_words` rule to be slightly more verbose and redirect
 41 | this output to a dedicated log file.
 42 | 
 43 | Two things before we start:
 44 | 
 45 | * `&>` is a handy operator in bash that redirects both `stdout` and `stderr` to
 46 |   a file.
 47 | * `&>>` does the same thing as `&>`, but appends to a file instead of
 48 |   overwriting it.
 49 | 
 50 | ```
 51 | # count words in one of our "books"
 52 | rule count_words:
 53 |   input:
 54 |       wc='wordcount.py',
 55 |       book='books/{file}.txt'
 56 |   output: 'dats/{file}.dat'
 57 |   threads: 4
 58 |   log: 'dats/{file}.log'
 59 |   shell:
 60 |       '''
 61 |       echo "Running {input.wc} with {threads} cores on {input.book}." &> {log}
 62 |       python {input.wc} {input.book} {output} &>> {log}
 63 |       '''
 64 | ```
 65 | {: .language-make}
 66 | 
 67 | ```
 68 | $ snakemake clean
 69 | $ snakemake -j 8
 70 | $ cat dats/abyss.log
 71 | ```
 72 | {: .language-bash}
 73 | ```
 74 | # snakemake output omitted
 75 | Running wordcount.py with 4 cores on books/abyss.txt.
 76 | ```
 77 | {: .output}
 78 | 
 79 | Notice how the pipeline no longer prints to the pipeline's log,
 80 | and instead redirects this to a log file.
 81 | 
 82 | > ## Choosing a good log file location
 83 | >
 84 | > Though you can put a log anywhere (and name it anything),
 85 | > it is often a good practice to put the log in the same directory
 86 | > where the rule's output will be created.
 87 | > If you need to investigate the output for a rule and associated log files,
 88 | > this means that you only have to check one location!
 89 | {: .callout}
 90 | 
 91 | ## Token files
 92 | 
 93 | Often, a rule does not generate a unique output, and merely modifies a file.
 94 | In these cases it is often worthwhile to create a placeholder, or "token file"
 95 | as output.
 96 | A token file is simply an empty file that you can create with the touch command
 97 | (`touch some_file.txt` creates an empty file called `some_file.txt`).
 98 | An example rule using this technique is shown below:
 99 | 
100 | ```
101 | rule token_example:
102 |     input:  'some_file.txt'
103 |     output: 'some_file.tkn'   # marks some_file.txt as modified
104 |     shell:
105 |         '''
106 |         some_command --do-things {input} &&
107 |             touch {output}
108 |         '''
109 | ```
110 | {: .language-make}
111 | 
112 | ## Directory locks
113 | 
114 | Only one instance of Snakemake can run in a directory at a time.
115 | If a Snakemake run fails without unlocking the directory
116 | (if you killed the process, for instance), you can run
117 | `snakemake --unlock` to unlock it.
118 | 
119 | ## Python as a fallback
120 | 
121 | Remember, you can use Python imports and functions anywhere in a Snakefile.
122 | If something seems a little tricky to implement - Python can do it.
123 | The `os`, `shutil`, and `subprocess` packages are useful tools for using Python
124 | to execute command line actions.
125 | In particular, `os.system('some command')` will run a command on the
126 | command-line and block until execution is complete.
127 | 
128 | ## Creating a workflow diagram
129 | 
130 | Assuming graphviz is installed (`conda install graphviz`),
131 | you can create a diagram of your workflow with the command:
132 | `snakemake --dag | dot -Tsvg > dag.svg`.
133 | This creates a plot of your "directed acyclic graph"
134 | (a plot of all of the rules Snakemake thinks it needs to complete),
135 | which you can view using any picture viewing program.
136 | In fact this was the tool used to create all of the diagrams in this lesson:
137 | 
138 | ```
139 | snakemake --dag | dot -Tsvg > dag.svg
140 | eog dag.svg     # eog is an image viewer installed on many Linux systems
141 | ```
142 | {: .language-bash}
143 | 
144 | ![Example DAG plot](../fig/05-final-dag.svg)
145 | 
146 | Rules that have yet to be completed are indicated with solid outlines.
147 | Already completed tasks will be indicated with dashed outlines.
148 | In this case, I ran `snakemake clean`,
149 | just before creating the diagram &mdash; no rules have been run yet.
150 | 
151 | ## Viewing the GUI
152 | 
153 | Snakemake has an experimental web browser GUI.
154 | I personally haven't used it for anything, but it's cool to know it's there and
155 | can be used to view your workflow on the fly.
156 | 
157 | `snakemake --gui`
158 | 
159 | ## Where to go for documentation / help
160 | 
161 | The Snakemake documentation is located at
162 | [snakemake.readthedocs.io](http://snakemake.readthedocs.io)
163 | 


--------------------------------------------------------------------------------
/_extras/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc-carpentry/hpc-python/4af94a57eccb8a2b400db514d682bcfc6cdc9a7d/_extras/.gitkeep


--------------------------------------------------------------------------------
/_extras/about.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: About
4 | permalink: /about/
5 | ---
6 | {% include carpentries.html %}
7 | 


--------------------------------------------------------------------------------
/_extras/discuss.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | title: Discussion
 4 | permalink: /discuss/
 5 | ---
 6 | 
 7 | This lesson is heavily skewed towards teaching basic Python syntax and analysis
 8 | pipelines using [Snakemake](http://snakemake.readthedocs.io/en/stable/). Of
 9 | course this raises a couple questions - given the limited teaching time for
10 | these courses, why teach Snakemake over other Python concepts or tools? Why
11 | teach Python at all for high-performance computing tasks?
12 | 
13 | ## Why not other Python topics?
14 | 
15 | For a workshop on general data analysis or basic coding in Python, we recommend
16 | checking out one of Software Carpentry's other workshops that focus on
17 | [Numpy](http://swcarpentry.github.io/python-novice-inflammation/) or
18 | [Pandas](http://swcarpentry.github.io/python-novice-gapminder/) instead.
19 | 
20 | The goal of this workshop is to teach Python in the context of high-performance
21 | computing. Of course, Python is not a fast language. Any code written in an
22 | interpreted language like Python will generally run [hundreds of times slower](
23 | http://benchmarksgame.alioth.debian.org/u64q/compare.php?lang=python3&lang2=gpp)
24 | than a compiled language like C++, Fortran, or even Java. Though it's possible
25 | to improve Python's performance with tools like PyPy, Cython, etc. the level of
26 | knowledge required to use these tools effectively is far beyond what can be
27 | taught in a one-day workshop. Python isn't the right tool for the job if
28 | fast/parallel computing is required. Instructors looking to teach heavy-duty
29 | performance and/or parallelization related topics should check out our [Chapel
30 | lesson](https://hpc-carpentry.github.io/hpc-chapel/) instead.
31 | 
32 | So why teach Python at all?
33 | 
34 | In most scientific fields, there is a major need for automation. Workflows
35 | where the same computation needs to repeated for thousands of input files are
36 | commonplace. This is especially true for fields like bioinformatics, where
37 | researchers need to run dozens of pre-existing programs to process a piece of
38 | data, and then repeat this process for dozens, if not hundreds (or thousands)
39 | of input files. Running these types of high-throughput workflows is a
40 | significant amount of work, made even more complex by the scripting required to
41 | use an HPC cluster's scheduler effectively.
42 | 
43 | Python is a great scripting language, and used in a combination with a workflow
44 | management tool like Snakemake, it is very simple to script the execution of
45 | these types of high-throughput/complex workflows. The goal of this workshop is
46 | to teach students how to automate their work with Python, and make their
47 | workflows reproducible. Importantly, this also covers how to use Snakemake to
48 | automate submission of jobs to an HPC scheduler in a reasonable manner (no
49 | runaway submission of tens of thousands of jobs, encountering an error safely
50 | stops the workflow without losing work, logfiles and output are handled
51 | appropriately, etc.).
52 | 
53 | ## Why not other workflow/pipeline tools?
54 | 
55 | There are lots of other pipeline/workflow management tools out there (in fact,
56 | this lesson was adapted from Software Carpentry's [GNU Make lesson](
57 | http://swcarpentry.github.io/make-novice/)). Why teach Snakemake instead of
58 | these other tools?
59 | 
60 | * It's free, open-source, and installs in about 5 seconds flat via `pip`.
61 | 
62 | * Snakemake works cross-platform (Windows, MacOS, Linux) and is compatible with
63 |   all HPC schedulers. More importantly, the same workflow will work and scale
64 |   appropriately regardless of whether it's on a laptop or cluster *without
65 |   modification*.
66 | 
67 | * Snakemake uses pure Python syntax. There is no tool specific-language to
68 |   learn like in GNU Make, NextFlow, WDL, etc.. Even if students end up not
69 |   liking Snakemake, you've still taught them how to program in Python at the
70 |   end of the day.
71 | 
72 | * Anything that you can do in Python, you can do with Snakemake (since you can
73 |   pretty much execute arbitrary Python code anywhere).
74 | 
75 | * Snakemake was written to be as similar to GNU Make as possible. Users already
76 |   familiar with Make will find Snakemake quite easy to use.
77 | 
78 | * It's easy. You can teach Snakemake in an afternoon.
79 | 


--------------------------------------------------------------------------------
/_extras/figures.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: page
3 | title: Figures
4 | permalink: /figures/
5 | ---
6 | {% include all_figures.html %}
7 | 


--------------------------------------------------------------------------------
/_extras/guide.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | title: "Instructor Notes"
 4 | permalink: /guide/
 5 | ---
 6 | 
 7 | This lesson does not cover the specifics of using a Python IDE. As the
 8 | instructor, you should be familiar with the editor you intend to teach, as well
 9 | as how to install/set it up across all three platforms (Windows, macOS, Linux).
10 | If you don't have a preference for one editor over another, we recommend using
11 | either Jupyter Notebooks, Spyder, or a text editor/IPython console (because
12 | these come preinstalled with Anaconda).
13 | 
14 | You have the option of running the Snakemake portion of the workshop either on
15 | student laptops or an HPC cluster. If you end up going the laptop route, be
16 | aware of how to run things via the Windows command line (since Snakemake
17 | natively works on Windows). The only significant change is the `snakemake
18 | clean` rule: `rm -f *.dat` should be changed to `del *.dat`.
19 | 
20 | If students get lost, there is a hidden `.Snakemake` file in the lesson
21 | materials ([`snakemake-lesson.tar.gz`][snakemake-lesson]) that students can use
22 | as a reference or use for the final "cluster submission" section.
23 | 
24 | [snakemake-lesson]: {{ page.root }}/files/snakemake-lesson.tar.gz
25 | 


--------------------------------------------------------------------------------
/_includes/all_figures.html:
--------------------------------------------------------------------------------
1 | <p><img alt="" src="../fig/02-challenge-dag.svg" /></p>
2 | <hr/>
3 | <p><img alt="" src="../fig/02-dats-dag.svg" /></p>
4 | <hr/>
5 | <p><img alt="" src="../fig/05-final-dag.svg" /></p>
6 | <hr/>
7 | 


--------------------------------------------------------------------------------
/_includes/figure.html:
--------------------------------------------------------------------------------
 1 | <figure>
 2 |   {% if include.url != "" %}
 3 |   <a href="{{ include.url }}">
 4 |   {% endif %}
 5 |     <img src="{{ include.file | relative_url}}" style="max-width: {{ include.max-width }};"
 6 |        alt="{{ include.alt | relative_url}}"/>
 7 |   {% if include.url != "" %}
 8 |   </a>
 9 |   {% endif %}
10 |   <figcaption style="text-align: center">
11 |     <small>
12 |       {{ include.caption }}
13 |     </small>
14 |   </figcaption>
15 | </figure>
16 | 


--------------------------------------------------------------------------------
/_includes/links.md:
--------------------------------------------------------------------------------
 1 | [cc-by-human]: https://creativecommons.org/licenses/by/4.0/
 2 | [cc-by-legal]: https://creativecommons.org/licenses/by/4.0/legalcode
 3 | [ci]: http://communityin.org/
 4 | [coc-reporting]: https://docs.carpentries.org/topic_folders/policies/code-of-conduct.html#reporting-guidelines
 5 | [coc]: https://docs.carpentries.org/topic_folders/policies/code-of-conduct.html
 6 | [concept-maps]: https://carpentries.github.io/instructor-training/05-memory/
 7 | [contrib-covenant]: https://contributor-covenant.org/
 8 | [contributing]: {{ site.github.repository_url }}/blob/gh-pages/CONTRIBUTING.md
 9 | [cran-checkpoint]: https://cran.r-project.org/package=checkpoint
10 | [cran-knitr]: https://cran.r-project.org/package=knitr
11 | [cran-stringr]: https://cran.r-project.org/package=stringr
12 | [dc-lessons]: http://www.datacarpentry.org/lessons/
13 | [email]: mailto:team@carpentries.org
14 | [github-importer]: https://import.github.com/
15 | [importer]: https://github.com/new/import
16 | [jekyll-collection]: https://jekyllrb.com/docs/collections/
17 | [jekyll-install]: https://jekyllrb.com/docs/installation/
18 | [jekyll-windows]: http://jekyll-windows.juthilo.com/
19 | [jekyll]: https://jekyllrb.com/
20 | [jupyter]: https://jupyter.org/
21 | [lc-lessons]: https://librarycarpentry.org/#portfolio
22 | [lesson-example]: https://carpentries.github.io/lesson-example/
23 | [mit-license]: https://opensource.org/licenses/mit-license.html
24 | [morea]: https://morea-framework.github.io/
25 | [numfocus]: https://numfocus.org/
26 | [osi]: https://opensource.org
27 | [pandoc]: https://pandoc.org/
28 | [paper-now]: https://github.com/PeerJ/paper-now
29 | [python-gapminder]: https://swcarpentry.github.io/python-novice-gapminder/
30 | [pyyaml]: https://pypi.python.org/pypi/PyYAML
31 | [r-markdown]: https://rmarkdown.rstudio.com/
32 | [rstudio]: https://www.rstudio.com/
33 | [ruby-install-guide]: https://www.ruby-lang.org/en/downloads/
34 | [ruby-installer]: https://rubyinstaller.org/
35 | [rubygems]: https://rubygems.org/pages/download/
36 | [styles]: https://github.com/carpentries/styles/
37 | [swc-lessons]: https://software-carpentry.org/lessons/ 
38 | [swc-releases]: https://github.com/swcarpentry/swc-releases
39 | [workshop-repo]: {{ site.workshop_repo }}
40 | [yaml]: http://yaml.org/
41 | 


--------------------------------------------------------------------------------
/aio.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | permalink: /aio/index.html
 3 | ---
 4 | 
 5 | {% comment %}
 6 | As a maintainer, you don't need to edit this file.
 7 | If you notice that something doesn't work, please
 8 | open an issue: <https://github.com/carpentries/styles/issues/new>
 9 | {% endcomment %}
10 | 
11 | {% include base_path.html %}
12 | 
13 | {% include aio-script.md %}
14 | 


--------------------------------------------------------------------------------
/bin/chunk-options.R:
--------------------------------------------------------------------------------
 1 | # These settings control the behavior of all chunks in the novice R materials.
 2 | # For example, to generate the lessons with all the output hidden, simply change
 3 | # `results` from "markup" to "hide".
 4 | # For more information on available chunk options, see
 5 | # http://yihui.name/knitr/options#chunk_options
 6 | 
 7 | library("knitr")
 8 | 
 9 | fix_fig_path <- function(pth) file.path("..", pth)
10 | 
11 | 
12 | ## We set the path for the figures globally below, so if we want to
13 | ## customize it for individual episodes, we can append a prefix to the
14 | ## global path. For instance, if we call knitr_fig_path("01-") in the
15 | ## first episode of the lesson, it will generate the figures in
16 | ## `fig/rmd-01-`
17 | knitr_fig_path <- function(prefix) {
18 |     new_path <- paste0(opts_chunk$get("fig.path"),
19 |                       prefix)
20 |     opts_chunk$set(fig.path = new_path)
21 | }
22 | 
23 | ## We use the rmd- prefix for the figures generated by the lessons so
24 | ## they can be easily identified and deleted by `make clean-rmd`.  The
25 | ## working directory when the lessons are generated is the root so the
26 | ## figures need to be saved in fig/, but when the site is generated,
27 | ## the episodes will be one level down. We fix the path using the
28 | ## `fig.process` option.
29 | 
30 | opts_chunk$set(tidy = FALSE, results = "markup", comment = NA,
31 |                fig.align = "center", fig.path = "fig/rmd-",
32 |                fig.process = fix_fig_path,
33 |                fig.width = 8.5, fig.height = 8.5,
34 |                fig.retina = 2)
35 | 
36 | # The hooks below add html tags to the code chunks and their output so that they
37 | # are properly formatted when the site is built.
38 | 
39 | hook_in <- function(x, options) {
40 |   lg <- tolower(options$engine)
41 |   style <- paste0(".language-", lg)
42 | 
43 |   stringr::str_c("\n\n~~~\n",
44 |     paste0(x, collapse="\n"),
45 |     "\n~~~\n{: ", style, "}\n\n")
46 | }
47 | 
48 | hook_out <- function(x, options) {
49 |   x <- gsub("\n$", "", x)
50 |   stringr::str_c("\n\n~~~\n",
51 |     paste0(x, collapse="\n"),
52 |     "\n~~~\n{: .output}\n\n")
53 | }
54 | 
55 | hook_error <- function(x, options) {
56 |   x <- gsub("\n$", "", x)
57 |   stringr::str_c("\n\n~~~\n",
58 |     paste0(x, collapse="\n"),
59 |     "\n~~~\n{: .error}\n\n")
60 | }
61 | 
62 | hook_warning <- function(x, options) {
63 |   x <- gsub("\n$", "", x)
64 |   stringr::str_c("\n\n~~~\n",
65 |     paste0(x, collapse = "\n"),
66 |     "\n~~~\n{: .warning}\n\n")
67 | }
68 | 
69 | knit_hooks$set(source = hook_in, output = hook_out, warning = hook_warning,
70 |   error = hook_error, message = hook_out)
71 | 


--------------------------------------------------------------------------------
/bin/extract_figures.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from __future__ import print_function
 4 | import sys
 5 | import os
 6 | import glob
 7 | from optparse import OptionParser
 8 | 
 9 | from util import Reporter, read_markdown, IMAGE_FILE_SUFFIX
10 | 
11 | def main():
12 |     """Main driver."""
13 | 
14 |     args = parse_args()
15 |     images = []
16 |     for filename in args.filenames:
17 |         images += get_images(args.parser, filename)
18 |     save(sys.stdout, images)
19 | 
20 | 
21 | def parse_args():
22 |     """Parse command-line arguments."""
23 | 
24 |     parser = OptionParser()
25 |     parser.add_option('-p', '--parser',
26 |                       default=None,
27 |                       dest='parser',
28 |                       help='path to Markdown parser')
29 | 
30 |     args, extras = parser.parse_args()
31 |     require(args.parser is not None,
32 |             'Path to Markdown parser not provided')
33 |     require(extras,
34 |             'No filenames specified')
35 | 
36 |     args.filenames = extras
37 |     return args
38 | 
39 | 
40 | def get_filenames(source_dir):
41 |     """Get all filenames to be searched for images."""
42 | 
43 |     return glob.glob(os.path.join(source_dir, '*.md'))
44 | 
45 | 
46 | def get_images(parser, filename):
47 |     """Extract all images from file."""
48 | 
49 |     content = read_markdown(parser, filename)
50 |     result = []
51 |     find_image_nodes(content['doc'], result)
52 |     find_image_links(content['doc'], result)
53 |     return result
54 | 
55 | 
56 | def find_image_nodes(doc, result):
57 |     """Find all nested nodes representing images."""
58 | 
59 |     if (doc['type'] == 'img') or \
60 |        ((doc['type'] == 'html_element') and (doc['value'] == 'img')):
61 |         alt = doc['attr'].get('alt', '')
62 |         result.append({'alt': alt, 'src': doc['attr']['src']})
63 |     else:
64 |         for child in doc.get('children', []):
65 |             find_image_nodes(child, result)
66 | 
67 | 
68 | def find_image_links(doc, result):
69 |     """Find all links to files in the 'fig' directory."""
70 | 
71 |     if ((doc['type'] == 'a') and ('attr' in doc) and ('href' in doc['attr'])) \
72 |        or \
73 |        ((doc['type'] == 'html_element') and (doc['value'] == 'a') and ('href' in doc['attr'])):
74 |         path = doc['attr']['href']
75 |         if os.path.splitext(path)[1].lower() in IMAGE_FILE_SUFFIX:
76 |             result.append({'alt':'', 'src': doc['attr']['href']})
77 |     else:
78 |         for child in doc.get('children', []):
79 |             find_image_links(child, result)
80 | 
81 | 
82 | def save(stream, images):
83 |     """Save results as Markdown."""
84 | 
85 |     text = '\n<hr/>\n'.join(['<p><img alt="{0}" src="{1}" /></p>'.format(img['alt'], img['src']) for img in images])
86 |     print(text, file=stream)
87 | 
88 | 
89 | def require(condition, message):
90 |     """Fail if condition not met."""
91 | 
92 |     if not condition:
93 |         print(message, file=sys.stderr)
94 |         sys.exit(1)
95 | 
96 | 
97 | if __name__ == '__main__':
98 |     main()
99 | 


--------------------------------------------------------------------------------
/bin/generate_md_episodes.R:
--------------------------------------------------------------------------------
 1 | generate_md_episodes <- function() {
 2 | 
 3 |   library("methods")
 4 |   
 5 |   if (!require("remotes", quietly = TRUE)) {
 6 |     install.packages("remotes", repos = c(CRAN = "https://cloud.r-project.org/"))
 7 |   }
 8 | 
 9 |   if (!require("requirements", quietly = TRUE)) {
10 |     remotes::install_github("hadley/requirements")
11 |   }
12 | 
13 |   required_pkgs <- unique(c(
14 |     ## Packages for episodes
15 |     requirements:::req_dir("_episodes_rmd"),
16 |     ## Packages for tools
17 |     requirements:::req_dir("bin")
18 |   ))
19 | 
20 |   missing_pkgs <- setdiff(required_pkgs, rownames(installed.packages()))
21 | 
22 |   if (length(missing_pkgs)) {
23 |     message("Installing missing required packages: ",
24 |             paste(missing_pkgs, collapse=", "))
25 |     install.packages(missing_pkgs)
26 |   }
27 | 
28 |   if (require("knitr") && packageVersion("knitr") < '1.9.19')
29 |     stop("knitr must be version 1.9.20 or higher")
30 | 
31 |   ## get the Rmd file to process from the command line, and generate the path for their respective outputs
32 |   args  <- commandArgs(trailingOnly = TRUE)
33 |   if (!identical(length(args), 2L)) {
34 |     stop("input and output file must be passed to the script")
35 |   }
36 | 
37 |   src_rmd <- args[1]
38 |   dest_md <- args[2]
39 | 
40 |   ## knit the Rmd into markdown
41 |   knitr::knit(src_rmd, output = dest_md)
42 | 
43 |     # Read the generated md files and add comments advising not to edit them
44 |     vapply(dest_md, function(y) {
45 |       con <- file(y)
46 |       mdfile <- readLines(con)
47 |       if (mdfile[1] != "---")
48 |         stop("Input file does not have a valid header")
49 |       mdfile <- append(mdfile, "# Please do not edit this file directly; it is auto generated.", after = 1)
50 |       mdfile <- append(mdfile, paste("# Instead, please edit",
51 |                                      basename(y), "in _episodes_rmd/"), after = 2)
52 |       writeLines(mdfile, con)
53 |       close(con)
54 |       return(paste("Warning added to YAML header of", y))
55 |     },
56 |     character(1))
57 | }
58 | 
59 | generate_md_episodes()
60 | ead, please edit", basename(y), "in _episodes_rmd/"),
61 |       after = 2
62 |     )
63 |     writeLines(mdfile, con)
64 |     close(con)
65 |     return(paste("Warning added to YAML header of", y))
66 |   }
67 | 
68 |   vapply(dest_md, add_no_edit_comment, character(1))
69 | }
70 | 
71 | generate_md_episodes()
72 | 


--------------------------------------------------------------------------------
/bin/knit_lessons.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | # Only try running R to translate files if there are some files present.
4 | # The Makefile passes in the names of files.
5 | 
6 | if [ $# -ne 0 ] ; then
7 |     Rscript -e "source('bin/generate_md_episodes.R')" "$@"
8 | fi
9 | 


--------------------------------------------------------------------------------
/bin/lesson_initialize.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """Initialize a newly-created repository."""
 4 | 
 5 | 
 6 | import sys
 7 | import os
 8 | import shutil
 9 | 
10 | BOILERPLATE = (
11 |     '.travis.yml',
12 |     'AUTHORS',
13 |     'CITATION',
14 |     'CONTRIBUTING.md',
15 |     'MAINTENANCE.md',
16 |     'README.md',
17 |     '_config.yml',
18 |     '_episodes/01-introduction.md',
19 |     '_extras/about.md',
20 |     '_extras/discuss.md',
21 |     '_extras/figures.md',
22 |     '_extras/guide.md',
23 |     'aio.md',
24 |     'index.md',
25 |     'reference.md',
26 |     'setup.md',
27 | )
28 | 
29 | 
30 | def main():
31 |     """Check for collisions, then create."""
32 | 
33 |     # Check.
34 |     errors = False
35 |     for path in BOILERPLATE:
36 |         if os.path.exists(path):
37 |             print('Warning: {0} already exists.'.format(path), file=sys.stderr)
38 |             errors = True
39 |     if errors:
40 |         print('**Exiting without creating files.**', file=sys.stderr)
41 |         sys.exit(1)
42 | 
43 |     # Create.
44 |     for path in BOILERPLATE:
45 |         shutil.copyfile(
46 |             "bin/boilerplate/{}".format(path),
47 |             path
48 |         )
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     main()
53 | 


--------------------------------------------------------------------------------
/bin/markdown_ast.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # frozen_string_literal: true
 3 | 
 4 | # Use Kramdown parser to produce AST for Markdown document.
 5 | 
 6 | require 'kramdown'
 7 | require 'kramdown-parser-gfm'
 8 | require 'json'
 9 | 
10 | markdown = $stdin.read
11 | doc = Kramdown::Document.new(markdown, input: 'GFM', hard_wrap: false)
12 | tree = doc.to_hash_a_s_t
13 | puts JSON.pretty_generate(tree)
14 | 


--------------------------------------------------------------------------------
/bin/repo_check.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Check repository settings.
  3 | """
  4 | 
  5 | 
  6 | import sys
  7 | import os
  8 | from subprocess import Popen, PIPE
  9 | import re
 10 | from argparse import ArgumentParser
 11 | 
 12 | from util import Reporter, require
 13 | 
 14 | # Import this way to produce a more useful error message.
 15 | try:
 16 |     import requests
 17 | except ImportError:
 18 |     print('Unable to import requests module: please install requests', file=sys.stderr)
 19 |     sys.exit(1)
 20 | 
 21 | 
 22 | # Pattern to match Git command-line output for remotes => (user name, project name).
 23 | P_GIT_REMOTE = re.compile(r'upstream\s+(?:https://|git@)github.com[:/]([^/]+)/([^.]+)(\.git)?\s+\(fetch\)')
 24 | 
 25 | # Repository URL format string.
 26 | F_REPO_URL = 'https://github.com/{0}/{1}/'
 27 | 
 28 | # Pattern to match repository URLs => (user name, project name)
 29 | P_REPO_URL = re.compile(r'https?://github\.com/([^.]+)/([^/]+)/?')
 30 | 
 31 | # API URL format string.
 32 | F_API_URL = 'https://api.github.com/repos/{0}/{1}/labels'
 33 | 
 34 | # Expected labels and colors.
 35 | EXPECTED = {
 36 |     'help wanted': 'dcecc7',
 37 |     'status:in progress': '9bcc65',
 38 |     'status:changes requested': '679f38',
 39 |     'status:wait': 'fff2df',
 40 |     'status:refer to cac': 'ffdfb2',
 41 |     'status:need more info': 'ee6c00',
 42 |     'status:blocked': 'e55100',
 43 |     'status:out of scope': 'eeeeee',
 44 |     'status:duplicate': 'bdbdbd',
 45 |     'type:typo text': 'f8bad0',
 46 |     'type:bug': 'eb3f79',
 47 |     'type:formatting': 'ac1357',
 48 |     'type:template and tools': '7985cb',
 49 |     'type:instructor guide': '00887a',
 50 |     'type:discussion': 'b2e5fc',
 51 |     'type:enhancement': '7fdeea',
 52 |     'type:clarification': '00acc0',
 53 |     'type:teaching example': 'ced8dc',
 54 |     'good first issue': 'ffeb3a',
 55 |     'high priority': 'd22e2e'
 56 | }
 57 | 
 58 | 
 59 | def main():
 60 |     """
 61 |     Main driver.
 62 |     """
 63 | 
 64 |     args = parse_args()
 65 |     reporter = Reporter()
 66 |     repo_url = get_repo_url(args.repo_url)
 67 |     check_labels(reporter, repo_url)
 68 |     reporter.report()
 69 | 
 70 | 
 71 | def parse_args():
 72 |     """
 73 |     Parse command-line arguments.
 74 |     """
 75 | 
 76 |     parser = ArgumentParser(description="""Check repository settings.""")
 77 |     parser.add_argument('-r', '--repo',
 78 |                         default=None,
 79 |                         dest='repo_url',
 80 |                         help='repository URL')
 81 |     parser.add_argument('-s', '--source',
 82 |                         default=os.curdir,
 83 |                         dest='source_dir',
 84 |                         help='source directory')
 85 | 
 86 |     args, extras = parser.parse_known_args()
 87 |     require(not extras,
 88 |             'Unexpected trailing command-line arguments "{0}"'.format(extras))
 89 | 
 90 |     return args
 91 | 
 92 | 
 93 | def get_repo_url(repo_url):
 94 |     """
 95 |     Figure out which repository to query.
 96 |     """
 97 | 
 98 |     # Explicitly specified.
 99 |     if repo_url is not None:
100 |         return repo_url
101 | 
102 |     # Guess.
103 |     cmd = 'git remote -v'
104 |     p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE,
105 |               close_fds=True, universal_newlines=True, encoding='utf-8')
106 |     stdout_data, stderr_data = p.communicate()
107 |     stdout_data = stdout_data.split('\n')
108 |     matches = [P_GIT_REMOTE.match(line) for line in stdout_data]
109 |     matches = [m for m in matches if m is not None]
110 |     require(len(matches) == 1,
111 |             'Unexpected output from git remote command: "{0}"'.format(matches))
112 | 
113 |     username = matches[0].group(1)
114 |     require(
115 |         username, 'empty username in git remote output {0}'.format(matches[0]))
116 | 
117 |     project_name = matches[0].group(2)
118 |     require(
119 |         username, 'empty project name in git remote output {0}'.format(matches[0]))
120 | 
121 |     url = F_REPO_URL.format(username, project_name)
122 |     return url
123 | 
124 | 
125 | def check_labels(reporter, repo_url):
126 |     """
127 |     Check labels in repository.
128 |     """
129 | 
130 |     actual = get_labels(repo_url)
131 |     extra = set(actual.keys()) - set(EXPECTED.keys())
132 | 
133 |     reporter.check(not extra,
134 |                    None,
135 |                    'Extra label(s) in repository {0}: {1}',
136 |                    repo_url, ', '.join(sorted(extra)))
137 | 
138 |     missing = set(EXPECTED.keys()) - set(actual.keys())
139 |     reporter.check(not missing,
140 |                    None,
141 |                    'Missing label(s) in repository {0}: {1}',
142 |                    repo_url, ', '.join(sorted(missing)))
143 | 
144 |     overlap = set(EXPECTED.keys()).intersection(set(actual.keys()))
145 |     for name in sorted(overlap):
146 |         reporter.check(EXPECTED[name].lower() == actual[name].lower(),
147 |                        None,
148 |                        'Color mismatch for label {0} in {1}: expected {2}, found {3}',
149 |                        name, repo_url, EXPECTED[name], actual[name])
150 | 
151 | 
152 | def get_labels(repo_url):
153 |     """
154 |     Get actual labels from repository.
155 |     """
156 | 
157 |     m = P_REPO_URL.match(repo_url)
158 |     require(
159 |         m, 'repository URL {0} does not match expected pattern'.format(repo_url))
160 | 
161 |     username = m.group(1)
162 |     require(username, 'empty username in repository URL {0}'.format(repo_url))
163 | 
164 |     project_name = m.group(2)
165 |     require(
166 |         username, 'empty project name in repository URL {0}'.format(repo_url))
167 | 
168 |     url = F_API_URL.format(username, project_name)
169 |     r = requests.get(url)
170 |     require(r.status_code == 200,
171 |             'Request for {0} failed with {1}'.format(url, r.status_code))
172 | 
173 |     result = {}
174 |     for entry in r.json():
175 |         result[entry['name']] = entry['color']
176 |     return result
177 | 
178 | 
179 | if __name__ == '__main__':
180 |     main()
181 | 


--------------------------------------------------------------------------------
/bin/run-make-docker-serve.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -o errexit
 4 | set -o pipefail
 5 | set -o nounset
 6 | 
 7 | 
 8 | bundle install
 9 | bundle update
10 | exec bundle exec jekyll serve --host 0.0.0.0
11 | 


--------------------------------------------------------------------------------
/bin/test_lesson_check.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import unittest
 4 | 
 5 | import lesson_check
 6 | import util
 7 | 
 8 | 
 9 | class TestFileList(unittest.TestCase):
10 |     def setUp(self):
11 |         self.reporter = util.Reporter()  # TODO: refactor reporter class.
12 | 
13 |     def test_file_list_has_expected_entries(self):
14 |         # For first pass, simply assume that all required files are present
15 |         all_filenames = [filename.replace('%', '')
16 |                          for filename in lesson_check.REQUIRED_FILES]
17 | 
18 |         lesson_check.check_fileset('', self.reporter, all_filenames)
19 |         self.assertEqual(len(self.reporter.messages), 0)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     unittest.main()
24 | 


--------------------------------------------------------------------------------
/bin/util.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import json
  4 | from subprocess import Popen, PIPE
  5 | 
  6 | # Import this way to produce a more useful error message.
  7 | try:
  8 |     import yaml
  9 | except ImportError:
 10 |     print('Unable to import YAML module: please install PyYAML', file=sys.stderr)
 11 |     sys.exit(1)
 12 | 
 13 | 
 14 | # Things an image file's name can end with.
 15 | IMAGE_FILE_SUFFIX = {
 16 |     '.gif',
 17 |     '.jpg',
 18 |     '.png',
 19 |     '.svg'
 20 | }
 21 | 
 22 | # Files that shouldn't be present.
 23 | UNWANTED_FILES = [
 24 |     '.nojekyll'
 25 | ]
 26 | 
 27 | # Marker to show that an expected value hasn't been provided.
 28 | # (Can't use 'None' because that might be a legitimate value.)
 29 | REPORTER_NOT_SET = []
 30 | 
 31 | 
 32 | class Reporter:
 33 |     """Collect and report errors."""
 34 | 
 35 |     def __init__(self):
 36 |         """Constructor."""
 37 |         self.messages = []
 38 | 
 39 |     def check_field(self, filename, name, values, key, expected=REPORTER_NOT_SET):
 40 |         """Check that a dictionary has an expected value."""
 41 | 
 42 |         if key not in values:
 43 |             self.add(filename, '{0} does not contain {1}', name, key)
 44 |         elif expected is REPORTER_NOT_SET:
 45 |             pass
 46 |         elif type(expected) in (tuple, set, list):
 47 |             if values[key] not in expected:
 48 |                 self.add(
 49 |                     filename, '{0} {1} value {2} is not in {3}', name, key, values[key], expected)
 50 |         elif values[key] != expected:
 51 |             self.add(filename, '{0} {1} is {2} not {3}',
 52 |                      name, key, values[key], expected)
 53 | 
 54 |     def check(self, condition, location, fmt, *args):
 55 |         """Append error if condition not met."""
 56 | 
 57 |         if not condition:
 58 |             self.add(location, fmt, *args)
 59 | 
 60 |     def add(self, location, fmt, *args):
 61 |         """Append error unilaterally."""
 62 | 
 63 |         self.messages.append((location, fmt.format(*args)))
 64 | 
 65 |     @staticmethod
 66 |     def pretty(item):
 67 |         location, message = item
 68 |         if isinstance(location, type(None)):
 69 |             return message
 70 |         elif isinstance(location, str):
 71 |             return location + ': ' + message
 72 |         elif isinstance(location, tuple):
 73 |             return '{0}:{1}: '.format(*location) + message
 74 | 
 75 |         print('Unknown item "{0}"'.format(item), file=sys.stderr)
 76 |         return NotImplemented
 77 | 
 78 |     @staticmethod
 79 |     def key(item):
 80 |         location, message = item
 81 |         if isinstance(location, type(None)):
 82 |             return ('', -1, message)
 83 |         elif isinstance(location, str):
 84 |             return (location, -1, message)
 85 |         elif isinstance(location, tuple):
 86 |             return (location[0], location[1], message)
 87 | 
 88 |         print('Unknown item "{0}"'.format(item), file=sys.stderr)
 89 |         return NotImplemented
 90 | 
 91 |     def report(self, stream=sys.stdout):
 92 |         """Report all messages in order."""
 93 | 
 94 |         if not self.messages:
 95 |             return
 96 | 
 97 |         for m in sorted(self.messages, key=self.key):
 98 |             print(self.pretty(m), file=stream)
 99 | 
100 | 
101 | def read_markdown(parser, path):
102 |     """
103 |     Get YAML and AST for Markdown file, returning
104 |     {'metadata':yaml, 'metadata_len':N, 'text':text, 'lines':[(i, line, len)], 'doc':doc}.
105 |     """
106 | 
107 |     # Split and extract YAML (if present).
108 |     with open(path, 'r') as reader:
109 |         body = reader.read()
110 |     metadata_raw, metadata_yaml, body = split_metadata(path, body)
111 | 
112 |     # Split into lines.
113 |     metadata_len = 0 if metadata_raw is None else metadata_raw.count('\n')
114 |     lines = [(metadata_len+i+1, line, len(line))
115 |              for (i, line) in enumerate(body.split('\n'))]
116 | 
117 |     # Parse Markdown.
118 |     cmd = 'bundle exec ruby {0}'.format(parser)
119 |     p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE,
120 |               close_fds=True, universal_newlines=True)
121 |     stdout_data, stderr_data = p.communicate(body)
122 |     doc = json.loads(stdout_data)
123 | 
124 |     return {
125 |         'metadata': metadata_yaml,
126 |         'metadata_len': metadata_len,
127 |         'text': body,
128 |         'lines': lines,
129 |         'doc': doc
130 |     }
131 | 
132 | 
133 | def split_metadata(path, text):
134 |     """
135 |     Get raw (text) metadata, metadata as YAML, and rest of body.
136 |     If no metadata, return (None, None, body).
137 |     """
138 | 
139 |     metadata_raw = None
140 |     metadata_yaml = None
141 | 
142 |     pieces = text.split('---', 2)
143 |     if len(pieces) == 3:
144 |         metadata_raw = pieces[1]
145 |         text = pieces[2]
146 |         try:
147 |             metadata_yaml = yaml.load(metadata_raw, Loader=yaml.SafeLoader)
148 |         except yaml.YAMLError as e:
149 |             print('Unable to parse YAML header in {0}:\n{1}'.format(
150 |                 path, e), file=sys.stderr)
151 |             sys.exit(1)
152 | 
153 |     return metadata_raw, metadata_yaml, text
154 | 
155 | 
156 | def load_yaml(filename):
157 |     """
158 |     Wrapper around YAML loading so that 'import yaml' is only needed
159 |     in one file.
160 |     """
161 | 
162 |     try:
163 |         with open(filename, 'r') as reader:
164 |             return yaml.load(reader, Loader=yaml.SafeLoader)
165 |     except (yaml.YAMLError, IOError) as e:
166 |         print('Unable to load YAML file {0}:\n{1}'.format(
167 |             filename, e), file=sys.stderr)
168 |         sys.exit(1)
169 | 
170 | 
171 | def check_unwanted_files(dir_path, reporter):
172 |     """
173 |     Check that unwanted files are not present.
174 |     """
175 | 
176 |     for filename in UNWANTED_FILES:
177 |         path = os.path.join(dir_path, filename)
178 |         reporter.check(not os.path.exists(path),
179 |                        path,
180 |                        "Unwanted file found")
181 | 
182 | 
183 | def require(condition, message):
184 |     """Fail if condition not met."""
185 | 
186 |     if not condition:
187 |         print(message, file=sys.stderr)
188 |         sys.exit(1)
189 | 


--------------------------------------------------------------------------------
/code/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc-carpentry/hpc-python/4af94a57eccb8a2b400db514d682bcfc6cdc9a7d/code/.gitkeep


--------------------------------------------------------------------------------
/commands.mk:
--------------------------------------------------------------------------------
1 | files/snakemake-lesson.tar.gz: files/snakemake-lesson/*py
2 | 	@rm -vf $@ && cd files && tar vczf ../$@ snakemake-lesson/*py snakemake-lesson/books/* snakemake-lesson/matplotlibrc snakemake-lesson/cluster.json snakemake-lesson/.Snakefile
3 | 
4 | files/snakemake-lesson.zip: files/snakemake-lesson/*py
5 | 	@rm -vf $@ && cd files && zip ../$@ snakemake-lesson/*py snakemake-lesson/books/* snakemake-lesson/matplotlibrc snakemake-lesson/cluster.json snakemake-lesson/.Snakefile
6 | 
7 | ## prep-release     : compress contents of snakemake-lesson for release
8 | prep-release: files/snakemake-lesson.zip files/snakemake-lesson.tar.gz
9 | 


--------------------------------------------------------------------------------
/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc-carpentry/hpc-python/4af94a57eccb8a2b400db514d682bcfc6cdc9a7d/data/.gitkeep


--------------------------------------------------------------------------------
/fig/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc-carpentry/hpc-python/4af94a57eccb8a2b400db514d682bcfc6cdc9a7d/fig/.gitkeep


--------------------------------------------------------------------------------
/fig/02-challenge-dag.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
 3 |  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
 4 | <!-- Generated by graphviz version 2.40.1 (20161225.0304)
 5 |  -->
 6 | <!-- Title: snakemake_dag Pages: 1 -->
 7 | <svg width="336pt" height="116pt"
 8 |  viewBox="0.00 0.00 335.50 116.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
 9 | <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 112)">
10 | <title>snakemake_dag</title>
11 | <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-112 331.5,-112 331.5,4 -4,4"/>
12 | <!-- 0 -->
13 | <g id="node1" class="node">
14 | <title>0</title>
15 | <path fill="none" stroke="#d85656" stroke-width="2" d="M169.5,-36C169.5,-36 134.5,-36 134.5,-36 128.5,-36 122.5,-30 122.5,-24 122.5,-24 122.5,-12 122.5,-12 122.5,-6 128.5,0 134.5,0 134.5,0 169.5,0 169.5,0 175.5,0 181.5,-6 181.5,-12 181.5,-12 181.5,-24 181.5,-24 181.5,-30 175.5,-36 169.5,-36"/>
16 | <text text-anchor="middle" x="152" y="-15.5" font-family="sans" font-size="10.00" fill="#000000">zipf_test</text>
17 | </g>
18 | <!-- 1 -->
19 | <g id="node2" class="node">
20 | <title>1</title>
21 | <path fill="none" stroke="#afd856" stroke-width="2" d="M66,-108C66,-108 12,-108 12,-108 6,-108 0,-102 0,-96 0,-96 0,-84 0,-84 0,-78 6,-72 12,-72 12,-72 66,-72 66,-72 72,-72 78,-78 78,-84 78,-84 78,-96 78,-96 78,-102 72,-108 66,-108"/>
22 | <text text-anchor="middle" x="39" y="-87.5" font-family="sans" font-size="10.00" fill="#000000">count_words</text>
23 | </g>
24 | <!-- 1&#45;&gt;0 -->
25 | <g id="edge1" class="edge">
26 | <title>1&#45;&gt;0</title>
27 | <path fill="none" stroke="#c0c0c0" stroke-width="2" d="M67.5147,-71.8314C81.8342,-62.7074 99.3624,-51.539 114.7338,-41.7449"/>
28 | <polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="116.9417,-44.4882 123.4945,-36.1628 113.1801,-38.5847 116.9417,-44.4882"/>
29 | </g>
30 | <!-- 2 -->
31 | <g id="node3" class="node">
32 | <title>2</title>
33 | <path fill="none" stroke="#56d8a9" stroke-width="2" d="M196,-108C196,-108 108,-108 108,-108 102,-108 96,-102 96,-96 96,-96 96,-84 96,-84 96,-78 102,-72 108,-72 108,-72 196,-72 196,-72 202,-72 208,-78 208,-84 208,-84 208,-96 208,-96 208,-102 202,-108 196,-108"/>
34 | <text text-anchor="middle" x="152" y="-87.5" font-family="sans" font-size="10.00" fill="#000000">count_words_abyss</text>
35 | </g>
36 | <!-- 2&#45;&gt;0 -->
37 | <g id="edge2" class="edge">
38 | <title>2&#45;&gt;0</title>
39 | <path fill="none" stroke="#c0c0c0" stroke-width="2" d="M152,-71.8314C152,-64.131 152,-54.9743 152,-46.4166"/>
40 | <polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="155.5001,-46.4132 152,-36.4133 148.5001,-46.4133 155.5001,-46.4132"/>
41 | </g>
42 | <!-- 3 -->
43 | <g id="node4" class="node">
44 | <title>3</title>
45 | <path fill="none" stroke="#d8ac56" stroke-width="2" d="M315.5,-108C315.5,-108 238.5,-108 238.5,-108 232.5,-108 226.5,-102 226.5,-96 226.5,-96 226.5,-84 226.5,-84 226.5,-78 232.5,-72 238.5,-72 238.5,-72 315.5,-72 315.5,-72 321.5,-72 327.5,-78 327.5,-84 327.5,-84 327.5,-96 327.5,-96 327.5,-102 321.5,-108 315.5,-108"/>
46 | <text text-anchor="middle" x="277" y="-87.5" font-family="sans" font-size="10.00" fill="#000000">count_words_last</text>
47 | </g>
48 | <!-- 3&#45;&gt;0 -->
49 | <g id="edge3" class="edge">
50 | <title>3&#45;&gt;0</title>
51 | <path fill="none" stroke="#c0c0c0" stroke-width="2" d="M245.4572,-71.8314C228.8282,-62.253 208.2876,-50.4217 190.702,-40.2923"/>
52 | <polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="192.2238,-37.1299 181.8116,-35.1715 188.7299,-43.1956 192.2238,-37.1299"/>
53 | </g>
54 | </g>
55 | </svg>
56 | 


--------------------------------------------------------------------------------
/fig/02-dats-dag.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
 3 |  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
 4 | <!-- Generated by graphviz version 2.40.1 (20161225.0304)
 5 |  -->
 6 | <!-- Title: snakemake_dag Pages: 1 -->
 7 | <svg width="216pt" height="116pt"
 8 |  viewBox="0.00 0.00 216.00 116.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
 9 | <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 112)">
10 | <title>snakemake_dag</title>
11 | <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-112 212,-112 212,4 -4,4"/>
12 | <!-- 0 -->
13 | <g id="node1" class="node">
14 | <title>0</title>
15 | <path fill="none" stroke="#59d856" stroke-width="2" d="M127,-36C127,-36 97,-36 97,-36 91,-36 85,-30 85,-24 85,-24 85,-12 85,-12 85,-6 91,0 97,0 97,0 127,0 127,0 133,0 139,-6 139,-12 139,-12 139,-24 139,-24 139,-30 133,-36 127,-36"/>
16 | <text text-anchor="middle" x="112" y="-15.5" font-family="sans" font-size="10.00" fill="#000000">dats</text>
17 | </g>
18 | <!-- 1 -->
19 | <g id="node2" class="node">
20 | <title>1</title>
21 | <path fill="none" stroke="#d6d856" stroke-width="2" d="M100,-108C100,-108 12,-108 12,-108 6,-108 0,-102 0,-96 0,-96 0,-84 0,-84 0,-78 6,-72 12,-72 12,-72 100,-72 100,-72 106,-72 112,-78 112,-84 112,-84 112,-96 112,-96 112,-102 106,-108 100,-108"/>
22 | <text text-anchor="middle" x="56" y="-87.5" font-family="sans" font-size="10.00" fill="#000000">count_words_abyss</text>
23 | </g>
24 | <!-- 1&#45;&gt;0 -->
25 | <g id="edge1" class="edge">
26 | <title>1&#45;&gt;0</title>
27 | <path fill="none" stroke="#c0c0c0" stroke-width="2" d="M70.1312,-71.8314C76.5811,-63.5386 84.3445,-53.557 91.4252,-44.4533"/>
28 | <polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="94.3019,-46.4556 97.6786,-36.4133 88.7764,-42.158 94.3019,-46.4556"/>
29 | </g>
30 | <!-- 2 -->
31 | <g id="node3" class="node">
32 | <title>2</title>
33 | <path fill="none" stroke="#56d8d8" stroke-width="2" d="M196,-108C196,-108 142,-108 142,-108 136,-108 130,-102 130,-96 130,-96 130,-84 130,-84 130,-78 136,-72 142,-72 142,-72 196,-72 196,-72 202,-72 208,-78 208,-84 208,-84 208,-96 208,-96 208,-102 202,-108 196,-108"/>
34 | <text text-anchor="middle" x="169" y="-87.5" font-family="sans" font-size="10.00" fill="#000000">count_words</text>
35 | </g>
36 | <!-- 2&#45;&gt;0 -->
37 | <g id="edge2" class="edge">
38 | <title>2&#45;&gt;0</title>
39 | <path fill="none" stroke="#c0c0c0" stroke-width="2" d="M154.6165,-71.8314C148.0514,-63.5386 140.1493,-53.557 132.9422,-44.4533"/>
40 | <polygon fill="#c0c0c0" stroke="#c0c0c0" stroke-width="2" points="135.5284,-42.0813 126.5772,-36.4133 130.0401,-46.4262 135.5284,-42.0813"/>
41 | </g>
42 | </g>
43 | </svg>
44 | 


--------------------------------------------------------------------------------
/files/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc-carpentry/hpc-python/4af94a57eccb8a2b400db514d682bcfc6cdc9a7d/files/.gitkeep


--------------------------------------------------------------------------------
/files/snakemake-lesson.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc-carpentry/hpc-python/4af94a57eccb8a2b400db514d682bcfc6cdc9a7d/files/snakemake-lesson.tar.gz


--------------------------------------------------------------------------------
/files/snakemake-lesson.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hpc-carpentry/hpc-python/4af94a57eccb8a2b400db514d682bcfc6cdc9a7d/files/snakemake-lesson.zip


--------------------------------------------------------------------------------
/files/snakemake-lesson/.Snakefile:
--------------------------------------------------------------------------------
 1 | # This is a "hidden" version of the final Snakefile if students want/need 
 2 | # to run the instructor's copy.
 3 | 
 4 | # our zipf analysis pipeline
 5 | DATS = glob_wildcards('books/{book}.txt').book
 6 | 
 7 | localrules: all, clean, make_archive
 8 | 
 9 | rule all:
10 |     input:
11 |         'zipf_analysis.tar.gz'
12 | 
13 | # delete everything so we can re-run things
14 | # deletes a little extra for purposes of lesson prep
15 | rule clean:
16 |     shell:  
17 |         '''
18 |         rm -rf results dats plots __pycache__
19 |         rm -f results.txt zipf_analysis.tar.gz *.out *.log *.pyc
20 |         '''
21 | 
22 | # count words in one of our "books"
23 | rule count_words:
24 |     input:  
25 |         wc='wordcount.py',
26 |         book='books/{file}.txt'
27 |     output: 'dats/{file}.dat'
28 |     threads: 4
29 |     log: 'dats/{file}.log'
30 |     shell:
31 |         '''
32 |         echo "Running {input.wc} with {threads} cores on {input.book}." &> {log} &&
33 |             python {input.wc} {input.book} {output} &>> {log}
34 |         '''
35 | 
36 | # create a plot for each book
37 | rule make_plot:
38 |     input:
39 |         plotcount='plotcount.py',
40 |         book='dats/{file}.dat'
41 |     output: 'plots/{file}.png'
42 |     resources: gpu=1
43 |     shell: 'python {input.plotcount} {input.book} {output}'
44 | 
45 | # generate summary table
46 | rule zipf_test:
47 |     input:  
48 |         zipf='zipf_test.py',
49 |         books=expand('dats/{book}.dat', book=DATS)
50 |     output: 'results.txt'
51 |     shell:  'python {input.zipf} {input.books} > {output}'
52 | 
53 | # create an archive with all of our results
54 | rule make_archive:
55 |     input:
56 |         expand('plots/{book}.png', book=DATS),
57 |         expand('dats/{book}.dat', book=DATS),
58 |         'results.txt'
59 |     output: 'zipf_analysis.tar.gz'
60 |     shell: 'tar -czvf {output} {input}'
61 | 
62 | 


--------------------------------------------------------------------------------
/files/snakemake-lesson/cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "__default__":
 3 |     {
 4 |         "account": "aSLURMSubmissionAccount",
 5 |         "mem": "1G",
 6 |         "time": "0:5:0"
 7 | 	},
 8 |     "count_words":
 9 |     {
10 |         "time": "0:10:0",
11 |         "mem": "2G"
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/files/snakemake-lesson/matplotlibrc:
--------------------------------------------------------------------------------
1 | # This is a fix so that Matplotlib can create plots
2 | # non-interactively on a cluster.
3 | 
4 | backend: Agg
5 | 
6 | 


--------------------------------------------------------------------------------
/files/snakemake-lesson/plotcount.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | ######################################################################
  4 | # Never do the following in the real world, warnings are important!
  5 | # This silences warnings only to not confuse learners.
  6 | import warnings
  7 | warnings.filterwarnings("ignore")
  8 | ######################################################################
  9 | 
 10 | import numpy as np
 11 | import matplotlib.pyplot as plt
 12 | import sys
 13 | 
 14 | try:
 15 |     from collections.abc import Sequence
 16 | except ImportError:
 17 |     from collections import Sequence
 18 | 
 19 | from wordcount import load_word_counts
 20 | 
 21 | 
 22 | def plot_word_counts(counts, limit=10):
 23 |     """
 24 |     Given a list of (word, count, percentage) tuples, plot the counts as a
 25 |     histogram. Only the first limit tuples are plotted.
 26 |     """
 27 |     plt.title("Word Counts")
 28 |     limited_counts = counts[0:limit]
 29 |     word_data = [word for (word, _, _) in limited_counts]
 30 |     count_data = [count for (_, count, _) in limited_counts]
 31 |     position = np.arange(len(word_data))
 32 |     width = 1.0
 33 |     ax = plt.axes()
 34 |     ax.set_xticks(position + (width / 2))
 35 |     ax.set_xticklabels(word_data)
 36 |     plt.bar(position, count_data, width, color='b')
 37 | 
 38 | 
 39 | def typeset_labels(labels=None, gap=5):
 40 |     """
 41 |     Given a list of labels, create a new list of labels such that each label
 42 |     is right-padded by spaces so that every label has the same width, then
 43 |     is further right padded by ' ' * gap.
 44 |     """
 45 |     if not isinstance(labels, Sequence):
 46 |         labels = list(range(labels))
 47 |     labels = [str(i) for i in labels]
 48 |     label_lens = [len(s) for s in labels]
 49 |     label_width = max(label_lens)
 50 |     output = []
 51 |     for label in labels:
 52 |         label_string = label + ' ' * (label_width - len(label)) + (' ' * gap)
 53 |         output.append(label_string)
 54 |     assert len(set(len(s) for s in output)) == 1  # Check all have same length.
 55 |     return output
 56 | 
 57 | 
 58 | def get_ascii_bars(values, truncate=True, maxlen=10, symbol='#'):
 59 |     """
 60 |     Given a list of values, create a list of strings of symbols, where each
 61 |     strings contains N symbols where N = ()(value / minimum) /
 62 |     (maximum - minimum)) * (maxlen / len(symbol)).
 63 |     """
 64 |     maximum = max(values)
 65 |     if truncate:
 66 |         minimum = min(values) - 1
 67 |     else:
 68 |         minimum = 0
 69 |     
 70 |     # Type conversion to floats is required for compatibility with python 2,
 71 |     # because it doesn't do integer division correctly (it does floor division
 72 |     # for integers).
 73 |     value_range=float(maximum - minimum)
 74 |     prop_values = [(float(value - minimum) / value_range) for value in values]
 75 |     
 76 |     # Type conversion to int required for compatibility with python 2
 77 |     biggest_bar = symbol * int(round(maxlen / len(symbol)))
 78 |     bars = [biggest_bar[:int(round(prop * len(biggest_bar)))]
 79 |             for prop in prop_values]
 80 |     
 81 |     return bars
 82 | 
 83 | 
 84 | def plot_ascii_bars(values, labels=None, screenwidth=80, gap=2, truncate=True):
 85 |     """
 86 |     Given a list of values and labels, create right-padded labels for each
 87 |     label and strings of symbols representing the associated values.
 88 |     """
 89 |     if not labels:
 90 |         try:
 91 |             values, labels = list(zip(*values))
 92 |         except TypeError:
 93 |             labels = len(values)
 94 |     labels = typeset_labels(labels=labels, gap=gap)
 95 |     bars = get_ascii_bars(values, maxlen=screenwidth - gap - len(labels[0]),
 96 |                           truncate=truncate)
 97 |     return [s + b for s, b in zip(labels, bars)]
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 |     input_file = sys.argv[1]
102 |     output_file = sys.argv[2]
103 |     limit = 10
104 |     if len(sys.argv) > 3:
105 |         limit = int(sys.argv[3])
106 |     counts = load_word_counts(input_file)
107 |     plot_word_counts(counts, limit)
108 |     if output_file == "show":
109 |         plt.show()
110 |     elif output_file == 'ascii':
111 |         words, counts, _ = list(zip(*counts))
112 |         for line in plot_ascii_bars(counts[:limit], words[:limit],
113 |                                     truncate=False):
114 |             print(line)
115 |     else:
116 |         plt.savefig(output_file)
117 | 


--------------------------------------------------------------------------------
/files/snakemake-lesson/wordcount.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import sys
  4 | 
  5 | DELIMITERS = ". , ; : ? $ @ ^ < > # % ` ! * - = ( ) [ ] { } / \" '".split()
  6 | 
  7 | 
  8 | def load_text(filename):
  9 |     """
 10 |     Load lines from a plain-text file and return these as a list, with
 11 |     trailing newlines stripped.
 12 |     """
 13 |     with open(filename) as input_fd:
 14 |         lines = input_fd.read().splitlines()
 15 |     return lines
 16 | 
 17 | 
 18 | def save_word_counts(filename, counts):
 19 |     """
 20 |     Save a list of [word, count, percentage] lists to a file, in the form
 21 |     "word count percentage", one tuple per line.
 22 |     """
 23 |     with open(filename, 'w') as output:
 24 |         for count in counts:
 25 |             output.write("%s\n" % " ".join(str(c) for c in count))
 26 | 
 27 | 
 28 | def load_word_counts(filename):
 29 |     """
 30 |     Load a list of (word, count, percentage) tuples from a file where each
 31 |     line is of the form "word count percentage". Lines starting with # are
 32 |     ignored.
 33 |     """
 34 |     counts = []
 35 |     with open(filename, "r") as input_fd:
 36 |         for line in input_fd:
 37 |             if not line.startswith("#"):
 38 |                 fields = line.split()
 39 |                 counts.append((fields[0], int(fields[1]), float(fields[2])))
 40 |     return counts
 41 | 
 42 | 
 43 | def update_word_counts(line, counts):
 44 |     """
 45 |     Given a string, parse the string and update a dictionary of word
 46 |     counts (mapping words to counts of their frequencies). DELIMITERS are
 47 |     removed before the string is parsed. The function is case-insensitive
 48 |     and words in the dictionary are in lower-case.
 49 |     """
 50 |     for purge in DELIMITERS:
 51 |         line = line.replace(purge, " ")
 52 |     words = line.split()
 53 |     for word in words:
 54 |         word = word.lower().strip()
 55 |         if word in counts:
 56 |             counts[word] += 1
 57 |         else:
 58 |             counts[word] = 1
 59 | 
 60 | 
 61 | def calculate_word_counts(lines):
 62 |     """
 63 |     Given a list of strings, parse each string and create a dictionary of
 64 |     word counts (mapping words to counts of their frequencies). DELIMITERS
 65 |     are removed before the string is parsed. The function is
 66 |     case-insensitive and words in the dictionary are in lower-case.
 67 |     """
 68 |     counts = {}
 69 |     for line in lines:
 70 |         update_word_counts(line, counts)
 71 |     return counts
 72 | 
 73 | 
 74 | def word_count_dict_to_tuples(counts, decrease=True):
 75 |     """
 76 |     Given a dictionary of word counts (mapping words to counts of their
 77 |     frequencies), convert this into an ordered list of tuples (word,
 78 |     count). The list is ordered by decreasing count, unless increase is
 79 |     True.
 80 |     """
 81 |     return sorted(list(counts.items()), key=lambda key_value: key_value[1],
 82 |                   reverse=decrease)
 83 | 
 84 | 
 85 | def filter_word_counts(counts, min_length=1):
 86 |     """
 87 |     Given a list of (word, count) tuples, create a new list with only
 88 |     those tuples whose word is >= min_length.
 89 |     """
 90 |     stripped = []
 91 |     for (word, count) in counts:
 92 |         if len(word) >= min_length:
 93 |             stripped.append((word, count))
 94 |     return stripped
 95 | 
 96 | 
 97 | def calculate_percentages(counts):
 98 |     """
 99 |     Given a list of (word, count) tuples, create a new list (word, count,
100 |     percentage) where percentage is the percentage number of occurrences
101 |     of this word compared to the total number of words.
102 |     """
103 |     total = 0
104 |     for count in counts:
105 |         total += count[1]
106 |     tuples = [(word, count, (float(count) / total) * 100.0)
107 |               for (word, count) in counts]
108 |     return tuples
109 | 
110 | 
111 | def word_count(input_file, output_file, min_length=1):
112 |     """
113 |     Load a file, calculate the frequencies of each word in the file and
114 |     save in a new file the words, counts and percentages of the total  in
115 |     descending order. Only words whose length is >= min_length are
116 |     included.
117 |     """
118 |     lines = load_text(input_file)
119 |     counts = calculate_word_counts(lines)
120 |     sorted_counts = word_count_dict_to_tuples(counts)
121 |     sorted_counts = filter_word_counts(sorted_counts, min_length)
122 |     percentage_counts = calculate_percentages(sorted_counts)
123 |     save_word_counts(output_file, percentage_counts)
124 | 
125 | if __name__ == '__main__':
126 |     input_file = sys.argv[1]
127 |     output_file = sys.argv[2]
128 |     min_length = 1
129 |     if len(sys.argv) > 3:
130 |         min_length = int(sys.argv[3])
131 |     word_count(input_file, output_file, min_length)
132 | 


--------------------------------------------------------------------------------
/files/snakemake-lesson/zipf_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from wordcount import load_word_counts
 3 | import sys
 4 | 
 5 | def top_two_word(counts):
 6 |     """
 7 |     Given a list of (word, count, percentage) tuples, 
 8 |     return the top two word counts.
 9 |     """
10 |     limited_counts = counts[0:2]
11 |     count_data = [count for (_, count, _) in limited_counts]
12 |     return count_data
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     input_files = sys.argv[1:]
17 |     print("Book\tFirst\tSecond\tRatio")
18 |     for input_file in input_files:
19 |         counts = load_word_counts(input_file)
20 |         [first, second] = top_two_word(counts)
21 |         bookname = input_file[:-4]
22 |         print("%s\t%i\t%i\t%.2f" %(bookname, first, second, float(first)/second))
23 | 
24 | 


--------------------------------------------------------------------------------
/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: lesson
 3 | root: .
 4 | ---
 5 | 
 6 | Python is probably the most versatile language in existence. However one of its
 7 | most useful features is its ability to tie things together and automate the
 8 | execution of other programs.
 9 | 
10 | This tutorial focuses on using Python in high-performance computing
11 | environments to automate data analysis pipelines with
12 | [Snakemake](http://snakemake.readthedocs.io/en/stable/) (for a detailed
13 | discussion for why we are teaching Snakemake, see this lesson's <a href="{{
14 | page.root }}/discuss/">discussion page</a>). We’ll start with the basics and
15 | cover everything you need to get started. Some elements of writing
16 | performance-oriented code will be covered, but it is not the main focus. There
17 | is no prerequisite knowledge for this tutorial, although having some prior
18 | experience with the command-line or a compute cluster will be very helpful.
19 | 
20 | At the end of this lesson, you will know how to:
21 | 
22 | * Write and run basic Python programs.
23 | 
24 | * Create a reproducible analysis pipeline in Python. 
25 | 
26 | * Run your pipeline on your computer or on a high-performance computing cluster
27 |   and have it scale appropriately.
28 | 
29 | 
30 | > ## Setup
31 | >
32 | > You will want to have Python 3 and your favorite Python editor preinstalled
33 | > and ready to go. If you don’t know where to get things or what to install,
34 | > just install Miniconda (the Python 3 version) from
35 | > <https://docs.conda.io/en/latest/miniconda.html>.
36 | > 
37 | > To install snakemake, please run the following in a command-line terminal:
38 | > `conda install -c bioconda snakemake`
39 | >
40 | > The files used in this lesson can be downloaded
41 | > [here](files/snakemake-lesson.zip).
42 | {: .prereq}
43 | 


--------------------------------------------------------------------------------
/lesson-outline.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: page
 3 | title: Chapel for HPC Lesson Outline
 4 | ---
 5 | 
 6 | # Lesson outline and todo list
 7 | 
 8 | This is the tentative list of tasks and topics for each lesson.
 9 | Lesson writers are indicated with first/last initials (e.g. AR).
10 | Feel free to edit the topics for your section.
11 | 
12 | ## 1. UNIX fundamentals - AR
13 | 
14 | * SSH to a cluster
15 | * Bash fundamentals (`cd`, `ls`, ..., aliases, functions, ~/.bashrc)
16 | * Transferring files (`scp`? `sftp`? Maybe only one?)
17 | * Overview of HPC resources
18 | 
19 | 	* What is a cloud?
20 | 	* What is a cluster? Different cluster types
21 | 	* Overview of services available (Compute Canada, Amazon EC2, etc.)
22 | 
23 | ## 2. Submitting / running jobs - JS
24 | 
25 | * Scheduler - lesson will cover SLURM (which can also run PBS scripts/commands
26 |   natively)
27 | 
28 | 	* Submitting jobs
29 | 	* Checking status of jobs
30 | 	* Deleting jobs
31 | 	* Job size consequences
32 | 	* GUI vs. batch programs (X-forwarding, SSH tunnels?)
33 | 
34 | * Using software and environment modules
35 | * Playing friendly in the cluster
36 | 
37 | 	* Understanding resource utilization
38 | 	* Profiling code - time, size, etc.
39 | 	* Getting system stats
40 | 	* Consequences of going over
41 | 
42 | ## 3. Language refresher / introduction (Python - JB, Chapel - JZ+AR)
43 | 
44 | * Programming language concepts
45 | 
46 | 	* Compiled vs. interpreted languages
47 | 	* How does a program work?
48 | 	* Quick intro of programming language of choice
49 | 
50 | 		* Major features + pros/cons
51 | 		* What is it good at? 
52 | 
53 | * Actual language overview
54 | 
55 | 	* Basic syntax (arithmetic, variables, etc.)
56 | 	* Basic data structures (lists, arrays, etc.)
57 | 	* Defining functions
58 | 	* Conditional expressions
59 | 	* For-loops
60 | 	* Reading/writing data
61 | 
62 | Some side notes: perhaps a quick refresh of key concepts right before use in
63 | parallel section, certain concepts could get mixed in right before they're
64 | needed by the parallel lesson.
65 | 
66 | ## 4. Intro to parallel programming (Python - JB, Chapel - JZ+AR)
67 | 
68 | * Pipelining / automatic job submission / serial farming
69 | * Shared memory programming
70 | * Distributed memory programming
71 | * Overview of good parallel design
72 | 
73 | 	* Dependencies within own code
74 | 	* Race conditions
75 | 
76 | * Typical problems and bottlenecks
77 | 
78 | 	* running in parallel (parallel scaling)
79 | 	* parallel I/O (don't write a 1GB file from one processor if data is
80 |       already distributed, etc.)
81 | 	* Storage limitations (millions of files, compression, text vs. binary
82 |       storage)
83 | 	* Filesystem choice (home, scratch, tmp, etc.)
84 | 
85 | 
86 | Good luck!
87 | 


--------------------------------------------------------------------------------
/reference.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: reference
3 | permalink: /reference/
4 | ---
5 | 
6 | ## Glossary
7 | 


--------------------------------------------------------------------------------
/setup.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: page
  3 | title: Setup
  4 | root: .
  5 | ---
  6 | 
  7 | There are several pieces of software you will wish to install before the
  8 | workshop. Though installation help will be provided at the workshop, we
  9 | recommend that these tools are installed (or at least downloaded) beforehand.
 10 | 
 11 | ## Python 3
 12 | 
 13 | Please install [Miniconda](https://docs.conda.io/en/latest/miniconda.html).
 14 | While any installation of Python 3 will work, Miniconda is the free version of
 15 | Anaconda Python, and comes bundled with its most useful tools. Even better, it
 16 | includes several significant performance improvements over "vanilla" Python.
 17 | 
 18 | ## Snakemake
 19 | 
 20 | Once you have a Miniconda environment configured, please install Snakemake from
 21 | the [bioconda channel](https://anaconda.org/bioconda) with
 22 | 
 23 | ```
 24 | $ conda install -c bioconda snakemake
 25 | ```
 26 | {: .language-bash}
 27 | 
 28 | ## Shell and SSH
 29 | 
 30 | This lesson requires a terminal application (`bash`, `zsh`, or others) with
 31 | the ability to securely connect to a remote machine (`ssh`).
 32 | 
 33 | ### Where to type commands: How to open a new shell
 34 | 
 35 | The shell is a program that enables us to send commands to the computer and
 36 | receive output. It is also referred to as the terminal or command line.
 37 | 
 38 | Some computers include a default Unix Shell program. The steps below describe
 39 | some methods for identifying and opening a Unix Shell program if you already
 40 | have one installed. There are also options for identifying and downloading a
 41 | Unix Shell program, a Linux/UNIX emulator, or a program to access a Unix Shell
 42 | on a server.
 43 | 
 44 | #### Shell for Windows
 45 | 
 46 | Computers with Windows operating systems do not automatically have a Unix Shell
 47 | program installed. In this lesson, we encourage you to use an emulator included
 48 | in Git for Windows, which gives you access to both Bash shell commands and Git.
 49 | If you have attended a Software Carpentry workshop session, it is likely you
 50 | have already received instructions on how to install Git for Windows.
 51 | 
 52 | Once installed, you can open a terminal by running the program Git Bash from
 53 | the Windows start menu.
 54 | 
 55 | ##### Shell Programs for Windows
 56 | 
 57 | * [Git for Windows](https://gitforwindows.org/) &mdash; *Recommended*
 58 | * [Windows Subsystem for Linux](
 59 |   https://docs.microsoft.com/en-us/windows/wsl/install-win10)
 60 |   &mdash; advanced option for Windows 10
 61 | 
 62 | > ## Alternatives to Git for Windows
 63 | >
 64 | > Other solutions are available for running Bash commands on Windows. There is
 65 | > now a Bash shell command-line tool available for Windows 10. Additionally,
 66 | > you can run Bash commands on a remote computer or server that already has a
 67 | > Unix Shell, from your Windows machine. This can usually be done through a
 68 | > Secure Shell (SSH) client. One such client available for free for Windows
 69 | > computers is PuTTY. See the reference below for information on installing and
 70 | > using PuTTY, using the Windows 10 command-line tool, or installing and using
 71 | > a Unix/Linux emulator.
 72 | >
 73 | > For advanced users, you may choose one of the following alternatives:
 74 | >
 75 | > * Install the [Windows Subsystem for
 76 | >   Linux][microsoft-wsl]
 77 | > * Use the Windows [Powershell][microsoft-powershell]
 78 | > * Read up on [Using a Unix/Linux emulator][unix-emulator] (Cygwin) or Secure
 79 | >   Shell (SSH) client (Putty)
 80 | >
 81 | > > ## Warning
 82 | > >
 83 | > > Commands in the Windows Subsystem for Linux (WSL), Powershell, or Cygwin
 84 | > > may differ slightly from those shown in the lesson or presented in the
 85 | > > workshop. Please ask if you encounter such a mismatch &mdash; you're
 86 | > > probably not alone.
 87 | > {: .challenge}
 88 | {: .discussion}
 89 | 
 90 | #### Shell for macOS
 91 | 
 92 | On macOS, the default Unix Shell is accessible by running the Terminal program
 93 | from the `/Application/Utilities` folder in Finder.
 94 | 
 95 | To open Terminal, try one or both of the following:
 96 | 
 97 | * In Finder, select the Go menu, then select Utilities. Locate Terminal in the
 98 |   Utilities folder and open it.
 99 | * Use the Mac ‘Spotlight’ computer search function. Search for: `Terminal` and
100 |   press <kbd>Return</kbd>.
101 | 
102 | For an introduction, see [How to Use Terminal on a Mac][mac-terminal].
103 | 
104 | #### Shell for Linux
105 | 
106 | On most versions of Linux, the default Unix Shell is accessible by running the
107 | [(Gnome) Terminal](https://help.gnome.org/users/gnome-terminal/stable/) or
108 | [(KDE) Konsole](https://konsole.kde.org/) or
109 | [xterm](https://en.wikipedia.org/wiki/Xterm), which can be found via the
110 | applications menu or the search bar.
111 | 
112 | #### Special cases
113 | 
114 | If none of the options above address your circumstances, try an online search
115 | for: `Unix shell [your operating system]`.
116 | 
117 | ### SSH for Secure Connections
118 | 
119 | All students should have an SSH client installed. SSH is a tool that allows us
120 | to connect to and use a remote computer as our own.
121 | 
122 | #### SSH for Windows
123 | 
124 | Git for Windows comes with SSH preinstalled: you do not have to do anything.
125 | 
126 | > ## GUI Support for Windows
127 | >
128 | > If you know that the software you will be running on the cluster requires a
129 | > graphical user interface (a GUI window needs to open for the application to
130 | > run properly), please install [MobaXterm](http://mobaxterm.mobatek.net) Home
131 | > Edition.
132 | {: .discussion}
133 | 
134 | #### SSH for macOS
135 | 
136 | macOS comes with SSH pre-installed: you do not have to do anything.
137 | 
138 | > ## GUI Support for macOS
139 | >
140 | > If you know that the software you will be running requires a graphical user
141 | > interface, please install [XQuartz](www.xquartz.org).
142 | {: .discussion}
143 | 
144 | #### SSH for Linux
145 | 
146 | Linux comes with SSH and X window support preinstalled: you do not have to do
147 | anything.
148 | 
149 | <!-- links -->
150 | [mac-terminal]: http://www.macworld.co.uk/feature/mac-software/how-use-terminal-on-mac-3608274/
151 | [microsoft-wsl]: https://docs.microsoft.com/en-us/windows/wsl/install-win10
152 | [microsoft-powershell]: https://docs.microsoft.com/en-us/powershell/scripting/learn/remoting/ssh-remoting-in-powershell-core?view=powershell-7
153 | [unix-emulator]: http://faculty.smu.edu/reynolds/unixtut/windows.html
154 | 


--------------------------------------------------------------------------------