├── .github
└── workflows
│ └── sphinx.yml
├── .gitignore
├── LICENSE
├── Makefile
├── content
├── a_list.dot
├── a_list.svg
├── binder.rst
├── conf.py
├── data-formats.rst
├── dependencies.rst
├── exercises.md
├── format_comparison_array.csv
├── format_comparison_tidy.csv
├── guide.rst
├── img
│ ├── binder
│ │ ├── binder.jpg
│ │ └── python_unmasked.jpg
│ ├── installation
│ │ ├── anaconda-navigator-jupyterlab.png
│ │ ├── anaconda-prompt.png
│ │ ├── jupyterlab-notebook.png
│ │ └── jupyterlab-terminal.png
│ ├── jupyter
│ │ ├── main-ui.png
│ │ └── notebook-ui.png
│ ├── numpy-advanced
│ │ ├── 01_memory_layout.svg
│ │ └── 02_views.svg
│ ├── pandas
│ │ ├── 01_table_dataframe.svg
│ │ └── tidy_data.png
│ └── xarray
│ │ ├── xarray_1d_plot.png
│ │ ├── xarray_2d_plot.png
│ │ ├── xarray_dataset_image.png
│ │ └── xarray_hist.png
├── index.rst
├── installation.rst
├── jupyter.ipynb
├── libraries.rst
├── ndarray.dot
├── ndarray.svg
├── numpy-advanced.rst
├── numpy.rst
├── packaging-example-project
│ ├── calculator
│ │ ├── __init__.py
│ │ ├── adding.py
│ │ ├── integrating.py
│ │ └── subtracting.py
│ ├── pyproject.toml
│ ├── test.py
│ └── test_editable.py
├── packaging.rst
├── pandas.rst
├── parallel-pi-multiprocessing.ipynb
├── parallel.rst
├── plotting-matplotlib.md
├── plotting-matplotlib
│ ├── customizing
│ │ ├── gapminder-larger-font.png
│ │ ├── gapminder-linear.png
│ │ └── gapminder-log.png
│ └── first-plot
│ │ ├── exercise.png
│ │ └── getting-started.png
├── plotting-vega-altair.md
├── plotting-vega-altair
│ ├── precipitation-on-top-yearmonth.svg
│ ├── precipitation-on-top.svg
│ ├── precipitation-side.svg
│ ├── precipitation-stacked-x.svg
│ ├── precipitation-stacked-y.svg
│ ├── snow-depth-circles.svg
│ ├── snow-depth-color.svg
│ ├── snow-depth-plasma.svg
│ ├── snow-depth.svg
│ ├── temperature-ranges-combined.svg
│ └── temperature-ranges-side.svg
├── productivity.md
├── productivity
│ ├── chatgpt.png
│ └── code-completion.gif
├── profiling.md
├── profiling
│ ├── exercise.png
│ └── exercise.py
├── python.rst
├── quick-reference.rst
├── scipy.rst
├── scripts.rst
├── web-apis.ipynb
├── work-with-data.rst
└── xarray.rst
├── extras
├── data-formats-comparison-array.ipynb
└── data-formats-comparison-tidy.ipynb
├── make.bat
├── requirements.txt
├── resources
├── code
│ └── scripts
│ │ ├── __pycache__
│ │ ├── optionsparser.cpython-38.pyc
│ │ ├── weather_functions.cpython-38.pyc
│ │ └── weather_functions_config.cpython-38.pyc
│ │ ├── optionsparser.py
│ │ ├── out.png
│ │ ├── rain_in_cairo.png
│ │ ├── weather.png
│ │ ├── weather_functions.py
│ │ ├── weather_functions_config.py
│ │ ├── weather_observations.ipynb
│ │ ├── weather_observations.py
│ │ ├── weather_observations_argparse.py
│ │ ├── weather_observations_config.py
│ │ └── weather_options.yml
├── data
│ ├── laureate.csv
│ ├── plotting
│ │ ├── README.md
│ │ ├── exercise-2.csv
│ │ ├── oslo-daily.csv
│ │ ├── oslo-monthly.csv
│ │ ├── tromso-daily.csv
│ │ └── tromso-monthly.csv
│ └── scripts
│ │ ├── weather_cairo.csv
│ │ └── weather_tapiola.csv
└── notebooks
│ ├── plotting-exercise-2.ipynb
│ └── plotting.ipynb
└── software
└── environment.yml
/.github/workflows/sphinx.yml:
--------------------------------------------------------------------------------
1 | # Deploy Sphinx. This could be shorter, but we also do some extra
2 | # stuff.
3 | #
4 | # License: CC-0. This is the canonical location of this file, which
5 | # you may want to link to anyway:
6 | # https://github.com/coderefinery/sphinx-lesson-template/blob/main/.github/workflows/sphinx.yml
7 | # https://raw.githubusercontent.com/coderefinery/sphinx-lesson-template/main/.github/workflows/sphinx.yml
8 |
9 |
10 | name: sphinx
11 | on: [push, pull_request]
12 |
13 | env:
14 | DEFAULT_BRANCH: "master"
15 | # If these SPHINXOPTS are enabled, then be strict about the
16 | # builds and fail on any warnings.
17 | #SPHINXOPTS: "-W --keep-going -T"
18 | GENERATE_PDF: true # to enable, must be 'true' lowercase
19 | GENERATE_SINGLEHTML: true # to enable, must be 'true' lowercase
20 | PDF_FILENAME: lesson.pdf
21 | MULTIBRANCH: true # to enable, must be 'true' lowercase
22 |
23 |
24 | jobs:
25 | build:
26 | name: Build
27 | runs-on: ubuntu-latest
28 | permissions:
29 | contents: read
30 |
31 | steps:
32 | # https://github.com/marketplace/actions/checkout
33 | - uses: actions/checkout@v4
34 | with:
35 | fetch-depth: 0
36 | lfs: true
37 |
38 | # https://github.com/marketplace/actions/setup-python
39 | # ^-- This gives info on matrix testing.
40 | - name: Install Python
41 | uses: actions/setup-python@v4
42 | with:
43 | python-version: '3.11'
44 | cache: 'pip'
45 |
46 | # https://docs.github.com/en/actions/guides/building-and-testing-python#installing-dependencies
47 | # ^-- This gives info on installing dependencies with pip
48 | - name: Install dependencies
49 | run: |
50 | python -m pip install --upgrade pip
51 | pip install -r requirements.txt
52 |
53 | # Debug
54 | - name: Debugging information
55 | env:
56 | ref: ${{github.ref}}
57 | event_name: ${{github.event_name}}
58 | head_ref: ${{github.head_ref}}
59 | base_ref: ${{github.base_ref}}
60 | run: |
61 | echo "github.ref: ${ref}"
62 | echo "github.event_name: ${event_name}"
63 | echo "github.head_ref: ${head_ref}"
64 | echo "github.base_ref: ${base_ref}"
65 | echo "GENERATE_PDF: ${GENERATE_PDF}"
66 | echo "GENERATE_SINGLEHTML: ${GENERATE_SINGLEHTML}"
67 | set -x
68 | git rev-parse --abbrev-ref HEAD
69 | git branch
70 | git branch -a
71 | git remote -v
72 | python -V
73 | pip list --not-required
74 | pip list
75 |
76 |
77 | # Build
78 | - uses: ammaraskar/sphinx-problem-matcher@master
79 | - name: Build Sphinx docs (dirhtml)
80 | # SPHINXOPTS used via environment variables
81 | run: |
82 | make dirhtml
83 | # This fixes broken copy button icons, as explained in
84 | # https://github.com/coderefinery/sphinx-lesson/issues/50
85 | # https://github.com/executablebooks/sphinx-copybutton/issues/110
86 | # This can be removed once these PRs are accepted (but the
87 | # fixes also need to propagate to other themes):
88 | # https://github.com/sphinx-doc/sphinx/pull/8524
89 | # https://github.com/readthedocs/sphinx_rtd_theme/pull/1025
90 | sed -i 's/url_root="#"/url_root=""/' _build/dirhtml/index.html || true
91 |
92 | # singlehtml
93 | - name: Generate singlehtml
94 | if: ${{ env.GENERATE_SINGLEHTML == 'true' }}
95 | run: |
96 | make singlehtml
97 | mv _build/singlehtml/ _build/dirhtml/singlehtml/
98 |
99 | # PDF if requested
100 | - name: Generate PDF
101 | if: ${{ env.GENERATE_PDF == 'true' }}
102 | run: |
103 | pip install https://github.com/rkdarst/sphinx_pyppeteer_builder/archive/refs/heads/main.zip
104 | make pyppeteer
105 | mv _build/pyppeteer/*.pdf _build/dirhtml/${PDF_FILENAME}
106 |
107 | # Stage all deployed assets in _gh-pages/ for simplicity, and to
108 | # prepare to do a multi-branch deployment.
109 | - name: Copy deployment data to _gh-pages/
110 | if: ${{ github.event_name == 'push' }}
111 | run:
112 | rsync -a _build/dirhtml/ _gh-pages/
113 |
114 | # Use gh-pages-multibranch to multiplex different branches into
115 | # one deployment. See
116 | # https://github.com/coderefinery/gh-pages-multibranch
117 | - name: gh-pages multibranch
118 | uses: coderefinery/gh-pages-multibranch@main
119 | if: ${{ github.event_name == 'push' && env.MULTIBRANCH == 'true' }}
120 | with:
121 | directory: _gh-pages/
122 | default_branch: ${{ env.DEFAULT_BRANCH }}
123 | publish_branch: gh-pages
124 |
125 | # Add the .nojekyll file
126 | - name: nojekyll
127 | if: ${{ github.event_name == 'push' }}
128 | run: |
129 | touch _gh-pages/.nojekyll
130 |
131 | # Save artifact for the next step.
132 | - uses: actions/upload-artifact@v4
133 | if: ${{ github.event_name == 'push' }}
134 | with:
135 | name: gh-pages-build
136 | path: _gh-pages/
137 |
138 | # Deploy in a separate job so that write permissions are restricted
139 | # to the minimum steps.
140 | deploy:
141 | name: Deploy
142 | runs-on: ubuntu-latest
143 | needs: build
144 | # This if can't use the env context - find better way later.
145 | if: ${{ github.event_name == 'push' }}
146 | permissions:
147 | contents: write
148 |
149 | steps:
150 | - uses: actions/download-artifact@v4
151 | if: ${{ github.event_name == 'push' && ( env.MULTIBRANCH == 'true' || github.ref == format('refs/heads/{0}', env.DEFAULT_BRANCH )) }}
152 | with:
153 | name: gh-pages-build
154 | path: _gh-pages/
155 |
156 | # As of 2023, we could publish to pages via a Deployment. This
157 | # isn't done yet to give it time to stabilize (out of beta), and
158 | # also having a gh-pages branch to check out is rather
159 | # convenient.
160 |
161 | # Deploy
162 | # https://github.com/peaceiris/actions-gh-pages
163 | - name: Deploy
164 | uses: peaceiris/actions-gh-pages@v3
165 | if: ${{ github.event_name == 'push' && ( env.MULTIBRANCH == 'true' || github.ref == format('refs/heads/{0}', env.DEFAULT_BRANCH )) }}
166 | with:
167 | publish_branch: gh-pages
168 | github_token: ${{ secrets.GITHUB_TOKEN }}
169 | publish_dir: _gh-pages/
170 | force_orphan: true
171 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /_build
2 | /venv
3 | .ipynb_checkpoints/
4 | .vscode
5 | catfacts.jsonl
6 | jupyter_execute/
7 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Creative Commons Attribution 4.0
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = content
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
22 | # Live reload site documents for local development
23 | livehtml:
24 | sphinx-autobuild "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
25 |
--------------------------------------------------------------------------------
/content/a_list.dot:
--------------------------------------------------------------------------------
1 | strict digraph a_list {
2 | graph [compound=true];
3 |
4 | node [style = filled, color=cyan];
5 |
6 | a_list [label="Variable a_list (lvalue)", color=gold];
7 | aobj [label="PyObject a_list"];
8 | one [label="PyObject 1"];
9 | hello [label="PyObject hello"];
10 | oneptwo [label="PyObject 1.2"];
11 |
12 | a_list -> aobj;
13 |
14 |
15 |
16 | subgraph cluster_adata {
17 | label = "Data array for a_list PyObject";
18 | color = aquamarine;
19 | style = filled;
20 |
21 | adata_0 [label="element [0]"];
22 | adata_1 [label="element [1]"];
23 | adata_2 [label="element [2]"];
24 | }
25 |
26 | adata_0 -> one;
27 | adata_1 -> hello;
28 | adata_2 -> oneptwo;
29 |
30 |
31 | aobj -> adata_1 [lhead=cluster_adata];
32 |
33 | }
34 |
--------------------------------------------------------------------------------
/content/a_list.svg:
--------------------------------------------------------------------------------
1 |
2 |
4 |
6 |
7 |
83 |
--------------------------------------------------------------------------------
/content/binder.rst:
--------------------------------------------------------------------------------
1 | Binder
2 | ======
3 |
4 | .. questions::
5 |
6 | - Why sharing code alone may not be sufficient.
7 | - How to share a computational environment?
8 | - What is Binder?
9 | - How to binderize my Python repository?
10 | - How to publish my Python repository?
11 |
12 | .. objectives::
13 |
14 | - Learn about reproducible computational environments.
15 | - Learn to create and share custom computing environments with Binder.
16 | - Learn to get a DOI from Zenodo for a repository.
17 |
18 |
19 | Why is it sometimes not enough to share your code?
20 | --------------------------------------------------
21 |
22 | .. image:: img/binder/python_unmasked.jpg
23 |
24 |
25 | Exercise 1
26 | ~~~~~~~~~~
27 |
28 | .. challenge:: Binder-1: Discuss better strategies than only code sharing (10 min)
29 |
30 | Lea is a PhD student in computational biology and after 2 years of intensive
31 | work, she is finally ready to publish her first paper. The code she has used
32 | for analyzing her data is available on GitHub but her supervisor who is an
33 | advocate of open science told her that sharing code is not sufficient.
34 |
35 | **Why is it possibly not enough to share "just" your code?
36 | What problems can you anticipate 2-5 years from now?**
37 |
38 | We form small groups (4-5 persons) and discuss in groups. If the workshop is
39 | online, each group will join a breakout room.
40 | If joining a group is not possible or practical, we use the shared document
41 | to discuss this collaboratively.
42 |
43 | Each group write a summary (bullet points) of the discussion in the workshop
44 | shared document (the link will be provided by your instructors).
45 |
46 |
47 | Sharing a computing environment with Binder
48 | -------------------------------------------
49 |
50 | `Binder `__ allows you to create
51 | custom computing environments that can be shared and used by many remote users.
52 | It uses `repo2docker `__ to
53 | create a container image (`docker `__ image) of a
54 | project using information contained in included configuration files.
55 |
56 | Repo2docker is a standalone package that you can install locally on your laptop
57 | but an `online Binder `__ service is freely available.
58 | This is what we will be using in the tutorial.
59 |
60 | The main objective of this exercise is to learn to fork a repository and add a
61 | requirement file to share the computational environment with Binder.
62 |
63 | .. image:: https://opendreamkit.org/public/images/use-cases/reproducible_logbook.png
64 |
65 | Credit: `Juliette Taka, Logilab and the OpenDreamKit project (2017) `_
66 |
67 |
68 | Binder exercise/demo
69 | ~~~~~~~~~~~~~~~~~~~~
70 |
71 | In an earlier episode (Data visualization with Matplotlib) we have created this notebook:
72 |
73 | .. code-block:: python
74 |
75 | import pandas as pd
76 | import matplotlib.pyplot as plt
77 |
78 | url = "https://raw.githubusercontent.com/plotly/datasets/master/gapminder_with_codes.csv"
79 | data = pd.read_csv(url)
80 | data_2007 = data[data["year"] == 2007]
81 |
82 | fig, ax = plt.subplots()
83 |
84 | ax.scatter(x=data_2007["gdpPercap"], y=data_2007["lifeExp"], alpha=0.5)
85 |
86 | ax.set_xscale("log")
87 |
88 | ax.set_xlabel("GDP (USD) per capita")
89 | ax.set_ylabel("life expectancy (years)")
90 |
91 | We will now first share it via `GitHub `__ "statically",
92 | then using `Binder `__.
93 |
94 | .. challenge:: Binder-2: Exercise/demo: Make your notebooks reproducible by anyone (15 min)
95 |
96 | Instructor demonstrates this. **This exercise (and all following)
97 | requires git/GitHub knowledge and accounts, which wasn't a
98 | prerequisite of this course. Thus, this is a demo (and might even
99 | be too fast for you to type-along). Watch the video if you
100 | are reading this later on**:
101 |
102 | - Creates a GitHub repository
103 | - Uploads the notebook file
104 | - Then we look at the statically rendered version of the notebook on GitHub
105 | - Create a ``requirements.txt`` file which contains:
106 |
107 | .. code-block:: none
108 |
109 | pandas==1.2.3
110 | matplotlib==3.4.2
111 |
112 | - Commit and push also this file to your notebook repository.
113 | - Visit https://mybinder.org and copy paste the code under "Copy the text below ..." into your `README.md`:
114 |
115 | .. image:: img/binder/binder.jpg
116 |
117 | - Check that your notebook repository now has a "launch binder"
118 | badge in your `README.md` file on GitHub.
119 | - Try clicking the button and see how your repository is launched
120 | on Binder (can take a minute or two). Your notebooks can now be explored and executed in the cloud.
121 | - Enjoy being fully reproducible!
122 |
123 |
124 | How can I get a DOI from Zenodo?
125 | ---------------------------------
126 |
127 | `Zenodo `__ is a general purpose open-access
128 | repository built and operated by `CERN `__ and `OpenAIRE
129 | `__ that allows researchers to archive and get a
130 | `Digital Object Identifier (DOI) `__ to data that they
131 | share.
132 |
133 | .. challenge:: Binder-3: Link a Github repository with Zenodo (optional)
134 |
135 | **Everything you deposit on Zenodo is meant to be kept (long-term archive).
136 | Therefore we recommend to practice with the Zenodo "sandbox" (practice/test area)
137 | instead:** https://sandbox.zenodo.org
138 |
139 | 1. **Link GitHub with Zenodo**:
140 |
141 | - Go to https://sandbox.zenodo.org (or to https://zenodo.org for the real upload later, after practicing).
142 | - Log in to Zenodo with your GitHub account. Be aware that you may need to
143 | authorize Zenodo application (Zenodo will redirect you back to GitHub for
144 | Authorization).
145 | - Choose the repository webhooks options.
146 | - From the drop-down menu next to your email address at the top of the page, select GitHub.
147 | - You will be presented with a list of all your Github repositories.
148 |
149 | 2. **Archiving a repo**:
150 |
151 | - Select a repository you want to archive on Zenodo.
152 | - Toggle the "on" button next to the repository ou need to archive.
153 | - Click on the Repo that you want to reserve.
154 | - Click on Create release button at the top of the page. Zenodo will redirect you back to GitHub’s repo page to generate a release.
155 |
156 | 3. **Trigger Zenodo to Archive your repository**
157 |
158 | - Go to GitHub and create a release. Zenodo will automatically download a .zip-ball of each new release and register a DOI.
159 | - If this is the first release of your code then you should give it a
160 | version number of v1.0.0. Add description for your release then click the
161 | Publish release button.
162 | - Zenodo takes an archive of your GitHub repository each time you create a new Release.
163 |
164 | 4. **To ensure that everything is working**:
165 |
166 | - Go to https://zenodo.org/account/settings/github/ (or the corresponding
167 | sandbox at https://sandbox.zenodo.org/account/settings/github/), or the
168 | Upload page (https://zenodo.org/deposit), you will find your repo is
169 | listed.
170 | - Click on the repo, Zenodo will redirect you to a page that contains a DOI for your repo will the information that you added to the repo.
171 | - You can edit the archive on Zenodo and/or publish a new version of your software.
172 | - It is recommended that you add a description for your repo and fill in other metadata in the edit page. Instead of editing metadata
173 | manually, you can also add a ``.zenodo.json`` or a ``CITATION.cff`` file to your repo and Zenodo will infer the metadata from this file.
174 | - Your code is now published on a Github public repository and archived on Zenodo.
175 | - Update the README file in your repository with the newly created zenodo badge.
176 |
177 |
178 | Create a Binder link for your Zenodo DOI
179 | ----------------------------------------
180 |
181 | Rather than specifying a GitHub repository when launching binder, you can instead use a Zenodo DOI.
182 |
183 | .. challenge:: Binder-4: Link Binder with Zenodo (10 min)
184 |
185 | We will be using an existing Zenodo DOI `10.5281/zenodo.3886864 `_ to start Binder:
186 |
187 | - Go to `https://mybinder.org `__ and fill information using Zenodo DOI (as shown on the animation below):
188 |
189 | .. image:: https://miro.medium.com/max/1050/1*xOABVY2hNtVmjV5-LXreFw.gif
190 |
191 | - You can also get a Binder badge and update the README file in the
192 | repository. It is good practice to add both the Zenodo badge and the
193 | corresponding Binder badge.
194 |
195 | .. keypoints::
196 |
197 | - It is easy to sharing reproducible computational environments
198 | - Binder provides a way for anyone to test and run code - without
199 | you needing to set up a dedicated server for it.
200 | - Zenodo provides permanent archives and a DOI.
201 |
--------------------------------------------------------------------------------
/content/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 |
17 |
18 | # -- Project information -----------------------------------------------------
19 |
20 | project = 'Python for Scientific Computing'
21 | copyright = '2020-2024, The contributors'
22 | author = 'The contributors'
23 | github_user = 'AaltoSciComp'
24 | github_repo_name = 'python-for-scicomp' # auto-detected from dirname if blank
25 | github_version = 'master/content/' # with trailing slash
26 |
27 |
28 | # -- General configuration ---------------------------------------------------
29 |
30 | highlight_language = 'python3'
31 |
32 | # Add any Sphinx extension module names here, as strings. They can be
33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
34 | # ones.
35 | extensions = [
36 | 'sphinx_lesson',
37 | 'sphinx_rtd_theme_ext_color_contrast',
38 | 'sphinx.ext.todo',
39 | 'sphinx.ext.intersphinx',
40 | 'sphinx.ext.mathjax',
41 | 'sphinx_aaltoscicomp_branding',
42 | 'sphinxext.opengraph',
43 | 'sphinx_thebe',
44 | ]
45 | myst_enable_extensions = ['colon_fence']
46 |
47 | thebe_config = {
48 | "selector": "div.highlight"
49 | }
50 |
51 | nb_execution_mode = "off"
52 |
53 | ogp_site_name = "Python for Scientific Computing"
54 | ogp_site_url = 'https://aaltoscicomp.github.io/python-for-scicomp/'
55 | import datetime
56 | if datetime.date.today() < datetime.date(2022,12,15):
57 | ogp_image = 'https://www.aalto.fi/sites/g/files/flghsv161/files/styles/o_914w_ah_n/public/2022-11/PFSC22_v2.png'
58 | ogp_image_alt = 'Python for Scientific Computing course logo with date of 22-25/11/2022, twitch.tv/coderefinery, and partner logos'
59 |
60 | copybutton_exclude = '.linenos, .gp'
61 |
62 | import os
63 | if (
64 | 'GITHUB_ACTION' in os.environ
65 | and os.environ.get('GITHUB_REPOSITORY', '').lower() == 'aaltoscicomp/python-for-scicomp'
66 | and os.environ.get('GITHUB_REF') == 'refs/heads/master'
67 | ):
68 | html_js_files = [
69 | ('https://plausible.cs.aalto.fi/js/script.js', {"data-domain": "aaltoscicomp.github.io/python-for-scicomp", "defer": "defer"}),
70 | ]
71 |
72 |
73 | # Add any paths that contain templates here, relative to this directory.
74 | #templates_path = ['_templates']
75 |
76 | # List of patterns, relative to source directory, that match files and
77 | # directories to ignore when looking for source files.
78 | # This pattern also affects html_static_path and html_extra_path.
79 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'venv', 'jupyter_execute']
80 |
81 |
82 | # -- Options for HTML output -------------------------------------------------
83 |
84 | # The theme to use for HTML and HTML Help pages. See the documentation for
85 | # a list of builtin themes.
86 | #
87 | html_theme = 'sphinx_rtd_theme'
88 |
89 | # Add any paths that contain custom static files (such as style sheets) here,
90 | # relative to this directory. They are copied after the builtin static files,
91 | # so a file named "default.css" will overwrite the builtin "default.css".
92 | #html_static_path = ['_static']
93 |
94 |
95 | # HTML context:
96 | from os.path import dirname, realpath, basename
97 | html_context = {'display_github': True,
98 | 'github_user': github_user,
99 | # Auto-detect directory name. This can break, but
100 | # useful as a default.
101 | 'github_repo': github_repo_name or basename(dirname(realpath(__file__))),
102 | 'github_version': github_version,
103 | }
104 |
105 |
106 | intersphinx_mapping = {
107 | 'python': ('https://docs.python.org/3', None),
108 | 'numpy': ('https://numpy.org/doc/stable', None),
109 | 'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None),
110 | 'matplotlib': ('https://matplotlib.org/stable', None),
111 | 'requests': ('https://requests.readthedocs.io/en/latest/', None),
112 | }
113 |
--------------------------------------------------------------------------------
/content/exercises.md:
--------------------------------------------------------------------------------
1 | # List of exercises
2 |
3 | ## Full list
4 |
5 | This is a list of all exercises and solutions in this lesson, mainly
6 | as a reference for helpers and instructors. This list is
7 | automatically generated from all of the other pages in the lesson.
8 | Any single teaching event will probably cover only a subset of these,
9 | depending on their interests.
10 |
11 | ```{exerciselist}
12 | ```
13 |
--------------------------------------------------------------------------------
/content/format_comparison_array.csv:
--------------------------------------------------------------------------------
1 | File format,File size [MB],Write time [ms],Read time [ms],Data matches exactly
2 | CSV,23.8,690,294,True
3 | npy,7.63,13.8,2.72,True
4 | HDF5,7.63,27,3.97,True
5 | NetCDF4,7.64,28.8,12.2,True
6 |
--------------------------------------------------------------------------------
/content/format_comparison_tidy.csv:
--------------------------------------------------------------------------------
1 | File format,File size [MB],Write time [ms],Read time [ms],Data matches exactly
2 | CSV,4.57,360,81.2,False
3 | Feather,2.2,12.9,6.67,True
4 | Parquet,1.82,35.1,8.96,True
5 | HDF5,4.89,41.7,29.6,True
6 | NetCDF4,6.9,92.9,74.2,True
7 |
--------------------------------------------------------------------------------
/content/guide.rst:
--------------------------------------------------------------------------------
1 | Instructor's guide
2 | ==================
3 |
4 | Learner personas
5 | ----------------
6 |
7 | A is a early career PhD researcher who has been using Python a bit,
8 | but is not sure what they know or don't know. They want to be able to
9 | do their research more efficiently and make sure that they are using
10 | the right tools. A may know that numpy exists, etc. and could
11 | theoretically read some about it themselves, but aren't sure if they
12 | are going in the right direction.
13 |
14 | A2 can use numpy and pandas, but have learned little bits here and
15 | there and hasn't had a comprehensive introduction. They want to
16 | ensure they are using best practices. (Baseline of high-level
17 | packages)
18 |
19 | B is a mid-to-late undergraduate student who has used Python in some
20 | classes. They have possibly learned the syntax and enough to use it
21 | in courses, but in a course-like manner where they are expected to
22 | create everything themselves.
23 |
24 |
25 | Prerequisites:
26 | - Knowing basic Python syntax
27 | - Watch the command line crash course, if you aren't familiar.
28 |
29 | Not prerequisites:
30 | - Any external libraries, e.g. numpy
31 | - Knowing how to make scripts or use Jupyter
32 |
33 |
34 |
35 | About each section
36 | ------------------
37 |
38 | In general, "Python for Scientific Computing could be a multi-year
39 | course. We can't even pretend to really teach even a small fraction
40 | of it. We can, however, introduce people to things that can very
41 | easily be missed in the typical academic career path.
42 |
43 | * **Python intro:** We can't really replace a Python tutorial, but
44 | here we try to outline some of the main points. We don't go over
45 | this in the course.
46 |
47 | * **Jupyter:** Jupyter is somewhat useful, but the main reason we go
48 | over it is that it provides a convenient user interface for the
49 | other programming lessons (it's easier to spend a bit of time with
50 | Jupyter than expect people to be able to use some
51 | editor/IDE/shell/etc). So, we do start from the beginning, so that
52 | people can do the other lessons, but also try to teach some advanced
53 | tips and tricks.
54 |
55 | * **Numpy:** The basic of much of the rest of scipy, so we need to
56 | cover it. We try to get the main principles out, but if someone
57 | already knows it this can be a bit boring. We try to make sure
58 | everyone comes out with an appreciation for vectorization and
59 | broadcasting.
60 |
61 | * **Pandas:** A lot of similar goals to the Numpy section, especially
62 | the concepts behind Dataframes that one needs to know in order to
63 | read other documentation.
64 |
65 | * **Visualization:** Matplotlib is getting a bit old, but is still the
66 | backbone of other plotting packages. We try to get forth the ideas
67 | of the matplotlib API that can be seen in other packages and the
68 | importance of scripted plots.
69 |
70 | * **Data formats:** Input/output/storage is a common task, and can
71 | easily either be a bottleneck or a huge mess. This lessons tries to
72 | show some best practices with data formats and, as usual, get the
73 | idea to not "do it yourself". Pandas is used as a common framework,
74 | but we should point out there are plenty of other options.
75 |
76 | * **Scripts:** The most important lesson here is to break out of
77 | Jupyter/run buttons of editors. If you can't make actual programs
78 | with an actual interface, you can't scale up.
79 |
80 | * This is the first lesson to introduce the command line. We
81 | recommend being as simple as possible: at least demonstrate the
82 | JupyterLab terminal and discuss the bigger picture behind what it
83 | means and why.
84 |
85 | * This is also the first lesson to use non-Jupyter code editor. We
86 | recommend again being simple: use the JupyterLab code editor to
87 | start off, and carefully explain what is going on.
88 |
89 | * **Scipy:** We don't cover much here (this is super short), but the
90 | point is scipy exists and the concept of wrapping existing C/fortran
91 | libraries and so on.
92 |
93 | * **Library ecosystem:** This was an overview of the types of packages
94 | available in the "scipy ecosystem", which is a large and ill-defined
95 | thing. But there is another point: choosing what to use. Do you
96 | trust a half-done thing published on someone's personal webpage? If
97 | it's on Github? How do you make your code more reusable? When
98 | coming from academic courses, you get a "build it yourself" idea,
99 | which isn't sustainable in research.
100 |
101 | * **Parallel programming:**
102 |
103 | * **Dependencies:** The main point here is environments, another thing
104 | you often don't learn in courses.
105 |
106 | * There is a lot of material here. Consider what you will demo,
107 | what will be done as exercises, and what is advanced/optional.
108 | However, it is the fourth-day lesson that is most interactive, so
109 | it is OK if it take a while to go through everything.
110 |
111 | * If someone else installs Anaconda for a user (e.g. admin-managed
112 | laptop), the conda environment creations (with ``--name``,
113 | possibly with ``--prefix`` too?) may not work. Be prepared for
114 | this and mention it. You don't need to solve the problem but
115 | acknowledge that the lesson becomes a demo. The virtualenv part
116 | should hopefully work for them.
117 |
118 | * **Binder:** Binder exists and can help make code
119 | reproducible/reusable by others.
120 |
121 | * **Packaging:** How to make your code reusable by others. By the
122 | time we get here, people are tired and the topics get involved. We
123 | more explicitly say "you might want to watch and take this as a
124 | demo".
125 |
126 |
--------------------------------------------------------------------------------
/content/img/binder/binder.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/binder/binder.jpg
--------------------------------------------------------------------------------
/content/img/binder/python_unmasked.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/binder/python_unmasked.jpg
--------------------------------------------------------------------------------
/content/img/installation/anaconda-navigator-jupyterlab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/installation/anaconda-navigator-jupyterlab.png
--------------------------------------------------------------------------------
/content/img/installation/anaconda-prompt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/installation/anaconda-prompt.png
--------------------------------------------------------------------------------
/content/img/installation/jupyterlab-notebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/installation/jupyterlab-notebook.png
--------------------------------------------------------------------------------
/content/img/installation/jupyterlab-terminal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/installation/jupyterlab-terminal.png
--------------------------------------------------------------------------------
/content/img/jupyter/main-ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/jupyter/main-ui.png
--------------------------------------------------------------------------------
/content/img/jupyter/notebook-ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/jupyter/notebook-ui.png
--------------------------------------------------------------------------------
/content/img/numpy-advanced/02_views.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
337 |
--------------------------------------------------------------------------------
/content/img/pandas/01_table_dataframe.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
263 |
--------------------------------------------------------------------------------
/content/img/pandas/tidy_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/pandas/tidy_data.png
--------------------------------------------------------------------------------
/content/img/xarray/xarray_1d_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/xarray/xarray_1d_plot.png
--------------------------------------------------------------------------------
/content/img/xarray/xarray_2d_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/xarray/xarray_2d_plot.png
--------------------------------------------------------------------------------
/content/img/xarray/xarray_dataset_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/xarray/xarray_dataset_image.png
--------------------------------------------------------------------------------
/content/img/xarray/xarray_hist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/xarray/xarray_hist.png
--------------------------------------------------------------------------------
/content/index.rst:
--------------------------------------------------------------------------------
1 | ===============================
2 | Python for Scientific Computing
3 | ===============================
4 |
5 | .. admonition:: Attending the course 5-7 November, 2024?
6 |
7 | `See the course page here
8 | `__
9 | and watch at https://twitch.tv/coderefinery.
10 | Whether you are or aren't, the course material is below. Videos
11 | will appear in `this playlist `__ (Last year's videos: `playlist `__).
12 |
13 |
14 | Python is a modern, object-oriented programming language, which has
15 | become popular in several areas of software development. This course
16 | discusses how Python can be utilized in scientific computing. The
17 | course starts by introducing some of the main Python tools for
18 | computing: Jupyter for interactive analysis, NumPy and SciPy for
19 | numerical analysis, Matplotlib for visualization, and so on. In
20 | addition, it talks about *how* python is used:
21 | related scientific libraries, reproducibility, and the broader
22 | ecosystem of science in Python, because your work is more than the raw
23 | code you write.
24 |
25 | This course (like any course) can't teach you Python... it can show
26 | your some examples, let you see how experts do things, and prepare you
27 | to learn yourself as you need to.
28 |
29 | .. _prerequisites:
30 |
31 | .. prereq::
32 |
33 | - Knowing basic Python syntax. We assume that you can do some
34 | Python programming, but not much more that that. We don't cover
35 | standard Python programming. `Here a short course on basic Python
36 | syntax, with further references `__.
37 | - Watch or read the `command line crash course
38 | `__, if you aren't
39 | familiar.
40 | - You should be able to use a text editor to edit files some.
41 | - The :doc:`software installation ` described below
42 | (basically, anaconda).
43 |
44 | These are not prerequisites:
45 |
46 | - Any external libraries, e.g. numpy
47 | - Knowing how to make scripts or use Jupyter
48 |
49 |
50 | .. admonition:: Videos and archived Q&A
51 |
52 | Videos and material from past instances:
53 |
54 | * 2021: `this YouTube playlist
55 | `__.
56 | * 2022: `here
57 | `__,
58 | Q&A: `days 1-2
59 | `__, `days 3-4
60 | `__
61 |
62 | * 2023: `Videos
63 | `__
64 |
65 | * 2024 (Please contact us if you would like to help to process the videos): `Videos `__
66 |
67 |
68 | .. csv-table::
69 | :widths: auto
70 | :delim: ;
71 |
72 | (prereq) ; :doc:`python`
73 | 30 min ; :doc:`jupyter`
74 | 60 min ; :doc:`numpy` or :doc:`numpy-advanced`
75 | 60 min ; :doc:`pandas`
76 | 30 min ; :doc:`xarray`
77 | 60 min ; :doc:`plotting-matplotlib`
78 | 60 min ; :doc:`plotting-vega-altair`
79 | 30 min ; :doc:`work-with-data`
80 | 60 min ; :doc:`scripts`
81 | 40 min ; :doc:`profiling`
82 | 20 min ; :doc:`productivity`
83 | 30 min ; :doc:`web-apis`
84 | 15 min ; :doc:`scipy`
85 | 30 min ; :doc:`libraries`
86 | 45 min ; :doc:`parallel`
87 | 45 min ; :doc:`dependencies`
88 | 30 min ; :doc:`binder`
89 | 60 min ; :doc:`packaging`
90 |
91 |
92 | .. toctree::
93 | :maxdepth: 1
94 | :caption: The lesson
95 | :hidden:
96 |
97 | python
98 | jupyter
99 | numpy
100 | numpy-advanced
101 | pandas
102 | xarray
103 | plotting-matplotlib
104 | plotting-vega-altair
105 | work-with-data
106 | scripts
107 | profiling
108 | productivity
109 | scipy
110 | libraries
111 | dependencies
112 | binder
113 | parallel
114 | packaging
115 | web-apis
116 |
117 | .. toctree::
118 | :maxdepth: 1
119 | :caption: Reference
120 |
121 | installation
122 | quick-reference
123 | exercises
124 | guide
125 | data-formats
126 |
127 |
128 | .. _learner-personas:
129 |
130 | Who is the course for?
131 | ======================
132 |
133 | The course is targeted towards these learner personas:
134 |
135 | * A is a early career PhD researcher who has been using Python a bit,
136 | but is not sure what they know or don't know. They want to be able
137 | to do their research more efficiently and make sure that they are
138 | using the right tools. A may know that numpy exists, etc. and could
139 | theoretically read some about it themselves, but aren't sure if they
140 | are going in the right direction.
141 |
142 | * A2 can use numpy and pandas, but have learned little bits here and
143 | there and hasn't had a comprehensive introduction. They want to
144 | ensure they are using best practices. (Baseline of high-level
145 | packages)
146 |
147 | * B is a mid-to-late undergraduate student who has used Python in some
148 | classes. They have possibly learned the syntax and enough to use it
149 | in courses, but in a course-like manner where they are expected to
150 | create everything themselves: they want to know how to reuse tools
151 | that already exist.
152 |
153 |
154 | Motivation
155 | ==========
156 |
157 | Why Python
158 | ----------
159 |
160 | Python has become popular, largely due to good reasons. It's very easy
161 | to get started, there's lots of educational material, a huge amount of
162 | libraries for doing everything imaginable. Particularly in the
163 | scientific computing space, there is the Numpy, Scipy, and matplotlib
164 | libraries which form the basis of almost everything. Numpy and Scipy
165 | are excellent examples of using Python as a glue language, meaning to
166 | glue together battle-tested and well performing code and present them
167 | with an easy to use interface. Also machine learning and deep
168 | learning frameworks have embraced python as the glue language of
169 | choice. And finally, Python is open source, meaning that anybody can
170 | download and install it on their computer, without having to bother
171 | with acquiring a license or such. This makes it easier to distribute
172 | your code e.g. to collaborators in different universities.
173 |
174 |
175 | Why not Python for Scientific Computing
176 | ---------------------------------------
177 |
178 | While Python is extremely popular in scientific computing today, there
179 | are certainly things better left to other tools.
180 |
181 | - Implementing performance-critical kernels. Python is a **very**
182 | slow language, which often doesn't matter if you can offload the
183 | heavy lifting to fast compiled code, e.g. by using Numpy array
184 | operations. But if what you're trying to do isn't *vectorizable*
185 | then you're out of luck. An alternative to Python, albeit much less
186 | mature and with a smaller ecosystem, but which provides very fast
187 | generated code, is *Julia*.
188 |
189 | - Creating libraries that can be called from other languages. In this
190 | case you'll often want to create a library with a C interface, which
191 | can then be called from most languages. Suitable languages for this
192 | sort of task, depending on what you are doing, could be Rust, C,
193 | C++, or Fortran.
194 |
195 | - You really like static typing, or functional programming
196 | approaches. *Haskell* might be what you're looking for.
197 |
198 |
199 | Python 2 vs Python 3
200 | --------------------
201 |
202 | Python 3.0 came out in September 2008 and was just slightly different
203 | enough that most code had to be changed, which meant that many
204 | projects ignored it for many years. It was about 3-5 years until the
205 | differences were reduced enough (and better transition plans came out,
206 | so that it was reasonable to use a single code for both versions) that
207 | it become more and more adopted in the scientific community. Python 2
208 | finally became unsupported in 2020, and by now Python 3 is the defacto
209 | standard.
210 |
211 | At this point, all new projects should use Python 3, and existing
212 | actively developed projects should be upgraded to use it. Still, you
213 | might find some old unmaintained tools that are only compatible with
214 | Python 2.
215 |
216 |
217 |
218 | Credits
219 | =======
220 |
221 | This course was originally designed by Janne Blomqvist.
222 |
223 | In 2020 it was completely redesigned by a team of the following:
224 |
225 | * Authors: Radovan Bast, Richard Darst, Anne Fouilloux, Thor Wikfeldt, ...
226 | * Editor:
227 | * Testers and advisors: Enrico Glerean
228 |
229 | We follow The Carpentries Code of Conduct: https://docs.carpentries.org/topic_folders/policies/code-of-conduct.html
230 |
231 |
232 | See also
233 | ========
234 |
235 | * `High Performance Data Analytics in Python
236 | `__ is a logical follow-up to
237 | this lesson that goes more in-depth to tools of high-performance
238 | and large-scale Python.
239 |
--------------------------------------------------------------------------------
/content/installation.rst:
--------------------------------------------------------------------------------
1 | Software installation
2 | =====================
3 |
4 | This course is interactive and demonstrates many different tools.
5 | Thus, even beyond Python, extra software (Python libraries) needs to
6 | be installed. This page contains the instructions.
7 |
8 | **Once the course starts, we don't have time to stop for installing
9 | software.**
10 |
11 | Please make sure before the course that you have all the required
12 | software installed or some other way access to it. For example, the
13 | workshop could be done with a remote Jupyter server, as long as you
14 | can use the terminal from the Jupyter (you need to be able to access
15 | the command line for some lessons).
16 |
17 | .. admonition:: Do you need help?
18 | :class: important
19 |
20 | Participants from a partner institution are invited to install help
21 | sessions. (Hint: ask your institution to become a partner if it
22 | isn't already!)
23 |
24 | Otherwise, if you need installation help, show this page to someone
25 | around you and they can probably help. These are relatively
26 | standard tools.
27 |
28 | Don't be afraid to ask for help. Installing scientific software is
29 | *harder than it should be* and it helps to have someone guide you
30 | through it.
31 |
32 | .. highlight:: console
33 |
34 |
35 |
36 | Python
37 | ------
38 |
39 | We expect you to have a working Python installation with some common
40 | libraries. **We currently recommend Miniforge, which includes the base and
41 | packages through a different, freely usable channel.** You can
42 | explore the options in the tabs below.
43 |
44 | .. admonition:: Python, conda, anaconda, miniforge, etc?
45 | :class: dropdown
46 |
47 | Unfortunately there's a lot of jargon. We'll go over this in the
48 | course but here is a crash course:
49 |
50 | * **Python** is a programming language very commonly used in
51 | science, it's the topic of this course.
52 | * **Conda** is a package manager: it allows distributing and
53 | installing packages, and is designed for complex scientific
54 | code.
55 | * **Mamba** is a re-implementation of Conda to be much faster with
56 | resolving dependencies and installing things.
57 | * An **Environment** is a self-contained collections of packages
58 | which can be installed separately from others. They are used so
59 | each project can install what it needs without affecting others.
60 | * **Anaconda** is a commercial distribution of Python+Conda+many
61 | packages that all work together. It used to be freely usable for
62 | research, but since ~2023-2024 it's more limited. Thus, we don't
63 | recommend it (even though it has a nice graphical user interface).
64 | * **conda-forge** is another channel of distributing packages that
65 | is maintained by the community, and thus can be used by anyone.
66 | (Anaconda's parent company also hosts conda-forge packages)
67 | * **miniforge** is a distribution of conda pre-configured for
68 | conda-forge. It operates via the command line.
69 | * **miniconda** is a distribution of conda pre-configured to use
70 | the Anaconda channels.
71 |
72 | .. tabs::
73 |
74 | .. group-tab:: Miniforge
75 |
76 | This is our recommended method - it can be used for any purpose
77 | and makes a strong base for the future.
78 |
79 | Follow the `instructions on the miniforge web page
80 | `__. This installs
81 | the base, and from here other packages can be installed.
82 |
83 | ..
84 | You can read how to install miniconda from the `CodeRefinery
85 | installation instructions
86 | `__.
87 |
88 | Miniforge uses the command line - this gives you the most power
89 | but can feel unfamiliar. See the `command line crash course
90 | `__ for an intro.
91 |
92 | .. group-tab:: Anaconda
93 |
94 | Anaconda is easier to get started with, but may be more limiting
95 | in the future. The Anaconda Navigator provides a graphical
96 | interface to most of what you would need.
97 |
98 | The `Anaconda Python distribution
99 | `__ conveniently packages
100 | everything, but its license has does not allow large organizations to
101 | use it for free (and has actually been enforced against
102 | universities).
103 |
104 | Note the license of Anaconda - there were recently issues with
105 | it being used by large universities for free, and this is not
106 | yet fully resolved.
107 |
108 | .. group-tab:: Other options
109 |
110 | There are many ways to install Python. Other methods can work,
111 | as long as you can install the libraries from the
112 | ``environment.yml`` file mentioned in the Miniforge
113 | instructions.
114 |
115 | We don't currently provide a ``requirements.txt`` for installing
116 | the required packages without Conda/Mamba, though.
117 |
118 |
119 |
120 | Starting Python
121 | ---------------
122 |
123 | You need to Python in a way that activates conda/mamba.
124 |
125 | .. tabs::
126 |
127 | .. group-tab:: Miniforge
128 |
129 | .. tabs::
130 |
131 | .. group-tab:: Linux / MacOS
132 |
133 | Linux/MacOS: Each time you start a new command line terminal,
134 | you can activate Miniforge by running. This is needed so that
135 | Miniforge is usable wherever you need, but doesn't affect any
136 | other software on your computer (this is not needed if you
137 | choose "Do you wish to update your shell profile to
138 | automatically initialize conda?", but then it will always be
139 | active)::
140 |
141 | $ source ~/miniforge3/bin/activate
142 |
143 | .. group-tab:: Windows
144 |
145 | Windows: Use the "Miniforge Prompt" to start Miniforge. This
146 | will set up everything so that ``conda`` and ``mamba`` are
147 | available.
148 |
149 | .. group-tab:: Anaconda
150 |
151 | The `Anaconda Navigator
152 | `__ provides a convenient
153 | way to access the software. It can be installed from that page.
154 |
155 |
156 | .. group-tab:: Other options
157 |
158 | You are on your own here.
159 |
160 |
161 | Python for SciComp software environment
162 | ---------------------------------------
163 |
164 | Once Python and conda/mamba are installed, you can use it to install
165 | an environment. An **environment** is a self-contained set of extra
166 | libraries - different projects can use different environments to not
167 | interfere with each other. This environment will have all of the
168 | software needed for this particular course.
169 |
170 | .. tabs::
171 |
172 | .. group-tab:: Miniforge
173 |
174 | This `environment file
175 | `__
176 | contains all packages needed for the course, and can be
177 | installed with. The following command will install an
178 | environment named ``python-for-scicomp`` (there may be lots of
179 | warning messages: this is OK if it still goes through):
180 |
181 | .. tabs::
182 |
183 | .. group-tab:: Linux / MacOS
184 |
185 | ::
186 |
187 | $ mamba env create -n python-for-scicomp -f https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/master/software/environment.yml
188 |
189 | .. group-tab:: Windows
190 |
191 | ::
192 |
193 | $ mamba env create -n python-for-scicomp -f https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/master/software/environment.yml
194 |
195 | Each time you start a new command line, you need to activate
196 | miniforge and this environment:
197 |
198 | .. tabs::
199 |
200 | .. group-tab:: Linux / MacOS
201 |
202 | ::
203 |
204 | $ source ~/miniforge3/bin/activate
205 | $ conda activate python-for-scicomp
206 |
207 | .. group-tab:: Windows
208 |
209 | ::
210 |
211 | $ # Start the Miniforge Prompt.
212 | $ conda activate python-for-scicomp
213 |
214 | .. group-tab:: Anaconda
215 |
216 | Anaconda includes most of the things needed for the course
217 | automatically, but as of 2024 not everything. You can use the
218 | navigator to create new environments from this `this environment
219 | file
220 | `__.
221 | You'll have to download it and then `import it
222 | `__.
223 |
224 | When running this course's exercise, make sure the
225 | ``python-for-scicomp`` environment is activated before starting
226 | JupyterLab or any code. You need to start termnials or
227 | JupyterLab from the Anaconda Navigator for the
228 | ``python-for-scicomp`` environment to be used.
229 |
230 | .. group-tab:: Other options
231 |
232 | **Minoconda, Anaconda command line, other conda/mamba command
233 | line tools**: see "Miniforge" instructions.
234 |
235 | Virtual environments: we don't currently provide a
236 | ``requirements.txt`` but many package names can probably be
237 | copied from the ``environment.yml`` file. We really recommend
238 | conda/mamba based systems: it's designed for complex scientific
239 | software.
240 |
241 | Any other Python distribution which you can install libraries into
242 | would work, but because there are so many different ways to do this,
243 | we don't support them. You would need the extra libraries mentioned
244 | in the Miniforge instructions.
245 |
246 | Remember you need to activate the environment each time you use it.
247 |
248 |
249 |
250 | JupyterLab
251 | ----------
252 |
253 | We do most of the lessons from JupyterLab (and JupyterLab provides
254 | most of the other tools we need).
255 |
256 | .. tabs::
257 |
258 | .. group-tab:: Miniforge
259 |
260 | JupyterLab was instaled in the previous step. To run it, first,
261 | start the Miniforge command line interface. Remember, you may
262 | need to activate Miniforge and the environment first.
263 |
264 | .. tabs::
265 |
266 | .. group-tab:: Linux / MacOS
267 |
268 | ::
269 |
270 | $ source ~/miniforge3/bin/activate
271 | $ conda activate python-for-scicomp
272 | $ jupyter-lab
273 |
274 | .. group-tab:: Windows
275 |
276 | ::
277 |
278 | $ # Start the Miniforge Prompt.
279 | $ conda activate python-for-scicomp
280 | $ jupyter-lab
281 |
282 | .. group-tab:: Anaconda
283 |
284 | If you install the full Anaconda distribution, this will be
285 | available and can be started either through Anaconda Navigator
286 | or command line.
287 |
288 | Make sure the ``python-for-scicomp`` environment is selected and
289 | you can start JupyterLab.
290 |
291 |
292 |
293 | Verification of Python and JupyterLab
294 | -------------------------------------
295 |
296 | .. admonition:: Watch the video
297 |
298 | See this `verification in video form
299 | `__ - if you can do this, you are
300 | ready to go for day one. Your exact steps may be a bit different.
301 |
302 | Remember that you need to activate the environment first - see the
303 | step above.
304 |
305 | .. tabs::
306 |
307 | .. group-tab:: Miniforge
308 |
309 | You can start JupyterLab from the command line::
310 |
311 | $ jupyter-lab
312 | (... Jupyter starts in a web browser)
313 |
314 |
315 | .. group-tab:: Anaconda
316 |
317 | **You should be able to start JupyterLab.** You can do this from the
318 | `Anaconda Navigator `__ (recommended if you have it):
319 |
320 | .. figure:: img/installation/anaconda-navigator-jupyterlab.png
321 | :class: with-border
322 |
323 | Starting JupyterLab from the Anaconda Navigator.
324 |
325 | ... or you can start JupyterLab from the command line::
326 |
327 | $ jupyter-lab
328 | (... Jupyter starts in a web browser)
329 |
330 |
331 |
332 | **Verify that you can start a Jupyter notebook.** We will learn how to
333 | do this in day 1, but you can try running ``print("Hello, world!")``
334 | if you want.
335 |
336 | .. figure:: img/installation/jupyterlab-notebook.png
337 | :class: with-border
338 |
339 | Starting a Jupyter Notebook from JupyterLab.
340 |
341 |
342 |
343 | Text editor
344 | -----------
345 |
346 | For one portion of the course, you will need a text editor. **If you
347 | don't know what to use, you can use the text editor that comes from
348 | JupyterLab and it will do everything you need - no extra installation
349 | needed.**
350 |
351 | .. admonition:: Other editors
352 | :class: toggle
353 |
354 | Because we need to be simple in our teaching, we only teach the
355 | most basic editors. We encourage you to try out more advanced ones
356 | yourself.
357 |
358 | For other editors, see the `CodeRefinery instructions
359 | `__. You don't
360 | exactly need a terminal editor - the graphical ones, such as VSCode or
361 | whatever you use now, will work as well.
362 |
363 |
364 |
365 | Command line
366 | ------------
367 |
368 | **You need access to the command line for some lessons. JupyterLab
369 | includes it, so no extra installation is needed.** If you want to
370 | test in advance:
371 |
372 | * You can start it from JupyterLab (recommended):
373 |
374 | .. figure:: img/installation/jupyterlab-terminal.png
375 | :class: with-border
376 | :scale: 75%
377 |
378 | From the JupyterLab launcher, select "Terminal".
379 |
380 | .. admonition:: Other ways to access the command line
381 | :class: toggle
382 |
383 | * From the Anaconda Navigator:
384 |
385 | .. figure:: img/installation/anaconda-prompt.png
386 | :class: with-border
387 |
388 | From the Anaconda Navigator, you can select "environments" on the
389 | left, then click on one, then the arrow, then "Open terminal".
390 |
391 | * From your operating system's terminal applications, if you activate
392 | Anaconda.
393 |
394 |
395 |
396 | Verification of the command line
397 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
398 |
399 | To verify command line usage, type the following commands (without the
400 | ``$``), and you should see the corresponding output that lists the
401 | Python version:
402 |
403 | .. code-block:: console
404 |
405 | $ python3 -V
406 | Python 3.8.3
407 |
408 | ## Or python... if it's installed as that
409 | $ python -V
410 | Python 3.8.3
411 |
412 | Any recent version of Python 3 should work for the course (for example
413 | 3.8 or higher).
414 |
415 |
416 |
417 | Zoom
418 | ----
419 |
420 | If this is an online workshop, it might use Zoom. You can see
421 | `CodeRefinery instructions for it
422 | `__.
423 |
424 |
425 |
426 | Need help?
427 | ----------
428 |
429 | If you have access, come to one of the installation help sessions.
430 | Or, ask your colleagues: these are standard tools and you can
431 | definitely find someone can help you get set up!
432 |
433 |
434 |
435 | See also
436 | --------
437 |
438 | * `Research Software Hour on conda
439 | `__
440 | * `Conda manual `__ (technical)
441 | * `Anaconda individual edition home
442 | `__
443 | * `Anaconda getting started
444 | `__
445 |
--------------------------------------------------------------------------------
/content/libraries.rst:
--------------------------------------------------------------------------------
1 | Library ecosystem
2 | =================
3 |
4 | .. questions::
5 |
6 | - What happens when you need some method beyond what we discuss in this course, what is available?
7 | - How do you decide what to build on for your work?
8 |
9 | .. objectives::
10 |
11 | - Know of some other available packages, but don't necessarily know
12 | how to use them.
13 | - Be able to evaluate what you should reuse and what you should
14 | develop yourself.
15 |
16 | You can't do everything yourself. In fact, once we heard a quote such
17 | as this:
18 |
19 | When you are a student, you are expected to do everything
20 | yourself, and that is how you are evaluated. When you become a
21 | researcher, you *have* to be able to reuse what others have done.
22 | We don't have much practice in doing this.
23 | -- A student
24 |
25 | In this lesson, we'll talk about the broader ecosystem in Python: all
26 | the resources you have available to you. Perhaps we can even classify
27 | this into two types:
28 |
29 | - Well-maintained libraries that are used by many others.
30 | - A wide variety of public code that might work but isn't necessarily
31 | well-maintained (for example, code from articles).
32 |
33 | We'll start with the first then go to the second.
34 |
35 |
36 |
37 | Glossary
38 | --------
39 |
40 | Library
41 | A collection of code used by a program.
42 |
43 | Package
44 | A library that has been made easily installable and reusable.
45 | Often published on public repositories such as the `Python Package
46 | Index `__
47 |
48 | Dependency
49 | A requirement of another program, not included in that program.
50 |
51 |
52 |
53 | The Python/SciPy ecosystem
54 | --------------------------
55 |
56 | This section is nothing more than a tour of what exists in Python.
57 | You aren't expected to particularly remember any of these right now,
58 | but searching for these repositories is a starting point of a lot of
59 | future work.
60 |
61 | The "core" packages `could be considered
62 | `__. Many other packages build on
63 | these, and others that try to do similar things often try to conform
64 | to their interfaces (especially numpy):
65 |
66 | * Python
67 | * Numpy - arrays, everything builds on this
68 | * Scipy - scientific functions (not necessarily a lot builds on this)
69 | * matplotlib - plotting, many other plotting tools build on this
70 | * pandas - data structures
71 | * IPython / Jupyter: interactive work
72 |
73 |
74 | Core numerics libraries
75 | ~~~~~~~~~~~~~~~~~~~~~~~
76 |
77 | * `numpy `__ - Arrays and array math.
78 | * `scipy `__ - Software
79 | for math, science, and engineering.
80 |
81 |
82 | Plotting
83 | ~~~~~~~~
84 |
85 | * `matplotlib `__ - Base plotting package,
86 | somewhat low level but almost everything builds on it.
87 | * `seaborn `__ - Higher level plotting
88 | interface; statistical graphics.
89 | * `Vega-Altair `__ - Declarative Python
90 | plotting.
91 | * `mayavi `__ - 3D plotting
92 | * `Plotly `__ - Big graphing library.
93 |
94 |
95 | Data analysis and other important core packages
96 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
97 |
98 | * `pandas `__ - Columnar
99 | data analysi.
100 | * `polars ` - Alternative to pandas that uses similar
101 | API, but is re-imagined for more speed.
102 | * `Vaex `__ - Alternative for pandas
103 | that uses similar API for lazy-loading and processing huge DataFrames.
104 | * `Dask `__ - Alternative to Pandas that uses
105 | similar API and can do analysis in parallel.
106 | * `xarrray `__ - Framework for
107 | working with mutli-dimensional arrays.
108 | * `statsmodels `__ - Statistical
109 | models and tests.
110 | * `SymPy `__ - Symbolic math.
111 | * `networkx `__ - Graph and network analysis.
112 | * `graph-tool `__ - Graph and network analysis
113 | toolkit implemented in C++.
114 |
115 |
116 | Interactive computing and human interface
117 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
118 | * Interactive computing
119 |
120 | * `IPython `__ - Nicer interactive interpreter
121 | * `Jupyter `__ - Web-based interface to IPython
122 | and other languages (includes projects such as jupyter notebook,
123 | lab, hub, ...)
124 |
125 | * Testing
126 |
127 | * `pytest `__ - Automated testing interface
128 |
129 | * Documentation
130 |
131 | * `Sphinx `__ - Documentation generator
132 | (also used for this lesson...)
133 |
134 | * Development environments
135 |
136 | * `Spyder `__ - Interactive Python
137 | development environment.
138 | * `Visual Studio Code `__ - Microsoft's
139 | flagship code editor.
140 | * `PyCharm `__ - JetBrains's
141 | Python IDE.
142 |
143 | * `Binder `__ - load any git repository in
144 | Jupyter automatically, good for reproducible research
145 |
146 |
147 | Data format support and data ingestion
148 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
149 |
150 | * `pillow `__ - Image manipulation. The
151 | original PIL is no longer maintained, the new "Pillow" is a drop-in
152 | replacement.
153 | * `h5py `__ and `PyTables `__ -
154 | Interfaces to the `HDF5 `__
155 | file format.
156 |
157 |
158 | Speeding up code and parallelism
159 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
160 |
161 | * `MPI for Python (mpi4py) `__ - Message
162 | Passing Interface (MPI) in Python for parallelizing jobs.
163 | * `cython `__ - easily make C extensions for
164 | Python, also interface to C libraries
165 | * `numba `__ - just in time compiling of
166 | functions for speed-up
167 | * `PyPy `__ - Python written in Python so that
168 | it can internally optimize more.
169 | * `Dask `__ - Distributed array data structure for
170 | distributed computation
171 | * `Joblib `__ - Easy embarrassingly
172 | parallel computing
173 | * `IPyParallel `__ - Easy
174 | parallel task engine.
175 | * `numexpr `__ - Fast evaluation of
176 | array expressions by automatically compiling the arithmetic.
177 |
178 |
179 | Machine learning
180 | ~~~~~~~~~~~~~~~~
181 |
182 | * `nltk `__ - Natural language processing
183 | toolkit.
184 | * `scikit-learn `__ - Traditional
185 | machine learning toolkit.
186 | * `xgboost `__ - Toolkit for
187 | gradient boosting algorithms.
188 |
189 |
190 | Deep learning
191 | ~~~~~~~~~~~~~
192 |
193 | * `tensorflow `__ - Deep learning
194 | library by Google.
195 | * `pytorch `__ - Currently the most popular
196 | deep learning library.
197 | * `keras `__ - Simple libary for doing deep learning.
198 | * `huggingface `__ - Ecosystem for sharing
199 | and running deep learning models and datasets. Incluses packages
200 | like ``transformers``, ``datasets``, ``accelerate``, etc.
201 | * `jax `__ - Google's
202 | Python library for running NumPy and automatic differentiation
203 | on GPUs.
204 | * `flax `__ - Neural network
205 | framework built on Jax.
206 | * `equinox `__ - Another neural
207 | network framework built on Jax.
208 | * `DeepSpeed `__ - Algorithms for running
209 | massive scale trainings. Included in many of the frameworks.
210 | * `PyTorch Lightning `__ -
211 | Framework for creating and training PyTorch models.
212 | * `Tensorboard ` - Tool
213 | for visualizing model training on a web page.
214 |
215 |
216 | Other packages for special cases
217 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
218 |
219 | * `dateutil `__ and `pytz
220 | `__ - Date arithmetic and handling,
221 | timezone database and conversion.
222 |
223 |
224 |
225 |
226 | Connecting Python to other languages
227 | ------------------------------------
228 |
229 | As we discussed with Scipy, very many of the above packages aren't
230 | written in Python: they are written in some other language and have a
231 | Python interface. Python is written in C, and thus has great C
232 | interfaces. This contributes to two things:
233 |
234 | * **Extending Python** by writing your own modules in C.
235 |
236 | * It's actually common to first have (or write) an analysis package
237 | in C or C++, then make the Python interface. Then it can be
238 | supported by other languages, too.
239 |
240 | * Or one starts an analysis package in Python, and slowly moves bits
241 | of it to C over time as there is need.
242 |
243 | * **Embedding Python**, where you have another primary application
244 | that uses Python under the hood as an internal scripting language.
245 |
246 | These features aren't exactly unique to Python, but Python does
247 | support them very well. Read more: `Extending and embedding Python
248 | `__.
249 |
250 |
251 | Tools for interfacing with other languages
252 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
253 |
254 | These days, one rarely directly extends the Python interpreter, but uses
255 |
256 | * `cffi `__ and `ctypes
257 | `__ - interface to C
258 | and compatible libraries
259 | * `cython `__ - easily make C extensions for
260 | Python, also interface to C libraries
261 | * `f2py `__ - interface to Fortran
262 | code
263 | * `swig `__ - connect to a variety of programming languages.
264 | * ``Boost.python`` - Another Python/C++ interface
265 | * TODO: Julia modules for Python?
266 |
267 |
268 |
269 | Evaluating Python packages for reuse
270 | ------------------------------------
271 |
272 | Above, we talked about well-maintained mainstream packages. **Do you
273 | trust random code you find online (for example included in a paper)?**
274 |
275 | Especially consider scientific results, which *have* to be correct.
276 | Still, you also *can't* build everything yourself, so you have to
277 | carefully evaluate the situation.
278 |
279 | Below are some things to consider:
280 |
281 | * Are there releases? Have they been going on for a while?
282 |
283 | * Are releases installable without copy-paste?
284 |
285 | * Are dependencies handled well?
286 |
287 | * Does the code randomly change, so that it no longer works with your
288 | code. Is this relevant?
289 |
290 | * Is there good documentation, that not just tells how to use it but
291 | how it works?
292 |
293 | * Is there automated testing? What's your evaluation of the risk of
294 | undetectable scientific errors?
295 |
296 | * Is there a community, or is it one person? Is it backed by some
297 | organization? Does it have a permanent home?
298 |
299 | * Is it is a public hosting site (GitLab, GitHub, Bitbucket, etc)
300 | where a community *could* form?
301 |
302 | * Do others post issues and make contributions? Are these issues
303 | dealt with in a timely manner? Can you search past bug reports?
304 |
305 | * Is the software citeable?
306 |
307 |
308 |
309 | Is your work reuseable?
310 | -----------------------
311 |
312 | Every small project you do contributes a little bit to the Python and
313 | SciPy ecosystem. This course has sort of started you on that path,
314 | and a `CodeRefinery workshop `__ will make
315 | sure you have the tools to produce high-quality, reusable code.
316 |
317 |
318 |
319 | What's next?
320 | ------------
321 |
322 | * The `CodeRefinery workshop `__ mentioned
323 | above will prepare you for others to reuse your code and for you to
324 | contribute to other code.
325 | * The upcoming :doc:`dependencies` lesson will teach you how to
326 | record and manage dependencies so that anyone can seamlessly reuse
327 | your code.
328 |
329 |
330 |
331 | Exercises
332 | ---------
333 |
334 | .. exercise:: Libraries 1.1: Libraries in your work
335 |
336 | What libraries do you use in your work? What have you made, which
337 | you could have reused from some other source. What have you used
338 | from some other source that you wished you had re-created?
339 |
340 | Discuss in your groups or HackMD.
341 |
342 | .. solution:: Libraries 1.1
343 |
344 | ... is there anything to say here?
345 |
346 |
347 | .. exercise:: Libraries 1.2: Evaluating packages
348 |
349 | Below are some links to some packages, both public and made by the
350 | authors of this lesson. Evaluate them, considering "would I use
351 | this in my project?"
352 |
353 | a) https://github.com/networkx/networkx/
354 | b) some code on webpage in a paper's footnote
355 | c) https://github.com/rkdarst/pcd
356 | d) https://github.com/dftlibs/numgrid
357 | e) https://github.com/rkdarst/dynbench
358 | f) https://vpython.org/
359 |
360 | .. solution:: Libraries 1.2
361 |
362 | a) networkx: This seems to be a relatively large, active project
363 | using best practices. Probably usable.
364 | b) I would probably use it if I had to, but would prefer not to.
365 | c) This (written by one of the authors of this lesson) has no
366 | documenting, no community, no best practices, and is very old.
367 | Probably not a good idea to try to use it
368 | d) This project uses best practices, but doesn't seem to have a big
369 | community. It's probably fine to use, but who knows if it will
370 | be maintained 10 years from now. It does have automated tests
371 | via Github Actions (``.github/workflows`` and the green checks),
372 | so the authors have put some work into making it correct.
373 | e) This (also written by one of the authors) looks like it was made
374 | for a paper of some sort. It has some minimal documentation,
375 | but still is missing many best practices and is clearly not
376 | maintained anymore (look at the ancient pull request). Probably
377 | not a good idea to use unless you have to.
378 | f) This project has a pretty website, and some information. But
379 | seems to not be using best practices of an open repository, and
380 | custom locations which could disappear at any time.
381 |
382 | You notice that several of the older projects here were written by
383 | one of the authors of this lesson. It goes to show that everyone
384 | starts somewhere and improves over time - don't feel bad if your
385 | work isn't perfect, as long as you keep trying to get better!
386 |
387 |
388 |
389 | See also
390 | --------
391 |
392 | * `Topical Software in the SciPy ecosystem
393 | `__ - relatively
394 | detailed (but not comprehensive) list of projects
395 |
396 |
397 | .. keypoints::
398 |
399 | - Almost everything you need can already be found, except your
400 | incremental work.
401 | - When do you build on that other work, and when do you create
402 | things yourself?
403 |
--------------------------------------------------------------------------------
/content/ndarray.dot:
--------------------------------------------------------------------------------
1 | strict digraph ndarray {
2 | graph [compound=true];
3 |
4 | node [style = filled, color=cyan];
5 |
6 | n [label="Variable n (lvalue)", color=gold];
7 | nobj [label="PyObject n"];
8 | ndesc [label="ndarray metadata"];
9 |
10 | n -> nobj;
11 | nobj -> ndesc;
12 |
13 | subgraph cluster_n {
14 | label = "Data array for n";
15 | color = aquamarine;
16 | style = filled;
17 | node [shape=box];
18 |
19 | ndata_0 [label="3"];
20 | ndata_1 [label="2"];
21 | ndata_2 [label="1"];
22 | }
23 |
24 |
25 | ndesc -> ndata_1 [lhead=cluster_n];
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/content/ndarray.svg:
--------------------------------------------------------------------------------
1 |
2 |
4 |
6 |
7 |
63 |
--------------------------------------------------------------------------------
/content/packaging-example-project/calculator/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Example calculator package.
3 | """
4 |
5 | from .adding import add
6 | from .subtracting import subtract
7 | from .integrating import integral
8 |
9 | __version__ = "0.1.0"
10 |
--------------------------------------------------------------------------------
/content/packaging-example-project/calculator/adding.py:
--------------------------------------------------------------------------------
1 | def add(x, y):
2 | return x + y
3 |
--------------------------------------------------------------------------------
/content/packaging-example-project/calculator/integrating.py:
--------------------------------------------------------------------------------
1 | from scipy import integrate
2 |
3 |
4 | def integral(function, lower_limit, upper_limit):
5 | return integrate.quad(function, lower_limit, upper_limit)
6 |
--------------------------------------------------------------------------------
/content/packaging-example-project/calculator/subtracting.py:
--------------------------------------------------------------------------------
1 | def subtract(x, y):
2 | return x - y
3 |
--------------------------------------------------------------------------------
/content/packaging-example-project/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.0"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "calculator-myname"
7 | description = "A small example package"
8 | version = "0.1.0"
9 | readme = "README.md"
10 | authors = [
11 | { name = "Firstname Lastname", email = "firstname.lastname@example.org" }
12 | ]
13 | dependencies = [
14 | "scipy"
15 | ]
16 |
--------------------------------------------------------------------------------
/content/packaging-example-project/test.py:
--------------------------------------------------------------------------------
1 | from calculator import add, subtract, integral
2 |
3 | print("2 + 3 =", add(2, 3))
4 | print("2 - 3 =", subtract(2, 3))
5 | integral_x_squared, error = integral(lambda x: x * x, 0.0, 1.0)
6 | print(f"{integral_x_squared = }")
7 |
--------------------------------------------------------------------------------
/content/packaging-example-project/test_editable.py:
--------------------------------------------------------------------------------
1 | from calculator import subtract
2 |
3 | print("2 - 3 =", subtract(2, 3))
4 |
--------------------------------------------------------------------------------
/content/packaging.rst:
--------------------------------------------------------------------------------
1 | Packaging
2 | =========
3 |
4 | .. questions::
5 |
6 | - How to organize Python projects larger than one script?
7 | - What is a good file and folder structure for Python projects?
8 | - How can you make your Python functions most usable by your collaborators?
9 | - How to prepare your code to make a Python package?
10 | - How to publish your Python package?
11 |
12 | .. objectives::
13 |
14 | - Learn to identify the components of a Python package
15 | - Learn to create a Python package
16 | - Learn to publish a Python package
17 |
18 |
19 | Organizing Python projects
20 | --------------------------
21 |
22 | Python projects often start as a single script or Jupyter notebook but
23 | they can grow out of a single file.
24 |
25 | In the :ref:`scripts` episode we have also learned how to import functions
26 | and objects from other Python files (modules). Now we will take it a step further.
27 |
28 | **Recommendations**:
29 |
30 | - Collect related functions into modules (files).
31 | - Collect related modules into packages (we will show how).
32 | - Add a ``LICENSE`` file to your code from `choosealicense.com `__
33 | (see `Software Licensing and Open source explained with cakes `__).
34 | - Write a ``README.md`` file describing what the code does and how to use it.
35 | - It is also recommended to `document your package `__.
36 | - When the project grows, you might need `automated testing `__.
37 |
38 | To have a concrete but still simple example, we will create a project
39 | consisting of 3 functions, each in its own file. We can then imagine that each
40 | file would contain many more functions. To make it more interesting,
41 | one of these functions will depend on an external library: ``scipy``.
42 |
43 | These are the 3 files:
44 |
45 | .. literalinclude:: packaging-example-project/calculator/adding.py
46 | :caption: adding.py
47 |
48 | .. literalinclude:: packaging-example-project/calculator/subtracting.py
49 | :caption: subtracting.py
50 |
51 | .. literalinclude:: packaging-example-project/calculator/integrating.py
52 | :caption: integrating.py
53 |
54 | We will add a fourth file:
55 |
56 | .. literalinclude:: packaging-example-project/calculator/__init__.py
57 | :caption: __init__.py
58 |
59 | This ``__init__.py`` file will be the interface of our package/library.
60 | It also holds the package docstring and the version string.
61 | Note how it imports functions from the various modules using *relative imports*
62 | (with the dot).
63 |
64 | This is how we will arrange the files in the project folder/repository:
65 |
66 | .. code-block:: none
67 | :emphasize-lines: 3-6
68 |
69 | project-folder
70 | ├── calculator
71 | │ ├── adding.py
72 | │ ├── __init__.py
73 | │ ├── integrating.py
74 | │ └── subtracting.py
75 | ├── LICENSE
76 | └── README.md
77 |
78 | Now we are ready to test the package. For this we need to be in the "root"
79 | folder, what we have called the *project-folder*. We also need to have
80 | ``scipy`` available in our environment:
81 |
82 | .. literalinclude:: packaging-example-project/test.py
83 |
84 | The package is not yet pip-installable, though. We will make this possible in
85 | the next section.
86 |
87 |
88 | Testing a local pip install
89 | ---------------------------
90 |
91 | To make our example package pip-installable we need to add one more file:
92 |
93 | .. code-block:: none
94 | :emphasize-lines: 9
95 |
96 | project-folder
97 | ├── calculator
98 | │ ├── adding.py
99 | │ ├── __init__.py
100 | │ ├── integrating.py
101 | │ └── subtracting.py
102 | ├── LICENSE
103 | ├── README.md
104 | └── pyproject.toml
105 |
106 | This is how ``pyproject.toml`` looks:
107 |
108 | .. literalinclude:: packaging-example-project/pyproject.toml
109 | :caption: pyproject.toml
110 | :emphasize-lines: 13-15
111 |
112 | Note how our package requires ``scipy`` and we decided to not pin the version
113 | here (see :ref:`version_pinning`).
114 |
115 | Now we have all the building blocks to test a local pip install. This is a good
116 | test before trying to upload a package to PyPI or test-PyPI
117 | (see :ref:`pypi`)
118 |
119 | .. note::
120 |
121 | Sometime you need to rely on unreleased, development versions as
122 | dependencies and this is also possible. For example, to use the
123 | latest ``xarray`` you could add::
124 |
125 | dependencies = [
126 | "scipy",
127 | "xarray @ https://github.com/pydata/xarray/archive/main.zip"
128 | ]
129 |
130 | .. seealso::
131 | - `pip requirement specifiers `__
132 | - pyOpenSci tutorial on
133 | `pyproject.toml metadata `__
134 |
135 |
136 |
137 | Exercise 1
138 | ----------
139 |
140 | .. challenge:: Packaging-1
141 |
142 | To test a local pip install:
143 |
144 | - Create a new folder outside of our example project
145 | - Create a new virtual environment (:ref:`dependency_management`)
146 | - Install the example package from the project folder
147 | into the new environment::
148 |
149 | pip install --editable /path/to/project-folder/
150 |
151 | - Test the local installation:
152 |
153 | .. literalinclude:: packaging-example-project/test.py
154 |
155 | - Make a change in the ``subtract`` function above such that it always
156 | returns a float ``return float(x - y)``.
157 |
158 | - Open a new Python console and test the following lines. Compare it with
159 | the previous output.
160 |
161 | .. literalinclude:: packaging-example-project/test_editable.py
162 |
163 | Sharing packages via PyPI
164 | -------------------------
165 |
166 | .. demo::
167 |
168 | Most people will watch and observe this, due to the speed with which we will
169 | move.
170 |
171 | Once we are able to pip-install the example package locally, we are ready for
172 | upload.
173 |
174 | We exercise by uploading to test-PyPI_, not the
175 | real `PyPI `__, so that if we mess things up, nothing bad
176 | happens.
177 |
178 | We need two more things:
179 |
180 | - We will do this using `Twine `__ so you need
181 | to pip install that, too.
182 | - You need an account on test-PyPI_
183 |
184 | .. _test-PyPI: https://test.pypi.org/
185 |
186 | .. highlight:: console
187 |
188 | Let's try it out. First we create the distribution package::
189 |
190 | $ python3 -m build
191 |
192 | We need twine::
193 |
194 | $ pip install twine
195 |
196 | And use twine to upload the distribution files to test-PyPI::
197 |
198 | $ twine upload -r testpypi dist/*
199 |
200 | Uploading distributions to https://test.pypi.org/legacy/
201 | Enter your API token:
202 |
203 |
204 | .. _Create API token: https://test.pypi.org/manage/account/token/
205 |
206 | .. note::
207 |
208 | To generate an API token, proceed to the `Create API token`_ page in test-PyPI.
209 | You will be prompted for your password.
210 |
211 | .. solution:: The long-version for finding the *Create API token* page
212 |
213 | 1. Log on to test-PyPI_ at https://test.pypi.org
214 | 2. In the top-right corner, click on the drop-down menu and click **Account settings** or
215 | follow this `link `__.
216 | 3. Scroll down to the section **API tokens** and click the button **Add API token**,
217 | which opens up the
218 | `Create API token`_ page.
219 |
220 |
221 | #. Under **Token name** write something memorable.
222 | It should remind you the *purpose*
223 | or the *name of the computer*, such that when you are done
224 | using it, you can safely delete it.
225 | #. Under **Scope** select ``Entire account (all projects)``.
226 | #. Click on **Create token**.
227 | #. Click on **Copy token** once a long string which starts
228 | with ``pypi-`` is generated.
229 |
230 | Paste that token back into the terminal where ``twine upload ...`` is running and press ENTER.
231 |
232 | Once this is done, create yet another virtual environment and try to install from test-PyPI (adapt ``myname``).
233 |
234 | .. tabs::
235 |
236 | .. tab:: Linux / macOS
237 |
238 | .. code-block:: console
239 | :emphasize-lines: 4-7
240 |
241 | $ python3 -m venv venv-calculator
242 | $ source venv-calculator/bin/activate
243 | $ which python
244 | $ python3 -m pip install \
245 | -i https://test.pypi.org/simple/ \
246 | --extra-index-url https://pypi.org/simple/ \
247 | calculator-myname
248 | $ deactivate
249 |
250 | .. tab:: Windows
251 |
252 | .. code-block:: console
253 | :emphasize-lines: 4
254 |
255 | $ python3 -m venv venv-calculator
256 | $ venv-calculator\Scripts\activate
257 | $ where python
258 | $ python3 -m pip install -i https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ calculator-myname
259 | $ deactivate
260 |
261 | Tools that simplify sharing via PyPI
262 | ------------------------------------
263 |
264 | The solution that we have used to create the example package (using
265 | ``setuptools`` and ``twine``) is not the only approach. There are many ways to
266 | achieve this and we avoided going into too many details and comparisons to not
267 | confuse too much. If you web-search this, you will also see that recently the
268 | trend goes towards using ``pyproject.toml`` as more general
269 | alternative to the previous ``setup.py``.
270 |
271 | There are at least two tools which try to make the packaging and PyPI interaction easier:
272 |
273 | - `Poetry `__
274 | - `Flit `__
275 |
276 | If you upload packages to PyPI or test PyPI often you can create an API token and
277 | `save it in the .pypirc file `__.
278 |
279 | Building a conda package and share it
280 | -------------------------------------
281 |
282 |
283 | .. callout:: Prerequisites
284 |
285 | To generate a conda build recipe, the package ``grayskull`` and
286 | to build it, the package ``conda-build`` are required.
287 | You may install these with **Anaconda Navigator** or from the command line::
288 |
289 | $ conda install -n base grayskull conda-build
290 |
291 |
292 | The simplest way for creating a conda package for your python script is to
293 | first publish it in `PyPI `__ following the steps explained
294 | above.
295 |
296 |
297 | Building a python package with grayskull and conda-build
298 | ********************************************************
299 |
300 | Once build, the conda package can be installed locally. For this example, we
301 | will use `runtest `__. `runtest
302 | `__ is a numerically tolerant end-to-end test
303 | library for research software.
304 |
305 | 1. Generate the *recipe* by executing (``grayskull`` or ``conda grayskull``)::
306 |
307 | $ conda grayskull pypi runtest
308 |
309 | The command above will create a new folder called `runtest` containing a file `meta.yaml`,
310 | the conda recipe for building the `runtest` package.
311 |
312 | 2. View the contents of `meta.yaml` and ensure requirements :
313 |
314 | .. code-block:: yaml
315 |
316 | requirements:
317 | host:
318 | - python
319 | - flit-core >=2,<4
320 | - pip
321 | run:
322 | - python
323 |
324 | In the requirements above, we specified what is required for the `host `__ and for `running `__ the package.
325 |
326 | .. callout:: Remark
327 |
328 | For pure python recipes, this is all you need for building a python package with conda.
329 | If your package needs to be built (for instance compilation), you would need additional files e.g. `build.sh` (to build on Linux/Mac-OSX) and `bld.bat` (to build on Windows systems). You can also add test scripts for testing your package. See `documentation `__
330 |
331 |
332 | 3. Build your package with conda
333 |
334 | Your package is now ready to be build with conda::
335 |
336 | $ conda build runtest
337 |
338 |
339 | .. callout:: Conda package location
340 |
341 | Look at the messages produced while building. The location of the local conda package is given (search for `anaconda upload`):
342 |
343 | .. code-block:: none
344 |
345 | /home/username/miniforge3/conda-bld/noarch/runtest-2.3.4-py_0.tar.bz2
346 |
347 | The prefix ``/home/username/miniforge3/`` may be different on your machine.
348 | depending on your operating system (Linux, Mac-OSX or Windows). The sub-folder is named ``noarch`` since
349 | it is a pure-python package and the recipe indicates the same.
350 |
351 | If package contained compiled code then the sub-folder would have been named ``win-64`` or ``linux-64``.
352 | It could then be converted to other platforms using
353 | `conda convert `__.
354 |
355 | 4. Check within new environment
356 |
357 | It is not necessary to create a new conda environment to install it but as explained in previous episode, it is good practice to have isolated environments.
358 |
359 | ::
360 |
361 | $ conda create -n local-runtest --use-local runtest
362 |
363 | We can then check `runtest` has been successfully installed in `local-runtest` conda environment. Open a new Terminal with `local-runtest` environment (either from the command line::
364 |
365 | $ conda activate local-runtest
366 |
367 | or via **Anaconda Navigator** (Open Terminal), import runtest and
368 | check its version:
369 |
370 | .. code-block:: python
371 |
372 | import runtest
373 | print(runtest.__version__)
374 |
375 |
376 | .. callout:: Building a conda package from scratch
377 |
378 | It is possible to build a conda package from scratch without using conda grayskull.
379 | We recommend you to check the
380 | `conda-build documentation `__
381 | for more information.
382 |
383 | To be able to share and install your local conda package anywhere (on other platforms), you would need to upload it to a `conda channel `__ (see below).
384 |
385 |
386 |
387 | Publishing a python package
388 | ***************************
389 |
390 | - Upload your package to `conda-forge `__:
391 | conda-forge is a conda channel: it contains community-led collection of
392 | recipes, build infrastructure and distributions for the conda package
393 | manager. Anyone can
394 | `publish conda packages to conda-forge `__
395 | if certain
396 | `guidelines `__ are respected.
397 |
398 | - Upload your package to `bioconda `_: bioconda is
399 | a very popular channel for the conda package manager specializing in
400 | bioinformatics software. As for conda-forge, you need to follow their
401 | `guidelines `__ when
402 | building conda recipes.
403 |
404 | You can also `create your own conda channel
405 | `__
406 | for publishing your packages.
407 |
408 |
409 | .. keypoints::
410 |
411 | - It is worth it to organize your code for publishing, even if only
412 | you are using it.
413 | - PyPI is a place for Python packages
414 | - conda is similar but is not limited to Python
415 |
--------------------------------------------------------------------------------
/content/parallel-pi-multiprocessing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Python multithreading solution\n",
8 | "Here, we will create a simple stochastic calculation of pi, and then parallelize it using multiprocessing (and multithreading to compare)."
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 1,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "import random"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 2,
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "def sample(n):\n",
27 | " \"\"\"Make n trials of points in the square. Return (n, number_in_circle)\n",
28 | " \n",
29 | " This is our basic function. By design, it returns everything it\\\n",
30 | " needs to compute the final answer: both n (even though it is an input\n",
31 | " argument) and n_inside_circle. To compute our final answer, all we\n",
32 | " have to do is sum up the n:s and the n_inside_circle:s and do our\n",
33 | " computation\"\"\"\n",
34 | " n_inside_circle = 0\n",
35 | " for i in range(n):\n",
36 | " x = random.random()\n",
37 | " y = random.random()\n",
38 | " if x**2 + y**2 < 1.0:\n",
39 | " n_inside_circle += 1\n",
40 | " return n, n_inside_circle"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 3,
46 | "metadata": {},
47 | "outputs": [
48 | {
49 | "name": "stdout",
50 | "output_type": "stream",
51 | "text": [
52 | "598 ms ± 29.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
53 | ]
54 | }
55 | ],
56 | "source": [
57 | "%%timeit\n",
58 | "# Do it just for timing\n",
59 | "n, n_inside_circle = sample(10**6)"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 4,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "# Do the actual calculation (the previous result doesn't get saved)\n",
69 | "n, n_inside_circle = sample(10**6)"
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "metadata": {},
75 | "source": [
76 | "This is the \"calculate answer\" phase."
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 5,
82 | "metadata": {},
83 | "outputs": [
84 | {
85 | "data": {
86 | "text/plain": [
87 | "3.144548"
88 | ]
89 | },
90 | "execution_count": 5,
91 | "metadata": {},
92 | "output_type": "execute_result"
93 | }
94 | ],
95 | "source": [
96 | "pi = 4.0 * (n_inside_circle / n)\n",
97 | "pi"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {},
103 | "source": [
104 | "## Do it in parallel with multiprocessing\n",
105 | "This divides the calculation into 10 tasks and runs `sample` on each of them. Then it re-combines the results."
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 6,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "import multiprocessing.pool\n",
115 | "pool = multiprocessing.pool.Pool()\n",
116 | "# The default pool makes one process per CPU"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": 7,
122 | "metadata": {},
123 | "outputs": [
124 | {
125 | "name": "stdout",
126 | "output_type": "stream",
127 | "text": [
128 | "320 ms ± 38.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
129 | ]
130 | }
131 | ],
132 | "source": [
133 | "%%timeit\n",
134 | "# Do it once to time it\n",
135 | "results = pool.map(sample, [10**5] * 10)"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": 8,
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "# Do it again to get the results, since the results of the above\n",
145 | "# cell aren't accessible because of the %%timeit magic.\n",
146 | "results = pool.map(sample, [10**5] * 10)"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 9,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "pool.close()"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 10,
161 | "metadata": {},
162 | "outputs": [
163 | {
164 | "data": {
165 | "text/plain": [
166 | "3.140768"
167 | ]
168 | },
169 | "execution_count": 10,
170 | "metadata": {},
171 | "output_type": "execute_result"
172 | }
173 | ],
174 | "source": [
175 | "n_sum = sum(x[0] for x in results)\n",
176 | "n_inside_circle_sum = sum(x[1] for x in results)\n",
177 | "pi = 4.0 * (n_inside_circle_sum / n_sum)\n",
178 | "pi"
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "metadata": {},
184 | "source": [
185 | "## Do it in \"parallel\" with threads\n",
186 | "To compare. This should not be any faster, because the multiple Python functions can not run at the same time in the same process."
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": 11,
192 | "metadata": {},
193 | "outputs": [],
194 | "source": [
195 | "threadpool = multiprocessing.pool.ThreadPool()"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": 12,
201 | "metadata": {},
202 | "outputs": [
203 | {
204 | "name": "stdout",
205 | "output_type": "stream",
206 | "text": [
207 | "635 ms ± 28.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
208 | ]
209 | },
210 | {
211 | "data": {
212 | "text/plain": [
213 | ""
214 | ]
215 | },
216 | "execution_count": 12,
217 | "metadata": {},
218 | "output_type": "execute_result"
219 | }
220 | ],
221 | "source": [
222 | "%%timeit -o\n",
223 | "# Do it once to time it\n",
224 | "threadpool.map(sample, [10**5] * 10)"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": 13,
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "# Do it again to get the results, since the results of the above\n",
234 | "# cell aren't accessible because of the %%timeit magic.\n",
235 | "results = threadpool.map(sample, [10**5] * 10)"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": 14,
241 | "metadata": {},
242 | "outputs": [],
243 | "source": [
244 | "threadpool.close()"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": 15,
250 | "metadata": {},
251 | "outputs": [
252 | {
253 | "data": {
254 | "text/plain": [
255 | "3.142388"
256 | ]
257 | },
258 | "execution_count": 15,
259 | "metadata": {},
260 | "output_type": "execute_result"
261 | }
262 | ],
263 | "source": [
264 | "n_sum = sum(x[0] for x in results)\n",
265 | "n_inside_circle_sum = sum(x[1] for x in results)\n",
266 | "pi = 4.0 * (n_inside_circle_sum / n_sum)\n",
267 | "pi"
268 | ]
269 | },
270 | {
271 | "cell_type": "markdown",
272 | "metadata": {},
273 | "source": [
274 | "## Future ideas\n",
275 | "\n",
276 | "You could make a separate `calculate` function that take a list of results and returns pi. This can be used regardless of if it is done with multiprocessing or without.\n",
277 | "\n",
278 | "Notice the similarity to [split-apply-combine](https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html) or [map-reduce](https://en.wikipedia.org/wiki/MapReduce) which is a specialization of split-apply-combine."
279 | ]
280 | }
281 | ],
282 | "metadata": {
283 | "kernelspec": {
284 | "display_name": "Python 3",
285 | "language": "python",
286 | "name": "python3"
287 | },
288 | "language_info": {
289 | "codemirror_mode": {
290 | "name": "ipython",
291 | "version": 3
292 | },
293 | "file_extension": ".py",
294 | "mimetype": "text/x-python",
295 | "name": "python",
296 | "nbconvert_exporter": "python",
297 | "pygments_lexer": "ipython3",
298 | "version": "3.8.5"
299 | }
300 | },
301 | "nbformat": 4,
302 | "nbformat_minor": 4
303 | }
304 |
--------------------------------------------------------------------------------
/content/plotting-matplotlib/customizing/gapminder-larger-font.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/plotting-matplotlib/customizing/gapminder-larger-font.png
--------------------------------------------------------------------------------
/content/plotting-matplotlib/customizing/gapminder-linear.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/plotting-matplotlib/customizing/gapminder-linear.png
--------------------------------------------------------------------------------
/content/plotting-matplotlib/customizing/gapminder-log.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/plotting-matplotlib/customizing/gapminder-log.png
--------------------------------------------------------------------------------
/content/plotting-matplotlib/first-plot/exercise.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/plotting-matplotlib/first-plot/exercise.png
--------------------------------------------------------------------------------
/content/plotting-matplotlib/first-plot/getting-started.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/plotting-matplotlib/first-plot/getting-started.png
--------------------------------------------------------------------------------
/content/plotting-vega-altair/temperature-ranges-combined.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/content/productivity.md:
--------------------------------------------------------------------------------
1 | # Productivity tools
2 |
3 | :::{objectives}
4 | - Know about tools that can help you **spot code problems** and help you following
5 | a **consistent code style** without you having to do it manually.
6 | - Get an overview of **AI-based tools** and how they can help you
7 | writing code.
8 | :::
9 |
10 | :::{instructor-note}
11 | - Demo/discussion: 20 min
12 | :::
13 |
14 |
15 | ## Linters and formatters
16 |
17 | **Linter**: Tool that analyzes source code to detect potential errors, unused
18 | imports, unused variables, code style violations, and to improve readability.
19 | - Popular linters:
20 | - [Autoflake](https://pypi.org/project/autoflake/)
21 | - [Flake8](https://flake8.pycqa.org/)
22 | - [Pyflakes](https://pypi.org/project/pyflakes/)
23 | - [Pycodestyle](https://pycodestyle.pycqa.org/)
24 | - [Pylint](https://pylint.readthedocs.io/)
25 | - [Ruff](https://docs.astral.sh/ruff/)
26 |
27 | **Formatter**: Tool that automatically formats your code to a consistent style,
28 | for instance following [PEP 8](https://peps.python.org/pep-0008/).
29 |
30 | - Popular formatters:
31 | - [Black](https://black.readthedocs.io/)
32 | - [YAPF](https://github.com/google/yapf)
33 | - [Ruff](https://docs.astral.sh/ruff/)
34 |
35 | In this course we will focus on [Ruff](https://docs.astral.sh/ruff/) since it
36 | can do **both checking and formatting** and you don't have to switch between
37 | multiple tools.
38 |
39 | :::{discussion} Linters and formatters can be configured to your liking
40 | These tools typically have good defaults. But if you don't like the defaults,
41 | you can configure what they should ignore or how they should format or not format.
42 | :::
43 |
44 |
45 | ## Examples
46 |
47 | This code example (which we possibly recognize from the previous section about
48 | {ref}`profiling`)
49 | has few problems (highlighted):
50 | ```{code-block} python
51 | ---
52 | emphasize-lines: 2, 7, 10
53 | ---
54 | import re
55 | import requests
56 |
57 |
58 | def count_unique_words(file_path: str) -> int:
59 | unique_words = set()
60 | forgotten_variable = 13
61 | with open(file_path, "r", encoding="utf-8") as file:
62 | for line in file:
63 | words = re.findall(r"\b\w+\b", line.lower()))
64 | for word in words:
65 | unique_words.add(word)
66 | return len(unique_words)
67 | ```
68 |
69 | Please try whether you can locate these problems using Ruff:
70 | ```console
71 | $ ruff check
72 | ```
73 |
74 | Next, let us try to auto-format a code example which is badly formatted and also difficult
75 | to read:
76 | :::::{tabs}
77 | ::::{tab} Badly formatted
78 | ```python
79 | import re
80 | def count_unique_words (file_path : str)->int:
81 | unique_words=set()
82 | with open(file_path,"r",encoding="utf-8") as file:
83 | for line in file:
84 | words=re.findall(r"\b\w+\b",line.lower())
85 | for word in words:
86 | unique_words.add(word)
87 | return len( unique_words )
88 | ```
89 | ::::
90 |
91 | ::::{tab} Auto-formatted
92 | ```python
93 | import re
94 |
95 |
96 | def count_unique_words(file_path: str) -> int:
97 | unique_words = set()
98 | with open(file_path, "r", encoding="utf-8") as file:
99 | for line in file:
100 | words = re.findall(r"\b\w+\b", line.lower())
101 | for word in words:
102 | unique_words.add(word)
103 | return len(unique_words)
104 | ```
105 |
106 | This was done using:
107 | ```console
108 | $ ruff format
109 | ```
110 | ::::
111 | :::::
112 |
113 |
114 | ## Type checking
115 |
116 | A (static) type checker is a tool that checks whether the types of variables in your
117 | code match the types that you have specified.
118 | - Tools:
119 | - [Mypy](https://mypy.readthedocs.io/)
120 | - [Pyright](https://github.com/microsoft/pyright) (Microsoft)
121 | - [Pyre](https://pyre-check.org/) (Meta)
122 |
123 |
124 | ## Integration with editors
125 |
126 | Many/most of the above tools can be integrated with your editor. For instance,
127 | you can configure your editor to automatically format your code when you save
128 | the file. However, this only makes sense when all team members agree to follow
129 | the same style, otherwise saving and possibly committing changes to version
130 | control will show up changes to code written by others which you possibly
131 | didn't intend to make.
132 |
133 |
134 | ## Integration with Jupyter notebooks
135 |
136 | It is possible to automatically format your code in Jupyter notebooks!
137 | For this to work you need
138 | the following three dependencies installed:
139 | - `jupyterlab-code-formatter`
140 | - `black`
141 | - `isort`
142 |
143 | More information and a screen-cast of how this works can be found at
144 | .
145 |
146 |
147 | ## Integration with version control
148 |
149 | If you use version control and like to have your code checked or formatted
150 | **before you commit the change**, you can use tools like [pre-commit](https://pre-commit.com/).
151 |
152 |
153 | ## AI-assisted coding
154 |
155 | We can use AI as an assistant/apprentice:
156 | - Code completion
157 | - Write a test based on an implementation
158 | - Write an implementation based on a test
159 |
160 | Or we can use AI as a mentor:
161 | - Explain a concept
162 | - Improve code
163 | - Show a different (possibly better) way of implementing the same thing
164 |
165 |
166 | :::{figure} productivity/chatgpt.png
167 | :alt: Screenshot of ChatGPT
168 | :width: 100%
169 |
170 | Example for using a chat-based AI tool.
171 | :::
172 |
173 | :::{figure} productivity/code-completion.gif
174 | :alt: Screen-cast of working with GitHub Copilot
175 | :width: 100%
176 |
177 | Example for using AI to complete code in an editor.
178 | :::
179 |
180 | :::{admonition} AI tools open up a box of questions
181 | - Legal
182 | - Ethical
183 | - Privacy
184 | - Lock-in/ monopolies
185 | - Lack of diversity
186 | - Will we still need to learn programming?
187 | - How will it affect learning and teaching programming?
188 | :::
189 |
--------------------------------------------------------------------------------
/content/productivity/chatgpt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/productivity/chatgpt.png
--------------------------------------------------------------------------------
/content/productivity/code-completion.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/productivity/code-completion.gif
--------------------------------------------------------------------------------
/content/profiling.md:
--------------------------------------------------------------------------------
1 | # Profiling
2 |
3 | :::{objectives}
4 | - Understand when improving code performance is worth the time and effort.
5 | - Knowing how to find performance bottlenecks in Python code.
6 | - Try `scalene` as one of many tools to profile Python code.
7 | :::
8 |
9 | :::{instructor-note}
10 | - Discussion: 20 min
11 | - Exercise: 20 min
12 | :::
13 |
14 |
15 | ## Should we even optimize the code?
16 |
17 | Classic quote to keep in mind: "Premature optimization is the root of all evil." [Donald Knuth]
18 |
19 | :::{discussion}
20 | It is important to ask ourselves whether it is worth it.
21 | - Is it worth spending e.g. 2 days to make a program run 20% faster?
22 | - Is it worth optimizing the code so that it spends 90% less memory?
23 |
24 | Depends. What does it depend on?
25 | :::
26 |
27 |
28 | ## Measure instead of guessing
29 |
30 | Before doing code surgery to optimize the run time or lower the memory usage,
31 | we should **measure** where the bottlenecks are. This is called **profiling**.
32 |
33 | Analogy: Medical doctors don't start surgery based on guessing. They first measure
34 | (X-ray, MRI, ...) to know precisely where the problem is.
35 |
36 | Not only programming beginners can otherwise guess wrong, but also experienced
37 | programmers can be surprised by the results of profiling.
38 |
39 |
40 | ## One of the simplest tools is to insert timers
41 |
42 | Below we will list some tools that can be used to profile Python code.
43 | But even without these tools you can find **time-consuming parts** of your code
44 | by inserting timers:
45 |
46 |
47 |
48 | ```{code-block} python
49 | ---
50 | emphasize-lines: 1,8,10
51 | ---
52 | import time
53 |
54 |
55 | # ...
56 | # code before the function
57 |
58 |
59 | start = time.time()
60 | result = some_function()
61 | print(f"some_function took {time.time() - start} seconds")
62 |
63 |
64 | # code after the function
65 | # ...
66 | ```
67 |
68 |
69 | ## Many tools exist
70 |
71 | The list below here is probably not complete, but it gives an overview of the
72 | different tools available for profiling Python code.
73 |
74 | CPU profilers:
75 | - [cProfile and profile](https://docs.python.org/3/library/profile.html)
76 | - [line_profiler](https://kernprof.readthedocs.io/)
77 | - [py-spy](https://github.com/benfred/py-spy)
78 | - [Yappi](https://github.com/sumerc/yappi)
79 | - [pyinstrument](https://pyinstrument.readthedocs.io/)
80 | - [Perfetto](https://perfetto.dev/docs/analysis/trace-processor-python)
81 |
82 | Memory profilers:
83 | - [memory_profiler](https://pypi.org/project/memory-profiler/) (not actively maintained)
84 | - [Pympler](https://pympler.readthedocs.io/)
85 | - [tracemalloc](https://docs.python.org/3/library/tracemalloc.html)
86 | - [guppy/heapy](https://github.com/zhuyifei1999/guppy3/)
87 |
88 | Both CPU and memory:
89 | - [Scalene](https://github.com/plasma-umass/scalene)
90 |
91 | In the exercise below, we will use Scalene to profile a Python program. Scalene
92 | is a sampling profiler that can profile CPU, memory, and GPU usage of Python.
93 |
94 |
95 | ## Tracing profilers vs. sampling profilers
96 |
97 | **Tracing profilers** record every function call and event in the program,
98 | logging the exact sequence and duration of events.
99 | - **Pros:**
100 | - Provides detailed information on the program's execution.
101 | - Deterministic: Captures exact call sequences and timings.
102 | - **Cons:**
103 | - Higher overhead, slowing down the program.
104 | - Can generate larger amount of data.
105 |
106 | **Sampling profilers** periodically samples the program's state (where it is
107 | and how much memory is used), providing a statistical view of where time is
108 | spent.
109 | - **Pros:**
110 | - Lower overhead, as it doesn't track every event.
111 | - Scales better with larger programs.
112 | - **Cons:**
113 | - Less precise, potentially missing infrequent or short calls.
114 | - Provides an approximation rather than exact timing.
115 |
116 | :::{discussion} Analogy: Imagine we want to optimize the London Underground (subway) system
117 | We wish to detect bottlenecks in the system to improve the service and for this we have
118 | asked few passengers to help us by tracking their journey.
119 | - **Tracing**: We follow every train and passenger, recording every stop
120 | and delay. When passengers enter and exit the train, we record the exact time
121 | and location.
122 | - **Sampling**: Every 5 minutes the phone notifies the passenger to note
123 | down their current location. We then use this information to estimate
124 | the most crowded stations and trains.
125 | :::
126 |
127 |
128 | ## Choosing the right system size
129 |
130 | Sometimes we can configure the system size (for instance the time step in a simulation
131 | or the number of time steps or the matrix dimensions) to make the program finish sooner.
132 |
133 | For profiling, we should choose a system size that is **representative of the real-world**
134 | use case. If we profile a program with a small input size, we might not see the same
135 | bottlenecks as when running the program with a larger input size.
136 |
137 | Often, when we scale up the system size, or scale the number of processors, new bottlenecks
138 | might appear which we didn't see before. This brings us back to: "measure instead of guessing".
139 |
140 |
141 | ## Exercises
142 |
143 | ::::{exercise} Exercise: Practicing profiling
144 | In this exercise we will use the Scalene profiler to find out where most of the time is spent
145 | and most of the memory is used in a given code example.
146 |
147 | Please try to go through the exercise in the following steps:
148 | 1. Make sure `scalene` is installed in your environment (if you have followed
149 | this course from the start and installed the recommended software
150 | environment, then it is).
151 | 1. Download Leo Tolstoy's "War and Peace" from the following link (the text is
152 | provided by [Project Gutenberg](https://www.gutenberg.org/)):
153 |
154 | (right-click and "save as" to download the file and **save it as "book.txt"**).
155 | 1. **Before** you run the profiler, try to predict in which function the code
156 | (the example code is below)
157 | will spend most of the time and in which function it will use most of the
158 | memory.
159 | 1. Save the example code as `example.py` and
160 | run the `scalene` profiler on the following code example and browse the
161 | generated HTML report to find out where most of the time is spent and where
162 | most of the memory is used:
163 | ```console
164 | $ scalene example.py
165 | ```
166 | Alternatively you can do this (and then open the generated file in a browser):
167 | ```console
168 | $ scalene example.py --html > profile.html
169 | ```
170 | You can find an example of the generated HTML report in the solution below.
171 | 1. Does the result match your prediction? Can you explain the results?
172 |
173 | Example code (`example.py`):
174 | :::{literalinclude} profiling/exercise.py
175 | :::
176 |
177 | :::{solution}
178 | ```{figure} profiling/exercise.png
179 | :alt: Result of the profiling run for the above code example.
180 | :width: 100%
181 |
182 | Result of the profiling run for the above code example. You can click on the image to make it larger.
183 | ```
184 |
185 | Results:
186 | - Most time is spent in the `count_unique_words2` function.
187 | - Most memory is used in the `count_unique_words1` function.
188 |
189 | Explanation:
190 | - The `count_unique_words2` function is the slowest because it **uses a list**
191 | to store unique words and checks if a word is already in the list before
192 | adding it.
193 | Checking whether a list contains an element might require traversing the
194 | whole list, which is an O(n) operation. As the list grows in size,
195 | the lookup time increases with the size of the list.
196 | - The `count_unique_words1` and `count_unique_words3` functions are faster
197 | because they **use a set** to store unique words.
198 | Checking whether a set contains an element is an O(1) operation.
199 | - The `count_unique_words1` function uses the most memory because it **creates
200 | a list of all words** in the text file and then **creates a set** from that
201 | list.
202 | - The `count_unique_words3` function uses less memory because it traverses
203 | the text file line by line instead of reading the whole file into memory.
204 |
205 | What we can learn from this exercise:
206 | - When processing large files, it can be good to read them line by line
207 | or in batches
208 | instead of reading the whole file into memory.
209 | - It is good to get an overview over standard data structures and their
210 | advantages and disadvantages (e.g. adding an element to a list is fast but checking whether
211 | it already contains the element can be slow).
212 | :::
213 | ::::
214 |
215 |
216 | ## Additional resources
217 |
218 | - [Python performance workshop (by ENCCS)](https://enccs.github.io/python-perf/profile/)
219 |
--------------------------------------------------------------------------------
/content/profiling/exercise.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/profiling/exercise.png
--------------------------------------------------------------------------------
/content/profiling/exercise.py:
--------------------------------------------------------------------------------
1 | """
2 | The code below reads a text file and counts the number of unique words in it
3 | (case-insensitive).
4 | """
5 | import re
6 |
7 |
8 | def count_unique_words1(file_path: str) -> int:
9 | with open(file_path, "r", encoding="utf-8") as file:
10 | text = file.read()
11 | words = re.findall(r"\b\w+\b", text.lower())
12 | return len(set(words))
13 |
14 |
15 | def count_unique_words2(file_path: str) -> int:
16 | unique_words = []
17 | with open(file_path, "r", encoding="utf-8") as file:
18 | for line in file:
19 | words = re.findall(r"\b\w+\b", line.lower())
20 | for word in words:
21 | if word not in unique_words:
22 | unique_words.append(word)
23 | return len(unique_words)
24 |
25 |
26 | def count_unique_words3(file_path: str) -> int:
27 | unique_words = set()
28 | with open(file_path, "r", encoding="utf-8") as file:
29 | for line in file:
30 | words = re.findall(r"\b\w+\b", line.lower())
31 | for word in words:
32 | unique_words.add(word)
33 | return len(unique_words)
34 |
35 |
36 | def main():
37 | # book.txt is downloaded from https://www.gutenberg.org/cache/epub/2600/pg2600.txt
38 | _result = count_unique_words1("book.txt")
39 | _result = count_unique_words2("book.txt")
40 | _result = count_unique_words3("book.txt")
41 |
42 |
43 | if __name__ == "__main__":
44 | main()
45 |
--------------------------------------------------------------------------------
/content/python.rst:
--------------------------------------------------------------------------------
1 | Introduction to Python
2 | ======================
3 |
4 | .. questions::
5 |
6 | - What are the basic blocks of Python language?
7 | - How are functions and classes defined in Python?
8 |
9 | .. objectives::
10 |
11 | - Get a *very* short introduction to Python types and syntax
12 | - Be able to follow the rest of the examples in the course, even if you don't understand everything perfectly.
13 |
14 | We expect everyone to be able to know the following basic material
15 | to follow the course (though it is not *everything* you need to
16 | know about Python).
17 |
18 | If you are not familiar with Python, here is a *very* short
19 | introduction. It will not be enough to do everything in this course,
20 | but you will be able to follow along a bit more than you would otherwise.
21 |
22 | .. seealso::
23 |
24 | This page contains an overview of the basics of Python. You can
25 | also refer to `This Python overview from a different lesson
26 | `__
27 | which is slightly more engaging.
28 |
29 |
30 |
31 | Scalars
32 | -------
33 |
34 | Scalar types, that is, single elements of various types:
35 |
36 | ::
37 |
38 | i = 42 # integer
39 | i = 2**77 # Integers have arbitrary precision
40 | g = 3.14 # floating point number
41 | c = 2 - 3j # Complex number
42 | b = True # boolean
43 | s = "Hello!" # String (Unicode)
44 | q = b'Hello' # bytes (8-bit values)
45 |
46 | Read more: :class:`int`, :class:`float`, :class:`complex`,
47 | :class:`bool`, :class:`str`, :class:`bytes`.
48 |
49 |
50 | Collections
51 | -----------
52 |
53 | Collections are data structures capable of storing multiple values.
54 |
55 | ::
56 |
57 | l = [1, 2, 3] # list
58 | l[1] # lists are indexed by int
59 | l[1] = True # list elements can be any type
60 | d = {"Janne": 123, "Richard": 456} # dictionary
61 | d["Janne"]
62 | s = set(("apple", "cherry", "banana", "apple")) # Set of unique values
63 | s
64 |
65 | Read more: :class:`list`, :class:`tuple`, :class:`dict`, :class:`set`.
66 |
67 |
68 | Control structures
69 | ------------------
70 |
71 | Python has the usual control structures, that is conditional
72 | statements and loops. For example, the :ref:`if` statement:
73 |
74 | ::
75 |
76 | x = 2
77 | if x == 3:
78 | print('x is 3')
79 | elif x == 2:
80 | print('x is 2')
81 | else:
82 | print('x is something else')
83 |
84 | :ref:`While ` loops loop until some condition is met:
85 |
86 | ::
87 |
88 | x = 0
89 | while x < 42:
90 | print('x is ', x)
91 | x += 0.2
92 |
93 | :ref:`For ` loops loop over some collection of values:
94 |
95 | ::
96 |
97 | xs = [1, 2, 3, 4]
98 | for x in xs:
99 | print(x)
100 |
101 |
102 | Often you want to loop over a sequence of integers, in that case the
103 | :class:`range` function is useful:
104 |
105 | ::
106 |
107 | for x in range(9):
108 | print(x)
109 |
110 | Another common need is to iterate over a collection, but at the same
111 | time also have an index number. For this there is the :func:`enumerate`
112 | function:
113 |
114 | ::
115 |
116 | xs = [1, 'hello', 'world']
117 | for ii, x in enumerate(xs):
118 | print(ii, x)
119 |
120 |
121 | Functions and classes
122 | ---------------------
123 |
124 | Python functions are defined by the :ref:`def` keyword. They take a
125 | number of arguments, and return a number of return values.
126 |
127 | ::
128 |
129 | def hello(name):
130 | """Say hello to the person given by the argument"""
131 | print('Hello', name)
132 | return 'Hello ' + name
133 |
134 | hello("Anne")
135 |
136 | Classes are defined by the :ref:`class` keyword:
137 |
138 | ::
139 |
140 | class Hello:
141 | def __init__(self, name):
142 | self._name = name
143 | def say(self):
144 | print('Hello', self._name)
145 |
146 | h = Hello("Richard")
147 | h.say()
148 |
149 |
150 | Python type system
151 | ------------------
152 |
153 | Python is strongly and dynamically typed.
154 |
155 | Strong here means, roughly, that it's not possible to circumvent the
156 | type system (at least, not easily, and not without invoking undefined
157 | behavior).
158 |
159 | ::
160 |
161 | x = 42
162 | type(x)
163 | x + "hello"
164 |
165 | Dynamic typing means that types are determined at runtime, and a
166 | variable can be redefined to refer to an instance of another type:
167 |
168 | ::
169 |
170 | x = 42
171 | x = "hello"
172 |
173 |
174 | *Jargon*: Types are associated with rvalues, not lvalues. In
175 | statically typed language, types are associated with lvalues, and are
176 | (typically) reified during compilation.
177 |
178 |
179 | ??? (lesson here)
180 |
181 |
182 |
183 | .. keypoints::
184 |
185 | - Python offers a nice set of basic types as many other programming languages
186 | - Python is strongly typed and dynamically typed
187 |
--------------------------------------------------------------------------------
/content/quick-reference.rst:
--------------------------------------------------------------------------------
1 | Quick reference
2 | ===============
3 |
4 | * `Pandas cheatsheet
5 | `__ (pandas.pydata.org)
6 |
7 | * `Pandas cheatsheet
8 | `__
9 | (via `Datacamp
10 | `__)
11 |
12 | * `Numpy cheatsheet
13 | `__
14 | (via `Datacamp
15 | `__)
16 |
17 | * `JupyterLab cheatsheet
18 | `__
19 |
20 | * `Matplotlib cheatsheet
21 | `__
22 | (via `Datacamp
23 | `__)
24 |
25 | * `Numpy, Pandas, Matplotlib, Scikit-learn all together
26 | `__
27 |
--------------------------------------------------------------------------------
/content/scipy.rst:
--------------------------------------------------------------------------------
1 | SciPy
2 | =====
3 |
4 | .. questions::
5 |
6 | - When you need more advanced mathematical functions, where do you
7 | look?
8 |
9 | .. objectives::
10 |
11 | - Understand that SciPy exists and what kinds of things it has.
12 | - Understand the importance of using external libraries and how to
13 | use them.
14 | - Understand the purpose of wrapping existing C/Fortran code.
15 | - Non-objective: know details of everything (or anything) in SciPy.
16 |
17 | .. seealso::
18 |
19 | * Main article: `SciPy documentation `__
20 |
21 |
22 |
23 | SciPy is a library that builds on top of NumPy. It contains a lot of
24 | interfaces to battle-tested numerical routines written in Fortran or
25 | C, as well as python implementations of many common algorithms.
26 |
27 |
28 |
29 | What's in SciPy?
30 | ----------------
31 |
32 | Briefly, it contains functionality for
33 |
34 | - Special functions (Bessel, Gamma, etc.)
35 | - Numerical integration
36 | - Optimization
37 | - Interpolation
38 | - Fast Fourier Transform (FFT)
39 | - Signal processing
40 | - Linear algebra (more complete than in NumPy)
41 | - Sparse matrices
42 | - Statistics
43 | - More I/O routine, e.g. Matrix Market format for sparse matrices,
44 | MATLAB files (.mat), etc.
45 |
46 | Many (most?) of these are not written specifically for SciPy, but use
47 | the best available open source C or Fortran libraries. Thus, you get
48 | the best of Python and the best of compiled languages.
49 |
50 | Most functions are documented ridiculously well from a scientific
51 | standpoint: you aren't just using some unknown function, but have a
52 | full scientific description and citation to the method and
53 | implementation.
54 |
55 |
56 |
57 | Exercises: use SciPy
58 | --------------------
59 |
60 | These exercises do not exist because *you* might need *these*
61 | functions someday. They are because *you* will need to *read
62 | documentation and understand documentation of an an external library*
63 | eventually.
64 |
65 | 1: Numerical integration
66 | ~~~~~~~~~~~~~~~~~~~~~~~~
67 |
68 | .. challenge::
69 |
70 | Do the following exercise **or** read the documentation and
71 | understand the relevant functions of SciPy:
72 |
73 | Define a function of one variable and using
74 | `scipy.integrate.quad `__
75 | calculate the integral of your function in the
76 | interval ``[0.0, 4.0]``. Then vary the interval and also modify the function and check
77 | whether scipy can integrate it.
78 |
79 |
80 | .. solution::
81 |
82 | .. code-block:: python
83 |
84 | from scipy import integrate
85 |
86 | def myfunction(x):
87 | # you need to define result
88 | return result
89 |
90 | integral = integrate.quad(myfunction, 0.0, 4.0)
91 | print(integral)
92 |
93 | `quad
94 | `__
95 | uses the Fortran library QUADPACK, which one can assume is pretty
96 | good. You can also see a whole lot of scientific information about
97 | the function on the docs page - including the scientific names of
98 | the methods used.
99 |
100 |
101 |
102 | 2: Sparse matrices
103 | ~~~~~~~~~~~~~~~~~~
104 |
105 | .. challenge::
106 |
107 | Do the following exercise **or** read the documentation and
108 | understand the relevant functions of SciPy:
109 |
110 | Use the SciPy sparse matrix functionality to create a random sparse
111 | matrix with a probability of non-zero elements of 0.05 and size 10000
112 | x 10000. The use the SciPy sparse linear algebra support to calculate
113 | the matrix-vector product of the sparse matrix you just created and a
114 | random vector. Use the %timeit macro to measure how long it
115 | takes. Does the optional ``format`` argument when you create the
116 | sparse matrix make a difference?
117 |
118 | Then, compare to how long it takes if you'd instead first convert the
119 | sparse matrix to a normal NumPy dense array, and use the NumPy ``dot``
120 | method to calculate the matrix-vector product.
121 |
122 | Can you figure out a quick rule of thumb when it's worth using a
123 | sparse matrix representation vs. a dense representation?
124 |
125 | .. solution::
126 |
127 | The basic code to do the test is:
128 |
129 | .. code-block::
130 |
131 | import numpy
132 | import scipy.sparse
133 |
134 | vector = numpy.random.random(10000)
135 | matrix = scipy.sparse.rand(10000, 10000, density=.05, format='csc')
136 |
137 | # We time this line
138 | matrix.dot(vector)
139 |
140 | From the top of the `spare matrix module documentation
141 | `__, we can
142 | see there are a variety of different available sparse matrix types:
143 | ``bsr``, ``coo``, ``csr``, ``csc``, etc. These each represent a
144 | different way of storing the matrices.
145 |
146 | It seems that ``csr`` and ``csc`` are fairly fast. ``lil`` and
147 | ``dok`` are slow but it says that these are good for creating
148 | matrices with random insertions.
149 |
150 | For example, ``csr`` takes 7ms, ``lil`` 42ms, ``dok`` 1600ms, and
151 | converting to a non-sparse array ``matrix.toarray()`` and
152 | multiplying takes 64ms on one particular computer.
153 |
154 | This code allows us to time the performance at different
155 | densities. It seems that with the ``csr`` format, sparse is better
156 | below densities of around .4 to .5:
157 |
158 | ..code-block::
159 |
160 | for density in [.01, .05, .1, .2, .3, .4, .5]:
161 | matrix = scipy.sparse.rand(10000, 10000, density=density, format='csr')
162 | time_sparse = timeit.timeit('matrix.dot(vector)', number=10, globals=globals())
163 | matrix2 = matrix.toarray()
164 | time_full = timeit.timeit('matrix2.dot(vector)', number=10, globals=globals())
165 | print(f"{density} {time_sparse:.3f} {time_full:.3f}")
166 |
167 |
168 |
169 | See also
170 | --------
171 |
172 | * `SciPy general introduction `__
173 | * `SciPy documentation
174 | `__
175 |
176 |
177 |
178 | .. keypoints::
179 |
180 | - When you need advance math or scientific functions, let's just
181 | admit it: you do a web search first.
182 | - But when you see something in SciPy come up, you know your
183 | solutions are in good hands.
184 |
--------------------------------------------------------------------------------
/content/work-with-data.rst:
--------------------------------------------------------------------------------
1 | Working with Data
2 | =================
3 |
4 | .. questions::
5 |
6 | - How do you store your data right now?
7 | - Are you doing data cleaning / preprocessing every time you load the data?
8 |
9 | .. objectives::
10 |
11 | - Learn benefits/drawbacks of common data formats.
12 | - Learn how you can read and write data in a variety of formats.
13 |
14 |
15 | .. figure:: https://imgs.xkcd.com/comics/norm_normal_file_format.png
16 |
17 | Source: `xkcd #2116 `__
18 |
19 |
20 | What is a data format?
21 | ----------------------
22 |
23 | Data format can mean two different things
24 |
25 | 1. `data structure `__ or how
26 | you're storing the data in memory while you're working on it;
27 | 2. `file format `__ or the way you're
28 | storing the data in the disk.
29 |
30 | Let's consider this randomly generated DataFrame with various columns::
31 |
32 | import pandas as pd
33 | import numpy as np
34 |
35 | n_rows = 100000
36 |
37 | dataset = pd.DataFrame(
38 | data={
39 | 'string': np.random.choice(('apple', 'banana', 'carrot'), size=n_rows),
40 | 'timestamp': pd.date_range("20130101", periods=n_rows, freq="s"),
41 | 'integer': np.random.choice(range(0,10), size=n_rows),
42 | 'float': np.random.uniform(size=n_rows),
43 | },
44 | )
45 |
46 | dataset.info()
47 |
48 | This DataFrame is structured in the *tidy data* format.
49 | In tidy data we have multiple columns of data that are collected in a Pandas
50 | DataFrame, where each column represents a value of a specific type.
51 |
52 | .. image:: img/pandas/tidy_data.png
53 |
54 | Let's consider another example::
55 |
56 | n = 1000
57 |
58 | data_array = np.random.uniform(size=(n,n))
59 | np.info(data_array)
60 |
61 |
62 | Here we have a different data structure: we have a two-dimensional array of numbers.
63 | This is different to a Pandas DataFrame as data is stored as one contiguous block
64 | instead of individual columns. This also means that the whole array must have one
65 | data type.
66 |
67 |
68 | .. figure:: https://github.com/elegant-scipy/elegant-scipy/raw/master/figures/NumPy_ndarrays_v2.png
69 |
70 | Source: `Elegant Scipy `__
71 |
72 | Now the question is: **Can the data be saved to the disk without changing the
73 | data format?**
74 |
75 | For this we need a **file format** that can easily store our **data structure**.
76 |
77 | .. admonition:: Data type vs. data structure vs. file format
78 | :class: dropdown
79 |
80 | - **Data type:** Type of a single piece of data (integer, string,
81 | float, ...).
82 | - **Data structure:** How the data is organized in memory (individual
83 | columns, 2D-array, nested dictionaries, ...).
84 | - **File format:** How the data is organized when it is saved to the disk
85 | (columns of strings, block of binary data, ...).
86 |
87 | For example, a black and white image stored as a .png-file (**file format**)
88 | might be stored in memory as an NxM array (**data structure**) of integers
89 | (**data type**) with each entry representing the color value of the pixel.
90 |
91 | What to look for in a file format?
92 | ----------------------------------
93 |
94 | When deciding which file format you should use for your program, you should
95 | remember the following:
96 |
97 | **There is no file format that is good for every use case.**
98 |
99 | and
100 |
101 | **It is very likely, that a good format already exists for your use case.**
102 |
103 | There are, indeed, various standard file formats for various use cases:
104 |
105 | .. figure:: https://imgs.xkcd.com/comics/standards.png
106 |
107 | Source: `xkcd #927 `__.
108 |
109 | Usually, you'll want to consider the following things when choosing a file
110 | format:
111 |
112 | 1. Is the file format good for my data structure (is it fast/space
113 | efficient/easy to use)?
114 | 2. Is everybody else / leading authorities in my field recommending a certain
115 | format?
116 | 3. Do I need a human-readable format or is it enough to work on it using code?
117 | 4. Do I want to archive / share the data or do I just want to store it while
118 | I'm working?
119 |
120 | Pandas supports
121 | `many file formats `__
122 | for tidy data and Numpy supports
123 | `some file formats `__
124 | for array data. However, there are many other file formats that can be used
125 | through other libraries.
126 |
127 | Table below describes some data formats:
128 |
129 | .. list-table::
130 | :header-rows: 1
131 |
132 | * - | Name:
133 | - | Human
134 | | readable:
135 | - | Space
136 | | efficiency:
137 | - | Arbitrary
138 | | data:
139 | - | Tidy
140 | | data:
141 | - | Array
142 | | data:
143 | - | Long term
144 | | storage/sharing:
145 |
146 | * - :ref:`Pickle `
147 | - ❌
148 | - 🟨
149 | - ✅
150 | - 🟨
151 | - 🟨
152 | - ❌
153 |
154 | * - :ref:`CSV `
155 | - ✅
156 | - ❌
157 | - ❌
158 | - ✅
159 | - 🟨
160 | - ✅
161 |
162 | * - :ref:`Feather `
163 | - ❌
164 | - ✅
165 | - ❌
166 | - ✅
167 | - ❌
168 | - ❌
169 |
170 | * - :ref:`Parquet `
171 | - ❌
172 | - ✅
173 | - 🟨
174 | - ✅
175 | - 🟨
176 | - ✅
177 |
178 | * - :ref:`npy `
179 | - ❌
180 | - 🟨
181 | - ❌
182 | - ❌
183 | - ✅
184 | - ❌
185 |
186 | * - :ref:`HDF5 `
187 | - ❌
188 | - ✅
189 | - ❌
190 | - ❌
191 | - ✅
192 | - ✅
193 |
194 | * - :ref:`NetCDF4 `
195 | - ❌
196 | - ✅
197 | - ❌
198 | - ❌
199 | - ✅
200 | - ✅
201 |
202 | * - :ref:`JSON `
203 | - ✅
204 | - ❌
205 | - 🟨
206 | - ❌
207 | - ❌
208 | - ✅
209 |
210 | * - :ref:`Excel `
211 | - ❌
212 | - ❌
213 | - ❌
214 | - 🟨
215 | - ❌
216 | - 🟨
217 |
218 | * - :ref:`Graph formats `
219 | - 🟨
220 | - 🟨
221 | - ❌
222 | - ❌
223 | - ❌
224 | - ✅
225 |
226 | .. important::
227 |
228 | - ✅ : Good
229 | - 🟨 : Ok / depends on a case
230 | - ❌ : Bad
231 |
232 |
233 | A more in-depth analysis of the file formats mentioned above, can be found
234 | :doc:`here `.
235 |
236 | Pros and cons
237 | -------------
238 |
239 | Let's have a general look at pros and cons of some types of file formats
240 |
241 | Binary File formats
242 | ~~~~~~~~~~~~~~~~~~~
243 |
244 | Good things
245 | +++++++++++
246 |
247 | - Can represent floating point numbers with full precision.
248 | - Can potentially save lots of space, especially, when storing numbers.
249 | - Data reading and writing is usually much faster than loading from text files,
250 | since the format contains information about the data structure, and thus
251 | memory allocation can be done more efficiently.
252 | - More explicit specification for storing multiple data sets and metadata in
253 | the same file.
254 | - Many binary formats allow for partial loading of the data.
255 | This makes it possible to work with datasets that are larger than your
256 | computer's memory.
257 |
258 | Bad things
259 | ++++++++++
260 |
261 | - Commonly requires the use of a specific library to read and write the data.
262 | - Library specific formats can be version dependent.
263 | - Not human readable.
264 | - Sharing can be more difficult (requires some expertise to be able to
265 | read the data).
266 | - Might require more documentation efforts.
267 |
268 | Textual formats
269 | ~~~~~~~~~~~~~~~
270 |
271 | Good things
272 | +++++++++++
273 |
274 | - Human readable.
275 | - Easy to check for (structural) errors.
276 | - Supported by many tool out of the box.
277 | - Easily shared.
278 |
279 | Bad things
280 | ++++++++++
281 |
282 | - Can be slow to read and write.
283 | - High potential to increase required disk space substantially (e.g. when
284 | storing floating point numbers as text).
285 | - Prone to losing precision when storing floating point numbers.
286 | - Multi-dimensional data can be hard to represent.
287 | - While the data format might be specified, the data structure might not be
288 | clear when starting to read the data.
289 |
290 | Further considerations
291 | ~~~~~~~~~~~~~~~~~~~~~~
292 |
293 | - The closer your stored data is to the code, the more likely it depends on the
294 | environment you are working in. If you ``pickle``, e.g. a generated model,
295 | you can only be sure that the model will work as intended if you load it in
296 | an environment that has the same versions of all libraries the model depends
297 | on.
298 |
299 |
300 | Exercise
301 | --------
302 |
303 | .. challenge::
304 |
305 | You have a model that you have been training for a while.
306 | Lets assume it's a relatively simple neural network (consisting of a
307 | network structure and it's associated weights).
308 |
309 | Let's consider 2 scenarios
310 |
311 | A: You have a different project, that is supposed to take this model, and
312 | do some processing with it to determine it's efficiency after different
313 | times of training.
314 |
315 | B: You want to publish the model and make it available to others.
316 |
317 | What are good options to store the model in each of these scenarios?
318 |
319 | .. solution::
320 |
321 | A:
322 |
323 | Some export into a binary format that can be easily read. E.g. pickle
324 | or a specific export function from the library you use.
325 |
326 | It also depends on whether you intend to make the intermediary steps
327 | available to others. If you do, you might also want to consider storing
328 | structure and weights separately or use a format specific for the
329 | type of model you are training to keep the data independent of the
330 | library.
331 |
332 | B:
333 |
334 | You might want to consider a more general format that is supported by
335 | many libraries, e.g. ONNX, or a format that is specifically designed
336 | for the type of model you are training.
337 |
338 | You might also want to consider additionally storing the model in a way
339 | that is easily readable by humans, to make it easier for others to
340 | understand the model.
341 |
342 |
343 | Case study: Converting untidy data to tidy data
344 | -----------------------------------------------
345 |
346 | Many data analysis tools (like Pandas) are designed to work with tidy data,
347 | but some data is not in a suitable format. What we have seen often in the
348 | past is people then not using the powerful tools, but write complicated
349 | scripts that extract individual pieces from the data each time they need
350 | to do a calculation.
351 |
352 | As an example, let's see how we can use country data from an example REST API
353 | endpoint (for more information on how to work with web APIs, see
354 | :doc:`this page `). Let's get the data with the following piece
355 | of code:
356 |
357 | .. code-block:: python
358 |
359 | import json
360 | import requests
361 |
362 | url = 'https://api.sampleapis.com/countries/countries'
363 |
364 | response = requests.get(url)
365 |
366 | countries_json = json.loads(response.content)
367 |
368 | Let's try to find the country with the largest population.
369 |
370 | An example of a "questionable" way of solving this problem would be something
371 | like the following piece of code that is written in pure Python:
372 |
373 | .. code-block:: python
374 |
375 | max_population = 0
376 | top_population_country = ''
377 |
378 | for country in countries_json:
379 | if country.get('population', 0) > max_population:
380 | top_population_country = country['name']
381 | max_population = country.get('population', 0)
382 |
383 | print(top_population_country)
384 |
385 | This is a very natural way of writing a solution for the problem, but it has
386 | major caveats:
387 |
388 | 1. We throw all of the other data out so we cannot answer any
389 | follow up questions.
390 | 2. For bigger data, this would be very slow and ineffective.
391 | 3. We have to write lots of code to do a simple thing.
392 |
393 | Another typical solution would be something like the following code,
394 | which picks some of the data and creates a Pandas dataframe out of it:
395 |
396 | .. code-block:: python
397 |
398 | import pandas as pd
399 |
400 | countries_list = []
401 |
402 | for country in countries_json:
403 | countries_list.append([country['name'], country.get('population',0)])
404 |
405 | countries_df = pd.DataFrame(countries_list, columns=['name', 'population'])
406 |
407 | print(countries_df.nlargest(1, 'population')['name'].values[0])
408 |
409 | This solution has many of the same problems as the previous one, but now we can
410 | use Pandas to do follow up analysis.
411 |
412 | Better solution would be to use Pandas'
413 | `pandas.DataFrame.from_dict `__
414 | or `pandas.json_normalize `__
415 | to read the full data in:
416 |
417 | .. code-block:: python
418 |
419 | countries_df = pd.DataFrame.from_dict(countries_json)
420 | print(countries_df.nlargest(1, 'population')['name'].values[0])
421 |
422 | countries_df = pd.json_normalize(countries_json)
423 | print(countries_df.nlargest(1, 'population')['name'].values[0])
424 |
425 | .. admonition:: Key points
426 |
427 | - Convert your data to a format where it is easy to do analysis on it.
428 | - Check the tools you're using if they have an existing feature that can help
429 | you read the data in.
430 |
431 |
432 | Things to remember
433 | ------------------
434 |
435 | 1. **There is no file format that is good for every use case.**
436 | 2. Usually, your research question determines which libraries you want to use
437 | to solve it. Similarly, the data format you have determines file format you
438 | want to use.
439 | 3. However, if you're using a previously existing framework or tools or you
440 | work in a specific field, you should prioritize using the formats that are
441 | used in said framework/tools/field.
442 | 4. When you're starting your project, it's a good idea to take your initial
443 | data, clean it, and store the results in a good binary format that works as
444 | a starting point for your future analysis. If you've written the cleaning
445 | procedure as a script, you can always reproduce it.
446 | 5. Throughout your work, you should use code to turn important data to
447 | a human-readable format (e.g. plots, averages,
448 | :meth:`pandas.DataFrame.head`), not to keep your full data in a
449 | human-readable format.
450 | 6. Once you've finished, you should store the data in a format that can be
451 | easily shared to other people.
452 |
453 |
454 | See also
455 | --------
456 |
457 | - `Pandas' IO tools `__
458 | - `Tidy data comparison notebook `__
459 | - `Array data comparison notebook `__
460 |
461 |
462 | .. keypoints::
463 |
464 | - Pandas can read and write a variety of data formats.
465 | - There are many good, standard formats, and you don't need to create your own.
466 | - There are plenty of other libraries dedicated to various formats.
467 |
--------------------------------------------------------------------------------
/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=content
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Sphinx
2 | sphinx_rtd_theme
3 | sphinx_rtd_theme_ext_color_contrast
4 | myst_nb
5 | sphinx-lesson
6 | https://github.com/aaltoscicomp/sphinx-aaltoscicomp-branding/archive/master.zip
7 | sphinxext-opengraph
8 | sphinx-thebe
9 |
10 | # for web-apis execution
11 | jsonlines
12 | bs4
13 |
--------------------------------------------------------------------------------
/resources/code/scripts/__pycache__/optionsparser.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/resources/code/scripts/__pycache__/optionsparser.cpython-38.pyc
--------------------------------------------------------------------------------
/resources/code/scripts/__pycache__/weather_functions.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/resources/code/scripts/__pycache__/weather_functions.cpython-38.pyc
--------------------------------------------------------------------------------
/resources/code/scripts/__pycache__/weather_functions_config.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/resources/code/scripts/__pycache__/weather_functions_config.cpython-38.pyc
--------------------------------------------------------------------------------
/resources/code/scripts/optionsparser.py:
--------------------------------------------------------------------------------
1 | import yaml
2 |
3 | def get_parameters(config_file, required, defaults):
4 | '''
5 | Parameters:
6 | Optionfile: FileName of the yaml file containing the options
7 | required: Dict of required argument names and their object types.
8 | defaults: Dict of default parameters mapping to their default values
9 |
10 | Returns: An object with fields named according to required and optional values.
11 | '''
12 | f = open(config_file)
13 | options = yaml.safe_load(f)
14 | # create a parameters object that allows setting attributes.
15 | parameters = type('Options', (), {})()
16 | # check required arguments
17 | for arg in required:
18 | if not arg in options:
19 | raise Exception("Could not find required Argument " + arg + " aborting...")
20 | else:
21 | if not isinstance(options[arg],required[arg]):
22 | raise Exception("Expected input of type " + str(required[arg]) + " but got " + str(type(options[arg])))
23 | print("Setting " + arg + " to " + str(options[arg]))
24 | setattr(parameters,arg,options[arg])
25 | # check the default values.
26 | for arg in defaults:
27 | if arg in options:
28 | if not isinstance(options[arg],type(defaults[arg])):
29 | #Wrong type for the parameter
30 | raise Exception("Expected input of type " + str(type(defaults[arg])) + " but got " + str(type(options[arg])))
31 | print("Setting " + arg + " to " + str(options[arg]))
32 | setattr(parameters,arg,options[arg])
33 | else:
34 | print( arg + " not found in option file. Using default: " +str(defaults[arg]))
35 | setattr(parameters,arg,defaults[arg])
36 | return parameters
37 |
38 |
39 |
--------------------------------------------------------------------------------
/resources/code/scripts/out.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/resources/code/scripts/out.png
--------------------------------------------------------------------------------
/resources/code/scripts/rain_in_cairo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/resources/code/scripts/rain_in_cairo.png
--------------------------------------------------------------------------------
/resources/code/scripts/weather.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/resources/code/scripts/weather.png
--------------------------------------------------------------------------------
/resources/code/scripts/weather_functions.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import matplotlib.pyplot as plt
3 |
4 | def preprocessing(dataset, start_date, end_date):
5 | # The date format in the file is in a day-first format, which matplotlib does nto understand.
6 | # so we need to convert it.
7 | dataset['Local time'] = pd.to_datetime(dataset['Local time'],dayfirst=True)
8 | dataset = dataset[dataset['Local time'].between(start_date,end_date)]
9 | return dataset
10 |
11 |
12 | def plot_data(dates, values):
13 | fig, ax = plt.subplots()
14 | ax.plot(dates, values)
15 | # label the axes
16 | ax.set_xlabel("Date of observation")
17 | ax.set_ylabel("Temperature in Celsius")
18 | ax.set_title("Temperature Observations")
19 | # adjust tick labels
20 | fig.autofmt_xdate()
21 | return ax,fig
22 |
23 |
--------------------------------------------------------------------------------
/resources/code/scripts/weather_functions_config.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import matplotlib.pyplot as plt
3 |
4 | def preprocessing(dataset, start_date, end_date):
5 | # The date format in the file is in a day-first format, which matplotlib does nto understand.
6 | # so we need to convert it.
7 | dataset['Local time'] = pd.to_datetime(dataset['Local time'],dayfirst=True)
8 | dataset = dataset[dataset['Local time'].between(start_date,end_date)]
9 | return dataset
10 |
11 |
12 | def plot_data(dates, values, labels):
13 | fig, ax = plt.subplots()
14 | ax.plot(dates, values)
15 | # label the axes
16 | ax.set_xlabel(labels.xlabel)
17 | ax.set_ylabel(labels.ylabel)
18 | ax.set_title(labels.title)
19 | # adjust tick labels
20 | fig.autofmt_xdate()
21 | return ax,fig
22 |
23 |
--------------------------------------------------------------------------------
/resources/code/scripts/weather_observations.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | import pandas as pd
5 | import weather_functions
6 |
7 | url = "https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/master/resources/data/scripts/weather_tapiola.csv"
8 | weather = pd.read_csv(url,comment='#')
9 |
10 | # define the start and end time for the plot
11 | start_date=pd.to_datetime('01/06/2021', dayfirst=True)
12 | end_date=pd.to_datetime('01/10/2021', dayfirst=True)
13 | #Preprocess the data
14 | weather['Local time'] = pd.to_datetime(weather['Local time'], dayfirst=True)
15 | # select the data
16 | weather = weather[weather['Local time'].between(start_date,end_date)]
17 |
18 | # Now, we have the data loaded, and adapted to our needs. So lets get plotting
19 | import matplotlib.pyplot as plt
20 | # start the figure.
21 | fig, ax = plt.subplots()
22 | ax.plot(weather['Local time'], weather['T'])
23 | # label the axes
24 | ax.set_xlabel("Date of observation")
25 | ax.set_ylabel("Temperature in Celsius")
26 | ax.set_title("Temperature Observations")
27 | # adjust the date labels, so that they look nicer
28 | fig.autofmt_xdate()
29 | # save the figure
30 |
31 | # save the figure
32 | fig.savefig('weather.png')
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/resources/code/scripts/weather_observations_argparse.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import argparse
3 |
4 | parser = argparse.ArgumentParser()
5 | parser.add_argument("input", type=str, help="Input data file")
6 | parser.add_argument("output", type=str, help="Output plot file")
7 | parser.add_argument("-s", "--start", default="01/01/2019", type=str, help="Start date in DD/MM/YYYY format")
8 | parser.add_argument("-e", "--end", default="16/10/2021", type=str, help="End date in DD/MM/YYYY format")
9 |
10 | args = parser.parse_args()
11 |
12 | # load the data
13 | weather = pd.read_csv(args.input,comment='#')
14 |
15 | # define the start and end time for the plot
16 | start_date=pd.to_datetime(args.start, dayfirst=True)
17 | end_date=pd.to_datetime(args.end, dayfirst=True)
18 |
19 | # preprocess the data
20 | weather['Local time'] = pd.to_datetime(weather['Local time'], dayfirst=True)
21 | # select the data
22 | weather = weather[weather['Local time'].between(start_date,end_date)]
23 |
24 | # plot the data
25 | import matplotlib.pyplot as plt
26 | # start the figure.
27 | fig, ax = plt.subplots()
28 | ax.plot(weather['Local time'], weather['T'])
29 | # label the axes
30 | ax.set_xlabel("Date of observation")
31 | ax.set_ylabel("Temperature in Celsius")
32 | ax.set_title("Temperature Observations")
33 | # adjust the date labels, so that they look nicer
34 | fig.autofmt_xdate()
35 |
36 | # save the figure
37 | fig.savefig(args.output)
38 |
--------------------------------------------------------------------------------
/resources/code/scripts/weather_observations_config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | import pandas as pd
5 | from optionsparser import get_parameters
6 | import argparse
7 |
8 | # Lets start reading our confg file. we'll use argparse to get the config file.
9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('input', type=str,
11 | help="Config File name ")
12 | args = parser.parse_args()
13 |
14 | # Set optional parameters with default values and required parameter values with their type
15 | defaults = {
16 | "xlabel" : "Date of observation",
17 | "title" : "Weather Observations",
18 | "start" : "01/06/2021",
19 | "end" : "01/10/2021",
20 | "output" : "weather.png",
21 | "ylabel" : "Temperature in Celsius",
22 | "data_column" : "T",
23 | }
24 |
25 | required = {
26 | "input" : str
27 | }
28 |
29 | # now, parse the config file
30 | parameters = get_parameters(args.input, required, defaults)
31 |
32 | # load the data
33 | weather = pd.read_csv(parameters.input,comment='#')
34 |
35 | # obtain start and end date
36 | start_date=pd.to_datetime(parameters.start, dayfirst=True)
37 | end_date=pd.to_datetime(parameters.end, dayfirst=True)
38 |
39 | # Data preprocessing
40 | weather['Local time'] = pd.to_datetime(weather['Local time'], dayfirst=True)
41 | # select the data
42 | weather = weather[weather['Local time'].between(start_date,end_date)]
43 |
44 | # Data plotting
45 | import matplotlib.pyplot as plt
46 | # start the figure.
47 | fig, ax = plt.subplots()
48 | ax.plot(weather['Local time'], weather['T'])
49 | # label the axes
50 | ax.set_xlabel("Date of observation")
51 | ax.set_ylabel("Temperature in Celsius")
52 | ax.set_title("Temperature Observations")
53 | # adjust the date labels, so that they look nicer
54 | fig.autofmt_xdate()
55 |
56 |
57 | # save the figure
58 | fig.savefig(parameters.output)
59 |
--------------------------------------------------------------------------------
/resources/code/scripts/weather_options.yml:
--------------------------------------------------------------------------------
1 | input: https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/master/resources/data/scripts/weather_cairo.csv
2 | output: rain_in_cairo.png
3 | xlabel: Days in June
4 | ylabel: Rainfall in mm
5 | title: Rainfall in Cairo
6 | data_column: RRR
7 | start: 01/06/2021
8 | end: 30/06/2021
9 |
--------------------------------------------------------------------------------
/resources/data/plotting/README.md:
--------------------------------------------------------------------------------
1 | Data obtained from [Norsk
2 | KlimaServiceSenter](https://seklima.met.no/observations/), Meteorologisk
3 | institutt (MET) (CC BY 4.0).
4 |
5 | The following changes were applied to the data to make it easier to work with:
6 | - The decimal separator was changed from a comma to a period.
7 | - The column separator was changed from a semicolon to a comma.
8 | - Missing values were replaced with zeros instead of a dash.
9 |
--------------------------------------------------------------------------------
/resources/data/plotting/exercise-2.csv:
--------------------------------------------------------------------------------
1 | xval,yval
2 | 01,7.7
3 | 02,6.6
4 | 03,4.5
5 | 04,9.8
6 | 05,17.7
7 | 06,25.4
8 | 07,26.7
9 | 08,25.1
10 | 09,19.3
11 | 10,9.8
12 |
--------------------------------------------------------------------------------
/resources/data/plotting/oslo-monthly.csv:
--------------------------------------------------------------------------------
1 | name,station,date,max temperature,precipitation,min temperature
2 | Oslo - Blindern,SN18700,10.2022,17.1,82.9,-0.4
3 | Oslo - Blindern,SN18700,11.2022,15.1,83.4,-2.1
4 | Oslo - Blindern,SN18700,12.2022,6.5,85.5,-14.6
5 | Oslo - Blindern,SN18700,01.2023,7.2,100.5,-13.4
6 | Oslo - Blindern,SN18700,02.2023,10.2,46,-9.4
7 | Oslo - Blindern,SN18700,03.2023,9.8,72.6,-12.6
8 | Oslo - Blindern,SN18700,04.2023,19.8,99.7,-4.7
9 | Oslo - Blindern,SN18700,05.2023,24.2,17,-0.8
10 | Oslo - Blindern,SN18700,06.2023,31.8,39.9,4.6
11 | Oslo - Blindern,SN18700,07.2023,28.4,146.9,8.6
12 | Oslo - Blindern,SN18700,08.2023,24.5,259.8,9.8
13 | Oslo - Blindern,SN18700,09.2023,25.1,105.8,5.3
14 | Oslo - Blindern,SN18700,10.2023,17.1,7.3,-0.7
15 |
--------------------------------------------------------------------------------
/resources/data/plotting/tromso-monthly.csv:
--------------------------------------------------------------------------------
1 | name,station,date,max temperature,precipitation,min temperature
2 | Tromso - Langnes,SN90490,10.2022,10.7,187,-4.2
3 | Tromso - Langnes,SN90490,11.2022,8.5,41.5,-7
4 | Tromso - Langnes,SN90490,12.2022,5.6,88.8,-11.7
5 | Tromso - Langnes,SN90490,01.2023,7.7,111.4,-13.9
6 | Tromso - Langnes,SN90490,02.2023,6.6,171.3,-10.7
7 | Tromso - Langnes,SN90490,03.2023,4.5,157,-15.1
8 | Tromso - Langnes,SN90490,04.2023,9.8,85,-7.1
9 | Tromso - Langnes,SN90490,05.2023,17.7,101.2,-4.6
10 | Tromso - Langnes,SN90490,06.2023,25.4,43.4,-0.4
11 | Tromso - Langnes,SN90490,07.2023,26.7,14,6
12 | Tromso - Langnes,SN90490,08.2023,25.1,43.4,5.4
13 | Tromso - Langnes,SN90490,09.2023,19.3,163.7,0.3
14 | Tromso - Langnes,SN90490,10.2023,9.8,64.8,-0.6
15 |
--------------------------------------------------------------------------------
/software/environment.yml:
--------------------------------------------------------------------------------
1 | name: python-for-scicomp
2 | channels:
3 | - conda-forge
4 | dependencies:
5 | - jsonlines
6 | - jupyterlab
7 | - notebook
8 | - ipywidgets
9 | - requests
10 | - numpy
11 | - scipy
12 | - matplotlib
13 | - seaborn
14 | - mpi4py
15 | - dask
16 | - setuptools
17 | - twine
18 | - poetry
19 | - flit
20 | - scikit-learn
21 | - scalene
22 | - ruff
23 | - altair-all
24 | - vega_datasets
25 | - xarray
26 | - netcdf4
27 | - yfinance
28 | - pip
29 | - pip:
30 | - pythia_datasets
31 |
--------------------------------------------------------------------------------