├── .github └── workflows │ └── sphinx.yml ├── .gitignore ├── LICENSE ├── Makefile ├── content ├── a_list.dot ├── a_list.svg ├── binder.rst ├── conf.py ├── data-formats.rst ├── dependencies.rst ├── exercises.md ├── format_comparison_array.csv ├── format_comparison_tidy.csv ├── guide.rst ├── img │ ├── binder │ │ ├── binder.jpg │ │ └── python_unmasked.jpg │ ├── installation │ │ ├── anaconda-navigator-jupyterlab.png │ │ ├── anaconda-prompt.png │ │ ├── jupyterlab-notebook.png │ │ └── jupyterlab-terminal.png │ ├── jupyter │ │ ├── main-ui.png │ │ └── notebook-ui.png │ ├── numpy-advanced │ │ ├── 01_memory_layout.svg │ │ └── 02_views.svg │ ├── pandas │ │ ├── 01_table_dataframe.svg │ │ └── tidy_data.png │ └── xarray │ │ ├── xarray_1d_plot.png │ │ ├── xarray_2d_plot.png │ │ ├── xarray_dataset_image.png │ │ └── xarray_hist.png ├── index.rst ├── installation.rst ├── jupyter.ipynb ├── libraries.rst ├── ndarray.dot ├── ndarray.svg ├── numpy-advanced.rst ├── numpy.rst ├── packaging-example-project │ ├── calculator │ │ ├── __init__.py │ │ ├── adding.py │ │ ├── integrating.py │ │ └── subtracting.py │ ├── pyproject.toml │ ├── test.py │ └── test_editable.py ├── packaging.rst ├── pandas.rst ├── parallel-pi-multiprocessing.ipynb ├── parallel.rst ├── plotting-matplotlib.md ├── plotting-matplotlib │ ├── customizing │ │ ├── gapminder-larger-font.png │ │ ├── gapminder-linear.png │ │ └── gapminder-log.png │ └── first-plot │ │ ├── exercise.png │ │ └── getting-started.png ├── plotting-vega-altair.md ├── plotting-vega-altair │ ├── precipitation-on-top-yearmonth.svg │ ├── precipitation-on-top.svg │ ├── precipitation-side.svg │ ├── precipitation-stacked-x.svg │ ├── precipitation-stacked-y.svg │ ├── snow-depth-circles.svg │ ├── snow-depth-color.svg │ ├── snow-depth-plasma.svg │ ├── snow-depth.svg │ ├── temperature-ranges-combined.svg │ └── temperature-ranges-side.svg ├── productivity.md ├── productivity │ ├── chatgpt.png │ └── code-completion.gif ├── profiling.md ├── profiling │ ├── exercise.png │ └── exercise.py ├── python.rst ├── quick-reference.rst ├── scipy.rst ├── scripts.rst ├── web-apis.ipynb ├── work-with-data.rst └── xarray.rst ├── extras ├── data-formats-comparison-array.ipynb └── data-formats-comparison-tidy.ipynb ├── make.bat ├── requirements.txt ├── resources ├── code │ └── scripts │ │ ├── __pycache__ │ │ ├── optionsparser.cpython-38.pyc │ │ ├── weather_functions.cpython-38.pyc │ │ └── weather_functions_config.cpython-38.pyc │ │ ├── optionsparser.py │ │ ├── out.png │ │ ├── rain_in_cairo.png │ │ ├── weather.png │ │ ├── weather_functions.py │ │ ├── weather_functions_config.py │ │ ├── weather_observations.ipynb │ │ ├── weather_observations.py │ │ ├── weather_observations_argparse.py │ │ ├── weather_observations_config.py │ │ └── weather_options.yml ├── data │ ├── laureate.csv │ ├── plotting │ │ ├── README.md │ │ ├── exercise-2.csv │ │ ├── oslo-daily.csv │ │ ├── oslo-monthly.csv │ │ ├── tromso-daily.csv │ │ └── tromso-monthly.csv │ └── scripts │ │ ├── weather_cairo.csv │ │ └── weather_tapiola.csv └── notebooks │ ├── plotting-exercise-2.ipynb │ └── plotting.ipynb └── software └── environment.yml /.github/workflows/sphinx.yml: -------------------------------------------------------------------------------- 1 | # Deploy Sphinx. This could be shorter, but we also do some extra 2 | # stuff. 3 | # 4 | # License: CC-0. This is the canonical location of this file, which 5 | # you may want to link to anyway: 6 | # https://github.com/coderefinery/sphinx-lesson-template/blob/main/.github/workflows/sphinx.yml 7 | # https://raw.githubusercontent.com/coderefinery/sphinx-lesson-template/main/.github/workflows/sphinx.yml 8 | 9 | 10 | name: sphinx 11 | on: [push, pull_request] 12 | 13 | env: 14 | DEFAULT_BRANCH: "master" 15 | # If these SPHINXOPTS are enabled, then be strict about the 16 | # builds and fail on any warnings. 17 | #SPHINXOPTS: "-W --keep-going -T" 18 | GENERATE_PDF: true # to enable, must be 'true' lowercase 19 | GENERATE_SINGLEHTML: true # to enable, must be 'true' lowercase 20 | PDF_FILENAME: lesson.pdf 21 | MULTIBRANCH: true # to enable, must be 'true' lowercase 22 | 23 | 24 | jobs: 25 | build: 26 | name: Build 27 | runs-on: ubuntu-latest 28 | permissions: 29 | contents: read 30 | 31 | steps: 32 | # https://github.com/marketplace/actions/checkout 33 | - uses: actions/checkout@v4 34 | with: 35 | fetch-depth: 0 36 | lfs: true 37 | 38 | # https://github.com/marketplace/actions/setup-python 39 | # ^-- This gives info on matrix testing. 40 | - name: Install Python 41 | uses: actions/setup-python@v4 42 | with: 43 | python-version: '3.11' 44 | cache: 'pip' 45 | 46 | # https://docs.github.com/en/actions/guides/building-and-testing-python#installing-dependencies 47 | # ^-- This gives info on installing dependencies with pip 48 | - name: Install dependencies 49 | run: | 50 | python -m pip install --upgrade pip 51 | pip install -r requirements.txt 52 | 53 | # Debug 54 | - name: Debugging information 55 | env: 56 | ref: ${{github.ref}} 57 | event_name: ${{github.event_name}} 58 | head_ref: ${{github.head_ref}} 59 | base_ref: ${{github.base_ref}} 60 | run: | 61 | echo "github.ref: ${ref}" 62 | echo "github.event_name: ${event_name}" 63 | echo "github.head_ref: ${head_ref}" 64 | echo "github.base_ref: ${base_ref}" 65 | echo "GENERATE_PDF: ${GENERATE_PDF}" 66 | echo "GENERATE_SINGLEHTML: ${GENERATE_SINGLEHTML}" 67 | set -x 68 | git rev-parse --abbrev-ref HEAD 69 | git branch 70 | git branch -a 71 | git remote -v 72 | python -V 73 | pip list --not-required 74 | pip list 75 | 76 | 77 | # Build 78 | - uses: ammaraskar/sphinx-problem-matcher@master 79 | - name: Build Sphinx docs (dirhtml) 80 | # SPHINXOPTS used via environment variables 81 | run: | 82 | make dirhtml 83 | # This fixes broken copy button icons, as explained in 84 | # https://github.com/coderefinery/sphinx-lesson/issues/50 85 | # https://github.com/executablebooks/sphinx-copybutton/issues/110 86 | # This can be removed once these PRs are accepted (but the 87 | # fixes also need to propagate to other themes): 88 | # https://github.com/sphinx-doc/sphinx/pull/8524 89 | # https://github.com/readthedocs/sphinx_rtd_theme/pull/1025 90 | sed -i 's/url_root="#"/url_root=""/' _build/dirhtml/index.html || true 91 | 92 | # singlehtml 93 | - name: Generate singlehtml 94 | if: ${{ env.GENERATE_SINGLEHTML == 'true' }} 95 | run: | 96 | make singlehtml 97 | mv _build/singlehtml/ _build/dirhtml/singlehtml/ 98 | 99 | # PDF if requested 100 | - name: Generate PDF 101 | if: ${{ env.GENERATE_PDF == 'true' }} 102 | run: | 103 | pip install https://github.com/rkdarst/sphinx_pyppeteer_builder/archive/refs/heads/main.zip 104 | make pyppeteer 105 | mv _build/pyppeteer/*.pdf _build/dirhtml/${PDF_FILENAME} 106 | 107 | # Stage all deployed assets in _gh-pages/ for simplicity, and to 108 | # prepare to do a multi-branch deployment. 109 | - name: Copy deployment data to _gh-pages/ 110 | if: ${{ github.event_name == 'push' }} 111 | run: 112 | rsync -a _build/dirhtml/ _gh-pages/ 113 | 114 | # Use gh-pages-multibranch to multiplex different branches into 115 | # one deployment. See 116 | # https://github.com/coderefinery/gh-pages-multibranch 117 | - name: gh-pages multibranch 118 | uses: coderefinery/gh-pages-multibranch@main 119 | if: ${{ github.event_name == 'push' && env.MULTIBRANCH == 'true' }} 120 | with: 121 | directory: _gh-pages/ 122 | default_branch: ${{ env.DEFAULT_BRANCH }} 123 | publish_branch: gh-pages 124 | 125 | # Add the .nojekyll file 126 | - name: nojekyll 127 | if: ${{ github.event_name == 'push' }} 128 | run: | 129 | touch _gh-pages/.nojekyll 130 | 131 | # Save artifact for the next step. 132 | - uses: actions/upload-artifact@v4 133 | if: ${{ github.event_name == 'push' }} 134 | with: 135 | name: gh-pages-build 136 | path: _gh-pages/ 137 | 138 | # Deploy in a separate job so that write permissions are restricted 139 | # to the minimum steps. 140 | deploy: 141 | name: Deploy 142 | runs-on: ubuntu-latest 143 | needs: build 144 | # This if can't use the env context - find better way later. 145 | if: ${{ github.event_name == 'push' }} 146 | permissions: 147 | contents: write 148 | 149 | steps: 150 | - uses: actions/download-artifact@v4 151 | if: ${{ github.event_name == 'push' && ( env.MULTIBRANCH == 'true' || github.ref == format('refs/heads/{0}', env.DEFAULT_BRANCH )) }} 152 | with: 153 | name: gh-pages-build 154 | path: _gh-pages/ 155 | 156 | # As of 2023, we could publish to pages via a Deployment. This 157 | # isn't done yet to give it time to stabilize (out of beta), and 158 | # also having a gh-pages branch to check out is rather 159 | # convenient. 160 | 161 | # Deploy 162 | # https://github.com/peaceiris/actions-gh-pages 163 | - name: Deploy 164 | uses: peaceiris/actions-gh-pages@v3 165 | if: ${{ github.event_name == 'push' && ( env.MULTIBRANCH == 'true' || github.ref == format('refs/heads/{0}', env.DEFAULT_BRANCH )) }} 166 | with: 167 | publish_branch: gh-pages 168 | github_token: ${{ secrets.GITHUB_TOKEN }} 169 | publish_dir: _gh-pages/ 170 | force_orphan: true 171 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /_build 2 | /venv 3 | .ipynb_checkpoints/ 4 | .vscode 5 | catfacts.jsonl 6 | jupyter_execute/ 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Attribution 4.0 -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = content 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | 22 | # Live reload site documents for local development 23 | livehtml: 24 | sphinx-autobuild "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 25 | -------------------------------------------------------------------------------- /content/a_list.dot: -------------------------------------------------------------------------------- 1 | strict digraph a_list { 2 | graph [compound=true]; 3 | 4 | node [style = filled, color=cyan]; 5 | 6 | a_list [label="Variable a_list (lvalue)", color=gold]; 7 | aobj [label="PyObject a_list"]; 8 | one [label="PyObject 1"]; 9 | hello [label="PyObject hello"]; 10 | oneptwo [label="PyObject 1.2"]; 11 | 12 | a_list -> aobj; 13 | 14 | 15 | 16 | subgraph cluster_adata { 17 | label = "Data array for a_list PyObject"; 18 | color = aquamarine; 19 | style = filled; 20 | 21 | adata_0 [label="element [0]"]; 22 | adata_1 [label="element [1]"]; 23 | adata_2 [label="element [2]"]; 24 | } 25 | 26 | adata_0 -> one; 27 | adata_1 -> hello; 28 | adata_2 -> oneptwo; 29 | 30 | 31 | aobj -> adata_1 [lhead=cluster_adata]; 32 | 33 | } 34 | -------------------------------------------------------------------------------- /content/a_list.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | a_list 11 | 12 | cluster_adata 13 | 14 | Data array for a_list PyObject 15 | 16 | 17 | a_list 18 | 19 | Variable a_list (lvalue) 20 | 21 | 22 | aobj 23 | 24 | PyObject a_list 25 | 26 | 27 | a_list->aobj 28 | 29 | 30 | 31 | 32 | adata_1 33 | 34 | element [1] 35 | 36 | 37 | aobj->adata_1 38 | 39 | 40 | 41 | 42 | one 43 | 44 | PyObject 1 45 | 46 | 47 | hello 48 | 49 | PyObject hello 50 | 51 | 52 | oneptwo 53 | 54 | PyObject 1.2 55 | 56 | 57 | adata_0 58 | 59 | element [0] 60 | 61 | 62 | adata_0->one 63 | 64 | 65 | 66 | 67 | adata_1->hello 68 | 69 | 70 | 71 | 72 | adata_2 73 | 74 | element [2] 75 | 76 | 77 | adata_2->oneptwo 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /content/binder.rst: -------------------------------------------------------------------------------- 1 | Binder 2 | ====== 3 | 4 | .. questions:: 5 | 6 | - Why sharing code alone may not be sufficient. 7 | - How to share a computational environment? 8 | - What is Binder? 9 | - How to binderize my Python repository? 10 | - How to publish my Python repository? 11 | 12 | .. objectives:: 13 | 14 | - Learn about reproducible computational environments. 15 | - Learn to create and share custom computing environments with Binder. 16 | - Learn to get a DOI from Zenodo for a repository. 17 | 18 | 19 | Why is it sometimes not enough to share your code? 20 | -------------------------------------------------- 21 | 22 | .. image:: img/binder/python_unmasked.jpg 23 | 24 | 25 | Exercise 1 26 | ~~~~~~~~~~ 27 | 28 | .. challenge:: Binder-1: Discuss better strategies than only code sharing (10 min) 29 | 30 | Lea is a PhD student in computational biology and after 2 years of intensive 31 | work, she is finally ready to publish her first paper. The code she has used 32 | for analyzing her data is available on GitHub but her supervisor who is an 33 | advocate of open science told her that sharing code is not sufficient. 34 | 35 | **Why is it possibly not enough to share "just" your code? 36 | What problems can you anticipate 2-5 years from now?** 37 | 38 | We form small groups (4-5 persons) and discuss in groups. If the workshop is 39 | online, each group will join a breakout room. 40 | If joining a group is not possible or practical, we use the shared document 41 | to discuss this collaboratively. 42 | 43 | Each group write a summary (bullet points) of the discussion in the workshop 44 | shared document (the link will be provided by your instructors). 45 | 46 | 47 | Sharing a computing environment with Binder 48 | ------------------------------------------- 49 | 50 | `Binder `__ allows you to create 51 | custom computing environments that can be shared and used by many remote users. 52 | It uses `repo2docker `__ to 53 | create a container image (`docker `__ image) of a 54 | project using information contained in included configuration files. 55 | 56 | Repo2docker is a standalone package that you can install locally on your laptop 57 | but an `online Binder `__ service is freely available. 58 | This is what we will be using in the tutorial. 59 | 60 | The main objective of this exercise is to learn to fork a repository and add a 61 | requirement file to share the computational environment with Binder. 62 | 63 | .. image:: https://opendreamkit.org/public/images/use-cases/reproducible_logbook.png 64 | 65 | Credit: `Juliette Taka, Logilab and the OpenDreamKit project (2017) `_ 66 | 67 | 68 | Binder exercise/demo 69 | ~~~~~~~~~~~~~~~~~~~~ 70 | 71 | In an earlier episode (Data visualization with Matplotlib) we have created this notebook: 72 | 73 | .. code-block:: python 74 | 75 | import pandas as pd 76 | import matplotlib.pyplot as plt 77 | 78 | url = "https://raw.githubusercontent.com/plotly/datasets/master/gapminder_with_codes.csv" 79 | data = pd.read_csv(url) 80 | data_2007 = data[data["year"] == 2007] 81 | 82 | fig, ax = plt.subplots() 83 | 84 | ax.scatter(x=data_2007["gdpPercap"], y=data_2007["lifeExp"], alpha=0.5) 85 | 86 | ax.set_xscale("log") 87 | 88 | ax.set_xlabel("GDP (USD) per capita") 89 | ax.set_ylabel("life expectancy (years)") 90 | 91 | We will now first share it via `GitHub `__ "statically", 92 | then using `Binder `__. 93 | 94 | .. challenge:: Binder-2: Exercise/demo: Make your notebooks reproducible by anyone (15 min) 95 | 96 | Instructor demonstrates this. **This exercise (and all following) 97 | requires git/GitHub knowledge and accounts, which wasn't a 98 | prerequisite of this course. Thus, this is a demo (and might even 99 | be too fast for you to type-along). Watch the video if you 100 | are reading this later on**: 101 | 102 | - Creates a GitHub repository 103 | - Uploads the notebook file 104 | - Then we look at the statically rendered version of the notebook on GitHub 105 | - Create a ``requirements.txt`` file which contains: 106 | 107 | .. code-block:: none 108 | 109 | pandas==1.2.3 110 | matplotlib==3.4.2 111 | 112 | - Commit and push also this file to your notebook repository. 113 | - Visit https://mybinder.org and copy paste the code under "Copy the text below ..." into your `README.md`: 114 | 115 | .. image:: img/binder/binder.jpg 116 | 117 | - Check that your notebook repository now has a "launch binder" 118 | badge in your `README.md` file on GitHub. 119 | - Try clicking the button and see how your repository is launched 120 | on Binder (can take a minute or two). Your notebooks can now be explored and executed in the cloud. 121 | - Enjoy being fully reproducible! 122 | 123 | 124 | How can I get a DOI from Zenodo? 125 | --------------------------------- 126 | 127 | `Zenodo `__ is a general purpose open-access 128 | repository built and operated by `CERN `__ and `OpenAIRE 129 | `__ that allows researchers to archive and get a 130 | `Digital Object Identifier (DOI) `__ to data that they 131 | share. 132 | 133 | .. challenge:: Binder-3: Link a Github repository with Zenodo (optional) 134 | 135 | **Everything you deposit on Zenodo is meant to be kept (long-term archive). 136 | Therefore we recommend to practice with the Zenodo "sandbox" (practice/test area) 137 | instead:** https://sandbox.zenodo.org 138 | 139 | 1. **Link GitHub with Zenodo**: 140 | 141 | - Go to https://sandbox.zenodo.org (or to https://zenodo.org for the real upload later, after practicing). 142 | - Log in to Zenodo with your GitHub account. Be aware that you may need to 143 | authorize Zenodo application (Zenodo will redirect you back to GitHub for 144 | Authorization). 145 | - Choose the repository webhooks options. 146 | - From the drop-down menu next to your email address at the top of the page, select GitHub. 147 | - You will be presented with a list of all your Github repositories. 148 | 149 | 2. **Archiving a repo**: 150 | 151 | - Select a repository you want to archive on Zenodo. 152 | - Toggle the "on" button next to the repository ou need to archive. 153 | - Click on the Repo that you want to reserve. 154 | - Click on Create release button at the top of the page. Zenodo will redirect you back to GitHub’s repo page to generate a release. 155 | 156 | 3. **Trigger Zenodo to Archive your repository** 157 | 158 | - Go to GitHub and create a release. Zenodo will automatically download a .zip-ball of each new release and register a DOI. 159 | - If this is the first release of your code then you should give it a 160 | version number of v1.0.0. Add description for your release then click the 161 | Publish release button. 162 | - Zenodo takes an archive of your GitHub repository each time you create a new Release. 163 | 164 | 4. **To ensure that everything is working**: 165 | 166 | - Go to https://zenodo.org/account/settings/github/ (or the corresponding 167 | sandbox at https://sandbox.zenodo.org/account/settings/github/), or the 168 | Upload page (https://zenodo.org/deposit), you will find your repo is 169 | listed. 170 | - Click on the repo, Zenodo will redirect you to a page that contains a DOI for your repo will the information that you added to the repo. 171 | - You can edit the archive on Zenodo and/or publish a new version of your software. 172 | - It is recommended that you add a description for your repo and fill in other metadata in the edit page. Instead of editing metadata 173 | manually, you can also add a ``.zenodo.json`` or a ``CITATION.cff`` file to your repo and Zenodo will infer the metadata from this file. 174 | - Your code is now published on a Github public repository and archived on Zenodo. 175 | - Update the README file in your repository with the newly created zenodo badge. 176 | 177 | 178 | Create a Binder link for your Zenodo DOI 179 | ---------------------------------------- 180 | 181 | Rather than specifying a GitHub repository when launching binder, you can instead use a Zenodo DOI. 182 | 183 | .. challenge:: Binder-4: Link Binder with Zenodo (10 min) 184 | 185 | We will be using an existing Zenodo DOI `10.5281/zenodo.3886864 `_ to start Binder: 186 | 187 | - Go to `https://mybinder.org `__ and fill information using Zenodo DOI (as shown on the animation below): 188 | 189 | .. image:: https://miro.medium.com/max/1050/1*xOABVY2hNtVmjV5-LXreFw.gif 190 | 191 | - You can also get a Binder badge and update the README file in the 192 | repository. It is good practice to add both the Zenodo badge and the 193 | corresponding Binder badge. 194 | 195 | .. keypoints:: 196 | 197 | - It is easy to sharing reproducible computational environments 198 | - Binder provides a way for anyone to test and run code - without 199 | you needing to set up a dedicated server for it. 200 | - Zenodo provides permanent archives and a DOI. 201 | -------------------------------------------------------------------------------- /content/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'Python for Scientific Computing' 21 | copyright = '2020-2024, The contributors' 22 | author = 'The contributors' 23 | github_user = 'AaltoSciComp' 24 | github_repo_name = 'python-for-scicomp' # auto-detected from dirname if blank 25 | github_version = 'master/content/' # with trailing slash 26 | 27 | 28 | # -- General configuration --------------------------------------------------- 29 | 30 | highlight_language = 'python3' 31 | 32 | # Add any Sphinx extension module names here, as strings. They can be 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 34 | # ones. 35 | extensions = [ 36 | 'sphinx_lesson', 37 | 'sphinx_rtd_theme_ext_color_contrast', 38 | 'sphinx.ext.todo', 39 | 'sphinx.ext.intersphinx', 40 | 'sphinx.ext.mathjax', 41 | 'sphinx_aaltoscicomp_branding', 42 | 'sphinxext.opengraph', 43 | 'sphinx_thebe', 44 | ] 45 | myst_enable_extensions = ['colon_fence'] 46 | 47 | thebe_config = { 48 | "selector": "div.highlight" 49 | } 50 | 51 | nb_execution_mode = "off" 52 | 53 | ogp_site_name = "Python for Scientific Computing" 54 | ogp_site_url = 'https://aaltoscicomp.github.io/python-for-scicomp/' 55 | import datetime 56 | if datetime.date.today() < datetime.date(2022,12,15): 57 | ogp_image = 'https://www.aalto.fi/sites/g/files/flghsv161/files/styles/o_914w_ah_n/public/2022-11/PFSC22_v2.png' 58 | ogp_image_alt = 'Python for Scientific Computing course logo with date of 22-25/11/2022, twitch.tv/coderefinery, and partner logos' 59 | 60 | copybutton_exclude = '.linenos, .gp' 61 | 62 | import os 63 | if ( 64 | 'GITHUB_ACTION' in os.environ 65 | and os.environ.get('GITHUB_REPOSITORY', '').lower() == 'aaltoscicomp/python-for-scicomp' 66 | and os.environ.get('GITHUB_REF') == 'refs/heads/master' 67 | ): 68 | html_js_files = [ 69 | ('https://plausible.cs.aalto.fi/js/script.js', {"data-domain": "aaltoscicomp.github.io/python-for-scicomp", "defer": "defer"}), 70 | ] 71 | 72 | 73 | # Add any paths that contain templates here, relative to this directory. 74 | #templates_path = ['_templates'] 75 | 76 | # List of patterns, relative to source directory, that match files and 77 | # directories to ignore when looking for source files. 78 | # This pattern also affects html_static_path and html_extra_path. 79 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'venv', 'jupyter_execute'] 80 | 81 | 82 | # -- Options for HTML output ------------------------------------------------- 83 | 84 | # The theme to use for HTML and HTML Help pages. See the documentation for 85 | # a list of builtin themes. 86 | # 87 | html_theme = 'sphinx_rtd_theme' 88 | 89 | # Add any paths that contain custom static files (such as style sheets) here, 90 | # relative to this directory. They are copied after the builtin static files, 91 | # so a file named "default.css" will overwrite the builtin "default.css". 92 | #html_static_path = ['_static'] 93 | 94 | 95 | # HTML context: 96 | from os.path import dirname, realpath, basename 97 | html_context = {'display_github': True, 98 | 'github_user': github_user, 99 | # Auto-detect directory name. This can break, but 100 | # useful as a default. 101 | 'github_repo': github_repo_name or basename(dirname(realpath(__file__))), 102 | 'github_version': github_version, 103 | } 104 | 105 | 106 | intersphinx_mapping = { 107 | 'python': ('https://docs.python.org/3', None), 108 | 'numpy': ('https://numpy.org/doc/stable', None), 109 | 'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None), 110 | 'matplotlib': ('https://matplotlib.org/stable', None), 111 | 'requests': ('https://requests.readthedocs.io/en/latest/', None), 112 | } 113 | -------------------------------------------------------------------------------- /content/exercises.md: -------------------------------------------------------------------------------- 1 | # List of exercises 2 | 3 | ## Full list 4 | 5 | This is a list of all exercises and solutions in this lesson, mainly 6 | as a reference for helpers and instructors. This list is 7 | automatically generated from all of the other pages in the lesson. 8 | Any single teaching event will probably cover only a subset of these, 9 | depending on their interests. 10 | 11 | ```{exerciselist} 12 | ``` 13 | -------------------------------------------------------------------------------- /content/format_comparison_array.csv: -------------------------------------------------------------------------------- 1 | File format,File size [MB],Write time [ms],Read time [ms],Data matches exactly 2 | CSV,23.8,690,294,True 3 | npy,7.63,13.8,2.72,True 4 | HDF5,7.63,27,3.97,True 5 | NetCDF4,7.64,28.8,12.2,True 6 | -------------------------------------------------------------------------------- /content/format_comparison_tidy.csv: -------------------------------------------------------------------------------- 1 | File format,File size [MB],Write time [ms],Read time [ms],Data matches exactly 2 | CSV,4.57,360,81.2,False 3 | Feather,2.2,12.9,6.67,True 4 | Parquet,1.82,35.1,8.96,True 5 | HDF5,4.89,41.7,29.6,True 6 | NetCDF4,6.9,92.9,74.2,True 7 | -------------------------------------------------------------------------------- /content/guide.rst: -------------------------------------------------------------------------------- 1 | Instructor's guide 2 | ================== 3 | 4 | Learner personas 5 | ---------------- 6 | 7 | A is a early career PhD researcher who has been using Python a bit, 8 | but is not sure what they know or don't know. They want to be able to 9 | do their research more efficiently and make sure that they are using 10 | the right tools. A may know that numpy exists, etc. and could 11 | theoretically read some about it themselves, but aren't sure if they 12 | are going in the right direction. 13 | 14 | A2 can use numpy and pandas, but have learned little bits here and 15 | there and hasn't had a comprehensive introduction. They want to 16 | ensure they are using best practices. (Baseline of high-level 17 | packages) 18 | 19 | B is a mid-to-late undergraduate student who has used Python in some 20 | classes. They have possibly learned the syntax and enough to use it 21 | in courses, but in a course-like manner where they are expected to 22 | create everything themselves. 23 | 24 | 25 | Prerequisites: 26 | - Knowing basic Python syntax 27 | - Watch the command line crash course, if you aren't familiar. 28 | 29 | Not prerequisites: 30 | - Any external libraries, e.g. numpy 31 | - Knowing how to make scripts or use Jupyter 32 | 33 | 34 | 35 | About each section 36 | ------------------ 37 | 38 | In general, "Python for Scientific Computing could be a multi-year 39 | course. We can't even pretend to really teach even a small fraction 40 | of it. We can, however, introduce people to things that can very 41 | easily be missed in the typical academic career path. 42 | 43 | * **Python intro:** We can't really replace a Python tutorial, but 44 | here we try to outline some of the main points. We don't go over 45 | this in the course. 46 | 47 | * **Jupyter:** Jupyter is somewhat useful, but the main reason we go 48 | over it is that it provides a convenient user interface for the 49 | other programming lessons (it's easier to spend a bit of time with 50 | Jupyter than expect people to be able to use some 51 | editor/IDE/shell/etc). So, we do start from the beginning, so that 52 | people can do the other lessons, but also try to teach some advanced 53 | tips and tricks. 54 | 55 | * **Numpy:** The basic of much of the rest of scipy, so we need to 56 | cover it. We try to get the main principles out, but if someone 57 | already knows it this can be a bit boring. We try to make sure 58 | everyone comes out with an appreciation for vectorization and 59 | broadcasting. 60 | 61 | * **Pandas:** A lot of similar goals to the Numpy section, especially 62 | the concepts behind Dataframes that one needs to know in order to 63 | read other documentation. 64 | 65 | * **Visualization:** Matplotlib is getting a bit old, but is still the 66 | backbone of other plotting packages. We try to get forth the ideas 67 | of the matplotlib API that can be seen in other packages and the 68 | importance of scripted plots. 69 | 70 | * **Data formats:** Input/output/storage is a common task, and can 71 | easily either be a bottleneck or a huge mess. This lessons tries to 72 | show some best practices with data formats and, as usual, get the 73 | idea to not "do it yourself". Pandas is used as a common framework, 74 | but we should point out there are plenty of other options. 75 | 76 | * **Scripts:** The most important lesson here is to break out of 77 | Jupyter/run buttons of editors. If you can't make actual programs 78 | with an actual interface, you can't scale up. 79 | 80 | * This is the first lesson to introduce the command line. We 81 | recommend being as simple as possible: at least demonstrate the 82 | JupyterLab terminal and discuss the bigger picture behind what it 83 | means and why. 84 | 85 | * This is also the first lesson to use non-Jupyter code editor. We 86 | recommend again being simple: use the JupyterLab code editor to 87 | start off, and carefully explain what is going on. 88 | 89 | * **Scipy:** We don't cover much here (this is super short), but the 90 | point is scipy exists and the concept of wrapping existing C/fortran 91 | libraries and so on. 92 | 93 | * **Library ecosystem:** This was an overview of the types of packages 94 | available in the "scipy ecosystem", which is a large and ill-defined 95 | thing. But there is another point: choosing what to use. Do you 96 | trust a half-done thing published on someone's personal webpage? If 97 | it's on Github? How do you make your code more reusable? When 98 | coming from academic courses, you get a "build it yourself" idea, 99 | which isn't sustainable in research. 100 | 101 | * **Parallel programming:** 102 | 103 | * **Dependencies:** The main point here is environments, another thing 104 | you often don't learn in courses. 105 | 106 | * There is a lot of material here. Consider what you will demo, 107 | what will be done as exercises, and what is advanced/optional. 108 | However, it is the fourth-day lesson that is most interactive, so 109 | it is OK if it take a while to go through everything. 110 | 111 | * If someone else installs Anaconda for a user (e.g. admin-managed 112 | laptop), the conda environment creations (with ``--name``, 113 | possibly with ``--prefix`` too?) may not work. Be prepared for 114 | this and mention it. You don't need to solve the problem but 115 | acknowledge that the lesson becomes a demo. The virtualenv part 116 | should hopefully work for them. 117 | 118 | * **Binder:** Binder exists and can help make code 119 | reproducible/reusable by others. 120 | 121 | * **Packaging:** How to make your code reusable by others. By the 122 | time we get here, people are tired and the topics get involved. We 123 | more explicitly say "you might want to watch and take this as a 124 | demo". 125 | 126 | -------------------------------------------------------------------------------- /content/img/binder/binder.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/binder/binder.jpg -------------------------------------------------------------------------------- /content/img/binder/python_unmasked.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/binder/python_unmasked.jpg -------------------------------------------------------------------------------- /content/img/installation/anaconda-navigator-jupyterlab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/installation/anaconda-navigator-jupyterlab.png -------------------------------------------------------------------------------- /content/img/installation/anaconda-prompt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/installation/anaconda-prompt.png -------------------------------------------------------------------------------- /content/img/installation/jupyterlab-notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/installation/jupyterlab-notebook.png -------------------------------------------------------------------------------- /content/img/installation/jupyterlab-terminal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/installation/jupyterlab-terminal.png -------------------------------------------------------------------------------- /content/img/jupyter/main-ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/jupyter/main-ui.png -------------------------------------------------------------------------------- /content/img/jupyter/notebook-ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/jupyter/notebook-ui.png -------------------------------------------------------------------------------- /content/img/numpy-advanced/02_views.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 39 | 41 | 54 | 59 | 60 | 61 | 66 | 73 | 80 | 82 | 89 | 96 | 103 | 110 | 117 | 124 | 131 | 138 | 145 | 152 | 159 | 166 | 173 | 180 | 187 | 194 | 201 | 208 | 215 | 222 | 223 | 230 | array "a" 241 | array "b" 252 | .shape.strides 268 | .shape.strides 284 | 288 | 292 | 297 | 302 | data pointer 313 | data pointer 324 | memory buffer 335 | 336 | 337 | -------------------------------------------------------------------------------- /content/img/pandas/01_table_dataframe.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 21 | 43 | 45 | 46 | 48 | image/svg+xml 49 | 51 | 52 | 53 | 54 | 55 | 60 | 64 | 69 | 74 | 79 | 84 | 89 | 94 | 99 | 104 | 109 | 114 | 119 | 124 | 129 | 134 | 139 | 144 | 149 | 154 | 159 | 162 | 167 | 172 | 177 | 182 | 187 | 188 | 193 | 198 | 203 | 208 | 213 | column 224 | DataFrame 235 | 242 | 249 | row 260 | 261 | 262 | 263 | -------------------------------------------------------------------------------- /content/img/pandas/tidy_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/pandas/tidy_data.png -------------------------------------------------------------------------------- /content/img/xarray/xarray_1d_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/xarray/xarray_1d_plot.png -------------------------------------------------------------------------------- /content/img/xarray/xarray_2d_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/xarray/xarray_2d_plot.png -------------------------------------------------------------------------------- /content/img/xarray/xarray_dataset_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/xarray/xarray_dataset_image.png -------------------------------------------------------------------------------- /content/img/xarray/xarray_hist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/img/xarray/xarray_hist.png -------------------------------------------------------------------------------- /content/index.rst: -------------------------------------------------------------------------------- 1 | =============================== 2 | Python for Scientific Computing 3 | =============================== 4 | 5 | .. admonition:: Attending the course 5-7 November, 2024? 6 | 7 | `See the course page here 8 | `__ 9 | and watch at https://twitch.tv/coderefinery. 10 | Whether you are or aren't, the course material is below. Videos 11 | will appear in `this playlist `__ (Last year's videos: `playlist `__). 12 | 13 | 14 | Python is a modern, object-oriented programming language, which has 15 | become popular in several areas of software development. This course 16 | discusses how Python can be utilized in scientific computing. The 17 | course starts by introducing some of the main Python tools for 18 | computing: Jupyter for interactive analysis, NumPy and SciPy for 19 | numerical analysis, Matplotlib for visualization, and so on. In 20 | addition, it talks about *how* python is used: 21 | related scientific libraries, reproducibility, and the broader 22 | ecosystem of science in Python, because your work is more than the raw 23 | code you write. 24 | 25 | This course (like any course) can't teach you Python... it can show 26 | your some examples, let you see how experts do things, and prepare you 27 | to learn yourself as you need to. 28 | 29 | .. _prerequisites: 30 | 31 | .. prereq:: 32 | 33 | - Knowing basic Python syntax. We assume that you can do some 34 | Python programming, but not much more that that. We don't cover 35 | standard Python programming. `Here a short course on basic Python 36 | syntax, with further references `__. 37 | - Watch or read the `command line crash course 38 | `__, if you aren't 39 | familiar. 40 | - You should be able to use a text editor to edit files some. 41 | - The :doc:`software installation ` described below 42 | (basically, anaconda). 43 | 44 | These are not prerequisites: 45 | 46 | - Any external libraries, e.g. numpy 47 | - Knowing how to make scripts or use Jupyter 48 | 49 | 50 | .. admonition:: Videos and archived Q&A 51 | 52 | Videos and material from past instances: 53 | 54 | * 2021: `this YouTube playlist 55 | `__. 56 | * 2022: `here 57 | `__, 58 | Q&A: `days 1-2 59 | `__, `days 3-4 60 | `__ 61 | 62 | * 2023: `Videos 63 | `__ 64 | 65 | * 2024 (Please contact us if you would like to help to process the videos): `Videos `__ 66 | 67 | 68 | .. csv-table:: 69 | :widths: auto 70 | :delim: ; 71 | 72 | (prereq) ; :doc:`python` 73 | 30 min ; :doc:`jupyter` 74 | 60 min ; :doc:`numpy` or :doc:`numpy-advanced` 75 | 60 min ; :doc:`pandas` 76 | 30 min ; :doc:`xarray` 77 | 60 min ; :doc:`plotting-matplotlib` 78 | 60 min ; :doc:`plotting-vega-altair` 79 | 30 min ; :doc:`work-with-data` 80 | 60 min ; :doc:`scripts` 81 | 40 min ; :doc:`profiling` 82 | 20 min ; :doc:`productivity` 83 | 30 min ; :doc:`web-apis` 84 | 15 min ; :doc:`scipy` 85 | 30 min ; :doc:`libraries` 86 | 45 min ; :doc:`parallel` 87 | 45 min ; :doc:`dependencies` 88 | 30 min ; :doc:`binder` 89 | 60 min ; :doc:`packaging` 90 | 91 | 92 | .. toctree:: 93 | :maxdepth: 1 94 | :caption: The lesson 95 | :hidden: 96 | 97 | python 98 | jupyter 99 | numpy 100 | numpy-advanced 101 | pandas 102 | xarray 103 | plotting-matplotlib 104 | plotting-vega-altair 105 | work-with-data 106 | scripts 107 | profiling 108 | productivity 109 | scipy 110 | libraries 111 | dependencies 112 | binder 113 | parallel 114 | packaging 115 | web-apis 116 | 117 | .. toctree:: 118 | :maxdepth: 1 119 | :caption: Reference 120 | 121 | installation 122 | quick-reference 123 | exercises 124 | guide 125 | data-formats 126 | 127 | 128 | .. _learner-personas: 129 | 130 | Who is the course for? 131 | ====================== 132 | 133 | The course is targeted towards these learner personas: 134 | 135 | * A is a early career PhD researcher who has been using Python a bit, 136 | but is not sure what they know or don't know. They want to be able 137 | to do their research more efficiently and make sure that they are 138 | using the right tools. A may know that numpy exists, etc. and could 139 | theoretically read some about it themselves, but aren't sure if they 140 | are going in the right direction. 141 | 142 | * A2 can use numpy and pandas, but have learned little bits here and 143 | there and hasn't had a comprehensive introduction. They want to 144 | ensure they are using best practices. (Baseline of high-level 145 | packages) 146 | 147 | * B is a mid-to-late undergraduate student who has used Python in some 148 | classes. They have possibly learned the syntax and enough to use it 149 | in courses, but in a course-like manner where they are expected to 150 | create everything themselves: they want to know how to reuse tools 151 | that already exist. 152 | 153 | 154 | Motivation 155 | ========== 156 | 157 | Why Python 158 | ---------- 159 | 160 | Python has become popular, largely due to good reasons. It's very easy 161 | to get started, there's lots of educational material, a huge amount of 162 | libraries for doing everything imaginable. Particularly in the 163 | scientific computing space, there is the Numpy, Scipy, and matplotlib 164 | libraries which form the basis of almost everything. Numpy and Scipy 165 | are excellent examples of using Python as a glue language, meaning to 166 | glue together battle-tested and well performing code and present them 167 | with an easy to use interface. Also machine learning and deep 168 | learning frameworks have embraced python as the glue language of 169 | choice. And finally, Python is open source, meaning that anybody can 170 | download and install it on their computer, without having to bother 171 | with acquiring a license or such. This makes it easier to distribute 172 | your code e.g. to collaborators in different universities. 173 | 174 | 175 | Why not Python for Scientific Computing 176 | --------------------------------------- 177 | 178 | While Python is extremely popular in scientific computing today, there 179 | are certainly things better left to other tools. 180 | 181 | - Implementing performance-critical kernels. Python is a **very** 182 | slow language, which often doesn't matter if you can offload the 183 | heavy lifting to fast compiled code, e.g. by using Numpy array 184 | operations. But if what you're trying to do isn't *vectorizable* 185 | then you're out of luck. An alternative to Python, albeit much less 186 | mature and with a smaller ecosystem, but which provides very fast 187 | generated code, is *Julia*. 188 | 189 | - Creating libraries that can be called from other languages. In this 190 | case you'll often want to create a library with a C interface, which 191 | can then be called from most languages. Suitable languages for this 192 | sort of task, depending on what you are doing, could be Rust, C, 193 | C++, or Fortran. 194 | 195 | - You really like static typing, or functional programming 196 | approaches. *Haskell* might be what you're looking for. 197 | 198 | 199 | Python 2 vs Python 3 200 | -------------------- 201 | 202 | Python 3.0 came out in September 2008 and was just slightly different 203 | enough that most code had to be changed, which meant that many 204 | projects ignored it for many years. It was about 3-5 years until the 205 | differences were reduced enough (and better transition plans came out, 206 | so that it was reasonable to use a single code for both versions) that 207 | it become more and more adopted in the scientific community. Python 2 208 | finally became unsupported in 2020, and by now Python 3 is the defacto 209 | standard. 210 | 211 | At this point, all new projects should use Python 3, and existing 212 | actively developed projects should be upgraded to use it. Still, you 213 | might find some old unmaintained tools that are only compatible with 214 | Python 2. 215 | 216 | 217 | 218 | Credits 219 | ======= 220 | 221 | This course was originally designed by Janne Blomqvist. 222 | 223 | In 2020 it was completely redesigned by a team of the following: 224 | 225 | * Authors: Radovan Bast, Richard Darst, Anne Fouilloux, Thor Wikfeldt, ... 226 | * Editor: 227 | * Testers and advisors: Enrico Glerean 228 | 229 | We follow The Carpentries Code of Conduct: https://docs.carpentries.org/topic_folders/policies/code-of-conduct.html 230 | 231 | 232 | See also 233 | ======== 234 | 235 | * `High Performance Data Analytics in Python 236 | `__ is a logical follow-up to 237 | this lesson that goes more in-depth to tools of high-performance 238 | and large-scale Python. 239 | -------------------------------------------------------------------------------- /content/installation.rst: -------------------------------------------------------------------------------- 1 | Software installation 2 | ===================== 3 | 4 | This course is interactive and demonstrates many different tools. 5 | Thus, even beyond Python, extra software (Python libraries) needs to 6 | be installed. This page contains the instructions. 7 | 8 | **Once the course starts, we don't have time to stop for installing 9 | software.** 10 | 11 | Please make sure before the course that you have all the required 12 | software installed or some other way access to it. For example, the 13 | workshop could be done with a remote Jupyter server, as long as you 14 | can use the terminal from the Jupyter (you need to be able to access 15 | the command line for some lessons). 16 | 17 | .. admonition:: Do you need help? 18 | :class: important 19 | 20 | Participants from a partner institution are invited to install help 21 | sessions. (Hint: ask your institution to become a partner if it 22 | isn't already!) 23 | 24 | Otherwise, if you need installation help, show this page to someone 25 | around you and they can probably help. These are relatively 26 | standard tools. 27 | 28 | Don't be afraid to ask for help. Installing scientific software is 29 | *harder than it should be* and it helps to have someone guide you 30 | through it. 31 | 32 | .. highlight:: console 33 | 34 | 35 | 36 | Python 37 | ------ 38 | 39 | We expect you to have a working Python installation with some common 40 | libraries. **We currently recommend Miniforge, which includes the base and 41 | packages through a different, freely usable channel.** You can 42 | explore the options in the tabs below. 43 | 44 | .. admonition:: Python, conda, anaconda, miniforge, etc? 45 | :class: dropdown 46 | 47 | Unfortunately there's a lot of jargon. We'll go over this in the 48 | course but here is a crash course: 49 | 50 | * **Python** is a programming language very commonly used in 51 | science, it's the topic of this course. 52 | * **Conda** is a package manager: it allows distributing and 53 | installing packages, and is designed for complex scientific 54 | code. 55 | * **Mamba** is a re-implementation of Conda to be much faster with 56 | resolving dependencies and installing things. 57 | * An **Environment** is a self-contained collections of packages 58 | which can be installed separately from others. They are used so 59 | each project can install what it needs without affecting others. 60 | * **Anaconda** is a commercial distribution of Python+Conda+many 61 | packages that all work together. It used to be freely usable for 62 | research, but since ~2023-2024 it's more limited. Thus, we don't 63 | recommend it (even though it has a nice graphical user interface). 64 | * **conda-forge** is another channel of distributing packages that 65 | is maintained by the community, and thus can be used by anyone. 66 | (Anaconda's parent company also hosts conda-forge packages) 67 | * **miniforge** is a distribution of conda pre-configured for 68 | conda-forge. It operates via the command line. 69 | * **miniconda** is a distribution of conda pre-configured to use 70 | the Anaconda channels. 71 | 72 | .. tabs:: 73 | 74 | .. group-tab:: Miniforge 75 | 76 | This is our recommended method - it can be used for any purpose 77 | and makes a strong base for the future. 78 | 79 | Follow the `instructions on the miniforge web page 80 | `__. This installs 81 | the base, and from here other packages can be installed. 82 | 83 | .. 84 | You can read how to install miniconda from the `CodeRefinery 85 | installation instructions 86 | `__. 87 | 88 | Miniforge uses the command line - this gives you the most power 89 | but can feel unfamiliar. See the `command line crash course 90 | `__ for an intro. 91 | 92 | .. group-tab:: Anaconda 93 | 94 | Anaconda is easier to get started with, but may be more limiting 95 | in the future. The Anaconda Navigator provides a graphical 96 | interface to most of what you would need. 97 | 98 | The `Anaconda Python distribution 99 | `__ conveniently packages 100 | everything, but its license has does not allow large organizations to 101 | use it for free (and has actually been enforced against 102 | universities). 103 | 104 | Note the license of Anaconda - there were recently issues with 105 | it being used by large universities for free, and this is not 106 | yet fully resolved. 107 | 108 | .. group-tab:: Other options 109 | 110 | There are many ways to install Python. Other methods can work, 111 | as long as you can install the libraries from the 112 | ``environment.yml`` file mentioned in the Miniforge 113 | instructions. 114 | 115 | We don't currently provide a ``requirements.txt`` for installing 116 | the required packages without Conda/Mamba, though. 117 | 118 | 119 | 120 | Starting Python 121 | --------------- 122 | 123 | You need to Python in a way that activates conda/mamba. 124 | 125 | .. tabs:: 126 | 127 | .. group-tab:: Miniforge 128 | 129 | .. tabs:: 130 | 131 | .. group-tab:: Linux / MacOS 132 | 133 | Linux/MacOS: Each time you start a new command line terminal, 134 | you can activate Miniforge by running. This is needed so that 135 | Miniforge is usable wherever you need, but doesn't affect any 136 | other software on your computer (this is not needed if you 137 | choose "Do you wish to update your shell profile to 138 | automatically initialize conda?", but then it will always be 139 | active):: 140 | 141 | $ source ~/miniforge3/bin/activate 142 | 143 | .. group-tab:: Windows 144 | 145 | Windows: Use the "Miniforge Prompt" to start Miniforge. This 146 | will set up everything so that ``conda`` and ``mamba`` are 147 | available. 148 | 149 | .. group-tab:: Anaconda 150 | 151 | The `Anaconda Navigator 152 | `__ provides a convenient 153 | way to access the software. It can be installed from that page. 154 | 155 | 156 | .. group-tab:: Other options 157 | 158 | You are on your own here. 159 | 160 | 161 | Python for SciComp software environment 162 | --------------------------------------- 163 | 164 | Once Python and conda/mamba are installed, you can use it to install 165 | an environment. An **environment** is a self-contained set of extra 166 | libraries - different projects can use different environments to not 167 | interfere with each other. This environment will have all of the 168 | software needed for this particular course. 169 | 170 | .. tabs:: 171 | 172 | .. group-tab:: Miniforge 173 | 174 | This `environment file 175 | `__ 176 | contains all packages needed for the course, and can be 177 | installed with. The following command will install an 178 | environment named ``python-for-scicomp`` (there may be lots of 179 | warning messages: this is OK if it still goes through): 180 | 181 | .. tabs:: 182 | 183 | .. group-tab:: Linux / MacOS 184 | 185 | :: 186 | 187 | $ mamba env create -n python-for-scicomp -f https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/master/software/environment.yml 188 | 189 | .. group-tab:: Windows 190 | 191 | :: 192 | 193 | $ mamba env create -n python-for-scicomp -f https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/master/software/environment.yml 194 | 195 | Each time you start a new command line, you need to activate 196 | miniforge and this environment: 197 | 198 | .. tabs:: 199 | 200 | .. group-tab:: Linux / MacOS 201 | 202 | :: 203 | 204 | $ source ~/miniforge3/bin/activate 205 | $ conda activate python-for-scicomp 206 | 207 | .. group-tab:: Windows 208 | 209 | :: 210 | 211 | $ # Start the Miniforge Prompt. 212 | $ conda activate python-for-scicomp 213 | 214 | .. group-tab:: Anaconda 215 | 216 | Anaconda includes most of the things needed for the course 217 | automatically, but as of 2024 not everything. You can use the 218 | navigator to create new environments from this `this environment 219 | file 220 | `__. 221 | You'll have to download it and then `import it 222 | `__. 223 | 224 | When running this course's exercise, make sure the 225 | ``python-for-scicomp`` environment is activated before starting 226 | JupyterLab or any code. You need to start termnials or 227 | JupyterLab from the Anaconda Navigator for the 228 | ``python-for-scicomp`` environment to be used. 229 | 230 | .. group-tab:: Other options 231 | 232 | **Minoconda, Anaconda command line, other conda/mamba command 233 | line tools**: see "Miniforge" instructions. 234 | 235 | Virtual environments: we don't currently provide a 236 | ``requirements.txt`` but many package names can probably be 237 | copied from the ``environment.yml`` file. We really recommend 238 | conda/mamba based systems: it's designed for complex scientific 239 | software. 240 | 241 | Any other Python distribution which you can install libraries into 242 | would work, but because there are so many different ways to do this, 243 | we don't support them. You would need the extra libraries mentioned 244 | in the Miniforge instructions. 245 | 246 | Remember you need to activate the environment each time you use it. 247 | 248 | 249 | 250 | JupyterLab 251 | ---------- 252 | 253 | We do most of the lessons from JupyterLab (and JupyterLab provides 254 | most of the other tools we need). 255 | 256 | .. tabs:: 257 | 258 | .. group-tab:: Miniforge 259 | 260 | JupyterLab was instaled in the previous step. To run it, first, 261 | start the Miniforge command line interface. Remember, you may 262 | need to activate Miniforge and the environment first. 263 | 264 | .. tabs:: 265 | 266 | .. group-tab:: Linux / MacOS 267 | 268 | :: 269 | 270 | $ source ~/miniforge3/bin/activate 271 | $ conda activate python-for-scicomp 272 | $ jupyter-lab 273 | 274 | .. group-tab:: Windows 275 | 276 | :: 277 | 278 | $ # Start the Miniforge Prompt. 279 | $ conda activate python-for-scicomp 280 | $ jupyter-lab 281 | 282 | .. group-tab:: Anaconda 283 | 284 | If you install the full Anaconda distribution, this will be 285 | available and can be started either through Anaconda Navigator 286 | or command line. 287 | 288 | Make sure the ``python-for-scicomp`` environment is selected and 289 | you can start JupyterLab. 290 | 291 | 292 | 293 | Verification of Python and JupyterLab 294 | ------------------------------------- 295 | 296 | .. admonition:: Watch the video 297 | 298 | See this `verification in video form 299 | `__ - if you can do this, you are 300 | ready to go for day one. Your exact steps may be a bit different. 301 | 302 | Remember that you need to activate the environment first - see the 303 | step above. 304 | 305 | .. tabs:: 306 | 307 | .. group-tab:: Miniforge 308 | 309 | You can start JupyterLab from the command line:: 310 | 311 | $ jupyter-lab 312 | (... Jupyter starts in a web browser) 313 | 314 | 315 | .. group-tab:: Anaconda 316 | 317 | **You should be able to start JupyterLab.** You can do this from the 318 | `Anaconda Navigator `__ (recommended if you have it): 319 | 320 | .. figure:: img/installation/anaconda-navigator-jupyterlab.png 321 | :class: with-border 322 | 323 | Starting JupyterLab from the Anaconda Navigator. 324 | 325 | ... or you can start JupyterLab from the command line:: 326 | 327 | $ jupyter-lab 328 | (... Jupyter starts in a web browser) 329 | 330 | 331 | 332 | **Verify that you can start a Jupyter notebook.** We will learn how to 333 | do this in day 1, but you can try running ``print("Hello, world!")`` 334 | if you want. 335 | 336 | .. figure:: img/installation/jupyterlab-notebook.png 337 | :class: with-border 338 | 339 | Starting a Jupyter Notebook from JupyterLab. 340 | 341 | 342 | 343 | Text editor 344 | ----------- 345 | 346 | For one portion of the course, you will need a text editor. **If you 347 | don't know what to use, you can use the text editor that comes from 348 | JupyterLab and it will do everything you need - no extra installation 349 | needed.** 350 | 351 | .. admonition:: Other editors 352 | :class: toggle 353 | 354 | Because we need to be simple in our teaching, we only teach the 355 | most basic editors. We encourage you to try out more advanced ones 356 | yourself. 357 | 358 | For other editors, see the `CodeRefinery instructions 359 | `__. You don't 360 | exactly need a terminal editor - the graphical ones, such as VSCode or 361 | whatever you use now, will work as well. 362 | 363 | 364 | 365 | Command line 366 | ------------ 367 | 368 | **You need access to the command line for some lessons. JupyterLab 369 | includes it, so no extra installation is needed.** If you want to 370 | test in advance: 371 | 372 | * You can start it from JupyterLab (recommended): 373 | 374 | .. figure:: img/installation/jupyterlab-terminal.png 375 | :class: with-border 376 | :scale: 75% 377 | 378 | From the JupyterLab launcher, select "Terminal". 379 | 380 | .. admonition:: Other ways to access the command line 381 | :class: toggle 382 | 383 | * From the Anaconda Navigator: 384 | 385 | .. figure:: img/installation/anaconda-prompt.png 386 | :class: with-border 387 | 388 | From the Anaconda Navigator, you can select "environments" on the 389 | left, then click on one, then the arrow, then "Open terminal". 390 | 391 | * From your operating system's terminal applications, if you activate 392 | Anaconda. 393 | 394 | 395 | 396 | Verification of the command line 397 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 398 | 399 | To verify command line usage, type the following commands (without the 400 | ``$``), and you should see the corresponding output that lists the 401 | Python version: 402 | 403 | .. code-block:: console 404 | 405 | $ python3 -V 406 | Python 3.8.3 407 | 408 | ## Or python... if it's installed as that 409 | $ python -V 410 | Python 3.8.3 411 | 412 | Any recent version of Python 3 should work for the course (for example 413 | 3.8 or higher). 414 | 415 | 416 | 417 | Zoom 418 | ---- 419 | 420 | If this is an online workshop, it might use Zoom. You can see 421 | `CodeRefinery instructions for it 422 | `__. 423 | 424 | 425 | 426 | Need help? 427 | ---------- 428 | 429 | If you have access, come to one of the installation help sessions. 430 | Or, ask your colleagues: these are standard tools and you can 431 | definitely find someone can help you get set up! 432 | 433 | 434 | 435 | See also 436 | -------- 437 | 438 | * `Research Software Hour on conda 439 | `__ 440 | * `Conda manual `__ (technical) 441 | * `Anaconda individual edition home 442 | `__ 443 | * `Anaconda getting started 444 | `__ 445 | -------------------------------------------------------------------------------- /content/libraries.rst: -------------------------------------------------------------------------------- 1 | Library ecosystem 2 | ================= 3 | 4 | .. questions:: 5 | 6 | - What happens when you need some method beyond what we discuss in this course, what is available? 7 | - How do you decide what to build on for your work? 8 | 9 | .. objectives:: 10 | 11 | - Know of some other available packages, but don't necessarily know 12 | how to use them. 13 | - Be able to evaluate what you should reuse and what you should 14 | develop yourself. 15 | 16 | You can't do everything yourself. In fact, once we heard a quote such 17 | as this: 18 | 19 | When you are a student, you are expected to do everything 20 | yourself, and that is how you are evaluated. When you become a 21 | researcher, you *have* to be able to reuse what others have done. 22 | We don't have much practice in doing this. 23 | -- A student 24 | 25 | In this lesson, we'll talk about the broader ecosystem in Python: all 26 | the resources you have available to you. Perhaps we can even classify 27 | this into two types: 28 | 29 | - Well-maintained libraries that are used by many others. 30 | - A wide variety of public code that might work but isn't necessarily 31 | well-maintained (for example, code from articles). 32 | 33 | We'll start with the first then go to the second. 34 | 35 | 36 | 37 | Glossary 38 | -------- 39 | 40 | Library 41 | A collection of code used by a program. 42 | 43 | Package 44 | A library that has been made easily installable and reusable. 45 | Often published on public repositories such as the `Python Package 46 | Index `__ 47 | 48 | Dependency 49 | A requirement of another program, not included in that program. 50 | 51 | 52 | 53 | The Python/SciPy ecosystem 54 | -------------------------- 55 | 56 | This section is nothing more than a tour of what exists in Python. 57 | You aren't expected to particularly remember any of these right now, 58 | but searching for these repositories is a starting point of a lot of 59 | future work. 60 | 61 | The "core" packages `could be considered 62 | `__. Many other packages build on 63 | these, and others that try to do similar things often try to conform 64 | to their interfaces (especially numpy): 65 | 66 | * Python 67 | * Numpy - arrays, everything builds on this 68 | * Scipy - scientific functions (not necessarily a lot builds on this) 69 | * matplotlib - plotting, many other plotting tools build on this 70 | * pandas - data structures 71 | * IPython / Jupyter: interactive work 72 | 73 | 74 | Core numerics libraries 75 | ~~~~~~~~~~~~~~~~~~~~~~~ 76 | 77 | * `numpy `__ - Arrays and array math. 78 | * `scipy `__ - Software 79 | for math, science, and engineering. 80 | 81 | 82 | Plotting 83 | ~~~~~~~~ 84 | 85 | * `matplotlib `__ - Base plotting package, 86 | somewhat low level but almost everything builds on it. 87 | * `seaborn `__ - Higher level plotting 88 | interface; statistical graphics. 89 | * `Vega-Altair `__ - Declarative Python 90 | plotting. 91 | * `mayavi `__ - 3D plotting 92 | * `Plotly `__ - Big graphing library. 93 | 94 | 95 | Data analysis and other important core packages 96 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 97 | 98 | * `pandas `__ - Columnar 99 | data analysi. 100 | * `polars ` - Alternative to pandas that uses similar 101 | API, but is re-imagined for more speed. 102 | * `Vaex `__ - Alternative for pandas 103 | that uses similar API for lazy-loading and processing huge DataFrames. 104 | * `Dask `__ - Alternative to Pandas that uses 105 | similar API and can do analysis in parallel. 106 | * `xarrray `__ - Framework for 107 | working with mutli-dimensional arrays. 108 | * `statsmodels `__ - Statistical 109 | models and tests. 110 | * `SymPy `__ - Symbolic math. 111 | * `networkx `__ - Graph and network analysis. 112 | * `graph-tool `__ - Graph and network analysis 113 | toolkit implemented in C++. 114 | 115 | 116 | Interactive computing and human interface 117 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 118 | * Interactive computing 119 | 120 | * `IPython `__ - Nicer interactive interpreter 121 | * `Jupyter `__ - Web-based interface to IPython 122 | and other languages (includes projects such as jupyter notebook, 123 | lab, hub, ...) 124 | 125 | * Testing 126 | 127 | * `pytest `__ - Automated testing interface 128 | 129 | * Documentation 130 | 131 | * `Sphinx `__ - Documentation generator 132 | (also used for this lesson...) 133 | 134 | * Development environments 135 | 136 | * `Spyder `__ - Interactive Python 137 | development environment. 138 | * `Visual Studio Code `__ - Microsoft's 139 | flagship code editor. 140 | * `PyCharm `__ - JetBrains's 141 | Python IDE. 142 | 143 | * `Binder `__ - load any git repository in 144 | Jupyter automatically, good for reproducible research 145 | 146 | 147 | Data format support and data ingestion 148 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 149 | 150 | * `pillow `__ - Image manipulation. The 151 | original PIL is no longer maintained, the new "Pillow" is a drop-in 152 | replacement. 153 | * `h5py `__ and `PyTables `__ - 154 | Interfaces to the `HDF5 `__ 155 | file format. 156 | 157 | 158 | Speeding up code and parallelism 159 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 160 | 161 | * `MPI for Python (mpi4py) `__ - Message 162 | Passing Interface (MPI) in Python for parallelizing jobs. 163 | * `cython `__ - easily make C extensions for 164 | Python, also interface to C libraries 165 | * `numba `__ - just in time compiling of 166 | functions for speed-up 167 | * `PyPy `__ - Python written in Python so that 168 | it can internally optimize more. 169 | * `Dask `__ - Distributed array data structure for 170 | distributed computation 171 | * `Joblib `__ - Easy embarrassingly 172 | parallel computing 173 | * `IPyParallel `__ - Easy 174 | parallel task engine. 175 | * `numexpr `__ - Fast evaluation of 176 | array expressions by automatically compiling the arithmetic. 177 | 178 | 179 | Machine learning 180 | ~~~~~~~~~~~~~~~~ 181 | 182 | * `nltk `__ - Natural language processing 183 | toolkit. 184 | * `scikit-learn `__ - Traditional 185 | machine learning toolkit. 186 | * `xgboost `__ - Toolkit for 187 | gradient boosting algorithms. 188 | 189 | 190 | Deep learning 191 | ~~~~~~~~~~~~~ 192 | 193 | * `tensorflow `__ - Deep learning 194 | library by Google. 195 | * `pytorch `__ - Currently the most popular 196 | deep learning library. 197 | * `keras `__ - Simple libary for doing deep learning. 198 | * `huggingface `__ - Ecosystem for sharing 199 | and running deep learning models and datasets. Incluses packages 200 | like ``transformers``, ``datasets``, ``accelerate``, etc. 201 | * `jax `__ - Google's 202 | Python library for running NumPy and automatic differentiation 203 | on GPUs. 204 | * `flax `__ - Neural network 205 | framework built on Jax. 206 | * `equinox `__ - Another neural 207 | network framework built on Jax. 208 | * `DeepSpeed `__ - Algorithms for running 209 | massive scale trainings. Included in many of the frameworks. 210 | * `PyTorch Lightning `__ - 211 | Framework for creating and training PyTorch models. 212 | * `Tensorboard ` - Tool 213 | for visualizing model training on a web page. 214 | 215 | 216 | Other packages for special cases 217 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 218 | 219 | * `dateutil `__ and `pytz 220 | `__ - Date arithmetic and handling, 221 | timezone database and conversion. 222 | 223 | 224 | 225 | 226 | Connecting Python to other languages 227 | ------------------------------------ 228 | 229 | As we discussed with Scipy, very many of the above packages aren't 230 | written in Python: they are written in some other language and have a 231 | Python interface. Python is written in C, and thus has great C 232 | interfaces. This contributes to two things: 233 | 234 | * **Extending Python** by writing your own modules in C. 235 | 236 | * It's actually common to first have (or write) an analysis package 237 | in C or C++, then make the Python interface. Then it can be 238 | supported by other languages, too. 239 | 240 | * Or one starts an analysis package in Python, and slowly moves bits 241 | of it to C over time as there is need. 242 | 243 | * **Embedding Python**, where you have another primary application 244 | that uses Python under the hood as an internal scripting language. 245 | 246 | These features aren't exactly unique to Python, but Python does 247 | support them very well. Read more: `Extending and embedding Python 248 | `__. 249 | 250 | 251 | Tools for interfacing with other languages 252 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 253 | 254 | These days, one rarely directly extends the Python interpreter, but uses 255 | 256 | * `cffi `__ and `ctypes 257 | `__ - interface to C 258 | and compatible libraries 259 | * `cython `__ - easily make C extensions for 260 | Python, also interface to C libraries 261 | * `f2py `__ - interface to Fortran 262 | code 263 | * `swig `__ - connect to a variety of programming languages. 264 | * ``Boost.python`` - Another Python/C++ interface 265 | * TODO: Julia modules for Python? 266 | 267 | 268 | 269 | Evaluating Python packages for reuse 270 | ------------------------------------ 271 | 272 | Above, we talked about well-maintained mainstream packages. **Do you 273 | trust random code you find online (for example included in a paper)?** 274 | 275 | Especially consider scientific results, which *have* to be correct. 276 | Still, you also *can't* build everything yourself, so you have to 277 | carefully evaluate the situation. 278 | 279 | Below are some things to consider: 280 | 281 | * Are there releases? Have they been going on for a while? 282 | 283 | * Are releases installable without copy-paste? 284 | 285 | * Are dependencies handled well? 286 | 287 | * Does the code randomly change, so that it no longer works with your 288 | code. Is this relevant? 289 | 290 | * Is there good documentation, that not just tells how to use it but 291 | how it works? 292 | 293 | * Is there automated testing? What's your evaluation of the risk of 294 | undetectable scientific errors? 295 | 296 | * Is there a community, or is it one person? Is it backed by some 297 | organization? Does it have a permanent home? 298 | 299 | * Is it is a public hosting site (GitLab, GitHub, Bitbucket, etc) 300 | where a community *could* form? 301 | 302 | * Do others post issues and make contributions? Are these issues 303 | dealt with in a timely manner? Can you search past bug reports? 304 | 305 | * Is the software citeable? 306 | 307 | 308 | 309 | Is your work reuseable? 310 | ----------------------- 311 | 312 | Every small project you do contributes a little bit to the Python and 313 | SciPy ecosystem. This course has sort of started you on that path, 314 | and a `CodeRefinery workshop `__ will make 315 | sure you have the tools to produce high-quality, reusable code. 316 | 317 | 318 | 319 | What's next? 320 | ------------ 321 | 322 | * The `CodeRefinery workshop `__ mentioned 323 | above will prepare you for others to reuse your code and for you to 324 | contribute to other code. 325 | * The upcoming :doc:`dependencies` lesson will teach you how to 326 | record and manage dependencies so that anyone can seamlessly reuse 327 | your code. 328 | 329 | 330 | 331 | Exercises 332 | --------- 333 | 334 | .. exercise:: Libraries 1.1: Libraries in your work 335 | 336 | What libraries do you use in your work? What have you made, which 337 | you could have reused from some other source. What have you used 338 | from some other source that you wished you had re-created? 339 | 340 | Discuss in your groups or HackMD. 341 | 342 | .. solution:: Libraries 1.1 343 | 344 | ... is there anything to say here? 345 | 346 | 347 | .. exercise:: Libraries 1.2: Evaluating packages 348 | 349 | Below are some links to some packages, both public and made by the 350 | authors of this lesson. Evaluate them, considering "would I use 351 | this in my project?" 352 | 353 | a) https://github.com/networkx/networkx/ 354 | b) some code on webpage in a paper's footnote 355 | c) https://github.com/rkdarst/pcd 356 | d) https://github.com/dftlibs/numgrid 357 | e) https://github.com/rkdarst/dynbench 358 | f) https://vpython.org/ 359 | 360 | .. solution:: Libraries 1.2 361 | 362 | a) networkx: This seems to be a relatively large, active project 363 | using best practices. Probably usable. 364 | b) I would probably use it if I had to, but would prefer not to. 365 | c) This (written by one of the authors of this lesson) has no 366 | documenting, no community, no best practices, and is very old. 367 | Probably not a good idea to try to use it 368 | d) This project uses best practices, but doesn't seem to have a big 369 | community. It's probably fine to use, but who knows if it will 370 | be maintained 10 years from now. It does have automated tests 371 | via Github Actions (``.github/workflows`` and the green checks), 372 | so the authors have put some work into making it correct. 373 | e) This (also written by one of the authors) looks like it was made 374 | for a paper of some sort. It has some minimal documentation, 375 | but still is missing many best practices and is clearly not 376 | maintained anymore (look at the ancient pull request). Probably 377 | not a good idea to use unless you have to. 378 | f) This project has a pretty website, and some information. But 379 | seems to not be using best practices of an open repository, and 380 | custom locations which could disappear at any time. 381 | 382 | You notice that several of the older projects here were written by 383 | one of the authors of this lesson. It goes to show that everyone 384 | starts somewhere and improves over time - don't feel bad if your 385 | work isn't perfect, as long as you keep trying to get better! 386 | 387 | 388 | 389 | See also 390 | -------- 391 | 392 | * `Topical Software in the SciPy ecosystem 393 | `__ - relatively 394 | detailed (but not comprehensive) list of projects 395 | 396 | 397 | .. keypoints:: 398 | 399 | - Almost everything you need can already be found, except your 400 | incremental work. 401 | - When do you build on that other work, and when do you create 402 | things yourself? 403 | -------------------------------------------------------------------------------- /content/ndarray.dot: -------------------------------------------------------------------------------- 1 | strict digraph ndarray { 2 | graph [compound=true]; 3 | 4 | node [style = filled, color=cyan]; 5 | 6 | n [label="Variable n (lvalue)", color=gold]; 7 | nobj [label="PyObject n"]; 8 | ndesc [label="ndarray metadata"]; 9 | 10 | n -> nobj; 11 | nobj -> ndesc; 12 | 13 | subgraph cluster_n { 14 | label = "Data array for n"; 15 | color = aquamarine; 16 | style = filled; 17 | node [shape=box]; 18 | 19 | ndata_0 [label="3"]; 20 | ndata_1 [label="2"]; 21 | ndata_2 [label="1"]; 22 | } 23 | 24 | 25 | ndesc -> ndata_1 [lhead=cluster_n]; 26 | 27 | } 28 | -------------------------------------------------------------------------------- /content/ndarray.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | ndarray 11 | 12 | cluster_n 13 | 14 | Data array for n 15 | 16 | 17 | n 18 | 19 | Variable n (lvalue) 20 | 21 | 22 | nobj 23 | 24 | PyObject n 25 | 26 | 27 | n->nobj 28 | 29 | 30 | 31 | 32 | ndesc 33 | 34 | ndarray metadata 35 | 36 | 37 | nobj->ndesc 38 | 39 | 40 | 41 | 42 | ndata_1 43 | 44 | 2 45 | 46 | 47 | ndesc->ndata_1 48 | 49 | 50 | 51 | 52 | ndata_0 53 | 54 | 3 55 | 56 | 57 | ndata_2 58 | 59 | 1 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /content/packaging-example-project/calculator/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example calculator package. 3 | """ 4 | 5 | from .adding import add 6 | from .subtracting import subtract 7 | from .integrating import integral 8 | 9 | __version__ = "0.1.0" 10 | -------------------------------------------------------------------------------- /content/packaging-example-project/calculator/adding.py: -------------------------------------------------------------------------------- 1 | def add(x, y): 2 | return x + y 3 | -------------------------------------------------------------------------------- /content/packaging-example-project/calculator/integrating.py: -------------------------------------------------------------------------------- 1 | from scipy import integrate 2 | 3 | 4 | def integral(function, lower_limit, upper_limit): 5 | return integrate.quad(function, lower_limit, upper_limit) 6 | -------------------------------------------------------------------------------- /content/packaging-example-project/calculator/subtracting.py: -------------------------------------------------------------------------------- 1 | def subtract(x, y): 2 | return x - y 3 | -------------------------------------------------------------------------------- /content/packaging-example-project/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "calculator-myname" 7 | description = "A small example package" 8 | version = "0.1.0" 9 | readme = "README.md" 10 | authors = [ 11 | { name = "Firstname Lastname", email = "firstname.lastname@example.org" } 12 | ] 13 | dependencies = [ 14 | "scipy" 15 | ] 16 | -------------------------------------------------------------------------------- /content/packaging-example-project/test.py: -------------------------------------------------------------------------------- 1 | from calculator import add, subtract, integral 2 | 3 | print("2 + 3 =", add(2, 3)) 4 | print("2 - 3 =", subtract(2, 3)) 5 | integral_x_squared, error = integral(lambda x: x * x, 0.0, 1.0) 6 | print(f"{integral_x_squared = }") 7 | -------------------------------------------------------------------------------- /content/packaging-example-project/test_editable.py: -------------------------------------------------------------------------------- 1 | from calculator import subtract 2 | 3 | print("2 - 3 =", subtract(2, 3)) 4 | -------------------------------------------------------------------------------- /content/packaging.rst: -------------------------------------------------------------------------------- 1 | Packaging 2 | ========= 3 | 4 | .. questions:: 5 | 6 | - How to organize Python projects larger than one script? 7 | - What is a good file and folder structure for Python projects? 8 | - How can you make your Python functions most usable by your collaborators? 9 | - How to prepare your code to make a Python package? 10 | - How to publish your Python package? 11 | 12 | .. objectives:: 13 | 14 | - Learn to identify the components of a Python package 15 | - Learn to create a Python package 16 | - Learn to publish a Python package 17 | 18 | 19 | Organizing Python projects 20 | -------------------------- 21 | 22 | Python projects often start as a single script or Jupyter notebook but 23 | they can grow out of a single file. 24 | 25 | In the :ref:`scripts` episode we have also learned how to import functions 26 | and objects from other Python files (modules). Now we will take it a step further. 27 | 28 | **Recommendations**: 29 | 30 | - Collect related functions into modules (files). 31 | - Collect related modules into packages (we will show how). 32 | - Add a ``LICENSE`` file to your code from `choosealicense.com `__ 33 | (see `Software Licensing and Open source explained with cakes `__). 34 | - Write a ``README.md`` file describing what the code does and how to use it. 35 | - It is also recommended to `document your package `__. 36 | - When the project grows, you might need `automated testing `__. 37 | 38 | To have a concrete but still simple example, we will create a project 39 | consisting of 3 functions, each in its own file. We can then imagine that each 40 | file would contain many more functions. To make it more interesting, 41 | one of these functions will depend on an external library: ``scipy``. 42 | 43 | These are the 3 files: 44 | 45 | .. literalinclude:: packaging-example-project/calculator/adding.py 46 | :caption: adding.py 47 | 48 | .. literalinclude:: packaging-example-project/calculator/subtracting.py 49 | :caption: subtracting.py 50 | 51 | .. literalinclude:: packaging-example-project/calculator/integrating.py 52 | :caption: integrating.py 53 | 54 | We will add a fourth file: 55 | 56 | .. literalinclude:: packaging-example-project/calculator/__init__.py 57 | :caption: __init__.py 58 | 59 | This ``__init__.py`` file will be the interface of our package/library. 60 | It also holds the package docstring and the version string. 61 | Note how it imports functions from the various modules using *relative imports* 62 | (with the dot). 63 | 64 | This is how we will arrange the files in the project folder/repository: 65 | 66 | .. code-block:: none 67 | :emphasize-lines: 3-6 68 | 69 | project-folder 70 | ├── calculator 71 | │ ├── adding.py 72 | │ ├── __init__.py 73 | │ ├── integrating.py 74 | │ └── subtracting.py 75 | ├── LICENSE 76 | └── README.md 77 | 78 | Now we are ready to test the package. For this we need to be in the "root" 79 | folder, what we have called the *project-folder*. We also need to have 80 | ``scipy`` available in our environment: 81 | 82 | .. literalinclude:: packaging-example-project/test.py 83 | 84 | The package is not yet pip-installable, though. We will make this possible in 85 | the next section. 86 | 87 | 88 | Testing a local pip install 89 | --------------------------- 90 | 91 | To make our example package pip-installable we need to add one more file: 92 | 93 | .. code-block:: none 94 | :emphasize-lines: 9 95 | 96 | project-folder 97 | ├── calculator 98 | │ ├── adding.py 99 | │ ├── __init__.py 100 | │ ├── integrating.py 101 | │ └── subtracting.py 102 | ├── LICENSE 103 | ├── README.md 104 | └── pyproject.toml 105 | 106 | This is how ``pyproject.toml`` looks: 107 | 108 | .. literalinclude:: packaging-example-project/pyproject.toml 109 | :caption: pyproject.toml 110 | :emphasize-lines: 13-15 111 | 112 | Note how our package requires ``scipy`` and we decided to not pin the version 113 | here (see :ref:`version_pinning`). 114 | 115 | Now we have all the building blocks to test a local pip install. This is a good 116 | test before trying to upload a package to PyPI or test-PyPI 117 | (see :ref:`pypi`) 118 | 119 | .. note:: 120 | 121 | Sometime you need to rely on unreleased, development versions as 122 | dependencies and this is also possible. For example, to use the 123 | latest ``xarray`` you could add:: 124 | 125 | dependencies = [ 126 | "scipy", 127 | "xarray @ https://github.com/pydata/xarray/archive/main.zip" 128 | ] 129 | 130 | .. seealso:: 131 | - `pip requirement specifiers `__ 132 | - pyOpenSci tutorial on 133 | `pyproject.toml metadata `__ 134 | 135 | 136 | 137 | Exercise 1 138 | ---------- 139 | 140 | .. challenge:: Packaging-1 141 | 142 | To test a local pip install: 143 | 144 | - Create a new folder outside of our example project 145 | - Create a new virtual environment (:ref:`dependency_management`) 146 | - Install the example package from the project folder 147 | into the new environment:: 148 | 149 | pip install --editable /path/to/project-folder/ 150 | 151 | - Test the local installation: 152 | 153 | .. literalinclude:: packaging-example-project/test.py 154 | 155 | - Make a change in the ``subtract`` function above such that it always 156 | returns a float ``return float(x - y)``. 157 | 158 | - Open a new Python console and test the following lines. Compare it with 159 | the previous output. 160 | 161 | .. literalinclude:: packaging-example-project/test_editable.py 162 | 163 | Sharing packages via PyPI 164 | ------------------------- 165 | 166 | .. demo:: 167 | 168 | Most people will watch and observe this, due to the speed with which we will 169 | move. 170 | 171 | Once we are able to pip-install the example package locally, we are ready for 172 | upload. 173 | 174 | We exercise by uploading to test-PyPI_, not the 175 | real `PyPI `__, so that if we mess things up, nothing bad 176 | happens. 177 | 178 | We need two more things: 179 | 180 | - We will do this using `Twine `__ so you need 181 | to pip install that, too. 182 | - You need an account on test-PyPI_ 183 | 184 | .. _test-PyPI: https://test.pypi.org/ 185 | 186 | .. highlight:: console 187 | 188 | Let's try it out. First we create the distribution package:: 189 | 190 | $ python3 -m build 191 | 192 | We need twine:: 193 | 194 | $ pip install twine 195 | 196 | And use twine to upload the distribution files to test-PyPI:: 197 | 198 | $ twine upload -r testpypi dist/* 199 | 200 | Uploading distributions to https://test.pypi.org/legacy/ 201 | Enter your API token: 202 | 203 | 204 | .. _Create API token: https://test.pypi.org/manage/account/token/ 205 | 206 | .. note:: 207 | 208 | To generate an API token, proceed to the `Create API token`_ page in test-PyPI. 209 | You will be prompted for your password. 210 | 211 | .. solution:: The long-version for finding the *Create API token* page 212 | 213 | 1. Log on to test-PyPI_ at https://test.pypi.org 214 | 2. In the top-right corner, click on the drop-down menu and click **Account settings** or 215 | follow this `link `__. 216 | 3. Scroll down to the section **API tokens** and click the button **Add API token**, 217 | which opens up the 218 | `Create API token`_ page. 219 | 220 | 221 | #. Under **Token name** write something memorable. 222 | It should remind you the *purpose* 223 | or the *name of the computer*, such that when you are done 224 | using it, you can safely delete it. 225 | #. Under **Scope** select ``Entire account (all projects)``. 226 | #. Click on **Create token**. 227 | #. Click on **Copy token** once a long string which starts 228 | with ``pypi-`` is generated. 229 | 230 | Paste that token back into the terminal where ``twine upload ...`` is running and press ENTER. 231 | 232 | Once this is done, create yet another virtual environment and try to install from test-PyPI (adapt ``myname``). 233 | 234 | .. tabs:: 235 | 236 | .. tab:: Linux / macOS 237 | 238 | .. code-block:: console 239 | :emphasize-lines: 4-7 240 | 241 | $ python3 -m venv venv-calculator 242 | $ source venv-calculator/bin/activate 243 | $ which python 244 | $ python3 -m pip install \ 245 | -i https://test.pypi.org/simple/ \ 246 | --extra-index-url https://pypi.org/simple/ \ 247 | calculator-myname 248 | $ deactivate 249 | 250 | .. tab:: Windows 251 | 252 | .. code-block:: console 253 | :emphasize-lines: 4 254 | 255 | $ python3 -m venv venv-calculator 256 | $ venv-calculator\Scripts\activate 257 | $ where python 258 | $ python3 -m pip install -i https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ calculator-myname 259 | $ deactivate 260 | 261 | Tools that simplify sharing via PyPI 262 | ------------------------------------ 263 | 264 | The solution that we have used to create the example package (using 265 | ``setuptools`` and ``twine``) is not the only approach. There are many ways to 266 | achieve this and we avoided going into too many details and comparisons to not 267 | confuse too much. If you web-search this, you will also see that recently the 268 | trend goes towards using ``pyproject.toml`` as more general 269 | alternative to the previous ``setup.py``. 270 | 271 | There are at least two tools which try to make the packaging and PyPI interaction easier: 272 | 273 | - `Poetry `__ 274 | - `Flit `__ 275 | 276 | If you upload packages to PyPI or test PyPI often you can create an API token and 277 | `save it in the .pypirc file `__. 278 | 279 | Building a conda package and share it 280 | ------------------------------------- 281 | 282 | 283 | .. callout:: Prerequisites 284 | 285 | To generate a conda build recipe, the package ``grayskull`` and 286 | to build it, the package ``conda-build`` are required. 287 | You may install these with **Anaconda Navigator** or from the command line:: 288 | 289 | $ conda install -n base grayskull conda-build 290 | 291 | 292 | The simplest way for creating a conda package for your python script is to 293 | first publish it in `PyPI `__ following the steps explained 294 | above. 295 | 296 | 297 | Building a python package with grayskull and conda-build 298 | ******************************************************** 299 | 300 | Once build, the conda package can be installed locally. For this example, we 301 | will use `runtest `__. `runtest 302 | `__ is a numerically tolerant end-to-end test 303 | library for research software. 304 | 305 | 1. Generate the *recipe* by executing (``grayskull`` or ``conda grayskull``):: 306 | 307 | $ conda grayskull pypi runtest 308 | 309 | The command above will create a new folder called `runtest` containing a file `meta.yaml`, 310 | the conda recipe for building the `runtest` package. 311 | 312 | 2. View the contents of `meta.yaml` and ensure requirements : 313 | 314 | .. code-block:: yaml 315 | 316 | requirements: 317 | host: 318 | - python 319 | - flit-core >=2,<4 320 | - pip 321 | run: 322 | - python 323 | 324 | In the requirements above, we specified what is required for the `host `__ and for `running `__ the package. 325 | 326 | .. callout:: Remark 327 | 328 | For pure python recipes, this is all you need for building a python package with conda. 329 | If your package needs to be built (for instance compilation), you would need additional files e.g. `build.sh` (to build on Linux/Mac-OSX) and `bld.bat` (to build on Windows systems). You can also add test scripts for testing your package. See `documentation `__ 330 | 331 | 332 | 3. Build your package with conda 333 | 334 | Your package is now ready to be build with conda:: 335 | 336 | $ conda build runtest 337 | 338 | 339 | .. callout:: Conda package location 340 | 341 | Look at the messages produced while building. The location of the local conda package is given (search for `anaconda upload`): 342 | 343 | .. code-block:: none 344 | 345 | /home/username/miniforge3/conda-bld/noarch/runtest-2.3.4-py_0.tar.bz2 346 | 347 | The prefix ``/home/username/miniforge3/`` may be different on your machine. 348 | depending on your operating system (Linux, Mac-OSX or Windows). The sub-folder is named ``noarch`` since 349 | it is a pure-python package and the recipe indicates the same. 350 | 351 | If package contained compiled code then the sub-folder would have been named ``win-64`` or ``linux-64``. 352 | It could then be converted to other platforms using 353 | `conda convert `__. 354 | 355 | 4. Check within new environment 356 | 357 | It is not necessary to create a new conda environment to install it but as explained in previous episode, it is good practice to have isolated environments. 358 | 359 | :: 360 | 361 | $ conda create -n local-runtest --use-local runtest 362 | 363 | We can then check `runtest` has been successfully installed in `local-runtest` conda environment. Open a new Terminal with `local-runtest` environment (either from the command line:: 364 | 365 | $ conda activate local-runtest 366 | 367 | or via **Anaconda Navigator** (Open Terminal), import runtest and 368 | check its version: 369 | 370 | .. code-block:: python 371 | 372 | import runtest 373 | print(runtest.__version__) 374 | 375 | 376 | .. callout:: Building a conda package from scratch 377 | 378 | It is possible to build a conda package from scratch without using conda grayskull. 379 | We recommend you to check the 380 | `conda-build documentation `__ 381 | for more information. 382 | 383 | To be able to share and install your local conda package anywhere (on other platforms), you would need to upload it to a `conda channel `__ (see below). 384 | 385 | 386 | 387 | Publishing a python package 388 | *************************** 389 | 390 | - Upload your package to `conda-forge `__: 391 | conda-forge is a conda channel: it contains community-led collection of 392 | recipes, build infrastructure and distributions for the conda package 393 | manager. Anyone can 394 | `publish conda packages to conda-forge `__ 395 | if certain 396 | `guidelines `__ are respected. 397 | 398 | - Upload your package to `bioconda `_: bioconda is 399 | a very popular channel for the conda package manager specializing in 400 | bioinformatics software. As for conda-forge, you need to follow their 401 | `guidelines `__ when 402 | building conda recipes. 403 | 404 | You can also `create your own conda channel 405 | `__ 406 | for publishing your packages. 407 | 408 | 409 | .. keypoints:: 410 | 411 | - It is worth it to organize your code for publishing, even if only 412 | you are using it. 413 | - PyPI is a place for Python packages 414 | - conda is similar but is not limited to Python 415 | -------------------------------------------------------------------------------- /content/parallel-pi-multiprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Python multithreading solution\n", 8 | "Here, we will create a simple stochastic calculation of pi, and then parallelize it using multiprocessing (and multithreading to compare)." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import random" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "def sample(n):\n", 27 | " \"\"\"Make n trials of points in the square. Return (n, number_in_circle)\n", 28 | " \n", 29 | " This is our basic function. By design, it returns everything it\\\n", 30 | " needs to compute the final answer: both n (even though it is an input\n", 31 | " argument) and n_inside_circle. To compute our final answer, all we\n", 32 | " have to do is sum up the n:s and the n_inside_circle:s and do our\n", 33 | " computation\"\"\"\n", 34 | " n_inside_circle = 0\n", 35 | " for i in range(n):\n", 36 | " x = random.random()\n", 37 | " y = random.random()\n", 38 | " if x**2 + y**2 < 1.0:\n", 39 | " n_inside_circle += 1\n", 40 | " return n, n_inside_circle" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "name": "stdout", 50 | "output_type": "stream", 51 | "text": [ 52 | "598 ms ± 29.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 53 | ] 54 | } 55 | ], 56 | "source": [ 57 | "%%timeit\n", 58 | "# Do it just for timing\n", 59 | "n, n_inside_circle = sample(10**6)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 4, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "# Do the actual calculation (the previous result doesn't get saved)\n", 69 | "n, n_inside_circle = sample(10**6)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "This is the \"calculate answer\" phase." 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 5, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "data": { 86 | "text/plain": [ 87 | "3.144548" 88 | ] 89 | }, 90 | "execution_count": 5, 91 | "metadata": {}, 92 | "output_type": "execute_result" 93 | } 94 | ], 95 | "source": [ 96 | "pi = 4.0 * (n_inside_circle / n)\n", 97 | "pi" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "## Do it in parallel with multiprocessing\n", 105 | "This divides the calculation into 10 tasks and runs `sample` on each of them. Then it re-combines the results." 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 6, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "import multiprocessing.pool\n", 115 | "pool = multiprocessing.pool.Pool()\n", 116 | "# The default pool makes one process per CPU" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 7, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "name": "stdout", 126 | "output_type": "stream", 127 | "text": [ 128 | "320 ms ± 38.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 129 | ] 130 | } 131 | ], 132 | "source": [ 133 | "%%timeit\n", 134 | "# Do it once to time it\n", 135 | "results = pool.map(sample, [10**5] * 10)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 8, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "# Do it again to get the results, since the results of the above\n", 145 | "# cell aren't accessible because of the %%timeit magic.\n", 146 | "results = pool.map(sample, [10**5] * 10)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 9, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "pool.close()" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 10, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "3.140768" 167 | ] 168 | }, 169 | "execution_count": 10, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "n_sum = sum(x[0] for x in results)\n", 176 | "n_inside_circle_sum = sum(x[1] for x in results)\n", 177 | "pi = 4.0 * (n_inside_circle_sum / n_sum)\n", 178 | "pi" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "## Do it in \"parallel\" with threads\n", 186 | "To compare. This should not be any faster, because the multiple Python functions can not run at the same time in the same process." 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 11, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "threadpool = multiprocessing.pool.ThreadPool()" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 12, 201 | "metadata": {}, 202 | "outputs": [ 203 | { 204 | "name": "stdout", 205 | "output_type": "stream", 206 | "text": [ 207 | "635 ms ± 28.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 208 | ] 209 | }, 210 | { 211 | "data": { 212 | "text/plain": [ 213 | "" 214 | ] 215 | }, 216 | "execution_count": 12, 217 | "metadata": {}, 218 | "output_type": "execute_result" 219 | } 220 | ], 221 | "source": [ 222 | "%%timeit -o\n", 223 | "# Do it once to time it\n", 224 | "threadpool.map(sample, [10**5] * 10)" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 13, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "# Do it again to get the results, since the results of the above\n", 234 | "# cell aren't accessible because of the %%timeit magic.\n", 235 | "results = threadpool.map(sample, [10**5] * 10)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 14, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "threadpool.close()" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 15, 250 | "metadata": {}, 251 | "outputs": [ 252 | { 253 | "data": { 254 | "text/plain": [ 255 | "3.142388" 256 | ] 257 | }, 258 | "execution_count": 15, 259 | "metadata": {}, 260 | "output_type": "execute_result" 261 | } 262 | ], 263 | "source": [ 264 | "n_sum = sum(x[0] for x in results)\n", 265 | "n_inside_circle_sum = sum(x[1] for x in results)\n", 266 | "pi = 4.0 * (n_inside_circle_sum / n_sum)\n", 267 | "pi" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "## Future ideas\n", 275 | "\n", 276 | "You could make a separate `calculate` function that take a list of results and returns pi. This can be used regardless of if it is done with multiprocessing or without.\n", 277 | "\n", 278 | "Notice the similarity to [split-apply-combine](https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html) or [map-reduce](https://en.wikipedia.org/wiki/MapReduce) which is a specialization of split-apply-combine." 279 | ] 280 | } 281 | ], 282 | "metadata": { 283 | "kernelspec": { 284 | "display_name": "Python 3", 285 | "language": "python", 286 | "name": "python3" 287 | }, 288 | "language_info": { 289 | "codemirror_mode": { 290 | "name": "ipython", 291 | "version": 3 292 | }, 293 | "file_extension": ".py", 294 | "mimetype": "text/x-python", 295 | "name": "python", 296 | "nbconvert_exporter": "python", 297 | "pygments_lexer": "ipython3", 298 | "version": "3.8.5" 299 | } 300 | }, 301 | "nbformat": 4, 302 | "nbformat_minor": 4 303 | } 304 | -------------------------------------------------------------------------------- /content/plotting-matplotlib/customizing/gapminder-larger-font.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/plotting-matplotlib/customizing/gapminder-larger-font.png -------------------------------------------------------------------------------- /content/plotting-matplotlib/customizing/gapminder-linear.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/plotting-matplotlib/customizing/gapminder-linear.png -------------------------------------------------------------------------------- /content/plotting-matplotlib/customizing/gapminder-log.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/plotting-matplotlib/customizing/gapminder-log.png -------------------------------------------------------------------------------- /content/plotting-matplotlib/first-plot/exercise.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/plotting-matplotlib/first-plot/exercise.png -------------------------------------------------------------------------------- /content/plotting-matplotlib/first-plot/getting-started.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/plotting-matplotlib/first-plot/getting-started.png -------------------------------------------------------------------------------- /content/plotting-vega-altair/temperature-ranges-combined.svg: -------------------------------------------------------------------------------- 1 | Oct 2022Nov 2022Dec 2022Jan 2023Feb 2023Mar 2023Apr 2023May 2023Jun 2023Jul 2023Aug 2023Sep 2023Oct 2023date (year-month)−20−15−10−505101520253035max temperature, min temperatureOslo - BlindernTromso - Langnesname -------------------------------------------------------------------------------- /content/productivity.md: -------------------------------------------------------------------------------- 1 | # Productivity tools 2 | 3 | :::{objectives} 4 | - Know about tools that can help you **spot code problems** and help you following 5 | a **consistent code style** without you having to do it manually. 6 | - Get an overview of **AI-based tools** and how they can help you 7 | writing code. 8 | ::: 9 | 10 | :::{instructor-note} 11 | - Demo/discussion: 20 min 12 | ::: 13 | 14 | 15 | ## Linters and formatters 16 | 17 | **Linter**: Tool that analyzes source code to detect potential errors, unused 18 | imports, unused variables, code style violations, and to improve readability. 19 | - Popular linters: 20 | - [Autoflake](https://pypi.org/project/autoflake/) 21 | - [Flake8](https://flake8.pycqa.org/) 22 | - [Pyflakes](https://pypi.org/project/pyflakes/) 23 | - [Pycodestyle](https://pycodestyle.pycqa.org/) 24 | - [Pylint](https://pylint.readthedocs.io/) 25 | - [Ruff](https://docs.astral.sh/ruff/) 26 | 27 | **Formatter**: Tool that automatically formats your code to a consistent style, 28 | for instance following [PEP 8](https://peps.python.org/pep-0008/). 29 | 30 | - Popular formatters: 31 | - [Black](https://black.readthedocs.io/) 32 | - [YAPF](https://github.com/google/yapf) 33 | - [Ruff](https://docs.astral.sh/ruff/) 34 | 35 | In this course we will focus on [Ruff](https://docs.astral.sh/ruff/) since it 36 | can do **both checking and formatting** and you don't have to switch between 37 | multiple tools. 38 | 39 | :::{discussion} Linters and formatters can be configured to your liking 40 | These tools typically have good defaults. But if you don't like the defaults, 41 | you can configure what they should ignore or how they should format or not format. 42 | ::: 43 | 44 | 45 | ## Examples 46 | 47 | This code example (which we possibly recognize from the previous section about 48 | {ref}`profiling`) 49 | has few problems (highlighted): 50 | ```{code-block} python 51 | --- 52 | emphasize-lines: 2, 7, 10 53 | --- 54 | import re 55 | import requests 56 | 57 | 58 | def count_unique_words(file_path: str) -> int: 59 | unique_words = set() 60 | forgotten_variable = 13 61 | with open(file_path, "r", encoding="utf-8") as file: 62 | for line in file: 63 | words = re.findall(r"\b\w+\b", line.lower())) 64 | for word in words: 65 | unique_words.add(word) 66 | return len(unique_words) 67 | ``` 68 | 69 | Please try whether you can locate these problems using Ruff: 70 | ```console 71 | $ ruff check 72 | ``` 73 | 74 | Next, let us try to auto-format a code example which is badly formatted and also difficult 75 | to read: 76 | :::::{tabs} 77 | ::::{tab} Badly formatted 78 | ```python 79 | import re 80 | def count_unique_words (file_path : str)->int: 81 | unique_words=set() 82 | with open(file_path,"r",encoding="utf-8") as file: 83 | for line in file: 84 | words=re.findall(r"\b\w+\b",line.lower()) 85 | for word in words: 86 | unique_words.add(word) 87 | return len( unique_words ) 88 | ``` 89 | :::: 90 | 91 | ::::{tab} Auto-formatted 92 | ```python 93 | import re 94 | 95 | 96 | def count_unique_words(file_path: str) -> int: 97 | unique_words = set() 98 | with open(file_path, "r", encoding="utf-8") as file: 99 | for line in file: 100 | words = re.findall(r"\b\w+\b", line.lower()) 101 | for word in words: 102 | unique_words.add(word) 103 | return len(unique_words) 104 | ``` 105 | 106 | This was done using: 107 | ```console 108 | $ ruff format 109 | ``` 110 | :::: 111 | ::::: 112 | 113 | 114 | ## Type checking 115 | 116 | A (static) type checker is a tool that checks whether the types of variables in your 117 | code match the types that you have specified. 118 | - Tools: 119 | - [Mypy](https://mypy.readthedocs.io/) 120 | - [Pyright](https://github.com/microsoft/pyright) (Microsoft) 121 | - [Pyre](https://pyre-check.org/) (Meta) 122 | 123 | 124 | ## Integration with editors 125 | 126 | Many/most of the above tools can be integrated with your editor. For instance, 127 | you can configure your editor to automatically format your code when you save 128 | the file. However, this only makes sense when all team members agree to follow 129 | the same style, otherwise saving and possibly committing changes to version 130 | control will show up changes to code written by others which you possibly 131 | didn't intend to make. 132 | 133 | 134 | ## Integration with Jupyter notebooks 135 | 136 | It is possible to automatically format your code in Jupyter notebooks! 137 | For this to work you need 138 | the following three dependencies installed: 139 | - `jupyterlab-code-formatter` 140 | - `black` 141 | - `isort` 142 | 143 | More information and a screen-cast of how this works can be found at 144 | . 145 | 146 | 147 | ## Integration with version control 148 | 149 | If you use version control and like to have your code checked or formatted 150 | **before you commit the change**, you can use tools like [pre-commit](https://pre-commit.com/). 151 | 152 | 153 | ## AI-assisted coding 154 | 155 | We can use AI as an assistant/apprentice: 156 | - Code completion 157 | - Write a test based on an implementation 158 | - Write an implementation based on a test 159 | 160 | Or we can use AI as a mentor: 161 | - Explain a concept 162 | - Improve code 163 | - Show a different (possibly better) way of implementing the same thing 164 | 165 | 166 | :::{figure} productivity/chatgpt.png 167 | :alt: Screenshot of ChatGPT 168 | :width: 100% 169 | 170 | Example for using a chat-based AI tool. 171 | ::: 172 | 173 | :::{figure} productivity/code-completion.gif 174 | :alt: Screen-cast of working with GitHub Copilot 175 | :width: 100% 176 | 177 | Example for using AI to complete code in an editor. 178 | ::: 179 | 180 | :::{admonition} AI tools open up a box of questions 181 | - Legal 182 | - Ethical 183 | - Privacy 184 | - Lock-in/ monopolies 185 | - Lack of diversity 186 | - Will we still need to learn programming? 187 | - How will it affect learning and teaching programming? 188 | ::: 189 | -------------------------------------------------------------------------------- /content/productivity/chatgpt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/productivity/chatgpt.png -------------------------------------------------------------------------------- /content/productivity/code-completion.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/productivity/code-completion.gif -------------------------------------------------------------------------------- /content/profiling.md: -------------------------------------------------------------------------------- 1 | # Profiling 2 | 3 | :::{objectives} 4 | - Understand when improving code performance is worth the time and effort. 5 | - Knowing how to find performance bottlenecks in Python code. 6 | - Try `scalene` as one of many tools to profile Python code. 7 | ::: 8 | 9 | :::{instructor-note} 10 | - Discussion: 20 min 11 | - Exercise: 20 min 12 | ::: 13 | 14 | 15 | ## Should we even optimize the code? 16 | 17 | Classic quote to keep in mind: "Premature optimization is the root of all evil." [Donald Knuth] 18 | 19 | :::{discussion} 20 | It is important to ask ourselves whether it is worth it. 21 | - Is it worth spending e.g. 2 days to make a program run 20% faster? 22 | - Is it worth optimizing the code so that it spends 90% less memory? 23 | 24 | Depends. What does it depend on? 25 | ::: 26 | 27 | 28 | ## Measure instead of guessing 29 | 30 | Before doing code surgery to optimize the run time or lower the memory usage, 31 | we should **measure** where the bottlenecks are. This is called **profiling**. 32 | 33 | Analogy: Medical doctors don't start surgery based on guessing. They first measure 34 | (X-ray, MRI, ...) to know precisely where the problem is. 35 | 36 | Not only programming beginners can otherwise guess wrong, but also experienced 37 | programmers can be surprised by the results of profiling. 38 | 39 | 40 | ## One of the simplest tools is to insert timers 41 | 42 | Below we will list some tools that can be used to profile Python code. 43 | But even without these tools you can find **time-consuming parts** of your code 44 | by inserting timers: 45 | 46 | 47 | 48 | ```{code-block} python 49 | --- 50 | emphasize-lines: 1,8,10 51 | --- 52 | import time 53 | 54 | 55 | # ... 56 | # code before the function 57 | 58 | 59 | start = time.time() 60 | result = some_function() 61 | print(f"some_function took {time.time() - start} seconds") 62 | 63 | 64 | # code after the function 65 | # ... 66 | ``` 67 | 68 | 69 | ## Many tools exist 70 | 71 | The list below here is probably not complete, but it gives an overview of the 72 | different tools available for profiling Python code. 73 | 74 | CPU profilers: 75 | - [cProfile and profile](https://docs.python.org/3/library/profile.html) 76 | - [line_profiler](https://kernprof.readthedocs.io/) 77 | - [py-spy](https://github.com/benfred/py-spy) 78 | - [Yappi](https://github.com/sumerc/yappi) 79 | - [pyinstrument](https://pyinstrument.readthedocs.io/) 80 | - [Perfetto](https://perfetto.dev/docs/analysis/trace-processor-python) 81 | 82 | Memory profilers: 83 | - [memory_profiler](https://pypi.org/project/memory-profiler/) (not actively maintained) 84 | - [Pympler](https://pympler.readthedocs.io/) 85 | - [tracemalloc](https://docs.python.org/3/library/tracemalloc.html) 86 | - [guppy/heapy](https://github.com/zhuyifei1999/guppy3/) 87 | 88 | Both CPU and memory: 89 | - [Scalene](https://github.com/plasma-umass/scalene) 90 | 91 | In the exercise below, we will use Scalene to profile a Python program. Scalene 92 | is a sampling profiler that can profile CPU, memory, and GPU usage of Python. 93 | 94 | 95 | ## Tracing profilers vs. sampling profilers 96 | 97 | **Tracing profilers** record every function call and event in the program, 98 | logging the exact sequence and duration of events. 99 | - **Pros:** 100 | - Provides detailed information on the program's execution. 101 | - Deterministic: Captures exact call sequences and timings. 102 | - **Cons:** 103 | - Higher overhead, slowing down the program. 104 | - Can generate larger amount of data. 105 | 106 | **Sampling profilers** periodically samples the program's state (where it is 107 | and how much memory is used), providing a statistical view of where time is 108 | spent. 109 | - **Pros:** 110 | - Lower overhead, as it doesn't track every event. 111 | - Scales better with larger programs. 112 | - **Cons:** 113 | - Less precise, potentially missing infrequent or short calls. 114 | - Provides an approximation rather than exact timing. 115 | 116 | :::{discussion} Analogy: Imagine we want to optimize the London Underground (subway) system 117 | We wish to detect bottlenecks in the system to improve the service and for this we have 118 | asked few passengers to help us by tracking their journey. 119 | - **Tracing**: We follow every train and passenger, recording every stop 120 | and delay. When passengers enter and exit the train, we record the exact time 121 | and location. 122 | - **Sampling**: Every 5 minutes the phone notifies the passenger to note 123 | down their current location. We then use this information to estimate 124 | the most crowded stations and trains. 125 | ::: 126 | 127 | 128 | ## Choosing the right system size 129 | 130 | Sometimes we can configure the system size (for instance the time step in a simulation 131 | or the number of time steps or the matrix dimensions) to make the program finish sooner. 132 | 133 | For profiling, we should choose a system size that is **representative of the real-world** 134 | use case. If we profile a program with a small input size, we might not see the same 135 | bottlenecks as when running the program with a larger input size. 136 | 137 | Often, when we scale up the system size, or scale the number of processors, new bottlenecks 138 | might appear which we didn't see before. This brings us back to: "measure instead of guessing". 139 | 140 | 141 | ## Exercises 142 | 143 | ::::{exercise} Exercise: Practicing profiling 144 | In this exercise we will use the Scalene profiler to find out where most of the time is spent 145 | and most of the memory is used in a given code example. 146 | 147 | Please try to go through the exercise in the following steps: 148 | 1. Make sure `scalene` is installed in your environment (if you have followed 149 | this course from the start and installed the recommended software 150 | environment, then it is). 151 | 1. Download Leo Tolstoy's "War and Peace" from the following link (the text is 152 | provided by [Project Gutenberg](https://www.gutenberg.org/)): 153 | 154 | (right-click and "save as" to download the file and **save it as "book.txt"**). 155 | 1. **Before** you run the profiler, try to predict in which function the code 156 | (the example code is below) 157 | will spend most of the time and in which function it will use most of the 158 | memory. 159 | 1. Save the example code as `example.py` and 160 | run the `scalene` profiler on the following code example and browse the 161 | generated HTML report to find out where most of the time is spent and where 162 | most of the memory is used: 163 | ```console 164 | $ scalene example.py 165 | ``` 166 | Alternatively you can do this (and then open the generated file in a browser): 167 | ```console 168 | $ scalene example.py --html > profile.html 169 | ``` 170 | You can find an example of the generated HTML report in the solution below. 171 | 1. Does the result match your prediction? Can you explain the results? 172 | 173 | Example code (`example.py`): 174 | :::{literalinclude} profiling/exercise.py 175 | ::: 176 | 177 | :::{solution} 178 | ```{figure} profiling/exercise.png 179 | :alt: Result of the profiling run for the above code example. 180 | :width: 100% 181 | 182 | Result of the profiling run for the above code example. You can click on the image to make it larger. 183 | ``` 184 | 185 | Results: 186 | - Most time is spent in the `count_unique_words2` function. 187 | - Most memory is used in the `count_unique_words1` function. 188 | 189 | Explanation: 190 | - The `count_unique_words2` function is the slowest because it **uses a list** 191 | to store unique words and checks if a word is already in the list before 192 | adding it. 193 | Checking whether a list contains an element might require traversing the 194 | whole list, which is an O(n) operation. As the list grows in size, 195 | the lookup time increases with the size of the list. 196 | - The `count_unique_words1` and `count_unique_words3` functions are faster 197 | because they **use a set** to store unique words. 198 | Checking whether a set contains an element is an O(1) operation. 199 | - The `count_unique_words1` function uses the most memory because it **creates 200 | a list of all words** in the text file and then **creates a set** from that 201 | list. 202 | - The `count_unique_words3` function uses less memory because it traverses 203 | the text file line by line instead of reading the whole file into memory. 204 | 205 | What we can learn from this exercise: 206 | - When processing large files, it can be good to read them line by line 207 | or in batches 208 | instead of reading the whole file into memory. 209 | - It is good to get an overview over standard data structures and their 210 | advantages and disadvantages (e.g. adding an element to a list is fast but checking whether 211 | it already contains the element can be slow). 212 | ::: 213 | :::: 214 | 215 | 216 | ## Additional resources 217 | 218 | - [Python performance workshop (by ENCCS)](https://enccs.github.io/python-perf/profile/) 219 | -------------------------------------------------------------------------------- /content/profiling/exercise.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/content/profiling/exercise.png -------------------------------------------------------------------------------- /content/profiling/exercise.py: -------------------------------------------------------------------------------- 1 | """ 2 | The code below reads a text file and counts the number of unique words in it 3 | (case-insensitive). 4 | """ 5 | import re 6 | 7 | 8 | def count_unique_words1(file_path: str) -> int: 9 | with open(file_path, "r", encoding="utf-8") as file: 10 | text = file.read() 11 | words = re.findall(r"\b\w+\b", text.lower()) 12 | return len(set(words)) 13 | 14 | 15 | def count_unique_words2(file_path: str) -> int: 16 | unique_words = [] 17 | with open(file_path, "r", encoding="utf-8") as file: 18 | for line in file: 19 | words = re.findall(r"\b\w+\b", line.lower()) 20 | for word in words: 21 | if word not in unique_words: 22 | unique_words.append(word) 23 | return len(unique_words) 24 | 25 | 26 | def count_unique_words3(file_path: str) -> int: 27 | unique_words = set() 28 | with open(file_path, "r", encoding="utf-8") as file: 29 | for line in file: 30 | words = re.findall(r"\b\w+\b", line.lower()) 31 | for word in words: 32 | unique_words.add(word) 33 | return len(unique_words) 34 | 35 | 36 | def main(): 37 | # book.txt is downloaded from https://www.gutenberg.org/cache/epub/2600/pg2600.txt 38 | _result = count_unique_words1("book.txt") 39 | _result = count_unique_words2("book.txt") 40 | _result = count_unique_words3("book.txt") 41 | 42 | 43 | if __name__ == "__main__": 44 | main() 45 | -------------------------------------------------------------------------------- /content/python.rst: -------------------------------------------------------------------------------- 1 | Introduction to Python 2 | ====================== 3 | 4 | .. questions:: 5 | 6 | - What are the basic blocks of Python language? 7 | - How are functions and classes defined in Python? 8 | 9 | .. objectives:: 10 | 11 | - Get a *very* short introduction to Python types and syntax 12 | - Be able to follow the rest of the examples in the course, even if you don't understand everything perfectly. 13 | 14 | We expect everyone to be able to know the following basic material 15 | to follow the course (though it is not *everything* you need to 16 | know about Python). 17 | 18 | If you are not familiar with Python, here is a *very* short 19 | introduction. It will not be enough to do everything in this course, 20 | but you will be able to follow along a bit more than you would otherwise. 21 | 22 | .. seealso:: 23 | 24 | This page contains an overview of the basics of Python. You can 25 | also refer to `This Python overview from a different lesson 26 | `__ 27 | which is slightly more engaging. 28 | 29 | 30 | 31 | Scalars 32 | ------- 33 | 34 | Scalar types, that is, single elements of various types: 35 | 36 | :: 37 | 38 | i = 42 # integer 39 | i = 2**77 # Integers have arbitrary precision 40 | g = 3.14 # floating point number 41 | c = 2 - 3j # Complex number 42 | b = True # boolean 43 | s = "Hello!" # String (Unicode) 44 | q = b'Hello' # bytes (8-bit values) 45 | 46 | Read more: :class:`int`, :class:`float`, :class:`complex`, 47 | :class:`bool`, :class:`str`, :class:`bytes`. 48 | 49 | 50 | Collections 51 | ----------- 52 | 53 | Collections are data structures capable of storing multiple values. 54 | 55 | :: 56 | 57 | l = [1, 2, 3] # list 58 | l[1] # lists are indexed by int 59 | l[1] = True # list elements can be any type 60 | d = {"Janne": 123, "Richard": 456} # dictionary 61 | d["Janne"] 62 | s = set(("apple", "cherry", "banana", "apple")) # Set of unique values 63 | s 64 | 65 | Read more: :class:`list`, :class:`tuple`, :class:`dict`, :class:`set`. 66 | 67 | 68 | Control structures 69 | ------------------ 70 | 71 | Python has the usual control structures, that is conditional 72 | statements and loops. For example, the :ref:`if` statement: 73 | 74 | :: 75 | 76 | x = 2 77 | if x == 3: 78 | print('x is 3') 79 | elif x == 2: 80 | print('x is 2') 81 | else: 82 | print('x is something else') 83 | 84 | :ref:`While ` loops loop until some condition is met: 85 | 86 | :: 87 | 88 | x = 0 89 | while x < 42: 90 | print('x is ', x) 91 | x += 0.2 92 | 93 | :ref:`For ` loops loop over some collection of values: 94 | 95 | :: 96 | 97 | xs = [1, 2, 3, 4] 98 | for x in xs: 99 | print(x) 100 | 101 | 102 | Often you want to loop over a sequence of integers, in that case the 103 | :class:`range` function is useful: 104 | 105 | :: 106 | 107 | for x in range(9): 108 | print(x) 109 | 110 | Another common need is to iterate over a collection, but at the same 111 | time also have an index number. For this there is the :func:`enumerate` 112 | function: 113 | 114 | :: 115 | 116 | xs = [1, 'hello', 'world'] 117 | for ii, x in enumerate(xs): 118 | print(ii, x) 119 | 120 | 121 | Functions and classes 122 | --------------------- 123 | 124 | Python functions are defined by the :ref:`def` keyword. They take a 125 | number of arguments, and return a number of return values. 126 | 127 | :: 128 | 129 | def hello(name): 130 | """Say hello to the person given by the argument""" 131 | print('Hello', name) 132 | return 'Hello ' + name 133 | 134 | hello("Anne") 135 | 136 | Classes are defined by the :ref:`class` keyword: 137 | 138 | :: 139 | 140 | class Hello: 141 | def __init__(self, name): 142 | self._name = name 143 | def say(self): 144 | print('Hello', self._name) 145 | 146 | h = Hello("Richard") 147 | h.say() 148 | 149 | 150 | Python type system 151 | ------------------ 152 | 153 | Python is strongly and dynamically typed. 154 | 155 | Strong here means, roughly, that it's not possible to circumvent the 156 | type system (at least, not easily, and not without invoking undefined 157 | behavior). 158 | 159 | :: 160 | 161 | x = 42 162 | type(x) 163 | x + "hello" 164 | 165 | Dynamic typing means that types are determined at runtime, and a 166 | variable can be redefined to refer to an instance of another type: 167 | 168 | :: 169 | 170 | x = 42 171 | x = "hello" 172 | 173 | 174 | *Jargon*: Types are associated with rvalues, not lvalues. In 175 | statically typed language, types are associated with lvalues, and are 176 | (typically) reified during compilation. 177 | 178 | 179 | ??? (lesson here) 180 | 181 | 182 | 183 | .. keypoints:: 184 | 185 | - Python offers a nice set of basic types as many other programming languages 186 | - Python is strongly typed and dynamically typed 187 | -------------------------------------------------------------------------------- /content/quick-reference.rst: -------------------------------------------------------------------------------- 1 | Quick reference 2 | =============== 3 | 4 | * `Pandas cheatsheet 5 | `__ (pandas.pydata.org) 6 | 7 | * `Pandas cheatsheet 8 | `__ 9 | (via `Datacamp 10 | `__) 11 | 12 | * `Numpy cheatsheet 13 | `__ 14 | (via `Datacamp 15 | `__) 16 | 17 | * `JupyterLab cheatsheet 18 | `__ 19 | 20 | * `Matplotlib cheatsheet 21 | `__ 22 | (via `Datacamp 23 | `__) 24 | 25 | * `Numpy, Pandas, Matplotlib, Scikit-learn all together 26 | `__ 27 | -------------------------------------------------------------------------------- /content/scipy.rst: -------------------------------------------------------------------------------- 1 | SciPy 2 | ===== 3 | 4 | .. questions:: 5 | 6 | - When you need more advanced mathematical functions, where do you 7 | look? 8 | 9 | .. objectives:: 10 | 11 | - Understand that SciPy exists and what kinds of things it has. 12 | - Understand the importance of using external libraries and how to 13 | use them. 14 | - Understand the purpose of wrapping existing C/Fortran code. 15 | - Non-objective: know details of everything (or anything) in SciPy. 16 | 17 | .. seealso:: 18 | 19 | * Main article: `SciPy documentation `__ 20 | 21 | 22 | 23 | SciPy is a library that builds on top of NumPy. It contains a lot of 24 | interfaces to battle-tested numerical routines written in Fortran or 25 | C, as well as python implementations of many common algorithms. 26 | 27 | 28 | 29 | What's in SciPy? 30 | ---------------- 31 | 32 | Briefly, it contains functionality for 33 | 34 | - Special functions (Bessel, Gamma, etc.) 35 | - Numerical integration 36 | - Optimization 37 | - Interpolation 38 | - Fast Fourier Transform (FFT) 39 | - Signal processing 40 | - Linear algebra (more complete than in NumPy) 41 | - Sparse matrices 42 | - Statistics 43 | - More I/O routine, e.g. Matrix Market format for sparse matrices, 44 | MATLAB files (.mat), etc. 45 | 46 | Many (most?) of these are not written specifically for SciPy, but use 47 | the best available open source C or Fortran libraries. Thus, you get 48 | the best of Python and the best of compiled languages. 49 | 50 | Most functions are documented ridiculously well from a scientific 51 | standpoint: you aren't just using some unknown function, but have a 52 | full scientific description and citation to the method and 53 | implementation. 54 | 55 | 56 | 57 | Exercises: use SciPy 58 | -------------------- 59 | 60 | These exercises do not exist because *you* might need *these* 61 | functions someday. They are because *you* will need to *read 62 | documentation and understand documentation of an an external library* 63 | eventually. 64 | 65 | 1: Numerical integration 66 | ~~~~~~~~~~~~~~~~~~~~~~~~ 67 | 68 | .. challenge:: 69 | 70 | Do the following exercise **or** read the documentation and 71 | understand the relevant functions of SciPy: 72 | 73 | Define a function of one variable and using 74 | `scipy.integrate.quad `__ 75 | calculate the integral of your function in the 76 | interval ``[0.0, 4.0]``. Then vary the interval and also modify the function and check 77 | whether scipy can integrate it. 78 | 79 | 80 | .. solution:: 81 | 82 | .. code-block:: python 83 | 84 | from scipy import integrate 85 | 86 | def myfunction(x): 87 | # you need to define result 88 | return result 89 | 90 | integral = integrate.quad(myfunction, 0.0, 4.0) 91 | print(integral) 92 | 93 | `quad 94 | `__ 95 | uses the Fortran library QUADPACK, which one can assume is pretty 96 | good. You can also see a whole lot of scientific information about 97 | the function on the docs page - including the scientific names of 98 | the methods used. 99 | 100 | 101 | 102 | 2: Sparse matrices 103 | ~~~~~~~~~~~~~~~~~~ 104 | 105 | .. challenge:: 106 | 107 | Do the following exercise **or** read the documentation and 108 | understand the relevant functions of SciPy: 109 | 110 | Use the SciPy sparse matrix functionality to create a random sparse 111 | matrix with a probability of non-zero elements of 0.05 and size 10000 112 | x 10000. The use the SciPy sparse linear algebra support to calculate 113 | the matrix-vector product of the sparse matrix you just created and a 114 | random vector. Use the %timeit macro to measure how long it 115 | takes. Does the optional ``format`` argument when you create the 116 | sparse matrix make a difference? 117 | 118 | Then, compare to how long it takes if you'd instead first convert the 119 | sparse matrix to a normal NumPy dense array, and use the NumPy ``dot`` 120 | method to calculate the matrix-vector product. 121 | 122 | Can you figure out a quick rule of thumb when it's worth using a 123 | sparse matrix representation vs. a dense representation? 124 | 125 | .. solution:: 126 | 127 | The basic code to do the test is: 128 | 129 | .. code-block:: 130 | 131 | import numpy 132 | import scipy.sparse 133 | 134 | vector = numpy.random.random(10000) 135 | matrix = scipy.sparse.rand(10000, 10000, density=.05, format='csc') 136 | 137 | # We time this line 138 | matrix.dot(vector) 139 | 140 | From the top of the `spare matrix module documentation 141 | `__, we can 142 | see there are a variety of different available sparse matrix types: 143 | ``bsr``, ``coo``, ``csr``, ``csc``, etc. These each represent a 144 | different way of storing the matrices. 145 | 146 | It seems that ``csr`` and ``csc`` are fairly fast. ``lil`` and 147 | ``dok`` are slow but it says that these are good for creating 148 | matrices with random insertions. 149 | 150 | For example, ``csr`` takes 7ms, ``lil`` 42ms, ``dok`` 1600ms, and 151 | converting to a non-sparse array ``matrix.toarray()`` and 152 | multiplying takes 64ms on one particular computer. 153 | 154 | This code allows us to time the performance at different 155 | densities. It seems that with the ``csr`` format, sparse is better 156 | below densities of around .4 to .5: 157 | 158 | ..code-block:: 159 | 160 | for density in [.01, .05, .1, .2, .3, .4, .5]: 161 | matrix = scipy.sparse.rand(10000, 10000, density=density, format='csr') 162 | time_sparse = timeit.timeit('matrix.dot(vector)', number=10, globals=globals()) 163 | matrix2 = matrix.toarray() 164 | time_full = timeit.timeit('matrix2.dot(vector)', number=10, globals=globals()) 165 | print(f"{density} {time_sparse:.3f} {time_full:.3f}") 166 | 167 | 168 | 169 | See also 170 | -------- 171 | 172 | * `SciPy general introduction `__ 173 | * `SciPy documentation 174 | `__ 175 | 176 | 177 | 178 | .. keypoints:: 179 | 180 | - When you need advance math or scientific functions, let's just 181 | admit it: you do a web search first. 182 | - But when you see something in SciPy come up, you know your 183 | solutions are in good hands. 184 | -------------------------------------------------------------------------------- /content/work-with-data.rst: -------------------------------------------------------------------------------- 1 | Working with Data 2 | ================= 3 | 4 | .. questions:: 5 | 6 | - How do you store your data right now? 7 | - Are you doing data cleaning / preprocessing every time you load the data? 8 | 9 | .. objectives:: 10 | 11 | - Learn benefits/drawbacks of common data formats. 12 | - Learn how you can read and write data in a variety of formats. 13 | 14 | 15 | .. figure:: https://imgs.xkcd.com/comics/norm_normal_file_format.png 16 | 17 | Source: `xkcd #2116 `__ 18 | 19 | 20 | What is a data format? 21 | ---------------------- 22 | 23 | Data format can mean two different things 24 | 25 | 1. `data structure `__ or how 26 | you're storing the data in memory while you're working on it; 27 | 2. `file format `__ or the way you're 28 | storing the data in the disk. 29 | 30 | Let's consider this randomly generated DataFrame with various columns:: 31 | 32 | import pandas as pd 33 | import numpy as np 34 | 35 | n_rows = 100000 36 | 37 | dataset = pd.DataFrame( 38 | data={ 39 | 'string': np.random.choice(('apple', 'banana', 'carrot'), size=n_rows), 40 | 'timestamp': pd.date_range("20130101", periods=n_rows, freq="s"), 41 | 'integer': np.random.choice(range(0,10), size=n_rows), 42 | 'float': np.random.uniform(size=n_rows), 43 | }, 44 | ) 45 | 46 | dataset.info() 47 | 48 | This DataFrame is structured in the *tidy data* format. 49 | In tidy data we have multiple columns of data that are collected in a Pandas 50 | DataFrame, where each column represents a value of a specific type. 51 | 52 | .. image:: img/pandas/tidy_data.png 53 | 54 | Let's consider another example:: 55 | 56 | n = 1000 57 | 58 | data_array = np.random.uniform(size=(n,n)) 59 | np.info(data_array) 60 | 61 | 62 | Here we have a different data structure: we have a two-dimensional array of numbers. 63 | This is different to a Pandas DataFrame as data is stored as one contiguous block 64 | instead of individual columns. This also means that the whole array must have one 65 | data type. 66 | 67 | 68 | .. figure:: https://github.com/elegant-scipy/elegant-scipy/raw/master/figures/NumPy_ndarrays_v2.png 69 | 70 | Source: `Elegant Scipy `__ 71 | 72 | Now the question is: **Can the data be saved to the disk without changing the 73 | data format?** 74 | 75 | For this we need a **file format** that can easily store our **data structure**. 76 | 77 | .. admonition:: Data type vs. data structure vs. file format 78 | :class: dropdown 79 | 80 | - **Data type:** Type of a single piece of data (integer, string, 81 | float, ...). 82 | - **Data structure:** How the data is organized in memory (individual 83 | columns, 2D-array, nested dictionaries, ...). 84 | - **File format:** How the data is organized when it is saved to the disk 85 | (columns of strings, block of binary data, ...). 86 | 87 | For example, a black and white image stored as a .png-file (**file format**) 88 | might be stored in memory as an NxM array (**data structure**) of integers 89 | (**data type**) with each entry representing the color value of the pixel. 90 | 91 | What to look for in a file format? 92 | ---------------------------------- 93 | 94 | When deciding which file format you should use for your program, you should 95 | remember the following: 96 | 97 | **There is no file format that is good for every use case.** 98 | 99 | and 100 | 101 | **It is very likely, that a good format already exists for your use case.** 102 | 103 | There are, indeed, various standard file formats for various use cases: 104 | 105 | .. figure:: https://imgs.xkcd.com/comics/standards.png 106 | 107 | Source: `xkcd #927 `__. 108 | 109 | Usually, you'll want to consider the following things when choosing a file 110 | format: 111 | 112 | 1. Is the file format good for my data structure (is it fast/space 113 | efficient/easy to use)? 114 | 2. Is everybody else / leading authorities in my field recommending a certain 115 | format? 116 | 3. Do I need a human-readable format or is it enough to work on it using code? 117 | 4. Do I want to archive / share the data or do I just want to store it while 118 | I'm working? 119 | 120 | Pandas supports 121 | `many file formats `__ 122 | for tidy data and Numpy supports 123 | `some file formats `__ 124 | for array data. However, there are many other file formats that can be used 125 | through other libraries. 126 | 127 | Table below describes some data formats: 128 | 129 | .. list-table:: 130 | :header-rows: 1 131 | 132 | * - | Name: 133 | - | Human 134 | | readable: 135 | - | Space 136 | | efficiency: 137 | - | Arbitrary 138 | | data: 139 | - | Tidy 140 | | data: 141 | - | Array 142 | | data: 143 | - | Long term 144 | | storage/sharing: 145 | 146 | * - :ref:`Pickle ` 147 | - ❌ 148 | - 🟨 149 | - ✅ 150 | - 🟨 151 | - 🟨 152 | - ❌ 153 | 154 | * - :ref:`CSV ` 155 | - ✅ 156 | - ❌ 157 | - ❌ 158 | - ✅ 159 | - 🟨 160 | - ✅ 161 | 162 | * - :ref:`Feather ` 163 | - ❌ 164 | - ✅ 165 | - ❌ 166 | - ✅ 167 | - ❌ 168 | - ❌ 169 | 170 | * - :ref:`Parquet ` 171 | - ❌ 172 | - ✅ 173 | - 🟨 174 | - ✅ 175 | - 🟨 176 | - ✅ 177 | 178 | * - :ref:`npy ` 179 | - ❌ 180 | - 🟨 181 | - ❌ 182 | - ❌ 183 | - ✅ 184 | - ❌ 185 | 186 | * - :ref:`HDF5 ` 187 | - ❌ 188 | - ✅ 189 | - ❌ 190 | - ❌ 191 | - ✅ 192 | - ✅ 193 | 194 | * - :ref:`NetCDF4 ` 195 | - ❌ 196 | - ✅ 197 | - ❌ 198 | - ❌ 199 | - ✅ 200 | - ✅ 201 | 202 | * - :ref:`JSON ` 203 | - ✅ 204 | - ❌ 205 | - 🟨 206 | - ❌ 207 | - ❌ 208 | - ✅ 209 | 210 | * - :ref:`Excel ` 211 | - ❌ 212 | - ❌ 213 | - ❌ 214 | - 🟨 215 | - ❌ 216 | - 🟨 217 | 218 | * - :ref:`Graph formats ` 219 | - 🟨 220 | - 🟨 221 | - ❌ 222 | - ❌ 223 | - ❌ 224 | - ✅ 225 | 226 | .. important:: 227 | 228 | - ✅ : Good 229 | - 🟨 : Ok / depends on a case 230 | - ❌ : Bad 231 | 232 | 233 | A more in-depth analysis of the file formats mentioned above, can be found 234 | :doc:`here `. 235 | 236 | Pros and cons 237 | ------------- 238 | 239 | Let's have a general look at pros and cons of some types of file formats 240 | 241 | Binary File formats 242 | ~~~~~~~~~~~~~~~~~~~ 243 | 244 | Good things 245 | +++++++++++ 246 | 247 | - Can represent floating point numbers with full precision. 248 | - Can potentially save lots of space, especially, when storing numbers. 249 | - Data reading and writing is usually much faster than loading from text files, 250 | since the format contains information about the data structure, and thus 251 | memory allocation can be done more efficiently. 252 | - More explicit specification for storing multiple data sets and metadata in 253 | the same file. 254 | - Many binary formats allow for partial loading of the data. 255 | This makes it possible to work with datasets that are larger than your 256 | computer's memory. 257 | 258 | Bad things 259 | ++++++++++ 260 | 261 | - Commonly requires the use of a specific library to read and write the data. 262 | - Library specific formats can be version dependent. 263 | - Not human readable. 264 | - Sharing can be more difficult (requires some expertise to be able to 265 | read the data). 266 | - Might require more documentation efforts. 267 | 268 | Textual formats 269 | ~~~~~~~~~~~~~~~ 270 | 271 | Good things 272 | +++++++++++ 273 | 274 | - Human readable. 275 | - Easy to check for (structural) errors. 276 | - Supported by many tool out of the box. 277 | - Easily shared. 278 | 279 | Bad things 280 | ++++++++++ 281 | 282 | - Can be slow to read and write. 283 | - High potential to increase required disk space substantially (e.g. when 284 | storing floating point numbers as text). 285 | - Prone to losing precision when storing floating point numbers. 286 | - Multi-dimensional data can be hard to represent. 287 | - While the data format might be specified, the data structure might not be 288 | clear when starting to read the data. 289 | 290 | Further considerations 291 | ~~~~~~~~~~~~~~~~~~~~~~ 292 | 293 | - The closer your stored data is to the code, the more likely it depends on the 294 | environment you are working in. If you ``pickle``, e.g. a generated model, 295 | you can only be sure that the model will work as intended if you load it in 296 | an environment that has the same versions of all libraries the model depends 297 | on. 298 | 299 | 300 | Exercise 301 | -------- 302 | 303 | .. challenge:: 304 | 305 | You have a model that you have been training for a while. 306 | Lets assume it's a relatively simple neural network (consisting of a 307 | network structure and it's associated weights). 308 | 309 | Let's consider 2 scenarios 310 | 311 | A: You have a different project, that is supposed to take this model, and 312 | do some processing with it to determine it's efficiency after different 313 | times of training. 314 | 315 | B: You want to publish the model and make it available to others. 316 | 317 | What are good options to store the model in each of these scenarios? 318 | 319 | .. solution:: 320 | 321 | A: 322 | 323 | Some export into a binary format that can be easily read. E.g. pickle 324 | or a specific export function from the library you use. 325 | 326 | It also depends on whether you intend to make the intermediary steps 327 | available to others. If you do, you might also want to consider storing 328 | structure and weights separately or use a format specific for the 329 | type of model you are training to keep the data independent of the 330 | library. 331 | 332 | B: 333 | 334 | You might want to consider a more general format that is supported by 335 | many libraries, e.g. ONNX, or a format that is specifically designed 336 | for the type of model you are training. 337 | 338 | You might also want to consider additionally storing the model in a way 339 | that is easily readable by humans, to make it easier for others to 340 | understand the model. 341 | 342 | 343 | Case study: Converting untidy data to tidy data 344 | ----------------------------------------------- 345 | 346 | Many data analysis tools (like Pandas) are designed to work with tidy data, 347 | but some data is not in a suitable format. What we have seen often in the 348 | past is people then not using the powerful tools, but write complicated 349 | scripts that extract individual pieces from the data each time they need 350 | to do a calculation. 351 | 352 | As an example, let's see how we can use country data from an example REST API 353 | endpoint (for more information on how to work with web APIs, see 354 | :doc:`this page `). Let's get the data with the following piece 355 | of code: 356 | 357 | .. code-block:: python 358 | 359 | import json 360 | import requests 361 | 362 | url = 'https://api.sampleapis.com/countries/countries' 363 | 364 | response = requests.get(url) 365 | 366 | countries_json = json.loads(response.content) 367 | 368 | Let's try to find the country with the largest population. 369 | 370 | An example of a "questionable" way of solving this problem would be something 371 | like the following piece of code that is written in pure Python: 372 | 373 | .. code-block:: python 374 | 375 | max_population = 0 376 | top_population_country = '' 377 | 378 | for country in countries_json: 379 | if country.get('population', 0) > max_population: 380 | top_population_country = country['name'] 381 | max_population = country.get('population', 0) 382 | 383 | print(top_population_country) 384 | 385 | This is a very natural way of writing a solution for the problem, but it has 386 | major caveats: 387 | 388 | 1. We throw all of the other data out so we cannot answer any 389 | follow up questions. 390 | 2. For bigger data, this would be very slow and ineffective. 391 | 3. We have to write lots of code to do a simple thing. 392 | 393 | Another typical solution would be something like the following code, 394 | which picks some of the data and creates a Pandas dataframe out of it: 395 | 396 | .. code-block:: python 397 | 398 | import pandas as pd 399 | 400 | countries_list = [] 401 | 402 | for country in countries_json: 403 | countries_list.append([country['name'], country.get('population',0)]) 404 | 405 | countries_df = pd.DataFrame(countries_list, columns=['name', 'population']) 406 | 407 | print(countries_df.nlargest(1, 'population')['name'].values[0]) 408 | 409 | This solution has many of the same problems as the previous one, but now we can 410 | use Pandas to do follow up analysis. 411 | 412 | Better solution would be to use Pandas' 413 | `pandas.DataFrame.from_dict `__ 414 | or `pandas.json_normalize `__ 415 | to read the full data in: 416 | 417 | .. code-block:: python 418 | 419 | countries_df = pd.DataFrame.from_dict(countries_json) 420 | print(countries_df.nlargest(1, 'population')['name'].values[0]) 421 | 422 | countries_df = pd.json_normalize(countries_json) 423 | print(countries_df.nlargest(1, 'population')['name'].values[0]) 424 | 425 | .. admonition:: Key points 426 | 427 | - Convert your data to a format where it is easy to do analysis on it. 428 | - Check the tools you're using if they have an existing feature that can help 429 | you read the data in. 430 | 431 | 432 | Things to remember 433 | ------------------ 434 | 435 | 1. **There is no file format that is good for every use case.** 436 | 2. Usually, your research question determines which libraries you want to use 437 | to solve it. Similarly, the data format you have determines file format you 438 | want to use. 439 | 3. However, if you're using a previously existing framework or tools or you 440 | work in a specific field, you should prioritize using the formats that are 441 | used in said framework/tools/field. 442 | 4. When you're starting your project, it's a good idea to take your initial 443 | data, clean it, and store the results in a good binary format that works as 444 | a starting point for your future analysis. If you've written the cleaning 445 | procedure as a script, you can always reproduce it. 446 | 5. Throughout your work, you should use code to turn important data to 447 | a human-readable format (e.g. plots, averages, 448 | :meth:`pandas.DataFrame.head`), not to keep your full data in a 449 | human-readable format. 450 | 6. Once you've finished, you should store the data in a format that can be 451 | easily shared to other people. 452 | 453 | 454 | See also 455 | -------- 456 | 457 | - `Pandas' IO tools `__ 458 | - `Tidy data comparison notebook `__ 459 | - `Array data comparison notebook `__ 460 | 461 | 462 | .. keypoints:: 463 | 464 | - Pandas can read and write a variety of data formats. 465 | - There are many good, standard formats, and you don't need to create your own. 466 | - There are plenty of other libraries dedicated to various formats. 467 | -------------------------------------------------------------------------------- /make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=content 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Sphinx 2 | sphinx_rtd_theme 3 | sphinx_rtd_theme_ext_color_contrast 4 | myst_nb 5 | sphinx-lesson 6 | https://github.com/aaltoscicomp/sphinx-aaltoscicomp-branding/archive/master.zip 7 | sphinxext-opengraph 8 | sphinx-thebe 9 | 10 | # for web-apis execution 11 | jsonlines 12 | bs4 13 | -------------------------------------------------------------------------------- /resources/code/scripts/__pycache__/optionsparser.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/resources/code/scripts/__pycache__/optionsparser.cpython-38.pyc -------------------------------------------------------------------------------- /resources/code/scripts/__pycache__/weather_functions.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/resources/code/scripts/__pycache__/weather_functions.cpython-38.pyc -------------------------------------------------------------------------------- /resources/code/scripts/__pycache__/weather_functions_config.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/resources/code/scripts/__pycache__/weather_functions_config.cpython-38.pyc -------------------------------------------------------------------------------- /resources/code/scripts/optionsparser.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | 3 | def get_parameters(config_file, required, defaults): 4 | ''' 5 | Parameters: 6 | Optionfile: FileName of the yaml file containing the options 7 | required: Dict of required argument names and their object types. 8 | defaults: Dict of default parameters mapping to their default values 9 | 10 | Returns: An object with fields named according to required and optional values. 11 | ''' 12 | f = open(config_file) 13 | options = yaml.safe_load(f) 14 | # create a parameters object that allows setting attributes. 15 | parameters = type('Options', (), {})() 16 | # check required arguments 17 | for arg in required: 18 | if not arg in options: 19 | raise Exception("Could not find required Argument " + arg + " aborting...") 20 | else: 21 | if not isinstance(options[arg],required[arg]): 22 | raise Exception("Expected input of type " + str(required[arg]) + " but got " + str(type(options[arg]))) 23 | print("Setting " + arg + " to " + str(options[arg])) 24 | setattr(parameters,arg,options[arg]) 25 | # check the default values. 26 | for arg in defaults: 27 | if arg in options: 28 | if not isinstance(options[arg],type(defaults[arg])): 29 | #Wrong type for the parameter 30 | raise Exception("Expected input of type " + str(type(defaults[arg])) + " but got " + str(type(options[arg]))) 31 | print("Setting " + arg + " to " + str(options[arg])) 32 | setattr(parameters,arg,options[arg]) 33 | else: 34 | print( arg + " not found in option file. Using default: " +str(defaults[arg])) 35 | setattr(parameters,arg,defaults[arg]) 36 | return parameters 37 | 38 | 39 | -------------------------------------------------------------------------------- /resources/code/scripts/out.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/resources/code/scripts/out.png -------------------------------------------------------------------------------- /resources/code/scripts/rain_in_cairo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/resources/code/scripts/rain_in_cairo.png -------------------------------------------------------------------------------- /resources/code/scripts/weather.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/80042c08efd912d18034a3d977c31288922b93cf/resources/code/scripts/weather.png -------------------------------------------------------------------------------- /resources/code/scripts/weather_functions.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | 4 | def preprocessing(dataset, start_date, end_date): 5 | # The date format in the file is in a day-first format, which matplotlib does nto understand. 6 | # so we need to convert it. 7 | dataset['Local time'] = pd.to_datetime(dataset['Local time'],dayfirst=True) 8 | dataset = dataset[dataset['Local time'].between(start_date,end_date)] 9 | return dataset 10 | 11 | 12 | def plot_data(dates, values): 13 | fig, ax = plt.subplots() 14 | ax.plot(dates, values) 15 | # label the axes 16 | ax.set_xlabel("Date of observation") 17 | ax.set_ylabel("Temperature in Celsius") 18 | ax.set_title("Temperature Observations") 19 | # adjust tick labels 20 | fig.autofmt_xdate() 21 | return ax,fig 22 | 23 | -------------------------------------------------------------------------------- /resources/code/scripts/weather_functions_config.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | 4 | def preprocessing(dataset, start_date, end_date): 5 | # The date format in the file is in a day-first format, which matplotlib does nto understand. 6 | # so we need to convert it. 7 | dataset['Local time'] = pd.to_datetime(dataset['Local time'],dayfirst=True) 8 | dataset = dataset[dataset['Local time'].between(start_date,end_date)] 9 | return dataset 10 | 11 | 12 | def plot_data(dates, values, labels): 13 | fig, ax = plt.subplots() 14 | ax.plot(dates, values) 15 | # label the axes 16 | ax.set_xlabel(labels.xlabel) 17 | ax.set_ylabel(labels.ylabel) 18 | ax.set_title(labels.title) 19 | # adjust tick labels 20 | fig.autofmt_xdate() 21 | return ax,fig 22 | 23 | -------------------------------------------------------------------------------- /resources/code/scripts/weather_observations.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import pandas as pd 5 | import weather_functions 6 | 7 | url = "https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/master/resources/data/scripts/weather_tapiola.csv" 8 | weather = pd.read_csv(url,comment='#') 9 | 10 | # define the start and end time for the plot 11 | start_date=pd.to_datetime('01/06/2021', dayfirst=True) 12 | end_date=pd.to_datetime('01/10/2021', dayfirst=True) 13 | #Preprocess the data 14 | weather['Local time'] = pd.to_datetime(weather['Local time'], dayfirst=True) 15 | # select the data 16 | weather = weather[weather['Local time'].between(start_date,end_date)] 17 | 18 | # Now, we have the data loaded, and adapted to our needs. So lets get plotting 19 | import matplotlib.pyplot as plt 20 | # start the figure. 21 | fig, ax = plt.subplots() 22 | ax.plot(weather['Local time'], weather['T']) 23 | # label the axes 24 | ax.set_xlabel("Date of observation") 25 | ax.set_ylabel("Temperature in Celsius") 26 | ax.set_title("Temperature Observations") 27 | # adjust the date labels, so that they look nicer 28 | fig.autofmt_xdate() 29 | # save the figure 30 | 31 | # save the figure 32 | fig.savefig('weather.png') 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /resources/code/scripts/weather_observations_argparse.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import argparse 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument("input", type=str, help="Input data file") 6 | parser.add_argument("output", type=str, help="Output plot file") 7 | parser.add_argument("-s", "--start", default="01/01/2019", type=str, help="Start date in DD/MM/YYYY format") 8 | parser.add_argument("-e", "--end", default="16/10/2021", type=str, help="End date in DD/MM/YYYY format") 9 | 10 | args = parser.parse_args() 11 | 12 | # load the data 13 | weather = pd.read_csv(args.input,comment='#') 14 | 15 | # define the start and end time for the plot 16 | start_date=pd.to_datetime(args.start, dayfirst=True) 17 | end_date=pd.to_datetime(args.end, dayfirst=True) 18 | 19 | # preprocess the data 20 | weather['Local time'] = pd.to_datetime(weather['Local time'], dayfirst=True) 21 | # select the data 22 | weather = weather[weather['Local time'].between(start_date,end_date)] 23 | 24 | # plot the data 25 | import matplotlib.pyplot as plt 26 | # start the figure. 27 | fig, ax = plt.subplots() 28 | ax.plot(weather['Local time'], weather['T']) 29 | # label the axes 30 | ax.set_xlabel("Date of observation") 31 | ax.set_ylabel("Temperature in Celsius") 32 | ax.set_title("Temperature Observations") 33 | # adjust the date labels, so that they look nicer 34 | fig.autofmt_xdate() 35 | 36 | # save the figure 37 | fig.savefig(args.output) 38 | -------------------------------------------------------------------------------- /resources/code/scripts/weather_observations_config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import pandas as pd 5 | from optionsparser import get_parameters 6 | import argparse 7 | 8 | # Lets start reading our confg file. we'll use argparse to get the config file. 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('input', type=str, 11 | help="Config File name ") 12 | args = parser.parse_args() 13 | 14 | # Set optional parameters with default values and required parameter values with their type 15 | defaults = { 16 | "xlabel" : "Date of observation", 17 | "title" : "Weather Observations", 18 | "start" : "01/06/2021", 19 | "end" : "01/10/2021", 20 | "output" : "weather.png", 21 | "ylabel" : "Temperature in Celsius", 22 | "data_column" : "T", 23 | } 24 | 25 | required = { 26 | "input" : str 27 | } 28 | 29 | # now, parse the config file 30 | parameters = get_parameters(args.input, required, defaults) 31 | 32 | # load the data 33 | weather = pd.read_csv(parameters.input,comment='#') 34 | 35 | # obtain start and end date 36 | start_date=pd.to_datetime(parameters.start, dayfirst=True) 37 | end_date=pd.to_datetime(parameters.end, dayfirst=True) 38 | 39 | # Data preprocessing 40 | weather['Local time'] = pd.to_datetime(weather['Local time'], dayfirst=True) 41 | # select the data 42 | weather = weather[weather['Local time'].between(start_date,end_date)] 43 | 44 | # Data plotting 45 | import matplotlib.pyplot as plt 46 | # start the figure. 47 | fig, ax = plt.subplots() 48 | ax.plot(weather['Local time'], weather['T']) 49 | # label the axes 50 | ax.set_xlabel("Date of observation") 51 | ax.set_ylabel("Temperature in Celsius") 52 | ax.set_title("Temperature Observations") 53 | # adjust the date labels, so that they look nicer 54 | fig.autofmt_xdate() 55 | 56 | 57 | # save the figure 58 | fig.savefig(parameters.output) 59 | -------------------------------------------------------------------------------- /resources/code/scripts/weather_options.yml: -------------------------------------------------------------------------------- 1 | input: https://raw.githubusercontent.com/AaltoSciComp/python-for-scicomp/master/resources/data/scripts/weather_cairo.csv 2 | output: rain_in_cairo.png 3 | xlabel: Days in June 4 | ylabel: Rainfall in mm 5 | title: Rainfall in Cairo 6 | data_column: RRR 7 | start: 01/06/2021 8 | end: 30/06/2021 9 | -------------------------------------------------------------------------------- /resources/data/plotting/README.md: -------------------------------------------------------------------------------- 1 | Data obtained from [Norsk 2 | KlimaServiceSenter](https://seklima.met.no/observations/), Meteorologisk 3 | institutt (MET) (CC BY 4.0). 4 | 5 | The following changes were applied to the data to make it easier to work with: 6 | - The decimal separator was changed from a comma to a period. 7 | - The column separator was changed from a semicolon to a comma. 8 | - Missing values were replaced with zeros instead of a dash. 9 | -------------------------------------------------------------------------------- /resources/data/plotting/exercise-2.csv: -------------------------------------------------------------------------------- 1 | xval,yval 2 | 01,7.7 3 | 02,6.6 4 | 03,4.5 5 | 04,9.8 6 | 05,17.7 7 | 06,25.4 8 | 07,26.7 9 | 08,25.1 10 | 09,19.3 11 | 10,9.8 12 | -------------------------------------------------------------------------------- /resources/data/plotting/oslo-monthly.csv: -------------------------------------------------------------------------------- 1 | name,station,date,max temperature,precipitation,min temperature 2 | Oslo - Blindern,SN18700,10.2022,17.1,82.9,-0.4 3 | Oslo - Blindern,SN18700,11.2022,15.1,83.4,-2.1 4 | Oslo - Blindern,SN18700,12.2022,6.5,85.5,-14.6 5 | Oslo - Blindern,SN18700,01.2023,7.2,100.5,-13.4 6 | Oslo - Blindern,SN18700,02.2023,10.2,46,-9.4 7 | Oslo - Blindern,SN18700,03.2023,9.8,72.6,-12.6 8 | Oslo - Blindern,SN18700,04.2023,19.8,99.7,-4.7 9 | Oslo - Blindern,SN18700,05.2023,24.2,17,-0.8 10 | Oslo - Blindern,SN18700,06.2023,31.8,39.9,4.6 11 | Oslo - Blindern,SN18700,07.2023,28.4,146.9,8.6 12 | Oslo - Blindern,SN18700,08.2023,24.5,259.8,9.8 13 | Oslo - Blindern,SN18700,09.2023,25.1,105.8,5.3 14 | Oslo - Blindern,SN18700,10.2023,17.1,7.3,-0.7 15 | -------------------------------------------------------------------------------- /resources/data/plotting/tromso-monthly.csv: -------------------------------------------------------------------------------- 1 | name,station,date,max temperature,precipitation,min temperature 2 | Tromso - Langnes,SN90490,10.2022,10.7,187,-4.2 3 | Tromso - Langnes,SN90490,11.2022,8.5,41.5,-7 4 | Tromso - Langnes,SN90490,12.2022,5.6,88.8,-11.7 5 | Tromso - Langnes,SN90490,01.2023,7.7,111.4,-13.9 6 | Tromso - Langnes,SN90490,02.2023,6.6,171.3,-10.7 7 | Tromso - Langnes,SN90490,03.2023,4.5,157,-15.1 8 | Tromso - Langnes,SN90490,04.2023,9.8,85,-7.1 9 | Tromso - Langnes,SN90490,05.2023,17.7,101.2,-4.6 10 | Tromso - Langnes,SN90490,06.2023,25.4,43.4,-0.4 11 | Tromso - Langnes,SN90490,07.2023,26.7,14,6 12 | Tromso - Langnes,SN90490,08.2023,25.1,43.4,5.4 13 | Tromso - Langnes,SN90490,09.2023,19.3,163.7,0.3 14 | Tromso - Langnes,SN90490,10.2023,9.8,64.8,-0.6 15 | -------------------------------------------------------------------------------- /software/environment.yml: -------------------------------------------------------------------------------- 1 | name: python-for-scicomp 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - jsonlines 6 | - jupyterlab 7 | - notebook 8 | - ipywidgets 9 | - requests 10 | - numpy 11 | - scipy 12 | - matplotlib 13 | - seaborn 14 | - mpi4py 15 | - dask 16 | - setuptools 17 | - twine 18 | - poetry 19 | - flit 20 | - scikit-learn 21 | - scalene 22 | - ruff 23 | - altair-all 24 | - vega_datasets 25 | - xarray 26 | - netcdf4 27 | - yfinance 28 | - pip 29 | - pip: 30 | - pythia_datasets 31 | --------------------------------------------------------------------------------