├── .static
├── .devcontainer
    ├── noop.txt
    ├── devcontainer.json
    └── Dockerfile
├── Makefile
├── requirements.txt
├── .buildpacks
├── app.json
├── docs
    ├── machine-learning
    │   ├── comp-bayes-figures
    │   │   ├── mcmc-trace.png
    │   │   └── mcmc-trace-burn-in.png
    │   ├── markov-models-figures
    │   │   ├── output_20_0.png
    │   │   ├── output_28_0.png
    │   │   ├── output_33_0.png
    │   │   ├── output_37_0.png
    │   │   ├── output_41_0.png
    │   │   ├── output_43_0.png
    │   │   ├── output_45_0.png
    │   │   ├── output_46_0.png
    │   │   ├── output_50_0.png
    │   │   ├── output_52_0.png
    │   │   ├── output_56_0.png
    │   │   ├── output_95_0.png
    │   │   ├── markov-models.zip
    │   │   ├── output_101_0.png
    │   │   ├── output_102_0.png
    │   │   ├── output_103_0.png
    │   │   ├── output_106_0.png
    │   │   ├── output_112_0.png
    │   │   ├── output_113_0.png
    │   │   ├── output_115_0.png
    │   │   ├── output_116_0.png
    │   │   ├── output_118_0.png
    │   │   ├── 02-gaussian-emissions.png
    │   │   ├── 01-markov-chain-example.png
    │   │   └── 03-autoregressive-emissions.png
    │   ├── message-passing-figures
    │   │   ├── figure-msg-passing-water.pdf
    │   │   ├── figure-msg-passing-water.png
    │   │   ├── figure-message-passing-sparse.pdf
    │   │   ├── figure-message-passing-sparse.png
    │   │   ├── figure-message-passing-batched.pdf
    │   │   ├── figure-message-passing-batched.png
    │   │   ├── figure-message-passing-graph-size.pdf
    │   │   ├── figure-message-passing-graph-size.png
    │   │   ├── figure-msg-passing-carbon-methane.pdf
    │   │   └── figure-msg-passing-carbon-methane.png
    │   ├── reimplementing-models.md
    │   ├── message-passing.md
    │   └── computational-bayesian-stats.md
    ├── supporters.md
    ├── index.md
    ├── software-skills
    │   ├── index.md
    │   ├── code-formatting.md
    │   ├── documentation.md
    │   ├── refactoring.md
    │   ├── testing.md
    │   └── environment-variables.md
    ├── computing
    │   └── recursion.md
    ├── terminal
    │   ├── cli-tools.md
    │   └── pre-commits.md
    ├── newsletter
    │   └── 2020
    │   │   ├── 05-may.md
    │   │   ├── 09-september.md
    │   │   ├── 08-august.md
    │   │   ├── 07-july.md
    │   │   └── 06-june.md
    ├── workflow
    │   ├── code-review.md
    │   ├── effective-commit-messages.md
    │   └── gitflow.md
    └── miscellaneous
    │   ├── learning-to-learn.md
    │   └── dashboarding-landscape.md
├── environment.yml
├── README.md
├── .pre-commit-config.yaml
├── .travis.yml
├── draft-ideas.md
├── mkdocs.yml
└── .gitignore


/.static:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.devcontainer/noop.txt:
--------------------------------------------------------------------------------
1 | Created just in case the environment.yml build step fails.
2 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | serve:
2 | 	mkdocs serve -a 0.0.0.0:8001
3 | 
4 | deploy:
5 | 	mkdocs gh-deploy
6 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | mkdocs-minify-plugin
2 | pymdown-extensions
3 | mknotebooks
4 | mkdocs
5 | mkdocs-material
6 | 


--------------------------------------------------------------------------------
/.buildpacks:
--------------------------------------------------------------------------------
1 | https://github.com/heroku/heroku-buildpack-python.git
2 | https://github.com/dokku/buildpack-nginx.git
3 | 


--------------------------------------------------------------------------------
/app.json:
--------------------------------------------------------------------------------
1 | {
2 |   "scripts": {
3 |     "dokku": {
4 |       "predeploy": "cd /app/www && mkdocs build"
5 |     }
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/docs/machine-learning/comp-bayes-figures/mcmc-trace.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/comp-bayes-figures/mcmc-trace.png


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/output_20_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/output_20_0.png


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/output_28_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/output_28_0.png


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/output_33_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/output_33_0.png


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/output_37_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/output_37_0.png


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/output_41_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/output_41_0.png


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/output_43_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/output_43_0.png


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/output_45_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/output_45_0.png


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/output_46_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/output_46_0.png


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/output_50_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/output_50_0.png


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/output_52_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/output_52_0.png


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/output_56_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/output_56_0.png


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/output_95_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/output_95_0.png


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/markov-models.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/markov-models.zip


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/output_101_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/output_101_0.png


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/output_102_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/output_102_0.png


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/output_103_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/output_103_0.png


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/output_106_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/output_106_0.png


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/output_112_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/output_112_0.png


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/output_113_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/output_113_0.png


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/output_115_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/output_115_0.png


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/output_116_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/output_116_0.png


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/output_118_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/output_118_0.png


--------------------------------------------------------------------------------
/docs/machine-learning/comp-bayes-figures/mcmc-trace-burn-in.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/comp-bayes-figures/mcmc-trace-burn-in.png


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/02-gaussian-emissions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/02-gaussian-emissions.png


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/01-markov-chain-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/01-markov-chain-example.png


--------------------------------------------------------------------------------
/docs/machine-learning/message-passing-figures/figure-msg-passing-water.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/message-passing-figures/figure-msg-passing-water.pdf


--------------------------------------------------------------------------------
/docs/machine-learning/message-passing-figures/figure-msg-passing-water.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/message-passing-figures/figure-msg-passing-water.png


--------------------------------------------------------------------------------
/docs/machine-learning/markov-models-figures/03-autoregressive-emissions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/markov-models-figures/03-autoregressive-emissions.png


--------------------------------------------------------------------------------
/docs/machine-learning/message-passing-figures/figure-message-passing-sparse.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/message-passing-figures/figure-message-passing-sparse.pdf


--------------------------------------------------------------------------------
/docs/machine-learning/message-passing-figures/figure-message-passing-sparse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/message-passing-figures/figure-message-passing-sparse.png


--------------------------------------------------------------------------------
/docs/machine-learning/message-passing-figures/figure-message-passing-batched.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/message-passing-figures/figure-message-passing-batched.pdf


--------------------------------------------------------------------------------
/docs/machine-learning/message-passing-figures/figure-message-passing-batched.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/message-passing-figures/figure-message-passing-batched.png


--------------------------------------------------------------------------------
/docs/machine-learning/message-passing-figures/figure-message-passing-graph-size.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/message-passing-figures/figure-message-passing-graph-size.pdf


--------------------------------------------------------------------------------
/docs/machine-learning/message-passing-figures/figure-message-passing-graph-size.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/message-passing-figures/figure-message-passing-graph-size.png


--------------------------------------------------------------------------------
/docs/machine-learning/message-passing-figures/figure-msg-passing-carbon-methane.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/message-passing-figures/figure-msg-passing-carbon-methane.pdf


--------------------------------------------------------------------------------
/docs/machine-learning/message-passing-figures/figure-msg-passing-carbon-methane.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/essays-on-data-science/master/docs/machine-learning/message-passing-figures/figure-msg-passing-carbon-methane.png


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: essays-on-data-science
 2 | channels:
 3 | - conda-forge
 4 | dependencies:
 5 | - python=3.7
 6 | - pygments=2.4.2
 7 | - pre-commit=1.20.0
 8 | - pip
 9 | - pip:
10 |   - mkdocs-minify-plugin
11 |   - pymdown-extensions
12 |   - mknotebooks
13 |   - mkdocs
14 |   - mkdocs-material
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Essays on Data Science
 2 | 
 3 | In which I put together my thoughts on the practice of data science.
 4 | 
 5 | This is a curated and edited collection of my blog posts.
 6 | 
 7 | Other links:
 8 | 
 9 | - Development Docker container: https://hub.docker.com/repository/docker/ericmjl/essays-on-data-science
10 | 


--------------------------------------------------------------------------------
/docs/supporters.md:
--------------------------------------------------------------------------------
 1 | # A big thank you...
 2 | 
 3 | ...to my Patreon supporters!
 4 | 
 5 | 1. Eddie Janowicz
 6 | 1. Carol Willing
 7 | 1. Hector Munoz
 8 | 1. Mridul Seth
 9 | 1. Kapil Jain
10 | 1. Brian Gue
11 | 1. Brice Paris
12 | 
13 | Your support keeps me caffeinated,
14 | so I can continue to make educational material
15 | for the data science and Python communities!
16 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | repos:
 4 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 5 |     rev: v2.4.0
 6 |     hooks:
 7 |     -   id: trailing-whitespace
 8 |     -   id: end-of-file-fixer
 9 |     -   id: check-added-large-files
10 | # -   repo: https://github.com/jumanjihouse/pre-commit-hooks
11 | #     rev: 1.11.0
12 | #     hooks:
13 | #     -   id: markdownlint
14 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | install:
 4 |   # We do this conditionally because it saves us some downloading if the
 5 |   # version is the same.
 6 |   - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
 7 |   - bash miniconda.sh -b -p $HOME/miniconda
 8 |   - export PATH="$HOME/miniconda/bin:$PATH"
 9 |   - hash -r
10 |   - conda config --set always_yes yes --set changeps1 no
11 |   - conda update -q conda
12 |   - conda config --add channels conda-forge
13 | 
14 |   # Useful for debugging any issues with conda
15 |   - conda info -a
16 | 
17 |   # Install Python, py.test, and required packages.
18 |   - conda env create -f environment.yml
19 |   - source activate essays-on-data-science
20 | 
21 | script: true
22 | 
23 | before_deploy:
24 |   - mkdocs build --verbose --clean --strict
25 | 
26 | deploy:
27 |   provider: pages
28 |   skip_cleanup: true
29 |   github_token: $github_token
30 |   local_dir: site
31 |   on:
32 |     branch: master
33 | 
34 | notifications:
35 |   email: true
36 | 


--------------------------------------------------------------------------------
/draft-ideas.md:
--------------------------------------------------------------------------------
 1 | # Ideas for further essays
 2 | 
 3 | ## Computing
 4 | 
 5 | ### Reasoning about algorithmic complexity
 6 | 
 7 | Basically covering
 8 | "how to read your Python code
 9 | to reason about algorithmic complexity".
10 | I would like to reference Ned Batchelder's talk and blog post
11 | on this topic:
12 | https://nedbatchelder.com/text/bigo.html
13 | 
14 | ### Practical selection of appropriate data structures
15 | 
16 | Basically covering
17 | where and when to use:
18 | 
19 | - lists: sequences of identically-typed objects
20 | - dictionaries: simple mappings from hashable type to arbitrary objects
21 | - (named) tuples: more complex mappings
22 | - objects: when more complexity is needed
23 | 
24 | ### Functional and object-oriented programming
25 | 
26 | Basically covering the tell-tale signs
27 | of when functional programming may be a better choice,
28 | and when object-oriented programming may be a better choice,
29 | and what to do when the choice isn't crystal clear.
30 | (In my biased opinion, stick with functional programming as much as possible.)
31 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Essays on Data Science
 2 | 
 3 | In which I put together my thoughts on the practice of data science.
 4 | 
 5 | This is a curated and edited collection of my blog posts,
 6 | as well as essays specially written for the broader Python community.
 7 | 
 8 | ## Support this project
 9 | 
10 | If you find this collection of essays useful,
11 | please [star the repository on GitHub](https://github.com/ericmjl/essays-on-data-science)!
12 | 
13 | If you enjoyed this essay collection and would like to receive early-bird access to more,
14 | [please support me on Patreon][patreon]!
15 | A coffee a month sent my way gets you _early_ access to my essays
16 | on a private URL exclusively for my supporters
17 | as well as shoutouts on every single essay that I put out.
18 | 
19 | [patreon]: https://patreon.com/ericmjl
20 | 
21 | Also, I have a free monthly newsletter that I use as an outlet
22 | to share programming-oriented data science tips and tools.
23 | If you'd like to receive it, sign up on [TinyLetter][tinyletter]!
24 | 
25 | [tinyletter]: https://tinyletter.com/ericmjl
26 | 


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | // For format details, see https://aka.ms/vscode-remote/devcontainer.json or this file's README at:
 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.117.1/containers/python-3-miniconda
 3 | {
 4 |     "name": "Essays on Data Science",
 5 |     "context": "..",
 6 |     "image": "registry.hub.docker.com/ericmjl/essays-on-data-science:latest",
 7 |     // Set *default* container specific settings.json values on container create.
 8 |     "settings": {
 9 |         "terminal.integrated.shell.linux": "/bin/bash",
10 |         "python.pythonPath": "/opt/conda/bin/python",
11 |         "python.linting.enabled": true,
12 |         "python.linting.pylintEnabled": true,
13 |         "python.linting.pylintPath": "/opt/conda/bin/pylint"
14 |     },
15 |     // Add the IDs of extensions you want installed when the container is created.
16 |     "extensions": [
17 |         "ms-python.python"
18 |     ],
19 |     // Use 'forwardPorts' to make a list of ports inside the container available locally.
20 |     "forwardPorts": [
21 |         8000,
22 |         8001,
23 |         8002,
24 |         8003,
25 |         8004,
26 |         8005
27 |     ],
28 |     // Use 'postCreateCommand' to run commands after the container is created.
29 |     "postCreateCommand": [
30 |         "pre-commit install",
31 |     ]
32 |     // Uncomment to connect as a non-root user. See https://aka.ms/vscode-remote/containers/non-root.
33 |     // "remoteUser": "vscode"
34 | }
35 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: Essays on Data Science
 2 | theme:
 3 |   name: 'material'
 4 |   palette:
 5 |     primary: 'light blue'
 6 |     accent: 'light blue'
 7 |     icon:
 8 |       logo: 'library_books'
 9 |   features:
10 |     - tabs
11 | 
12 | plugins:
13 |   - search:
14 |       separator: '[\s\-\.]+'
15 |       lang:
16 |         - en
17 |   - mknotebooks
18 | 
19 | # Taken from here: https://squidfunk.github.io/mkdocs-material/extensions/codehilite/
20 | markdown_extensions:
21 |   - codehilite
22 |   - pymdownx.arithmatex
23 |   - pymdownx.details
24 |   - pymdownx.superfences:
25 |       custom_fences:
26 |         - name: mermaid
27 |           class: mermaid
28 |           format: !!python/name:pymdownx.superfences.fence_div_format
29 |   - markdown.extensions.footnotes
30 | 
31 | extra_css:
32 |   - https://unpkg.com/mermaid@7.1.2/dist/mermaid.css
33 | 
34 | extra_javascript:
35 |   - https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-AMS-MML_HTMLorMML
36 |   - https://unpkg.com/mermaid@7.1.2/dist/mermaid.min.js
37 | 
38 | google_analytics:
39 |   - 'UA-12498603-3'
40 |   - 'auto'
41 | 
42 | repo_name: 'ericmjl/essays-on-data-science'
43 | repo_url: 'https://github.com/ericmjl/essays-on-data-science'
44 | 
45 | extra:
46 |   social:
47 |     - icon: 'material/email'
48 |       link: 'http://www.shortwhale.com/ericmjl'
49 |     - icon: 'fontawesome/brands/github'
50 |       link: 'https://github.com/ericmjl'
51 |     - icon: 'fontawesome/brands/twitter'
52 |       link: 'https://twitter.com/ericmjl'
53 |     - icon: 'fontawesome/brands/linkedin'
54 |       link: 'https://linkedin.com/in/ericmjl'
55 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | .vscode/settings.json
106 | 


--------------------------------------------------------------------------------
/.devcontainer/Dockerfile:
--------------------------------------------------------------------------------
 1 | #-------------------------------------------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License. See https://go.microsoft.com/fwlink/?linkid=2090316 for license information.
 4 | #-------------------------------------------------------------------------------------------------------------
 5 | 
 6 | FROM continuumio/miniconda3
 7 | 
 8 | # Avoid warnings by switching to noninteractive
 9 | ENV DEBIAN_FRONTEND=noninteractive
10 | 
11 | # This Dockerfile adds a non-root user with sudo access. Use the "remoteUser"
12 | # property in devcontainer.json to use it. On Linux, the container user's GID/UIDs
13 | # will be updated to match your local UID/GID (when using the dockerFile property).
14 | # See https://aka.ms/vscode-remote/containers/non-root-user for details.
15 | ARG USERNAME=vscode
16 | ARG USER_UID=1000
17 | ARG USER_GID=$USER_UID
18 | 
19 | # Copy environment.yml (if found) to a temp locaition so we update the environment. Also
20 | # copy "noop.txt" so the COPY instruction does not fail if no environment.yml exists.
21 | COPY environment.yml* .devcontainer/noop.txt /tmp/conda-tmp/
22 | 
23 | # Configure apt and install packages
24 | RUN apt-get update \
25 |     && apt-get -y install --no-install-recommends apt-utils dialog 2>&1 \
26 |     #
27 |     # Verify git, process tools, lsb-release (common in install instructions for CLIs) installed
28 |     && apt-get -y install git openssh-client less iproute2 procps iproute2 lsb-release \
29 |     #
30 |     # Install pylint
31 |     && /opt/conda/bin/pip install pylint \
32 |     #
33 |     # Update Python environment based on environment.yml (if present)
34 |     && if [ -f "/tmp/conda-tmp/environment.yml" ]; then /opt/conda/bin/conda env update -n base -f /tmp/conda-tmp/environment.yml; fi \
35 |     && rm -rf /tmp/conda-tmp \
36 |     #
37 |     # Create a non-root user to use if preferred - see https://aka.ms/vscode-remote/containers/non-root-user.
38 |     && groupadd --gid $USER_GID $USERNAME \
39 |     && useradd -s /bin/bash --uid $USER_UID --gid $USER_GID -m $USERNAME \
40 |     # [Optional] Add sudo support for the non-root user
41 |     && apt-get install -y sudo nano emacs vim \
42 |     && echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME\
43 |     && chmod 0440 /etc/sudoers.d/$USERNAME \
44 |     #
45 |     # Clean up
46 |     && apt-get autoremove -y \
47 |     && apt-get clean -y \
48 |     && rm -rf /var/lib/apt/lists/*
49 | 
50 | # Switch back to dialog for any ad-hoc use of apt-get
51 | ENV DEBIAN_FRONTEND=dialog
52 | 


--------------------------------------------------------------------------------
/docs/software-skills/index.md:
--------------------------------------------------------------------------------
 1 | # Software Skills
 2 | 
 3 | Because our day-to-day involves writing code,
 4 | I am convinced that we data scientists
 5 | need to be equipped with basic software engineering skills.
 6 | Being equipped with these skills
 7 | will help us write code that is, in the long-run,
 8 | easy to recap, remember, reference, review, and rewrite.
 9 | 
10 | In this collection of short essays,
11 | I will highlight the basic software skills that,
12 | if we master,
13 | will increase our efficiency and effectiveness in the long-run.
14 | 
15 | ## Common Objections
16 | 
17 | If you have heard these suggestions before,
18 | then you might have also heard some of the common objections
19 | to learning these software practices.
20 | I wish to address them here in bulk,
21 | so I do not have to address them in-depth in the individual essays.
22 | 
23 | ### I have not enough time
24 | 
25 | This objection is one I am sympathetic to,
26 | as I operate under time constraints myself.
27 | 
28 | This is the nature of code: written once, used many times.
29 | Hence, the best response that I can give is that time taken cutting corners now
30 | yields multiples of others' (including your future self's) time wasted
31 | navigating an undocumented, spaghetti-code codebase,
32 | that is not well-structured either.
33 | Cutting out these software practices now
34 | makes things much more difficult to maintain and improve code
35 | when it goes into production.
36 | 
37 | ### My code is only going to be written and read by myself
38 | 
39 | At some point, though, there is a high probability
40 | that you will end up writing code
41 | that someone else has to read and use.
42 | The time invested in making the code read well _now_,
43 | even on code that does not have to be read by others,
44 | will reduce the learning curve pain
45 | when you eventually do have to write code for others.
46 | You might as well invest the time now while there's less formal scrutiny
47 | to practice your software skills.
48 | When the stakes are higher, being ready can only be helpful.
49 | 
50 | ### I don't know how to get started, there are so many places to begin
51 | 
52 | Pick any one skill, say, refactoring, and work on it first.
53 | You can always add on more skills into your toolkit as you go along.
54 | 
55 | ## Thank you for reading!
56 | 
57 | If you enjoyed this essay and would like to receive early-bird access to more,
58 | [please support me on Patreon][patreon]!
59 | A coffee a month sent my way gets you _early_ access to my essays
60 | on a private URL exclusively for my supporters
61 | as well as shoutouts on every single essay that I put out.
62 | 
63 | [patreon]: https://patreon.com/ericmjl
64 | 
65 | Also, I have a free monthly newsletter that I use as an outlet
66 | to share programming-oriented data science tips and tools.
67 | If you'd like to receive it, sign up on [TinyLetter][tinyletter]!
68 | 
69 | [tinyletter]: https://tinyletter.com/ericmjl
70 | 


--------------------------------------------------------------------------------
/docs/computing/recursion.md:
--------------------------------------------------------------------------------
 1 | # Recursion
 2 | 
 3 | Recursion is an incredibly useful concept to know.
 4 | To be clear, it is distinct from looping, but is related.
 5 | I think it's helpful for data scientists
 6 | to have recursion as a programming trick in their back pocket.
 7 | In this essay, let's take an introductory look at recursion,
 8 | and where it can come in handy.
 9 | 
10 | ## What recursion looks like
11 | 
12 | Recursion happens when we have a function
13 | that calls itself by default
14 | or else returns a result when some stopping criteria is reached.
15 | 
16 | A classic example of recursion
17 | is in finding the root of a tree from a given node.
18 | Here, we essentially want to follow every node's predecessor
19 | until we reach a node that has no predecessor.
20 | 
21 | In code form, this looks something like this:
22 | 
23 | ```python linenums="1"
24 | def find_root(G, n):
25 |     predecessor = G.predecessor(n)
26 |     if G.predecessor(n):
27 |         return find_root(G, predecessor)
28 |     else:
29 |         return n
30 | ```
31 | 
32 | Generally, we first compute something on the basis of the inputs (line 2).
33 | This is usually some form of finding a new substitute input
34 | on which we can check a condition (lines 4 and 6).
35 | Under one condition, we return the function call with a new input,
36 | and under another condition, we return the desired output.
37 | 
38 | ## Why you would use recursion
39 | 
40 | Recursion is essentially a neat way to write a loop concisely,
41 | and can be useful, say,
42 | under circumstances where we do not know
43 | the exact number of loop iterations needed
44 | before we encounter the stopping condition.
45 | 
46 | While I do find recursion useful in certain applied settings,
47 | I will also clarify that I don't use recursion on a daily basis.
48 | As such, I recommend this as a back-pocket trick that one should have,
49 | but won't necessarily use all the time.
50 | 
51 | ## Where recursion shows up in a real-life situation
52 | 
53 | I can speak to one situation at work
54 | where I was benchmarking some deep neural network models,
55 | and also testing hyperparameters on a grid.
56 | There, I used YAML files to keep track of parameters and experiments,
57 | and in order to keep things concise,
58 | I implemented a very lightweight YAML inheritance scheme,
59 | where I would have a master "template" experiment,
60 | but use child YAML files that inherited from the "master" template
61 | in which certain parts of the experiment parameters were changed.
62 | (An example might be one where the master template
63 | specified the use of the Adam optimizer with a particular learning rate,
64 | while the child templates simply modified the learning rate.)
65 | 
66 | As the experiments got deeper and varied more parameters,
67 | things became more tree-like,
68 | and so I had to navigate the parameter tree from the child templates
69 | up till the root template, which by definition had no parents.
70 | After finding the root template,
71 | I could then travel back down from the root template,
72 | iteratively updating the parameters
73 | until I reached the child template of interest.
74 | 
75 | The more general scenario to look out for is in graph traversal problems.
76 | If your problem can be cast in terms of a graph data structure
77 | that you need to program your computer to take a walk over,
78 | then that is a prime candidate for trying your hand at recursion.
79 | 
80 | ## Thank you for reading!
81 | 
82 | If you enjoyed this essay and would like to receive early-bird access to more,
83 | [please support me on Patreon][patreon]!
84 | A coffee a month sent my way gets you _early_ access to my essays
85 | on a private URL exclusively for my supporters
86 | as well as shoutouts on every single essay that I put out.
87 | 
88 | [patreon]: https://patreon.com/ericmjl
89 | 
90 | Also, I have a free monthly newsletter that I use as an outlet
91 | to share programming-oriented data science tips and tools.
92 | If you'd like to receive it, sign up on [TinyLetter][tinyletter]!
93 | 
94 | [tinyletter]: https://tinyletter.com/ericmjl
95 | 


--------------------------------------------------------------------------------
/docs/terminal/cli-tools.md:
--------------------------------------------------------------------------------
  1 | # Tools and Upgrades for your CLI
  2 | 
  3 | In this short essay,
  4 | I would like to introduce you
  5 | to a list of awesome command-line tools
  6 | that I have found on the internet.
  7 | 
  8 | Most of the tools listed here do one thing really well:
  9 | they add visual clarity to the text that we are looking at.
 10 | This is mostly done by colorizing the terminal
 11 | with syntax highlighting.
 12 | 
 13 | Without further ado, let's get started listing them.
 14 | 
 15 | ## `exa`
 16 | 
 17 | [`exa`](https://the.exa.website/) is a favourite of mine,
 18 | because it is an almost drop-in replacement for `ls`,
 19 | except with saner defaults.
 20 | It also comes with a saner set of defaults for the `tree` command.
 21 | 
 22 | After installing, you can replace `ls` and `tree` with `exa` by aliasing:
 23 | 
 24 | ```bash
 25 | alias ls='exa --long --git -a --header --group'
 26 | alias tree='exa --tree --level=2 --long -a --header --git'
 27 | ```
 28 | 
 29 | ## `tmux`
 30 | 
 31 | [`tmux`](https://github.com/tmux/tmux) is another daily driver of mine.
 32 | I use it to keep remote terminal sessions persistent,
 33 | and use it effectively as a workspace manager between projects.
 34 | 
 35 | ## `nanorc`
 36 | 
 37 | If you're like me, and are accustomed to the `nano` text editor
 38 | rather than `vim` or `emacs`,
 39 | then [`nanorc`](https://github.com/scopatz/nanorc),
 40 | a set of syntax highlighting configurations
 41 | provided by [Anthony Scopatz](https://twitter.com/scopatz)
 42 | is an awesome addition to your `nano` toolkit.
 43 | 
 44 | (For what it's worth, I wrote this short essay in `nano`,
 45 | and `nanorc` played no small role in making the text readable!)
 46 | 
 47 | ## `diff-so-fancy`
 48 | 
 49 | [`diff-so-fancy`](https://github.com/so-fancy/diff-so-fancy)
 50 | is a drop-in replacement for `diff`,
 51 | and makes it so much easier read diffs between two files.
 52 | 
 53 | After installation, you can easily replace `diff` with `diff-so-fancy` through aliasing:
 54 | 
 55 | ```bash
 56 | alias diff="diff-so-fancy"
 57 | ```
 58 | 
 59 | ## `bat`
 60 | 
 61 | [`bat`](https://github.com/sharkdp/bat) is another one of those instant favourites.
 62 | I use `cat` and `less` often to look through files,
 63 | but `bat` takes things to another level.
 64 | It is basically a mash-up between `cat` and `less`,
 65 | allowing you to scroll through your files in a `less`-like scrolling fashion,
 66 | while also providing syntax highlighting for the files you open.
 67 | 
 68 | At the same time, it'll let you concatenate two files together (just like `cat`)
 69 | and display them to the screen.
 70 | 
 71 | After installing, you can replace `cat` with `bat` by aliasing as well:
 72 | 
 73 | ```bash
 74 | alias cat="bat"
 75 | ```
 76 | 
 77 | ## `fd`
 78 | 
 79 | [`fd`](https://github.com/sharkdp/fd)
 80 | is another tool that provides saner syntax than the default `find`.
 81 | 
 82 | After installing, you can replace `find` with `fd` by aliasing:
 83 | 
 84 | ```bash
 85 | alias find="fd"
 86 | ```
 87 | 
 88 | ## `ripgrep`
 89 | 
 90 | [`ripgrep`](https://github.com/BurntSushi/ripgrep)
 91 | is a tool that will let you search directories recursively for a particular pattern.
 92 | This can help you quickly find text inside a file inside the file tree easily.
 93 | 
 94 | ## References
 95 | 
 96 | [Vim From Scratch](https://www.vimfromscratch.com/articles/awesome-command-line-tools/)
 97 | introduced many of the tools shown here,
 98 | and I want to make sure that the author gets credit
 99 | for finding and sharing these awesome tools!
100 | 
101 | [James Weis](https://www.linkedin.com/in/jameswweis/) introduced me to `tmux`
102 | while in grad school, and I've been hooked ever since.
103 | 
104 | ## Thank you for reading!
105 | 
106 | If you enjoyed this essay and would like to receive early-bird access to more,
107 | [please support me on Patreon][patreon]!
108 | A coffee a month sent my way gets you _early_ access to my essays
109 | on a private URL exclusively for my supporters
110 | as well as shoutouts on every single essay that I put out.
111 | 
112 | [patreon]: https://patreon.com/ericmjl
113 | 
114 | Also, I have a free monthly newsletter that I use as an outlet
115 | to share programming-oriented data science tips and tools.
116 | If you'd like to receive it, sign up on [TinyLetter][tinyletter]!
117 | 
118 | [tinyletter]: https://tinyletter.com/ericmjl
119 | 


--------------------------------------------------------------------------------
/docs/software-skills/code-formatting.md:
--------------------------------------------------------------------------------
  1 | # Formatting your code
  2 | 
  3 | One key insight from the Python programming language
  4 | is that code is read more often than it is written.
  5 | Hence, writing code in a fashion that makes it easy to read
  6 | is something that can only be beneficial.
  7 | 
  8 | But formatting code is a nit-picky and tedious matter, isn't it?
  9 | Moreover, code style is one of those things that are not substantive enough
 10 | to engage in flame wars.
 11 | It really is one of those things we should just get over with, right?
 12 | 
 13 | Yes, and it is possible to be "just over and done with it"
 14 | if we use automation tools to help us take care of code formatting
 15 | so that we don't have to think about it.
 16 | 
 17 | ## Introducing `black`
 18 | 
 19 | `black` is an opinionated code formatter for the Python programming language.
 20 | It comes with sane defaults,
 21 | and produces consistently formatted code with a single command at the terminal.
 22 | 
 23 | ### Installing `black`
 24 | 
 25 | To install it, we can either use `pip` or `conda`:
 26 | 
 27 | ```bash
 28 | # for pip users
 29 | pip install black
 30 | # for conda users
 31 | conda install black
 32 | ```
 33 | 
 34 | ### Using `black`
 35 | 
 36 | We can use black directly at the command line in our project directory,
 37 | with configurations called at the command line for convenience.
 38 | 
 39 | ```bash
 40 | # Format all .py files within and underneath current working directory.
 41 | black -l 79 .
 42 | ```
 43 | 
 44 | ## Introducing `isort`
 45 | 
 46 | `isort` is a package for sorting your imports in a source `.py` file.
 47 | Once again, this is the sort of thing
 48 | you definitely don't want to do by hand.
 49 | 
 50 | ### Installing `isort`
 51 | 
 52 | `isort` is also conda- and pip-installable.
 53 | 
 54 | ```bash
 55 | # pip users
 56 | pip install isort
 57 | # conda users
 58 | conda install isort
 59 | ```
 60 | 
 61 | ### Using `isort`
 62 | 
 63 | Just like with black, we can use `isort` to automagically sort our imports.
 64 | As an example we will call it at the command line with certain options enabled.
 65 | 
 66 | ```bash
 67 | # -r: recurses down below the current working directory.
 68 | # -y: automatically overwrite original source file with sorted imports.
 69 | isort -r -y .
 70 | ```
 71 | 
 72 | ## Building automation for code formatting
 73 | 
 74 | Automatically executing automagic commands is pretty awesome.
 75 | Let's see how we can enable this.
 76 | 
 77 | ### Makefiles
 78 | 
 79 | I also place `black` as part of a series of commands used in code style checking
 80 | in a Makefile, to run all of those commands together.
 81 | 
 82 | ```makefile
 83 | format:
 84 |     isort -r -y .
 85 |     black -l 79 .
 86 | ```
 87 | 
 88 | With that Makefile command,
 89 | we can now execute all code formatting commands with a single call.
 90 | 
 91 | _Side note:_ I usually do `isort` first
 92 | because `black` will make detect `isort`-ed code as not properly formatted,
 93 | hence I defer to `black` to make the final changes.
 94 | 
 95 | ### Pre-commit hooks
 96 | 
 97 | We can also use pre-commit hooks to catch non-properly-formatted code,
 98 | and run the code formatters over the code,
 99 | preventing them from being merged if any formatting has to take place.
100 | This ensures thatwe never commit code that is incorrectly formatted.
101 | 
102 | Getting set up with pre-commit hooks is another topic,
103 | but there are already great resources that can be searched for online
104 | on how to get setup.
105 | 
106 | ## Concluding words
107 | 
108 | I hope this short essay gives you an overview
109 | of the tools that you can use to format your code automatically.
110 | Code formatting is important for readability,
111 | but isn't worth the tedium.
112 | Letting automation save your time is the wise thing to do.
113 | 
114 | ## Thank you for reading!
115 | 
116 | If you enjoyed this essay and would like to receive early-bird access to more,
117 | [please support me on Patreon][patreon]!
118 | A coffee a month sent my way gets you _early_ access to my essays
119 | on a private URL exclusively for my supporters
120 | as well as shoutouts on every single essay that I put out.
121 | 
122 | [patreon]: https://patreon.com/ericmjl
123 | 
124 | Also, I have a free monthly newsletter that I use as an outlet
125 | to share programming-oriented data science tips and tools.
126 | If you'd like to receive it, sign up on [TinyLetter][tinyletter]!
127 | 
128 | [tinyletter]: https://tinyletter.com/ericmjl
129 | 


--------------------------------------------------------------------------------
/docs/newsletter/2020/05-may.md:
--------------------------------------------------------------------------------
  1 | # Data Science Programming May 2020 Newsletter
  2 | 
  3 | Hello fellow datanistas!
  4 | 
  5 | Here’s the May 2020 edition of my newsletter.
  6 | I’m trying out a slightly different formatting;
  7 | as always, though, I hope you find it useful.
  8 | If you have feedback, [do send it my way][shortmail]!
  9 | 
 10 | [shortmail]: http://shortmail.ericmjl.com/
 11 | 
 12 | ## Multi-Task Learning in the Wild
 13 | 
 14 | I recently watched Andrej Karpathy’s [talk on multi-task learning][karpathy], and I learned a ton.
 15 | When you’re faced with hardware constraints,
 16 | how do you tweak your ML model to get better at more tasks?
 17 | Check out his talk to learn more.
 18 | 
 19 | [karpathy]: https://slideslive.com/38917690/multitask-learning-in-the-wilderness
 20 | 
 21 | ## Gaussian processes explained quite simply
 22 | 
 23 | I’ve been a fan of Gaussian Processes for non-linear predictive modeling tasks,
 24 | especially when writing a neural network feels too much
 25 | for the small-ish data that I have.
 26 | Learning about them wasn’t easy though.
 27 | That said, there’s a wonderful blog post from Yuge Shi, a PhD student at Oxford, explaining GPs in a pretty intuitive fashion.
 28 | She put in enough pictures to accompany the math that you should find it [an enjoyable read][gp]!
 29 | 
 30 | [gp]: https://yugeten.github.io/posts/2019/09/GP/
 31 | 
 32 | ## Preview of an exciting development using Dask
 33 | 
 34 | If you’re a Dask user, this next video preview is going to be music to your ears.
 35 | Matthew Rocklin,
 36 | lead developer of Dask and founder of Coiled Computing
 37 | (which is providing support for Dask and more)
 38 | just showed us spinning Dask clusters in the cloud from a laptop,
 39 | getting us to interactive-scale compute.
 40 | This is the dream:
 41 | burstable, interactive-time, portable, large-scale computing
 42 | from my laptop to the cloud with minimal config!
 43 | Check out [the screencast][dask] he made for a preview!
 44 | 
 45 | [dask]: https://www.youtube.com/watch?v=qaJcAvhgLy4
 46 | 
 47 | ## SciPy 2020
 48 | 
 49 | Virtual SciPy 2020’s schedule has been released!
 50 | The conference has been a wonderful place
 51 | to learn about the latest in data science and scientific computing tooling.
 52 | Come check out the schedule [here][scipy].
 53 | I will be there presenting a tutorial on Bayesian statistics;
 54 | hope to see you there!
 55 | 
 56 | [here]: https://www.scipy2020.scipy.org/schedule
 57 | 
 58 | ## Faster pandas applies with swifter
 59 | 
 60 | While seeking out faster ways to do `pandas` applies,
 61 | I learned about a new tool, called [`swifter`][swifter]!
 62 | It automatically finds the fastest way to apply a pandas function.
 63 | Fits very nicely into the paradigm
 64 | of “do one thing and only one thing well”.
 65 | Check out [the GitHub repository][swifter]
 66 | and let me know what you think of it!
 67 | I will be experimenting with it on `pyjanitor`
 68 | to see whether it does a better job
 69 | with speeding up some of the functions in there.
 70 | 
 71 | [swifter]: http://github.com/jmcarpenter2/swifter
 72 | 
 73 | ## From my collection
 74 | 
 75 | ### Sane path management in your project directory
 76 | 
 77 | I recently wrote [a little post][blog]
 78 | about how we can use Python’s pathlib to make file paths
 79 | a little more sane in our projects.
 80 | [`pyprojroot`][pyprojroot], the tool I feature in the post,
 81 | was developed by one of my Python conference dopplegangers, [Daniel Chen][danchen],
 82 | who has this wonderfully ironic habit of doing book giveaways and signings
 83 | of his Pandas book at R conferences :).
 84 | 
 85 | [blog]: https://ericmjl.github.io/blog/2020/4/21/use-pyprojroot-and-pythons-pathlib-to-manage-your-data-paths/
 86 | 
 87 | [pyprojroot]: https://github.com/chendaniely/pyprojroot/
 88 | 
 89 | [danchen]: https://chendaniely.github.io/
 90 | 
 91 | ### Updates to our network analysis tutorial!
 92 | 
 93 | Finally, with my collaborator [Mridul Seth][mridul]
 94 | (who runs the GSoC program with NumFOCUS),
 95 | we’ve been updating our Network Analysis Made Simple repository!
 96 | My Patreon supporters will get early access to
 97 | the tutorial repository before its public launch later in the year,
 98 | so if you like it,
 99 | [please consider sending a cup of coffee each month][patreon]!
100 | Your support would go a long way to supporting
101 | the creation and maintenance of this teaching material!
102 | (Up next will be Bayesian content - on probabilistic programming -
103 | just in time for SciPy 2020!)
104 | 
105 | [mridul]: https://mriduls.github.io/
106 | [patreon]: https://patreon.com/ericmjl
107 | 
108 | Stay safe and enjoy the sunshine!
109 | Eric
110 | 


--------------------------------------------------------------------------------
/docs/newsletter/2020/09-september.md:
--------------------------------------------------------------------------------
  1 | # Data Science Programming September 2020 Newsletter
  2 | 
  3 | Hello fellow datanistas!
  4 | 
  5 | Welcome to the September edition
  6 | of the programming-oriented data science newsletter.
  7 | I hope you've all been staying safe amid the COVID-19 outbreak.
  8 | 
  9 | There's no special theme this month,
 10 | just a smattering of cool tools and articles
 11 | that I think will improve your productivity!
 12 | 
 13 | ## [Setting up VSCode for Python Development like RStudio](https://stevenmortimer.com/setting-up-vs-code-for-python-development-like-rstudio/)
 14 | 
 15 | Firstly,
 16 | a blog post by Steven Mortimer on how to set up VSCode,
 17 | which is a really awesome IDE,
 18 | in such a way that it behaves like RStudio.
 19 | For R users who have to transition over to Python
 20 | (e.g. for work, or for personal interest),
 21 | this should help bridge the gap a bit!
 22 | 
 23 | ## [Pylance in VSCode](https://devblogs.microsoft.com/python/announcing-pylance-fast-feature-rich-language-support-for-python-in-visual-studio-code/)
 24 | 
 25 | Speaking of VSCode,
 26 | I have been test-driving Pylance in my workflow at work,
 27 | and it's blazing fast and performant for code checking!
 28 | As I was writing my code,
 29 | the Pylance VSCode extension continually checked my code,
 30 | helping me to catch execution errors before I even executed the code.
 31 | Amazing stuff, Microsoft, I like what you've become now :).
 32 | 
 33 | ## [ECDFs are in Seaborn!](https://github.com/mwaskom/seaborn/pull/2141#event-3453087212)
 34 | 
 35 | Since learning about ECDFs a few years ago,
 36 | I have advocated for visualizing distributions of data
 37 | [using ECDFs rather than histograms](http://ericmjl.com/blog/2018/7/14/ecdfs/).
 38 | Well, nothing beats having best practices available  conveniently,
 39 | so I'm super happy to see ECDFs conveniently available in seaborn!
 40 | 
 41 | ## [Stupid Simple Kubernetes](https://levelup.gitconnected.com/stupid-simple-kubernetes-e509355fba3d)
 42 | 
 43 | From experience at work,
 44 | I can vouch for the idea
 45 | that it's completely worthwhile for a data scientist
 46 | to learn the ideas around containers,
 47 | Kubernetes included.
 48 | To help get up to speed,
 49 | my colleague Zach Barry found an awesome article to help,
 50 | titled "[Stupid Simple Kubernetes](https://levelup.gitconnected.com/stupid-simple-kubernetes-e509355fba3d)".
 51 | Lots of terms in the K8 world get clarified in that article.
 52 | I hope you enjoy it!
 53 | 
 54 | ## [Learn in Public](https://www.swyx.io/writing/learn-in-public/)
 55 | 
 56 | This is an article that resonated deeply with me.
 57 | Learning in public has been, for me,
 58 | the biggest career hack that I have experienced.
 59 | Now, Shawn Wang has articulated clearly the benefits of doing so!
 60 | The biggest is being able to build a public-facing portfolio
 61 | that you can point to that demonstrates your skill set.
 62 | 
 63 | ## From my collection
 64 | 
 65 | Some things I recently wrote about:
 66 | 
 67 | - [Software skills are important, for it helps us data scientists think _clearly_.](https://ericmjl.github.io/blog/2020/8/21/software-engineering-as-a-research-practice/)
 68 | - [Some early thoughts test-driving `pandera` for data validation.](https://ericmjl.github.io/blog/2020/8/30/pandera-data-validation-and-statistics/)
 69 | - [`.also()`, which comes from the Kotlin programming language, proposed in `pyjanitor` as a new feature](https://github.com/ericmjl/pyjanitor/issues/731) - I'm excited to see where this one goes!
 70 | - I'll be speaking at JupyterCon 2020 this year!
 71 | Super excited to release a talk on how we compiled Network Analysis Made Simple
 72 | into our [eBook](http://leanpub.com/nams) and [website](https://ericmjl.github.io/Network-Analysis-Made-Simple/)!
 73 | 
 74 | ## A plug for an awesome open source contributor
 75 | 
 76 | The final thing I'd like to include in this newsletter
 77 | is a completely unsolicited but heartfelt advertisement for
 78 | [Samuel Oranyeli](https://samukweku.github.io/data-wrangling-blog/).
 79 | He's been a consistent contributor to the  `pyjanitor` project,
 80 | and I have witnessed his skills growth
 81 | over the past few months of contribution.
 82 | The most important quality he possesses is consistent learning!
 83 | If you're hiring for a Python developer in the Sydney,
 84 | Australia area or remotely,
 85 | do consider him on your list!
 86 | 
 87 | ## Thank you for reading!
 88 | 
 89 | As always,
 90 | let me know on [Twitter](twitter.com/ericmjl) if you've enjoyed the newsletter,
 91 | and I'm always open to hearing about the new things you've learned from it.
 92 | 
 93 | Meanwhile, if you'd like to get early access to new written tutorials,
 94 | essays, 1-on-1 consulting
 95 | and complimentary access to the Skillshare workshops that I make,
 96 | I'd appreciate your support on [Patreon](patreon.com/ericmjl)!
 97 | 
 98 | Stay safe, stay indoors, and keep hacking!
 99 | 
100 | Cheers,
101 | Eric
102 | 


--------------------------------------------------------------------------------
/docs/workflow/code-review.md:
--------------------------------------------------------------------------------
  1 | # Practicing Code Review
  2 | 
  3 | The practice of code review is extremely beneficial to the practice of software engineering.
  4 | I believe it has its place in data science as well.
  5 | 
  6 | ## What code review is
  7 | 
  8 | Code review is the process by which a contributor's newly committed code
  9 | is reviewed by one or more teammate(s).
 10 | During the review process, the teammate(s) are tasked with ensuring that they
 11 | 
 12 | - understand the code and are able to follow the logic,
 13 | - find potential flaws in the newly contributed code,
 14 | - identify poorly documented code and confusing use of variable names,
 15 | - raise constructive questions and provide constructive feedback
 16 | 
 17 | on the codebase.
 18 | 
 19 | If you've done the practice of scientific research before,
 20 | it is essentially identical to peer review,
 21 | except with code being the thing being reviewed instead.
 22 | 
 23 | ## What code review _isn't_
 24 | 
 25 | Code review is not the time
 26 | for a senior person to slam the contributions of a junior person,
 27 | nor vice versa.
 28 | 
 29 | ## Why data scientists should do code review
 30 | 
 31 | ### Reason 1: Sharing Knowledge
 32 | 
 33 | The first reason is to ensure that project knowledge
 34 | is shared amongst teammates.
 35 | By doing this, we ensure that
 36 | in case the original code creator needs to be offline for whatever reason,
 37 | others on the team cover for that person and pick up the analysis.
 38 | When N people review the code, N+1 people know what went on.
 39 | (It does not necessarily have to be N == number of people on the team.)
 40 | 
 41 | In the context of notebooks, this is even more important.
 42 | An analysis is complex,
 43 | and involves multiple modelling decisions and assumptions.
 44 | Raising these questions,
 45 | and pointing out where those assumptions should be documented
 46 | (particularly in the notebook)
 47 | is a good way of ensuring
 48 | that N+1 people know those implicit assumptions that go into the model.
 49 | 
 50 | ### Reason 2: Catching Mistakes
 51 | 
 52 | The second reason is that
 53 | even so-called "senior" data scientists are humans,
 54 | and will make mistakes.
 55 | With my interns and less-experienced colleagues,
 56 | I will invite them to constructively raise queries about my code
 57 | where it looks confusing to them.
 58 | Sometimes, their lack of experience gives me an opportunity to explain
 59 | and share design considerations during the code review process,
 60 | but at other times, they are correct, and I have made a mistake in my code
 61 | that should be rectified.
 62 | 
 63 | ### Reason 3: Social Networking
 64 | 
 65 | If your team is remote,
 66 | then code review can be an incredibly powerful way
 67 | of interacting with one another
 68 | in a professional and constructive fashion.
 69 | 
 70 | Because of code review,
 71 | even in the absence of in-person chats,
 72 | we still know someone else is looking at the product of our work.
 73 | The constructive feedback
 74 | and the mark of approval at the end of the code review session
 75 | are little plus points that add up to a great working relationship
 76 | in the long-run,
 77 | and reduce the sense of loneliness in working remotely.
 78 | 
 79 | ## What code review can be
 80 | 
 81 | Code review can become a very productive time of learning for all parties.
 82 | What it takes is the willingness to listen to the critique provided,
 83 | and the willingness to raise issues on the codebase in a constructive fashion.
 84 | 
 85 | ## How code review happens
 86 | 
 87 | Code review happens usually in the context of a pull request
 88 | to merge contributed code into the master branch.
 89 | The major version control system hosting platforms (GitHub, BitBucket, GitLab)
 90 | all provide an interface to show the "diff"
 91 | (i.e. newly contributed or deleted code)
 92 | and comment directly on the code, in context.
 93 | 
 94 | As such, code review can happen entirely asynchronously, across time zones,
 95 | and without needing much in-person interaction.
 96 | 
 97 | Of course, being able to sync up either via a video call,
 98 | or by meeting up in person,
 99 | has numerous advantages by allowing non-verbal communication to take place.
100 | This helps with building trust between teammates,
101 | and hence doing even "virtual" in-person reviews
102 | can be a way of being inclusive towards remote colleagues.
103 | 
104 | ## Parting words
105 | 
106 | If your firm is set up to use a version control system,
107 | then you probably have the facilities to do code review available.
108 | I hope this essay encourages you to give it a try.
109 | 
110 | ## Thank you for reading!
111 | 
112 | If you enjoyed this essay and would like to receive early-bird access to more,
113 | [please support me on Patreon][patreon]!
114 | A coffee a month sent my way gets you _early_ access to my essays
115 | on a private URL exclusively for my supporters
116 | as well as shoutouts on every single essay that I put out.
117 | 
118 | [patreon]: https://patreon.com/ericmjl
119 | 
120 | Also, I have a free monthly newsletter that I use as an outlet
121 | to share programming-oriented data science tips and tools.
122 | If you'd like to receive it, sign up on [TinyLetter][tinyletter]!
123 | 
124 | [tinyletter]: https://tinyletter.com/ericmjl
125 | 


--------------------------------------------------------------------------------
/docs/terminal/pre-commits.md:
--------------------------------------------------------------------------------
  1 | # Using `pre-commit` git hooks to automate code checks
  2 | 
  3 | Git hooks are an awesome way
  4 | to automate checks on your codebase locally
  5 | before committing them to your code repository.
  6 | 
  7 | That said, setting them up
  8 | involves digging into the `.git` folder of your repository,
  9 | and can feel intimidating to set up and replicate
 10 | across multiple local clones of repositories.
 11 | 
 12 | Thankfully, there is an easier way about.
 13 | 
 14 | The developers of the `pre-commit` framework have given us a wonderful tool
 15 | to standardize and automate the replication of pre-commit git hooks.
 16 | 
 17 | ## What git hooks are
 18 | 
 19 | Git hooks are basically commands that are run just before or after
 20 | git commands are executed.
 21 | In this essay's context, I basically consider it
 22 | a great way to run automated checks on our code before we commit them.
 23 | 
 24 | ## Getting started with `pre-commit`
 25 | 
 26 | First off, you should follow the `pre-commit` instructions for getting setup.
 27 | These instructions are availble
 28 | on the [`pre-commit`](https://pre-commit.com/) website.
 29 | For those of you who know what you are doing
 30 | and just want something to copy/paste:
 31 | 
 32 | ```bash
 33 | conda install -c conda-forge pre-commit
 34 | pre-commit sample-config > .pre-commit-config.yaml
 35 | pre-commit install
 36 | pre-commit run --all-files
 37 | ```
 38 | 
 39 | ## Configuring your `pre-commit`
 40 | 
 41 | While the default set is nice, you might want to install other hooks.
 42 | 
 43 | For example, a Python project
 44 | might want to default to using `black` as the code formatter.
 45 | To enable automatic `black` formatting _and_ checking before committing code,
 46 | we need to add `black` to the configuration file that was produced
 47 | (`.pre-commit-config.yaml`).
 48 | 
 49 | ```yaml
 50 | -   repo: https://github.com/psf/black
 51 |     rev: 19.3b0
 52 |     hooks:
 53 |     -   id: black
 54 | ```
 55 | 
 56 | A classic mistake that I made was to add black directly underneath the default:
 57 | 
 58 | ```yaml
 59 | # THIS IS WRONG!!!
 60 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 61 |     rev: v2.3.0
 62 |     hooks:
 63 |     -   id: check-yaml
 64 |     -   id: end-of-file-fixer
 65 |     -   id: trailing-whitespace
 66 |     -   id: black  # THIS IS WRONG!!!
 67 | ```
 68 | 
 69 | You will get an error if you do this. Be forewarned!
 70 | 
 71 | ## Updating your pre-commit after updating `.pre-commit-config.yaml`
 72 | 
 73 | If you forgot to add a hook but have just edited the YAML file to do so,
 74 | you will need to run the command to install the hooks.
 75 | 
 76 | ```bash
 77 | pre-commit install-hooks
 78 | #  Optional
 79 | pre-commit run --all-files
 80 | ```
 81 | 
 82 | Now, the new hooks will be installed.
 83 | 
 84 | ## What happens when you use pre-commit
 85 | 
 86 | As soon as you write your commit your source files,
 87 | just before the commit happens,
 88 | your installed pre-commit hooks execute.
 89 | If the hooks modify any files,
 90 | then the commit is halted,
 91 | and the files that were modified will show up as being "modified"
 92 | or "untracked"
 93 | in your git status.
 94 | 
 95 | At this point, add the files that were modified by your pre-commit hooks,
 96 | commit those files,
 97 | and re-enter your commit message.
 98 | In this way, you will prevent yourself from committing code
 99 | that does not pass your code checks.
100 | 
101 | ## Good pre-commit hooks for Python projects
102 | 
103 | My opinionated list of nice hooks to have can be found below.
104 | 
105 | - black
106 | - pydocstyle
107 | - isort
108 | 
109 | ## Benefits of setting up pre-commit (and hooks)
110 | 
111 | By setting up a standard configuration that gets checked into source control,
112 | we are setting our team up for success working together.
113 | Opinionated checks are now delegated to automated machinery
114 | rather than requiring human intervention,
115 | hence freeing us up to discuss higher order issues
116 | rather than nitpicking on code style.
117 | 
118 | Moreover, by using the `pre-commit` framework, we take a lot of tedium out
119 | in setting up the pre-commit git hooks correctly.
120 | I've tried to do that before, and found writing the bash script to be
121 | a fragile task to execute.
122 | It's fragile because I'm not very proficient in Bash,
123 | and I have no other way of testing the git pre-commit hooks
124 | apart from actually making a commit.
125 | Yet, it seems like we should be able to modularize our hooks,
126 | such that they are distributed, installed, and executed in a standard fashion.
127 | This is what the pre-commit framework gives us.
128 | 
129 | ## Thank you for reading!
130 | 
131 | If you enjoyed this essay and would like to receive early-bird access to more,
132 | [please support me on Patreon][patreon]!
133 | A coffee a month sent my way gets you _early_ access to my essays
134 | on a private URL exclusively for my supporters
135 | as well as shoutouts on every single essay that I put out.
136 | 
137 | [patreon]: https://patreon.com/ericmjl
138 | 
139 | Also, I have a free monthly newsletter that I use as an outlet
140 | to share programming-oriented data science tips and tools.
141 | If you'd like to receive it, sign up on [TinyLetter][tinyletter]!
142 | 
143 | [tinyletter]: https://tinyletter.com/ericmjl
144 | 


--------------------------------------------------------------------------------
/docs/newsletter/2020/08-august.md:
--------------------------------------------------------------------------------
  1 | # Data Science Programming August 2020 Newsletter
  2 | 
  3 | Hello fellow datanistas!
  4 | 
  5 | Welcome to the August edition
  6 | of the programming-oriented data science newsletter!
  7 | 
  8 | This edition of the newsletter has a large dose of SciPy 2020 inside it.
  9 | I participated in the conference as a tutorial instructor
 10 | (and as one of the Financial Aid chairs),
 11 | though I did also miss Austin TX food!
 12 | (Seriously, Texan BBQ is one of the best!)
 13 | If you're interested in seeing the whole playlist,
 14 | [check it out on YouTube](https://www.youtube.com/enthought)!
 15 | If not, come check out some of those that I've watched and liked,
 16 | my subset curated for you!
 17 | 
 18 | ## [Frictionless Data for Reproducible Biology by Lilly Winfree](https://youtu.be/vZAi4OnfH-Q)
 19 | 
 20 | The reason I like this talk is primarily
 21 | because of the idea of "Data Packages",
 22 | where raw data and its metadata are packaged in a machine-readable format.
 23 | In my mind, I'm contrasting this idea against
 24 | the large-scale data collection efforts;
 25 | in biosciences, many datasets are small and designed for one question,
 26 | but may be useful for other problems by providing,
 27 | for example, useful priors on parameters.
 28 | Here, a data package helps users ship and
 29 | distribute a self-contained unit of data
 30 | that others can build on top of.
 31 | I'm imagining many cool use cases,
 32 | both in public-facing research and in internal-facing workflows!
 33 | 
 34 | ## [Continuous Integration for Scientific Python Projects by Stanley Seibert](https://youtu.be/OAlr9vY5XLU)
 35 | 
 36 | In this talk, Stan Seibert (one of the Numba core developers)
 37 | speaks about the advantages of standing up
 38 | a continuous integration pipeline for your code,
 39 | as well as challenges that you'll encounter along the way.
 40 | I find this to be a useful video for data scientists,
 41 | because in it Stan gives a good overview of what to look out for.
 42 | 
 43 | ## [Awkward Array: Manipulating JSON like Data with NumPy like Idioms by Jim Pivarski](https://youtu.be/WlnUF3LRBj4)
 44 | 
 45 | This one has to be one of my favourite talks,
 46 | because the package featured in there has an awesome name,
 47 | brings over NumPy idioms and semantics into world of nested
 48 | and "awkwardly"-structured data.
 49 | 
 50 | ## [JAX: Accelerated Machine Learning Research by Jake Vanderplas](https://youtu.be/z-WSrQDXkuM)
 51 | 
 52 | I'm a fan of the NumPy API
 53 | because it's the array-computing _lingua franca_ of the Python world,
 54 | and I strongly believe that targeting a common API
 55 | (and evolving it in a community-oriented fashion)
 56 | is the right way to build the PyData ecosystem.
 57 | JAX does this by making array-oriented automatic differentiation,
 58 | GPU/TPU acceleration,
 59 | just-in-time compilation,
 60 | and vectorized mapping all first-class citizens
 61 | alongside the idiomatic NumPy API.
 62 | I love it and totally dig it!
 63 | And I use it for research and production at work.
 64 | I'd encourage you to try it out too!
 65 | 
 66 | ## [`matplotlib` Data Model by Hannah Aizenman](https://youtu.be/XC0M76CmzHg)
 67 | 
 68 | If you use `matplotlib`, then this Maintainer's track talk by Hannah Aizenman
 69 | is going to make your eyes light up!
 70 | In here, she talks about CZI-funded work
 71 | to refactor the data model underneath `matplotlib`,
 72 | which will enable a _ton_ of really cool things downstream.
 73 | I'm not going to spoil it for you; check it out!
 74 | (And also check out the other _cool talks_ by the other maintainers!)
 75 | 
 76 | ## [Interactive Supercomputing with Jupyter at NERSC by Rollin Thomas](https://youtu.be/nU-FDFrtOvM)
 77 | 
 78 | I think this is a great case study talk that shows how JupyterHub is used
 79 | at a research institution to help facilitate computational research.
 80 | If your organization is thinking about setting something up,
 81 | I think this talk will give you valuable insights and lessons!
 82 | 
 83 | ## [Tutorials](https://www.youtube.com/playlist?list=PLYx7XA2nY5Gde-6QO98KUJ9iL_WW4rgYf)
 84 | 
 85 | If I _really_ wanted to, I would have listed all 10 tutorials down here amongst my recommendations,
 86 | but I know you came for a curation.
 87 | Here's the two that I think are most generally useful for data scientists:
 88 | 
 89 | - [Introduction to Conda for (Data) Scientists](https://youtu.be/qn5zfdJtcYc): This being such a foundational tool for distributing data science packages, I think it's work getting our mental models straightened out!
 90 | - [Jupyter Interactive Widget Ecosystem](https://youtu.be/8IYbdshUd9c): With Jupyter notebooks being so idiomatic, and with widgets being so useful for dashboarding, pedagogy and more, this one is an easy recommendation!
 91 | 
 92 | But seriously, check out all 10 of them!
 93 | 
 94 | ## From my collection
 95 | 
 96 | Here's a few snippets of my participation this year at SciPy!
 97 | 
 98 | - [Call prediction, prediction, not inference!](https://youtu.be/VzRj55pas3I?t=435) (My ~~rant~~ lightning talk at SciPy.)
 99 | - [Bayesian Data Science by Simulation](https://youtu.be/8eh5A72hIWM) (tutorial I led, based on material I co-developed with Hugo Bowne-Anderson!)
100 | 
101 | In some other news, the [Network Analysis Made Simple eBook has launched](https://leanpub.com/nams)!
102 | In line with my personal philosophy
103 | of democratizing access to learning material,
104 | everything is [freely available online](https://ericmjl.github.io/Network-Analysis-Made-Simple/index.html),
105 | but if you'd like to support us (mostly by keeping us caffeinated)
106 | or would like an offline copy to keep that will be kept up-to-date for life,
107 | please consider purchasing a copy!
108 | 
109 | ## Thank you for reading!
110 | 
111 | Alrighty, I shan't toot my own horn anymore.
112 | I hope you enjoyed this special SciPy curation edition of the programming-oriented data science newsletter!
113 | As always, let me know on [Twitter](twitter.com/ericmjl)
114 | if you've enjoyed the newsletter,
115 | and I'm always open to hearing about the new things you've learned from it.
116 | Next month, we resume regular scheduled, ahem, programming!
117 | 
118 | Meanwhile, if you'd like to get early access to new written tutorials,
119 | essays, 1-on-1 consulting and complimentary access to the Skillshare workshops that I make,
120 | I'd appreciate your support on [Patreon](patreon.com/ericmjl)!
121 | 
122 | Stay safe, stay indoors, and keep hacking!
123 | 
124 | Cheers,
125 | Eric
126 | 


--------------------------------------------------------------------------------
/docs/software-skills/documentation.md:
--------------------------------------------------------------------------------
  1 | # Documenting your code
  2 | 
  3 | Writing lightweight documentation is a practice that I found
  4 | sorely lacking in data science practice.
  5 | In this essay, I will show you how to introduce lightweight documentation
  6 | into your code.
  7 | 
  8 | ## Why document your code
  9 | 
 10 | There are a few good reasons to document your code.
 11 | 
 12 | Firstly, your future self will thank you
 13 | for having a plain English translation of what you _intended_ to do
 14 | with that block of code.
 15 | Oftentimes, the _intent_ behind the code
 16 | is lost in the translation from our heads to actual code.
 17 | 
 18 | Secondly, other readers of your code
 19 | will also thank you.
 20 | 
 21 | Thirdly, by clarifying what exactly you intended to accomplish with a block of code,
 22 | as well as the major steps taken towards accomplishing those goals,
 23 | you often will end up with a much cleaner implementation in the end.
 24 | 
 25 | ## When to document your code
 26 | 
 27 | A pragmatic choice would be once you find yourself accomplishing
 28 | a logical chunk of work.
 29 | 
 30 | I usually do it as soon as I define a Python function.
 31 | 
 32 | ## Where your code documentation should go
 33 | 
 34 | As a general rule of thumb,
 35 | having code documentation as close to the actual source code
 36 | is probably the best way to approach this.
 37 | 
 38 | For Python programmers, this would imply taking advantage of __docstrings__!
 39 | 
 40 | **Docstrings** occur in the following places:
 41 | 
 42 | 1. Right after a function or class method definition.
 43 | 1. Right inside a class definition.
 44 | 1. Right at the top of a `.py` module.
 45 | 
 46 | An anti-pattern here
 47 | would be writing your documentation in an external system,
 48 | such as a Wiki.
 49 | (Woe betide the code developer who writes code docs in Confluence...)
 50 | This is because the documentation is not proximal to the source code.
 51 | I have found myself forgetting to update the docstrings
 52 | after updating the source code.
 53 | If it's easy to forget to update the docs
 54 | when the docs are right next to the source,
 55 | imagine how much easier it is to forget to update external docs!
 56 | 
 57 | Where, then, would documentation on how the code is organized live then?
 58 | I would argue it should be pushed as close to the source code as possible.
 59 | For example, we can use the `.py` module docstrings
 60 | to describe the intent behind why certain entire modules exist.
 61 | 
 62 | ## An example
 63 | 
 64 | Here is a skeleton to follow:
 65 | 
 66 | ```python
 67 | """
 68 | This module houses all functions that cannot be neatly categorized
 69 | in other places.
 70 | """
 71 | 
 72 | def my_function(arg1, arg2):
 73 |     """
 74 |     Calculates something based on arg1 and arg2.
 75 | 
 76 |     This calculated thing is intended to be used
 77 |     by `this_other_function`,
 78 |     so the return type should not be changed.
 79 | 
 80 |     :param arg1: Describe arg1
 81 |     :param arg2: Describe arg2
 82 |     :returns: ``the_thing_being_returned``, a pandas DataFrame (for example).
 83 |     """
 84 |     the_thing_being_returned = ...  # implement the function
 85 |     return the_thing_being_returned
 86 | ```
 87 | 
 88 | Now, let's see this in action with a function
 89 | that returns a snake-cased version of a string
 90 | with all punctuation also removed.
 91 | (This is a simplified implementation of what is implemented in `pyjanitor`'s
 92 | `clean_names` function.)
 93 | 
 94 | ```python
 95 | import string
 96 | 
 97 | def clean_string(s):
 98 |     """
 99 |     Remove all punctuation from string, and convert to lower_snake_case.
100 | 
101 |     An example of the input and output:
102 | 
103 |         "My string!" -> "my_string"
104 | 
105 |     :param s: String to clean.
106 |     """
107 |     s = s.replace(string.punctuation, "_").replace(" ", "_").strip("_").lower()
108 |     return s
109 | ```
110 | 
111 | You may notice that the docstring is longer than the implementation.
112 | Frequently (though not always),
113 | I have found that when docstring length exceeds implementation length,
114 | it is a sign that the author(s) of the code
115 | have been thoughtful about its implementation.
116 | This bodes well for working in a team,
117 | especially when a data scientist hands over a prototype
118 | to the engineering team.
119 | 
120 | ## Addressing objections
121 | 
122 | The main objections to injecting "basic software engineering"
123 | into a data scientist's workflow
124 | usually center around not having enough time.
125 | 
126 | As always, I am sympathetic to this objection,
127 | because I also operate under time constraints.
128 | 
129 | One thing I will offer is that docs are an investment of time
130 | for the team, rather than for the individual.
131 | We save multiples of time downstream
132 | when we write good docs.
133 | One way to conceptualize this is the number of person-hours saved
134 | down the road by oneself and one's teammates when good docs exist.
135 | We minimize the amount of time spent reading code
136 | to grok what it is about.
137 | 
138 | At the same time,
139 | the practice of clarifying what we intend to accomplish with the function
140 | can help bring clarity to the implementation.
141 | This I have mentioned above.
142 | Having a clean implementation makes things easier to maintain later on.
143 | Hence, time invested now on good docs
144 | also helps us later on.
145 | 
146 | As with other software engineering skills,
147 | this is a skill that can be picked up, refined, and honed.
148 | We get more efficient at writing docs the more we do it.
149 | 
150 | ## Parting words
151 | 
152 | I hope this essay has helped you get a feel
153 | for how you can write well-documented code.
154 | At the same time, I hope that by showing you a simple anchoring example
155 | that you will be able to replicate the pattern in your own work.
156 | 
157 | ## Thank you for reading!
158 | 
159 | If you enjoyed this essay and would like to receive early-bird access to more,
160 | [please support me on Patreon][patreon]!
161 | A coffee a month sent my way gets you _early_ access to my essays
162 | on a private URL exclusively for my supporters
163 | as well as shoutouts on every single essay that I put out.
164 | 
165 | [patreon]: https://patreon.com/ericmjl
166 | 
167 | Also, I have a free monthly newsletter that I use as an outlet
168 | to share programming-oriented data science tips and tools.
169 | If you'd like to receive it, sign up on [TinyLetter][tinyletter]!
170 | 
171 | [tinyletter]: https://tinyletter.com/ericmjl
172 | 


--------------------------------------------------------------------------------
/docs/software-skills/refactoring.md:
--------------------------------------------------------------------------------
  1 | # Refactoring your code
  2 | 
  3 | How many times have you found yourself copy/pasting code from one notebook to another?
  4 | If it the answer is "many", then this essay probably has something for you.
  5 | We're going to look at the practice of "refactoring" code,
  6 | and how it applies in a data science context.
  7 | 
  8 | ## Why refactor
  9 | 
 10 | When writing code, we _intend_ to have a block of code do one thing.
 11 | As such, its multiple application should have a single source of truth.
 12 | However, the practice of copying and pasting code
 13 | gives us multiple sources of truth.
 14 | Refactoring code, thus,
 15 | gives us a way of establishing a single source of truth for our functions,
 16 | which can be called on in multiple situations.
 17 | 
 18 | ## When to refactor
 19 | 
 20 | The short answer is "basically whenever you find yourself hitting copy+paste"
 21 | on your keyboard.
 22 | 
 23 | ## How do we refactor
 24 | 
 25 | The steps involved are as follows.
 26 | 
 27 | 1. Wrap the semi-complex block of code in a function.
 28 | 1. Identify what you would consider to be an "input" and "output" for the function.
 29 | 1. Take specific variable names and give them more general names.
 30 | 
 31 | ## An example
 32 | 
 33 | Let's take the example of a chunk of code that takes a protein sequence,
 34 | compares it to a reference sequence,
 35 | and returns all of the mutations that it has.
 36 | (We will only implemenet a naive version for the sake of pedagogy.)
 37 | 
 38 | ```python
 39 | sequence1 = ...
 40 | sequence2 = ...
 41 | 
 42 | mutations = []
 43 | for i, (letter1, letter2) in enumerate(zip(sequence1, sequence2)):
 44 |     mutations.append(f"{letter1}{i+1}{letter2}")
 45 | mutations = "; ".join(m for m in mutations)
 46 | ```
 47 | 
 48 | This more or less should accomplish what we want.
 49 | Let's now apply the ideas behind refactoring to this code block.
 50 | 
 51 | ```python
 52 | def mutation_string(reference, sequence, sep="; "):
 53 |     mutations = []
 54 |     for i, (letter1, letter2) in enumerate(zip(reference, sequence)):
 55 |         mutations.append(f"{letter1}{i+1}{letter2}")
 56 |     return f"{sep}".join(m for m in mutations)
 57 | ```
 58 | 
 59 | You'll notice the three steps coming into play.
 60 | 
 61 | **Firstly**, we simply shifted the main logic of the code into a function definition.
 62 | 
 63 | **Secondly**, we then generalized the function a bit,
 64 | by renaming `sequence1` and `sequence2` to what we usually intend for it to be,
 65 | a `sequence` of interest and a `reference` sequence.
 66 | 
 67 | **Finally**, we defined those two as inputs,
 68 | alongside a keyword argument called `sep`,
 69 | which defines the separator between each mutation.
 70 | 
 71 | ## Bonus
 72 | 
 73 | On the basis of this function definition,
 74 | we can do some additional neat things!
 75 | 
 76 | For example, in protein sequence analysis,
 77 | our `reference` sequence is usually kept constant.
 78 | Hence, we can actually create a custom `mutation_string`
 79 | for our reference sequence using `functools.partial`
 80 | by fixing `reference` to a particular value,
 81 | thus eliminating the need to repetitively pass in the same reference string.
 82 | 
 83 | ```python
 84 | from functools import partial
 85 | 
 86 | protein1 = ...  # define the string here.
 87 | 
 88 | prot1_mut_string = partial(mutation_string, reference=protein1)
 89 | 
 90 | protein2 = ...  # define the string here.
 91 | 
 92 | mutstring = prot1_mut_string(sequence=protein2)
 93 | ```
 94 | 
 95 | ## Where should this function be refactored to
 96 | 
 97 | You can choose to keep it in the notebook, and that would be fine
 98 | if the function was used only in a single notebook.
 99 | 
100 | If you find yourself needing to call on that same function from another notebook,
101 | do the right thing and create a `utils.py` (or analogous) Python module
102 | that lives in the same directory as the notebook.
103 | Then, import the refactored function from `utils.py`.
104 | 
105 | If you feel sophisticated, you can also create a custom Python library
106 | for your project. I will address this in a separate essay.
107 | 
108 | An anti-pattern, though, would be to attempt to treat the notebook as source code
109 | and import the function from one notebook into another.
110 | Notebooks are great for one thing:
111 | weaving functions together into an integrarted analysis.
112 | I'm of the opinion that we should use a tool the way it was intended,
113 | and bring in other tools to do what we need.
114 | In this respect, I think that DataBricks notebooks does the wrong thing
115 | by bowing to bad human first instincts rather than encouraging productive behaviours.
116 | 
117 | ## Where do we find time to do this
118 | 
119 | I hear this concern, as I went through the same concerns myself.
120 | 
121 | Isn't it faster to just copy/paste the code?
122 | What if I don't end up reusing the code elsewhere?
123 | Isn't the time then wasted?
124 | 
125 | In thinking back to my own habits, I realized early on
126 | that doing this was not a matter of technical ability
127 | but rather a matter of mindset.
128 | 
129 | Investing the time into doing simple refactoring alongside my analyses
130 | does take immediate time away from the analysis.
131 | However, the deliberate practice of refactoring early on
132 | earns back multiples of the time spent as the project progresses.
133 | 
134 | Moreover, if and when the project gets handed over "in production",
135 | or at least shared with others to use,
136 | our colleagues can spend less time is spent navigating a spaghetti-like codebase,
137 | and more time can be spent building a proper mental model of the codebase
138 | to build on top of.
139 | 
140 | On the possiblity of not reusing the code elsewhere,
141 | I would strongly disagree.
142 | Refactoring is not a common skill, while copy/pasting code is.
143 | Every chance we get to refactor code is practicing the skill,
144 | which only gets sharper and more refined as we do it more.
145 | Hence, even for the sake of getting more practice
146 | makes it worthwhile to do refactoring at every chance.
147 | 
148 | ## Concluding words
149 | 
150 | I hope this mini-essay demystifies the practice of code refactoring,
151 | and gives you some ideas on how to make it part of your workflow.
152 | 
153 | ## Thank you for reading!
154 | 
155 | If you enjoyed this essay and would like to receive early-bird access to more,
156 | [please support me on Patreon][patreon]!
157 | A coffee a month sent my way gets you _early_ access to my essays
158 | on a private URL exclusively for my supporters
159 | as well as shoutouts on every single essay that I put out.
160 | 
161 | [patreon]: https://patreon.com/ericmjl
162 | 
163 | Also, I have a free monthly newsletter that I use as an outlet
164 | to share programming-oriented data science tips and tools.
165 | If you'd like to receive it, sign up on [TinyLetter][tinyletter]!
166 | 
167 | [tinyletter]: https://tinyletter.com/ericmjl
168 | 


--------------------------------------------------------------------------------
/docs/newsletter/2020/07-july.md:
--------------------------------------------------------------------------------
  1 | # Data Science Programming July 2020 Newsletter
  2 | 
  3 | Hello datanistas!
  4 | 
  5 | Welcome to the July edition of the programming-oriented data science newsletter.
  6 | 
  7 | I usually try to send the newsletter on the first Monday of the month,
  8 | but this edition is a little bit later than usual,
  9 | and that’s because I was attending SciPy 2020’s virtual conference this month!
 10 | Be sure to [catch the videos on Enthought’s YouTube channel](https://www.youtube.com/c/enthought) next week,
 11 | when they are edited and uploaded!
 12 | (The talks are already up, check them out!)
 13 | 
 14 | Back to regular scheduled programming (\*cough cough the SciPy puns cough\*),
 15 | this month’s newsletter focuses on production ML systems and everything around it!
 16 | 
 17 | ## [On getting ML models into production](http://veekaybee.github.io/2020/06/09/ml-in-prod/)
 18 | 
 19 | Vicki Boykis has this very well-written article titled
 20 | "[Getting machine learning to production](http://veekaybee.github.io/2020/06/09/ml-in-prod/)". In there, she details a lot of the struggle in getting an ML model into a production system. I found it very instructive to read. As it turns out, your ML model is kind of the least of your worries. I won’t spoil it for you - take a good 10 minutes out of your day to read it!
 21 | 
 22 | ## [MLOps](https://mlops-github.com/)
 23 | 
 24 | Related to ML in production is the term
 25 | that is quickly becoming "a thing": MLOps.
 26 | In the same vein as DevOps, DevSecOps etc.,
 27 | it’s all about continuously running things
 28 | to check for reproducibility of your analyses,
 29 | and at least ensuring that the code continuously runs.
 30 | (Checking that everything is semantically correct
 31 | is still a human job that can’t be eliminated.)
 32 | 
 33 | GitHub has put together [a resource](https://mlops-github.com/)
 34 | to help you learn about some of the tooling to help you facilitate
 35 | the automation, collaboration, and reproducibility in your ML workflows.
 36 | 
 37 | If anything, I have found at work that continuously executed pipelines
 38 | are the basic unit of engineering reliability into both my software and my models,
 39 | and I’d encourage you to do the same!
 40 | 
 41 | ## [Approach Your Data with a Product Mindset](https://hbr.org/2020/05/approach-your-data-with-a-product-mindset)
 42 | 
 43 | This one comes from the Harvard Business Review.
 44 | Usually the HBR is a tad too suit-oriented for my tastes,
 45 | but having been involved in some data products at work, this article resonated with me.
 46 | Production systems usually imply something that directly impact decision-making,
 47 | and "data products" are what help facilitate/accelerate that process.
 48 | Especially if there’s a focus on "unmet needs",
 49 | that’s when a data + model project can turn into something impactful.
 50 | Let me not spoil the read for you,
 51 | and instead [come check out the article here](https://hbr.org/2020/05/approach-your-data-with-a-product-mindset).
 52 | I hope it gives you inspiration for your work!
 53 | 
 54 | ## [On Technical Debt...](https://matthewmcateer.me/blog/machine-learning-technical-debt/)
 55 | 
 56 | If you’ve read the paper titled
 57 | "[Hidden Technical Debt in Machine Learning Systems](https://papers.nips.cc/paper/5656-hidden-technical-debt-in-machine-learning-systems.pdf)",
 58 | then come read an article by Matthew McAteer,
 59 | in which he dissects the paper and teases out which points have been made obsolete as time progressed.
 60 | [It’s an eye-opening read](https://matthewmcateer.me/blog/machine-learning-technical-debt/)!
 61 | 
 62 | ## Assortments of Goodies
 63 | 
 64 | Some other things I have found to be important and informative include:
 65 | 
 66 | * [The proposal of a DataFrame protocol for the PyData ecosystem](https://discuss.ossdata.org/t/a-dataframe-protocol-for-the-pydata-ecosystem/267)
 67 | * [A succinct introduction to metamorphic testing](https://www.hillelwayne.com/post/metamorphic-testing/)
 68 | * [`pbcopy` and `pbpaste`](https://langui.sh/2010/11/14/pbpaste-pbcopy-in-mac-os-x-or-terminal-clipboard-fun/), a macOS utility for copying things to the clipboard from the terminal
 69 | * and what I would consider to be [Coiled Computing’s manifesto](https://medium.com/coiled-hq/distributed-computing-for-data-scientists-bfabc72d39da)! (To be clear, they did not pay me to put this link in here, I’m genuinely excited about what they’re building!)
 70 | 
 71 | ## From my collection
 72 | 
 73 | Now for some things from my own collection that I’m excited to share!
 74 | 
 75 | ### [Network Analysis Made Simple](http://ericmjl.github.io/Network-Analysis-Made-Simple/)
 76 | 
 77 | Each year, I submit Network Analysis Made Simple to PyCon, SciPy and PyData conferences,
 78 | where they get recorded and are shared with the world for free.
 79 | This year, I’m super happy to announce that my co-instructor and I
 80 | have [revamped the website](http://ericmjl.github.io/Network-Analysis-Made-Simple/)!
 81 | We spent some time restructuring the material,
 82 | adding a theme that provides search,
 83 | and adding a pipeline that reproducibly builds the notebook collection.
 84 | For those of you who like eBook artifacts to keep, we also compiled a book!
 85 | [If you’re interested in it, come tell us what you think the book is worth](https://leanpub.com/nams).
 86 | We’ll be officially launching next week, after the final chapter is added to the collection!
 87 | 
 88 | ([Bayesian Data Science by Simulation and Probabilistic Programming](https://github.com/ericmjl/bayesian-stats-modelling-tutorial)
 89 | is also undergoing a similar rebuild, stay tuned!)
 90 | 
 91 | A few colleagues have also given me feedback
 92 | that the Python data science ecosystem
 93 | is kind of like "the Wild Wild West".
 94 | Reflecting on my prior experience thus far,
 95 | I can appreciate the sentiment,
 96 | and so I sat down and wrote
 97 | [a long essay that tries to linearize/untangle the ecosystem for newcomers](https://ericmjl.github.io/essays-on-data-science/miscellaneous/pydata-landscape/).
 98 | I hope it’s useful for you too :).
 99 | My [Patreon supporters](https://ericmjl.github.io/essays-on-data-science/supporters/)
100 | have had early access to the article for a while,
101 | so if you appreciate the work, I’d love to hear from you on Patreon!
102 | 
103 | ## Moar Twitter
104 | 
105 | Have you tried to unsubscribe from a email list and got the response that it can "take a few days"?
106 | Well... follow [this thread](https://twitter.com/Joe8Bit/status/1156312965265707013) to learn why!
107 | (I’d love it if you’d stay with this newsletter though!)
108 | 
109 | ## Thank you for reading!
110 | 
111 | Hope you enjoyed this edition of the programmer-oriented data science newsletter!
112 | As always, let me know on [Twitter](twitter.com/ericmjl) if you've enjoyed the newsletter,
113 | and I'm always open to hearing about the new things you've learned from it.
114 | Next month will be a special SciPy 2020 edition,
115 | as I find time to carefully catch up and review the talks that have come by!
116 | 
117 | Meanwhile, if you'd like to get early access to new written tutorials, essays,
118 | 1-on-1 consulting and complimentary access to the Skillshare workshops that I make,
119 | I'd appreciate your support on [Patreon](patreon.com/ericmjl)!
120 | 
121 | Stay safe, stay indoors, and keep hacking!
122 | 
123 | Cheers,
124 | Eric
125 | 


--------------------------------------------------------------------------------
/docs/miscellaneous/learning-to-learn.md:
--------------------------------------------------------------------------------
  1 | # How I Learned to Learn
  2 | 
  3 | In this essay, I'd like to reflect back on
  4 | how I learned to learn new things.
  5 | For a data scientist, it's impossible to know _everything_,
  6 | but I do think that having a broad knowledge base can be very handy.
  7 | Especially when confronted with a new problem class,
  8 | having a broad toolkit of methods to solve it
  9 | can give us a leg-up in terms of efficiency.
 10 | This set of reflections hopefully lights up some lighbulbs
 11 | for your own learning journey.
 12 | 
 13 | ## Learning by doing/building/making
 14 | 
 15 | > "Carve out time to reinvent the wheel, to learn about the wheel."
 16 | 
 17 | One way that I think is very effective in learning new topics
 18 | is to learn by making things from scratch.
 19 | This trick, I believe, is particularly effective
 20 | when learning about the foundational topics that underlie
 21 | the API abstractions that we interact with
 22 | as data scientists.
 23 | 
 24 | For example, I learned quite a ton about architecting a deep learning library
 25 | by trying to make one myself.
 26 | The end result is [fundl], a deep learning framework that I wrote
 27 | that supports my own learning
 28 | about the so-called fancy math that belies deep learning models.
 29 | `fundl` fits in the **"model", "loss", "optimizer"** thought framework
 30 | that I rely on for reasoning about deep learning models,
 31 | and helps me focus on the "model" portion.
 32 | 
 33 | [fundl]: https://github.com/ericmjl/fundl
 34 | 
 35 | In there, I have used it at work to re-implement models that I have seen
 36 | implemented in other frameworks (e.g. PyTorch and TensorFlow),
 37 | and translate them into the NumPy API.
 38 | In doing so, I not only build familiarity with the models,
 39 | but also gain familiarity with the other tensor library APIs,
 40 | helping me to keep pace with framework development
 41 | while also leveraging existing knowledge that I have (in the NumPy API).
 42 | 
 43 | Through the process of implementing deep learning models,
 44 | I have also found my mental model
 45 | of linear algebra and data transformations has also grown.
 46 | For example, I no longer am satisfied to think of a deep learning model
 47 | in terms of an amorphous black box.
 48 | Rather, thanks to reimplemtation, I am much more inclined
 49 | to think about the model as doing some form of rotation and projection
 50 | in n-dimensional space,
 51 | which is exactly what dot products are all about.
 52 | Thinking this way, I think, prevents a predisposition towards
 53 | _anthropomorphization_ of machine learning models,
 54 | which is just a fancy term for ascribing human properties to models.
 55 | 
 56 | ## Learning by teaching
 57 | 
 58 | > "Having made the wheel, share how it was made."
 59 | 
 60 | Teaching something is also an incredibly effective method
 61 | to learn a new topic.
 62 | I was able to learn graph theory during graduate school
 63 | not only because I used it as a tool in my research,
 64 | but also because I made teaching material in Python
 65 | and brought it to conferences to share the knowledge.
 66 | 
 67 | I think one of the key reasons
 68 | why teaching is so unreasonably effective in learning
 69 | is that it forces us to demonstrate our mastery over our knowledge
 70 | in two ways.
 71 | 
 72 | Firstly, in preparing the teaching material,
 73 | we anticipate the questions that may arise from others.
 74 | To address those questions, in turn, we must be prepared
 75 | with knowledge deeper than what we have chosen to present.
 76 | 
 77 | Secondly, any presentation of the material
 78 | involves a linearization of a messy knowledge graph.
 79 | In my conception, when I present material,
 80 | I am tracing a path through the knowledge graph,
 81 | while sprinkling in edges that branch off a main knowledge trunk.
 82 | 
 83 | ```mermaid
 84 | graph LR;
 85 |     A((A)) ==> B((B));
 86 |     A((A)) --> C((C));
 87 |     B((B)) ==> D((D));
 88 |     C((C)) ==> E((E));
 89 |     D((D)) ==> C((C));
 90 |     B((B)) --> E((E));
 91 |     D((D)) --> E((E));
 92 | ```
 93 | 
 94 | The third point pertains to learning by teaching in quantitative topics.
 95 | By forcing myself to "teach" the ultimate dumb student
 96 | - a Python interpreter - to do math-y things,
 97 | I not only make concrete an abstract topic,
 98 | I also have to verify that the abstract topic is implemented correctly,
 99 | because a Python interpreter will definitely get it wrong
100 | if I implemented it wrong.
101 | 
102 | I've been incredibly fortunate to have a few platforms to do teaching,
103 | the primary one being the Python and data science conferences that I attend.
104 | That said, there are many avenues for teaching
105 | that you could take advantage of,
106 | including at work (1-on-1 pair coding or workshops),
107 | regional or international conferences,
108 | e-learning platforms,
109 | and more,
110 | and I would encourage you to leverage the platform
111 | that suits your situation best.
112 | 
113 | ## Leveraging existing knowledge
114 | 
115 | > "Pick projects that are adjacenct to what I know how to do."
116 | 
117 | Continuing the "knowledge graph" analogy referenced above,
118 | I have made an effort in my learning journey
119 | to leverage as much existing knowledge that I can.
120 | It seems to me that knowledge is best picked up and made to stick
121 | when I can use one topic to anchor another, and vice versa.
122 | 
123 | A few lightweight examples that have showed up in my learning journey:
124 | 
125 | - [Connecting graph message passing with linear algebra](/machine-learning/message-passing)
126 | - [Implementing Bayesian models from scratch but leveraging Python](/machine-learning/computational-bayesian-stats/)
127 | - [Digging into deep learning starting from linear regression](https://github.com/ericmjl/dl-workshop)
128 | 
129 | In the process of leveraging my existing knowledge to learn new things,
130 | I find that tying the learning process
131 | to the creation of "minimally complex examples"
132 | greatly accelerates my own learning process.
133 | 
134 | 
135 | ??? note "Minimally Complex Examples"
136 | 
137 |     These are examples that are simple to grok,
138 |     but not trivial.
139 |     For example, it's trivial to illustrate
140 |     sampling from a (multivariate) Gaussian distribution,
141 |     which is how [sampyl](https://github.com/mcleonard/sampyl)
142 |     illustrates MCMC sampling on its docs page.
143 |     However, it is non-trivial, and in fact quite illuminating,
144 |     to illustrate sampling from a joint distribution
145 |     of data, likelihood, and priors
146 |     involving a Gaussian and its parameters.
147 | 
148 | ## Seeking learning partners and teachers
149 | 
150 | > Learn and teach with others.
151 | 
152 | I also have to state that I have benefited much
153 | from learning from others.
154 | For example, my primary deep learning teacher was David Duvenaud,
155 | back when he was a post-doc at Harvard.
156 | (He is now a professor at the University of Toronto.)
157 | It was from him through which I gained the framework of
158 | deep learning as "model + loss + optimizer",
159 | and if I remember correctly,
160 | he was the one that taught me how to think about linear regression
161 | in that exact framework.
162 | 
163 | Additionally, a friend from amongst the PyMC developers, Colin Carroll,
164 | has been particularly helpful and inspiring.
165 | I read [his blog](https://colindcarroll.com)
166 | in which he writes about his own learnings and insights.
167 | In particular, I very much appreciate how he uses "minimal complex examples"
168 | to illustrate how things work.
169 | He was also the one who kept reminding me
170 | that gradient descent doesn't happen in MCMC,
171 | which thus inspired the essay
172 | [on MCMC](/machine-learning/computational-bayesian-stats/).
173 | 
174 | More generally,
175 | I find that identifying learning partners and teachers
176 | against whom I can check understanding
177 | is a great "social" strategy for picking up ideas.
178 | I generally try to find win-win scenarios,
179 | where I can offer something in exchange,
180 | as this helps balance out the learning partnership
181 | and makes it win-win for my fellow learner too.
182 | 
183 | ## Asking the "dumb" questions
184 | 
185 | One thing I do know I'm infamous for is asking dumb questions.
186 | By "dumb" questions, I mostly mean questions that clarify basic ideas
187 | that I might have missed, or still have a gap on.
188 | 
189 | In my mind, there are very, very few dumb questions.
190 | (I would probably classify
191 | repetitively asking the same basic questions over and over
192 | as not being particularly smart -
193 | use a notebook for heaven's sake!)
194 | In a more intimate learning situation, say, a 1-on-1 session,
195 | clarifying basic questions as soon as they come up
196 | is a wonderful way of ensuring that
197 | our foundational knowledge is strengthened.
198 | In larger settings,
199 | I am almost always certain
200 | that someone else will share the same basic questions that I do.
201 | 
202 | ## Concluding Words
203 | 
204 | This was neither a comprehensive reflection on how exactly I learn
205 | nor a comprehensive overview of how everybody learns.
206 | Nonetheless, it is my hope that you find it useful to reflect on,
207 | and that it gives you ideas for learning new technical topics.
208 | 
209 | ## Thank you for reading!
210 | 
211 | If you enjoyed this essay and would like to receive early-bird access to more,
212 | [please support me on Patreon][patreon]!
213 | A coffee a month sent my way gets you _early_ access to my essays
214 | on a private URL exclusively for my supporters
215 | as well as shoutouts on every single essay that I put out.
216 | 
217 | [patreon]: https://patreon.com/ericmjl
218 | 
219 | Also, I have a free monthly newsletter that I use as an outlet
220 | to share programming-oriented data science tips and tools.
221 | If you'd like to receive it, sign up on [TinyLetter][tinyletter]!
222 | 
223 | [tinyletter]: https://tinyletter.com/ericmjl
224 | 


--------------------------------------------------------------------------------
/docs/workflow/effective-commit-messages.md:
--------------------------------------------------------------------------------
  1 | # Effective Git Commits in Data Science
  2 | 
  3 | Continuing on the theme of the use of Git in data science,
  4 | I thought I would write about how to use git commits effectively
  5 | in our day-to-day data science work.
  6 | 
  7 | ## How `git` commits are intended to be used
  8 | 
  9 | Git commits are intended to be used as a running log
 10 | of what gets checked into a code repository.
 11 | In software engineering,
 12 | each commit is intended to be a “logical unit of work”.
 13 | 
 14 | One intent behind defining a commit as a “logical unit of work”
 15 | is that in case that logical unit of work turned out to be faulty,
 16 | we can revert that unit of work and _only_ that unit of work
 17 | without touching other units of work.
 18 | 
 19 | Git commits can also help us track who made contributions to a repository,
 20 | as each commit also contains information about the committer
 21 | (e.g. name and email address).
 22 | 
 23 | We can view the commit history at the terminal
 24 | by typing the following incantation:
 25 | 
 26 | ```bash
 27 | git log --decorate --graph
 28 | ```
 29 | 
 30 | That will give us an interface to the commit log.
 31 | It will show a running log of the commits to the project,
 32 | as well as every commit message that was put in.
 33 | Writing commit messages
 34 | as if we're going to read them at a later date in reverse sequential order
 35 | can help us write better commit messages.
 36 | 
 37 | ## `git` commits in analysis-heavy projects
 38 | 
 39 | In the software world, `git` commits are a logical way to work.
 40 | By comparison, in data analysis-heavy work,
 41 | it is seemingly more difficult to define
 42 | a “logical unit of work” thank we might in software engineering.
 43 | 
 44 | After all,
 45 | what exactly constitutes a “logical unit” of work in data analytics?
 46 | Is it the answering of a question?
 47 | That might yield commits/changes that are very large.
 48 | Is it a software change?
 49 | That might yield commits/changes that are too small.
 50 | Admittedly, there is a bit of an art to getting this right.
 51 | 
 52 | Here, I think treating `git` commits more as a "log of work done"
 53 | and less of "report of work done"
 54 | might be helpful in adapting `git` as a lab notebook-style log book.
 55 | 
 56 | ### Effective `git` commits
 57 | 
 58 | But before we describe how, a few preliminaries are in order.
 59 | Let’s take a look at
 60 | what effective and informative commit messages accomplish:
 61 | 
 62 | **Firstly**, if we are committing something that is work-in-progress
 63 | (and yes, this should be permitted, because end-of-day always rolls by),
 64 | a commit message can mark the fact that there is still work to be done,
 65 | and describe enough prose to resume context the next day.
 66 | 
 67 | **Secondly**, when used in tandem with a timeline,
 68 | an informative commit message lets us quickly isolate when work was done,
 69 | thus allowing us to retrace the progression of the project.
 70 | 
 71 | **Finally**,
 72 | good commit messages allow others we collaborate with
 73 | to get a handle on the work that was already done.
 74 | Well-written `git` commit messages can help colleagues that review our work
 75 | get quickly up-to-speed on what was done, and what to review.
 76 | 
 77 | In other words,
 78 | effective commit messages act like documentation
 79 | for our future selves and for others.
 80 | Once again, the “social coding” paradigm comes back.
 81 | 
 82 | ??? question "Social coding?"
 83 | 
 84 |     Social coding:
 85 |     where we aren’t programming something alone,
 86 |     but rather writing code in collaboration with others’ input.
 87 |     OSS development is a wonderful example of this.
 88 | 
 89 | ## `git` commit messages: examples in data science contexts
 90 | 
 91 | Let’s see a few examples in action.
 92 | 
 93 | ### The Trivial Change Message
 94 | 
 95 | If we applied trivial changes,
 96 | such as code formatting,
 97 | rather than writing a message that read:
 98 | 
 99 | ???+ failure "Don't do this"
100 | 
101 |     ```text
102 |     black
103 |     ```
104 | 
105 | Perhaps a a more informative message might be:
106 | 
107 | ???+ success "Do this"
108 | 
109 |     ```text
110 |     Applied code formatting (make format).
111 |     ```
112 | 
113 | We don’t need an extended message (unlike those we might see later), because it is a trivial change.
114 | 
115 | Now, I have been guilty of just writing `black` as the commit message,
116 | but usually that is in the context where I am working on my own project alone.
117 | Keeping in mind that commit messages are intended to be read by others,
118 | the more informative version is clearer to read
119 | and only takes practice to become second nature.
120 | 
121 | ### The Work-In-Progress (WIP) Message
122 | 
123 | Sometimes, the end of the day rolls by just like that,
124 | or we realize we have a mid-afternoon meeting to attend
125 | (these are, _the wurst_ sausages!).
126 | In those scenarios, putting in a WIP commit may be helpful.
127 | 
128 | So instead of writing a commit message that reads:
129 | 
130 | ???+ failure "Don't do this"
131 | 
132 |     ```text
133 |     WIP loaded data
134 |     ```
135 | 
136 | 
137 | We instead can write a commit message that reads:
138 | 
139 | ???+ success "Do this"
140 | 
141 |     ```text
142 |     WIP finished code that loads data into memory
143 | 
144 |     We still need to do the following:
145 | 
146 |     - Check statistical covariation between columns
147 |             and remove correlated features.
148 |     - Identify the best predictors.
149 |     ```
150 | 
151 | Now, when we look at the `git log`,
152 | we will see something that looks like this
153 | right at the top of our development branch:
154 | 
155 | ```
156 | * commit abe3d2e8ed55711a57835d96e67207aa2f07f383 (HEAD -> feature-branch)
157 | | Author: Me <abc@xyz.com>
158 | | Date:   Fri Nov 15 14:01:13 2019 -0500
159 | |
160 | |     WIP finished code that loads data into memory
161 | |
162 | |     We still need to do the following:
163 | |
164 | |     - Check statistical covariation between columns and remove correlated features.
165 | |     - Identify the best predictors.
166 | |
167 | * commit ...
168 | ```
169 | 
170 | In this way, the `git` commit log gives us a way
171 | to use it as a “lab notebook”-style running log of what’s we have done.
172 | 
173 | ### The Report on Progress
174 | 
175 | Pedantically,
176 | this is distinguished from the WIP message described above
177 | by being a “final” (but not necessarily binding) message in the work log.
178 | 
179 | An uninformative commit message for this would look like:
180 | 
181 | ???+ failure "Don't do this"
182 | 
183 |     ```text
184 |     Finally done with model building
185 |     ```
186 | 
187 | By contrast, an informative one might look something like this:
188 | 
189 | ???+ success "Do this"
190 | 
191 |     ```text
192 |     Model building (Issue #34) ready for review
193 | 
194 |     Finished:
195 | 
196 |     - Pipeline taking data from input (strings) to activity prediction.
197 |     - Custom code for data pipeline has been stored in custom package.
198 |         Tests and docs written.
199 |     - Notebooks documenting work are also written.
200 |         Static HTML version for archival also generated.
201 | 
202 |     Not done:
203 | 
204 |     - Hyperparameter selection.
205 |         This is the logical next step,
206 |         and as agreed at last meeting, of highest priority.
207 |     ```
208 | 
209 | Admittedly, it can be tough to know when to write this one,
210 | and I think it’s because it _feels_ like
211 | we might want to be sure that this is _absolutely_ the place
212 | that we _actually_ want to write such a message.
213 | 
214 | To this, I would suggest
215 | simply commit (pun intended) to writing it when appropriate,
216 | and worry about minor things in later commits.
217 | 
218 | ## Squashed commits
219 | 
220 | If we squash commits in our `git` workflow (e.g. when merging branches),
221 | then writing such detailed commit messages might seem unnecessary.
222 | To which my response is, yes indeed!
223 | In the case of using squashed commits
224 | really only the final commit message
225 | ends up being stored in the running log of what gets done.
226 | Hence, it makes perfect sense to focus writing good commit messages
227 | only at the merge stage,
228 | rather than at every single commit.
229 | 
230 | ## Intentional adoption of better commit messages
231 | 
232 | As I have observed with my own and colleagues’ workflows,
233 | we do not regularly write informative commit messages
234 | because we don’t read the git log.
235 | Then again, we don’t read the git log
236 | because it doesn’t contain a lot of information.
237 | 
238 | Hold on, that sounds kind of circular, doesn’t it?
239 | 
240 | I think the chicken-and-egg cycle at some point has to be broken.
241 | By starting at _some_ point,
242 | we break a vicious cycle of uninformative logging,
243 | and allow us to break into a virtuous cycle of good record-keeping.
244 | And that really is what this essay is trying to encourage:
245 | **better record-keeping!**
246 | 
247 | ## Further Reading
248 | 
249 | 1. [How to Write a Git Commit Message][git-chrisbeams]
250 | by [Chris Beams][chrisbeams].
251 | 
252 | [git-chrisbeams]: https://chris.beams.io/posts/git-commit/
253 | [chrisbeams]: https://chris.beams.io/
254 | 
255 | ??? note "A note to Chris"
256 | 
257 |     Thank you for writing a wonderful article.
258 |     I'll be praying for a speedy recovery, Chris.
259 | 
260 | ## Thank you for reading!
261 | 
262 | If you enjoyed this essay and would like to receive early-bird access to more,
263 | [please support me on Patreon][patreon]!
264 | A coffee a month sent my way gets you _early_ access to my essays
265 | on a private URL exclusively for my supporters
266 | as well as shoutouts on every single essay that I put out.
267 | 
268 | [patreon]: https://patreon.com/ericmjl
269 | 
270 | Also, I have a free monthly newsletter that I use as an outlet
271 | to share programming-oriented data science tips and tools.
272 | If you'd like to receive it, sign up on [TinyLetter][tinyletter]!
273 | 
274 | [tinyletter]: https://tinyletter.com/ericmjl
275 | 


--------------------------------------------------------------------------------
/docs/newsletter/2020/06-june.md:
--------------------------------------------------------------------------------
  1 | # Data Science Programming June 2020 Newsletter
  2 | 
  3 | Hello datanistas!
  4 | 
  5 | We're back with another edition of the programmer-oriented data science newsletter.
  6 | This month, I have so much I've learned and to share,
  7 | so I'm _thoroughly_ excited to be writing this newsletter edition!
  8 | 
  9 | ## Python 3.9 Beta!
 10 | 
 11 | First things first, Python 3.9's latest beta has been released!
 12 | There are new language features in there, including:
 13 | 
 14 | 1. New dictionary operators
 15 | 2. A topological sorter class in functools
 16 | 3. A "least common multiple" (`lcm`) function in the math library,
 17 | 4. And the best of them all: `string.removeprefix('prefix_goes_here')` and `string.removesuffix('suffix_goes_here')`!
 18 | This is a serious convenience piece for those of us who work with files!
 19 | 
 20 | Check out [Martin Heinz' blog post on Medium][py39] to learn more!
 21 | 
 22 | [py39]: https://medium.com/@martin.heinz/new-features-in-python-3-9-you-should-know-about-14f3c647c2b4
 23 | 
 24 | ## Learn Through The Data Science Design Manual
 25 | 
 26 | During this extraordinary COVID-19 time,
 27 | Springer did an extraordinary thing that I never expected:
 28 | They released a whole bucketload of books for free online!
 29 | One of them caught my eye: ["The Data Science Design Manual"][dsmanual].
 30 | Having browsed through the book PDF, I'm impressed by its coverage of the foundational topics
 31 | that I think _every_ data scientist should be equipped with:
 32 | statistical inference, data wrangling, linear algebra, and machine learning.
 33 | The author, Steven Skiena, also covers more in there.
 34 | 
 35 | Go [grab the PDF][dsmanual] while it's still free!
 36 | 
 37 | [dsmanual]: https://link.springer.com/book/10.1007%2F978-3-319-55444-0
 38 | 
 39 | 
 40 | ## Easy `matplotlib` animations
 41 | 
 42 | Recently, [`celluloid`][celluloid]caught my eye: it's a package that lets you create `matplotlib` animations easily!
 43 | 
 44 | If you need a dead-simple example to convince you to check it out, here's one lifted straight from the repository:
 45 | 
 46 | ```python
 47 | from matplotlib import pyplot as plt
 48 | from celluloid import Camera
 49 | 
 50 | fig = plt.figure()
 51 | camera = Camera(fig)
 52 | for i in range(10):
 53 |     plt.plot([i] * 10)
 54 |     camera.snap()
 55 | animation = camera.animate()
 56 | ```
 57 | 
 58 | But seriously though, if you use the workhorse Python drawing package `matplotlib` for anything,
 59 | this package can be considered to be one of those "great tricks to have" in your bag!
 60 | 
 61 | 
 62 | [celluloid]: https://github.com/jwkvam/celluloid
 63 | 
 64 | ## Better Design Skills: Points of View
 65 | 
 66 | Continuing the theme of visualization,
 67 | I wanted to share with you a resource from Nature Methods that has influenced
 68 | the entirety of how I approach data visualization and figure design.
 69 | This is the [Points of View series][pov],
 70 | written by Bang Wong and Martin Krzywinski and many other co-authors.
 71 | The entire series [is available online][pov], and is a valuable resource to read.
 72 | 
 73 | Two fun tidbits: I devoured the entire series while doing my doctoral training,
 74 | eagerly awaiting each new release _like a Netflix addict_.
 75 | And I was thoroughly thrilled when Bang decided to join the department I work in at NIBR!
 76 | Imagine getting to work with your grad school hero :).
 77 | 
 78 | [pov]: http://blogs.nature.com/methagora/2013/07/data-visualization-points-of-view.html
 79 | 
 80 | ## Better Software Skills: Document your tests!
 81 | 
 82 | For those of you who know me, I am a strong proponent of data scientists
 83 | being equipped with good, basic software skills.
 84 | When we write code in a "basically good" way (refactored, documented and tested),
 85 | we accelerate our productivity many-fold.
 86 | One of my interns reminded me of this when we realized
 87 | that something that would have otherwise taken days to get right in SQL
 88 | ended up being 10 minutes of work
 89 | because we documented and tested our pandas DataFrame caches.
 90 | (If you wish to read more about testing, I write about it [on my Essays on data science][test_code].)
 91 | 
 92 | [test_code]: https://ericmjl.github.io/essays-on-data-science/software-skills/testing/
 93 | 
 94 | Documenting code is important.
 95 | Turns out, your _test suite_ is also code!
 96 | So [in his blog post][doctest], Hyne Schlawack makes the argument that we ought to document our tests,
 97 | something that has become painfully obvious in some of the projects I have worked on.
 98 | His blog post, then, gets an absolute strong recommendation from me!
 99 | 
100 | [doctest]: https://hynek.me/articles/document-your-tests/
101 | 
102 | ## Work Anywhere with Development Containers
103 | 
104 | For those of you who, like myself, moonlight as a software engineer because you develop tools,
105 | this next piece might come as music to your ears:
106 | Visual Studio Code has [superb support for developing a project _inside_ a Docker container][devcontainer].
107 | If you try it out, I guarantee you
108 | the convenience of never having to get someone else set up with development instructions
109 | will be liberating.
110 | 
111 | Since finding out about it on Thursday (28 May),
112 | I've enabled dev containers on [my personal website][ericmjl],
113 | [my Essays collection][essays],
114 | and the [pyjanitor project][pyjanitor].
115 | In each case, Dockerhub automatically builds containers on every commit to the default branch,
116 | and those containers are referenced in the dev container configuration file,
117 | which means _your local machine never has to build the container_,
118 | you only have to pull it down!
119 | I also got everything working remotely,
120 | so my puny little 12" MacBook now uses a remote GPU-enabled development server.
121 | Speaking of which, if you're interested in making an open source contribution,
122 | or wish to just test-drive dev containers on an actual project,
123 | check out [the docs I wrote for the pyjanitor project][devdocs]!
124 | 
125 | [devcontainer]: https://code.visualstudio.com/docs/remote/containers
126 | [ericmjl]: https://github.com/ericmjl/website
127 | [essays]: https://github.com/ericmjl/essays-on-data-science
128 | [pyjanitor]: https://github.com/ericmjl/pyjanitor
129 | [devdocs]: https://pyjanitor.readthedocs.io/contributing.html#get-started
130 | 
131 | ## Automate Workflow with Continuous X
132 | 
133 | I first saw what "Continuous X" meant
134 | when I made my first pull requests to the `matplotlib` project,
135 | and was hooked ever since.
136 | Having a continuous pipeline runner
137 | like Travis or Jenkins or Azure Pipelines
138 | automatically run code and style checks on _every commit_
139 | takes a ton of drudgery out of guaranteeing that our software works properly.
140 | It's like having a Roomba go through your kitchen every time it knows you've finished a meal.
141 | How does "continuous X" apply for data science projects though?
142 | 
143 | Turns out, individuals way more experienced than myself and much smarter than me
144 | have been thinking about this problem too.
145 | In particular, I want to highlight two articles,
146 | one by [Danilo Sato, Arif Wider and Christoph Windheuser][cd_fowler]
147 | and one [on Booklet.ai][cd_booklet].
148 | In both cases, they raise possible ways to integrate pipeline-based automation into data projects,
149 | making them robust and reproducible.
150 | Be sure to check the articles out!
151 | 
152 | [cd_fowler]: https://martinfowler.com/articles/cd4ml.html
153 | [cd_booklet]: https://booklet.ai/blog/continuous-delivery-machine-learning-cd4ml/
154 | 
155 | ## From My Collection
156 | 
157 | I have two articles from my own collection to share.
158 | 
159 | The first one is about [**how to set up a personal platform as a service (PaaS) called Dokku**][blog_dokku].
160 | It's written for those who are especially cheap (like yours truly)
161 | and don't want to pay $7/month to Heroku for each project that gets hosted there.
162 | For those of you who do want to learn the basics of Heroku-based deployment,
163 | I have [a class on Skillshare][skillshare] that you can use too,
164 | which is being used by the Insight Data Science Fellows in Boston!
165 | 
166 | [blog_dokku]: https://ericmjl.github.io/essays-on-data-science/miscellaneous/static-sites-on-dokku/
167 | [skillshare]: https://skl.sh/3dbXxNa
168 | 
169 | The second one is about [**a hack to speed up data loading**][blog_cachier],
170 | using a package called [`cachier`][cachier].
171 | It's a neat hack - especially if you wrap specific data queries from a database into a Python function!
172 | 
173 | [cachier]: https://github.com/shaypal5/cachier
174 | [blog_cachier]: https://ericmjl.github.io/blog/2019/10/18/caching-long-running-function-results/
175 | 
176 | ## Take a break, have a bit of humour
177 | 
178 | Let's close with some humorous stuff, if not at least to lighten the mood in these tumultuous times.
179 | 
180 | Firstly, _Hossein Siamaknejad actually did it_:
181 | automate a game using Python.
182 | And the hack was _absolutely brilliant_:
183 | "RGB detection and programmatically controlled mouse and keyboard".
184 | Props to you, [Hossein][hossein]!
185 | 
186 | [hossein]: https://www.linkedin.com/posts/siamaknejad_python-ai-automation-ugcPost-6665159908478066688-JB8I/
187 | 
188 | Secondly, the prolifically-hilarious Kareem Carr writes about ["practicing safe.... modelling"][kareem].
189 | 
190 | [kareem]: https://twitter.com/kareem_carr/status/1245731021707976704
191 | 
192 | ## Happy, ahem, modelling :)
193 | 
194 | Hope you enjoyed this edition of the programmer-oriented data science newsletter!
195 | As always, let me know [on Twitter][twitter] if you've enjoyed the newsletter,
196 | and I'm always open to hearing about the new things you've learned from it.
197 | 
198 | [twitter]: https://twitter.com/ericmjl
199 | 
200 | If you'd like to get early access to new written tutorials, essays,
201 | 1-on-1 consulting (I just did one session with one of my supporters!)
202 | and complimentary access to the Skillshare workshops that I make,
203 | I'd appreciate your support on [Patreon][patreon]!
204 | 
205 | [patreon]: https://patreon.com/ericmjl
206 | 
207 | Stay safe, stay indoors, and keep hacking!
208 | 
209 | Cheers,
210 | Eric
211 | 


--------------------------------------------------------------------------------
/docs/software-skills/testing.md:
--------------------------------------------------------------------------------
  1 | # Testing your code
  2 | 
  3 | Writing tests for code is a basic software skill.
  4 | Writing tests helps build confidence in the _stability_ of our code.
  5 | 
  6 | ## When to write tests
  7 | 
  8 | There are two "time scales" at which I think this question can be answered.
  9 | 
 10 | The first time scale is "short-term".
 11 | As soon as we finish up a function, that first test should be written.
 12 | Doing so lets us immediately sanity-check our intuition
 13 | about the newly-written fuction.
 14 | 
 15 | The second time scale is "longer-term".
 16 | As soon as we discover bugs, new tests should be added to the test suite.
 17 | Those new tests should either cover that exact bug,
 18 | or cover the class of bugs together.
 19 | 
 20 | A general rule-of-thumb that has proven reliable
 21 | is to write an automated test for anything function you come to rely on.
 22 | 
 23 | ## How to get setup
 24 | 
 25 | In a Python project, first ensure that you have `pytest` installed.
 26 | If you follow recommended practice
 27 | and have one `conda` environment per project,
 28 | then you should be able to install `pytest` using `conda`:
 29 | 
 30 | ```bash
 31 | # if you use conda:
 32 | conda install pytest
 33 | # if you use pip:
 34 | pip install pytest
 35 | ```
 36 | 
 37 | ## The anatomy of a test
 38 | 
 39 | When using `pytest`, your tests take on the function name:
 40 | 
 41 | ```python
 42 | from custom_library import my_function
 43 | 
 44 | def test_my_function():
 45 |     """Test for my_function."""
 46 |     # set up test here.
 47 |     assert some_condition
 48 | ```
 49 | 
 50 | We can then execute the test from the command line:
 51 | 
 52 | ```bash
 53 | pytest .
 54 | ```
 55 | 
 56 | _Voila!_ The tests will be executed, and you will see them run one by one.
 57 | 
 58 | ## The kinds of tests you could write
 59 | 
 60 | Let's go through the kinds of tests you might want to write.
 61 | 
 62 | ### Execution tests
 63 | 
 64 | I started with this kind of test because
 65 | these are the simplest to understand:
 66 | we simply execute a function to make sure that it runs without breaking.
 67 | 
 68 | ```python
 69 | from custom_lib import my_function
 70 | 
 71 | def test_my_function():
 72 |     """Execution test for my_function."""
 73 |     my_function()
 74 | ```
 75 | 
 76 | This kind of test is useful when your function is not parameterized,
 77 | and simply calls on other functions inside your library.
 78 | It is also incredibly useful as a starter test
 79 | when you cannot think of a better test to write.
 80 | 
 81 | One place where I have used this test pattern
 82 | is when we built a project dashboard using Panel.
 83 | The dashboard is made from many complex layers of function calls,
 84 | involving database queries, data preprocessing, cached results, and more.
 85 | Sporadically, something would break,
 86 | and it was something difficult to debug.
 87 | By wrapping the dashboard execution inside a Python function
 88 | and executing it by simply calling `dashboard()`,
 89 | we could discover bugs as soon as they showed up,
 90 | rather than so-called "in production".
 91 | 
 92 | ### Example-based test
 93 | 
 94 | An example-based test looks basically like this:
 95 | 
 96 | ```python
 97 | from custom_lib import another_function
 98 | 
 99 | def test_another_function():
100 |     arg1 = ...
101 |     arg2 = ...
102 |     result = another_function(arg1, arg2)
103 | 
104 |     expected_result = ...
105 | 
106 |     assert result == expected_result
107 | ```
108 | 
109 | Basically, we set up the test with an example,
110 | and check that when given a set of pre-specified inputs,
111 | a particular expected result is returned.
112 | 
113 | When writing code in the notebook,
114 | I find myself writing example-based tests informally all the time.
115 | They are those "sanity-checks" function calls
116 | where I manually check that the result looks correct.
117 | I am sure you do too.
118 | 
119 | So rather than rely on manually checking,
120 | it makes perfect sense to simply
121 | copy and paste the code into a test function
122 | and execute them.
123 | 
124 | ### Advanced Testing
125 | 
126 | The above I consider to be basic, bare minimum testing
127 | that a data scientist can do.
128 | Of course, there are more complex forms of testing
129 | that a QA engineer would engage in,
130 | and I find it useful to know at least what they are
131 | and what tools we have to do these forms of testing
132 | in the Python ecosystem:
133 | 
134 | - Parameterized tests: [`pytest` has these capabilities](https://docs.pytest.org/en/latest/parametrize.html).
135 | - Property-based tests: [`hypothesis` gives us these capabilities](https://hypothesis.readthedocs.io/en/latest/details.html).
136 | 
137 | ## Tests for Data
138 | 
139 | Data are notoriously difficult to test,
140 | because it is a snapshot of the stochastic state of the world.
141 | Nonetheless, if we impose prior knowledge on our testing,
142 | we can ensure that certain errors in our data never show up.
143 | 
144 | ### Nullity Tests
145 | 
146 | For example, if we subject a SQL query to a series of transforms
147 | that are supposed to guarantee a densely populated DataFrame,
148 | then we can write a **nullity test**.
149 | 
150 | ```python
151 | def test_dataframe_function():
152 |     """Ensures that there are no null values in the dataframe function."""
153 |     df = dataframe_function(*args, **kwargs)
154 |     assert pd.isnull(df).sum().sum() == 0
155 | ```
156 | 
157 | ### `dtype` Tests
158 | 
159 | We can also check that the dtypes of the dataframe are correct.
160 | 
161 | ```python
162 | def test_dataframe_dtypes():
163 |     """Checks that the dtypes of the dataframe are correct."""
164 |     dtypes = {
165 |         "col1": float32,
166 |         "col2": int,
167 |         "col3": object,
168 |     }
169 |     df = dataframe_function(*args, **kwargs)
170 |     for col, dtype in dtypes.items():
171 |         assert df[col].dtype == dtype
172 | ```
173 | 
174 | ### Bounds Tests
175 | 
176 | We can also check to make sure that our dataframe-returning function
177 | yields data in the correct bounds for each column.
178 | 
179 | ```python
180 | def test_dataframe_bounds():
181 |     """Checks that the bounds of datsa are correct."""
182 |     df = dataframe_function(*args, **kwargs)
183 |     # For a column that can be greater than or equal to zero.
184 |     assert df["column1"].min() >= 0
185 | 
186 |     # For a column that can only be non-zero positive.
187 |     assert df["column2"].min() > 0
188 | 
189 |     # For a column that can only be non-zero negative.
190 |     assert df["column3"].max() < 0
191 | ```
192 | 
193 | DataFrame tests are a special one for data scientists,
194 | because the dataframe is the idiomatic data structure
195 | that we engage with on an almost daily basis.
196 | 
197 | ### Column Name Tests
198 | 
199 | Having stable and consistent column names in the dataframes that we use
200 | is extremely important;
201 | the column names are like our API to the data.
202 | Hence, checking that a suite of expected column names exist in the dataframe
203 | can be very useful.
204 | 
205 | ```python
206 | def test_dataframe_names():
207 |     """Checks that dataframe column names are correct."""
208 |     expected_column_names = ["col1", "col2", "col3"]
209 |     df = dataframe_function(*args, **kwargs)
210 | 
211 |     # Check that each of those column names are present
212 |     for c in expected_column_names:
213 |         assert c in df.columns
214 | 
215 |     # (Optional) check that _only_ those columns are present.
216 |     assert set(df.columns) == set(expected_column_names)
217 | ```
218 | 
219 | ### Other statistical property tests
220 | 
221 | Testing the mean, median, and mode are difficult,
222 | but under some circumstances,
223 | such as when we know that the data are drawn from some distribution,
224 | we might be able to write a test for the central tendencies of the data.
225 | 
226 | Placing an automated test
227 | that checks
228 | whether the data matches a particular parameterized distribution
229 | with some probability value
230 | is generally not a good idea,
231 | [because it can give a false sense of security](https://allendowney.blogspot.com/2013/08/are-my-data-normal.html).
232 | However, if this is a key modelling assumption
233 | and you need to keep an automated, rolling check on your data,
234 | then having it as a test
235 | can help you catch failures in downstream modelling early.
236 | In practice, I rarely use this because the speed at which data come in
237 | are slow relative to the time I need to check assumptions.
238 | Additionally, the stochastic nature of data
239 | means that this test would be a flaky one,
240 | which is an undesirable property for tests.
241 | 
242 | ## Parting words
243 | 
244 | I hope this essay gives you some ideas
245 | for implementing testing in your data science workflow.
246 | As with other software skills,
247 | these are skills that become muscle memory over time,
248 | hence taking the time from our daily hustle
249 | to practice them makes us more efficient in the long-run.
250 | In particular, the consistent practice of testing
251 | builds confidence in our codebase,
252 | not just for my future self, but also for other colleagues
253 | who might end up using the codebase too.
254 | 
255 | -----
256 | 
257 | ## A Glossary of Testing in Data Science
258 | 
259 | **Manual testing**:
260 | Basically where we use a Jupyter notebook
261 | and manually inspect that the function works to how we’re expecting.
262 | 
263 | **Automated testing**:
264 | Where we provide a test suite and use a test runner (e.g. `pytest`)
265 | to automatically execute all of the tests in the suite.
266 | 
267 | **Example-based testing**:
268 | Where we provide one or more hard-coded examples in our test suite,
269 | and test that our function works on those examples.
270 | 
271 | **Parameterized testing**:
272 | Where we provide examples as parameters to our test functions,
273 | helping us reduce code duplication in our test functions.
274 | Not necessarily something distinct from example-based testing.
275 | 
276 | **Auto-manual testing**:
277 | A not-so-tongue-in-cheek way of describing
278 | automated testing using hard-coded examples.
279 | 
280 | **Property-based testing**:
281 | Where we use an automatic generator of examples
282 | that fulfill certain “properties”.
283 | For example, numbers with range constraints,
284 | or strings generated from an alphabet of a certain length or less.
285 | Property-based testing builds on top of parameterized testing.
286 | 
287 | **Data testing**:
288 | Where we test the “correctness” of our data.
289 | Property-based testing can be used here,
290 | or we can hard-code checks on our data
291 | that we know should be invariant over time.
292 | 
293 | ## Thank you for reading!
294 | 
295 | If you enjoyed this essay and would like to receive early-bird access to more,
296 | [please support me on Patreon][patreon]!
297 | A coffee a month sent my way gets you _early_ access to my essays
298 | on a private URL exclusively for my supporters
299 | as well as shoutouts on every single essay that I put out.
300 | 
301 | [patreon]: https://patreon.com/ericmjl
302 | 
303 | Also, I have a free monthly newsletter that I use as an outlet
304 | to share programming-oriented data science tips and tools.
305 | If you'd like to receive it, sign up on [TinyLetter][tinyletter]!
306 | 
307 | [tinyletter]: https://tinyletter.com/ericmjl
308 | 


--------------------------------------------------------------------------------
/docs/miscellaneous/dashboarding-landscape.md:
--------------------------------------------------------------------------------
  1 | # A Review of the Python Data Science Dashboarding Landscape in 2019
  2 | 
  3 | ## Introduction
  4 | 
  5 | As Pythonista data scientists,
  6 | we are spoiled for choice when it comes to developing front-ends
  7 | for our data apps.
  8 | We used to have to fiddle with HTML in Flask (or Plotly's Dash),
  9 | but now, there are tools in which
 10 | "someone wrote the HTML/JS so I didn't have to".
 11 | 
 12 | Let me give a quick tour of the landscape of tools
 13 | as I've experienced it in 2019.
 14 | 
 15 | ### Beginnings: Voila
 16 | 
 17 | Previously, I had test-driven
 18 | [Voila](https://voila.readthedocs.io/en/latest/).
 19 | The key advantage I saw back then was that in my workflow,
 20 | once I had the makings of a UI present in the Jupyter notebook,
 21 | and just needed a way to serve it up
 22 | independent of having my end-users run a Jupyter server,
 23 | then Voila helped solve that use case.
 24 | By taking advantage of existing the `ipywidgets` ecosystem
 25 | and adding on a way to run and serve the HTML output of a notebook,
 26 | Voila solved that part of the dashboarding story quite nicely.
 27 | In many respects,
 28 | I regard Voila as the first proper dashboarding tool for Pythonistas.
 29 | 
 30 | That said, development in a Jupyter notebook
 31 | didn't necessarily foster best practices
 32 | (such as refactoring and testing code).
 33 | When my first project at work ended,
 34 | and I didn't have a need for further dashboarding,
 35 | I didn't touch Voila for a long time.
 36 | 
 37 | ### Another player: Panel
 38 | 
 39 | Later, [Panel](http://panel.pyviz.org/) showed up.
 40 | Panel's development model allowed a more modular app setup,
 41 | including importing of plotting functions defined inside `.py` files
 42 | that returned individual plots.
 43 | Panel also allowed me to prototype in a notebook and see the output live
 44 | before moving the dashboard code into a source `.py` file.
 45 | 
 46 | At work, we based a one-stop shop dashboard for a project on Panel,
 47 | and in my personal life,
 48 | I also built a
 49 | [minimal panel app](https://github.com/ericmjl/minimal-panel-app)
 50 | that I also
 51 | [deployed to Heroku](https://minimal-panel-app.herokuapp.com/).
 52 | Panel was definitely developed
 53 | targeting notebook and source file use cases in mind,
 54 | and this shows through in its source development model.
 55 | 
 56 | That said, panel apps could be slow to load,
 57 | and without having a "spinner" solution in place
 58 | (i.e. something to show the user
 59 | that the app is "doing something" in the background),
 60 | it sometimes made apps _feel_ slow
 61 | even though the slowness was not Panel's fault really.
 62 | (My colleagues and I pulled out all the tricks in our bag to speed things up.)
 63 | 
 64 | Additionally, any errors that show up don't get surfaced to the app's UI,
 65 | where developer eyeballs are on -
 66 | instead, they get buried in the browser's JavaScript console
 67 | or in the Python terminal where the app is being served.
 68 | When deployed, this makes it difficult to see where errors show up
 69 | and debug errors.
 70 | 
 71 | ### Enter Streamlit
 72 | 
 73 | Now, Streamlit comes along, and some of its initial demos are pretty rad.
 74 | In order to test-drive it,
 75 | I put together this [little tutorial](https://minimal-streamlit.herokuapp.com/)
 76 | on the Beta probability distribution for my colleagues.
 77 | 
 78 | Streamlit definitely solves some of the pain points
 79 | that I've observed with Panel and Voila.
 80 | 
 81 | The most important one that I see is that errors are captured by Streamlit
 82 | and bubbled up to the UI,
 83 | where our eyeballs are going to be when developing the app.
 84 | For me, this is a very sensible decision to make, for two reasons:
 85 | 
 86 | Firstly, it makes debugging interactions that much easier.
 87 | Instead of needing to have two interfaces open,
 88 | the error message shows up right where the interaction fails,
 89 | in the same browser window as the UI elements.
 90 | 
 91 | Secondly, it makes it possible for us
 92 | to use the error messages as a UI "hack" to inform users
 93 | where their inputs (e.g. free text) might be invalid,
 94 | thereby giving them _informative error messages_.
 95 | (Try it out in the Beta distribution app:
 96 | it'll give you an error message right below
 97 | if you try to type something that cant be converted into a float!)
 98 | 
 99 | The other key thing that Streamlit provides as a UI nice-ity
100 | is the ability to signal to end-users that a computation is happening.
101 | Streamlit does this in three ways, two of which always come for free.
102 | **Firstly**, if something is "running",
103 | then in the top-right hand corner of the page,
104 | the "Running" spinner will animate.
105 | **Secondly**, anything that is re-rendering will automatically be greyed out.
106 | **Finally**, we can use a special context manager
107 | to provide a custom message on the front-end:
108 | 
109 | ```python
110 | import streamlit as st
111 | with st.spinner("Message goes here..."):
112 |     # stuff happens
113 | ```
114 | 
115 | So all-in-all, Streamlit seems to have a solution of some kind
116 | for the friction points that I have observed with Panel and Voila.
117 | 
118 | Besides that, Streamlit, I think, uses a procedural paradigm,
119 | rather than a callback paradigm, for app construction.
120 | We just have to think of the app as a linear sequence of actions
121 | that happen from top to bottom.
122 | State is never really an issue, because every code change
123 | and interaction re-runs the source file from top to bottom, from scratch.
124 | When building quick apps,
125 | this paradigm really simplifies things compared to a callback-based paradigm.
126 | 
127 | Finally, Streamlit also provides a convenient way to add text to the UI
128 | by automatically parsing as Markdown any raw strings unassigned to a variable
129 | in a `.py` file and rendering them as HTML.
130 | This opens the door to treating a `.py` file as a
131 | [literate programming document](https://en.wikipedia.org/wiki/Literate_programming),
132 | hosted by a Python-based server in the backend.
133 | It'd be useful especially in teaching scenarios.
134 | (With `pyiodide` bringing the PyData stack to the browser,
135 | I can't wait to see standalone `.py` files rendered to the DOM!)
136 | 
137 | Now, this isn't to say that Streamlit is problem-free.
138 | There are still rough edges,
139 | the most glaring (as of today) in the current release
140 | is the inability to upload a file and operate on it.
141 | This has been fixed in [a recent pull request](https://github.com/streamlit/streamlit/pull/488),
142 | so I'm expecting this should show up in a new release any time soon.
143 | 
144 | The other not-so-big-problem that I see with Streamlit at the moment
145 | is the procedural paradigm -
146 | by always re-running code from top-to-bottom afresh on every single change,
147 | apps that rely on long compute may need a bit more thought to construct,
148 | including the use of Streamlit's caching mechanism.
149 | Being procedural does make things easier for development though,
150 | and on balance, I would not discount Streamlit's simplicity here.
151 | 
152 | ## Where does Streamlit fit?
153 | 
154 | As I see it, Streamlit's devs are laser-focused on enabling devs
155 | to _very quickly_ get to a somewhat good-looking app prototype.
156 | In my experience, the development time for the Beta distribution app
157 | took about 3 hours, 2.5 of which were spent on composing prose.
158 | So effectively, I only used half an hour doing code writing,
159 | with a live and auto-reloading preview
160 | greatly simplifying the development process.
161 | (I conservatively estimate that this is about 1.5 times
162 | as fast as I would be using Panel.)
163 | 
164 | Given Streamlit, I would use it to develop two classes of apps:
165 | (1) very tightly-focused utility apps that do one lightweight thing well, and
166 | (2) bespoke, single-document literate programming education material.
167 | 
168 | I would be quite hesitant to build more complex things;
169 | then again, for me, that statement would be true more generally anyways
170 | with whatever tool.
171 | In any case, I think bringing UNIX-like thinking to the web
172 | is probably a good idea:
173 | we make little utilities/functional tools
174 | that can pipe standard data formats from to another.
175 | 
176 | ## Common pain points across all three dashboarding tools
177 | 
178 | A design pattern I have desired is to be able to serve up a fleet of small,
179 | individual utilities served up from the same codebase,
180 | served up by individual server processes,
181 | but all packaged within the same container.
182 | The only way I can think of at the moment
183 | is to build a custom Flask-based gateway
184 | to redirect properly to each utility's process.
185 | That said, I think this is probably out of scope
186 | for the individual dashboarding projects.
187 | 
188 | ## How do we go forward?
189 | 
190 | The ecosystem is ever-evolving, and,
191 | rather than being left confused by the multitude of options available to us,
192 | I find myself actually being very encouraged
193 | at the development that has been happening.
194 | There's competing ideas with friendly competition between the developers,
195 | but they are also simultaneously listening to each other and their users
196 | and converging on similar things in the end.
197 | 
198 | That said, I think it would be premature to go "all-in" on a single solution
199 | at this moment.
200 | For the individual data scientist,
201 | I would advise to be able to build something
202 | using each of the dashboarding frameworks.
203 | My personal recommendations are to know how to use:
204 | 
205 | - Voila + `ipywidgets` in a Jupyter notebook
206 | - Panel in Jupyter notebooks and standalone `.py` files
207 | - Streamlit in `.py` files.
208 | 
209 | These recommendations stem mainly from
210 | the ability to style and layout content without needing much knowledge of HTML.
211 | In terms of roughly when to use what,
212 | my prior experience has been that
213 | Voila and Streamlit are pretty good for quicker prototypes,
214 | while Panel has been good for more complex ones,
215 | though in all cases, we have to worry about speed impacting user experience.
216 | 
217 | From my experience at work,
218 | being able to quickly hash out key visual elements in a front-end prototype
219 | gives us the ability to better communicate with UI/UX designers and developers
220 | on what we're trying to accomplish.
221 | Knowing how to build front-ends ourselves
222 | lowers the communication and engineering barrier
223 | when taking a project to production.
224 | It's a worthwhile skill to have;
225 | be sure to have it in your toolbox!
226 | 
227 | ## Thank you for reading!
228 | 
229 | If you enjoyed this essay and would like to receive early-bird access to more,
230 | [please support me on Patreon][patreon]!
231 | A coffee a month sent my way gets you _early_ access to my essays
232 | on a private URL exclusively for my supporters
233 | as well as shoutouts on every single essay that I put out.
234 | 
235 | [patreon]: https://patreon.com/ericmjl
236 | 
237 | Also, I have a free monthly newsletter that I use as an outlet
238 | to share programming-oriented data science tips and tools.
239 | If you'd like to receive it, sign up on [TinyLetter][tinyletter]!
240 | 
241 | [tinyletter]: https://tinyletter.com/ericmjl
242 | 


--------------------------------------------------------------------------------
/docs/machine-learning/reimplementing-models.md:
--------------------------------------------------------------------------------
  1 | # Reimplementing and Testing Deep Learning Models
  2 | 
  3 | At work, most deep learners I have encountered
  4 | have a tendency to take deep learning models
  5 | and treat them as black boxes that we should be able to wrangle.
  6 | While I see this as a pragmatic first step
  7 | to testing and proving out the value of a newly-developed deep learning model,
  8 | I think that stopping there
  9 | and not investing the time into understanding the nitty-gritty of the model
 10 | leaves us in a poor position
 11 | to know that model's
 12 | (1) applicability domain (i.e. where the model should be used),
 13 | (2) computational and statistical performance limitations, and
 14 | (3) possible engineering barriers to getting the model performant
 15 | in a "production" setting.
 16 | 
 17 | As such, with deep learning models,
 18 | I'm actually a fan of investing the time to re-implement the model
 19 | in a tensor framework that we all know and love,
 20 | NumPy (and by extension, JAX).
 21 | 
 22 | ## Benefits of re-implementing deep learning models
 23 | 
 24 | Doing a model re-implementation from a deep learning framework
 25 | into NumPy code actually has some benefits for the time being invested.
 26 | 
 27 | ### Developing familiarity with deep learning frameworks
 28 | 
 29 | Firstly, doing so forces us to know the translation/mapping
 30 | from deep learning tensor libraries into NumPy.
 31 | One of the issues I have had with deep learning libraries
 32 | (PyTorch and Tensorflow being the main culprits here)
 33 | is that their API copies something like 90% of NumPy API
 34 | without making easily accessible
 35 | the design considerations discussed when deciding to deviate.
 36 | (By contrast, CuPy has an explicit API policy
 37 | that is well-documented and front-and-center on the docs,
 38 | while JAX strives to replicate the NumPy API.)
 39 | 
 40 | My gripes with tensor library APIs aside, though,
 41 | translating a model by hand from one API to another
 42 | forces growth in familiarity with both APIs,
 43 | much as translating between two languages
 44 | forces growth in familiarity with both languages.
 45 | 
 46 | ### Developing a mechanistic understanding of the model
 47 | 
 48 | It is one thing to describe a deep neural network
 49 | as being "like the brain cell connections".
 50 | It is another thing to know that the math operations underneath the hood
 51 | are nothing more than dot products (or tensor operations, more generally).
 52 | Re-implementing a deep learning model
 53 | requires combing over every line of code,
 54 | which forces us to identify each math operation used.
 55 | No longer can we hide behind an unhelpfully vague abstraction.
 56 | 
 57 | ### Developing an ability to test and sanity-check the model
 58 | 
 59 | If we follow the workflow (that I will describe below)
 60 | for reimplementing the model,
 61 | (or as the reader should now see, translating the model between APIs)
 62 | we will develop confidence in the correctness of the model.
 63 | This is because the workflow I am going to propose
 64 | involves proper basic software engineering workflow:
 65 | writing documentation for the model,
 66 | testing it,
 67 | and modularizing it into its logical components.
 68 | Doing each of these requires a mechanistic understanding
 69 | of how the model works,
 70 | and hence forms a useful way of building intuition behind the model
 71 | as well as correctness of the model.
 72 | 
 73 | ### Reimplementing models is _not_ a waste of time
 74 | 
 75 | By contrast, it is a highly beneficial practice
 76 | for gaining a deeper understanding into the inner workings
 77 | of a deep neural network.
 78 | The only price we pay is in person-hours,
 79 | yet under the assumption that the model is of strong commercial interest,
 80 | that price can only be considered an investment, and not a waste.
 81 | 
 82 | ## A proposed workflow for reimplementing deep learning models
 83 | 
 84 | I will now propose a workflow for re-implementing deep learning models.
 85 | 
 86 | ### Identify a coding partner
 87 | 
 88 | Pair programming is a productive way of teaching and learning.
 89 | Hence, I would start by identifying a coding partner
 90 | who has the requisite skillset and shared incentive
 91 | to go deep on the model.
 92 | 
 93 | Doing so helps a few ways.
 94 | 
 95 | Firstly, we have real-time peer review on our code,
 96 | making it easier for us to catch mistakes that show up.
 97 | 
 98 | Secondly, working together at the same time means that
 99 | both myself and my colleague will learn something about the neural network
100 | that we are re-implementing.
101 | 
102 | ### Pick out the "forward" step of the model
103 | 
104 | The "forward" pass of the model is where the structure of the model is defined:
105 | basically the mathematical operations
106 | that transform the input data into the output observations.
107 | 
108 | A few keywords to look out for
109 | are the `forward()` and  `__call__()` class methods.
110 | 
111 | ```python
112 | class MyModel(nn.Model):
113 |     # ...
114 |     def forward(self, X):
115 |         # Implementation of model happens here.
116 |         something = ...
117 |         return something
118 | ```
119 | 
120 | For models that involve an autoencoder,
121 | somewhat more seasoned programmers
122 | might create a class method called `encoder()` and `decoder()`,
123 | which themselves reference another model
124 | that would have a `forward()` or `__call__()` defined.
125 | 
126 | ```python
127 | class AutoEncoder(nn.Model):
128 |     # ...
129 |     def forward(self, X):
130 |         something = self.encoder(X)
131 |         output = self.decoder(something)
132 |         return output
133 | ```
134 | 
135 | Re-implementing the `forward()` part of the model
136 | is usually a good way of building a map
137 | of the equations that are being used
138 | to transform the input data into the output data.
139 | 
140 | ### Inspect the shapes of the weights
141 | 
142 | While the equations give the model _structure_,
143 | the weights and biases, or the _parameters_,
144 | are the part of the model that are optimized.
145 | (In Bayesian statistics, we would usually presume a model structure,
146 | i.e. the set of equations used alongside the priors,
147 | and fit the model parameters.)
148 | 
149 | Because much of deep learning hinges on linear algebra,
150 | and because most of the transformations that happen
151 | involve transforming the _input space_ into the _output space_,
152 | getting the shapes of the parameters is very important.
153 | 
154 | In a re-implementation exercise with my intern,
155 | where we re-implemented
156 | a specially designed recurrent neural network layer in JAX,
157 | we did a manual sanity check through our implementation
158 | to identify what the shapes would need to be
159 | for the inputs and outputs.
160 | After that, we encoded that manual test into an automatic test.
161 | Later on, after we built another test that integrated
162 | which paradoxically failed on shapes,
163 | we eventually uncovered that we were indexing into the wrong dimensions
164 | in our implementation.
165 | This led to us
166 | (1) fixing the bug,
167 | (2) writing a more comprehensive documentation and test suite, and
168 | (3) writing better documentations for the semantic meaning
169 | of each tensor axis.
170 | 
171 | ### Write tests for the neural network components
172 | 
173 | Once we have the neural network model and its components implemented,
174 | writing tests for those components is a wonderful way of making sure that
175 | (1) the implementation is correct, to the best of our knowledge, and that
176 | (2) we can catch when the implementation might have been broken inadvertently.
177 | 
178 | The shape test (as described above) is one way of doing this.
179 | 
180 | ```python
181 | def test_layer_shapes():
182 |     weights = np.random.normal(size=(input_dims, output_dims))
183 |     data = np.random.normal(size=(batch_size, input_dims))
184 |     output = nn_layer(weights, data)
185 |     assert output.shape[1] == output_dims
186 | ```
187 | 
188 | If there are special elementwise transforms performed on the data,
189 | such as a ReLU or exponential transform,
190 | we can test that the numerical properties of the output are correct:
191 | 
192 | ```python
193 | def test_layer_shapes():
194 |     weights = np.random.normal(size=(input_dims, output_dims))
195 |     data = np.random.normal(size=(batch_size, input_dims))
196 | 
197 |     output = nn_layer(weights, data, nonlinearity="relu")
198 |     assert np.all(output >= 0)
199 | ```
200 | 
201 | ### Write tests for the entire training loop
202 | 
203 | Once the model has been re-implemented in its entirety,
204 | prepare a small set of training data,
205 | and pass it through the model,
206 | and attempt to train it for a few epochs.
207 | 
208 | If the model, as implemented, is doing what we think it should be,
209 | then after a dozen epochs or so,
210 | the training loss should go down.
211 | We can then test that the training loss at the end
212 | is less than the training loss at the beginning.
213 | If the loss does go down, it's necessary but not sufficient for knowing
214 | that the model is implemented correctly.
215 | However, if the loss _does not_ go down, then we will definitely know
216 | that a problem exists somewhere in the code, and can begin to debug.
217 | 
218 | An example with pseudocode below might look like the following:
219 | 
220 | ```python
221 | from data import dummy_graph_data
222 | from model import gnn_model
223 | from params import make_gnn_params
224 | from losses import mse_loss
225 | from jax import grad
226 | from jax.experimental.optimizers import adam
227 | 
228 | def test_gnn_training():
229 |     # Prepare training data
230 |     x, y = dummy_graph_data(*args, **kwargs)
231 |     params = make_gnn_params(*args, **kwargs)
232 | 
233 |     dloss = grad(mse_loss)
234 |     init, update, get_params = adam(step_size=0.005)
235 |     start_loss  = mse_loss(params, model, x, y)
236 | 
237 |     state = init(params)
238 |     for i in range(10):
239 |         g = dloss(params, model, x, y)
240 | 
241 |         state = update(i, g, state)
242 |         params = get_params(state)
243 | 
244 |     end_loss = mse_loss(params, model, x, y)
245 | 
246 |     assert end_loss < start_loss
247 | ```
248 | 
249 | A side benefit of this is that
250 | if you commit to only judiciously changing the tests,
251 | you will end up with a stable
252 | and copy/paste-able
253 | training loop that you know you can trust
254 | on new learning tasks,
255 | and hence only need to worry about swapping out the data.
256 | 
257 | ### Build little tools for yourself that automate repetitive (boring) things
258 | 
259 | You may notice in the above integration test,
260 | we wrote a lot of other functions
261 | that make testing much easier,
262 | such as dummy data generators,
263 | and parameter initializers.
264 | 
265 | These are tools that make composing parts of the entire training process
266 | modular and easy to compose.
267 | I strongly recommend writing these things,
268 | and also backing them with more tests
269 | (since we will end up relying on them anyways).
270 | 
271 | ### Now run your deep learning experiments
272 | 
273 | Once we have the model re-implemented and tested,
274 | the groundwork is present for us to conduct extensive experiments
275 | with the confidence that we know
276 | how to catch bugs in the model
277 | in a fairly automated fashion.
278 | 
279 | ## Concluding words
280 | 
281 | Re-implementing deep learning models can be a very fun and rewarding exercise,
282 | because it serves as an excellent tool
283 | to check our understanding of the models that we work with.
284 | 
285 | Without the right safeguards in place, though,
286 | it can also very quickly metamorphose into a nightmare rabbithole of debugging.
287 | Placing basic safeguards in place when re-implementing models
288 | helps us avoid as many of these rabbitholes as possible.
289 | 
290 | ## Thank you for reading!
291 | 
292 | If you enjoyed this essay and would like to receive early-bird access to more,
293 | [please support me on Patreon][patreon]!
294 | A coffee a month sent my way gets you _early_ access to my essays
295 | on a private URL exclusively for my supporters
296 | as well as shoutouts on every single essay that I put out.
297 | 
298 | [patreon]: https://patreon.com/ericmjl
299 | 
300 | Also, I have a free monthly newsletter that I use as an outlet
301 | to share programming-oriented data science tips and tools.
302 | If you'd like to receive it, sign up on [TinyLetter][tinyletter]!
303 | 
304 | [tinyletter]: https://tinyletter.com/ericmjl
305 | 


--------------------------------------------------------------------------------
/docs/software-skills/environment-variables.md:
--------------------------------------------------------------------------------
  1 | # A Data Scientist's Guide to Environment Variables
  2 | 
  3 | You might have encountered a piece of software asking you for permission to modify your `PATH` variable,
  4 | or another program's installation instructions cryptically telling you
  5 | that you have to "set your `LD_LIBRARY_PATH` variable correctly".
  6 | 
  7 | As a data scientist, you might encounter other environment variable issues
  8 | when interacting with your compute stack
  9 | (particularly if you don't have full control over it, like I do).
 10 | This post is meant to demystify what an environment variable is,
 11 | and how it gets used in a data science context.
 12 | 
 13 | ## What Is An Environment Variable?
 14 | 
 15 | First off, let me explain what an environment variable is,
 16 | by going in-depth into the `PATH` environment variable.
 17 | I'd encourage you to execute the commands here inside your bash terminal
 18 | (with appropriate modifications -- read the text to figure out what I'm doing!).
 19 | 
 20 | When you log into your computer system, say,
 21 | your local computer’s terminal or your remote server via SSH,
 22 | your bash interpreter needs to know where to look for particular programs,
 23 | such as `nano` (the text editor), or `git` (your version control software),
 24 | or your Python executable. This is controlled by your PATH variable.
 25 | It specifies the paths to folders where your executable programs are found.
 26 | 
 27 | By historical convention, command line programs,
 28 | such as `nano`, `which`, and `top`,
 29 | are found in the directory `/usr/bin`.
 30 | By historical convention, the `/bin` folder is for software binaries,
 31 | which is why they are named `/bin`.
 32 | These are the ones that are bundled with your operating system,
 33 | and as such, need special permissions to upgrade.
 34 | 
 35 | Try it out in your terminal:
 36 | 
 37 | ```
 38 | $ which which
 39 | /usr/bin/which
 40 | $ which top
 41 | /usr/bin/top
 42 | ```
 43 | 
 44 | Other programs are installed (for whatever reason) into `/bin` instead. `ls` is one example:
 45 | 
 46 | ```
 47 | $ which ls
 48 | /bin/ls
 49 | ```
 50 | 
 51 | Yet other programs might be installed in other special directories:
 52 | 
 53 | ```
 54 | $ which nano
 55 | /usr/local/bin/nano
 56 | ```
 57 | 
 58 | How does your Bash terminal figure out where to go to look for stuff?
 59 | It uses the `PATH` environment variable.
 60 | It looks something like this:
 61 | 
 62 | ```
 63 | $ echo $PATH
 64 | /usr/bin:/bin:/usr/local/bin
 65 | ```
 66 | 
 67 | The most important thing to remember about the `PATH` variable is that it is "colon-delimited".
 68 | That is, each directory path is separated by the next using a "colon" (`:`) character.
 69 | The order in which your bash terminal is looking for programs goes from left to right:
 70 | 
 71 | - `/usr/bin`
 72 | - `/bin`
 73 | - `/usr/local/bin`
 74 | 
 75 | On my particular computer, when I type in `ls`,
 76 | my bash interpreter will look inside the `/usr/bin` directory first.
 77 | It'll find that `ls` doesn't exist in `/usr/bin`,
 78 | and so it'll move to the next directory, `/bin`.
 79 | Since my `ls` exists under `/bin`,
 80 | it'll execute the `ls` program from there.
 81 | 
 82 | You can see, then, that this is simultaneously super flexible for customizing your compute environment,
 83 | yet also potentially super frustrating if a program modified your `PATH` variable without you knowing.
 84 | 
 85 | Wait, you can actually modify your `PATH` variable? Yep, and there's a few ways to do this.
 86 | 
 87 | ## How To Modify the `PATH` variable
 88 | 
 89 | ### Using a Bash Session
 90 | 
 91 | The first way is transient, or temporary, and only occurs for your particular bash session.
 92 | You can make a folder have higher priority than the existing paths by "pre-pending" it to the `PATH` variable:
 93 | 
 94 | ```
 95 | $ export PATH=/path/to/my/folder:$PATH
 96 | $ echo $PATH
 97 | /path/to/my/folder:/usr/bin:/bin:/usr/local/bin
 98 | ```
 99 | 
100 | Or I can make it have a lower priority than existing paths by "appending" it to the `PATH` variable:
101 | 
102 | ```
103 | $ export PATH=$PATH:/path/to/my/folder
104 | $ echo $PATH
105 | /usr/bin:/bin:/usr/local/bin:/path/to/my/folder
106 | ```
107 | 
108 | The reason this is temporary is because I only export it during my current bash session.
109 | 
110 | ### `bashrc` or `.bash_profile` File
111 | 
112 | If I wanted to make my changes somewhat more permanent,
113 | then I would include inside my `.bashrc` or `.bash_profile` file.
114 | (I recommend using the `.bashrc` file.)
115 | The `.bashrc`/`.bash_profile` file lives inside your home directory
116 | (your `$HOME` environment variable specifies this),
117 | and is a file that your bash interpreter will execute first load.
118 | It will execute all of the commands inside there.
119 | This means, you can change your PATH variable by simply putting inside your `.bashrc`:
120 | 
121 | ```
122 | ...other stuff above...
123 | # Make /path/to/folder have higher priority
124 | export PATH=/path/to/folder:$PATH
125 | 
126 | # Make /path/to/other/folder have lower priority
127 | export PATH=$PATH:/path/to/folder
128 | ...other stuff below...
129 | ```
130 | 
131 | ## Data Science and the `PATH` environment variable
132 | 
133 | Now, **how is this relevant to data scientists?**
134 | Well, if you're a data scientist, chances are that you use Python,
135 | and that your Python interpreter comes from the Anaconda Python distribution
136 | (a seriously awesome thing, go get it!).
137 | What the Anaconda Python installer does is prioritize
138 | the `/path/to/anaconda/bin` folder in the `PATH` environment variable.
139 | You might have other Python interpreters installed on your system
140 | (that is, Apple ships its own).
141 | However, this `PATH` modification ensures that
142 | each time you type `python` into your Bash terminal,
143 | ou execute the Python interpreter shipped with the Anaconda Python distribution.
144 | In my case, after installing the Anaconda Python distribution, my `PATH` looks like:
145 | 
146 | ```
147 | $ echo $PATH
148 | /Users/ericmjl/anaconda/bin:/usr/bin:/bin:/usr/local/bin
149 | ```
150 | 
151 | Even better, what conda environments do is
152 | prepend the path to the conda environment binaries folder
153 | while the environment is activated.
154 | For example, with my blog, I keep it in an environment named `lektor`.
155 | Thus...
156 | 
157 | ```
158 | $ echo $PATH
159 | /Users/ericmjl/anaconda/bin:/usr/bin:/bin:/usr/local/bin
160 | $ which python
161 | /Users/ericmjl/anaconda/bin/python
162 | $ source activate lektor
163 | $ echo $PATH
164 | /Users/ericmjl/anaconda/envs/lektor/bin:/Users/ericmjl/anaconda/bin:/usr/bin:/bin:/usr/local/bin
165 | $ which python
166 | /Users/ericmjl/anaconda/envs/lektor/bin/python
167 | ```
168 | 
169 | Notice how the bash terminal now preferentially picks the Python inside the higher-priority `lektor` environment.
170 | 
171 | If you've gotten to this point, then you'll hopefully realize there's a few important concepts listed here. Let's recap them:
172 | 
173 | - `PATH` is an environment variable stored as a plain text string used by the bash interpreter to figure out where to find executable programs.
174 | - `PATH` is colon-delimited; higher priority directories are to the left of the string, while lower priority directories are to the right of the string.
175 | - `PATH` can be modified by prepending or appending directories to the environment variable. It can be done transiently inside a bash session by running the `export` command at the command prompt, or it can be done permanently across bash sessions by adding an `export` line inside your `.bashrc` or `.bash_profile`.
176 | 
177 | ## Other Environment Variables of Interest
178 | 
179 | Now, what other environment variables might a data scientist encounter? These are a sampling of them that you might see, and might have to fix, especially in contexts where your system administrators are off on vacation (or taking too long to respond).
180 | 
181 | ### General Use
182 | 
183 | For general use**, you'll definitely want to know where your `HOME` folder is -- on Linux systems, it's often `/home/username`, while on macOS systems, it's often `/Users/username`.  You can figure out what `HOME` is by doing:
184 | 
185 | ```
186 | $ echo $HOME
187 | /Users/ericmjl
188 | ```
189 | 
190 | ### Python
191 | 
192 | **If you're a Python user**,
193 | then the `PYTHONPATH` is one variable that might be useful.
194 | It is used by the Python interpreter,
195 | and specifies where to find Python modules/packages.
196 | 
197 | ### C++ libraries
198 | 
199 | **If you have to deal with C++ libraries**,
200 | then knowing your `LD_LIBRARY_PATH` environment variable is going to be very important.
201 | I'm not well-versed enough in this to espouse on it intelligently,
202 | so I would defer to [this website](http://xahlee.info/UnixResource_dir/_/ldpath.html)
203 | for more information on best practices for using the `LD_LIBRARY_PATH` variable.
204 | 
205 | ### Spark
206 | 
207 | **If you're working with Spark**,
208 | then the `PYSPARK_PYTHON` environment variable would be of interest.
209 | This essentially tells Spark which Python to use for both its driver and its workers;
210 | you can also set the `PYSPARK_DRIVER_PYTHON`
211 | to be separate from the `PYSPARK_PYTHON` environment variable, if needed.
212 | 
213 | ### Data science apps
214 | 
215 | **If you're developing data science apps**,
216 | then according to the [12 factor app development principles](https://12factor.net),
217 | your credentials to databases and other sensitive information
218 | are securely stored and dynamically loaded into the environment at runtime.
219 | How then do you mimic this in a "local" environment (i.e. your computer)
220 | without hard-coding sensitive information in your source `.py` files?
221 | 
222 | One way to handle this situation is as follows:
223 | Firstly, create a `.env` file in your home directory.
224 | In there, store your credentials:
225 | 
226 | ```bash
227 | SOME_PASSWORD="put_your_pw_here"
228 | SOME_USERNAME="put_your_username_here"
229 | ```
230 | 
231 | Next, add it to your `.gitignore`, so you never add it to your version control system.
232 | 
233 | ```bash
234 | # other things
235 | .env
236 | ```
237 | 
238 | Finally, in your source `.py` files, use `python-dotenv` to load the environment variables at runtime.
239 | 
240 | ```python
241 | from dotenv import load_dotenv
242 | load_dotenv()
243 | 
244 | import os
245 | 
246 | username = os.getenv("SOME_USERNAME")
247 | password = os.getenv("SOME_PASSWORD")
248 | ```
249 | 
250 | ## Hack Your Environment Variables
251 | 
252 | This is where the most fun happens!
253 | Follow along for some stuff you might be able to do
254 | by hacking your environment variables.
255 | 
256 | ### Hack #1: Enable access to PyPy.
257 | 
258 | I occasionally keep up with the development of PyPy,
259 | but because PyPy is not yet the default Python interpreter,
260 | and is not yet `conda install`-able,
261 | I have to put it in its own `$HOME/pypy/bin` directory.
262 | To enable access to the PyPy interpreter,
263 | I have to make sure that my `/path/to/pypy` is present
264 | in the `PATH` environment variable,
265 | but at a lower priority than my regular CPython interpreter.
266 | 
267 | ### Hack #2: Enable access to other language interpreters/compilers.
268 | 
269 | This is analogous to PyPy.
270 | I once was trying out Lua's JIT interpreter to use Torch for deep learning,
271 | and needed to add a path to there in my `.bashrc`.
272 | 
273 | ### Hack #3: Install Python packages to your home directory.
274 | 
275 | On shared Linux compute systems that use the `modules` system
276 | rather than `conda` environments,
277 | a `modulefile` that you load might be configured
278 | with a virtual environment that *you don't have permissions to modify*.
279 | If you need to install a Python package,
280 | you might want to `pip install --user my_pkg_name`.
281 | This will install it to `$HOME/.local/lib/python-[version]/site-packages/`.
282 | Ensuring that your `PYTHONPATH`
283 | includes `$HOME/.local/lib/python-[version]/site-packages`
284 | at a high enough priority is going to be important in this case.
285 | 
286 | ### Hack 4: Debugging when things go wrong.
287 | 
288 | In case something throws an error,
289 | or you have unexpected behaviour --
290 | something I encountered before was my Python interpreter
291 | not being found correctly after loading all of my Linux modules --
292 | then a way to debug is to temporarily set your PATH environment variable
293 | to some sensible "defaults" and sourcing that,
294 | effectively "resetting" your PATH variable,
295 | so that you can manually prepend/append while debugging.
296 | 
297 | To do this, place the following line inside a file named `.path_default`,
298 | inside your home directory:
299 | 
300 | ```
301 | export PATH=""  # resets PATH to an empty string.
302 | export PATH=/usr/bin:/bin:/usr/local/bin:$PATH  # this is a sensible default; customize as needed.
303 | ```
304 | 
305 | After something goes wrong,
306 | you can reset your PATH environment variable by using the "source" command:
307 | 
308 | ```
309 | $ echo $PATH
310 | /some/complicated/path:/more/complicated/paths:/really/complicated/paths
311 | $ source ~/.path_default
312 | $ echo $PATH
313 | /usr/bin:/bin:/usr/local/bin
314 | ```
315 | 
316 | Note - you can also execute the exact same commands inside your bash session;
317 | the interactivity may also be helpful.
318 | 
319 | ## Conclusion
320 | 
321 | I hope you enjoyed this article, and that it'll give you a, ahem,
322 | path forward whenever you encounter these environment variables!
323 | 
324 | ## Thank you for reading!
325 | 
326 | If you enjoyed this essay and would like to receive early-bird access to more,
327 | [please support me on Patreon][patreon]!
328 | A coffee a month sent my way gets you _early_ access to my essays
329 | on a private URL exclusively for my supporters
330 | as well as shoutouts on every single essay that I put out.
331 | 
332 | [patreon]: https://patreon.com/ericmjl
333 | 
334 | Also, I have a free monthly newsletter that I use as an outlet
335 | to share programming-oriented data science tips and tools.
336 | If you'd like to receive it, sign up on [TinyLetter][tinyletter]!
337 | 
338 | [tinyletter]: https://tinyletter.com/ericmjl
339 | 


--------------------------------------------------------------------------------
/docs/workflow/gitflow.md:
--------------------------------------------------------------------------------
  1 | # Principled Git-based Workflow in Collaborative Data Science Projects
  2 | 
  3 | [GitFlow] is an incredible branching model for working with code.
  4 | In this essay, I would like to introduce it to you, the data scientist,
  5 | and show how it might be useful in your context,
  6 | especially for working with multiple colleagues on the same project.
  7 | 
  8 | [GitFlow]: https://datasift.github.io/gitflow/IntroducingGitFlow.html
  9 | 
 10 | ## What GitFlow is
 11 | 
 12 | GitFlow is a way of working with multiple collaborators on a git repository.
 13 | It originated in the software development world,
 14 | and gives software developers a way of keeping new development work
 15 | isolated from reviewed, documented, and stable code.
 16 | 
 17 | At its core, we have a "source of truth" branch called `master`,
 18 | from which we make branches on which development work happens.
 19 | Development work basically means new code,
 20 | added documentation,
 21 | more tests, etc.
 22 | When the new code, documentation, tests, and more are reviewed,
 23 | a pull request is made to merge the new code back into the `master` branch.
 24 | 
 25 | Usually, the act of making a branch
 26 | is paired with raising an issue on an issue tracker,
 27 | in which the problem and proposed solution are written down.
 28 | (In other words, the **deliverables** are explicitly sketched out.)
 29 | Merging into master is paired with a code review session,
 30 | in which another colleague (or the tech lead) reviews the code to be merged,
 31 | and approves (or denies) code merger
 32 | based on whether the issue raised in the issue tracker has been resolved.
 33 | 
 34 | From my time experimenting with GitFlow at work,
 35 | I think that when paired with other principled workflows
 36 | that doen't directly interact with Git,
 37 | can I think be of great utility to data scientists.
 38 | It does, however, involve a bit of change in the common mode of working
 39 | that data scientists use.
 40 | 
 41 | ??? question "Is GitFlow still confusing for you?"
 42 | 
 43 |     If so, please check out [this article][GitFlow] on GitFlow.
 44 |     It includes the appropriate graphics that will make it much clearer.
 45 |     I felt that a detailed explanation here would be rather out of scope.
 46 | 
 47 |     That said, nothing beats trying it out to get a feel for it,
 48 |     so if you're willing to pick it up,
 49 |     I would encourage you to find a software developer in your organization
 50 |     who has experience with GitFlow
 51 |     and ask them to guide you on it.
 52 | 
 53 | ## GitFlow in a data science project
 54 | 
 55 | Here is how I think GitFlow can be successfully deployed
 56 | in a data science project.
 57 | 
 58 | Everything starts with the **unit of analysis** that we are trying to perform.
 59 | 
 60 | We start by defining the question that we are trying to answer.
 61 | We then proceed forward by sketching out an analysis plan
 62 | (let's call this an **analysis sketch**),
 63 | which outlines the data sources that we need,
 64 | the strategy for analyzing the data
 65 | (roughly including:
 66 | models we think might be relevant to the scale of the problem,
 67 | the plots we think might be relevant to make,
 68 | and where we think, future directions might lie).
 69 | 
 70 | None of this is binding,
 71 | which makes the analysis sketch
 72 | less like a formal pre-registered analysis plan,
 73 | and more like a tool to
 74 | be more thoughtful of what we want to do when analyzing our data.
 75 | After all, one of the myths of data science
 76 | is that we can
 77 | ["stir the pile until the data start looking right"](https://xkcd.com/1838/).
 78 | 
 79 | ??? tip "About stirring the pot..."
 80 | 
 81 |     If you didn't click the URL to go to XKCD,
 82 |     here's the cartoon embedded below:
 83 | 
 84 |     ![](https://imgs.xkcd.com/comics/machine_learning_2x.png)
 85 | 
 86 | Once we are done with defining the analysis sketch in an issue,
 87 | we follow the rest of GitFlow-based workflow:
 88 | We create a branch off from `master`,
 89 | execute on our work,
 90 | and submit a pull request with everything that we have done.
 91 | 
 92 | We then invite a colleague to review our work,
 93 | in which the colleague is explicitly checking
 94 | that we have delivered on our analysis sketch,
 95 | or if we have changed course,
 96 | to discuss the analysis with us in a formal setting.
 97 | Ideally this is done in-person,
 98 | but by submitting a formal pull request,
 99 | our colleague can pull down our code
100 | and check that things have been done correctly on their computer.
101 | 
102 | ??? tip "Code review"
103 | 
104 |     If you want to know more about code review,
105 |     please check out [another essay][code-review] in this collection.
106 | 
107 | [code-review]: /workflow/code-review/
108 | 
109 | If your team has access to a [Binder]-like service,
110 | then review can be done in an even simpler fashion:
111 | simply create a Binder session for the colleague's fork,
112 | and explore the analyses there in a temporary session.
113 | 
114 | [Binder]: https://mybinder.org/
115 | 
116 | Once the formal review has finished
117 | and both colleagues are on the same page with the analysis,
118 | the analysis is merged back into the `master` branch, and considered done.
119 | Both parties can now move onto the next analysis.
120 | 
121 | ## Mindset changes needed to make GitFlow work
122 | 
123 | In this section,
124 | I am going to describe some common mindsets
125 | that prevent successful adoption of GitFlow
126 | that data scientists might employ,
127 | and ways to adapt those mindsets to work with GitFlow.
128 | 
129 | ### Jumping straight into exploratory data analysis (EDA)
130 | 
131 | This is a common one that even I have done before.
132 | The refrain in our mind is,
133 | "Just give me the CSV file!
134 | I will figure something out."
135 | Famous last words, once we come to terms
136 | with the horror that we experience
137 | in looking through the data.
138 | 
139 | It seems, though,
140 | that we shouldn't be able to sketch an analysis plan
141 | for EDA, right?
142 | 
143 | I think that mode of thinking might be a tad pessimistic.
144 | What we are trying to accomplish with exploratory data analysis
145 | is to establish our own working knowledge on:
146 | 
147 | - The bounds of the data,
148 | - The types of the data (ordinal, categorical, numeric),
149 | - The possible definitions of a single sample in the dataset,
150 | - Covariation between columns of data,
151 | - Whether or not the data can answer our questions, and
152 | - Further questions that come up while looking at the data.
153 | 
154 | Hence, a good analysis sketch to raise for exploratory data analysis
155 | would be to write a Jupyter notebook
156 | that simply documents all of the above,
157 | and then have a colleague review it.
158 | 
159 | ### Endless modelling experiments
160 | 
161 | This is another one of those trops that I fall into often,
162 | so I am sympathetic towards others who might do the same.
163 | 
164 | Scientists (of any type, not just data sciensists)
165 | usually come with an obsessive streak,
166 | and the way it manifests in data science
167 | is usually the quest for the best-performing model.
168 | However, in most data science settings,
169 | the goal we are trying to accomplish
170 | requires first proving out the value of our work
171 | using some form of prototype,
172 | so we cannot afford to chase performance rabbits down their hole.
173 | 
174 | One way to get around this is to think about the problem in two phases.
175 | 
176 | The first phase is **model prototyping**.
177 | As such, in the analysis sketch,
178 | we define a deliverable
179 | that is "a machine learning model that predicts Y from X",
180 | _leaving out the performance metric for now_.
181 | In other words, we are establishing a baseline model,
182 | and building out the analysis framework for evaluating how good the model is
183 | in the larger applied context.
184 | 
185 | We do this in a quick and dirty fashion,
186 | and invite a colleague to review our work
187 | to ensure that we have not made any elementary statistical errors,
188 | and that the framework is correct
189 | with respect to the applied problem that we are tackling.
190 | (See note below for more detail.)
191 | 
192 | ??? note "Note: statistical errors"
193 |     For example,
194 |     we need to get splitting done correctly in a time series setting,
195 |     which does not have i.i.d. samples,
196 |     compared to most other ML problems.
197 |     And in a cheminformatics setting,
198 |     random splits tend to over-estimate model performance
199 |     when compared to a real-world setting
200 |     where new molecules are often out-of-distribution.
201 | 
202 |     If we focused on getting a good model right from the get-go,
203 |     we may end up missing out on elementary details such as these.
204 | 
205 | Once we are done with this,
206 | we embark on the second phase,
207 | which is **model improvement**.
208 | Here, we define another analysis sketch
209 | where we outline the models that we intend to try,
210 | and for which the deliverable is now
211 | a Jupyter notebook documenting the modelling experiments we tried.
212 | As usual, once we are done,
213 | we invite a colleague to review the work
214 | to make sure that we have conducted it correctly.
215 | 
216 | A key here is to define the task
217 | in as _neutral_
218 | and relevant
219 | terms as possible.
220 | For example, nobody can guarantee an improved model.
221 | However, we can promise a comprehensive, if not exhaustive,
222 | search through model and parameter space.
223 | We can also guarantee delivering recommendations for improvement
224 | regardless of what model performance looks like.
225 | 
226 | ??? note "Note: Neutral forms of goals"
227 |     As expressed on Twitter before,
228 |     "the most scary scientist is one with a hypothesis to prove".
229 |     A data scientist who declares
230 |     that a high-performing model will be the goal
231 |     is probably being delusional.
232 |     I wish I knew where exactly I saw the quote,
233 |     and hence will not take credit for that.
234 | 
235 | ### Endless ideation prototyping
236 | 
237 | Another trap I have fallen into involves endless ideation prototyping,
238 | which is very similar
239 | to the "endless modelling experiments" problem described above.
240 | 
241 | My proposal here, then, is two-fold.
242 | Firstly, rather than running down rabbit holes endlessly,
243 | we **trust our instincts** in evaluating the maturity of an idea.
244 | Secondly, we ought also to define "kill/stop criteria" ahead-of-time,
245 | and **move as quickly as possible to kill the idea**
246 | while also documenting it in a Jupyter notebook.
247 | If made part of an analysis sketch that is raised on the issue tracker,
248 | then we can be kept accountable by our colleagues.
249 | 
250 | ## Benefits of adopting GitFlow and associated practices
251 | 
252 | At its core, adopting a workflow as described above
253 | is really about intentionally slowing down our work a little bit
254 | so that we are more thoughtful about the work we want to finish.
255 | In work with my colleagues,
256 | I have found this to be incredibly useful.
257 | GitFlow and its associated practices bring a suite of benefits
258 | to our projects,
259 | and I think it is easy to see how.
260 | 
261 | By spending a bit more time on thought and on execution,
262 | we cut down on wasted hours exploring unproductive analysis avenues.
263 | 
264 | By pre-defining deliverables expressed in a _neutral_ form,
265 | we reduce stress and pressure on data scientists,
266 | We also prevent endless rabbit-hole hacking
267 | to achieve those non-neutrally-expressed goals.
268 | We also receive a less biased analysis,
269 | which I believe can only help with making better decisions.
270 | 
271 | Finally, by inviting colleagues to review our work,
272 | we also prevent the silo-ing of knowledge on one person,
273 | and instead distribute expertise and knowledge.
274 | 
275 | ## How to gradually adopt GitFlow in your data science teams
276 | 
277 | I know that not every single data science team
278 | will have adopted GitFlow from the get-go,
279 | and so there will have to be some form of ramp-up
280 | to get it going productively.
281 | 
282 | Because this is a collaborative workflow,
283 | and because adoption is usually done only in the presence of incentives,
284 | I think that in order for GitFlow and associated practices to be adopted,
285 | one or more champions for using GitFlow needs to be empowered
286 | with the authority to use this workflow on any project they embark on.
287 | They also have to be sufficiently unpressured to deliver,
288 | so that time and performance pressures do not compromise on adoption.
289 | Finally, they have to be able to teach `git` newcomers
290 | and debug problems that show up in `git` branching,
291 | and be able to handle the `git` workflow
292 | for colleagues who might not have the time to pick it up.
293 | 
294 | Tooling also has to be present.
295 | A modern version control system and associated hosting software,
296 | such as BitBucket, GitHub and GitLab, are necessary.
297 | Issue trackers also need to be present for each repository
298 | (or project, more generally).
299 | 
300 | At my workplace, I have been fortunate to initiate two projects
301 | on which we practice GitFlow,
302 | bringing along an intern and a colleague one rank above me
303 | who were willing to try this out.
304 | This has led to much better sharing of the coding and knowledge load,
305 | and has also allowed us to cover for one another much more effectively.
306 | 
307 | While above I may have sounded as if there is resistance to adoption,
308 | in practice I know that most data scientists instinctively know
309 | that proper workflows are going to be highly beneficial,
310 | but lack the time/space and incentives to introduce them in,
311 | yet would jump at the chance to do so if properly incentivized
312 | and given the time and space to do so.
313 | 
314 | ## Concluding words
315 | 
316 | I hope that I have convinced you
317 | that learning GitFlow, and its associated practices,
318 | can be incredibly useful for the long-term health and productivity
319 | of your data science team(s).
320 | 
321 | ## Thank you for reading!
322 | 
323 | If you enjoyed this essay and would like to receive early-bird access to more,
324 | [please support me on Patreon][patreon]!
325 | A coffee a month sent my way gets you _early_ access to my essays
326 | on a private URL exclusively for my supporters
327 | as well as shoutouts on every single essay that I put out.
328 | 
329 | [patreon]: https://patreon.com/ericmjl
330 | 
331 | Also, I have a free monthly newsletter that I use as an outlet
332 | to share programming-oriented data science tips and tools.
333 | If you'd like to receive it, sign up on [TinyLetter][tinyletter]!
334 | 
335 | [tinyletter]: https://tinyletter.com/ericmjl
336 | 


--------------------------------------------------------------------------------
/docs/machine-learning/message-passing.md:
--------------------------------------------------------------------------------
  1 | # Computational Representations of Message Passing
  2 | 
  3 | **Abstract:** Message passing on graphs,
  4 | also known as graph convolutions,
  5 | have become a popular research topic.
  6 | In this piece,
  7 | I aim to provide a short technical primer
  8 | on ways to implement message passing on graphs.
  9 | The goal is to provide clear pedagogy
 10 | on what message passing means mathematically,
 11 | and hopefully point towards cleaner computational implementations
 12 | of the key algorithmic pieces.
 13 | 
 14 | **Assumed knowledge:**
 15 | We assume our reader has familiarity with elementary graph concepts.
 16 | More specifically, the terms “graph”, “nodes”, and “edges”
 17 | should be familiar terms.
 18 | Code examples in this technical piece will be written
 19 | using the Python programming language,
 20 | specifically using Python 3.7, NumPy 1.17 (in JAX), and NetworkX 2.2.
 21 | 
 22 | ## Introduction to Message Passing
 23 | 
 24 | ### Functions on Nodes
 25 | 
 26 | Message passing starts with a “function defined over nodes”,
 27 | which we will denote here as $f(v)$ (for “function of node/vertex v”).
 28 | What is this, one might ask?
 29 | In short,
 30 | this is nothing more than a numeric value of some kind
 31 | attached to every node in a graph.
 32 | This value could be scalar, vector, matrix, or tensor.
 33 | 
 34 | The semantic meaning of that value
 35 | is typically defined by the application domain
 36 | that the graph is being used in.
 37 | As a concrete example,
 38 | in molecules, a “function” defined over the molecular graph
 39 | could be the scalar-valued proton number.
 40 | Carbon would be represented by the function $f(v) = 6$.
 41 | Alternatively, it could be a vector of values
 42 | encompassing both the atomic mass and the number of valence electrons.
 43 | In this case, carbon would be represented by the function $f(v) = (6, 4)$.
 44 | 
 45 | Visually, one might represent it as follows:
 46 | 
 47 | <!-- \<FIGURE\> -->
 48 | ![](message-passing-figures/figure-msg-passing-carbon-methane.png)
 49 | 
 50 | ### Message Passing
 51 | 
 52 | What then is message passing, or,
 53 | as the deep learning community has adopted, “graph convolution”?
 54 | At its core, message passing is nothing more
 55 | than a generic mathematical operation
 56 | defined between a node’s function value
 57 | and its neighbors function value.
 58 | 
 59 | As an example,
 60 | one may define a message passing operation
 61 | to be the summation the function evaluated at a node
 62 | with the function evaluated on its neighbor’s nodes.
 63 | Here is a simplistic example,
 64 | shown using a scalar on water:
 65 | 
 66 | ![](message-passing-figures/figure-msg-passing-water.png)
 67 | 
 68 | Summation is not the only message passing operation that can be defined.
 69 | In principle,
 70 | given any node (or vertex) $v$ and its neighbors $N(v)$ values,
 71 | we may write down a generic function $f(v, N(v))$
 72 | that defines how the function value on each node
 73 | is to be shared with its neighbors.
 74 | 
 75 | ## Computational Implementations of Message Passing
 76 | 
 77 | For simplicity,
 78 | let us stay with the particular case
 79 | where the message passing operation is defined as
 80 | the summation of one’s neighbors values with one’s values.
 81 | 
 82 | ### Object-Oriented Implementation
 83 | 
 84 | With this definition in place,
 85 | we may then define a message passing operation in Python as follows:
 86 | 
 87 | ```python linenums="1"
 88 | def message_passing(G):
 89 |     """Object-oriented message passing operation."""
 90 | 
 91 |     G_new = G.copy()
 92 | 
 93 |     for node, data in G.nodes(data=True):
 94 |         new_value = data["value"]  # assuming the value is stored under this key
 95 |         neighbors = G.neighbors(node)
 96 |         for neighbor in neighbors:
 97 |             new_value += G.nodes[neighbor]["value"]
 98 |         G_new.node[node]["value"] = new_value
 99 |     return G
100 | ```
101 | 
102 | Thinking about computational considerations,
103 | we would naturally consider this implementation to be slow,
104 | because it involves a for-loop over Python objects.
105 | If we had multiple graphs
106 | over which we wanted message passing to be performed,
107 | the type-checking overhead in Python will naturally accumulate,
108 | and may even dominate.
109 | 
110 | ### Linear Algebra Implementation
111 | 
112 | How might we speed things up? As it turns out, linear algebra may be useful.
113 | 
114 | We know that every graph may be represented as an adjacency matrix `A`,
115 | whose shape is `(n_nodes, n_nodes)`.
116 | As long as we maintain proper node ordering,
117 | we may also define a compatibly-shaped matrix `F` for node function values,
118 | whose shape is `(n_nodes, n_features)`.
119 | 
120 | Taking advantage of this,
121 | in order define the “self plus neighbors” message passing operation
122 | in terms of linear algebra operations,
123 | we may then modify `A` by adding to it a diagonal matrix of ones.
124 | (In graph terminology,
125 | this is equivalent to adding a self-loop to the adjacency matrix.)
126 | 
127 | Then, message passing,
128 | as defined above,
129 | is trivially the dot product of `A` and `F`:
130 | 
131 | ```python linenums="1"
132 | def message_passing(A, F):
133 |     """
134 |     Message passing done by linear algebra.
135 | 
136 |     :param A: Adjacency-like matrix, whose shape is (n_nodes, n_nodes).
137 |     :param F: Feature matrix, whose shape is (n_nodes, n_features).
138 |     """
139 | 
140 |     return np.dot(A, F)
141 | ```
142 | 
143 | In principle, variants on the adjacency matrix are possible.
144 | The only hard requirement for the matrix `A`
145 | is that it has the shape `(n_nodes, n_nodes)`.
146 | 
147 | #### Adjacency Variant 1: N-degree adjacency matrix
148 | 
149 | The adjacency matrix represents connectivity by degree 1.
150 | If we take the second matrix power of the adjacency matrix,
151 | we get back the connectivity of nodes
152 | at two degrees of separation away.
153 | More generically:
154 | 
155 | ```python linenums="1"
156 | def n_degree_adjacency(A, n: int):
157 |     """
158 |     Return the n-degree of separation adjacency matrix.
159 | 
160 |     :param A: Adjacency matrix, of shape (n_nodes, n_nodes)
161 |     :param n: Number of degrees of separation.
162 |     """
163 |     return np.linalg.matrix_power(A, n)
164 | ```
165 | 
166 | Performing message passing using the N-degree adjacency matrix
167 | effectively describes sharing of information
168 | between nodes that are N-degrees of separation apart,
169 | skipping intermediate neighbors.
170 | 
171 | #### Adjacency Variant 2: Graph laplacian matrix
172 | 
173 | The graph laplacian matrix is defined as
174 | the diagonal degree matrix `D`
175 | (where the diagonal entries are the degree of each node)
176 | minus the adjacency matrix `A`: `L = D - A`.
177 | 
178 | This matrix is the discrete analog to the Laplacian operator,
179 | and can give us information
180 | about the discrete gradient between a node and its neighbors.
181 | 
182 | ## Message Passing on Multiple Graphs
183 | 
184 | Thus far,
185 | we have seen an efficient implementation
186 | of message passing on a single graph
187 | using linear algebra.
188 | 
189 | How would one perform message passing on multiple graphs, though?
190 | 
191 | This is a question
192 | that has applications in graph neural networks
193 | (especially in cheminformatics).
194 | For the learning task where one has a batch of graphs,
195 | and the supervised learning task is
196 | to predict a scalar (or vector) value per graph,
197 | knowing how to efficiently message pass over multiple graphs
198 | is crucial to developing a performant graph neural network model.
199 | 
200 | The challenge here, though,
201 | is that graphs generally are of variable size,
202 | hence it is not immediately obvious how to “tensorify” the operations.
203 | 
204 | Let us look at a few alternatives,
205 | starting with the most obvious (but also most inefficient),
206 | building towards more efficient solutions.
207 | 
208 | ### Implementation 1: For-loops over pairs of adjacency and feature matrices
209 | 
210 | If we multiple graphs,
211 | they may be represented as a list of feature matrices
212 | and a list of adjacency matrices.
213 | The message passing operation, then,
214 | may be defined by writing a for-loop over pairs of these matrices.
215 | 
216 | ```python linenums="1"
217 | def message_passing(As, Fs):
218 |     outputs = []
219 |     for A, F in zip(As, Fs):
220 |         outputs.append(np.dot(A, F))
221 |     return outputs
222 | ```
223 | 
224 | Because of the for-loop,
225 | the obvious downside here
226 | is the overhead induced by running a for-loop over pairs of As and Fs.
227 | 
228 | ### Implementation 2: Sparse Matrices
229 | 
230 | Sparse matrices are an attractive alternative.
231 | Instead of treating graphs as independent samples,
232 | we may treat them as a single large graph on which we perform message passing.
233 | If we order the nodes in our adjacency matrix and feature matrix correctly,
234 | we will end up with a block diagonal adjacency matrix,
235 | and vertically stacked feature matrices.
236 | 
237 | ![](message-passing-figures/figure-message-passing-sparse.png)
238 | 
239 | If we prepare the multiple graphs as a large disconnected graph,
240 | then we will have a dense feature matrix of shape `(sum(n_nodes), n_feats)`,
241 | and a sparse adjacency matrix of shape `(sum(n_nodes), sum(n_nodes))`.
242 | Message passing then becomes a sparse-dense dot product:
243 | 
244 | ```python linenums="1"
245 | def message_passing(A, F):
246 |     return sparse.dot(A, F)
247 | ```
248 | 
249 | The upside here is that
250 | message passing has been returned back to its natural form (a dot product).
251 | The downsides here are that the data must be prepared as a single large graph,
252 | hence we effectively lose
253 | what one would call the “sample” (or “batch”) dimension.
254 | Additionally, the most widely used deep learning libraries
255 | do not support automatic differentiation
256 | on sparse-dense or dense-sparse dot products,
257 | hence limiting the use of this implementation in deep learning.
258 | 
259 | ### Implementation 3: Size-batched matrix multiplication
260 | 
261 | An alternative way to conceptualize message passing
262 | is to think of graphs of the same size as belonging to a “size batch”.
263 | We may then vertically stack the feature and adjacency matrices
264 | of graphs of the same size together,
265 | and perform a batched matrix multiplication,
266 | ensuring that we preserve the sample/batch dimension in the final result.
267 | 
268 | ![](message-passing-figures/figure-message-passing-graph-size.png)
269 | 
270 | In terms of Python code, this requires special preparation of the graphs.
271 | 
272 | ```python linenums="1"
273 | from collections import defaultdict
274 | from jax.lax import batch_matmul
275 | 
276 | def feature_matrix(G):
277 |     # ...
278 |     return F
279 | 
280 | def prep_data(Gs: list):
281 |     adjacency_matrices = defaultdict(list)
282 |     feature_matrices = defaultdict(list)
283 |     for G in Gs:
284 |         size = len(G)
285 |         F = feature_matrix(G)
286 |         A = nx.adjacency_matrix(G) + np.ones(size)
287 |         adjacency_matrices[size].append(A)
288 |         feature_matrices[size].append(A)
289 | 
290 |     for size, As in adjacency_matrices.items():
291 |         adjacency_matrices[size] = np.stack(As)
292 |     for size, Fs in feature_matrices.items():
293 |         feature_matrices[size] = np.stack(Fs)
294 |     return adjacency_matrices, feature_matrices
295 | 
296 | def message_passing(As, Fs):
297 |     result = dict()
298 |     for size in As.keys():
299 |         F = Fs[size]
300 |         A = As[size]
301 | 
302 |         result[size] = batch_matmul(A, F)
303 |     return result
304 | ```
305 | 
306 | In this implementation,
307 | we use `jax.lax.batch_matmul`,
308 | which inherently assumes
309 | that the first dimension is the sample/batch dimension,
310 | and that the matrix multiplication happens on the subsequent dimensions.
311 | 
312 | An advantage here is that the number of loop overhead calls in Python
313 | is reduced to the number of unique graph sizes that are present in the graph.
314 | The disadvantage, though,
315 | is that we have a dictionary data structure that we have to deal with,
316 | which makes data handling in Python less natural
317 | when dealing with linear algebra libraries.
318 | 
319 | ### Implementation 4: Batched padded matrix multiplication
320 | 
321 | In this implementation,
322 | we prepare the data in a different way.
323 | Firstly, we must know the size of the largest graph ahead-of-time.
324 | 
325 | ```python linenums="1"
326 | size = ...  # largest graph size
327 | ```
328 | 
329 | We then pad every graph’s feature matrix with zeros along the node axis
330 | until the node axis is as long as the largest graph size.
331 | 
332 | ```python linenums="1"
333 | def prep_feats(F, size):
334 |     # F is of shape (n_nodes, n_feats)
335 |     return np.pad(
336 |         F,
337 |         [
338 |             (0, size - F.shape[0]),
339 |             (0, 0)
340 |         ],
341 |     )
342 | ```
343 | 
344 | We do the same with every adjacency matrix.
345 | 
346 | ```python linenums="1"
347 | def prep_adjs(A, size):
348 |     # A is of shape (n_nodes, n_nodes)
349 |     return np.pad(
350 |         A,
351 |         [
352 |             (0, size-A.shape[0]),
353 |             (0, size-A.shape[0]),
354 |         ],
355 |     )
356 | ```
357 | 
358 | Finally, we simply stack them into the data matrix:
359 | 
360 | ```python linenums="1"
361 | As = np.stack([prep_adjs(A, size) for A in As]
362 | Fs = np.stack([prep_feats(F, size) for F in Fs]
363 | ```
364 | 
365 | Now, the shapes of our matrices are as follows:
366 | 
367 | - `F` takes on the shape `(n_graphs, n_nodes, n_feats)`
368 | - `A` takes on the shape `(n_graphs, n_nodes, n_nodes)`
369 | 
370 | If we desire to be semantically consistent with our shapes,
371 | then we might, by convention,
372 | assign the first dimension to be the sample/batch dimension.
373 | 
374 | Finally, message passing is now trivially defined as a batch matrix multiply:
375 | 
376 | ```python linenums="1"
377 | def message_passing(A, F):
378 |     return batch_matmul(A, F)
379 | ```
380 | 
381 | Visually, this is represented as follows:
382 | 
383 | ![](message-passing-figures/figure-message-passing-batched.png)
384 | 
385 | To this author’s best knowledge,
386 | this should be the most efficient implementation of batched message passing
387 | across multiple graphs
388 | that also supports automatic differentiation,
389 | while also maintaining parity with the written equation form,
390 | hence preserving readability.
391 | The problems associated with a for-loop,
392 | sparse matrix multiplication,
393 | and dictionary carries,
394 | are removed.
395 | Moreover, the sample/batch dimension is preserved,
396 | hence it is semantically easy to map each graph
397 | to its corresponding output value.
398 | Given the current state of automatic differentiation libraries,
399 | no additional machinery is necessary to support sparse matrix products.
400 | 
401 | The only disadvantage that this author can think of
402 | is that zero-padding may not be intuitive at first glance,
403 | and that the data must still be specially prepared and stacked first.
404 | 
405 | ## Concluding Words
406 | 
407 | This essay was initially motivated
408 | by the myriad of difficult-to-read message passing implementations
409 | present in the deep learning literature.
410 | Frequently,
411 | a for-loop of some kind is invoked,
412 | or an undocumented list data structure is created,
413 | in order to accomplish the message passing operation.
414 | Moreover, the model implementation
415 | is frequently not separated from the data preparation step,
416 | which makes for convoluted
417 | and mutually incompatible implementations
418 | of message passing in neural networks.
419 | 
420 | It is my hope that while the research field is still in vogue,
421 | a technical piece that advises researchers
422 | on easily-readable and efficient implementations
423 | of message passing on graphs
424 | may help advance research practice.
425 | In particular,
426 | if our code can more closely match the equations listed in papers,
427 | that will help facilitate
428 | communication and verification of model implementations.
429 | 
430 | To help researchers get started,
431 | an example implementation for the full data preparation
432 | and batched padded matrix multiplies in JAX
433 | is available on GitHub,
434 | archived on Zenodo.
435 | 
436 | ## Acknowledgments
437 | 
438 | I thank Rif. A. Saurous
439 | for our discussion at the PyMC4 developer summit in Montreal, QC,
440 | where his laser-like focus on “tensorify everything”
441 | inspired many new thoughts in my mind.
442 | 
443 | Many thanks to my wife, Nan Li,
444 | who first pointed me to the linear algebra equivalents of graphs.
445 | 
446 | I also thank David Duvenaud and Matthew J. Johnson
447 | for their pedagogy while they were at Harvard.
448 | 
449 | ## Appendix
450 | 
451 | ### Equivalence between padded and non-padded message passing
452 | 
453 | To readers who may need an example to be convinced
454 | that matrix multiplying the padded matrices
455 | is equivalent to matrix multiplying the originals,
456 | we show the Python example below.
457 | 
458 | Firstly, without padding:
459 | 
460 | ```python
461 | F = np.array([[1, 0], [1, 1]])
462 | A = np.array([[1, 0], [0, 1]])
463 | M = np.dot(A, F)
464 | 
465 | # Value of M
466 | # DeviceArray([[1, 0],
467 |             #  [1, 1]], dtype=int32)
468 | ```
469 | 
470 | And now, with padding:
471 | 
472 | ```python
473 | pad_size = 2
474 | F_pad = np.pad(
475 |     F,
476 |     pad_width=[
477 |         (0, pad_size),
478 |         (0, 0),
479 |     ]
480 | )
481 | A_pad = np.pad(
482 |     A,
483 |     pad_width=[
484 |         (0, pad_size),
485 |         (0, pad_size),
486 |     ]
487 | )
488 | 
489 | # F_pad:
490 | # DeviceArray([[1, 0],
491 | #              [1, 1],
492 | #              [0, 0],
493 | #              [0, 0]], dtype=int32)
494 | 
495 | # A_pad:
496 | # DeviceArray([[1, 0, 0, 0],
497 | #              [0, 1, 0, 0],
498 | #              [0, 0, 0, 0],
499 | #              [0, 0, 0, 0]], dtype=int32)
500 | 
501 | M_pad = np.dot(A_pad, F_pad)
502 | # M_pad:
503 | # DeviceArray([[1, 0],
504 | #              [1, 1],
505 | #              [0, 0],
506 | #              [0, 0]], dtype=int32)
507 | ```
508 | 
509 | ## Thank you for reading!
510 | 
511 | If you enjoyed this essay and would like to receive early-bird access to more,
512 | [please support me on Patreon][patreon]!
513 | A coffee a month sent my way gets you _early_ access to my essays
514 | on a private URL exclusively for my supporters
515 | as well as shoutouts on every single essay that I put out.
516 | 
517 | [patreon]: https://patreon.com/ericmjl
518 | 
519 | Also, I have a free monthly newsletter that I use as an outlet
520 | to share programming-oriented data science tips and tools.
521 | If you'd like to receive it, sign up on [TinyLetter][tinyletter]!
522 | 
523 | [tinyletter]: https://tinyletter.com/ericmjl
524 | 


--------------------------------------------------------------------------------
/docs/machine-learning/computational-bayesian-stats.md:
--------------------------------------------------------------------------------
  1 | # An Introduction to Probability and Computational Bayesian Statistics
  2 | 
  3 | In Bayesian statistics,
  4 | we often say that we are "sampling" from a posterior distribution
  5 | to estimate what parameters could be,
  6 | given a model structure and data.
  7 | What exactly is happening here?
  8 | 
  9 | Examples that I have seen on "how sampling happens"
 10 | tends to focus on an overly-simple example
 11 | of sampling from a single distribution with known parameters.
 12 | I was wondering if I could challenge myself
 13 | to come up with a "simplest complex example"
 14 | that would illuminate ideas that were obscure to me before.
 15 | In this essay, I would like to share that knowledge with you,
 16 | and hopefully build up your intuition behind
 17 | what is happening in computational Bayesian inference.
 18 | 
 19 | ## Probability Distributions
 20 | 
 21 | We do need to have a working understanding
 22 | of what a probability distribution is before we can go on.
 23 | Without going down deep technical and philosophical rabbit holes
 24 | (I hear they are deep),
 25 | I'll start by proposing
 26 | that "a probability distribution is a Python object
 27 | that has a math function
 28 | that allocates credibility points onto the number line".
 29 | 
 30 | Because we'll be using the normal distribution extensively in this essay,
 31 | we'll start off by examining that definition
 32 | in the context of the standard normal distribution.
 33 | 
 34 | ### Base Object Implementation
 35 | 
 36 | Since the normal distribution is an object,
 37 | I'm implying here that it can hold state.
 38 | What might that state be?
 39 | Well, we know from math that probability distributions have parameters,
 40 | and that the normal distribution
 41 | has the "mean" and "variance" parameters defined.
 42 | In Python code, we might write it as:
 43 | 
 44 | ```python
 45 | class Normal:
 46 |     def __init__(self, mu, sigma):
 47 |         self.mu = mu
 48 |         self.sigma = sigma
 49 | ```
 50 | 
 51 | ### Probability Density Function
 52 | 
 53 | Now, I also stated that the normal distribution has a math function
 54 | that we can use to allocate credibility points to the number line.
 55 | This function also has a name,
 56 | called a "probability density function", or the "PDF".
 57 | Using this, we may then extend extend this object
 58 | with a method called `.pdf(x)`,
 59 | that returns a number
 60 | giving the number of credibility points
 61 | assigned to the value of `x` passed in.
 62 | 
 63 | ```python
 64 | import numpy as np
 65 | 
 66 | class Normal:
 67 |     def __init__(self, mu, sigma):
 68 |         self.mu = mu
 69 |         self.sigma = sigma
 70 | 
 71 |     def pdf(self, x):
 72 |         return (
 73 |             1 / np.sqrt(2 * self.sigma ** 2 * np.pi)
 74 |             * np.exp(
 75 |                 - (x - self.mu) ** 2
 76 |                 / 2 * self.sigma ** 2
 77 |             ))
 78 | ```
 79 | 
 80 | If we pass in a number `x` from the number line,
 81 | we will get back another number that tells us
 82 | the number of credibility points given to that value `x`,
 83 | under the state of the normal distribution instantiated.
 84 | We'll call this $P(x)$.
 85 | 
 86 | To simplify the implementation used here,
 87 | we are going to borrow some machinery already available to us
 88 | in the Python scientific computing ecosystem,
 89 | particularly from the SciPy stats module,
 90 | which gives us reference implementations of probability distributions.
 91 | 
 92 | ```python
 93 | from scipy.stats import norm
 94 | 
 95 | class Normal:
 96 |     def __init__(self, mu, sigma):
 97 |         self.mu = mu
 98 |         self.sigma = sigma
 99 | 
100 |         # We instantiate the distribution object here.
101 |         self.dist = norm(loc=mu, scale=sigma)
102 | 
103 |     def pdf(self, x):
104 |         # Now, our PDF class method is simplified to be just a wrapper.
105 |         return self.dist.pdf(x)
106 | ```
107 | 
108 | ### Log Probability
109 | 
110 | A common task in Bayesian inference is computing the likelihood of data.
111 | Let's assume that the data ${X_1, X_2, ... X_i}$ generated
112 | are independent and identically distributed,
113 | (the famous _i.i.d._ term comes from this).
114 | This means, then, that the joint probability of the data that was generated
115 | is equivalent to the product of the individual probabilities of each datum:
116 | 
117 | $$P(X_1, X_2, ... X_i) = P(X_1) P(X_2) ... P(X_i)$$
118 | 
119 | (We have to know the rules of probability to know this result;
120 | it is a topic for a different essay.)
121 | 
122 | If you remember the notation above,
123 | each $P(X_i)$ is an evaluation of $X_i$
124 | on the distribution's probability density function.
125 | It being a probability value means it is bound between 0 and 1.
126 | However, multiplying many probabilities together
127 | usually will result in issues with underflow computationally,
128 | so in evaluating likelihoods,
129 | we usually stick with log-likelihoods instead.
130 | By the usual rules of math, then:
131 | 
132 | $$\log P(X_1, X_2, ..., X_i) = \sum_{j=1}^{i}\log P(X_i)$$
133 | 
134 | To our normal distribution class,
135 | we can now add in another class method
136 | that computes the sum of log likelihoods
137 | evaluated at a bunch of i.i.d. data points.
138 | 
139 | ```python
140 | from scipy.stats import norm
141 | 
142 | class Normal:
143 |     def __init__(self, mu, sigma):
144 |         self.mu = mu
145 |         self.sigma = sigma
146 | 
147 |         # We instantiate the distribution object here.
148 |         self.dist = norm(loc=mu, scale=sigma)
149 | 
150 |     def pdf(self, x):
151 |         # Now, our PDF class method is simplified to be just a wrapper.
152 |         return self.dist.pdf(x)
153 | 
154 |     def logpdf(self, x):
155 |         return self.dist.logpdf(x)
156 | ```
157 | 
158 | ## Random Variables
159 | 
160 | ### Definition
161 | 
162 | Informally, a "random variable" is nothing more than
163 | a variable whose quantity is non-deterministic (hence random)
164 | but whose probability of taking on a certain value
165 | can be described by a probability distribution.
166 | 
167 | According to the Wikipedia definition of a [random variable][rv]:
168 | 
169 | > A random variable has a probability distribution, which specifies the probability of its values.
170 | 
171 | [rv]: https://en.wikipedia.org/wiki/Random_variable
172 | 
173 | As such, it may be tempting to conceive of a random variable
174 | as an object that has a probability distribution attribute attached to it.
175 | 
176 | ### Realizations of a Random Variable
177 | 
178 | On the other hand, it can also be convenient to invert that relationship,
179 | and claim that a probability distribution
180 | can generate realizations of a random variable.
181 | The latter is exactly how SciPy distributions are implemented:
182 | 
183 | ```python
184 | from scipy.stats import norm
185 | 
186 | # Normal distribution can generate realizations of an RV
187 | # The following returns a NumPy array of 10 draws
188 | # from a standard normal distribution.
189 | norm(loc=0, scale=1).rvs(10)
190 | ```
191 | 
192 | ??? note "Realizations of a Random Variable"
193 | 
194 |     A "realization" of a random variable is nothing more than
195 |     generating a random number
196 |     whose probability of being generated
197 |     is defined by the random variable's probability density function.
198 | 
199 | Because the generation of realizations of a random variable
200 | is equivalent to sampling from a probability distribution,
201 | we can extend our probability distribution definition
202 | to include a `.sample(n)` method:
203 | 
204 | ```python
205 | from scipy.stats import norm
206 | 
207 | class Normal:
208 |     def __init__(self, mu, sigma):
209 |         self.mu = mu
210 |         self.sigma = sigma
211 | 
212 |         # We instantiate the distribution object here.
213 |         self.dist = norm(loc=mu, scale=sigma)
214 | 
215 |     # ...
216 | 
217 |     def sample(self, n):
218 |         return self.dist.rvs(n)
219 | ```
220 | 
221 | Now, if we draw 10 realizations of a normally distributed random variable,
222 | and the drawing of each realization has no dependence of any kind
223 | on the previous draw,
224 | then we can claim that each draw is **independent**
225 | and **identically distributed**.
226 | This is where the fabled "_iid_" term in undergraduate statistics classes
227 | comes from.
228 | 
229 | ## Data Generating Process
230 | 
231 | Now that we have covered what probability distributions are,
232 | we can now move on to other concepts
233 | that are important in Bayesian statistical modelling.
234 | 
235 | Realizations of a random variable,
236 | or draws from its probability distribution,
237 | are how a Bayesian assumes data are generated.
238 | Describing how data are generated using probability distributions,
239 | or in other words, writing down the "data generating process",
240 | is a core activity in Bayesian statistical modelling.
241 | 
242 | Viewed this way, data values generated by a random process
243 | depend on the underlying random variable's probability distribution.
244 | In other words, the random variable realizations are known,
245 | given the probability distribution used to model it.
246 | Keep this idea in mind:
247 | it is going to be important shortly.
248 | 
249 | ## Bayes' Rule
250 | 
251 | Now that we've covered probability distributions,
252 | we can move on to Bayes' rule.
253 | You probably have seen the following equation:
254 | 
255 | $$P(B|A) = \frac{P(A|B)P(B)}{P(A)}$$
256 | 
257 | Bayes' rule states nothing more than the fact that
258 | the conditional probability of B given A is equal to
259 | the conditional probability of A given B
260 | times the probability of B
261 | divided by the probability of A.
262 | 
263 | When doing Bayesian statistical inference,
264 | we commonly take a related but distinct interpretation:
265 | 
266 | $$P(H|D) = \frac{P(D|H)P(H)}{P(D)}$$
267 | 
268 | It may look weird,
269 | but didn't we say before that data are realizations from a random variable?
270 | Why are we now treating data as a random variable?
271 | Here, we are doing not-so-intuitive but technically correct step
272 | of treating the data $D$ as being part of this probabilistic model
273 | (hence it "looks" like a random variable),
274 | alongside our model parameters $H$.
275 | There's a lot of measure theory that goes into this interpretation,
276 | which at this point I have not yet mastered,
277 | and so will wave my hands in great arcs
278 | and propose that this interpretation be accepted for now and move on.
279 | 
280 | ??? note "Data are random variables?"
281 | 
282 |     Notes from a chat with Colin gave me a lot to chew on, as usual:
283 | 
284 |     > The answer is in how you define "event" as
285 |     > "an element of a sigma algebra".
286 |     > intuitively, an "event" is just an abstraction,
287 |     > so one event might be "the coin is heads",
288 |     > or in another context the event might be
289 |     > "the parameters are [0.2, 0.1, 0.2]".
290 |     > And so analogously, "the data were configured as [0, 5, 2, 3]".
291 |     > Notice also that the events are different
292 |     > if the data being ordered vs unordered are different!
293 | 
294 |     This was a logical leap that I had been asked about before,
295 |     but did not previously have the knowledge to respond to.
296 |     Thanks to Colin, I now do.
297 | 
298 | 
299 | [colin]: https://colindcarroll.com/
300 | 
301 | With the data + hypothesis interpretation of Bayes' rule in hand,
302 | the next question arises:
303 | What math happens when we calculate posterior densities?
304 | 
305 | ## Translating Bayes' Math to Python
306 | 
307 | ### Defining Posterior Log-Likelihood
308 | 
309 | To understand this, let's look at the simplest complex example
310 | that I could think of:
311 | Estimating the $\mu$ and $\sigma$ parameters
312 | of a normal distribution
313 | conditioned on observing data points $y$.
314 | 
315 | If we assume a data generating process that looks like the following
316 | (with no probability distributions specified yet):
317 | 
318 | ```mermaid
319 | graph TD;
320 |     μ((μ)) --> y(y);
321 |     σ((σ)) --> y(y);
322 | ```
323 | 
324 | We can write out the following probabilistic model
325 | (now explicitly specifying probability distributions):
326 | 
327 | $$\mu \sim Normal(0, 10)$$
328 | 
329 | $$\sigma \sim Exponential(1)$$
330 | 
331 | $$y \sim Normal(\mu, \sigma)$$
332 | 
333 | Let's now map the symbols onto Bayes' rule.
334 | 
335 | - $H$ are the parameters, which are $\mu$ and $\sigma$ here.
336 | - $D$ is the data that I will observe
337 | - $P(H|D)$ is the posterior, which we would like to compute.
338 | - $P(D|H)$ is the likelihood,
339 | and is given by $y$'s probability distribution $Normal(\mu, \sigma)$,
340 | or in probability notation, $P(y|\mu, \sigma)$.
341 | - $P(H)$ is the the prior, and is given by $P(\mu, \sigma)$.
342 | - $P(D)$ is a hard quantity to calculate, so we sort of cheat and don't use it,
343 | and merely claim that the posterior is proportional to likelihood times prior.
344 | 
345 | If we look at the probability symbols again,
346 | we should notice that $P(\mu, \sigma)$
347 | is the joint distribution between $\mu$ and $\sigma$.
348 | However, from observing the graphical diagram,
349 | we'll notice that $\mu$ and $\sigma$ have no bearing on one another:
350 | we do not need to know $\mu$ to know the value of $\sigma$,
351 | and vice versa.
352 | Hence, they are independent of one another,
353 | and so by the rules of probability,
354 | 
355 | $$P(\mu, \sigma) = P(\mu | \sigma)P(\sigma) = P(\mu)P(\sigma) = P(H)$$
356 | 
357 | Now, by simply moving symbols around:
358 | 
359 | $$P(H|D) = P(D|H)P(H)$$
360 | 
361 | $$ = P(y|\mu,\sigma)P(\mu, \sigma)$$
362 | 
363 | $$ = P(y|\mu, \sigma)P(\mu)P(\sigma)$$
364 | 
365 | This translates directly into Python code!
366 | 
367 | ```python
368 | def model_prob(mu, sigma, y):
369 |     # Probability of mu under prior.
370 |     normal_prior = Normal(0, 10)
371 |     mu_prob = normal_prior.pdf(mu)
372 | 
373 |     # Probability of sigma under prior.
374 |     sigma_prior = Exponential(1)
375 |     sigma_prob = sigma_prior.pdf(sigma)
376 | 
377 |     # Likelihood of data given mu and sigma
378 |     likelihood = Normal(mu, sigma)
379 |     likelihood_prob = likelihood.pdf(y).prod()
380 | 
381 |     # Joint likelihood
382 |     return mu_prob * sigma_prob * likelihood_prob
383 | ```
384 | 
385 | If you remember, multiplying so many probability distributions together
386 | can give us underflow issues when computing,
387 | so it is common to take the log of both sides.
388 | 
389 | $$\log(P(H|D)) = log(P(y|\mu, \sigma)) + log(P(\mu)) + log(P(\sigma))$$
390 | 
391 | This also translates directly into Python code!
392 | 
393 | ```python
394 | def model_log_prob(mu, sigma, y):
395 |     # log-probability of mu under prior.
396 |     normal_prior = Normal(0, 10)
397 |     mu_log_prob = normal_prior.logpdf(mu)
398 | 
399 |     # log-probability of sigma under prior.
400 |     sigma_prior = Exponential(1)
401 |     sigma_log_prob = sigma_prior.logpdf(sigma)
402 | 
403 |     # log-likelihood given priors and data
404 |     likelihood = Normal(mu, sigma)
405 |     likelihood_log_prob = likelihood.logpdf(y).sum()
406 | 
407 |     # Joint log-likelihood
408 |     return mu_log_prob + sigma_log_prob + likelihood_log_prob
409 | ```
410 | 
411 | ## Computing the Posterior with Sampling
412 | 
413 | To identify what the values of $\mu$ and $\sigma$
414 | should take on given the data and priors,
415 | we can turn to sampling to help us.
416 | I am intentionally skipping over integrals
417 | which are used to compute expectations,
418 | which is what sampling is replacing.
419 | 
420 | ### Metropolis-Hastings Sampling
421 | 
422 | An easy-to-understand sampler that we can start with
423 | is the Metropolis-Hastings sampler.
424 | I first learned it in a grad-level computational biology class,
425 | but I expect most statistics undergrads should have
426 | a good working knowledge of the algorithm.
427 | 
428 | For the rest of us, check out the note below on how the algorithm works.
429 | 
430 | ???+ note "The Metropolis-Hastings Algorithm"
431 | 
432 |     Shamelessly copied (and modified)
433 |     from the [Wikipedia article]():
434 | 
435 |     - For each parameter $p$, do the following.
436 |     - Initialize an arbitrary point for the parameter (this is $p_t$, or $p$ at step $t$).
437 |     - Define a probability density $P(p_t)$, for which we will draw new values of the parameters. Here, we will use $P(p) = Normal(p_{t-1}, 1)$.
438 |     - For each iteration:
439 |         - Generate candidate new candidate $p_t$ drawn from $P(p_t)$.
440 |         - Calculate the likelihood of the data under the previous parameter value(s) $p_{t-1}$: $L(p_{t-1})$
441 |         - Calculate the likelihood of the data under the proposed parameter value(s) $p_t$: $L(p_t)$
442 |         - Calculate acceptance ratio $r = \frac{L(p_t)}{L(p_{t-1})}$.
443 |         - Generate a new random number on the unit interval: $s \sim U(0, 1)$.
444 |         - Compare $s$ to $r$.
445 |             - If $s \leq r$, accept $p_t$.
446 |             - If $s \gt r$, reject $p_t$ and continue sampling again with $p_{t-1}$.
447 | 
448 | [mh]: https://en.wikipedia.org/wiki/Metropolis%E2%80%93Hastings_algorithm
449 | 
450 | In the algorithm described in the note above,
451 | our parameters $p$ are actually $(\mu, \sigma)$.
452 | This means that we have to propose two numbers
453 | and sample two numbers in each loop of the sampler.
454 | 
455 | To make things simple for us, let's use the normal distribution
456 | centered on $0$ but with scale $0.1$
457 | to propose values for each.
458 | 
459 | We can implement the algorithm in Python code:
460 | 
461 | ```python linenums="1"
462 | # Metropolis-Hastings Sampling
463 | mu_prev = np.random.normal()
464 | sigma_prev = np.random.normal()
465 | 
466 | # Keep a history of the parameter values and ratio.
467 | mu_history = dict()
468 | sigma_history = dict()
469 | ratio_history = dict()
470 | 
471 | for i in range(1000):
472 |     mu_history[i] = mu_prev
473 |     sigma_history[i] = sigma_prev
474 |     mu_t = np.random.normal(mu_prev, 0.1)
475 |     sigma_t = np.random.normal(sigma_prev, 0.1)
476 | 
477 |     # Compute joint log likelihood
478 |     LL_t = model_log_prob(mu_t, sigma_t, y)
479 |     LL_prev = model_log_prob(mu_prev, sigma_prev, y)
480 | 
481 |     # Calculate the difference in log-likelihoods
482 |     # (or a.k.a. ratio of likelihoods)
483 |     diff_log_like = LL_t - LL_prev
484 |     if diff_log_like > 0:
485 |         ratio = 1
486 |     else:
487 |         # We need to exponentiate to get the correct ratio,
488 |         # since all of our calculations were in log-space
489 |         ratio = np.exp(diff_log_like)
490 | 
491 |     # Defensive programming check
492 |     if np.isinf(ratio) or np.isnan(ratio):
493 |         raise ValueError(f"LL_t: {LL_t}, LL_prev: {LL_prev}")
494 | 
495 |     # Ratio comparison step
496 |     ratio_history[i] = ratio
497 |     p = np.random.uniform(0, 1)
498 | 
499 |     if ratio >= p:
500 |         mu_prev = mu_t
501 |         sigma_prev = sigma_t
502 | ```
503 | 
504 | Because of a desire for convenience,
505 | we chose to use a single normal distribution to sample all values.
506 | However, that distribution choice is going to bite us during sampling,
507 | because the values that we could possibly sample for the $\sigma$ parameter
508 | can take on negatives,
509 | but when a negative $\sigma$ is passed
510 | into the normally-distributed likelihood,
511 | we are going to get computation errors!
512 | This is because the scale parameter of a normal distribution
513 | can only be positive, and cannot be negative or zero.
514 | (If it were zero, there would be no randomness.)
515 | 
516 | ### Transformations as a Hack
517 | 
518 | The key problem here is that the support of the Exponential distribution
519 | is bound to be positive real numbers only.
520 | That said, we can get around this problem
521 | simply by sampling amongst the unbounded real number space $(-\inf, +\inf)$,
522 | and then transforming the number by a math function to be in the bounded space.
523 | 
524 | One way we can transform numbers from an unbounded space
525 | to a positive-bounded space
526 | is to use the exponential transform:
527 | 
528 | $$y = e^x$$
529 | 
530 | For any given value $x$, $y$ will be guaranteed to be positive.
531 | 
532 | Knowing this, we can modify our sampling code, specifically, what was before:
533 | 
534 | ```python
535 | # Initialize in unconstrained space
536 | sigma_prev_unbounded = np.random.normal(0, 1)
537 | # ...
538 | for i in range(1000):
539 |     # ...
540 |     # Propose in unconstrained space
541 |     sigma_t_unbounded = np.random.normal(sigma_prev_unbounded, 0.1)
542 | 
543 |     # Transform the sampled values to the constrained space
544 |     sigma_prev = np.exp(sigma_prev_unbounded)
545 |     sigma_t = np.exp(sigma_t_unbounded)
546 | 
547 |     # ...
548 | 
549 |     # Pass the transformed values into the log-likelihood calculation
550 |     LL_t = model_log_prob(mu_t, sigma_t, y)
551 |     LL_prev = model_log_prob(mu_prev, sigma_prev, y)
552 | 
553 |     # ...
554 | ```
555 | 
556 | And _voila_!
557 | If you notice, the key trick here was
558 | to **sample in unbounded space**,
559 | but **evalute log-likelihood in bounded space**.
560 | We call the "unbounded" space the _transformed_ space,
561 | while the "bounded" space is the _original_ or _untransformed_ space.
562 | We have implemented the necessary components
563 | to compute posterior distributions on parameters!
564 | 
565 | ### Samples from Posterior
566 | 
567 | If we simulate 1000 data points from a $Normal(3, 1)$ distribution,
568 | and pass them into the model log probability function defined above,
569 | then after running the sampler,
570 | we get a chain of values that the sampler has picked out
571 | as maximizing the joint likelihood of the data and the model.
572 | This, by the way, is essentially the simplest version of
573 | Markov Chain Monte Carlo sampling that exists
574 | in modern computational Bayesian statistics.
575 | 
576 | Let's examine the trace from one run:
577 | 
578 | ![](./comp-bayes-figures/mcmc-trace.png)
579 | 
580 | Notice how it takes about 200 steps before the trace becomes **stationary**,
581 | that is it becomes a flat trend-line.
582 | If we prune the trace to just the values after the 200th iteration,
583 | we get the following trace:
584 | 
585 | ![](./comp-bayes-figures/mcmc-trace-burn-in.png)
586 | 
587 | The samples drawn are an approximation to
588 | the expected values of $\mu$ and $\sigma$
589 | given the data and priors specified.
590 | 
591 | ???+ note "Random Variables and Sampling"
592 | 
593 |     A piece of wisdom directly quoted from my friend [Colin Carroll][colin],
594 |     who is also a PyMC developer:
595 | 
596 |     > Random variables are *measures*,
597 |     > and measures are only really defined under an integral sign.
598 |     > *Sampling* is usually defined as the act of generating data
599 |     > according to a certain measure.
600 |     > This is confusing, because we invert this relationship
601 |     > when we do computational statistics:
602 |     > we generate the data,
603 |     > and use that to approximate an integral or expectation.
604 | 
605 | ## Topics We Skipped Over
606 | 
607 | We intentionally skipped over a number of topics.
608 | 
609 | One of them was why we used a normal distribution with scale of 0.1
610 | to propose a different value, rather than a different scale.
611 | As it turns out the, scale parameter is a tunable hyperparameter,
612 | and in PyMC3 we do perform tuning as well.
613 | If you want to learn more about how tuning happens,
614 | [Colin][colin] has a [great essay][tuning] on that too.
615 | 
616 | [tuning]: https://colcarroll.github.io/hmc_tuning_talk/
617 | 
618 | We also skipped over API design,
619 | as that is a topic I will be exploring in a separate essay.
620 | It will also serve as a tour through the PyMC3 API
621 | as I understand it.
622 | 
623 | ## An Anchoring Thought Framework for Learning Computational Bayes
624 | 
625 | Having gone through this exercise
626 | has been extremely helpful in deciphering
627 | what goes on behind-the-scenes in PyMC3
628 | (and the in-development PyMC4,
629 | which is built on top of TensorFlow probability).
630 | 
631 | From digging through everything from scratch,
632 | my thought framework to think about Bayesian modelling
633 | has been updated (pun intended) to the following.
634 | 
635 | Firstly, we can view a Bayesian model
636 | from the axis of **prior, likelihood, posterior**.
637 | Bayes' rule provides us the equation "glue"
638 | that links those three components together.
639 | 
640 | Secondly, when doing _computational_ Bayesian statistics,
641 | we should be able to modularly separate **sampling**
642 | from **model definition**.
643 | **Sampling** is computing the posterior distribution of parameters
644 | given the model and data.
645 | **Model definition**, by contrast,
646 | is all about providing the model structure
647 | as well as a function that calculates the joint log likelihood
648 | of the model and data.
649 | 
650 | In fact, based on the exercise above,
651 | any "sampler" is only concerned with the model log probability
652 | (though some also require the local gradient of the log probability
653 | w.r.t. the parameters to find where to climb next),
654 | and should only be required to accept a **model log probability** function
655 | and a proposed set of initial parameter values,
656 | and return a chain of sampled values.
657 | 
658 | Finally, I hope the "simplest complex example"
659 | of estimating $\mu$ and $\sigma$ of a normal distribution
660 | helps further your understanding of the math behind Bayesian statistics.
661 | 
662 | All in all, I hope this essay helps your learning, as writing it did for me!
663 | 
664 | ## Thank you for reading!
665 | 
666 | If you enjoyed this essay and would like to receive early-bird access to more,
667 | [please support me on Patreon][patreon]!
668 | A coffee a month sent my way gets you _early_ access to my essays
669 | on a private URL exclusively for my supporters
670 | as well as shoutouts on every single essay that I put out.
671 | 
672 | [patreon]: https://patreon.com/ericmjl
673 | 
674 | Also, I have a free monthly newsletter that I use as an outlet
675 | to share programming-oriented data science tips and tools.
676 | If you'd like to receive it, sign up on [TinyLetter][tinyletter]!
677 | 
678 | [tinyletter]: https://tinyletter.com/ericmjl
679 | 


--------------------------------------------------------------------------------