├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── LICENSE
├── README.md
├── data
    └── matrices.pkl
├── docs
    ├── README.md
    ├── figures
    │   ├── Design_by_contract.svg
    │   ├── Design_by_contract.svg.png
    │   ├── argparse.PNG
    │   ├── cka_example.png
    │   ├── dashboard.PNG
    │   ├── final.doc.gif
    │   ├── invariance_to_ortho.PNG
    │   ├── lifecycle_complex
    │   ├── lifecycle_complex.pdf
    │   ├── lifecycle_complex.png
    │   ├── lifecycle_simple
    │   ├── lifecycle_simple.pdf
    │   ├── lifecycle_simple.png
    │   ├── mary-kondo.jpg
    │   ├── mineault_et_al.png
    │   ├── nma.png
    │   ├── pcbi.1007358.g002.PNG_L.png
    │   ├── readme.PNG
    │   ├── reproducible_research
    │   ├── reproducible_research.pdf
    │   ├── reproducible_research.png
    │   ├── reversi.PNG
    │   ├── shablona.png
    │   ├── spaghetti-code.png
    │   ├── testing-trophy.png
    │   ├── tweet.png
    │   ├── wave_clus.png
    │   ├── wizard.png
    │   └── wm-federenko.png
    ├── notes
    │   └── how_packages_work.md
    ├── notion-notes.md
    └── slides
    │   ├── 01-intro.md
    │   ├── 02-decouple.md
    │   ├── 03-testing.md
    │   ├── 04-docs.md
    │   ├── 05-social.md
    │   ├── 99-standalone-testing.md
    │   ├── Makefile
    │   ├── README.md
    │   ├── pdf
    │       ├── 01-intro.pdf
    │       ├── 02-decouple.pdf
    │       ├── 03-testing.pdf
    │       ├── 04-docs.pdf
    │       ├── 05-social.pdf
    │       └── 99-standalone-testing.pdf
    │   └── preamble.tex
├── requirements.txt
├── research_code
    ├── __init__.py
    ├── cka_not_great.py
    ├── cka_step2.py
    ├── cka_step3.py
    ├── cka_step4.py
    ├── fib.py
    ├── fib_and_test.py
    ├── fib_monolithic.py
    └── tests
    │   ├── test_cka_step3.py
    │   ├── test_cka_step4.py
    │   └── test_fib.py
├── results
    ├── closeness.png
    └── closeness_sns.png
├── scripts
    ├── Draw dependency graph.ipynb
    └── Generate CKA matrices.ipynb
└── setup.py


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: Run Python Tests
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 |   pull_request:
 7 |     branches:
 8 |       - main
 9 | 
10 | jobs:
11 |   build:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@v2
15 |       - name: Install Python 3
16 |         uses: actions/setup-python@v1
17 |         with:
18 |           python-version: 3.8
19 |       - name: Install dependencies
20 |         run: |
21 |           python -m pip install --upgrade pip
22 |           pip install -r requirements.txt
23 |           pip install -e .
24 |       - name: Lint with pylint
25 |         run: |
26 |           # From https://medium.com/swlh/automate-python-testing-with-github-actions-7926b5d8a865
27 |           pip install flake8
28 |           # stop the build if there are Python syntax errors or undefined names
29 |           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
30 |           # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
31 |           flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
32 |       - name: Run tests, measure coverage with nose
33 |         run: |
34 |           pip install nose2 coverage
35 |           nose2 --with-coverage --coverage research_code


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # VSCode
132 | .vscode/
133 | data/cifar
134 | data/svhn


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Patrick Mineault
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Writing good research code
 2 | 
 3 | This repo contains the slides and code for a presentation on writing research software I first gave in January 2021 to the PhD students in neuro at Harvard. It's a compendium of 5 lessons I learned the hard way about writing research code that won't bite back.
 4 | 
 5 | * [The slides are here](https://github.com/patrickmineault/research_code/tree/main/docs/slides)
 6 | * The rest of the repo contains supporting code in the format advocated in the first lesson.
 7 | * You can see the full presentation recorded at [NMA 2021](https://www.crowdcast.io/e/nma2021/29) and a short version focused on testing recorded at [Brainhack MTL 2021](https://www.youtube.com/watch?v=gfPP2pQ8Rms&feature=youtu.be&ab_channel=OHBMOpenScienceSIG).
 8 | 
 9 | For the book version of these slides, see [goodresearch.dev](https://goodresearch.dev/).
10 | 
11 | ## Organization
12 | 
13 | This repo follows the organization of [shablona](https://github.com/uwescience/shablona). All the code and tests are under `research_code`. `research_code` is itself a Python package.
14 | 
15 | For the package, we use the same setup as this tutorial on [setuptools](https://python-packaging-user-guide.readthedocs.io/tutorials/packaging-projects/), and is compatible with it - this repo is publishable to PyPI directly!
16 | 
17 | ## To install the package locally in development mode
18 | 
19 | `cd` into this directory, then run:
20 | 
21 | ```
22 | pip install -e .
23 | ```
24 | 
25 | In Python:
26 | 
27 | ```{python}
28 | import research_code
29 | ```
30 | 
31 | ## To test
32 | 
33 | `cd` into the `research_code/tests` directory, then run each file individually, or run `nose2`.
34 | 
35 | ## CI
36 | 
37 | While shablona recommended the use of Jenkins for continuous integration (CI), we showcase instead Github actions, which don't require additional accounts/software. The workflow, which runs tests, is located in `.github/workflows/ci.yml`.
38 | 
39 | 


--------------------------------------------------------------------------------
/data/matrices.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/data/matrices.pkl


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | # Docs
2 | 
3 | This contains the documentation (including what I used to create slides) for my tutorial on writing good research software good. The good stuff is under the slides directory.


--------------------------------------------------------------------------------
/docs/figures/Design_by_contract.svg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/Design_by_contract.svg.png


--------------------------------------------------------------------------------
/docs/figures/argparse.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/argparse.PNG


--------------------------------------------------------------------------------
/docs/figures/cka_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/cka_example.png


--------------------------------------------------------------------------------
/docs/figures/dashboard.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/dashboard.PNG


--------------------------------------------------------------------------------
/docs/figures/final.doc.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/final.doc.gif


--------------------------------------------------------------------------------
/docs/figures/invariance_to_ortho.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/invariance_to_ortho.PNG


--------------------------------------------------------------------------------
/docs/figures/lifecycle_complex:
--------------------------------------------------------------------------------
 1 | digraph {
 2 | 	D [label="Create data"]
 3 | 	T [label="Transform data"]
 4 | 	F [label="Fit models"]
 5 | 	H [label="Test Hypotheses"]
 6 | 	P [label="Generate plots"]
 7 | 	W [label="Write and publish paper"]
 8 | 	B [label="Publish data"]
 9 | 	C [label="Publish code"]
10 | 	D -> T [label=""]
11 | 	T -> F [label=""]
12 | 	F -> H [label=""]
13 | 	H -> D [label=""]
14 | 	H -> P [label=""]
15 | 	H -> T [label=""]
16 | 	H -> F [label=""]
17 | 	P -> W [label=""]
18 | 	D -> P [label=""]
19 | 	P -> T [label=""]
20 | 	P -> F [label=""]
21 | 	D -> B [label=""]
22 | 	W -> B [label=""]
23 | 	W -> C [label=""]
24 | 	F -> C [label=""]
25 | }
26 | 


--------------------------------------------------------------------------------
/docs/figures/lifecycle_complex.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/lifecycle_complex.pdf


--------------------------------------------------------------------------------
/docs/figures/lifecycle_complex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/lifecycle_complex.png


--------------------------------------------------------------------------------
/docs/figures/lifecycle_simple:
--------------------------------------------------------------------------------
 1 | digraph {
 2 | 	D [label="Create data"]
 3 | 	T [label="Transform data"]
 4 | 	F [label="Fit models"]
 5 | 	H [label="Test Hypotheses"]
 6 | 	P [label="Generate plots"]
 7 | 	W [label="Write and publish paper"]
 8 | 	D -> T [label=""]
 9 | 	T -> F [label=""]
10 | 	F -> H [label=""]
11 | 	H -> D [label=""]
12 | 	H -> P [label=""]
13 | 	P -> W [label=""]
14 | }
15 | 


--------------------------------------------------------------------------------
/docs/figures/lifecycle_simple.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/lifecycle_simple.pdf


--------------------------------------------------------------------------------
/docs/figures/lifecycle_simple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/lifecycle_simple.png


--------------------------------------------------------------------------------
/docs/figures/mary-kondo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/mary-kondo.jpg


--------------------------------------------------------------------------------
/docs/figures/mineault_et_al.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/mineault_et_al.png


--------------------------------------------------------------------------------
/docs/figures/nma.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/nma.png


--------------------------------------------------------------------------------
/docs/figures/pcbi.1007358.g002.PNG_L.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/pcbi.1007358.g002.PNG_L.png


--------------------------------------------------------------------------------
/docs/figures/readme.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/readme.PNG


--------------------------------------------------------------------------------
/docs/figures/reproducible_research:
--------------------------------------------------------------------------------
 1 | digraph {
 2 | 	O [label="Open code"]
 3 | 	V [label="Version control"]
 4 | 	C [label="Command line"]
 5 | 	D [label="Open data"]
 6 | 	R [label="Reproducible research"]
 7 | 	E [label=Environments]
 8 | 	L [label="Readable code"]
 9 | 	T [label=Testing]
10 | 	I [label=CI]
11 | 	W [label="Code review"]
12 | 	S [label="Cloud storage"]
13 | 	U [label="Cloud computing"]
14 | 	M [label=Documentation]
15 | 	P [label=Packaging]
16 | 	V -> O [label=""]
17 | 	C -> V [label=""]
18 | 	O -> D [label=""]
19 | 	O -> R [label=""]
20 | 	D -> R [label=""]
21 | 	E -> R [label=""]
22 | 	C -> E [label=""]
23 | 	O -> E [label=""]
24 | 	O -> W [label=""]
25 | 	O -> L [label=""]
26 | 	C -> T [label=""]
27 | 	T -> I [label=""]
28 | 	O -> I [label=""]
29 | 	E -> I [label=""]
30 | 	W -> L [label=""]
31 | 	E -> U [label=""]
32 | 	T -> U [label=""]
33 | 	V -> U [label=""]
34 | 	S -> U [label=""]
35 | 	S -> D [label=""]
36 | 	M -> P [label=""]
37 | 	T -> P [label=""]
38 | 	W -> P [label=""]
39 | 	O -> P [label=""]
40 | 	I -> P [label=""]
41 | 	T -> R [label=""]
42 | 	L -> M [label=""]
43 | }
44 | 


--------------------------------------------------------------------------------
/docs/figures/reproducible_research.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/reproducible_research.pdf


--------------------------------------------------------------------------------
/docs/figures/reproducible_research.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/reproducible_research.png


--------------------------------------------------------------------------------
/docs/figures/reversi.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/reversi.PNG


--------------------------------------------------------------------------------
/docs/figures/shablona.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/shablona.png


--------------------------------------------------------------------------------
/docs/figures/spaghetti-code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/spaghetti-code.png


--------------------------------------------------------------------------------
/docs/figures/testing-trophy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/testing-trophy.png


--------------------------------------------------------------------------------
/docs/figures/tweet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/tweet.png


--------------------------------------------------------------------------------
/docs/figures/wave_clus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/wave_clus.png


--------------------------------------------------------------------------------
/docs/figures/wizard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/wizard.png


--------------------------------------------------------------------------------
/docs/figures/wm-federenko.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/wm-federenko.png


--------------------------------------------------------------------------------
/docs/notes/how_packages_work.md:
--------------------------------------------------------------------------------
  1 | # How packages actually work
  2 | 
  3 | Pip and packages are wonderful, but they can obscure what's going on behind the 
  4 | scenes. How do packages actually work?
  5 | 
  6 | ## Packages and modules
  7 | 
  8 | Let's say we have a directory `mylib` with one module inside (a module is a normal Python file with functions):
  9 | 
 10 | ```
 11 | mylib/
 12 | |- code.py
 13 | ```
 14 | 
 15 | Inside `code.py`, there's a function:
 16 | 
 17 | ```{.python}
 18 | def the_fun():
 19 |     print("Hello world")
 20 | ```
 21 | 
 22 | Now let's assume the base directory `.` is on Python's search path. This creates the implicit package `mylib`. Hence, you can import code like so:
 23 | 
 24 | ```{.python}
 25 | from mylib.code import the_fun
 26 | ```
 27 | 
 28 | ## Python's search path
 29 | 
 30 | But wait, where does Python search for code? Several places:
 31 | 
 32 | * The current directory `os.getcwd()`
 33 | * Directories listed in `sys.path`
 34 | * Directories listed in the environment variable `PYTHONPATH`
 35 | 
 36 | When you install a package listed in PyPI with `pip`, it puts a copy of that folder somewhere that's on the path. For example, when I use the conda environment 
 37 | `py3`, it puts new packages in:
 38 | 
 39 | `/home/pmin/anaconda3/envs/py3/lib/python3.8/site-packages`
 40 | 
 41 | This location is listed in `sys.path`. Hence, if I `pip install seaborn`, I will find a copy of seaborn inside that directory:
 42 | 
 43 | ```{.shell}
 44 | (py3) pmin@desktop:~/anaconda3/envs/py3/lib/python3.8/site-packages/seaborn$ ls -al
 45 | total 780
 46 | drwxr-xr-x   6 pmin pmin   4096 Dec 28 16:21 .
 47 | drwxr-xr-x 438 pmin pmin  20480 Jan  8 14:00 ..
 48 | -rw-r--r--   1 pmin pmin    744 Dec 28 16:21 __init__.py
 49 | drwxr-xr-x   2 pmin pmin   4096 Dec 28 16:21 __pycache__
 50 | -rw-r--r--   1 pmin pmin  52671 Dec 28 16:21 _core.py
 51 | -rw-r--r--   1 pmin pmin   2126 Dec 28 16:21 _decorators.py
 52 | -rw-r--r--   1 pmin pmin   5861 Dec 28 16:21 _docstrings.py
 53 | -rw-r--r--   1 pmin pmin  14699 Dec 28 16:21 _statistics.py
 54 | -rw-r--r--   1 pmin pmin   2139 Dec 28 16:21 _testing.py
 55 | -rw-r--r--   1 pmin pmin   4483 Dec 28 16:21 algorithms.py
 56 | ...
 57 | ```
 58 | 
 59 | ## What if I want to use my homebrew library somewhere else?
 60 | 
 61 | Let's say your `mylib` code is in `/path/to/mylib/code.py`. You want to import it from `/home/me/projecto/script.py`. You need to figure out a way to place it on Python's search path. Before we discuss the ideal solution, let's make sure we understand what's going on by discussing other partial solutions.
 62 | 
 63 | ### (bad) copy `code.py` to `/home/me/projecto/mylib/code.py`
 64 | 
 65 | This works, but then you have two copies of your code, and it can rapidly become a maintenance nightmare. 
 66 | 
 67 | ### (not great) create a symlink
 68 | 
 69 | A better idea is to create a symlink from `/home/me/projecto/mylib -> /path/to/mylib`. You can do 
 70 | 
 71 | `ln -s /path/to/mylib /home/me/projecto/mylib`
 72 | 
 73 | This works, but it can be a pain to manage if you use multiple computers or you're sharing your code with somebody else.
 74 | 
 75 | ### (not great) Change `sys.path` during execution
 76 | 
 77 | Add your library code to `sys.path` temporarily. At the top of your script, use:
 78 | 
 79 | ```{.python}
 80 | import sys
 81 | # Assuming the code is in /my/great/library/code.py
 82 | sys.path.append('/my/great/') 
 83 | 
 84 | from library import code
 85 | ```
 86 | 
 87 | Note that once you exit the script, `sys.path` will return to its original value.
 88 | 
 89 | This works, but it will mess up code completion and linting in your favorite editor, because the dependency is injected at runtime. 
 90 | 
 91 | ## Create a package and install it in development mode
 92 | 
 93 | We can create a package and install in development mode (`pip install -e`). For that, we need a few files:
 94 | 
 95 | ```
 96 | setup.py
 97 | mylib/
 98 | |- __init__.py
 99 | |- code.py
100 | ```
101 | 
102 | * `setup.py` contains minimal code to setup a package. This will suffice:
103 | 
104 | ```{.python}
105 | from setuptools import setup
106 | 
107 | setup(
108 |     name='minipkg',
109 |     version='0.0.1',
110 |     author='An Awesome Coder',
111 |     author_email='patty.mcgoo@example.com',
112 |     packages=setuptools.find_packages(),
113 |     scripts=[],
114 |     url='https://github.com/patrickmineault/minimal-package',
115 |     license='LICENSE.txt',
116 |     description='An awesome package that does nothing',
117 |     long_description=open('README.md').read(),
118 |     install_requires=[
119 |     ],
120 | )
121 | ```
122 | 
123 | * Finally, the existence of an empty `__init__.py` tells setuptools that there's a package in that directory.
124 | 
125 | When you `pip install -e .`, it will add the current directory `.` to sys.path. Hence, you can now import the package from anywhere. The name of the folder (in this case `mylib`) determines the name of the package. 
126 | 
127 | ## Removing one level from imports
128 | 
129 | It may feel a bit unwieldy to have to have to write `from mylib.code import the_fun`. We can shorten that to `from mylib import the_fun` by changing the contents of `__init__.py` to:
130 | 
131 | ```{.python}
132 | from .code import *
133 | ```
134 | 
135 | This will lift the symbols inside of code, including `the_fun`, to a package-level symbol. This is a [common pattern in Python packages](https://github.com/mwaskom/seaborn/blob/master/seaborn/__init__.py).
136 | 
137 | ## `src` and all that
138 | 
139 | Some authors prefer putting the package code two levels down, i.e. inside `src/mylib`. This prevents polluting the package namespace with unnecessary symbols; see [this blog post for an explanation](https://blog.ionelmc.ro/2014/05/25/python-packaging/).
140 | 
141 | # Further reading
142 | 
143 | * http://andrewsforge.com/article/python-new-package-landscape/
144 | * https://blog.ionelmc.ro/2014/05/25/python-packaging/


--------------------------------------------------------------------------------
/docs/notion-notes.md:
--------------------------------------------------------------------------------
  1 | # Writing good research code good
  2 | 
  3 | TODO: Make a Jupyterbook out of this
  4 | 
  5 | - Who am I?
  6 |     - Open source experiences
  7 |     - No formal CS training
  8 |     - Google SWE and Facebook research scientist
  9 |     - Organizer of NMA
 10 |     - Occasionally taught CS
 11 | - Writing research code
 12 | 
 13 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled.png)
 14 | 
 15 | - Who is this for?
 16 |     - Not CS students
 17 |     - People who picked up programming more or less by accident
 18 | - Inspirations
 19 |     - Zen of Python
 20 |     - data scientist as scientist
 21 |     - move fast and break things
 22 |     - 12 factors
 23 |     - full stack deep learning
 24 |     - anti-patterns
 25 | - Why is research code hard to write
 26 |     - Endpoint is unclear
 27 |     - "correct" can be hard to define
 28 |     - Lots of exploration and dead ends
 29 |     - Sometimes, there are manual steps involving human judgement
 30 |     - Many people that do code for research are not trained in CS or programming
 31 | - Low judgement zone
 32 |     - It's ok to write garbage code when you're in a rush
 33 |     - It's not ok to keep building more and more on top of garbage code
 34 |         - sure, there's the moral imperative to create replicable code to bring forward the shining light of science and truth...
 35 |         - ...but also, do you want to have to scrap 6 months of research because your awful code does something silly?
 36 |             - "[https://en.wikipedia.org/wiki/Growth_in_a_Time_of_Debt](https://en.wikipedia.org/wiki/Growth_in_a_Time_of_Debt)" - Reinhart–Rogoff
 37 | 
 38 |             ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%201.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%201.png)
 39 | 
 40 |             ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%202.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%202.png)
 41 | 
 42 |         [https://www.bbc.com/news/magazine-22213219](https://www.bbc.com/news/magazine-22213219)
 43 | 
 44 |         ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%203.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%203.png)
 45 | 
 46 |         - I've wasted months of my life cursing my own bad code, don't be like me
 47 |     - When I say "you should" or "you shouldn't", those are just recommendations, maybe you know better, or maybe you have a deadline to hit
 48 |     - Clean things up later!
 49 | 
 50 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%204.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%204.png)
 51 | 
 52 | ## Grand scheme of things
 53 | 
 54 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%205.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%205.png)
 55 | 
 56 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%206.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%206.png)
 57 | 
 58 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%207.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%207.png)
 59 | 
 60 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%208.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%208.png)
 61 | 
 62 | - Lifecycle of research code
 63 |     - Create data
 64 |         - Psychophysics, EEG, ECoG, fMRI, ephys, calcium imaging, human labeling, simulations, etc.
 65 |         - Towards the goal of testing a hypothesis or something
 66 |     - Ingest the data
 67 |     - Apply transformations to the data
 68 |     - Fit models to the data
 69 |     - Test hypotheses
 70 |     - Generate plots
 71 |     - Write the paper
 72 |     - (receive the reviews and rewrite the paper)
 73 |     - (pass down the code to the next grad student down the pipe)
 74 | 
 75 | 0. Principles [Ev Federenko]
 76 | 
 77 | Ivanova et al. [2020]
 78 | 
 79 | CP: code programming
 80 | 
 81 | SP: sentence programming
 82 | 
 83 | SR: sentence reading
 84 | 
 85 | NR: non-word reading
 86 | 
 87 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%209.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%209.png)
 88 | 
 89 | - You have to write for your future self in mind
 90 |     - Future you will have forgotten 90% of what you wrote
 91 |     - You have to be twice as smart to debug code as to write it
 92 |         - Never write code that's as a smart as you can make it
 93 | - You need to conserve your working memory
 94 |     - Reduce the cognitive load of understanding your code
 95 |     - mental model for understanding and debugging code:
 96 |         - cognitive task in which you have to juggle lots of things in your WM
 97 |         - when there's too many pieces of information you have to keep in working memory, you start to lose track of other important pieces of information
 98 |         - you then have to refer to non-WM (i.e. stackoverflow, your codebase) to get back on track
 99 |         - eventually your productivity trends towards zero
100 |         - e.g. [https://imgur.com/gallery/UNhWQiV](https://imgur.com/gallery/UNhWQiV)
101 |     - simple is better than complex, but complex is better than complicated
102 |         - cyclomatic complexity (number of linearly independent paths in a program)
103 |     - Don't optimize code
104 | 
105 | 1. Organize different code projects according to a convention
106 | 
107 | I was given the swim test: everything at Google is one giant monorepo with billions of lines of code ([https://cacm.acm.org/magazines/2016/7/204032-why-google-stores-billions-of-lines-of-code-in-a-single-repository/fulltext#FNE](https://cacm.acm.org/magazines/2016/7/204032-why-google-stores-billions-of-lines-of-code-in-a-single-repository/fulltext#FNE)). Everything is organized according to strict (sometimes downright pedantic) conventions, such that "it's not that bad" to jump. [https://github.com/google/styleguide/blob/gh-pages/pyguide.md](https://github.com/google/styleguide/blob/gh-pages/pyguide.md). Also, to become a reviewer, you need to obtain readability ([https://www.pullrequest.com/blog/google-code-review-readability-certification/](https://www.pullrequest.com/blog/google-code-review-readability-certification/)), which is a kind of ritualized hazing in which one is taught the ways of Google-y-ness. To this day, I manually alphabetically sort my imports.
108 | 
109 | Suggested by Turing Way:
110 | 
111 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2010.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2010.png)
112 | 
113 | Suggested by Research Software Engineering with Python (originally from Noble 2009):
114 | 
115 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2011.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2011.png)
116 | 
117 | [https://github.com/uwescience/shablona](https://github.com/uwescience/shablona)
118 | 
119 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2012.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2012.png)
120 | 
121 | - Convention over configuration
122 | - E.g. React project:
123 |     - lots of code, always in the same folders
124 | - One project → one git repo
125 |     - If you don't use git yet, start doing it now
126 |     - Take a weekend to learn it
127 | - Analysis → Start with a Capital Letter.ipynb
128 | - Reusable functions and packages, etc. → lower letter with underscores .py
129 | - Tests under tests folder
130 | - Extends beyond just project organization
131 |     - Consistency of style (PEP8)
132 |         - flake8 vs. pylint (vscode)
133 |     - Consistency of documentation style (Google vs. numpy)
134 |         - Preference for Google
135 | - Checks off future you and WM
136 | - Exercise: let's create a project with the right structure
137 | 
138 | 2. Avoid the great mush
139 | 
140 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2013.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2013.png)
141 | 
142 | - Maybe your code is written in a way where you're doing a little bit of everything all at once
143 | 
144 | e.g `wave_clus` 
145 | 
146 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2014.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2014.png)
147 | 
148 | This is a callback for a function in a GUI for spike sorting.
149 | 
150 | - Does many things at once
151 |     - Manipulates the GUI
152 |     - Modifies data
153 |     - Reads a jpg file?
154 | - Uses magic numbers and magic columns
155 | - Various string formatting and exec
156 | - Big function
157 | - Not complex, but it's complicated
158 | - That's bad because your code becomes really hard to reason about
159 |     - Tightly coupled
160 | - Are the results weird because:
161 |     - the data is bad
162 |     - you're loading the data wrong
163 |     - your model is incorrectly implemented
164 |     - your model is inappropriate for the data
165 |     - you statistical tests are inappropriate for the data distribution
166 | - Are your results good because...
167 | - Keep each of the boxes separate with minimal interface
168 |     - Separation of concerns:
169 |         - example: your data loading function should just load data
170 |         - Your computation functions shouldn't load data, they should just compute
171 | - Make each of the boxes small
172 |     - don't make giant monolithic functions
173 |     - Make functions which are small
174 |         - a screen's worth
175 |             - 80 columns, 50 lines
176 | - Avoid side effects, prefer pure functions
177 |     - What's a side effect?
178 |     - In computer science, an operation, function or expression is said to have a side effect if it modifies some state variable value(s) outside its local environment, that is to say has an observable effect besides returning a value (the main effect) to the invoker of the operation. State data updated "outside" of the operation may be maintained "inside" a stateful object or a wider stateful system within which the operation is performed. Example side effects include modifying a non-local variable, modifying a static local variable, modifying a mutable argument passed by reference, performing I/O or calling other side-effect functions.
179 |     - Example: fib
180 | - Learn more about your language
181 |     - Sometimes (but not always!), code smells come from lack of knowledge
182 |     - E.g. using magic column numbers in a raw numpy array rather than named columns in pandas because you don't know pandas
183 |     - Using unnamed dimensions in numpy rather than xarray
184 |     - Using + and bespoke code rather than the one true solution, the f-string
185 | - E.g. implementing CKA
186 | - Checks off WM
187 | 
188 | 3. Build around testing
189 | 
190 | Ariel Rokem
191 | 
192 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2015.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2015.png)
193 | 
194 | - Oftentimes we write code to convince ourselves that our other code works
195 | - E.g. I write a spiffy function that fits a GLM with Tikhonov regularization
196 |     - I make up some test data
197 |     - I run my model
198 |     - It gives me the correct outputs
199 | - At the end, I either delete that code (if it's a tiny amount of tests) or I let it rot in a notebook somewhere.
200 |     - Don't do that!
201 | - 70% of bugs will be old bugs that keep reappearing
202 | - Formalize how you write your code through tests
203 |     - Unit tests
204 |         - micro tests
205 |             - inline `assert`
206 |         - unit tests
207 |             - unittest
208 |             - pytest
209 |             - nose
210 |         - Integration tests
211 |             - "big tests"
212 | - How to use a test runner
213 | - Run them periodically
214 | - Lesson: tested code is low-stress code
215 | - Just learned recently:
216 |     - You can even test figures! pytest-mpl
217 | - Practical assignment:
218 |     - 10 next times you comment out a print statement: transform it into an assert
219 | - Checks off WM
220 | 
221 | 4. Make notes to future yourself
222 | 
223 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2016.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2016.png)
224 | 
225 | - Documentation
226 |     - docstrings
227 |         - Controversial opinion: docstrings are overrated. Tests often form better documentation.
228 |     - README.md
229 |     - tests
230 |     - Keep a lab book
231 |         - Notion
232 | - Checks off future self
233 | 
234 | 5. Work with better people than you
235 | 
236 | - Maybe you're the best coder in your lab so you don't have opportunities for growth
237 |     - Contribute to open source projects
238 |         - NMA & NMC are always happy to have more people!
239 |     - Join a community or hackerspace
240 | - Maybe you're starting out
241 |     - Pair programming!
242 |     - Actively seek resources
243 |         - Julia Evans @b0rk
244 | - Two anecdotes
245 |     - Ctrl+R
246 |     - Michael Waskom's CI for NMA
247 | - Checks off future self
248 | 
249 | 6. Use good tools
250 | 
251 | - You won't become proficient without actively seeking for it
252 |     - E.g. navigational queries on Google
253 | - Take off days where you learn tool X
254 | 
255 | # Examples list
256 | 
257 | precision = 1000
258 | 
259 | x0 = (self.wx[mask].reshape(-1, 1, 1) +
260 | 
261 | torch.randn(len(mask), precision, 1) * self.wsigmax[mask].reshape(-1, 1, 1))
262 | 
263 | y0 = (self.wy[mask].reshape(-1, 1, 1) +
264 | 
265 | torch.randn(len(mask), precision, 1) * self.wsigmax[mask].reshape(-1, 1, 1))
266 | 
267 | ## Advanced topics
268 | 
269 | - Configuration and .env
270 | - Environments
271 | - Dockerfiles
272 | - CI
273 | - Packaging
274 | - Cloud stuff
275 | - Reproducibility
276 | 
277 | # Resources
278 | 
279 | - Data science in practice paper: [https://www.tandfonline.com/doi/full/10.1080/10691898.2020.1860725](https://www.tandfonline.com/doi/full/10.1080/10691898.2020.1860725)
280 | - Making packages and testing [https://education.molssi.org/python-package-best-practices/index.html](https://education.molssi.org/python-package-best-practices/index.html)
281 | - Carpentries testing Python: [http://carpentries-incubator.github.io/python-testing/](http://carpentries-incubator.github.io/python-testing/)
282 | - Software engineering for research: [https://www.youtube.com/watch?v=SxoDCo9iNI0&feature=emb_title](https://www.youtube.com/watch?v=SxoDCo9iNI0&feature=emb_title)
283 | - Computer code and the brain: [https://twitter.com/neuranna/status/1251589731932135425](https://twitter.com/neuranna/status/1251589731932135425)
284 | - Software engineering best practices: [http://www.bris.ac.uk/acrc/acrc-training/](http://www.bris.ac.uk/acrc/acrc-training/)
285 | - The Turing Way: [https://the-turing-way.netlify.app/reproducible-research/code-quality.html](https://the-turing-way.netlify.app/reproducible-research/code-quality.html)
286 | - Software engineering for data scientists: [http://uwseds.github.io/](http://uwseds.github.io/)
287 | - Test and code for scientists (podcast): [https://testandcode.com/140](https://testandcode.com/140)
288 | - Research software engineering: [https://merely-useful.github.io/py-rse/](https://merely-useful.github.io/py-rse/)
289 | - Shablona template: [https://github.com/uwescience/shablona](https://github.com/uwescience/shablona)
290 | 
291 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2017.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2017.png)


--------------------------------------------------------------------------------
/docs/slides/01-intro.md:
--------------------------------------------------------------------------------
  1 | % Intro
  2 | % Good research code
  3 | % Patrick Mineault
  4 | 
  5 | # 
  6 | 
  7 | Intro
  8 | 
  9 | # Who is this lecture for?
 10 | 
 11 | ![](../figures/tweet.png)
 12 | 
 13 | # Who is this lecture for?
 14 | 
 15 | * Most people who do coding-heavy research are not trained in CS or software engineering
 16 | * You're probably in this bucket
 17 | * Bad consequences:
 18 |     * You feel like you don't know what you're doing
 19 |     * Imposter syndrome
 20 |     * Low productivity
 21 |     * Bugs
 22 |     * You hate your code and you don't want to work on it
 23 |     * You never graduate
 24 |     * You have great sadness in your heart
 25 | * It doesn't have to be all bad!
 26 | 
 27 | # My weird perspective
 28 | 
 29 | * Patrick Mineault, PhD in neuroscience
 30 | * (wildly underqualified) software engineer at Google
 31 | * [Research scientist at Facebook Reality Labs on brain-computer interfaces](https://tech.fb.com/imagining-a-new-interface-hands-free-communication-without-saying-a-word/)
 32 | * [Helped build NMA as first year CTO](https://xcorr.net/2021/03/25/building-neuromatch-academy/)
 33 | * [Independent researcher](https://xcorr.net/) and technologist
 34 | * Occasionally taught CS
 35 | 
 36 | # Regrets, I've had a few
 37 | 
 38 | * Mostly self-taught in programming
 39 | * Didn't study CS until very late
 40 | * Wasted months working with bad code of my own making
 41 | * Not a great coder, but better than in grad school
 42 | * I think you might be curious
 43 | 
 44 | # Organization
 45 | 
 46 | * Assume that you know a little bit about [Python](https://swcarpentry.github.io/python-novice-inflammation/), [git](http://swcarpentry.github.io/git-novice/) and the [command line](http://swcarpentry.github.io/shell-novice/)
 47 |     * You can catch up on these topics via Software Carpentries
 48 |     * If you don't, that's ok! This is vertically integrated advice. Get inspired, follow more detailed tutorials after, and come back to this.
 49 | * 5 practical tips to better code
 50 |     * Concrete examples
 51 |     * 5-minute action items
 52 |     * Everybody leaves having learned an actionable thing
 53 | * Interrupt me and chat!
 54 | * But first, I will indulge in theory...
 55 | 
 56 | # Open question
 57 | 
 58 | Q: What does coding look like in the brain?
 59 | 
 60 | # Coding is very working-memory intensive
 61 | 
 62 | ![Code and working memory in the brain, Ivanova et al. (2020)](../figures/wm-federenko.png)
 63 | 
 64 | # Coding is very working-memory intensive
 65 | 
 66 | * MD: Multiple-demand system
 67 | * CP: code programming
 68 | * SP: sentence programming
 69 | * SR: sentence reading
 70 | * NR: non-word reading
 71 | 
 72 | # Consequence
 73 | 
 74 | You will get [overloaded](https://imgur.com/gallery/UNhWQiV).
 75 | 
 76 | # Principle 1: conserve your WM
 77 | 
 78 | - Reduce the cognitive load of understanding your code
 79 | - [Simple is better than complex. Complex is better than complicated.](https://zen-of-python.info/simple-is-better-than-complex.html#3)
 80 | 
 81 | # Research code is very LTM-intensive
 82 | 
 83 | ![Theory](../figures/lifecycle_simple.png){height=250px}
 84 | 
 85 | # Research code is very LTM-intensive
 86 | 
 87 | ![Practice](../figures/lifecycle_complex.png){height=250px}
 88 | 
 89 | # Research code
 90 | 
 91 | - Endpoint is unclear
 92 | - Correct can be hard to define
 93 | - Lots of exploration and dead ends
 94 | - Sometimes, there are manual steps involving human judgement
 95 | - You have to remember all the dead ends for the code to even make sense
 96 | 
 97 | # Principle 2: write for your future self in mind
 98 | 
 99 | - Future you will have forgotten 90% of what you wrote
100 | - Kernighan's Law - Debugging is twice as hard as writing the code in the first place. Therefore, if you write the code as cleverly as possible, you are, by definition, not smart enough to debug it.
101 | 
102 | # Thesis
103 | 
104 | Writing good research code boils down to saving your memory - both working and long-term.
105 | 
106 | ---
107 | 
108 | # 
109 | 
110 | Practical Lessons
111 | 
112 | # Disclaimer: this is a low-judgement zone
113 | 
114 | - It's ok to write garbage code when you're in a rush
115 | - It's not ok to keep building more and more on top of garbage code
116 |     - sure, there's the moral imperative to create replicable code to bring forward the shining light of science and truth...
117 |     - [and yes, people have messed up the world real bad](https://www.nytimes.com/2013/04/19/opinion/krugman-the-excel-depression.html) by doing things fast and loose
118 |     - but also, do you want to scrap 6 months of research because you forgot to transpose a matrix?
119 |     - you will get bitten back
120 | - Guidelines not rules
121 | 
122 | # Lesson 1: keep things tidy
123 | 
124 | ![](../figures/mary-kondo.jpg)
125 | 
126 | # What needs to be tidy
127 | 
128 | * Project folder structure
129 | * Code style
130 | * Notebooks
131 | * Scripts
132 | * Prereq: Git & Github: if you're going to keep things clean, you will mess up and need a time machine.
133 | 
134 | # Project folder structure
135 | 
136 | * Consensus: one repo = one project $\approx$ one paper
137 | * Lots of templates around:
138 |   * [Turing Way](https://the-turing-way.netlify.app/reproducible-research/compendia.html#executable-compendium)
139 |   * [Research Software Engineering with Python](https://merely-useful.github.io/py-rse/getting-started.html#getting-started-structure)
140 |   * [Data science cookiecutter](https://drivendata.github.io/cookiecutter-data-science/)
141 |   * [Shablona](https://github.com/uwescience/shablona)
142 | 
143 | # Shablona
144 | 
145 | ![Shablona](../figures/shablona.png){height=220px}
146 | 
147 | # Shablona
148 | 
149 | * Lightweight, good starter template
150 | * Keeps docs, data, scripts and code tidy and in their own little box
151 | * You can `import shablona` to access the code in the packages
152 | * [Use as a template to start a new project via big green button](https://github.com/uwescience/shablona)
153 | * Or build it from scratch to understand the moving pieces
154 | * **Important**: Is compatible with Python packaging. That means you can install locally with `pip install -e .`, and the code inside the special folder (placeholder: `shablona`) becomes a package `shablona`
155 | 
156 | # 
157 | 
158 | Live demo
159 | 
160 | # Packages, how do they work?
161 | 
162 | Whatever template you use, make sure it makes a local package for your code that you can `pip install`. That will make it easier to re-use your code in other places.
163 | 
164 | [If you're curious, I wrote a long-form note on how packages really work](../notes/how_packages_work.md).
165 | 
166 | # Other conventions
167 | 
168 | - Notebooks → `Start with a Capital Letter.ipynb`
169 | - Reusable functions and packages, etc. → `snake_case.py`
170 | - Tests under `tests` folder
171 | 
172 | # Organizing scripts
173 | 
174 | ![From [Van Vliet (2020)](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1007358)](../figures/pcbi.1007358.g002.PNG_L.png){height=220px}
175 | 
176 | # Organizing scripts
177 | 
178 | * Use filenames that indicate hierarchy, e.g. `00_fetch_data.py`
179 |   * One issue: you can't `import` these scripts because you can't start a module name with a digit.
180 |   * Start with an underscore, `_00_fetch_data.py`, or with a prefix, `step_00_fetch_data.py`, those are valid module names
181 | * Figure code separate from processing steps code, e.g. `figure_csd.py`
182 | * Use a master script to bind everything together
183 |   * Plain Python
184 |   * Bash files
185 |   * Build tools: `doit`, `make`
186 |   * Specialized tools like `nipype`
187 | 
188 | # Code style
189 | 
190 | * Use a consistent style
191 | * [Python has a style guide - PEP8](https://www.google.com/search?channel=crow2&client=firefox-b-d&q=pep8). 
192 |   * Indentation
193 |   * Line length
194 |   * Spaces
195 |   * Variable names
196 |   * imports
197 | * [Orgs like Google have their even more pedantic style guides](https://google.github.io/styleguide/pyguide.html).
198 | * There are linters and auto-formatters which will catch style issues
199 |   * flake8
200 |   * pylint
201 |   * black
202 | * Install them in VSCode
203 | 
204 | # Docstrings
205 | 
206 | [Numpy style](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_numpy.html) or [Google style](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html).
207 | 
208 | ```{.python}
209 | def my_doubler(x):
210 |     """Doubles x.
211 | 
212 |     Args:
213 |         x: the number to double
214 | 
215 |     Returns:
216 |         Twice x
217 |     """
218 |     return x * 2
219 | ```
220 | 
221 | # IPython notebooks
222 | 
223 | > If you use notebooks to develop software, you are probably using the wrong tool. -- [Yihui Xie](https://yihui.org/en/2018/09/notebook-war/)
224 | 
225 | * Notebooks are hard to keep tidy because of nonlinear execution
226 | * Restart and Run All is your friend
227 | * If your notebook doesn't run top to bottom - it's not reproducible
228 | * It's ok to write plotting code in a notebook, but don't write real functions.
229 | * Import the code from your installable package (see `shablona` above)
230 | * You can auto-reload your package code when it changes, makes development easier. In a cell:
231 | 
232 | ```{.python}
233 | %load_ext autoreload
234 | %autoreload 2
235 | ```
236 | 
237 | # Why does this matter?
238 | 
239 | You don't have to constantly ask yourself where stuff is, how you should do thing X, etc. and that allows you to focus on the stuff that matters.
240 | 
241 | # Aside: day 3
242 | 
243 | Everything at Google is one giant monorepo with [billions of lines of code](https://cacm.acm.org/magazines/2016/7/204032-why-google-stores-billions-of-lines-of-code-in-a-single-repository/fulltext#FNE). By ~day 3, it was time to go do a code. Everything is organized according to strict [conventions](https://github.com/google/styleguide/blob/gh-pages/pyguide.md), so it's not *that bad* to jump in. 
244 | 
245 | # Lesson 1
246 | 
247 | * Keep things tidy
248 | * Free your W&LTM from having to remember where stuff is
249 | * Your 5-minute exercise: use the `shablona` template for a project
250 | 
251 | ---


--------------------------------------------------------------------------------
/docs/slides/02-decouple.md:
--------------------------------------------------------------------------------
  1 | % Decoupled code
  2 | % Good research code
  3 | % Patrick Mineault
  4 | 
  5 | # Lesson 2
  6 | 
  7 | Keep things decoupled
  8 | 
  9 | # Spaghetti code
 10 | 
 11 | ![e.g [^1]](../figures/spaghetti-code.png)
 12 | 
 13 | [^1] Brown et al. AntiPatterns, 1998
 14 | 
 15 | # Do you know when your code smells?
 16 | 
 17 | - Maybe your code is written in a way where you're doing a little bit of everything all at once
 18 | - e.g. `wave_clus`
 19 |     - very useful software to sort spikes
 20 |     - has a GUI in Matlab GUIDE
 21 |     - GUIDE makes it exceptionally hard to write good code
 22 |     - Picked it because it's real code
 23 | - This stuff can happen in Matlab or in Python!
 24 | 
 25 | # Sample code
 26 | 
 27 | [Link](https://github.com/csn-le/wave_clus/blob/master/wave_clus.m#L964).
 28 | 
 29 | # What's going here?
 30 | 
 31 | This is a callback for a function in a GUI for spike sorting.
 32 | 
 33 | - Does many things at once
 34 |     - Manipulates the GUI
 35 |     - Modifies data
 36 |     - Reads a jpg file?
 37 | - Uses magic numbers and magic columns
 38 | - Uses various string formatting functions and `eval`
 39 | - Big function
 40 | - Not complex, but it's complicated
 41 | 
 42 | # Tightly coupled
 43 | 
 44 | - When code does a lot of unrelated things at once, it becomes very hard to reason about.
 45 | - Let's say your results are weird, are they weird because...
 46 |     - the data is bad?
 47 |     - you're loading the data wrong?
 48 |     - your model is incorrectly implemented?
 49 |     - your model is inappropriate for the data?
 50 |     - you statistical tests are inappropriate for the data distribution?
 51 | 
 52 | # Uncouple and simplify
 53 | 
 54 | - Keep each of the boxes separate with minimal interface
 55 |     - Separation of concerns:
 56 |         - Example: your data loading function should just load data
 57 |         - Your computation functions shouldn't load data, they should just compute
 58 | - Make each of the boxes small
 59 |     - Don't make giant monolithic functions
 60 |     - Make functions which are small
 61 |         - A screen's worth, 80 columns, 50 lines
 62 | - Avoid side effects, prefer pure functions
 63 | 
 64 | # What's a side effect?
 65 | 
 66 | > In computer science, an operation, function or expression is said to have a side effect if it modifies some state variable value(s) outside its local environment, that is to say has an observable effect besides returning a value (the main effect) to the invoker of the operation. State data updated "outside" of the operation may be maintained "inside" a stateful object or a wider stateful system within which the operation is performed. Example side effects include modifying a non-local variable, modifying a static local variable, modifying a mutable argument passed by reference, performing I/O or calling other side-effect functions. (Wikipedia)
 67 | 
 68 | # Side effects
 69 | 
 70 | ![From Wikipedia](../figures/Design_by_contract.svg.png){height=220px}
 71 | 
 72 | # A function with side effects
 73 | 
 74 | Q: what will be printed?
 75 | 
 76 | ```{.python}
 77 | def reversi(arr):
 78 |     """Reverses a list."""
 79 |     for i in range(len(arr) // 2):
 80 |         arr[-i - 1], arr[i] = arr[i], arr[-i - 1]
 81 |     return arr
 82 | 
 83 | >>> a = [0, 1, 2]
 84 | >>> b = reversi(a)
 85 | >>> print(b)
 86 | >>> print(a)
 87 | ```
 88 | 
 89 | # A function which changes its arguments
 90 | 
 91 | ![This function mutates its arguments](../figures/reversi.PNG)
 92 | 
 93 | # Side effects
 94 | 
 95 | * Modifying arguments
 96 | * Printing
 97 | * Making API calls
 98 | * Changing globals
 99 | 
100 | # Side effects are not the best
101 | 
102 | * Stuff happens outside of the normal flow from arguments → return value
103 | * Need to know state of function to understand it
104 | * Hard to test
105 | * Let's box them
106 |     * You can use closures or classes to encapsulate state
107 | 
108 | # Demo
109 | 
110 | * `fib.py`
111 | * Fibonacci sequence, $F(n) = F(n-1) + F(n-2)$
112 | * Memoization
113 | 
114 | # Learn more about your language
115 | 
116 | - Sometimes (but not always!), code smells come from lack of knowledge
117 |     - E.g. using magic column numbers in a raw numpy array rather than named columns in pandas because you don't know pandas
118 |     - Using unnamed dimensions in numpy rather than xarray
119 |     - Using + and bespoke casting for string formatting rather than the one true solution, the f-string
120 | - Take time to learn more about the language you use
121 | - Coming from Matlab? I have three tutorials: [[1]](https://xcorr.net/2020/02/21/transitioning-away-from-matlab/), [[2]](https://xcorr.net/2020/02/29/orienting-yourself-through-python/), [[3]](https://xcorr.net/2020/03/04/rewriting-matlab-code-in-python/)
122 | 
123 | # Enough theory!
124 | 
125 | Let's de-couple CKA!
126 | 
127 | # Background on centered kernel alignment
128 | 
129 | Q: how can we compare how different brain areas and artificial neural networks represent the world?
130 | 
131 | A: Choose a standard battery of stimuli, measure responses across systems, compare the responses between the systems. Many approaches, including: 
132 | 
133 | * forward encoding models (e.g. ridge regression)
134 | * canonical correlation analysis (CCA)
135 | * representational similarity analysis (RSA). 
136 | 
137 | # CKA
138 | 
139 | [Kornblith et al. (2019)](https://arxiv.org/abs/1905.00414) propose a new method to compare representations. You can think of it as a generalization of the (square of the) Pearson correlation coefficient, but with matrices instead of vectors.
140 | 
141 | ![Alignment between layers of two neural nets initialized with different seeds](../figures/cka_example.png){height=100px}
142 | 
143 | Importantly, CKA is not implemented in scipy or sklearn, github gives very few hits ^[1]... it's real research code!
144 | 
145 | [1] [There is an implementation in a notebook from authors](https://colab.research.google.com/github/google-research/google-research/blob/master/representation_similarity/Demo.ipynb)
146 | 
147 | # Centered kernel alignment
148 | 
149 | * We collect the responses of each system to our battery of $n$ stimuli into matrices $\mathbf{X}, \mathbf{Y}$. 
150 | * $\mathbf{X}, \mathbf{Y}$ have shape $n x k$, $n x l$, and $k$ and $l$ are not necessarily the same.
151 | * Center $\mathbf{X}, \mathbf{Y}$ so each column has 0 mean, then:
152 | 
153 | $$CKA(\mathbf X, \mathbf Y) = \frac{||\mathbf X^T \mathbf Y||_2^2}{||\mathbf X^T \mathbf X||_2 ||\mathbf Y^T \mathbf Y||_2}$$
154 | 
155 | * Min 0, max 1
156 | * Check: if $\mathbf{X}$ and $\mathbf{Y}$ are one-dimensional, then $CKA = \rho( \mathbf X, \mathbf Y)^2$.
157 | 
158 | 
159 | # Open discussion
160 | 
161 | Q: What's not ideal about this code? `research_code.cka_not_great.py`
162 | 
163 | # Pain points
164 | 
165 | * IO, computation and plotting are all in one big blob
166 | * Solution: isolate the computation in its own function independent of IO
167 | * Put the controller in the `main` function, hide behind `__name__ == "__main__"`
168 |     * Avoids module variables in Python
169 |     * Makes the code importable
170 | 
171 | # Live coding!
172 | 
173 | (the result is `cka_step2.py`)
174 | 
175 | 
176 | # You can apply this advice at a project-wide level as well
177 | 
178 | Advice from [van Vliet (2020)](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1007358):
179 | 
180 | 1. **Each analysis step is one script**
181 | 2. **A script either processes a single experimental replicate, or aggregates across replicates, never both.**
182 | 3. One master script to run the entire analysis
183 | 4. **Save all intermediate results**
184 | 5. Visualize all intermediate results
185 | 6. **Each parameter and filename is defined only once**
186 | 7. Distinguish files that are a part of the official pipeline
187 | 
188 | 
189 | # Decoupling configuration
190 | 
191 | * Keep your configuration our of your code
192 |     * Use `argparse` to specify options via the command line
193 |     * Keep configuration options located in an importable `config.py` file
194 |     * Use `python-dotenv` to store secrets in a `.env` file
195 | 
196 | 
197 | # Lesson 2
198 | 
199 | * Keep things decoupled
200 | * By keeping things decoupled, you can think about one part of your program at a time
201 | * Save your WM slots
202 | * Your 5-minute exercise: take existing code and wrap it in `main`


--------------------------------------------------------------------------------
/docs/slides/03-testing.md:
--------------------------------------------------------------------------------
  1 | % Testing
  2 | % Good research code
  3 | % Patrick Mineault
  4 | 
  5 | # Bulding around testing
  6 | 
  7 | > Most scientists who write software constantly test their code. That is, if you are a scientist writing software, I am sure that you have tried to see how well your code works by running every new function you write, examining the inputs and the outputs of the function, to see if the code runs properly (without error), and to see whether the results make sense. Automated code testing takes this informal practice, makes it formal, and automates it, so that you can make sure that your code does what it is supposed to do, even as you go about making changes around it. --Ariel Rokem, Shablona README
  8 | 
  9 | # Open discussion
 10 | 
 11 | * Let's test `fib.py`
 12 | * What can we test?
 13 | 
 14 | # What can we test about `fib`?
 15 | 
 16 | * Correctness, e.g. $F(4) = 5$
 17 | * Edge cases, e.g. $F(0) = 1$, $F(-1)$ → *error*
 18 | * Functional goals are achieved, e.g. caching works
 19 | * It's much easier to test decoupled code with no side effects
 20 |     * Forces you to write modular decoupled code
 21 | 
 22 | # How can you decide what to test?
 23 | 
 24 | * If something caused a bug, test it
 25 |     * 70% of bugs will be old bugs that keep reappearing
 26 | * If you manually checked if procedure X yielded something reasonable results, write a test for it.
 27 | 
 28 | # How can we test?
 29 | 
 30 | * `assert`
 31 | * Hide code behind `if __name__ == '__main__'`
 32 | * Test suite
 33 | 
 34 | # `assert`
 35 | 
 36 | * `assert` throws an error if the assertion is False
 37 | 
 38 | ```assert -(7 // 2) == (-7 // 2)```
 39 | 
 40 | * Great for inline tests
 41 |   * e.g. check whether the shape of a matrix is correct after a permute op
 42 | 
 43 | # Hide code behind `if __name__ == '__main__'`
 44 | 
 45 | * Code behind `__name__ == '__main__'` is only run if you run the file as a script directly.
 46 | * Use this for lightweight tests in combination with `assert`.
 47 | 
 48 | ```{.python}
 49 | if __name__ == '__main__':
 50 |     assert fib(4) == 5
 51 | ```
 52 | 
 53 | # Use a test suite
 54 | 
 55 | * Create a specialized file with tests that run with the help of a runner.
 56 | * There's `pytest` and `unittest`.
 57 | * I use `unittest` because that's what I learned, and it's built-in, but people like `pytest` a lot.
 58 | 
 59 | # Basic template
 60 | 
 61 | ```{.python}
 62 | # test_something.py
 63 | import unittest
 64 | 
 65 | class MyTest(unittest.TestCase):
 66 |     def sample_test(self):
 67 |         self.assertTrue(True)
 68 | 
 69 | if __name__ == '__main__':
 70 |     unittest.main()
 71 | ```
 72 | 
 73 | # Run it
 74 | 
 75 | ```{.shell}
 76 | $ python test_something.py
 77 | ```
 78 | 
 79 | To run all tests within a directory, install nose via `pip install nose2`, then:
 80 | 
 81 | ```{.shell}
 82 | $ nose2
 83 | ```
 84 | 
 85 | # Live coding
 86 | 
 87 | Let's code up `fib.py` tests!
 88 | 
 89 | # Points from live coding example
 90 | 
 91 | * Paths!
 92 |     * Sometimes you can get away with hacking `sys.path`
 93 |     * Ideally, set up a package with `pip install -e .`
 94 | * There's a lot of cruft in writing tests: no shame in copy and paste (but do it once from scratch)!
 95 | 
 96 | # A hierarchy of tests can be run with a runner
 97 | 
 98 | * Static tests (literally your editor parsing your code to figure out if it will crash)
 99 | * Asserts
100 | * Unit tests (test one function = one unit; what we just saw)
101 | * Integration tests
102 | * Smoke tests (does it crash?)
103 | * Regression tests
104 | * E2E (literally a robot clicking buttons)
105 | 
106 | # Write lots of tiny unit tests that run very quickly
107 | 
108 | * Goal: each unit test should run in 1 ms.
109 | * The faster you iterate, the better for your WM.
110 |     * If your test suite takes more than 5 seconds to run, you will be tempted to go do something else.
111 | 
112 | # Open discussion
113 | 
114 | Q: what do you think is the ratio of test code to real code in a real codebase?
115 | 
116 | # Open discussion
117 | 
118 | A: 1:1 to 3:1, but can be many, many times that in safety critical applications
119 | 
120 | e.g. the aviation standard DO-178C requires 100% code coverage (percentage of lines of code called by the tests) at its third highest safety level (Level C).
121 | 
122 | For more down-to-earth applications, 80% code coverage is a common target. [You can use the `Coverage.py` package to figure out your test coverage](https://coverage.readthedocs.io/en/coverage-5.3.1/).
123 | 
124 | # Demo
125 | 
126 | Let's code CKA tests. We will turn properties of CKA listed in the paper into tests.
127 | 
128 | # What we know about CKA
129 | 
130 | * Only makes sense if two matrices are the same size along the first dimension
131 | * Pearson correlation: If $\mathbf{X}$ and $\mathbf{Y}$ are one-dimensional, then $CKA = \rho( \mathbf X, \mathbf Y)^2$.
132 | * $CKA(\mathbf X, \mathbf X) = 1$
133 | 
134 | # Live coding
135 | 
136 | Note: to follow at home, look at `cka_step3.py` and `tests/test_cka_step3.py`.
137 | 
138 | # What else can we know about CKA? Let's read the paper!
139 | 
140 | * 2.1 _not_ invariant to non-isotropic scaling
141 | * 2.2 invariant to rotations, $CKA(\alpha \mathbf{X U}, \beta \mathbf{Y V}) = CKA(\mathbf X, \mathbf Y)$
142 | 
143 | ![Invariance to rotation](../figures/invariance_to_ortho.PNG){height=85px}
144 | 
145 | * 2.3 invariant to isotropic scaling, $CKA(\alpha \mathbf X, \beta \mathbf Y) = CKA(\mathbf X, \mathbf Y)$
146 | 
147 | # Live coding (II)
148 | 
149 | 
150 | 
151 | 
152 | # Points from live coding example
153 | 
154 | * Your test code can be ugly, as long as it's functional!
155 | * Define boundary conditions, pathological examples
156 |     * Test that bad inputs indeed raise errors! Your code should yell when you feed it bad inputs.
157 | * Lock in current behaviour for regression testing
158 |     * E.g. we implement a different, faster implementation of CKA in `cka_step4.py` and regression test it in `test_cka_step4.py`.
159 | 
160 | # Refactoring with confidence
161 | 
162 | * Your code is ugly: time to refactor!
163 |     1. Your code is ugly, tests pass
164 |     2. Rewrite the code
165 |     3. Your code is clean, tests don't pass
166 |     4. Rewrite the code
167 |     5. Iterate until tests pass again
168 | * Much less stressful with tests and git
169 | * Focus on one test at a time with `python test_cka_step3.py TestCka.test_same`
170 |     * Don't forget to run the whole suite at the end!
171 | 
172 | 
173 | # Advanced topics!
174 | 
175 | Testing deterministic side-effect free computational code has a very high returns:effort ratio, but...
176 | 
177 | * [You can also test data loaders for correctness](https://github.com/patrickmineault/brain-scorer/blob/main/tests/test_pvc4_loader.py).
178 | * [You can also test data for correctness](https://github.com/patrickmineault/phaco-meta/blob/master/read-data.R#L320)
179 | * [You can also test notebooks for correctness](https://github.com/NeuromatchAcademy/course-content/blob/master/ci/verify_exercises.py#L56)
180 | * [You can integrate your tests into Github](https://github.com/patrickmineault/research_code/runs/1647753165?check_suite_focus=true)
181 |     * [This presentation's repo has CI](https://github.com/patrickmineault/research_code/actions)! It's completely unnecessary!
182 | * [You can test stochastic functions](https://softwareengineering.stackexchange.com/questions/133047/unit-testing-of-inherently-random-non-deterministic-algorithms?rq=1)
183 | 
184 | # Lesson 3
185 | 
186 | * Test your code
187 | * Free your WM from having to consider that a piece of code unrelated to the thing you care about is broken
188 | * From lesson 1: much simpler to refactor code to make it tidy when you know you have a test scaffold which catches mistakes
189 | * From lesson 2: you will have to decouple code to write tests
190 | * Your 5-minute assignment: find a commented-out `print` statement in your code and replace it with `assert`
191 | 


--------------------------------------------------------------------------------
/docs/slides/04-docs.md:
--------------------------------------------------------------------------------
  1 | % Docs
  2 | % Good research code
  3 | % Patrick Mineault
  4 | 
  5 | # Documentation
  6 | 
  7 | Write documentation
  8 | 
  9 | # 
 10 | 
 11 | You will forget about 90% of what you worked on. If you write it down, you'll be in a good spot.
 12 | 
 13 | # A word of warning
 14 | 
 15 | * I covered testing before documentation
 16 | * But why?
 17 | 
 18 | # Testing before documentation
 19 | 
 20 | * It's more important that your code works (is correct) than it is easy to use
 21 | * Docs become stale, tests have a long shelf life
 22 | * If tests run, you can always copy and paste code if you can't remember how to use the code
 23 | * Relatedly: if something can be a check, a warning or an exception, it should be
 24 | 
 25 | # Documented
 26 | 
 27 | ```{.python}
 28 | def conv(A, B, padding='valid'):
 29 |     """
 30 |     Convolves the 1d signals A and B.
 31 | 
 32 |     Args:
 33 |         A: a 1d numpy array
 34 |         B: a 1d numpy array
 35 |         padding (str): padding type (valid, mirror)
 36 |     """
 37 |     pass
 38 | ```
 39 | 
 40 | # Defensive inline checks
 41 | 
 42 | ```{.python }
 43 | def conv(A, B, padding='none'):
 44 |   assert A.ndim == 1
 45 |   assert B.ndim == 1
 46 |   if padding not in ('valid', 'mirror'):
 47 |     raise NotImplementedError(
 48 |         f"{padding} not implemented.")
 49 | ```
 50 | 
 51 | # What should you document?
 52 | 
 53 | * References to papers
 54 | * Why you wrote tricky code the way you did instead of the obvious way
 55 | * TODOs (your Python editor will highlight these special comments)
 56 | 
 57 | ```{.python}
 58 | # TODO(pmin): refactor this mess
 59 | ```
 60 | 
 61 | * Usage, especially if other people will use your code.
 62 | * It's a gift from present you to future you
 63 | 
 64 | # How should we document functions? 
 65 | 
 66 | * [Numpy style](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_numpy.html) or [Google style](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html).
 67 | 
 68 | ```{.python}
 69 | def my_doubler(x):
 70 |     """Doubles x.
 71 | 
 72 |     Args:
 73 |         x: the number to double
 74 | 
 75 |     Returns:
 76 |         Twice x
 77 |     """
 78 |     return x * 2
 79 | ```
 80 | 
 81 | # Package docs
 82 | 
 83 | If you create a useful package, you can generate docs for it using [Sphinx](https://www.sphinx-doc.org/en/master/index.html) and publish them on [readthedocs](https://readthedocs.org/).
 84 | 
 85 | # 
 86 | 
 87 | There are other many kinds of *documentation*
 88 | 
 89 | # `README.md`
 90 | 
 91 | ![NMC3: We survived](../figures/readme.PNG){height=220px}
 92 | 
 93 | # Console usage
 94 | 
 95 | ![NMC3: We survived](../figures/argparse.PNG){height=220px}
 96 | 
 97 | # Console usage
 98 | 
 99 | ```{.shell}
100 | (py3) $ python sendit.py
101 | usage: sendit.py [-h] {list,create,add,templates,test,remove,send} ...
102 | 
103 | Manage sendgrid email batches with confidence
104 | 
105 | positional arguments:
106 |   {list,create,add,templates,test,remove,send}
107 |     list                List batches
108 |     create              Create a new batch
109 |     add                 Adds a set of information to a batch
110 |     templates           List templates
111 |     test                Sends a test email
112 |     remove              Deletes an email batch
113 |     send                Sends an email batch
114 | 
115 | optional arguments:
116 |   -h, --help            show this help message and exit
117 | ```
118 | 
119 | # Lab book & blogs
120 | 
121 | * I like [notion.so](https://notion.so) as a labbook
122 | * Blog: jekyll hosted on Github pages or wordpress.com
123 | * I have had a [wordpress.com blog](https://xcorr.net) for the last 12 years. Two weeks ago I copied and pasted from a blog post that I wrote in 2009.
124 | 
125 | # Dashboards
126 | 
127 | * If you have a project that relies on tracking and improving a metric, use a dashboard
128 |     * Lots of machine learning projects are set up this way 
129 | * Not only acts as a LTM, acts as an information radiator
130 | * Many ways to do this (most of these are commercial cloud offerings with a free tier): 
131 |     * [R Shiny](https://shiny.rstudio.com/)
132 |     * [Streamlit](https://www.streamlit.io/)
133 |     * [Panel](https://panel.holoviz.org/)
134 |     * [Plotly dash](https://plotly.com/dash/)
135 |     * [Google Data Studio](https://datastudio.google.com/u/0/)
136 |     * [W&B](https://wandb.ai/)
137 | 
138 | # Sample dashboard
139 | 
140 | ![NMA dashboard](../figures/dashboard.PNG){height=220px}
141 | 
142 | # Lesson 4
143 | 
144 | * Write documentation
145 | * Write the right kind of documentation
146 | * Save your long-term memory and offload it to digital store
147 | * 5-minute exercise: make a `README.md` file and push it to Github


--------------------------------------------------------------------------------
/docs/slides/05-social.md:
--------------------------------------------------------------------------------
  1 | % Improving your skillset
  2 | % Good research code
  3 | % Patrick Mineault
  4 | 
  5 | # Lesson 5
  6 | 
  7 | Maybe the real good code is the friends we made along the way
  8 | 
  9 | # Lesson 5
 10 | 
 11 | Work with better people than you
 12 | 
 13 | # Reality check
 14 | 
 15 | * People think that programming is a solitary activity
 16 | * Reality: at a place like Google, programming is very social
 17 |     * readability
 18 |     * code reviews
 19 |     * design reviews
 20 |     * pair programming
 21 |     * reading groups
 22 |     * retreats
 23 |     * performance reviews
 24 | 
 25 | # Open discussion
 26 | 
 27 | Q: what is pair programming?
 28 | 
 29 | # Pair programming
 30 | 
 31 | * Pair programming is a way for two programmers, potentially of different skill leves, to share knowledge through active practice
 32 | * Traditional style: driver and navigator
 33 |     * The driver physically types the code into the terminal/editor. They think about the micro-issues (e.g. what goes in the body of the for loop)
 34 |     * The navigator tells the driver what to write. They typically focus about macro-issues (e.g. what should a function accomplish, how is it architected)
 35 | 
 36 | # What you learn through pair programming
 37 | 
 38 | * You practice your communication skills
 39 | * You learn someone's productivity shortcuts. E.g. I learned about Ctrl+Shift+R (search in bash) through a pair programming session.
 40 | * If someone has domain knowledge (e.g. neuroscience) and the other technical knowledge (e.g. Python), you will make faster progress than two people working separately
 41 | 
 42 | # Tips on pair programming
 43 | 
 44 | * Tools can make remote pair programming more comfortable
 45 |   * Zoom screensharing (can lead to embarassment if e.g. you accidentally Alt-Tab to an ebay search for beanie babies)
 46 | 
 47 | # Practical tips
 48 | 
 49 | * Open an issue in an open source project
 50 | * Open a PR in an open source project
 51 | * Set up pair programming with people in your lab or study group
 52 |     * One person drives, one person co-pilots
 53 |     * Learn how at NMA2021!
 54 |     * [CoCalc](https://cocalc.com/) and [DeepNote](https://deepnote.com/) can do this remotely.
 55 | * Set up study group with fellow programmers ([event suggestions from Mozilla](http://mozillascience.github.io/studyGroupHandbook/event-types.html)).
 56 | 
 57 | # Set up a review circle
 58 | 
 59 | [You can use Github Pull Requests to give and receive line-by-line feedback on code](https://docs.github.com/en/enterprise-server@2.20/github/collaborating-with-issues-and-pull-requests/reviewing-proposed-changes-in-a-pull-request).
 60 | 
 61 | 
 62 | # It doesn't have to be lonely
 63 | 
 64 | - Maybe you're the best coder in your lab so you don't have opportunities for growth
 65 | - Contribute to open source projects
 66 |     - [NMA](https://neuromatchacademy.org/) & [NMC](https://neuromatch.io/) are always happy to have more people!
 67 | - Join a community or hackerspace
 68 |     - [BrainHack.org](https://brainhack.org/)
 69 |     - Meetup
 70 |     - [Hackerspaces](https://wiki.hackerspaces.org/w/index.php)
 71 |     - [PyLadies](https://www.pyladies.com/)
 72 | 
 73 | # Become a wizard!
 74 | 
 75 | ![zine by Julia Evans, released under CC-BY-NC-SA 4.0 license](../figures/wizard.png){height=220px}
 76 | 
 77 | # Wizard!
 78 | 
 79 | - Great zine about how to become a wizard from [Julia Evans](https://www.twitter.com/b0rk)
 80 | - [Zine link](https://wizardzines.com/comics/take-on-hard-projects/)
 81 | 
 82 | # You are never finished learning!
 83 | 
 84 | ![](../figures/reproducible_research.png)
 85 | 
 86 | # Acknowledgement
 87 | 
 88 | Thanks to the reviewers, Tyler Sloan and Elizabeth DuPre who made this talk much better.
 89 | 
 90 | # You can accomplish anything!
 91 | 
 92 | ![](../figures/nma.png)
 93 | 
 94 | # Lesson 5
 95 | 
 96 | * Work with better people than you
 97 | * It's a bit of a stretch to make this work with the theme of WM & LTM
 98 |     * The most important point: grow with people
 99 | * Your 5-minute exercise: schedule one pair-programming session
100 | 


--------------------------------------------------------------------------------
/docs/slides/99-standalone-testing.md:
--------------------------------------------------------------------------------
  1 | % Testing: standalone lecture
  2 | % Good research code
  3 | % Patrick Mineault
  4 | 
  5 | # 
  6 | 
  7 | Intro
  8 | 
  9 | # Who is this lecture for?
 10 | 
 11 | ![](../figures/tweet.png)
 12 | 
 13 | # Who is this lecture for?
 14 | 
 15 | * Most people who do coding-heavy research are not trained in CS or software engineering
 16 | * You're probably in this bucket
 17 | * Bad consequences:
 18 |     * You feel like you don't know what you're doing
 19 |     * Imposter syndrome
 20 |     * Low productivity
 21 |     * Bugs
 22 |     * You hate your code and you don't want to work on it
 23 |     * You never graduate
 24 |     * You have great sadness in your heart
 25 | * It doesn't have to be all bad!
 26 | 
 27 | # My weird perspective
 28 | 
 29 | * Patrick Mineault, PhD in neuroscience
 30 | * (wildly underqualified) software engineer at Google
 31 | * Research scientist at Facebook on brain-computer interfaces
 32 | * Technical chair of Neuromatch Academy
 33 | * Independent researcher and technologist
 34 | * Occasionally taught CS
 35 | 
 36 | # Regrets, I've had a few
 37 | 
 38 | * Mostly self-taught in programming
 39 | * Didn't study CS until very late
 40 | * Wasted months working with bad code of my own making
 41 | * Not a great coder, but better than in grad school
 42 | * I think you might be curious
 43 | 
 44 | # The single most useful skill
 45 | 
 46 | Testing
 47 | 
 48 | # Organization
 49 | 
 50 | * Assume that you know a little bit about [Python](https://swcarpentry.github.io/python-novice-inflammation/), [git](http://swcarpentry.github.io/git-novice/) and the [command line](http://swcarpentry.github.io/shell-novice/)
 51 |     * You can catch up on these topics via Software Carpentries
 52 | * I don't expect you to have any experience in packaging code, distribution, working in groups.
 53 | * This is a subset of a longer series of lectures which you can refer to
 54 |     * https://github.com/patrickmineault/research_code
 55 |     * One day I will record the whole lecture set and maybe run it live
 56 | * Interrupt me and chat!
 57 | * Learning objectives for this lecture
 58 |     * What is testing?
 59 |     * What should I test?
 60 |     * How can I test?
 61 |     * How can I integrate testing into a project I'm doing right now?
 62 | 
 63 | 
 64 | # Bulding around testing
 65 | 
 66 | > Most scientists who write software constantly test their code. That is, if you are a scientist writing software, I am sure that you have tried to see how well your code works by running every new function you write, examining the inputs and the outputs of the function, to see if the code runs properly (without error), and to see whether the results make sense. Automated code testing takes this informal practice, makes it formal, and automates it, so that you can make sure that your code does what it is supposed to do, even as you go about making changes around it. --Ariel Rokem, Shablona README
 67 | 
 68 | # Open discussion
 69 | 
 70 | * Let's say we have a function in `fib.py`:
 71 | 
 72 | ```{.python}
 73 | def fib(n):
 74 |     if n >= 2:
 75 |         return fib(n-2) + fib(n-1)
 76 |     else:
 77 |         return 1
 78 | ```
 79 | 
 80 | * Let's test `fib.py`
 81 | * What can we test?
 82 | 
 83 | # What can we test about `fib`?
 84 | 
 85 | * Correctness, e.g. $F(4) = 5$
 86 | * Edge cases, e.g. $F(0) = 1$, $F(-1)$ → *error*
 87 | 
 88 | # How can you decide what to test?
 89 | 
 90 | * If something caused a bug, test it
 91 |     * 70% of bugs will be old bugs that keep reappearing
 92 | * If you manually checked if procedure X yielded reasonable results, write a test for it.
 93 | 
 94 | # What will this give me?
 95 | 
 96 | * Decrease bugs: You'll uncover bugs which you'll fix immediately
 97 | * Peace of mind: You'll know that your code is correct
 98 | * Easy refactors: If you change your code you can easily find out if it's still correct
 99 | * Docs: You will know how to call your code long after you've stopped working on it actively
100 | * Better code: If you write your code to be testable you'll write better-organized code
101 | 
102 | # How can we test?
103 | 
104 | * `assert`
105 | * Hide code behind `if __name__ == '__main__'`
106 | * Test suite
107 | 
108 | # `assert`
109 | 
110 | * `assert` throws an error if the assertion is False
111 | 
112 | ```assert -(7 // 2) == (-7 // 2)```
113 | 
114 | * Great for inline tests
115 |   * e.g. check whether the shape of a matrix is correct after a permute operation
116 | 
117 | # Hide code behind `if __name__ == '__main__'`
118 | 
119 | * Code behind `__name__ == '__main__'` is only run if you run the file as a script directly.
120 | * Use this for lightweight tests in combination with `assert`.
121 | 
122 | ```{.python}
123 | if __name__ == '__main__':
124 |     assert fib(4) == 5
125 | ```
126 | 
127 | # Use a test suite
128 | 
129 | * Create a specialized file with tests that run with the help of a runner.
130 | * There's `pytest` and `unittest`.
131 | * I use `unittest` because that's what I learned, and it's built-in, but people like `pytest` a lot.
132 | 
133 | # Basic template
134 | 
135 | ```{.python}
136 | # test_something.py
137 | import unittest
138 | 
139 | class MyTest(unittest.TestCase):
140 |     def sample_test(self):
141 |         self.assertTrue(True)
142 | 
143 | if __name__ == '__main__':
144 |     unittest.main()
145 | ```
146 | 
147 | # Run it
148 | 
149 | ```{.shell}
150 | $ python test_something.py
151 | ```
152 | 
153 | To run all tests within a directory, install nose via `pip install nose2`, then:
154 | 
155 | ```{.shell}
156 | $ nose2
157 | ```
158 | 
159 | # Live coding
160 | 
161 | Let's code up `fib.py` tests!
162 | 
163 | # Points from live coding example
164 | 
165 | * Paths!
166 |     * Sometimes you can get away with hacking `sys.path`
167 |     * Ideally, set up a package with `pip install -e .`
168 | * There's a lot of cruft in writing tests: no shame in copy and paste (but do it once from scratch)!
169 | 
170 | # A hierarchy of tests can be run with a runner
171 | 
172 | * Static tests (literally your editor parsing your code to figure out if it will crash)
173 | * Asserts
174 | * Unit tests (test one function = one unit; what we just saw)
175 | * Integration tests
176 | * Smoke tests (does it crash?)
177 | * Regression tests
178 | * E2E (literally a robot clicking buttons)
179 | 
180 | # Write lots of tiny unit tests that run very quickly
181 | 
182 | * Goal: each unit test should run in 1 ms.
183 | * The faster you iterate, the better
184 |     * If your test suite takes more than 5 seconds to run, you will be tempted to go do something else.
185 | 
186 | # Open discussion
187 | 
188 | Q: what do you think is the ratio of test code to real code in a real codebase?
189 | 
190 | # Open discussion
191 | 
192 | A: 1:1 to 3:1, but can be many, many times that in safety critical applications
193 | 
194 | e.g. the aviation standard DO-178C requires 100% code coverage (percentage of lines of code called by the tests) at its third highest safety level (Level C).
195 | 
196 | For more down-to-earth applications, 80% code coverage is a common target. [You can use the `Coverage.py` package to figure out your test coverage](https://coverage.readthedocs.io/en/coverage-5.3.1/).
197 | 
198 | # Demo
199 | 
200 | Let's code up a non-trivial set of tests for a real paper.
201 | 
202 | # Background on centered kernel alignment
203 | 
204 | Q: How can we compare how different brain areas and artificial neural networks represent the world?
205 | 
206 | A: Choose a standard battery of stimuli, measure responses across systems, compare the responses between the systems. Many approaches, including: 
207 | 
208 | * forward encoding models (e.g. ridge regression)
209 | * canonical correlation analysis (CCA)
210 | * representational similarity analysis (RSA). 
211 | 
212 | # CKA
213 | 
214 | [Kornblith et al. (2019)](https://arxiv.org/abs/1905.00414) propose a new method to compare representations. You can think of it as a generalization of the (square of the) Pearson correlation coefficient, but with matrices instead of vectors.
215 | 
216 | ![Alignment between layers of two neural nets initialized with different seeds](../figures/cka_example.png){height=100px}
217 | 
218 | Importantly, CKA is not implemented in scipy or sklearn, github gives very few hits ^[1]... it's real research code!
219 | 
220 | [1] [There is an implementation in a notebook from authors](https://colab.research.google.com/github/google-research/google-research/blob/master/representation_similarity/Demo.ipynb)
221 | 
222 | # What we know about CKA
223 | 
224 | * Pearson correlation: If $\mathbf{X}$ and $\mathbf{Y}$ are one-dimensional, then $CKA = \rho( \mathbf X, \mathbf Y)^2$.
225 | * Only makes sense if two matrices are the same size along the first dimension
226 | * $CKA(\mathbf X, \mathbf X) = 1$
227 | 
228 | # Live coding
229 | 
230 | Note: to follow at home, look at `cka_step3.py` and `tests/test_cka_step3.py`.
231 | 
232 | 
233 | # Points from live coding example
234 | 
235 | * Your test code can be ugly, as long as it's functional!
236 | * Define boundary conditions, pathological examples
237 |     * Test that bad inputs indeed raise errors! Your code should yell when you feed it bad inputs.
238 | * Lock in current behaviour for regression testing
239 |     * E.g. we implement a different, faster implementation of CKA in `cka_step4.py` and regression test it in `test_cka_step4.py`.
240 | 
241 | # Refactoring with confidence
242 | 
243 | * Your code is ugly: time to refactor!
244 |     1. Your code is ugly, tests pass
245 |     2. Rewrite the code
246 |     3. Your code is clean, tests don't pass
247 |     4. Rewrite the code
248 |     5. Iterate until tests pass again
249 | * Much less stressful with tests and git
250 | * Focus on one test at a time with `python test_cka_step3.py TestCka.test_same`
251 |     * Don't forget to run the whole suite at the end!
252 | 
253 | 
254 | # Advanced topics!
255 | 
256 | Testing deterministic side-effect free computational code has a very high returns:effort ratio, but...
257 | 
258 | * [You can also test data loaders for correctness](https://github.com/patrickmineault/brain-scorer/blob/main/tests/test_pvc4_loader.py).
259 | * [You can also test data for correctness](https://github.com/patrickmineault/phaco-meta/blob/master/read-data.R#L320)
260 | * [You can also test notebooks for correctness](https://github.com/NeuromatchAcademy/course-content/blob/master/ci/verify_exercises.py#L56)
261 | * [You can integrate your tests into Github](https://github.com/patrickmineault/research_code/runs/1647753165?check_suite_focus=true)
262 |     * [This presentation's repo has CI](https://github.com/patrickmineault/research_code/actions)! It's completely unnecessary!
263 | * [You can test stochastic functions](https://softwareengineering.stackexchange.com/questions/133047/unit-testing-of-inherently-random-non-deterministic-algorithms?rq=1)
264 | 
265 | # Lesson 3
266 | 
267 | * Test your code
268 | * Your 5-minute assignment: find a commented-out `print` statement in your code and replace it with `assert`


--------------------------------------------------------------------------------
/docs/slides/Makefile:
--------------------------------------------------------------------------------
 1 | OUTDIR := pdf
 2 | 
 3 | # This pattern excludes README.md
 4 | MD_FILES=$(wildcard *-*.md)
 5 | OUT_FILES=$(patsubst %.md, $(OUTDIR)/%.pdf, $(MD_FILES))
 6 | 
 7 | all : directories $(OUT_FILES)
 8 | 
 9 | .PHONY : clean
10 | clean :
11 | 	rm -f $(OUTDIR)/*.pdf
12 | 
13 | $(OUTDIR)/%.pdf : %.md preamble.tex
14 | 	pandoc -t beamer -s $< -o $@ -H preamble.tex
15 | 
16 | directories : $(OUTDIR)
17 | 
18 | $(OUTDIR):
19 | 	mkdir -p $(OUTDIR)


--------------------------------------------------------------------------------
/docs/slides/README.md:
--------------------------------------------------------------------------------
 1 | # Slides for writing good research code good
 2 | 
 3 | * [Introduction and keeping things tidy](pdf/01-intro.pdf)
 4 | * [Decoupling code](pdf/02-decouple.pdf)
 5 | * [Testing](pdf/03-testing.pdf)
 6 | * [Documentation](pdf/04-docs.pdf)
 7 | * [Make it social](pdf/05-social.pdf)
 8 | 
 9 | # References
10 | 
11 | ## Reading
12 | 
13 | - Research software engineering: [https://merely-useful.github.io/py-rse/](https://merely-useful.github.io/py-rse/)
14 | - Making packages and testing [https://education.molssi.org/python-package-best-practices/index.html](https://education.molssi.org/python-package-best-practices/index.html)
15 | - Carpentries testing Python: [http://carpentries-incubator.github.io/python-testing/](http://carpentries-incubator.github.io/python-testing/)
16 | - Shablona template: [https://github.com/uwescience/shablona](https://github.com/uwescience/shablona)
17 | - The Turing Way: [https://the-turing-way.netlify.app/reproducible-research/code-quality.html](https://the-turing-way.netlify.app/reproducible-research/code-quality.html)
18 | - Software engineering best practices: [http://www.bris.ac.uk/acrc/acrc-training/](http://www.bris.ac.uk/acrc/acrc-training/)
19 | - Data science in practice paper: [https://www.tandfonline.com/doi/full/10.1080/10691898.2020.1860725](https://www.tandfonline.com/doi/full/10.1080/10691898.2020.1860725)
20 | - Software engineering for data scientists: [http://uwseds.github.io/](http://uwseds.github.io/)
21 | 
22 | ## Media
23 | 
24 | - Software engineering for research: [https://www.youtube.com/watch?v=SxoDCo9iNI0&feature=emb_title](https://www.youtube.com/watch?v=SxoDCo9iNI0&feature=emb_title)
25 | - Test and code for scientists (podcast): [https://testandcode.com/140](https://testandcode.com/140)
26 | 
27 | ## Inspiration
28 | 
29 | * The Zen of Python: https://zen-of-python.info/
30 | 
31 | ## Tools
32 | 
33 | * [IDEs for scientific Python](https://xcorr.net/2013/04/17/evaluating-ides-for-scientific-python/)
34 | 
35 | # Compiling these slides
36 | 
37 | Slides can be compiled with `make all`. Requires pandoc:
38 | 
39 | ```
40 | sudo apt-get install pandoc texlive texlive-latex-extra
41 | pip install pandoc-latex-fontsize
42 | ```


--------------------------------------------------------------------------------
/docs/slides/pdf/01-intro.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/slides/pdf/01-intro.pdf


--------------------------------------------------------------------------------
/docs/slides/pdf/02-decouple.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/slides/pdf/02-decouple.pdf


--------------------------------------------------------------------------------
/docs/slides/pdf/03-testing.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/slides/pdf/03-testing.pdf


--------------------------------------------------------------------------------
/docs/slides/pdf/04-docs.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/slides/pdf/04-docs.pdf


--------------------------------------------------------------------------------
/docs/slides/pdf/05-social.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/slides/pdf/05-social.pdf


--------------------------------------------------------------------------------
/docs/slides/pdf/99-standalone-testing.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/slides/pdf/99-standalone-testing.pdf


--------------------------------------------------------------------------------
/docs/slides/preamble.tex:
--------------------------------------------------------------------------------
 1 | %Found here: https://github.com/alexeygumirov/pandoc-beamer-how-to/blob/master/pandoc/templates/preamble.tex
 2 | %%%%%%%%%%%%%color 
 3 | \definecolor{UBCblue}{rgb}{0.04706, 0.13725, 0.26667} % UBC Blue (primary)
 4 | \definecolor{UBCgrey}{rgb}{0.3686, 0.5255, 0.6235} % UBC Grey (secondary)
 5 | 
 6 | \definecolor{orange}{RGB}{244,167,66}
 7 | 
 8 | \setbeamercolor{palette primary}{bg=UBCblue,fg=white}
 9 | \setbeamercolor{palette secondary}{bg=UBCblue,fg=white}
10 | \setbeamercolor{palette tertiary}{bg=UBCblue,fg=white}
11 | \setbeamercolor{palette quaternary}{bg=UBCblue,fg=white}
12 | \setbeamercolor{structure}{fg=UBCblue} % itemize, enumerate, etc
13 | \setbeamercolor{section in toc}{fg=UBCblue} % TOC sections
14 | 
15 | %% change circle miniframes color
16 | \setbeamercolor{mini frame}{fg=orange, bg=UBCblue}
17 | 
18 | %% Change subsection in footer color (author and institute color)
19 | \setbeamercolor{subsection in head/foot}{bg=UBCgrey,fg=white}
20 | 
21 | %change ilmenau section dot color 
22 | \setbeamercolor{section in head/foot}{fg=orange} 
23 | 
24 | %% Change the circle in miniframes to a box
25 | %\setbeamertemplate{mini frame}[box]
26 | %\setbeamertemplate{mini frame in current subsection}[box]
27 | 
28 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
29 | %%%% Create framenumber in footer
30 | \newcommand{\frameofframes}{/}
31 | \newcommand{\setframeofframes}[1]{\renewcommand{\frameofframes}{#1}}
32 | 
33 | \setframeofframes{of}
34 | \makeatletter
35 | \setbeamertemplate{footline}
36 |   {%
37 |     \begin{beamercolorbox}[colsep=1.5pt]{upper separation line foot}
38 |     \end{beamercolorbox}
39 |     \begin{beamercolorbox}[ht=2.5ex,dp=1.125ex,%
40 |       leftskip=.3cm,rightskip=.3cm plus1fil]{author in head/foot}%
41 |       \leavevmode{\usebeamerfont{author in head/foot}\insertshortauthor}%
42 |       \hfill%
43 |       {\usebeamerfont{institute in head/foot}\usebeamercolor[fg]{institute in head/foot}\insertshortinstitute}%
44 |     \end{beamercolorbox}%
45 |     \begin{beamercolorbox}[ht=2.5ex,dp=1.125ex,%
46 |       leftskip=.3cm,rightskip=.3cm plus1fil]{title in head/foot}%
47 |       {\usebeamerfont{title in head/foot}\insertshorttitle}%
48 |       \hfill%
49 |       {\usebeamerfont{frame number}\usebeamercolor[fg]{frame number}\insertframenumber~\frameofframes~\inserttotalframenumber}
50 |     \end{beamercolorbox}%
51 |     \begin{beamercolorbox}[colsep=1.5pt]{lower separation line foot}
52 |     \end{beamercolorbox}
53 |   }
54 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
55 | %%% Remove subsection bar in mini frames
56 | \defbeamertemplate*{headline}{miniframes theme no subsection}
57 | {%
58 |   \begin{beamercolorbox}[colsep=1.5pt]{upper separation line head}
59 |   \end{beamercolorbox}
60 |   \begin{beamercolorbox}{section in head/foot}
61 |     \vskip2pt\insertnavigation{\paperwidth}\vskip2pt
62 |   \end{beamercolorbox}%
63 |   \begin{beamercolorbox}[colsep=1.5pt]{lower separation line head}
64 |   \end{beamercolorbox}
65 | }
66 | 
67 | \setbeamertemplate{footline}[miniframes theme no subsection]
68 | %%%%%%%%%%%%%%%%%%%%%%%%%%%
69 | \makeatother


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | graphviz
2 | matplotlib
3 | seaborn
4 | torch


--------------------------------------------------------------------------------
/research_code/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/research_code/__init__.py


--------------------------------------------------------------------------------
/research_code/cka_not_great.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | import os
 4 | import pickle
 5 | 
 6 | 
 7 | """Run a linear centered-kernel alignment (CKA) to find how close two 
 8 | latent representations of 
 9 | 
10 | Kornblith et al. (2019) https://arxiv.org/abs/1905.00414
11 | 
12 | Load data from ../data and compare them. All the matrices in 
13 | ../data/matrices.pkl have the same height (number of examplars) but 
14 | potentially different numbers of columns.
15 | """
16 | f = open('../data/matrices.pkl', 'rb')
17 | data = pickle.load(f)['reps']
18 | 
19 | cka = np.zeros((5, 5))
20 | for i in range(5):
21 |     for j in range(i+1, 5):
22 |         X, Y = data[i], data[j]
23 |         X = (X - X.mean(0).reshape((1, -1)))
24 |         Y = (Y - Y.mean(0).reshape((1, -1)))
25 |         
26 |         XTX = X.T.dot(X)
27 |         YTY = Y.T.dot(Y)
28 |         YTX = Y.T.dot(X)
29 | 
30 |         # Equation (4)
31 |         cka[i, j] = (YTX ** 2).sum() / np.sqrt((XTX * XTX).sum() * (YTY * YTY).sum())
32 | 
33 | cka = cka + cka.T
34 | cka = cka + np.eye(cka.shape[0])
35 | 
36 | plt.figure()
37 | plt.imshow(cka)
38 | plt.colorbar()
39 | plt.xticks([0, 1, 2, 3, 4], ['baseline', 'rescaled', 'nuisance', 'truncated1', 'truncated2'])
40 | plt.yticks([0, 1, 2, 3, 4], ['baseline', 'rescaled', 'nuisance', 'truncated1', 'truncated2'])
41 | plt.title('Similarity of different representations (CKA)')
42 | 
43 | # os.makedirs('../results')
44 | plt.savefig('../results/closeness.png')
45 | 
46 | 


--------------------------------------------------------------------------------
/research_code/cka_step2.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | import os
 3 | import seaborn as sns
 4 | 
 5 | import matplotlib.pyplot as plt
 6 | import numpy as np
 7 | import pandas as pd
 8 | import pickle
 9 | 
10 | 
11 | def multi_cka(reps: List[np.array]) -> np.array:
12 |     """
13 |     Calculate CKA matrix for a list of matrices.
14 | 
15 |     Kornblith et al. (2019) https://arxiv.org/abs/1905.00414
16 | 
17 |     Args:
18 |         reps: a list of representations of the same data from different 
19 |               networks. All have the same height (number of examplars) but 
20 |               potentially different numbers of columns.
21 | 
22 |     Returns:
23 |         the CKA matrix (larger values mean more similar).
24 |     """
25 |     C = np.zeros((len(reps), len(reps)))
26 |     for i in range(len(reps)):
27 |         C[i, i] = 1.0 # by definition
28 |         for j in range(i+1, len(reps)):
29 |             X, Y = reps[i], reps[j]
30 |             X = X - X.mean(0, keepdims=True)
31 |             Y = Y - Y.mean(0, keepdims=True)
32 |             
33 |             XTX = X.T @ X
34 |             YTY = Y.T @ Y
35 |             YTX = Y.T @ X
36 | 
37 |             # Equation (4)
38 |             top = (YTX ** 2).sum()
39 |             bottom = np.sqrt((XTX ** 2).sum() * (YTY ** 2).sum())
40 |             c = top / bottom
41 |             C[i, j] = c
42 |             C[j, i] = c
43 | 
44 |     return C
45 | 
46 | def main():
47 |     with open ('../data/matrices.pkl', 'rb') as f:
48 |         data = pickle.load(f)
49 | 
50 |     C = multi_cka(data['reps'])
51 | 
52 |     df = pd.DataFrame(C)
53 |     df.index = data['models']
54 |     df.columns = data['models']
55 | 
56 |     ax = sns.heatmap(df, annot=True, fmt='.2f')
57 |     try:
58 |         os.makedirs('../results')
59 |     except FileExistsError:
60 |         pass
61 | 
62 |     plt.title('Similarity of different representations (CKA)')
63 |     plt.savefig('../results/closeness_sns.png')
64 | 
65 | if __name__ == "__main__":
66 |     main()
67 | 
68 |         
69 | 
70 | 


--------------------------------------------------------------------------------
/research_code/cka_step3.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | import os
 3 | import seaborn as sns
 4 | 
 5 | import matplotlib.pyplot as plt
 6 | import numpy as np
 7 | import pandas as pd
 8 | import pickle
 9 | 
10 | def cka(X, Y):
11 |     """
12 |     Calculate CKA for two matrices
13 |     """
14 |     X = X - X.mean(0, keepdims=True)
15 |     Y = Y - Y.mean(0, keepdims=True)
16 |             
17 |     XTX = X.T @ X
18 |     YTY = Y.T @ Y
19 |     YTX = Y.T @ X
20 | 
21 |     # Equation (4)
22 |     top = (YTX ** 2).sum()
23 |     bottom = np.sqrt((XTX ** 2).sum() * (YTY ** 2).sum())
24 |     c = top / bottom
25 | 
26 |     return c
27 | 
28 | 
29 | def multi_cka(reps: List[np.array]) -> np.array:
30 |     """
31 |     Calculate CKA matrix for a list of matrices.
32 | 
33 |     Kornblith et al. (2019) https://arxiv.org/abs/1905.00414
34 | 
35 |     Args:
36 |         reps: a list of representations of the same data from different 
37 |               networks. All have the same height (number of examplars) but 
38 |               potentially different numbers of columns.
39 | 
40 |     Returns:
41 |         the CKA matrix (larger values mean more similar).
42 |     """
43 |     C = np.zeros((len(reps), len(reps)))
44 |     for i in range(len(reps)):
45 |         C[i, i] = 1.0 # by definition
46 |         for j in range(i+1, len(reps)):
47 |             c = cka(reps[i], reps[j])
48 | 
49 |             C[i, j] = c
50 |             C[j, i] = c
51 | 
52 |     return C
53 | 
54 | def main():
55 |     with open ('../data/matrices.pkl', 'rb') as f:
56 |         data = pickle.load(f)
57 | 
58 |     C = multi_cka(data['reps'])
59 | 
60 |     df = pd.DataFrame(C)
61 |     df.index = data['models']
62 |     df.columns = data['models']
63 | 
64 |     ax = sns.heatmap(df, annot=True, fmt='.2f')
65 |     try:
66 |         os.makedirs('../results')
67 |     except FileExistsError:
68 |         pass
69 | 
70 |     plt.title('Similarity of different representations (CKA)')
71 |     plt.savefig('../results/closeness_sns.png')
72 | 
73 | if __name__ == "__main__":
74 |     main()
75 | 
76 |         
77 | 
78 | 


--------------------------------------------------------------------------------
/research_code/cka_step4.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | import os
  3 | import seaborn as sns
  4 | 
  5 | import matplotlib.pyplot as plt
  6 | import numpy as np
  7 | import pandas as pd
  8 | import pickle
  9 | 
 10 | 
 11 | def cka_wide(X, Y):
 12 |     """
 13 |     Calculate CKA for two matrices. This algorithm uses a Gram matrix 
 14 |     implementation, which is fast when the data is wider than it is 
 15 |     tall.
 16 | 
 17 |     This implementation is inspired by the one in this colab:
 18 |     https://colab.research.google.com/github/google-research/google-research/blob/master/representation_similarity/Demo.ipynb#scrollTo=MkucRi3yn7UJ
 19 | 
 20 |     Note that we use center the features rather than the Gram matrix
 21 |     because we think the latter is tricky and mysterious. It only works for 
 22 |     linear CKA though (we only implement linear CKA throughout).
 23 |     """     
 24 |     X = X - X.mean(0, keepdims=True)
 25 |     Y = Y - Y.mean(0, keepdims=True)
 26 | 
 27 |     XXT = X @ X.T
 28 |     YYT = Y @ Y.T
 29 | 
 30 |     # We use reshape((-1,)) instead of ravel() to ensure this is compatible
 31 |     # with numpy and pytorch tensors.
 32 |     top = (XXT.reshape((-1,)) * YYT.reshape((-1,))).sum()
 33 |     bottom = np.sqrt((XXT ** 2).sum() * (YYT ** 2).sum())
 34 |     c = top / bottom
 35 | 
 36 |     return c
 37 | 
 38 | 
 39 | def cka_tall(X, Y):
 40 |     """
 41 |     Calculate CKA for two matrices.
 42 |     """
 43 |     X = X - X.mean(0, keepdims=True)
 44 |     Y = Y - Y.mean(0, keepdims=True)
 45 |             
 46 |     XTX = X.T @ X
 47 |     YTY = Y.T @ Y
 48 |     YTX = Y.T @ X
 49 | 
 50 |     # Equation (4)
 51 |     top = (YTX ** 2).sum()
 52 |     bottom = np.sqrt((XTX ** 2).sum() * (YTY ** 2).sum())
 53 |     c = top / bottom
 54 | 
 55 |     return c
 56 | 
 57 | def cka(X, Y):
 58 |     """
 59 |     Calculate CKA for two matrices.
 60 | 
 61 |     CKA has several potential implementations. The naive implementation is 
 62 |     appropriate for tall matrices (more examples than features), but this 
 63 |     implementation uses lots of memory and it slow when there are many more 
 64 |     features than examples. In that case, which often happens with DNNs, we 
 65 |     prefer the Gram matrix variant.
 66 |     """
 67 |     
 68 |     if X.shape[0] < X.shape[1]:
 69 |         return cka_wide(X, Y)
 70 |     else:
 71 |         return cka_tall(X, Y)
 72 | 
 73 | 
 74 | def multi_cka(reps: List[np.array]) -> np.array:
 75 |     """
 76 |     Calculate CKA matrix for a list of matrices.
 77 | 
 78 |     Kornblith et al. (2019) https://arxiv.org/abs/1905.00414
 79 | 
 80 |     Args:
 81 |         reps: a list of representations of the same data from different 
 82 |               networks. All have the same height (number of examplars) but 
 83 |               potentially different numbers of columns.
 84 | 
 85 |     Returns:
 86 |         the CKA matrix (larger values mean more similar).
 87 |     """
 88 |     C = np.zeros((len(reps), len(reps)))
 89 |     for i in range(len(reps)):
 90 |         C[i, i] = 1.0 # by definition
 91 |         for j in range(i+1, len(reps)):
 92 |             c = cka(reps[i], reps[j])
 93 | 
 94 |             C[i, j] = c
 95 |             C[j, i] = c
 96 | 
 97 |     return C
 98 | 
 99 | def main():
100 |     with open ('../data/matrices.pkl', 'rb') as f:
101 |         data = pickle.load(f)
102 | 
103 |     C = multi_cka(data['reps'])
104 | 
105 |     df = pd.DataFrame(C)
106 |     df.index = data['models']
107 |     df.columns = data['models']
108 | 
109 |     ax = sns.heatmap(df, annot=True, fmt='.2f')
110 |     try:
111 |         os.makedirs('../results')
112 |     except FileExistsError:
113 |         pass
114 | 
115 |     plt.title('Similarity of different representations (CKA)')
116 |     plt.savefig('../results/closeness_sns.png')
117 | 
118 | if __name__ == "__main__":
119 |     main()
120 | 
121 |         
122 | 
123 | 


--------------------------------------------------------------------------------
/research_code/fib.py:
--------------------------------------------------------------------------------
 1 | def memoize(fun):
 2 |     """Memoizes a function of one argument."""
 3 |     the_dict = {}
 4 |     def wrapper_decorator(*args, **kwargs):
 5 |         assert len(args) == 1, "Only works with one argument"
 6 |         if args[0] not in the_dict:
 7 |             the_dict[args[0]] = fun(args[0])
 8 |         return the_dict[args[0]]
 9 |     return wrapper_decorator
10 | 
11 | 
12 | @memoize
13 | def fib(n):
14 |     """Calculates the n'th fibonacci number (memo-ized version).
15 | 
16 |     Args:
17 |         n: Which Fibonacci number to return
18 | 
19 |     Returns: the n'th Fibonacci number.
20 |     """
21 |     if n >= 2:
22 |         return fib(n-2) + fib(n-1)
23 |     else:
24 |         return 1
25 | 


--------------------------------------------------------------------------------
/research_code/fib_and_test.py:
--------------------------------------------------------------------------------
 1 | def fib(n):
 2 |     if n >= 2:
 3 |         return fib(n-1) + fib(n-2)
 4 |     else:
 5 |         assert n != 2
 6 |         return 1
 7 | 
 8 | if __name__ == '__main__':
 9 |     print("Tests running")
10 |     assert fib(0) == 1  # expect 1
11 |     assert fib(2) == 2  # 2
12 |     assert fib(4) == 5  # 5
13 |     print("Tests passed")


--------------------------------------------------------------------------------
/research_code/fib_monolithic.py:
--------------------------------------------------------------------------------
 1 | memory = {}
 2 | 
 3 | """
 4 | The N'th Fibonacci number is 
 5 | 
 6 | F(n) = F(n-1) + F(n-2)
 7 | 
 8 | with F(0) = 1, F(1) = 1
 9 | """
10 | 
11 | def fib(n):
12 |     global memory
13 |     if n not in memory:
14 |         if n >= 2:
15 |             memory[n] = fib(n-2) + fib(n-1)
16 |         else:
17 |             memory[n] = 1
18 |     return memory[n]


--------------------------------------------------------------------------------
/research_code/tests/test_cka_step3.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import unittest
 3 | 
 4 | from research_code.cka_step3 import cka
 5 | 
 6 | def _get_one():
 7 |     X = np.cos(.1 * np.pi * np.arange(10)).reshape((-1, 1))
 8 |     Y = np.cos(2 + .07 * np.pi * np.arange(10)).reshape((-1, 1))
 9 |     return X, Y
10 | 
11 | def _get_multi():
12 |     X = np.cos(.1 * np.pi * np.arange(10).reshape((-1, 1)) * np.linspace(.5, 1.5, num=3).reshape((1, -1)))
13 |     Y = np.cos(.5 + .07 * np.pi * np.arange(10).reshape((-1, 1)) * np.linspace(.7, 1.3, num=4).reshape((1, -1)))
14 |     return X, Y
15 | 
16 | def _get_wide():
17 |     X = np.cos(.1 * np.pi * np.arange(10).reshape((-1, 1)) * np.linspace(.5, 1.5, num=50).reshape((1, -1)))
18 |     Y = np.cos(.5 + .07 * np.pi * np.arange(10).reshape((-1, 1)) * np.linspace(.7, 1.3, num=47).reshape((1, -1)))
19 |     return X, Y
20 | 
21 | class TestCka(unittest.TestCase):
22 |     
23 |     @unittest.expectedFailure
24 |     def test_wrong_dim(self):
25 |         """It should throw an error if we have a different number of stimuli"""
26 |         X = np.ones((8, 1))
27 |         Y = np.ones((10, 1))
28 |         cka(X, Y)
29 | 
30 |     def test_same(self):
31 |         """The CKA of a matrix and itself is one"""
32 |         X, _ = _get_one()
33 |         self.assertAlmostEqual(cka(X, X), 1)
34 | 
35 |     def test_corr(self):
36 |         """The CKA of two vectors is the square of the correlation coefficient"""
37 |         X, Y = _get_one()
38 |         c1 = cka(X, Y)
39 |         c2 = np.corrcoef(X.squeeze(), Y.squeeze())[0, 1] ** 2
40 |         self.assertAlmostEqual(c1, c2)
41 | 
42 |     def test_isoscaling(self):
43 |         """CKA is insensitive to scaling by a scalar"""
44 |         X, Y = _get_multi()
45 |         c1 = cka(X, Y)
46 |         c2 = cka(2.0 * X, - 1 * Y)
47 |         self.assertAlmostEqual(c1, c2)
48 | 
49 |     def test_rotation(self):
50 |         """CKA is insensitive to rotations"""
51 |         X, Y = _get_multi()
52 |         X0 = X[:, :2]
53 |         X0p = X0 @ np.array([[1, -1], [1, 1]]) / np.sqrt(2)
54 |         c1 = cka(X0, Y)
55 |         c2 = cka(X0p, Y)
56 |         self.assertAlmostEqual(c1, c2)
57 | 
58 |     def test_no_iso(self):
59 |         """CKA is sensitive to column scaling"""
60 |         X, Y = _get_multi()
61 |         X0 = X[:, :2]
62 |         X0p = X0 @ np.array([[1, 1], [10, 1]])
63 |         c1 = cka(X0, Y)
64 |         c2 = cka(X0p, Y)
65 |         self.assertGreater(abs(c1 - c2), .001)
66 | 
67 |     def test_value(self):
68 |         """Regression test: for this particular input, check that the value
69 |         is the same as it always was."""
70 |         X, Y = _get_multi()
71 |         c1 = cka(X, Y)
72 |         self.assertAlmostEqual(c1, 0.96577, places=4)
73 | 
74 |     def test_wide(self):
75 |         """Smoke test."""
76 |         X, Y = _get_wide()
77 |         c1 = cka(X, Y)
78 | 
79 | if __name__ == '__main__':
80 |     unittest.main()


--------------------------------------------------------------------------------
/research_code/tests/test_cka_step4.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import unittest
  3 | from research_code.cka_step4 import cka, cka_tall, cka_wide
  4 | from research_code.cka_step3 import cka as old_cka
  5 | 
  6 | def _get_one():
  7 |     X = np.cos(.1 * np.pi * np.arange(10)).reshape((-1, 1))
  8 |     Y = np.cos(2 + .07 * np.pi * np.arange(10)).reshape((-1, 1))
  9 |     return X, Y
 10 | 
 11 | def _get_multi():
 12 |     X = np.cos(.1 * np.pi * np.arange(10).reshape((-1, 1)) * np.linspace(.5, 1.5, num=3).reshape((1, -1)))
 13 |     Y = np.cos(.5 + .07 * np.pi * np.arange(10).reshape((-1, 1)) * np.linspace(.7, 1.3, num=4).reshape((1, -1)))
 14 |     return X, Y
 15 | 
 16 | def _get_wide():
 17 |     X = np.cos(.1 * np.pi * np.arange(10).reshape((-1, 1)) * np.linspace(.5, 1.5, num=50).reshape((1, -1)))
 18 |     Y = np.cos(.5 + .07 * np.pi * np.arange(10).reshape((-1, 1)) * np.linspace(.7, 1.3, num=47).reshape((1, -1)))
 19 |     return X, Y
 20 | 
 21 | class TestCka(unittest.TestCase):
 22 |     
 23 |     @unittest.expectedFailure
 24 |     def test_wrong_dim(self):
 25 |         """It should throw an error if we have a different number of stimuli"""
 26 |         X = np.ones((8, 1))
 27 |         Y = np.ones((10, 1))
 28 |         cka(X, Y)
 29 | 
 30 |     def test_same(self):
 31 |         """The CKA of a matrix and itself is one"""
 32 |         X, _ = _get_one()
 33 |         self.assertAlmostEqual(cka(X, X), 1)
 34 | 
 35 |     def test_corr(self):
 36 |         """The CKA of two vectors is the square of the correlation coefficient"""
 37 |         X, Y = _get_one()
 38 |         c1 = cka(X, Y)
 39 |         c2 = np.corrcoef(X.squeeze(), Y.squeeze())[0, 1] ** 2
 40 |         self.assertAlmostEqual(c1, c2)
 41 | 
 42 |     def test_isoscaling(self):
 43 |         """CKA is insensitive to scaling by a scalar"""
 44 |         X, Y = _get_multi()
 45 |         c1 = cka(X, Y)
 46 |         c2 = cka(2.0 * X, - 1 * Y)
 47 |         self.assertAlmostEqual(c1, c2)
 48 | 
 49 |     def test_rotation(self):
 50 |         """CKA is insensitive to rotations"""
 51 |         X, Y = _get_multi()
 52 |         X0 = X[:, :2]
 53 |         X0p = X0 @ np.array([[1, -1], [1, 1]]) / np.sqrt(2)
 54 |         c1 = cka(X0, Y)
 55 |         c2 = cka(X0p, Y)
 56 |         self.assertAlmostEqual(c1, c2)
 57 | 
 58 |     def test_no_iso(self):
 59 |         """CKA is sensitive to column scaling"""
 60 |         X, Y = _get_multi()
 61 |         X0 = X[:, :2]
 62 |         X0p = X0 @ np.array([[1, 1], [10, 1]])
 63 |         c1 = cka(X0, Y)
 64 |         c2 = cka(X0p, Y)
 65 |         self.assertGreater(abs(c1 - c2), .001)
 66 | 
 67 |     def test_value(self):
 68 |         """Regression test: for this particular input, check that the value
 69 |         is the same as it always was."""
 70 |         X, Y = _get_multi()
 71 |         c1 = cka(X, Y)
 72 |         self.assertAlmostEqual(c1, 0.96577, places=4)
 73 | 
 74 |     def test_wide(self):
 75 |         """Smoke test."""
 76 |         X, Y = _get_wide()
 77 |         c1 = cka(X, Y)
 78 | 
 79 |     def test_consistent(self):
 80 |         """Regression test: check that the old implementation gives the same 
 81 |         results as the new implementation."""
 82 |         X, Y = _get_wide()
 83 |         c1 = cka(X, Y)
 84 |         c2 = old_cka(X, Y)
 85 | 
 86 |         self.assertNotEqual(c1, c2)
 87 |         self.assertAlmostEqual(c1, c2)
 88 | 
 89 |     def test_tall_wide(self):
 90 |         """Check that both implementations gives the same results"""
 91 |         X, Y = _get_wide()
 92 |         c1 = cka_tall(X, Y)
 93 |         c2 = cka_wide(X, Y)
 94 | 
 95 |         self.assertNotEqual(c1, c2)
 96 |         self.assertAlmostEqual(c1, c2)
 97 | 
 98 |     def test_torch(self):
 99 |         """Check that this also works if the input is a pytorch tensor"""
100 |         # We put the import inside the function so the whole test suite doesn't
101 |         # crash if we don't have pytorch installed.
102 |         import torch
103 | 
104 |         X, Y = _get_wide()
105 |         X, Y = torch.tensor(X), torch.tensor(Y)
106 |         c1 = cka_tall(X, Y)
107 |         c2 = cka_wide(X, Y)
108 | 
109 |         self.assertNotEqual(c1.item(), c2.item())
110 |         self.assertAlmostEqual(c1.item(), c2.item())
111 | 
112 | if __name__ == '__main__':
113 |     unittest.main()


--------------------------------------------------------------------------------
/research_code/tests/test_fib.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import unittest
 3 | from research_code import fib
 4 | 
 5 | class TestFib(unittest.TestCase):
 6 |     def test_fib(self):
 7 |         # 1, 1, 2, 3, 5, etc.
 8 |         self.assertEqual(fib.fib(0), 1)
 9 |         self.assertEqual(fib.fib(2), 2)
10 |         self.assertEqual(fib.fib(4), 5)
11 | 
12 |     def test_fib_big(self):
13 |         self.assertEqual(fib.fib(99), 354_224_848_179_261_915_075)
14 | 
15 |     def test_memoization(self):
16 |         """Check that the memo-ized version is much faster than the naive."""
17 |         def _fib(n):
18 |             if n >= 2:
19 |                 return _fib(n-2) + _fib(n-1)
20 |             else:
21 |                 return 1
22 | 
23 |         t0 = time.time()
24 |         val = fib.fib(15)
25 |         dt = time.time() - t0
26 | 
27 |         t0 = time.time()
28 |         val2 = _fib(15)
29 |         dt2 = time.time() - t0
30 | 
31 |         self.assertEqual(val, val2)
32 |         self.assertGreater(dt2, dt * 10)
33 | 
34 | if __name__ == "__main__":
35 |     unittest.main()


--------------------------------------------------------------------------------
/results/closeness.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/results/closeness.png


--------------------------------------------------------------------------------
/results/closeness_sns.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/results/closeness_sns.png


--------------------------------------------------------------------------------
/scripts/Draw dependency graph.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "source": [
  5 |     "Note: graphviz is in requirements.txt, but to get the binaries you'll probably want to install via conda."
  6 |    ],
  7 |    "cell_type": "markdown",
  8 |    "metadata": {}
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 7,
 13 |    "metadata": {},
 14 |    "outputs": [
 15 |     {
 16 |      "output_type": "stream",
 17 |      "name": "stdout",
 18 |      "text": [
 19 |       "digraph {\n\tO [label=\"Open code\"]\n\tV [label=\"Version control\"]\n\tC [label=\"Command line\"]\n\tD [label=\"Open data\"]\n\tR [label=\"Reproducible research\"]\n\tE [label=Environments]\n\tL [label=\"Readable code\"]\n\tT [label=Testing]\n\tI [label=CI]\n\tW [label=\"Code review\"]\n\tS [label=\"Cloud storage\"]\n\tU [label=\"Cloud computing\"]\n\tM [label=Documentation]\n\tP [label=Packaging]\n\tV -> O [label=\"\"]\n\tC -> V [label=\"\"]\n\tO -> D [label=\"\"]\n\tO -> R [label=\"\"]\n\tD -> R [label=\"\"]\n\tE -> R [label=\"\"]\n\tC -> E [label=\"\"]\n\tO -> E [label=\"\"]\n\tO -> W [label=\"\"]\n\tO -> L [label=\"\"]\n\tC -> T [label=\"\"]\n\tT -> I [label=\"\"]\n\tO -> I [label=\"\"]\n\tE -> I [label=\"\"]\n\tW -> L [label=\"\"]\n\tE -> U [label=\"\"]\n\tT -> U [label=\"\"]\n\tV -> U [label=\"\"]\n\tS -> U [label=\"\"]\n\tS -> D [label=\"\"]\n\tM -> P [label=\"\"]\n\tT -> P [label=\"\"]\n\tW -> P [label=\"\"]\n\tO -> P [label=\"\"]\n\tI -> P [label=\"\"]\n\tT -> R [label=\"\"]\n\tL -> M [label=\"\"]\n}\n"
 20 |      ]
 21 |     },
 22 |     {
 23 |      "output_type": "execute_result",
 24 |      "data": {
 25 |       "text/plain": [
 26 |        "'../docs/figures/reproducible_research.pdf'"
 27 |       ]
 28 |      },
 29 |      "metadata": {},
 30 |      "execution_count": 7
 31 |     }
 32 |    ],
 33 |    "source": [
 34 |     "from graphviz import Digraph\n",
 35 |     "\n",
 36 |     "dot = Digraph(format='png')\n",
 37 |     "dot.node('O', 'Open code')\n",
 38 |     "dot.node('V', 'Version control')\n",
 39 |     "dot.node('C', 'Command line')\n",
 40 |     "dot.node('D', 'Open data')\n",
 41 |     "dot.node('R', 'Reproducible research')\n",
 42 |     "dot.node('E', 'Environments')\n",
 43 |     "dot.node('L', 'Readable code')\n",
 44 |     "dot.node('T', 'Testing')\n",
 45 |     "dot.node('I', 'CI')\n",
 46 |     "dot.node('W', 'Code review')\n",
 47 |     "dot.node('S', 'Cloud storage')\n",
 48 |     "dot.node('U', 'Cloud computing')\n",
 49 |     "dot.node('M', 'Documentation')\n",
 50 |     "dot.node('P', 'Packaging')\n",
 51 |     "\n",
 52 |     "dot.edge('V', 'O', '')\n",
 53 |     "dot.edge('C', 'V', '')\n",
 54 |     "dot.edge('O', 'D', '')\n",
 55 |     "dot.edge('O', 'R', '')\n",
 56 |     "dot.edge('D', 'R', '')\n",
 57 |     "dot.edge('E', 'R', '')\n",
 58 |     "dot.edge('C', 'E', '')\n",
 59 |     "dot.edge('O', 'E', '')\n",
 60 |     "dot.edge('O', 'W', '')\n",
 61 |     "dot.edge('O', 'L', '')\n",
 62 |     "dot.edge('C', 'T', '')\n",
 63 |     "dot.edge('T', 'I', '')\n",
 64 |     "dot.edge('O', 'I', '')\n",
 65 |     "dot.edge('E', 'I', '')\n",
 66 |     "dot.edge('W', 'L', '')\n",
 67 |     "dot.edge('E', 'U', '')\n",
 68 |     "dot.edge('T', 'U', '')\n",
 69 |     "dot.edge('V', 'U', '')\n",
 70 |     "dot.edge('S', 'U', '')\n",
 71 |     "dot.edge('S', 'D', '')\n",
 72 |     "dot.edge('M', 'P', '')\n",
 73 |     "dot.edge('T', 'P', '')\n",
 74 |     "dot.edge('W', 'P', '')\n",
 75 |     "dot.edge('O', 'P', '')\n",
 76 |     "dot.edge('I', 'P', '')\n",
 77 |     "dot.edge('T', 'R', '')\n",
 78 |     "dot.edge('L', 'M', '')\n",
 79 |     "\n",
 80 |     "print(dot.source)\n",
 81 |     "dot.render('../docs/figures/reproducible_research', view=False)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 8,
 87 |    "metadata": {},
 88 |    "outputs": [
 89 |     {
 90 |      "output_type": "stream",
 91 |      "name": "stdout",
 92 |      "text": [
 93 |       "digraph {\n\tD [label=\"Create data\"]\n\tT [label=\"Transform data\"]\n\tF [label=\"Fit models\"]\n\tH [label=\"Test Hypotheses\"]\n\tP [label=\"Generate plots\"]\n\tW [label=\"Write and publish paper\"]\n\tD -> T [label=\"\"]\n\tT -> F [label=\"\"]\n\tF -> H [label=\"\"]\n\tH -> D [label=\"\"]\n\tH -> P [label=\"\"]\n\tP -> W [label=\"\"]\n}\n"
 94 |      ]
 95 |     },
 96 |     {
 97 |      "output_type": "execute_result",
 98 |      "data": {
 99 |       "text/plain": [
100 |        "'../docs/figures/lifecycle_simple.pdf'"
101 |       ]
102 |      },
103 |      "metadata": {},
104 |      "execution_count": 8
105 |     }
106 |    ],
107 |    "source": [
108 |     "from graphviz import Digraph\n",
109 |     "\n",
110 |     "dot = Digraph(format='png')\n",
111 |     "dot.node('D', 'Create data')\n",
112 |     "dot.node('T', 'Transform data')\n",
113 |     "dot.node('F', 'Fit models')\n",
114 |     "dot.node('H', 'Test Hypotheses')\n",
115 |     "dot.node('P', 'Generate plots')\n",
116 |     "dot.node('W', 'Write and publish paper')\n",
117 |     "\n",
118 |     "dot.edge('D', 'T', '')\n",
119 |     "dot.edge('T', 'F', '')\n",
120 |     "dot.edge('F', 'H', '')\n",
121 |     "dot.edge('H', 'D', '')\n",
122 |     "dot.edge('H', 'P', '')\n",
123 |     "dot.edge('P', 'W', '')\n",
124 |     "\n",
125 |     "print(dot.source)\n",
126 |     "dot.render('../docs/figures/lifecycle_simple', view=False)"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 9,
132 |    "metadata": {},
133 |    "outputs": [
134 |     {
135 |      "output_type": "stream",
136 |      "name": "stdout",
137 |      "text": [
138 |       "digraph {\n\tD [label=\"Create data\"]\n\tT [label=\"Transform data\"]\n\tF [label=\"Fit models\"]\n\tH [label=\"Test Hypotheses\"]\n\tP [label=\"Generate plots\"]\n\tW [label=\"Write and publish paper\"]\n\tB [label=\"Publish data\"]\n\tC [label=\"Publish code\"]\n\tD -> T [label=\"\"]\n\tT -> F [label=\"\"]\n\tF -> H [label=\"\"]\n\tH -> D [label=\"\"]\n\tH -> P [label=\"\"]\n\tH -> T [label=\"\"]\n\tH -> F [label=\"\"]\n\tP -> W [label=\"\"]\n\tD -> P [label=\"\"]\n\tP -> T [label=\"\"]\n\tP -> F [label=\"\"]\n\tD -> B [label=\"\"]\n\tW -> B [label=\"\"]\n\tW -> C [label=\"\"]\n\tF -> C [label=\"\"]\n}\n"
139 |      ]
140 |     },
141 |     {
142 |      "output_type": "execute_result",
143 |      "data": {
144 |       "text/plain": [
145 |        "'../docs/figures/lifecycle_complex.pdf'"
146 |       ]
147 |      },
148 |      "metadata": {},
149 |      "execution_count": 9
150 |     }
151 |    ],
152 |    "source": [
153 |     "from graphviz import Digraph\n",
154 |     "\n",
155 |     "dot = Digraph(format='png')\n",
156 |     "dot.node('D', 'Create data')\n",
157 |     "dot.node('T', 'Transform data')\n",
158 |     "dot.node('F', 'Fit models')\n",
159 |     "dot.node('H', 'Test Hypotheses')\n",
160 |     "dot.node('P', 'Generate plots')\n",
161 |     "dot.node('W', 'Write and publish paper')\n",
162 |     "dot.node('B', 'Publish data')\n",
163 |     "dot.node('C', 'Publish code')\n",
164 |     "\n",
165 |     "dot.edge('D', 'T', '')\n",
166 |     "dot.edge('T', 'F', '')\n",
167 |     "dot.edge('F', 'H', '')\n",
168 |     "dot.edge('H', 'D', '')\n",
169 |     "dot.edge('H', 'P', '')\n",
170 |     "dot.edge('H', 'T', '')\n",
171 |     "dot.edge('H', 'F', '')\n",
172 |     "dot.edge('P', 'W', '')\n",
173 |     "dot.edge('D', 'P', '')\n",
174 |     "dot.edge('P', 'T', '')\n",
175 |     "dot.edge('P', 'F', '')\n",
176 |     "dot.edge('D', 'B', '')\n",
177 |     "dot.edge('W', 'B', '')\n",
178 |     "dot.edge('W', 'C', '')\n",
179 |     "dot.edge('F', 'C', '')\n",
180 |     "\n",
181 |     "print(dot.source)\n",
182 |     "dot.render('../docs/figures/lifecycle_complex', view=False)"
183 |    ]
184 |   }
185 |  ],
186 |  "metadata": {
187 |   "kernelspec": {
188 |    "name": "python3",
189 |    "display_name": "Python 3.8.5 64-bit ('gpg': conda)",
190 |    "metadata": {
191 |     "interpreter": {
192 |      "hash": "ce31f03000ec776b9ef690d2ab56011f6e9daa1694a64fadd5368c8c54192b7d"
193 |     }
194 |    }
195 |   },
196 |   "language_info": {
197 |    "codemirror_mode": {
198 |     "name": "ipython",
199 |     "version": 3
200 |    },
201 |    "file_extension": ".py",
202 |    "mimetype": "text/x-python",
203 |    "name": "python",
204 |    "nbconvert_exporter": "python",
205 |    "pygments_lexer": "ipython3",
206 |    "version": "3.8.5-final"
207 |   }
208 |  },
209 |  "nbformat": 4,
210 |  "nbformat_minor": 4
211 | }


--------------------------------------------------------------------------------
/scripts/Generate CKA matrices.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This calculates the final layer representations of 3 pretrained models (and pixels) on 100 images on imagenet."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 33,
 13 |    "metadata": {},
 14 |    "outputs": [
 15 |     {
 16 |      "name": "stdout",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "Downloading http://ufldl.stanford.edu/housenumbers/train_32x32.mat to ../data/svhn/train_32x32.mat\n"
 20 |      ]
 21 |     },
 22 |     {
 23 |      "name": "stderr",
 24 |      "output_type": "stream",
 25 |      "text": [
 26 |       "27.1%"
 27 |      ]
 28 |     },
 29 |     {
 30 |      "ename": "KeyboardInterrupt",
 31 |      "evalue": "",
 32 |      "output_type": "error",
 33 |      "traceback": [
 34 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 35 |       "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
 36 |       "\u001b[0;32m<ipython-input-33-9feb6f50b5a9>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtorchvision\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mdataset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorchvision\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdatasets\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSVHN\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'../data/svhn'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdownload\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
 37 |       "\u001b[0;32m~/anaconda3/envs/gpg/lib/python3.8/site-packages/torchvision/datasets/svhn.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, root, split, transform, target_transform, download)\u001b[0m\n\u001b[1;32m     57\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     58\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mdownload\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 59\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdownload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     60\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     61\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_integrity\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 38 |       "\u001b[0;32m~/anaconda3/envs/gpg/lib/python3.8/site-packages/torchvision/datasets/svhn.py\u001b[0m in \u001b[0;36mdownload\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    116\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mdownload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    117\u001b[0m         \u001b[0mmd5\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit_list\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 118\u001b[0;31m         \u001b[0mdownload_url\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mroot\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmd5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    119\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    120\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mextra_repr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 39 |       "\u001b[0;32m~/anaconda3/envs/gpg/lib/python3.8/site-packages/torchvision/datasets/utils.py\u001b[0m in \u001b[0;36mdownload_url\u001b[0;34m(url, root, filename, md5)\u001b[0m\n\u001b[1;32m     68\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     69\u001b[0m             \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Downloading '\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0murl\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m' to '\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mfpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 70\u001b[0;31m             urllib.request.urlretrieve(\n\u001b[0m\u001b[1;32m     71\u001b[0m                 \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfpath\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     72\u001b[0m                 \u001b[0mreporthook\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mgen_bar_updater\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 40 |       "\u001b[0;32m~/anaconda3/envs/gpg/lib/python3.8/urllib/request.py\u001b[0m in \u001b[0;36murlretrieve\u001b[0;34m(url, filename, reporthook, data)\u001b[0m\n\u001b[1;32m    274\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    275\u001b[0m             \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 276\u001b[0;31m                 \u001b[0mblock\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    277\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mblock\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    278\u001b[0m                     \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 41 |       "\u001b[0;32m~/anaconda3/envs/gpg/lib/python3.8/http/client.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, amt)\u001b[0m\n\u001b[1;32m    456\u001b[0m             \u001b[0;31m# Amount is given, implement using readinto\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    457\u001b[0m             \u001b[0mb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbytearray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mamt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 458\u001b[0;31m             \u001b[0mn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadinto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    459\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mmemoryview\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtobytes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    460\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 42 |       "\u001b[0;32m~/anaconda3/envs/gpg/lib/python3.8/http/client.py\u001b[0m in \u001b[0;36mreadinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m    500\u001b[0m         \u001b[0;31m# connection, and the user is reading more bytes than will be provided\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    501\u001b[0m         \u001b[0;31m# (for example, reading in 1k chunks)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 502\u001b[0;31m         \u001b[0mn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadinto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    503\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mn\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    504\u001b[0m             \u001b[0;31m# Ideally, we would raise IncompleteRead if the content-length\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 43 |       "\u001b[0;32m~/anaconda3/envs/gpg/lib/python3.8/socket.py\u001b[0m in \u001b[0;36mreadinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m    667\u001b[0m         \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    668\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 669\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    670\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    671\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_timeout_occurred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 44 |       "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
 45 |      ]
 46 |     }
 47 |    ],
 48 |    "source": [
 49 |     "import torch\n",
 50 |     "import torchvision\n",
 51 |     "\n",
 52 |     "dataset = torchvision.datasets.SVHN('../data/svhn', download=True)"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 29,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "import torchvision.models as models\n",
 62 |     "from torchvision import transforms\n",
 63 |     "\n",
 64 |     "resnet18 = models.resnet18(pretrained=True)\n",
 65 |     "resnet34 = models.resnet34(pretrained=True)\n",
 66 |     "alexnet  = models.alexnet(pretrained=True)\n",
 67 |     "\n",
 68 |     "models = [resnet18, resnet34, alexnet]\n",
 69 |     "\n",
 70 |     "reps = [[], [], [], []]\n",
 71 |     "n = 0\n",
 72 |     "\n",
 73 |     "transform = transforms.Compose([\n",
 74 |     "    transforms.ToTensor(),\n",
 75 |     "    transforms.Normalize(mean=[0.485, 0.456, 0.406],\n",
 76 |     "                         std=[0.229, 0.224, 0.225]),\n",
 77 |     "    transforms.Resize([224, 224]),\n",
 78 |     "])\n",
 79 |     "\n",
 80 |     "for n, (img, _) in enumerate(cifar10):\n",
 81 |     "    \n",
 82 |     "    if n % 10 != 0:\n",
 83 |     "        continue\n",
 84 |     "    \n",
 85 |     "    with torch.no_grad():\n",
 86 |     "        im = transform(img).unsqueeze(0)\n",
 87 |     "        for j, model in enumerate(models):\n",
 88 |     "            reps[j].append(model(im).cpu().detach().numpy())\n",
 89 |     "    \n",
 90 |     "    reps[3].append(np.array(img).reshape((1, -1)))"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 30,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "import numpy as np\n",
100 |     "\n",
101 |     "for i, rep in enumerate(reps):\n",
102 |     "    rep = np.concatenate(rep, axis=0)\n",
103 |     "    reps[i] = rep"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 31,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "import pickle\n",
113 |     "\n",
114 |     "with open('../data/matrices.pkl', 'wb') as f:\n",
115 |     "    pickle.dump({'models': [\n",
116 |     "        'resnet18', 'resnet34', 'alexnet', 'pixel'\n",
117 |     "    ], 'reps': reps}, f)"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 37,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "X = np.random.randn(1000, 3)\n",
127 |     "X2 = np.concatenate([X, np.random.randn(1000, 10)], axis=1)\n",
128 |     "X3 = X[:, :2]\n",
129 |     "X4 = X[:, :1]\n",
130 |     "X5 = 3.0 * X\n",
131 |     "\n",
132 |     "import pickle\n",
133 |     "\n",
134 |     "with open('../data/matrices.pkl', 'wb') as f:\n",
135 |     "    pickle.dump({'models': [\n",
136 |     "        'baseline', 'rescaled', 'nuisance', 'truncated1', 'truncated2'\n",
137 |     "    ], 'reps': [X, X5, X2, X3, X4]}, f)"
138 |    ]
139 |   }
140 |  ],
141 |  "metadata": {
142 |   "kernelspec": {
143 |    "display_name": "Python 3",
144 |    "language": "python",
145 |    "name": "python3"
146 |   },
147 |   "language_info": {
148 |    "codemirror_mode": {
149 |     "name": "ipython",
150 |     "version": 3
151 |    },
152 |    "file_extension": ".py",
153 |    "mimetype": "text/x-python",
154 |    "name": "python",
155 |    "nbconvert_exporter": "python",
156 |    "pygments_lexer": "ipython3",
157 |    "version": "3.8.5"
158 |   }
159 |  },
160 |  "nbformat": 4,
161 |  "nbformat_minor": 4
162 | }
163 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r", encoding="utf-8") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="research_code-patrickmineault", 
 8 |     version="0.0.1",
 9 |     author="Patrick Mineault",
10 |     author_email="patrick@gmail.com",
11 |     description="A small example package",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/pypa/sampleproject",
15 |     packages=setuptools.find_packages(),
16 |     classifiers=[
17 |         "Programming Language :: Python :: 3",
18 |         "License :: OSI Approved :: MIT License",
19 |         "Operating System :: OS Independent",
20 |     ],
21 |     python_requires='>=3.6',
22 | )


--------------------------------------------------------------------------------