├── .github └── workflows │ └── ci.yml ├── .gitignore ├── LICENSE ├── README.md ├── data └── matrices.pkl ├── docs ├── README.md ├── figures │ ├── Design_by_contract.svg │ ├── Design_by_contract.svg.png │ ├── argparse.PNG │ ├── cka_example.png │ ├── dashboard.PNG │ ├── final.doc.gif │ ├── invariance_to_ortho.PNG │ ├── lifecycle_complex │ ├── lifecycle_complex.pdf │ ├── lifecycle_complex.png │ ├── lifecycle_simple │ ├── lifecycle_simple.pdf │ ├── lifecycle_simple.png │ ├── mary-kondo.jpg │ ├── mineault_et_al.png │ ├── nma.png │ ├── pcbi.1007358.g002.PNG_L.png │ ├── readme.PNG │ ├── reproducible_research │ ├── reproducible_research.pdf │ ├── reproducible_research.png │ ├── reversi.PNG │ ├── shablona.png │ ├── spaghetti-code.png │ ├── testing-trophy.png │ ├── tweet.png │ ├── wave_clus.png │ ├── wizard.png │ └── wm-federenko.png ├── notes │ └── how_packages_work.md ├── notion-notes.md └── slides │ ├── 01-intro.md │ ├── 02-decouple.md │ ├── 03-testing.md │ ├── 04-docs.md │ ├── 05-social.md │ ├── 99-standalone-testing.md │ ├── Makefile │ ├── README.md │ ├── pdf │ ├── 01-intro.pdf │ ├── 02-decouple.pdf │ ├── 03-testing.pdf │ ├── 04-docs.pdf │ ├── 05-social.pdf │ └── 99-standalone-testing.pdf │ └── preamble.tex ├── requirements.txt ├── research_code ├── __init__.py ├── cka_not_great.py ├── cka_step2.py ├── cka_step3.py ├── cka_step4.py ├── fib.py ├── fib_and_test.py ├── fib_monolithic.py └── tests │ ├── test_cka_step3.py │ ├── test_cka_step4.py │ └── test_fib.py ├── results ├── closeness.png └── closeness_sns.png ├── scripts ├── Draw dependency graph.ipynb └── Generate CKA matrices.ipynb └── setup.py /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Run Python Tests 2 | on: 3 | push: 4 | branches: 5 | - main 6 | pull_request: 7 | branches: 8 | - main 9 | 10 | jobs: 11 | build: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Install Python 3 16 | uses: actions/setup-python@v1 17 | with: 18 | python-version: 3.8 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install -r requirements.txt 23 | pip install -e . 24 | - name: Lint with pylint 25 | run: | 26 | # From https://medium.com/swlh/automate-python-testing-with-github-actions-7926b5d8a865 27 | pip install flake8 28 | # stop the build if there are Python syntax errors or undefined names 29 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 30 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 31 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 32 | - name: Run tests, measure coverage with nose 33 | run: | 34 | pip install nose2 coverage 35 | nose2 --with-coverage --coverage research_code -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # VSCode 132 | .vscode/ 133 | data/cifar 134 | data/svhn -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Patrick Mineault 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Writing good research code 2 | 3 | This repo contains the slides and code for a presentation on writing research software I first gave in January 2021 to the PhD students in neuro at Harvard. It's a compendium of 5 lessons I learned the hard way about writing research code that won't bite back. 4 | 5 | * [The slides are here](https://github.com/patrickmineault/research_code/tree/main/docs/slides) 6 | * The rest of the repo contains supporting code in the format advocated in the first lesson. 7 | * You can see the full presentation recorded at [NMA 2021](https://www.crowdcast.io/e/nma2021/29) and a short version focused on testing recorded at [Brainhack MTL 2021](https://www.youtube.com/watch?v=gfPP2pQ8Rms&feature=youtu.be&ab_channel=OHBMOpenScienceSIG). 8 | 9 | For the book version of these slides, see [goodresearch.dev](https://goodresearch.dev/). 10 | 11 | ## Organization 12 | 13 | This repo follows the organization of [shablona](https://github.com/uwescience/shablona). All the code and tests are under `research_code`. `research_code` is itself a Python package. 14 | 15 | For the package, we use the same setup as this tutorial on [setuptools](https://python-packaging-user-guide.readthedocs.io/tutorials/packaging-projects/), and is compatible with it - this repo is publishable to PyPI directly! 16 | 17 | ## To install the package locally in development mode 18 | 19 | `cd` into this directory, then run: 20 | 21 | ``` 22 | pip install -e . 23 | ``` 24 | 25 | In Python: 26 | 27 | ```{python} 28 | import research_code 29 | ``` 30 | 31 | ## To test 32 | 33 | `cd` into the `research_code/tests` directory, then run each file individually, or run `nose2`. 34 | 35 | ## CI 36 | 37 | While shablona recommended the use of Jenkins for continuous integration (CI), we showcase instead Github actions, which don't require additional accounts/software. The workflow, which runs tests, is located in `.github/workflows/ci.yml`. 38 | 39 | -------------------------------------------------------------------------------- /data/matrices.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/data/matrices.pkl -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Docs 2 | 3 | This contains the documentation (including what I used to create slides) for my tutorial on writing good research software good. The good stuff is under the slides directory. -------------------------------------------------------------------------------- /docs/figures/Design_by_contract.svg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/Design_by_contract.svg.png -------------------------------------------------------------------------------- /docs/figures/argparse.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/argparse.PNG -------------------------------------------------------------------------------- /docs/figures/cka_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/cka_example.png -------------------------------------------------------------------------------- /docs/figures/dashboard.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/dashboard.PNG -------------------------------------------------------------------------------- /docs/figures/final.doc.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/final.doc.gif -------------------------------------------------------------------------------- /docs/figures/invariance_to_ortho.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/invariance_to_ortho.PNG -------------------------------------------------------------------------------- /docs/figures/lifecycle_complex: -------------------------------------------------------------------------------- 1 | digraph { 2 | D [label="Create data"] 3 | T [label="Transform data"] 4 | F [label="Fit models"] 5 | H [label="Test Hypotheses"] 6 | P [label="Generate plots"] 7 | W [label="Write and publish paper"] 8 | B [label="Publish data"] 9 | C [label="Publish code"] 10 | D -> T [label=""] 11 | T -> F [label=""] 12 | F -> H [label=""] 13 | H -> D [label=""] 14 | H -> P [label=""] 15 | H -> T [label=""] 16 | H -> F [label=""] 17 | P -> W [label=""] 18 | D -> P [label=""] 19 | P -> T [label=""] 20 | P -> F [label=""] 21 | D -> B [label=""] 22 | W -> B [label=""] 23 | W -> C [label=""] 24 | F -> C [label=""] 25 | } 26 | -------------------------------------------------------------------------------- /docs/figures/lifecycle_complex.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/lifecycle_complex.pdf -------------------------------------------------------------------------------- /docs/figures/lifecycle_complex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/lifecycle_complex.png -------------------------------------------------------------------------------- /docs/figures/lifecycle_simple: -------------------------------------------------------------------------------- 1 | digraph { 2 | D [label="Create data"] 3 | T [label="Transform data"] 4 | F [label="Fit models"] 5 | H [label="Test Hypotheses"] 6 | P [label="Generate plots"] 7 | W [label="Write and publish paper"] 8 | D -> T [label=""] 9 | T -> F [label=""] 10 | F -> H [label=""] 11 | H -> D [label=""] 12 | H -> P [label=""] 13 | P -> W [label=""] 14 | } 15 | -------------------------------------------------------------------------------- /docs/figures/lifecycle_simple.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/lifecycle_simple.pdf -------------------------------------------------------------------------------- /docs/figures/lifecycle_simple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/lifecycle_simple.png -------------------------------------------------------------------------------- /docs/figures/mary-kondo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/mary-kondo.jpg -------------------------------------------------------------------------------- /docs/figures/mineault_et_al.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/mineault_et_al.png -------------------------------------------------------------------------------- /docs/figures/nma.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/nma.png -------------------------------------------------------------------------------- /docs/figures/pcbi.1007358.g002.PNG_L.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/pcbi.1007358.g002.PNG_L.png -------------------------------------------------------------------------------- /docs/figures/readme.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/readme.PNG -------------------------------------------------------------------------------- /docs/figures/reproducible_research: -------------------------------------------------------------------------------- 1 | digraph { 2 | O [label="Open code"] 3 | V [label="Version control"] 4 | C [label="Command line"] 5 | D [label="Open data"] 6 | R [label="Reproducible research"] 7 | E [label=Environments] 8 | L [label="Readable code"] 9 | T [label=Testing] 10 | I [label=CI] 11 | W [label="Code review"] 12 | S [label="Cloud storage"] 13 | U [label="Cloud computing"] 14 | M [label=Documentation] 15 | P [label=Packaging] 16 | V -> O [label=""] 17 | C -> V [label=""] 18 | O -> D [label=""] 19 | O -> R [label=""] 20 | D -> R [label=""] 21 | E -> R [label=""] 22 | C -> E [label=""] 23 | O -> E [label=""] 24 | O -> W [label=""] 25 | O -> L [label=""] 26 | C -> T [label=""] 27 | T -> I [label=""] 28 | O -> I [label=""] 29 | E -> I [label=""] 30 | W -> L [label=""] 31 | E -> U [label=""] 32 | T -> U [label=""] 33 | V -> U [label=""] 34 | S -> U [label=""] 35 | S -> D [label=""] 36 | M -> P [label=""] 37 | T -> P [label=""] 38 | W -> P [label=""] 39 | O -> P [label=""] 40 | I -> P [label=""] 41 | T -> R [label=""] 42 | L -> M [label=""] 43 | } 44 | -------------------------------------------------------------------------------- /docs/figures/reproducible_research.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/reproducible_research.pdf -------------------------------------------------------------------------------- /docs/figures/reproducible_research.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/reproducible_research.png -------------------------------------------------------------------------------- /docs/figures/reversi.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/reversi.PNG -------------------------------------------------------------------------------- /docs/figures/shablona.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/shablona.png -------------------------------------------------------------------------------- /docs/figures/spaghetti-code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/spaghetti-code.png -------------------------------------------------------------------------------- /docs/figures/testing-trophy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/testing-trophy.png -------------------------------------------------------------------------------- /docs/figures/tweet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/tweet.png -------------------------------------------------------------------------------- /docs/figures/wave_clus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/wave_clus.png -------------------------------------------------------------------------------- /docs/figures/wizard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/wizard.png -------------------------------------------------------------------------------- /docs/figures/wm-federenko.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/figures/wm-federenko.png -------------------------------------------------------------------------------- /docs/notes/how_packages_work.md: -------------------------------------------------------------------------------- 1 | # How packages actually work 2 | 3 | Pip and packages are wonderful, but they can obscure what's going on behind the 4 | scenes. How do packages actually work? 5 | 6 | ## Packages and modules 7 | 8 | Let's say we have a directory `mylib` with one module inside (a module is a normal Python file with functions): 9 | 10 | ``` 11 | mylib/ 12 | |- code.py 13 | ``` 14 | 15 | Inside `code.py`, there's a function: 16 | 17 | ```{.python} 18 | def the_fun(): 19 | print("Hello world") 20 | ``` 21 | 22 | Now let's assume the base directory `.` is on Python's search path. This creates the implicit package `mylib`. Hence, you can import code like so: 23 | 24 | ```{.python} 25 | from mylib.code import the_fun 26 | ``` 27 | 28 | ## Python's search path 29 | 30 | But wait, where does Python search for code? Several places: 31 | 32 | * The current directory `os.getcwd()` 33 | * Directories listed in `sys.path` 34 | * Directories listed in the environment variable `PYTHONPATH` 35 | 36 | When you install a package listed in PyPI with `pip`, it puts a copy of that folder somewhere that's on the path. For example, when I use the conda environment 37 | `py3`, it puts new packages in: 38 | 39 | `/home/pmin/anaconda3/envs/py3/lib/python3.8/site-packages` 40 | 41 | This location is listed in `sys.path`. Hence, if I `pip install seaborn`, I will find a copy of seaborn inside that directory: 42 | 43 | ```{.shell} 44 | (py3) pmin@desktop:~/anaconda3/envs/py3/lib/python3.8/site-packages/seaborn$ ls -al 45 | total 780 46 | drwxr-xr-x 6 pmin pmin 4096 Dec 28 16:21 . 47 | drwxr-xr-x 438 pmin pmin 20480 Jan 8 14:00 .. 48 | -rw-r--r-- 1 pmin pmin 744 Dec 28 16:21 __init__.py 49 | drwxr-xr-x 2 pmin pmin 4096 Dec 28 16:21 __pycache__ 50 | -rw-r--r-- 1 pmin pmin 52671 Dec 28 16:21 _core.py 51 | -rw-r--r-- 1 pmin pmin 2126 Dec 28 16:21 _decorators.py 52 | -rw-r--r-- 1 pmin pmin 5861 Dec 28 16:21 _docstrings.py 53 | -rw-r--r-- 1 pmin pmin 14699 Dec 28 16:21 _statistics.py 54 | -rw-r--r-- 1 pmin pmin 2139 Dec 28 16:21 _testing.py 55 | -rw-r--r-- 1 pmin pmin 4483 Dec 28 16:21 algorithms.py 56 | ... 57 | ``` 58 | 59 | ## What if I want to use my homebrew library somewhere else? 60 | 61 | Let's say your `mylib` code is in `/path/to/mylib/code.py`. You want to import it from `/home/me/projecto/script.py`. You need to figure out a way to place it on Python's search path. Before we discuss the ideal solution, let's make sure we understand what's going on by discussing other partial solutions. 62 | 63 | ### (bad) copy `code.py` to `/home/me/projecto/mylib/code.py` 64 | 65 | This works, but then you have two copies of your code, and it can rapidly become a maintenance nightmare. 66 | 67 | ### (not great) create a symlink 68 | 69 | A better idea is to create a symlink from `/home/me/projecto/mylib -> /path/to/mylib`. You can do 70 | 71 | `ln -s /path/to/mylib /home/me/projecto/mylib` 72 | 73 | This works, but it can be a pain to manage if you use multiple computers or you're sharing your code with somebody else. 74 | 75 | ### (not great) Change `sys.path` during execution 76 | 77 | Add your library code to `sys.path` temporarily. At the top of your script, use: 78 | 79 | ```{.python} 80 | import sys 81 | # Assuming the code is in /my/great/library/code.py 82 | sys.path.append('/my/great/') 83 | 84 | from library import code 85 | ``` 86 | 87 | Note that once you exit the script, `sys.path` will return to its original value. 88 | 89 | This works, but it will mess up code completion and linting in your favorite editor, because the dependency is injected at runtime. 90 | 91 | ## Create a package and install it in development mode 92 | 93 | We can create a package and install in development mode (`pip install -e`). For that, we need a few files: 94 | 95 | ``` 96 | setup.py 97 | mylib/ 98 | |- __init__.py 99 | |- code.py 100 | ``` 101 | 102 | * `setup.py` contains minimal code to setup a package. This will suffice: 103 | 104 | ```{.python} 105 | from setuptools import setup 106 | 107 | setup( 108 | name='minipkg', 109 | version='0.0.1', 110 | author='An Awesome Coder', 111 | author_email='patty.mcgoo@example.com', 112 | packages=setuptools.find_packages(), 113 | scripts=[], 114 | url='https://github.com/patrickmineault/minimal-package', 115 | license='LICENSE.txt', 116 | description='An awesome package that does nothing', 117 | long_description=open('README.md').read(), 118 | install_requires=[ 119 | ], 120 | ) 121 | ``` 122 | 123 | * Finally, the existence of an empty `__init__.py` tells setuptools that there's a package in that directory. 124 | 125 | When you `pip install -e .`, it will add the current directory `.` to sys.path. Hence, you can now import the package from anywhere. The name of the folder (in this case `mylib`) determines the name of the package. 126 | 127 | ## Removing one level from imports 128 | 129 | It may feel a bit unwieldy to have to have to write `from mylib.code import the_fun`. We can shorten that to `from mylib import the_fun` by changing the contents of `__init__.py` to: 130 | 131 | ```{.python} 132 | from .code import * 133 | ``` 134 | 135 | This will lift the symbols inside of code, including `the_fun`, to a package-level symbol. This is a [common pattern in Python packages](https://github.com/mwaskom/seaborn/blob/master/seaborn/__init__.py). 136 | 137 | ## `src` and all that 138 | 139 | Some authors prefer putting the package code two levels down, i.e. inside `src/mylib`. This prevents polluting the package namespace with unnecessary symbols; see [this blog post for an explanation](https://blog.ionelmc.ro/2014/05/25/python-packaging/). 140 | 141 | # Further reading 142 | 143 | * http://andrewsforge.com/article/python-new-package-landscape/ 144 | * https://blog.ionelmc.ro/2014/05/25/python-packaging/ -------------------------------------------------------------------------------- /docs/notion-notes.md: -------------------------------------------------------------------------------- 1 | # Writing good research code good 2 | 3 | TODO: Make a Jupyterbook out of this 4 | 5 | - Who am I? 6 | - Open source experiences 7 | - No formal CS training 8 | - Google SWE and Facebook research scientist 9 | - Organizer of NMA 10 | - Occasionally taught CS 11 | - Writing research code 12 | 13 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled.png) 14 | 15 | - Who is this for? 16 | - Not CS students 17 | - People who picked up programming more or less by accident 18 | - Inspirations 19 | - Zen of Python 20 | - data scientist as scientist 21 | - move fast and break things 22 | - 12 factors 23 | - full stack deep learning 24 | - anti-patterns 25 | - Why is research code hard to write 26 | - Endpoint is unclear 27 | - "correct" can be hard to define 28 | - Lots of exploration and dead ends 29 | - Sometimes, there are manual steps involving human judgement 30 | - Many people that do code for research are not trained in CS or programming 31 | - Low judgement zone 32 | - It's ok to write garbage code when you're in a rush 33 | - It's not ok to keep building more and more on top of garbage code 34 | - sure, there's the moral imperative to create replicable code to bring forward the shining light of science and truth... 35 | - ...but also, do you want to have to scrap 6 months of research because your awful code does something silly? 36 | - "[https://en.wikipedia.org/wiki/Growth_in_a_Time_of_Debt](https://en.wikipedia.org/wiki/Growth_in_a_Time_of_Debt)" - Reinhart–Rogoff 37 | 38 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%201.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%201.png) 39 | 40 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%202.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%202.png) 41 | 42 | [https://www.bbc.com/news/magazine-22213219](https://www.bbc.com/news/magazine-22213219) 43 | 44 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%203.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%203.png) 45 | 46 | - I've wasted months of my life cursing my own bad code, don't be like me 47 | - When I say "you should" or "you shouldn't", those are just recommendations, maybe you know better, or maybe you have a deadline to hit 48 | - Clean things up later! 49 | 50 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%204.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%204.png) 51 | 52 | ## Grand scheme of things 53 | 54 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%205.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%205.png) 55 | 56 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%206.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%206.png) 57 | 58 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%207.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%207.png) 59 | 60 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%208.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%208.png) 61 | 62 | - Lifecycle of research code 63 | - Create data 64 | - Psychophysics, EEG, ECoG, fMRI, ephys, calcium imaging, human labeling, simulations, etc. 65 | - Towards the goal of testing a hypothesis or something 66 | - Ingest the data 67 | - Apply transformations to the data 68 | - Fit models to the data 69 | - Test hypotheses 70 | - Generate plots 71 | - Write the paper 72 | - (receive the reviews and rewrite the paper) 73 | - (pass down the code to the next grad student down the pipe) 74 | 75 | 0. Principles [Ev Federenko] 76 | 77 | Ivanova et al. [2020] 78 | 79 | CP: code programming 80 | 81 | SP: sentence programming 82 | 83 | SR: sentence reading 84 | 85 | NR: non-word reading 86 | 87 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%209.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%209.png) 88 | 89 | - You have to write for your future self in mind 90 | - Future you will have forgotten 90% of what you wrote 91 | - You have to be twice as smart to debug code as to write it 92 | - Never write code that's as a smart as you can make it 93 | - You need to conserve your working memory 94 | - Reduce the cognitive load of understanding your code 95 | - mental model for understanding and debugging code: 96 | - cognitive task in which you have to juggle lots of things in your WM 97 | - when there's too many pieces of information you have to keep in working memory, you start to lose track of other important pieces of information 98 | - you then have to refer to non-WM (i.e. stackoverflow, your codebase) to get back on track 99 | - eventually your productivity trends towards zero 100 | - e.g. [https://imgur.com/gallery/UNhWQiV](https://imgur.com/gallery/UNhWQiV) 101 | - simple is better than complex, but complex is better than complicated 102 | - cyclomatic complexity (number of linearly independent paths in a program) 103 | - Don't optimize code 104 | 105 | 1. Organize different code projects according to a convention 106 | 107 | I was given the swim test: everything at Google is one giant monorepo with billions of lines of code ([https://cacm.acm.org/magazines/2016/7/204032-why-google-stores-billions-of-lines-of-code-in-a-single-repository/fulltext#FNE](https://cacm.acm.org/magazines/2016/7/204032-why-google-stores-billions-of-lines-of-code-in-a-single-repository/fulltext#FNE)). Everything is organized according to strict (sometimes downright pedantic) conventions, such that "it's not that bad" to jump. [https://github.com/google/styleguide/blob/gh-pages/pyguide.md](https://github.com/google/styleguide/blob/gh-pages/pyguide.md). Also, to become a reviewer, you need to obtain readability ([https://www.pullrequest.com/blog/google-code-review-readability-certification/](https://www.pullrequest.com/blog/google-code-review-readability-certification/)), which is a kind of ritualized hazing in which one is taught the ways of Google-y-ness. To this day, I manually alphabetically sort my imports. 108 | 109 | Suggested by Turing Way: 110 | 111 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2010.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2010.png) 112 | 113 | Suggested by Research Software Engineering with Python (originally from Noble 2009): 114 | 115 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2011.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2011.png) 116 | 117 | [https://github.com/uwescience/shablona](https://github.com/uwescience/shablona) 118 | 119 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2012.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2012.png) 120 | 121 | - Convention over configuration 122 | - E.g. React project: 123 | - lots of code, always in the same folders 124 | - One project → one git repo 125 | - If you don't use git yet, start doing it now 126 | - Take a weekend to learn it 127 | - Analysis → Start with a Capital Letter.ipynb 128 | - Reusable functions and packages, etc. → lower letter with underscores .py 129 | - Tests under tests folder 130 | - Extends beyond just project organization 131 | - Consistency of style (PEP8) 132 | - flake8 vs. pylint (vscode) 133 | - Consistency of documentation style (Google vs. numpy) 134 | - Preference for Google 135 | - Checks off future you and WM 136 | - Exercise: let's create a project with the right structure 137 | 138 | 2. Avoid the great mush 139 | 140 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2013.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2013.png) 141 | 142 | - Maybe your code is written in a way where you're doing a little bit of everything all at once 143 | 144 | e.g `wave_clus` 145 | 146 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2014.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2014.png) 147 | 148 | This is a callback for a function in a GUI for spike sorting. 149 | 150 | - Does many things at once 151 | - Manipulates the GUI 152 | - Modifies data 153 | - Reads a jpg file? 154 | - Uses magic numbers and magic columns 155 | - Various string formatting and exec 156 | - Big function 157 | - Not complex, but it's complicated 158 | - That's bad because your code becomes really hard to reason about 159 | - Tightly coupled 160 | - Are the results weird because: 161 | - the data is bad 162 | - you're loading the data wrong 163 | - your model is incorrectly implemented 164 | - your model is inappropriate for the data 165 | - you statistical tests are inappropriate for the data distribution 166 | - Are your results good because... 167 | - Keep each of the boxes separate with minimal interface 168 | - Separation of concerns: 169 | - example: your data loading function should just load data 170 | - Your computation functions shouldn't load data, they should just compute 171 | - Make each of the boxes small 172 | - don't make giant monolithic functions 173 | - Make functions which are small 174 | - a screen's worth 175 | - 80 columns, 50 lines 176 | - Avoid side effects, prefer pure functions 177 | - What's a side effect? 178 | - In computer science, an operation, function or expression is said to have a side effect if it modifies some state variable value(s) outside its local environment, that is to say has an observable effect besides returning a value (the main effect) to the invoker of the operation. State data updated "outside" of the operation may be maintained "inside" a stateful object or a wider stateful system within which the operation is performed. Example side effects include modifying a non-local variable, modifying a static local variable, modifying a mutable argument passed by reference, performing I/O or calling other side-effect functions. 179 | - Example: fib 180 | - Learn more about your language 181 | - Sometimes (but not always!), code smells come from lack of knowledge 182 | - E.g. using magic column numbers in a raw numpy array rather than named columns in pandas because you don't know pandas 183 | - Using unnamed dimensions in numpy rather than xarray 184 | - Using + and bespoke code rather than the one true solution, the f-string 185 | - E.g. implementing CKA 186 | - Checks off WM 187 | 188 | 3. Build around testing 189 | 190 | Ariel Rokem 191 | 192 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2015.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2015.png) 193 | 194 | - Oftentimes we write code to convince ourselves that our other code works 195 | - E.g. I write a spiffy function that fits a GLM with Tikhonov regularization 196 | - I make up some test data 197 | - I run my model 198 | - It gives me the correct outputs 199 | - At the end, I either delete that code (if it's a tiny amount of tests) or I let it rot in a notebook somewhere. 200 | - Don't do that! 201 | - 70% of bugs will be old bugs that keep reappearing 202 | - Formalize how you write your code through tests 203 | - Unit tests 204 | - micro tests 205 | - inline `assert` 206 | - unit tests 207 | - unittest 208 | - pytest 209 | - nose 210 | - Integration tests 211 | - "big tests" 212 | - How to use a test runner 213 | - Run them periodically 214 | - Lesson: tested code is low-stress code 215 | - Just learned recently: 216 | - You can even test figures! pytest-mpl 217 | - Practical assignment: 218 | - 10 next times you comment out a print statement: transform it into an assert 219 | - Checks off WM 220 | 221 | 4. Make notes to future yourself 222 | 223 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2016.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2016.png) 224 | 225 | - Documentation 226 | - docstrings 227 | - Controversial opinion: docstrings are overrated. Tests often form better documentation. 228 | - README.md 229 | - tests 230 | - Keep a lab book 231 | - Notion 232 | - Checks off future self 233 | 234 | 5. Work with better people than you 235 | 236 | - Maybe you're the best coder in your lab so you don't have opportunities for growth 237 | - Contribute to open source projects 238 | - NMA & NMC are always happy to have more people! 239 | - Join a community or hackerspace 240 | - Maybe you're starting out 241 | - Pair programming! 242 | - Actively seek resources 243 | - Julia Evans @b0rk 244 | - Two anecdotes 245 | - Ctrl+R 246 | - Michael Waskom's CI for NMA 247 | - Checks off future self 248 | 249 | 6. Use good tools 250 | 251 | - You won't become proficient without actively seeking for it 252 | - E.g. navigational queries on Google 253 | - Take off days where you learn tool X 254 | 255 | # Examples list 256 | 257 | precision = 1000 258 | 259 | x0 = (self.wx[mask].reshape(-1, 1, 1) + 260 | 261 | torch.randn(len(mask), precision, 1) * self.wsigmax[mask].reshape(-1, 1, 1)) 262 | 263 | y0 = (self.wy[mask].reshape(-1, 1, 1) + 264 | 265 | torch.randn(len(mask), precision, 1) * self.wsigmax[mask].reshape(-1, 1, 1)) 266 | 267 | ## Advanced topics 268 | 269 | - Configuration and .env 270 | - Environments 271 | - Dockerfiles 272 | - CI 273 | - Packaging 274 | - Cloud stuff 275 | - Reproducibility 276 | 277 | # Resources 278 | 279 | - Data science in practice paper: [https://www.tandfonline.com/doi/full/10.1080/10691898.2020.1860725](https://www.tandfonline.com/doi/full/10.1080/10691898.2020.1860725) 280 | - Making packages and testing [https://education.molssi.org/python-package-best-practices/index.html](https://education.molssi.org/python-package-best-practices/index.html) 281 | - Carpentries testing Python: [http://carpentries-incubator.github.io/python-testing/](http://carpentries-incubator.github.io/python-testing/) 282 | - Software engineering for research: [https://www.youtube.com/watch?v=SxoDCo9iNI0&feature=emb_title](https://www.youtube.com/watch?v=SxoDCo9iNI0&feature=emb_title) 283 | - Computer code and the brain: [https://twitter.com/neuranna/status/1251589731932135425](https://twitter.com/neuranna/status/1251589731932135425) 284 | - Software engineering best practices: [http://www.bris.ac.uk/acrc/acrc-training/](http://www.bris.ac.uk/acrc/acrc-training/) 285 | - The Turing Way: [https://the-turing-way.netlify.app/reproducible-research/code-quality.html](https://the-turing-way.netlify.app/reproducible-research/code-quality.html) 286 | - Software engineering for data scientists: [http://uwseds.github.io/](http://uwseds.github.io/) 287 | - Test and code for scientists (podcast): [https://testandcode.com/140](https://testandcode.com/140) 288 | - Research software engineering: [https://merely-useful.github.io/py-rse/](https://merely-useful.github.io/py-rse/) 289 | - Shablona template: [https://github.com/uwescience/shablona](https://github.com/uwescience/shablona) 290 | 291 | ![Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2017.png](Writing%20good%20research%20code%20good%20981f87fd9a8d4b6b9499195e98e55b08/Untitled%2017.png) -------------------------------------------------------------------------------- /docs/slides/01-intro.md: -------------------------------------------------------------------------------- 1 | % Intro 2 | % Good research code 3 | % Patrick Mineault 4 | 5 | # 6 | 7 | Intro 8 | 9 | # Who is this lecture for? 10 | 11 | ![](../figures/tweet.png) 12 | 13 | # Who is this lecture for? 14 | 15 | * Most people who do coding-heavy research are not trained in CS or software engineering 16 | * You're probably in this bucket 17 | * Bad consequences: 18 | * You feel like you don't know what you're doing 19 | * Imposter syndrome 20 | * Low productivity 21 | * Bugs 22 | * You hate your code and you don't want to work on it 23 | * You never graduate 24 | * You have great sadness in your heart 25 | * It doesn't have to be all bad! 26 | 27 | # My weird perspective 28 | 29 | * Patrick Mineault, PhD in neuroscience 30 | * (wildly underqualified) software engineer at Google 31 | * [Research scientist at Facebook Reality Labs on brain-computer interfaces](https://tech.fb.com/imagining-a-new-interface-hands-free-communication-without-saying-a-word/) 32 | * [Helped build NMA as first year CTO](https://xcorr.net/2021/03/25/building-neuromatch-academy/) 33 | * [Independent researcher](https://xcorr.net/) and technologist 34 | * Occasionally taught CS 35 | 36 | # Regrets, I've had a few 37 | 38 | * Mostly self-taught in programming 39 | * Didn't study CS until very late 40 | * Wasted months working with bad code of my own making 41 | * Not a great coder, but better than in grad school 42 | * I think you might be curious 43 | 44 | # Organization 45 | 46 | * Assume that you know a little bit about [Python](https://swcarpentry.github.io/python-novice-inflammation/), [git](http://swcarpentry.github.io/git-novice/) and the [command line](http://swcarpentry.github.io/shell-novice/) 47 | * You can catch up on these topics via Software Carpentries 48 | * If you don't, that's ok! This is vertically integrated advice. Get inspired, follow more detailed tutorials after, and come back to this. 49 | * 5 practical tips to better code 50 | * Concrete examples 51 | * 5-minute action items 52 | * Everybody leaves having learned an actionable thing 53 | * Interrupt me and chat! 54 | * But first, I will indulge in theory... 55 | 56 | # Open question 57 | 58 | Q: What does coding look like in the brain? 59 | 60 | # Coding is very working-memory intensive 61 | 62 | ![Code and working memory in the brain, Ivanova et al. (2020)](../figures/wm-federenko.png) 63 | 64 | # Coding is very working-memory intensive 65 | 66 | * MD: Multiple-demand system 67 | * CP: code programming 68 | * SP: sentence programming 69 | * SR: sentence reading 70 | * NR: non-word reading 71 | 72 | # Consequence 73 | 74 | You will get [overloaded](https://imgur.com/gallery/UNhWQiV). 75 | 76 | # Principle 1: conserve your WM 77 | 78 | - Reduce the cognitive load of understanding your code 79 | - [Simple is better than complex. Complex is better than complicated.](https://zen-of-python.info/simple-is-better-than-complex.html#3) 80 | 81 | # Research code is very LTM-intensive 82 | 83 | ![Theory](../figures/lifecycle_simple.png){height=250px} 84 | 85 | # Research code is very LTM-intensive 86 | 87 | ![Practice](../figures/lifecycle_complex.png){height=250px} 88 | 89 | # Research code 90 | 91 | - Endpoint is unclear 92 | - Correct can be hard to define 93 | - Lots of exploration and dead ends 94 | - Sometimes, there are manual steps involving human judgement 95 | - You have to remember all the dead ends for the code to even make sense 96 | 97 | # Principle 2: write for your future self in mind 98 | 99 | - Future you will have forgotten 90% of what you wrote 100 | - Kernighan's Law - Debugging is twice as hard as writing the code in the first place. Therefore, if you write the code as cleverly as possible, you are, by definition, not smart enough to debug it. 101 | 102 | # Thesis 103 | 104 | Writing good research code boils down to saving your memory - both working and long-term. 105 | 106 | --- 107 | 108 | # 109 | 110 | Practical Lessons 111 | 112 | # Disclaimer: this is a low-judgement zone 113 | 114 | - It's ok to write garbage code when you're in a rush 115 | - It's not ok to keep building more and more on top of garbage code 116 | - sure, there's the moral imperative to create replicable code to bring forward the shining light of science and truth... 117 | - [and yes, people have messed up the world real bad](https://www.nytimes.com/2013/04/19/opinion/krugman-the-excel-depression.html) by doing things fast and loose 118 | - but also, do you want to scrap 6 months of research because you forgot to transpose a matrix? 119 | - you will get bitten back 120 | - Guidelines not rules 121 | 122 | # Lesson 1: keep things tidy 123 | 124 | ![](../figures/mary-kondo.jpg) 125 | 126 | # What needs to be tidy 127 | 128 | * Project folder structure 129 | * Code style 130 | * Notebooks 131 | * Scripts 132 | * Prereq: Git & Github: if you're going to keep things clean, you will mess up and need a time machine. 133 | 134 | # Project folder structure 135 | 136 | * Consensus: one repo = one project $\approx$ one paper 137 | * Lots of templates around: 138 | * [Turing Way](https://the-turing-way.netlify.app/reproducible-research/compendia.html#executable-compendium) 139 | * [Research Software Engineering with Python](https://merely-useful.github.io/py-rse/getting-started.html#getting-started-structure) 140 | * [Data science cookiecutter](https://drivendata.github.io/cookiecutter-data-science/) 141 | * [Shablona](https://github.com/uwescience/shablona) 142 | 143 | # Shablona 144 | 145 | ![Shablona](../figures/shablona.png){height=220px} 146 | 147 | # Shablona 148 | 149 | * Lightweight, good starter template 150 | * Keeps docs, data, scripts and code tidy and in their own little box 151 | * You can `import shablona` to access the code in the packages 152 | * [Use as a template to start a new project via big green button](https://github.com/uwescience/shablona) 153 | * Or build it from scratch to understand the moving pieces 154 | * **Important**: Is compatible with Python packaging. That means you can install locally with `pip install -e .`, and the code inside the special folder (placeholder: `shablona`) becomes a package `shablona` 155 | 156 | # 157 | 158 | Live demo 159 | 160 | # Packages, how do they work? 161 | 162 | Whatever template you use, make sure it makes a local package for your code that you can `pip install`. That will make it easier to re-use your code in other places. 163 | 164 | [If you're curious, I wrote a long-form note on how packages really work](../notes/how_packages_work.md). 165 | 166 | # Other conventions 167 | 168 | - Notebooks → `Start with a Capital Letter.ipynb` 169 | - Reusable functions and packages, etc. → `snake_case.py` 170 | - Tests under `tests` folder 171 | 172 | # Organizing scripts 173 | 174 | ![From [Van Vliet (2020)](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1007358)](../figures/pcbi.1007358.g002.PNG_L.png){height=220px} 175 | 176 | # Organizing scripts 177 | 178 | * Use filenames that indicate hierarchy, e.g. `00_fetch_data.py` 179 | * One issue: you can't `import` these scripts because you can't start a module name with a digit. 180 | * Start with an underscore, `_00_fetch_data.py`, or with a prefix, `step_00_fetch_data.py`, those are valid module names 181 | * Figure code separate from processing steps code, e.g. `figure_csd.py` 182 | * Use a master script to bind everything together 183 | * Plain Python 184 | * Bash files 185 | * Build tools: `doit`, `make` 186 | * Specialized tools like `nipype` 187 | 188 | # Code style 189 | 190 | * Use a consistent style 191 | * [Python has a style guide - PEP8](https://www.google.com/search?channel=crow2&client=firefox-b-d&q=pep8). 192 | * Indentation 193 | * Line length 194 | * Spaces 195 | * Variable names 196 | * imports 197 | * [Orgs like Google have their even more pedantic style guides](https://google.github.io/styleguide/pyguide.html). 198 | * There are linters and auto-formatters which will catch style issues 199 | * flake8 200 | * pylint 201 | * black 202 | * Install them in VSCode 203 | 204 | # Docstrings 205 | 206 | [Numpy style](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_numpy.html) or [Google style](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html). 207 | 208 | ```{.python} 209 | def my_doubler(x): 210 | """Doubles x. 211 | 212 | Args: 213 | x: the number to double 214 | 215 | Returns: 216 | Twice x 217 | """ 218 | return x * 2 219 | ``` 220 | 221 | # IPython notebooks 222 | 223 | > If you use notebooks to develop software, you are probably using the wrong tool. -- [Yihui Xie](https://yihui.org/en/2018/09/notebook-war/) 224 | 225 | * Notebooks are hard to keep tidy because of nonlinear execution 226 | * Restart and Run All is your friend 227 | * If your notebook doesn't run top to bottom - it's not reproducible 228 | * It's ok to write plotting code in a notebook, but don't write real functions. 229 | * Import the code from your installable package (see `shablona` above) 230 | * You can auto-reload your package code when it changes, makes development easier. In a cell: 231 | 232 | ```{.python} 233 | %load_ext autoreload 234 | %autoreload 2 235 | ``` 236 | 237 | # Why does this matter? 238 | 239 | You don't have to constantly ask yourself where stuff is, how you should do thing X, etc. and that allows you to focus on the stuff that matters. 240 | 241 | # Aside: day 3 242 | 243 | Everything at Google is one giant monorepo with [billions of lines of code](https://cacm.acm.org/magazines/2016/7/204032-why-google-stores-billions-of-lines-of-code-in-a-single-repository/fulltext#FNE). By ~day 3, it was time to go do a code. Everything is organized according to strict [conventions](https://github.com/google/styleguide/blob/gh-pages/pyguide.md), so it's not *that bad* to jump in. 244 | 245 | # Lesson 1 246 | 247 | * Keep things tidy 248 | * Free your W<M from having to remember where stuff is 249 | * Your 5-minute exercise: use the `shablona` template for a project 250 | 251 | --- -------------------------------------------------------------------------------- /docs/slides/02-decouple.md: -------------------------------------------------------------------------------- 1 | % Decoupled code 2 | % Good research code 3 | % Patrick Mineault 4 | 5 | # Lesson 2 6 | 7 | Keep things decoupled 8 | 9 | # Spaghetti code 10 | 11 | ![e.g [^1]](../figures/spaghetti-code.png) 12 | 13 | [^1] Brown et al. AntiPatterns, 1998 14 | 15 | # Do you know when your code smells? 16 | 17 | - Maybe your code is written in a way where you're doing a little bit of everything all at once 18 | - e.g. `wave_clus` 19 | - very useful software to sort spikes 20 | - has a GUI in Matlab GUIDE 21 | - GUIDE makes it exceptionally hard to write good code 22 | - Picked it because it's real code 23 | - This stuff can happen in Matlab or in Python! 24 | 25 | # Sample code 26 | 27 | [Link](https://github.com/csn-le/wave_clus/blob/master/wave_clus.m#L964). 28 | 29 | # What's going here? 30 | 31 | This is a callback for a function in a GUI for spike sorting. 32 | 33 | - Does many things at once 34 | - Manipulates the GUI 35 | - Modifies data 36 | - Reads a jpg file? 37 | - Uses magic numbers and magic columns 38 | - Uses various string formatting functions and `eval` 39 | - Big function 40 | - Not complex, but it's complicated 41 | 42 | # Tightly coupled 43 | 44 | - When code does a lot of unrelated things at once, it becomes very hard to reason about. 45 | - Let's say your results are weird, are they weird because... 46 | - the data is bad? 47 | - you're loading the data wrong? 48 | - your model is incorrectly implemented? 49 | - your model is inappropriate for the data? 50 | - you statistical tests are inappropriate for the data distribution? 51 | 52 | # Uncouple and simplify 53 | 54 | - Keep each of the boxes separate with minimal interface 55 | - Separation of concerns: 56 | - Example: your data loading function should just load data 57 | - Your computation functions shouldn't load data, they should just compute 58 | - Make each of the boxes small 59 | - Don't make giant monolithic functions 60 | - Make functions which are small 61 | - A screen's worth, 80 columns, 50 lines 62 | - Avoid side effects, prefer pure functions 63 | 64 | # What's a side effect? 65 | 66 | > In computer science, an operation, function or expression is said to have a side effect if it modifies some state variable value(s) outside its local environment, that is to say has an observable effect besides returning a value (the main effect) to the invoker of the operation. State data updated "outside" of the operation may be maintained "inside" a stateful object or a wider stateful system within which the operation is performed. Example side effects include modifying a non-local variable, modifying a static local variable, modifying a mutable argument passed by reference, performing I/O or calling other side-effect functions. (Wikipedia) 67 | 68 | # Side effects 69 | 70 | ![From Wikipedia](../figures/Design_by_contract.svg.png){height=220px} 71 | 72 | # A function with side effects 73 | 74 | Q: what will be printed? 75 | 76 | ```{.python} 77 | def reversi(arr): 78 | """Reverses a list.""" 79 | for i in range(len(arr) // 2): 80 | arr[-i - 1], arr[i] = arr[i], arr[-i - 1] 81 | return arr 82 | 83 | >>> a = [0, 1, 2] 84 | >>> b = reversi(a) 85 | >>> print(b) 86 | >>> print(a) 87 | ``` 88 | 89 | # A function which changes its arguments 90 | 91 | ![This function mutates its arguments](../figures/reversi.PNG) 92 | 93 | # Side effects 94 | 95 | * Modifying arguments 96 | * Printing 97 | * Making API calls 98 | * Changing globals 99 | 100 | # Side effects are not the best 101 | 102 | * Stuff happens outside of the normal flow from arguments → return value 103 | * Need to know state of function to understand it 104 | * Hard to test 105 | * Let's box them 106 | * You can use closures or classes to encapsulate state 107 | 108 | # Demo 109 | 110 | * `fib.py` 111 | * Fibonacci sequence, $F(n) = F(n-1) + F(n-2)$ 112 | * Memoization 113 | 114 | # Learn more about your language 115 | 116 | - Sometimes (but not always!), code smells come from lack of knowledge 117 | - E.g. using magic column numbers in a raw numpy array rather than named columns in pandas because you don't know pandas 118 | - Using unnamed dimensions in numpy rather than xarray 119 | - Using + and bespoke casting for string formatting rather than the one true solution, the f-string 120 | - Take time to learn more about the language you use 121 | - Coming from Matlab? I have three tutorials: [[1]](https://xcorr.net/2020/02/21/transitioning-away-from-matlab/), [[2]](https://xcorr.net/2020/02/29/orienting-yourself-through-python/), [[3]](https://xcorr.net/2020/03/04/rewriting-matlab-code-in-python/) 122 | 123 | # Enough theory! 124 | 125 | Let's de-couple CKA! 126 | 127 | # Background on centered kernel alignment 128 | 129 | Q: how can we compare how different brain areas and artificial neural networks represent the world? 130 | 131 | A: Choose a standard battery of stimuli, measure responses across systems, compare the responses between the systems. Many approaches, including: 132 | 133 | * forward encoding models (e.g. ridge regression) 134 | * canonical correlation analysis (CCA) 135 | * representational similarity analysis (RSA). 136 | 137 | # CKA 138 | 139 | [Kornblith et al. (2019)](https://arxiv.org/abs/1905.00414) propose a new method to compare representations. You can think of it as a generalization of the (square of the) Pearson correlation coefficient, but with matrices instead of vectors. 140 | 141 | ![Alignment between layers of two neural nets initialized with different seeds](../figures/cka_example.png){height=100px} 142 | 143 | Importantly, CKA is not implemented in scipy or sklearn, github gives very few hits ^[1]... it's real research code! 144 | 145 | [1] [There is an implementation in a notebook from authors](https://colab.research.google.com/github/google-research/google-research/blob/master/representation_similarity/Demo.ipynb) 146 | 147 | # Centered kernel alignment 148 | 149 | * We collect the responses of each system to our battery of $n$ stimuli into matrices $\mathbf{X}, \mathbf{Y}$. 150 | * $\mathbf{X}, \mathbf{Y}$ have shape $n x k$, $n x l$, and $k$ and $l$ are not necessarily the same. 151 | * Center $\mathbf{X}, \mathbf{Y}$ so each column has 0 mean, then: 152 | 153 | $$CKA(\mathbf X, \mathbf Y) = \frac{||\mathbf X^T \mathbf Y||_2^2}{||\mathbf X^T \mathbf X||_2 ||\mathbf Y^T \mathbf Y||_2}$$ 154 | 155 | * Min 0, max 1 156 | * Check: if $\mathbf{X}$ and $\mathbf{Y}$ are one-dimensional, then $CKA = \rho( \mathbf X, \mathbf Y)^2$. 157 | 158 | 159 | # Open discussion 160 | 161 | Q: What's not ideal about this code? `research_code.cka_not_great.py` 162 | 163 | # Pain points 164 | 165 | * IO, computation and plotting are all in one big blob 166 | * Solution: isolate the computation in its own function independent of IO 167 | * Put the controller in the `main` function, hide behind `__name__ == "__main__"` 168 | * Avoids module variables in Python 169 | * Makes the code importable 170 | 171 | # Live coding! 172 | 173 | (the result is `cka_step2.py`) 174 | 175 | 176 | # You can apply this advice at a project-wide level as well 177 | 178 | Advice from [van Vliet (2020)](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1007358): 179 | 180 | 1. **Each analysis step is one script** 181 | 2. **A script either processes a single experimental replicate, or aggregates across replicates, never both.** 182 | 3. One master script to run the entire analysis 183 | 4. **Save all intermediate results** 184 | 5. Visualize all intermediate results 185 | 6. **Each parameter and filename is defined only once** 186 | 7. Distinguish files that are a part of the official pipeline 187 | 188 | 189 | # Decoupling configuration 190 | 191 | * Keep your configuration our of your code 192 | * Use `argparse` to specify options via the command line 193 | * Keep configuration options located in an importable `config.py` file 194 | * Use `python-dotenv` to store secrets in a `.env` file 195 | 196 | 197 | # Lesson 2 198 | 199 | * Keep things decoupled 200 | * By keeping things decoupled, you can think about one part of your program at a time 201 | * Save your WM slots 202 | * Your 5-minute exercise: take existing code and wrap it in `main` -------------------------------------------------------------------------------- /docs/slides/03-testing.md: -------------------------------------------------------------------------------- 1 | % Testing 2 | % Good research code 3 | % Patrick Mineault 4 | 5 | # Bulding around testing 6 | 7 | > Most scientists who write software constantly test their code. That is, if you are a scientist writing software, I am sure that you have tried to see how well your code works by running every new function you write, examining the inputs and the outputs of the function, to see if the code runs properly (without error), and to see whether the results make sense. Automated code testing takes this informal practice, makes it formal, and automates it, so that you can make sure that your code does what it is supposed to do, even as you go about making changes around it. --Ariel Rokem, Shablona README 8 | 9 | # Open discussion 10 | 11 | * Let's test `fib.py` 12 | * What can we test? 13 | 14 | # What can we test about `fib`? 15 | 16 | * Correctness, e.g. $F(4) = 5$ 17 | * Edge cases, e.g. $F(0) = 1$, $F(-1)$ → *error* 18 | * Functional goals are achieved, e.g. caching works 19 | * It's much easier to test decoupled code with no side effects 20 | * Forces you to write modular decoupled code 21 | 22 | # How can you decide what to test? 23 | 24 | * If something caused a bug, test it 25 | * 70% of bugs will be old bugs that keep reappearing 26 | * If you manually checked if procedure X yielded something reasonable results, write a test for it. 27 | 28 | # How can we test? 29 | 30 | * `assert` 31 | * Hide code behind `if __name__ == '__main__'` 32 | * Test suite 33 | 34 | # `assert` 35 | 36 | * `assert` throws an error if the assertion is False 37 | 38 | ```assert -(7 // 2) == (-7 // 2)``` 39 | 40 | * Great for inline tests 41 | * e.g. check whether the shape of a matrix is correct after a permute op 42 | 43 | # Hide code behind `if __name__ == '__main__'` 44 | 45 | * Code behind `__name__ == '__main__'` is only run if you run the file as a script directly. 46 | * Use this for lightweight tests in combination with `assert`. 47 | 48 | ```{.python} 49 | if __name__ == '__main__': 50 | assert fib(4) == 5 51 | ``` 52 | 53 | # Use a test suite 54 | 55 | * Create a specialized file with tests that run with the help of a runner. 56 | * There's `pytest` and `unittest`. 57 | * I use `unittest` because that's what I learned, and it's built-in, but people like `pytest` a lot. 58 | 59 | # Basic template 60 | 61 | ```{.python} 62 | # test_something.py 63 | import unittest 64 | 65 | class MyTest(unittest.TestCase): 66 | def sample_test(self): 67 | self.assertTrue(True) 68 | 69 | if __name__ == '__main__': 70 | unittest.main() 71 | ``` 72 | 73 | # Run it 74 | 75 | ```{.shell} 76 | $ python test_something.py 77 | ``` 78 | 79 | To run all tests within a directory, install nose via `pip install nose2`, then: 80 | 81 | ```{.shell} 82 | $ nose2 83 | ``` 84 | 85 | # Live coding 86 | 87 | Let's code up `fib.py` tests! 88 | 89 | # Points from live coding example 90 | 91 | * Paths! 92 | * Sometimes you can get away with hacking `sys.path` 93 | * Ideally, set up a package with `pip install -e .` 94 | * There's a lot of cruft in writing tests: no shame in copy and paste (but do it once from scratch)! 95 | 96 | # A hierarchy of tests can be run with a runner 97 | 98 | * Static tests (literally your editor parsing your code to figure out if it will crash) 99 | * Asserts 100 | * Unit tests (test one function = one unit; what we just saw) 101 | * Integration tests 102 | * Smoke tests (does it crash?) 103 | * Regression tests 104 | * E2E (literally a robot clicking buttons) 105 | 106 | # Write lots of tiny unit tests that run very quickly 107 | 108 | * Goal: each unit test should run in 1 ms. 109 | * The faster you iterate, the better for your WM. 110 | * If your test suite takes more than 5 seconds to run, you will be tempted to go do something else. 111 | 112 | # Open discussion 113 | 114 | Q: what do you think is the ratio of test code to real code in a real codebase? 115 | 116 | # Open discussion 117 | 118 | A: 1:1 to 3:1, but can be many, many times that in safety critical applications 119 | 120 | e.g. the aviation standard DO-178C requires 100% code coverage (percentage of lines of code called by the tests) at its third highest safety level (Level C). 121 | 122 | For more down-to-earth applications, 80% code coverage is a common target. [You can use the `Coverage.py` package to figure out your test coverage](https://coverage.readthedocs.io/en/coverage-5.3.1/). 123 | 124 | # Demo 125 | 126 | Let's code CKA tests. We will turn properties of CKA listed in the paper into tests. 127 | 128 | # What we know about CKA 129 | 130 | * Only makes sense if two matrices are the same size along the first dimension 131 | * Pearson correlation: If $\mathbf{X}$ and $\mathbf{Y}$ are one-dimensional, then $CKA = \rho( \mathbf X, \mathbf Y)^2$. 132 | * $CKA(\mathbf X, \mathbf X) = 1$ 133 | 134 | # Live coding 135 | 136 | Note: to follow at home, look at `cka_step3.py` and `tests/test_cka_step3.py`. 137 | 138 | # What else can we know about CKA? Let's read the paper! 139 | 140 | * 2.1 _not_ invariant to non-isotropic scaling 141 | * 2.2 invariant to rotations, $CKA(\alpha \mathbf{X U}, \beta \mathbf{Y V}) = CKA(\mathbf X, \mathbf Y)$ 142 | 143 | ![Invariance to rotation](../figures/invariance_to_ortho.PNG){height=85px} 144 | 145 | * 2.3 invariant to isotropic scaling, $CKA(\alpha \mathbf X, \beta \mathbf Y) = CKA(\mathbf X, \mathbf Y)$ 146 | 147 | # Live coding (II) 148 | 149 | 150 | 151 | 152 | # Points from live coding example 153 | 154 | * Your test code can be ugly, as long as it's functional! 155 | * Define boundary conditions, pathological examples 156 | * Test that bad inputs indeed raise errors! Your code should yell when you feed it bad inputs. 157 | * Lock in current behaviour for regression testing 158 | * E.g. we implement a different, faster implementation of CKA in `cka_step4.py` and regression test it in `test_cka_step4.py`. 159 | 160 | # Refactoring with confidence 161 | 162 | * Your code is ugly: time to refactor! 163 | 1. Your code is ugly, tests pass 164 | 2. Rewrite the code 165 | 3. Your code is clean, tests don't pass 166 | 4. Rewrite the code 167 | 5. Iterate until tests pass again 168 | * Much less stressful with tests and git 169 | * Focus on one test at a time with `python test_cka_step3.py TestCka.test_same` 170 | * Don't forget to run the whole suite at the end! 171 | 172 | 173 | # Advanced topics! 174 | 175 | Testing deterministic side-effect free computational code has a very high returns:effort ratio, but... 176 | 177 | * [You can also test data loaders for correctness](https://github.com/patrickmineault/brain-scorer/blob/main/tests/test_pvc4_loader.py). 178 | * [You can also test data for correctness](https://github.com/patrickmineault/phaco-meta/blob/master/read-data.R#L320) 179 | * [You can also test notebooks for correctness](https://github.com/NeuromatchAcademy/course-content/blob/master/ci/verify_exercises.py#L56) 180 | * [You can integrate your tests into Github](https://github.com/patrickmineault/research_code/runs/1647753165?check_suite_focus=true) 181 | * [This presentation's repo has CI](https://github.com/patrickmineault/research_code/actions)! It's completely unnecessary! 182 | * [You can test stochastic functions](https://softwareengineering.stackexchange.com/questions/133047/unit-testing-of-inherently-random-non-deterministic-algorithms?rq=1) 183 | 184 | # Lesson 3 185 | 186 | * Test your code 187 | * Free your WM from having to consider that a piece of code unrelated to the thing you care about is broken 188 | * From lesson 1: much simpler to refactor code to make it tidy when you know you have a test scaffold which catches mistakes 189 | * From lesson 2: you will have to decouple code to write tests 190 | * Your 5-minute assignment: find a commented-out `print` statement in your code and replace it with `assert` 191 | -------------------------------------------------------------------------------- /docs/slides/04-docs.md: -------------------------------------------------------------------------------- 1 | % Docs 2 | % Good research code 3 | % Patrick Mineault 4 | 5 | # Documentation 6 | 7 | Write documentation 8 | 9 | # 10 | 11 | You will forget about 90% of what you worked on. If you write it down, you'll be in a good spot. 12 | 13 | # A word of warning 14 | 15 | * I covered testing before documentation 16 | * But why? 17 | 18 | # Testing before documentation 19 | 20 | * It's more important that your code works (is correct) than it is easy to use 21 | * Docs become stale, tests have a long shelf life 22 | * If tests run, you can always copy and paste code if you can't remember how to use the code 23 | * Relatedly: if something can be a check, a warning or an exception, it should be 24 | 25 | # Documented 26 | 27 | ```{.python} 28 | def conv(A, B, padding='valid'): 29 | """ 30 | Convolves the 1d signals A and B. 31 | 32 | Args: 33 | A: a 1d numpy array 34 | B: a 1d numpy array 35 | padding (str): padding type (valid, mirror) 36 | """ 37 | pass 38 | ``` 39 | 40 | # Defensive inline checks 41 | 42 | ```{.python } 43 | def conv(A, B, padding='none'): 44 | assert A.ndim == 1 45 | assert B.ndim == 1 46 | if padding not in ('valid', 'mirror'): 47 | raise NotImplementedError( 48 | f"{padding} not implemented.") 49 | ``` 50 | 51 | # What should you document? 52 | 53 | * References to papers 54 | * Why you wrote tricky code the way you did instead of the obvious way 55 | * TODOs (your Python editor will highlight these special comments) 56 | 57 | ```{.python} 58 | # TODO(pmin): refactor this mess 59 | ``` 60 | 61 | * Usage, especially if other people will use your code. 62 | * It's a gift from present you to future you 63 | 64 | # How should we document functions? 65 | 66 | * [Numpy style](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_numpy.html) or [Google style](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html). 67 | 68 | ```{.python} 69 | def my_doubler(x): 70 | """Doubles x. 71 | 72 | Args: 73 | x: the number to double 74 | 75 | Returns: 76 | Twice x 77 | """ 78 | return x * 2 79 | ``` 80 | 81 | # Package docs 82 | 83 | If you create a useful package, you can generate docs for it using [Sphinx](https://www.sphinx-doc.org/en/master/index.html) and publish them on [readthedocs](https://readthedocs.org/). 84 | 85 | # 86 | 87 | There are other many kinds of *documentation* 88 | 89 | # `README.md` 90 | 91 | ![NMC3: We survived](../figures/readme.PNG){height=220px} 92 | 93 | # Console usage 94 | 95 | ![NMC3: We survived](../figures/argparse.PNG){height=220px} 96 | 97 | # Console usage 98 | 99 | ```{.shell} 100 | (py3) $ python sendit.py 101 | usage: sendit.py [-h] {list,create,add,templates,test,remove,send} ... 102 | 103 | Manage sendgrid email batches with confidence 104 | 105 | positional arguments: 106 | {list,create,add,templates,test,remove,send} 107 | list List batches 108 | create Create a new batch 109 | add Adds a set of information to a batch 110 | templates List templates 111 | test Sends a test email 112 | remove Deletes an email batch 113 | send Sends an email batch 114 | 115 | optional arguments: 116 | -h, --help show this help message and exit 117 | ``` 118 | 119 | # Lab book & blogs 120 | 121 | * I like [notion.so](https://notion.so) as a labbook 122 | * Blog: jekyll hosted on Github pages or wordpress.com 123 | * I have had a [wordpress.com blog](https://xcorr.net) for the last 12 years. Two weeks ago I copied and pasted from a blog post that I wrote in 2009. 124 | 125 | # Dashboards 126 | 127 | * If you have a project that relies on tracking and improving a metric, use a dashboard 128 | * Lots of machine learning projects are set up this way 129 | * Not only acts as a LTM, acts as an information radiator 130 | * Many ways to do this (most of these are commercial cloud offerings with a free tier): 131 | * [R Shiny](https://shiny.rstudio.com/) 132 | * [Streamlit](https://www.streamlit.io/) 133 | * [Panel](https://panel.holoviz.org/) 134 | * [Plotly dash](https://plotly.com/dash/) 135 | * [Google Data Studio](https://datastudio.google.com/u/0/) 136 | * [W&B](https://wandb.ai/) 137 | 138 | # Sample dashboard 139 | 140 | ![NMA dashboard](../figures/dashboard.PNG){height=220px} 141 | 142 | # Lesson 4 143 | 144 | * Write documentation 145 | * Write the right kind of documentation 146 | * Save your long-term memory and offload it to digital store 147 | * 5-minute exercise: make a `README.md` file and push it to Github -------------------------------------------------------------------------------- /docs/slides/05-social.md: -------------------------------------------------------------------------------- 1 | % Improving your skillset 2 | % Good research code 3 | % Patrick Mineault 4 | 5 | # Lesson 5 6 | 7 | Maybe the real good code is the friends we made along the way 8 | 9 | # Lesson 5 10 | 11 | Work with better people than you 12 | 13 | # Reality check 14 | 15 | * People think that programming is a solitary activity 16 | * Reality: at a place like Google, programming is very social 17 | * readability 18 | * code reviews 19 | * design reviews 20 | * pair programming 21 | * reading groups 22 | * retreats 23 | * performance reviews 24 | 25 | # Open discussion 26 | 27 | Q: what is pair programming? 28 | 29 | # Pair programming 30 | 31 | * Pair programming is a way for two programmers, potentially of different skill leves, to share knowledge through active practice 32 | * Traditional style: driver and navigator 33 | * The driver physically types the code into the terminal/editor. They think about the micro-issues (e.g. what goes in the body of the for loop) 34 | * The navigator tells the driver what to write. They typically focus about macro-issues (e.g. what should a function accomplish, how is it architected) 35 | 36 | # What you learn through pair programming 37 | 38 | * You practice your communication skills 39 | * You learn someone's productivity shortcuts. E.g. I learned about Ctrl+Shift+R (search in bash) through a pair programming session. 40 | * If someone has domain knowledge (e.g. neuroscience) and the other technical knowledge (e.g. Python), you will make faster progress than two people working separately 41 | 42 | # Tips on pair programming 43 | 44 | * Tools can make remote pair programming more comfortable 45 | * Zoom screensharing (can lead to embarassment if e.g. you accidentally Alt-Tab to an ebay search for beanie babies) 46 | 47 | # Practical tips 48 | 49 | * Open an issue in an open source project 50 | * Open a PR in an open source project 51 | * Set up pair programming with people in your lab or study group 52 | * One person drives, one person co-pilots 53 | * Learn how at NMA2021! 54 | * [CoCalc](https://cocalc.com/) and [DeepNote](https://deepnote.com/) can do this remotely. 55 | * Set up study group with fellow programmers ([event suggestions from Mozilla](http://mozillascience.github.io/studyGroupHandbook/event-types.html)). 56 | 57 | # Set up a review circle 58 | 59 | [You can use Github Pull Requests to give and receive line-by-line feedback on code](https://docs.github.com/en/enterprise-server@2.20/github/collaborating-with-issues-and-pull-requests/reviewing-proposed-changes-in-a-pull-request). 60 | 61 | 62 | # It doesn't have to be lonely 63 | 64 | - Maybe you're the best coder in your lab so you don't have opportunities for growth 65 | - Contribute to open source projects 66 | - [NMA](https://neuromatchacademy.org/) & [NMC](https://neuromatch.io/) are always happy to have more people! 67 | - Join a community or hackerspace 68 | - [BrainHack.org](https://brainhack.org/) 69 | - Meetup 70 | - [Hackerspaces](https://wiki.hackerspaces.org/w/index.php) 71 | - [PyLadies](https://www.pyladies.com/) 72 | 73 | # Become a wizard! 74 | 75 | ![zine by Julia Evans, released under CC-BY-NC-SA 4.0 license](../figures/wizard.png){height=220px} 76 | 77 | # Wizard! 78 | 79 | - Great zine about how to become a wizard from [Julia Evans](https://www.twitter.com/b0rk) 80 | - [Zine link](https://wizardzines.com/comics/take-on-hard-projects/) 81 | 82 | # You are never finished learning! 83 | 84 | ![](../figures/reproducible_research.png) 85 | 86 | # Acknowledgement 87 | 88 | Thanks to the reviewers, Tyler Sloan and Elizabeth DuPre who made this talk much better. 89 | 90 | # You can accomplish anything! 91 | 92 | ![](../figures/nma.png) 93 | 94 | # Lesson 5 95 | 96 | * Work with better people than you 97 | * It's a bit of a stretch to make this work with the theme of WM & LTM 98 | * The most important point: grow with people 99 | * Your 5-minute exercise: schedule one pair-programming session 100 | -------------------------------------------------------------------------------- /docs/slides/99-standalone-testing.md: -------------------------------------------------------------------------------- 1 | % Testing: standalone lecture 2 | % Good research code 3 | % Patrick Mineault 4 | 5 | # 6 | 7 | Intro 8 | 9 | # Who is this lecture for? 10 | 11 | ![](../figures/tweet.png) 12 | 13 | # Who is this lecture for? 14 | 15 | * Most people who do coding-heavy research are not trained in CS or software engineering 16 | * You're probably in this bucket 17 | * Bad consequences: 18 | * You feel like you don't know what you're doing 19 | * Imposter syndrome 20 | * Low productivity 21 | * Bugs 22 | * You hate your code and you don't want to work on it 23 | * You never graduate 24 | * You have great sadness in your heart 25 | * It doesn't have to be all bad! 26 | 27 | # My weird perspective 28 | 29 | * Patrick Mineault, PhD in neuroscience 30 | * (wildly underqualified) software engineer at Google 31 | * Research scientist at Facebook on brain-computer interfaces 32 | * Technical chair of Neuromatch Academy 33 | * Independent researcher and technologist 34 | * Occasionally taught CS 35 | 36 | # Regrets, I've had a few 37 | 38 | * Mostly self-taught in programming 39 | * Didn't study CS until very late 40 | * Wasted months working with bad code of my own making 41 | * Not a great coder, but better than in grad school 42 | * I think you might be curious 43 | 44 | # The single most useful skill 45 | 46 | Testing 47 | 48 | # Organization 49 | 50 | * Assume that you know a little bit about [Python](https://swcarpentry.github.io/python-novice-inflammation/), [git](http://swcarpentry.github.io/git-novice/) and the [command line](http://swcarpentry.github.io/shell-novice/) 51 | * You can catch up on these topics via Software Carpentries 52 | * I don't expect you to have any experience in packaging code, distribution, working in groups. 53 | * This is a subset of a longer series of lectures which you can refer to 54 | * https://github.com/patrickmineault/research_code 55 | * One day I will record the whole lecture set and maybe run it live 56 | * Interrupt me and chat! 57 | * Learning objectives for this lecture 58 | * What is testing? 59 | * What should I test? 60 | * How can I test? 61 | * How can I integrate testing into a project I'm doing right now? 62 | 63 | 64 | # Bulding around testing 65 | 66 | > Most scientists who write software constantly test their code. That is, if you are a scientist writing software, I am sure that you have tried to see how well your code works by running every new function you write, examining the inputs and the outputs of the function, to see if the code runs properly (without error), and to see whether the results make sense. Automated code testing takes this informal practice, makes it formal, and automates it, so that you can make sure that your code does what it is supposed to do, even as you go about making changes around it. --Ariel Rokem, Shablona README 67 | 68 | # Open discussion 69 | 70 | * Let's say we have a function in `fib.py`: 71 | 72 | ```{.python} 73 | def fib(n): 74 | if n >= 2: 75 | return fib(n-2) + fib(n-1) 76 | else: 77 | return 1 78 | ``` 79 | 80 | * Let's test `fib.py` 81 | * What can we test? 82 | 83 | # What can we test about `fib`? 84 | 85 | * Correctness, e.g. $F(4) = 5$ 86 | * Edge cases, e.g. $F(0) = 1$, $F(-1)$ → *error* 87 | 88 | # How can you decide what to test? 89 | 90 | * If something caused a bug, test it 91 | * 70% of bugs will be old bugs that keep reappearing 92 | * If you manually checked if procedure X yielded reasonable results, write a test for it. 93 | 94 | # What will this give me? 95 | 96 | * Decrease bugs: You'll uncover bugs which you'll fix immediately 97 | * Peace of mind: You'll know that your code is correct 98 | * Easy refactors: If you change your code you can easily find out if it's still correct 99 | * Docs: You will know how to call your code long after you've stopped working on it actively 100 | * Better code: If you write your code to be testable you'll write better-organized code 101 | 102 | # How can we test? 103 | 104 | * `assert` 105 | * Hide code behind `if __name__ == '__main__'` 106 | * Test suite 107 | 108 | # `assert` 109 | 110 | * `assert` throws an error if the assertion is False 111 | 112 | ```assert -(7 // 2) == (-7 // 2)``` 113 | 114 | * Great for inline tests 115 | * e.g. check whether the shape of a matrix is correct after a permute operation 116 | 117 | # Hide code behind `if __name__ == '__main__'` 118 | 119 | * Code behind `__name__ == '__main__'` is only run if you run the file as a script directly. 120 | * Use this for lightweight tests in combination with `assert`. 121 | 122 | ```{.python} 123 | if __name__ == '__main__': 124 | assert fib(4) == 5 125 | ``` 126 | 127 | # Use a test suite 128 | 129 | * Create a specialized file with tests that run with the help of a runner. 130 | * There's `pytest` and `unittest`. 131 | * I use `unittest` because that's what I learned, and it's built-in, but people like `pytest` a lot. 132 | 133 | # Basic template 134 | 135 | ```{.python} 136 | # test_something.py 137 | import unittest 138 | 139 | class MyTest(unittest.TestCase): 140 | def sample_test(self): 141 | self.assertTrue(True) 142 | 143 | if __name__ == '__main__': 144 | unittest.main() 145 | ``` 146 | 147 | # Run it 148 | 149 | ```{.shell} 150 | $ python test_something.py 151 | ``` 152 | 153 | To run all tests within a directory, install nose via `pip install nose2`, then: 154 | 155 | ```{.shell} 156 | $ nose2 157 | ``` 158 | 159 | # Live coding 160 | 161 | Let's code up `fib.py` tests! 162 | 163 | # Points from live coding example 164 | 165 | * Paths! 166 | * Sometimes you can get away with hacking `sys.path` 167 | * Ideally, set up a package with `pip install -e .` 168 | * There's a lot of cruft in writing tests: no shame in copy and paste (but do it once from scratch)! 169 | 170 | # A hierarchy of tests can be run with a runner 171 | 172 | * Static tests (literally your editor parsing your code to figure out if it will crash) 173 | * Asserts 174 | * Unit tests (test one function = one unit; what we just saw) 175 | * Integration tests 176 | * Smoke tests (does it crash?) 177 | * Regression tests 178 | * E2E (literally a robot clicking buttons) 179 | 180 | # Write lots of tiny unit tests that run very quickly 181 | 182 | * Goal: each unit test should run in 1 ms. 183 | * The faster you iterate, the better 184 | * If your test suite takes more than 5 seconds to run, you will be tempted to go do something else. 185 | 186 | # Open discussion 187 | 188 | Q: what do you think is the ratio of test code to real code in a real codebase? 189 | 190 | # Open discussion 191 | 192 | A: 1:1 to 3:1, but can be many, many times that in safety critical applications 193 | 194 | e.g. the aviation standard DO-178C requires 100% code coverage (percentage of lines of code called by the tests) at its third highest safety level (Level C). 195 | 196 | For more down-to-earth applications, 80% code coverage is a common target. [You can use the `Coverage.py` package to figure out your test coverage](https://coverage.readthedocs.io/en/coverage-5.3.1/). 197 | 198 | # Demo 199 | 200 | Let's code up a non-trivial set of tests for a real paper. 201 | 202 | # Background on centered kernel alignment 203 | 204 | Q: How can we compare how different brain areas and artificial neural networks represent the world? 205 | 206 | A: Choose a standard battery of stimuli, measure responses across systems, compare the responses between the systems. Many approaches, including: 207 | 208 | * forward encoding models (e.g. ridge regression) 209 | * canonical correlation analysis (CCA) 210 | * representational similarity analysis (RSA). 211 | 212 | # CKA 213 | 214 | [Kornblith et al. (2019)](https://arxiv.org/abs/1905.00414) propose a new method to compare representations. You can think of it as a generalization of the (square of the) Pearson correlation coefficient, but with matrices instead of vectors. 215 | 216 | ![Alignment between layers of two neural nets initialized with different seeds](../figures/cka_example.png){height=100px} 217 | 218 | Importantly, CKA is not implemented in scipy or sklearn, github gives very few hits ^[1]... it's real research code! 219 | 220 | [1] [There is an implementation in a notebook from authors](https://colab.research.google.com/github/google-research/google-research/blob/master/representation_similarity/Demo.ipynb) 221 | 222 | # What we know about CKA 223 | 224 | * Pearson correlation: If $\mathbf{X}$ and $\mathbf{Y}$ are one-dimensional, then $CKA = \rho( \mathbf X, \mathbf Y)^2$. 225 | * Only makes sense if two matrices are the same size along the first dimension 226 | * $CKA(\mathbf X, \mathbf X) = 1$ 227 | 228 | # Live coding 229 | 230 | Note: to follow at home, look at `cka_step3.py` and `tests/test_cka_step3.py`. 231 | 232 | 233 | # Points from live coding example 234 | 235 | * Your test code can be ugly, as long as it's functional! 236 | * Define boundary conditions, pathological examples 237 | * Test that bad inputs indeed raise errors! Your code should yell when you feed it bad inputs. 238 | * Lock in current behaviour for regression testing 239 | * E.g. we implement a different, faster implementation of CKA in `cka_step4.py` and regression test it in `test_cka_step4.py`. 240 | 241 | # Refactoring with confidence 242 | 243 | * Your code is ugly: time to refactor! 244 | 1. Your code is ugly, tests pass 245 | 2. Rewrite the code 246 | 3. Your code is clean, tests don't pass 247 | 4. Rewrite the code 248 | 5. Iterate until tests pass again 249 | * Much less stressful with tests and git 250 | * Focus on one test at a time with `python test_cka_step3.py TestCka.test_same` 251 | * Don't forget to run the whole suite at the end! 252 | 253 | 254 | # Advanced topics! 255 | 256 | Testing deterministic side-effect free computational code has a very high returns:effort ratio, but... 257 | 258 | * [You can also test data loaders for correctness](https://github.com/patrickmineault/brain-scorer/blob/main/tests/test_pvc4_loader.py). 259 | * [You can also test data for correctness](https://github.com/patrickmineault/phaco-meta/blob/master/read-data.R#L320) 260 | * [You can also test notebooks for correctness](https://github.com/NeuromatchAcademy/course-content/blob/master/ci/verify_exercises.py#L56) 261 | * [You can integrate your tests into Github](https://github.com/patrickmineault/research_code/runs/1647753165?check_suite_focus=true) 262 | * [This presentation's repo has CI](https://github.com/patrickmineault/research_code/actions)! It's completely unnecessary! 263 | * [You can test stochastic functions](https://softwareengineering.stackexchange.com/questions/133047/unit-testing-of-inherently-random-non-deterministic-algorithms?rq=1) 264 | 265 | # Lesson 3 266 | 267 | * Test your code 268 | * Your 5-minute assignment: find a commented-out `print` statement in your code and replace it with `assert` -------------------------------------------------------------------------------- /docs/slides/Makefile: -------------------------------------------------------------------------------- 1 | OUTDIR := pdf 2 | 3 | # This pattern excludes README.md 4 | MD_FILES=$(wildcard *-*.md) 5 | OUT_FILES=$(patsubst %.md, $(OUTDIR)/%.pdf, $(MD_FILES)) 6 | 7 | all : directories $(OUT_FILES) 8 | 9 | .PHONY : clean 10 | clean : 11 | rm -f $(OUTDIR)/*.pdf 12 | 13 | $(OUTDIR)/%.pdf : %.md preamble.tex 14 | pandoc -t beamer -s $< -o $@ -H preamble.tex 15 | 16 | directories : $(OUTDIR) 17 | 18 | $(OUTDIR): 19 | mkdir -p $(OUTDIR) -------------------------------------------------------------------------------- /docs/slides/README.md: -------------------------------------------------------------------------------- 1 | # Slides for writing good research code good 2 | 3 | * [Introduction and keeping things tidy](pdf/01-intro.pdf) 4 | * [Decoupling code](pdf/02-decouple.pdf) 5 | * [Testing](pdf/03-testing.pdf) 6 | * [Documentation](pdf/04-docs.pdf) 7 | * [Make it social](pdf/05-social.pdf) 8 | 9 | # References 10 | 11 | ## Reading 12 | 13 | - Research software engineering: [https://merely-useful.github.io/py-rse/](https://merely-useful.github.io/py-rse/) 14 | - Making packages and testing [https://education.molssi.org/python-package-best-practices/index.html](https://education.molssi.org/python-package-best-practices/index.html) 15 | - Carpentries testing Python: [http://carpentries-incubator.github.io/python-testing/](http://carpentries-incubator.github.io/python-testing/) 16 | - Shablona template: [https://github.com/uwescience/shablona](https://github.com/uwescience/shablona) 17 | - The Turing Way: [https://the-turing-way.netlify.app/reproducible-research/code-quality.html](https://the-turing-way.netlify.app/reproducible-research/code-quality.html) 18 | - Software engineering best practices: [http://www.bris.ac.uk/acrc/acrc-training/](http://www.bris.ac.uk/acrc/acrc-training/) 19 | - Data science in practice paper: [https://www.tandfonline.com/doi/full/10.1080/10691898.2020.1860725](https://www.tandfonline.com/doi/full/10.1080/10691898.2020.1860725) 20 | - Software engineering for data scientists: [http://uwseds.github.io/](http://uwseds.github.io/) 21 | 22 | ## Media 23 | 24 | - Software engineering for research: [https://www.youtube.com/watch?v=SxoDCo9iNI0&feature=emb_title](https://www.youtube.com/watch?v=SxoDCo9iNI0&feature=emb_title) 25 | - Test and code for scientists (podcast): [https://testandcode.com/140](https://testandcode.com/140) 26 | 27 | ## Inspiration 28 | 29 | * The Zen of Python: https://zen-of-python.info/ 30 | 31 | ## Tools 32 | 33 | * [IDEs for scientific Python](https://xcorr.net/2013/04/17/evaluating-ides-for-scientific-python/) 34 | 35 | # Compiling these slides 36 | 37 | Slides can be compiled with `make all`. Requires pandoc: 38 | 39 | ``` 40 | sudo apt-get install pandoc texlive texlive-latex-extra 41 | pip install pandoc-latex-fontsize 42 | ``` -------------------------------------------------------------------------------- /docs/slides/pdf/01-intro.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/slides/pdf/01-intro.pdf -------------------------------------------------------------------------------- /docs/slides/pdf/02-decouple.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/slides/pdf/02-decouple.pdf -------------------------------------------------------------------------------- /docs/slides/pdf/03-testing.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/slides/pdf/03-testing.pdf -------------------------------------------------------------------------------- /docs/slides/pdf/04-docs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/slides/pdf/04-docs.pdf -------------------------------------------------------------------------------- /docs/slides/pdf/05-social.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/slides/pdf/05-social.pdf -------------------------------------------------------------------------------- /docs/slides/pdf/99-standalone-testing.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/docs/slides/pdf/99-standalone-testing.pdf -------------------------------------------------------------------------------- /docs/slides/preamble.tex: -------------------------------------------------------------------------------- 1 | %Found here: https://github.com/alexeygumirov/pandoc-beamer-how-to/blob/master/pandoc/templates/preamble.tex 2 | %%%%%%%%%%%%%color 3 | \definecolor{UBCblue}{rgb}{0.04706, 0.13725, 0.26667} % UBC Blue (primary) 4 | \definecolor{UBCgrey}{rgb}{0.3686, 0.5255, 0.6235} % UBC Grey (secondary) 5 | 6 | \definecolor{orange}{RGB}{244,167,66} 7 | 8 | \setbeamercolor{palette primary}{bg=UBCblue,fg=white} 9 | \setbeamercolor{palette secondary}{bg=UBCblue,fg=white} 10 | \setbeamercolor{palette tertiary}{bg=UBCblue,fg=white} 11 | \setbeamercolor{palette quaternary}{bg=UBCblue,fg=white} 12 | \setbeamercolor{structure}{fg=UBCblue} % itemize, enumerate, etc 13 | \setbeamercolor{section in toc}{fg=UBCblue} % TOC sections 14 | 15 | %% change circle miniframes color 16 | \setbeamercolor{mini frame}{fg=orange, bg=UBCblue} 17 | 18 | %% Change subsection in footer color (author and institute color) 19 | \setbeamercolor{subsection in head/foot}{bg=UBCgrey,fg=white} 20 | 21 | %change ilmenau section dot color 22 | \setbeamercolor{section in head/foot}{fg=orange} 23 | 24 | %% Change the circle in miniframes to a box 25 | %\setbeamertemplate{mini frame}[box] 26 | %\setbeamertemplate{mini frame in current subsection}[box] 27 | 28 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 29 | %%%% Create framenumber in footer 30 | \newcommand{\frameofframes}{/} 31 | \newcommand{\setframeofframes}[1]{\renewcommand{\frameofframes}{#1}} 32 | 33 | \setframeofframes{of} 34 | \makeatletter 35 | \setbeamertemplate{footline} 36 | {% 37 | \begin{beamercolorbox}[colsep=1.5pt]{upper separation line foot} 38 | \end{beamercolorbox} 39 | \begin{beamercolorbox}[ht=2.5ex,dp=1.125ex,% 40 | leftskip=.3cm,rightskip=.3cm plus1fil]{author in head/foot}% 41 | \leavevmode{\usebeamerfont{author in head/foot}\insertshortauthor}% 42 | \hfill% 43 | {\usebeamerfont{institute in head/foot}\usebeamercolor[fg]{institute in head/foot}\insertshortinstitute}% 44 | \end{beamercolorbox}% 45 | \begin{beamercolorbox}[ht=2.5ex,dp=1.125ex,% 46 | leftskip=.3cm,rightskip=.3cm plus1fil]{title in head/foot}% 47 | {\usebeamerfont{title in head/foot}\insertshorttitle}% 48 | \hfill% 49 | {\usebeamerfont{frame number}\usebeamercolor[fg]{frame number}\insertframenumber~\frameofframes~\inserttotalframenumber} 50 | \end{beamercolorbox}% 51 | \begin{beamercolorbox}[colsep=1.5pt]{lower separation line foot} 52 | \end{beamercolorbox} 53 | } 54 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 55 | %%% Remove subsection bar in mini frames 56 | \defbeamertemplate*{headline}{miniframes theme no subsection} 57 | {% 58 | \begin{beamercolorbox}[colsep=1.5pt]{upper separation line head} 59 | \end{beamercolorbox} 60 | \begin{beamercolorbox}{section in head/foot} 61 | \vskip2pt\insertnavigation{\paperwidth}\vskip2pt 62 | \end{beamercolorbox}% 63 | \begin{beamercolorbox}[colsep=1.5pt]{lower separation line head} 64 | \end{beamercolorbox} 65 | } 66 | 67 | \setbeamertemplate{footline}[miniframes theme no subsection] 68 | %%%%%%%%%%%%%%%%%%%%%%%%%%% 69 | \makeatother -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | graphviz 2 | matplotlib 3 | seaborn 4 | torch -------------------------------------------------------------------------------- /research_code/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/research_code/__init__.py -------------------------------------------------------------------------------- /research_code/cka_not_great.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import os 4 | import pickle 5 | 6 | 7 | """Run a linear centered-kernel alignment (CKA) to find how close two 8 | latent representations of 9 | 10 | Kornblith et al. (2019) https://arxiv.org/abs/1905.00414 11 | 12 | Load data from ../data and compare them. All the matrices in 13 | ../data/matrices.pkl have the same height (number of examplars) but 14 | potentially different numbers of columns. 15 | """ 16 | f = open('../data/matrices.pkl', 'rb') 17 | data = pickle.load(f)['reps'] 18 | 19 | cka = np.zeros((5, 5)) 20 | for i in range(5): 21 | for j in range(i+1, 5): 22 | X, Y = data[i], data[j] 23 | X = (X - X.mean(0).reshape((1, -1))) 24 | Y = (Y - Y.mean(0).reshape((1, -1))) 25 | 26 | XTX = X.T.dot(X) 27 | YTY = Y.T.dot(Y) 28 | YTX = Y.T.dot(X) 29 | 30 | # Equation (4) 31 | cka[i, j] = (YTX ** 2).sum() / np.sqrt((XTX * XTX).sum() * (YTY * YTY).sum()) 32 | 33 | cka = cka + cka.T 34 | cka = cka + np.eye(cka.shape[0]) 35 | 36 | plt.figure() 37 | plt.imshow(cka) 38 | plt.colorbar() 39 | plt.xticks([0, 1, 2, 3, 4], ['baseline', 'rescaled', 'nuisance', 'truncated1', 'truncated2']) 40 | plt.yticks([0, 1, 2, 3, 4], ['baseline', 'rescaled', 'nuisance', 'truncated1', 'truncated2']) 41 | plt.title('Similarity of different representations (CKA)') 42 | 43 | # os.makedirs('../results') 44 | plt.savefig('../results/closeness.png') 45 | 46 | -------------------------------------------------------------------------------- /research_code/cka_step2.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import os 3 | import seaborn as sns 4 | 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | import pandas as pd 8 | import pickle 9 | 10 | 11 | def multi_cka(reps: List[np.array]) -> np.array: 12 | """ 13 | Calculate CKA matrix for a list of matrices. 14 | 15 | Kornblith et al. (2019) https://arxiv.org/abs/1905.00414 16 | 17 | Args: 18 | reps: a list of representations of the same data from different 19 | networks. All have the same height (number of examplars) but 20 | potentially different numbers of columns. 21 | 22 | Returns: 23 | the CKA matrix (larger values mean more similar). 24 | """ 25 | C = np.zeros((len(reps), len(reps))) 26 | for i in range(len(reps)): 27 | C[i, i] = 1.0 # by definition 28 | for j in range(i+1, len(reps)): 29 | X, Y = reps[i], reps[j] 30 | X = X - X.mean(0, keepdims=True) 31 | Y = Y - Y.mean(0, keepdims=True) 32 | 33 | XTX = X.T @ X 34 | YTY = Y.T @ Y 35 | YTX = Y.T @ X 36 | 37 | # Equation (4) 38 | top = (YTX ** 2).sum() 39 | bottom = np.sqrt((XTX ** 2).sum() * (YTY ** 2).sum()) 40 | c = top / bottom 41 | C[i, j] = c 42 | C[j, i] = c 43 | 44 | return C 45 | 46 | def main(): 47 | with open ('../data/matrices.pkl', 'rb') as f: 48 | data = pickle.load(f) 49 | 50 | C = multi_cka(data['reps']) 51 | 52 | df = pd.DataFrame(C) 53 | df.index = data['models'] 54 | df.columns = data['models'] 55 | 56 | ax = sns.heatmap(df, annot=True, fmt='.2f') 57 | try: 58 | os.makedirs('../results') 59 | except FileExistsError: 60 | pass 61 | 62 | plt.title('Similarity of different representations (CKA)') 63 | plt.savefig('../results/closeness_sns.png') 64 | 65 | if __name__ == "__main__": 66 | main() 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /research_code/cka_step3.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import os 3 | import seaborn as sns 4 | 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | import pandas as pd 8 | import pickle 9 | 10 | def cka(X, Y): 11 | """ 12 | Calculate CKA for two matrices 13 | """ 14 | X = X - X.mean(0, keepdims=True) 15 | Y = Y - Y.mean(0, keepdims=True) 16 | 17 | XTX = X.T @ X 18 | YTY = Y.T @ Y 19 | YTX = Y.T @ X 20 | 21 | # Equation (4) 22 | top = (YTX ** 2).sum() 23 | bottom = np.sqrt((XTX ** 2).sum() * (YTY ** 2).sum()) 24 | c = top / bottom 25 | 26 | return c 27 | 28 | 29 | def multi_cka(reps: List[np.array]) -> np.array: 30 | """ 31 | Calculate CKA matrix for a list of matrices. 32 | 33 | Kornblith et al. (2019) https://arxiv.org/abs/1905.00414 34 | 35 | Args: 36 | reps: a list of representations of the same data from different 37 | networks. All have the same height (number of examplars) but 38 | potentially different numbers of columns. 39 | 40 | Returns: 41 | the CKA matrix (larger values mean more similar). 42 | """ 43 | C = np.zeros((len(reps), len(reps))) 44 | for i in range(len(reps)): 45 | C[i, i] = 1.0 # by definition 46 | for j in range(i+1, len(reps)): 47 | c = cka(reps[i], reps[j]) 48 | 49 | C[i, j] = c 50 | C[j, i] = c 51 | 52 | return C 53 | 54 | def main(): 55 | with open ('../data/matrices.pkl', 'rb') as f: 56 | data = pickle.load(f) 57 | 58 | C = multi_cka(data['reps']) 59 | 60 | df = pd.DataFrame(C) 61 | df.index = data['models'] 62 | df.columns = data['models'] 63 | 64 | ax = sns.heatmap(df, annot=True, fmt='.2f') 65 | try: 66 | os.makedirs('../results') 67 | except FileExistsError: 68 | pass 69 | 70 | plt.title('Similarity of different representations (CKA)') 71 | plt.savefig('../results/closeness_sns.png') 72 | 73 | if __name__ == "__main__": 74 | main() 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /research_code/cka_step4.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import os 3 | import seaborn as sns 4 | 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | import pandas as pd 8 | import pickle 9 | 10 | 11 | def cka_wide(X, Y): 12 | """ 13 | Calculate CKA for two matrices. This algorithm uses a Gram matrix 14 | implementation, which is fast when the data is wider than it is 15 | tall. 16 | 17 | This implementation is inspired by the one in this colab: 18 | https://colab.research.google.com/github/google-research/google-research/blob/master/representation_similarity/Demo.ipynb#scrollTo=MkucRi3yn7UJ 19 | 20 | Note that we use center the features rather than the Gram matrix 21 | because we think the latter is tricky and mysterious. It only works for 22 | linear CKA though (we only implement linear CKA throughout). 23 | """ 24 | X = X - X.mean(0, keepdims=True) 25 | Y = Y - Y.mean(0, keepdims=True) 26 | 27 | XXT = X @ X.T 28 | YYT = Y @ Y.T 29 | 30 | # We use reshape((-1,)) instead of ravel() to ensure this is compatible 31 | # with numpy and pytorch tensors. 32 | top = (XXT.reshape((-1,)) * YYT.reshape((-1,))).sum() 33 | bottom = np.sqrt((XXT ** 2).sum() * (YYT ** 2).sum()) 34 | c = top / bottom 35 | 36 | return c 37 | 38 | 39 | def cka_tall(X, Y): 40 | """ 41 | Calculate CKA for two matrices. 42 | """ 43 | X = X - X.mean(0, keepdims=True) 44 | Y = Y - Y.mean(0, keepdims=True) 45 | 46 | XTX = X.T @ X 47 | YTY = Y.T @ Y 48 | YTX = Y.T @ X 49 | 50 | # Equation (4) 51 | top = (YTX ** 2).sum() 52 | bottom = np.sqrt((XTX ** 2).sum() * (YTY ** 2).sum()) 53 | c = top / bottom 54 | 55 | return c 56 | 57 | def cka(X, Y): 58 | """ 59 | Calculate CKA for two matrices. 60 | 61 | CKA has several potential implementations. The naive implementation is 62 | appropriate for tall matrices (more examples than features), but this 63 | implementation uses lots of memory and it slow when there are many more 64 | features than examples. In that case, which often happens with DNNs, we 65 | prefer the Gram matrix variant. 66 | """ 67 | 68 | if X.shape[0] < X.shape[1]: 69 | return cka_wide(X, Y) 70 | else: 71 | return cka_tall(X, Y) 72 | 73 | 74 | def multi_cka(reps: List[np.array]) -> np.array: 75 | """ 76 | Calculate CKA matrix for a list of matrices. 77 | 78 | Kornblith et al. (2019) https://arxiv.org/abs/1905.00414 79 | 80 | Args: 81 | reps: a list of representations of the same data from different 82 | networks. All have the same height (number of examplars) but 83 | potentially different numbers of columns. 84 | 85 | Returns: 86 | the CKA matrix (larger values mean more similar). 87 | """ 88 | C = np.zeros((len(reps), len(reps))) 89 | for i in range(len(reps)): 90 | C[i, i] = 1.0 # by definition 91 | for j in range(i+1, len(reps)): 92 | c = cka(reps[i], reps[j]) 93 | 94 | C[i, j] = c 95 | C[j, i] = c 96 | 97 | return C 98 | 99 | def main(): 100 | with open ('../data/matrices.pkl', 'rb') as f: 101 | data = pickle.load(f) 102 | 103 | C = multi_cka(data['reps']) 104 | 105 | df = pd.DataFrame(C) 106 | df.index = data['models'] 107 | df.columns = data['models'] 108 | 109 | ax = sns.heatmap(df, annot=True, fmt='.2f') 110 | try: 111 | os.makedirs('../results') 112 | except FileExistsError: 113 | pass 114 | 115 | plt.title('Similarity of different representations (CKA)') 116 | plt.savefig('../results/closeness_sns.png') 117 | 118 | if __name__ == "__main__": 119 | main() 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /research_code/fib.py: -------------------------------------------------------------------------------- 1 | def memoize(fun): 2 | """Memoizes a function of one argument.""" 3 | the_dict = {} 4 | def wrapper_decorator(*args, **kwargs): 5 | assert len(args) == 1, "Only works with one argument" 6 | if args[0] not in the_dict: 7 | the_dict[args[0]] = fun(args[0]) 8 | return the_dict[args[0]] 9 | return wrapper_decorator 10 | 11 | 12 | @memoize 13 | def fib(n): 14 | """Calculates the n'th fibonacci number (memo-ized version). 15 | 16 | Args: 17 | n: Which Fibonacci number to return 18 | 19 | Returns: the n'th Fibonacci number. 20 | """ 21 | if n >= 2: 22 | return fib(n-2) + fib(n-1) 23 | else: 24 | return 1 25 | -------------------------------------------------------------------------------- /research_code/fib_and_test.py: -------------------------------------------------------------------------------- 1 | def fib(n): 2 | if n >= 2: 3 | return fib(n-1) + fib(n-2) 4 | else: 5 | assert n != 2 6 | return 1 7 | 8 | if __name__ == '__main__': 9 | print("Tests running") 10 | assert fib(0) == 1 # expect 1 11 | assert fib(2) == 2 # 2 12 | assert fib(4) == 5 # 5 13 | print("Tests passed") -------------------------------------------------------------------------------- /research_code/fib_monolithic.py: -------------------------------------------------------------------------------- 1 | memory = {} 2 | 3 | """ 4 | The N'th Fibonacci number is 5 | 6 | F(n) = F(n-1) + F(n-2) 7 | 8 | with F(0) = 1, F(1) = 1 9 | """ 10 | 11 | def fib(n): 12 | global memory 13 | if n not in memory: 14 | if n >= 2: 15 | memory[n] = fib(n-2) + fib(n-1) 16 | else: 17 | memory[n] = 1 18 | return memory[n] -------------------------------------------------------------------------------- /research_code/tests/test_cka_step3.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import unittest 3 | 4 | from research_code.cka_step3 import cka 5 | 6 | def _get_one(): 7 | X = np.cos(.1 * np.pi * np.arange(10)).reshape((-1, 1)) 8 | Y = np.cos(2 + .07 * np.pi * np.arange(10)).reshape((-1, 1)) 9 | return X, Y 10 | 11 | def _get_multi(): 12 | X = np.cos(.1 * np.pi * np.arange(10).reshape((-1, 1)) * np.linspace(.5, 1.5, num=3).reshape((1, -1))) 13 | Y = np.cos(.5 + .07 * np.pi * np.arange(10).reshape((-1, 1)) * np.linspace(.7, 1.3, num=4).reshape((1, -1))) 14 | return X, Y 15 | 16 | def _get_wide(): 17 | X = np.cos(.1 * np.pi * np.arange(10).reshape((-1, 1)) * np.linspace(.5, 1.5, num=50).reshape((1, -1))) 18 | Y = np.cos(.5 + .07 * np.pi * np.arange(10).reshape((-1, 1)) * np.linspace(.7, 1.3, num=47).reshape((1, -1))) 19 | return X, Y 20 | 21 | class TestCka(unittest.TestCase): 22 | 23 | @unittest.expectedFailure 24 | def test_wrong_dim(self): 25 | """It should throw an error if we have a different number of stimuli""" 26 | X = np.ones((8, 1)) 27 | Y = np.ones((10, 1)) 28 | cka(X, Y) 29 | 30 | def test_same(self): 31 | """The CKA of a matrix and itself is one""" 32 | X, _ = _get_one() 33 | self.assertAlmostEqual(cka(X, X), 1) 34 | 35 | def test_corr(self): 36 | """The CKA of two vectors is the square of the correlation coefficient""" 37 | X, Y = _get_one() 38 | c1 = cka(X, Y) 39 | c2 = np.corrcoef(X.squeeze(), Y.squeeze())[0, 1] ** 2 40 | self.assertAlmostEqual(c1, c2) 41 | 42 | def test_isoscaling(self): 43 | """CKA is insensitive to scaling by a scalar""" 44 | X, Y = _get_multi() 45 | c1 = cka(X, Y) 46 | c2 = cka(2.0 * X, - 1 * Y) 47 | self.assertAlmostEqual(c1, c2) 48 | 49 | def test_rotation(self): 50 | """CKA is insensitive to rotations""" 51 | X, Y = _get_multi() 52 | X0 = X[:, :2] 53 | X0p = X0 @ np.array([[1, -1], [1, 1]]) / np.sqrt(2) 54 | c1 = cka(X0, Y) 55 | c2 = cka(X0p, Y) 56 | self.assertAlmostEqual(c1, c2) 57 | 58 | def test_no_iso(self): 59 | """CKA is sensitive to column scaling""" 60 | X, Y = _get_multi() 61 | X0 = X[:, :2] 62 | X0p = X0 @ np.array([[1, 1], [10, 1]]) 63 | c1 = cka(X0, Y) 64 | c2 = cka(X0p, Y) 65 | self.assertGreater(abs(c1 - c2), .001) 66 | 67 | def test_value(self): 68 | """Regression test: for this particular input, check that the value 69 | is the same as it always was.""" 70 | X, Y = _get_multi() 71 | c1 = cka(X, Y) 72 | self.assertAlmostEqual(c1, 0.96577, places=4) 73 | 74 | def test_wide(self): 75 | """Smoke test.""" 76 | X, Y = _get_wide() 77 | c1 = cka(X, Y) 78 | 79 | if __name__ == '__main__': 80 | unittest.main() -------------------------------------------------------------------------------- /research_code/tests/test_cka_step4.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import unittest 3 | from research_code.cka_step4 import cka, cka_tall, cka_wide 4 | from research_code.cka_step3 import cka as old_cka 5 | 6 | def _get_one(): 7 | X = np.cos(.1 * np.pi * np.arange(10)).reshape((-1, 1)) 8 | Y = np.cos(2 + .07 * np.pi * np.arange(10)).reshape((-1, 1)) 9 | return X, Y 10 | 11 | def _get_multi(): 12 | X = np.cos(.1 * np.pi * np.arange(10).reshape((-1, 1)) * np.linspace(.5, 1.5, num=3).reshape((1, -1))) 13 | Y = np.cos(.5 + .07 * np.pi * np.arange(10).reshape((-1, 1)) * np.linspace(.7, 1.3, num=4).reshape((1, -1))) 14 | return X, Y 15 | 16 | def _get_wide(): 17 | X = np.cos(.1 * np.pi * np.arange(10).reshape((-1, 1)) * np.linspace(.5, 1.5, num=50).reshape((1, -1))) 18 | Y = np.cos(.5 + .07 * np.pi * np.arange(10).reshape((-1, 1)) * np.linspace(.7, 1.3, num=47).reshape((1, -1))) 19 | return X, Y 20 | 21 | class TestCka(unittest.TestCase): 22 | 23 | @unittest.expectedFailure 24 | def test_wrong_dim(self): 25 | """It should throw an error if we have a different number of stimuli""" 26 | X = np.ones((8, 1)) 27 | Y = np.ones((10, 1)) 28 | cka(X, Y) 29 | 30 | def test_same(self): 31 | """The CKA of a matrix and itself is one""" 32 | X, _ = _get_one() 33 | self.assertAlmostEqual(cka(X, X), 1) 34 | 35 | def test_corr(self): 36 | """The CKA of two vectors is the square of the correlation coefficient""" 37 | X, Y = _get_one() 38 | c1 = cka(X, Y) 39 | c2 = np.corrcoef(X.squeeze(), Y.squeeze())[0, 1] ** 2 40 | self.assertAlmostEqual(c1, c2) 41 | 42 | def test_isoscaling(self): 43 | """CKA is insensitive to scaling by a scalar""" 44 | X, Y = _get_multi() 45 | c1 = cka(X, Y) 46 | c2 = cka(2.0 * X, - 1 * Y) 47 | self.assertAlmostEqual(c1, c2) 48 | 49 | def test_rotation(self): 50 | """CKA is insensitive to rotations""" 51 | X, Y = _get_multi() 52 | X0 = X[:, :2] 53 | X0p = X0 @ np.array([[1, -1], [1, 1]]) / np.sqrt(2) 54 | c1 = cka(X0, Y) 55 | c2 = cka(X0p, Y) 56 | self.assertAlmostEqual(c1, c2) 57 | 58 | def test_no_iso(self): 59 | """CKA is sensitive to column scaling""" 60 | X, Y = _get_multi() 61 | X0 = X[:, :2] 62 | X0p = X0 @ np.array([[1, 1], [10, 1]]) 63 | c1 = cka(X0, Y) 64 | c2 = cka(X0p, Y) 65 | self.assertGreater(abs(c1 - c2), .001) 66 | 67 | def test_value(self): 68 | """Regression test: for this particular input, check that the value 69 | is the same as it always was.""" 70 | X, Y = _get_multi() 71 | c1 = cka(X, Y) 72 | self.assertAlmostEqual(c1, 0.96577, places=4) 73 | 74 | def test_wide(self): 75 | """Smoke test.""" 76 | X, Y = _get_wide() 77 | c1 = cka(X, Y) 78 | 79 | def test_consistent(self): 80 | """Regression test: check that the old implementation gives the same 81 | results as the new implementation.""" 82 | X, Y = _get_wide() 83 | c1 = cka(X, Y) 84 | c2 = old_cka(X, Y) 85 | 86 | self.assertNotEqual(c1, c2) 87 | self.assertAlmostEqual(c1, c2) 88 | 89 | def test_tall_wide(self): 90 | """Check that both implementations gives the same results""" 91 | X, Y = _get_wide() 92 | c1 = cka_tall(X, Y) 93 | c2 = cka_wide(X, Y) 94 | 95 | self.assertNotEqual(c1, c2) 96 | self.assertAlmostEqual(c1, c2) 97 | 98 | def test_torch(self): 99 | """Check that this also works if the input is a pytorch tensor""" 100 | # We put the import inside the function so the whole test suite doesn't 101 | # crash if we don't have pytorch installed. 102 | import torch 103 | 104 | X, Y = _get_wide() 105 | X, Y = torch.tensor(X), torch.tensor(Y) 106 | c1 = cka_tall(X, Y) 107 | c2 = cka_wide(X, Y) 108 | 109 | self.assertNotEqual(c1.item(), c2.item()) 110 | self.assertAlmostEqual(c1.item(), c2.item()) 111 | 112 | if __name__ == '__main__': 113 | unittest.main() -------------------------------------------------------------------------------- /research_code/tests/test_fib.py: -------------------------------------------------------------------------------- 1 | import time 2 | import unittest 3 | from research_code import fib 4 | 5 | class TestFib(unittest.TestCase): 6 | def test_fib(self): 7 | # 1, 1, 2, 3, 5, etc. 8 | self.assertEqual(fib.fib(0), 1) 9 | self.assertEqual(fib.fib(2), 2) 10 | self.assertEqual(fib.fib(4), 5) 11 | 12 | def test_fib_big(self): 13 | self.assertEqual(fib.fib(99), 354_224_848_179_261_915_075) 14 | 15 | def test_memoization(self): 16 | """Check that the memo-ized version is much faster than the naive.""" 17 | def _fib(n): 18 | if n >= 2: 19 | return _fib(n-2) + _fib(n-1) 20 | else: 21 | return 1 22 | 23 | t0 = time.time() 24 | val = fib.fib(15) 25 | dt = time.time() - t0 26 | 27 | t0 = time.time() 28 | val2 = _fib(15) 29 | dt2 = time.time() - t0 30 | 31 | self.assertEqual(val, val2) 32 | self.assertGreater(dt2, dt * 10) 33 | 34 | if __name__ == "__main__": 35 | unittest.main() -------------------------------------------------------------------------------- /results/closeness.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/results/closeness.png -------------------------------------------------------------------------------- /results/closeness_sns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/research_code/c351d02e9d6a02c8b2157db3d06908f8a797ed56/results/closeness_sns.png -------------------------------------------------------------------------------- /scripts/Draw dependency graph.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "source": [ 5 | "Note: graphviz is in requirements.txt, but to get the binaries you'll probably want to install via conda." 6 | ], 7 | "cell_type": "markdown", 8 | "metadata": {} 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 7, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "output_type": "stream", 17 | "name": "stdout", 18 | "text": [ 19 | "digraph {\n\tO [label=\"Open code\"]\n\tV [label=\"Version control\"]\n\tC [label=\"Command line\"]\n\tD [label=\"Open data\"]\n\tR [label=\"Reproducible research\"]\n\tE [label=Environments]\n\tL [label=\"Readable code\"]\n\tT [label=Testing]\n\tI [label=CI]\n\tW [label=\"Code review\"]\n\tS [label=\"Cloud storage\"]\n\tU [label=\"Cloud computing\"]\n\tM [label=Documentation]\n\tP [label=Packaging]\n\tV -> O [label=\"\"]\n\tC -> V [label=\"\"]\n\tO -> D [label=\"\"]\n\tO -> R [label=\"\"]\n\tD -> R [label=\"\"]\n\tE -> R [label=\"\"]\n\tC -> E [label=\"\"]\n\tO -> E [label=\"\"]\n\tO -> W [label=\"\"]\n\tO -> L [label=\"\"]\n\tC -> T [label=\"\"]\n\tT -> I [label=\"\"]\n\tO -> I [label=\"\"]\n\tE -> I [label=\"\"]\n\tW -> L [label=\"\"]\n\tE -> U [label=\"\"]\n\tT -> U [label=\"\"]\n\tV -> U [label=\"\"]\n\tS -> U [label=\"\"]\n\tS -> D [label=\"\"]\n\tM -> P [label=\"\"]\n\tT -> P [label=\"\"]\n\tW -> P [label=\"\"]\n\tO -> P [label=\"\"]\n\tI -> P [label=\"\"]\n\tT -> R [label=\"\"]\n\tL -> M [label=\"\"]\n}\n" 20 | ] 21 | }, 22 | { 23 | "output_type": "execute_result", 24 | "data": { 25 | "text/plain": [ 26 | "'../docs/figures/reproducible_research.pdf'" 27 | ] 28 | }, 29 | "metadata": {}, 30 | "execution_count": 7 31 | } 32 | ], 33 | "source": [ 34 | "from graphviz import Digraph\n", 35 | "\n", 36 | "dot = Digraph(format='png')\n", 37 | "dot.node('O', 'Open code')\n", 38 | "dot.node('V', 'Version control')\n", 39 | "dot.node('C', 'Command line')\n", 40 | "dot.node('D', 'Open data')\n", 41 | "dot.node('R', 'Reproducible research')\n", 42 | "dot.node('E', 'Environments')\n", 43 | "dot.node('L', 'Readable code')\n", 44 | "dot.node('T', 'Testing')\n", 45 | "dot.node('I', 'CI')\n", 46 | "dot.node('W', 'Code review')\n", 47 | "dot.node('S', 'Cloud storage')\n", 48 | "dot.node('U', 'Cloud computing')\n", 49 | "dot.node('M', 'Documentation')\n", 50 | "dot.node('P', 'Packaging')\n", 51 | "\n", 52 | "dot.edge('V', 'O', '')\n", 53 | "dot.edge('C', 'V', '')\n", 54 | "dot.edge('O', 'D', '')\n", 55 | "dot.edge('O', 'R', '')\n", 56 | "dot.edge('D', 'R', '')\n", 57 | "dot.edge('E', 'R', '')\n", 58 | "dot.edge('C', 'E', '')\n", 59 | "dot.edge('O', 'E', '')\n", 60 | "dot.edge('O', 'W', '')\n", 61 | "dot.edge('O', 'L', '')\n", 62 | "dot.edge('C', 'T', '')\n", 63 | "dot.edge('T', 'I', '')\n", 64 | "dot.edge('O', 'I', '')\n", 65 | "dot.edge('E', 'I', '')\n", 66 | "dot.edge('W', 'L', '')\n", 67 | "dot.edge('E', 'U', '')\n", 68 | "dot.edge('T', 'U', '')\n", 69 | "dot.edge('V', 'U', '')\n", 70 | "dot.edge('S', 'U', '')\n", 71 | "dot.edge('S', 'D', '')\n", 72 | "dot.edge('M', 'P', '')\n", 73 | "dot.edge('T', 'P', '')\n", 74 | "dot.edge('W', 'P', '')\n", 75 | "dot.edge('O', 'P', '')\n", 76 | "dot.edge('I', 'P', '')\n", 77 | "dot.edge('T', 'R', '')\n", 78 | "dot.edge('L', 'M', '')\n", 79 | "\n", 80 | "print(dot.source)\n", 81 | "dot.render('../docs/figures/reproducible_research', view=False)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 8, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "output_type": "stream", 91 | "name": "stdout", 92 | "text": [ 93 | "digraph {\n\tD [label=\"Create data\"]\n\tT [label=\"Transform data\"]\n\tF [label=\"Fit models\"]\n\tH [label=\"Test Hypotheses\"]\n\tP [label=\"Generate plots\"]\n\tW [label=\"Write and publish paper\"]\n\tD -> T [label=\"\"]\n\tT -> F [label=\"\"]\n\tF -> H [label=\"\"]\n\tH -> D [label=\"\"]\n\tH -> P [label=\"\"]\n\tP -> W [label=\"\"]\n}\n" 94 | ] 95 | }, 96 | { 97 | "output_type": "execute_result", 98 | "data": { 99 | "text/plain": [ 100 | "'../docs/figures/lifecycle_simple.pdf'" 101 | ] 102 | }, 103 | "metadata": {}, 104 | "execution_count": 8 105 | } 106 | ], 107 | "source": [ 108 | "from graphviz import Digraph\n", 109 | "\n", 110 | "dot = Digraph(format='png')\n", 111 | "dot.node('D', 'Create data')\n", 112 | "dot.node('T', 'Transform data')\n", 113 | "dot.node('F', 'Fit models')\n", 114 | "dot.node('H', 'Test Hypotheses')\n", 115 | "dot.node('P', 'Generate plots')\n", 116 | "dot.node('W', 'Write and publish paper')\n", 117 | "\n", 118 | "dot.edge('D', 'T', '')\n", 119 | "dot.edge('T', 'F', '')\n", 120 | "dot.edge('F', 'H', '')\n", 121 | "dot.edge('H', 'D', '')\n", 122 | "dot.edge('H', 'P', '')\n", 123 | "dot.edge('P', 'W', '')\n", 124 | "\n", 125 | "print(dot.source)\n", 126 | "dot.render('../docs/figures/lifecycle_simple', view=False)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 9, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "output_type": "stream", 136 | "name": "stdout", 137 | "text": [ 138 | "digraph {\n\tD [label=\"Create data\"]\n\tT [label=\"Transform data\"]\n\tF [label=\"Fit models\"]\n\tH [label=\"Test Hypotheses\"]\n\tP [label=\"Generate plots\"]\n\tW [label=\"Write and publish paper\"]\n\tB [label=\"Publish data\"]\n\tC [label=\"Publish code\"]\n\tD -> T [label=\"\"]\n\tT -> F [label=\"\"]\n\tF -> H [label=\"\"]\n\tH -> D [label=\"\"]\n\tH -> P [label=\"\"]\n\tH -> T [label=\"\"]\n\tH -> F [label=\"\"]\n\tP -> W [label=\"\"]\n\tD -> P [label=\"\"]\n\tP -> T [label=\"\"]\n\tP -> F [label=\"\"]\n\tD -> B [label=\"\"]\n\tW -> B [label=\"\"]\n\tW -> C [label=\"\"]\n\tF -> C [label=\"\"]\n}\n" 139 | ] 140 | }, 141 | { 142 | "output_type": "execute_result", 143 | "data": { 144 | "text/plain": [ 145 | "'../docs/figures/lifecycle_complex.pdf'" 146 | ] 147 | }, 148 | "metadata": {}, 149 | "execution_count": 9 150 | } 151 | ], 152 | "source": [ 153 | "from graphviz import Digraph\n", 154 | "\n", 155 | "dot = Digraph(format='png')\n", 156 | "dot.node('D', 'Create data')\n", 157 | "dot.node('T', 'Transform data')\n", 158 | "dot.node('F', 'Fit models')\n", 159 | "dot.node('H', 'Test Hypotheses')\n", 160 | "dot.node('P', 'Generate plots')\n", 161 | "dot.node('W', 'Write and publish paper')\n", 162 | "dot.node('B', 'Publish data')\n", 163 | "dot.node('C', 'Publish code')\n", 164 | "\n", 165 | "dot.edge('D', 'T', '')\n", 166 | "dot.edge('T', 'F', '')\n", 167 | "dot.edge('F', 'H', '')\n", 168 | "dot.edge('H', 'D', '')\n", 169 | "dot.edge('H', 'P', '')\n", 170 | "dot.edge('H', 'T', '')\n", 171 | "dot.edge('H', 'F', '')\n", 172 | "dot.edge('P', 'W', '')\n", 173 | "dot.edge('D', 'P', '')\n", 174 | "dot.edge('P', 'T', '')\n", 175 | "dot.edge('P', 'F', '')\n", 176 | "dot.edge('D', 'B', '')\n", 177 | "dot.edge('W', 'B', '')\n", 178 | "dot.edge('W', 'C', '')\n", 179 | "dot.edge('F', 'C', '')\n", 180 | "\n", 181 | "print(dot.source)\n", 182 | "dot.render('../docs/figures/lifecycle_complex', view=False)" 183 | ] 184 | } 185 | ], 186 | "metadata": { 187 | "kernelspec": { 188 | "name": "python3", 189 | "display_name": "Python 3.8.5 64-bit ('gpg': conda)", 190 | "metadata": { 191 | "interpreter": { 192 | "hash": "ce31f03000ec776b9ef690d2ab56011f6e9daa1694a64fadd5368c8c54192b7d" 193 | } 194 | } 195 | }, 196 | "language_info": { 197 | "codemirror_mode": { 198 | "name": "ipython", 199 | "version": 3 200 | }, 201 | "file_extension": ".py", 202 | "mimetype": "text/x-python", 203 | "name": "python", 204 | "nbconvert_exporter": "python", 205 | "pygments_lexer": "ipython3", 206 | "version": "3.8.5-final" 207 | } 208 | }, 209 | "nbformat": 4, 210 | "nbformat_minor": 4 211 | } -------------------------------------------------------------------------------- /scripts/Generate CKA matrices.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This calculates the final layer representations of 3 pretrained models (and pixels) on 100 images on imagenet." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 33, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "Downloading http://ufldl.stanford.edu/housenumbers/train_32x32.mat to ../data/svhn/train_32x32.mat\n" 20 | ] 21 | }, 22 | { 23 | "name": "stderr", 24 | "output_type": "stream", 25 | "text": [ 26 | "27.1%" 27 | ] 28 | }, 29 | { 30 | "ename": "KeyboardInterrupt", 31 | "evalue": "", 32 | "output_type": "error", 33 | "traceback": [ 34 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 35 | "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 36 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtorchvision\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mdataset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorchvision\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdatasets\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSVHN\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'../data/svhn'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdownload\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 37 | "\u001b[0;32m~/anaconda3/envs/gpg/lib/python3.8/site-packages/torchvision/datasets/svhn.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, root, split, transform, target_transform, download)\u001b[0m\n\u001b[1;32m 57\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdownload\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 59\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdownload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 60\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_integrity\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 38 | "\u001b[0;32m~/anaconda3/envs/gpg/lib/python3.8/site-packages/torchvision/datasets/svhn.py\u001b[0m in \u001b[0;36mdownload\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 116\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdownload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[0mmd5\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit_list\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 118\u001b[0;31m \u001b[0mdownload_url\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mroot\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmd5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 119\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 120\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mextra_repr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 39 | "\u001b[0;32m~/anaconda3/envs/gpg/lib/python3.8/site-packages/torchvision/datasets/utils.py\u001b[0m in \u001b[0;36mdownload_url\u001b[0;34m(url, root, filename, md5)\u001b[0m\n\u001b[1;32m 68\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 69\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Downloading '\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0murl\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m' to '\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mfpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 70\u001b[0;31m urllib.request.urlretrieve(\n\u001b[0m\u001b[1;32m 71\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfpath\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0mreporthook\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mgen_bar_updater\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 40 | "\u001b[0;32m~/anaconda3/envs/gpg/lib/python3.8/urllib/request.py\u001b[0m in \u001b[0;36murlretrieve\u001b[0;34m(url, filename, reporthook, data)\u001b[0m\n\u001b[1;32m 274\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 275\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 276\u001b[0;31m \u001b[0mblock\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 277\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mblock\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 278\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 41 | "\u001b[0;32m~/anaconda3/envs/gpg/lib/python3.8/http/client.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, amt)\u001b[0m\n\u001b[1;32m 456\u001b[0m \u001b[0;31m# Amount is given, implement using readinto\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 457\u001b[0m \u001b[0mb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbytearray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mamt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 458\u001b[0;31m \u001b[0mn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadinto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 459\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmemoryview\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtobytes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 460\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 42 | "\u001b[0;32m~/anaconda3/envs/gpg/lib/python3.8/http/client.py\u001b[0m in \u001b[0;36mreadinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 500\u001b[0m \u001b[0;31m# connection, and the user is reading more bytes than will be provided\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 501\u001b[0m \u001b[0;31m# (for example, reading in 1k chunks)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 502\u001b[0;31m \u001b[0mn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadinto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 503\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mn\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 504\u001b[0m \u001b[0;31m# Ideally, we would raise IncompleteRead if the content-length\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 43 | "\u001b[0;32m~/anaconda3/envs/gpg/lib/python3.8/socket.py\u001b[0m in \u001b[0;36mreadinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 667\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 668\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 669\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 670\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 671\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_timeout_occurred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 44 | "\u001b[0;31mKeyboardInterrupt\u001b[0m: " 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "import torch\n", 50 | "import torchvision\n", 51 | "\n", 52 | "dataset = torchvision.datasets.SVHN('../data/svhn', download=True)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 29, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "import torchvision.models as models\n", 62 | "from torchvision import transforms\n", 63 | "\n", 64 | "resnet18 = models.resnet18(pretrained=True)\n", 65 | "resnet34 = models.resnet34(pretrained=True)\n", 66 | "alexnet = models.alexnet(pretrained=True)\n", 67 | "\n", 68 | "models = [resnet18, resnet34, alexnet]\n", 69 | "\n", 70 | "reps = [[], [], [], []]\n", 71 | "n = 0\n", 72 | "\n", 73 | "transform = transforms.Compose([\n", 74 | " transforms.ToTensor(),\n", 75 | " transforms.Normalize(mean=[0.485, 0.456, 0.406],\n", 76 | " std=[0.229, 0.224, 0.225]),\n", 77 | " transforms.Resize([224, 224]),\n", 78 | "])\n", 79 | "\n", 80 | "for n, (img, _) in enumerate(cifar10):\n", 81 | " \n", 82 | " if n % 10 != 0:\n", 83 | " continue\n", 84 | " \n", 85 | " with torch.no_grad():\n", 86 | " im = transform(img).unsqueeze(0)\n", 87 | " for j, model in enumerate(models):\n", 88 | " reps[j].append(model(im).cpu().detach().numpy())\n", 89 | " \n", 90 | " reps[3].append(np.array(img).reshape((1, -1)))" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 30, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "import numpy as np\n", 100 | "\n", 101 | "for i, rep in enumerate(reps):\n", 102 | " rep = np.concatenate(rep, axis=0)\n", 103 | " reps[i] = rep" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 31, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "import pickle\n", 113 | "\n", 114 | "with open('../data/matrices.pkl', 'wb') as f:\n", 115 | " pickle.dump({'models': [\n", 116 | " 'resnet18', 'resnet34', 'alexnet', 'pixel'\n", 117 | " ], 'reps': reps}, f)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 37, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "X = np.random.randn(1000, 3)\n", 127 | "X2 = np.concatenate([X, np.random.randn(1000, 10)], axis=1)\n", 128 | "X3 = X[:, :2]\n", 129 | "X4 = X[:, :1]\n", 130 | "X5 = 3.0 * X\n", 131 | "\n", 132 | "import pickle\n", 133 | "\n", 134 | "with open('../data/matrices.pkl', 'wb') as f:\n", 135 | " pickle.dump({'models': [\n", 136 | " 'baseline', 'rescaled', 'nuisance', 'truncated1', 'truncated2'\n", 137 | " ], 'reps': [X, X5, X2, X3, X4]}, f)" 138 | ] 139 | } 140 | ], 141 | "metadata": { 142 | "kernelspec": { 143 | "display_name": "Python 3", 144 | "language": "python", 145 | "name": "python3" 146 | }, 147 | "language_info": { 148 | "codemirror_mode": { 149 | "name": "ipython", 150 | "version": 3 151 | }, 152 | "file_extension": ".py", 153 | "mimetype": "text/x-python", 154 | "name": "python", 155 | "nbconvert_exporter": "python", 156 | "pygments_lexer": "ipython3", 157 | "version": "3.8.5" 158 | } 159 | }, 160 | "nbformat": 4, 161 | "nbformat_minor": 4 162 | } 163 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r", encoding="utf-8") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="research_code-patrickmineault", 8 | version="0.0.1", 9 | author="Patrick Mineault", 10 | author_email="patrick@gmail.com", 11 | description="A small example package", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/pypa/sampleproject", 15 | packages=setuptools.find_packages(), 16 | classifiers=[ 17 | "Programming Language :: Python :: 3", 18 | "License :: OSI Approved :: MIT License", 19 | "Operating System :: OS Independent", 20 | ], 21 | python_requires='>=3.6', 22 | ) --------------------------------------------------------------------------------