├── .netlify └── state.json ├── assets └── images │ ├── wsl.png │ ├── pic.jpeg │ ├── vscode.png │ ├── final.doc.gif │ ├── github-repo.png │ ├── google-day2.jpg │ ├── wm-federenko.png │ ├── spaghetti-code.png │ ├── python_environment_2x.png │ └── pcbi.1007358.g002.PNG_L.png ├── docs ├── _static │ ├── book.pdf │ ├── et-book │ │ ├── et-book-bold-line-figures │ │ │ ├── et-book-bold-line-figures.eot │ │ │ ├── et-book-bold-line-figures.ttf │ │ │ └── et-book-bold-line-figures.woff │ │ ├── et-book-roman-line-figures │ │ │ ├── et-book-roman-line-figures.eot │ │ │ ├── et-book-roman-line-figures.ttf │ │ │ └── et-book-roman-line-figures.woff │ │ ├── et-book-roman-old-style-figures │ │ │ ├── et-book-roman-old-style-figures.eot │ │ │ ├── et-book-roman-old-style-figures.ttf │ │ │ └── et-book-roman-old-style-figures.woff │ │ ├── et-book-semi-bold-old-style-figures │ │ │ ├── et-book-semi-bold-old-style-figures.eot │ │ │ ├── et-book-semi-bold-old-style-figures.ttf │ │ │ └── et-book-semi-bold-old-style-figures.woff │ │ └── et-book-display-italic-old-style-figures │ │ │ ├── et-book-display-italic-old-style-figures.eot │ │ │ ├── et-book-display-italic-old-style-figures.ttf │ │ │ └── et-book-display-italic-old-style-figures.woff │ └── tufte.css ├── figures │ ├── nma.png │ ├── pic.jpeg │ ├── wsl.png │ ├── readme.PNG │ ├── reversi.PNG │ ├── tweet.png │ ├── unicorn.png │ ├── vscode.png │ ├── wandb.png │ ├── wizard.png │ ├── argparse.PNG │ ├── dashboard.PNG │ ├── final.doc.gif │ ├── final.doc.png │ ├── oneoverf.png │ ├── shablona.png │ ├── wave_clus.png │ ├── cka_example.png │ ├── git-vscode.png │ ├── github-repo.png │ ├── google-day2.jpg │ ├── mary-kondo.jpg │ ├── wm-federenko.png │ ├── zipf-diagram.png │ ├── mineault_et_al.png │ ├── spaghetti-code.png │ ├── testing-trophy.png │ ├── lifecycle_complex.pdf │ ├── lifecycle_complex.png │ ├── lifecycle_simple.pdf │ ├── lifecycle_simple.png │ ├── regression-lines.PNG │ ├── invariance_to_ortho.PNG │ ├── zipf-diagram-coded.png │ ├── Design_by_contract.svg.png │ ├── python_environment_2x.png │ ├── reproducible_research.pdf │ ├── reproducible_research.png │ ├── pcbi.1007358.g002.PNG_L.png │ ├── lifecycle_simple │ ├── lifecycle_complex │ ├── reproducible_research │ ├── twitter.svg │ └── pure-impure.svg ├── images │ └── favicon.ico ├── _toc.yml ├── acknowledgements.md ├── _config.yml ├── front-print.md ├── roadmap.md ├── tools.md ├── brains.md ├── index.md ├── social.md ├── tidy.md ├── docs.md ├── cka.md ├── pipelines.md ├── zipf.md ├── testing.md └── setup.md ├── .gitignore ├── netlify.toml ├── requirements.txt ├── .pre-commit-config.yaml ├── environment.yml ├── curvenote.yml ├── strip_js.py ├── TODO.md ├── Makefile ├── README.md ├── assemble_book.py └── LICENSE /.netlify/state.json: -------------------------------------------------------------------------------- 1 | { 2 | "siteId": "f0bcb2b1-3782-4a4b-8611-94f5412b4f76" 3 | } 4 | -------------------------------------------------------------------------------- /assets/images/wsl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/assets/images/wsl.png -------------------------------------------------------------------------------- /docs/_static/book.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/_static/book.pdf -------------------------------------------------------------------------------- /docs/figures/nma.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/nma.png -------------------------------------------------------------------------------- /docs/figures/pic.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/pic.jpeg -------------------------------------------------------------------------------- /docs/figures/wsl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/wsl.png -------------------------------------------------------------------------------- /assets/images/pic.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/assets/images/pic.jpeg -------------------------------------------------------------------------------- /assets/images/vscode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/assets/images/vscode.png -------------------------------------------------------------------------------- /docs/figures/readme.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/readme.PNG -------------------------------------------------------------------------------- /docs/figures/reversi.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/reversi.PNG -------------------------------------------------------------------------------- /docs/figures/tweet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/tweet.png -------------------------------------------------------------------------------- /docs/figures/unicorn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/unicorn.png -------------------------------------------------------------------------------- /docs/figures/vscode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/vscode.png -------------------------------------------------------------------------------- /docs/figures/wandb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/wandb.png -------------------------------------------------------------------------------- /docs/figures/wizard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/wizard.png -------------------------------------------------------------------------------- /docs/images/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/images/favicon.ico -------------------------------------------------------------------------------- /docs/figures/argparse.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/argparse.PNG -------------------------------------------------------------------------------- /docs/figures/dashboard.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/dashboard.PNG -------------------------------------------------------------------------------- /docs/figures/final.doc.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/final.doc.gif -------------------------------------------------------------------------------- /docs/figures/final.doc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/final.doc.png -------------------------------------------------------------------------------- /docs/figures/oneoverf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/oneoverf.png -------------------------------------------------------------------------------- /docs/figures/shablona.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/shablona.png -------------------------------------------------------------------------------- /docs/figures/wave_clus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/wave_clus.png -------------------------------------------------------------------------------- /assets/images/final.doc.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/assets/images/final.doc.gif -------------------------------------------------------------------------------- /assets/images/github-repo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/assets/images/github-repo.png -------------------------------------------------------------------------------- /assets/images/google-day2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/assets/images/google-day2.jpg -------------------------------------------------------------------------------- /docs/figures/cka_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/cka_example.png -------------------------------------------------------------------------------- /docs/figures/git-vscode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/git-vscode.png -------------------------------------------------------------------------------- /docs/figures/github-repo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/github-repo.png -------------------------------------------------------------------------------- /docs/figures/google-day2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/google-day2.jpg -------------------------------------------------------------------------------- /docs/figures/mary-kondo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/mary-kondo.jpg -------------------------------------------------------------------------------- /docs/figures/wm-federenko.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/wm-federenko.png -------------------------------------------------------------------------------- /docs/figures/zipf-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/zipf-diagram.png -------------------------------------------------------------------------------- /assets/images/wm-federenko.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/assets/images/wm-federenko.png -------------------------------------------------------------------------------- /docs/figures/mineault_et_al.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/mineault_et_al.png -------------------------------------------------------------------------------- /docs/figures/spaghetti-code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/spaghetti-code.png -------------------------------------------------------------------------------- /docs/figures/testing-trophy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/testing-trophy.png -------------------------------------------------------------------------------- /assets/images/spaghetti-code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/assets/images/spaghetti-code.png -------------------------------------------------------------------------------- /docs/figures/lifecycle_complex.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/lifecycle_complex.pdf -------------------------------------------------------------------------------- /docs/figures/lifecycle_complex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/lifecycle_complex.png -------------------------------------------------------------------------------- /docs/figures/lifecycle_simple.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/lifecycle_simple.pdf -------------------------------------------------------------------------------- /docs/figures/lifecycle_simple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/lifecycle_simple.png -------------------------------------------------------------------------------- /docs/figures/regression-lines.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/regression-lines.PNG -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _build 2 | *.Zone.Identifier 3 | # Local Netlify folder 4 | .netlify 5 | .DS_Store 6 | tmp 7 | exports 8 | conf.py 9 | -------------------------------------------------------------------------------- /docs/figures/invariance_to_ortho.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/invariance_to_ortho.PNG -------------------------------------------------------------------------------- /docs/figures/zipf-diagram-coded.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/zipf-diagram-coded.png -------------------------------------------------------------------------------- /assets/images/python_environment_2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/assets/images/python_environment_2x.png -------------------------------------------------------------------------------- /docs/figures/Design_by_contract.svg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/Design_by_contract.svg.png -------------------------------------------------------------------------------- /docs/figures/python_environment_2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/python_environment_2x.png -------------------------------------------------------------------------------- /docs/figures/reproducible_research.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/reproducible_research.pdf -------------------------------------------------------------------------------- /docs/figures/reproducible_research.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/reproducible_research.png -------------------------------------------------------------------------------- /assets/images/pcbi.1007358.g002.PNG_L.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/assets/images/pcbi.1007358.g002.PNG_L.png -------------------------------------------------------------------------------- /docs/figures/pcbi.1007358.g002.PNG_L.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/figures/pcbi.1007358.g002.PNG_L.png -------------------------------------------------------------------------------- /docs/_static/et-book/et-book-bold-line-figures/et-book-bold-line-figures.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/_static/et-book/et-book-bold-line-figures/et-book-bold-line-figures.eot -------------------------------------------------------------------------------- /docs/_static/et-book/et-book-bold-line-figures/et-book-bold-line-figures.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/_static/et-book/et-book-bold-line-figures/et-book-bold-line-figures.ttf -------------------------------------------------------------------------------- /docs/_static/et-book/et-book-bold-line-figures/et-book-bold-line-figures.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/_static/et-book/et-book-bold-line-figures/et-book-bold-line-figures.woff -------------------------------------------------------------------------------- /docs/_static/et-book/et-book-roman-line-figures/et-book-roman-line-figures.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/_static/et-book/et-book-roman-line-figures/et-book-roman-line-figures.eot -------------------------------------------------------------------------------- /docs/_static/et-book/et-book-roman-line-figures/et-book-roman-line-figures.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/_static/et-book/et-book-roman-line-figures/et-book-roman-line-figures.ttf -------------------------------------------------------------------------------- /docs/_static/et-book/et-book-roman-line-figures/et-book-roman-line-figures.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/_static/et-book/et-book-roman-line-figures/et-book-roman-line-figures.woff -------------------------------------------------------------------------------- /netlify.toml: -------------------------------------------------------------------------------- 1 | [dev] 2 | framework = "#static" 3 | targetPort = 3998 4 | port = 9000 # The port that the netlify dev will be accessible on 5 | publish = "_build/html" # The path to your static content folder 6 | -------------------------------------------------------------------------------- /docs/_static/et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/_static/et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.eot -------------------------------------------------------------------------------- /docs/_static/et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/_static/et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.ttf -------------------------------------------------------------------------------- /docs/_static/et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/_static/et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.woff -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Jinja2==2.11.3 2 | MarkupSafe==2.0.1 3 | git+https://github.com/patrickmineault/sphinx-book-theme.git 4 | jupyter-book==0.11.3 5 | tqdm 6 | sphinxext-opengraph 7 | docutils==0.16 8 | sphinx-autobuild 9 | pre-commit 10 | -------------------------------------------------------------------------------- /docs/_static/et-book/et-book-semi-bold-old-style-figures/et-book-semi-bold-old-style-figures.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/_static/et-book/et-book-semi-bold-old-style-figures/et-book-semi-bold-old-style-figures.eot -------------------------------------------------------------------------------- /docs/_static/et-book/et-book-semi-bold-old-style-figures/et-book-semi-bold-old-style-figures.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/_static/et-book/et-book-semi-bold-old-style-figures/et-book-semi-bold-old-style-figures.ttf -------------------------------------------------------------------------------- /docs/_static/et-book/et-book-semi-bold-old-style-figures/et-book-semi-bold-old-style-figures.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/_static/et-book/et-book-semi-bold-old-style-figures/et-book-semi-bold-old-style-figures.woff -------------------------------------------------------------------------------- /docs/_static/et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/_static/et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.eot -------------------------------------------------------------------------------- /docs/_static/et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/_static/et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.ttf -------------------------------------------------------------------------------- /docs/_static/et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrickmineault/codebook/HEAD/docs/_static/et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.woff -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v2.3.0 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | - id: trailing-whitespace 8 | - repo: https://github.com/psf/black 9 | rev: 22.10.0 10 | hooks: 11 | - id: black 12 | -------------------------------------------------------------------------------- /docs/figures/lifecycle_simple: -------------------------------------------------------------------------------- 1 | digraph { 2 | D [label="Create data"] 3 | T [label="Transform data"] 4 | F [label="Fit models"] 5 | H [label="Test Hypotheses"] 6 | P [label="Generate plots"] 7 | W [label="Write and publish paper"] 8 | D -> T [label=""] 9 | T -> F [label=""] 10 | F -> H [label=""] 11 | H -> D [label=""] 12 | H -> P [label=""] 13 | P -> W [label=""] 14 | } 15 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: cb 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - python=3.8 7 | - pip 8 | - pip: 9 | - Jinja2==2.11.3 10 | - MarkupSafe==2.0.1 11 | - git+https://github.com/patrickmineault/sphinx-book-theme.git 12 | - jupyter-book==0.11.3 13 | - tqdm 14 | - sphinxext-opengraph 15 | - docutils==0.16 16 | - sphinx-autobuild 17 | -------------------------------------------------------------------------------- /docs/_toc.yml: -------------------------------------------------------------------------------- 1 | format: jb-book 2 | root: index 3 | parts: 4 | - caption: Intro 5 | chapters: 6 | - file: roadmap 7 | - file: brains 8 | - caption: Lessons 9 | chapters: 10 | - file: setup 11 | - file: tidy 12 | - file: decoupled 13 | - file: testing 14 | - file: docs 15 | - file: pipelines 16 | - file: social 17 | - file: zipf 18 | - caption: Extras 19 | chapters: 20 | - file: cka 21 | - file: tools 22 | - file: acknowledgements 23 | -------------------------------------------------------------------------------- /curvenote.yml: -------------------------------------------------------------------------------- 1 | version: 1 2 | site: 3 | title: Example project 4 | projects: 5 | - path: . 6 | slug: example-project 7 | nav: 8 | - title: Example project 9 | url: /example-project 10 | - title: CENTER tech 11 | url: /center-tech 12 | - title: CENTER tech 13 | url: /center-tech 14 | actions: 15 | - title: Learn More 16 | url: https://curvenote.com/docs/web 17 | domains: 18 | - xcorr-example.curve.space 19 | logo: public/logo.svg 20 | logoText: Example project 21 | -------------------------------------------------------------------------------- /docs/figures/lifecycle_complex: -------------------------------------------------------------------------------- 1 | digraph { 2 | D [label="Create data"] 3 | T [label="Transform data"] 4 | F [label="Fit models"] 5 | H [label="Test Hypotheses"] 6 | P [label="Generate plots"] 7 | W [label="Write and publish paper"] 8 | B [label="Publish data"] 9 | C [label="Publish code"] 10 | D -> T [label=""] 11 | T -> F [label=""] 12 | F -> H [label=""] 13 | H -> D [label=""] 14 | H -> P [label=""] 15 | H -> T [label=""] 16 | H -> F [label=""] 17 | P -> W [label=""] 18 | D -> P [label=""] 19 | P -> T [label=""] 20 | P -> F [label=""] 21 | D -> B [label=""] 22 | W -> B [label=""] 23 | W -> C [label=""] 24 | F -> C [label=""] 25 | } 26 | -------------------------------------------------------------------------------- /strip_js.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from tqdm import tqdm 3 | 4 | 5 | def strip_js(filename, bad_lines): 6 | good_lines = [] 7 | with open(filename, "r") as f: 8 | for line in f: 9 | is_bad = sum([bad_line in line for bad_line in bad_lines]) 10 | if not is_bad: 11 | good_lines.append(line) 12 | 13 | text = "".join(good_lines) 14 | with open(filename, "w") as f: 15 | f.write(text) 16 | 17 | 18 | if __name__ == "__main__": 19 | # Go through _build/html, and remove all extraneous javascript. This is all 20 | # very silly, but upgrading sphinx-book-theme's version to reject the bad js 21 | # is very annoying 22 | files = pathlib.Path("_build/html").glob("*.html") 23 | for f in tqdm(files): 24 | strip_js(f, ["thebe.js"]) 25 | -------------------------------------------------------------------------------- /docs/figures/reproducible_research: -------------------------------------------------------------------------------- 1 | digraph { 2 | O [label="Open code"] 3 | V [label="Version control"] 4 | C [label="Command line"] 5 | D [label="Open data"] 6 | R [label="Reproducible research"] 7 | E [label=Environments] 8 | L [label="Readable code"] 9 | T [label=Testing] 10 | I [label=CI] 11 | W [label="Code review"] 12 | S [label="Cloud storage"] 13 | U [label="Cloud computing"] 14 | M [label=Documentation] 15 | P [label=Packaging] 16 | V -> O [label=""] 17 | C -> V [label=""] 18 | O -> D [label=""] 19 | O -> R [label=""] 20 | D -> R [label=""] 21 | E -> R [label=""] 22 | C -> E [label=""] 23 | O -> E [label=""] 24 | O -> W [label=""] 25 | O -> L [label=""] 26 | C -> T [label=""] 27 | T -> I [label=""] 28 | O -> I [label=""] 29 | E -> I [label=""] 30 | W -> L [label=""] 31 | E -> U [label=""] 32 | T -> U [label=""] 33 | V -> U [label=""] 34 | S -> U [label=""] 35 | S -> D [label=""] 36 | M -> P [label=""] 37 | T -> P [label=""] 38 | W -> P [label=""] 39 | O -> P [label=""] 40 | I -> P [label=""] 41 | T -> R [label=""] 42 | L -> M [label=""] 43 | } 44 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | * Add a capstone project with accompanying repository for the book 2 | * Give people a place to chat about the book 3 | 4 | Read these resources: 5 | 6 | * Come back to this: https://missing.csail.mit.edu/ 7 | 8 | Code along with an idea for an analysis: 9 | https://coderefinery.github.io/modular-type-along/instructor-guide/ 10 | 11 | === Feedback from reviewers === 12 | 13 | Some questions you might consider: 14 | 15 | 1. Was the advice consistent with what you consider best practices? If not, how so? 16 | 1. Were some critical subjects not covered by this book? 17 | 1. Was the narrative cohesive? 18 | 1. Were some subjects treated too superficially? Were some subjects treated with too much detail? How was the pacing overall? 19 | 1. Was the level appropriate for the target audience (2nd and 3rd year grad students focused on computational subjects)? If not, how so? 20 | 1. Did you feel that illustrations were appropriate? Where do you feel more illustrations would be helpful to the narrative? 21 | 1. Did you feel like the questions were appropriate? Would more comprehension questions help students learn? 22 | -------------------------------------------------------------------------------- /docs/figures/twitter.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 9 | 10 | 11 | 12 | 13 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | book: tmp/exports/book-complete.tex 2 | cd tmp/exports;pdflatex --shell-escape book-complete.tex;cd ../.. 3 | cp tmp/exports/book-complete.pdf docs/_static/book.pdf 4 | 5 | tmp/exports/book-complete.tex: $(wildcard docs/*.md) docs/_toc.yml assemble_book.py tmp 6 | python assemble_book.py 7 | 8 | sphinx-book: 9 | jupyter-book build docs --all --builder latex 10 | 11 | tmp: 12 | mkdir tmp 13 | 14 | develop: 15 | jupyter-book config sphinx docs > docs/conf.py 16 | sphinx-autobuild docs _build/html -b html 17 | 18 | build: $(wildcard *.md) clean 19 | jupyter-book build docs --all 20 | mv docs/_build _build 21 | python strip_js.py 22 | 23 | clean: 24 | rm -rf _build 25 | 26 | deploy: 27 | netlify deploy 28 | 29 | .PHONY: clean build deploy develop book sphinx-book 30 | 31 | help: 32 | @echo "Please use \`make ' where is one of" 33 | @echo " clean to remove all build, test, coverage and Python artifacts" 34 | @echo " build to build the book to HTML" 35 | @echo " deploy to deploy the book to netlify" 36 | @echo " develop to build the book and watch for changes" 37 | @echo " book to build the book in PDF via manual pipeline" 38 | @echo " sphinx-book to build the book to PDF via sphinx-book pipeline" 39 | @echo " help to show this message" 40 | -------------------------------------------------------------------------------- /docs/acknowledgements.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Acknowledgements" 3 | exports: 4 | - format: tex 5 | logo: false 6 | template: ../templates/plain_latex_book_chapter 7 | output: exports/acknowledgements.tex 8 | --- 9 | 10 | # Acknowledgements 11 | 12 | I wouldn't have been able to write this book without the help of many people. They have made me a better programmer. Thanks first and foremost to Ella Batty for inviting me to give the workshop that inspired this book. Thanks to the reviewers, Tyler Sloan, Elizabeth DuPre and Martin Heroux who made the talk much better. Thanks to Ivan Savov for inspiring me to write in long form and for reviewing early versions. Thanks to everyone who gave me feedback on the book, including Kaytee Flick, Tyler Manning, Felix Taschbach, Konrad Kording and Russ Poldrack. 13 | 14 | ## References 15 | 16 | This book was inspired by many other long-form books, papers and tutorials. Check them out: 17 | 18 | - Felienne Hermans. [The Programmer's Brain](https://www.manning.com/books/the-programmers-brain). 19 | - Irving et al. [Research Software Engineering with Python](https://merely-useful.tech/py-rse/). 20 | - The Turing Way Community. [The Turing Way: A Handbook for Reproducible Data Science](https://the-turing-way.netlify.app/welcome). 21 | - [Software Carpentry lessons](https://software-carpentry.org/lessons/). 22 | - Greg Wilson et al. [Good enough practices in scientific computing](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1005510). 23 | - The CodeRefinery Community. [Materials from CodeRefinery](https://coderefinery.org/). 24 | 25 | This book was generated by [jupyterbook](https://jupyterbook.org/), which builds on [Sphinx](https://www.sphinx-doc.org). The stylesheet is an adaptation of [`tufte.css`](https://edwardtufte.github.io/tufte-css/). The unicorn logo is from [twemoji](https://github.com/twitter/twemoji), released under a CC-BY 4.0 license. The PDF version of this book was generated via [CurveNote](https://curvenote.com). 26 | -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | # in _config.yml 2 | title: "Good research code" 3 | author: "Patrick Mineault Twitter" 4 | copyright: "2021–2022" 5 | logo: figures/unicorn.png 6 | 7 | execute: 8 | execute_notebooks: "auto" 9 | 10 | repository: 11 | url : "https://github.com/patrickmineault/repl-test" # Online location of your book 12 | path_to_book : . # Optional path to your book, relative to the repository root 13 | branch : main # Which branch of the repository should be used when creating links (optional) 14 | 15 | html: 16 | google_analytics_id: G-3Q6LDVNS0X 17 | home_page_in_navbar: true 18 | use_edit_page_button: true 19 | use_repository_button: true 20 | use_issues_button: true 21 | baseurl: https://goodresearch.dev/ 22 | favicon: images/favicon.ico 23 | use_fullscreen_button: false 24 | comments: 25 | hypothesis: false 26 | extra_footer: "Licensed under CC-BY 4.0 [source].
Report issue." 27 | 28 | launch_buttons: 29 | thebe : true 30 | notebook_interface: classic 31 | 32 | thebe_config: 33 | repository_url : "https://github.com/patrickmineault/repl-test" # Online location of your book 34 | path_to_book : "docs/" # Optional path to your book, relative to the repository root 35 | branch : main # Which branch of the repository should be used when creating links (optional) 36 | 37 | parse: 38 | myst_enable_extensions: 39 | - colon_fence 40 | - substitution 41 | - dollarmath 42 | - deflist 43 | - html_image 44 | 45 | sphinx: 46 | extra_extensions: 47 | - sphinxext.opengraph 48 | config: 49 | ogp_site_url: "https://goodresearch.dev/" 50 | ogp_image: "https://goodresearch.dev/_images/unicorn.png" 51 | ogp_description_length: 200 52 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # The Good Research Code Handbook source 2 | 3 | This is the source for the Good Research Code Handbook, by Patrick Mineault. The website lives at [goodresearch.dev](https://goodresearch.dev). This book uses jupyterbook to build, with a highly customized theme based off of `tufte.css`. 4 | 5 | ## Reporting issues with the website 6 | 7 | Please report any issues via the Issues tab. 8 | 9 | ## Building the book 10 | 11 | [![Netlify Status](https://api.netlify.com/api/v1/badges/f0bcb2b1-3782-4a4b-8611-94f5412b4f76/deploy-status)](https://app.netlify.com/sites/fervent-carson-5a9d17/deploys) 12 | 13 | Recreate the conda environment with: 14 | 15 | `conda env create -n cb --file environment.yml` 16 | 17 | Then `conda activate cb`. 18 | 19 | For local development, I recommend using the auto-reloading `sphinx-autobuild` package. This will not only build the book and create a local server for you to preview the book, it will also rebuild and reload the browser whenever you make a change. Neat! Use `make develop` to get that going. 20 | 21 | To build a version ready to be deployed: 22 | 23 | 1. `jupyter-book build docs --all` for a full rebuild 24 | 2. `python strip_js.py` to remove `thebe.js` includes, which would otherwise cause a 500KB javascript file to be loaded 25 | 26 | These two can be run via `make build`. 27 | 28 | I use the `netlify deploy -d _build/html --prod` or `make deploy` to manually deploy the book to `goodreseach.dev`. This command won't work for you unless you have my netlify credentials. 29 | 30 | _Note_: when you push a PR through Github, it will build a preview of your work through Netlify automatically. When I merge the PR, it will automatically deploy the built website to [goodresearch.dev](https://goodresearch.dev). See the badge above for build status. 31 | 32 | ## Building the book to PDF 33 | 34 | The PDF of this book is built using LaTeX via CurveNote's MyST to tex conversion. As this feature is in alpha stage, there's an elaborate translation stage in `assemble_one_pager.py`. Because the CurveNote CLI is subject to change, make sure to use the exact version of the curvenote cli, `v0.8.2`, to build this. 35 | 36 | Run `git clone https://github.com/patrickmineault/plain_latex_book_chapter.git` to fetch a plain tex template. Then place this under `../templates/plain_latex_book_chapter`. 37 | 38 | Build using `make pdf`. This will create a number of temporary files and the final pdf will live in `tmp/exports/book-complete.pdf`. 39 | 40 | ## Citing this book 41 | 42 | 10.5281/zenodo.5796873 43 | 44 | Patrick J Mineault & The Good Research Code Handbook Community (2021). _The Good Research Code Handbook_. Zenodo. [doi:10.5281/zenodo.5796873](https://dx.doi.org/10.5281/zenodo.5796873) 45 | -------------------------------------------------------------------------------- /docs/front-print.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Preface to the print edition 3 | exports: 4 | - format: tex 5 | logo: false 6 | template: ../templates/plain_latex_book_chapter 7 | output: exports/front-print.tex 8 | --- 9 | # Introduction 10 | 11 | ```{epigraph} 12 | Is my code fast? No. But is it well documented? No. But does it work? Also no. 13 | 14 | ---[@KyleMorgenstein](https://twitter.com/KyleMorgenstein) 15 | ``` 16 | 17 | This handbook is for grad students, postdocs and PIs who do a lot of programming as part of their research. It will teach you, in a practical manner, how to organize your code so that it is easy to understand and works reliably. 18 | 19 | Most people who write research code are not trained in computer science or software engineering. It can feel like an uphill battle when you have to write code without the right training. Do you ever: 20 | 21 | - Feel like you don't know what you're doing 22 | - Feel like an impostor 23 | - Write code with lots of bugs 24 | - Hate your code and don't want to work on it 25 | - Have trouble finishing projects 26 | - Contemplate buying a small organic farm in upstate Vermont and read far too much about goat husbandry 27 | 28 | Did I hit a nerve? Yes?! Then you're in for a treat! This book will help you get from 0 to 1 on good software engineering practices for research projects. 29 | 30 | ## Prerequisites 31 | 32 | I've tried to write this book in a progressive manner: some content is targeted at complete novices, other to programmers who are farther along on their journey. However, I generally assume that you have some familiarity with the following: 33 | 34 | - **Python**: this intro is Python-centric. You can write good code for Matlab, R, or Julia, but we won't cover that here. You don't need to be a Python expert, but you'll get the most out of this if you've been using Python on a regular basis for at least a month, and if you have some passing familiarity with the python data science ecosystem (numpy, matplotlib, pandas, etc.). 35 | - **Git & Github**: a lot of the practices introduced here will require you to change your code, which could cause existing functionality to break. You might even accidentally delete something important! Mastering git and github means you will have a time machine for your code, so you can revert to an earlier state. [There's a great intro to git for beginners from software carpentries](https://swcarpentry.github.io/git-novice/). 36 | - **The command line**: You will need to run some commands on the command line to implement some of the advice in this book. I'm going to assume that you have some familiarity with running commands from a Unix-style shell (e.g. bash). [There's a great intro to the unix shell from software carpentries](http://swcarpentry.github.io/shell-novice/). If you're using Windows, you will still be able to run many tools from the Windows command prompt. Long term, your life will be easier if you [install the Windows Subsystem for Linux (WSL)](wsl) which will give you access to a Unix-style shell. 37 | 38 | Some of the examples I use are neuroscience-inspired---but neuroscience background is absolutely not a requirement. Appreciation for unicorns is a big plus. 39 | 40 | ## Why did I make this? 41 | 42 | I'm [Patrick Mineault](https://xcorr.net). I did my PhD in computational neuroscience at McGill University, in Montreal. I wrote a lot of not very good code, mostly in Matlab. One time, my code was in a non-working state for an entire month---I would furiously type on the keyboard all day in the hopes it would eventually work, and it didn't. It made me sad. But I managed to graduate. Then I did a postdoc. More of the same. 43 | 44 | Eventually, I decided to properly study CS. I studied data structures, algorithms and software engineering practices, and I got a big-boy job as a software engineer at Google in California. It was then that I learned the error of my ways. I had lost time during my research days because I didn't know how to organize and write code that didn't self-destruct out of spite. But this is fixable! With knowledge! 45 | 46 | At the invitation of Ella Batty, I gave a workshop for the students in neuroscience at Harvard in January of 2021 on writing good research code. The feedback was overwhelmingly positive, so I decided to expand it into this handbook. I hope you enjoy it! 47 | 48 | ```{figure} figures/pic.jpeg 49 | It me! 50 | ``` 51 | 52 | ## About the print edition 53 | 54 | If you're reading this, congratulations! You're reading the bespoke, artisanal, small-batch print edition. A few folks have asked me for a print edition, and I was happy to oblige. It's an experiment that I hope to expand in the coming years. It was made possible by last-minute feature additions in CurveNote, the wonderful new project that aims to make MyST Markdown the standard for publishing scientific articles and books. 55 | -------------------------------------------------------------------------------- /docs/roadmap.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: How to read this book 3 | exports: 4 | - format: tex 5 | logo: false 6 | template: ../templates/plain_latex_book_chapter 7 | output: exports/roadmap.tex 8 | --- 9 | 10 | # Roadmap 11 | 12 | This handbook covers practices and tools from big to small. We'll discuss how to write and organize modular code, how to write docs, how to make your code robust. We'll finally reveal the correct number of spaces to use to indent code [^four]. At the end of the main lessons, we'll go through an example project step-by-step. 13 | 14 | This roadmap shows some of the concepts we'll cover in the book. Take stock of where you are now. Come back to this figure after you've read parts of this book: you'll be surprised how much you can learn in a few hours! 15 | 16 | [^four]: [Four. Four is the correct number](https://www.python.org/dev/peps/pep-0008/). 17 | 18 | ```{figure} figures/concepts.svg 19 | --- 20 | figclass: boxed 21 | width: 100% 22 | --- 23 | Some concepts we'll cover in this book. I've highlighted in green and blue different concepts which are relevant to short-term and to long-term memory, respectively: we'll discuss what that means in the next section. How many concepts are you familiar with now? How many do you know well? 24 | ``` 25 | 26 | ## Conventions 27 | 28 | This book uses a number of conventions throughout. The command line is indicated by a `$` prompt: 29 | 30 | ``` 31 | $ ls -al 32 | ``` 33 | 34 | The Python command prompt is indicated by `>>>`: 35 | 36 | ``` 37 | $ python 38 | >>> import antigravity 39 | ``` 40 | 41 | Sometimes I will ask you a question, and will hide the answer behind a spoilers panel. For example, what is the answer to life, the universe, and everything? 42 | 43 | ```{dropdown} Spoilers 44 | 42 45 | ``` 46 | 47 | At the end of each main lesson, I ask you to put one of the lessons into practice through a 5-minute exercise. It looks like this: 48 | 49 | ```{admonition} 5-minute exercise 50 | Brush your teeth. 51 | ``` 52 | 53 | ## Code 54 | 55 | This handbook refers to code in several repositories: 56 | 57 | - [True neutral cookie cutter](https://github.com/patrickmineault/true-neutral-cookiecutter): `github.com/patrickmineault/true-neutral-cookiecutter` 58 | - [CKA example](https://github.com/patrickmineault/codebook_examples/tree/main/cka): `github.com/patrickmineault/codebook_examples` 59 | - [Zipf's law example project](https://github.com/patrickmineault/zipf/): `github.com/patrickmineault/zipf` 60 | - [Source for the text of the book](https://github.com/patrickmineault/codebook): `github.com/patrickmineault/codebook` 61 | - [Tweaked sphinx book theme](https://github.com/patrickmineault/sphinx-book-theme/): `patrickmineault/sphinx-book-theme` 62 | 63 | You can use these as references when you're working on your own projects. 64 | 65 | ## Breaking the cycle of frustration 66 | 67 | Learning to code is a lifelong journey. The upper ceiling on programming skills is very high. Much like cooking, coding can be done on a purely utilitarian basis, or it can be elevated to high art. [Professional programmers with decades of experience go on months-long retreats to acquire new skills](https://www.recurse.com/not-a-bootcamp). _A good frame_ for getting better at coding is to think of it as a _craft_. Reading this book is a great way to refine your craft through focused practice [^calnewport]. 68 | 69 | [^calnewport]: CS professor Cal Newport explores these themes in his book [So Good They Can't Ignore You](https://www.calnewport.com/books/so-good/). 70 | 71 | When I talk to students about writing code, a lot of them describe feelings like: 72 | 73 | - guilt 74 | - shame 75 | - cringe 76 | - frustration 77 | 78 | ```{margin} 79 | Julia Evans [has a lovely zine](https://wizardzines.com/comics/attitude-matters/) on dealing with frustrating bugs. 80 | ``` 81 | 82 | Much of that frustration comes from a mismatch between what you want to accomplish (a lot) and what you have the ability to accomplish (not as much as you'd like). Programming instruction often emphasizes _exploration_, embracing errors as learning opportunities, and encouraging you to let your imagination run free. This theory of change implies that you will learn a lot by simply programming a lot every day. You might then feel guilty and inadequate when you're not as proficient as you want after several years of daily programming. 83 | 84 | In fact, unstructured exploration is a very inefficient way to learn a complex skill like programming. It's like expecting a student to rediscover calculus by themselves after teaching them the rudiments of algebra! Research shows that _structured instruction_---like the one in this book---is far more effective at teaching programming skills [^felienne]. You're taking the right step by reading this book! 85 | 86 | [^felienne]: Felienne Hermans, [The Programmer's Brain](https://www.manning.com/books/the-programmers-brain) (2021). 87 | 88 | ## Our social contract 89 | 90 | You might have experience talking about computer stuff with somebody more experienced, and left feeling discouraged. Perhaps they were dismissive, or snooty, or just kind of a jerk. It's an unfortunate tendency in our profession, and I want to break that cycle, because I think that this lack of _psychological safety_ can make it hard to become proficient at coding. 91 | 92 | ```{important} 93 | This is a safe space. You're in a learning environment. There will be no tests. The advice I give here is non-binding and non-sanctimonious. 94 | ``` 95 | 96 | It's ok to write bad code when you're learning or you're in a hurry. You have deadlines! Remember to come back to the bad code and tidy it up after. Don't let the perfect be the enemy of the good. And remember, once you are empowered with this new knowledge, to be nice to beginners who are going through the same journey that you have. 97 | -------------------------------------------------------------------------------- /docs/tools.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Use good tools 3 | exports: 4 | - format: tex 5 | logo: false 6 | template: ../templates/plain_latex_book_chapter 7 | output: exports/tools.tex 8 | --- 9 | 10 | 11 | # Use good tools 12 | 13 | ## Use the tools introduced in each section 14 | 15 | For your convenience, here is a list of tools and packages discussed in each section of the book. 16 | 17 | ### Set up your project 18 | 19 | [git](https://git-scm.com/) 20 | : a command line tool for code versioning 21 | 22 | [github](https://github.com/) 23 | : a website where you can share code 24 | 25 | [conda](https://docs.conda.io/en/latest/) 26 | : a command line package manager and virtual environment manager 27 | 28 | [setuptools](https://setuptools.readthedocs.io/en/latest/userguide/quickstart.html) 29 | : a Python library to define pip installable packages 30 | 31 | [cookiecutter](https://cookiecutter.readthedocs.io/en/1.7.3/) 32 | : a command line tool to create projects from templates 33 | 34 | ### Keep things tidy 35 | 36 | [flake8](https://flake8.pycqa.org/en/latest/) and [pylint](https://pylint.org/) 37 | : command line linters for Python 38 | 39 | [black](https://github.com/psf/black) 40 | : a command line auto-formatter for Python code, with plugins for popular IDEs 41 | 42 | [vulture](https://github.com/jendrikseipp/vulture) 43 | : a Python package to identify dead code in your codebase 44 | 45 | [jupytext](https://github.com/mwouts/jupytext) 46 | : a command line tool to seamlessly translate between regular jupyter notebooks and a markdown-based representation 47 | 48 | ### Write decoupled code 49 | 50 | [pandas](https://pandas.pydata.org/) 51 | : a Python library to represent columnar data 52 | 53 | [xarray](http://xarray.pydata.org/en/stable/) 54 | : a Python library to represent multidimensional tensors with named dimensions 55 | 56 | [collections](https://docs.python.org/3/library/collections.html) 57 | : a Python standard library of containers, including defaultdict, counter, etc. 58 | 59 | ### Test your code 60 | 61 | [pytest](https://docs.pytest.org/en/6.2.x/) 62 | : a Python library for unit testing, along with a command line utilities 63 | 64 | ### Write good docs 65 | 66 | [argparse](https://docs.python.org/3/library/argparse.html) 67 | : a Python library to parse command-line arguments, part of the Python standard library 68 | 69 | [shellcheck](https://github.com/koalaman/shellcheck) 70 | : a command-line tool that checks for common errors in bash scripts, with plugins for popular IDEs 71 | 72 | [make](https://www.gnu.org/software/make/) 73 | : a command-line tool to define and run directed acyclic graphs of computation 74 | 75 | [sphinx](https://docs.readthedocs.io/en/stable/intro/getting-started-with-sphinx.html) 76 | : a command-line tool to generate documentation from Python code and text files 77 | 78 | [readthedocs](https://readthedocs.org/) 79 | : a website to host documentation 80 | 81 | ### Make it social 82 | 83 | [vscode live share](https://marketplace.visualstudio.com/items?itemName=MS-vsliveshare.vsliveshare-pack) 84 | : IDE extension for code sharing 85 | 86 | [deepnote](https://deepnote.com/) and [cocalc](https://cocalc.com/) 87 | : collaborative jupyter notebooks in the browser 88 | 89 | [replit](https://replit.com) 90 | : collaborative IDE in the browser 91 | 92 | (vscode)= 93 | ## Choose an IDE 94 | 95 | Integrated development environments (IDE) can help you develop faster and make it easy to implement some of the productivity tips I've discussed previously. Preferred IDEs change from year to year, as new editors become favored while others are shunned. Don't be surprised if in three years you'll be using a different IDE. 96 | 97 | ```{figure} figures/vscode.png 98 | --- 99 | width: 90% 100 | --- 101 | Editing the Markdown source for this page in vscode. 102 | ``` 103 | 104 | [I've evaluated many IDEs](https://xcorr.net/2013/04/17/evaluating-ides-for-scientific-python/), and overall, I like [vscode](https://code.visualstudio.com/) best. It's open source, free, and fast. It has very good integrated Python development tools, and it has an impressive array of plugins for almost any imaginable use case. The git and github tools are particularly well integrated, which makes it easy to do source control outside of the command line. There is an integrated debugger, as well as a terminal, so it's one-stop shop for development. 105 | 106 | Others recommend [PyCharm](https://www.jetbrains.com/pycharm/): it has best-in-class code understanding, and scales well to large codebases. It's free for academics. 107 | 108 | (wsl)= 109 | ## Use WSL on Windows 110 | 111 | ```{figure} figures/wsl.png 112 | --- 113 | width: 90% 114 | --- 115 | WSL running on my Windows laptop 116 | ``` 117 | 118 | Windows' basic terminal lacks basic features. Powershell is powerful but it is very different from other platforms. For a while, the best way to get a Unix-style shell on Windows was to use the git bash tool. In my opinion, these days the best-in-class terminal to use on Windows is *Windows subsystem for Linux* (WSL). 119 | 120 | *WSL* is an emulation layer that allows you to run a full Linux kernel inside of a Windows terminal window. [Once installed](https://docs.microsoft.com/en-us/windows/wsl/install-win10), you can install any Linux OS you like, Ubuntu being the *de facto* standard. 121 | 122 | You won't be able to run GUI applications. However, many tools you'll want to use run webservers, for example jupyter. You'll be able to access these through your your normal Windows-based web browser, such as Chrome, Firefox or Edge. Your Linux installation will run in a virtual filesystem, which you can access through Windows explorer by typing `explorer` inside a WSL terminal. `code .` will fire up a version of vscode in your current directory. 123 | -------------------------------------------------------------------------------- /docs/brains.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "The neuroscience of coding: a primer" 3 | exports: 4 | - format: tex 5 | logo: false 6 | template: ../templates/plain_latex_book_chapter 7 | output: exports/brains.tex 8 | --- 9 | 10 | # Brains & coding 11 | 12 | ```{admonition} Optional 13 | This section is optional---it's about the neuroscience of writing code. Feel free to skip to the next section if it doesn't speak to you. 14 | ``` 15 | 16 | What makes coding uniquely difficult? Coding is hard on your memory. This book's theme is that **writing good research code is about freeing your memory**. Neuroscientists distinguish different subtypes of memory, and coding strongly depends on two subtypes: 17 | 18 | - Working memory 19 | - Long-term memory 20 | 21 | Let's look at how these two types of memory are involved during programming. 22 | 23 | ## Working memory 24 | 25 | ```{margin} 26 | Felienne Hermans has a fantastic [talk](https://www.youtube.com/watch?v=g1ib43q3uXQ) and [book](https://www.manning.com/books/the-programmers-brain) on programming in the brain. 27 | ``` 28 | 29 | What do you think coding looks like in the brain? The term _programming language_ makes it seem that reading code is like reading natural language. Neuroscientists have looked at what circuits are engaged when people read code. [Ivanova and colleagues at MIT](https://pubmed.ncbi.nlm.nih.gov/33319744/) measured brain activity while people read Python in the scanner. They found that activations didn't overlap much with conventional language and speech areas. 30 | 31 | ```{figure} figures/wm-federenko.png 32 | --- 33 | figclass: boxed 34 | width: 90% 35 | --- 36 | Code problems (CP; purple bars) created activations with higher overlap with the multiple demand system (left and center) than with the language system (right) compared with other tasks like sentence problems (SP), sentence reading (SR) or non-word reading (NR). From Ivanova et al. (2020), used under a CC-BY 4.0 license. 37 | ``` 38 | 39 | Instead, they found high overlap with the **multiple-demand system**, a network of areas typically recruited during math, logic and problem solving. The multiple-demand system is also involved in **working memory**, a type of memory that can hold information for a short amount of time. You may have heard that people can only hold approximately 7 items in working memory---for instance, the digits of a phone number. This idea was popularized by the Harvard psychologist George Miller in the 1950's. While there's debate about the exact number of items we can remember in the short term, neuroscientists generally agree that our working memory capacity is very limited. 40 | 41 | Programming involves juggling in your mind many different pieces of information: the definition of the function you want to write, the name of variables you defined, the boundary conditions of the `for` loop you're writing, etc. If you have too many details to juggle, eventually you will forget about one detail, and you will have to slowly reconstruct that detail from context. Do that enough times, and you will completely cease to make progress. Your working memory is very precious indeed. 42 | 43 | ### Saving our working memory 44 | 45 | One of our strongest weapons against overloading our working memory is _convention_. Conventions mean that you don't have to remember details in your working memory: you can rely on your long-term memory instead. For example, let's say you want to call a helper function that splits a url into its constituent parts. You might instinctively know that the function to call is `split_url` and not `splitUrl` or `splitURL` or `URLSplitterFactory().do`. That's because Python has a convention that says that you separate words in a variable name with underscores (snake case). If you abide by the convention, you've just saved yourself a working memory slot. To be clear, it's a completely arbitrary convention: JavaScript uses a different convention (camel case) and it works fine. 46 | 47 | We'll see many more examples of organizing code such that it saves our working memory: 48 | 49 | - Writing small functions 50 | - Writing functions with low number of side effects 51 | - Writing decoupled functions 52 | - Following the Python style guide to the letter 53 | - Using an auto-formatter 54 | 55 | Your working memory is precious, save it! 56 | 57 | ## Long-term memory 58 | 59 | ```{epigraph} 60 | What is this? 61 | 62 | ---You, squinting at code you wrote a year ago 63 | ``` 64 | 65 | _Research_ code in particular is challenging on **long-term memory**. This is because: 66 | 67 | - The project's endpoint might be unclear 68 | - Correct can be hard to define 69 | - There can be lots of exploration and dead ends before you produce a unit of research 70 | - Sometimes, manual steps are involved requiring human judgement 71 | - You have to remember all the dead ends for the code to even make sense 72 | 73 | ```{admonition} Has this ever happened to you? 74 | 75 | You work on a project for many months, and you submit a paper. You receive reviews months later requiring revisions. You sit down to code and it takes you several days to figure out how to run a supplementary analysis that ended up taking only a few lines of code. 76 | ``` 77 | 78 | While the scientific method, as traditionally taught, is fairly linear, real lab work often involves a high nonlinear process of discovery. 79 | 80 | ```{figure} figures/lifecycle_complex.svg 81 | --- 82 | width: 450px 83 | figclass: boxed 84 | --- 85 | Generating a research paper can be a messy process. 86 | ``` 87 | 88 | Future you will have forgotten 90% of that process. Code that sticks to convention, is tidy and well-documented will be far easier to use in the future than clever, obtuse code. Boring code is good code! 89 | 90 | ## Discussion 91 | 92 | ```{epigraph} 93 | Simple is better than complicated. Complicated is better than complex. 94 | 95 | ---The Zen of Python 96 | ``` 97 | 98 | We've seen that coding, especially research code, is difficult along two axes: 99 | 100 | - working memory 101 | - long-term memory 102 | 103 | We've discussed some of our overall strategies to deal with these limitations, namely: 104 | 105 | - using convention 106 | - keeping code tidy 107 | - documenting our code 108 | 109 | Let's see how you can implement these strategies in practice. Let's jump in! 110 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # The Good Research Code Handbook 2 | 3 | This handbook is for grad students, postdocs and PIs who do a lot of programming as part of their research. It will teach you, in a practical manner, how to organize your code so that it is easy to understand and works reliably. 4 | 5 | Most people who write research code are not trained in computer science or software engineering. It can feel like an uphill battle when you have to write code without the right training. Do you ever: 6 | 7 | - Feel like you don't know what you're doing 8 | - Feel like an impostor 9 | - Write code with lots of bugs 10 | - Hate your code and don't want to work on it 11 | - Have trouble finishing projects 12 | - Contemplate buying a small organic farm in upstate Vermont and read far too much about goat husbandry 13 | 14 | Did I hit a nerve? Yes?! Then you're in for a treat! This book will help you get from 0 to 1 on good software engineering practices for research projects. 15 | 16 | ```{epigraph} 17 | Is my code fast? No. But is it well documented? No. But does it work? Also no. 18 | 19 | ---[@KyleMorgenstein](https://twitter.com/KyleMorgenstein) 20 | ``` 21 | 22 | ## Prerequisites 23 | 24 | I've tried to write this book in a progressive manner: some content is targeted at complete novices, other to programmers who are farther along on their journey. However, I generally assume that you have some familiarity with the following: 25 | 26 | - **Python**: this intro is Python-centric. You can write good code for Matlab, R, or Julia, but we won't cover that here. You don't need to be a Python expert, but you'll get the most out of this if you've been using Python on a regular basis for at least a month, and if you have some passing familiarity with the python data science ecosystem (numpy, matplotlib, pandas, etc.). 27 | - **Git & Github**: a lot of the practices introduced here will require you to change your code, which could cause existing functionality to break. You might even accidentally delete something important! Mastering git and github means you will have a time machine for your code, so you can revert to an earlier state. [There's a great intro to git for beginners from software carpentries](https://swcarpentry.github.io/git-novice/). 28 | - **The command line**: You will need to run some commands on the command line to implement some of the advice in this book. I'm going to assume that you have some familiarity with running commands from a Unix-style shell (e.g. bash). [There's a great intro to the unix shell from software carpentries](http://swcarpentry.github.io/shell-novice/). If you're using Windows, you will still be able to run many tools from the Windows command prompt. Long term, your life will be easier if you [install the Windows Subsystem for Linux (WSL)](wsl) which will give you access to a Unix-style shell. 29 | 30 | Some of the examples I use are neuroscience-inspired---but neuroscience background is absolutely not a requirement. Appreciation for unicorns unicorns is a big plus. 31 | 32 | ## Why did I make this? 33 | 34 | I'm [Patrick Mineault](https://xcorr.net) Twitter. I did my PhD in computational neuroscience at McGill University, in Montreal. I wrote a lot of not very good code, mostly in Matlab. One time, my code was in a non-working state for an entire month---I would furiously type on the keyboard all day in the hopes it would eventually work, and it didn't. It made me sad. But I managed to graduate. Then I did a postdoc. More of the same. 35 | 36 | Eventually, I decided to properly study CS. I studied data structures, algorithms and software engineering practices, and I got a big-boy job as a software engineer at Google in California. It was then that I learned the error of my ways. I had lost time during my research days because I didn't know how to organize and write code that didn't self-destruct out of spite. But this is fixable! With knowledge! 37 | 38 | At the invitation of Ella Batty, I gave a workshop for the students in neuroscience at Harvard in January of 2021 on writing good research code. The feedback was overwhelmingly positive, so I decided to expand it into this handbook. I hope you enjoy it! 39 | 40 | ![It me](figures/pic.jpeg) 41 | 42 | ## Citing this handbook 43 | 44 | 45 | 46 | 10.5281/zenodo.5796873 47 | 48 | Patrick J Mineault & The Good Research Code Handbook Community (2021). _The Good Research Code Handbook_. Zenodo. [doi:10.5281/zenodo.5796873](https://dx.doi.org/10.5281/zenodo.5796873) 49 | 50 | ## Alternative formats 51 | 52 | You can download a PDF version of this book here. You can also take a look at a presentation I gave covering [the same materials](https://github.com/patrickmineault/research_code) [here](https://www.crowdcast.io/e/nma2021/29). [I discuss some of the practices in this book with Brian Okken on the Test & Code podcast](https://testandcode.com/193). Please contact me if you'd like to see a print edition of this book. 53 | 54 | ## Contact me 55 | 56 | I help businesses and labs implement the practices outlined in this book through my consulting firm, [xcorr consulting](https://xcorr.dev). Please get in touch at [hello@xcorr.dev](mailto:hello@xcorr.dev) to find out how we can work together. I'm also available for speaking engagements and collabs---reach out to the same address for that. 57 | 58 | Issues with the handbook can be reported on Github. I'm also active on Twitter. 59 | -------------------------------------------------------------------------------- /docs/figures/pure-impure.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 16 | 37 | 39 | 44 | 47 | 53 | 54 | 57 | 63 | 70 | 71 | 78 | Impure functions 89 | Not ideal 100 | Better 111 | 118 | Pure functions 129 | 130 | 131 | -------------------------------------------------------------------------------- /docs/social.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Make coding social 3 | exports: 4 | - format: tex 5 | logo: false 6 | template: ../templates/plain_latex_book_chapter 7 | output: exports/social.tex 8 | --- 9 | 10 | # Make it social 11 | 12 | ```{epigraph} 13 | Maybe the real *good research code* is the friends we made along the way. 14 | 15 | ---Patrick Mineault 16 | ``` 17 | 18 | People think that programming is a solitary activity. Engineers, and software engineers in particular, are caricatured as socially inept, basement-dwelling dweebs in popular culture. The reality is that programming is a highly social activity. At a place like Google, for instance, programmers are in constant contact with other programmers, through: 19 | 20 | * readability reviews 21 | * code reviews 22 | * design reviews 23 | * pair programming 24 | * reading groups 25 | * retreats 26 | * performance reviews 27 | 28 | This is how you get better at programming: by programming with people who are better at programming than you. 29 | 30 | ## Pair program 31 | 32 | *Pair programming* is a very effective method of sharing knowledge through active practice. It's commonly used in industry. Two programmers collaborate actively on a programming task. In the traditional driver-and-navigator style, two programmers are physically co-located. One of them---the *driver* or pilot -- physically types the code into the terminal or code editor. They think about micro-issues in the code (tactics): what goes into the body of a `for` loop, the syntax of the code, etc. 33 | 34 | The *navigator* or co-pilot assist the driver in telling them what to write. They typically focus on macro issues: what a function should accomplish, how the code is structured, etc. They can also perform other tasks on a second laptop, for example looking up documentation. 35 | 36 | ```{margin} 37 | I wrote this book in part to become better at the skills I'm now teaching you. Teaching is a great way to learn. 38 | ``` 39 | 40 | Pair programming forces you to hone your communication skills. Nothing quite reveals your gaps in knowledge than trying to explain to someone what is going on with a piece of code. Sometimes, the people you're pairing with can immediately fill your gap in knowledge; in other cases, you can both learn about a new subject at the same time. Pair programming is especially effective at transmitting knowledge about under-documented systems. You may have explained to a new student in your lab how to perform mysterious experimental procedure **X**. The best way to transmit that knowledge is to have the student attempt to perform the procedure in front of you: active practice enhances learning. 41 | 42 | ```{margin} 43 | I learned about `Ctrl+Shift+R` (reverse search in bash) through pair programming. 44 | ``` 45 | 46 | Finally, pair programming can help you learn someone's productivity shortcuts. Seeing somebody work comfortably in an unfamiliar environment is enlightening. I have seen programmers be productive in vim, and it is a sight to behold. 47 | 48 | To many, pair programming sounds like a nightmare. It's certainly *uncomfortable*. Things might go too slow or too fast for you, and it can be mentally draining. It's best to do it in short increments (e.g. one hour). In all cases, remember to turn your empathy up to 11, and be excellent to each other. 49 | 50 | Although pair programming was traditionally done by physically co-located programmers, many find remote pair programming more comfortable. Screen sharing in Zoom works but can feel intrusive: imagine accidentally showing your inbox or IMs. Instead, you can use an IDE where you can see the other person's cursor. Some environments to do this: 51 | 52 | * [VSCode Liveshare for in-IDE editing](https://code.visualstudio.com/blogs/2017/11/15/live-share) 53 | * [DeepNote](https://deepnote.com/) and [cocalc](https://cocalc.com/) for jupyter notebooks 54 | * [Replit](https://replit.com/) for pure Python in the browser 55 | 56 | ## Set up code review in your lab 57 | 58 | Reading and critiquing other people's code is a great way to learn. *Code review* is the practice of peer reviewing other people's code. You can use [Github pull requests to give and receive line-by-line feedback on code](https://docs.github.com/en/enterprise-server@2.20/github/collaborating-with-issues-and-pull-requests/reviewing-proposed-changes-in-a-pull-request) asynchronously. Alternatively, you can set up a code review circle in your lab. It works like a lab meeting, but instead of doing presentations, you all read code and comment on it at once. Again, it's uncomfortable and awkward, but you can learn a lot this way. 59 | 60 | ## Participate in open source 61 | 62 | Maybe your local environment isn't ideal for you to become better at programming. Perhaps you're the only person in your lab that programs. Becoming part of an open source project is a great way to find like-minded people you can learn from. 63 | 64 | Joining an open source project doesn't have to be an all or none affair. You can dip your toe in by opening an issue on a software project that you use. Are people responsive? Are they nice? If yes, then you can increase your involvement by starting a pull request to add a feature to a project. Generally, it's better to open an issue first to tell people about your plans; many larger projects are careful not to introduce new features, because it increases the amount of long-term maintenance people need to do. 65 | 66 | Sometimes, you can pick a moribund open source project and maintain it. And if something doesn't exist yet, you can start your own open source project. You will do everything wrong, but you will learn a ton. 67 | 68 | ## Find your community 69 | 70 | In addition to projects, there are communities of programers that support each other. [Hacker spaces](https://wiki.hackerspaces.org/w/index.php) promote self-reliance through learning technical skills, which include fabrication, woodworking, sewing and programming. Many meetups exist, focused on Python, data science, deep learning, or more. Some groups are designed as safe spaces for underrepresented people, for example [PyLadies](https://www.pyladies.com/). 71 | 72 | Finally, you can go on a programmer's retreat to deep dive into a tech, for example through the [Recurse Center](https://www.recurse.com/). This will put you in contact with many other like-minded people who are in different places in their learning journey. Creating that deep web of connections will be invaluable to making you feel connected to the community at large. 73 | 74 | ## Discussion 75 | 76 | As we've seen throughout this book, there are many ways you can improve how you write code. However, one of the highest leverage actions you can take to improve your craft is to immerse yourself in a community of practice. Be excellent to each other, learn from each other and give back to the community 🌠🌈. 77 | 78 | ```{admonition} 5-minute exercise 79 | Schedule a pair programming session with a lab mate. 80 | ``` 81 | -------------------------------------------------------------------------------- /assemble_book.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import yaml 3 | 4 | 5 | template = r""" 6 | \documentclass[letterpaper,twoside,openright]{scrbook} 7 | \usepackage{hyperref} 8 | \usepackage{datetime} 9 | \usepackage{graphicx} 10 | \usepackage{natbib} 11 | \usepackage{framed} 12 | \usepackage[utf8]{inputenc} 13 | \usepackage{svg} 14 | 15 | % Header stuff 16 | \usepackage{scrlayer-scrpage} 17 | 18 | \usepackage[top=80pt, bottom=80pt, left=80pt, right=80pt]{geometry} 19 | 20 | \clearscrheadfoot 21 | \ihead{\headmark} 22 | \ohead{\pagemark} 23 | 24 | \rofoot{Patrick Mineault} 25 | 26 | % Use a smaller verbatim font to prevent overfull hboxes 27 | \usepackage{etoolbox} 28 | \makeatletter 29 | \patchcmd{\@verbatim} 30 | {\verbatim@font} 31 | {\verbatim@font\small} 32 | {}{} 33 | \makeatother 34 | 35 | % Listing code 36 | \usepackage{courier} 37 | \usepackage[formats]{listings} 38 | \lstdefinestyle{mystyle}{ 39 | basicstyle=\ttfamily\footnotesize, 40 | breakatwhitespace=false, 41 | breaklines=true, 42 | captionpos=b, 43 | frame=tB, 44 | aboveskip=16pt, 45 | belowskip=16pt, 46 | keepspaces=true, 47 | showspaces=false, 48 | showstringspaces=false, 49 | showtabs=false, 50 | tabsize=2 51 | } 52 | 53 | \lstdefineformat{Python}{~=\( \sim \)} 54 | 55 | 56 | % Blockquotes 57 | \usepackage[tikz]{bclogo} 58 | \usepackage[most]{tcolorbox} 59 | \usetikzlibrary{calc,shapes} 60 | 61 | 62 | \makeatletter 63 | \NewTColorBox{quotebox}{+O{}+m}{% 64 | enhanced, 65 | sharp corners, 66 | frame hidden, 67 | % borderline west={\kvtcb@left@rule}{-2pt}{black!50!white}, 68 | borderline west={4pt}{0pt}{black!30!white}, 69 | colback = white, 70 | left=15pt, 71 | #1, 72 | } 73 | \makeatother 74 | 75 | \lstset{style=mystyle} 76 | 77 | \bibliographystyle{abbrvnat} 78 | 79 | % colors for hyperlinks 80 | \hypersetup{colorlinks=true, allcolors=blue} 81 | 82 | \title{Good Research Code handbook} 83 | \author{Patrick Mineault} 84 | 85 | \begin{document} 86 | 87 | \maketitle 88 | 89 | \frontmatter 90 | 91 | \setcounter{tocdepth}{\subsectiontocdepth} 92 | 93 | \tableofcontents 94 | 95 | \part{Introduction} 96 | 97 | [-FRONTMATTER-] 98 | 99 | \mainmatter 100 | 101 | \part{Lessons} 102 | 103 | [-CONTENT-] 104 | 105 | \backmatter 106 | 107 | \part{Extras} 108 | 109 | [-BACKMATTER-] 110 | 111 | \end{document} 112 | """ 113 | 114 | 115 | import subprocess 116 | 117 | 118 | def clean_input(md): 119 | lines = [] 120 | in_citation = False 121 | for line in md.split("\n"): 122 | if in_citation: 123 | if "```" in line: 124 | in_citation = False 125 | else: 126 | lines.append("> " + line) 127 | else: 128 | if "{epigraph}" in line: 129 | in_citation = True 130 | elif "{dropdown}" in line: 131 | lines.append(line.replace("dropdown", "admonition")) 132 | elif "{margin}" in line: 133 | lines.append(line.replace("{margin}", "{admonition} Note")) 134 | elif "{tabbed}" in line: 135 | lines.append(line.replace("{tabbed}", "{admonition}")) 136 | else: 137 | lines.append(line.replace("🌠", "").replace("🌈", "")) 138 | return "\n".join(lines) 139 | 140 | 141 | def process_one(name): 142 | with open(f"docs/{name}.md", "r") as f: 143 | md = f.read() 144 | 145 | md = clean_input(md) 146 | with open(f"tmp/{name}.md", "w") as f: 147 | f.write(md) 148 | 149 | process = subprocess.Popen( 150 | ["curvenote", "export", "tex", f"tmp/{name}.md"], 151 | stdout=subprocess.PIPE, 152 | stderr=subprocess.PIPE, 153 | ) 154 | stdout, stderr = process.communicate() 155 | print(stderr.decode("utf-8")) 156 | with open(f"tmp/exports/{name}.tex", "r") as f: 157 | tex = f.read() 158 | return tex 159 | 160 | 161 | def clean_output(book): 162 | lines = [] 163 | in_code = False 164 | for line in book.split("\n"): 165 | if ".svg" in line and "includegraphics" in line: 166 | lines.append(line.replace("includegraphics", "includesvg")) 167 | elif "caption*" in line: 168 | lines.append(line.replace("caption*", "caption")) 169 | elif r"\begin{verbatim}" in line: 170 | lines.append( 171 | line.replace( 172 | r"\begin{verbatim}", r"\begin{lstlisting}[language=Python]" 173 | ) 174 | ) 175 | in_code = True 176 | elif r"\end{verbatim}" in line: 177 | lines.append(line.replace(r"\end{verbatim}", r"\end{lstlisting}")) 178 | in_code = False 179 | elif r"\begin{quote}" in line: 180 | lines.append(r"\begin{quotebox}{quote}") 181 | elif r"\end{quote}" in line: 182 | lines.append(r"\end{quotebox}") 183 | else: 184 | if in_code: 185 | lines.append( 186 | line.replace("- -", "--") 187 | .replace(" - ", "-") 188 | .replace("true -neutral -cookiecutter", "true-neutral-cookiecutter") 189 | .replace(" -forge", "-forge") 190 | .replace("| --", "|--") 191 | .replace("egg -info", "egg-info") 192 | .replace("sphinx -quickstart", "sphinx-quickstart") 193 | .replace("non -integer", "non-integer") 194 | .replace("codebook -testbucket", "codebook-testbucket") 195 | ) 196 | else: 197 | lines.append(line.replace("testing.md", "testing")) 198 | 199 | return "\n".join(lines) 200 | 201 | 202 | def assemble_onepager(): 203 | with open("docs/_toc.yml", "r") as f: 204 | toc = yaml.safe_load(f) 205 | print(toc) 206 | 207 | # Copy files 208 | process = subprocess.Popen( 209 | ["cp", "-r", "docs/figures", "tmp"], 210 | stdout=subprocess.PIPE, 211 | stderr=subprocess.PIPE, 212 | ) 213 | stdout, stderr = process.communicate() 214 | print(stderr.decode("utf-8")) 215 | 216 | book_parts = collections.defaultdict(str) 217 | 218 | book_parts["Intro"] += process_one("front-print") 219 | for part in toc["parts"]: 220 | for chapter in part["chapters"]: 221 | book_parts[part["caption"]] += process_one(chapter["file"]) 222 | 223 | the_map = { 224 | "Intro": "[-FRONTMATTER-]", 225 | "Lessons": "[-CONTENT-]", 226 | "Extras": "[-BACKMATTER-]", 227 | } 228 | 229 | complete = template 230 | for k, v in the_map.items(): 231 | part = clean_output(book_parts[k]) 232 | complete = complete.replace(v, part) 233 | 234 | with open("tmp/exports/book-complete.tex", "w") as f: 235 | f.write(complete) 236 | 237 | 238 | if __name__ == "__main__": 239 | assemble_onepager() 240 | -------------------------------------------------------------------------------- /docs/tidy.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Keep things tidy 3 | exports: 4 | - format: tex 5 | logo: false 6 | template: ../templates/plain_latex_book_chapter 7 | output: exports/tidy.tex 8 | --- 9 | 10 | # Keep things tidy 11 | 12 | ```{epigraph} 13 | Does it spark joy? 14 | 15 | ---Marie Kondo 16 | ``` 17 | 18 | Keeping things consistent and tidy will free your working memory from having to remember extraneous information like variable names and paths. 19 | 20 | ## Use the style guide 21 | 22 | Generally, Python uses: 23 | 24 | - Snake case for variables and module: `variable_name`, `my_module.py` 25 | - Camel case for class name: `MyClass` 26 | - Camel case with spaces for jupyter notebook: `Analyze Brain Data.ipynb` 27 | 28 | You might know that Python generally uses 4 spaces for indentation, and that files are expected to be at most 80 columns long. These and many other elements of style are written in [PEP 8](https://www.python.org/dev/peps/pep-0008/). Code that adheres to style tends to be easier to read. 29 | 30 | ```{margin} 31 | Big software companies like Google [have their own coding style guide](https://google.github.io/styleguide/pyguide.html). Even Guido von Rossum, inventor of Python, had to follow Google's style guide when he was working at Google. 32 | ``` 33 | 34 | Reading style guides is nobody's idea of a good time, but thankfully tools exist to help you maintain good coding style. If you prefer to eventually learn the rules, you can install `flake8` or `pylint`. Both tools are _linters_---detectors of bad style---which allow you to find and correct common deviations from the style guide. The ideal place to use the linter is inside of an IDE, [for example VSCode](vscode). It's also possible to use linters from the command line. 35 | 36 | A more radical way to impose style is to use a _code formatter_. A linter suggests fixes which you implement yourself; a formatter fixes issues automatically whenever you save a file. `black` [imposes consistent Python style](https://github.com/psf/black), and has plugins for all the popular IDEs. `black` is particularly useful to run on old code with haphazard style; run it once to upgrade code to a standard format. 37 | 38 | ## Delete dead code 39 | 40 | ```{figure} figures/final.doc.png 41 | --- 42 | width: 450px 43 | --- 44 | Dead code is a liability. From "Piled Higher and Deeper" by Jorge Cham. [www.phdcomics.com](https://www.phdcomics.com) 45 | ``` 46 | 47 | Code that gets developed over time can accumulate lots of dead ends and odds and ends. After a while, the majority of the code in your project might be dead: code that never gets called. You know who hates dead code? You, in three months. Navigating a project that contains stale or incorrect code can waste a huge amount of time. Whenever you're about to put aside a project for a long time---for instance, after submitting a manuscript---_clean up dead code_. Delete dead code from the main branch. With git and github, you have access to a time machine, so you can always revert if you mess up. 48 | 49 | If you're not used to this workflow, you might be scared of messing something up. You can download an archive of the repo before the cleanup to reassure yourself. If you've been diligent about committing and pushing code to Github, however, deleting dead code is a safe process. [`vulture` can help you find dead code in your codebase](https://github.com/jendrikseipp/vulture). Unit tests can help you verify that your codebase will still run after you eliminate dead code---we will cover this in a later lesson. 50 | 51 | ## Keep jupyter notebooks tidy 52 | 53 | ```{epigraph} 54 | If you use notebooks to develop software, you are probably using the wrong tool. 55 | 56 | ---[Yihui Xie](https://yihui.org/en/2018/09/notebook-war/) 57 | ``` 58 | 59 | Jupyter notebooks present a special challenge to keep tidy because of their inherently nonlinear nature. It's commonplace for long-running notebooks to accumulate lots of cruft: dead cells, out-of-date analyses, inaccurate commentary. Moreover, it takes a lot of discipline to put imports and functions at the start of notebooks. They don't play well with source control, so it can be hard to track down what has changed from one version of a notebook to another. 60 | 61 | ```{figure} figures/jupyter.svg 62 | --- 63 | width: 500px 64 | figclass: boxed 65 | --- 66 | Jupyter notebooks are very good at literate programming -- play to their strengths by focusing your jupyter notebooks on mixing explanations, text and graphics. 67 | ``` 68 | 69 | My [_somewhat_ controversial advice](https://news.ycombinator.com/item?id=18336202) is to keep IO, long-running pipelines, and functions and classes out of jupyter notebooks. Jupyter notebooks excel at literate programming---mixing code, textual explanations, and graphics. If you focus the scope of your jupyter notebooks to literate programming, you'll reduce the amount of cruft that you will need to clean up. As a side benefit, you'll be able to develop more software inside of an IDE---like VSCode or PyCharm---which has a deeper understanding of your code and supports powerful multi-file search. As such, you'll be well on your way to develop testable code. 70 | 71 | ```{tip} 72 | When you start a jupyter notebook, write 3-5 bullet points on what you want the analysis to accomplish. You'd be surprised how much this prevents notebooks from getting out of hand. 73 | ``` 74 | 75 | ### Make sure your notebooks run top to bottom 76 | 77 | [Pimentel et al. (2021)](https://link.springer.com/article/10.1007/s10664-021-09961-9#Sec18) found that only about 25% of jupyter notebooks scraped from GitHub ran from top to bottom without error. When you have context, it might only take a couple of minutes to re-order the cells and fix the errors; in several months, you could waste days on this. 78 | 79 | Therefore, before you commit a notebook to git, get into the habit of _restarting the kernel and running all_. Often, you will find that the notebook will not run top to bottom; fix the underlying error, and _restart and run all_ until your notebook runs again. If it's impractical to restart a notebook because you have a long-running pipeline in a cell, and executing the whole notebook takes a long time, move the relevant code outside the notebook and into a separate script. As a rule of thumb, a jupyter notebook _should run from top to bottom in a minute or less_. 80 | 81 | ### Be productive mixing modules and notebooks 82 | 83 | It can be difficult to co-develop a notebook and a module side-by-side, because whenever you change the module you will need to reload the library, often by restarting the kernel. Running these two magics---special commands recognized by jupyter---at the top of your notebook will ensure that the module is automatically reloaded whenever you change it. 84 | 85 | ```python 86 | %load_ext autoreload 87 | %autoreload 2 88 | ``` 89 | 90 | Now, it will feel uncomfortable to move away from jupyter notebooks for some workflows. You might be used to writing small snippets of code and then interact with it immediately to see whether it works: moving the code to a module means you can't use it in this very immediate fashion. We'll fix this discomfort later as we learn about [testing](testing). 91 | 92 | ### Refactor comfortably 93 | 94 | Refactoring and cleaning up a notebook can be a pain in the jupyter environment: moving a cell across several screens is a pain. [`jupytext` can seamlessly translate between a regular jupyter notebook and a markdown-based textual representation](https://jupytext.readthedocs.io/en/latest/). In my opinion, refactoring and moving cells around is far easier in the text representation. Checking in the jupytext representation of a notebook to source control also makes it easy to compare different versions of the same analysis. 95 | 96 | Move imports and function definitions to the top of your notebook. Look at Markdown headers and verify that they meaningfully summarize the analysis presented in that section. I find that it's better to write good headings and little long-form text at the start of an analysis, when the analysis still has a lot of room to shift, and to fill in the text later. Read the descriptions and check that they're up-to-date. Delete cells with obsolete analyses from the bottom of notebook: you can always recover them with source control if you've checked in the jupytext representation. 97 | 98 | ## Discussion 99 | 100 | ```{epigraph} 101 | There should be one---and preferably only one---obvious way to do it. 102 | 103 | ---[The Zen of Python](https://zen-of-python.info/there-should-be-one-and-preferably-only-one-obvious-way-to-do-it.html#13) 104 | ``` 105 | 106 | It's easy to mock style guides as pedantic nitpicking. After all, style, like spelling, is ultimately arbitrary. However, adhering to a standard style can help you preserve your working memory. Don't spend precious mental energy making lots of micro-decisions about variable names and how many spaces to put after a parenthesis: use the style guide. If there's an obvious way of doing things, do it that way. 107 | 108 | The short term advantage of using consistent structure compounds over time. Once you've put aside a project for long enough, you will need to reacquaint yourself with it anew, and cruft and dead ends will no longer make sense. Maintaining good code hygiene will make your future self happy. 109 | 110 | ```{admonition} 5-minute exercise 111 | Install `pylint` and run on a script you're currently working on. What did you learn? 112 | ``` 113 | -------------------------------------------------------------------------------- /docs/docs.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | formats: md:myst 4 | text_representation: 5 | extension: .md 6 | format_name: myst 7 | kernelspec: 8 | display_name: Python 3 9 | language: python 10 | name: python3 11 | title: "Write good documentation" 12 | exports: 13 | - format: tex 14 | logo: false 15 | template: ../templates/plain_latex_book_chapter 16 | output: exports/docs.tex 17 | --- 18 | 19 | # Document your code 20 | 21 | ```{epigraph} 22 | Documentation is a love letter that you write to your future self. 23 | 24 | ---Damian Conway 25 | ``` 26 | 27 | When I say documentation, what comes to mind? For many people, _documenting code_ is synonymous with _commenting code_. That's a narrow view of documentation. I take a larger view here---documentation is any meta-information that you write _about_ the code. In that larger view, all of these are documentation: 28 | 29 | - Single-line comments 30 | - Multi-line comments 31 | 32 | ```{margin} 33 | A unit test is meta-code which tells you the normative behavior of other code. It's a kind of documentation. Woah. 34 | ``` 35 | 36 | - Unit tests 37 | - Docstrings at the top of functions 38 | - Docstrings at the top of modules 39 | - Makefiles 40 | - Bash files 41 | - README.md 42 | - Usage documentation 43 | - Tutorial jupyter notebooks on using the code 44 | - Auto-generated documentation hosted on readthedocs 45 | - Websites with tutorials 46 | 47 | In this chapter, I'll talk about documenting small units of code---functions and modules. I will cover things that are not conventionally considered documentation, but that nevertheless clarify how to use code. In the next section, I'll discuss documenting larger units of code: programs, projects and pipelines. 48 | 49 | ## Raise errors 50 | 51 | ```{epigraph} 52 | Errors should never pass silently. 53 | 54 | ---The Zen of Python 55 | ``` 56 | 57 | People don't read manuals. That includes _you_. What people do read are _error messages_. Consider the following function stub, which is meant to convolve two vectors together: 58 | 59 | ```{code-cell} 60 | def conv(A, B, padding='valid'): 61 | """ 62 | Convolves the 1d signals A and B. 63 | 64 | Args: 65 | A (np.array): a 1d numpy array 66 | B (np.array): a 1d numpy array 67 | padding (str): padding type (valid, mirror) 68 | 69 | Returns: 70 | (np.array) The convolution of two vectors. 71 | """ 72 | pass 73 | ``` 74 | 75 | This is a fine docstring; it tells you how to use the code. Now consider the alternative function: 76 | 77 | ```{code-cell} 78 | def conv(A, B, padding='valid'): 79 | assert A.ndim == 1, "A must be one dimensional" 80 | assert B.ndim == 1, "B must be one dimensional" 81 | if padding not in ('valid', 'mirror'): 82 | raise NotImplementedError( 83 | f"{padding} not implemented.") 84 | pass 85 | ``` 86 | 87 | This code does not tell you how to use it: it _yells_ at you if you use it wrong. The first way relies on your good nature to read the docs; the second way _forces you_ to use the code as it was intended. I would argue that the second is better [^combine]. There are several ways to generate user errors: 88 | 89 | [^combine]: You can combine raising errors and write good docstrings. 90 | 91 | - `assert`: When an assert doesn't pass, it raises an `AssertionError`. You can optionally add an error message at the end. 92 | - `NotImplementedError`, `ValueError`, `NameError`: [Commonly used, generic errors](https://docs.python.org/3/library/exceptions.html) you can raise. I probably overuse `NotImplementedError` compared to other types. 93 | - Type hints: Python 3 has type hints, and you can optionally enforce type checking using decorators with [`enforce`](https://github.com/RussBaz/enforce) or [`pytypes`](https://pypi.org/project/pytypes/). Type checking is a bit controversial because it goes against Python's dynamic nature. It depends on your use case: if you like them, use them. 94 | 95 | The unit tests that we discussed last chapter are another mechanism through which you can raise errors: not in the main code path of your code, but in a secondary path that you run through the `pytest` command line. 96 | 97 | ## Write in-line comments 98 | 99 | ```{epigraph} 100 | Don't comment bad code---rewrite it. 101 | 102 | ---Kernighan & Plaugher, cited by Robert Martin in [Clean Code](https://www.oreilly.com/library/view/clean-code-a/9780136083238/) 103 | ``` 104 | 105 | In-line comments are often used to explain away bad code; you'd be better off rewriting the code rather than to explain the mess. Instead, aim to write code so that it needs few in-line comments. For instance, this code snippet uses meaningless variables names, so we have to explain its function in a comment: 106 | 107 | ```python 108 | # Iterate over lines 109 | for l in L: 110 | pass 111 | ``` 112 | 113 | Instead, we can clarify the code by using more meaningful variable names: 114 | 115 | ```python 116 | for line in lines: 117 | pass 118 | ``` 119 | 120 | Before commenting in-line on a piece of code, ask yourself: could I write this in a way that the code is self-explanatory? Then change your code to achieve that. You can then reserve in-line comments to give context that is not readily available in the code itself. There are some essential things you should comment in-line: 121 | 122 | - _References to papers_ with page numbers and equation numbers (e.g. see Mineault et al. (2011), Appendix equation 2 for definition) 123 | - _Explanations of tricky code_, and why you wrote it the way you did, and why it does the thing that it does 124 | - _TODOs_. Your Python editor recognizes special `TODO` comments and will highlight them. 125 | 126 | ``` 127 | # TODO(pmin): Implement generalization to non-integer numbers 128 | ``` 129 | 130 | You can similarly highlight code that needs to be improved with `FIXME`. 131 | 132 | ## Write docstrings 133 | 134 | Python uses multi-line strings---docstrings---to document individual functions. Docstrings can be read by humans directly. They can be also be read by machines to create HTML documentation, so they're particularly useful if your code is part of a publicly available package. There are three prevalent styles of docstrings: 135 | 136 | ```{margin} 137 | [Sphinx](https://www.sphinx-doc.org/en/master/) is the standard way to generate HTML documentation in Python. Sphinx is very powerful: this book is generated by jupyterbook, which uses sphinx to do its job! 138 | ``` 139 | 140 | - [reST (reStructuredText)](https://sphinx-rtd-tutorial.readthedocs.io/en/latest/docstrings.html) 141 | - [Google style](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) 142 | - [Numpy style](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_numpy.html) 143 | 144 | reST is more prevalent because it's the default in Sphinx, but I think the Google style is easier to read for humans and I prefer it. Here's how you would document a function which counts the number of occurrences of a line: 145 | 146 | ````{tabbed} Google-style 147 | ``` 148 | def count_line(f, line): 149 | """ 150 | Counts the number of times a line occurs. Case-sensitive. 151 | 152 | Arguments: 153 | f (file): the file to scan 154 | line (str): the line to count 155 | 156 | Returns: 157 | int: the number of times the line occurs. 158 | """ 159 | num_instances = 0 160 | for l in f: 161 | if l.strip() == line: 162 | num_instances += 1 163 | 164 | return num_instances 165 | ``` 166 | ```` 167 | 168 | ````{tabbed} Numpy-style 169 | ``` 170 | def count_line(f, line): 171 | """ 172 | Counts the number of times a line occurs. Case-sensitive. 173 | 174 | Parameters 175 | ---------- 176 | f: file 177 | the file to scan 178 | line: str 179 | the line to count 180 | 181 | Returns 182 | ------- 183 | int 184 | the number of times the line occurs. 185 | """ 186 | num_instances = 0 187 | for l in f: 188 | if l.strip() == line: 189 | num_instances += 1 190 | 191 | return num_instances 192 | ``` 193 | ```` 194 | 195 | ````{tabbed} reST 196 | ``` 197 | def count_word(f, line): 198 | """ 199 | Counts the number of times a line occurs. Case-sensitive. 200 | 201 | :param f: the file to scan 202 | :type f: file 203 | :param line: the line to count 204 | :type line: str 205 | :returns: the number of times the line occurs. 206 | :rtype: int 207 | """ 208 | num_instances = 0 209 | for l in f: 210 | if l.strip() == line: 211 | num_instances += 1 212 | 213 | return num_instances 214 | ``` 215 | ```` 216 | 217 | Docstrings for this function will appear in the REPL and in jupyter notebook when you type `help(count_word)`. They will also be parsed and displayed in IDEs like vscode and PyCharm. 218 | 219 | See which style of docstring you prefer and stick to it. Autodocstring, an extension in vscode, can you help you [automatically generate a docstring stub](https://marketplace.visualstudio.com/items?itemName=njpwerner.autodocstring). It uses the Google style by default. 220 | 221 | ```{warning} 222 | Docstrings can age poorly. When your arguments change, it's easy to forget to change the docstring accordingly. I prefer to wait until later in the development process when function interfaces are stable to start writing docstrings. 223 | ``` 224 | 225 | ## Publish docs on Readthedocs 226 | 227 | ```{margin} 228 | Sphinx can auto-generate docs from Google and Numpy-style docstrings [with a plugin](https://www.sphinx-doc.org/en/master/usage/extensions/napoleon.html). 229 | ``` 230 | 231 | You know those sweet static docs that you see on readthedocs? You can generate this kind of documentation from docstrings using Sphinx. [There's a great tutorial to get you started](https://docs.readthedocs.io/en/stable/intro/getting-started-with-sphinx.html), but in essence getting basic generated docs is a matter of typing 4 commands: 232 | 233 | ```shell 234 | pip install sphinx 235 | cd docs 236 | sphinx-quickstart 237 | make html 238 | ``` 239 | 240 | [Uploading the docs to readthedocs](https://docs.readthedocs.io/en/stable/tutorial/index.html#importing-the-project-to-read-the-docs) (or Github pages or netlify) is a one-command affair. Docs which focus exclusively on usage (what arguments to use, their types, the returns) are of pretty limited use by themselves. They're powerful when combined with high-level instructions, tutorials and walkthroughs. We'll cover how to write these in the next chapter. 241 | 242 | ## Discussion 243 | 244 | Good documentation helps maintain the long-term memory of a project. Very tricky code must be documented with care so that the memory of its intent and implementation is preserved. However, if you have a choice between documenting tricky code and refactoring the code so that it's less tricky, you'll often find that refactoring code pays off over the long term. Similarly, it's often more productive to write unit tests that lock in how the code works than to explain how the code _should_ work in words. Document code that needs to be documented, improve the code that can be improved, and develop the wisdom to tell them apart. 245 | 246 | ```{admonition} 5-minute exercise 247 | Write a docstring for a function you've worked on. 248 | ``` 249 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Attribution 4.0 International Public License 2 | 3 | By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions. 4 | 5 | Section 1 – Definitions. 6 | 7 | Adapted Material means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image. 8 | Adapter's License means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License. 9 | Copyright and Similar Rights means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights. 10 | Effective Technological Measures means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements. 11 | Exceptions and Limitations means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material. 12 | Licensed Material means the artistic or literary work, database, or other material to which the Licensor applied this Public License. 13 | Licensed Rights means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license. 14 | Licensor means the individual(s) or entity(ies) granting rights under this Public License. 15 | Share means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them. 16 | Sui Generis Database Rights means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world. 17 | You means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning. 18 | 19 | Section 2 – Scope. 20 | 21 | License grant. 22 | Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to: 23 | reproduce and Share the Licensed Material, in whole or in part; and 24 | produce, reproduce, and Share Adapted Material. 25 | Exceptions and Limitations. For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions. 26 | Term. The term of this Public License is specified in Section 6(a). 27 | Media and formats; technical modifications allowed. The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material. 28 | Downstream recipients. 29 | Offer from the Licensor – Licensed Material. Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License. 30 | No downstream restrictions. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material. 31 | No endorsement. Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i). 32 | 33 | Other rights. 34 | Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise. 35 | Patent and trademark rights are not licensed under this Public License. 36 | To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties. 37 | 38 | Section 3 – License Conditions. 39 | 40 | Your exercise of the Licensed Rights is expressly made subject to the following conditions. 41 | 42 | Attribution. 43 | 44 | If You Share the Licensed Material (including in modified form), You must: 45 | retain the following if it is supplied by the Licensor with the Licensed Material: 46 | identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated); 47 | a copyright notice; 48 | a notice that refers to this Public License; 49 | a notice that refers to the disclaimer of warranties; 50 | a URI or hyperlink to the Licensed Material to the extent reasonably practicable; 51 | indicate if You modified the Licensed Material and retain an indication of any previous modifications; and 52 | indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License. 53 | You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information. 54 | If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable. 55 | If You Share Adapted Material You produce, the Adapter's License You apply must not prevent recipients of the Adapted Material from complying with this Public License. 56 | 57 | Section 4 – Sui Generis Database Rights. 58 | 59 | Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material: 60 | 61 | for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database; 62 | if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material; and 63 | You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database. 64 | 65 | For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights. 66 | 67 | Section 5 – Disclaimer of Warranties and Limitation of Liability. 68 | 69 | Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You. 70 | To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You. 71 | 72 | The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability. 73 | 74 | Section 6 – Term and Termination. 75 | 76 | This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically. 77 | 78 | Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates: 79 | automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or 80 | upon express reinstatement by the Licensor. 81 | For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License. 82 | For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License. 83 | Sections 1, 5, 6, 7, and 8 survive termination of this Public License. 84 | 85 | Section 7 – Other Terms and Conditions. 86 | 87 | The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed. 88 | Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License. 89 | 90 | Section 8 – Interpretation. 91 | 92 | For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License. 93 | To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions. 94 | No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor. 95 | Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority. 96 | -------------------------------------------------------------------------------- /docs/cka.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "How to test numerical code: CKA" 3 | exports: 4 | - format: tex 5 | logo: false 6 | template: ../templates/plain_latex_book_chapter 7 | output: exports/cka.tex 8 | --- 9 | 10 | # Test numerical code: CKA 11 | 12 | Let's look at an extended example of testing numerical code. This example implements a computational method called CKA which was introduced [in this paper](https://arxiv.org/abs/1905.00414). Importantly, CKA is not already implemented in scipy or sci-kit learn or in any other pip installable package: we're flying solo [^caveat]. 13 | 14 | [^caveat]: [There is an implementation in a notebook from the authors](https://colab.research.google.com/github/google-research/google-research/blob/master/representation_similarity/Demo.ipynb). 15 | 16 | Does this make you nervous? It should! It's easy to make a mistake in a computational pipeline and get the wrong results. With the structured approach that I've introduced in this handbook, you can work with far more confidence. 17 | 18 | ## Background 19 | 20 | We're going to implement centered kernel alignment (CKA) and apply it on test data. Because I wanted the method not to be implemented already in a major Python package, it had to be pretty obscure. I don't expect anyone reading this to have already heard of CKA, so a quick intro is in order. In brief, _CKA is a way to compare two matrices, in the same way that Pearson's correlation can compare two vectors_. It has applications to studying the brain and neural nets. 21 | 22 | ### Comparing deep neural nets and brains 23 | 24 | Deep artificial neural nets perform fabulous feats, whether it's detecting objects in images or translating speech to text. Neuroscientists often wonder whether or not these neural nets do their work in ways similar to the brain. To answer this question, they rely on methods that follow the same core recipe: 25 | 26 | 1. pick a battery of stimuli 27 | 1. measure the response of a brain to the battery of stimuli (in a MRI scanner, let's say) 28 | 1. measure the response of a deep neural net to the battery of stimuli. 29 | 1. Compare the two sets of responses 30 | 31 | If the two sets of responses are similar, that means the brain and the deep neural net are aligned in some sense. We collect the responses into two matrices, $X$ and $Y$ which are of size *N*x*K* and *N*x*L*, respectively; _N_ is the number of stimuli. Then the game, in the final step, is to compare the two matrices $X$ and $Y$. CKA is a metric introduced in Kornblith et al. (2019) that can be used to compare two matrices which has some nice properties. 32 | 33 | ### Definition 34 | 35 | CKA is defined as: 36 | 37 | $$CKA(\mathbf X, \mathbf Y) = \frac{||\mathbf X^T \mathbf Y||_2^2}{||\mathbf X^T \mathbf X||_2 ||\mathbf Y^T \mathbf Y||_2}$$ 38 | 39 | The columns $X$ and $Y$ are centered. $|| \mathbf{Z} ||_2 \equiv \sqrt{ \sum_{ij} Z_{ij}^2}$ is the Frobenius norm, the root of the sum of squares of the entries. 40 | 41 | When the CKA is 0, the two representations are maximally different; when it is 1, the two representations are maximally similar. You might notice that the formula resembles Pearson's correlation: 42 | 43 | $$r_{xy} =\frac{\sum ^n _{i=1}(x_i - \bar{x})(y_i - \bar{y})}{\sqrt{\sum ^n _{i=1}(x_i - \bar{x})^2} \sqrt{\sum ^n _{i=1}(y_i - \bar{y})^2}}$$ 44 | 45 | In fact, CKA is the square of the Pearson's correlation when $X$ and $Y$ are vectors. You can thus think of the CKA as a generalization of Pearson's correlation for matrices. 46 | 47 | ## Initial implementation 48 | 49 | Based off of the definition above, we can implement a working version of CKA with the following code. In `cka_step1.py`: 50 | 51 | ```python 52 | import numpy as np 53 | 54 | def cka(X, Y): 55 | # Implements linear CKA as in Kornblith et al. (2019) 56 | 57 | # Center X and Y 58 | X -= X.mean(axis=0) 59 | Y -= Y.mean(axis=0) 60 | 61 | # Calculate CKA 62 | XTX = X.T.dot(X) 63 | YTY = Y.T.dot(Y) 64 | YTX = Y.T.dot(X) 65 | 66 | return (YTX ** 2).sum() / np.sqrt((XTX ** 2).sum() * (YTY ** 2).sum()) 67 | ``` 68 | 69 | Now, is this function _correct_? In these ten lines of code, there's a lot of trickiness going on: 70 | 71 | - This function centers the columns of $X$... or does it? Should it remove `X.mean(axis=1)` instead? 72 | - Is this function pure? Does it change its arguments $X$ and $Y$? 73 | - In the last line: is this the correct definition of the Frobenius norm? Or are we off by a squaring factor? 74 | 75 | Indeed, a lot can go wrong in implementing this short function. Let's write down some tests to reassure ourselves that this function does what it needs to do. 76 | 77 | ## Writing our first test 78 | 79 | The first test is the identity test: the CKA of a matrix with itself is 1, just like with Pearson's correlation. Let's write the identity test as part of a test suite. 80 | 81 | Let's code CKA tests. We will turn properties of CKA listed in the paper into tests. In `cka_step1.py`, we write: 82 | 83 | ```python 84 | from cka import cka 85 | import numpy as np 86 | 87 | def test_identity(): 88 | # Create a random matrix, check it is perfectly correlated with itself. 89 | X = np.random.randn(100, 2) 90 | assert cka(X, X) == 1.0 91 | ``` 92 | 93 | Great! Now we can run our test suite with pytest: 94 | 95 | ```console 96 | (cb) ~/Documents/codebook_examples/cka$ pytest . 97 | ============================= test session starts ============================== 98 | platform linux -- Python 3.8.11, pytest-6.2.5, py-1.10.0, pluggy-1.0.0 99 | rootdir: /home/pmin/Documents/codebook_examples/cka 100 | plugins: anyio-3.3.0 101 | collected 1 item 102 | 103 | test_cka.py F [100%] 104 | 105 | =================================== FAILURES =================================== 106 | ________________________________ test_identity _________________________________ 107 | 108 | def test_identity(): 109 | # Create a random matrix, check it is perfectly correlated with itself. 110 | X = np.random.randn(100, 2) 111 | > assert cka(X, X) == 1.0 112 | E assert 0.9999999999999994 == 1.0 113 | ``` 114 | 115 | Here we've run into one of the tricky bits about writing numerical code: numerical instability. 1.0 is very close to 0.9999999999999994, but it's not exactly equal. We can replace our test with a more lenient one. Numpy's `np.testing.assert_allclose` can test that two arrays are close enough to each other entry-wise: 116 | 117 | ```python 118 | def test_identity_lenient(): 119 | # Create a random matrix, check it is perfectly correlated with itself. 120 | X = np.random.randn(100, 2) 121 | np.testing.assert_allclose(cka_start(X, X), 1.0) 122 | ``` 123 | 124 | And now we find the tests pass. Let's add one more test to the mix: a matrix and itself are perfectly correlated, _regardless of the order of their columns_. We can make a new test for that. 125 | 126 | ```python 127 | def test_column_swaps(): 128 | # A matrix is perfectly correlated with itself even with column swaps. 129 | X = np.random.randn(100, 2) 130 | c = cka_start(X[:, [0, 1]], X[:, [1, 0]]) 131 | np.testing.assert_allclose(c, 1.0) 132 | ``` 133 | 134 | And now the tests pass: 135 | 136 | ```console 137 | (cb) ~/Documents/codebook_examples/cka$ pytest . 138 | ============================= test session starts ============================== 139 | platform linux -- Python 3.8.11, pytest-6.2.5, py-1.10.0, pluggy-1.0.0 140 | rootdir: /home/pmin/Documents/codebook_examples/cka 141 | plugins: anyio-3.3.0 142 | collected 2 items 143 | 144 | test_cka_step1.py .. [100%] 145 | ``` 146 | 147 | ## Checking centering 148 | 149 | Now let's add another test, to verify that our CKA implementation if centering correctly. It shouldn't matter how columns are centered, so we can add an offset and verify that we obtain the same result: 150 | 151 | ```python 152 | def test_centering(): 153 | # Check that a matrix is perfectly correlated with itself even with adding 154 | # column offsets 155 | X = np.random.randn(100, 2) 156 | Xp = X.copy() 157 | Xp[:, 1] += 1.0 158 | 159 | c = cka(X, Xp) 160 | np.testing.assert_allclose(c, 1.0) 161 | ``` 162 | 163 | Run it in pytest---it works! That means we did the centering correctly. Indeed, we correctly removed `X.mean(axis=0)` from `X` and `Y.mean(axis=0)` from `Y`. But wait a minute---when we center in our function, do we change the original matrix? We can add a test to check that: 164 | 165 | ````console 166 | (cb) ~/Documents/codebook_examples/cka$ pytest . 167 | ============================= test session starts ============================== 168 | platform linux -- Python 3.8.11, pytest-6.2.5, py-1.10.0, pluggy-1.0.0 169 | rootdir: /home/pmin/Documents/codebook_examples/cka 170 | plugins: anyio-3.3.0 171 | collected 4 items 172 | 173 | test_cka_step1.py ...F [100%] 174 | 175 | =================================== FAILURES =================================== 176 | __________________________________ test_pure ___________________________________ 177 | 178 | def test_pure(): 179 | # Check that a function doesn't change the original matrices 180 | X = np.random.randn(100, 2) 181 | Xp = X.copy() 182 | Xp[:, 1] += 1.0 183 | 184 | Xp_original = Xp.copy() 185 | c = cka(X, Xp) 186 | > np.testing.assert_allclose(Xp_original[:, 1], Xp[:, 1]) 187 | E AssertionError: 188 | E Not equal to tolerance rtol=1e-07, atol=0 189 | ```` 190 | 191 | We see that this function modifies its argument. If you scroll back up to the `cka` definition, you can see that we used the `-=` in-place assignment operator. This caused the original matrix to change. If this tripped you up: don't worry! I was very confused by this as well. This line changes the original array: 192 | 193 | ```python 194 | X -= X.mean(axis=0) 195 | ``` 196 | 197 | But this line returns a copy of the matrix: 198 | 199 | ```python 200 | X = X - X.mean(axis=0) 201 | ``` 202 | 203 | Who knew! This kind of subtle semantic difference can really trip you up. We can clarify the intent of the code using `copy()` to identicate that we don't want to change the original array: this way, the function's intent is very clear. In `cka_step2.py`, we write the function a different way: 204 | 205 | ```python 206 | import numpy as np 207 | 208 | def cka(X, Y): 209 | # Implements linear CKA as in Kornblith et al. (2019) 210 | X = X.copy() 211 | Y = Y.copy() 212 | 213 | # Center X and Y 214 | X -= X.mean(axis=0) 215 | Y -= Y.mean(axis=0) 216 | 217 | # Calculate CKA 218 | XTX = X.T.dot(X) 219 | YTY = Y.T.dot(Y) 220 | YTX = Y.T.dot(X) 221 | 222 | return (YTX ** 2).sum() / np.sqrt((XTX ** 2).sum() * (YTY ** 2).sum()) 223 | ``` 224 | 225 | Now we copy our old tests into `test_cka_step2.py`, and the issue is fixed: 226 | 227 | ``` 228 | (cb) ~/Documents/codebook_examples/cka$ pytest test_cka_step2.py 229 | ============================= test session starts ============================== 230 | platform linux -- Python 3.8.11, pytest-6.2.5, py-1.10.0, pluggy-1.0.0 231 | rootdir: /home/pmin/Documents/codebook_examples/cka 232 | plugins: anyio-3.3.0 233 | collected 4 items 234 | 235 | test_cka_step2.py .... [100%] 236 | ``` 237 | 238 | ## More properties 239 | 240 | CKA has several more properties which we can test. Many of these are listed in the CKA paper: 241 | 242 | - the CKA is the square of the Pearson's correlation when $X$ and $Y$ are vectors 243 | - the CKA is insensitive to rotations 244 | - the CKA is insensitive to scaling the entire matrix 245 | - the CKA is sensitive to scaling different columns differently 246 | 247 | Here, it becomes useful to create a few helper functions to generate sample signals: matrices made of sinusoids of different frequencies in each column. We'll remove our reliance on random data: it's generally good practice to have _deterministic_ tests. Non-deterministic tests that sometimes work and sometimes don't are _flaky_, and they can be a pain. Let's put it all together: 248 | 249 | ```python 250 | from cka_step2 import cka 251 | import numpy as np 252 | import pytest 253 | 254 | def _get_one(): 255 | X = np.cos(.1 * np.pi * np.arange(10)).reshape((-1, 1)) 256 | Y = np.cos(2 + .07 * np.pi * np.arange(10)).reshape((-1, 1)) 257 | return X, Y 258 | 259 | def _get_multi(): 260 | X = (np.cos(.1 * np.pi * np.arange(10).reshape((-1, 1)) * 261 | np.linspace(.5, 1.5, num=3).reshape((1, -1)))) 262 | Y = (np.cos(.5 + .07 * np.pi * np.arange(10).reshape((-1, 1)) * 263 | np.linspace(.7, 1.3, num=4).reshape((1, -1)))) 264 | return X, Y 265 | 266 | def test_identity_lenient(): 267 | """Create a random matrix, check it is perfectly correlated with itself.""" 268 | X, _ = _get_multi() 269 | np.testing.assert_allclose(cka(X, X), 1.0) 270 | 271 | def test_column_swaps(): 272 | """A matrix is perfectly correlated with itself even with column swaps.""" 273 | X, _ = _get_multi() 274 | c = cka(X[:, [0, 1]], X[:, [1, 0]]) 275 | np.testing.assert_allclose(c, 1.0) 276 | 277 | def test_centering(): 278 | """A matrix is perfectly correlated with itself with column offsets.""" 279 | X, _ = _get_multi() 280 | Xp = X.copy() 281 | Xp[:, 1] += 1.0 282 | 283 | c = cka(X, Xp) 284 | np.testing.assert_allclose(c, 1.0) 285 | 286 | def test_pure(): 287 | """Check that a function doesn't change the original matrices.""" 288 | X, _ = _get_multi() 289 | Xp = X.copy() 290 | Xp[:, 1] += 1.0 291 | 292 | Xp_original = Xp.copy() 293 | c = cka(X, Xp) 294 | np.testing.assert_allclose(Xp_original[:, 1], Xp[:, 1]) 295 | 296 | def test_corr(): 297 | """The CKA of two vectors is the square of the correlation coefficient""" 298 | X, Y = _get_one() 299 | c1 = cka(X, Y) 300 | c2 = np.corrcoef(X.squeeze(), Y.squeeze())[0, 1] ** 2 301 | np.testing.assert_allclose(c1, c2) 302 | 303 | def test_isoscaling(): 304 | """CKA is insensitive to scaling by a scalar""" 305 | X, Y = _get_multi() 306 | c1 = cka(X, Y) 307 | c2 = cka(2.0 * X, - 1 * Y) 308 | np.testing.assert_allclose(c1, c2) 309 | 310 | def test_rotation(): 311 | """CKA is insensitive to rotations""" 312 | X, Y = _get_multi() 313 | X0 = X[:, :2] 314 | X0p = X0 @ np.array([[1, -1], [1, 1]]) / np.sqrt(2) 315 | c1 = cka(X0, Y) 316 | c2 = cka(X0p, Y) 317 | np.testing.assert_allclose(c1, c2) 318 | 319 | def test_no_iso(): 320 | """CKA is sensitive to column scaling""" 321 | X, Y = _get_multi() 322 | X0 = X[:, :2] 323 | X0p = X0 @ np.array([[1, 1], [10, 1]]) 324 | c1 = cka(X0, Y) 325 | c2 = cka(X0p, Y) 326 | assert abs(c1 - c2) > .001 327 | ``` 328 | 329 | It's starting to get quite long! For numeric code, it's not unusual that the test code should be several times longer than the code it is testing. Indeed, when you introduce a new numerical method, you might spend days testing it on different inputs to check that it gives reasonable outputs. Testing takes this common practice and formalizes it. Now, you can rest assured that the code works as intended. 330 | 331 | ## Dealing with wide matrices 332 | 333 | Before we add more features to the code, it's important to make sure that what is already there is correct. It's all too easy to build in a vacuum, and we are left debugging a giant chunk of code. 334 | 335 | In our case, a nice feature we might want is the ability to deal with wide matrices. The implementation we have works well for tall, skinny matrices. However, neural nets are generally over-parametrized and frequently have big intermediate representations. The paper introduces another method to compute the CKA with these wide matrices that is far more memory-efficient. We can change our implementation to deal with these larger matrices efficiently---and of course, add more tests to make sure we didn't mess up anything! Tests are what allow us to move with confidence. [Take a look at the final version of the code](https://github.com/patrickmineault/codebook_examples/tree/main/cka) to see how we can test that the code works as expected. 336 | 337 | ## Final thoughts 338 | 339 | We've gone through a complex example of testing numerical code. We built infrastructure to test the code, found a gnarly bug, corrected it, and continued to build a large test suite. We were then able to expand our code to deal with new conditions. In the end, we could be confident that our code is correct. 340 | -------------------------------------------------------------------------------- /docs/pipelines.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Document your project" 3 | exports: 4 | - format: tex 5 | logo: false 6 | template: ../templates/plain_latex_book_chapter 7 | output: exports/pipelines.tex 8 | --- 9 | 10 | # Document your project 11 | 12 | Research code is written in spurts and fits. You will often put code aside for several months and focus your energy on experiments, passing qualifying exams or working on another project. When you come back to your original project, you will be lost. A little prep work by writing docs will help you preserve your knowledge over the long run. In the previous chapter, I showed how to document small units of your code. In this section, I talk about how to document entire projects. 13 | 14 | ## Document pipelines 15 | 16 | It's a common practice to use graphical tools (GUIs) to perform analyses. It also happens more often than most people are willing to admit that different variants of a pipeline are run by commenting and un-commenting code. Both of these practices make it hard to reproduce a result 6 months down the line. What was run, and when? 17 | 18 | One approach is to textually document in detail what piece of code was run to obtain results. This method can be tedious and error-prone. It's usually worth it to push as much computation as possible into reproducible pipelines which are self-documenting. That way, there's no ambiguity about how results were produced. 19 | 20 | Manual steps involving GUI tools should produce results which can be ingested by text-based pipelines. For instance, a interactive GUI to define regions-of-interest (ROI) should export the ROI coordinates in a way that the pipeline can ingest it. 21 | 22 | ### Write console programs 23 | 24 | Instead of commenting and un-commenting code, we can have different code paths execute depending on flags passed as command line arguments. Console programs can be written through the `argparse` library, which is part of the Python standard library, or through external libraries like `click`. As a side benefit, these libraries document the intent of the flags and generate help for them. 25 | 26 | Let's say you create a command line program `train_net.py` that trains a neural net. This training script has four parameters you could change: model type, number of iterations, input directory, output directory. Rather than changing these variables in the source code, you can pass them as command line arguments. Here's how you can write that: 27 | 28 | ```{margin} 29 | [Real training scripts for neural nets can take dozens of parameters](https://github.com/patrickmineault/your-head-is-there-to-move-you-around/blob/main/train_net.py#L677). 30 | ``` 31 | 32 | ```python 33 | import argparse 34 | 35 | def main(args): 36 | # TODO(pmin): Implement a neural net here. 37 | print(args.model) # Prints the model type. 38 | 39 | if __name__ == '__main__': 40 | parser = argparse.ArgumentParser(description="Train a neural net") 41 | 42 | parser.add_argument("--model", required=True, 43 | help="Model type (resnet or alexnet)") 44 | parser.add_argument("--niter", type=int, default=1000, 45 | help="Number of iterations") 46 | parser.add_argument("--in_dir", required=True, 47 | help="Input directory with images") 48 | parser.add_argument("--out_dir", required=True, 49 | help="Output directory with trained model") 50 | 51 | args = parser.parse_args() 52 | main(args) 53 | ``` 54 | 55 | A nice side benefit of using `argparse` is that it automatically generates help at the command line. 56 | 57 | ```shell 58 | $ python train_net.py -h 59 | usage: train_net.py [-h] --model MODEL [--niter NITER] --in_dir IN_DIR --out_dir OUT_DIR 60 | 61 | Train a neural net 62 | 63 | optional arguments: 64 | -h, --help show this help message and exit 65 | --model MODEL Model type (resnet or alexnet) 66 | --niter NITER Number of iterations 67 | --in_dir IN_DIR Input directory with images 68 | --out_dir OUT_DIR Output directory with trained model 69 | ``` 70 | 71 | External flags thus allow you to run different versions of the same script in a standardized way. 72 | 73 | ### Commit shell files 74 | 75 | Once you've refactored your code to take configuration as command line flags, you should record the flags that you used when you invoke your code. You can do this using a _shell file_. A shell file contains multiple shell commands that are run one after the other. 76 | 77 | Consider a long-running pipeline involving `train_net.py`. This pipeline starts with downloading images from the internet stored in an AWS S3 bucket; trains a neural net; then generates plots. We can document this pipeline with a shell file. In `pipeline.sh`, we have: 78 | 79 | ```shell 80 | #!/bin/bash 81 | 82 | # This will cause bash to stop executing the script if there's an error 83 | set -e 84 | 85 | # Download files 86 | aws s3 cp s3://codebook-testbucket/images/ data/images --recursive 87 | 88 | # Train network 89 | python scripts/train_net.py --model resnet --niter 100000 --in_dir data/images \ 90 | --out_dir results/trained_model 91 | 92 | # Create output directory 93 | mkdir results/figures/ 94 | 95 | # Generate plots 96 | python scripts/generate_plots.py --in_dir data/images \ 97 | --out_dir results/figures/ --ckpt results/trained_model/model.ckpt 98 | ``` 99 | 100 | This shell file serves both as runnable code and as documentation for the pipeline. Now we know how our figure was generated! Don't forget to check in this shell file to git to have a record of this file. 101 | 102 | ```{danger} 103 | Bash is quirky: the syntax is awkward and it's pretty easy to shoot yourself in the foot. If you're going to write elaborate shell scripts, use [`shellcheck`](https://github.com/koalaman/shellcheck), which will point out common mistakes in your code. Your favorite editor probably has a plugin for `shellcheck`. 104 | 105 | Also, check out [Julia Evans' zine](https://wizardzines.com/zines/bite-size-command-line/) on bash. It's a life saver. 106 | ``` 107 | 108 | ### Document pipelines with make 109 | 110 | Scientific pipelines often take the shape of DAGs---directed acyclic graphs. This means, essentially, that programs flow from inputs to output in a directed fashion. The `train_net` pipeline above is a DAG with 4 steps. Other DAGs can be more elaborate, for example the 12-step DAG which generates figures shown in Van Vliet (2020) [^vanvliet]: 111 | 112 | [^vanvliet]: Van Vliet (2020). [Seven quick tips for analysis scripts in neuroimaging](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1007358). PLOS Computational Biology. 113 | 114 | ```{figure} figures/pcbi.1007358.g002.PNG_L.png 115 | --- 116 | figclass: boxed 117 | width: 90% 118 | --- 119 | DAG from Van Vliet (2020). CC-BY 4.0 120 | ``` 121 | 122 | You can document how all the steps of the DAG fit together with a bash file. As your pipelines grow, it can be painful to restart a pipeline that fails in the middle. This could lead to a proliferation of bash files corresponding to different subparts of your pipeline, and pretty soon you'll have a mess of shell scripts on your hands: basically recreating the commenting/uncommenting workflow we had in Python, this time in bash! 123 | 124 | You can use a more specialized build tool to build and document a pipeline. GNU `make` has been a standard tool for compiling code for several decades, and is becoming more adopted by the data science and research communities. A `Makefile` specifies both the inputs to each pipeline step and its outputs. To give you a flavor of what a `Makefile` looks like, this file implements the DAG to train a neural net and plot diagnostic plots: 125 | 126 | ```makefile 127 | .PHONY: plot 128 | plot: results/trained_model/model.ckpt results/figures 129 | python scripts/generate_plots.py --in_dir data/images --out_dir \ 130 | results/figures/ --ckpt results/trained_model/model.ckpt 131 | 132 | results/trained_model/model.ckpt: data/images 133 | python scripts/train_net.py --model resnet --niter 100000 \ 134 | --in_dir data/images --out_dir results/trained_model 135 | 136 | data/images: 137 | aws s3 cp s3://codebook-testbucket/images/ data/images --recursive 138 | 139 | results/figures: 140 | mkdir results/figures/ 141 | ``` 142 | 143 | ```{margin} 144 | `make` can be pretty finicky. For instance, you must use _tabs_, not spaces, to indent. Use an editor like vscode to spot issues early. 145 | ``` 146 | 147 | The plot can be created with `make plot`. The `Makefile` contains a complete description of the inputs and outputs to different scripts, and thus serves as a self-documenting artifact. [Software carpentry](https://swcarpentry.github.io/make-novice/) has an excellent tutorial on `make`. What's more, `make` only rebuilds what needs to be rebuilt. In particular, if the network is already trained, `make` will detect it and won't retrain the network again, skipping ahead to the plotting task. 148 | 149 | `make` uses a domain-specific language (DSL) to define a DAG. It might feel daunting to learn yet another language to document a pipeline. There are numerous alternatives to `make` that define DAGs in pure Python, including [`doit`](https://pydoit.org/). There are also more complex tools that can implement Python DAGs and run them in the cloud, including [`luigi`](https://github.com/spotify/luigi), [`airflow`](https://airflow.apache.org/), and [`dask`](https://docs.dask.org/en/latest/custom-graphs.html). 150 | 151 | ### Record the provenance of each figure and table 152 | 153 | Before you submit a manuscript, you should create a canonical version of the pipeline used to generate the figures and tables in the paper, and re-run the pipeline from scratch. That way, there will be no ambiguity as to the provenance of a figure in the final paper: it was generated by the canonical version of the pipeline. 154 | 155 | However, before you get to this final state, it's all too easy to lose track of which result was generated by which version of the pipeline. It is extremely frustrating when your results change and you can't figure out why. If you check in figures and results to source control as you generate them, in theory you have access to a time machine. However, there's information embedded in figures and tables about the state of the code _when the figures were generated_, only about the state of the code _when the figures were committed_, and there can be a significant lag between the two. 156 | 157 | A lightweight workaround is to record the git hash with each result. The git hash is a long string of random digits corresponding to a git commit. You can see these hashes using `git log`: 158 | 159 | ```console 160 | $ git log 161 | commit 3b0c0665465a8ea4cd862058e107b76041acae0f (HEAD -> main, origin/main) 162 | Author: Patrick Mineault 163 | Date: Wed Sep 8 13:34:31 2021 -0400 164 | 165 | Clean up setup instructions 166 | 167 | commit 70deb0e7c9bffe4cdc73d813df8115d99606601c 168 | Author: Patrick Mineault 169 | Date: Wed Sep 8 00:41:10 2021 -0400 170 | ``` 171 | 172 | Here, `3b0c06...` is the git hash of my latest commit. You can read the current git hash using the `gitpython` library and append it to the name of file. For example, instead of recording `figure.png`, I can record `figure.3b0c066546.png` using: 173 | 174 | ```python 175 | import git 176 | import matplotlib.pyplot as plt 177 | 178 | repo = git.Repo(search_parent_directories=True) 179 | short_hash = repo.head.object.hexsha[:10] 180 | 181 | # Plotting code goes here... 182 | plt.savefig(f'figure.{short_hash}.png') 183 | ``` 184 | 185 | Now there's no ambiguity about how that figure was generated. You can repeat the same process with csv files and other results. 186 | 187 | ```{margin} 188 | Most services in this space are closed-source, cloud-based commercial services available for free to researchers. [Wandb](https://wandb.ai/) and [Neptune](https://neptune.ai/) record machine learning results. [Gigantum](https://gigantum.com) keeps a rich log of jupyter notebook executions. `datalad` is libre software that can [record and document dataset manipulations](http://docs.datalad.org/en/stable/generated/man/datalad-run.html). 189 | ``` 190 | 191 | A more full-featured way of doing this is to use a specialized tool to post results to a centralized database. Most of the offerings in this space are cloud-based commercial services. You post your results---whether scalars, whole tables, figures, etc.---to a centralized server through a bit of python code, and it takes care of versioning. In the screenshot below, I used [wandb.ai](https://wandb.ai/) to record the outcome of a machine learning pipeline. The record tells me what command that was run, the git hash, meta-information about which computer was used to run the pipeline, as well as the outcome. There is no ambiguity about provenance. 192 | 193 | ```{figure} figures/wandb.png 194 | --- 195 | width: 600px 196 | figclass: boxed 197 | --- 198 | Record of one deep-learning run in wandb.ai 199 | ``` 200 | 201 | ## Document projects 202 | 203 | In addition to documenting pipelines, it's important to write proper textual documentation for your project. The secret is that once you've written good unit tests and have documented your pipelines, you won't have a lot of text docs to write. 204 | 205 | ### Write a `README.md` file 206 | 207 | `README.md` is the often first entry point to your code that you and others will see. This is the file that's rendered when you navigate to your github repository. What are some of the elements in a good README? 208 | 209 | - A one-sentence description of your project 210 | - A longer description of your project 211 | - Installation instructions 212 | - General orientation to the codebase and usage instructions 213 | - Links to papers 214 | - Links to extended docs 215 | - License 216 | 217 | Importantly, keep your `README.md` up-to-date. A good `README.md` file helps strangers understand the value of your code: it's as important as a paper's abstract. 218 | 219 | ### Write Markdown docs 220 | 221 | ```{margin} 222 | Different environments support slightly different variants of Markdown: Remarkable, CommonMark, pandoc and MyST. This book is written in MyST Markdown. 223 | ``` 224 | 225 | Markdown has taken over the world of technical writing. Using the same format everywhere creates tremendous opportunities, so I highly recommend that you write your documentation in Markdown. With the same text, you can generate: 226 | 227 | - _Digital notes_. [Notion](https://notion.so/), [notable](https://notable.app/), [Obsidian](https://obsidian.md/), [HackMD](https://hackmd.io/), GitHub. 228 | - _Blogs_. [Wordpress](https://wordpress.com), [Jekyll](https://jekyllrb.com/) 229 | - _Wikis_. GitHub. 230 | - _Static sites_. [Jekyll](https://jekyllrb.com/), [eleventy](https://11ty.dev/), GitHub Pages 231 | - _Executable books_. [jupyterbook](https://jupyterbook.org/) generates this book. 232 | - _Papers_. [CurveNote](https://curvenote.com/) uses MyST Markdown under the hood. 233 | - _Slide decks_. [Pandoc](https://pandoc.org/) via conversion to LaTeX/Beamer. 234 | - _readthedocs-style documentation_. [Sphinx](https://www.sphinx-doc.org/en/master/) using MyST. 235 | 236 | The same Markdown can be deployed in different environments depending on what exactly you want to accomplish. For some projects, the `README.md` file will be all that is needed. Others will want a static site that shows highlights of the paper. Yet other projects will be well-served by blog posts which discuss in longer form the tradeoffs involved in the design decisions. 237 | 238 | Creating your documentation in Markdown you make it really easy for you to eventually migrate to another format. I tend to use a combination of all these tools. For instance, I write notes on papers in Notion; I then export those notes to markdown as stubs for my Wordpress blog, for pandoc slides or for jupyterbook. 239 | 240 | # Discussion 241 | 242 | ```{epigraph} 243 | Some things are in our control and others not. [...] If [a thing] concerns anything not in your control, be prepared to say that it is nothing to you. 244 | 245 | ---Epictetus 246 | ``` 247 | 248 | Documentation contains the long-term memory of a project. When you're writing documentation, you're ensuring that your code will remain useful for a long time. Code should be self-documenting to a large degree, and you should put effort into automating most of the steps involved in generating figures and tables in your paper. However, you will still need to write some textual documentation. 249 | 250 | You can take that occasion to reflect on your project. Sometimes, you'll find that it's more productive to rewrite bad code than to write complex explanations for it. At other times, especially at the very end of a project, refactoring will not be worth the effort, and you will have to let things go. As part of the documentation, you can write about how you could improve on your project. You can use a framing device device like _three lessons I learned in this project_. Learn from your mistakes and do better next time. 251 | 252 | ```{admonition} 5-minute exercise 253 | Add a README.md file to a project you're working on right now 254 | ``` 255 | -------------------------------------------------------------------------------- /docs/zipf.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "A sample project: Zipf's law" 3 | exports: 4 | - format: tex 5 | logo: false 6 | template: ../templates/plain_latex_book_chapter 7 | output: exports/zipf.tex 8 | --- 9 | 10 | # A sample project: Zipf's law 11 | 12 | Let's look at how we can use the suggested organization in a real project. We use the example of calculating Zipf's law for a series of English texts, which was suggested in the book [Research Software Engineering in Python](https://merely-useful.tech/py-rse/), and was released under a CC-BY license. [You can see the completed project on Github](https://github.com/patrickmineault/zipf/) at `github.com/patrickmineault/zipf`. 13 | 14 | ## What is Zipf's law, anyway? 15 | 16 | Zipf's law comes from quantitative linguistics. It states the most used word in a language is used twice as much as the second most used word, three times as much of the third, etc. It's an empirical law that holds in different languages and texts. There's a lot of interesting theories about why it should hold: [have a look at the Wikipedia article if you're curious](https://en.wikipedia.org/wiki/Zipf%27s_law). A generalized form of Zipf's law states that: 17 | 18 | $$p(r) = \frac{1}{Cr^\alpha}$$ 19 | 20 | That is, the frequency of $r^{th}$ most frequent word in a language is a power law with exponent $\alpha$; the canonical Zipf's law is obtained when $\alpha=1$. $C$ is a constant that makes the distribution add up to 1. 21 | 22 | ## Our project 23 | 24 | The goal of the project is to measure that Zipf's law holds in three freely available texts. We'll proceed as follows: 25 | 26 | 1. download some texts from project Gutenberg 27 | 2. compute the distribution of words 28 | 3. fit Zipf's law to each of them 29 | 4. output metrics 30 | 5. plot some diagnostics 31 | 32 | Let's consider different ways to organize the different subcomponents and define how they should interact with each other. It's clear that the processing has the nice structure of a (linear) directed acyclic graph (DAG). 33 | 34 | ```{figure} figures/zipf-diagram.svg 35 | --- 36 | figclass: boxed 37 | width: 100% 38 | --- 39 | The Zipf project can be described as a linear directed acyclic graph (DAG) 40 | ``` 41 | 42 | _Approach 1_. We could make each box a separate script with command line arguments. Each script would be backed with module code that would help in a common library. Then, we would glue these scripts together with a bash file, or perhaps with make. This approach has a lot of merit, and in fact was the one that was originally suggested by the book: it's clean and it's standardized. One inconvenience is that it involves writing a lot of repetitive code in order to define command line interfaces. There's also a bit of overhead in writing both a script and module code for each subcomponent. 43 | 44 | ```{margin} 45 | The amount of cruft for command line interfaces could be reduced significantly using the [`click`](https://palletsprojects.com/p/click/) library. 46 | ``` 47 | 48 | _Approach 2_. We could make each box a separate function, held in a module. Then, we would glue these functions together with a Python script. This Python script would have its own command line arguments, which we could set in a bash script. One merit of the approach is that it has less overhead---fewer files and cruft---than the first approach. 49 | 50 | The tradeoff between these two approaches lies in the balance between generality and complexity. The first is a bit more flexible than the second. We can't run a single component of our analysis separately from the command line: we'll need to implement tests instead, which feel a little less intuitive. If we wanted to run an analysis of tens of thousands of books, parallelizing would also be easier with the first approach (e.g. [with `make`](https://www.gnu.org/software/make/manual/html_node/Parallel.html)). 51 | 52 | ```{margin} 53 | Different people---and sometimes the same person at different points in time---can disagree on the very best approach for a particular problem. In the end, what matters more is that the process through which the code was deliberate. If you put some thought into the organization: you're 90% of the way there. 54 | ``` 55 | 56 | For this example, however, I have a slight preference for the second approach, so that's the one we will implement. Keeping the number of command line tools to create to one means we'll worry less about cruft and more about the computations. Let's take a look at the DAG again to see how we'll split the job: 57 | 58 | ```{figure} figures/zipf-diagram-coded.svg 59 | --- 60 | figclass: boxed 61 | width: 100% 62 | --- 63 | The Zipf project split into different sub-components. 64 | ``` 65 | 66 | We'll create one module to compute the distribution of words, and another to fit Zipf's law. We will create tests for each of those two modules. We'll wrap the two modules as well as glue code inside a command line tool. Plotting will take place in jupyter notebook, which makes it easy to change plots interactively. 67 | 68 | I encourage you to also have a look at the Python RSE github repo for an alternative implementation to examine the tradeoffs in different implementations. 69 | 70 | ```{note} 71 | Most of the code is taken verbatim from the [Py-RSE repo](https://github.com/merely-useful/py-rse/tree/book/zipf). Our emphasis here is on organizing project files. 72 | ``` 73 | 74 | ## Setup 75 | 76 | Let's proceed with setting up the project. `zipf` is a reasonable project name, so let's go ahead and create a new project with that name: 77 | 78 | ``` 79 | (base) ~/Documents $ cookiecutter gh:patrickmineault/true-neutral-cookiecutter 80 | ``` 81 | 82 | Use `zipf`, `zipf`, and `zipf` as the answers to the first 3 questions, and "Zipf's law project" as the description. Then proceed to create an environment and save it: 83 | 84 | ```bash 85 | cd zipf 86 | conda create --name zipf python=3.8 87 | conda activate zipf 88 | conda env export > environment.yml 89 | ``` 90 | 91 | Then we can sync to a Github remote. My favorite way of doing this is to use the GUI in vscode, which saves me from going to Github to create the remote _and_ then type on the command line to sync locally. We can fire up vscode using: 92 | 93 | ```console 94 | code . 95 | ``` 96 | 97 | Then, in the git panel, we hit "Publish to Github" to locally set up git and create the Github remote in one shot. 98 | 99 | Now we have a good looking project skeleton! We can set up `black` in vscode so that whenever a file is saved, it is formatted in a standard way. Time to add some code to it. 100 | 101 | ```{note} 102 | If you prefer, you can instead go through github.com to create a new repo and follow the command line instructions there. 103 | ``` 104 | 105 | ## Download the texts 106 | 107 | We want to download three texts and put them in the `data` folder. Ideally, we'd do this automatically, for example with a bash file with calls to `wget`. However, the terms and conditions from Project Gutenberg state: 108 | 109 | ```{epigraph} 110 | The website is intended for human users only. Any perceived use of automated tools to access the Project Gutenberg website will result in a temporary or permanent block of your IP address. 111 | 112 | ---[Project Gutenberg](https://www.gutenberg.org/policy/robot_access.html) 113 | ``` 114 | 115 | ```{caution} 116 | Document manual steps in README.md! 117 | ``` 118 | 119 | Not a big deal! We can download the files manually and document the source in the `README.md` file. Let's download the following files manually and put them in the data folder: 120 | 121 | - [Dracula](https://www.gutenberg.org/files/345/345-0.txt) $\rightarrow$ `data/dracula.txt` 122 | - [Frankenstein](https://www.gutenberg.org/ebooks/42324.txt.utf-8) $\rightarrow$ `data/frankenstein.txt` 123 | - [Jane Eyre](https://www.gutenberg.org/files/1260/1260-0.txt) $\rightarrow$ `data/jane_eyre.txt` 124 | 125 | ## Count the words 126 | 127 | The next step is to calculate word counts for each text. We will create a module called `parse_text` in the `zipf` folder. It will contain a function `count_words` that takes in a text string and outputs a set of counts. This function will itself call a helper function `_clean_gutenberg_text`. Note that the function starts with an underscore to indicate that it is meant to be a private function used internally by the module. 128 | 129 | `_clean_gutenberg_text` will explicitly filter out boilerplate from project Gutenberg texts. There's a significant amount of boilerplate at the start of e-books and license information at the end, which might skew the word count distribution. Thankfully, there are phrases in the e-books which delimit the main text, which we can detect: 130 | 131 | - START OF THE PROJECT GUTENBERG EBOOK 132 | - END OF THE PROJECT GUTENBERG EBOOK 133 | 134 | ```{margin} 135 | [Ensuring data quality is huge chunk of the workload of data scientists](https://blog.ldodds.com/2020/01/31/do-data-scientists-spend-80-of-their-time-cleaning-data-turns-out-no/). 136 | ``` 137 | 138 | Because we only have three books in our collection, we can manually test that each book is processed correctly. However, if we ever process a fourth, unusually formatted text, and our detection didn't work, we want our pipeline to fail in a graceful way. Thus, we add in-line `assert` statements to make sure our filter finds the two delimiters in reasonable locations in the text. 139 | 140 | ```python 141 | def _clean_gutenberg_text(text): 142 | """ 143 | Find fences in a Gutenberg text and select the text between them. 144 | """ 145 | start_fence = "start of the project gutenberg ebook" 146 | end_fence = "end of the project gutenberg ebook" 147 | text = text.lower() 148 | start_pos = text.find(start_fence) + len(start_fence) + 1 149 | end_pos = text.find(end_fence) 150 | 151 | # Check that the fences are at reasonable positions within the text. 152 | assert 0.000001 < start_pos / len(text) <= 0.1 153 | assert 0.9 < end_pos / len(text) <= 1.0 154 | 155 | return text[start_pos:end_pos] 156 | ``` 157 | 158 | Now we can use call this function in another wrapper function that counts words ike so: 159 | 160 | ```python 161 | import string 162 | import collections 163 | 164 | 165 | def count_words(f, clean_text=False): 166 | """ 167 | Count words in a file. 168 | 169 | Arguments: 170 | f: an open file handle 171 | clean_text (optional): a Boolean, if true, filters out boilerplate 172 | typical of a Gutenberg book. 173 | 174 | Returns: 175 | A dict keyed by word, with word counts 176 | """ 177 | text = f.read() 178 | if clean_text: 179 | text = _clean_gutenberg_text(text) 180 | 181 | chunks = text.split() 182 | npunc = [word.strip(string.punctuation) for word in chunks] 183 | word_list = [word.lower() for word in npunc if word] 184 | word_counts = collections.Counter(word_list) 185 | return dict(word_counts) 186 | ``` 187 | 188 | To test that the `count_words` function works as intended, we make a sample txt file in the `tests` folder which contains dummy data: 189 | 190 | ```text 191 | The 192 | 193 | *** START OF THE PROJECT GUTENBERG EBOOK *** 194 | OF 195 | TO to 196 | I i i 197 | and and and and 198 | THE The THE the thE 199 | [...] 200 | *** END OF THE PROJECT GUTENBERG EBOOK *** 201 | 202 | 203 | OF 204 | ``` 205 | 206 | If our function works correctly, it should ignore all the text outside of the start and end fences. It should return: 207 | 208 | ``` 209 | {'the': 5, 'and': 4, 'i': 3, 'to': 2, 'of': 1} 210 | ``` 211 | 212 | We set up a test function in `test_parse_text.py` that loads this test text and verifies that the word counts are correctly measured. The test can be run with: 213 | 214 | ```console 215 | $ pytest tests/test_parse_text.py 216 | ``` 217 | 218 | [The resulting module](https://github.com/patrickmineault/zipf/blob/master/zipf/parse_text.py) and [test](https://github.com/patrickmineault/zipf/blob/master/tests/test_parse_text.py) can be viewed on Github. 219 | 220 | ## Calculate Zipf's law 221 | 222 | Once we have word counts, we can calculate Zipf's law based on word counts. We use the method proposed in [Moreno-Sanchez et al. (2016)](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0147073#sec004), which is implemented in the book Research Software Engineering with Python. The method calculates the maximum likelihood $\alpha$ parameter for the empirical distribution of ranks. It starts with $\alpha = 1$ and refines the estimate through gradient descent. We implement this method in [`fit_distribution.py`](https://github.com/patrickmineault/zipf/blob/master/zipf/fit_distribution.py). 223 | 224 | It's quite easy to make a mistake in calculating the exponent by incorrectly implementing the error function. To protect ourselves against this, in our test suite, we generate data from a known distribution with $\alpha = 1$ and verify that the correct exponent is estimated. We put this test, [`test_fit_distribution.py`](https://github.com/patrickmineault/zipf/blob/master/tests/test_fit_distribution.py) under the tests folder. 225 | 226 | ## Write the command line interface 227 | 228 | We are now ready to write glue code to run our pipeline from the command line. We create a script, `run_analysis.py`, under the scripts folder. Our script takes in two different command line arguments, or flags: 229 | 230 | - `in_folder`: the input folder containing txt files. 231 | - `out_folder`: where we will output the results 232 | 233 | ```{margin} 234 | Keep your command line programs to less than 20 flags. Beyond that, they become a pain to maintain. 235 | ``` 236 | 237 | We create this command line interface using `argparse`. This looks like this: 238 | 239 | ```python 240 | if __name__ == "__main__": 241 | parser = argparse.ArgumentParser(description="Compute zipf distribution") 242 | parser.add_argument("--in_folder", help="the input folder") 243 | parser.add_argument("--out_folder", help="the output folder") 244 | args = parser.parse_args() 245 | 246 | main(args) 247 | ``` 248 | 249 | The `main` function first verifies that there are files to be processed and creates the output folder if necessary. To manipulate paths, it uses the `pathlib` module. The function then runs each of the stages of the pipeline in turn until completion. We don't write tests this part of the code, as we've lifted the error-prone aspects of the code elsewhere. Again, the goal is rarely to get 100% coverage, but rather to have good coverage where it counts. 250 | 251 | The code uses a for loop to construct a table of results, which is then loaded into pandas, and then dumped to disk as csv. Pandas is not strictly necessary to write a csv file, but it's the default choice to work with tabular data in Python, so it is used here. [Have a look at the script here](https://github.com/patrickmineault/zipf/blob/master/scripts/run_analysis.py). 252 | 253 | ## Make summary figures 254 | 255 | Finally, to generate figures, we prefer the jupyter notebook environment. Polishing figures takes some trial and error, and the jupyter notebook environment makes it easy to iterate on figures. The notebook, in `scripts/visualize_results.ipynb`, takes in results files as csvs, and generates figures and summary tables from them. 256 | 257 | ```{figure} figures/oneoverf.png 258 | --- 259 | figclass: boxed 260 | --- 261 | A plot generated by our pipeline. Zipf distribution estimated on the novel Frankenstein by Mary Shelley. 262 | ``` 263 | 264 | We used the standard choice of matplotlib for the figures, although with the stylesheet from seaborn, which is more aesthetically pleasing. Note, however, that there are a number of excellent libraries for interactive graphics, so if this was critical, we could certainly apply this here. There is no computation in the notebook, only figure and summary table generation, which keeps the code in the notebook to a minimum. 265 | 266 | ## What did we learn? 267 | 268 | We saw how to organize an analysis according to the organization proposed in this handbook. We created a new project folder and module with the true neutral cookiecutter. If this analysis were part of a larger project---a paper or book chapter---we would re-use the same project folder, and add new pipelines and analyses to the project. 269 | 270 | When I started this project, I was surprised by how many macro- and micro-decisions were needed to be made about how a pipeline works and how it's organized. A little bit of thinking ahead can avoid days of headaches later on. In this case, we chose a lightweight structure with one entry script calling module functions. These module functions are short and most of them are pure functions. Pure functions are easier to test. Indeed, we wrote unit tests to check that these functions worked as intended. That way, once the code was written and tests passed, it was crystal clear that the code worked as intended. We separated module code from glue code and plotting code. The resulting code is decoupled and easily maintainable. 271 | 272 | Writing code in this organized way is effortful. Over the long term, this methodical approach is more efficient, and more importantly, it's less stressful. 273 | 274 | ```{admonition} 5-minute exercise 275 | How would you change this pipeline so that it computes error bars for the parameters of the Zipf distribution through bootstrapping? Would you need to change existing functions? What function would you need to add? *If you're feeling adventurous, go ahead and implement it! It's definitely more than a 5-minute project!* 276 | ``` 277 | -------------------------------------------------------------------------------- /docs/testing.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | formats: md:myst 4 | text_representation: 5 | extension: .md 6 | format_name: myst 7 | kernelspec: 8 | display_name: Python 3 9 | language: python 10 | name: python3 11 | title: "Testing your code" 12 | exports: 13 | - format: tex 14 | logo: false 15 | template: ../templates/plain_latex_book_chapter 16 | output: exports/testing.tex 17 | --- 18 | 19 | (test)= 20 | 21 | # Test your code 22 | 23 | ```{epigraph} 24 | Most scientists who write software constantly test their code. That is, if you are a scientist writing software, I am sure that you have tried to see how well your code works by running every new function you write, examining the inputs and the outputs of the function, to see if the code runs properly (without error), and to see whether the results make sense. Automated code testing takes this informal practice, makes it formal, and automates it, so that you can make sure that your code does what it is supposed to do, even as you go about making changes around it. 25 | 26 | ---[Ariel Rokem](https://github.com/uwescience/shablona) 27 | ``` 28 | 29 | Automated testing is one of the most powerful techniques that professional programmers use to make code robust. Having never used testing until I went to industry, it changed the way I write code for the better. 30 | 31 | ## Testing to maintain your sanity 32 | 33 | When you run an experiment and the results of the analysis don't make sense, you will go through a process of eliminating one potential cause after the other. You will investigate several hypotheses, including: 34 | 35 | - the data is bad 36 | - you're loading the data incorrectly 37 | - your model is incorrectly implemented 38 | - your model is inappropriate for the data 39 | - the statistical test you used is inappropriate for the data distribution 40 | 41 | Testing can help you maintain your sanity by decreasing the surface of things that might be wrong with your experiment. Good code yells loudly when something goes wrong. Imagine that you had an experimental setup that alerted you when you had a ground loop, or that would sound off when you use the wrong reagent, or that would text you when it's about to overheat: how many hours or days would you save? 42 | 43 | ## Unit testing by example 44 | 45 | Unit testing is the practice of testing a _unit_ of code, typically a single function. The easiest way to understand what that means is to illustrate it with a specific example. The Fibonacci sequence is defined as: 46 | 47 | $$F(x) \equiv F(x-1) + F(x-2)$$ 48 | $$F(0) \equiv 0 $$ 49 | $$F(1) \equiv 1 $$ 50 | 51 | [The first few items in the Fibonacci sequence](https://oeis.org/A000045) are: 52 | 53 | $$F = 0, 1, 1, 2, 3, 5, 8, 13, 21, \ldots$$ 54 | 55 | Let's write up a naive implementation of this. 56 | 57 | ```{code-cell} 58 | def fib(x): 59 | if x <= 2: 60 | return 1 61 | else: 62 | return fib(x - 1) + fib(x - 2) 63 | ``` 64 | 65 | Let's say that a colleague brings you this code and asks you to check that the code they've written up works. How would check whether this code works? 66 | 67 | ````{dropdown} Spoilers 68 | You could run this code on the command line with different inputs and check that the code works as expected. For instance, you expect that: 69 | 70 | ```pycon 71 | >>> fib(0) 72 | 0 73 | >>> fib(1) 74 | 1 75 | >>> fib(2) 76 | 1 77 | >>> fib(6) 78 | 8 79 | >>> fib(40) 80 | 102334155 81 | ``` 82 | 83 | You could also run the code with bad inputs, to check whether the code returns meaningful errors. For example, the sequence is undefined for negative numbers or non-integers. 84 | ```` 85 | 86 | Informal testing can be done in an interactive computing environment, like the `ipython` REPL or a jupyter notebook. Run the code, check the output, repeat until the code works right---it's a workflow you've probably used as well. 87 | 88 | ### Lightweight formal tests with `assert` 89 | 90 | One issue with informal tests is that they often have a short shelf life. Once the code is written and informal testing is over, you don't have a record of that testing. You might even discard the tests you wrote in jupyter! We can make our tests stick with `assert`. 91 | 92 | `assert` is a special statement in Python that throws an error whenever the statement is false. For instance, 93 | 94 | ``` 95 | >>> assert 1 == 0 96 | Traceback (most recent call last): 97 | File "", line 1, in 98 | AssertionError 99 | ``` 100 | 101 | Notice that there are no parentheses between `assert` and the statement. `assert` is great for inline tests, for example checking whether the shape or a matrix is as expected after permuting its indices. 102 | 103 | We can also assemble multiple assert operations to create a lightweight test suite. You can hide your asserts behind an `__name__ == '__main__'` statement, so that they will only run when you directly run a file. Let's write some tests in `fib.py`: 104 | 105 | ``` 106 | def fib(x): 107 | if x <= 2: 108 | return 1 109 | else: 110 | return fib(x - 1) + fib(x - 2) 111 | 112 | if __name__ == '__main__': 113 | assert fib(0) == 0 114 | assert fib(1) == 1 115 | assert fib(2) == 1 116 | assert fib(6) == 8 117 | assert fib(40) == 102334155 118 | print("Tests passed") 119 | ``` 120 | 121 | Now we can run the tests from the command line: 122 | 123 | ```console 124 | $ python fib.py 125 | Traceback (most recent call last): 126 | File "fib.py", line 8, in 127 | assert fib(0) == 0 128 | AssertionError 129 | ``` 130 | 131 | We see our test suite fail immediately for `fib(0)`. We can fix up the boundary conditions of the code, and run the code again. We repeat this process until all our tests pass. Let's look at the fixed up code: 132 | 133 | ``` 134 | def fib(x): 135 | if x == 0: 136 | return 0 137 | if x == 1: 138 | return 1 139 | else: 140 | return fib(x - 1) + fib(x - 2) 141 | 142 | if __name__ == '__main__': 143 | assert fib(0) == 0 144 | assert fib(1) == 1 145 | assert fib(2) == 1 146 | assert fib(6) == 8 147 | assert fib(40) == 102334155 148 | print("Tests passed") 149 | ``` 150 | 151 | While the first few tests pass, the last one hangs for a long time. What's going on here? 152 | 153 | ### Refactoring with confidence with tests 154 | 155 | Our `fib(N)` function hangs for a large value of `N` because it spawns a lot of repeated computation. `fib(N)` calls both `fib(N-1)` and `fib(N-2)`. In turn, `fib(N-1)` calls `fib` twice, and so on and so forth. Therefore, the time complexity of this function scales exponentially as $O(2^N)$: it's very slow. 156 | 157 | We can re-implement this function so that it keeps a record of previously computed values. One straightforward way of doing this is with a global cache. **We keep our previously implemented tests**, and rewrite the function: 158 | 159 | ``` 160 | cache = {} 161 | def fib(x): 162 | global cache 163 | if x in cache: 164 | return cache[x] 165 | if x == 0: 166 | return 0 167 | elif x == 1: 168 | return 1 169 | else: 170 | val = fib(x - 1) + fib(x - 2) 171 | cache[x] = val 172 | return val 173 | 174 | if __name__ == '__main__': 175 | assert fib(0) == 0 176 | assert fib(1) == 1 177 | assert fib(2) == 1 178 | assert fib(6) == 8 179 | assert fib(40) == 102334155 180 | print("Tests passed") 181 | ``` 182 | 183 | Running this new and improved script, we see: 184 | 185 | ```console 186 | $ python fib.py 187 | Tests passed 188 | ``` 189 | 190 | Hurray! We can be confident that our code works as expected. What if we want to refactor our code so that it doesn't use globals? Not a problem, we keep the tests around, and we rewrite the code to use an inner function: 191 | 192 | ``` 193 | def fib(x): 194 | cache = {} 195 | def fib_inner(x): 196 | nonlocal cache 197 | if x in cache: 198 | return cache[x] 199 | if x == 0: 200 | return 0 201 | elif x == 1: 202 | return 1 203 | else: 204 | val = fib_inner(x - 1) + fib_inner(x - 2) 205 | cache[x] = val 206 | return val 207 | return fib_inner(x) 208 | 209 | if __name__ == '__main__': 210 | assert fib(0) == 0 211 | assert fib(1) == 1 212 | assert fib(2) == 1 213 | assert fib(6) == 8 214 | assert fib(40) == 102334155 215 | print("Tests passed") 216 | ``` 217 | 218 | Running the module again, our tests still pass! Testing helps us refactor with confidence because we can immediately tell whether we've introduced new bugs in our code. 219 | 220 | ### Testing pure functions 221 | 222 | With pure functions, such as `fib`, we can readily come up with ways to test whether the code works or not. We can check: 223 | 224 | - _Correctness for typical inputs_, e.g. $F(5) = 5$ 225 | - _Edge cases_, e.g. $F(0) = 0$ 226 | - _Errors_ with bad input, e.g. $F(-1)$ $\rightarrow$ _error_ 227 | - _Functional goals are achieved_, e.g. that the function works for large numbers 228 | 229 | Pure functions don't require elaborate setups to test properly, and indeed they have some of the highest _bang for your buck_ when it comes to testing. If in your current workflow, you would have manually checked whether a procedure yielded reasonable results, write a test for it. 230 | 231 | ```{tip} 232 | If something caused a bug, write a test for it. 70% of bugs are old bugs that keep reappearing. 233 | ``` 234 | 235 | ### Testing with a test suite 236 | 237 | Testing with `assert` hidden behind `__name__ == '__main__'` works great for small-scale testing. However, once you have a lot of tests, it starts to make sense to group them into a _test suite_ and run them with a _test runner_. There are two main frameworks to run unit tests in Python, `pytest` and `unittest`. `pytest` is the more popular of the two, so I'll cover that here. 238 | 239 | To install pytest on your system, first run: 240 | 241 | ```python 242 | pip install -U pytest 243 | ``` 244 | 245 | Writing a test suite for pytest is a matter of taking our previous unit tests and putting them in a separate file, wrapping them in functions which start with `test_`. In `tests/test_fib.py`, we write: 246 | 247 | ``` 248 | from src.fib import fib 249 | import pytest 250 | 251 | def test_typical(): 252 | assert fib(1) == 1 253 | assert fib(2) == 1 254 | assert fib(6) == 8 255 | assert fib(40) == 102334155 256 | 257 | def test_edge_case(): 258 | assert fib(0) == 0 259 | 260 | def test_raises(): 261 | with pytest.raises(NotImplementedError): 262 | fib(-1) 263 | 264 | with pytest.raises(NotImplementedError): 265 | fib(1.5) 266 | ``` 267 | 268 | Notice that pytest primarily relies on the `assert` statement to do the heavy lifting. `pytest` also offers extra functionality to deal with special test cases. `pytest.raises` creates a context manager to verify that a function raises an expected exception. 269 | 270 | Running the `pytest` utility from the command line, we find: 271 | 272 | ```console 273 | $ pytest test_fib.py 274 | ... 275 | def fib_inner(x): 276 | nonlocal cache 277 | if x in cache: 278 | return cache[x] 279 | > if x == 0: 280 | E RecursionError: maximum recursion depth exceeded in comparison 281 | 282 | ../src/fib.py:7: RecursionError 283 | ============================= short test summary info ========================== 284 | FAILED test_fib.py::test_raises - RecursionError: maximum recursion depth exceed 285 | =========================== 1 failed, 2 passed in 1.18s ======================== 286 | ``` 287 | 288 | Notice how informative the output of pytest is compared to our homegrown test suite. `pytest` informs us that two of our tests passed---`test_typical` and `test_edge_case`---while the last one failed. Calling our `fib` function with a negative argument or a non-integer argument will make the function call itself recursively with negative numbers - it never stops! Hence, Python eventually will generate a `RecursionError`. However, our tests are expecting a `NotImplementedError` instead! Our test correctly detected that the code has this odd behavior. We can fix it up like so: 289 | 290 | ```{code-cell} 291 | def fib(x): 292 | if x % 1 != 0 or x < 0: 293 | raise NotImplementedError('fib only defined on non-negative integers.') 294 | cache = {} 295 | def fib_inner(x): 296 | nonlocal cache 297 | if x in cache: 298 | return cache[x] 299 | if x == 0: 300 | return 0 301 | elif x == 1: 302 | return 1 303 | else: 304 | val = fib_inner(x - 1) + fib_inner(x - 2) 305 | cache[x] = val 306 | return val 307 | return fib_inner(x) 308 | ``` 309 | 310 | Now we can run tests again. 311 | 312 | ```console 313 | $ pytest test_fib.py 314 | =============================== test session starts ============================ 315 | platform linux -- Python 3.8.8, pytest-6.2.4, py-1.10.0, pluggy-0.13.1 316 | rootdir: /home/pmin/Documents/codebook 317 | plugins: anyio-3.1.0 318 | collected 3 items 319 | 320 | test_fib.py ... [100%] 321 | 322 | ================================ 3 passed in 0.02s ============================= 323 | ``` 324 | 325 | They pass! 326 | 327 | ## Testing non-pure functions and classes 328 | 329 | I claimed earlier that _pure functions_ are the easiest to test. Let's see what we need to do to test non-pure functions. For a _nondeterministic_ function, you can usually give the random seed or random variables needed by the function as arguments, turning the nondeterministic function into a deterministic one. For a _stateful_ function, we need to additionally test that: 330 | 331 | - _Postconditions are met_, that is, the internal state of the function or object is changed in the expected way by the code 332 | 333 | Classes are stateful, so we'll need to inspect their state after calling methods on them to make sure they work as expected. For example, consider this Chronometer class: 334 | 335 | ```{code-cell} 336 | import time 337 | 338 | class Chronometer: 339 | def start(self): 340 | self.t0 = time.time() 341 | 342 | def stop(self): 343 | return time.time() - self.t0 344 | ``` 345 | 346 | We might want to check that the `t0` variable is indeed set by the `start` method. 347 | 348 | For a function with _I/O side effects_, we'll need to do a little extra work to verify that it works. We might need to create mock files to check whether inputs are read properly and outputs are as expected. `io.StringIO` and the `tempfile` module can help you create these mock objects. For instance, suppose we have a function `file_to_upper` that takes in an input and an output filename, and turns every letter into an uppercase: 349 | 350 | ```{code-cell} 351 | def file_to_upper(in_file, out_file): 352 | fout = open(out_file, 'w') 353 | with open(in_file, 'r') as f: 354 | for line in f: 355 | fout.write(line.upper()) 356 | fout.close() 357 | ``` 358 | 359 | Writing a test for this is a little tortured: 360 | 361 | ```{code-cell} 362 | import tempfile 363 | import os 364 | 365 | def test_upper(): 366 | in_file = tempfile.NamedTemporaryFile(delete=False, mode='w') 367 | out_file = tempfile.NamedTemporaryFile(delete=False) 368 | out_file.close() 369 | in_file.write("test123\nthetest") 370 | in_file.close() 371 | file_to_upper(in_file.name, out_file.name) 372 | with open(out_file.name, 'r') as f: 373 | data = f.read() 374 | assert data == "TEST123\nTHETEST" 375 | os.unlink(in_file.name) 376 | os.unlink(out_file.name) 377 | ``` 378 | 379 | With remote calls and persistent storage, testing can rapidly become quite complex. 380 | 381 | ## A hierarchy of tests 382 | 383 | We've been focused so far on _unit tests_. However, there are many different kinds of tests that people use. 384 | 385 | - _Static tests_: your editor parses and runs your code as you write it to figure out if it will crash 386 | - _Inline asserts_: test whether intermediate computations are as expected 387 | - _Unit tests_: test whether one function or unit of code works as expected 388 | - _Docstring tests_: unit tests embedded in docstrings 389 | - _Integration tests_: test whether multiple functions work correctly together 390 | - _Smoke tests_: test whether a large piece of code crashes at an intermediate stage 391 | - _Regression tests_: tests whether your code is producing the same outputs that it used to in previous versions 392 | - _End-to-end tests_: literally a robot clicking buttons to figure out if your application works as expected 393 | 394 | The point is not to overwhelm you with the possibilities, but to give you a glossary of testing so you know what to look for when you're ready to dig deeper. 395 | 396 | ## Write lots of tiny unit tests 397 | 398 | My proposal to you is modest: 399 | 400 | 1. Isolate numeric code. 401 | 2. Make numeric functions pure if practical. 402 | 3. Write tests for the numeric code 403 | 4. Write tests for the critical IO code 404 | 405 | You're going to get a lot of bang for your buck by writing unit tests - inline asserts and regression tests are also high payoff-to-effort. Aim for each unit test to run in 1 ms. The faster each test runs, the better for your working memory. More than 5 seconds and you'll be tempted to check your phone. 406 | 407 | What do you think is the ideal ratio of test code to real code? 408 | 409 | ```{dropdown} Spoilers 410 | There's no ideal number per say, but 1:1 to 3:1 is a commonly quoted range for library code. For one-off code, you can usually get away with less test coverage. For more down-to-earth applications, 80% test coverage is a common target. [You can use the `Coverage.py` package to figure out your test coverage](https://coverage.readthedocs.io/en/coverage-5.3.1/). 411 | ``` 412 | 413 | ## Now you're playing with power 414 | 415 | Testing is the key to refactor with confidence. Let's say that your code looks ugly, and you feel like it's time to refactor. 416 | 417 | 1. Lock in the current behavior of your code with regression tests 418 | 1. Check that the tests pass 419 | 1. Rewrite the code to be tidy 420 | 1. Correct the code 421 | 1. Iterate until tests pass again 422 | 423 | You can call `pytest` with a specific filename to run one test suite. For a larger refactor, you can run all the tests in the current directory with: 424 | 425 | ``` 426 | $ pytest . 427 | ``` 428 | 429 | If you want, you can even integrate this workflow into github by running tests every time you push a commit! This is what's called _continuous integration_. It's probably overkill for a small-scale project, but know that it exists. 430 | 431 | ## Discussion 432 | 433 | Writing tests is not part of common scientific practice yet, but I think it deserves a higher place in scientific programming education. 434 | 435 | Testing allows you to decrease the uncertainty surface of your code. With the right tests, you can convince yourself that parts of your code are _correct_, and that allows you to concentrate your debugging efforts. Keeping that uncertainty out of your head saves your working memory, and debugging will be faster and more efficient. At the same time, code with tests is less stressful to refactor, so you will be able to continuously improve your code so that it doesn't slide towards an unmanageable mess of spaghetti. 436 | 437 | Testing is not an all-or-none proposition: you can start writing lightweight inline tests in your code today. Find a commented out `print` statement in your code. Can you figure out how to replace it with an `assert`? 438 | 439 | ```{admonition} 5-minute exercise 440 | Find a commented out `print` statement in your code and transform it into an `assert`. 441 | ``` 442 | -------------------------------------------------------------------------------- /docs/setup.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Setting up your project 3 | authors: 4 | - Patrick Mineault 5 | keywords: 6 | - code 7 | - academia 8 | exports: 9 | - format: tex 10 | template: ../templates/plain_latex_book_chapter 11 | output: exports/setup.tex 12 | --- 13 | 14 | # Set up your project 15 | 16 | Setting up an organized project will help you remain productive as your project grows. The broad steps involved are: 17 | 18 | 1. Pick a name and create a folder for your project 19 | 2. Initialize a git repository and sync to Github 20 | 3. Set up a virtual environment 21 | 4. Create a project skeleton 22 | 5. Install a project package 23 | 24 | The end result will be a logically organized project skeleton that's synced to version control. 25 | 26 | ```{warning} 27 | I will present most of the project setup in the terminal, but you can do many of these steps inside of an IDE or file explorer. 28 | ``` 29 | 30 | ## Pick a name and create a folder for your project 31 | 32 | When you start a project, you will need to decide how to structure it. As an academic, a project will tend to naturally map to a paper. Therefore, **one project = one paper = one folder = one git repository** is generally a good default structure. 33 | 34 | ```{margin} 35 | You might want to create extra standalone projects for tools you re-use across different papers. 36 | ``` 37 | 38 | Pick a short and descriptive name for your project and create a folder in your Documents folder. For instance, when I created the project for this book, the first step was to create the `codebook` folder: 39 | 40 | ```console 41 | ~/Documents$ mkdir codebook 42 | ``` 43 | 44 | ## Initialize a git repository and sync to Github 45 | 46 | Since git is such a core tool to manage code-heavy projects, I recommend that you set it up immediately. The way I prefer to do this is by going to [Github](https://github.com) and clicking the big green **New** button to create a new repository. I name the remote the same as my local folder and hit **Create Repository**. 47 | 48 | ```{figure} figures/github-repo.png 49 | --- 50 | width: 313px 51 | --- 52 | The big green New button. 53 | ``` 54 | 55 | I then follow Github's instructions to initialize the repo. In `~/Documents/codebook`, I run: 56 | 57 | ```{margin} 58 | I've never attempted to remember these commands. I always copy and paste. 59 | ``` 60 | 61 | ```{code} 62 | echo "# codebook" >> README.md 63 | git init 64 | git add README.md 65 | git commit -m "first commit" 66 | git branch -M main 67 | git remote add origin https://github.com/patrickmineault/codebook.git 68 | git push -u origin main 69 | ``` 70 | 71 | How often do you think you should commit to git? 72 | 73 | ```{dropdown} Spoilers 74 | Depending on your pace, you should aim to commit your code from *a few times a day* to *a few times per week*. Don't wait until the project is almost finished before you start to commit. 75 | ``` 76 | 77 | The general rule of thumb is that one commit should represent a unit of related work. For example, if you made changes in 3 files to add a new functionality, that should be _one_ commit. Splitting the commit into 3 would lose the relationship between the changes; combining these changes with 100 other changed files would make it very hard to track down what changed. Try to make your git commit messages meaningful, as it will help you track down bugs several months down the line. 78 | 79 | If you don't use git very often, you might not like the idea of committing to git daily or multiple times per day. The git command line can feel like a formidable adversary; GUIs can ease you into it. I used to use the git command line exclusively. These days, I tend to prefer [the git panel in VSCode](vscode). 80 | 81 | ```{figure} figures/git-vscode.png 82 | --- 83 | width: 250px 84 | --- 85 | The git panel in VSCode. 86 | ``` 87 | 88 | ## Set up a virtual environment 89 | 90 | ```{epigraph} 91 | Why do I use virtual Python environments? So I don't fuck up all my local shit. 92 | 93 | ---[Nick Wan](https://twitter.com/nickwan) 94 | ``` 95 | 96 | ```{figure} figures/python_environment_2x.png 97 | --- 98 | width: 492px 99 | --- 100 | Python environments can be a real pain. From [xkcd.com](https://xkcd.com/1987/) by Randall Munroe. 101 | ``` 102 | 103 | Many novices starting out in Python use one big monolithic Python environment. Every package is installed in that one environment. The problem is that this environment is not documented anywhere. Hence, if they need to move to another computer, or they need to recreate the environment from scratch several months later, they're in for several hours or days of frustration. 104 | 105 | The solution is to use a _virtual environment_ to manage dependencies. Each virtual environment specifies which versions of software and packages a project uses. The specs can be different for different projects, and each virtual environment can be easily swapped, created, duplicated or destroyed. You can use software like `conda`, `pipenv`, `poetry`, `venv`, `virtualenv`, `asdf` or `docker`---among others---to manage dependencies. Which one you prefer is a matter of personal taste and [countless internet feuds](https://twitter.com/patrickmineault/status/1429560804869873664?s=20). Here I present the `conda` workflow, which is particularly popular among data scientists and researchers. 106 | 107 | ### Conda 108 | 109 | Conda is the _de facto_ standard package manager for data science-centric Python. `conda` is both a package manager (something that installs package on your system) and a virtual environment manager (something that can swap out different combinations of packages and binaries---virtual environments---easily). 110 | 111 | [Once conda is installed](https://docs.conda.io/en/latest/miniconda.html)---for instance, through miniconda---you can create a new environment and activate it like so: 112 | 113 | ```console 114 | ~/Documents/codebook$ conda create --name codebook python=3.8 115 | ~/Documents/codebook$ conda activate codebook 116 | ``` 117 | 118 | From this point on, you can install packages through the conda installer like so: 119 | 120 | ```console 121 | (codebook) ~/Documents/codebook$ conda install pandas numpy scipy matplotlib seaborn 122 | ``` 123 | 124 | Now, you might ask yourself, can I use both pip and conda together? 125 | 126 | ```{dropdown} Spoilers 127 | 128 | **You can use pip inside of a conda environment**. A big point of confusion is how conda relates to `pip`. For conda: 129 | 130 | * Conda is both a package manager and a virtual environment manager 131 | * Conda can install big, complicated-to-install, non-Python software, like `gcc` 132 | * Not all Python packages can be installed through conda 133 | 134 | For pip: 135 | 136 | * pip is just a package manager 137 | * pip only installs Python packages 138 | * pip can install every package on PyPI in addition to local packages 139 | 140 | `conda` tracks which packages are pip installed and will include a special section in `environment.yml` for pip packages. [However, installing pip packages may negatively affect conda's ability to install conda packages correctly after the first pip install](https://www.anaconda.com/blog/using-pip-in-a-conda-environment). Therefore, people generally recommend installing **big conda packages first**, then installing **small pip packages second**. 141 | ``` 142 | 143 | ### Export your environment 144 | 145 | To export a list of dependencies so you can easily recreate your environment, use the `export env` command: 146 | 147 | ```console 148 | (codebook) ~/Documents/codebook$ conda env export > environment.yml 149 | ``` 150 | 151 | You can then commit `environment.yml` to document this environment. You can recreate this environment---when you move to a different computer, for example---using: 152 | 153 | ```console 154 | $ conda env create --name recoveredenv --file environment.yml 155 | ``` 156 | 157 | This `export` method will create a well-documented, perfectly _reproducible_ conda environment on your OS. However, it will document low-level, OS-specific packages, which means it won't be _portable_ to a different OS. If you need portability, you can instead write an `environment.yml` file manually. Here's an example file: 158 | 159 | ``` 160 | name: cb 161 | channels: 162 | - conda-forge 163 | - defaults 164 | dependencies: 165 | - python=3.8 166 | - numpy=1.21.2 167 | - pip 168 | - pip: 169 | - tqdm==4.62.3 170 | ``` 171 | 172 | `pip` and `conda` packages are documented separately. Note that `pip` package versions use `==` to identify the package number, while `conda` packages use `=`. If you need to add dependencies to your project, change the `environment.yml` file, then run this command to update your conda environment: 173 | 174 | ``` 175 | (cb) $ conda env update --prefix ./env --file environment.yml --prune 176 | ``` 177 | 178 | You can [read more about creating reproducible environments in the Carpentries tutorial on conda](https://carpentries-incubator.github.io/introduction-to-conda-for-data-scientists/04-sharing-environments/index.html). You can also [use the `environment.yml` file for this book's repo](https://github.com/patrickmineault/codebook/blob/main/environment.yml) as an inspiration. 179 | 180 | ## Create a project skeleton 181 | 182 | ```{margin} 183 | This project skeleton combines ideas from [shablona](https://github.com/uwescience/shablona) and [good enough practices in scientific computing](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1005510). 184 | ``` 185 | 186 | Many different programming frameworks---Ruby on Rails, React, etc.---use a highly consistent directory structure from project to project, which makes it seamless to jump back into an old project. In Python, things are much less standardized. I went into a deep rabbit hole looking at different directory structures suggested by different projects. Here's a consensus structure you can use as inspiration: 187 | 188 | ```{code} 189 | |-- data 190 | |-- docs 191 | |-- results 192 | |-- scripts 193 | |-- src 194 | |-- tests 195 | -- .gitignore 196 | -- environment.yml 197 | -- README.md 198 | ``` 199 | 200 | Let's look at each of these components in turn. 201 | 202 | ### Folders 203 | 204 | - `data`: Where you put raw data for your project. You usually won't sync this to source control, unless you use very small, text-based datasets (< 10 MBs). 205 | - `docs`: Where you put documentation, including Markdown and reStructuredText (reST). Calling it `docs` makes it easy to publish documentation online through Github pages. 206 | - `results`: Where you put results, including checkpoints, hdf5 files, pickle files, as well as figures and tables. If these files are heavy, you won't put these under source control. 207 | - `scripts`: Where you put scripts---Python and bash alike---as well as .ipynb notebooks. 208 | - `src`: Where you put reusable Python modules for your project. This is the kind of python code that you `import`. 209 | - `tests`: Where you put tests for your code. We'll cover testing in a later lesson. 210 | 211 | You can create this project structure manually using `mkdir` on the command line: 212 | 213 | ```console 214 | $ mkdir {data,docs,results,scripts,src,tests} 215 | ``` 216 | 217 | ### Files 218 | 219 | - `.gitignore` contains a list of files that git should ignore. 220 | - `README.md` contains a description of your project, including installation instructions. This file is what people see by default when they navigate to your project on GitHub. 221 | - `environment.yml` contains the description of your conda environment. 222 | 223 | `.gitignore` can be initialized to the following: 224 | 225 | ``` 226 | *.egg-info 227 | data 228 | ``` 229 | 230 | A `README.md` should have already been created during the initial sync to Github. You can either create an `environment.yml` file manually or export an exhaustive list of the packages you are currently using: 231 | 232 | ```console 233 | $ conda env export > environment.yml 234 | ``` 235 | 236 | ## Install a project package 237 | 238 | ```{warning} 239 | Creating a project package is slightly annoying, but the payoff is quite substantial: your project structure will be clean, you won't need to change Python's path, and your project will be pip installable. 240 | ``` 241 | 242 | You might notice a flaw in the preceding project structure. Let's say you create a reusable `lib.py` under the `src` folder, with a function `my_very_good_function`. How would you reference that function in `scripts/use_lib.py`? This doesn't work: 243 | 244 | ```pycon 245 | >>> from ..src.lib import my_very_good_function 246 | Traceback (most recent call last): 247 | File "", line 1, in 248 | ImportError: attempted relative import with no known parent package 249 | ``` 250 | 251 | You need to tell Python where to look for your library code. You have two options, change your Python path, or create an installable package. I recommend the installable package route, but cover the Python path route first because you're likely to encounter it in other projects. 252 | 253 | ### Change your Python path (not recommended) 254 | 255 | You can put the `src` folder on your Python path. To do so, you can [append the `src` folder to the system variable PYTHONPATH when bash starts up (in `~/.bashrc`)](https://bic-berkeley.github.io/psych-214-fall-2016/using_pythonpath.html). You might alternatively dynamically append to the system path from Python, via: 256 | 257 | ```{code} 258 | import sys 259 | sys.path.append('/home/me/Documents/codebook/src') 260 | 261 | from src.lib import my_very_good_function 262 | ``` 263 | 264 | This pattern is also frequently used in jupyter notebooks---I often see it in code cells at the top of notebooks. 265 | 266 | The disadvantage of changing the path is that it tends to be pretty brittle. You have to hard-code the name of folders in multiple places. If they move, you will break your package. It won't work on another computer with different paths, so it will make it hard to share your project with colleagues. Furthermore, dynamic paths don't play well with IDEs like [vscode](vscode) that can only look in the static environment, so you won't get automatic code completion. 267 | 268 | ### Create a pip-installable package (recommended) 269 | 270 | This is a more scalable solution. [The packaging ecosystem in Python can feel frankly daunting](https://packaging.python.org/guides/), but a lot of it we don't need for our purposes. Creating a locally pip installable package actually only involves a few steps. 271 | 272 | #### 1. Create a `setup.py` file 273 | 274 | Create a `setup.py` file in the root of your project. Here's a minimal setup file: 275 | 276 | ``` 277 | from setuptools import find_packages, setup 278 | 279 | setup( 280 | name='src', 281 | packages=find_packages(), 282 | ) 283 | ``` 284 | 285 | #### 2. Create a `__init__.py` file 286 | 287 | Create an empty `__init__.py` file under the `src` directory. This will allow the `find_packages` function to find the package. 288 | 289 | ```console 290 | (codebook) ~/Documents/codebook $ touch src/__init__.py 291 | ``` 292 | 293 | Your files should now look like: 294 | 295 | ``` 296 | |-- data 297 | |-- doc 298 | |-- results 299 | |-- scripts 300 | |-- src 301 | |   -- __init__.py 302 | |-- tests 303 | -- .gitignore 304 | -- environment.yml 305 | -- README.md 306 | -- setup.py 307 | ``` 308 | 309 | #### 3. `pip install` your package 310 | 311 | Now comes the fun part, installing the package. You can do so using: 312 | 313 | ```console 314 | (codebook) ~/Documents/codebook $ pip install -e . 315 | ``` 316 | 317 | `.` indicates that we're installing the package in the current directory. `-e` indicates that the package should be editable. That means that if you change the files inside the `src` folder, you don't need to re-install the package for your changes to be picked up by Python. 318 | 319 | #### 4. Use the package 320 | 321 | Once the package is locally installed, it can be easily used _regardless of which directory you're in_. For instance: 322 | 323 | ```console 324 | (codebook) ~/Documents/codebook $ echo "print('hello world')" > src/helloworld.py 325 | (codebook) ~/Documents/codebook $ cd scripts 326 | (codebook) ~/Documents/codebook/scripts $ python 327 | >>> import src.helloworld 328 | hello world 329 | >>> exit() 330 | (codebook) ~/Documents/codebook/scripts $ cd ~ 331 | (codebook) ~ $ python 332 | >>> import src.helloworld 333 | hello world 334 | ``` 335 | 336 | How does this work? When you install a package in editable mode, Python essentially adds your code to its path. That makes it available from anywhere. The path is changed in such a way that `conda`, `vscode` and other tools are aware that your package is installed, so all these tools will know where to find your code. 337 | 338 | ````{note} 339 | To find out where the code for an installed package is located, print the module info in the Python console: 340 | 341 | ```pycon 342 | >>> import src 343 | >>> src 344 | 345 | >>> import numpy as np 346 | >>> np 347 | 348 | ``` 349 | ```` 350 | 351 | #### 5. (optional) Change the name of the package 352 | 353 | Note that the name of the folder which contains the code, `src`, becomes the name of the package. If you'd like to rename the package, for example to `cb`, change the name of the folder: 354 | 355 | ``` 356 | (codebook) ~/Documents/codebook $ mv src cb 357 | ``` 358 | 359 | If Python doesn't pick up your changes for whatever reason, re-install your package like so: 360 | 361 | ``` 362 | (codebook) ~/Documents/codebook $ pip install -e . 363 | ``` 364 | 365 | ```{margin} 366 | `setuptools` knows which folder contains your package by looking for a `__init__.py` at the root of that folder. 367 | ``` 368 | 369 | ## Use the true-neutral cookiecutter 370 | 371 | If doing all this for every new project sounds like a lot of work, you can save yourself some time using the _true neutral_ cookiecutter, which creates the project skeleton outlined above automatically. `cookiecutter` is a Python tool which generates project folders from templates. You can install it in the base conda environment with: 372 | 373 | ``` 374 | (base) ~/Documents $ pip install cookiecutter 375 | ``` 376 | 377 | To create the `codebook` folder with all its subfolders and setup.py, use the following: 378 | 379 | ``` 380 | (base) ~/Documents $ cookiecutter gh:patrickmineault/true-neutral-cookiecutter 381 | ``` 382 | 383 | ```{margin} 384 | There are many other interesting cookiecutters. Check out the [data science cookiecutter](https://drivendata.github.io/cookiecutter-data-science/) for a more elaborate data science project template. 385 | ``` 386 | 387 | This will create an instance of the `true-neutral-cookiecutter` project skeleton, which is hosted on my personal github. Follow the prompts and it will create the folder structure above, including the setup file. Next, pip install the package you've created for yourself, and sync to your own remote repository, following the github instructions. 388 | 389 | ## Discussion 390 | 391 | ```{margin} 392 | You can reorganize an existing project to align better with the guidelines here. **Make sure to back up everything!** 393 | ``` 394 | 395 | Using structured projects linked to git will help your long-term memory. You will be able to instantly understand how files are laid out months after you've last worked on that project. Using a virtual environment will allow you to recreate that environment in the far future. And git will give you a time machine to work with. 396 | 397 | Writing for your future self has an added bonus: it can make it easier for _other people_ to use your project. Consider this: everything at Google is in one giant repository with [billions of lines of code](https://cacm.acm.org/magazines/2016/7/204032-why-google-stores-billions-of-lines-of-code-in-a-single-repository/fulltext#FNE). As a new software engineer, you're invited to commit to that repository during your first week. Because everything is organized according to [strict conventions](https://github.com/google/styleguide/blob/gh-pages/pyguide.md), so it's not as _terrifying_ as it sounds to jump in. Structure is what enables sustainable growth. 398 | 399 | ```{admonition} 5-minute exercise 400 | Create an empty project with the true-neutral cookiecutter. 401 | ``` 402 | -------------------------------------------------------------------------------- /docs/_static/tufte.css: -------------------------------------------------------------------------------- 1 | @charset "UTF-8"; 2 | 3 | /* Import ET Book styles 4 | adapted from https://github.com/edwardtufte/et-book/blob/gh-pages/et-book.css */ 5 | 6 | @font-face { 7 | font-family: "et-book"; 8 | src: url("et-book/et-book-roman-line-figures/et-book-roman-line-figures.eot"); 9 | src: url("et-book/et-book-roman-line-figures/et-book-roman-line-figures.eot?#iefix") 10 | format("embedded-opentype"), 11 | url("et-book/et-book-roman-line-figures/et-book-roman-line-figures.woff") 12 | format("woff"), 13 | url("et-book/et-book-roman-line-figures/et-book-roman-line-figures.ttf") 14 | format("truetype"), 15 | url("et-book/et-book-roman-line-figures/et-book-roman-line-figures.svg#etbookromanosf") 16 | format("svg"); 17 | font-weight: normal; 18 | font-style: normal; 19 | font-display: swap; 20 | } 21 | 22 | @font-face { 23 | font-family: "et-book"; 24 | src: url("et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.eot"); 25 | src: url("et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.eot?#iefix") 26 | format("embedded-opentype"), 27 | url("et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.woff") 28 | format("woff"), 29 | url("et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.ttf") 30 | format("truetype"), 31 | url("et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.svg#etbookromanosf") 32 | format("svg"); 33 | font-weight: normal; 34 | font-style: italic; 35 | font-display: swap; 36 | } 37 | 38 | @font-face { 39 | font-family: "et-book"; 40 | src: url("et-book/et-book-bold-line-figures/et-book-bold-line-figures.eot"); 41 | src: url("et-book/et-book-bold-line-figures/et-book-bold-line-figures.eot?#iefix") 42 | format("embedded-opentype"), 43 | url("et-book/et-book-bold-line-figures/et-book-bold-line-figures.woff") 44 | format("woff"), 45 | url("et-book/et-book-bold-line-figures/et-book-bold-line-figures.ttf") 46 | format("truetype"), 47 | url("et-book/et-book-bold-line-figures/et-book-bold-line-figures.svg#etbookromanosf") 48 | format("svg"); 49 | font-weight: bold; 50 | font-style: normal; 51 | font-display: swap; 52 | } 53 | 54 | @font-face { 55 | font-family: "et-book-roman-old-style"; 56 | src: url("et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.eot"); 57 | src: url("et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.eot?#iefix") 58 | format("embedded-opentype"), 59 | url("et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.woff") 60 | format("woff"), 61 | url("et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.ttf") 62 | format("truetype"), 63 | url("et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.svg#etbookromanosf") 64 | format("svg"); 65 | font-weight: normal; 66 | font-style: normal; 67 | font-display: swap; 68 | } 69 | 70 | /* Tufte CSS styles */ 71 | html { 72 | font-size: 15px; 73 | } 74 | 75 | body { 76 | width: 87.5%; 77 | margin-left: auto; 78 | margin-right: auto; 79 | padding-left: 12.5%; 80 | font-family: et-book, Palatino, "Palatino Linotype", "Palatino LT STD", 81 | "Book Antiqua", Georgia, serif; 82 | background-color: #fffff8; 83 | color: #111; 84 | max-width: 1400px; 85 | counter-reset: sidenote-counter; 86 | } 87 | 88 | /* Adds dark mode */ 89 | /* 90 | @media (prefers-color-scheme: dark) { 91 | body { 92 | background-color: #151515; 93 | color: #ddd; 94 | } 95 | } 96 | */ 97 | 98 | .heading-style, 99 | h1, 100 | h2, 101 | h3, 102 | h4, 103 | h5, 104 | h6 { 105 | font-family: et-book, Palatino, "Palatino Linotype", "Palatino LT STD", 106 | "Book Antiqua", Georgia, serif; 107 | } 108 | 109 | h1 { 110 | font-weight: 400; 111 | margin-top: 4rem; 112 | margin-bottom: 1.5rem; 113 | font-size: 3.2rem; 114 | line-height: 1; 115 | } 116 | 117 | h2 { 118 | font-style: italic; 119 | font-weight: 400; 120 | margin-top: 2.1rem; 121 | margin-bottom: 1.4rem; 122 | font-size: 2.2rem; 123 | line-height: 1; 124 | } 125 | 126 | h3 { 127 | font-style: italic; 128 | font-weight: 400; 129 | font-size: 1.7rem; 130 | margin-top: 2rem; 131 | margin-bottom: 1.4rem; 132 | line-height: 1; 133 | } 134 | 135 | hr { 136 | display: block; 137 | height: 1px; 138 | width: 55%; 139 | border: 0; 140 | border-top: 1px solid #ccc; 141 | margin: 1em 0; 142 | padding: 0; 143 | } 144 | 145 | p.subtitle { 146 | font-style: italic; 147 | margin-top: 1rem; 148 | margin-bottom: 1rem; 149 | font-size: 1.8rem; 150 | display: block; 151 | line-height: 1; 152 | } 153 | 154 | .numeral { 155 | font-family: et-book-roman-old-style; 156 | } 157 | 158 | .danger { 159 | color: red; 160 | } 161 | 162 | article { 163 | padding: 5rem 0rem; 164 | } 165 | 166 | section { 167 | padding-top: 1rem; 168 | padding-bottom: 1rem; 169 | } 170 | 171 | p, 172 | dl, 173 | ol, 174 | ul { 175 | font-size: 1.4rem; 176 | line-height: 2rem; 177 | } 178 | 179 | p { 180 | margin-top: 1.4rem; 181 | margin-bottom: 1.4rem; 182 | padding-right: 0; 183 | vertical-align: baseline; 184 | } 185 | 186 | /* Chapter Epigraphs */ 187 | div.epigraph { 188 | margin: 5em 0; 189 | } 190 | 191 | div.epigraph > blockquote { 192 | margin-top: 3em; 193 | margin-bottom: 3em; 194 | } 195 | 196 | div.epigraph > blockquote, 197 | div.epigraph > blockquote > p { 198 | font-style: italic; 199 | } 200 | 201 | div.epigraph > blockquote > footer { 202 | font-style: normal; 203 | } 204 | 205 | div.epigraph > blockquote > footer > cite { 206 | font-style: italic; 207 | } 208 | /* end chapter epigraphs styles */ 209 | 210 | blockquote { 211 | font-size: 1.4rem; 212 | } 213 | 214 | blockquote p { 215 | width: 55%; 216 | margin-right: 40px; 217 | } 218 | 219 | blockquote footer { 220 | width: 55%; 221 | font-size: 1.1rem; 222 | text-align: right; 223 | } 224 | 225 | section > p, 226 | section > footer, 227 | section > table { 228 | width: 55%; 229 | } 230 | 231 | /* 50 + 5 == 55, to be the same width as paragraph */ 232 | section > dl, 233 | section > ol, 234 | section > ul { 235 | width: 50%; 236 | -webkit-padding-start: 5%; 237 | } 238 | 239 | dt:not(:first-child), 240 | li:not(:first-child) { 241 | margin-top: 0.25rem; 242 | } 243 | 244 | figure { 245 | padding: 0; 246 | border: 0; 247 | font-size: 100%; 248 | font: inherit; 249 | vertical-align: baseline; 250 | max-width: 55%; 251 | -webkit-margin-start: 0; 252 | -webkit-margin-end: 0; 253 | margin: 0 0 3em 0; 254 | } 255 | 256 | figcaption { 257 | float: right; 258 | clear: right; 259 | margin-top: 0; 260 | margin-bottom: 0; 261 | font-size: 1.1rem; 262 | line-height: 1.6; 263 | vertical-align: baseline; 264 | position: relative; 265 | max-width: 40%; 266 | } 267 | 268 | figure.fullwidth figcaption { 269 | margin-right: 24%; 270 | } 271 | 272 | /* Links: replicate underline that clears descenders */ 273 | a:link, 274 | a:visited { 275 | color: inherit; 276 | } 277 | 278 | .no-tufte-underline:link { 279 | background: unset; 280 | text-shadow: unset; 281 | } 282 | 283 | #main-content a:link, 284 | .tufte-underline, 285 | .hover-tufte-underline:hover { 286 | text-decoration: none; 287 | background: -webkit-linear-gradient(#fffff8, #fffff8), 288 | -webkit-linear-gradient(#fffff8, #fffff8), 289 | -webkit-linear-gradient(currentColor, currentColor); 290 | background: linear-gradient(#fffff8, #fffff8), 291 | linear-gradient(#fffff8, #fffff8), 292 | linear-gradient(currentColor, currentColor); 293 | -webkit-background-size: 0.05em 1px, 0.05em 1px, 1px 1px; 294 | -moz-background-size: 0.05em 1px, 0.05em 1px, 1px 1px; 295 | background-size: 0.05em 1px, 0.05em 1px, 1px 1px; 296 | background-repeat: no-repeat, no-repeat, repeat-x; 297 | text-shadow: 0.03em 0 #fffff8, -0.03em 0 #fffff8, 0 0.03em #fffff8, 298 | 0 -0.03em #fffff8, 0.06em 0 #fffff8, -0.06em 0 #fffff8, 0.09em 0 #fffff8, 299 | -0.09em 0 #fffff8, 0.12em 0 #fffff8, -0.12em 0 #fffff8, 0.15em 0 #fffff8, 300 | -0.15em 0 #fffff8; 301 | background-position: 0% 93%, 100% 93%, 0% 93%; 302 | } 303 | 304 | @media screen and (-webkit-min-device-pixel-ratio: 0) { 305 | a:link, 306 | .tufte-underline, 307 | .hover-tufte-underline:hover { 308 | background-position-y: 87%, 87%, 87%; 309 | } 310 | } 311 | 312 | a:link::selection, 313 | a:link::-moz-selection { 314 | text-shadow: 0.03em 0 #b4d5fe, -0.03em 0 #b4d5fe, 0 0.03em #b4d5fe, 315 | 0 -0.03em #b4d5fe, 0.06em 0 #b4d5fe, -0.06em 0 #b4d5fe, 0.09em 0 #b4d5fe, 316 | -0.09em 0 #b4d5fe, 0.12em 0 #b4d5fe, -0.12em 0 #b4d5fe, 0.15em 0 #b4d5fe, 317 | -0.15em 0 #b4d5fe; 318 | background: #b4d5fe; 319 | } 320 | 321 | /* Sidenotes, margin notes, figures, captions */ 322 | img { 323 | max-width: 100%; 324 | } 325 | 326 | .sidenote, 327 | .marginnote { 328 | float: right; 329 | clear: right; 330 | margin-right: -60%; 331 | width: 50%; 332 | margin-top: 0.3rem; 333 | margin-bottom: 0; 334 | font-size: 1.1rem; 335 | line-height: 1.3; 336 | vertical-align: baseline; 337 | position: relative; 338 | } 339 | 340 | .sidenote-number { 341 | counter-increment: sidenote-counter; 342 | } 343 | 344 | .sidenote-number:after, 345 | .sidenote:before { 346 | font-family: et-book-roman-old-style; 347 | position: relative; 348 | vertical-align: baseline; 349 | } 350 | 351 | .sidenote-number:after { 352 | content: counter(sidenote-counter); 353 | font-size: 1rem; 354 | top: -0.5rem; 355 | left: 0.1rem; 356 | } 357 | 358 | .sidenote:before { 359 | content: counter(sidenote-counter) " "; 360 | font-size: 1rem; 361 | top: -0.5rem; 362 | } 363 | 364 | blockquote .sidenote, 365 | blockquote .marginnote { 366 | margin-right: -82%; 367 | min-width: 59%; 368 | text-align: left; 369 | } 370 | 371 | div.fullwidth, 372 | table.fullwidth { 373 | width: 100%; 374 | } 375 | 376 | div.table-wrapper { 377 | overflow-x: auto; 378 | font-family: "Trebuchet MS", "Gill Sans", "Gill Sans MT", sans-serif; 379 | } 380 | 381 | .sans { 382 | font-family: "Gill Sans", "Gill Sans MT", Calibri, sans-serif; 383 | letter-spacing: 0.03em; 384 | } 385 | 386 | code, 387 | pre > code { 388 | font-family: Consolas, "Liberation Mono", Menlo, Courier, monospace; 389 | font-size: 1rem; 390 | line-height: 1.42; 391 | -webkit-text-size-adjust: 100%; /* Prevent adjustments of font size after orientation changes in iOS. See https://github.com/edwardtufte/tufte-css/issues/81#issuecomment-261953409 */ 392 | } 393 | 394 | .sans > code { 395 | font-size: 1.2rem; 396 | } 397 | 398 | h1 > code, 399 | h2 > code, 400 | h3 > code { 401 | font-size: 0.8em; 402 | } 403 | 404 | .marginnote > code, 405 | .sidenote > code { 406 | font-size: 1rem; 407 | } 408 | 409 | pre > code { 410 | font-size: 0.9rem; 411 | width: 52.5%; 412 | margin-left: 2.5%; 413 | overflow-x: auto; 414 | display: block; 415 | } 416 | 417 | pre.fullwidth > code { 418 | width: 90%; 419 | } 420 | 421 | .fullwidth { 422 | max-width: 90%; 423 | clear: both; 424 | } 425 | 426 | span.newthought { 427 | font-variant: small-caps; 428 | font-size: 1.2em; 429 | } 430 | 431 | input.margin-toggle { 432 | display: none; 433 | } 434 | 435 | label.sidenote-number { 436 | display: inline-block; 437 | max-height: 2rem; /* should be less than or equal to paragraph line-height */ 438 | } 439 | 440 | label.margin-toggle:not(.sidenote-number) { 441 | display: none; 442 | } 443 | 444 | .iframe-wrapper { 445 | position: relative; 446 | padding-bottom: 56.25%; /* 16:9 */ 447 | padding-top: 25px; 448 | height: 0; 449 | } 450 | 451 | .iframe-wrapper iframe { 452 | position: absolute; 453 | top: 0; 454 | left: 0; 455 | width: 100%; 456 | height: 100%; 457 | } 458 | 459 | @media (max-width: 760px) { 460 | body { 461 | width: 84%; 462 | padding-left: 8%; 463 | padding-right: 8%; 464 | } 465 | 466 | hr, 467 | section > p, 468 | section > footer, 469 | section > table { 470 | width: 100%; 471 | } 472 | 473 | pre > code { 474 | width: 97%; 475 | } 476 | 477 | section > dl, 478 | section > ol, 479 | section > ul { 480 | width: 90%; 481 | } 482 | 483 | figure { 484 | max-width: 90%; 485 | } 486 | 487 | figcaption, 488 | figure.fullwidth figcaption { 489 | margin-right: 0%; 490 | max-width: none; 491 | } 492 | 493 | blockquote { 494 | margin-left: 1.5em; 495 | margin-right: 0em; 496 | } 497 | 498 | blockquote p, 499 | blockquote footer { 500 | width: 100%; 501 | } 502 | 503 | label.margin-toggle:not(.sidenote-number) { 504 | display: inline; 505 | } 506 | 507 | .sidenote, 508 | .marginnote { 509 | display: none; 510 | } 511 | 512 | .margin-toggle:checked + .sidenote, 513 | .margin-toggle:checked + .marginnote { 514 | display: block; 515 | float: left; 516 | left: 1rem; 517 | clear: both; 518 | width: 95%; 519 | margin: 1rem 2.5%; 520 | vertical-align: baseline; 521 | position: relative; 522 | } 523 | 524 | label { 525 | cursor: pointer; 526 | } 527 | 528 | div.table-wrapper, 529 | table { 530 | width: 85%; 531 | } 532 | 533 | img { 534 | width: 100%; 535 | } 536 | } 537 | 538 | @media (min-width: 760px) { 539 | .fixed-top { 540 | z-index: 0; 541 | } 542 | } 543 | 544 | /* End tufte.css */ 545 | .dropdown-buttons-trigger, 546 | .full-screen-button { 547 | visibility: hidden; 548 | } 549 | 550 | #bd-toc-nav { 551 | background-color: transparent !important; 552 | } 553 | 554 | .bd-sidebar { 555 | border-right: 0px !important; 556 | } 557 | 558 | blockquote p { 559 | width: 100% !important; 560 | } 561 | 562 | body { 563 | width: 100% !important; 564 | padding-left: 0px !important; 565 | } 566 | 567 | .admonition-title { 568 | padding: 0.4rem 0.6rem 0.4rem 2.5rem !important; 569 | } 570 | 571 | code { 572 | font-size: 75% !important; 573 | } 574 | 575 | .admonition { 576 | background-color: white; 577 | } 578 | 579 | .boxed { 580 | padding: 12px; 581 | background-color: white; 582 | border: 1px solid black; 583 | border-radius: 2px; 584 | } 585 | 586 | h1 { 587 | margin-top: 0px !important; 588 | } 589 | 590 | summary { 591 | font-size: 125%; 592 | padding: 8px 10px !important; 593 | } 594 | 595 | .margin p { 596 | font-size: 1.2rem !important; 597 | } 598 | 599 | #site-title { 600 | display: none; 601 | } 602 | 603 | .navbar_extra_footer { 604 | display: none; 605 | } 606 | 607 | nav { 608 | font-size: 1.2rem; 609 | } 610 | 611 | .epigraph div::before { 612 | content: open-quote; 613 | position: absolute; 614 | left: 10px; 615 | font-size: 4rem; 616 | color: rgba(192 192 192); 617 | margin-top: -20px; 618 | quotes: "“" "”"; 619 | } 620 | 621 | .MathJax_Display { 622 | font-size: 1.2rem; 623 | } 624 | 625 | .bd-sidebar { 626 | position: relative; 627 | overflow-y: visible !important; 628 | } 629 | 630 | /* 631 | footer { 632 | visibility: hidden; 633 | } 634 | */ 635 | 636 | h4 { 637 | font-size: 1.5rem; 638 | font-weight: bold; 639 | } 640 | 641 | main.bd-content #main-content dl.simple dd:not(:last-child), 642 | main.bd-content #main-content dl.field-list dd:not(:last-child) { 643 | margin-top: -10px; 644 | margin-bottom: 20px; 645 | } 646 | 647 | :root { 648 | --pst-color-background: 255, 255, 247; 649 | } 650 | 651 | /* Adds dark mode */ 652 | @media (prefers-color-scheme: dark) { 653 | :root { 654 | --pst-color-primary: 221, 221, 221; 655 | --pst-color-base: var(--pst-color-primary); 656 | --pst-color-paragraph: var(--pst-color-primary); 657 | --pst-color-h1: var(--pst-color-primary); 658 | --pst-color-h2: var(--pst-color-primary); 659 | --pst-color-h3: var(--pst-color-text-base); 660 | --pst-color-h4: var(--pst-color-text-base); 661 | --pst-color-h5: var(--pst-color-text-base); 662 | --pst-color-h6: var(--pst-color-text-base); 663 | --pst-color-paragraph: var(--pst-color-primary); 664 | --pst-color-sidebar-link: var(--pst-color-primary); 665 | --pst-color-sidebar-caption: var(--pst-color-primary); 666 | --pst-color-link: 95, 153, 191; 667 | --pst-color-admonition: 19, 6, 84; 668 | --pst-color-background: 32, 32, 32; 669 | } 670 | 671 | .search { 672 | color: rgba(var(--pst-color-primary)) !important; 673 | } 674 | 675 | main.bd-content #main-content h1, 676 | main.bd-content #main-content h2, 677 | main.bd-content #main-content h3, 678 | main.bd-content #main-content h4, 679 | main.bd-content #main-content h5, 680 | .extra_footer { 681 | color: rgba(var(--pst-color-primary)) !important; 682 | } 683 | 684 | #site-navigation nav ul.nav li a, 685 | #site-navigation nav ul.nav ul li a, 686 | .bd-toc div.onthispage, 687 | .bd-toc .toc-entry a { 688 | color: rgba(var(--pst-color-sidebar-link)); 689 | } 690 | 691 | nav.bd-links p.caption { 692 | color: rgba(var(--pst-color-sidebar-caption)); 693 | } 694 | 695 | #main-content a:link, 696 | .tufte-underline, 697 | .hover-tufte-underline:hover { 698 | text-shadow: 0.03em 0 #151515, -0.03em 0 #151515, 0 0.03em #151515, 699 | 0 -0.03em #151515, 0.06em 0 #151515, -0.06em 0 #151515, 0.09em 0 #151515, 700 | -0.09em 0 #151515, 0.12em 0 #151515, -0.12em 0 #151515, 0.15em 0 #151515, 701 | -0.15em 0 #151515; 702 | } 703 | 704 | .admonition .admonition-title, 705 | .admonition p, 706 | .card-text, 707 | .card-body li::marker, 708 | #main-content .card-body a:link, 709 | #main-content .admonition a:link { 710 | color: rgba(var(--pst-color-admonition)); 711 | text-shadow: none; 712 | } 713 | 714 | main .boxed .caption-text, 715 | main .boxed .caption-number { 716 | color: rgba(var(--pst-color-admonition)); 717 | } 718 | 719 | .MathJax_Display, 720 | li::marker, 721 | .brackets, 722 | main.bd-content #main-content .prev-next-bottom .right-next .prevnext-label, 723 | main.bd-content #main-content .prev-next-bottom .left-prev .prevnext-label, 724 | .margin .sidebar { 725 | color: rgba(var(--pst-color-paragraph)); 726 | } 727 | } 728 | 729 | #main-content a:link, 730 | .tufte-underline, 731 | .hover-tufte-underline:hover, 732 | div.sidebar { 733 | background: none !important; 734 | } 735 | 736 | p.card-text { 737 | margin-top: 0px; 738 | } 739 | 740 | #main-content a:link, 741 | .tufte-underline, 742 | .hover-tufte-underline:hover { 743 | background-repeat: no-repeat !important; 744 | } 745 | 746 | body, 747 | #site-navigation { 748 | background-color: rgba(var(--pst-color-background)) !important; 749 | } 750 | 751 | .topbar { 752 | background-color: rgba(var(--pst-color-background)) !important; 753 | box-shadow: 0 !important; 754 | } 755 | 756 | .scrolled .topbar { 757 | background-color: rgba(var(--pst-color-background)) !important; 758 | box-shadow: 0 0 0 0 rgba(var(--pst-color-background)) !important; 759 | } 760 | 761 | .bd-toc { 762 | margin-top: calc(45px + 1.5em); 763 | } 764 | 765 | .tocsection { 766 | display: none; 767 | } 768 | 769 | .navbar-brand { 770 | display: none; 771 | } 772 | 773 | .navbar-brand-box { 774 | display: none; 775 | } 776 | 777 | #site-navigation { 778 | padding-top: calc(1.5em + 35px); 779 | } 780 | 781 | footer { 782 | border: none; 783 | } 784 | 785 | footer p { 786 | font-size: 1rem; 787 | } 788 | 789 | body { 790 | padding-right: 0%; 791 | } 792 | 793 | .prev-next-bottom a.left-prev { 794 | float: left; 795 | } 796 | 797 | .prev-next-bottom a.right-next { 798 | float: right; 799 | } 800 | 801 | @media (max-width: 768px) { 802 | .bd-search { 803 | display: none !important; 804 | } 805 | 806 | #site-navigation { 807 | padding-top: 0px !important; 808 | margin-top: 0px !important; 809 | } 810 | 811 | .bd-sidenav:first-child { 812 | font-size: 1.4rem; 813 | font-weight: bold; 814 | } 815 | 816 | .topbar { 817 | display: none; 818 | } 819 | 820 | #main-content { 821 | padding-top: 1em !important; 822 | } 823 | 824 | main.bd-content { 825 | padding-top: 0px !important; 826 | } 827 | } 828 | --------------------------------------------------------------------------------