├── cookiecutter.json ├── hooks ├── post_gen_project.sh └── pre_gen_project.py ├── readme.md └── {{cookiecutter.project_slug}} ├── .export_rmarkdown.R ├── .first_install.py ├── .gitignore ├── .nbconvert_templates └── ap_report │ ├── ap.svg │ ├── conf.json │ ├── index.html.j2 │ └── static │ └── style.css ├── .set_kernel_path.sh ├── README.md ├── _quarto.yml ├── analysis ├── .gitkeep ├── archive │ └── .gitkeep └── notebook_templates │ └── ap_data_team │ ├── quarto.ipynb │ └── rmarkdown.ipynb ├── data ├── .gitignore ├── documentation │ └── .gitignore ├── handmade │ └── .gitignore ├── html_reports │ └── .gitignore ├── processed │ └── .gitignore ├── public │ └── .gitignore └── source │ └── .gitignore ├── etl └── .gitkeep ├── publish └── .gitkeep └── scratch └── .gitkeep /cookiecutter.json: -------------------------------------------------------------------------------- 1 | { 2 | "full_name": "Firstname Lastname", 3 | "email": "", 4 | "project_name": "New Project", 5 | "project_slug": "{{ cookiecutter.project_name.lower().replace(' ', '-') }}", 6 | "project_short_description": "TK: short project description", 7 | "_copy_without_render": [ 8 | "analysis/*", 9 | ".nbconvert_templates/*" 10 | ] 11 | } 12 | -------------------------------------------------------------------------------- /hooks/post_gen_project.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ## This post project generation script only runs if pipenv is on the machine 3 | command -v pipenv >/dev/null 2>&1 || { echo >&2 "pipenv not found. Aborting startup script."; exit 1; } 4 | 5 | ## Run first_install script 6 | #### This is meant to be run when people first clone the project. 7 | #### Running it here to add jupyter data directory env variable, to set the RETICULATE_PYTHON r env 8 | ###### variable, to set up the jupyter lab template directory/enable its server, 9 | ###### and to set up the git solution for changing cwd in an analysis file. 10 | python ./.first_install.py 11 | -------------------------------------------------------------------------------- /hooks/pre_gen_project.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | 4 | SLUG_REGEX = r'^[a-zA-Z0-9][-_a-zA-Z0-9]+$' 5 | slug = '{{ cookiecutter.project_slug }}' 6 | 7 | if not re.match(SLUG_REGEX, slug): 8 | print(f'ERROR: {slug} is not a valid project slug!') 9 | sys.exit(1) 10 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # AP Python Cookiecutter 2 | 3 | This is a project template powered by [Cookiecutter](https://github.com/cookiecutter/cookiecutter) for use with [datakit-project](https://github.com/associatedpress/datakit-project/). 4 | 5 | **Structure** 6 | 7 | ``` 8 | . 9 | ├── README.md 10 | ├── analysis 11 | │ └── archive 12 | ├── data 13 | │ ├── documentation 14 | │ ├── html_reports 15 | │ ├── manual 16 | │ ├── processed 17 | │ ├── public 18 | │ └── source 19 | ├── etl 20 | ├── publish 21 | └── scratch 22 | ``` 23 | 24 | - `README.md` 25 | - Project-specific readme with boilerplate for data projects. 26 | - `analysis` 27 | - This is where we keep all of our jupyter ipython notebooks that contain analysis for the project. 28 | - Notebooks in this folder can ingest data from either `data/source` (if that data comes from the source in a workable format) or `data/processed` (if the data required some prep). 29 | - Dataframes from analysis notebooks should be written out to `data/processed` 30 | - `analysis/archive`: Notebooks that leave the scope of the project but should also remain in the project history will be placed here. 31 | - Note that only `.Rmd` linked to `.ipynb` via `Jupytext` are commited, `.ipynb` are in the `.gitignore` because `.ipynb` metadata frequently disrupts version control whenever a notebook is opened or interacted with, while `.Rmd` files only keep track of code. 32 | - `data` 33 | - This is the directory used with our `datakit-data` plugin. 34 | - `data/documentation` 35 | - Documentation on data files should go here - data dictionaries, manuals, interview notes. 36 | - `data/html_reports` 37 | - Contains rendered html of our analysis notebooks, the results of calling `pipenv run export_rmarkdown` on a notebook. 38 | - `data/manual` 39 | - Contains data that has been manually altered (e.g. excel workbooks with inconsistent string errors requiring eyes on every row). 40 | - `data/processed` 41 | - Contains data that has either been transformed from an `etl` script or output from an `analysis` jupyter notebook. 42 | - Data that has been transformed from an `etl` script will follow a naming convention: `etl_{file_name}.[csv,json...]` 43 | - `data/public` 44 | - Public-facing data files go here - data files which are 'live'. 45 | - `data/source`: contains raw, untouched data. 46 | - `etl` 47 | - This is where we keep python scripts involved with collecting data and prepping it for analysis. 48 | - These files should be scripts, they should not be jupyter notebooks. 49 | - `publish` 50 | - This directory holds all the documents in the project that will be public facing (e.g. data.world documents). 51 | - `scratch` 52 | - This directory contains output that will not be used in the project in its final form. 53 | - Common cases are filtered tables or quick visualizations for reporters 54 | - This directory is not git tracked. 55 | 56 | **Our `.gitignore`** 57 | 58 | ``` 59 | *.vim 60 | .env 61 | .Renviron 62 | .venv 63 | .quarto 64 | .DS_Store 65 | .ipynb_checkpoints 66 | 67 | analysis/*.ipynb 68 | analysis/archive/*.ipynb 69 | !analysis/notebook_templates/*.ipynb 70 | 71 | data/ 72 | !data/source/.gitkeep 73 | !data/manual/.gitkeep 74 | !data/processed/.gitkeep 75 | !data/html_reports/.gitkeep 76 | !data/public/.gitkeep 77 | !data/documentation/.gitkeep 78 | 79 | scratch/ 80 | !scratch/.gitkeep 81 | ``` 82 | 83 | ## Usage 84 | 85 | These steps assume configuration for [datakit-project](https://github.com/associatedpress/datakit-project) are complete. 86 | 87 | - If you'd like to keep a local version of this template on your computer, git clone this repository to where your cookiecutters live: 88 | 89 | ``` 90 | cd path/to/.cookiecutters 91 | git clone git@github.com/associatedpress/cookiecutter-python-project.git 92 | ``` 93 | 94 | - Now, when starting a new project with `datakit-project`, reference the cookiecutter in your filesystem. This creates a `pipenv` virtual environment and a ipython kernel for jupyter notebooks that will have the name of the `project_slug`. 95 | 96 | ``` 97 | datakit project create --template path/to/.cookiecutters/cookiecutter-python-project` 98 | ``` 99 | 100 | If you'd like to avoid specifying the template each time, you can edit `~/.datakit/plugins/datakit-project/config.json` to use this template by default: 101 | 102 | ``` 103 | {"default_template": "/path/to/.cookiecutters/cookiecutter-python-project"} 104 | ``` 105 | 106 | ### Full virtual environment setup. From package management to rendering analyses. 107 | 108 | This python template should get AP data journalists set up quickly with a virtual environment, allowing them to clone a project and quickly install all the packages required to run ETL and analysis files. 109 | 110 | **Setup** 111 | 112 | *This is the required setup to get the full python package management functionality provided by this template:* 113 | 114 | - [Pyenv](https://github.com/pyenv/pyenv) to manage our python installations. `brew install pyenv` 115 | 116 | - We need to install a python with shared libraries via `pyenv` using the option `--enable-shared`. This gives us the ability to interact with our R install, should we ever wish to write R code in an R cell in Jupyter, or use R from a python instance using the python library `rpy2`. If we were to install version 3.9.13, for example: `env PYTHON_CONFIGURE_OPTS="--enable-shared" pyenv install 3.9.13`. 117 | 118 | - [Pipenv](https://pipenv.pypa.io/en/latest/) to manage the python packages necessary for our project. We switch to our python with shared libraries we installed earlier (in this case version 3.9.13) with `pyenv global 3.9.13` and then pip install pipenv `python -m pip install pipenv`. It is possible to brew install pipenv, but those who maintain pipenv do not maintain that brew install of the software. They suggest pip installing. 119 | 120 | - [Quarto](https://quarto.org/) to render our analysis notebooks. To install, we use the [CLI installer](https://quarto.org/docs/get-started/) available on their site. 121 | 122 | - Finally, we install `datakit` on the pyenv python with shared libraries. The config files we set up when we first installed datakit will work with datakit installs across different versions of python. `python -m pip install datakit-gitlab datakit-project datakit-data`. 123 | 124 | **Workflow** 125 | 126 | *Starting a new project* 127 | - `datakit project create` will kick off the typical datakit cookiecutter project creation, but this template runs an additional script after constructing the AP analysis folder tree. Briefly, this script sets up the project for pipenv and installs our typical analysis packages. You can find this script in your project: `.first_install.py`. A more detailed description for this script will come with an update to the README. 128 | 129 | - Once the project is created we `cd` into it and run `pipenv shell` Before running `jupyter lab`. Or, we run `pipenv run jupyter lab`. It's up to you which commands to use here. Some people like to have a subshell running via `pipenv shell`, knowing that any command they run in that open subshell will make use of the pipenv environment. Other people like to type in the command every time they want to use the virtual environment with `pipenv run [terminal command]`. 130 | 131 | - Whenever we need to install a package, we use `pipenv install [some_package]`. 132 | 133 | - We don't git track `.ipynb` notebooks. Instead, we use [Jupytext](https://jupytext.readthedocs.io/en/latest/) to link our `.ipynb` files to git-tracked `.Rmd` files. This makes `git diff`s much more useful. `git status` shouldn't say our analysis changed because we ran a cell again. This makes sure it doesn't. 134 | 135 | - When we start an analysis notebook, we use the folder tree in Jupyter Lab to get to our analysis folder and open a new Launcher Window (`shift + command + L`). Under the "Notebook" section, we select the option called "Template". This brings up a dropdown selection menu. Select `ap_data_team` on the top dropdown, and `quarto.ipynb` on the bottom. This should bring up another option to select your ipython kernel. Select the kernel named after your project. 136 | - At this point, you have an analysis notebook file that is linked to an `.Rmd` with the same name. The first time you save your `.ipynb` file, you'll see that `.Rmd` appear alongside your `.ipynb` file. If you ever rename the `.ipynb`, the name of the `.Rmd` will change to match it. 137 | - You can still create a typical `.ipynb` analysis without the template (and without the paired `.Rmd`). Just keep in mind that without a paired `.Rmd` the analysis will not be git-tracked, unless you add an exception for the `.ipynb` file in the `.gitignore`. 138 | 139 | - While we are coding our analysis, we have the ability through Quarto to preview the rendered html file. Run `quarto preview path/to/analysis.ipynb`. 140 | 141 | - When we're ready to render and share our analysis, we make sure Quarto executes the cells in the notebook to render fresh output. Run `quarto render path/to/analysis.ipynb --to html --execute`. 142 | 143 | *Cloning a project* 144 | 145 | - When you're in the directory where you keep your analysis projects, clone the python project: `git clone git@some.git.domain:path/to/git_project.git` 146 | - `cd` into the project and run `python .first_install.py` 147 | - This step will create the projects virtual environment, install all necessary packages included in the `Pipfile` using the major python version defined in the `Pipfile`, and use the `.Rmd` files in the project to generate `.ipynb` files to work with. 148 | 149 | 150 | **Legacy rmarkdown rendering** 151 | 152 | Before we started using Quarto, this template generated R-style html reports via rmarkdown. We did this because rmarkdown generated better tables and more beautiful reports. To achieve it, we would actually pass the Jupytext-paired `.Rmd` file to rmarkdown via an Rscript. This required writing R cells in our analyses to get R style tables. For Altair charts, we'd have to pass the chart json to an R library that knew how to deal with vega charts. These cells wouldn't run until we rendered the report. This is the main reason for switching to Quarto, which allows us to have notebook output that matches what we'll see in the rendered report, and the result is just as beautiful. However, there may come a time, when we find rendering an `.Rmd` via rmarkdown useful. For that reason, we are keeping the rmarkdown rendering script. Keep in mind that to make use of it, you'll need to start an analysis with the Jupyter notebook template `rmarkdown.ipynb`. Then you can render an analysis using that template with `pipenv run export_rmarkdown path/to/analysis.Rmd`. 153 | 154 | ## Configuration 155 | 156 | You can set the default name, email, etc. for a project in the `cookiecutter.json` file. 157 | -------------------------------------------------------------------------------- /{{cookiecutter.project_slug}}/.export_rmarkdown.R: -------------------------------------------------------------------------------- 1 | main <- function() { 2 | # Exports Rmd as html from the command line. 3 | # 4 | # Takes one argument: 5 | # Rmd file to convert 6 | # 7 | library(rmarkdown) 8 | args <- commandArgs(trailingOnly = TRUE) 9 | rmarkdown_file <- args[1] 10 | render(rmarkdown_file, output_dir='data/html_reports') 11 | } 12 | 13 | main() -------------------------------------------------------------------------------- /{{cookiecutter.project_slug}}/.first_install.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import glob 3 | import argparse 4 | from subprocess import check_output 5 | from subprocess import run 6 | 7 | run(['mkdir', './.venv']) 8 | 9 | PYENV_VERSION = "".join(check_output(['pyenv', 'version-name']).decode('utf-8').split()) 10 | PYENV_PREFIX = "".join(check_output(['pyenv', 'prefix', f"{PYENV_VERSION}"]).decode('utf-8').split()) 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--python', help='Python version to use with the project') 14 | args = parser.parse_args() 15 | 16 | if os.path.isfile('./Pipfile'): 17 | if args.python: 18 | run(['pipenv', 'install', '--python', f"{args.python}", '--dev']) 19 | else: 20 | run(['pipenv', 'install', '--dev']) 21 | else: 22 | run(['pipenv', 'install', '--python', f"{PYENV_PREFIX}/bin/python", 'ipython', 'ipykernel', 'pandas', 'matplotlib', 'notebook', 'jupyterlab', 'pyarrow', 'altair', 'jupytext', 'jupyterlab_templates', 'itables', 'ap-altair-theme']) 23 | ## Add this script to the Pipfile, along with the rmarkdown export script 24 | with open('Pipfile', 'a') as pipfile: 25 | pipfile.write('\n[scripts]\nexport_rmarkdown = "Rscript .export_rmarkdown.R"') 26 | 27 | VENV_DIR = "".join(check_output(['pipenv', '--venv']).decode('utf-8').split()) 28 | RETICULATE_PYTHON = check_output(['pipenv', 'run', 'which', 'python']).decode('utf-8') 29 | TEMPLATE_PATHS = glob.glob('analysis/notebook_templates/*') 30 | 31 | # Need to set the Jupyter data directory, this is where jupyter looks for kernels 32 | with open ('.env', 'w') as env_fi: 33 | env_fi.write(f"JUPYTER_DATA_DIR={VENV_DIR}/share/jupyter/\n") 34 | # Need to tell R which python executable to use. Necessary for exporting rmarkdown reports as html. 35 | with open ('.Renviron', 'w') as Renv_fi: 36 | Renv_fi.write(f"RETICULATE_PYTHON={RETICULATE_PYTHON}") 37 | 38 | # Generate ipynb for every markdown file in analysis 39 | run(['pipenv', 'run', 'jupytext', '--set-formats', 'Rmd,ipynb', 'analysis/*.Rmd']) 40 | # Install jupyter template extension and enable the template server 41 | run(['mkdir', f"{VENV_DIR}/share/jupyter/notebook_templates"]) 42 | for path in TEMPLATE_PATHS: 43 | run(['cp', '-r', path, f"{VENV_DIR}/share/jupyter/notebook_templates/"]) 44 | # Git solution for changing cwd in analysis files to root of project 45 | run(['pipenv', 'run', 'bash', '.set_kernel_path.sh']) 46 | -------------------------------------------------------------------------------- /{{cookiecutter.project_slug}}/.gitignore: -------------------------------------------------------------------------------- 1 | *.vim 2 | .env 3 | .Renviron 4 | .venv 5 | .quarto 6 | .DS_Store 7 | .ipynb_checkpoints 8 | 9 | analysis/*.ipynb 10 | analysis/archive/*.ipynb 11 | !analysis/notebook_templates/*.ipynb 12 | 13 | scratch/* 14 | !scratch/.gitkeep 15 | -------------------------------------------------------------------------------- /{{cookiecutter.project_slug}}/.nbconvert_templates/ap_report/ap.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /{{cookiecutter.project_slug}}/.nbconvert_templates/ap_report/conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_template": "lab", 3 | "mimetypes": { 4 | "text/html": true 5 | }, 6 | "preprocessors": { 7 | "100-pygments": { 8 | "type": "nbconvert.preprocessors.CSSHTMLHeaderPreprocessor", 9 | "enabled": true 10 | } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /{{cookiecutter.project_slug}}/.nbconvert_templates/ap_report/index.html.j2: -------------------------------------------------------------------------------- 1 | {%- extends 'lab/index.html.j2' -%} 2 | 3 | {%- block html_head_css -%} 4 | {{ super() }} 5 | {{ resources.include_css("static/style.css") }} 6 | {%- endblock html_head_css -%} 7 | 8 | {% block body_header %} 9 |
10 |