├── .gitignore
├── ES-onboarding
    ├── 00-lecturas.md
    ├── 01-config-admin-pqt.md
    ├── 02-config-git.md
    ├── 03-config-python.md
    ├── 04-config-R.md
    ├── 05-bash.md
    ├── 06-tareas.md
    ├── 07-terminal.md
    ├── MakeFiles.rmd
    ├── README.md
    ├── Reproducibilidad.rmd
    └── símbolos.md
├── LICENSE
├── README.md
├── checklists
    ├── input.md
    ├── makefiles.md
    ├── new-tasks.md
    ├── output.md
    ├── repo-handling.md
    ├── routines.md
    └── scripts.md
├── demo-tasks
    └── record-linkage
    │   ├── .gitignore
    │   ├── README.md
    │   ├── individual
    │       ├── README.md
    │       ├── amazon
    │       │   ├── Makefile
    │       │   └── src
    │       │   │   └── import-amz.R
    │       └── itunes
    │       │   ├── Makefile
    │       │   └── src
    │       │       └── import-itunes.R
    │   ├── match
    │       ├── Makefile
    │       ├── README.md
    │       ├── TS-import
    │       │   ├── Makefile
    │       │   └── src
    │       │   │   └── import-pair-labels.R
    │       ├── blocking-features
    │       │   ├── Makefile
    │       │   └── src
    │       │   │   └── feats.R
    │       ├── blocking
    │       │   ├── Makefile
    │       │   ├── hand
    │       │   │   └── rules.yaml
    │       │   └── src
    │       │   │   ├── count-pairs.R
    │       │   │   └── generate-pairs.R
    │       ├── classify
    │       │   ├── Makefile
    │       │   └── src
    │       │   │   └── classify.R
    │       ├── cluster
    │       │   ├── Makefile
    │       │   └── src
    │       │   │   ├── cluster.R
    │       │   │   └── make-summary.R
    │       ├── compare
    │       │   ├── Makefile
    │       │   └── src
    │       │   │   └── compare.R
    │       ├── export
    │       │   └── Makefile
    │       ├── import
    │       │   ├── Makefile
    │       │   └── src
    │       │   │   └── concatenate.R
    │       └── merge
    │       │   ├── Makefile
    │       │   └── src
    │       │       └── choose-canonical.R
    │   └── setup
    │       └── Makefile
├── glossary.md
├── languages
    ├── R
    │   ├── Makefile
    │   ├── README.md
    │   ├── note
    │   │   └── stringdist.ipynb
    │   ├── output
    │   │   └── performance.md
    │   └── src
    │   │   └── performance.Rmd
    ├── julia
    │   ├── .gitignore
    │   ├── Makefile
    │   ├── Project.toml
    │   ├── README.md
    │   ├── input
    │   │   └── mtcars.csv
    │   ├── output
    │   │   ├── basics.md
    │   │   └── dfs.md
    │   └── src
    │   │   ├── basics.jl
    │   │   ├── compile-md.jl
    │   │   └── dfs.jl
    ├── python
    │   ├── missingness.md
    │   ├── note
    │   │   └── fuzzywuzzy.ipynb
    │   ├── scalability.md
    │   └── set-operations.md
    └── share
    │   ├── input
    │       └── string_tests.yaml
    │   ├── output
    │       ├── make_test_pairs.log
    │       └── test_pairs.parquet
    │   └── src
    │       └── make_test_pairs.py
├── notebooks
    ├── python
    │   ├── .ipynb_checkpoints
    │   │   ├── basic-stats-checkpoint.ipynb
    │   │   └── sets-checkpoint.ipynb
    │   ├── basic-stats.ipynb
    │   ├── here.ipynb
    │   ├── input
    │   │   └── set-operations.pdf
    │   ├── intro.ipynb
    │   ├── magics-general.ipynb
    │   ├── magics-performance.ipynb
    │   ├── missingness.ipynb
    │   ├── os.ipynb
    │   ├── sets.ipynb
    │   └── src
    │   │   ├── no_main.py
    │   │   └── with_main.py
    └── terminal
    │   └── basics.ipynb
├── onboarding
    ├── 00-reading-list.md
    ├── 01-pkg-manager-setup.md
    ├── 02-git-setup.md
    ├── 03-python-setup.md
    ├── 04-R-setup.md
    ├── 05-bash.md
    ├── 06-tasks.md
    ├── 07-living-in-the-terminal.md
    ├── 08-nvim.md
    ├── 09-parquet.md
    ├── 10-auditing.md
    ├── MakeFiles.rmd
    ├── README.md
    └── Reproducibility.rmd
└── templates
    ├── Makefile
    ├── gitattributes
    ├── sample.R
    ├── sample.Rmd
    └── sample.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | .Rproj.user
3 | .Rhistory
4 | 


--------------------------------------------------------------------------------
/ES-onboarding/00-lecturas.md:
--------------------------------------------------------------------------------
 1 | # Lista de lecturas
 2 | Recursos y lecturas sobre las herramientas utilizadas por HRDAG.
 3 | <!--Note: all of the readings are in English except R para Ciencia de Datos-->
 4 | 
 5 | ## Flujo de trabajo
 6 | En HRDAG organizamos nuestro trabajo en tareas o "tasks" (ve `06-tareas.md`). Si tienes alguna duda sobre la organización de nuestros proyectos, [The Task Is A Quantum Of Workflow](https://hrdag.org/2016/06/14/the-task-is-a-quantum-of-workflow/) es una buena referencia. Para obtener una explicación de por qué HRDAG **no** utiliza la herramienta `.Rproj` en RStudio, consulta el siguiente texto: [.Rproj considered harmful](https://hrdag.org/tech-notes/harmful.html).
 7 | 
 8 | ## Markdown
 9 | Markdown es un lenguaje de programación pensado para la redacción (este documento está escrito en Markdown). Los archivos de Markdown terminan con `.md`.
10 | 
11 | - La página [Sintaxis de escritura y formato básicos](https://docs.github.com/es/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) tiene toda la información que se necesita para aprender a escribir y formatear el texto en Markdown.
12 | 
13 | 
14 | ## `R` & R Markdown
15 | En HRDAG generalmente usamos las herramientas del [tidyverse](https://www.tidyverse.org/) para el análisis de datos en `R` tanto como sea posible. Nos gusta el tidyverse porque los paquetes del tidyverse están bien desarrollados y mantenidos. Estas herramientas utilizan una gramática común que simplifica el proceso de escribir código que sea fácil de leer y permite la creación de cadenas de comandos.
16 | 
17 | - Para una explicación de los principios del tidyverse, se recomienda empezar con la siguiente página: [introduction to tidy data principles](https://github.com/jennybc/lotr-tidy/blob/master/01-intro.md).
18 | - [R para Ciencia de Datos](https://es.r4ds.hadley.nz/) es el libro preferido acerca del tidyverse. El libro abarca mucha información y si bien sería genial leerlo completo, sugerimos empezar con:
19 |     - Capítulo 2
20 |     - Capítulo 4
21 |     - Capítulo 5 (¡no te olvides de los ejercicios!)
22 |     - Capítulo 7 (¡no te olvides de los ejercicios!)
23 | 
24 | 
25 | - R Markdown es una variedad de Markdown pensada especialmente para el uso con `R`. Los archivos de R Markdown terminan con `.Rmd`. Utilizamos R Markdown (en vez de manualmente fijar cifras o gráficas) porque nos permite combinar prosa y código, y así podemos elaborar informes que se actualizan automáticamente en el caso de que nuestros resultados cambian. [R Markdown: The Definitive Guide](https://bookdown.org/yihui/rmarkdown/) es nuestro libro preferido acerca de este tema. R Markdown puede ser utilizado de diversas formas, pero los capítulos 1 y 2 son suficientes para empezar.
26 | 
27 | ## Vim y el lenguaje de edición
28 | 
29 | [_Practical Vim_](https://pragprog.com/titles/dnvim2/) es uno de los mejores libros de programación que yo (PB) he leído. Creo que algunos de los/as colegas del equipo de capacitación tienen una versión en PDF. Te aliento encarecidamente a que lo estudies con atención.
30 | 
31 | ## Estilo de redacción
32 | Es útil contar con una referencia del estilo y calibre de los textos elaborados por HDAG. Estos son unos ejemplos seleccionados por nuestra Directora Ejecutiva, Megan Price.
33 | 
34 | >[Esto](https://chance.amstat.org/2018/02/statistics-of-genocide/) es un artículo de una revista de estadística, así que se asume que la audiencia está familiarizada con la estadística, o por lo menos está interesada en este área. Otro ejemplo es [To Predict and Serve?](https://rss.onlinelibrary.wiley.com/doi/epdf/10.1111/j.1740-9713.2016.00960.x), elaborado por Kristian y William para la revista Significance. A lo largo de los años, hemos publicado textos en [Chance](https://chance.amstat.org/) y [Significance](https://www.significancemagazine.com/), y ambos son medios accesibles para consultar nuestro trabajo (en aras de la transparencia, comparto que formo parte del consejo editorial de Significance). [Violence in Blue](https://granta.com/violence-in-blue/) es un ejemplo bastante distinto de nuestras redacciones para una audiencia general (Granta también es amable con nosotros y publicó algunos de nuestros ensayos sobre el COVID, ¡pero técnicamente es una revista literaria!)
35 | >
36 | >[Este artículo](https://hrdag.org/wp-content/uploads/2019/09/2019-DemographicResearch-civilian-killings-el-salvador.pdf) en Demographic Research es un ejemplo de la escritura académica sometida a revisión de pares. Cada disciplina académica tiene su propio estilo de escritura, y como trabajamos interdisciplinariamente, terminamos escribiendo en una variedad de estilos. Esta revista de ciencia política y demografía prefiere artículos largos, y este texto es de 36 páginas.
37 | >
38 | > En 2019, Patrick escribió una serie de memorandos para distintos proyectos que utilizan métodos parecidos de estimación de múltiples sistemas (o EMS por sus siglas en inglés). Creo que estos son buenos ejemplos de los tipos de documentación técnica o memorandos que a veces elaboramos para nuestras organizaciones asociadas (y, cuando sea posible, publicamos en nuestra página web). [The Philippines](https://hrdag.org/wp-content/uploads/2019/07/2019-HRDAG-killings-philippines.pdf) [Indonesia](https://hrdag.org/wp-content/uploads/2018/12/KP-Palemban-ests.pdf) [Sri Lanka](https://hrdag.org/wp-content/uploads/2018/12/HRDAG-ITJPSL-2018-12-12-1.pdf)
39 | 
40 | <!-- done --> 
41 | 


--------------------------------------------------------------------------------
/ES-onboarding/01-config-admin-pqt.md:
--------------------------------------------------------------------------------
 1 | # Configuración del administrador de paquetes
 2 | 
 3 | Los administradores de paquetes son útiles para instalar y actualizar las herramientas de la línea de comandos. En vez de tener que pasar por varios pasos complicados, puedes ejecutar un solo comando. Por ejemplo:
 4 | 
 5 | Sin un administrador de paquetes:
 6 | ```
 7 | curl -LO https://github.com/neovim/neovim/releases/download/nightly/nvim-macos.tar.gz
 8 | tar xzf nvim-macos.tar.gz
 9 | ./nvim-osx64/bin/nvim
10 | ```
11 | 
12 | Con un administrador de paquetes:
13 | ```
14 | brew install neovim
15 | ```
16 | 
17 | ## MacOS: Instalación de homebrew
18 | 
19 | Homebrew es el administrador de paquetes más popular para macOS. Para instalarlo, copia y pega la síguiente línea en el terminal y presiona Enter.
20 | 
21 | ```
22 | /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install.sh)"
23 | ```
24 | 
25 | ## Linux: TODO
26 | 
27 | ## Windows: TODO
28 | 
29 | <!-- give up, maybe -->
30 | 


--------------------------------------------------------------------------------
/ES-onboarding/02-config-git.md:
--------------------------------------------------------------------------------
 1 | # Configuración de git y GitHub
 2 | 
 3 | Git es una forma de registrar cambios dentro de un repositorio para que puedas volver a versiones anteriores de archivos y juntar tu trabajo con el de otros. No es completamente ideal para nuestro tipo de trabajo con datos, pero es la norma de la indústria así que lo usamos. (Lo complementamos con nuestro propio sistema de control de versiones llamado `snap`, pero ésa es otra lección.
 4 | 
 5 | GitHub es un sitio web que ofrece una interfaz bonita para git y facilita la colaboración con organizaciones. Lo usamos para la mayor parte de nuestros proyectos en HRDAG.
 6 | 
 7 | ## Pasos de configuración
 8 | 
 9 | Muy probablemente ya tienes git instalado en tu computador y solo tendrás que configurar GitHub.
10 | 
11 | 1. Abre una cuenta de GitHub (https://github.com/join).
12 | 2. Conecta tu cuenta de GitHub con una clave SSH (https://docs.github.com/es/authentication/connecting-to-github-with-ssh). Tu clave SSH permite que GitHub reconozca quién eres cuando le pides archivos. Es como una contraseña más segura que no tienes que recordar.
13 | 
14 | ¡Eso es todo!
15 | 


--------------------------------------------------------------------------------
/ES-onboarding/03-config-python.md:
--------------------------------------------------------------------------------
 1 | # Configuración de python
 2 | 
 3 | Python es uno de los dos lenguajes que usamos para la mayor parte de nuestro análisis de datos. Lo más probable es que ya tengas una versión de python en tu computadora, pero puede que sea anticuado o que falten los paquetes cruciales para el análisis de datos. Para resolver esto, usamos Anaconda, que administra python y sus paquetes.
 4 | 
 5 | ## Pasos de configuración
 6 | 
 7 | 1. Visita https://docs.anaconda.com/anaconda/install/ y selecciona tu sistema operativo.
 8 | 2. Selecciona la versión más reciente de python que sea compatible con Anaconda. Al 18 de agosto de 2022, ésta es python 3.9.6.
 9 | 3. Sigue los pasos indicados por el instalador.
10 | 
11 | Si utilizas una shell no compatible con POSIX, como `fish`, tendrás que agregar anaconda a tu ruta de forma manual. (Si no sabes qué significa esto, no te preocupes.)
12 | 
13 | ## jupyter
14 | 
15 | `jupyter` es una interfaz interactiva para python que utilizamos para la exploración de datos y prototipación de código. Viene incluido con Anaconda-puedes simplemente ejecutar `jupyter notebook` en tu terminal y aparece.
16 | 


--------------------------------------------------------------------------------
/ES-onboarding/04-config-R.md:
--------------------------------------------------------------------------------
 1 | # Configuración de R
 2 | 
 3 | R es uno de los dos lenguajes que usamos para la mayor parte de nuestro análisis de datos. Fue creado para el uso con datos y es mejor que Python para la visualización de datos. 
 4 | 
 5 | # Pasos de configuración
 6 | Para Mac
 7 | 
 8 | Si utilizas Mac, asegúrate de haber instalado homebrew.
 9 | Ejecuta "brew install r".
10 | [Aquí](https://www.r-bloggers.com/how-to-install-r-on-mac-ubuntu-and-windows/) 
11 | se puede encontrar una buena guía para los usuarios de Mac.
12 | 
13 | # Instalación de RStudio
14 | 
15 | 1. Visita https://www.r-project.org/
16 | 2. Selecciona la versión más reciente de R.
17 | 3. Sigue los pasos indicados por el instalador.
18 | 
19 | Recuerda que hay que instalar R antes de instalar RStudio. R se puede instalar 
20 | [aquí](https://cran.rstudio.com/).
21 | 
22 | # RStudio a través de un navegador
23 | 
24 | [Este](https://support.rstudio.com/hc/en-us/articles/234653607-Getting-Started-with-RStudio-Server) es un buen texto sobre el uso de RStudio Server, el cual te permite trabajar en RStudio a través de un navegador. 
25 | 
26 | # Lee el libro 
27 | [_R para Ciencia de Datos_ en español](https://es.r4ds.hadley.nz/)
28 | 
29 | <!-- done --> 
30 | 


--------------------------------------------------------------------------------
/ES-onboarding/05-bash.md:
--------------------------------------------------------------------------------
 1 | # Bash
 2 | 
 3 | Bash es un lenguaje para interactuar con el árbol de procesos y el sistema de archivos de Unix/Linux. Es un tipo de metalenguaje, dentro del cual vive todo lo demás en tu computadora.
 4 | 
 5 | Para empezar a trabajar en Bash, intenta lo siguiente:
 6 | 
 7 | Abre tu terminal y escribe `cd`. Esto cambiará directorios (las carpetas en Unix se llaman directorios) al directorio 'home', tu directorio personal, único para cada usuario. Ahora, si escribes `ls`, debe aparecer una lista de todos los archivos y directorios dentro de tu directorio 'home'. Esto debe incluir todos los archivos y directorios no ocultos dentro del directorio en el que estés. ¿Y los archivos ocultos? Los archivos ocultos son solamente los que empiezan con un punto (.). Para verlos, usa el comando `ls -a`. Estos 'dotfiles' ocultos serán relevantes después, cuando personalices el entorno de tu terminal.
 8 | 
 9 | Puede que ya hayas notado el texto que aparece antes de tus comandos, conocido como el indicador (o 'prompt') del intérprete de comandos. El tuyo probablemente sea algo así: `<nombre-de-computadora>:<directorio> <nombre-de-cuenta>$`. No es recomendable usar un indicador así por varias razones: no tiene color, lo cual ayuda a los ojos a diferenciar entre valores; es difícil de leer; y no incluye tu ruta completa.
10 | 
11 | Con una ruta completa, se puede saber quién eres, en qué computadora estás trabajando (en eleanor, por ejemplo) y en qué directorio te encuentras (`~/git/GT-fingerprints/individual`). Asímismo, se puede dejar el comando en su propia línea para que quepa aunque sea particularmente larga la ruta del directorio de trabajo. Recomiendo determinar qué tipo de indicador te gustaría tener y después programarlo en tus configuraciones. He aquí unas sugerencias básicas.
12 | 
13 | 1. Asigna la fórmula que quieras a la variable `PS1` en bash.
14 | 2. Haz todo tu trabajo en `~/.bashrc`. Este archivo es uno de los 'dotfiles' antes mencionados y se ejecuta automáticamente para que bash pueda ver tus preferencias. Escribir una barra invertida antes de un carácter te permite poner variables en tu indicador. Consulta esta lista para conocer algunas combinaciones: https://www.tldp.org/HOWTO/Bash-Prompt-HOWTO/bash-prompt-escape-sequences.html
15 | 
16 | Estos enlaces son útiles para empezar a trabajar con Bash:
17 | 
18 | - http://tldp.org/HOWTO/Bash-Prog-Intro-HOWTO.html
19 | - http://cs.lmu.edu/~ray/notes/bash/
20 | - https://programminghistorian.org/es/lecciones/introduccion-a-bash
21 | - https://www.digitalocean.com/community/tutorials/an-introduction-to-the-linux-terminal
22 | - https://www.tjhsst.edu/~dhyatt/superap/unixcmd.html (las listas "essential", "valuable" y "useful" son buenas-las otras son algo anticuadas)
23 | - https://softcover.s3.amazonaws.com/636/learn_enough_command_line/images/figures/anatomy.png (una imagen que explica la estructura de bash)
24 | - http://tldp.org/LDP/abs/html/ (demasiado detallado, enfocado en programas en vez de la línea de comandos. Podría ser útil como un diccionario)
25 | - https://www.youtube.com/watch?v=oxuRxtrO2Ag&t=3922s
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/ES-onboarding/06-tareas.md:
--------------------------------------------------------------------------------
 1 | # Tasks
 2 | 
 3 | En HRDAG, nuestros proyectos frecuentemente se caracterizan por lo que nosotros llamamos "más de dos". En un mismo proyecto, hay más de dos programadores trabajando y se usan más de dos conjuntos de datos o más de dos lenguajes de programación. Para manejar esta complejidad, implementamos estructura en el proyecto en la forma de tareas, o "tasks". Una tarea representa un paso en el trayecto de los conjuntos de datos desde los datos originales hasta el análisis final. 
 4 | 
 5 | Como mínimo, las tareas tienen tres directorios: `input/`, `src/` y `output/`. `input/` incluye los datos iniciales, y debe ser leído solamente. `src/` es la forma que Unix escribe "source" ("fuente""), así que este directorio incluye el código fuente que procesa los datos. `output/` incluye los archivos de salida, y debe contener únicamente los archivos generados al ejecutar el código en `src/`.
 6 | 
 7 | También hay otros directorios que a veces existen dentro de una tarea. Estos son:
 8 | 
 9 | * `note/` incluye archivos para la prototipación del código fuente (código de jupyter o RStudio, por ejemplo)
10 | * `hand/` incluye archivos generados de forma manual, como un archivo .csv que convierte los nombres de municipios o departamentos en códigos de geolocalización.
11 | * `frozen/` incluye archivos que no caben en `input/` ni `output/`. Esto ocurre cuando los datos están tan defectuosos que nuestras herramientas de fuente abierta no los pueden manejar, entonces los tenemos que editar de forma manual o usar otro programa para abrir y guardarlos de nuevo.
12 | * `doc/` incluye documentación.
13 | 
14 | Las tareas generalmente se conectan entre sí para que los archivos de salida generados por una tarea sirvan como los archivos de entrada de la siguiente tarea. Normalmente cada rama empieza con la tarea `import/`, que convierte el archivo al formato correcto para nuestras operaciones, y termina con la tarea `export/`, que convierte el archivo al formato correcto para el producto final que queramos. Esto quiere decir que cuando accedes al proyecto en seis meses, todavía sabes dónde empieza y dónde termina.
15 | 
16 | Lee [The Task Is A Quantum of Workflow](https://hrdag.org/2016/06/14/the-task-is-a-quantum-of-workflow/) y ve [Patrick Ball: Principled Data Processing](https://www.youtube.com/watch?v=ZSunU9GQdcI) para obtener más información sobre la organización de las tareas.
17 | 


--------------------------------------------------------------------------------
/ES-onboarding/07-terminal.md:
--------------------------------------------------------------------------------
 1 | # Cómo "vivir" en el terminal
 2 | 
 3 | Unas sugerencias para ayudarte a trabajar en el terminal:
 4 | 
 5 | ## `alias`
 6 | 
 7 | Este comando te permite fácilmente crear atajos para otros comandos. Es particularmente útil para asegurarte de no olvidar una opción. Por ejemplo: 
 8 | 
 9 | `alias ll="ls -AlFGgh --color='always'"` crea el comando `ll` que funciona como una versión más detallada de `ls`. Este comando muestra los permisos de archivos, coloca el nombre de cada archivo o directorio en su propia línea, usa un código de colores para representar distintos tipos de archivos, ¡y más!
10 | 
11 | El resultado generado por `ls`:
12 | ```
13 | README.md  bash  setup.md  setup.sh  vim
14 | ```
15 | 
16 | El resultado generado por `ll`: (imagina que tiene colores también)
17 | ```
18 | total 28K
19 | -rw-r--r--  1 8.1K Aug  4  2017 .DS_Store
20 | drwxr-xr-x 16  512 Aug 29  2019 .git/
21 | -rw-r--r--  1   35 Jun 21  2018 .gitignore
22 | -rw-r--r--  1   27 Jun 26  2017 README.md
23 | drwxr-xr-x  7  224 Aug 29  2019 bash/
24 | -rw-r--r--  1  494 Jul 12  2018 setup.md
25 | -rw-r--r--  1 3.2K Jul 28  2017 setup.sh
26 | drwxr-xr-x 11  352 Jul 22  2019 vim/
27 | ```
28 | 
29 | También he encontrado útil el siguiente bloque de código: 
30 | ```
31 | alias gs="git status"
32 | alias gc="git commit -m"
33 | alias gA="git add -A && git status"
34 | ```
35 | Esto refuerza buenas prácticas en git--no se puede hacer un commit al repositorio sin un mensaje de confirmación porque si se ejecuta `gc` solamente, se devolverá un error.
36 | 
37 | Asegúrate de incluir estos comandos en tu archivo .bashrc para que se ejecuten cada vez que abres tu terminal.
38 | 
39 | ## `tree`
40 | 
41 | `tree` es un excelente comando que presenta una representación más visual de la estructura de tus archivos. Esto es lo que `tree` devuelve para el mismo directorio que usamos anteriormente:
42 | 
43 | ```
44 | .
45 | ├── README.md
46 | ├── bash
47 | │   ├── bashrc
48 | │   ├── featherhead.py
49 | │   ├── fromproj.py
50 | │   ├── projpath.py
51 | │   └── toproj.py
52 | ├── setup.md
53 | ├── setup.sh
54 | └── vim
55 |     ├── UltiSnips
56 |     │   ├── make.snippets
57 |     │   ├── python.snippets
58 |     │   └── yaml.snippets
59 |     ├── ftplugin
60 |     │   ├── make.vim
61 |     │   ├── markdown.vim
62 |     │   └── text.vim
63 |     ├── hi-output
64 |     ├── hi-presets.vim
65 |     ├── parens.vim
66 |     ├── plugs.vim
67 |     ├── process-hi.py
68 |     ├── vimconfigs.sh
69 |     └── vimrc
70 | 
71 | 4 directories, 21 files
72 | ```
73 | Dos buenas opciones para `tree` son `-C`, que añade color, y `-L NÚMERO`, que hace que `tree` sólo devuelva los primeros NÚMERO niveles del directorio. Por ejemplo, para el mismo directorio, `tree -L 1` devuelve:
74 | 
75 | ```
76 | .
77 | ├── README.md
78 | ├── bash
79 | ├── setup.md
80 | ├── setup.sh
81 | └── vim
82 | 
83 | 2 directories, 3 files
84 | ```
85 | 
86 | ## Cómo usar los atajos de teclado de vim en bash
87 | 
88 | Añade la línea `set -o vi` a tu archivo .bashrc. Este comando utiliza vi en vez de vim, así que faltan algunas de las características a las cuales quizás estés acostumbrado/a (para mí, lo más notable es la falta de objetos de texto, entonces los comandos `ciw` y `da` no funcionan). Otro problema es que no hay una buena forma de saber en qué modo estás trabajando (por defecto es el modo de inserción), así que puede ser confuso a veces.
89 | 
90 | ## `cd -`
91 | 
92 | `cd -` te regresa al último directorio en el que estuviste. Entonces, si estás trabajando en el directorio `~/git/HRDAG-training` y ejecutas `cd /etc` y después `cd -`, estarás de vuelta en `~/git/HRDAG-training`. Este comando es útil si quieres brevemente saltar a otro directorio para hacer algo.
93 | 
94 | ## Cómo buscar comandos de bash
95 | 
96 | Escribir CTRL-r en la línea de comandos activa el modo de búsqueda. Este modo básicamente toma lo que escribes y encuentra el comando más reciente que ejecutaste que incluya esa cadena de caracteres. Es útil para ejecutar de nuevo un comando largo que ejecutaste hace poco.
97 | 
98 | <!-- done. -->
99 | 


--------------------------------------------------------------------------------
/ES-onboarding/MakeFiles.rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "MakeFile"
 3 | output: pdf_document
 4 | header-includes:
 5 | - \usepackage{float}
 6 | - \usepackage{hyperref}
 7 | - \hypersetup{
 8 |      colorlinks=true,
 9 |      linkcolor=black,
10 |      citecolor = black,
11 |      urlcolor=blue,
12 |      }
13 | fontsize: 12pt
14 | ---
15 | 
16 | # ¿Qué es un Makefile?
17 | 
18 | Un Makefile es un archivo para compilar código. Tenemos Makefiles para cada tarea que creamos en nuestro análisis de datos. Se guardan dentro de una tarea pero fuera de los subdirectorios (input, output, src, hand, frozen).
19 | 
20 | # ¿Cómo debe ser un Makefile?
21 | 
22 | 1. Como cualquier archivo de texto que creamos, siempre debe incluir una cabecera que incluye:
23 | 
24 | Author (autor/a): 
25 | Maintainers (mantenedores):
26 | Date (fecha): Oct. 16 2019
27 | License (licencia): 
28 |  ------
29 | ruta/del/archivo
30 | 
31 | 2. Después, hay que seguir esta estructura:
32 | 
33 | 2.1 Para ejecutar scripts escritos en R:
34 | ```
35 | .PHONY: all clean
36 |  
37 | all:  ruta(s) de los archivos de salida
38 |  
39 | clean:
40 | 	rm output/*
41 |  
42 | ruta del primer archivo de salida: \
43 | 		script para crear ese archivo 
44 | 	Rscript --vanilla $<
45 |  
46 | ruta del segundo archivo de salida: \
47 | 		script para crear ese archivo 
48 | 	Rscript --vanilla $<
49 | ```
50 | 
51 | 3. Ejemplo
52 | 
53 | Digamos que queremos crear un Makefile para la tarea "append" (añadir) en un análisis de datos sobre los asesinatos de líderes sociales en Colombia. El archivo de salida será una base de datos con todos los años que tenemos en nuestros registros. También crearemos un archivo .log con información básica que usaremos después para nuestro informe.
54 | ```
55 | Author: VRA
56 | Maintainers: VRA, PB, CAF
57 | Date: Oct. 16 2019
58 | License: GPL-2 or newer
59 | ------
60 | CO-leaders/append/Makefile
61 | 
62 | .PHONY: all clean
63 | 
64 | all: output/allyears.rds \
65 | 	output/logfileappend.log
66 | 
67 | clean:
68 | 	rm output/*
69 | 
70 | output/allyears.rds: \
71 | 		src/appendyears.R 
72 | 	cd ../clean && make
73 | 	Rscript --vanilla $<
74 | 
75 | output/logfileappend.log: \
76 | 		src/appendyears.R
77 | 	cd ../clean && make
78 | 	Rscript --vanilla $<
79 | 
80 | done
81 | ```
82 | 
83 | <!-- done -->
84 | 
85 | 


--------------------------------------------------------------------------------
/ES-onboarding/README.md:
--------------------------------------------------------------------------------
 1 | # Materiales didácticos de HRDAG
 2 | 
 3 | Este repositorio contiene materiales didácticos para los y las colegas de HRDAG. A medida que los vas leyendo, es probable que surjan preguntas y cuando intentes poner estos conceptos en práctica, puede que te topes con errores incomprensibles.
 4 | 
 5 | Si esto ocurre, deberías pedir ayuda después de unos minutos de intentar resolver el error. Es probable que uno/a de tus colegas haya visto el error antes o esté familiarizado/a con problemas parecidos.
 6 | 
 7 | ## Antes de pedir ayuda
 8 | 
 9 | La razón por la que es una buena práctica intentar resolver un problema por tu cuenta (aparte del hecho de que tal vez soluciones el problema) es que te ayudará a aclarar tu pregunta. Pasar unos minutos haciendo pruebas podría revelar que lo que pensabas que era un problema con tus configuraciones de ssh en realidad era un problema con tu archivo .bashrc. Puede ser que todavía no sepas *cuál* es el problema, pero por lo menos será más fácil que alguien más te ayude.
10 | 
11 | Unos consejos para corregir errores:
12 | <!-- it looks like the repository linked in the second bullet point no longer exists. alternative repo?-->
13 | 
14 | * Copia y pega la parte relevante del mensaje de error en Google
15 | * Busca el problema en la página de "issues" de [este repositorio](https://github.com/HRDAG/training-docs/issues). Puede que ya haya un "issue" sobre el mismo problema.
16 | * Si el error ocurre dentro de un script, intenta comentar la línea errónea para ver si aparece otro error interesante.
17 | 
18 | El tiempo que dedicas a intentar resolver tu problema antes de pedir ayuda debería aumentar a la medida que vas familiarizándote con estas herramientas. Al principio, no debes dedicar más de 5 minutos, pero después de un tiempo naturalmente dedicarás 15, 30 o 120 minutos. 
19 | 
20 | ## Cómo pedir ayuda
21 | 
22 | Cuando pidas ayuda, es recomendable [crear un nuevo "issue"](https://github.com/HRDAG/training-docs/issues/new) en este repositorio. De esta forma, los/as futuros/as colegas de HRDAG se beneficiarán de tus conocimientos.
23 | <!-- fix link. -->
24 | 
25 | Unos consejos para formular buenas preguntas:
26 | 
27 | * explica por qué estás intentando hacer lo que quieres hacer. Puede que haya una herramienta más apta para solucionar el problema.
28 | * si estás intentando resolver un error, copia y pega el mensaje de error completo en la pregunta. Usa la función de bloques de código en Markdown (pon el mensaje de error entre `` ``` ``s) para mantener el formato del código.
29 | * describe las soluciones que ya has probado y cuáles fueron los resultados.
30 | 
31 | ¡Diviértete y no dudes en hacernos preguntas!
32 | 
33 | <!-- done. -->
34 | 


--------------------------------------------------------------------------------
/ES-onboarding/Reproducibilidad.rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Algunas reflexiones sobre la reproducibilidad"
 3 | output: pdf_document
 4 | header-includes:
 5 | - \usepackage{float}
 6 | - \usepackage{hyperref}
 7 | - \hypersetup{
 8 |      colorlinks=true,
 9 |      linkcolor=black,
10 |      citecolor = black,
11 |      urlcolor=blue,
12 |      }
13 | fontsize: 12pt
14 | ---
15 | 
16 | # ¿Por qué es importante poder auditar los proyectos?
17 | 
18 | Poder replicar los resultados de nuestros análisis de datos es clave para nuestra reputación. Por lo general, poder replicar algo significa que podemos obtener el mismo resultado una y otra vez utilizando el mismo script y los mismos datos de entrada. En las ciencias, también se habla de la *reproducibilidad*, que implica más variación experimental. La reproducibilidad y replicabilidad son parte de las mejores prácticas científicas, y es en parte por esto que recomendamos usar la línea de comandos en vez de los ambientes de desarrollo interactivos (como R Studio).
19 | 
20 | Con resultados replicables, podemos compartir nuestros métodos con otros/as científicos/as, lo cual produce dos externalidades positivas. Primero, nuestros resultados podrán ser validados por otros/as investigadores/as, lo cual da credibilidad a nuestros hallazgos. Segundo, al poder compartir nuestros métodos con otros/as investigadores/as, se producen economías de escala y se minimiza el esfuerzo necesario para reproducir estas técnicas.
21 | 
22 | 
23 | El mundo se enfrenta a una crisis de reproducibilidad. La revista *Nature* realizó [una encuesta](https://www.nature.com/news/1-500-scientists-lift-the-lid-on-reproducibility-1.19970) sobre este tema y encontró que "más de 70% de los investigadores han intentado reproducir los resultados de otro/a científico/a sin lograrlo, y más de la mitad no han podido reproducir sus propios experimentos".
24 | 
25 | Aunque esta crisis está presente en muchas disciplinas, en el área de los derechos humanos, la replicabilidad es particularmente crucial. Aunque nuestros métodos y resultados son imparciales y no manipulamos ninguna parte del proceso de esclarecer la verdad, es muy probable que nuestros hallazgos no les gustarán a ciertos actores. Por esto, cuando publicamos nuestros resultados para su uso por los/as defensores/as de derechos humanos, hay que asegurarnos de que los resultados no tengan errores. Garantizar la replicabilidad es clave para esto.
26 | 
27 | # Algunas lecturas
28 | 
29 | Puedes hacer clic [aquí](https://github.com/ropensci-archive/reproducibility-guide/blob/gh-pages/sections/references/index.md) para ver algunas lecturas sobre la reproducibilidad.
30 | 
31 | <!-- done -->
32 | 
33 | 


--------------------------------------------------------------------------------
/ES-onboarding/símbolos.md:
--------------------------------------------------------------------------------
 1 | | símbolo |   Spanish                 |   English |
 2 | |   :-:   |   :-:                     |   :-: |
 3 | |    .    |   período, punto          |   period, point |
 4 | |    ,    |   coma                    |   comma |
 5 | |    ;    |   punto y coma            |   semicolon |
 6 | |    :    |   dos puntos 	            |   colon |
 7 | |    !    |   signo de exclamación    |   colon |
 8 | |    ?    |   signo de interrogación  |   colon |
 9 | |    -    |   guión                   |   minus, dash |
10 | |    _    |   guión bajo              |   underscore |
11 | |    +    |   más                     |   plus |
12 | |    ~    |   tilda                   |   tilde |
13 | |    `    |   acento grave            |   backtick |
14 | |    /    |   slash hacia adelante    |   forward slash |
15 | |    \    |   slash hacia atrás       |   back slask |
16 | |    ""   |   comillas                |   quotation marks |
17 | |    ()   |   paréntesis              |   parentheses |
18 | |    []   |   paréntesis cuadrados    |   hard brackets |
19 | |    {}   |   paréntesis rizados      |   curly brackets |
20 | |   \|    |   ?                       |   pipe  |
21 | |    <>   |   ?                       |   carrots, pointy brackets  |
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # training-docs
 2 | This repo contains materials to study and learn about principled data processing, including:
 3 | - `onboarding` materials to help you get setup with the right tools and procedures
 4 | - `demo-tasks` to walk-through how to use some of those tools in practice
 5 | - `templates` to serve as outlines for routine files like Makefiles and scripts in python or R
 6 | - `checklists` to refer to as you work and contribute to projects (updating a repo, writing a script, adding a new task)
 7 | - `languages` and language-specific tips to consider when writing scripts (like scalability, missingness)
 8 | - `notebooks` to walk-through various topics in context of a specific language (ie. set operations in python)
 9 | 
10 | ## helpful repos
11 | There are a few repos outside of this one that house various tools and/or guidance that may be useful.
12 | 
13 | - [sample-project](https://github.com/baileyb0t/sample_project)
14 |     - This is a dummy repo to test out git functionality like cloning, pushing, and pull requesting
15 | - [resource-utils/faqs](https://github.com/HRDAG/resource-utils/tree/master/faqs)
16 |     - There's a few help articles related to HRDAG workflow, in particular:
17 |         1. `data-hacking-on-server.md` includes instructions for making ssh keys and running Jupyter notebooks
18 |         2. `safe-logout.txt` instructions for safely disconnecting from eleanor and notebooks 
19 |         3. `data-work-faq.txt` questions we've asked ourselves enough to write down for others
20 | - [resource-utils/notes](https://github.com/HRDAG/resource-utils/tree/master/notes)
21 |     - There's a useful document from a previous intern
22 |         1. `internship_notes_2016.md` includes some walk-throughs, suggested tools, frequently used commands
23 | - [gnutools](https://github.com/tarakc02/gnutools/)
24 |     - A useful guide to using GNU tools more effectively with examples
25 | - [record-hash-comparisons](https://github.com/tarakc02/record-hash-comparisons)
26 |     - An introduction and overview of creating unique identifiers with hashes
27 | - [form-extraction](https://github.com/HRDAG/form-extraction)
28 |     - A place for some common tools and code we use to extract info from different kinds of forms
29 | - [tool-suite](https://github.com/baileyb0t/tool-suite)
30 |     - A home for some tools related to performance improvements and benchmarking
31 | - [dotfiles](https://github.com/tarakc02/dotfiles)
32 |     - An example all kinds of dotfiles you might want to explore and use in your working environment, like `vimrc`, `bash_profile`, `zshrc`, and `gitconfig`
33 | 
34 | ## more helpful topics
35 | on vim:
36 | - [Vim Cheat Sheet](https://vim.rtorr.com/)
37 | - [Vim screencast library](http://vimcasts.org/)
38 | - [**Interactive:** Vim adventures!](https://vim-adventures.com/)
39 | 
40 | on git:
41 | - [Collaboration with version control](https://datasciencebook.ca/Getting-started-with-version-control.html)
42 | - [Version control for transparency and collaboration](https://ubc-dsci.github.io/reproducible-and-trustworthy-workflows-for-data-science/materials/lectures/02-version-control-1.html)
43 | - [**Interactive:** Learn Git Branching](https://learngitbranching.js.org/?locale=en_US)
44 | 
45 | on workflow:
46 | - [Filenames and data science project organization, Integrated development environments](https://ubc-dsci.github.io/reproducible-and-trustworthy-workflows-for-data-science/materials/lectures/03-filenames-project-organization.html)
47 | - [Introduction to testing code for data science](https://ubc-dsci.github.io/reproducible-and-trustworthy-workflows-for-data-science/materials/lectures/06-intro-to-testing-code.html)
48 | - [Non-interactive scripts](https://ubc-dsci.github.io/reproducible-and-trustworthy-workflows-for-data-science/materials/lectures/07-scripts.html)
49 | - [data analysis pipelines](https://ubc-dsci.github.io/reproducible-and-trustworthy-workflows-for-data-science/materials/lectures/09-pipelines.html)
50 | - [Reproducible reports](https://ubc-dsci.github.io/reproducible-and-trustworthy-workflows-for-data-science/materials/lectures/08-reproducible-reports.html)
51 | - [Automated testing and continuous integration](https://ubc-dsci.github.io/reproducible-and-trustworthy-workflows-for-data-science/materials/lectures/11-continuous-integration.html)
52 | 
53 | ## other suggested reading
54 | - [Using Statistics to Assess Lethal Violence in Civil and Inter-State War](https://www.annualreviews.org/doi/pdf/10.1146/annurev-statistics-030718-105222#article-denial) (ask Megan for the PDF if you can't download this one for free!)
55 | - [Processing scanned documents for investigations of police violence](https://hrdag.org/tech-notes/processing-scanned-docs-for-investigation-police-violence.html)
56 | 
57 | ## books
58 | - [R for Data Science](https://r4ds.had.co.nz/)
59 | - [Advanced R](https://adv-r.hadley.nz/)
60 | - [R markdown: The definitive guide](https://bookdown.org/yihui/rmarkdown/)
61 | - [Practical Vim](https://bookarchive.net/pdf/practical-vim/)
62 | - [Python Data Science Handbook](https://jakevdp.github.io/PythonDataScienceHandbook/)
63 | - [Data Science: A First Introduction](https://datasciencebook.ca/)
64 | - [Reproducible and Trustworthy Workflows for Data Science](https://ubc-dsci.github.io/reproducible-and-trustworthy-workflows-for-data-science/README.html)
65 | 
66 | # done.
67 | 


--------------------------------------------------------------------------------
/checklists/input.md:
--------------------------------------------------------------------------------
 1 | # Input
 2 | 
 3 | There are a couple different ways we encode the inputs to a task, depending on what's \
 4 | appropriate for the project.
 5 | 
 6 | ### Everyone shares I/O
 7 | Ideally, we'd all be able to push/pull updates to input files and be sure we're looking\
 8 | at the same content, which means tracking recreatable I/O with version control tools \
 9 | like `snap` or `git`.
10 | 
11 | In general, we prefer to use `snap` for buildable input/ and \
12 | output/ files when the project team shares access to our server, and we track our other \
13 | dependencies like src/ code, Makefiles, and project framework with `git`.
14 | 
15 | When everyone working in the repo has access to `snap`:
16 | - [] Upstream input is symbolically linked into `task/input/`
17 | - [] Activate symlink has been included in a `snap push` call
18 | 
19 | ### Everyone recreates I/O
20 | When project visibility is flexible and/or the team has inconsistent server access, we \
21 | can use `git` to track the project's **initial input/** files and setup downstream \
22 | Makefiles to look for upstream outputs.
23 | 
24 | When project structure is likely to change, relative paths are the least brittle:
25 | - [] Ex) via relative path
26 |     ```
27 |     input := ../import/output/mentions.parquet
28 |     ```
29 | 
30 | Sometimes, the dependency is sufficiently "far" away in the overall structure, and it's \
31 | easier to point directly to it:
32 | - [] Ex) via project root
33 |     ```
34 |     HERE := $(shell git rev-parse --show-toplevel)
35 |     input := $(HERE)/import/output/mentions.parquet
36 |     ```
37 | 
38 | # done.
39 | 


--------------------------------------------------------------------------------
/checklists/makefiles.md:
--------------------------------------------------------------------------------
 1 | ### Makefiles
 2 | 
 3 | #### Standard features
 4 | - [] Makefile exists at task level (ie. `filter/Makefile`)
 5 | - [] Makefile contains
 6 |     - [] standard script header and footer
 7 |     - [] `.PHONY all clean`
 8 |     - [] `all` target listing all targets needed to "build" task
 9 |     - [] `clean` target that removes existing output
10 |     - [] all task input paths (ie. `../import/output/complaints.parquet`)
11 |     - [] all task output paths (ie. `output/complaints.parquet`)
12 |     - [] all task script paths (ie. `src/filter.py`)
13 | 
14 | #### Standard functionality
15 | Your makefile works if:
16 | - [] `make clean` successfully clears `output` contents
17 | - [] `make all` successfully rebuilds `output` contents
18 | 
19 | 
20 | ### Sample
21 | ```
22 | # vim: set ts=8 sts=0 sw=8 si fenc=utf-8 noet:
23 | # vim: set fdm=marker fmr={{{,}}} fdl=0 foldcolumn=4:
24 | # Authors:     BP
25 | # Maintainers: BP
26 | # Copyright:   2023, HRDAG, GPL v2 or later
27 | # =========================================
28 | # DPA/filter/Makefile
29 | 
30 | # ---- dependencies {{{
31 | input := ../import/output/complaints.parquet
32 | output := output/complaints.parquet
33 | # }}}
34 | 
35 | # ---- standard {{{
36 | .PHONY: all clean
37 | 
38 | all: $(output)
39 | 
40 | clean: 
41 | 	-rm -r output/*
42 | # }}}
43 | 
44 | # ---- task-specific {{{
45 | $(output):\
46 | 		src/filter.R \
47 | 		$(input)
48 | 	-mkdir output
49 | 	Rscript --vanilla $< \
50 | 		--input=$(input) \
51 | 		--output=$@
52 | # }}}
53 |  
54 | # done.
55 | ```
56 | 
57 | # done.
58 | 


--------------------------------------------------------------------------------
/checklists/new-tasks.md:
--------------------------------------------------------------------------------
 1 | ### Task directories
 2 | 
 3 | #### Minimum requirements
 4 | - [] `src` directory created
 5 | - [] Makefile created
 6 | - If tracking input, `input` directory should be created (else: not necessary)
 7 | - Ideally, the first target in a makefile makes the `output` directory at runtime \
 8 | (usually "-mkdir output")
 9 | 
10 | #### Supplemental requirements
11 | _If documents are provided alongside partner data,_
12 | - [] `docs` directory created
13 | - [] relevant doc(s) moved into `docs`
14 | 
15 | _If hand-coded input referenced by code,_
16 | - [] `hand` directory created
17 | - [] hand-coded files saved to `hand`
18 | 
19 | _If input/output has been manually altered in task,_
20 | - [] `frozen` directory created
21 | - [] manually altered data saved to `frozen`
22 | 
23 | _If notebooks used in task,_
24 | - [] `note` directory created
25 | - [] notebook(s) moved to `note`
26 | 
27 | #### Task-by-task requirements
28 | _If task is first in series,_
29 | - [] `import` task directory created
30 | 
31 | _If task is last in series,_
32 | - [] `export` created 
33 | - [] `export/Makefile` symbolically links `last-actual-task/output` to `export/output/`
34 | 
35 | # done.
36 | 


--------------------------------------------------------------------------------
/checklists/output.md:
--------------------------------------------------------------------------------
1 | ### Output
2 | 
3 | #### General requirements
4 | - [] `task/output/` contents are written by `task/Makefile`
5 | - Ideally, the `output` directory is created during runtime by the first target built
6 | 
7 | # done.
8 | 


--------------------------------------------------------------------------------
/checklists/repo-handling.md:
--------------------------------------------------------------------------------
 1 | ### Repo handling
 2 | 
 3 | #### **Initial**
 4 | If you intend to make contributions to a repo, we recommend cloning via ssh \
 5 | ([here's how to set that up](https://docs.github.com/en/authentication/connecting-to-github-with-ssh/adding-a-new-ssh-key-to-your-github-account))
 6 | - [ ] `git clone git@github.com:HRDAG/resource-utils.git`
 7 | 
 8 | #### **Maintenance**
 9 | _Upon returning to repo, before making changes_
10 | - [ ] `git checkout {your-branch-name}` to make sure you’re on the right branch
11 | - [ ] `git pull` and `snap pull` to make sure you have most recent updates
12 | 
13 | _Before leaving repo, after making changes_
14 | - [] `git add` the changed files along with a message about the changes in a `git commit`
15 | - [ ] `git push` changes to the remote repo
16 | - [ ] If tracking with snap, `snap add` with a commit message and then `snap push` to eleanor
17 | 
18 | # done.
19 | 


--------------------------------------------------------------------------------
/checklists/routines.md:
--------------------------------------------------------------------------------
 1 | # routines
 2 | Before a leg of data processing or analysis, it's often useful to do a pass at cleaning 
 3 | and formatting the data for the work ahead. This checklist is meant to provide a sample 
 4 | outline of data processing steps from import to export.
 5 | 
 6 | Reminder: Development is an iterative process and revisions should be expected as your 
 7 | skills and understanding of the dataset evolve. Most if not all of the work we create 
 8 | is added in a logical order overtime as a project progresses and we look for things to 
 9 | inform our next steps, but it's important to be critical about where a unit of code 
10 | "belongs" in the workflow and revise earlier work when doing so would eliminate the 
11 | need for corrective steps downstream.
12 | 
13 | Repetitive assert statements and logging can seem redundant, and in some cases 
14 | can be, but it's useful to rigorously vet and record the characteristics of a 
15 | dataset to chart how it changes or doesn't change. In the event debugging is 
16 | necessary, abundant asserting and logging habits can save us from an otherwise 
17 | overwhelming investigation into the cause of an error between several tasks. It can 
18 | also clue us in to a need to re-work a task whose input's changed since our last run, 
19 | or catch a major flaw in a recent update that changed the output in unexpected ways.
20 | 
21 | ### Import
22 | The import script is meant to read in upstream data before any other task in a series, 
23 | acting as the starting point to a leg of processing. It should not do any data 
24 | manipulation or formatting of content.
25 | - [] no changes to dataset contents
26 | - [] assert statements for known features (ie. shape, column names, etc) and select values
27 | - [] logging of unique features and a sample of unique values for a few random records
28 | - [] format column names to "clean" equivalent ([per janitor](https://www.rdocumentation.org/packages/janitor/versions/1.2.0/topics/clean_names))
29 | - [] filetype to .parquet if not already
30 | 
31 | ### Filter
32 | The filter script is not always necessary but it can be useful to formalize steps to 
33 | drop records that are not suited for downstream work. For example, if the next steps 
34 | are about cleaning narrative text and setting up indicator variables, then it would be 
35 | appropriate to drop records missing narrative text before we do that. If we were 
36 | specifically looking at narratives from incidents in the last 10 years, we could apply 
37 | that cutoff here, too.
38 | 
39 | With small datasets, these decisions don't make much of a change 
40 | in processing time, but as a project scales being critical of what work is done on what 
41 | records can improve performance and auditability.
42 | 
43 | - [] records are not altered but might be dropped from input
44 | - [] assert statements for filtering methods, known features, sample of records
45 | - [] logging of unique features and a sample of unique values for a random sample 
46 | 
47 | ### Clean
48 | The clean script is the first run at cleaning up data with inconsistent or 
49 | unreliable values. 
50 | 
51 | ##### missingness
52 | A lot of our datasets contain free text fields which are meant to represent some 
53 | underlying artifact, like names, jurisdictions, etc. These artifacts can have 
54 | boundaries we understand conceptually, like whether negative values are possible 
55 | (think measurements of time), and we should do our best to apply the conceptual 
56 | boundaries to our data processing. This might mean adding steps to recover salvageable 
57 | records or marking them as lost for possible hand labeling at a later time.
58 | 
59 | ##### datatypes
60 | We should know some logical boundaries for data fields as well as their ideal or 
61 | expected data _type_. This is the place to convert fields to the appropriate type and 
62 | make any other cleaning steps needed to do so.
63 | 
64 | ##### simplifying
65 | We may want to apply some cleaning ahead of doing ML or analysis, where things like 
66 | inconsistent capitalization, erroneous symbols, extra spaces, etc. can add superficial 
67 | uniqueness and complicate results.
68 | 
69 | Overall:
70 | - [] minimal changes to the data, content structure (ie. only what needs doing)
71 | - [] assert statements for support methods, known features, sample of records
72 | - [] logging of unique features and a sample of unique values for a random sample
73 | - [] fields are appropriate datatypes (missing values are missing, numeric fields numeric)
74 | 
75 | ### Export
76 | Here we'd use a makefile to symlink clean/output to export/output. This is the final 
77 | task in a series and the data coming out of it is "done" as far as this series of tasks 
78 | is concerned.
79 | 
80 | - [] if any source code is needed, it is not better suited in an upstream script
81 | 
82 | 
83 | ***note:*** This example specifies the task order import -> filter -> clean -> export, 
84 | but we think critically about the order of steps and ensure they're appropriate for 
85 | the data as it actually exists. What might be a reason to reverse the task order?
86 | 
87 | done.
88 | 


--------------------------------------------------------------------------------
/checklists/scripts.md:
--------------------------------------------------------------------------------
 1 | ### Scripts
 2 | 
 3 | #### Style requirements
 4 | _Script contains:_
 5 | - [] Standard script header and footer
 6 | - [] Double-spaces between functions and class definitions
 7 | - [] Single-space within code
 8 | - [] Logical function and variable names
 9 | 
10 | _Script does NOT contain:_
11 | - [] Commented code
12 | - [] Space around variable names as arguments in parentheses
13 | - [] Unused modules
14 | - [] Remnants of abandoned approaches/methods
15 | 
16 | #### Core logic requirements
17 | - [] Assert statements that trace I/O along function calls and manipulations
18 | - [] Logging of data characteristics, manipulations performed, and reports 
19 | - [] Digestible and readily debuggable units of code
20 |     - [] Multiple small scripts instead of a single long script
21 | - [] Ideal solutions per the corresponding language's docs
22 | 
23 | ```
24 | #!/usr/local/bin/python
25 | # -*- coding: utf-8 -*-
26 | # vim: set ts=4 sts=0 sw=4 si fenc=utf-8 et:
27 | # vim: set fdm=marker fmr={{{,}}} fdl=0 foldcolumn=4:
28 | # Authors:     BP
29 | # Maintainers: BP
30 | # Copyright:   2023, HRDAG, GPL v2 or later
31 | # =========================================
32 | # DPA/filter/src/find_officers.py
33 | 
34 | # ---- dependencies {{{
35 | from os import listdir
36 | from pathlib import Path
37 | from sys import stdout
38 | import argparse
39 | import logging
40 | import re
41 | import pandas as pd
42 | #}}}
43 | 
44 | # ---- support methods {{{
45 | def get_args():
46 |     parser = argparse.ArgumentParser()
47 |     parser.add_argument("--input", default="output/complaints.parquet")
48 |     parser.add_argument("--output", default="output/named-officers.parquet")
49 |     args = parser.parse_args()
50 |     assert Path(args.input).exists()
51 |     return args
52 | 
53 | 
54 | def get_logger(sname, file_name=None):
55 |     logger = logging.getLogger(sname)
56 |     logger.setLevel(logging.DEBUG)
57 |     formatter = logging.Formatter("%(asctime)s - %(levelname)s " +
58 |                                   "- %(message)s", datefmt='%Y-%m-%d %H:%M:%S')
59 |     stream_handler = logging.StreamHandler(stdout)
60 |     stream_handler.setFormatter(formatter)
61 |     logger.addHandler(stream_handler)
62 |     if file_name:
63 |         file_handler = logging.FileHandler(file_name)
64 |         file_handler.setFormatter(formatter)
65 |         logger.addHandler(file_handler)
66 |     return logger
67 | 
68 | 
69 | def find_officer_names(line):
70 |     pattern = re.compile("(OFFICER\s[A-Z]+\s[A-Z]+\s[#][0-9]{4,})", flags=re.I|re.M)
71 |     found = re.findall(pattern, line)
72 |     if (not found) | (found == []): return None
73 |     return found
74 | #}}}
75 | 
76 | # ---- main {{{
77 | if __name__ == '__main__':
78 |     # setup logging
79 |     logger = get_logger(__name__, "output/find_officers.log")
80 | 
81 |     # arg handling
82 |     args = get_args()
83 |     
84 |     complaints = pd.read_parquet(args.input, columns=['complaint_id', 'allegation_text'])
85 |     complaints['named_officers'] = complaints.allegation_text.apply(find_officer_names)
86 |     complaints.to_parquet(args.output)
87 |     
88 |     logger.info("done.")
89 |     
90 | #}}}
91 | # done.
92 | ```
93 | 
94 | # done.
95 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/.gitignore:
--------------------------------------------------------------------------------
1 | input
2 | output
3 | frozen
4 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/README.md:
--------------------------------------------------------------------------------
  1 | # Blocking and record linkage (`match`)
  2 | 
  3 | This subdirectory contains an example project structure for record linkage. The
  4 | goal of record linkage is to take an input database where multiple records may
  5 | refer to the same person (very common if you're integrating multiple data
  6 | sources), and output a dataset that has one row per distinct person, by linking
  7 | co-referent records and using their content to assemble a synthetic "entity"
  8 | record.
  9 | 
 10 | ## Context
 11 | 
 12 | Record linkage will depend on data that is already cleaned and canonicalized.
 13 | So the pipeline from raw data up through record linkage will look like:
 14 | 
 15 | 
 16 | ```
 17 | .
 18 | ├── individual
 19 | │   ├── SOURCEA
 20 | │   ├── SOURCEB
 21 | │   └── ...
 22 | ├── pool-records
 23 | │   ├── import
 24 | │   ├── canonicalize
 25 | │   ├── ...
 26 | │   └── export
 27 | ├── match
 28 | │   ├── import
 29 | │   ├── ...
 30 | │   └── export
 31 | └── ... (rest of repo)
 32 | 
 33 | ```
 34 | 
 35 | Where `pool-records` produces a dataframe that contains columns for `recordid`
 36 | and `source`, along with the standardized and canonicalized record content.
 37 | 
 38 | If there is nothing to do in the `pool-records` step besides concatenate data
 39 | sets (this can happen e.g. when the cleaning and standardizing happened within
 40 | each of the `individual` tasks), then that step can be rolled in to
 41 | `match/import`.
 42 | 
 43 | ## Parallel `match` tasks
 44 | 
 45 | It's often the case that we need to run several different versions of record
 46 | linkage, for example to produce preliminary estimates, or to produce estimates
 47 | while including or excluding a specific dataset. In that case, we build
 48 | multiple match pipelines alongside each other:
 49 | 
 50 | ```
 51 | .
 52 | ├── ...
 53 | ├── match
 54 | │   ├── phase1-SOURCEA+SOURCEB
 55 | │   │   ├── import
 56 | │   │   ├── ...
 57 | │   │   └── export
 58 | │   ├── phase2-SOURCEA+SOURCEB+SOURCEC
 59 | │   │   ├── import
 60 | │   │   ├── ...
 61 | │   │   └── export
 62 | │   └── ...
 63 | └── ...
 64 | ```
 65 | 
 66 | ## Anatomy of the match task
 67 | 
 68 | ```
 69 | match
 70 | ├── import
 71 | ├── blocking-features
 72 | ├── blocking
 73 | ├── compare
 74 | ├── classify
 75 | ├── cluster
 76 | ├── merge
 77 | ├── export
 78 | ├── TS-draw
 79 | ├── TS-import
 80 | ├── TS-compare
 81 | ├── TS-integrate
 82 | └── TS-train
 83 | ```
 84 | 
 85 | There are four major steps to get from input data to de-duplicated entity-level
 86 | data:
 87 | 
 88 | - blocking: a coarse filtering for a set of "candidate pairs" that may contain
 89 |   non-matches, but should include all truly co-referent pairs. Directly
 90 |   comparing every pair of records in a database is not going to be practical
 91 |   for datasets that have more than a few thousand records. Reference: [Database
 92 |   Deduplication to Identify Victims of Human Rights Violations](https://hrdag.org/2016/01/08/a-geeky-deep-dive-database-deduplication-to-identify-victims-of-human-rights-violations/)
 93 | 
 94 | - classify: we use a supervised binary classifier to classify blocked pairs as
 95 |   either co-referent or not. Since calculating features for large numbers of
 96 |   candidate pairs (example: for the CO project, blocking generates around 90
 97 |   million pairs) can get complicated and messy, we separate that step into its
 98 |   own task and call it `compare`.
 99 | 
100 | - cluster: If the classifier deems A to be co-referent with B, and B to be
101 |   co-referent with C, but does not give a high score to the pair (A,C), then
102 |   how do we group records into entities? The cluster step takes pairwise
103 |   classification scores, and outputs a data frame with two columns, `recordid`
104 |   and `entity_id`. All records with the same entity id are considered to refer
105 |   to the same person. Reference: [Clustering and solving the right
106 |   problem](https://hrdag.org/2016/07/28/clustering-and-solving-the-right-problem/)
107 | 
108 | - merge: once we have grouped co-referent records together, we run into the
109 |   problem of outputting a single canonical record representing that entity.
110 |   Different records referring to the same entity may have conflicting values
111 |   for some fields, and so this step needs to make decisions about how to
112 |   resolve those discrepancies
113 | 
114 | Depending on context and complexity, either or both of blocking and
115 | classification can rely on machine learning models, rather than hand-written
116 | code. The `TS-*` part of the pipeline manages the generation and collection of
117 | training samples to supervise these processes.
118 | 
119 | ### Blocking
120 | 
121 | ```
122 | match
123 | ├── ...
124 | ├── blocking-features
125 | ├── blocking
126 | └── ...
127 | ```
128 | 
129 | ### Classification
130 | 
131 | ```
132 | match
133 | ├── ...
134 | ├── compare
135 | ├── classify
136 | └── ...
137 | ```
138 | 
139 | ### Clustering and creating entity records
140 | 
141 | ```
142 | match
143 | ├── ...
144 | ├── cluster
145 | ├── merge
146 | ├── export
147 | └── ...
148 | ```
149 | 
150 | ### Collecting hand-labeled examples: the TS-\* tasks
151 | 
152 | ```
153 | match
154 | ├── ...
155 | ├── TS-draw
156 | ├── TS-import
157 | ├── TS-compare
158 | ├── TS-integrate
159 | └── TS-train
160 | ```
161 | 
162 | **NOTE:** by convention, we represent a pair of records by their recordids with
163 | the smaller one coming first. For recordids `r1` and `r2`, that means `(r1,r2)`
164 | is considered the same pair as `(r2,r1)`
165 | 
166 | - `TS-draw`: samples data for labeling. We can sample data in "block" or "pair"
167 |   format, there are good reasons to include both types. Sampling might be more
168 |   or less random, or might be targeted. All sampling code can live in
169 |   `TS-draw`, which also includes code required to export human-reviewable
170 |   spreadsheets.
171 | 
172 | - `TS-integrate`: eventually we end up with training data from a variety of
173 |   sources, including labeled pairs, labeled blocks, not to mention the negative
174 |   pairs implied by labeled blocks. This task outputs the full set of labeled
175 |   data used to train classifiers, search for blocking rules, and break up large
176 |   clusters. There are usually two big outputs: `positive-pairs` is used in
177 |   blocking, and can just be a table with two columns, `recordid1` and
178 |   `recordid2`. `labeled-pairs` includes both positive and negative pairs, and
179 |   importantly will include a large sample of implied negative pairs from
180 |   labeled blocks.
181 | 
182 | - `TS-compare`: this is the equivalent in the TS cycle to the `compare` task in
183 |   the main match pipeline. We can usually re-use the same source code for both
184 |   tasks, the only difference is the input to `TS-compare` is
185 |   `TS-integrate/output/labeled-pairs.xyz` whereas the input to `compare` is
186 |   `blocking/output/candidate-pairs.xyz`
187 | 
188 | - `TS-train`: train a classifier, using features from `TS-compare` and labels
189 |   from `TS-integrate`.
190 | 
191 | ## About the example
192 | 
193 | Though the techniques used here can be used in a variety of record linkage
194 | contexts, we most frequently use them for deduplicating databases of deaths or
195 | disappearances in conflicts. In that context, each record will have at least
196 | name, location, and date available to match on.
197 | 
198 | For training purposes, this repo includes a demo match task across databases of
199 | music, where each row of data describes a single song. Our task is to
200 | deduplicate/match rows that correspond to the same song. In this case, each
201 | record has at least: artist name, song name, album name, release date,
202 | genre(s), and song length available to match on. Though the context is very
203 | different, we'll explore the same workflows and techniques that we use for
204 | deduplicating databases of deaths and disappearances.
205 | 
206 | The data for the demo task comes from [CompERBench: Complementing Entity
207 | Matching Benchmark
208 | Tasks](http://data.dws.informatik.uni-mannheim.de/benchmarkmatchingtasks/), and
209 | the `setup` directory downloads the required files using `curl`. To run the
210 | setup task, enter the directory and type `make`:
211 | 
212 | ```bash
213 | $ cd setup && make
214 | ```
215 | 
216 | Then, go into the `individual` directory, read the README, run the
217 | individual import tasks, and complete the exercises.
218 | 
219 | Once the `indiviual` tasks are built, you can run the `match` pipeline:
220 | 
221 | ```bash
222 | $ cd match && make
223 | ```
224 | 
225 | Make sure the pipeline runs without errors or debug as necessary. Then try out
226 | the exercises and further reading suggested in `match/README.md`
227 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/individual/README.md:
--------------------------------------------------------------------------------
 1 | # Instructions
 2 | 
 3 | To run these import tasks, `cd` into each directory and run the relevant
 4 | Makefile:
 5 | 
 6 | ```bash
 7 | $ cd amazon && make
 8 | $ cd itunes && make
 9 | ```
10 | 
11 | ## Exercises
12 | 
13 | 1. Matching depends on the quality of the data cleaning and standardization.
14 |    What other ways would you process these records to improve matching?
15 | 2. Write tests to check whether times and dates are valid.
16 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/individual/amazon/Makefile:
--------------------------------------------------------------------------------
 1 | setup := ../../setup/output
 2 | input := $(setup)/2_amazon_music.csv
 3 | output := output/amz.parquet
 4 | src := src/import-amz.R
 5 | 
 6 | .PHONY: all clean
 7 | 
 8 | all: $(output)
 9 | 
10 | clean: 
11 | 	-rm -r output/*
12 | 
13 | $(output): $(src) $(input)
14 | 	-mkdir output
15 | 	Rscript --vanilla $< \
16 | 		--input=$(input) \
17 | 		--output=$@
18 | 
19 | # done.
20 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/individual/amazon/src/import-amz.R:
--------------------------------------------------------------------------------
 1 | library(arrow)
 2 | library(lubridate)
 3 | library(tidyverse)
 4 | library(fs)
 5 | library(janitor)
 6 | library(argparse)
 7 | 
 8 | # args {{{
 9 | parser <- ArgumentParser()
10 | parser$add_argument("--input")
11 | parser$add_argument("--output")
12 | args <- parser$parse_args()
13 | # }}}
14 | 
15 | amz <- read_csv(args$input,
16 |             col_types = cols(.default = col_character(),
17 |                              Customer_Rating = col_number())) %>%
18 |     janitor::clean_names()
19 | 
20 | price2num <- function(price) str_replace_all(price, "\\$", "") %>% as.numeric
21 | 
22 | out <- amz %>%
23 |     mutate(released = lubridate::mdy(released)) %>%
24 |     transmute(recordid = subject_id,
25 |               album_name,
26 |               artist_name,
27 |               song_name,
28 |               price = if_else(price == "FREE", "0", price) %>% price2num,
29 |               time,
30 |               yy_released = lubridate::year(released),
31 |               mm_released = lubridate::month(released),
32 |               dd_released = lubridate::day(released),
33 |               copyright,
34 |               genre)
35 | 
36 | write_parquet(out, args$output)
37 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/individual/itunes/Makefile:
--------------------------------------------------------------------------------
 1 | setup  := ../../setup/output
 2 | input  := $(setup)/1_itunes.csv
 3 | output := output/itunes.parquet
 4 | src    := src/import-itunes.R
 5 | 
 6 | .PHONY: all clean
 7 | 
 8 | all: $(output)
 9 | 
10 | clean: 
11 | 	-rm -r output/*
12 | 
13 | $(output): $(src) $(input)
14 | 	-mkdir output
15 | 	Rscript --vanilla $<
16 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/individual/itunes/src/import-itunes.R:
--------------------------------------------------------------------------------
 1 | library(arrow)
 2 | library(fs)
 3 | library(tidyverse)
 4 | library(lubridate)
 5 | library(janitor)
 6 | 
 7 | itn <- read_csv("input/1_itunes.csv",
 8 |             col_types = cols(.default = col_character(),
 9 |                              Customer_Rating = col_number())) %>%
10 |     janitor::clean_names()
11 | 
12 | out <- itn %>%
13 |     mutate(year_only = str_detect(released, "^[0-9]{4}$"),
14 |            dt_released = lubridate::dmy(released),
15 |            yy_released = if_else(year_only,
16 |                                  as.numeric(released),
17 |                                  lubridate::year(dt_released))) %>%
18 |     transmute(recordid = subject_id,
19 |               album_name,
20 |               artist_name,
21 |               song_name,
22 |               price = str_replace_all(price, "\\$", "") %>% as.numeric,
23 |               time,
24 |               yy_released,
25 |               mm_released = lubridate::month(dt_released),
26 |               dd_released = lubridate::day(dt_released),
27 |               copyright = copy_right,
28 |               genre)
29 | 
30 | write_parquet(out, "output/itunes.parquet")
31 | 
32 | # done.
33 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/match/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all \
 2 | 	import \
 3 | 	blocking-features \
 4 | 	blocking \
 5 | 	compare \
 6 | 	classify \
 7 | 	cluster \
 8 | 	merge \
 9 | 	export
10 | 
11 | all: export
12 | 
13 | export: merge
14 | 	cd $@ && make
15 | 	
16 | merge: import cluster
17 | 	cd $@ && make
18 | 
19 | cluster: import classify
20 | 	cd $@ && make
21 | 
22 | classify: compare
23 | 	cd $@ && make
24 | 
25 | compare: import blocking
26 | 	cd $@ && make
27 | 
28 | blocking: blocking-features
29 | 	cd $@ && make
30 | 
31 | blocking-features: import
32 | 	cd $@ && make
33 | 
34 | import: 
35 | 	cd $@ && make
36 | 
37 | # done.
38 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/match/README.md:
--------------------------------------------------------------------------------
 1 | ## Instructions
 2 | 
 3 | Update all tasks by running `make`. Review and address any errors if they
 4 | occur.
 5 | 
 6 | ## Specific exercises
 7 | 
 8 | ### `blocking` and `blocking-features`
 9 | 
10 | 1. `TS-import` brings in labeled training data, that is currently not used
11 |    anywhere in the repo. Incorporate the labeled data into the blocking
12 |    pipeline by writing a script that reports how many of the known true matches
13 |    are covered by `blocking/output/candidate-pairs.parquet`.
14 | 2. The reason we separate out the logic to count the number of pairs for a rule
15 |    is because we often want to explore many possible blocking rules to see
16 |    which would be the "best" in terms of both covering known matches (see
17 |    exercise 1) and not generating too many pairs overall. Since we can
18 |    calculate the number of pairs that would be generated much faster than we
19 |    can produce the pairs, it helps to have a separate script that just does
20 |    that. Use the count-pairs script, and your answer to exercise 1, to explore
21 |    and compare different rules. Once you feel confident, try to set up a search
22 |    like the one described in [Database Deduplication to Identify Victims of
23 |    Human Rights
24 |    Violations](https://hrdag.org/2016/01/08/a-geeky-deep-dive-database-deduplication-to-identify-victims-of-human-rights-violations/)
25 | 3. A search like the one in step 2 is bottlenecked by how long it takes to
26 |    calculate the number of pairs generated by a given rule. Write a
27 |    benchmarking script that measures the amount of time required to count the
28 |    generated pairs for one or more rules, and then try to write your  own
29 |    count-pairs script that is more performant that the one provided.
30 | 
31 | ## `compare` & `classify`
32 | 
33 | 1. if data is big, loading the full pairs data and doing all of our string
34 |    distance calculations might take too much memory/time. Modify the compare
35 |    script to calculate features one at a time, and then add a final step that
36 |    re-combines all of the feature-ized pair data.
37 | 2. similarly, given large data sets we can make better use of our computation
38 |    resources by running `classify` on chunks of records, rather than the entire
39 |    data set all at once. Update classify to run on chunks of 10,000 pairs at a
40 |    time.
41 | 3. currently `classify` is based on some adhoc heuristics. Evaluate the quality
42 |    of the solution by comparing to the known labels from `TS-import`. Then
43 |    train a machine learning classifier to classify pairs. Does this work better
44 |    than the heuristics? Feel free to create new/different features in
45 |    `compare`, your solution to exercise 1 should allow you to add more
46 |    features without running into memory limitations.
47 | 
48 | ## `cluster`
49 | 
50 | 1. Take a look at the diagnostics presented in
51 |    `cluster/output/cluster-summary.yaml`. `degree_distribution` summarizes the
52 |    distribution of number of records that a given record is linked to.
53 |    `recs_per_entity` summarizes the distribution of unique records associated
54 |    with each entity. What do you see there that would concern you about the
55 |    match quality. Encode your expectations as a test. What other metrics would
56 |    you include in your test?
57 | 
58 | 2. Create a version of `cluster` that can pass your expectation tests, i.e.
59 |    that does not result in any improbably large clusters. Hint: take a look at
60 |    [Clustering and solving the right
61 |    problem](https://hrdag.org/2016/07/28/clustering-and-solving-the-right-problem/)
62 | 
63 | 
64 | ## Further reading
65 | 
66 | [string metric wikipedia entry](https://en.wikipedia.org/wiki/String_metric)
67 | 
68 | To-do: other readings
69 | 
70 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/match/TS-import/Makefile:
--------------------------------------------------------------------------------
 1 | setupdir := ../../setup/output/downloaded
 2 | 
 3 | inputs := $(setupdir)/gs_train.csv \
 4 | 	  $(setupdir)/gs_val.csv \
 5 | 	  $(setupdir)/gs_test.csv
 6 | 
 7 | output := output/labeled-pairs.parquet
 8 | 
 9 | .PHONY: all clean
10 | 
11 | all: $(output)
12 | 
13 | clean: 
14 | 	-rm -r output/*
15 | 
16 | $(output): src/import-pair-labels.R $(inputs)
17 | 	-mkdir output
18 | 	Rscript --vanilla $<
19 | 
20 | # done.
21 | 	
22 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/match/TS-import/src/import-pair-labels.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(logger)
 3 | library(arrow)
 4 | 
 5 | read_training <- function(filename) {
 6 |     df <- read_csv(filename, col_types = 'ccl')
 7 |     log_info("file {basename(filename)} has {nrow(df)} rows")
 8 |     out <- df %>%
 9 |         transmute(recordid1 = pmin(source_id, target_id),
10 |                   recordid2 = pmax(source_id, target_id),
11 |                   matching) %>%
12 |         distinct
13 |     log_info("file {basename(filename)} processed, {nrow(out)} distinct pairs")
14 |     return(out)
15 | }
16 | 
17 | fnames <- c("gs_train", "gs_val", "gs_test")
18 | paths <- file.path("../../setup/output/downloaded", str_c(fnames, ".csv"))
19 | 
20 | labs <- map_dfr(paths, read_training)
21 | pospairs <- filter(labs, matching) %>% distinct(recordid1, recordid2)
22 | 
23 | write_parquet(labs, "output/labeled-pairs.parquet")
24 | write_parquet(pospairs, "output/positive-pairs.parquet")
25 | 
26 | # done.
27 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/match/blocking-features/Makefile:
--------------------------------------------------------------------------------
 1 | input := ../import/output/music.parquet
 2 | output := output/blockfeats.parquet
 3 | src := src/feats.R
 4 | 
 5 | .PHONY: all clean
 6 | 
 7 | all: $(output)
 8 | 
 9 | clean: 
10 | 	-rm -r output/*
11 | 
12 | $(output): $(src) $(input)
13 | 	-mkdir output
14 | 	Rscript --vanilla $<
15 | 
16 | # done.
17 | 	
18 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/match/blocking-features/src/feats.R:
--------------------------------------------------------------------------------
 1 | library(arrow)
 2 | library(tidyverse)
 3 | library(stringdist)
 4 | 
 5 | mz <- read_parquet("../import/output/music.parquet")
 6 | 
 7 | calc_secs <- function(time_segments) {
 8 |     if (length(time_segments) > 3) return(NA_real_)
 9 |     segs <- as.numeric(time_segments)
10 |     if (length(time_segments) == 1) return(segs)
11 |     if (length(time_segments) == 2) return(60*segs[1] + segs[2])
12 |     360*segs[1] + 60*segs[2] + segs[3]
13 | }
14 | 
15 | time2secs <- function(timestring) {
16 |     map_dbl(str_split(timestring, fixed(":")), calc_secs)
17 | }
18 | 
19 | cleanstring <- function(string) {
20 |     cln <- str_to_lower(string) %>%
21 |         str_replace_all("[^a-z0-9 ]", " ") %>%
22 |         str_squish
23 |     if_else(cln == "", NA_character_, cln)
24 | }
25 | 
26 | padnum <- function(num) str_pad(num, width=2, side = "left", pad = "0")
27 | sortgenre <- function(g) str_split(g, ",") %>%
28 |     map(str_squish) %>%
29 |     map(unique) %>%
30 |     map(sort)
31 | 
32 | out <- mz %>%
33 |     mutate(genres  = sortgenre(genre),
34 |            glen    = map_int(genres, length),
35 |            genre_1 = map_chr(genres, 1),
36 |            genre_n = map2_chr(genres, glen, `[`),
37 |            across(c(album_name, artist_name, song_name,
38 |                     genre, genre_1, genre_n),
39 |                   cleanstring)) %>%
40 |     transmute(
41 |         recordid,
42 |         album_name_first_5 = str_sub(album_name, 1, 5),
43 |         artist_name_sx = phonetic(artist_name, method = "soundex"),
44 |         song_name_first_5 = str_sub(song_name, 1, 5),
45 |         genre_1, genre_n,
46 |         time_secs = time2secs(time),
47 |         time_secs_round10 = round(time_secs/10, digits=0) * 10,
48 |         time_secs_round100 = round(time_secs/100, digits=0) * 100,
49 |         released_yrmo = str_c(padnum(yy_released), padnum(mm_released))
50 |     )
51 | 
52 | write_parquet(out, "output/blockfeats.parquet")
53 | 
54 | # done.
55 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/match/blocking/Makefile:
--------------------------------------------------------------------------------
 1 | expected := output/expected-pair-counts.yaml
 2 | pairs    := output/candidate-pairs.parquet
 3 | features := ../blocking-features/output/blockfeats.parquet
 4 | 
 5 | .PHONY: all clean
 6 | 
 7 | all: $(expected) $(pairs)
 8 | 
 9 | clean: 
10 | 	-rm -r output/*
11 | 
12 | $(expected): src/count-pairs.R $(features)
13 | 	-mkdir output
14 | 	Rscript --vanilla $<
15 | 	
16 | $(pairs): src/generate-pairs.R $(features)
17 | 	-mkdir output
18 | 	Rscript --vanilla $<
19 | 
20 | # done.
21 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/match/blocking/hand/rules.yaml:
--------------------------------------------------------------------------------
 1 | r1: 
 2 |   - artist_name_sx
 3 |   - song_name_first_5
 4 |   - released_yrmo
 5 | 
 6 | r2:
 7 |   - album_name_first_5
 8 |   - artist_name_sx
 9 |   - released_yrmo
10 |   - time_secs_round10
11 | 
12 | r3:
13 |   - genre_1
14 |   - released_yrmo
15 |   - time_secs_round10
16 |   - genre_n
17 | 
18 | r4:
19 |   - artist_name_sx
20 |   - album_name_first_5
21 |   - time_secs_round10
22 |   - genre_1
23 |   - genre_n
24 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/match/blocking/src/count-pairs.R:
--------------------------------------------------------------------------------
 1 | library(arrow)
 2 | library(tidyverse)
 3 | library(yaml)
 4 | library(furrr)
 5 | 
 6 | mz <- read_parquet("../blocking-features/output/blockfeats.parquet")
 7 | all_rules <- read_yaml("hand/rules.yaml")
 8 | 
 9 | count_pairs <- function(rules, data) {
10 |      smry <- data %>%
11 |          filter(if_all(all_of(rules), ~!is.na(.))) %>%
12 |          count(across(all_of(rules)))
13 |      ns <- smry$n
14 |      sum(ns * (ns - 1) / 2)
15 | }
16 | 
17 | plan(multicore)
18 | rule_pair_counts <- future_map_dbl(all_rules, count_pairs, data = mz)
19 | plan(sequential)
20 | out <- c(rule_pair_counts, "total" = sum(rule_pair_counts)) %>%
21 |     as.list
22 | 
23 | write_yaml(out, "output/expected-pair-counts.yaml", )
24 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/match/blocking/src/generate-pairs.R:
--------------------------------------------------------------------------------
 1 | library(arrow)
 2 | library(furrr)
 3 | library(tidyverse)
 4 | library(logger)
 5 | library(yaml)
 6 | 
 7 | mz <- read_parquet("../blocking-features/output/blockfeats.parquet")
 8 | rules <- read_yaml("hand/rules.yaml")
 9 | 
10 | log_appender(appender_file("output/pair-generation.log"))
11 | 
12 | topairs <- function(recordids) {
13 |     expand_grid(recordid1 = recordids,
14 |                 recordid2 = recordids) %>%
15 |         filter(recordid1 < recordid2)
16 | }
17 | 
18 | generate_pairs <- function(rules, data) {
19 |      groups <- data %>%
20 |          filter(if_all(all_of(rules), ~!is.na(.))) %>%
21 |          group_by(across(all_of(rules))) %>%
22 |          group_split %>% keep(~nrow(.) > 1)
23 |      out <- map(groups, pluck, "recordid") %>%
24 |          future_map_dfr(topairs)
25 |     log_info("{str_c(rules, collapse = ',')}: {nrow(out)}")
26 |     return(out)
27 | }
28 | 
29 | plan(multisession)
30 | pairs <- map_dfr(rules, generate_pairs, data = mz) %>% distinct
31 | plan(sequential)
32 | 
33 | write_parquet(pairs, "output/candidate-pairs.parquet")
34 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/match/classify/Makefile:
--------------------------------------------------------------------------------
 1 | features := ../compare/output/pair-classifier-features.parquet
 2 | labels := output/match-predictions.parquet
 3 | classifier := src/classify.R
 4 | 
 5 | .PHONY: all clean
 6 | 
 7 | all: $(labels)
 8 | 
 9 | clean: 
10 | 	-rm -r output/*
11 | 
12 | $(labels): $(classifier) $(features)
13 | 	-mkdir output
14 | 	Rscript --vanilla $<
15 | 	
16 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/match/classify/src/classify.R:
--------------------------------------------------------------------------------
 1 | library(arrow)
 2 | library(tidyverse)
 3 | 
 4 | fname <- "../compare/output/pair-classifier-features.parquet"
 5 | feats <- read_parquet(fname)
 6 | 
 7 | predictions <- feats %>%
 8 |     mutate(matchscore = case_when(
 9 |         yrmo & album_name > .9 & song_name > .9 & artist_name > .9 ~ 1,
10 |         album_name >= 1 & song_name >= 1 & artist_name >= 1 ~ 1,
11 |         yrmo &
12 |             genre_overlap > 2  &
13 |             artist_name > .5 &
14 |             album_name > .5 &
15 |             song_name > .5 ~ 1,
16 |         genre_overlap > 2 & song_name > .9 & artist_name > .7 ~ 1,
17 |         yrmo & album_name > .8 & song_name > .8 ~ 1,
18 |         TRUE ~ 0)) %>%
19 |     select(recordid1, recordid2, matchscore)
20 | 
21 | write_parquet(predictions, "output/match-predictions.parquet")
22 | 
23 | # done.
24 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/match/cluster/Makefile:
--------------------------------------------------------------------------------
 1 | recs := ../import/output/music.parquet
 2 | labs := ../classify/output/match-predictions.parquet
 3 | 
 4 | entities := output/entity-ids.parquet
 5 | summary  := output/cluster-summary.yaml
 6 | 
 7 | .PHONY: all clean
 8 | 
 9 | all: $(entities) $(summary)
10 | 
11 | clean: 
12 | 	-rm -r output/*
13 | 
14 | $(entities): src/cluster.R $(recs) $(labs)
15 | 	-mkdir output
16 | 	Rscript --vanilla $<
17 | 
18 | $(summary): src/make-summary.R $(entities)
19 | 	Rscript --vanilla $<
20 | 	
21 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/match/cluster/src/cluster.R:
--------------------------------------------------------------------------------
 1 | library(arrow)
 2 | library(digest) # calculate hashes to create identifiers
 3 | library(tidyverse)
 4 | library(tidygraph)
 5 | 
 6 | recs <- read_parquet("../import/output/music.parquet")
 7 | labs <- read_parquet("../classify/output/match-predictions.parquet")
 8 | 
 9 | 
10 | threshold <- .5
11 | 
12 | edges <- labs %>%
13 |     filter(matchscore >= threshold) %>%
14 |     transmute(from = recordid1, to = recordid2, matchscore)
15 | 
16 | graph <- tbl_graph(
17 |     nodes = recs %>% select(recordid),
18 |     edges = edges,
19 |     directed = FALSE,
20 |     node_key = "recordid")
21 | 
22 | connected_components <- graph %>%
23 |     activate("nodes") %>%
24 |     mutate(component  = group_components(type = "weak"),
25 |            degree     = centrality_degree()) %>%
26 |     as_tibble("nodes")
27 | 
28 | make_id <- function(recordids) digest(sort(unique(recordids)), algo = "sha1")
29 | 
30 | # now generate unique identifiers for the merged records:
31 | out <- connected_components %>%
32 |     group_by(component) %>%
33 |     mutate(entity_id = make_id(recordid),
34 |            entity_nrecs = n_distinct(recordid)) %>%
35 |     ungroup
36 | 
37 | stopifnot(setequal(out$recordid, recs$recordid))
38 | stopifnot(nrow(out) == length(unique(out$recordid)))
39 | 
40 | write_parquet(out, "output/entity-ids.parquet")
41 | 
42 | # done.
43 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/match/cluster/src/make-summary.R:
--------------------------------------------------------------------------------
 1 | library(arrow)
 2 | library(tidyverse)
 3 | library(yaml)
 4 | 
 5 | ents <- read_parquet("output/entity-ids.parquet")
 6 | 
 7 | degree_distribution <- ents %>%
 8 |     distinct(recordid, degree) %>%
 9 |     pluck("degree") %>% quantile(c(seq(.5,.9,.1), .95, .99, 1))
10 | 
11 | recs_per_entity <- ents %>%
12 |     distinct(entity_id, entity_nrecs) %>%
13 |     pluck("entity_nrecs") %>%
14 |     quantile(c(seq(.5, .9, .1), .95, .99, 1))
15 | 
16 | n_entity <- length(unique(ents$entity_id))
17 | 
18 | smry <- list(
19 |     degree_distribution = as.list(degree_distribution),
20 |     recs_per_entity = as.list(recs_per_entity),
21 |     n_entity = n_entity,
22 |     n_recs = nrow(ents))
23 | 
24 | write_yaml(smry, "output/cluster-summary.yaml", )
25 | 
26 | # done.
27 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/match/compare/Makefile:
--------------------------------------------------------------------------------
 1 | mz := ../import/output/music.parquet
 2 | pairs := ../blocking/output/candidate-pairs.parquet
 3 | output := output/pair-classifier-features.parquet
 4 | 
 5 | .PHONY: all clean
 6 | 
 7 | all: $(output)
 8 | 
 9 | clean: 
10 | 	-rm -r output/*
11 | 
12 | $(output): src/compare.R $(mz) $(pairs)
13 | 	-mkdir output
14 | 	Rscript --vanilla $<
15 | 
16 | # done.
17 | 	
18 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/match/compare/src/compare.R:
--------------------------------------------------------------------------------
 1 | library(arrow)
 2 | library(tidyverse)
 3 | library(stringdist)
 4 | 
 5 | cleanstring <- function(string) {
 6 |     cln <- str_to_lower(string) %>%
 7 |         str_replace_all("[^a-z0-9 ]", " ") %>%
 8 |         str_squish
 9 |     if_else(cln == "", NA_character_, cln)
10 | }
11 | 
12 | mz <- read_parquet("../import/output/music.parquet")
13 | pairs <- read_parquet("../blocking/output/candidate-pairs.parquet")
14 | 
15 | processed <- mz %>%
16 |     mutate(across(c(album_name, artist_name, song_name),
17 |                   cleanstring),
18 |            genre = str_to_lower(genre) %>% str_squish %>% str_split(",")) %>%
19 |     select(recordid,
20 |            album_name, artist_name, song_name,
21 |            yy_released, mm_released, dd_released, genre)
22 | 
23 | similarity <- function(string1, string2, method = "cosine") {
24 |     1 - stringdist(string1, string2, method = method, nthread = 11)
25 | }
26 | 
27 | feats <- pairs %>%
28 |     inner_join(processed, by = c("recordid1" = "recordid")) %>%
29 |     inner_join(processed, by = c("recordid2" = "recordid"),
30 |                suffix = c("_1", "_2")) %>%
31 |     mutate(genre_overlap = map2_dbl(genre_1, genre_2, ~length(intersect(.x, .y))),
32 |            yrmo = yy_released_1 == yy_released_2 & mm_released_1 == mm_released_2,
33 |            album_name = similarity(album_name_1, album_name_2),
34 |            song_name = similarity(song_name_1, song_name_2),
35 |            artist_name = similarity(artist_name_1, artist_name_2, method = "jw"),
36 |            genre_overlap) %>%
37 |     select(-ends_with("_1"), -ends_with("_2"))
38 | 
39 | write_parquet(feats, "output/pair-classifier-features.parquet")
40 | 
41 | # done.
42 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/match/export/Makefile:
--------------------------------------------------------------------------------
1 | all: output/music-entities.parquet
2 | 
3 | output/music-entities.parquet: ../merge/output/music-entities.parquet
4 | 	-mkdir output
5 | 	cd output && ln -s ../$<
6 | 	
7 | # done.
8 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/match/import/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | amz := ../../individual/amazon/output/amz.parquet
 3 | itn := ../../individual/itunes/output/itunes.parquet
 4 | output := output/music.parquet
 5 | src := src/concatenate.R
 6 | 
 7 | .PHONY: all clean
 8 | 
 9 | all: $(output)
10 | 
11 | clean: 
12 | 	-rm -r output/*
13 | 
14 | $(output): $(src) $(amz) $(itn)
15 | 	-mkdir output
16 | 	Rscript --vanilla $<
17 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/match/import/src/concatenate.R:
--------------------------------------------------------------------------------
 1 | library(arrow)
 2 | library(tidyverse)
 3 | library(tools)
 4 | 
 5 | fname <- function(x) file_path_sans_ext(basename(x))
 6 | 
 7 | time2seconds <- function(string) {
 8 |     ms <- str_split(string, fixed(":"))
 9 | }
10 | 
11 | files <- c("../../individual/amazon/output/amz.parquet",
12 |            "../../individual/itunes/output/itunes.parquet") %>%
13 |     set_names(fname)
14 | 
15 | music <- map_dfr(files, read_parquet, .id = "source") %>%
16 |     select(recordid, source, everything())
17 | 
18 | write_parquet(music, "output/music.parquet")
19 | 
20 | # done.
21 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/match/merge/Makefile:
--------------------------------------------------------------------------------
 1 | recs := ../import/output/music.parquet
 2 | ents := ../cluster/output/entity-ids.parquet
 3 | output := output/music-entities.parquet
 4 | 
 5 | .PHONY: all clean
 6 | 
 7 | all: $(output)
 8 | 
 9 | clean: 
10 | 	-rm -r output/*
11 | 
12 | $(output): src/choose-canonical.R $(recs) $(ents)
13 | 	-mkdir output
14 | 	Rscript --vanilla $<
15 | 	
16 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/match/merge/src/choose-canonical.R:
--------------------------------------------------------------------------------
 1 | library(arrow)
 2 | library(tidyverse)
 3 | 
 4 | recs <- read_parquet("../import/output/music.parquet")
 5 | ents <- read_parquet("../cluster/output/entity-ids.parquet")
 6 | 
 7 | pick_value <- function(values) {
 8 |     # confusingly, this `mode` is looking for `character` vs. `integer` etc.,
 9 |     # NOT the statistical mode. since we use names() to identify the modal
10 |     # value below, we have to convert back to original data
11 |     # type before returning
12 |     original_type <- mode(values)
13 |     if (all(is.na(values))) return(values[1])
14 |     frequencies <- table(values, useNA = "no")
15 |     if (length(frequencies) == 1) {
16 |         res <- names(frequencies)[1]
17 |     } else {
18 |         modal_value <- max(frequencies)
19 |         remaining_candidates <- names(which(frequencies == modal_value))
20 |         res <- sample(remaining_candidates, 1)
21 |     }
22 |     as(res, original_type)
23 | }
24 | 
25 | pad <- function(num) str_pad(num, width = 2, side = "left", pad = "0")
26 | 
27 | cols_to_keep <- c("album_name",
28 |                   "song_name",
29 |                   "price",
30 |                   "time",
31 |                   "date_released",
32 |                   "genre")
33 | 
34 | out <- ents %>%
35 |     inner_join(recs, by = "recordid") %>%
36 |     mutate(across(c(mm_released, dd_released), pad)) %>%
37 |     mutate(date_released = str_c(yy_released, mm_released, dd_released,
38 |                                    sep = "-")) %>%
39 |     group_by(entity_id) %>%
40 |     summarise(in_amz = max(source == "amz"),
41 |               in_itn = max(source == "itunes"),
42 |               across(all_of(cols_to_keep), pick_value),
43 |               recordids = str_c(recordid, collapse=","))
44 | 
45 | stopifnot(
46 |     setequal(out$entity_id, ents$entity_id),
47 |     nrow(out) == length(unique(out$entity_id))
48 | )
49 | 
50 | write_parquet(out, "output/music-entities.parquet")
51 | 


--------------------------------------------------------------------------------
/demo-tasks/record-linkage/setup/Makefile:
--------------------------------------------------------------------------------
 1 | path := "http://data.dws.informatik.uni-mannheim.de/benchmarkmatchingtasks/data/music_(iTunes-Amazon)"
 2 | filenames := gs_train.csv gs_val.csv gs_test.csv feature_vector.zip records.zip
 3 | 
 4 | .PHONY: all clean
 5 | 
 6 | all: $(patsubst %,output/downloaded/%,$(filenames)) \
 7 | 	output/2_amazon_music.csv \
 8 | 	output/1_itunes.csv
 9 | 
10 | clean: 
11 | 	-rm -r output/*
12 | 
13 | output/downloaded/%: 
14 | 	curl $(path)/$* --create-dirs -o $@
15 | 
16 | output/%.csv: output/downloaded/records.zip
17 | 	unzip $< record_descriptions/$(shell basename $@) -d output
18 | 	mv output/record_descriptions/$*.csv output/
19 | 	rm -r output/record_descriptions
20 | 	touch $@
21 | 
22 | # done.
23 | 


--------------------------------------------------------------------------------
/glossary.md:
--------------------------------------------------------------------------------
 1 | # Glossary
 2 | 
 3 | In training, troubleshooting, or general, we will use words that may be unfamiliar.
 4 | In an effort to get us on the same page, here's what we mean.
 5 | 
 6 | - **algorithm:** In very simple terms, an algorithm is a routine. Some algorithms are so common we know them by name, each one is unique and comes with pros, cons, and best use cases. If you've ever tried to search or sort data, an algorithm was deployed to perform that work for you.
 7 | - **bayesian:**
 8 | - **bias:** There are several specific types of bias that we consider and test for in our data, but at the core these refer to some factor having an effect on data or results. We might have an algorithm or model that introduces bias we have to account for, we might observe bias introduced by human data collection.
 9 | - **build:** We want our work to be reproducible and _rebuildable_. We use Makefiles to setup rules for how to rebuild a particular output by synthesizing source code with input(s).
10 | - **classify / classifier:**
11 | - **clean:** Data cleaning refers the process of reading in data and applying some methods to get it into better shape for downstream analysis. This might including removing extra whitespace, converting numeric data to the appropriate datatype (ie. integer, float, datetime, timedelta), splitting fields that contain multiple types of info, etc. Although some methods are more common than others, your cleaning routine is unique to your data.
12 | - **compiler:** There are functional distinctions between a compiler and an interpreter, but the important thing to know is which one the language(s) you're writing in use. Java and C use a compiler and require you to have a deeper understanding of how the computer receives instructions. There are referred to as _lower level_ languages because they are closer to the ground or machine level and there is no interpreter to meet you halfway. These tend to be quicker to compile and there are even some parts of interpretted language code that compiles to C.
13 | - **complexity:**
14 | - **conditional probability:**
15 | - **confidence interval:**
16 | - **control vocabulary:** If I ask you to read a narrative and tell me whether a home invasion is mentioned, how can you be sure we are on the same page about what qualifies as a home invasion? We use control vocabulary to create a common language for tasks where we might accidentally operate on different interpretations of the same prompt.
17 | - **convenience sample:** A convenience sample is a sample taken conveniently. If I walk around my campus or office and ask anyone who walks by me a particular question, I took a convenience sample. If you respond to a survey on a website you were already on, you were part of a convenience sample.
18 | - **correlation:** Two things are correlated if they share some (potentially unknown) attribute and this common link affects both things. The classic statistics example is shark attacks and ice cream sales increasing at the same time. If you saw it as a headline, you might mistakenly assume _causation_ is being suggested, but if you think about it, these events more likely share a common feature: summertime, so it's more likely there is a correlation between shark attacks and ice cream sales.
19 | - **credible interval:**
20 | - **debug:**
21 | - **deduplication:**
22 | - **depdendency:**
23 | - **deterministic:**
24 | - **disjoint:**
25 | - **distribution:**
26 | - **frequentist:**
27 | - **hashid:**
28 | - **heuristic:**
29 | - **imputation:**
30 | - **independent / dependent:**
31 | - **inference:**
32 | - **interpreter:** There are functional distinctions between a compiler and an interpreter, but the important thing to know is which one the language(s) you're writing in use. Python and R both use an interpreter, and this enables the programmer to mind fewer syntax rules and be less in touch with how the computer understands the work being done. These can be slower programs to run for exactly that reason, but that doesn't mean they aren't remarkably useful tools than can run effectively. Know that the extra work the interpreter is doing does not solve all the problems that can arise, and in no way should you slack on writing good code!
33 | - **interrater reliability:**
34 | - **language:** You write code in a programming language. There are many languages you could choose from, each has pros and cons and ideal use cases.
35 | - **margin of error:** A margin of error is a window above and below an estimate that puts parameters on under- and over-counting that might've occurred and what the true number would be in that case. 
36 | - **marginal probability:**
37 | - **mean/median/mode/stddev:**
38 | - **model / selection / evaluation / parameter:**
39 | - **multiple systems estimation (capture-recapture):**
40 | - **ocr:**
41 | - **pair programming:** Sometimes we end up doing this in code review, but in general it refers to a practice of sitting side-by-side and writing code together, usually for the same project.
42 | - **posterior:**
43 | - **prior:**
44 | - **probabilistic:**
45 | - **push:** When we talk about pushing code, we're talking about sending your code and/or changes to the remote place where the project lives. If your project is tracked by git (via GitHub, in our case), then you'll push to GitHub. If your project is tracked by snap, then you'll push to snap.
46 | - **refactor:**
47 | - **repo:** We use GitHub, so our projects live in GitHub repos. These are basically fancy directories or folders that give us useful functionality like version control 
48 | - **sample:**
49 | - **set / set theory:**
50 | - **sparse / sparsity:**
51 | - **task:**
52 | - **transitive closure:** Suppose we have 3 sets, A, B, and C. I run a clustering algorithm to help me identify if these all refer to the same underlying set. The results suggest that A is the same as B, and B is the same as C, but _not_ that A and C are the same. What do I do? That's the problem of transitive closure.
53 | - **uncertainty:**
54 | - **variance:**
55 | - **version control:** What happens if we remove some code we thought we didn't need, only to discover we did need it? This is why we use a version control system (namely, git). If we've set this up, we can roll back to any previous version (or directly view the code we're missing on GitHub from the commit history) and restore our work. 
56 | 
57 | <---- done ---->
58 | 


--------------------------------------------------------------------------------
/languages/R/Makefile:
--------------------------------------------------------------------------------
 1 | # vim: set ts=8 sts=0 sw=8 si fenc=utf-8 noet:
 2 | # vim: set fdm=marker fmr={{{,}}} fdl=0 foldcolumn=4:
 3 | # Authors:     TS
 4 | # Maintainers: TS
 5 | # Copyright:   2022, HRDAG, GPL v2 or later
 6 | # =========================================
 7 | 
 8 | .PHONY: all clean
 9 | 
10 | sections := performance
11 | 
12 | all: $(patsubst %,output/%.md,$(sections))
13 | 
14 | clean: 
15 | 	-rm -r output/*
16 | 
17 | output/%.md: src/%.Rmd
18 | 	-mkdir output
19 | 	Rscript -e "rmarkdown::render('$<')"
20 | 	mv src/$*.md $@
21 | 	
22 | 


--------------------------------------------------------------------------------
/languages/R/README.md:
--------------------------------------------------------------------------------
1 | Guide
2 | 
3 | - [performance](output/performance.md) goes over some general advice for fixing
4 |   performance bugs and writing performant enough R code.
5 | 


--------------------------------------------------------------------------------
/languages/julia/.gitignore:
--------------------------------------------------------------------------------
1 | Manifest.toml
2 | 


--------------------------------------------------------------------------------
/languages/julia/Makefile:
--------------------------------------------------------------------------------
 1 | # vim: set ts=8 sts=0 sw=8 si fenc=utf-8 noet:
 2 | # vim: set fdm=marker fmr={{{,}}} fdl=0 foldcolumn=4:
 3 | # Authors:     TS
 4 | # Maintainers: TS
 5 | # Copyright:   2022, HRDAG, GPL v2 or later
 6 | # =========================================
 7 | 
 8 | .PHONY: all clean
 9 | 
10 | all: \
11 | 	output/dfs.md \
12 | 	output/basics.md
13 | 
14 | clean: 
15 | 	-rm -r output/*
16 | 
17 | Manifest.toml: Project.toml
18 | 	julia --project -e "using Pkg; Pkg.resolve(); Pkg.instantiate()"
19 | 
20 | output/%.md: src/%.jl Manifest.toml
21 | 	-mkdir output
22 | 	julia --project --threads=auto src/compile-md.jl $< output
23 | 
24 | # done.
25 | 


--------------------------------------------------------------------------------
/languages/julia/Project.toml:
--------------------------------------------------------------------------------
1 | [deps]
2 | CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
3 | DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
4 | DataFramesMeta = "1313f7d8-7da2-5740-9ea0-a2ca25f37964"
5 | Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
6 | Query = "1a8c2f83-1ff3-5112-b086-8aa67b057ba1"
7 | 


--------------------------------------------------------------------------------
/languages/julia/README.md:
--------------------------------------------------------------------------------
1 | # Guide
2 | 
3 | - [basics](output/basics.md)
4 | - [dataframes](output/dfs.md). Uses the [mtcars
5 |   dataset](https://rdrr.io/r/datasets/mtcars.html) dataset that comes with R,
6 |   which is originally from: Henderson and Velleman (1981), Building multiple
7 |   regression models interactively. *Biometrics*, 37, 391–411. 
8 | 


--------------------------------------------------------------------------------
/languages/julia/input/mtcars.csv:
--------------------------------------------------------------------------------
 1 | car|mpg|cyl|disp|hp|drat|wt|qsec|vs|am|gear|carb
 2 | Mazda RX4|21|6|160|110|3.9|2.62|16.46|0|1|4|4
 3 | Mazda RX4 Wag|21|6|160|110|3.9|2.875|17.02|0|1|4|4
 4 | Datsun 710|22.8|4|108|93|3.85|2.32|18.61|1|1|4|1
 5 | Hornet 4 Drive|21.4|6|258|110|3.08|3.215|19.44|1|0|3|1
 6 | Hornet Sportabout|18.7|8|360|175|3.15|3.44|17.02|0|0|3|2
 7 | Valiant|18.1|6|225|105|2.76|3.46|20.22|1|0|3|1
 8 | Duster 360|14.3|8|360|245|3.21|3.57|15.84|0|0|3|4
 9 | Merc 240D|24.4|4|146.7|62|3.69|3.19|20|1|0|4|2
10 | Merc 230|22.8|4|140.8|95|3.92|3.15|22.9|1|0|4|2
11 | Merc 280|19.2|6|167.6|123|3.92|3.44|18.3|1|0|4|4
12 | Merc 280C|17.8|6|167.6|123|3.92|3.44|18.9|1|0|4|4
13 | Merc 450SE|16.4|8|275.8|180|3.07|4.07|17.4|0|0|3|3
14 | Merc 450SL|17.3|8|275.8|180|3.07|3.73|17.6|0|0|3|3
15 | Merc 450SLC|15.2|8|275.8|180|3.07|3.78|18|0|0|3|3
16 | Cadillac Fleetwood|10.4|8|472|205|2.93|5.25|17.98|0|0|3|4
17 | Lincoln Continental|10.4|8|460|215|3|5.424|17.82|0|0|3|4
18 | Chrysler Imperial|14.7|8|440|230|3.23|5.345|17.42|0|0|3|4
19 | Fiat 128|32.4|4|78.7|66|4.08|2.2|19.47|1|1|4|1
20 | Honda Civic|30.4|4|75.7|52|4.93|1.615|18.52|1|1|4|2
21 | Toyota Corolla|33.9|4|71.1|65|4.22|1.835|19.9|1|1|4|1
22 | Toyota Corona|21.5|4|120.1|97|3.7|2.465|20.01|1|0|3|1
23 | Dodge Challenger|15.5|8|318|150|2.76|3.52|16.87|0|0|3|2
24 | AMC Javelin|15.2|8|304|150|3.15|3.435|17.3|0|0|3|2
25 | Camaro Z28|13.3|8|350|245|3.73|3.84|15.41|0|0|3|4
26 | Pontiac Firebird|19.2|8|400|175|3.08|3.845|17.05|0|0|3|2
27 | Fiat X1-9|27.3|4|79|66|4.08|1.935|18.9|1|1|4|1
28 | Porsche 914-2|26|4|120.3|91|4.43|2.14|16.7|0|1|5|2
29 | Lotus Europa|30.4|4|95.1|113|3.77|1.513|16.9|1|1|5|2
30 | Ford Pantera L|15.8|8|351|264|4.22|3.17|14.5|0|1|5|4
31 | Ferrari Dino|19.7|6|145|175|3.62|2.77|15.5|0|1|5|6
32 | Maserati Bora|15|8|301|335|3.54|3.57|14.6|0|1|5|8
33 | Volvo 142E|21.4|4|121|109|4.11|2.78|18.6|1|1|4|2
34 | 


--------------------------------------------------------------------------------
/languages/julia/output/basics.md:
--------------------------------------------------------------------------------
  1 | ## Functions
  2 | 
  3 | Functions work basically the way you think they would:
  4 | 
  5 | ````julia
  6 | function add(x, y)
  7 |     return x + y
  8 | end
  9 | add(1, 2)
 10 | ````
 11 | 
 12 | ````
 13 | 3
 14 | ````
 15 | 
 16 | The return value of a function is the value of the last evaluated expression in
 17 | the function, so we could also have written:
 18 | 
 19 | ```julia
 20 | function add(x, y)
 21 |     x + y
 22 | end
 23 | ```
 24 | 
 25 | Furthermore, there's a compact form for short function definitions:
 26 | 
 27 | ````julia
 28 | add(x, y) = x + y
 29 | add(1, 2)
 30 | ````
 31 | 
 32 | ````
 33 | 3
 34 | ````
 35 | 
 36 | Notice that the longer form doesn't use curly brackets. This is also true more
 37 | generally -- blocks of code are encapsulated in `begin` and `end` instead of
 38 | brackets. For things like functions, loops, and conditionals, the keyword does
 39 | the same job as `begin`. Also you don't have to use a lot of parentheses that
 40 | you'd find in other languages:
 41 | 
 42 | ````julia
 43 | myfunc1 = function(x)
 44 |     if x < 5
 45 |         println("size: small")
 46 |     elseif x < 10
 47 |         println("size: medium")
 48 |     else
 49 |         println("size: big")
 50 |     end
 51 |     println("counting down...")
 52 |     while x > 0
 53 |         println(x)
 54 |         x -= 1
 55 |     end
 56 | end
 57 | myfunc1(7)
 58 | ````
 59 | 
 60 | ````
 61 | size: medium
 62 | counting down...
 63 | 7
 64 | 6
 65 | 5
 66 | 4
 67 | 3
 68 | 2
 69 | 1
 70 | 
 71 | ````
 72 | 
 73 | Functions can be composed (notice evaluation is from right to left, as in
 74 | mathematics):
 75 | 
 76 | ````julia
 77 | f(x) = 2x
 78 | g(x) = x + 7
 79 | h = f ∘ g
 80 | h(10)
 81 | ````
 82 | 
 83 | ````
 84 | 34
 85 | ````
 86 | 
 87 | You can make anonymous functions. For example this:
 88 | 
 89 | ````julia
 90 | x -> 2x + 7
 91 | ````
 92 | 
 93 | ````
 94 | #3 (generic function with 1 method)
 95 | ````
 96 | 
 97 | is the same as
 98 | 
 99 | ````julia
100 | function(x) 2x + 7 end
101 | ````
102 | 
103 | ````
104 | #5 (generic function with 1 method)
105 | ````
106 | 
107 | but can more concisely be passed as an argument to a higher order function:
108 | 
109 | ````julia
110 | map(x -> 2x + 7, [1, 2, 3])
111 | ````
112 | 
113 | ````
114 | 3-element Vector{Int64}:
115 |   9
116 |  11
117 |  13
118 | ````
119 | 
120 | You can pipe functions together using the `|>` operator:
121 | 
122 | ````julia
123 | rand(10) |> sum |> round
124 | ````
125 | 
126 | ````
127 | 6.0
128 | ````
129 | 
130 | ## Some useful built-in data structures
131 | 
132 | - Dicts: `Dict("key1" => 123, "key2" => 473, ...)`
133 | - Arrays: `[1, 2, 3]`
134 | - Tuples: `(1, 2, 3)`
135 | - Named Tuples: `(a = 1, b = 2, c = 3)`
136 | - Sets: `Set([1, 2, 3])`
137 | 
138 | ## Python-ish array generators/comprehensions:
139 | 
140 | The comprehension syntax is similar to what you would use in Python. This
141 | generates a vector (a 1-dimensional array):
142 | 
143 | ````julia
144 | [2x for x in 1:5]
145 | ````
146 | 
147 | ````
148 | 5-element Vector{Int64}:
149 |   2
150 |   4
151 |   6
152 |   8
153 |  10
154 | ````
155 | 
156 | the same syntax can be used to create other datastructures as well:
157 | 
158 | ````julia
159 | mydict = Dict((x => 2x) for x in 1:5)
160 | mydict[5]
161 | ````
162 | 
163 | ````
164 | 10
165 | ````
166 | 
167 | ---
168 | 
169 | *This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*
170 | 
171 | 


--------------------------------------------------------------------------------
/languages/julia/output/dfs.md:
--------------------------------------------------------------------------------
  1 | # Using data frames in Julia
  2 | 
  3 | ## [Dataframes.jl](https://dataframes.juliadata.org/stable/)
  4 | 
  5 | [DataFrames.jl](https://dataframes.juliadata.org/stable/) implemenets the data
  6 | structure most similar to dataframes in R (similar to base R's `data.frame` or
  7 | `data.table`) or python/pandas.
  8 | 
  9 | ````julia
 10 | using DataFrames
 11 | using CSV
 12 | 
 13 | mtcars = CSV.read("../input/mtcars.csv", DataFrame, delim="|")
 14 | describe(mtcars)
 15 | ````
 16 | 
 17 | ````
 18 | 12×7 DataFrame
 19 |  Row │ variable  mean     min          median  max         nmissing  eltype
 20 |      │ Symbol    Union…   Any          Union…  Any         Int64     DataType
 21 | ─────┼────────────────────────────────────────────────────────────────────────
 22 |    1 │ car                AMC Javelin          Volvo 142E         0  String31
 23 |    2 │ mpg       20.0906  10.4         19.2    33.9               0  Float64
 24 |    3 │ cyl       6.1875   4            6.0     8                  0  Int64
 25 |    4 │ disp      230.722  71.1         196.3   472.0              0  Float64
 26 |    5 │ hp        146.688  52           123.0   335                0  Int64
 27 |    6 │ drat      3.59656  2.76         3.695   4.93               0  Float64
 28 |    7 │ wt        3.21725  1.513        3.325   5.424              0  Float64
 29 |    8 │ qsec      17.8488  14.5         17.71   22.9               0  Float64
 30 |    9 │ vs        0.4375   0            0.0     1                  0  Int64
 31 |   10 │ am        0.40625  0            0.0     1                  0  Int64
 32 |   11 │ gear      3.6875   3            4.0     5                  0  Int64
 33 |   12 │ carb      2.8125   1            2.0     8                  0  Int64
 34 | ````
 35 | 
 36 | ### Indexing
 37 | 
 38 | Indexing in DataFrames is pretty natural if you've used data frames in R:
 39 | 
 40 | ````julia
 41 | mtcars[1:3, [:mpg, :wt, :drat]]
 42 | ````
 43 | 
 44 | ````
 45 | 3×3 DataFrame
 46 |  Row │ mpg      wt       drat
 47 |      │ Float64  Float64  Float64
 48 | ─────┼───────────────────────────
 49 |    1 │    21.0    2.62      3.9
 50 |    2 │    21.0    2.875     3.9
 51 |    3 │    22.8    2.32      3.85
 52 | ````
 53 | 
 54 | or via column indices, not recommended:
 55 | 
 56 | ````julia
 57 | mtcars[1:3, [1,6,5]]
 58 | ````
 59 | 
 60 | ````
 61 | 3×3 DataFrame
 62 |  Row │ car            drat     hp
 63 |      │ String31       Float64  Int64
 64 | ─────┼───────────────────────────────
 65 |    1 │ Mazda RX4         3.9     110
 66 |    2 │ Mazda RX4 Wag     3.9     110
 67 |    3 │ Datsun 710        3.85     93
 68 | ````
 69 | 
 70 | If you want to get a dataframe with a single column, make sure you still use
 71 | the array brackets:
 72 | 
 73 | ````julia
 74 | mtcars[:, [:mpg]]
 75 | ````
 76 | 
 77 | ````
 78 | 32×1 DataFrame
 79 |  Row │ mpg
 80 |      │ Float64
 81 | ─────┼─────────
 82 |    1 │    21.0
 83 |    2 │    21.0
 84 |    3 │    22.8
 85 |    4 │    21.4
 86 |    5 │    18.7
 87 |    6 │    18.1
 88 |    7 │    14.3
 89 |    8 │    24.4
 90 |    9 │    22.8
 91 |   10 │    19.2
 92 |   11 │    17.8
 93 |   12 │    16.4
 94 |   13 │    17.3
 95 |   14 │    15.2
 96 |   15 │    10.4
 97 |   16 │    10.4
 98 |   17 │    14.7
 99 |   18 │    32.4
100 |   19 │    30.4
101 |   20 │    33.9
102 |   21 │    21.5
103 |   22 │    15.5
104 |   23 │    15.2
105 |   24 │    13.3
106 |   25 │    19.2
107 |   26 │    27.3
108 |   27 │    26.0
109 |   28 │    30.4
110 |   29 │    15.8
111 |   30 │    19.7
112 |   31 │    15.0
113 |   32 │    21.4
114 | ````
115 | 
116 | compare to:
117 | 
118 | ````julia
119 | mtcars[:, :mpg]
120 | ````
121 | 
122 | ````
123 | 32-element Vector{Float64}:
124 |  21.0
125 |  21.0
126 |  22.8
127 |  21.4
128 |  18.7
129 |  18.1
130 |  14.3
131 |  24.4
132 |  22.8
133 |  19.2
134 |  17.8
135 |  16.4
136 |  17.3
137 |  15.2
138 |  10.4
139 |  10.4
140 |  14.7
141 |  32.4
142 |  30.4
143 |  33.9
144 |  21.5
145 |  15.5
146 |  15.2
147 |  13.3
148 |  19.2
149 |  27.3
150 |  26.0
151 |  30.4
152 |  15.8
153 |  19.7
154 |  15.0
155 |  21.4
156 | ````
157 | 
158 | you can also return a vector this way:
159 | 
160 | ````julia
161 | mtcars.mpg
162 | ````
163 | 
164 | ````
165 | 32-element Vector{Float64}:
166 |  21.0
167 |  21.0
168 |  22.8
169 |  21.4
170 |  18.7
171 |  18.1
172 |  14.3
173 |  24.4
174 |  22.8
175 |  19.2
176 |  17.8
177 |  16.4
178 |  17.3
179 |  15.2
180 |  10.4
181 |  10.4
182 |  14.7
183 |  32.4
184 |  30.4
185 |  33.9
186 |  21.5
187 |  15.5
188 |  15.2
189 |  13.3
190 |  19.2
191 |  27.3
192 |  26.0
193 |  30.4
194 |  15.8
195 |  19.7
196 |  15.0
197 |  21.4
198 | ````
199 | 
200 | ### Joins
201 | 
202 | `innerjoin`, `leftjoin`, and `antijoin` work the way you would expect:
203 | 
204 | ````julia
205 | main_data = mtcars[:, [:car, :mpg, :wt]]
206 | additional_data = DataFrame(car = ["Volvo 142E", "Datsun 710", "Ferrari Dino"],
207 |                             variable = [13, 42, 17])
208 | 
209 | innerjoin(main_data, additional_data, on = :car)
210 | leftjoin(main_data, additional_data, on = :car)
211 | antijoin(main_data, additional_data, on = :car)
212 | ````
213 | 
214 | ````
215 | 29×3 DataFrame
216 |  Row │ car                  mpg      wt
217 |      │ String31             Float64  Float64
218 | ─────┼───────────────────────────────────────
219 |    1 │ Mazda RX4               21.0    2.62
220 |    2 │ Mazda RX4 Wag           21.0    2.875
221 |    3 │ Hornet 4 Drive          21.4    3.215
222 |    4 │ Hornet Sportabout       18.7    3.44
223 |    5 │ Valiant                 18.1    3.46
224 |    6 │ Duster 360              14.3    3.57
225 |    7 │ Merc 240D               24.4    3.19
226 |    8 │ Merc 230                22.8    3.15
227 |    9 │ Merc 280                19.2    3.44
228 |   10 │ Merc 280C               17.8    3.44
229 |   11 │ Merc 450SE              16.4    4.07
230 |   12 │ Merc 450SL              17.3    3.73
231 |   13 │ Merc 450SLC             15.2    3.78
232 |   14 │ Cadillac Fleetwood      10.4    5.25
233 |   15 │ Lincoln Continental     10.4    5.424
234 |   16 │ Chrysler Imperial       14.7    5.345
235 |   17 │ Fiat 128                32.4    2.2
236 |   18 │ Honda Civic             30.4    1.615
237 |   19 │ Toyota Corolla          33.9    1.835
238 |   20 │ Toyota Corona           21.5    2.465
239 |   21 │ Dodge Challenger        15.5    3.52
240 |   22 │ AMC Javelin             15.2    3.435
241 |   23 │ Camaro Z28              13.3    3.84
242 |   24 │ Pontiac Firebird        19.2    3.845
243 |   25 │ Fiat X1-9               27.3    1.935
244 |   26 │ Porsche 914-2           26.0    2.14
245 |   27 │ Lotus Europa            30.4    1.513
246 |   28 │ Ford Pantera L          15.8    3.17
247 |   29 │ Maserati Bora           15.0    3.57
248 | ````
249 | 
250 | ## [Query.jl](http://www.queryverse.org/Query.jl/stable/)
251 | 
252 | [Query.jl](http://www.queryverse.org/Query.jl/stable/) allows you to build
253 | dplyr-like query pipelines for dataframe manipulation:
254 | 
255 | ````julia
256 | using Query
257 | using Statistics
258 | 
259 | mtcars |>
260 |     @select(:car, :cyl, :mpg, :wt) |>
261 |     @mutate(car = uppercase(_.car),
262 |             mpg_wt = _.mpg/_.wt) |>
263 |     @groupby(_.cyl) |>
264 |     @map({cyl           = key(_),
265 |           mean_mpg      = mean(_.mpg),
266 |           max_wt        = maximum(_.wt),
267 |           median_mpg_wt = median(_.mpg_wt)})
268 | ````
269 | 
270 | ````
271 | 3x4 query result
272 | cyl │ mean_mpg │ max_wt │ median_mpg_wt
273 | ────┼──────────┼────────┼──────────────
274 | 6   │ 19.7429  │ 3.46   │ 6.6563       
275 | 4   │ 26.6636  │ 3.19   │ 12.1495      
276 | 8   │ 15.1     │ 5.424  │ 4.11558      
277 | ````
278 | 
279 | ## Reading and writing
280 | 
281 | The [`CSV`](https://csv.juliadata.org/stable/) package is for reading and
282 | writing any delimited files, the delimiter does not have to be a comma. The
283 | syntax above already shows how to read in a CSV file as a dataframe.
284 | Alternately, if you need to process the file in a streaming manner, `CSV.File`
285 | returns an iterator over rows:
286 | 
287 | ````julia
288 | mtcars_streaming = CSV.File("../input/mtcars.csv", delim ="|");
289 | 
290 | for row in mtcars_streaming
291 |     println(row.car, ": ", row.mpg)
292 | end
293 | ````
294 | 
295 | ````
296 | Mazda RX4: 21.0
297 | Mazda RX4 Wag: 21.0
298 | Datsun 710: 22.8
299 | Hornet 4 Drive: 21.4
300 | Hornet Sportabout: 18.7
301 | Valiant: 18.1
302 | Duster 360: 14.3
303 | Merc 240D: 24.4
304 | Merc 230: 22.8
305 | Merc 280: 19.2
306 | Merc 280C: 17.8
307 | Merc 450SE: 16.4
308 | Merc 450SL: 17.3
309 | Merc 450SLC: 15.2
310 | Cadillac Fleetwood: 10.4
311 | Lincoln Continental: 10.4
312 | Chrysler Imperial: 14.7
313 | Fiat 128: 32.4
314 | Honda Civic: 30.4
315 | Toyota Corolla: 33.9
316 | Toyota Corona: 21.5
317 | Dodge Challenger: 15.5
318 | AMC Javelin: 15.2
319 | Camaro Z28: 13.3
320 | Pontiac Firebird: 19.2
321 | Fiat X1-9: 27.3
322 | Porsche 914-2: 26.0
323 | Lotus Europa: 30.4
324 | Ford Pantera L: 15.8
325 | Ferrari Dino: 19.7
326 | Maserati Bora: 15.0
327 | Volvo 142E: 21.4
328 | 
329 | ````
330 | 
331 | ## Parquet files
332 | 
333 | Use [`Parquet`]()
334 | 
335 | ## Abstract interface: Tables.jl
336 | 
337 | ---
338 | 
339 | *This page was generated using [Literate.jl](https://github.com/fredrikekre/Literate.jl).*
340 | 
341 | 


--------------------------------------------------------------------------------
/languages/julia/src/basics.jl:
--------------------------------------------------------------------------------
 1 | #=
 2 | ## Functions
 3 | 
 4 | Functions work basically the way you think they would:
 5 | =#
 6 | 
 7 | function add(x, y)
 8 |     return x + y
 9 | end
10 | add(1, 2)
11 | 
12 | #=
13 | 
14 | The return value of a function is the value of the last evaluated expression in
15 | the function, so we could also have written:
16 | 
17 | ```julia
18 | function add(x, y)
19 |     x + y
20 | end
21 | ```
22 | 
23 | Furthermore, there's a compact form for short function definitions:
24 | 
25 | =#
26 | 
27 | 
28 | add(x, y) = x + y
29 | add(1, 2)
30 | 
31 | #=
32 | 
33 | Notice that the longer form doesn't use curly brackets. This is also true more
34 | generally -- blocks of code are encapsulated in `begin` and `end` instead of
35 | brackets. For things like functions, loops, and conditionals, the keyword does
36 | the same job as `begin`. Also you don't have to use a lot of parentheses that
37 | you'd find in other languages:
38 | 
39 | =#
40 | 
41 | myfunc1 = function(x)
42 |     if x < 5
43 |         println("size: small")
44 |     elseif x < 10
45 |         println("size: medium")
46 |     else
47 |         println("size: big")
48 |     end
49 |     println("counting down...")
50 |     while x > 0
51 |         println(x)
52 |         x -= 1
53 |     end
54 | end
55 | myfunc1(7)
56 | 
57 | # Functions can be composed (notice evaluation is from right to left, as in
58 | # mathematics):
59 | 
60 | f(x) = 2x
61 | g(x) = x + 7
62 | h = f ∘ g
63 | h(10)
64 | 
65 | # You can make anonymous functions. For example this:
66 | 
67 | x -> 2x + 7
68 | 
69 | # is the same as
70 | function(x) 2x + 7 end
71 | 
72 | # but can more concisely be passed as an argument to a higher order function:
73 | map(x -> 2x + 7, [1, 2, 3])
74 | 
75 | # You can pipe functions together using the `|>` operator:
76 | rand(10) |> sum |> round
77 | 
78 | #=
79 | ## Some useful built-in data structures
80 | 
81 | - Dicts: `Dict("key1" => 123, "key2" => 473, ...)`
82 | - Arrays: `[1, 2, 3]`
83 | - Tuples: `(1, 2, 3)`
84 | - Named Tuples: `(a = 1, b = 2, c = 3)`
85 | - Sets: `Set([1, 2, 3])`
86 | 
87 | ## Python-ish array generators/comprehensions:
88 | 
89 | The comprehension syntax is similar to what you would use in Python. This
90 | generates a vector (a 1-dimensional array):
91 | 
92 | =#
93 | [2x for x in 1:5]
94 | 
95 | # the same syntax can be used to create other datastructures as well:
96 | mydict = Dict((x => 2x) for x in 1:5)
97 | mydict[5]
98 | 


--------------------------------------------------------------------------------
/languages/julia/src/compile-md.jl:
--------------------------------------------------------------------------------
1 | using Literate
2 | source, outdir = ARGS[1], ARGS[2]
3 | Literate.markdown(source, outdir,
4 |                   flavor=Literate.CommonMarkFlavor();
5 |                   config=Dict("execute" => true))
6 | 
7 | 


--------------------------------------------------------------------------------
/languages/julia/src/dfs.jl:
--------------------------------------------------------------------------------
  1 | #=
  2 | 
  3 | # Using data frames in Julia
  4 | 
  5 | ## [Dataframes.jl](https://dataframes.juliadata.org/stable/)
  6 | 
  7 | [DataFrames.jl](https://dataframes.juliadata.org/stable/) implemenets the data
  8 | structure most similar to dataframes in R (similar to base R's `data.frame` or
  9 | `data.table`) or python/pandas.
 10 | 
 11 | =#
 12 | 
 13 | using DataFrames
 14 | using CSV
 15 | 
 16 | mtcars = CSV.read("../input/mtcars.csv", DataFrame, delim="|")
 17 | describe(mtcars)
 18 | 
 19 | #=
 20 | 
 21 | ### Indexing
 22 | 
 23 | Indexing in DataFrames is pretty natural if you've used data frames in R:
 24 | 
 25 | =#
 26 | 
 27 | 
 28 | mtcars[1:3, [:mpg, :wt, :drat]]
 29 | 
 30 | # or via column indices, not recommended:
 31 | 
 32 | mtcars[1:3, [1,6,5]]
 33 | 
 34 | # If you want to get a dataframe with a single column, make sure you still use
 35 | # the array brackets:
 36 | 
 37 | mtcars[:, [:mpg]]
 38 | 
 39 | # compare to:
 40 | 
 41 | mtcars[:, :mpg]
 42 | 
 43 | # you can also return a vector this way:
 44 | 
 45 | mtcars.mpg
 46 | 
 47 | #=
 48 | 
 49 | ### Joins
 50 | 
 51 | `innerjoin`, `leftjoin`, and `antijoin` work the way you would expect:
 52 | 
 53 | =#
 54 | 
 55 | main_data = mtcars[:, [:car, :mpg, :wt]]
 56 | additional_data = DataFrame(car = ["Volvo 142E", "Datsun 710", "Ferrari Dino"],
 57 |                             variable = [13, 42, 17])
 58 | 
 59 | innerjoin(main_data, additional_data, on = :car)
 60 | leftjoin(main_data, additional_data, on = :car)
 61 | antijoin(main_data, additional_data, on = :car)
 62 | #=
 63 | 
 64 | ## [Query.jl](http://www.queryverse.org/Query.jl/stable/)
 65 | 
 66 | [Query.jl](http://www.queryverse.org/Query.jl/stable/) allows you to build
 67 | dplyr-like query pipelines for dataframe manipulation:
 68 | 
 69 | =#
 70 | 
 71 | using Query
 72 | using Statistics
 73 | 
 74 | mtcars |>
 75 |     @select(:car, :cyl, :mpg, :wt) |>
 76 |     @mutate(car = uppercase(_.car),
 77 |             mpg_wt = _.mpg/_.wt) |>
 78 |     @groupby(_.cyl) |>
 79 |     @map({cyl           = key(_),
 80 |           mean_mpg      = mean(_.mpg),
 81 |           max_wt        = maximum(_.wt),
 82 |           median_mpg_wt = median(_.mpg_wt)})
 83 | 
 84 | 
 85 | #=
 86 | 
 87 | ## Reading and writing
 88 | 
 89 | The [`CSV`](https://csv.juliadata.org/stable/) package is for reading and
 90 | writing any delimited files, the delimiter does not have to be a comma. The
 91 | syntax above already shows how to read in a CSV file as a dataframe.
 92 | Alternately, if you need to process the file in a streaming manner, `CSV.File`
 93 | returns an iterator over rows:
 94 | 
 95 | =#
 96 | 
 97 | mtcars_streaming = CSV.File("../input/mtcars.csv", delim ="|");
 98 | 
 99 | for row in mtcars_streaming
100 |     println(row.car, ": ", row.mpg)
101 | end
102 | 
103 | #=
104 | 
105 | ## Parquet files
106 | 
107 | Use [`Parquet`]()
108 | 
109 | =#
110 | 
111 | #=
112 | 
113 | ## Abstract interface: Tables.jl
114 | 
115 | =#
116 | 
117 | 


--------------------------------------------------------------------------------
/languages/python/missingness.md:
--------------------------------------------------------------------------------
 1 | ### Missing Values in Python
 2 | Datasets often have missing values, and different languages handle missingness
 3 | in data differently. This doc is intended to be an introduction to how python, 
 4 | numpy, and pandas handle missing data. A more in depth guide can be found [here](https://jakevdp.github.io/PythonDataScienceHandbook/03.04-missing-values.html).
 5 | 
 6 | #### Pandas
 7 | Technically, pandas doesn't have its own implementation of a missing value,
 8 | it chooses to use two existing null values as sentinels for missingness.
 9 | 
10 | 1. numpy's `np.nan` float
11 | 2. Python's `None` object
12 | 
13 | By using two different types of sentinel values, a float and an object, pandas
14 | covers missingness of most data types we might encounter in a dataset with little
15 | additional overhead.
16 | 
17 | #### [`np.nan`](https://numpy.org/doc/stable/user/misc.html)
18 | "Nan" means "not a number", and numpy uses it to label a missing numeric datapoint.
19 | In fact, numpy compiles `np.nan` as a floating point even when surrounding values
20 | are integer type.
21 | 
22 | ##### Why is that helpful?
23 | Well, we're not always going to store integer type data, and when we do store numeric
24 | data of non-integer type, we can be certain our missing token has been allocated
25 | enough room in our data structure. You probably wouldn't consciously notice this at
26 | runtime, but changes from a smaller dtype that uses little memory to one that carries
27 | more information to store can create a delay as our structure tries to make room. On 
28 | the other hand, if our placeholder reserves the slightly larger room, the operational
29 | cost should be minimal. 
30 | 
31 | In case that is still too technical, think about it like reserving a meeting room. 
32 | Let's say we have a team of 10 people working on a project, and we decide to meet
33 | in-person to discuss it. Initially, only 4 people say they can make it, so the
34 | smaller meeting room is chosen. But when its meeting day, 2 more people say they can
35 | make it, and now you need room for 6. Depending on how many meeting rooms are 
36 | available, this could be a big problem, but most likely it just takes some shuffling
37 | around and a slight delay to the meeting. _Floating point numbers leave more room
38 | numeric info than integers do, so the `np.nan` placeholder is read as a float._
39 | 
40 | 
41 | #### [`None`](https://docs.python.org/3/c-api/none.html)
42 | 
43 | #### upcasting
44 | In some cases, pandas will switch between the two chosen sentinel values when an
45 | alternate might be more efficient, and this can be helpful to know when we're making
46 | manipulations to our data.
47 | 
48 | Let's think about an example of when this might happen. Say we have an array full of
49 | integers, and we want to insert an placeholder for a value we don't have yet.
50 | 
51 | At first, we have an array with dtype `integer`.
52 | 
53 | But after we insert placeholder, `np.nan`, and re-evaluate the dtype, we realize it's
54 | been changed to `float`. Remember how np.nan compiles to a float? The type of the data
55 | gets upcasted when we do this operation to accommodate 
56 | 
57 | Here is a table to summarize some of the upcasting scenarios.
58 | 
59 | | arr.dtype before | arr.dtype after | sentinel output  |
60 | | ---              | ---             | ---              |
61 | | `float`          | no change       | `np.nan`         |
62 | | `object`         | no change       | `np.nan`, `None` |
63 | | `integer`        | `float`         | `np.nan`         |
64 | | `boolean`        | `object`        | `np.nan`, `None` |
65 | 
66 | #### operations
67 | (I think these are all referring to `data` as a `pd.DataFrame` object, but I need to double check)
68 | 
69 | I. Arithmetic
70 |      - item
71 | II. Boolean detection
72 |      - `data.isnull()`
73 |           - returns a series of Booleans for all datapoints referring to whether or not they are missing
74 |      - `data.notnull()`
75 |           - returns a series of Booleans for non-null datapoints
76 |      - `data[data.notnull()]`
77 |           - uses the `notnull()` series as an index and returns the corresponding non-null datapoints
78 | III. Convenience
79 |      - `data.dropna()`
80 |           - removes rows with null values in any column from structure
81 |           - **Note:** there are some neat optional parameters to feed to `dropna()` if the default approach
82 |           is not ideal, namely `how` and `thresh`
83 |      - `data.fillna()`
84 |           - fills missing values with the value passed as an argument
85 | 
86 | ##### done.
87 | 


--------------------------------------------------------------------------------
/languages/python/scalability.md:
--------------------------------------------------------------------------------
  1 | ### Python
  2 | Because the Python language is type-agnostic, in order to compile it uses an Interpreter to make inferences about types at runtime. This can result in signficant bottlenecks in performance if we aren't careful about exactly how our code should be implemented.
  3 | 
  4 | This document shows some samples from:
  5 | - [Python Performance Tuning: 20 Simple Tips](https://stackify.com/20-simple-python-performance-tuning-tips/)
  6 | - [High Performance Pandas: eval() and query()](https://jakevdp.github.io/PythonDataScienceHandbook/03.12-performance-eval-and-query.html)
  7 | 
  8 | ## *Tips*
  9 | - Purge unused dependencies
 10 | - Use as few global variables as possible
 11 | - [Built-ins](https://docs.python.org/3/library/functions.html)
 12 |     - Don’t write your own version of a built-in method that does exactly the same thing! The built-in version will be faster and include better error handling than a custom implementation.
 13 | - Utilize memory profilers to identify bottlenecks in your code
 14 | 
 15 | &nbsp;  
 16 | 
 17 | ## *Tricks*
 18 | 
 19 | > Goal: Make a list of integers in a given range
 20 | 
 21 | Possible solution: 
 22 | ```
 23 | indices = []
 24 | for i in range(len(some_list)):
 25 |     indices.append(i)
 26 | ```
 27 | Better solution: 
 28 | ```
 29 | indices = [ i for i in range(len(some_list)) ]
 30 | ```
 31 | &nbsp;  
 32 | 
 33 | > Goal: Check if an exact match for a value is in a list
 34 | 
 35 | Possible solution: 
 36 | ```
 37 | target = 5
 38 | for val in some_list:
 39 |     if val == target:
 40 |         (do work)
 41 | ```
 42 | OR
 43 | ```
 44 | target = 5
 45 | if target in set(some_list):
 46 |     (do work)
 47 | ```
 48 | Better solution: 
 49 | ```
 50 | target = 5
 51 | if target in some_list:
 52 |     (do work)
 53 | ```
 54 | &nbsp;  
 55 | 
 56 | > Goal: Find values in one list that are also present in another
 57 | 
 58 | Possible solution: 
 59 | ```
 60 | dupes = []
 61 | for x in left_list:
 62 |   for y in right_list:
 63 |     if x==y:
 64 |       dupes.append(x)
 65 | ```
 66 | Better solution: 
 67 | ```
 68 | dupes = set(left_list) & set(right_list)
 69 | ```
 70 | &nbsp;  
 71 | 
 72 | > Goal: Assign multiple values in one call
 73 | 
 74 | Possible solution: 
 75 | ```
 76 | def format_full_name( some_name ):
 77 |     lower_name = some_name.lower()
 78 |     return lower_name.split(“ “)
 79 | 
 80 | name_list = format_full_name(“Some Guys Name”)
 81 | first = name_list[0]
 82 | middle = name_list[1]
 83 | last = name_list[2]
 84 | ```
 85 | Better solution: 
 86 | ```
 87 | def format_full_name( some_name ):
 88 |     lower_name = some_name.lower()
 89 |     return lower_name.split(“ “)
 90 | 
 91 | first, middle, last = format_full_name(“Some Guys Name”)
 92 | ```
 93 | &nbsp;  
 94 | 
 95 | > Goal: Swap the contents of two variables
 96 | 
 97 | Possible solution: 
 98 | ```
 99 | temp = x
100 | x = y
101 | y = temp
102 | ```
103 | Better solution: 
104 | ```
105 | x, y = y, x
106 | ```
107 | &nbsp;  
108 | 
109 | > Goal: Combine multiple string values
110 | 
111 | Possible solution: 
112 | ```
113 | 
114 | ```
115 | OR
116 | ```
117 | def rebuild_full_name( a_first, a_middle, a_last ):
118 |     return a_first + “ “ + a_middle + “ “ + a_last
119 | 
120 | full_name = rebuild_full_name(first, middle, last)
121 | ```
122 | Better solution: 
123 | ```
124 | def rebuild_full_name( a_first, a_middle, a_last ):
125 |     return “ “.join(a_first, a_middle, a_last)
126 | 
127 | full_name = rebuild_full_name(first, middle, last)
128 | ```
129 | 
130 | # done.
131 | 


--------------------------------------------------------------------------------
/languages/python/set-operations.md:
--------------------------------------------------------------------------------
 1 | ### Set operations in python
 2 | Set objects are useful for a number of reasons, and they come with some useful operations that can be handy in exploring relational data.
 3 | 
 4 | #### Features
 5 | In the simplest case, let's say we are iterating through some data and we want to capture all unique values we find that meet a particular condition.
 6 | 
 7 | If we use a `list`, we will see the lowest overhead to append a new item, [O(1)](https://wiki.python.org/moin/TimeComplexity) , but we will have to deduplicate the list contents after building the collection.
 8 | 
 9 | If we use a `dict`, we will see decent
10 | 
11 | **Note**: Because you don't have to use the name of an object to declare it in python, and sets and dicts both use {} to denote themselves, python makes an inference at runtime as to which type is intended, based on the structure of the first insertion. Since it wants to run the most efficiently, it will choose the lesser overhead `set` over `dict` unless it gets specific instructions otherwise.
12 | As an example of this issue, let's say we want to use a dictionary as a template for the info we want to capture.
13 | `info = {'name', 'address', 'phone'}`
14 | What's wrong with this implementation? What is going to happen when we run `info['name'] = 'Kenny'`?
15 | `TypeError: 'set' object does not support item assignment`
16 | Instead, when we want to outline the keys a dictionary should have before we have corresponding values, we need to insert a placeholder value for each key so that the compiler leaves room for incoming values.
17 | 
18 | #### Functions
19 | Several operations are available to use that come with statistical context and 
20 | 


--------------------------------------------------------------------------------
/languages/share/input/string_tests.yaml:
--------------------------------------------------------------------------------
 1 | [
 2 |   "alpha",
 3 |   "alba",
 4 |   "alvarez",
 5 |   "bravo",
 6 |   "jose",
 7 |   "jóse",
 8 |   "j0se",
 9 |   "iose"
10 | ]
11 | 


--------------------------------------------------------------------------------
/languages/share/output/make_test_pairs.log:
--------------------------------------------------------------------------------
1 | 2022-09-16 15:11:39 - INFO - Loading data.
2 | 2022-09-16 15:11:50 - INFO - Loading data.
3 | 2022-09-16 15:11:51 - INFO - done.
4 | 


--------------------------------------------------------------------------------
/languages/share/output/test_pairs.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HRDAG/training-docs/b5a9b827a010c21f5e647d761b898c321152fe88/languages/share/output/test_pairs.parquet


--------------------------------------------------------------------------------
/languages/share/src/make_test_pairs.py:
--------------------------------------------------------------------------------
 1 | # vim: set ts=4 sts=0 sw=4 si fenc=utf-8 et:
 2 | # vim: set fdm=marker fmr={{{,}}} fdl=0 foldcolumn=4:
 3 | # Authors:     BP
 4 | # Maintainers: BP
 5 | # Copyright:   2022, HRDAG, GPL v2 or later
 6 | # =========================================
 7 | # 
 8 | 
 9 | # ---- dependencies {{{
10 | from pathlib import Path
11 | from sys import stdout
12 | import argparse
13 | import logging
14 | import yaml
15 | import pandas as pd
16 | #}}}
17 | 
18 | # ---- support methods {{{
19 | def get_args():
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument("--input", default="input/string_tests.yaml")
22 |     parser.add_argument("--output", default="output/test_pairs.parquet")
23 |     args = parser.parse_args()
24 |     assert Path(args.input).exists()
25 |     return args
26 | 
27 | 
28 | def get_logger(sname, file_name=None):
29 |     logger = logging.getLogger(sname)
30 |     logger.setLevel(logging.DEBUG)
31 |     formatter = logging.Formatter("%(asctime)s - %(levelname)s " +
32 |                                   "- %(message)s", datefmt='%Y-%m-%d %H:%M:%S')
33 |     stream_handler = logging.StreamHandler(stdout)
34 |     stream_handler.setFormatter(formatter)
35 |     logger.addHandler(stream_handler)
36 |     if file_name:
37 |         file_handler = logging.FileHandler(file_name)
38 |         file_handler.setFormatter(formatter)
39 |         logger.addHandler(file_handler)
40 |     return logger
41 | 
42 | 
43 | def read_yaml(fname):
44 |     with open(fname, 'r') as f_handle:
45 |         out = yaml.safe_load(f_handle)
46 |     return out
47 | 
48 | 
49 | def make_pairs(tests):
50 |     pairs  = [(a, b) for a in tests for b in tests]
51 |     assert len(pairs) == (len(tests)**2)
52 |     return pairs
53 | #}}}
54 | 
55 | # ---- main {{{
56 | if __name__ == '__main__':
57 |     # setup logging
58 |     logger = get_logger(__name__, "output/make_test_pairs.log")
59 | 
60 |     # arg handling
61 |     args = get_args()
62 | 
63 |     # read data, initial verification
64 |     logger.info("Loading data.")
65 |     tests = read_yaml(args.input)
66 |     
67 |     # do stuff, more verification
68 |     test_pairs = make_pairs(tests)
69 |     out = pd.DataFrame(test_pairs, columns=["string_1", "string_2"])
70 | 
71 |     # save data, final verification
72 |     out.to_parquet(args.output)
73 |     
74 |     logger.info("done.")
75 |     
76 | #}}}
77 | # done.
78 | 


--------------------------------------------------------------------------------
/notebooks/python/.ipynb_checkpoints/basic-stats-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 26,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# dependencies\n",
 10 |     "import numpy as np\n",
 11 |     "import pandas as pd\n",
 12 |     "import statistics\n",
 13 |     "from scipy.stats import trimboth, trim_mean"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "## Techniques\n",
 21 |     "There are several techniques you've probably heard in a class or an article that represent some statistical method applied to some data. The most common of these are:\n",
 22 |     "\n",
 23 |     "- count\n",
 24 |     "- mean \n",
 25 |     "- standard deviation\n",
 26 |     "- median\n",
 27 |     "- mode\n",
 28 |     "- range\n",
 29 |     "- percentiles"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 30,
 35 |    "metadata": {},
 36 |    "outputs": [
 37 |     {
 38 |      "data": {
 39 |       "text/html": [
 40 |        "<div>\n",
 41 |        "<style scoped>\n",
 42 |        "    .dataframe tbody tr th:only-of-type {\n",
 43 |        "        vertical-align: middle;\n",
 44 |        "    }\n",
 45 |        "\n",
 46 |        "    .dataframe tbody tr th {\n",
 47 |        "        vertical-align: top;\n",
 48 |        "    }\n",
 49 |        "\n",
 50 |        "    .dataframe thead th {\n",
 51 |        "        text-align: right;\n",
 52 |        "    }\n",
 53 |        "</style>\n",
 54 |        "<table border=\"1\" class=\"dataframe\">\n",
 55 |        "  <thead>\n",
 56 |        "    <tr style=\"text-align: right;\">\n",
 57 |        "      <th></th>\n",
 58 |        "      <th>price</th>\n",
 59 |        "    </tr>\n",
 60 |        "  </thead>\n",
 61 |        "  <tbody>\n",
 62 |        "    <tr>\n",
 63 |        "      <th>house_a</th>\n",
 64 |        "      <td>198300.0</td>\n",
 65 |        "    </tr>\n",
 66 |        "    <tr>\n",
 67 |        "      <th>house_b</th>\n",
 68 |        "      <td>2385000.0</td>\n",
 69 |        "    </tr>\n",
 70 |        "    <tr>\n",
 71 |        "      <th>house_c</th>\n",
 72 |        "      <td>658200.0</td>\n",
 73 |        "    </tr>\n",
 74 |        "    <tr>\n",
 75 |        "      <th>house_d</th>\n",
 76 |        "      <td>NaN</td>\n",
 77 |        "    </tr>\n",
 78 |        "    <tr>\n",
 79 |        "      <th>house_e</th>\n",
 80 |        "      <td>658200.0</td>\n",
 81 |        "    </tr>\n",
 82 |        "  </tbody>\n",
 83 |        "</table>\n",
 84 |        "</div>"
 85 |       ],
 86 |       "text/plain": [
 87 |        "             price\n",
 88 |        "house_a   198300.0\n",
 89 |        "house_b  2385000.0\n",
 90 |        "house_c   658200.0\n",
 91 |        "house_d        NaN\n",
 92 |        "house_e   658200.0"
 93 |       ]
 94 |      },
 95 |      "execution_count": 30,
 96 |      "metadata": {},
 97 |      "output_type": "execute_result"
 98 |     }
 99 |    ],
100 |    "source": [
101 |     "prices = [198300, 2385000, 658200, np.nan, 658200]\n",
102 |     "indices = ['house_a', 'house_b', 'house_c', 'house_d', 'house_e']\n",
103 |     "\n",
104 |     "prices_df = pd.DataFrame(data, index=indices, columns=['price'])\n",
105 |     "prices_df"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "### `nan` handling"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "metadata": {},
118 |    "source": [
119 |     "#### different version, different default approach\n",
120 |     "Note that different library versions of the same statistical method may handle missing data in different ways. Where one chooses to halt on missing values, others skip over `nan` and continue performing the operation."
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 28,
126 |    "metadata": {},
127 |    "outputs": [
128 |     {
129 |      "name": "stdout",
130 |      "output_type": "stream",
131 |      "text": [
132 |       "658200\n"
133 |      ]
134 |     }
135 |    ],
136 |    "source": [
137 |     "print(mode(prices))"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 33,
143 |    "metadata": {},
144 |    "outputs": [
145 |     {
146 |      "name": "stdout",
147 |      "output_type": "stream",
148 |      "text": [
149 |       "nan\n",
150 |       "nan\n",
151 |       "nan\n"
152 |      ]
153 |     }
154 |    ],
155 |    "source": [
156 |     "print(np.median(prices))\n",
157 |     "print(np.mean(prices))\n",
158 |     "print(np.std(prices))"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 32,
164 |    "metadata": {},
165 |    "outputs": [
166 |     {
167 |      "name": "stdout",
168 |      "output_type": "stream",
169 |      "text": [
170 |       "658200\n",
171 |       "658200\n",
172 |       "nan\n",
173 |       "nan\n"
174 |      ]
175 |     }
176 |    ],
177 |    "source": [
178 |     "print(statistics.mode(prices))\n",
179 |     "print(statistics.median(prices))\n",
180 |     "print(statistics.mean(prices))\n",
181 |     "print(statistics.stdev(prices))"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 20,
187 |    "metadata": {},
188 |    "outputs": [
189 |     {
190 |      "data": {
191 |       "text/html": [
192 |        "<div>\n",
193 |        "<style scoped>\n",
194 |        "    .dataframe tbody tr th:only-of-type {\n",
195 |        "        vertical-align: middle;\n",
196 |        "    }\n",
197 |        "\n",
198 |        "    .dataframe tbody tr th {\n",
199 |        "        vertical-align: top;\n",
200 |        "    }\n",
201 |        "\n",
202 |        "    .dataframe thead th {\n",
203 |        "        text-align: right;\n",
204 |        "    }\n",
205 |        "</style>\n",
206 |        "<table border=\"1\" class=\"dataframe\">\n",
207 |        "  <thead>\n",
208 |        "    <tr style=\"text-align: right;\">\n",
209 |        "      <th></th>\n",
210 |        "      <th>price</th>\n",
211 |        "    </tr>\n",
212 |        "  </thead>\n",
213 |        "  <tbody>\n",
214 |        "    <tr>\n",
215 |        "      <th>count</th>\n",
216 |        "      <td>3.000</td>\n",
217 |        "    </tr>\n",
218 |        "    <tr>\n",
219 |        "      <th>mean</th>\n",
220 |        "      <td>1080500.000</td>\n",
221 |        "    </tr>\n",
222 |        "    <tr>\n",
223 |        "      <th>std</th>\n",
224 |        "      <td>1152895.134</td>\n",
225 |        "    </tr>\n",
226 |        "    <tr>\n",
227 |        "      <th>min</th>\n",
228 |        "      <td>198300.000</td>\n",
229 |        "    </tr>\n",
230 |        "    <tr>\n",
231 |        "      <th>25%</th>\n",
232 |        "      <td>428250.000</td>\n",
233 |        "    </tr>\n",
234 |        "    <tr>\n",
235 |        "      <th>50%</th>\n",
236 |        "      <td>658200.000</td>\n",
237 |        "    </tr>\n",
238 |        "    <tr>\n",
239 |        "      <th>75%</th>\n",
240 |        "      <td>1521600.000</td>\n",
241 |        "    </tr>\n",
242 |        "    <tr>\n",
243 |        "      <th>max</th>\n",
244 |        "      <td>2385000.000</td>\n",
245 |        "    </tr>\n",
246 |        "  </tbody>\n",
247 |        "</table>\n",
248 |        "</div>"
249 |       ],
250 |       "text/plain": [
251 |        "             price\n",
252 |        "count        3.000\n",
253 |        "mean   1080500.000\n",
254 |        "std    1152895.134\n",
255 |        "min     198300.000\n",
256 |        "25%     428250.000\n",
257 |        "50%     658200.000\n",
258 |        "75%    1521600.000\n",
259 |        "max    2385000.000"
260 |       ]
261 |      },
262 |      "execution_count": 20,
263 |      "metadata": {},
264 |      "output_type": "execute_result"
265 |     }
266 |    ],
267 |    "source": [
268 |     "prices_df.describe()"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": 4,
274 |    "metadata": {},
275 |    "outputs": [
276 |     {
277 |      "name": "stdout",
278 |      "output_type": "stream",
279 |      "text": [
280 |       "<class 'pandas.core.frame.DataFrame'>\n",
281 |       "Index: 3 entries, house_a to house_c\n",
282 |       "Data columns (total 1 columns):\n",
283 |       " #   Column  Non-Null Count  Dtype\n",
284 |       "---  ------  --------------  -----\n",
285 |       " 0   price   3 non-null      int64\n",
286 |       "dtypes: int64(1)\n",
287 |       "memory usage: 48.0+ bytes\n"
288 |      ]
289 |     }
290 |    ],
291 |    "source": [
292 |     "prices_df.info()"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "markdown",
297 |    "metadata": {},
298 |    "source": [
299 |     "### Similar or relevant techniques\n",
300 |     "\n",
301 |     "In some cases, our data may have a small number of outliers. Depending on how realistic the outliers are in context, it can be useful to perform a modified approach to something like a basic mean when evaluating the descriptive statistics of a dataset.\n",
302 |     "\n",
303 |     "When our outliers make sense for the data we have, a trimmed mean allows you to cut a specified proportion of the tails of the data off and then evaluate the mean. \n",
304 |     "- `trim_mean(arr, 0.01)`: Takes `arr` and sorts the contents, then slices off the left and rightmost tails \n",
305 |     "- clip()"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": 5,
311 |    "metadata": {},
312 |    "outputs": [
313 |     {
314 |      "name": "stdout",
315 |      "output_type": "stream",
316 |      "text": [
317 |       "dataset: [1, 25, 156, 78, 465, 12312, 98, 5651, 75615]\n",
318 |       "mean:\t 10489.0\n"
319 |      ]
320 |     }
321 |    ],
322 |    "source": [
323 |     "data = [1, 25, 156, 78, 465, 12312, 98, 5651, 75615]\n",
324 |     "\n",
325 |     "print('dataset:', data)\n",
326 |     "print('mean:\\t', np.mean(data))"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": 6,
332 |    "metadata": {},
333 |    "outputs": [
334 |     {
335 |      "name": "stdout",
336 |      "output_type": "stream",
337 |      "text": [
338 |       "new dataset [    1    98    25    78   156   465  5651 12312 75615]\n",
339 |       "new mean:t 10489.0\n"
340 |      ]
341 |     }
342 |    ],
343 |    "source": [
344 |     "trimmed_data = trimboth(data, 0.05)\n",
345 |     "\n",
346 |     "print('new dataset', trimmed_data)\n",
347 |     "print('new mean:t', np.mean(trimmed_data))"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": 7,
353 |    "metadata": {},
354 |    "outputs": [
355 |     {
356 |      "data": {
357 |       "text/plain": [
358 |        "2683.5714285714284"
359 |       ]
360 |      },
361 |      "execution_count": 7,
362 |      "metadata": {},
363 |      "output_type": "execute_result"
364 |     }
365 |    ],
366 |    "source": [
367 |     "trim_mean(data, .2)"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": null,
373 |    "metadata": {},
374 |    "outputs": [],
375 |    "source": []
376 |   }
377 |  ],
378 |  "metadata": {
379 |   "kernelspec": {
380 |    "display_name": "Python 3",
381 |    "language": "python",
382 |    "name": "python3"
383 |   },
384 |   "language_info": {
385 |    "codemirror_mode": {
386 |     "name": "ipython",
387 |     "version": 3
388 |    },
389 |    "file_extension": ".py",
390 |    "mimetype": "text/x-python",
391 |    "name": "python",
392 |    "nbconvert_exporter": "python",
393 |    "pygments_lexer": "ipython3",
394 |    "version": "3.7.3"
395 |   }
396 |  },
397 |  "nbformat": 4,
398 |  "nbformat_minor": 4
399 | }
400 | 


--------------------------------------------------------------------------------
/notebooks/python/.ipynb_checkpoints/sets-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 15,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# dependencies\n",
 10 |     "import numpy as np\n",
 11 |     "import pandas as pd"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "## Features\n",
 19 |     "What are the cases for choosing a `set` over a `list` or a `dict`? What functionality do we get with sets that we don't with others and how is it useful?"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "In the simplest case, let's say we are iterating through some data and we want to capture all unique values we find that meet a particular condition.\n",
 27 |     "  \n",
 28 |     "### [time complexity](https://wiki.python.org/moin/TimeComplexity)\n",
 29 |     "If we use a `list`, we will see the lowest overhead possible to append a new item, `O(1)` average and worst case, but we will have to deduplicate the list contents after building the collection.\n",
 30 |     "\n",
 31 |     "If we use a `dict`, we will make great time with appending new items with an average of `O(1)`, but in order to protect against duplicate keys, we estimate worst case time at `O(n)` to append a new item.\n",
 32 |     "\n",
 33 |     "If we use a `set`, we observe the same time complexity for adding a new item to the collection, but we get to save a little bit of memory by only storing keys instead of key-value pairs. In addition, `set` type objects come with statistical operations that can be used to examine relational features of the data.\n",
 34 |     "- `l.isdisjoint(r)`\n",
 35 |     "- `l.issubset(r)`\n",
 36 |     "- `l.issuperset(r)`\n",
 37 |     "- `l.union(r)`\n",
 38 |     "- `l.intersection(r)`\n",
 39 |     "- `l.difference(r)`\n",
 40 |     "- `l.symmetric_difference(r)`"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 28,
 46 |    "metadata": {},
 47 |    "outputs": [
 48 |     {
 49 |      "data": {
 50 |       "text/html": [
 51 |        "<div>\n",
 52 |        "<style scoped>\n",
 53 |        "    .dataframe tbody tr th:only-of-type {\n",
 54 |        "        vertical-align: middle;\n",
 55 |        "    }\n",
 56 |        "\n",
 57 |        "    .dataframe tbody tr th {\n",
 58 |        "        vertical-align: top;\n",
 59 |        "    }\n",
 60 |        "\n",
 61 |        "    .dataframe thead th {\n",
 62 |        "        text-align: right;\n",
 63 |        "    }\n",
 64 |        "</style>\n",
 65 |        "<table border=\"1\" class=\"dataframe\">\n",
 66 |        "  <thead>\n",
 67 |        "    <tr style=\"text-align: right;\">\n",
 68 |        "      <th></th>\n",
 69 |        "      <th>average case</th>\n",
 70 |        "      <th>worst case</th>\n",
 71 |        "    </tr>\n",
 72 |        "  </thead>\n",
 73 |        "  <tbody>\n",
 74 |        "    <tr>\n",
 75 |        "      <th>list</th>\n",
 76 |        "      <td>O(1)</td>\n",
 77 |        "      <td>O(1)</td>\n",
 78 |        "    </tr>\n",
 79 |        "    <tr>\n",
 80 |        "      <th>dict</th>\n",
 81 |        "      <td>O(1)</td>\n",
 82 |        "      <td>O(n)</td>\n",
 83 |        "    </tr>\n",
 84 |        "    <tr>\n",
 85 |        "      <th>set</th>\n",
 86 |        "      <td>O(1)</td>\n",
 87 |        "      <td>O(n)</td>\n",
 88 |        "    </tr>\n",
 89 |        "  </tbody>\n",
 90 |        "</table>\n",
 91 |        "</div>"
 92 |       ],
 93 |       "text/plain": [
 94 |        "     average case worst case\n",
 95 |        "list         O(1)       O(1)\n",
 96 |        "dict         O(1)       O(n)\n",
 97 |        "set          O(1)       O(n)"
 98 |       ]
 99 |      },
100 |      "execution_count": 28,
101 |      "metadata": {},
102 |      "output_type": "execute_result"
103 |     }
104 |    ],
105 |    "source": [
106 |     "time = {\n",
107 |     "    'list': ['O(1)', 'O(1)'],\n",
108 |     "    'dict': ['O(1)', 'O(n)'],\n",
109 |     "    'set': ['O(1)', 'O(n)']\n",
110 |     "}\n",
111 |     "time_df = pd.DataFrame.from_dict(time, orient='index', columns=['average case', 'worst case'])\n",
112 |     "time_df"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "metadata": {},
118 |    "source": [
119 |     "## Problem with ambiguous declaration\n",
120 |     "- Cause: Partial assignment at initialization\n",
121 |     "\n",
122 |     "Because you don't have to use the name of an object to declare it in python, and sets and dicts both use curly brackets, `{}`, to denote themselves, python makes an inference at runtime as to which type is intended, based on the initialization.\n",
123 |     "\n",
124 |     "- Empty initialization: `info = {}`\n",
125 |     "    - Inferred to be `dict` type to leave the most room upfront\n",
126 |     "    \n",
127 |     "- Partial initialization: `info = {'name', 'address', 'phone'}`\n",
128 |     "    - Inferred to be `set` type, since it was given keys but no values\n",
129 |     "    \n",
130 |     "- Complete initialization: `info = {'name'=None, 'address'=None, 'phone'=np.nan}`\n",
131 |     "    - Inferred to be `dict`, since it has both keys and values\n",
132 |     "\n",
133 |     "Let's say we intend to collect some data that meets some criteria, and we have an idea of the keys we want to save. So, we fill in the keys in our dictionary notation and then proceed to filling values."
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 4,
139 |    "metadata": {},
140 |    "outputs": [
141 |     {
142 |      "ename": "TypeError",
143 |      "evalue": "'set' object does not support item assignment",
144 |      "output_type": "error",
145 |      "traceback": [
146 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
147 |       "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
148 |       "\u001b[0;32m<ipython-input-4-29ea74b0feeb>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0minfo\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'name'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'Kenny'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
149 |       "\u001b[0;31mTypeError\u001b[0m: 'set' object does not support item assignment"
150 |      ]
151 |     }
152 |    ],
153 |    "source": [
154 |     "info = {'name', 'address', 'phone'}\n",
155 |     "info['name'] = 'Kenny'"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {},
161 |    "source": [
162 |     "#### Solution\n",
163 |     "Okay, so we can't just throw keys into curly brackets and expect python to know we want a dictionary. But we still don't have the values at that stage, so we find the appropriate placeholder to denote a value that's missing _for now_."
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 12,
169 |    "metadata": {},
170 |    "outputs": [
171 |     {
172 |      "data": {
173 |       "text/plain": [
174 |        "{'name': 'Kenny', 'address': None, 'phone': nan}"
175 |       ]
176 |      },
177 |      "execution_count": 12,
178 |      "metadata": {},
179 |      "output_type": "execute_result"
180 |     }
181 |    ],
182 |    "source": [
183 |     "info = {'name':None, 'address':None, 'phone':np.nan}\n",
184 |     "info['name'] = 'Kenny'\n",
185 |     "info"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "**Comments about solution**\n",
193 |     "You might wonder if a single placeholder is enough to signal to the compiler that you want a dictionary, and the answer is yes and no.\n",
194 |     "- Yes: If you only insert one key, then you only need one placeholder.\n",
195 |     "- No: For every key you add, you need a placeholder value, or else you get a `SyntaxError`."
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 8,
201 |    "metadata": {},
202 |    "outputs": [
203 |     {
204 |      "ename": "SyntaxError",
205 |      "evalue": "invalid syntax (<ipython-input-8-cf8bf1468a8c>, line 1)",
206 |      "output_type": "error",
207 |      "traceback": [
208 |       "\u001b[0;36m  File \u001b[0;32m\"<ipython-input-8-cf8bf1468a8c>\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m    info = {'name':None, 'address', 'phone'}\u001b[0m\n\u001b[0m                                  ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
209 |      ]
210 |     }
211 |    ],
212 |    "source": [
213 |     "info = {'name':None, 'address', 'phone'}"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {},
219 |    "source": [
220 |     "**More comments**\n",
221 |     "If you're wondering why we used two different placeholder values, check out `language-tips/python/missingness.md` in the training-docs repo, or read the \"Handling Missing Data\" topic in _Python for Data Science Handbook_."
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "markdown",
226 |    "metadata": {},
227 |    "source": [
228 |     "## Set operations\n",
229 |     "\n",
230 |     "| method                      | alternative |   meaning     |\n",
231 |     "| :---                        |    :----:   |          ---: |\n",
232 |     "| `l.isdisjoint(r)`           | None        | non-empty intersection   |\n",
233 |     "| `l.issubset(r)`             | `l < r`     | all elements in l are in r, but `l != r` |\n",
234 |     "| `l.issuperset(r)`           | `l > r`     | all elements in r are in l, but `l != r` |\n",
235 |     "| `l.union(r)`                | `l \\| r`    | all elements in either l or r |\n",
236 |     "| `l.intersection(r)`         | `l & r`     | all elements in both l and r |\n",
237 |     "| `l.difference(r)`           | `l - r`     | all elements in l but not r |\n",
238 |     "| `l.symmetric_difference(r)` | `l ^ r`     | all elements in l or r but not in both  |"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "metadata": {},
244 |    "source": [
245 |     "For a downloadable version, check out this [fancy chart for set ops](input/set-operations.pdf)."
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": null,
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": []
254 |   }
255 |  ],
256 |  "metadata": {
257 |   "kernelspec": {
258 |    "display_name": "Python 3",
259 |    "language": "python",
260 |    "name": "python3"
261 |   },
262 |   "language_info": {
263 |    "codemirror_mode": {
264 |     "name": "ipython",
265 |     "version": 3
266 |    },
267 |    "file_extension": ".py",
268 |    "mimetype": "text/x-python",
269 |    "name": "python",
270 |    "nbconvert_exporter": "python",
271 |    "pygments_lexer": "ipython3",
272 |    "version": "3.7.3"
273 |   }
274 |  },
275 |  "nbformat": 4,
276 |  "nbformat_minor": 4
277 | }
278 | 


--------------------------------------------------------------------------------
/notebooks/python/here.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# dependencies\n",
 10 |     "import git\n",
 11 |     "import os\n",
 12 |     "import subprocess\n",
 13 |     "from pathlib import Path"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "# support methods\n",
 23 |     "# snippets from this thread:\n",
 24 |     "#     https://stackoverflow.com/questions/22081209/find-the-root-of-the-git-repository-where-the-file-lives\n",
 25 |     "\n",
 26 |     "# uses git module, gets top level\n",
 27 |     "def get_git_root(path):\n",
 28 |     "    git_repo = git.Repo(path, search_parent_directories=True)\n",
 29 |     "    git_root = git_repo.git.rev_parse(\"--show-toplevel\")\n",
 30 |     "    return git_root\n",
 31 |     "\n",
 32 |     "\n",
 33 |     "# uses git module, gets working directory\n",
 34 |     "def get_git_wd(path):\n",
 35 |     "    git_repo = git.Repo(path, search_parent_directories=True)\n",
 36 |     "    cwd = git_repo.working_tree_dir\n",
 37 |     "    return cwd\n",
 38 |     "\n",
 39 |     "\n",
 40 |     "# uses subprocess module, gets top level\n",
 41 |     "def get_git_root_sub():\n",
 42 |     "    proc = subprocess.Popen(['git', 'rev-parse', '--show-toplevel'], stdout=subprocess.PIPE).communicate()[0]\n",
 43 |     "    return proc.rstrip().decode('utf-8')\n",
 44 |     "\n",
 45 |     "\n",
 46 |     "# uses path module, gets first '.git' directory\n",
 47 |     "# This DOES NOT CURRENTLY WORK\n",
 48 |     "def find_repo(path):\n",
 49 |     "    for path in Path(path).parents:\n",
 50 |     "        git_dir = path / \".git\"\n",
 51 |     "        if git_dir.is_dir():\n",
 52 |     "            return path\n",
 53 |     "\n",
 54 |     "\n",
 55 |     "# should be a way to do it with `os.walk`"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 3,
 61 |    "metadata": {},
 62 |    "outputs": [
 63 |     {
 64 |      "name": "stdout",
 65 |      "output_type": "stream",
 66 |      "text": [
 67 |       "/home/bailey/training-docs/notebooks/python\r\n"
 68 |      ]
 69 |     }
 70 |    ],
 71 |    "source": [
 72 |     "!pwd"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 4,
 78 |    "metadata": {},
 79 |    "outputs": [
 80 |     {
 81 |      "data": {
 82 |       "text/plain": [
 83 |        "'/home/bailey/training-docs'"
 84 |       ]
 85 |      },
 86 |      "execution_count": 4,
 87 |      "metadata": {},
 88 |      "output_type": "execute_result"
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "get_git_root('.')"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 5,
 98 |    "metadata": {},
 99 |    "outputs": [
100 |     {
101 |      "data": {
102 |       "text/plain": [
103 |        "'/home/bailey/training-docs'"
104 |       ]
105 |      },
106 |      "execution_count": 5,
107 |      "metadata": {},
108 |      "output_type": "execute_result"
109 |     }
110 |    ],
111 |    "source": [
112 |     "get_git_wd('.')"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 6,
118 |    "metadata": {},
119 |    "outputs": [
120 |     {
121 |      "data": {
122 |       "text/plain": [
123 |        "'/home/bailey/training-docs/notebooks/python'"
124 |       ]
125 |      },
126 |      "execution_count": 6,
127 |      "metadata": {},
128 |      "output_type": "execute_result"
129 |     }
130 |    ],
131 |    "source": [
132 |     "os.getcwd()"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 7,
138 |    "metadata": {},
139 |    "outputs": [
140 |     {
141 |      "data": {
142 |       "text/plain": [
143 |        "'/home/bailey/training-docs'"
144 |       ]
145 |      },
146 |      "execution_count": 7,
147 |      "metadata": {},
148 |      "output_type": "execute_result"
149 |     }
150 |    ],
151 |    "source": [
152 |     "get_git_root_sub()"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": 8,
158 |    "metadata": {},
159 |    "outputs": [
160 |     {
161 |      "name": "stdout",
162 |      "output_type": "stream",
163 |      "text": [
164 |       "None\n"
165 |      ]
166 |     }
167 |    ],
168 |    "source": [
169 |     "print(find_repo('.'))"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 9,
175 |    "metadata": {},
176 |    "outputs": [
177 |     {
178 |      "data": {
179 |       "text/plain": [
180 |        "PosixPath('.')"
181 |       ]
182 |      },
183 |      "execution_count": 9,
184 |      "metadata": {},
185 |      "output_type": "execute_result"
186 |     }
187 |    ],
188 |    "source": [
189 |     "Path('.')"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": null,
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": []
198 |   }
199 |  ],
200 |  "metadata": {
201 |   "kernelspec": {
202 |    "display_name": "Python 3",
203 |    "language": "python",
204 |    "name": "python3"
205 |   },
206 |   "language_info": {
207 |    "codemirror_mode": {
208 |     "name": "ipython",
209 |     "version": 3
210 |    },
211 |    "file_extension": ".py",
212 |    "mimetype": "text/x-python",
213 |    "name": "python",
214 |    "nbconvert_exporter": "python",
215 |    "pygments_lexer": "ipython3",
216 |    "version": "3.7.3"
217 |   }
218 |  },
219 |  "nbformat": 4,
220 |  "nbformat_minor": 4
221 | }
222 | 


--------------------------------------------------------------------------------
/notebooks/python/input/set-operations.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HRDAG/training-docs/b5a9b827a010c21f5e647d761b898c321152fe88/notebooks/python/input/set-operations.pdf


--------------------------------------------------------------------------------
/notebooks/python/magics-performance.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": []
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "metadata": {},
 22 |    "outputs": [
 23 |     {
 24 |      "name": "stdout",
 25 |      "output_type": "stream",
 26 |      "text": [
 27 |       "LEVEL1_ICACHE_SIZE                 32768\r\n",
 28 |       "LEVEL1_ICACHE_ASSOC                8\r\n",
 29 |       "LEVEL1_ICACHE_LINESIZE             64\r\n",
 30 |       "LEVEL1_DCACHE_SIZE                 32768\r\n",
 31 |       "LEVEL1_DCACHE_ASSOC                8\r\n",
 32 |       "LEVEL1_DCACHE_LINESIZE             64\r\n",
 33 |       "LEVEL2_CACHE_SIZE                  262144\r\n",
 34 |       "LEVEL2_CACHE_ASSOC                 8\r\n",
 35 |       "LEVEL2_CACHE_LINESIZE              64\r\n",
 36 |       "LEVEL3_CACHE_SIZE                  36700160\r\n",
 37 |       "LEVEL3_CACHE_ASSOC                 20\r\n",
 38 |       "LEVEL3_CACHE_LINESIZE              64\r\n",
 39 |       "LEVEL4_CACHE_SIZE                  0\r\n",
 40 |       "LEVEL4_CACHE_ASSOC                 0\r\n",
 41 |       "LEVEL4_CACHE_LINESIZE              0\r\n"
 42 |      ]
 43 |     }
 44 |    ],
 45 |    "source": [
 46 |     "!getconf -a | grep CACHE"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "df.values.nbytes"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "### Performance\n",
 63 |     "\n",
 64 |     "The `%time` line command measures the execution time of a given line and returns the CPU and wall time of execution."
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 2,
 70 |    "metadata": {},
 71 |    "outputs": [
 72 |     {
 73 |      "name": "stdout",
 74 |      "output_type": "stream",
 75 |      "text": [
 76 |       "CPU times: user 47.1 ms, sys: 6.46 ms, total: 53.6 ms\n",
 77 |       "Wall time: 53.1 ms\n"
 78 |      ]
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "%time out = [i*i for i in range(1000000)]"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "`%timeit` executes the command given as input for 7 rounds where each round executes code 10 times totaling 70 times by default. It takes the best of each iteration in each round and gives time measurement with standard deviation. Below are some useful arguments of the command.\n",
 90 |     "\n",
 91 |     "- `n <loops>`: It accepts integer value specifying number of iteration per round.\n",
 92 |     "- `r <runs>`: It accepts integer value specifying number of rounds to test timer.\n",
 93 |     "- `t`: This option forces `%timeit` to use time.time to measure time which returns wall time.\n",
 94 |     "- `c`: This option forces `%timeit` to use time.clock to measure time which returns CPU time.\n",
 95 |     "- `q`: This option instructs `%timeit` to not print results to the output.\n",
 96 |     "- `o`: This option returns TimeitResult object.\n",
 97 |     "\n",
 98 |     "[Source](https://coderzcolumn.com/tutorials/python/list-of-useful-magic-commands-in-jupyter-notebook-lab)"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 3,
104 |    "metadata": {},
105 |    "outputs": [
106 |     {
107 |      "name": "stdout",
108 |      "output_type": "stream",
109 |      "text": [
110 |       "57.1 ms ± 481 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
111 |      ]
112 |     }
113 |    ],
114 |    "source": [
115 |     "%timeit out = [i*i for i in range(1000000)]"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "## Next up...\n",
123 |     "- `%doctest_mode`\n",
124 |     "- `%prun` (has blob to copy and format), `%%prun`\n",
125 |     "- `%lprun`\n",
126 |     "- `%memit`, `%mprun`\n",
127 |     "- `%%time`, `%%timeit`\n",
128 |     "\n",
129 |     "\n",
130 |     "- [snakeviz](https://coderzcolumn.com/tutorials/python/snakeviz-visualize-profiling-results-in-python)\n",
131 |     "- [memory_profiler](https://coderzcolumn.com/tutorials/python/how-to-profile-memory-usage-in-python-using-memory-profiler)\n",
132 |     "- [line_profiler](https://coderzcolumn.com/tutorials/python/line-profiler-line-by-line-profiling-of-python-code)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": []
141 |   }
142 |  ],
143 |  "metadata": {
144 |   "kernelspec": {
145 |    "display_name": "Python 3",
146 |    "language": "python",
147 |    "name": "python3"
148 |   },
149 |   "language_info": {
150 |    "codemirror_mode": {
151 |     "name": "ipython",
152 |     "version": 3
153 |    },
154 |    "file_extension": ".py",
155 |    "mimetype": "text/x-python",
156 |    "name": "python",
157 |    "nbconvert_exporter": "python",
158 |    "pygments_lexer": "ipython3",
159 |    "version": "3.7.3"
160 |   }
161 |  },
162 |  "nbformat": 4,
163 |  "nbformat_minor": 5
164 | }
165 | 


--------------------------------------------------------------------------------
/notebooks/python/os.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 13,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# dependencies\n",
 10 |     "from os import listdir\n",
 11 |     "from os import walk\n",
 12 |     "from os import scandir\n",
 13 |     "from os import cpu_count\n",
 14 |     "\n",
 15 |     "from os import link"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "# support methods\n",
 25 |     "# stats also offers:\n",
 26 |     "#     - user, group id\n",
 27 |     "#     - times for last access or modification, and creation of item\n",
 28 |     "def sample_scan(src_dir, sample_n):\n",
 29 |     "    count = 0\n",
 30 |     "    assert sample_n in range(len(listdir(src_dir)))\n",
 31 |     "    print(f'scanning {sample_n} items from {src_dir}')\n",
 32 |     "    print('=========================================================')\n",
 33 |     "    for item in scandir(src_dir):\n",
 34 |     "        if count < sample_n:\n",
 35 |     "            this_name = item.name\n",
 36 |     "            this_path = item.path\n",
 37 |     "            is_dir = item.is_dir()\n",
 38 |     "            is_file = item.is_file()\n",
 39 |     "            is_sym = item.is_symlink()\n",
 40 |     "            stats = item.stat()\n",
 41 |     "            dev = stats.st_dev\n",
 42 |     "            n_hlinks = stats.st_nlink\n",
 43 |     "            n_bytes = stats.st_size    # The size of a symbolic link is the length of the pathname it contains, without a terminating null byte\n",
 44 |     "            inode = stats.st_ino()\n",
 45 |     "            assert stats.st_ino() == item.inode()\n",
 46 |     "            print('current:\\t', this_path)\n",
 47 |     "            print('directory:\\t', is_dir)\n",
 48 |     "            print('symlink:\\t', is_sym)\n",
 49 |     "            print('inode:\\t\\t', inode)\n",
 50 |     "            print()\n",
 51 |     "            count += 1\n",
 52 |     "    return 1"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 15,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "# main\n",
 62 |     "src_dir = '.'\n",
 63 |     "\n",
 64 |     "print('cpus:\\t', cpu_count())\n",
 65 |     "sample_scan(src_dir, sample_n=10)"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "## How do `listdir`, `walk`, and `scandir` differ?"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": []
 81 |   }
 82 |  ],
 83 |  "metadata": {
 84 |   "kernelspec": {
 85 |    "display_name": "Python 3",
 86 |    "language": "python",
 87 |    "name": "python3"
 88 |   },
 89 |   "language_info": {
 90 |    "codemirror_mode": {
 91 |     "name": "ipython",
 92 |     "version": 3
 93 |    },
 94 |    "file_extension": ".py",
 95 |    "mimetype": "text/x-python",
 96 |    "name": "python",
 97 |    "nbconvert_exporter": "python",
 98 |    "pygments_lexer": "ipython3",
 99 |    "version": "3.7.3"
100 |   }
101 |  },
102 |  "nbformat": 4,
103 |  "nbformat_minor": 4
104 | }
105 | 


--------------------------------------------------------------------------------
/notebooks/python/sets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 15,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# dependencies\n",
 10 |     "import numpy as np\n",
 11 |     "import pandas as pd"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "## Features\n",
 19 |     "What are the cases for choosing a `set` over a `list` or a `dict`? What functionality do we get with sets that we don't with others and how is it useful?"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "In the simplest case, let's say we are iterating through some data and we want to capture all unique values we find that meet a particular condition.\n",
 27 |     "  \n",
 28 |     "### [time complexity](https://wiki.python.org/moin/TimeComplexity)\n",
 29 |     "If we use a `list`, we will see the lowest overhead possible to append a new item, `O(1)` average and worst case, but we will have to deduplicate the list contents after building the collection.\n",
 30 |     "\n",
 31 |     "If we use a `dict`, we will make great time with appending new items with an average of `O(1)`, but in order to protect against duplicate keys, we estimate worst case time at `O(n)` to append a new item.\n",
 32 |     "\n",
 33 |     "If we use a `set`, we observe the same time complexity for adding a new item to the collection, but we get to save a little bit of memory by only storing keys instead of key-value pairs. In addition, `set` type objects come with statistical operations that can be used to examine relational features of the data.\n",
 34 |     "- `l.isdisjoint(r)`\n",
 35 |     "- `l.issubset(r)`\n",
 36 |     "- `l.issuperset(r)`\n",
 37 |     "- `l.union(r)`\n",
 38 |     "- `l.intersection(r)`\n",
 39 |     "- `l.difference(r)`\n",
 40 |     "- `l.symmetric_difference(r)`"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 28,
 46 |    "metadata": {},
 47 |    "outputs": [
 48 |     {
 49 |      "data": {
 50 |       "text/html": [
 51 |        "<div>\n",
 52 |        "<style scoped>\n",
 53 |        "    .dataframe tbody tr th:only-of-type {\n",
 54 |        "        vertical-align: middle;\n",
 55 |        "    }\n",
 56 |        "\n",
 57 |        "    .dataframe tbody tr th {\n",
 58 |        "        vertical-align: top;\n",
 59 |        "    }\n",
 60 |        "\n",
 61 |        "    .dataframe thead th {\n",
 62 |        "        text-align: right;\n",
 63 |        "    }\n",
 64 |        "</style>\n",
 65 |        "<table border=\"1\" class=\"dataframe\">\n",
 66 |        "  <thead>\n",
 67 |        "    <tr style=\"text-align: right;\">\n",
 68 |        "      <th></th>\n",
 69 |        "      <th>average case</th>\n",
 70 |        "      <th>worst case</th>\n",
 71 |        "    </tr>\n",
 72 |        "  </thead>\n",
 73 |        "  <tbody>\n",
 74 |        "    <tr>\n",
 75 |        "      <th>list</th>\n",
 76 |        "      <td>O(1)</td>\n",
 77 |        "      <td>O(1)</td>\n",
 78 |        "    </tr>\n",
 79 |        "    <tr>\n",
 80 |        "      <th>dict</th>\n",
 81 |        "      <td>O(1)</td>\n",
 82 |        "      <td>O(n)</td>\n",
 83 |        "    </tr>\n",
 84 |        "    <tr>\n",
 85 |        "      <th>set</th>\n",
 86 |        "      <td>O(1)</td>\n",
 87 |        "      <td>O(n)</td>\n",
 88 |        "    </tr>\n",
 89 |        "  </tbody>\n",
 90 |        "</table>\n",
 91 |        "</div>"
 92 |       ],
 93 |       "text/plain": [
 94 |        "     average case worst case\n",
 95 |        "list         O(1)       O(1)\n",
 96 |        "dict         O(1)       O(n)\n",
 97 |        "set          O(1)       O(n)"
 98 |       ]
 99 |      },
100 |      "execution_count": 28,
101 |      "metadata": {},
102 |      "output_type": "execute_result"
103 |     }
104 |    ],
105 |    "source": [
106 |     "time = {\n",
107 |     "    'list': ['O(1)', 'O(1)'],\n",
108 |     "    'dict': ['O(1)', 'O(n)'],\n",
109 |     "    'set': ['O(1)', 'O(n)']\n",
110 |     "}\n",
111 |     "time_df = pd.DataFrame.from_dict(time, orient='index', columns=['average case', 'worst case'])\n",
112 |     "time_df"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "metadata": {},
118 |    "source": [
119 |     "## Problem with ambiguous declaration\n",
120 |     "- Cause: Partial assignment at initialization\n",
121 |     "\n",
122 |     "Because you don't have to use the name of an object to declare it in python, and sets and dicts both use curly brackets, `{}`, to denote themselves, python makes an inference at runtime as to which type is intended, based on the initialization.\n",
123 |     "\n",
124 |     "- Empty initialization: `info = {}`\n",
125 |     "    - Inferred to be `dict` type to leave the most room upfront\n",
126 |     "    \n",
127 |     "- Partial initialization: `info = {'name', 'address', 'phone'}`\n",
128 |     "    - Inferred to be `set` type, since it was given keys but no values\n",
129 |     "    \n",
130 |     "- Complete initialization: `info = {'name'=None, 'address'=None, 'phone'=np.nan}`\n",
131 |     "    - Inferred to be `dict`, since it has both keys and values\n",
132 |     "\n",
133 |     "Let's say we intend to collect some data that meets some criteria, and we have an idea of the keys we want to save. So, we fill in the keys in our dictionary notation and then proceed to filling values."
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 4,
139 |    "metadata": {},
140 |    "outputs": [
141 |     {
142 |      "ename": "TypeError",
143 |      "evalue": "'set' object does not support item assignment",
144 |      "output_type": "error",
145 |      "traceback": [
146 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
147 |       "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
148 |       "\u001b[0;32m<ipython-input-4-29ea74b0feeb>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0minfo\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'name'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'Kenny'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
149 |       "\u001b[0;31mTypeError\u001b[0m: 'set' object does not support item assignment"
150 |      ]
151 |     }
152 |    ],
153 |    "source": [
154 |     "info = {'name', 'address', 'phone'}\n",
155 |     "info['name'] = 'Kenny'"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {},
161 |    "source": [
162 |     "#### Solution\n",
163 |     "Okay, so we can't just throw keys into curly brackets and expect python to know we want a dictionary. But we still don't have the values at that stage, so we find the appropriate placeholder to denote a value that's missing _for now_."
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 12,
169 |    "metadata": {},
170 |    "outputs": [
171 |     {
172 |      "data": {
173 |       "text/plain": [
174 |        "{'name': 'Kenny', 'address': None, 'phone': nan}"
175 |       ]
176 |      },
177 |      "execution_count": 12,
178 |      "metadata": {},
179 |      "output_type": "execute_result"
180 |     }
181 |    ],
182 |    "source": [
183 |     "info = {'name':None, 'address':None, 'phone':np.nan}\n",
184 |     "info['name'] = 'Kenny'\n",
185 |     "info"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "**Comments about solution**\n",
193 |     "You might wonder if a single placeholder is enough to signal to the compiler that you want a dictionary, and the answer is yes and no.\n",
194 |     "- Yes: If you only insert one key, then you only need one placeholder.\n",
195 |     "- No: For every key you add, you need a placeholder value, or else you get a `SyntaxError`."
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 8,
201 |    "metadata": {},
202 |    "outputs": [
203 |     {
204 |      "ename": "SyntaxError",
205 |      "evalue": "invalid syntax (<ipython-input-8-cf8bf1468a8c>, line 1)",
206 |      "output_type": "error",
207 |      "traceback": [
208 |       "\u001b[0;36m  File \u001b[0;32m\"<ipython-input-8-cf8bf1468a8c>\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m    info = {'name':None, 'address', 'phone'}\u001b[0m\n\u001b[0m                                  ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
209 |      ]
210 |     }
211 |    ],
212 |    "source": [
213 |     "info = {'name':None, 'address', 'phone'}"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {},
219 |    "source": [
220 |     "**More comments**\n",
221 |     "If you're wondering why we used two different placeholder values, check out `language-tips/python/missingness.md` in the training-docs repo, or read the \"Handling Missing Data\" topic in _Python for Data Science Handbook_."
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "markdown",
226 |    "metadata": {},
227 |    "source": [
228 |     "## Set operations\n",
229 |     "\n",
230 |     "| method                      | alternative |   meaning     |\n",
231 |     "| :---                        |    :----:   |          ---: |\n",
232 |     "| `l.isdisjoint(r)`           | None        | non-empty intersection   |\n",
233 |     "| `l.issubset(r)`             | `l < r`     | all elements in l are in r, but `l != r` |\n",
234 |     "| `l.issuperset(r)`           | `l > r`     | all elements in r are in l, but `l != r` |\n",
235 |     "| `l.union(r)`                | `l \\| r`    | all elements in either l or r |\n",
236 |     "| `l.intersection(r)`         | `l & r`     | all elements in both l and r |\n",
237 |     "| `l.difference(r)`           | `l - r`     | all elements in l but not r |\n",
238 |     "| `l.symmetric_difference(r)` | `l ^ r`     | all elements in l or r but not in both  |"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "metadata": {},
244 |    "source": [
245 |     "For a downloadable version, check out this [fancy chart for set ops](input/set-operations.pdf)."
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": null,
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": []
254 |   }
255 |  ],
256 |  "metadata": {
257 |   "kernelspec": {
258 |    "display_name": "Python 3",
259 |    "language": "python",
260 |    "name": "python3"
261 |   },
262 |   "language_info": {
263 |    "codemirror_mode": {
264 |     "name": "ipython",
265 |     "version": 3
266 |    },
267 |    "file_extension": ".py",
268 |    "mimetype": "text/x-python",
269 |    "name": "python",
270 |    "nbconvert_exporter": "python",
271 |    "pygments_lexer": "ipython3",
272 |    "version": "3.7.3"
273 |   }
274 |  },
275 |  "nbformat": 4,
276 |  "nbformat_minor": 4
277 | }
278 | 


--------------------------------------------------------------------------------
/notebooks/python/src/no_main.py:
--------------------------------------------------------------------------------
 1 | # Authors:     FL
 2 | # Maintainers: FL
 3 | # Copyright:   YYYY, HRDAG, GPL v2 or later
 4 | # =========================================
 5 | # Project-Name/parent-task/core-task/src/script.py
 6 | 
 7 | # dependencies
 8 | import argparse
 9 | import logging
10 | import pandas as pd
11 | 
12 | # support methods
13 | def check_asserts( val ):
14 |     assert val
15 | 
16 | 
17 | def get_args():
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument("--input")
20 |     parser.add_argument("--output")
21 |     return parser.parse_args()
22 | 
23 | 
24 | def get_logging(logname):
25 |         logging.basicConfig(level=logging.DEBUG,
26 |                             format='%(asctime)s %(levelname)s %(message)s',
27 |                             handlers=[logging.FileHandler(logname),
28 |                             logging.StreamHandler()])
29 | 
30 | # setup logging
31 | get_logging("output/core-task.log”)
32 | 
33 | # arg handling
34 | args = getargs()
35 | input_f = args.input
36 | output_f = args.output
37 | 
38 | # read data, initial verification
39 | logging.info("Loading data.")
40 | raw_df = pd.read_ext(input_f)
41 | check_asserts(raw_df)
42 | 
43 | logging.info('Summary:')
44 | logging.info('====================')    
45 | logging.info('{:50}{}'.format('initial shape:', raw_df.shape ))
46 | logging.info('{:50}{}'.format('initial info:', raw_df.info() ))
47 | logging.info('\n')
48 | 
49 | # save data
50 | raw.to_parquet(output_f)
51 | 
52 | logging.info("done.")
53 |     
54 | # done.


--------------------------------------------------------------------------------
/notebooks/python/src/with_main.py:
--------------------------------------------------------------------------------
 1 | # Authors:     FL
 2 | # Maintainers: FL
 3 | # Copyright:   YYYY, HRDAG, GPL v2 or later
 4 | # =========================================
 5 | # Project-Name/parent-task/core-task/src/script.py
 6 | 
 7 | # dependencies
 8 | import argparse
 9 | import logging
10 | import pandas as pd
11 | 
12 | # support methods
13 | def check_asserts( val ):
14 |     assert val
15 | 
16 | 
17 | def get_args():
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument("--input")
20 |     parser.add_argument("--output")
21 |     return parser.parse_args()
22 | 
23 | 
24 | def get_logging(logname):
25 |         logging.basicConfig(level=logging.DEBUG,
26 |                             format='%(asctime)s %(levelname)s %(message)s',
27 |                             handlers=[logging.FileHandler(logname),
28 |                             logging.StreamHandler()])
29 | 
30 | # main
31 | if __name__ == '__main__':
32 | 
33 |     # setup logging
34 |     get_logging("output/core-task.log”)
35 | 
36 |     # arg handling
37 |     args = getargs()
38 |     input_f = args.input
39 |     output_f = args.output
40 | 
41 |     # read data, initial verification
42 |     logging.info("Loading data.")
43 |     raw_df = pd.read_ext(input_f)
44 |     check_asserts(raw_df)
45 |     
46 |     logging.info('__main__ Summary:')
47 |     logging.info('====================')    
48 |     logging.info('{:50}{}'.format('initial shape:', raw_df.shape ))
49 |     logging.info('{:50}{}'.format('initial info:', raw_df.info() ))
50 |     logging.info('\n')
51 |     
52 |     # save data
53 |     raw.to_parquet(output_f)
54 |     
55 |     logging.info("done.")
56 |     
57 | # done.


--------------------------------------------------------------------------------
/notebooks/terminal/basics.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Terminal basics"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "# TODO: Add 'grep' examples"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "### navigation"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 21,
 29 |    "metadata": {},
 30 |    "outputs": [
 31 |     {
 32 |      "name": "stdout",
 33 |      "output_type": "stream",
 34 |      "text": [
 35 |       "/home/bailey/training-docs/notebooks/terminal\r\n"
 36 |      ]
 37 |     }
 38 |    ],
 39 |    "source": [
 40 |     "# Where am I?\n",
 41 |     "! pwd"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 22,
 47 |    "metadata": {},
 48 |    "outputs": [
 49 |     {
 50 |      "name": "stdout",
 51 |      "output_type": "stream",
 52 |      "text": [
 53 |       "total 8\r\n",
 54 |       "-rw-rw-r-- 1 bailey svn 5752 Apr  7 12:56 basics.ipynb\r\n"
 55 |      ]
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "# What's around me?\n",
 60 |     "! ls -l"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 23,
 66 |    "metadata": {},
 67 |    "outputs": [
 68 |     {
 69 |      "name": "stdout",
 70 |      "output_type": "stream",
 71 |      "text": [
 72 |       "total 8\r\n",
 73 |       "drwxrwxr-x 6 bailey svn 4096 Apr  7 12:37 python\r\n",
 74 |       "drwxrwxr-x 3 bailey svn 4096 Apr  7 12:56 terminal\r\n"
 75 |      ]
 76 |     }
 77 |    ],
 78 |    "source": [
 79 |     "# What task is above me?\n",
 80 |     "! cd ..; ls -l"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 24,
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "name": "stdout",
 90 |      "output_type": "stream",
 91 |      "text": [
 92 |       "\u001b[01;34m.\u001b[00m\r\n",
 93 |       "├── \u001b[01;34mpython\u001b[00m\r\n",
 94 |       "│   ├── basic-stats.ipynb\r\n",
 95 |       "│   ├── \u001b[01;34minput\u001b[00m\r\n",
 96 |       "│   │   └── set-operations.pdf\r\n",
 97 |       "│   ├── intro_exercises.ipynb\r\n",
 98 |       "│   ├── intro.ipynb\r\n",
 99 |       "│   ├── \u001b[01;34moutput\u001b[00m\r\n",
100 |       "│   ├── sets.ipynb\r\n",
101 |       "│   └── \u001b[01;34msrc\u001b[00m\r\n",
102 |       "│       ├── no_main.py\r\n",
103 |       "│       └── with_main.py\r\n",
104 |       "└── \u001b[01;34mterminal\u001b[00m\r\n",
105 |       "    └── basics.ipynb\r\n",
106 |       "\r\n",
107 |       "5 directories, 8 files\r\n"
108 |      ]
109 |     }
110 |    ],
111 |    "source": [
112 |     "# What is the structure of the project one level above me?\n",
113 |     "! cd ..; tree"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 25,
119 |    "metadata": {},
120 |    "outputs": [
121 |     {
122 |      "name": "stdout",
123 |      "output_type": "stream",
124 |      "text": [
125 |       "/home/bailey\r\n"
126 |      ]
127 |     }
128 |    ],
129 |    "source": [
130 |     "# Can I go home now?\n",
131 |     "! cd; pwd"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 26,
137 |    "metadata": {},
138 |    "outputs": [
139 |     {
140 |      "name": "stdout",
141 |      "output_type": "stream",
142 |      "text": [
143 |       "/home/bailey/training-docs/notebooks/terminal\r\n"
144 |      ]
145 |     }
146 |    ],
147 |    "source": [
148 |     "# (jk, jupyter knows you're still in this notebook and where it runs from)\n",
149 |     "! pwd"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {},
155 |    "source": [
156 |     "### files and folders"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 27,
162 |    "metadata": {},
163 |    "outputs": [
164 |     {
165 |      "name": "stdout",
166 |      "output_type": "stream",
167 |      "text": [
168 |       "total 12\r\n",
169 |       "-rw-rw-r-- 1 bailey svn 5757 Apr  7 12:57 basics.ipynb\r\n",
170 |       "drwxrwxr-x 2 bailey svn 4096 Apr  7 12:58 sample_dir\r\n"
171 |      ]
172 |     }
173 |    ],
174 |    "source": [
175 |     "# I need a new directory here\n",
176 |     "! mkdir sample_dir; ls -l"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 28,
182 |    "metadata": {},
183 |    "outputs": [
184 |     {
185 |      "name": "stdout",
186 |      "output_type": "stream",
187 |      "text": [
188 |       "total 12\r\n",
189 |       "-rw-rw-r-- 1 bailey svn 5757 Apr  7 12:57 basics.ipynb\r\n",
190 |       "drwxrwxr-x 2 bailey svn 4096 Apr  7 12:58 sample_dir\r\n",
191 |       "-rw-rw-r-- 1 bailey svn    0 Apr  7 12:58 uh-oh.txt\r\n"
192 |      ]
193 |     }
194 |    ],
195 |    "source": [
196 |     "# I need a new file\n",
197 |     "! touch ./uh-oh.txt; ls -l"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": 12,
203 |    "metadata": {},
204 |    "outputs": [
205 |     {
206 |      "name": "stdout",
207 |      "output_type": "stream",
208 |      "text": [
209 |       "total 0\r\n",
210 |       "-rw-rw-r-- 1 bailey svn 0 Apr  7 12:45 test.txt\r\n",
211 |       "-rw-rw-r-- 1 bailey svn 0 Apr  7 12:47 uh-oh.txt\r\n"
212 |      ]
213 |     }
214 |    ],
215 |    "source": [
216 |     "# Oops, I wanted that file to go in the new folder\n",
217 |     "! mv uh-oh.txt sample_dir/; cd sample_dir; ls -l"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 16,
223 |    "metadata": {},
224 |    "outputs": [
225 |     {
226 |      "name": "stdout",
227 |      "output_type": "stream",
228 |      "text": [
229 |       "rmdir: failed to remove 'sample_dir': Directory not empty\r\n"
230 |      ]
231 |     }
232 |    ],
233 |    "source": [
234 |     "# Nvm, I don't need any of that\n",
235 |     "! rmdir sample_dir"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 17,
241 |    "metadata": {},
242 |    "outputs": [
243 |     {
244 |      "name": "stdout",
245 |      "output_type": "stream",
246 |      "text": [
247 |       "total 8\r\n",
248 |       "-rw-rw-r-- 1 bailey svn 4414 Apr  7 12:47 basics.ipynb\r\n"
249 |      ]
250 |     }
251 |    ],
252 |    "source": [
253 |     "# Second attempt to get rid of that stuff\n",
254 |     "! rm -r sample_dir; ls -l"
255 |    ]
256 |   }
257 |  ],
258 |  "metadata": {
259 |   "kernelspec": {
260 |    "display_name": "Python 3",
261 |    "language": "python",
262 |    "name": "python3"
263 |   },
264 |   "language_info": {
265 |    "codemirror_mode": {
266 |     "name": "ipython",
267 |     "version": 3
268 |    },
269 |    "file_extension": ".py",
270 |    "mimetype": "text/x-python",
271 |    "name": "python",
272 |    "nbconvert_exporter": "python",
273 |    "pygments_lexer": "ipython3",
274 |    "version": "3.7.3"
275 |   }
276 |  },
277 |  "nbformat": 4,
278 |  "nbformat_minor": 4
279 | }
280 | 


--------------------------------------------------------------------------------
/onboarding/00-reading-list.md:
--------------------------------------------------------------------------------
 1 | # Reading List
 2 | Resources and suggested readings for the tools used at HRDAG.
 3 | 
 4 | ## Workflow
 5 | At HRDAG we organize our work in tasks (see `06-tasks.md`). If you're ever in doubt about project organization, [The Task Is A Quantum Of Workflow](https://hrdag.org/2016/06/14/the-task-is-a-quantum-of-workflow/) is a good reference. For an explanation about why HRDAG does **not** use the `.Rproj` tool in RStudio, see our post [.Rproj considered harmful](https://hrdag.org/tech-notes/harmful.html). 
 6 | 
 7 | ## Markdown
 8 | A language for writing (this document is written in Markdown). Markdown files end with `.md`. 
 9 | 
10 | - [Basic writing and formatting syntax](https://help.github.com/en/github/writing-on-github/basic-writing-and-formatting-syntax) has all of the information you need to get started writing and formatting text in no time.
11 | 
12 | ## `R` & R Markdown
13 | At HRDAG we tend to use [tidyverse](https://www.tidyverse.org/) tools for data analysis in `R` as much as possible. We like the tidyverse because tidyverse packages are well developed and maintained. These tools use a common grammar that simplifies writing code that is clear to read and easily allows for multiple commands to be chained together.
14 | 
15 | - Get started with this [introduction to tidy data principles](https://github.com/jennybc/lotr-tidy/blob/master/01-intro.md) to understand what it means for data to be "tidy".
16 | - [R for Data Science](https://r4ds.had.co.nz/) ([R para Ciencia de Datos](https://es.r4ds.hadley.nz/) en español) is the go-to tidyverse textbook. There's a lot of information covered in the textbook and it would be great to read all of it if you have the time, but we suggest getting started with:
17 |     - Chapter 2
18 |     - Chapter 4
19 |     - Chapter 5 (don't forget the exercises!)
20 |     - Chapter 7 (don't forget the exercises!)
21 | - R Markdown is a flavor of Markdown made especially for use with `R`. R Markdown files end with `.Rmd`. We use R Markdown because it lets us combine prose and code and we can write reports that automatically update if our results change (rather than hard coding numbers or graphs). [R Markdown: The Definitive Guide](https://bookdown.org/yihui/rmarkdown/) is our go-to textbook for this. R Markdown can be used in many different ways, but Chapters 1 and 2 are sufficient to get started.
22 | 
23 | ## Vim and the language of editing
24 | 
25 | [_Practical Vim_](https://pragprog.com/titles/dnvim2/) is one of the best programming books I (PB) have ever read. I think there is a PDF that some of the training team colleagues have. I very strong encourage you to study it closely. 
26 | 
27 | ## Writing
28 | It's useful to have a reference for the style and caliber of HRDAG's written reporting. Here are a few samples selected by our Executive Director, Megan Price.
29 | >[This](https://chance.amstat.org/2018/02/statistics-of-genocide/) is technically a magazine article, but it’s a statistics magazine (called _Chance_), so the audience is still assumed to be somewhat technical (or at least technically interested). Another example is [To Predict and Serve?](https://rss.onlinelibrary.wiley.com/doi/epdf/10.1111/j.1740-9713.2016.00960.x) the article Kristian and William wrote for Signficance magazine. Over the years we’ve placed a number of pieces in both [Chance](https://chance.amstat.org/) and [Significance](https://www.significancemagazine.com/) and they’re pretty friendly outlets for our work (full disclosure: I’m currently on the editorial board for Significance). [Violence in Blue](https://granta.com/violence-in-blue/) is a fairly different example of general audience writing (Granta is also friendly to us, and published a couple of our Covid essays, but it’s technically a literary magazine!)
30 | >
31 | >[This article](https://hrdag.org/wp-content/uploads/2019/09/2019-DemographicResearch-civilian-killings-el-salvador.pdf) in Demographic Research is one example of peer-reviewed academic writing. Every field has it’s own style, and since we do so much interdisciplinary work, we end up needing to write things up in a variety of styles. This particular blend of political science and demography favors long papers, and this one is 36 pages.
32 | >
33 | >In 2019 Patrick wrote a series of memos for different projects that all use a similar MSE approach. I think these are good examples of the kinds of memos or white papers we sometimes write for partners (and also, when possible, publish on our webpage). [The Philippines](https://hrdag.org/wp-content/uploads/2019/07/2019-HRDAG-killings-philippines.pdf) [Indonesia](https://hrdag.org/wp-content/uploads/2018/12/KP-Palemban-ests.pdf) [Sri Lanka](https://hrdag.org/wp-content/uploads/2018/12/HRDAG-ITJPSL-2018-12-12-1.pdf)
34 | 
35 | <!-- done --> 
36 | 


--------------------------------------------------------------------------------
/onboarding/01-pkg-manager-setup.md:
--------------------------------------------------------------------------------
 1 | # Setting up a package manager
 2 | 
 3 | Package managers are useful for installing and updating command-line tools. Instead of having to go through lots of complicated steps, you can execute one simple command. Here's an example:
 4 | 
 5 | Without a package manager:
 6 | ```
 7 | curl -LO https://github.com/neovim/neovim/releases/download/nightly/nvim-macos.tar.gz
 8 | tar xzf nvim-macos.tar.gz
 9 | ./nvim-osx64/bin/nvim
10 | ```
11 | 
12 | With a package manager:
13 | ```
14 | brew install neovim
15 | ```
16 | 
17 | ## MacOS: Installing homebrew
18 | 
19 | Homebrew is the most popular package manager for macOS. To install, copy and paste the following into the terminal and hit enter:
20 | 
21 | ```
22 | /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install.sh)"
23 | ```
24 | 
25 | ## Linux: TODO
26 | 
27 | ## Windows: TODO
28 | 
29 | <!-- give up, maybe -->
30 | 


--------------------------------------------------------------------------------
/onboarding/02-git-setup.md:
--------------------------------------------------------------------------------
 1 | # Setting up git and GitHub
 2 | 
 3 | Git is a way of keeping track of changes inside a repository so that you can revert to previous versions and join your work with others. It's not totally ideal for our sort of data work, but it's the industry standard and so we use it. (We supplement it with our own version control system called `snap`, but that's a different lesson.)
 4 | 
 5 | GitHub is a website that provides a pretty interface for git, and makes collaboration with organizations easier. We use it for most projects at HRDAG.
 6 | 
 7 | ## Set up steps
 8 | 
 9 | Git is likely already installed on your computer, so mostly you'll just need to set up GitHub.
10 | 
11 | 1. Get a GitHub account (https://github.com/join).
12 | 2. Connect your GitHub account with an ssh key (https://help.github.com/en/github/authenticating-to-github/connecting-to-github-with-ssh). Your SSH key lets GitHub know that you are who you say you are when asking for files. It's like a more-secure password that you don't have to remember.
13 | 
14 | That's it!
15 | 


--------------------------------------------------------------------------------
/onboarding/03-python-setup.md:
--------------------------------------------------------------------------------
 1 | # Setting up python
 2 | 
 3 | Python is one of the two languages we use for most of our data analysis. You likely already have a version on your laptop, but it might be out of date and/or missing packages crucial for data analysis. To solve this, we use Anaconda, which manages python and its packages.
 4 | 
 5 | ## Set up steps
 6 | 
 7 | 1. Visit https://docs.anaconda.com/anaconda/install/ and select your operating system.
 8 | 2. Select the most recent version of python supported by Anaconda. As of 3 April 2020, that's python 3.7.
 9 | 3. Follow the steps on the installer.
10 | 
11 | If you are using a non-POSIX compliant shell such as `fish`, you will have to manually add anaconda into your path. (If you don't know what this means, don't worry about it.)
12 | 
13 | ## jupyter
14 | 
15 | `jupyter` is an interactive interface for python, which we use for data exploration and prototyping code. It comes with Anaconda—you can just run `jupyter notebook` in a terminal and it'll show up.
16 | 


--------------------------------------------------------------------------------
/onboarding/04-R-setup.md:
--------------------------------------------------------------------------------
 1 | # Setting up R
 2 | 
 3 | R is one of the two languages we use for most of our data analysis. It was built for working with data and it's better than Python for data visualization.
 4 | 
 5 | # Set up steps
 6 | For Mac
 7 | 
 8 | If you are using a Mac, make sure you have homebrew installed. 
 9 | Run "brew install r".
10 | [Here](https://www.r-bloggers.com/how-to-install-r-on-mac-ubuntu-and-windows/)
11 | you can find a good guide for Mac users.
12 | 
13 | # Rstudio installation
14 | 
15 | 1. Visit https://www.r-project.org/
16 | 2. Select the most recent version of R.
17 | 3. Follow the steps on the installer
18 | 
19 | Keep in mind that R needs to be installed before RStudio. You can download R
20 | [here](https://cran.rstudio.com/).
21 | 
22 | # RStudio thru a browser
23 | 
24 | [Here](https://support.rstudio.com/hc/en-us/articles/234653607-Getting-Started-with-RStudio-Server) you can find a good article on RStudio server. 
25 | 
26 | # Read the book 
27 | [_R para Ciencia de Datos_ en castellano](https://es.r4ds.hadley.nz/)
28 | 
29 | <!-- done --> 
30 | 


--------------------------------------------------------------------------------
/onboarding/05-bash.md:
--------------------------------------------------------------------------------
 1 | # Bash
 2 | 
 3 | Bash is a language to interact with the Unix/Linux file system and process tree. It’s a sort of meta-language, within which everything else on your computer lives.
 4 | 
 5 | Here are some things to try to get started with Bash:
 6 | 
 7 | Open up your terminal, and type in `cd`. This will change directories (folders are called directories in Unix) to the 'home' directory, which is unique for each user. Now, if you type `ls`, it should list all the files and directories inside your home directory. This should list all your non-hidden files and directories in whatever directory you're in. But what about the hidden files? Those files are just the files that start with dots (.). To see them, use `ls -a`. These hidden 'dotfiles' will come up later, when you’re customizing your environment.
 8 | 
 9 | You’ve probably noticed something else by now too, which is the prompt. This is the text that is displayed before your commands. For you it probably looks like: `<machine-name>:<directory> <account-name>$`. This prompt is bad for a couple of reasons: it has no color, which helps your eye differentiate between values; it's laid out poorly, so it's hard to read; and it doesn't have your complete path. 
10 | 
11 | With a complete path, I can tell who I am, where I am (eleanor for example) and where I am specifically (`~/git/GT-fingerprints/individual`). I then have my command on its own line so if I have an especially long path, the command I type in will still have room. You should figure out what prompt you'd like to have, and then program it in! Here's some basic tips:
12 | 
13 | 1. Assign whatever formula to the `PS1` variable in bash
14 | 2. Do all your work in `~/.bashrc`. This is one of those dotfiles—it's run automatically for bash to see your preferences–backslashed characters are your friends! They’re how you can put variables into your prompt. Check out this list for some: https://www.tldp.org/HOWTO/Bash-Prompt-HOWTO/bash-prompt-escape-sequences.html
15 | 
16 | This links are useful for getting started with Bash:
17 | 
18 | - http://tldp.org/HOWTO/Bash-Prog-Intro-HOWTO.html
19 | - http://cs.lmu.edu/~ray/notes/bash/
20 | - https://programminghistorian.org/en/lessons/intro-to-bash (spanish: https://programminghistorian.org/es/lecciones/introduccion-a-bash)
21 | - https://www.digitalocean.com/community/tutorials/an-introduction-to-the-linux-terminal
22 |  https://www.tjhsst.edu/~dhyatt/superap/unixcmd.html (essential, valuable, and useful lists are good—the rest are a tad out of date)
23 | - https://softcover.s3.amazonaws.com/636/learn_enough_command_line/images/figures/anatomy.png (an image explaining bash's structure)
24 | - http://tldp.org/LDP/abs/html/ (way too in depth, and focused on scripting, not command line. Could be useful as a 'dictionary')
25 | - https://www.youtube.com/watch?v=oxuRxtrO2Ag&t=3922s (highly recommend 1.5-2x speed)
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/onboarding/06-tasks.md:
--------------------------------------------------------------------------------
 1 | # Tasks
 2 | 
 3 | At HRDAG, our projects have a lot of what we call "more-than-twos." We have more than two programmers working on it, more than two datasets, or more than two programming languages. To handle all this complexity, we impose structure onto the project: tasks. A task represents one step on the journey from the original datasets to the final analysis.
 4 | 
 5 | Tasks, at a minimum, have three directories: `input/`, `src/`, and `output/`. `input/` contains the initial data, and should be read only. `src/` is how Unix spells "source," and so contains the source code that processes the data. `output/` contains the output files, and should only have files generated from running the code in `src/`.
 6 | 
 7 | There are other directories that might be in a task as well. Here they are:
 8 | * `note/` contains prototyping notebooks (e.g., from jupyter or RStudio).
 9 | * `hand/` contains hand-made files, like a csv that translates from city/state (or municipio/departemento) to a numerical geocode.
10 | * `frozen/` contains data files that don't fit in `input/` or `output/`. This will happen when the data is so broken that our open source tools can't deal with it, and so we have to hand-edit it or use another program to open and resave it.
11 | * `doc/` contains documentation.
12 | 
13 | Tasks generally chain together, so that the output from one task becomes the input of the next. By convention, every branch starts with an `import/` task that converts the file into the correct format for our operations and an `export/` task that converts it to the correct format for whatever the final product is. Importantly, these tasks exist even when the data is already in the correct format. That means that, when you visit the project in six months, you still know where it starts and where it ends.
14 | 
15 | Read [The Task Is A Quantum of Workflow](https://hrdag.org/2016/06/14/the-task-is-a-quantum-of-workflow/) and watch [Patrick Ball: Principled Data Processing](https://www.youtube.com/watch?v=ZSunU9GQdcI) for more information on this approach.
16 | 


--------------------------------------------------------------------------------
/onboarding/07-living-in-the-terminal.md:
--------------------------------------------------------------------------------
 1 | # Living in the terminal
 2 | 
 3 | Here's a collection of tips to help you when you spend time in the terminal.
 4 | 
 5 | ## `alias`
 6 | 
 7 | This command lets you easily create shortcuts for other commands. It's especially useful to ensure you don't forget an option. Here are some examples:
 8 | 
 9 | `alias ll="ls -AlFGgh --color='always'"`. This binds `ll` to a more featured version of `ls`. It shows permissions, puts each file or directory on its own line, color codes different kinds of files, and more!
10 | 
11 | `ls` output:
12 | ```
13 | README.md  bash  setup.md  setup.sh  vim
14 | ```
15 | 
16 | `ll` output: (pretend there are colors, too)
17 | ```
18 | total 28K
19 | -rw-r--r--  1 8.1K Aug  4  2017 .DS_Store
20 | drwxr-xr-x 16  512 Aug 29  2019 .git/
21 | -rw-r--r--  1   35 Jun 21  2018 .gitignore
22 | -rw-r--r--  1   27 Jun 26  2017 README.md
23 | drwxr-xr-x  7  224 Aug 29  2019 bash/
24 | -rw-r--r--  1  494 Jul 12  2018 setup.md
25 | -rw-r--r--  1 3.2K Jul 28  2017 setup.sh
26 | drwxr-xr-x 11  352 Jul 22  2019 vim/
27 | ```
28 | 
29 | I've found this block especially useful:
30 | ```
31 | alias gs="git status"
32 | alias gc="git commit -m"
33 | alias gA="git add -A && git status"
34 | ```
35 | This enforces good git hygiene--I can't commit without a commit message, because if I try to just `gc`, it throws an error!
36 | 
37 | Be sure to place these in your bashrc so they are there every time you open your terminal.
38 | 
39 | ## `tree`
40 | 
41 | `tree` is an awesome command that gives you a more visual look at your file structure. Here's its output on the same directory as above:
42 | ```
43 | .
44 | ├── README.md
45 | ├── bash
46 | │   ├── bashrc
47 | │   ├── featherhead.py
48 | │   ├── fromproj.py
49 | │   ├── projpath.py
50 | │   └── toproj.py
51 | ├── setup.md
52 | ├── setup.sh
53 | └── vim
54 |     ├── UltiSnips
55 |     │   ├── make.snippets
56 |     │   ├── python.snippets
57 |     │   └── yaml.snippets
58 |     ├── ftplugin
59 |     │   ├── make.vim
60 |     │   ├── markdown.vim
61 |     │   └── text.vim
62 |     ├── hi-output
63 |     ├── hi-presets.vim
64 |     ├── parens.vim
65 |     ├── plugs.vim
66 |     ├── process-hi.py
67 |     ├── vimconfigs.sh
68 |     └── vimrc
69 | 
70 | 4 directories, 21 files
71 | ```
72 | Two great options for `tree` are `-C`, which adds color, and `-L NUMBER`, which makes tree only look NUMBER layers down. So, `tree -L 1` to the same directory gives:
73 | ```
74 | .
75 | ├── README.md
76 | ├── bash
77 | ├── setup.md
78 | ├── setup.sh
79 | └── vim
80 | 
81 | 2 directories, 3 files
82 | ```
83 | 
84 | ## vim keybindings in bash
85 | 
86 | Add `set -o vi` to your bashrc. It's vi, not vim, so it's missing some features that you might be used to (the most noticeable for me is the lack of text objects, so `ciw` or `da"` don't work). The one other issue is that there isn't a great way to tell which mode you're in (you default to insert), so sometimes you get confused.
87 | 
88 | ## `cd -`
89 | 
90 | `cd -` jumps back to the last directory you were in. So, if you're in `~/git/HRDAG-training`, and you `cd /etc`, then `cd -`, you'll be back at `~/git/HRDAG-training`. It's helpful if you want to jump into another place briefly to get something done.
91 | 
92 | ## searching past bash commands
93 | 
94 | When you're on the command line, typing CTRL-r triggers a search mode. Essentially, it takes whatever you type and finds the most recent command you executed that contains that string. It's super useful for when you want to run a long command that you ran awhile ago again.
95 | 
96 | <!-- done. -->
97 | 


--------------------------------------------------------------------------------
/onboarding/08-nvim.md:
--------------------------------------------------------------------------------
 1 | # config files, where do they go?
 2 | 
 3 | traditional vim has the config file at `~/.vimrc`. we like to use neovim,
 4 | though, and neovim places it in `~/.config/nvim/init.vim`.
 5 | 
 6 | # useful vim options
 7 | 
 8 | These are basic options that we always use
 9 | 
10 | you can set these in your `init.vim`. To understand what they do, use the colon
11 | prompt in neovim, for example if you are in normal mode, you can type `:help
12 | hiden` to get a detailed explanation.
13 | 
14 | - `set ignorecase` and `set smartcase`: these are both related to the
15 |   functionality of search
16 | 
17 | - `syntax enable`: enables syntax highlighting, this is huge for making code
18 |   easier to read and write, but it'll work best if you find a colorscheme that
19 |   is both comfortable to look at, and that clearly distinguishes between
20 |   different types of code objects in a way that your eyes can quickly
21 |   differentiate.
22 | 
23 | - `colorscheme XXXX`: in normal mode, if you type `:colorscheme ` (make sure
24 |   there's a blank space at the end) and then hit the `<Tab>` button, the
25 |   autocomplete will show you the available color schemes. You should pick one
26 |   that works for you. You can also use your plug-in manager to install
27 |   additional color schemes, for instance
28 |   [monokai](https://github.com/tanvirtin/monokai.nvim) or
29 |   [gruvbox](https://github.com/morhetz/gruvbox)
30 | 
31 | - `set hidden`: keeps buffers open (including undo history, etc) when you
32 |   switch over to another one
33 | 
34 | - `set relativenumber` and `set number`: `number` shows the current line number
35 |   on the left of the editor. `relativenumber` shows, for each line, the number
36 |   of lines up or down from the current cursor position.
37 | 
38 | - `set colorcolumn=80`: this adds a line of color at 80 characters wide, which
39 |   is helpful when you're coding to help you notice when a line has gotten too
40 |   long.
41 | 
42 | # nvim plugins
43 | 
44 | There are various plugin managers for neovim,
45 | [vim-plug](https://github.com/junegunn/vim-plug) works pretty well
46 | 
47 | Here are a couple of useful plugins you can install once you've got vim-plug
48 | set up:
49 | 
50 | - [Ultisnips](https://github.com/sirver/UltiSnips)
51 | - [vim-commentary](https://github.com/tpope/vim-commentary)
52 | 
53 | 


--------------------------------------------------------------------------------
/onboarding/09-parquet.md:
--------------------------------------------------------------------------------
 1 | Author:     LB
 2 | Maintainer: BP
 3 | 
 4 | ## Parquet Files
 5 | What is Parquet? According to Databricks.com, “Apache Parquet is an open source, column-oriented data file format designed for efficient data storage and retrieval. It provides efficient data compression and encoding schemes with enhanced performance to handle complex data in bulk”.
 6 | 
 7 | ## What this means 
 8 | Parquet files read, store and write data based on columns and not rows. It turns out that passing data in this way is less expensive than working in a row-oriented direction. 
 9 | 
10 | ## Benefits
11 | According to Russell Jurney,  the abilities of column-oriented formatting to store each column of data together and can load them one at a time. This leads to two performance optimizations:
12 |     1. You only pay for the columns you load. This is called columnar storage.
13 | Let m be the total number of columns in a file and n be the number of columns requested by the user. Loading n columns results in justn/m raw I/O volume.
14 |     2. The similarity of values within separate columns results in more efficient compression. This is called columnar compression.
15 | 
16 | Note the event_type column in both row and column-oriented formats in the diagram below. A compression algorithm will have a much easier time compressing repeats of the value party in this column if they make up the entire value for that row, as in the column-oriented format. By contrast, the row-oriented format requires the compression algorithm to figure out repeats occur at some offset in the row which will vary based on the values in the previous columns. This is a much more difficult task.
17 | 
18 | (unlinked image representing storage format diagrams)
19 | 
20 | The column-oriented storage format can load just the columns of interest. Within these columns, similar or repeated values such as ‘party’ within the ‘event_type’ column compress more efficiently.
21 | Columnar storage combines with columnar compression to produce dramatic performance improvements for most applications that do not require every column in the file. I have often used PySpark to load CSV or JSON data that took a long time to load and converted it to Parquet format, after which using it with PySpark or even on a single computer in Pandas became quick and painless.
22 | 
23 | More later...
24 | - Pyarrow
25 | - Fastparquet
26 | 
27 | done.
28 | 


--------------------------------------------------------------------------------
/onboarding/10-auditing.md:
--------------------------------------------------------------------------------
  1 | We'd like to catch surprises in our processing pipelines as early as possible,
  2 | as that will allow us to avoid more serious problems, and also make it easier
  3 | to debug.
  4 | 
  5 | # Assertions
  6 | 
  7 | Assertions are a way to test assumptions or verify things that should be true.
  8 | An assertion is made up of a condition (some expression that evluates to
  9 | true/false) and, optionally, a custom error message. When the condition is
 10 | false, we get an error.
 11 | 
 12 | ## Syntax
 13 | 
 14 | In python, we can use the `assert` statement:
 15 | 
 16 | ```python
 17 | >>> assert 1 == 1
 18 | >>> assert 1 == 2
 19 | Traceback (most recent call last):
 20 |   File "<stdin>", line 1, in <module>
 21 | AssertionError
 22 | ```
 23 | 
 24 | A more informative error message will make debugging easier:
 25 | 
 26 | ```python
 27 | >>> assert 1 == 2, "input values should be equal"
 28 | Traceback (most recent call last):
 29 |   File "<stdin>", line 1, in <module>
 30 | AssertionError: input values should be equal
 31 | ```
 32 | 
 33 | Julia has a similar syntax, using the built-in `@assert` macro:
 34 | 
 35 | ```julia
 36 | julia> @assert 1 == 1
 37 | 
 38 | julia> @assert 1 == 2
 39 | ERROR: AssertionError: 1 == 2
 40 | Stacktrace:
 41 |  [1] top-level scope
 42 |    @ REPL[2]:1
 43 | ```
 44 | 
 45 | Once again, we can add an informative error message
 46 | 
 47 | ```julia
 48 | julia> @assert 1 == 2 "input values should be equal"
 49 | ERROR: AssertionError: input values should be equal
 50 | Stacktrace:
 51 |  [1] top-level scope
 52 |    @ REPL[1]:1
 53 | ```
 54 | 
 55 | R, on ther hand, uses `stopifnot`, which we call like a function:
 56 | 
 57 | ```r
 58 | > stopifnot(1 == 1)
 59 | > stopifnot(1 == 2)
 60 | Error: 1 == 2 is not TRUE
 61 | ```
 62 | 
 63 | Use named arguments to specify custom error messages, and you can use one
 64 | `stopifnot` to check multiple assumptions:
 65 | 
 66 | ```r
 67 | > stopifnot("input values should be equal" =  1  ==  2,
 68 |             "characters should be equal"   = 'a' == 'a')
 69 | Error: input values should be equal
 70 | ```
 71 | 
 72 | If multiple assertions in a `stopifnot` fail, only the first error is thrown.
 73 | 
 74 | ## Examples of useful assertions
 75 | 
 76 | A helpful assertion will encode some substantive knowledge you have about the
 77 | data in the form of an expectation.
 78 | 
 79 | ### we haven't dropped data or created duplicates
 80 | 
 81 | Code that requires joining data frames assumes
 82 | 
 83 | 
 84 | ```r
 85 | input <- read(args$input)
 86 | supplement <- read(args$supplement)
 87 | 
 88 | output <- input %>% inner_join(supplement, by = "id")
 89 | 
 90 | stopifnot(nrow(output) == nrow(input))
 91 | write_parquet(output, args$output)
 92 | ```
 93 | 
 94 | ### constraints on discrete values a field should take
 95 | 
 96 | R example:
 97 | 
 98 | ```r
 99 | clean_date <- function(dates) {
100 |     out <- some_parsing_code(dates)
101 |     stopifnot(
102 |         all(out$month %in% 1:12),
103 |         all(out$day %in% 1:31)
104 |     )
105 |     return(out)
106 | }
107 | ```
108 | 
109 | python example:
110 | 
111 | ```python
112 | def clean_age_group(ages):
113 |     out = some_parsing_code(ages)
114 |     assert all(grp in ['adult', 'child'] for grp in out)
115 |     return out
116 | ```
117 | 
118 | ### fields that have a known prior distribution
119 | 
120 | ```r
121 | clean_age <- function(ages) {
122 |     clean_ages <- some_parsing_code(ages)
123 |     stopifnot( all(clean_ages < 150) )
124 |     return(clean_ages)
125 | }
126 | ```
127 | 
128 | ```python
129 | def extract_year(date_strings, conflict_start, conflict_end):
130 |     years = some_extraction_code(date_strings)
131 |     assert all(y >= conflict_start and y <= conflict_end for y in years)
132 |     return years
133 | ```
134 | 
135 | ### knowledge about the data sources or generating processes
136 | 
137 | Some of our background knowledge gets encoded into our models as priors, making
138 | that knowledge unsuitable for this type of test. But, for a variety of reasons,
139 | we often have substantive knowledge or assumptions about the distribution of
140 | data that we do not encode into our models. The content of these assertions are
141 | context and/or data-source specific.
142 | 
143 | ```r
144 | stopifnot(sum(data$sex == "M") > sum(data$sex == "F"))
145 | stopifnot(sum(data$age_cat == "ADULT") > sum(data$age_cat == "CHILD"))
146 | ```
147 | 
148 | ```python
149 | # e.g. when one data source is known to be an aggregate
150 | # collection that covers the other
151 | assert overlap_rate(source1, source2) > .5
152 | ```
153 | 
154 | # Unit tests
155 | 
156 | When code has intricate logic or has to handle a lot of weird edge cases, use
157 | unit tests to make sure it works. These are especially valuable when you need
158 | to make changes to the code, allowing you to confirm that the changes haven't
159 | broken anything.
160 | 
161 | ```r
162 | library(testthat)
163 | 
164 | samples <- c(
165 |     "123 Main Street",
166 |     "9866 N. Park Avenue",
167 |     "   789 15th",
168 |     " 123   Main St."
169 | )
170 | 
171 | hand_cleaned <- c(
172 |     "123 MAIN STREET",
173 |     "9866 N PARK AVENUE",
174 |     "789 15th",
175 |     "123 MAIN STREET"
176 | )
177 | 
178 | test_that("address cleaning works as expected", {
179 |     expect_equal(clean(samples), hand_cleaned),
180 |     epect_type(clean(samples), "character")
181 | })
182 | ```
183 | 
184 | If there are more than a few examples, you can store them in a YAML or CSV
185 | file. As you encounter bugs or corner cases, add them to the examples.
186 | 
187 | # Audits that require some manual intervention
188 | 
189 | Assertions and unit tests can automatically stop data processing rather than
190 | outputting incorrect results. Sometimes, we can design audit reports that don't
191 | necessarily throw errors, but can allow a person reviewing them to spot
192 | potential issues.
193 | 
194 | example: plot histograms of the length of the person's name for each imported
195 | data source side by side, and look for any that are out of the ordinary. If the
196 | names all came from the same time and place, they should have similar
197 | statistical characteristics (similar lengths, similar distribution of
198 | characters, etc.)
199 | 
200 | 


--------------------------------------------------------------------------------
/onboarding/MakeFiles.rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "MakeFile"
 3 | output: pdf_document
 4 | header-includes:
 5 | - \usepackage{float}
 6 | - \usepackage{hyperref}
 7 | - \hypersetup{
 8 |      colorlinks=true,
 9 |      linkcolor=black,
10 |      citecolor = black,
11 |      urlcolor=blue,
12 |      }
13 | fontsize: 12pt
14 | ---
15 | 
16 | # What is a MakeFile? 
17 | 
18 | A MakeFile is a file to compile code. We have one of them for each of the tasks that we create in our data analysis. We locate it inside of a task but outside of any of the directories (input, output, src, hand, frozen).
19 | 
20 | # What should it look like?
21 | 
22 | 1. It must always contain a header as any other text file we are creating containing:
23 | 
24 | Author: 
25 | Maintainers:
26 | Date: Oct. 16 2019
27 | License: 
28 |  ------
29 | path
30 | 
31 | 2. Then you must follow this structure:
32 | 
33 | 2.1 For R
34 | 
35 | .PHONY: all clean
36 |  
37 | all:  path(s) to the output
38 |  
39 | clean:
40 | 	rm output/*
41 |  
42 | path to the output1: \
43 | 		script for creating that output 
44 | 	Rscript --vanilla $<
45 |  
46 | path to the output1: \
47 | 		script for creating that output 
48 | 	Rscript --vanilla $< 
49 | 
50 | 3. Example 
51 | 
52 | We want to create a MakeFile for our task "append" in our data analysis about social movement leaders killings in Colombia. The output will be a database with all of the different years we have. We will also create a logfile with basic information that we will use later in our report.
53 | 
54 | Author: VRA
55 | Maintainers: VRA, PB, CAF
56 | Date: Oct. 16 2019
57 | License: GPL-2 or newer
58 | ------
59 | CO-leaders/append/Makefile
60 | 
61 | .PHONY: all clean
62 | 
63 | all: output/allyears.rds \
64 | 	output/logfileappend.log
65 | 
66 | clean:
67 | 	rm output/*
68 | 
69 | output/allyears.rds: \
70 | 		src/appendyears.R 
71 | 	cd ../clean && make
72 | 	Rscript --vanilla $<
73 | 
74 | output/logfileappend.log: \
75 | 		src/appendyears.R
76 | 	cd ../clean && make
77 | 	Rscript --vanilla $<
78 | 
79 | done
80 | 
81 | 
82 | <!-- done -->
83 | 
84 | 


--------------------------------------------------------------------------------
/onboarding/README.md:
--------------------------------------------------------------------------------
 1 | # HRDAG Training
 2 | 
 3 | This repo contains training materials for HRDAG colleagues. As you read, you'll likely encounter questions, and as you try things in practice, you'll run into incomprehensible errors.
 4 | 
 5 | If this happens to you, after a few minutes of trying to fix it, you should ask for help! One of your colleagues may have seen the error you're trying to deal with, or be familiar with related problems.
 6 | 
 7 | ## Before you ask for help
 8 | 
 9 | The reason it's good to try to solve the problem on your own (besides the fact that you might actually fix it) is that it will help you clarify your question. A few minutes poking around might reveal that what you thought was an issue with your ssh config is actually a problem with your bashrc. You may still not know *what* the problem is, but it'll be easier for someone to help.
10 | 
11 | Here are some tips on bug-fixing:
12 | * Copy and paste the relevant part of the error message into Google
13 | * Search the issues in [this repository](https://github.com/HRDAG/training-docs/issues). As of August 2022, there aren't that many, so you might even glance through all the titles and see if there's anything relevant.
14 | * If it's in a script, try commenting out the offending line and see if you get a different, interesting error
15 | 
16 | The amount of time you spend trying to solve your problem before you ask for help should gradually get longer. In the beginning, don't spend more than 5 minutes, but that will naturally become 15, 30, or 120 minutes.
17 | 
18 | ## Asking for help
19 | 
20 | When you do ask for help, the best place is to [make a new issue](https://github.com/HRDAG/training-docs/issues/new) in this repository. That way, future HRDAG people can benefit from your knowledge.
21 | 
22 | Here are some guidelines for asking good questions:
23 | * explain why you are trying to do what you're doing. There might be a tool that fits the problem better.
24 | * if you're trying to fix an error, copy and paste the entire error message into the question. Use markdown's block-code feature (enclose the error message in `` ``` ``s) to retain formatting.
25 | * describe the steps you've already taken, and what the results were.
26 | 
27 | Have fun, and ask away!
28 | 
29 | <!-- done. -->
30 | 


--------------------------------------------------------------------------------
/onboarding/Reproducibility.rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Some thoughts on replicability"
 3 | output: pdf_document
 4 | header-includes:
 5 | - \usepackage{float}
 6 | - \usepackage{hyperref}
 7 | - \hypersetup{
 8 |      colorlinks=true,
 9 |      linkcolor=black,
10 |      citecolor = black,
11 |      urlcolor=blue,
12 |      }
13 | fontsize: 12pt
14 | ---
15 | 
16 | # Why being able to audit things is important? 
17 | 
18 | Being able to replicate the results that we get by analyzing data is key for our reputation. In general, being able to replicate something means that one can get the same result over and over again using the same input data and script. In science we also talk about *reproducibility*, which involves more experimental variation. Reproducibility/replicabilite are one best practice in science and is one of the reasons for not using the program environvenments, such as R studio but in the command line. 
19 | 
20 | When we have replicable results we can share our methods with other scientist, which creates two positive externalities. On the one hand, our results can be validated by other researchers, giving credibility to them. On the other hand, we can share our methods with other researchers, leading to scale economies and preventing duplications efforts.
21 | 
22 | The world is facing a reproducibility crisis. Nature magazine conducted [a survey](https://www.nature.com/news/1-500-scientists-lift-the-lid-on-reproducibility-1.19970) on this topic and found that "more than 70% of researchers have tried and failed to reproduce another scientist's experiments, and more than half have failed to reproduce their own experiments".
23 | 
24 | Although this crisis is faced by many fields, working in human rights gives an extra pressure on the importance of replicability. Even though our methods and results are unbiased and we do not manipulate any step of the process for getting the truth, it is very likely that our findings will be not liked by different actors. Therefore, when we publish our results and human rights adcovates use them, we need to be sure that the results have no mistakes. Guaranteeing replicability is key for this. 
25 | 
26 | # Some readings
27 | 
28 | You can click [here](https://github.com/ropensci-archive/reproducibility-guide/blob/gh-pages/sections/references/index.md) for some readings on reproducible results. 
29 | 
30 | 
31 | 
32 | <!-- done -->
33 | 
34 | 


--------------------------------------------------------------------------------
/templates/Makefile:
--------------------------------------------------------------------------------
 1 | # vim: set ts=8 sts=0 sw=8 si fenc=utf-8 noet:
 2 | # vim: set fdm=marker fmr={{{,}}} fdl=0 foldcolumn=4:
 3 | # Authors:     FL
 4 | # Maintainers: FL
 5 | # Copyright:   YYYY, HRDAG, GPL v2 or later
 6 | # =========================================
 7 | # Project-Name/parent-task/core-task/Makefile
 8 | 
 9 | # ---- dependencies {{{
10 | HERE := $(shell git rev-parse --show-toplevel)
11 | input := $(HERE)/other-parent/other-core/output/filename.ext
12 | subtask_target := $(HERE)/parent-task/sub-task
13 | py_target := $(HERE)/parent-task/core-task/output/py-out.parquet
14 | r_target := $(HERE)/parent-task/core-task/output/r-out.parquet
15 | # }}}
16 | 
17 | # ---- standard {{{
18 | .PHONY: help all clean sym_target
19 | 
20 | help:
21 | 	@echo "---------------HELP-----------------------------"
22 | 	@echo "To purge existing output, run make clean"
23 | 	@echo "To rebuild all output, run make all"
24 | 	@echo "To rebuild a single target, run make target_name"
25 | 	@echo "------------------------------------------------"
26 | 
27 | all: subtask_target sym_target py_target r_target
28 | 
29 | clean: 
30 | 	-rm -r output/*
31 | # }}}
32 | 
33 | # "$<" tells Make to fill in the variable with the first dependency passed to the target
34 | # "$@" tells Make to fill in the variable with the target name
35 | # ---- task-specific {{{
36 | subtask_target: dependency
37 | 	cd $@ && make
38 | 
39 | sym_target: dependency
40 | 	-mkdir output
41 | 	cd output && ln -s ../$< .
42 | 
43 | py_target: \
44 | 		src/task.py \
45 | 		$(input)
46 | 	-mkdir output
47 | 	python $< \
48 | 		--input=$(input) \
49 | 		--output=$@
50 | 
51 | r_target:\
52 | 		src/task.R \
53 | 		$(input)
54 | 	-mkdir output
55 | 	Rscript --vanilla $< \
56 | 		--input=$(input) \
57 | 		--output=$@
58 | # }}}
59 |  
60 | # done.
61 | 


--------------------------------------------------------------------------------
/templates/gitattributes:
--------------------------------------------------------------------------------
1 | # Track all input/, output/, and frozen/ directories for large files
2 | **/input/** filter=lfs diff=lfs merge=lfs -text
3 | **/frozen/** filter=lfs diff=lfs merge=lfs -text
4 | **/output/** filter=lfs diff=lfs merge=lfs -text
5 | 


--------------------------------------------------------------------------------
/templates/sample.R:
--------------------------------------------------------------------------------
 1 | # vim: set ts=4 sts=0 sw=4 si fenc=utf-8 et:
 2 | # vim: set fdm=marker fmr={{{,}}} fdl=0 foldcolumn=4:
 3 | # Authors:     FL
 4 | # Maintainers: FL
 5 | # Copyright:   YYYY, HRDAG, GPL v2 or later
 6 | # =========================================
 7 | # Project-Name/parent-task/core-task/src/script.R
 8 | 
 9 | # ---- dependencies {{{
10 | library(pacman)
11 | pacman::p_load(argparse, assertr, logger, tidyverse)
12 | #}}}
13 | 
14 | # ---- support methods {{{
15 | get_args <- function() {
16 |     parser <- ArgumentParser()
17 |     parser$add_argument("--input")
18 |     parser$add_argument("--output")
19 |     args <- parser$parse_args()
20 |     args
21 | }
22 | 
23 | set_log <- function(logname) {
24 |     log_appender(appender_tee(logname))
25 | }
26 | 
27 | initial_asserts <- function(data) {
28 |     data %>%
29 |         verify(nrow(.) > 100)
30 | }
31 | # }}}
32 | 
33 | # ---- main {{{
34 | logname <- "output/core-task.log"
35 | set_log(logname)
36 | 
37 | # arg handling
38 | args <- get_args() 
39 | input <- args$input
40 | output <- args$output
41 | 
42 | # read data, initial verification
43 | log_info("Loading data", logger="")
44 | dat <- read.ext(input)
45 | initial_asserts(dat)
46 | 
47 | # save data
48 | write.parquet(dat, output)
49 | 
50 | log_info("done", logger="")
51 | #}}}
52 | 
53 | # done.
54 | 


--------------------------------------------------------------------------------
/templates/sample.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: A report
 3 | author:
 4 | - "[link to author page](https://hrdag.org/people/)"
 5 | date: DD month, YYYY
 6 | output: html_document
 7 | ---
 8 | 
 9 | # An Rmd document
10 | 
11 | ```{r}
12 | x <- 7
13 | x + 14
14 | ```
15 | 
16 | The value of x is `r x`
17 | 


--------------------------------------------------------------------------------
/templates/sample.py:
--------------------------------------------------------------------------------
 1 | # vim: set ts=4 sts=0 sw=4 si fenc=utf-8 et:
 2 | # vim: set fdm=marker fmr={{{,}}} fdl=0 foldcolumn=4:
 3 | # Authors:     FL
 4 | # Maintainers: FL
 5 | # Copyright:   YYYY, HRDAG, GPL v2 or later
 6 | # =========================================
 7 | # Project-Name/parent-task/core-task/src/script.py
 8 | 
 9 | # ---- dependencies {{{
10 | from pathlib import Path
11 | from sys import stdout
12 | import argparse
13 | import logging
14 | import pandas as pd
15 | #}}}
16 | 
17 | # ---- support methods {{{
18 | def initial_asserts():
19 |     return 1 
20 | 
21 | 
22 | def get_args():
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--input", default=None)
25 |     parser.add_argument("--output", default=None)
26 |     args = parser.parse_args()
27 |     assert Path(args.input).exists()
28 |     return args
29 | 
30 | 
31 | def get_logger(sname, file_name=None):
32 |     logger = logging.getLogger(sname)
33 |     logger.setLevel(logging.DEBUG)
34 |     formatter = logging.Formatter("%(asctime)s - %(levelname)s " +
35 |                                   "- %(message)s", datefmt='%Y-%m-%d %H:%M:%S')
36 |     stream_handler = logging.StreamHandler(stdout)
37 |     stream_handler.setFormatter(formatter)
38 |     logger.addHandler(stream_handler)
39 |     if file_name:
40 |         file_handler = logging.FileHandler(file_name)
41 |         file_handler.setFormatter(formatter)
42 |         logger.addHandler(file_handler)
43 |     return logger
44 | 
45 | 
46 | def final_asserts(df):
47 |     return 1
48 | #}}}
49 | 
50 | # ---- main {{{
51 | if __name__ == '__main__':
52 |     # setup logging
53 |     logger = get_logger(__name__, "output/script-name.log")
54 | 
55 |     # arg handling
56 |     args = get_args()
57 |     input_f = args.input
58 |     output_f = args.output
59 | 
60 |     # read data, initial verification
61 |     logger.info("Loading data.")
62 |     raw_df = pd.read_ext(input_f)
63 |     initial_asserts(raw_df)
64 |     
65 |     # do stuff, more verification
66 |     final_asserts(raw_df)
67 | 
68 |     # save data, final verification
69 |     raw_df.to_parquet(output_f)
70 |     
71 |     logger.info("done.")
72 |     
73 | #}}}
74 | # done.
75 | 


--------------------------------------------------------------------------------